global: snapshot

2026-04-27 16:19:59 -07:00 · 2026-01-16 15:17:42 +01:00
parent f39681bb2b
commit 3b00a92fa4
23 changed files with 4904 additions and 845 deletions
--- a/research/analyze_price_signals.py
+++ b/research/analyze_price_signals.py
--- a/research/oracle_filter_analysis.md
+++ b/research/oracle_filter_analysis.md
@@ -0,0 +1,174 @@
+# Oracle Filter Analysis
+
+## Summary
+
+Analysis of ~20M outputs across 2017-2018 to find filters that distinguish accurate price signals from noise.
+
+## Key Finding: Round USD is the Only Reliable Filter
+
+| Filter | Accuracy Advantage | Consistency |
+|--------|-------------------|-------------|
+| **Round USD = True** | **+20% to +29%** | **12/12 months** |
+| Round BTC | +12% to -8% | Flips with price |
+| Value range/Decade | varies | Shifts with price |
+| Same-day spend | ~3% | Weak |
+| Micro-round sats | 0-5% | Inconsistent |
+| Tx pattern | <5% | Weak |
+| Is smaller output | ~3-4% | Weak |
+
+## Why Other Filters Fail
+
+### Round BTC (Unreliable)
+- Jan-Mar 2017 ($1k): Round BTC = True is GOOD (+10-12%)
+- Jun-Jul 2017 ($2.5k): Round BTC = True is BAD (-7%)
+- Reason: Round BTC only correlates with accuracy when it happens to align with round USD at current price
+
+### Value Range / Decade (Price-Dependent)
+- At $1,000/BTC: Decade 5 (100k-1M sats) is good
+- At $10,000/BTC: Decade 6 (1M-10M sats) is good
+- At $100,000/BTC: Decade 7 (10M-100M sats) would be good
+- These shift with price, making them useless as static filters
+
+## The Round USD Insight
+
+Round USD amounts ($1, $5, $10, $20, $50, $100, etc.) always map to the **same phase bins** regardless of price level:
+
+```
+$100 at $10,000/BTC  = 1,000,000 sats  → log10 = 6.0 → phase = 0.0 → bin 0
+$100 at $100,000/BTC = 100,000 sats    → log10 = 5.0 → phase = 0.0 → bin 0
+$100 at $1,000/BTC   = 10,000,000 sats → log10 = 7.0 → phase = 0.0 → bin 0
+```
+
+The phase = `frac(log10(sats))` is **invariant** to price decade!
+
+## Round USD Phase Bins
+
+| USD Amount | log10(USD) | Phase = frac(log10) | Bin (×100) |
+|------------|------------|---------------------|------------|
+| $1, $10, $100, $1000 | 0, 1, 2, 3 | 0.00 | 0 |
+| $1.50, $15, $150 | 0.18, 1.18, 2.18 | 0.18 | 18 |
+| $2, $20, $200 | 0.30, 1.30, 2.30 | 0.30 | 30 |
+| $2.50, $25, $250 | 0.40, 1.40, 2.40 | 0.40 | 40 |
+| $3, $30, $300 | 0.48, 1.48, 2.48 | 0.48 | 48 |
+| $4, $40, $400 | 0.60, 1.60, 2.60 | 0.60 | 60 |
+| $5, $50, $500 | 0.70, 1.70, 2.70 | 0.70 | 70 |
+| $6, $60, $600 | 0.78, 1.78, 2.78 | 0.78 | 78 |
+| $7, $70, $700 | 0.85, 1.85, 2.85 | 0.85 | 85 |
+| $8, $80, $800 | 0.90, 1.90, 2.90 | 0.90 | 90 |
+| $9, $90, $900 | 0.95, 1.95, 2.95 | 0.95 | 95 |
+
+## Implementation Plan
+
+### Approach: Phase-Based Round USD Filtering
+
+Filter outputs to only those whose phase bin corresponds to a round USD amount. No price knowledge needed.
+
+```rust
+/// Phase bins where round USD amounts cluster
+/// Computed as: bin = round(frac(log10(usd_cents)) * 100)
+const ROUND_USD_BINS: &[u8] = &[
+    0,   // $1, $10, $100, $1000 (and $0.10, $0.01)
+    18,  // $1.50, $15, $150
+    30,  // $2, $20, $200
+    40,  // $2.50, $25, $250
+    48,  // $3, $30, $300
+    60,  // $4, $40, $400
+    70,  // $5, $50, $500
+    78,  // $6, $60, $600
+    85,  // $7, $70, $700
+    90,  // $8, $80, $800
+    95,  // $9, $90, $900
+];
+
+/// Check if a histogram bin corresponds to a round USD amount
+fn is_round_usd_bin(bin: usize, tolerance: u8) -> bool {
+    let phase_bin = (bin % 100) as u8;
+    ROUND_USD_BINS.iter().any(|&round_bin| {
+        let diff = if phase_bin >= round_bin {
+            phase_bin - round_bin
+        } else {
+            round_bin - phase_bin
+        };
+        // Handle wraparound (bin 99 is close to bin 0)
+        diff <= tolerance || (100 - diff) <= tolerance
+    })
+}
+```
+
+### Where to Apply Filter
+
+In `compute.rs`, when adding outputs to histogram:
+
+```rust
+for sats in values {
+    if let Some(bin) = Histogram::sats_to_bin(sats) {
+        // Only include outputs in round-USD phase bins
+        if is_round_usd_bin(bin, 2) {  // ±2 bin tolerance
+            block_sparse.push((bin as u16, 1.0));
+            // ... rest of processing
+        }
+    }
+}
+```
+
+### Expected Impact
+
+- Reduces histogram noise by ~60-70% (only ~35% of accurate outputs are round USD)
+- Remaining outputs are 2-3x more likely to be accurate signals
+- Stencil matching should be more reliable with cleaner signal
+- Decade selection via anchors remains unchanged
+
+### Alternative: Weighted Approach
+
+Instead of hard filtering, weight round-USD bins higher:
+
+```rust
+let weight = if is_round_usd_bin(bin, 2) { 3.0 } else { 1.0 };
+block_sparse.push((bin as u16, weight));
+```
+
+This preserves some signal from non-round outputs while emphasizing round USD.
+
+## Bin Resolution: 100 vs 200
+
+UTXOracle uses **200 bins per decade**. Current phase oracle uses 100.
+
+| Resolution | Precision | Round USD cluster |
+|------------|-----------|-------------------|
+| 100 bins | 1% per bin | Wider, more overlap |
+| 200 bins | 0.5% per bin | Tighter, cleaner separation |
+
+**Round USD bins at 200 resolution:**
+| USD Amount | Phase = frac(log10) | Bin (×200) |
+|------------|---------------------|------------|
+| $1, $10, $100 | 0.000 | 0 |
+| $1.50, $15, $150 | 0.176 | 35 |
+| $2, $20, $200 | 0.301 | 60 |
+| $2.50, $25, $250 | 0.398 | 80 |
+| $3, $30, $300 | 0.477 | 95 |
+| $4, $40, $400 | 0.602 | 120 |
+| $5, $50, $500 | 0.699 | 140 |
+| $6, $60, $600 | 0.778 | 156 |
+| $7, $70, $700 | 0.845 | 169 |
+| $8, $80, $800 | 0.903 | 181 |
+| $9, $90, $900 | 0.954 | 191 |
+
+**Recommendation**: Use 200 bins for:
+1. Compatibility with UTXOracle stencil
+2. Tighter round-USD detection
+3. Better separation of signal from noise
+
+## Questions to Resolve
+
+1. **Tolerance**: ±2 bins (at 200) = ±1% vs ±4 bins = ±2%
+2. **Hard filter vs weight**: Filter completely or just weight higher?
+3. **Minimum count threshold**: What if too few outputs pass filter?
+4. **Interaction with existing smooth_round_btc()**: Still needed?
+5. **Migration**: Update PHASE_BINS constant from 100 to 200
+
+## Validation Plan
+
+1. Implement phase-based filtering
+2. Run on 2017-2018 data
+3. Compare accuracy vs current approach
+4. Tune tolerance parameter
--- a/research/price_signal_analysis_report.txt
+++ b/research/price_signal_analysis_report.txt
--- a/research/test_phase_detection.py
+++ b/research/test_phase_detection.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+"""
+Test price phase detection from outputs alone.
+The idea: Round USD outputs create a fingerprint pattern that reveals the price phase.
+"""
+
+import math
+import http.client
+import json
+import time
+from collections import defaultdict
+
+API_HOST = "localhost"
+API_PORT = 3110
+
+# Round USD phases (fixed fingerprint)
+# These are frac(log10(usd_cents)) for round USD values
+ROUND_USD_PHASES = [
+    0.00,  # $1, $10, $100, $1000
+    0.18,  # $1.50, $15, $150
+    0.30,  # $2, $20, $200
+    0.40,  # $2.50, $25, $250
+    0.48,  # $3, $30, $300
+    0.60,  # $4, $40, $400
+    0.70,  # $5, $50, $500
+    0.78,  # $6, $60, $600
+    0.85,  # $7, $70, $700
+    0.90,  # $8, $80, $800
+    0.95,  # $9, $90, $900
+]
+
+_conn = None
+
+def get_conn():
+    global _conn
+    if _conn is None:
+        _conn = http.client.HTTPConnection(API_HOST, API_PORT, timeout=300)
+    return _conn
+
+def reset_conn():
+    global _conn
+    if _conn:
+        try:
+            _conn.close()
+        except:
+            pass
+    _conn = None
+
+def fetch(path: str, retries: int = 3):
+    for attempt in range(retries):
+        try:
+            conn = get_conn()
+            conn.request("GET", path)
+            resp = conn.getresponse()
+            data = resp.read().decode('utf-8')
+            return json.loads(data)
+        except Exception as e:
+            reset_conn()
+            if attempt < retries - 1:
+                time.sleep(2)
+            else:
+                raise
+
+def fetch_chunked(path_template: str, start: int, end: int, chunk_size: int = 25000) -> list:
+    result = []
+    for chunk_start in range(start, end, chunk_size):
+        chunk_end = min(chunk_start + chunk_size, end)
+        path = path_template.format(start=chunk_start, end=chunk_end)
+        data = fetch(path)["data"]
+        result.extend(data)
+    return result
+
+
+def get_sats_phase(sats: int) -> float:
+    """Get the phase (fractional part of log10) for a sats value."""
+    if sats <= 0:
+        return 0.0
+    return math.log10(sats) % 1.0
+
+
+def count_round_usd_matches(outputs: list, price_phase: float, tolerance: float = 0.02) -> int:
+    """
+    Count how many outputs match round USD bins at the given price phase.
+
+    At price_phase P, round USD outputs should appear at sats_phase = (usd_phase - P) mod 1
+    """
+    # Compute expected sats phases for round USD at this price phase
+    expected_phases = [(usd_phase - price_phase) % 1.0 for usd_phase in ROUND_USD_PHASES]
+
+    count = 0
+    for sats in outputs:
+        if sats is None or sats < 1000:
+            continue
+        sats_phase = get_sats_phase(sats)
+
+        # Check if sats_phase matches any expected phase
+        for exp_phase in expected_phases:
+            diff = abs(sats_phase - exp_phase)
+            # Handle wraparound (0.99 is close to 0.01)
+            if diff < tolerance or diff > (1.0 - tolerance):
+                count += 1
+                break
+
+    return count
+
+
+def find_best_price_phase(outputs: list, tolerance: float = 0.02, resolution: int = 100) -> tuple:
+    """
+    Find the price phase that maximizes round USD matches.
+    Returns (best_phase, best_count, all_counts).
+    """
+    counts = []
+    best_phase = 0.0
+    best_count = 0
+
+    for i in range(resolution):
+        price_phase = i / resolution
+        count = count_round_usd_matches(outputs, price_phase, tolerance)
+        counts.append(count)
+
+        if count > best_count:
+            best_count = count
+            best_phase = price_phase
+
+    return best_phase, best_count, counts
+
+
+def actual_price_phase(price: float) -> float:
+    """Get the actual price phase from a price."""
+    return math.log10(price) % 1.0
+
+
+def analyze_day(date_str: str, start_height: int, end_height: int, actual_price: float):
+    """Analyze a single day's outputs."""
+
+    # Get transaction range for these heights
+    first_tx = fetch(f"/api/metric/first_txindex/height?start={start_height}&end={end_height}")
+    first_txs = first_tx["data"]
+    if not first_txs or len(first_txs) < 2:
+        return None
+
+    tx_start = first_txs[0]
+    tx_end = first_txs[-1]
+
+    # Get output range
+    tx_first_out = fetch_chunked("/api/metric/first_txoutindex/txindex?start={start}&end={end}", tx_start, tx_end)
+    if not tx_first_out:
+        return None
+
+    out_start = tx_first_out[0]
+    out_end = tx_first_out[-1] + 10  # estimate
+
+    # Fetch output values
+    out_values = fetch_chunked("/api/metric/value/txoutindex?start={start}&end={end}", out_start, out_end)
+
+    # Filter to reasonable range (1000 sats to 100 BTC)
+    outputs = [v for v in out_values if v and 1000 <= v <= 10_000_000_000]
+
+    if len(outputs) < 1000:
+        return None
+
+    # Find best price phase
+    detected_phase, match_count, _ = find_best_price_phase(outputs, tolerance=0.02)
+
+    # Compare with actual
+    actual_phase = actual_price_phase(actual_price)
+
+    # Phase error (handle wraparound)
+    phase_error = abs(detected_phase - actual_phase)
+    if phase_error > 0.5:
+        phase_error = 1.0 - phase_error
+
+    return {
+        'date': date_str,
+        'actual_price': actual_price,
+        'actual_phase': actual_phase,
+        'detected_phase': detected_phase,
+        'phase_error': phase_error,
+        'match_count': match_count,
+        'total_outputs': len(outputs),
+        'match_pct': 100 * match_count / len(outputs),
+    }
+
+
+def main():
+    print("=" * 60)
+    print("PRICE PHASE DETECTION TEST")
+    print("=" * 60)
+    print("\nIdea: Round USD outputs form a fingerprint pattern.")
+    print("Sliding this pattern across the histogram reveals the price phase.\n")
+
+    # Fetch dates
+    print("Fetching date index...")
+    dates = fetch("/api/metric/date/dateindex?start=0&end=4000")["data"]
+
+    # Fetch daily OHLC
+    print("Fetching daily prices...")
+    ohlc_data = fetch("/api/metric/price_ohlc/dateindex?start=2800&end=3600")["data"]
+
+    # Fetch heights
+    print("Fetching heights...")
+    heights = fetch("/api/metric/first_height/dateindex?start=2800&end=3600")["data"]
+
+    results = []
+
+    # Test on 2017-2018 (roughly dateindex 2900-3600)
+    # Sample every 7 days to speed up
+    for di in range(2900, 3550, 7):
+        if di - 2800 >= len(ohlc_data) or di - 2800 >= len(heights):
+            continue
+
+        ohlc = ohlc_data[di - 2800]
+        if not ohlc or len(ohlc) < 4:
+            continue
+
+        # Use close price as "actual"
+        actual_price = ohlc[3]
+        if not actual_price or actual_price <= 0:
+            continue
+
+        date_str = dates[di] if di < len(dates) else f"di={di}"
+
+        start_height = heights[di - 2800]
+        end_height = heights[di - 2800 + 1] if di - 2800 + 1 < len(heights) else start_height + 144
+
+        if not start_height:
+            continue
+
+        print(f"\nAnalyzing {date_str} (${actual_price:.0f})...")
+
+        try:
+            result = analyze_day(date_str, start_height, end_height, actual_price)
+            if result:
+                results.append(result)
+                print(f"  Actual phase:   {result['actual_phase']:.3f}")
+                print(f"  Detected phase: {result['detected_phase']:.3f}")
+                print(f"  Phase error:    {result['phase_error']:.3f} ({result['phase_error']*100:.1f}%)")
+                print(f"  Matches: {result['match_count']:,} / {result['total_outputs']:,} ({result['match_pct']:.1f}%)")
+        except Exception as e:
+            print(f"  Error: {e}")
+            continue
+
+    # Summary
+    if results:
+        print("\n" + "=" * 60)
+        print("SUMMARY")
+        print("=" * 60)
+
+        errors = [r['phase_error'] for r in results]
+        avg_error = sum(errors) / len(errors)
+
+        # Count how many are within various thresholds
+        within_01 = sum(1 for e in errors if e <= 0.01)
+        within_02 = sum(1 for e in errors if e <= 0.02)
+        within_05 = sum(1 for e in errors if e <= 0.05)
+        within_10 = sum(1 for e in errors if e <= 0.10)
+
+        print(f"\nTotal days analyzed: {len(results)}")
+        print(f"Average phase error: {avg_error:.3f} ({avg_error*100:.1f}%)")
+        print(f"\nPhase error distribution:")
+        print(f"  ≤1%:  {within_01:3d} / {len(results)} ({100*within_01/len(results):.0f}%)")
+        print(f"  ≤2%:  {within_02:3d} / {len(results)} ({100*within_02/len(results):.0f}%)")
+        print(f"  ≤5%:  {within_05:3d} / {len(results)} ({100*within_05/len(results):.0f}%)")
+        print(f"  ≤10%: {within_10:3d} / {len(results)} ({100*within_10/len(results):.0f}%)")
+
+        # Show worst cases
+        print(f"\nWorst cases:")
+        worst = sorted(results, key=lambda r: -r['phase_error'])[:5]
+        for r in worst:
+            print(f"  {r['date']}: detected {r['detected_phase']:.2f} vs actual {r['actual_phase']:.2f} "
+                  f"(error {r['phase_error']:.2f}, ${r['actual_price']:.0f})")
+
+        # Show best cases
+        print(f"\nBest cases:")
+        best = sorted(results, key=lambda r: r['phase_error'])[:5]
+        for r in best:
+            print(f"  {r['date']}: detected {r['detected_phase']:.2f} vs actual {r['actual_phase']:.2f} "
+                  f"(error {r['phase_error']:.3f}, ${r['actual_price']:.0f})")
+
+
+if __name__ == "__main__":
+    main()