Files
brk/research/analyze_price_signals.py
2026-01-16 15:17:42 +01:00

1082 lines
51 KiB
Python

#!/usr/bin/env python3
"""
Analyze ALL outputs to find characteristics that correlate with accurate price signals.
Uses txoutindex directly - no pre-filtering.
"""
import urllib.request
import http.client
import json
import math
import bisect
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Optional
import sys
import time
BASE_URL = "http://localhost:3110"
API_HOST = "localhost"
API_PORT = 3110
# Persistent connection
_conn = None
def get_conn():
global _conn
if _conn is None:
_conn = http.client.HTTPConnection(API_HOST, API_PORT, timeout=300)
return _conn
def reset_conn():
global _conn
if _conn:
try:
_conn.close()
except:
pass
_conn = None
# Monthly prices for 2017-2018 (CoinGecko open prices)
MONTHLY_PRICES = {
(2017, 1): 970, (2017, 2): 968, (2017, 3): 1190, (2017, 4): 1080,
(2017, 5): 1362, (2017, 6): 2299, (2017, 7): 2455, (2017, 8): 2865,
(2017, 9): 4738, (2017, 10): 4334, (2017, 11): 6440, (2017, 12): 9968,
(2018, 1): 13888, (2018, 2): 10116, (2018, 3): 10307, (2018, 4): 6922,
(2018, 5): 9243, (2018, 6): 7487, (2018, 7): 6386, (2018, 8): 7726,
(2018, 9): 7016, (2018, 10): 6566, (2018, 11): 6305, (2018, 12): 3972,
}
def fetch(path: str, retries: int = 5):
"""Fetch JSON from API with retry logic and connection reuse."""
for attempt in range(retries):
try:
conn = get_conn()
conn.request("GET", path)
resp = conn.getresponse()
data = resp.read().decode('utf-8')
return json.loads(data)
except Exception as e:
reset_conn() # Reset connection on error
if attempt < retries - 1:
wait_time = (attempt + 1) * 3 # 3, 6, 9, 12 seconds
print(f" Retry {attempt + 1}/{retries} after {wait_time}s: {type(e).__name__}")
time.sleep(wait_time)
else:
raise
def fetch_chunked(path_template: str, start: int, end: int, chunk_size: int = 25000) -> list:
"""Fetch data in chunks to avoid API limits."""
result = []
total_chunks = (end - start + chunk_size - 1) // chunk_size
for i, chunk_start in enumerate(range(start, end, chunk_size)):
chunk_end = min(chunk_start + chunk_size, end)
path = path_template.format(start=chunk_start, end=chunk_end)
if i % 20 == 0 and i > 0:
print(f" chunk {i}/{total_chunks}...")
data = fetch(path)["data"]
result.extend(data)
return result
def get_phase_bin_and_decade(sats: int) -> tuple:
"""Get (phase_bin, decade) for sats value. Returns (None, 0) if out of range."""
if sats < 1000 or sats > 10_000_000_000_000:
return None, 0
log_sats = math.log10(sats)
decade = int(math.floor(log_sats))
phase = log_sats - decade
return min(int(phase * 100), 99), decade
def get_phase_bin(sats: int) -> Optional[int]:
"""Get phase bin for sats value (0-99), or None if out of range."""
return get_phase_bin_and_decade(sats)[0]
def get_decade(sats: int) -> int:
"""Get the decade (power of 10) for sats value."""
if sats <= 0:
return 0
return int(math.floor(math.log10(sats)))
def sats_to_implied_btc_price(sats: int, btc_price: float) -> float:
"""
What BTC price does this output imply?
If someone paid X sats for $Y worth of goods, and BTC = btc_price,
then the implied price = (sats / 1e8) * btc_price.
But we don't know Y. So instead, assume the output represents ~$btc_price worth,
and see what price that implies: implied_price = btc_price * (1e8 / sats).
"""
if sats <= 0:
return 0
return btc_price * (100_000_000 / sats)
def bin_to_price(bin_idx: int, anchor: float) -> float:
"""Convert bin to price using anchor for decade selection."""
EXPONENT = 5.0
phase = (bin_idx + 0.5) / 100
raw_price = 10 ** (EXPONENT - phase)
decade_ratio = round(math.log10(anchor / raw_price))
return raw_price * (10 ** decade_ratio)
def build_bin_classifier(btc_price: float) -> dict:
"""
Precompute classification for each bin (0-99) at given BTC price.
Returns dict mapping bin -> category.
"""
EXPONENT = 5.0
result = {}
for bin_idx in range(100):
phase = (bin_idx + 0.5) / 100
raw_price = 10 ** (EXPONENT - phase)
best_error = float('inf')
best_decade = 0
for decade in range(-4, 5):
price = raw_price * (10 ** decade)
error = abs(price - btc_price) / btc_price
if error < best_error:
best_error = error
best_decade = decade
anchor_decade = round(math.log10(btc_price / raw_price))
if best_error <= 0.15:
result[bin_idx] = "accurate"
elif best_error <= 0.30:
result[bin_idx] = "close"
elif anchor_decade != best_decade:
result[bin_idx] = "wrong_decade"
else:
result[bin_idx] = "noise"
return result
def build_bin_classifier_range(low_price: float, high_price: float) -> dict:
"""
Precompute classification for each bin (0-99) given daily low-high range.
An output is "accurate" if its implied price falls within the daily range.
"""
EXPONENT = 5.0
result = {}
mid_price = (low_price + high_price) / 2
for bin_idx in range(100):
phase = (bin_idx + 0.5) / 100
raw_price = 10 ** (EXPONENT - phase)
# Find the best decade match using mid price as anchor
best_error_vs_mid = float('inf')
best_decade = 0
for decade in range(-4, 5):
price = raw_price * (10 ** decade)
error = abs(price - mid_price) / mid_price
if error < best_error_vs_mid:
best_error_vs_mid = error
best_decade = decade
# Get the implied price at best decade
implied_price = raw_price * (10 ** best_decade)
# Check if implied price falls within the daily range (with tolerance)
# ±5% for accurate (range already captures intraday variation)
# ±15% for close
range_low = low_price * 0.95
range_high = high_price * 1.05
anchor_decade = round(math.log10(mid_price / raw_price))
if range_low <= implied_price <= range_high:
result[bin_idx] = "accurate"
elif low_price * 0.85 <= implied_price <= high_price * 1.15:
result[bin_idx] = "close"
elif anchor_decade != best_decade:
result[bin_idx] = "wrong_decade"
else:
result[bin_idx] = "noise"
return result
def classify_accuracy_fast(bin_idx: Optional[int], classifier: dict) -> str:
"""Fast classification using precomputed bin classifier."""
if bin_idx is None:
return "noise"
return classifier.get(bin_idx, "noise")
# Precompute round BTC values as a set with tolerance ranges
_ROUND_VALUES = [1000, 10000, 20000, 30000, 50000, 100000, 200000, 300000, 500000,
1000000, 2000000, 3000000, 5000000, 10000000, 20000000, 30000000,
50000000, 100000000, 1000000000]
# Build set of all "close enough" values (within 0.1%)
_ROUND_SET = set()
for r in _ROUND_VALUES:
tol = int(r * 0.001)
for v in range(r - tol, r + tol + 1):
_ROUND_SET.add(v)
def is_round_btc(sats: int) -> bool:
"""Check if sats is a round BTC amount (within 0.1%)."""
return sats in _ROUND_SET
# Round USD values to check (in cents to avoid float issues) - sorted for binary search
_ROUND_USD_VALUES = [100, 500, 1000, 2000, 2500, 5000, 10000, 20000, 25000, 50000,
100000, 200000, 250000, 500000, 1000000, 2000000, 2500000,
5000000, 10000000] # $1 to $100,000
def is_round_usd(sats: int, btc_low: float, btc_high: float, tolerance: float = 0.05) -> bool:
"""Check if implied USD value is close to a round amount at any price in range."""
if sats <= 0 or btc_low <= 0 or btc_high <= 0:
return False
# Calculate implied USD range (low price = low USD, high price = high USD)
implied_usd_low = int(sats * btc_low / 1_000_000) # cents
implied_usd_high = int(sats * btc_high / 1_000_000) # cents
# Check if any round USD value falls within (or near) the implied range
for round_val in _ROUND_USD_VALUES:
# The implied USD at some point during the day could have been round_val
# if round_val is within [implied_low * (1-tol), implied_high * (1+tol)]
range_low = implied_usd_low * (1 - tolerance)
range_high = implied_usd_high * (1 + tolerance)
if range_low <= round_val <= range_high:
return True
return False
# Micro-round sats: specific round values with 0.01% tolerance (UTXOracle style)
# These are values like 50000, 100000, 200000, etc. that aren't caught by is_round_btc
_MICRO_ROUND_SATS = []
# 50k-100k range (step 10k)
for v in range(50000, 100000, 10000):
_MICRO_ROUND_SATS.append(v)
# 100k-1M range (step 10k)
for v in range(100000, 1000000, 10000):
_MICRO_ROUND_SATS.append(v)
# 1M-10M range (step 100k)
for v in range(1000000, 10000000, 100000):
_MICRO_ROUND_SATS.append(v)
# 10M-100M range (step 1M)
for v in range(10000000, 100000000, 1000000):
_MICRO_ROUND_SATS.append(v)
_MICRO_ROUND_SATS = sorted(set(_MICRO_ROUND_SATS))
def is_micro_round_sats(sats: int, tolerance: float = 0.0001) -> bool:
"""Check if sats is a micro-round amount (within 0.01% of specific values)."""
if sats <= 0:
return False
# Binary search for nearest
idx = bisect.bisect_left(_MICRO_ROUND_SATS, sats)
for i in [idx - 1, idx]:
if 0 <= i < len(_MICRO_ROUND_SATS):
round_val = _MICRO_ROUND_SATS[i]
if abs(sats - round_val) <= round_val * tolerance:
return True
return False
# Phase bins where round USD amounts cluster (0-199 at 200 bins/decade)
# Calculated as: bin = int(frac(log10(usd_cents)) * 200)
# This is price-independent - works regardless of BTC price level!
ROUND_USD_PHASE_BINS_200 = [
0, # $1, $10, $100, $1000 (log10 = 0, 1, 2, 3)
35, # $1.50, $15, $150 (log10 = 0.176)
60, # $2, $20, $200 (log10 = 0.301)
80, # $2.50, $25, $250 (log10 = 0.398)
95, # $3, $30, $300 (log10 = 0.477)
120, # $4, $40, $400 (log10 = 0.602)
140, # $5, $50, $500 (log10 = 0.699)
156, # $6, $60, $600 (log10 = 0.778)
169, # $7, $70, $700 (log10 = 0.845)
181, # $8, $80, $800 (log10 = 0.903)
191, # $9, $90, $900 (log10 = 0.954)
]
def is_round_usd_phase(sats: int, tolerance_bins: int) -> bool:
"""
Check if sats falls into a round-USD phase bin. NO PRICE NEEDED!
Uses 200 bins/decade resolution.
Tolerance in bins: 2 bins = 1%, 4 bins = 2%, 10 bins = 5%, 20 bins = 10%
"""
if sats < 1000: # Skip very small values
return False
phase = math.log10(sats) % 1.0 # fractional part (0.0 to 1.0)
bin_idx = int(phase * 200) # convert to bin (0 to 199)
for round_bin in ROUND_USD_PHASE_BINS_200:
diff = abs(bin_idx - round_bin)
# Handle wraparound (bin 199 is close to bin 0)
if diff <= tolerance_bins or (200 - diff) <= tolerance_bins:
return True
return False
def get_tx_pattern(input_count: int, output_count: int) -> str:
"""Categorize transaction by input/output pattern."""
if input_count == 1:
if output_count == 1:
return "1-to-1"
elif output_count == 2:
return "1-to-2"
else:
return "1-to-many"
elif input_count == 2:
if output_count == 1:
return "2-to-1"
elif output_count == 2:
return "2-to-2"
else:
return "2-to-many"
else: # many inputs (3+)
if output_count == 1:
return "many-to-1"
elif output_count == 2:
return "many-to-2"
else:
return "many-to-many"
@dataclass
class Stats:
"""Aggregated statistics."""
total: int = 0
by_output_count: dict = field(default_factory=lambda: defaultdict(int))
by_input_count: dict = field(default_factory=lambda: defaultdict(int))
by_output_type: dict = field(default_factory=lambda: defaultdict(int))
by_is_round: dict = field(default_factory=lambda: defaultdict(int))
by_same_day: dict = field(default_factory=lambda: defaultdict(int))
by_has_opreturn: dict = field(default_factory=lambda: defaultdict(int))
by_witness_size: dict = field(default_factory=lambda: defaultdict(int))
by_value_range: dict = field(default_factory=lambda: defaultdict(int))
by_both_round: dict = field(default_factory=lambda: defaultdict(int))
by_bin: dict = field(default_factory=lambda: defaultdict(int))
by_decade: dict = field(default_factory=lambda: defaultdict(int))
by_implied_usd_range: dict = field(default_factory=lambda: defaultdict(int))
# New: output position analysis (for 2-output txs only)
by_output_index: dict = field(default_factory=lambda: defaultdict(int))
by_is_smaller_output: dict = field(default_factory=lambda: defaultdict(int))
by_round_pattern: dict = field(default_factory=lambda: defaultdict(int)) # "only_this", "only_other", "both", "neither"
by_value_ratio: dict = field(default_factory=lambda: defaultdict(int)) # ratio of this output to total (for 2-out)
by_error_pct: dict = field(default_factory=lambda: defaultdict(int)) # how close to actual price
by_tx_total_value: dict = field(default_factory=lambda: defaultdict(int)) # total output value of tx
by_round_usd_10pct: dict = field(default_factory=lambda: defaultdict(int)) # 10% tolerance (price-based)
by_round_usd_5pct: dict = field(default_factory=lambda: defaultdict(int)) # 5% tolerance (price-based)
by_round_usd_2pct: dict = field(default_factory=lambda: defaultdict(int)) # 2% tolerance (price-based)
by_round_usd_1pct: dict = field(default_factory=lambda: defaultdict(int)) # 1% tolerance (price-based)
# Phase-based round USD (NO PRICE NEEDED) at different tolerances
by_phase_usd_1pct: dict = field(default_factory=lambda: defaultdict(int)) # 1% = ±2 bins
by_phase_usd_2pct: dict = field(default_factory=lambda: defaultdict(int)) # 2% = ±4 bins
by_phase_usd_5pct: dict = field(default_factory=lambda: defaultdict(int)) # 5% = ±10 bins
by_phase_usd_10pct: dict = field(default_factory=lambda: defaultdict(int)) # 10% = ±20 bins
by_tx_pattern: dict = field(default_factory=lambda: defaultdict(int)) # input->output pattern (1-to-2, many-to-1, etc)
by_value_similarity: dict = field(default_factory=lambda: defaultdict(int)) # how similar are 2-out values (for detecting splits)
by_is_micro_round: dict = field(default_factory=lambda: defaultdict(int)) # very specific round sat amounts (UTXOracle style)
def record(self, output_count, input_count, output_type, is_round,
same_day, has_opreturn, witness_size, sats, both_round, bin_idx,
btc_price, decade, output_index=None, is_smaller=None, round_pattern=None, value_ratio=None, error_pct=None, tx_total_sats=None,
round_usd_10pct=None, round_usd_5pct=None, round_usd_2pct=None, round_usd_1pct=None, tx_pattern=None, value_similarity=None, is_micro_round=None,
phase_usd_1pct=None, phase_usd_2pct=None, phase_usd_5pct=None, phase_usd_10pct=None):
self.total += 1
self.by_output_count[min(output_count, 5)] += 1
self.by_input_count[min(input_count, 5)] += 1
self.by_output_type[output_type] += 1
self.by_is_round[is_round] += 1
self.by_same_day[same_day] += 1
self.by_has_opreturn[has_opreturn] += 1
self.by_both_round[both_round] += 1
if bin_idx is not None:
self.by_bin[bin_idx] += 1
# Track decade (power of 10 of sats)
self.by_decade[decade] += 1
# Track output position (for 2-output txs)
if output_index is not None:
self.by_output_index[output_index] += 1
if is_smaller is not None:
self.by_is_smaller_output[is_smaller] += 1
if round_pattern is not None:
self.by_round_pattern[round_pattern] += 1
if value_ratio is not None:
self.by_value_ratio[value_ratio] += 1
if error_pct is not None:
if error_pct < 5:
self.by_error_pct["<5%"] += 1
elif error_pct < 10:
self.by_error_pct["5-10%"] += 1
elif error_pct < 15:
self.by_error_pct["10-15%"] += 1
elif error_pct < 25:
self.by_error_pct["15-25%"] += 1
elif error_pct < 50:
self.by_error_pct["25-50%"] += 1
else:
self.by_error_pct["50%+"] += 1
if tx_total_sats is not None and tx_total_sats > 0:
# Bucket by tx total value (in BTC terms)
if tx_total_sats < 1_000_000: # < 0.01 BTC
self.by_tx_total_value["<0.01 BTC"] += 1
elif tx_total_sats < 10_000_000: # < 0.1 BTC
self.by_tx_total_value["0.01-0.1 BTC"] += 1
elif tx_total_sats < 100_000_000: # < 1 BTC
self.by_tx_total_value["0.1-1 BTC"] += 1
elif tx_total_sats < 1_000_000_000: # < 10 BTC
self.by_tx_total_value["1-10 BTC"] += 1
else:
self.by_tx_total_value["10+ BTC"] += 1
# Track round USD at different tolerances
if round_usd_10pct is not None:
self.by_round_usd_10pct[round_usd_10pct] += 1
if round_usd_5pct is not None:
self.by_round_usd_5pct[round_usd_5pct] += 1
if round_usd_2pct is not None:
self.by_round_usd_2pct[round_usd_2pct] += 1
if round_usd_1pct is not None:
self.by_round_usd_1pct[round_usd_1pct] += 1
# Track transaction pattern (input-to-output pattern)
if tx_pattern is not None:
self.by_tx_pattern[tx_pattern] += 1
# Track value similarity (for 2-output txs)
if value_similarity is not None:
self.by_value_similarity[value_similarity] += 1
# Track micro-round sats
if is_micro_round is not None:
self.by_is_micro_round[is_micro_round] += 1
# Track phase-based round USD (no price needed)
if phase_usd_1pct is not None:
self.by_phase_usd_1pct[phase_usd_1pct] += 1
if phase_usd_2pct is not None:
self.by_phase_usd_2pct[phase_usd_2pct] += 1
if phase_usd_5pct is not None:
self.by_phase_usd_5pct[phase_usd_5pct] += 1
if phase_usd_10pct is not None:
self.by_phase_usd_10pct[phase_usd_10pct] += 1
# Track implied USD value (sats * btc_price / 1e8)
implied_usd = sats * btc_price / 100_000_000
if implied_usd < 1:
self.by_implied_usd_range["<$1"] += 1
elif implied_usd < 10:
self.by_implied_usd_range["$1-$10"] += 1
elif implied_usd < 100:
self.by_implied_usd_range["$10-$100"] += 1
elif implied_usd < 1000:
self.by_implied_usd_range["$100-$1k"] += 1
elif implied_usd < 10000:
self.by_implied_usd_range["$1k-$10k"] += 1
else:
self.by_implied_usd_range["$10k+"] += 1
# Witness size buckets
if witness_size == 0:
self.by_witness_size["0"] += 1
elif witness_size < 500:
self.by_witness_size["1-499"] += 1
elif witness_size < 1000:
self.by_witness_size["500-999"] += 1
elif witness_size < 2500:
self.by_witness_size["1000-2499"] += 1
else:
self.by_witness_size["2500+"] += 1
# Value ranges (sats)
if sats < 10000:
self.by_value_range["<10k"] += 1
elif sats < 100000:
self.by_value_range["10k-100k"] += 1
elif sats < 1000000:
self.by_value_range["100k-1M"] += 1
elif sats < 10000000:
self.by_value_range["1M-10M"] += 1
elif sats < 100000000:
self.by_value_range["10M-100M"] += 1
else:
self.by_value_range["100M+"] += 1
def print_stats(stats: Stats, label: str, log=print):
"""Print statistics with percentages."""
log(f"\n{'='*50}")
log(f"{label} (n={stats.total:,})")
log('='*50)
if stats.total == 0:
log("No data")
return
def pct(d):
return {k: f"{v:,} ({100*v/stats.total:.1f}%)" for k, v in sorted(d.items())}
log(f"\nOutput count: {pct(stats.by_output_count)}")
log(f"Input count: {pct(stats.by_input_count)}")
log(f"Output type: {pct(stats.by_output_type)}")
log(f"Is round BTC: {pct(stats.by_is_round)}")
log(f"Both outputs round: {pct(stats.by_both_round)}")
log(f"Same-day spend: {pct(stats.by_same_day)}")
log(f"Has OP_RETURN: {pct(stats.by_has_opreturn)}")
log(f"Witness size: {pct(stats.by_witness_size)}")
log(f"Value range (sats): {pct(stats.by_value_range)}")
log(f"Decade (10^N sats): {pct(stats.by_decade)}")
log(f"Implied USD value: {pct(stats.by_implied_usd_range)}")
if stats.by_output_index:
log(f"Output index (2-out only): {pct(stats.by_output_index)}")
if stats.by_is_smaller_output:
log(f"Is smaller output (2-out): {pct(stats.by_is_smaller_output)}")
if stats.by_round_pattern:
log(f"Round pattern (2-out): {pct(stats.by_round_pattern)}")
if stats.by_value_ratio:
log(f"Value ratio (2-out): {pct(stats.by_value_ratio)}")
if stats.by_error_pct:
log(f"Error from actual price: {pct(stats.by_error_pct)}")
if stats.by_tx_total_value:
log(f"Tx total value: {pct(stats.by_tx_total_value)}")
if stats.by_round_usd_10pct:
log(f"Round USD (10% tol): {pct(stats.by_round_usd_10pct)}")
if stats.by_round_usd_5pct:
log(f"Round USD (5% tol): {pct(stats.by_round_usd_5pct)}")
if stats.by_round_usd_2pct:
log(f"Round USD (2% tol): {pct(stats.by_round_usd_2pct)}")
if stats.by_round_usd_1pct:
log(f"Round USD (1% tol): {pct(stats.by_round_usd_1pct)}")
if stats.by_tx_pattern:
log(f"Tx pattern: {pct(stats.by_tx_pattern)}")
if stats.by_value_similarity:
log(f"Value similarity (2-out): {pct(stats.by_value_similarity)}")
if stats.by_is_micro_round:
log(f"Micro-round sats: {pct(stats.by_is_micro_round)}")
if stats.by_phase_usd_1pct:
log(f"Phase USD (1% tol): {pct(stats.by_phase_usd_1pct)}")
if stats.by_phase_usd_2pct:
log(f"Phase USD (2% tol): {pct(stats.by_phase_usd_2pct)}")
if stats.by_phase_usd_5pct:
log(f"Phase USD (5% tol): {pct(stats.by_phase_usd_5pct)}")
if stats.by_phase_usd_10pct:
log(f"Phase USD (10% tol): {pct(stats.by_phase_usd_10pct)}")
# Top bins
log(f"\nTop 10 bins:")
for bin_idx, count in sorted(stats.by_bin.items(), key=lambda x: -x[1])[:10]:
log(f" Bin {bin_idx}: {count:,} ({100*count/stats.total:.1f}%)")
def analyze_block_range(start_height: int, end_height: int, start_dateindex: int, end_dateindex: int):
"""Analyze all outputs in a block range using daily OHLC prices."""
print(f"\nFetching data for heights {start_height}-{end_height}...")
# Fetch daily OHLC prices for this date range (external price data)
print("Fetching daily OHLC prices...")
ohlc_data = fetch(f"/api/metric/price_ohlc/dateindex?start={start_dateindex}&end={end_dateindex}")["data"]
# OHLC format: [open, high, low, close] in dollars
# Store low, high, and mid price for each day (transactions happen throughout the day)
daily_prices = {} # dateindex -> (low, high, mid)
for i, ohlc in enumerate(ohlc_data):
if ohlc and len(ohlc) >= 4:
open_p, high, low, close = ohlc[0], ohlc[1], ohlc[2], ohlc[3]
mid = (open_p + close) / 2 # average of open/close, not low/high (avoid wick skew)
daily_prices[start_dateindex + i] = (low, high, mid)
all_lows = [p[0] for p in daily_prices.values()]
all_highs = [p[1] for p in daily_prices.values()]
print(f" Got prices for {len(daily_prices)} days (${min(all_lows):.0f} - ${max(all_highs):.0f})")
# Precompute bin classifiers for each unique price (cache for speed)
bin_classifier_cache = {}
def get_bin_classifier(low: float, high: float) -> dict:
"""Build classifier that checks if bin falls within daily low-high range."""
# Cache key is the rounded range
cache_key = (round(low / 10) * 10, round(high / 10) * 10)
if cache_key not in bin_classifier_cache:
bin_classifier_cache[cache_key] = build_bin_classifier_range(low, high)
return bin_classifier_cache[cache_key]
# Get transaction ranges
first_tx = fetch(f"/api/metric/first_txindex/height?start={start_height}&end={end_height+1}")
first_txs = first_tx["data"]
tx_start = first_txs[0]
tx_end = first_txs[-1] if len(first_txs) > 1 else tx_start + 10000
print(f"Transaction range: {tx_start}-{tx_end} ({tx_end-tx_start:,} txs)")
# Get transaction metadata (chunked for large ranges)
print("Fetching transaction data...")
tx_first_out = fetch_chunked("/api/metric/first_txoutindex/txindex?start={start}&end={end}", tx_start, tx_end)
tx_first_in = fetch_chunked("/api/metric/first_txinindex/txindex?start={start}&end={end}", tx_start, tx_end)
tx_base_size = fetch_chunked("/api/metric/base_size/txindex?start={start}&end={end}", tx_start, tx_end)
tx_total_size = fetch_chunked("/api/metric/total_size/txindex?start={start}&end={end}", tx_start, tx_end)
tx_output_count = fetch_chunked("/api/metric/output_count/txindex?start={start}&end={end}", tx_start, tx_end)
tx_input_count = fetch_chunked("/api/metric/input_count/txindex?start={start}&end={end}", tx_start, tx_end)
tx_height = fetch_chunked("/api/metric/height/txindex?start={start}&end={end}", tx_start, tx_end)
# Get output data
out_start = tx_first_out[0] if tx_first_out else 0
# Estimate out_end based on last tx's output count
last_tx_outputs = tx_output_count[-1] if tx_output_count else 10
out_end = tx_first_out[-1] + last_tx_outputs + 1 if tx_first_out else out_start + 10000
print(f"Output range: {out_start}-{out_end} ({out_end-out_start:,} outputs)")
print("Fetching output data...")
out_value = fetch_chunked("/api/metric/value/txoutindex?start={start}&end={end}", out_start, out_end)
out_type = fetch_chunked("/api/metric/outputtype/txoutindex?start={start}&end={end}", out_start, out_end)
# Get input data for same-day check
in_start = tx_first_in[0] if tx_first_in else 0
last_tx_inputs = tx_input_count[-1] if tx_input_count else 10
in_end = tx_first_in[-1] + last_tx_inputs + 1 if tx_first_in else in_start + 10000
print(f"Input range: {in_start}-{in_end} ({in_end-in_start:,} inputs)")
print("Fetching input data...")
# Get spent txoutindex for each input
in_spent_txoutindex = fetch_chunked("/api/metric/txoutindex/txinindex?start={start}&end={end}", in_start, in_end)
# For same-day spend detection, only check outputs created within our block range
# (outputs from before can't be same-day by definition)
# We'll use the output ranges we already have (out_start to out_end)
# Get height to dateindex for same-day check
height_dateindex = fetch_chunked("/api/metric/dateindex/height?start={start}&end={end}", start_height, end_height+1)
# Analyze each transaction
# Categories: accurate (<=15%), close (15-30%), wrong_decade, noise
stats_accurate = Stats()
stats_close = Stats()
stats_wrong_decade = Stats()
stats_noise = Stats()
num_txs = len(tx_first_out) - 1
print(f"Analyzing {num_txs:,} transactions...")
for i in range(num_txs):
if i % 100000 == 0 and i > 0:
print(f" Progress: {i:,}/{num_txs:,} ({100*i/num_txs:.0f}%)")
txindex = tx_start + i
out_count = tx_output_count[i] if i < len(tx_output_count) else 0
in_count = tx_input_count[i] if i < len(tx_input_count) else 0
base_size = tx_base_size[i] if i < len(tx_base_size) else 0
total_size = tx_total_size[i] if i < len(tx_total_size) else 0
witness_size = (total_size or 0) - (base_size or 0)
first_out = tx_first_out[i] - out_start
next_first_out = tx_first_out[i + 1] - out_start if i + 1 < len(tx_first_out) else first_out + out_count
first_in = tx_first_in[i] - in_start if i < len(tx_first_in) else 0
# Check for OP_RETURN
has_opreturn = False
output_types = []
output_values = []
for j in range(first_out, min(next_first_out, len(out_type))):
ot = out_type[j] if j < len(out_type) else "unknown"
output_types.append(ot)
if ot and ot.lower() == "opreturn":
has_opreturn = True
if j < len(out_value):
output_values.append(out_value[j])
# Get the daily price range for this transaction
tx_height_val = tx_height[i] if i < len(tx_height) else None
tx_dateindex_val = None
btc_price = None # mid price for round USD calculations
btc_low = None
btc_high = None
bin_classifier = None
if tx_height_val is not None:
tx_di_idx = tx_height_val - start_height
tx_dateindex_val = height_dateindex[tx_di_idx] if 0 <= tx_di_idx < len(height_dateindex) else None
if tx_dateindex_val is not None and tx_dateindex_val in daily_prices:
btc_low, btc_high, btc_price = daily_prices[tx_dateindex_val]
bin_classifier = get_bin_classifier(btc_low, btc_high)
# Skip if no price data for this day
if btc_price is None or bin_classifier is None:
continue
# Check same-day spend (was the spent output created within our analysis range?)
# Only outputs created within our range (out_start to out_end) can be same-day
same_day = False
if tx_dateindex_val is not None and in_count and in_count > 0:
for k in range(in_count):
in_idx = first_in + k
if 0 <= in_idx < len(in_spent_txoutindex):
spent_txoutindex = in_spent_txoutindex[in_idx]
if spent_txoutindex and spent_txoutindex < 2**62:
# Check if spent output is within our current range
if out_start <= spent_txoutindex < out_end:
# Binary search for the tx that contains this output
ti = bisect.bisect_right(tx_first_out, spent_txoutindex) - 1
if 0 <= ti < len(tx_height):
spent_tx_height = tx_height[ti]
if spent_tx_height:
spent_di_idx = spent_tx_height - start_height
if 0 <= spent_di_idx < len(height_dateindex):
if height_dateindex[spent_di_idx] == tx_dateindex_val:
same_day = True
break
# Compute transaction total output value
tx_total_sats = sum(v for v in output_values if v and v > 0)
# Check if both outputs are round (for 2-output txs)
both_round = False
if out_count == 2 and len(output_values) >= 2:
both_round = is_round_btc(output_values[0]) and is_round_btc(output_values[1])
# Precompute round status for each output (for 2-output analysis)
output_rounds = [is_round_btc(v) if v else False for v in output_values]
# Precompute value similarity for 2-output txs (same for both outputs)
value_similarity = None
if out_count == 2 and len(output_values) == 2:
v1, v2 = output_values[0] or 0, output_values[1] or 0
if v1 > 0 and v2 > 0:
similarity = min(v1, v2) / max(v1, v2)
if similarity > 0.95:
value_similarity = "nearly_equal"
elif similarity > 0.8:
value_similarity = "similar"
elif similarity > 0.5:
value_similarity = "moderate"
elif similarity > 0.2:
value_similarity = "different"
else:
value_similarity = "very_different"
# Analyze each output
for j, sats in enumerate(output_values):
if sats is None or sats < 1000:
continue
ot = output_types[j] if j < len(output_types) else "unknown"
is_round = output_rounds[j] # Use precomputed value
bin_idx, decade = get_phase_bin_and_decade(sats)
# Compute error: what price does this output imply vs actual daily range?
# Error is 0 if implied price falls within [low, high], otherwise distance to nearest edge
error_pct = None
if bin_idx is not None:
implied_price = bin_to_price(bin_idx, btc_price)
if implied_price < btc_low:
error_pct = 100 * (btc_low - implied_price) / btc_low
elif implied_price > btc_high:
error_pct = 100 * (implied_price - btc_high) / btc_high
else:
error_pct = 0 # within daily range
# Use precomputed bin classifier for speed
category = classify_accuracy_fast(bin_idx, bin_classifier)
if category == "accurate":
stats = stats_accurate
elif category == "close":
stats = stats_close
elif category == "wrong_decade":
stats = stats_wrong_decade
else:
stats = stats_noise
# Compute 2-output specific metrics
output_index = None
is_smaller = None
round_pattern = None
value_ratio = None
if out_count == 2 and len(output_values) == 2:
output_index = j # 0 or 1
other_idx = 1 - j
other_sats = output_values[other_idx] or 0
is_smaller = sats < other_sats
# Round pattern: which outputs are round?
this_round = output_rounds[j]
other_round = output_rounds[other_idx]
if this_round and other_round:
round_pattern = "both_round"
elif this_round and not other_round:
round_pattern = "only_this_round"
elif not this_round and other_round:
round_pattern = "only_other_round"
else:
round_pattern = "neither_round"
# Value ratio: what fraction of total is this output?
total_sats = sats + other_sats
if total_sats > 0:
ratio = sats / total_sats
if ratio < 0.1:
value_ratio = "<10%"
elif ratio < 0.3:
value_ratio = "10-30%"
elif ratio < 0.5:
value_ratio = "30-50%"
elif ratio < 0.7:
value_ratio = "50-70%"
elif ratio < 0.9:
value_ratio = "70-90%"
else:
value_ratio = ">90%"
# Compute round USD at different tolerances (price-based, using daily range)
round_usd_10pct = is_round_usd(sats, btc_low, btc_high, tolerance=0.10)
round_usd_5pct = is_round_usd(sats, btc_low, btc_high, tolerance=0.05)
round_usd_2pct = is_round_usd(sats, btc_low, btc_high, tolerance=0.02)
round_usd_1pct = is_round_usd(sats, btc_low, btc_high, tolerance=0.01)
# Compute phase-based round USD (NO PRICE NEEDED!)
# 200 bins/decade: 1%=±2bins, 2%=±4bins, 5%=±10bins, 10%=±20bins
phase_usd_1pct = is_round_usd_phase(sats, tolerance_bins=2)
phase_usd_2pct = is_round_usd_phase(sats, tolerance_bins=4)
phase_usd_5pct = is_round_usd_phase(sats, tolerance_bins=10)
phase_usd_10pct = is_round_usd_phase(sats, tolerance_bins=20)
tx_pattern = get_tx_pattern(in_count, out_count)
micro_round = is_micro_round_sats(sats)
stats.record(
output_count=out_count,
input_count=in_count,
output_type=ot,
is_round=is_round,
same_day=same_day,
has_opreturn=has_opreturn,
witness_size=witness_size,
sats=sats,
both_round=both_round,
bin_idx=bin_idx,
btc_price=btc_price,
decade=decade,
output_index=output_index,
is_smaller=is_smaller,
round_pattern=round_pattern,
value_ratio=value_ratio,
error_pct=error_pct,
tx_total_sats=tx_total_sats,
round_usd_10pct=round_usd_10pct,
round_usd_5pct=round_usd_5pct,
round_usd_2pct=round_usd_2pct,
round_usd_1pct=round_usd_1pct,
tx_pattern=tx_pattern,
value_similarity=value_similarity,
is_micro_round=micro_round,
phase_usd_1pct=phase_usd_1pct,
phase_usd_2pct=phase_usd_2pct,
phase_usd_5pct=phase_usd_5pct,
phase_usd_10pct=phase_usd_10pct
)
return stats_accurate, stats_close, stats_wrong_decade, stats_noise
def main():
# Open report file
report_file = open("research/price_signal_analysis_report.txt", "w")
def log(msg=""):
print(msg)
report_file.write(msg + "\n")
report_file.flush()
log("=" * 60)
log("PHASE ORACLE SIGNAL ANALYSIS")
log("=" * 60)
# Cache dates lookup (same for all months)
log("Fetching date index...")
dates = fetch("/api/metric/date/dateindex?start=0&end=4000")["data"]
# Analyze all months
for year in [2017, 2018]:
for month in range(1, 13): # All months
key = (year, month)
if key not in MONTHLY_PRICES:
continue
log(f"\n\n{'#'*60}")
log(f"# {year}-{month:02d}")
log('#'*60)
# Get heights and dateindexes for this month
try:
start_di = None
end_di = None
for i, d in enumerate(dates):
if d and d.startswith(f"{year}-{month:02d}"):
if start_di is None:
start_di = i
end_di = i + 1 # Keep updating to find last day of month
if start_di is None:
log(f"Could not find date index for {year}-{month:02d}")
continue
# Get all heights for the month
heights = fetch(f"/api/metric/first_height/dateindex?start={start_di}&end={end_di+1}")["data"]
start_height = heights[0]
end_height = heights[-1] if len(heights) > 1 else start_height + 1000
log(f"Date range: {dates[start_di]} to {dates[end_di-1]} (dateindex {start_di}-{end_di})")
# Analyze entire month with daily prices
accurate, close, wrong_decade, noise = analyze_block_range(start_height, end_height, start_di, end_di)
total_outputs = accurate.total + close.total + wrong_decade.total + noise.total
log(f"\n--- SUMMARY ---")
log(f"Total outputs analyzed: {total_outputs:,}")
log(f" Accurate (≤15% error): {accurate.total:,} ({100*accurate.total/total_outputs:.1f}%)")
log(f" Close (15-30% error): {close.total:,} ({100*close.total/total_outputs:.1f}%)")
log(f" Wrong decade: {wrong_decade.total:,} ({100*wrong_decade.total/total_outputs:.1f}%)")
log(f" Noise: {noise.total:,} ({100*noise.total/total_outputs:.1f}%)")
print_stats(accurate, "ACCURATE (within 15% of actual price)", log)
print_stats(noise, "NOISE (no decade matches)", log)
# Print ratio comparison
if accurate.total > 0 and noise.total > 0:
# Sanity check: show True/False split for key boolean fields
log(f"\n--- ROUND USD BREAKDOWN ---")
for name, acc_d, noise_d in [
("Round USD 10%", accurate.by_round_usd_10pct, noise.by_round_usd_10pct),
("Round USD 5%", accurate.by_round_usd_5pct, noise.by_round_usd_5pct),
("Round USD 2%", accurate.by_round_usd_2pct, noise.by_round_usd_2pct),
("Round USD 1%", accurate.by_round_usd_1pct, noise.by_round_usd_1pct),
]:
acc_true_pct = 100 * acc_d.get(True, 0) / accurate.total
acc_false_pct = 100 * acc_d.get(False, 0) / accurate.total
noise_true_pct = 100 * noise_d.get(True, 0) / noise.total
noise_false_pct = 100 * noise_d.get(False, 0) / noise.total
log(f"{name}: accurate True={acc_true_pct:.1f}% False={acc_false_pct:.1f}% | noise True={noise_true_pct:.1f}% False={noise_false_pct:.1f}%")
log(f"\n{'='*50}")
log("KEY DIFFERENCES (accurate vs noise):")
log('='*50)
def compare(name, acc_dict, noise_dict):
log(f"\n{name}:")
all_keys = set(acc_dict.keys()) | set(noise_dict.keys())
for k in sorted(all_keys):
acc_pct = 100 * acc_dict.get(k, 0) / accurate.total
noise_pct = 100 * noise_dict.get(k, 0) / noise.total
diff = acc_pct - noise_pct
if abs(diff) > 2: # Only show significant differences
log(f" {k}: {acc_pct:.1f}% vs {noise_pct:.1f}% (diff: {diff:+.1f}%)")
compare("Output count", accurate.by_output_count, noise.by_output_count)
compare("Input count", accurate.by_input_count, noise.by_input_count)
compare("Output type", accurate.by_output_type, noise.by_output_type)
compare("Is round BTC", accurate.by_is_round, noise.by_is_round)
compare("Both round", accurate.by_both_round, noise.by_both_round)
compare("Same-day spend", accurate.by_same_day, noise.by_same_day)
compare("OP_RETURN", accurate.by_has_opreturn, noise.by_has_opreturn)
compare("Witness size", accurate.by_witness_size, noise.by_witness_size)
compare("Value range (sats)", accurate.by_value_range, noise.by_value_range)
compare("Decade (10^N sats)", accurate.by_decade, noise.by_decade)
compare("Implied USD", accurate.by_implied_usd_range, noise.by_implied_usd_range)
compare("Output index (2-out)", accurate.by_output_index, noise.by_output_index)
compare("Is smaller (2-out)", accurate.by_is_smaller_output, noise.by_is_smaller_output)
compare("Round pattern (2-out)", accurate.by_round_pattern, noise.by_round_pattern)
compare("Value ratio (2-out)", accurate.by_value_ratio, noise.by_value_ratio)
compare("Error from price", accurate.by_error_pct, noise.by_error_pct)
compare("Tx total value", accurate.by_tx_total_value, noise.by_tx_total_value)
compare("Round USD (10%)", accurate.by_round_usd_10pct, noise.by_round_usd_10pct)
compare("Round USD (5%)", accurate.by_round_usd_5pct, noise.by_round_usd_5pct)
compare("Round USD (2%)", accurate.by_round_usd_2pct, noise.by_round_usd_2pct)
compare("Round USD (1%)", accurate.by_round_usd_1pct, noise.by_round_usd_1pct)
compare("Phase USD (1%)", accurate.by_phase_usd_1pct, noise.by_phase_usd_1pct)
compare("Phase USD (2%)", accurate.by_phase_usd_2pct, noise.by_phase_usd_2pct)
compare("Phase USD (5%)", accurate.by_phase_usd_5pct, noise.by_phase_usd_5pct)
compare("Phase USD (10%)", accurate.by_phase_usd_10pct, noise.by_phase_usd_10pct)
compare("Tx pattern", accurate.by_tx_pattern, noise.by_tx_pattern)
compare("Value similarity (2-out)", accurate.by_value_similarity, noise.by_value_similarity)
compare("Micro-round sats", accurate.by_is_micro_round, noise.by_is_micro_round)
# EXCLUSION RECOMMENDATIONS
log(f"\n{'='*50}")
log("EXCLUSION CANDIDATES (overrepresented in noise):")
log('='*50)
log("Characteristics where noise% > accurate% suggest exclusion filters:\n")
def find_exclusions(name, acc_dict, noise_dict, threshold=3.0):
"""Find characteristics overrepresented in noise (candidates for exclusion)."""
exclusions = []
for k in set(acc_dict.keys()) | set(noise_dict.keys()):
acc_pct = 100 * acc_dict.get(k, 0) / accurate.total
noise_pct = 100 * noise_dict.get(k, 0) / noise.total
diff = noise_pct - acc_pct # positive = more in noise
if diff > threshold and noise_pct > 1: # at least 1% of noise
exclusions.append((k, acc_pct, noise_pct, diff))
return sorted(exclusions, key=lambda x: -x[3]) # sort by diff descending
all_exclusions = []
for name, acc_d, noise_d in [
("Value range", accurate.by_value_range, noise.by_value_range),
("Implied USD", accurate.by_implied_usd_range, noise.by_implied_usd_range),
("Decade", accurate.by_decade, noise.by_decade),
("Output count", accurate.by_output_count, noise.by_output_count),
("Is round BTC", accurate.by_is_round, noise.by_is_round),
("Both round", accurate.by_both_round, noise.by_both_round),
("Tx pattern", accurate.by_tx_pattern, noise.by_tx_pattern),
("Value similarity", accurate.by_value_similarity, noise.by_value_similarity),
("Value ratio", accurate.by_value_ratio, noise.by_value_ratio),
("Round USD 10%", accurate.by_round_usd_10pct, noise.by_round_usd_10pct),
("Round USD 5%", accurate.by_round_usd_5pct, noise.by_round_usd_5pct),
("Round USD 2%", accurate.by_round_usd_2pct, noise.by_round_usd_2pct),
("Round USD 1%", accurate.by_round_usd_1pct, noise.by_round_usd_1pct),
("Phase USD 10%", accurate.by_phase_usd_10pct, noise.by_phase_usd_10pct),
("Phase USD 5%", accurate.by_phase_usd_5pct, noise.by_phase_usd_5pct),
("Phase USD 2%", accurate.by_phase_usd_2pct, noise.by_phase_usd_2pct),
("Phase USD 1%", accurate.by_phase_usd_1pct, noise.by_phase_usd_1pct),
("Tx total value", accurate.by_tx_total_value, noise.by_tx_total_value),
("Micro-round sats", accurate.by_is_micro_round, noise.by_is_micro_round),
]:
excl = find_exclusions(name, acc_d, noise_d)
for k, acc_pct, noise_pct, diff in excl:
all_exclusions.append((name, k, acc_pct, noise_pct, diff))
# Sort by impact (diff) and print
all_exclusions.sort(key=lambda x: -x[4])
for name, k, acc_pct, noise_pct, diff in all_exclusions[:15]:
log(f" EXCLUDE {name}={k}: noise {noise_pct:.1f}% vs accurate {acc_pct:.1f}% (+{diff:.1f}%)")
# Also show INCLUSION candidates (overrepresented in accurate)
log(f"\n{'='*50}")
log("INCLUSION SIGNALS (overrepresented in accurate):")
log('='*50)
log("Characteristics where accurate% > noise% are good signals:\n")
all_inclusions = []
for name, acc_d, noise_d in [
("Value range", accurate.by_value_range, noise.by_value_range),
("Implied USD", accurate.by_implied_usd_range, noise.by_implied_usd_range),
("Decade", accurate.by_decade, noise.by_decade),
("Output count", accurate.by_output_count, noise.by_output_count),
("Is round BTC", accurate.by_is_round, noise.by_is_round),
("Is smaller (2-out)", accurate.by_is_smaller_output, noise.by_is_smaller_output),
("Tx pattern", accurate.by_tx_pattern, noise.by_tx_pattern),
("Value similarity", accurate.by_value_similarity, noise.by_value_similarity),
("Value ratio", accurate.by_value_ratio, noise.by_value_ratio),
("Round USD 10%", accurate.by_round_usd_10pct, noise.by_round_usd_10pct),
("Round USD 5%", accurate.by_round_usd_5pct, noise.by_round_usd_5pct),
("Round USD 2%", accurate.by_round_usd_2pct, noise.by_round_usd_2pct),
("Round USD 1%", accurate.by_round_usd_1pct, noise.by_round_usd_1pct),
("Phase USD 10%", accurate.by_phase_usd_10pct, noise.by_phase_usd_10pct),
("Phase USD 5%", accurate.by_phase_usd_5pct, noise.by_phase_usd_5pct),
("Phase USD 2%", accurate.by_phase_usd_2pct, noise.by_phase_usd_2pct),
("Phase USD 1%", accurate.by_phase_usd_1pct, noise.by_phase_usd_1pct),
]:
for k in set(acc_d.keys()) | set(noise_d.keys()):
acc_pct = 100 * acc_d.get(k, 0) / accurate.total
noise_pct = 100 * noise_d.get(k, 0) / noise.total
diff = acc_pct - noise_pct # positive = more in accurate
if diff > 3.0 and acc_pct > 1:
all_inclusions.append((name, k, acc_pct, noise_pct, diff))
all_inclusions.sort(key=lambda x: -x[4])
for name, k, acc_pct, noise_pct, diff in all_inclusions[:15]:
log(f" KEEP {name}={k}: accurate {acc_pct:.1f}% vs noise {noise_pct:.1f}% (+{diff:.1f}%)")
except Exception as e:
log(f"Error: {e}")
import traceback
traceback.print_exc()
traceback.print_exc(file=report_file)
report_file.close()
print(f"\nReport saved to: research/price_signal_analysis_report.txt")
if __name__ == "__main__":
main()