mirror of
https://github.com/bitcoinresearchkit/brk.git
synced 2026-04-25 15:19:58 -07:00
457 lines
14 KiB
Rust
457 lines
14 KiB
Rust
//! Sweep round-value tolerance to find optimal rounding threshold.
|
||
//!
|
||
//! Tests different tolerance percentages (0%, 0.01%, 0.1%, 1%, etc.) for
|
||
//! detecting round BTC amounts, combined with several digit filter masks.
|
||
//!
|
||
//! Phase 1: single pass over indexer, store per-output relative errors.
|
||
//! Phase 2: sweep tolerance × mask combos across CPU cores.
|
||
//!
|
||
//! Run with: cargo run -p brk_oracle --example sweep_tolerance --release
|
||
|
||
use std::path::PathBuf;
|
||
use std::time::Instant;
|
||
|
||
use brk_indexer::Indexer;
|
||
use brk_oracle::{Config, NUM_BINS, Oracle, PRICES, START_HEIGHT, cents_to_bin, sats_to_bin};
|
||
use brk_types::{OutputType, Sats, TxIndex, TxOutIndex};
|
||
use vecdb::{AnyVec, ReadableVec, VecIndex};
|
||
|
||
const BINS_5PCT: f64 = 4.24;
|
||
const BINS_10PCT: f64 = 8.28;
|
||
const BINS_20PCT: f64 = 15.84;
|
||
|
||
fn bins_to_pct(bins: f64) -> f64 {
|
||
(10.0_f64.powf(bins / 200.0) - 1.0) * 100.0
|
||
}
|
||
|
||
fn seed_bin(start_height: usize) -> f64 {
|
||
let price: f64 = PRICES
|
||
.lines()
|
||
.nth(start_height - 1)
|
||
.expect("prices.txt too short")
|
||
.parse()
|
||
.expect("Failed to parse seed price");
|
||
cents_to_bin(price * 100.0)
|
||
}
|
||
|
||
fn leading_digit(sats: u64) -> u8 {
|
||
let log = (sats as f64).log10();
|
||
let magnitude = 10.0_f64.powf(log.floor());
|
||
let d = (sats as f64 / magnitude).round() as u8;
|
||
if d >= 10 { 1 } else { d }
|
||
}
|
||
|
||
/// Returns the relative error of `sats` from its nearest round value (d × 10^n).
|
||
/// e.g. 10_050 → leading=1, round_val=10_000, rel_err = 50/10000 = 0.005
|
||
fn relative_roundness(sats: u64) -> f64 {
|
||
let log = (sats as f64).log10();
|
||
let magnitude = 10.0_f64.powf(log.floor());
|
||
let leading = (sats as f64 / magnitude).round();
|
||
let round_val = leading * magnitude;
|
||
(sats as f64 - round_val).abs() / round_val
|
||
}
|
||
|
||
struct Stats {
|
||
total_sq_err: f64,
|
||
total_bias: f64,
|
||
max_err: f64,
|
||
total_blocks: u64,
|
||
gt_5pct: u64,
|
||
gt_10pct: u64,
|
||
gt_20pct: u64,
|
||
}
|
||
|
||
impl Stats {
|
||
fn new() -> Self {
|
||
Self {
|
||
total_sq_err: 0.0,
|
||
total_bias: 0.0,
|
||
max_err: 0.0,
|
||
total_blocks: 0,
|
||
gt_5pct: 0,
|
||
gt_10pct: 0,
|
||
gt_20pct: 0,
|
||
}
|
||
}
|
||
|
||
fn update(&mut self, err: f64) {
|
||
self.total_sq_err += err * err;
|
||
self.total_bias += err;
|
||
self.total_blocks += 1;
|
||
let abs_err = err.abs();
|
||
if abs_err > self.max_err {
|
||
self.max_err = abs_err;
|
||
}
|
||
if abs_err > BINS_5PCT {
|
||
self.gt_5pct += 1;
|
||
}
|
||
if abs_err > BINS_10PCT {
|
||
self.gt_10pct += 1;
|
||
}
|
||
if abs_err > BINS_20PCT {
|
||
self.gt_20pct += 1;
|
||
}
|
||
}
|
||
|
||
fn rmse_pct(&self) -> f64 {
|
||
bins_to_pct((self.total_sq_err / self.total_blocks as f64).sqrt())
|
||
}
|
||
|
||
fn max_pct(&self) -> f64 {
|
||
bins_to_pct(self.max_err)
|
||
}
|
||
|
||
fn bias(&self) -> f64 {
|
||
self.total_bias / self.total_blocks as f64
|
||
}
|
||
}
|
||
|
||
/// Per-output data: bin index, leading digit, relative error from round value.
|
||
struct RoundOutput {
|
||
bin: u16,
|
||
digit: u8,
|
||
rel_err: f32, // f32 is plenty of precision, saves memory
|
||
}
|
||
|
||
struct BlockData {
|
||
full_hist: Box<[u32; NUM_BINS]>,
|
||
round_outputs: Vec<RoundOutput>,
|
||
high_bin: f64,
|
||
low_bin: f64,
|
||
}
|
||
|
||
fn main() {
|
||
let t0 = Instant::now();
|
||
|
||
let data_dir = std::env::var("BRK_DIR")
|
||
.map(PathBuf::from)
|
||
.unwrap_or_else(|_| {
|
||
let home = std::env::var("HOME").unwrap();
|
||
PathBuf::from(home).join(".brk")
|
||
});
|
||
|
||
let indexer = Indexer::forced_import(&data_dir).expect("Failed to load indexer");
|
||
let total_heights = indexer.vecs.blocks.timestamp.len();
|
||
|
||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||
|
||
let height_ohlc: Vec<[f64; 4]> = serde_json::from_str(
|
||
&std::fs::read_to_string(format!("{manifest_dir}/examples/height_price_ohlc.json"))
|
||
.expect("Failed to read height_price_ohlc.json"),
|
||
)
|
||
.expect("Failed to parse height OHLC");
|
||
|
||
let height_bands: Vec<(f64, f64)> = height_ohlc
|
||
.iter()
|
||
.map(|ohlc| {
|
||
let high = ohlc[1];
|
||
let low = ohlc[2];
|
||
if high > 0.0 && low > 0.0 {
|
||
(cents_to_bin(high * 100.0), cents_to_bin(low * 100.0))
|
||
} else {
|
||
(0.0, 0.0)
|
||
}
|
||
})
|
||
.collect();
|
||
|
||
let sweep_start: usize = 575_000;
|
||
|
||
// Phase 1: precompute per-block data.
|
||
// Store all potentially-round outputs with their relative error so we can
|
||
// filter at different tolerance thresholds in Phase 2.
|
||
eprintln!("Phase 1: precomputing block data...");
|
||
|
||
let total_txs = indexer.vecs.transactions.height.len();
|
||
let total_outputs = indexer.vecs.outputs.value.len();
|
||
|
||
let first_tx_index: Vec<TxIndex> = indexer.vecs.transactions.first_tx_index.collect();
|
||
let out_first: Vec<TxOutIndex> = indexer.vecs.outputs.first_txout_index.collect();
|
||
|
||
let ref_config = Config::default();
|
||
let total_blocks = total_heights - sweep_start;
|
||
let mut blocks: Vec<BlockData> = Vec::with_capacity(total_blocks);
|
||
|
||
// Use the widest tolerance we'll test (5%) to decide what to store.
|
||
// Outputs beyond 5% relative error will never be filtered at any tolerance.
|
||
let max_tolerance: f64 = 0.05;
|
||
|
||
for h in START_HEIGHT..total_heights {
|
||
let ft = first_tx_index[h];
|
||
let next_ft = first_tx_index
|
||
.get(h + 1)
|
||
.copied()
|
||
.unwrap_or(TxIndex::from(total_txs));
|
||
|
||
let out_start = if ft.to_usize() + 1 < next_ft.to_usize() {
|
||
indexer
|
||
.vecs
|
||
.transactions
|
||
.first_txout_index
|
||
.collect_one(ft + 1)
|
||
.unwrap()
|
||
.to_usize()
|
||
} else {
|
||
out_first
|
||
.get(h + 1)
|
||
.copied()
|
||
.unwrap_or(TxOutIndex::from(total_outputs))
|
||
.to_usize()
|
||
};
|
||
let out_end = out_first
|
||
.get(h + 1)
|
||
.copied()
|
||
.unwrap_or(TxOutIndex::from(total_outputs))
|
||
.to_usize();
|
||
|
||
if h < sweep_start {
|
||
continue;
|
||
}
|
||
|
||
let values: Vec<Sats> = indexer
|
||
.vecs
|
||
.outputs
|
||
.value
|
||
.collect_range_at(out_start, out_end);
|
||
let output_types: Vec<OutputType> = indexer
|
||
.vecs
|
||
.outputs
|
||
.output_type
|
||
.collect_range_at(out_start, out_end);
|
||
|
||
let mut full_hist = Box::new([0u32; NUM_BINS]);
|
||
let mut round_outputs = Vec::new();
|
||
|
||
for (sats, output_type) in values.into_iter().zip(output_types) {
|
||
if ref_config.excluded_output_types.contains(&output_type) {
|
||
continue;
|
||
}
|
||
if *sats < ref_config.min_sats {
|
||
continue;
|
||
}
|
||
if let Some(bin) = sats_to_bin(sats) {
|
||
full_hist[bin] += 1;
|
||
let d = leading_digit(*sats);
|
||
if (1..=9).contains(&d) {
|
||
let rel_err = relative_roundness(*sats);
|
||
if rel_err <= max_tolerance {
|
||
round_outputs.push(RoundOutput {
|
||
bin: bin as u16,
|
||
digit: d,
|
||
rel_err: rel_err as f32,
|
||
});
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
let (high_bin, low_bin) = if h < height_bands.len() {
|
||
height_bands[h]
|
||
} else {
|
||
(0.0, 0.0)
|
||
};
|
||
|
||
blocks.push(BlockData {
|
||
full_hist,
|
||
round_outputs,
|
||
high_bin,
|
||
low_bin,
|
||
});
|
||
|
||
if (h - sweep_start).is_multiple_of(50_000) {
|
||
eprint!(
|
||
"\r {}/{} ({:.0}%)",
|
||
h - sweep_start,
|
||
total_blocks,
|
||
(h - sweep_start) as f64 / total_blocks as f64 * 100.0
|
||
);
|
||
}
|
||
}
|
||
|
||
let mem_hists = blocks.len() * std::mem::size_of::<[u32; NUM_BINS]>();
|
||
let mem_rounds: usize = blocks
|
||
.iter()
|
||
.map(|b| b.round_outputs.len() * std::mem::size_of::<RoundOutput>())
|
||
.sum();
|
||
eprintln!(
|
||
"\r {} blocks precomputed ({:.1} GB hists + {:.0} MB rounds) in {:.1}s",
|
||
blocks.len(),
|
||
mem_hists as f64 / 1e9,
|
||
mem_rounds as f64 / 1e6,
|
||
t0.elapsed().as_secs_f64()
|
||
);
|
||
|
||
// Phase 2: sweep tolerance × mask combos.
|
||
// Tolerances as fractions (not percentages).
|
||
let tolerances: &[(f64, &str)] = &[
|
||
(0.0, "0%"),
|
||
(0.0001, "0.01%"),
|
||
(0.0005, "0.05%"),
|
||
(0.001, "0.1%"),
|
||
(0.002, "0.2%"),
|
||
(0.005, "0.5%"),
|
||
(0.01, "1%"),
|
||
(0.02, "2%"),
|
||
(0.05, "5%"),
|
||
];
|
||
|
||
// 987654321
|
||
let masks: &[(u16, &str)] = &[
|
||
(0b0_0000_0000, "none"),
|
||
(0b0_0001_0111, "{1,2,3,5}"),
|
||
(0b0_0001_1111, "{1,2,3,4,5}"),
|
||
(0b0_0011_0111, "{1,2,3,5,6}"),
|
||
(0b0_0111_0111, "{1,2,3,5,6,7}"),
|
||
(0b1_1111_1111, "{1-9}"),
|
||
];
|
||
|
||
let num_configs = tolerances.len() * masks.len();
|
||
let num_threads = std::thread::available_parallelism()
|
||
.map(|n| n.get())
|
||
.unwrap_or(8);
|
||
eprintln!(
|
||
"Phase 2: sweeping {} configs ({} tolerances × {} masks) across {} threads...",
|
||
num_configs,
|
||
tolerances.len(),
|
||
masks.len(),
|
||
num_threads
|
||
);
|
||
|
||
let t1 = Instant::now();
|
||
let blocks = &blocks;
|
||
let tolerances_ref = tolerances;
|
||
let masks_ref = masks;
|
||
|
||
let all_results: Vec<(usize, usize, Stats)> = std::thread::scope(|s| {
|
||
let configs_per_thread = num_configs.div_ceil(num_threads);
|
||
|
||
let handles: Vec<_> = (0..num_threads)
|
||
.map(|t| {
|
||
s.spawn(move || {
|
||
let cfg_start = t * configs_per_thread;
|
||
let cfg_end = ((t + 1) * configs_per_thread).min(num_configs);
|
||
if cfg_start >= cfg_end {
|
||
return vec![];
|
||
}
|
||
let mut results = Vec::with_capacity(cfg_end - cfg_start);
|
||
|
||
for cfg_idx in cfg_start..cfg_end {
|
||
let ti = cfg_idx / masks_ref.len();
|
||
let mi = cfg_idx % masks_ref.len();
|
||
let (tolerance, _) = tolerances_ref[ti];
|
||
let (mask, _) = masks_ref[mi];
|
||
|
||
let mut oracle = Oracle::new(
|
||
seed_bin(sweep_start),
|
||
Config {
|
||
exclude_common_round_values: false,
|
||
..Default::default()
|
||
},
|
||
);
|
||
let mut stats = Stats::new();
|
||
|
||
for bd in blocks.iter() {
|
||
let mut hist = *bd.full_hist;
|
||
|
||
// Remove outputs matching this tolerance + mask.
|
||
let tol_f32 = tolerance as f32;
|
||
for ro in &bd.round_outputs {
|
||
if mask & (1 << (ro.digit - 1)) != 0 && ro.rel_err <= tol_f32 {
|
||
hist[ro.bin as usize] -= 1;
|
||
}
|
||
}
|
||
|
||
let ref_bin = oracle.process_histogram(&hist);
|
||
|
||
if bd.high_bin > 0.0 && bd.low_bin > 0.0 {
|
||
let err = if ref_bin < bd.high_bin {
|
||
ref_bin - bd.high_bin
|
||
} else if ref_bin > bd.low_bin {
|
||
ref_bin - bd.low_bin
|
||
} else {
|
||
0.0
|
||
};
|
||
stats.update(err);
|
||
}
|
||
}
|
||
|
||
results.push((ti, mi, stats));
|
||
}
|
||
|
||
results
|
||
})
|
||
})
|
||
.collect();
|
||
|
||
handles
|
||
.into_iter()
|
||
.flat_map(|h| h.join().unwrap())
|
||
.collect()
|
||
});
|
||
|
||
eprintln!(" Done in {:.1}s.", t1.elapsed().as_secs_f64());
|
||
|
||
// Print results grouped by tolerance.
|
||
println!();
|
||
println!(
|
||
"{:>8} {:>16} {:>8} {:>10} {:>10} {:>6} {:>6} {:>6} {:>8}",
|
||
"Tol", "Digits", "Blocks", "RMSE%", "Max%", ">5%", ">10%", ">20%", "Bias"
|
||
);
|
||
println!("{}", "-".repeat(88));
|
||
|
||
for (ti, &(_, tol_label)) in tolerances.iter().enumerate() {
|
||
for (mi, &(_, mask_label)) in masks.iter().enumerate() {
|
||
let (_, _, stats) = all_results
|
||
.iter()
|
||
.find(|(t, m, _)| *t == ti && *m == mi)
|
||
.unwrap();
|
||
println!(
|
||
"{:>8} {:>16} {:>8} {:>8.3}% {:>8.1}% {:>6} {:>6} {:>6} {:>+8.2}",
|
||
tol_label,
|
||
mask_label,
|
||
stats.total_blocks,
|
||
stats.rmse_pct(),
|
||
stats.max_pct(),
|
||
stats.gt_5pct,
|
||
stats.gt_10pct,
|
||
stats.gt_20pct,
|
||
stats.bias()
|
||
);
|
||
}
|
||
println!();
|
||
}
|
||
|
||
// Find overall best config by RMSE.
|
||
let best = all_results
|
||
.iter()
|
||
.min_by(|a, b| a.2.rmse_pct().partial_cmp(&b.2.rmse_pct()).unwrap())
|
||
.unwrap();
|
||
let (bti, bmi, bs) = best;
|
||
println!(
|
||
"Best: tolerance={}, digits={} → RMSE {:.3}%, Max {:.1}%, >5%: {}, >10%: {}, >20%: {}",
|
||
tolerances[*bti].1,
|
||
masks[*bmi].1,
|
||
bs.rmse_pct(),
|
||
bs.max_pct(),
|
||
bs.gt_5pct,
|
||
bs.gt_10pct,
|
||
bs.gt_20pct,
|
||
);
|
||
|
||
// Show current config for reference.
|
||
let current = all_results
|
||
.iter()
|
||
.find(|(t, m, _)| tolerances[*t].0 == 0.001 && masks[*m].0 == 0b0_0011_0111)
|
||
.unwrap();
|
||
let (_, _, cs) = current;
|
||
println!(
|
||
"Current: tolerance=0.1%, digits={{1,2,3,5,6}} → RMSE {:.3}%, Max {:.1}%, >5%: {}, >10%: {}, >20%: {}",
|
||
cs.rmse_pct(),
|
||
cs.max_pct(),
|
||
cs.gt_5pct,
|
||
cs.gt_10pct,
|
||
cs.gt_20pct,
|
||
);
|
||
|
||
println!("\nTotal time: {:.1}s", t0.elapsed().as_secs_f64());
|
||
}
|