From f559e4027ea8f94333251de908c29d5517fc663f Mon Sep 17 00:00:00 2001 From: nym21 Date: Thu, 19 Feb 2026 12:15:09 +0100 Subject: [PATCH] indexer: snapshot --- crates/brk_indexer/README.md | 105 ++++++++++++---------- crates/brk_indexer/src/processor/tx.rs | 8 +- crates/brk_indexer/src/processor/txout.rs | 2 +- crates/brk_indexer/src/readers.rs | 72 ++++++++------- crates/brk_indexer/src/vecs/addresses.rs | 23 ++--- crates/brk_indexer/src/vecs/mod.rs | 8 +- 6 files changed, 125 insertions(+), 93 deletions(-) diff --git a/crates/brk_indexer/README.md b/crates/brk_indexer/README.md index c18de4a36..28e1acc21 100644 --- a/crates/brk_indexer/README.md +++ b/crates/brk_indexer/README.md @@ -1,63 +1,79 @@ # brk_indexer -Full Bitcoin blockchain indexer for fast analytics queries. +Parses and indexes the entire Bitcoin blockchain so you can look up any block, transaction, input, output, or address by index in O(1). -## What It Enables +## How It's Organized -Transform raw Bitcoin blockchain data into indexed vectors and key-value stores optimized for analytics. Query any block, transaction, address, or UTXO without scanning the chain. +Every entity gets a sequential index in blockchain order: -## Key Features +- Block 0, 1, 2, ... → **height** +- Transaction 0, 1, 2, ... → **txindex** +- Input 0, 1, 2, ... → **txinindex** +- Output 0, 1, 2, ... → **txoutindex** +- Address 0, 1, 2, ... → **addressindex** (per address type) -- **Multi-phase block processing**: Parallel TXID computation, input/output processing, sequential finalization -- **Address indexing**: Maps addresses to their transaction history and UTXOs per address type -- **UTXO tracking**: Live outpoint→value lookups, address→unspent outputs -- **Reorg handling**: Automatic rollback to valid chain state on reorganization -- **Collision detection**: Validates rapidhash-based prefix lookups against known duplicate TXIDs -- **Incremental snapshots**: Periodic checkpoints for crash recovery +Data is stored in append-only vectors keyed by these indexes. Each block also stores the first index of each entity type it contains (e.g. `first_txindex`, `first_txoutindex`), so you can find all transactions, inputs, outputs, and addresses in any block in O(1). -## Core API +## What's Indexed -```rust,ignore -let mut indexer = Indexer::forced_import(&outputs_dir)?; +### Per Block (keyed by height) -// Index new blocks -let starting_indexes = indexer.index(&blocks, &client, &exit)?; +- Block hash, timestamp, difficulty, size, weight -// Access indexed data -let txindex = indexer.stores.txidprefix_to_txindex.get(&txid_prefix)?; -let blockhash = indexer.vecs.blocks.blockhash.get(height)?; -``` +### Per Transaction (keyed by txindex) -## Data Structures +- Txid, version, locktime, base size, total size, RBF flag, block height -**Vecs** (append-only vectors): -- `blocks`: `blockhash`, `timestamp`, `difficulty`, `total_size`, `weight` -- `transactions`: `txid`, `first_txinindex`, `first_txoutindex` -- `inputs`: `outpoint`, `txindex` -- `outputs`: `value`, `outputtype`, `typeindex`, `txindex` -- `addresses`: Per-type `p2pkhbytes`, `p2shbytes`, `p2wpkhbytes`, etc. +### Per Input (keyed by txinindex) -**Stores** (key-value lookups): -- `txidprefix_to_txindex` - TXID lookup via 10-byte prefix -- `blockhashprefix_to_height` - Block lookup via 4-byte prefix -- `addresstype_to_addresshash_to_addressindex` - Address lookup per type -- `addresstype_to_addressindex_and_unspentoutpoint` - Live UTXO set per address +- Spent outpoint, containing txindex, and the spent output's type and address index -## Processing Pipeline +### Per Output (keyed by txoutindex) -1. **Block metadata**: Store blockhash, difficulty, timestamp -2. **Compute TXIDs**: Parallel SHA256d across transactions -3. **Process inputs**: Lookup spent outpoints, resolve address info -4. **Process outputs**: Extract addresses, assign type indexes -5. **Finalize**: Sequential store updates, UTXO set mutations -6. **Commit**: Periodic flush to disk +- Value in satoshis, script type, address index within that type, containing txindex + +Script types: P2PK (compressed/uncompressed), P2PKH, P2SH, P2WPKH, P2WSH, P2TR, P2A, P2MS, OP_RETURN, Empty, Unknown + +### Per Address (keyed by addressindex, one set per type) + +- Raw address bytes (20-65 bytes depending on type: pubkey, pubkey hash, script hash, witness program, etc.) + +Address types each get their own index space: P2PK65, P2PK33, P2PKH, P2SH, P2WPKH, P2WSH, P2TR, P2A + +### Per Non-Address Script (OP_RETURN, P2MS, Empty, Unknown) + +- Containing txindex + +## Key-Value Stores + +On top of the vectors, key-value stores enable lookups that aren't sequential: + +| Store | Purpose | +|-------|---------| +| txid prefix → txindex | Look up a transaction by its txid | +| block hash prefix → height | Look up a block by its hash | +| address hash → addressindex | Look up an address (per type) | +| addressindex + txindex | All transactions involving an address | +| addressindex + outpoint | Unspent outputs for an address (live UTXO set) | +| height → coinbase tag | Miner-embedded message per block | + +## How It Works + +1. **Block metadata** — store block hash, difficulty, timestamp, size, weight +2. **Compute TXIDs** — parallel SHA256d across all transactions +3. **Process outputs** — classify script types, extract addresses, detect new unique addresses +4. **Process inputs** — resolve spent outpoints, look up address info +5. **Finalize** — update address stores, UTXO set mutations, push all vectors +6. **Snapshot** — periodic flush to disk for crash recovery + +Reorg handling is built-in: on chain reorganization, the indexer rolls back to the last valid state. ## Performance -| Machine | Time | Disk | Peak Disk | Memory | Peak Memory | -|---------|------|------|-----------|--------|-------------| -| MBP M3 Pro (36GB, internal SSD) | 3h | 247 GB | 314 GB | 5.2 GB | 11 GB | -| Mac Mini M4 (16GB, external SSD) | 4.9h | 233 GB | 303 GB | 5.4 GB | 11 GB | +| Version | Machine | Time | Disk | Peak Disk | Memory | Peak Memory | +|---------|---------|------|------|-----------|--------|-------------| +| v0.2.0-pre | MBP M3 Pro (36GB, internal SSD) | 2h40 | 239 GB | 302 GB | 5.9 GB | 13 GB | +| v0.1.0-alpha.0 | Mac Mini M4 (16GB, external SSD) | 4.9h | 233 GB | 303 GB | 5.4 GB | 11 GB | Full benchmark data: [bitcoinresearchkit/benches](https://github.com/bitcoinresearchkit/benches/tree/main/brk_indexer) @@ -67,8 +83,7 @@ Use [mimalloc v3](https://crates.io/crates/mimalloc) as the global allocator to ## Built On -- `vecdb` for append-only vectors -- `brk_cohort` for address type handling +- `vecdb` for append-only vectors — integer-compressed (`PcoVec`) or raw bytes (`BytesVec`) - `brk_iterator` for block iteration -- `brk_store` for key-value storage +- `brk_store` for key-value storage (fjall LSM) - `brk_types` for domain types diff --git a/crates/brk_indexer/src/processor/tx.rs b/crates/brk_indexer/src/processor/tx.rs index 1d9ab9120..8c84b1da6 100644 --- a/crates/brk_indexer/src/processor/tx.rs +++ b/crates/brk_indexer/src/processor/tx.rs @@ -19,7 +19,9 @@ impl<'a> BlockProcessor<'a> { .par_iter() .enumerate() .map(|(index, tx)| { - let txid = Txid::from(tx.compute_txid()); + let (btc_txid, base_size, total_size) = + self.block.compute_tx_id_and_sizes(index); + let txid = Txid::from(btc_txid); let txid_prefix = TxidPrefix::from(&txid); let prev_txindex_opt = if will_check_collisions { @@ -37,8 +39,8 @@ impl<'a> BlockProcessor<'a> { txid, txid_prefix, prev_txindex_opt, - base_size: tx.base_size() as u32, - total_size: tx.total_size() as u32, + base_size, + total_size, }) }) .collect() diff --git a/crates/brk_indexer/src/processor/txout.rs b/crates/brk_indexer/src/processor/txout.rs index e2f4cf820..515a9b74f 100644 --- a/crates/brk_indexer/src/processor/txout.rs +++ b/crates/brk_indexer/src/processor/txout.rs @@ -70,7 +70,7 @@ impl<'a> BlockProcessor<'a> { let prev_addressbytes = self.vecs.get_addressbytes_by_type( addresstype, typeindex, - self.readers.addressbytes.get_unwrap(addresstype), + &self.readers.addressbytes, ) .ok_or(Error::Internal("Missing addressbytes"))?; diff --git a/crates/brk_indexer/src/readers.rs b/crates/brk_indexer/src/readers.rs index 5e22bd022..6fbd49879 100644 --- a/crates/brk_indexer/src/readers.rs +++ b/crates/brk_indexer/src/readers.rs @@ -1,43 +1,55 @@ -use brk_cohort::ByAddressType; -use vecdb::Reader; +use brk_types::{ + OutputType, P2AAddressIndex, P2ABytes, P2PK33AddressIndex, P2PK33Bytes, P2PK65AddressIndex, + P2PK65Bytes, P2PKHAddressIndex, P2PKHBytes, P2SHAddressIndex, P2SHBytes, P2TRAddressIndex, + P2TRBytes, P2WPKHAddressIndex, P2WPKHBytes, P2WSHAddressIndex, P2WSHBytes, TxIndex, + TxOutIndex, Txid, TypeIndex, +}; +use vecdb::{BytesStrategy, VecReader}; use crate::Vecs; +pub struct AddressReaders { + pub p2pk65: VecReader>, + pub p2pk33: VecReader>, + pub p2pkh: VecReader>, + pub p2sh: VecReader>, + pub p2wpkh: VecReader>, + pub p2wsh: VecReader>, + pub p2tr: VecReader>, + pub p2a: VecReader>, +} + /// Readers for vectors that need to be accessed during block processing. -/// These provide consistent snapshots for reading while the main vectors are being modified. +/// +/// All fields use `VecReader` which caches the mmap base pointer for O(1) +/// random access without recomputing `region.start() + HEADER_OFFSET` per read. pub struct Readers { - pub txid: Reader, - pub txindex_to_first_txoutindex: Reader, - pub txoutindex_to_outputtype: Reader, - pub txoutindex_to_typeindex: Reader, - pub addressbytes: ByAddressType, + pub txid: VecReader>, + pub txindex_to_first_txoutindex: + VecReader>, + pub txoutindex_to_outputtype: + VecReader>, + pub txoutindex_to_typeindex: + VecReader>, + pub addressbytes: AddressReaders, } impl Readers { pub fn new(vecs: &Vecs) -> Self { Self { - txid: vecs.transactions.txid.create_reader(), - txindex_to_first_txoutindex: vecs.transactions.first_txoutindex.create_reader(), - txoutindex_to_outputtype: vecs.outputs.outputtype.create_reader(), - txoutindex_to_typeindex: vecs.outputs.typeindex.create_reader(), - addressbytes: ByAddressType { - p2pk65: vecs - .addresses - .p2pk65bytes - .create_reader(), - p2pk33: vecs - .addresses - .p2pk33bytes - .create_reader(), - p2pkh: vecs.addresses.p2pkhbytes.create_reader(), - p2sh: vecs.addresses.p2shbytes.create_reader(), - p2wpkh: vecs - .addresses - .p2wpkhbytes - .create_reader(), - p2wsh: vecs.addresses.p2wshbytes.create_reader(), - p2tr: vecs.addresses.p2trbytes.create_reader(), - p2a: vecs.addresses.p2abytes.create_reader(), + txid: vecs.transactions.txid.reader(), + txindex_to_first_txoutindex: vecs.transactions.first_txoutindex.reader(), + txoutindex_to_outputtype: vecs.outputs.outputtype.reader(), + txoutindex_to_typeindex: vecs.outputs.typeindex.reader(), + addressbytes: AddressReaders { + p2pk65: vecs.addresses.p2pk65bytes.reader(), + p2pk33: vecs.addresses.p2pk33bytes.reader(), + p2pkh: vecs.addresses.p2pkhbytes.reader(), + p2sh: vecs.addresses.p2shbytes.reader(), + p2wpkh: vecs.addresses.p2wpkhbytes.reader(), + p2wsh: vecs.addresses.p2wshbytes.reader(), + p2tr: vecs.addresses.p2trbytes.reader(), + p2a: vecs.addresses.p2abytes.reader(), }, } } diff --git a/crates/brk_indexer/src/vecs/addresses.rs b/crates/brk_indexer/src/vecs/addresses.rs index 002f35485..eecf0b95b 100644 --- a/crates/brk_indexer/src/vecs/addresses.rs +++ b/crates/brk_indexer/src/vecs/addresses.rs @@ -8,10 +8,11 @@ use brk_types::{ }; use rayon::prelude::*; use vecdb::{ - AnyStoredVec, BytesVec, Database, WritableVec, ImportableVec, PcoVec, Reader, ReadableVec, + AnyStoredVec, BytesVec, Database, WritableVec, ImportableVec, PcoVec, ReadableVec, Stamp, VecIndex, }; +use crate::AddressReaders; use crate::parallel_import; #[derive(Clone, Traversable)] @@ -164,46 +165,46 @@ impl AddressesVecs { .into_par_iter() } - /// Get address bytes by output type, using the reader for the specific address type. + /// Get address bytes by output type, using the cached VecReader for the specific address type. /// Returns None if the index doesn't exist yet. pub fn get_bytes_by_type( &self, addresstype: OutputType, typeindex: TypeIndex, - reader: &Reader, + readers: &AddressReaders, ) -> Option { match addresstype { OutputType::P2PK65 => self .p2pk65bytes - .get_pushed_or_read(typeindex.into(), reader) + .get_pushed_or_read(typeindex.into(), &readers.p2pk65) .map(AddressBytes::from), OutputType::P2PK33 => self .p2pk33bytes - .get_pushed_or_read(typeindex.into(), reader) + .get_pushed_or_read(typeindex.into(), &readers.p2pk33) .map(AddressBytes::from), OutputType::P2PKH => self .p2pkhbytes - .get_pushed_or_read(typeindex.into(), reader) + .get_pushed_or_read(typeindex.into(), &readers.p2pkh) .map(AddressBytes::from), OutputType::P2SH => self .p2shbytes - .get_pushed_or_read(typeindex.into(), reader) + .get_pushed_or_read(typeindex.into(), &readers.p2sh) .map(AddressBytes::from), OutputType::P2WPKH => self .p2wpkhbytes - .get_pushed_or_read(typeindex.into(), reader) + .get_pushed_or_read(typeindex.into(), &readers.p2wpkh) .map(AddressBytes::from), OutputType::P2WSH => self .p2wshbytes - .get_pushed_or_read(typeindex.into(), reader) + .get_pushed_or_read(typeindex.into(), &readers.p2wsh) .map(AddressBytes::from), OutputType::P2TR => self .p2trbytes - .get_pushed_or_read(typeindex.into(), reader) + .get_pushed_or_read(typeindex.into(), &readers.p2tr) .map(AddressBytes::from), OutputType::P2A => self .p2abytes - .get_pushed_or_read(typeindex.into(), reader) + .get_pushed_or_read(typeindex.into(), &readers.p2a) .map(AddressBytes::from), _ => unreachable!("get_bytes_by_type called with non-address type"), } diff --git a/crates/brk_indexer/src/vecs/mod.rs b/crates/brk_indexer/src/vecs/mod.rs index 295e16126..46b4f060f 100644 --- a/crates/brk_indexer/src/vecs/mod.rs +++ b/crates/brk_indexer/src/vecs/mod.rs @@ -4,7 +4,9 @@ use brk_error::Result; use brk_traversable::Traversable; use brk_types::{AddressBytes, AddressHash, Height, OutputType, TypeIndex, Version}; use rayon::prelude::*; -use vecdb::{AnyStoredVec, Database, Reader, Stamp}; +use vecdb::{AnyStoredVec, Database, Stamp}; + +use crate::AddressReaders; const PAGE_SIZE: usize = 4096; @@ -150,10 +152,10 @@ impl Vecs { &self, addresstype: OutputType, typeindex: TypeIndex, - reader: &Reader, + readers: &AddressReaders, ) -> Option { self.addresses - .get_bytes_by_type(addresstype, typeindex, reader) + .get_bytes_by_type(addresstype, typeindex, readers) } pub fn push_bytes_if_needed(&mut self, index: TypeIndex, bytes: AddressBytes) -> Result<()> {