mirror of
https://github.com/bitcoinresearchkit/brk.git
synced 2026-06-16 01:39:44 -07:00
interface: create super fast searcher
This commit is contained in:
@@ -1,9 +1,9 @@
|
||||
#![doc = include_str!("../README.md")]
|
||||
|
||||
use std::{collections::BTreeMap, sync::OnceLock};
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use brk_computer::Computer;
|
||||
use brk_error::{Error, Result};
|
||||
use brk_error::Result;
|
||||
use brk_indexer::Indexer;
|
||||
use brk_parser::Parser;
|
||||
use brk_structs::{
|
||||
@@ -11,11 +11,6 @@ use brk_structs::{
|
||||
TxidPath,
|
||||
};
|
||||
use brk_traversable::TreeNode;
|
||||
use nucleo_matcher::{
|
||||
Config, Matcher,
|
||||
pattern::{AtomKind, CaseMatching, Normalization, Pattern},
|
||||
};
|
||||
use quick_cache::sync::Cache;
|
||||
use vecdb::{AnyCollectableVec, AnyStoredVec};
|
||||
|
||||
mod chain;
|
||||
@@ -23,6 +18,7 @@ mod deser;
|
||||
mod metrics;
|
||||
mod pagination;
|
||||
mod params;
|
||||
mod searcher;
|
||||
mod vecs;
|
||||
|
||||
pub use metrics::{Output, Value};
|
||||
@@ -35,10 +31,10 @@ use crate::{
|
||||
vecs::{IndexToVec, MetricToVec},
|
||||
};
|
||||
|
||||
pub fn cached_errors() -> &'static Cache<String, String> {
|
||||
static CACHE: OnceLock<Cache<String, String>> = OnceLock::new();
|
||||
CACHE.get_or_init(|| Cache::new(1000))
|
||||
}
|
||||
// pub fn cached_errors() -> &'static Cache<String, String> {
|
||||
// static CACHE: OnceLock<Cache<String, String>> = OnceLock::new();
|
||||
// CACHE.get_or_init(|| Cache::new(1000))
|
||||
// }
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub struct Interface<'a> {
|
||||
@@ -75,64 +71,56 @@ impl<'a> Interface<'a> {
|
||||
get_transaction_info(txid, self)
|
||||
}
|
||||
|
||||
pub fn search(&self, params: &Params) -> Result<Vec<(String, &&dyn AnyCollectableVec)>> {
|
||||
let metrics = ¶ms.metrics;
|
||||
let index = params.index;
|
||||
pub fn search_metric(&self, metric: &str, limit: usize) -> Vec<&str> {
|
||||
self.vecs.search(metric, limit)
|
||||
}
|
||||
|
||||
let ids_to_vec = self
|
||||
.vecs
|
||||
.index_to_metric_to_vec
|
||||
.get(&index)
|
||||
.ok_or(Error::String(format!(
|
||||
"Index \"{}\" isn't a valid index",
|
||||
index
|
||||
)))?;
|
||||
pub fn search_metric_with_index(
|
||||
&self,
|
||||
metric: &str,
|
||||
index: Index,
|
||||
// params: &Params,
|
||||
) -> Result<Vec<(String, &&dyn AnyCollectableVec)>> {
|
||||
todo!();
|
||||
|
||||
metrics.iter()
|
||||
.map(|metric| {
|
||||
let vec = ids_to_vec.get(metric.as_str()).ok_or_else(|| {
|
||||
let cached_errors = cached_errors();
|
||||
// let all_metrics = &self.vecs.metrics;
|
||||
// let metrics = ¶ms.metrics;
|
||||
// let index = params.index;
|
||||
|
||||
if let Some(message) = cached_errors.get(metric) {
|
||||
return Error::String(message)
|
||||
}
|
||||
// let ids_to_vec = self
|
||||
// .vecs
|
||||
// .index_to_metric_to_vec
|
||||
// .get(&index)
|
||||
// .ok_or(Error::String(format!(
|
||||
// "Index \"{}\" isn't a valid index",
|
||||
// index
|
||||
// )))?;
|
||||
|
||||
let mut message = format!(
|
||||
"No vec named \"{}\" indexed by \"{}\" found.\n",
|
||||
metric,
|
||||
index
|
||||
);
|
||||
// metrics
|
||||
// .iter()
|
||||
// .map(|metric| {
|
||||
// let vec = ids_to_vec.get(metric.as_str()).ok_or_else(|| {
|
||||
// let matches: Vec<&str> = MATCHER.with(|matcher| {
|
||||
// let matcher = matcher.borrow();
|
||||
// let mut scored: Vec<(&str, i64)> = all_metrics
|
||||
// .iter()
|
||||
// .filter_map(|m| matcher.fuzzy_match(m, metric).map(|s| (*m, s)))
|
||||
// .collect();
|
||||
|
||||
let mut matcher = Matcher::new(Config::DEFAULT);
|
||||
// scored.sort_unstable_by_key(|&(_, s)| std::cmp::Reverse(s));
|
||||
// scored.into_iter().take(5).map(|(m, _)| m).collect()
|
||||
// });
|
||||
|
||||
let matches = Pattern::new(
|
||||
metric.as_str(),
|
||||
CaseMatching::Ignore,
|
||||
Normalization::Smart,
|
||||
AtomKind::Fuzzy,
|
||||
)
|
||||
.match_list(ids_to_vec.keys(), &mut matcher)
|
||||
.into_iter()
|
||||
.take(10)
|
||||
.map(|(s, _)| s)
|
||||
.collect::<Vec<_>>();
|
||||
// let mut message = format!("No vec \"{metric}\" for index \"{index}\".\n");
|
||||
// if !matches.is_empty() {
|
||||
// message += &format!("\nDid you mean: {matches:?}\n");
|
||||
// }
|
||||
|
||||
if !matches.is_empty() {
|
||||
message +=
|
||||
&format!("\nMaybe you meant one of the following: {matches:#?} ?\n");
|
||||
}
|
||||
|
||||
if let Some(index_to_vec) = self.metric_to_index_to_vec().get(metric.as_str()) {
|
||||
message += &format!("\nBut there is a vec named {metric} which supports the following indexes: {:#?}\n", index_to_vec.keys());
|
||||
}
|
||||
|
||||
cached_errors.insert(metric.clone(), message.clone());
|
||||
|
||||
Error::String(message)
|
||||
});
|
||||
vec.map(|vec| (metric.clone(), vec))
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()
|
||||
// Error::String(message)
|
||||
// });
|
||||
// vec.map(|vec| (metric.clone(), vec))
|
||||
// })
|
||||
// .collect::<Result<Vec<_>>>()
|
||||
}
|
||||
|
||||
pub fn format(
|
||||
@@ -227,7 +215,8 @@ impl<'a> Interface<'a> {
|
||||
}
|
||||
|
||||
pub fn search_and_format(&self, params: Params) -> Result<Output> {
|
||||
self.format(self.search(¶ms)?, ¶ms.rest)
|
||||
todo!()
|
||||
// self.format(self.search(¶ms)?, ¶ms.rest)
|
||||
}
|
||||
|
||||
pub fn metric_to_index_to_vec(&self) -> &BTreeMap<&str, IndexToVec<'_>> {
|
||||
@@ -262,7 +251,7 @@ impl<'a> Interface<'a> {
|
||||
}
|
||||
|
||||
pub fn get_metrics_catalog(&self) -> &TreeNode {
|
||||
self.vecs.catalog.as_ref().unwrap()
|
||||
self.vecs.catalog()
|
||||
}
|
||||
|
||||
pub fn get_index_to_vecids(&self, paginated_index: PaginatedIndexParam) -> Vec<&str> {
|
||||
|
||||
@@ -0,0 +1,200 @@
|
||||
use std::{marker::PhantomData, ops::Neg, ptr};
|
||||
|
||||
use rustc_hash::{FxHashMap, FxHashSet};
|
||||
|
||||
const MAX_TRIGRAMS: usize = 9;
|
||||
|
||||
pub struct NgramSearcher<'a> {
|
||||
max_word_count: usize,
|
||||
max_word_len: usize,
|
||||
max_query_len: usize,
|
||||
word_index: FxHashMap<String, FxHashSet<*const str>>,
|
||||
trigram_index: FxHashMap<[char; 3], FxHashSet<*const str>>,
|
||||
_phantom: PhantomData<&'a str>,
|
||||
}
|
||||
|
||||
unsafe impl<'a> Send for NgramSearcher<'a> {}
|
||||
unsafe impl<'a> Sync for NgramSearcher<'a> {}
|
||||
|
||||
const SEPARATORS: &[char] = &['_', '-', ' '];
|
||||
|
||||
impl<'a> NgramSearcher<'a> {
|
||||
pub fn new(items: &[&'a str]) -> Self {
|
||||
let mut word_index: FxHashMap<String, FxHashSet<*const str>> = FxHashMap::default();
|
||||
let mut trigram_index: FxHashMap<[char; 3], FxHashSet<*const str>> = FxHashMap::default();
|
||||
let mut max_word_len = 0;
|
||||
let mut max_query_len = 0;
|
||||
let mut max_words = 0;
|
||||
|
||||
for &item in items {
|
||||
max_query_len = max_query_len.max(item.len());
|
||||
let mut word_count = 0;
|
||||
for word in item.split(SEPARATORS) {
|
||||
word_count += 1;
|
||||
if word.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
max_word_len = max_word_len.max(item.len());
|
||||
|
||||
word_index.entry(word.to_string()).or_default().insert(item);
|
||||
|
||||
if word.len() >= 3 {
|
||||
let chars = word.chars().collect::<Vec<_>>();
|
||||
for window in chars.windows(3) {
|
||||
trigram_index
|
||||
.entry(unsafe { ptr::read(window.as_ptr() as *const [char; 3]) })
|
||||
.or_default()
|
||||
.insert(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
max_words = max_words.max(word_count);
|
||||
}
|
||||
|
||||
Self {
|
||||
max_query_len: max_query_len + 6,
|
||||
max_word_len: max_word_len + 4,
|
||||
max_word_count: max_word_len + 2,
|
||||
word_index,
|
||||
trigram_index,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn search(&self, query: &str, limit: usize) -> Vec<&'a str> {
|
||||
let query_lower = query.to_lowercase();
|
||||
let query_len = query_lower.len();
|
||||
|
||||
if query.is_empty() || query_len > self.max_query_len {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let words: FxHashSet<&str> = query_lower
|
||||
.split(SEPARATORS)
|
||||
.filter(|w| !w.is_empty() && w.len() <= self.max_word_len)
|
||||
.collect();
|
||||
|
||||
if words.is_empty() || words.len() > self.max_word_count {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let min_len = query_len.saturating_sub(3);
|
||||
|
||||
let mut pool: Option<FxHashSet<*const str>> = None;
|
||||
let mut unknown_words = Vec::new();
|
||||
|
||||
let mut words_to_intersect = vec![];
|
||||
for word in words {
|
||||
match self.word_index.get(word) {
|
||||
Some(items) => words_to_intersect.push(items),
|
||||
None => unknown_words.push(word),
|
||||
}
|
||||
}
|
||||
|
||||
if !words_to_intersect.is_empty() {
|
||||
words_to_intersect.sort_unstable_by_key(|set| (set.len() as i64).neg());
|
||||
|
||||
let mut intersect = words_to_intersect.pop().cloned().unwrap();
|
||||
|
||||
for other_set in words_to_intersect.iter().rev() {
|
||||
intersect.retain(|ptr| other_set.contains(ptr));
|
||||
if intersect.is_empty() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pool = Some(intersect);
|
||||
}
|
||||
let some_pool = pool.is_some();
|
||||
|
||||
if some_pool && unknown_words.is_empty() {
|
||||
let mut results: Vec<_> = pool
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|item| unsafe { &*item as &str })
|
||||
.collect();
|
||||
// Partial sort - only sort what we need
|
||||
if results.len() > limit {
|
||||
results.select_nth_unstable_by_key(limit, |item| item.len());
|
||||
results.truncate(limit);
|
||||
}
|
||||
results.sort_unstable_by_key(|item| item.len());
|
||||
return results;
|
||||
}
|
||||
|
||||
// Score candidates
|
||||
let mut scores: FxHashMap<*const str, usize> = FxHashMap::default();
|
||||
scores.reserve(256);
|
||||
if let Some(pool) = &pool {
|
||||
for &item in pool {
|
||||
scores.insert(item, 1);
|
||||
}
|
||||
}
|
||||
let mut trigram_count = 0;
|
||||
'outer: for word in unknown_words {
|
||||
if word.len() < 3 || trigram_count >= MAX_TRIGRAMS {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut chars = word.chars();
|
||||
let mut a = chars.next().unwrap();
|
||||
let mut b = chars.next().unwrap();
|
||||
|
||||
for c in chars {
|
||||
if trigram_count >= MAX_TRIGRAMS {
|
||||
break 'outer;
|
||||
}
|
||||
trigram_count += 1;
|
||||
|
||||
let trigram = [a, b, c];
|
||||
|
||||
let Some(items) = self.trigram_index.get(&trigram) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
if some_pool {
|
||||
for &item in items {
|
||||
if let Some(score) = scores.get_mut(&item) {
|
||||
*score += 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for &item in items {
|
||||
let len = unsafe { &*item }.len();
|
||||
if len >= min_len {
|
||||
*scores.entry(item).or_default() += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Slide window
|
||||
a = b;
|
||||
b = c;
|
||||
}
|
||||
}
|
||||
|
||||
// Filter by minimum score
|
||||
let min_score = trigram_count.div_ceil(2);
|
||||
let mut results: Vec<_> = scores
|
||||
.into_iter()
|
||||
.filter(|(_, s)| *s >= min_score)
|
||||
.map(|(item, score)| (unsafe { &*item as &str }, score))
|
||||
.collect();
|
||||
|
||||
if results.len() > limit {
|
||||
results.select_nth_unstable_by(limit, |a, b| {
|
||||
b.1.cmp(&a.1).then_with(|| a.0.len().cmp(&b.0.len()))
|
||||
});
|
||||
results.truncate(limit);
|
||||
}
|
||||
|
||||
results.sort_unstable_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.len().cmp(&b.0.len())));
|
||||
|
||||
results
|
||||
.into_iter()
|
||||
.take(limit)
|
||||
.map(|(item, _)| item)
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,10 @@ use brk_traversable::{Traversable, TreeNode};
|
||||
use derive_deref::{Deref, DerefMut};
|
||||
use vecdb::AnyCollectableVec;
|
||||
|
||||
use crate::pagination::{PaginatedIndexParam, PaginatedMetrics, PaginationParam};
|
||||
use crate::{
|
||||
pagination::{PaginatedIndexParam, PaginatedMetrics, PaginationParam},
|
||||
searcher::NgramSearcher,
|
||||
};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Vecs<'a> {
|
||||
@@ -17,7 +20,9 @@ pub struct Vecs<'a> {
|
||||
pub indexes: Vec<IndexInfo>,
|
||||
pub distinct_metric_count: usize,
|
||||
pub total_metric_count: usize,
|
||||
pub catalog: Option<TreeNode>,
|
||||
pub longest_metric_len: usize,
|
||||
catalog: Option<TreeNode>,
|
||||
searcher: Option<NgramSearcher<'a>>,
|
||||
metric_to_indexes: BTreeMap<&'a str, Vec<Index>>,
|
||||
index_to_metrics: BTreeMap<Index, Vec<&'a str>>,
|
||||
}
|
||||
@@ -55,6 +60,12 @@ impl<'a> Vecs<'a> {
|
||||
sort_ids(&mut ids);
|
||||
|
||||
this.metrics = ids;
|
||||
this.longest_metric_len = this
|
||||
.metrics
|
||||
.iter()
|
||||
.map(|s| s.len())
|
||||
.max()
|
||||
.unwrap_or_default();
|
||||
this.distinct_metric_count = this.metric_to_index_to_vec.keys().count();
|
||||
this.total_metric_count = this
|
||||
.index_to_metric_to_vec
|
||||
@@ -95,6 +106,7 @@ impl<'a> Vecs<'a> {
|
||||
.simplify()
|
||||
.unwrap(),
|
||||
);
|
||||
this.searcher = Some(NgramSearcher::new(&this.metrics));
|
||||
|
||||
this
|
||||
}
|
||||
@@ -157,6 +169,18 @@ impl<'a> Vecs<'a> {
|
||||
|
||||
vec.iter().skip(start).take(end).cloned().collect()
|
||||
}
|
||||
|
||||
pub fn catalog(&self) -> &TreeNode {
|
||||
self.catalog.as_ref().unwrap()
|
||||
}
|
||||
|
||||
pub fn search(&self, metric: &str, limit: usize) -> Vec<&'_ str> {
|
||||
self.searcher().search(metric, limit)
|
||||
}
|
||||
|
||||
fn searcher(&self) -> &NgramSearcher<'_> {
|
||||
self.searcher.as_ref().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Deref, DerefMut)]
|
||||
|
||||
Reference in New Issue
Block a user