interface: create super fast searcher

This commit is contained in:
nym21
2025-10-10 22:12:31 +02:00
parent 1821d5d57b
commit bb46481d7f
73 changed files with 484 additions and 286 deletions
+53 -64
View File
@@ -1,9 +1,9 @@
#![doc = include_str!("../README.md")]
use std::{collections::BTreeMap, sync::OnceLock};
use std::collections::BTreeMap;
use brk_computer::Computer;
use brk_error::{Error, Result};
use brk_error::Result;
use brk_indexer::Indexer;
use brk_parser::Parser;
use brk_structs::{
@@ -11,11 +11,6 @@ use brk_structs::{
TxidPath,
};
use brk_traversable::TreeNode;
use nucleo_matcher::{
Config, Matcher,
pattern::{AtomKind, CaseMatching, Normalization, Pattern},
};
use quick_cache::sync::Cache;
use vecdb::{AnyCollectableVec, AnyStoredVec};
mod chain;
@@ -23,6 +18,7 @@ mod deser;
mod metrics;
mod pagination;
mod params;
mod searcher;
mod vecs;
pub use metrics::{Output, Value};
@@ -35,10 +31,10 @@ use crate::{
vecs::{IndexToVec, MetricToVec},
};
pub fn cached_errors() -> &'static Cache<String, String> {
static CACHE: OnceLock<Cache<String, String>> = OnceLock::new();
CACHE.get_or_init(|| Cache::new(1000))
}
// pub fn cached_errors() -> &'static Cache<String, String> {
// static CACHE: OnceLock<Cache<String, String>> = OnceLock::new();
// CACHE.get_or_init(|| Cache::new(1000))
// }
#[allow(dead_code)]
pub struct Interface<'a> {
@@ -75,64 +71,56 @@ impl<'a> Interface<'a> {
get_transaction_info(txid, self)
}
pub fn search(&self, params: &Params) -> Result<Vec<(String, &&dyn AnyCollectableVec)>> {
let metrics = &params.metrics;
let index = params.index;
pub fn search_metric(&self, metric: &str, limit: usize) -> Vec<&str> {
self.vecs.search(metric, limit)
}
let ids_to_vec = self
.vecs
.index_to_metric_to_vec
.get(&index)
.ok_or(Error::String(format!(
"Index \"{}\" isn't a valid index",
index
)))?;
pub fn search_metric_with_index(
&self,
metric: &str,
index: Index,
// params: &Params,
) -> Result<Vec<(String, &&dyn AnyCollectableVec)>> {
todo!();
metrics.iter()
.map(|metric| {
let vec = ids_to_vec.get(metric.as_str()).ok_or_else(|| {
let cached_errors = cached_errors();
// let all_metrics = &self.vecs.metrics;
// let metrics = &params.metrics;
// let index = params.index;
if let Some(message) = cached_errors.get(metric) {
return Error::String(message)
}
// let ids_to_vec = self
// .vecs
// .index_to_metric_to_vec
// .get(&index)
// .ok_or(Error::String(format!(
// "Index \"{}\" isn't a valid index",
// index
// )))?;
let mut message = format!(
"No vec named \"{}\" indexed by \"{}\" found.\n",
metric,
index
);
// metrics
// .iter()
// .map(|metric| {
// let vec = ids_to_vec.get(metric.as_str()).ok_or_else(|| {
// let matches: Vec<&str> = MATCHER.with(|matcher| {
// let matcher = matcher.borrow();
// let mut scored: Vec<(&str, i64)> = all_metrics
// .iter()
// .filter_map(|m| matcher.fuzzy_match(m, metric).map(|s| (*m, s)))
// .collect();
let mut matcher = Matcher::new(Config::DEFAULT);
// scored.sort_unstable_by_key(|&(_, s)| std::cmp::Reverse(s));
// scored.into_iter().take(5).map(|(m, _)| m).collect()
// });
let matches = Pattern::new(
metric.as_str(),
CaseMatching::Ignore,
Normalization::Smart,
AtomKind::Fuzzy,
)
.match_list(ids_to_vec.keys(), &mut matcher)
.into_iter()
.take(10)
.map(|(s, _)| s)
.collect::<Vec<_>>();
// let mut message = format!("No vec \"{metric}\" for index \"{index}\".\n");
// if !matches.is_empty() {
// message += &format!("\nDid you mean: {matches:?}\n");
// }
if !matches.is_empty() {
message +=
&format!("\nMaybe you meant one of the following: {matches:#?} ?\n");
}
if let Some(index_to_vec) = self.metric_to_index_to_vec().get(metric.as_str()) {
message += &format!("\nBut there is a vec named {metric} which supports the following indexes: {:#?}\n", index_to_vec.keys());
}
cached_errors.insert(metric.clone(), message.clone());
Error::String(message)
});
vec.map(|vec| (metric.clone(), vec))
})
.collect::<Result<Vec<_>>>()
// Error::String(message)
// });
// vec.map(|vec| (metric.clone(), vec))
// })
// .collect::<Result<Vec<_>>>()
}
pub fn format(
@@ -227,7 +215,8 @@ impl<'a> Interface<'a> {
}
pub fn search_and_format(&self, params: Params) -> Result<Output> {
self.format(self.search(&params)?, &params.rest)
todo!()
// self.format(self.search(&params)?, &params.rest)
}
pub fn metric_to_index_to_vec(&self) -> &BTreeMap<&str, IndexToVec<'_>> {
@@ -262,7 +251,7 @@ impl<'a> Interface<'a> {
}
pub fn get_metrics_catalog(&self) -> &TreeNode {
self.vecs.catalog.as_ref().unwrap()
self.vecs.catalog()
}
pub fn get_index_to_vecids(&self, paginated_index: PaginatedIndexParam) -> Vec<&str> {
+200
View File
@@ -0,0 +1,200 @@
use std::{marker::PhantomData, ops::Neg, ptr};
use rustc_hash::{FxHashMap, FxHashSet};
const MAX_TRIGRAMS: usize = 9;
pub struct NgramSearcher<'a> {
max_word_count: usize,
max_word_len: usize,
max_query_len: usize,
word_index: FxHashMap<String, FxHashSet<*const str>>,
trigram_index: FxHashMap<[char; 3], FxHashSet<*const str>>,
_phantom: PhantomData<&'a str>,
}
unsafe impl<'a> Send for NgramSearcher<'a> {}
unsafe impl<'a> Sync for NgramSearcher<'a> {}
const SEPARATORS: &[char] = &['_', '-', ' '];
impl<'a> NgramSearcher<'a> {
pub fn new(items: &[&'a str]) -> Self {
let mut word_index: FxHashMap<String, FxHashSet<*const str>> = FxHashMap::default();
let mut trigram_index: FxHashMap<[char; 3], FxHashSet<*const str>> = FxHashMap::default();
let mut max_word_len = 0;
let mut max_query_len = 0;
let mut max_words = 0;
for &item in items {
max_query_len = max_query_len.max(item.len());
let mut word_count = 0;
for word in item.split(SEPARATORS) {
word_count += 1;
if word.is_empty() {
continue;
}
max_word_len = max_word_len.max(item.len());
word_index.entry(word.to_string()).or_default().insert(item);
if word.len() >= 3 {
let chars = word.chars().collect::<Vec<_>>();
for window in chars.windows(3) {
trigram_index
.entry(unsafe { ptr::read(window.as_ptr() as *const [char; 3]) })
.or_default()
.insert(item);
}
}
}
max_words = max_words.max(word_count);
}
Self {
max_query_len: max_query_len + 6,
max_word_len: max_word_len + 4,
max_word_count: max_word_len + 2,
word_index,
trigram_index,
_phantom: PhantomData,
}
}
pub fn search(&self, query: &str, limit: usize) -> Vec<&'a str> {
let query_lower = query.to_lowercase();
let query_len = query_lower.len();
if query.is_empty() || query_len > self.max_query_len {
return vec![];
}
let words: FxHashSet<&str> = query_lower
.split(SEPARATORS)
.filter(|w| !w.is_empty() && w.len() <= self.max_word_len)
.collect();
if words.is_empty() || words.len() > self.max_word_count {
return vec![];
}
let min_len = query_len.saturating_sub(3);
let mut pool: Option<FxHashSet<*const str>> = None;
let mut unknown_words = Vec::new();
let mut words_to_intersect = vec![];
for word in words {
match self.word_index.get(word) {
Some(items) => words_to_intersect.push(items),
None => unknown_words.push(word),
}
}
if !words_to_intersect.is_empty() {
words_to_intersect.sort_unstable_by_key(|set| (set.len() as i64).neg());
let mut intersect = words_to_intersect.pop().cloned().unwrap();
for other_set in words_to_intersect.iter().rev() {
intersect.retain(|ptr| other_set.contains(ptr));
if intersect.is_empty() {
break;
}
}
pool = Some(intersect);
}
let some_pool = pool.is_some();
if some_pool && unknown_words.is_empty() {
let mut results: Vec<_> = pool
.unwrap()
.into_iter()
.map(|item| unsafe { &*item as &str })
.collect();
// Partial sort - only sort what we need
if results.len() > limit {
results.select_nth_unstable_by_key(limit, |item| item.len());
results.truncate(limit);
}
results.sort_unstable_by_key(|item| item.len());
return results;
}
// Score candidates
let mut scores: FxHashMap<*const str, usize> = FxHashMap::default();
scores.reserve(256);
if let Some(pool) = &pool {
for &item in pool {
scores.insert(item, 1);
}
}
let mut trigram_count = 0;
'outer: for word in unknown_words {
if word.len() < 3 || trigram_count >= MAX_TRIGRAMS {
continue;
}
let mut chars = word.chars();
let mut a = chars.next().unwrap();
let mut b = chars.next().unwrap();
for c in chars {
if trigram_count >= MAX_TRIGRAMS {
break 'outer;
}
trigram_count += 1;
let trigram = [a, b, c];
let Some(items) = self.trigram_index.get(&trigram) else {
continue;
};
if some_pool {
for &item in items {
if let Some(score) = scores.get_mut(&item) {
*score += 1;
}
}
} else {
for &item in items {
let len = unsafe { &*item }.len();
if len >= min_len {
*scores.entry(item).or_default() += 1;
}
}
}
// Slide window
a = b;
b = c;
}
}
// Filter by minimum score
let min_score = trigram_count.div_ceil(2);
let mut results: Vec<_> = scores
.into_iter()
.filter(|(_, s)| *s >= min_score)
.map(|(item, score)| (unsafe { &*item as &str }, score))
.collect();
if results.len() > limit {
results.select_nth_unstable_by(limit, |a, b| {
b.1.cmp(&a.1).then_with(|| a.0.len().cmp(&b.0.len()))
});
results.truncate(limit);
}
results.sort_unstable_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.len().cmp(&b.0.len())));
results
.into_iter()
.take(limit)
.map(|(item, _)| item)
.collect()
}
}
+26 -2
View File
@@ -7,7 +7,10 @@ use brk_traversable::{Traversable, TreeNode};
use derive_deref::{Deref, DerefMut};
use vecdb::AnyCollectableVec;
use crate::pagination::{PaginatedIndexParam, PaginatedMetrics, PaginationParam};
use crate::{
pagination::{PaginatedIndexParam, PaginatedMetrics, PaginationParam},
searcher::NgramSearcher,
};
#[derive(Default)]
pub struct Vecs<'a> {
@@ -17,7 +20,9 @@ pub struct Vecs<'a> {
pub indexes: Vec<IndexInfo>,
pub distinct_metric_count: usize,
pub total_metric_count: usize,
pub catalog: Option<TreeNode>,
pub longest_metric_len: usize,
catalog: Option<TreeNode>,
searcher: Option<NgramSearcher<'a>>,
metric_to_indexes: BTreeMap<&'a str, Vec<Index>>,
index_to_metrics: BTreeMap<Index, Vec<&'a str>>,
}
@@ -55,6 +60,12 @@ impl<'a> Vecs<'a> {
sort_ids(&mut ids);
this.metrics = ids;
this.longest_metric_len = this
.metrics
.iter()
.map(|s| s.len())
.max()
.unwrap_or_default();
this.distinct_metric_count = this.metric_to_index_to_vec.keys().count();
this.total_metric_count = this
.index_to_metric_to_vec
@@ -95,6 +106,7 @@ impl<'a> Vecs<'a> {
.simplify()
.unwrap(),
);
this.searcher = Some(NgramSearcher::new(&this.metrics));
this
}
@@ -157,6 +169,18 @@ impl<'a> Vecs<'a> {
vec.iter().skip(start).take(end).cloned().collect()
}
pub fn catalog(&self) -> &TreeNode {
self.catalog.as_ref().unwrap()
}
pub fn search(&self, metric: &str, limit: usize) -> Vec<&'_ str> {
self.searcher().search(metric, limit)
}
fn searcher(&self) -> &NgramSearcher<'_> {
self.searcher.as_ref().unwrap()
}
}
#[derive(Default, Deref, DerefMut)]