From 637c1d44b337a64391c95c90d1012f60a475a427 Mon Sep 17 00:00:00 2001 From: Emma Forman Ling Date: Wed, 22 May 2024 16:55:40 -0700 Subject: [PATCH] TermMetadata RPC (#26203) This PR adds a new trait `TermMetadataFetcher` that will get the term metadata for a segment given term values and the number of documents containing that term that were deleted. This PR doesn't hook it up to the `build_new_segment` code, but the end goal is to get rid of `TermDictionary` lookups in backend. The searcher downloads the relevant segment, gets term ordinals based on the term value, and counts the number of terms that have been completely deleted (num_deleted_docs = doc frequency in the segment). GitOrigin-RevId: f911381c2b17272575c1bc672343e15d5c5f2723 --- crates/pb/protos/searchlight.proto | 22 ++++ crates/search/src/searcher/mod.rs | 3 + crates/search/src/searcher/searcher.rs | 144 ++++++++++++++++++++++++- 3 files changed, 168 insertions(+), 1 deletion(-) diff --git a/crates/pb/protos/searchlight.proto b/crates/pb/protos/searchlight.proto index 2d6aa795..0505314e 100644 --- a/crates/pb/protos/searchlight.proto +++ b/crates/pb/protos/searchlight.proto @@ -12,6 +12,7 @@ service Searchlight { rpc QueueVectorPrefetch(VectorPrefetchRequest) returns (VectorPrefetchResponse); rpc NumberOfSegments(SegmentRequest) returns (SegmentResponse); + rpc SegmentTermMetadata(SegmentTermMetadataRequest) returns (SegmentTermMetadataResponse); // Query a set of tokens against the term dictionary, optionally allowing // for fuzzy matching and prefix matching. Take the top `K` results with @@ -225,6 +226,27 @@ message SegmentResponse { uint32 number_of_segments = 1; } +message SegmentTermMetadataRequest { + StorageType storage_type = 1; + StorageKey segment = 2; + repeated TermValueDeleteCount term_values_and_delete_counts = 3; +} + +message TermValueDeleteCount { + optional bytes term_value = 1; + optional uint32 num_docs_deleted = 2; +} + +message SegmentTermMetadataResponse { + repeated TermOrdDeleteCount term_ords_and_delete_counts = 1; + optional uint64 num_terms_deleted = 2; +} + +message TermOrdDeleteCount { + optional uint64 term_ord = 1; + optional uint32 num_docs_deleted = 2; +} + message QueryTokensRequest { StorageType storage_type = 1; FragmentedTextSegmentPaths segment = 2; diff --git a/crates/search/src/searcher/mod.rs b/crates/search/src/searcher/mod.rs index e7a69fac..9484d861 100644 --- a/crates/search/src/searcher/mod.rs +++ b/crates/search/src/searcher/mod.rs @@ -16,7 +16,10 @@ pub use searcher::{ PostingListQuery, Searcher, SearcherImpl, + SegmentTermMetadata, + SegmentTermMetadataFetcher, Term, + TermValue, TextStorageKeys, TokenMatch, TokenQuery, diff --git a/crates/search/src/searcher/searcher.rs b/crates/search/src/searcher/searcher.rs index 2a38a8e0..60e25bb2 100644 --- a/crates/search/src/searcher/searcher.rs +++ b/crates/search/src/searcher/searcher.rs @@ -46,8 +46,10 @@ use pb::searchlight::{ FragmentedVectorSegmentPaths, MultiSegmentMetadata, QueryBm25StatsResponse, + SegmentTermMetadataResponse, SingleSegmentMetadata, StorageKey, + TermOrdDeleteCount, }; use storage::Storage; pub use tantivy::Term; @@ -61,6 +63,7 @@ use tantivy::{ EnableScoring, }, schema::Field, + termdict::TermOrdinal, SegmentReader, }; use text_search::tracker::{ @@ -128,6 +131,7 @@ use crate::{ TantivySearchIndexSchema, CREATION_TIME_FIELD_NAME, INTERNAL_ID_FIELD_NAME, + SEARCH_FIELD_ID, TS_FIELD_NAME, }; @@ -177,6 +181,87 @@ pub trait Searcher: VectorSearcher + Send + Sync + 'static { ) -> anyhow::Result>; } +#[cfg_attr( + any(test, feature = "testing"), + derive(proptest_derive::Arbitrary, PartialEq, Debug, Clone) +)] +/// Metadata about terms for a specific segment. +pub struct SegmentTermMetadata { + /// The number of documents containing the term that have been deleted, by + /// term ordinal. + pub term_documents_deleted: BTreeMap, + /// The number of terms that have been completely deleted from the segment. + pub num_terms_deleted: u64, +} + +impl TryFrom for SegmentTermMetadata { + type Error = anyhow::Error; + + fn try_from( + SegmentTermMetadataResponse { + term_ords_and_delete_counts, + num_terms_deleted, + }: SegmentTermMetadataResponse, + ) -> Result { + let term_documents_deleted = term_ords_and_delete_counts + .into_iter() + .map( + |TermOrdDeleteCount { + term_ord, + num_docs_deleted, + }| { + let term_ord = term_ord.context("Missing term ord")?; + let num_docs_deleted = num_docs_deleted.context("Missing term delete count")?; + anyhow::Ok::<(TermOrdinal, u32)>((term_ord, num_docs_deleted)) + }, + ) + .try_collect()?; + let num_terms_deleted = num_terms_deleted.context("Missing num terms deleted")?; + Ok(SegmentTermMetadata { + term_documents_deleted, + num_terms_deleted, + }) + } +} + +impl From for SegmentTermMetadataResponse { + fn from( + SegmentTermMetadata { + term_documents_deleted, + num_terms_deleted, + }: SegmentTermMetadata, + ) -> Self { + let term_ords_and_delete_counts = term_documents_deleted + .into_iter() + .map(|(term_ord, num_docs_deleted)| TermOrdDeleteCount { + term_ord: Some(term_ord), + num_docs_deleted: Some(num_docs_deleted), + }) + .collect(); + SegmentTermMetadataResponse { + term_ords_and_delete_counts, + num_terms_deleted: Some(num_terms_deleted), + } + } +} + +/// The value of a tantivy `Term`, should only be constructed from +/// `term.value_bytes()` or protos that contain the same bytes. +pub type TermValue = Vec; + +#[async_trait] +pub trait SegmentTermMetadataFetcher { + /// Gets the term ordinal from term values and determines how many terms + /// have been completely deleted from a segment, given the number of + /// documents deleted containing each term. + async fn segment_term_metadata( + &self, + search_storage: Arc, + segment: ObjectKey, + terms: BTreeMap, + ) -> anyhow::Result; +} + pub struct SearcherImpl { pub(crate) archive_cache: ArchiveCacheManager, segment_cache: SegmentCache, @@ -535,6 +620,43 @@ impl Searcher for SearcherImpl { } } +#[async_trait] +impl SegmentTermMetadataFetcher for SearcherImpl { + async fn segment_term_metadata( + &self, + search_storage: Arc, + segment: ObjectKey, + terms: BTreeMap, + ) -> anyhow::Result { + let segment_path = self + .archive_cache + .get(search_storage, &segment, SearchFileType::Text) + .await?; + let reader = index_reader_for_directory(segment_path)?; + let searcher = reader.searcher(); + // Multisegment indexes only write to one segment. + let segment = searcher.segment_reader(0); + let inverted_index = segment.inverted_index(Field::from_field_id(SEARCH_FIELD_ID))?; + let term_dict = inverted_index.terms(); + let mut term_documents_deleted = BTreeMap::new(); + let mut num_terms_deleted = 0; + for (term, num_documents_deleted) in terms { + let term_ord = term_dict + .term_ord(term)? + .context("Segment must contain term")?; + let doc_freq = term_dict.term_info_from_ord(term_ord).doc_freq; + if doc_freq == num_documents_deleted { + num_terms_deleted += 1; + } + term_documents_deleted.insert(term_ord, num_documents_deleted); + } + Ok(SegmentTermMetadata { + term_documents_deleted, + num_terms_deleted, + }) + } +} + #[async_trait] impl VectorSearcher for SearcherImpl { async fn execute_multi_segment_vector_query( @@ -1399,6 +1521,12 @@ mod tests { types::Timestamp, }; use futures::StreamExt; + use pb::searchlight::SegmentTermMetadataResponse; + use proptest::{ + arbitrary::any, + prelude::*, + proptest, + }; use runtime::testing::TestRuntime; use tantivy::{ Index, @@ -1411,13 +1539,17 @@ mod tests { }; use value::{ assert_obj, + testing::assert_roundtrips, FieldPath, InternalId, ResolvedDocumentId, TabletIdAndTableNumber, }; - use super::PostingListMatch; + use super::{ + PostingListMatch, + SegmentTermMetadata, + }; use crate::{ convex_query::OrTerm, disk_index::index_reader_for_directory, @@ -2036,4 +2168,14 @@ mod tests { id_tracker: StaticIdTracker::load_from_path(paths.id_tracker_path.clone())?, }) } + + proptest! { + #![proptest_config( + ProptestConfig { failure_persistence: None, ..ProptestConfig::default() } + )] + #[test] + fn term_metadata_roundtrips(term_metadata in any::()) { + assert_roundtrips::(term_metadata); + } + } }