Skip to content

Commit

Permalink
cleanup: make support for group_id and group_tracking_id filters more…
Browse files Browse the repository at this point in the history
… clear
  • Loading branch information
skeptrunedev authored and cdxker committed Dec 4, 2024
1 parent d4e9e30 commit f603016
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 20 deletions.
66 changes: 61 additions & 5 deletions clients/ts-sdk/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -7188,7 +7188,7 @@
},
"ChunkFilter": {
"type": "object",
"description": "Filters is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.",
"description": "ChunkFilter is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.",
"properties": {
"must": {
"type": "array",
Expand Down Expand Up @@ -8097,9 +8097,10 @@
"$ref": "#/components/schemas/FieldCondition"
},
{
"$ref": "#/components/schemas/HasIDCondition"
"$ref": "#/components/schemas/HasChunkIDCondition"
}
]
],
"description": "Filters can be constructed using either fields on the chunk objects, ids or tracking ids of chunks, and finally ids or tracking ids of groups."
},
"ContentChunkMetadata": {
"type": "object",
Expand Down Expand Up @@ -8791,6 +8792,14 @@
],
"nullable": true
},
"sort_options": {
"allOf": [
{
"$ref": "#/components/schemas/SortOptions"
}
],
"nullable": true
},
"topic_id": {
"type": "string",
"format": "uuid",
Expand Down Expand Up @@ -9508,6 +9517,14 @@
],
"nullable": true
},
"sort_options": {
"allOf": [
{
"$ref": "#/components/schemas/SortOptions"
}
],
"nullable": true
},
"topic_id": {
"type": "string",
"format": "uuid",
Expand Down Expand Up @@ -10176,6 +10193,7 @@
},
"FieldCondition": {
"type": "object",
"description": "FieldCondition is a JSON object which can be used to filter chunks by a field. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.",
"required": [
"field"
],
Expand All @@ -10190,7 +10208,7 @@
},
"field": {
"type": "string",
"description": "Field is the name of the field to filter on. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`."
"description": "Field is the name of the field to filter on. Commonly used fields are `timestamp`, `link`, `tag_set`, `location`, `num_value`, `group_ids`, and `group_tracking_ids`. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`."
},
"geo_bounding_box": {
"allOf": [
Expand Down Expand Up @@ -10976,22 +10994,25 @@
}
}
},
"HasIDCondition": {
"HasChunkIDCondition": {
"type": "object",
"description": "HasChunkIDCondition is a JSON object which can be used to filter chunks by their ids or tracking ids. This is useful for when you want to filter chunks by their ids or tracking ids.",
"properties": {
"ids": {
"type": "array",
"items": {
"type": "string",
"format": "uuid"
},
"description": "Ids of the chunks to apply a match_any condition with. Only chunks with one of these ids will be returned.",
"nullable": true
},
"tracking_ids": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tracking ids of the chunks to apply a match_any condition with. Only chunks with one of these tracking ids will be returned.",
"nullable": true
}
}
Expand Down Expand Up @@ -11441,6 +11462,25 @@
"updated_at": "2021-01-01 00:00:00.000"
}
},
"MmrOptions": {
"type": "object",
"description": "MMR Options lets you specify different methods to rerank the chunks in the result set using Maximal Marginal Relevance. If not specified, this defaults to the score of the chunks.",
"required": [
"use_mmr"
],
"properties": {
"mmr_lambda": {
"type": "number",
"format": "float",
"description": "Set mmr_lambda to a value between 0.0 and 1.0 to control the tradeoff between relevance and diversity. Closer to 1.0 will give more diverse results, closer to 0.0 will give more relevant results. If not specified, this defaults to 0.5.",
"nullable": true
},
"use_mmr": {
"type": "boolean",
"description": "Set use_mmr to true to use the Maximal Marginal Relevance algorithm to rerank the results."
}
}
},
"MultiQuery": {
"type": "object",
"description": "MultiQuery allows you to construct a dense vector from multiple queries with a weighted sum. This is useful for when you want to emphasize certain features of the query. This only works with Semantic Search and is not compatible with cross encoder re-ranking or highlights.",
Expand Down Expand Up @@ -13135,6 +13175,14 @@
],
"nullable": true
},
"sort_options": {
"allOf": [
{
"$ref": "#/components/schemas/SortOptions"
}
],
"nullable": true
},
"topic_id": {
"type": "string",
"format": "uuid",
Expand Down Expand Up @@ -15036,6 +15084,14 @@
],
"nullable": true
},
"mmr": {
"allOf": [
{
"$ref": "#/components/schemas/MmrOptions"
}
],
"nullable": true
},
"recency_bias": {
"type": "number",
"format": "float",
Expand Down
2 changes: 1 addition & 1 deletion clients/ts-sdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"files": [
"dist"
],
"version": "0.0.36",
"version": "0.0.37",
"license": "MIT",
"scripts": {
"lint": "eslint 'src/**/*.ts'",
Expand Down
41 changes: 37 additions & 4 deletions clients/ts-sdk/src/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ export type ChatMessageProxy = {
};

/**
* Filters is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.
* ChunkFilter is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.
*/
export type ChunkFilter = {
/**
Expand Down Expand Up @@ -485,7 +485,10 @@ export type ClusterAnalyticsFilter = {

export type ClusterAnalyticsResponse = SearchClusterResponse | SearchQueryResponse;

export type ConditionType = FieldCondition | HasIDCondition;
/**
* Filters can be constructed using either fields on the chunk objects, ids or tracking ids of chunks, and finally ids or tracking ids of groups.
*/
export type ConditionType = FieldCondition | HasChunkIDCondition;

export type ContentChunkMetadata = {
chunk_html?: (string) | null;
Expand Down Expand Up @@ -720,6 +723,7 @@ export type CreateMessageReqPayload = {
*/
search_query?: (string) | null;
search_type?: ((SearchMethod) | null);
sort_options?: ((SortOptions) | null);
/**
* The ID of the topic to attach the message to.
*/
Expand Down Expand Up @@ -1033,6 +1037,7 @@ export type EditMessageReqPayload = {
*/
search_query?: (string) | null;
search_type?: ((SearchMethod) | null);
sort_options?: ((SortOptions) | null);
/**
* The id of the topic to edit the message at the given sort order for.
*/
Expand Down Expand Up @@ -1329,10 +1334,13 @@ export type event_type = 'view';

export type EventTypesFilter = 'add_to_cart' | 'purchase' | 'view' | 'click' | 'filter_clicked';

/**
* FieldCondition is a JSON object which can be used to filter chunks by a field. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
*/
export type FieldCondition = {
date_range?: ((DateRange) | null);
/**
* Field is the name of the field to filter on. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
* Field is the name of the field to filter on. Commonly used fields are `timestamp`, `link`, `tag_set`, `location`, `num_value`, `group_ids`, and `group_tracking_ids`. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
*/
field: string;
geo_bounding_box?: ((LocationBoundingBox) | null);
Expand Down Expand Up @@ -1597,8 +1605,17 @@ export type GroupsForChunk = {
slim_groups: Array<ChunkGroupAndFileId>;
};

export type HasIDCondition = {
/**
* HasChunkIDCondition is a JSON object which can be used to filter chunks by their ids or tracking ids. This is useful for when you want to filter chunks by their ids or tracking ids.
*/
export type HasChunkIDCondition = {
/**
* Ids of the chunks to apply a match_any condition with. Only chunks with one of these ids will be returned.
*/
ids?: Array<(string)> | null;
/**
* Tracking ids of the chunks to apply a match_any condition with. Only chunks with one of these tracking ids will be returned.
*/
tracking_ids?: Array<(string)> | null;
};

Expand Down Expand Up @@ -1777,6 +1794,20 @@ export type Message = {
updated_at: string;
};

/**
* MMR Options lets you specify different methods to rerank the chunks in the result set using Maximal Marginal Relevance. If not specified, this defaults to the score of the chunks.
*/
export type MmrOptions = {
/**
* Set mmr_lambda to a value between 0.0 and 1.0 to control the tradeoff between relevance and diversity. Closer to 1.0 will give more diverse results, closer to 0.0 will give more relevant results. If not specified, this defaults to 0.5.
*/
mmr_lambda?: (number) | null;
/**
* Set use_mmr to true to use the Maximal Marginal Relevance algorithm to rerank the results.
*/
use_mmr: boolean;
};

/**
* MultiQuery allows you to construct a dense vector from multiple queries with a weighted sum. This is useful for when you want to emphasize certain features of the query. This only works with Semantic Search and is not compatible with cross encoder re-ranking or highlights.
*/
Expand Down Expand Up @@ -2218,6 +2249,7 @@ export type RegenerateMessageReqPayload = {
*/
search_query?: (string) | null;
search_type?: ((SearchMethod) | null);
sort_options?: ((SortOptions) | null);
/**
* The id of the topic to regenerate the last message for.
*/
Expand Down Expand Up @@ -2723,6 +2755,7 @@ export type SortBySearchType = {
*/
export type SortOptions = {
location_bias?: ((GeoInfoWithBias) | null);
mmr?: ((MmrOptions) | null);
/**
* Recency Bias lets you determine how much of an effect the recency of chunks will have on the search results. If not specified, this defaults to 0.0. We recommend setting this to 1.0 for a gentle reranking of the results, >3.0 for a strong reranking of the results.
*/
Expand Down
2 changes: 2 additions & 0 deletions frontends/search/src/components/FilterModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,8 @@ export const FilterItem = (props: FilterItemProps) => {
"location",
"metadata",
"num_value",
"group_tracking_ids",
"group_ids",
"tracking_ids",
"ids",
]}
Expand Down
13 changes: 10 additions & 3 deletions server/src/data/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4204,14 +4204,20 @@ pub struct LocationPolygon {
#[allow(clippy::large_enum_variant)]
#[derive(Serialize, Deserialize, Debug, Clone, ToSchema)]
#[serde(untagged)]
/// Filters can be constructed using either fields on the chunk objects, ids or tracking ids of chunks, and finally ids or tracking ids of groups.
pub enum ConditionType {
#[schema(title = "FieldCondition")]
Field(FieldCondition),
HasID(HasIDCondition),
#[schema(title = "HasChunkIDCondition")]
HasChunkId(HasChunkIDCondition),
}

#[derive(Serialize, Deserialize, Debug, Clone, ToSchema)]
pub struct HasIDCondition {
/// HasChunkIDCondition is a JSON object which can be used to filter chunks by their ids or tracking ids. This is useful for when you want to filter chunks by their ids or tracking ids.
pub struct HasChunkIDCondition {
/// Ids of the chunks to apply a match_any condition with. Only chunks with one of these ids will be returned.
pub ids: Option<Vec<uuid::Uuid>>,
/// Tracking ids of the chunks to apply a match_any condition with. Only chunks with one of these tracking ids will be returned.
pub tracking_ids: Option<Vec<String>>,
}

Expand All @@ -4226,8 +4232,9 @@ pub struct HasIDCondition {
"lt": 1.0
}
}))]
/// FieldCondition is a JSON object which can be used to filter chunks by a field. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
pub struct FieldCondition {
/// Field is the name of the field to filter on. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
/// Field is the name of the field to filter on. Commonly used fields are `timestamp`, `link`, `tag_set`, `location`, `num_value`, `group_ids`, and `group_tracking_ids`. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
pub field: String,
/// Match any lets you pass in an array of values that will return results if any of the items match. The match value will be used to check for an exact substring match on the metadata values for each existing chunk. If both match_all and match_any are provided, the match_any condition will be used.
#[serde(alias = "match")]
Expand Down
2 changes: 1 addition & 1 deletion server/src/handlers/chunk_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -948,7 +948,7 @@ pub async fn update_chunk_by_tracking_id(
}
]
}))]
/// Filters is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.
/// ChunkFilter is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.
pub struct ChunkFilter {
/// Only one of these field conditions has to match for the chunk to be included in the result set.
pub should: Option<Vec<ConditionType>>,
Expand Down
2 changes: 1 addition & 1 deletion server/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ impl Modify for SecurityAddon {
data::models::ContentChunkMetadata,
data::models::ChunkMetadataStringTagSet,
data::models::ConditionType,
data::models::HasIDCondition,
data::models::HasChunkIDCondition,
data::models::DistanceMetric,
data::models::PublicDatasetOptions,
data::models::Invitation,
Expand Down
10 changes: 5 additions & 5 deletions server/src/operators/search_operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use super::typo_operator::correct_query;
use crate::data::models::{
convert_to_date_time, ChunkGroup, ChunkGroupAndFileId, ChunkMetadata,
ChunkMetadataStringTagSet, ChunkMetadataTypes, ConditionType, ContentChunkMetadata, Dataset,
DatasetConfiguration, HasIDCondition, MmrOptions, QdrantChunkMetadata, QdrantSortBy,
DatasetConfiguration, HasChunkIDCondition, MmrOptions, QdrantChunkMetadata, QdrantSortBy,
QueryTypes, ReRankOptions, RedisPool, ScoreChunk, ScoreChunkDTO, SearchMethod,
SlimChunkMetadata, SortByField, SortBySearchType, SortOptions, UnifiedId,
};
Expand Down Expand Up @@ -157,7 +157,7 @@ async fn convert_group_tracking_ids_to_group_ids(
}

pub async fn get_qdrant_ids_from_condition(
cond: HasIDCondition,
cond: HasChunkIDCondition,
pool: web::Data<Pool>,
) -> Result<Vec<String>, ServiceError> {
if let Some(ids) = cond.ids {
Expand Down Expand Up @@ -220,7 +220,7 @@ pub async fn assemble_qdrant_filter(
filter.should.push(condition);
}
}
ConditionType::HasID(cond) => {
ConditionType::HasChunkId(cond) => {
filter.should.push(Condition::has_id(
get_qdrant_ids_from_condition(cond, pool.clone()).await?,
));
Expand All @@ -245,7 +245,7 @@ pub async fn assemble_qdrant_filter(
filter.must.push(condition);
}
}
ConditionType::HasID(cond) => {
ConditionType::HasChunkId(cond) => {
filter.must.push(Condition::has_id(
get_qdrant_ids_from_condition(cond, pool.clone()).await?,
));
Expand All @@ -270,7 +270,7 @@ pub async fn assemble_qdrant_filter(
filter.must_not.push(condition);
}
}
ConditionType::HasID(cond) => {
ConditionType::HasChunkId(cond) => {
filter.must_not.push(Condition::has_id(
get_qdrant_ids_from_condition(cond, pool.clone()).await?,
));
Expand Down

0 comments on commit f603016

Please sign in to comment.