cleanup: make support for group_id and group_tracking_id filters more…

… clear
devflowinc · Dec 4, 2024 · f603016 · f603016
1 parent d4e9e30
commit f603016
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 20 deletions.
diff --git a/clients/ts-sdk/openapi.json b/clients/ts-sdk/openapi.json
@@ -7188,7 +7188,7 @@
       },
       "ChunkFilter": {
         "type": "object",
-        "description": "Filters is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.",
+        "description": "ChunkFilter is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.",
         "properties": {
           "must": {
             "type": "array",
@@ -8097,9 +8097,10 @@
             "$ref": "#/components/schemas/FieldCondition"
           },
           {
-            "$ref": "#/components/schemas/HasIDCondition"
+            "$ref": "#/components/schemas/HasChunkIDCondition"
           }
-        ]
+        ],
+        "description": "Filters can be constructed using either fields on the chunk objects, ids or tracking ids of chunks, and finally ids or tracking ids of groups."
       },
       "ContentChunkMetadata": {
         "type": "object",
@@ -8791,6 +8792,14 @@
             ],
             "nullable": true
           },
+          "sort_options": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/SortOptions"
+              }
+            ],
+            "nullable": true
+          },
           "topic_id": {
             "type": "string",
             "format": "uuid",
@@ -9508,6 +9517,14 @@
             ],
             "nullable": true
           },
+          "sort_options": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/SortOptions"
+              }
+            ],
+            "nullable": true
+          },
           "topic_id": {
             "type": "string",
             "format": "uuid",
@@ -10176,6 +10193,7 @@
       },
       "FieldCondition": {
         "type": "object",
+        "description": "FieldCondition is a JSON object which can be used to filter chunks by a field. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.",
         "required": [
           "field"
         ],
@@ -10190,7 +10208,7 @@
           },
           "field": {
             "type": "string",
-            "description": "Field is the name of the field to filter on. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`."
+            "description": "Field is the name of the field to filter on. Commonly used fields are `timestamp`, `link`, `tag_set`, `location`, `num_value`, `group_ids`, and `group_tracking_ids`. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`."
           },
           "geo_bounding_box": {
             "allOf": [
@@ -10976,22 +10994,25 @@
           }
         }
       },
-      "HasIDCondition": {
+      "HasChunkIDCondition": {
         "type": "object",
+        "description": "HasChunkIDCondition is a JSON object which can be used to filter chunks by their ids or tracking ids. This is useful for when you want to filter chunks by their ids or tracking ids.",
         "properties": {
           "ids": {
             "type": "array",
             "items": {
               "type": "string",
               "format": "uuid"
             },
+            "description": "Ids of the chunks to apply a match_any condition with. Only chunks with one of these ids will be returned.",
             "nullable": true
           },
           "tracking_ids": {
             "type": "array",
             "items": {
               "type": "string"
             },
+            "description": "Tracking ids of the chunks to apply a match_any condition with. Only chunks with one of these tracking ids will be returned.",
             "nullable": true
           }
         }
@@ -11441,6 +11462,25 @@
           "updated_at": "2021-01-01 00:00:00.000"
         }
       },
+      "MmrOptions": {
+        "type": "object",
+        "description": "MMR Options lets you specify different methods to rerank the chunks in the result set using Maximal Marginal Relevance. If not specified, this defaults to the score of the chunks.",
+        "required": [
+          "use_mmr"
+        ],
+        "properties": {
+          "mmr_lambda": {
+            "type": "number",
+            "format": "float",
+            "description": "Set mmr_lambda to a value between 0.0 and 1.0 to control the tradeoff between relevance and diversity. Closer to 1.0 will give more diverse results, closer to 0.0 will give more relevant results. If not specified, this defaults to 0.5.",
+            "nullable": true
+          },
+          "use_mmr": {
+            "type": "boolean",
+            "description": "Set use_mmr to true to use the Maximal Marginal Relevance algorithm to rerank the results."
+          }
+        }
+      },
       "MultiQuery": {
         "type": "object",
         "description": "MultiQuery allows you to construct a dense vector from multiple queries with a weighted sum. This is useful for when you want to emphasize certain features of the query. This only works with Semantic Search and is not compatible with cross encoder re-ranking or highlights.",
@@ -13135,6 +13175,14 @@
             ],
             "nullable": true
           },
+          "sort_options": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/SortOptions"
+              }
+            ],
+            "nullable": true
+          },
           "topic_id": {
             "type": "string",
             "format": "uuid",
@@ -15036,6 +15084,14 @@
             ],
             "nullable": true
           },
+          "mmr": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/MmrOptions"
+              }
+            ],
+            "nullable": true
+          },
           "recency_bias": {
             "type": "number",
             "format": "float",

diff --git a/clients/ts-sdk/package.json b/clients/ts-sdk/package.json
@@ -6,7 +6,7 @@
   "files": [
     "dist"
   ],
-  "version": "0.0.36",
+  "version": "0.0.37",
   "license": "MIT",
   "scripts": {
     "lint": "eslint 'src/**/*.ts'",

diff --git a/clients/ts-sdk/src/types.gen.ts b/clients/ts-sdk/src/types.gen.ts
@@ -206,7 +206,7 @@ export type ChatMessageProxy = {
 };
 
 /**
- * Filters is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.
+ * ChunkFilter is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.
  */
 export type ChunkFilter = {
     /**
@@ -485,7 +485,10 @@ export type ClusterAnalyticsFilter = {
 
 export type ClusterAnalyticsResponse = SearchClusterResponse | SearchQueryResponse;
 
-export type ConditionType = FieldCondition | HasIDCondition;
+/**
+ * Filters can be constructed using either fields on the chunk objects, ids or tracking ids of chunks, and finally ids or tracking ids of groups.
+ */
+export type ConditionType = FieldCondition | HasChunkIDCondition;
 
 export type ContentChunkMetadata = {
     chunk_html?: (string) | null;
@@ -720,6 +723,7 @@ export type CreateMessageReqPayload = {
      */
     search_query?: (string) | null;
     search_type?: ((SearchMethod) | null);
+    sort_options?: ((SortOptions) | null);
     /**
      * The ID of the topic to attach the message to.
      */
@@ -1033,6 +1037,7 @@ export type EditMessageReqPayload = {
      */
     search_query?: (string) | null;
     search_type?: ((SearchMethod) | null);
+    sort_options?: ((SortOptions) | null);
     /**
      * The id of the topic to edit the message at the given sort order for.
      */
@@ -1329,10 +1334,13 @@ export type event_type = 'view';
 
 export type EventTypesFilter = 'add_to_cart' | 'purchase' | 'view' | 'click' | 'filter_clicked';
 
+/**
+ * FieldCondition is a JSON object which can be used to filter chunks by a field. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
+ */
 export type FieldCondition = {
     date_range?: ((DateRange) | null);
     /**
-     * Field is the name of the field to filter on. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
+     * Field is the name of the field to filter on. Commonly used fields are `timestamp`, `link`, `tag_set`, `location`, `num_value`, `group_ids`, and `group_tracking_ids`. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
      */
     field: string;
     geo_bounding_box?: ((LocationBoundingBox) | null);
@@ -1597,8 +1605,17 @@ export type GroupsForChunk = {
     slim_groups: Array<ChunkGroupAndFileId>;
 };
 
-export type HasIDCondition = {
+/**
+ * HasChunkIDCondition is a JSON object which can be used to filter chunks by their ids or tracking ids. This is useful for when you want to filter chunks by their ids or tracking ids.
+ */
+export type HasChunkIDCondition = {
+    /**
+     * Ids of the chunks to apply a match_any condition with. Only chunks with one of these ids will be returned.
+     */
     ids?: Array<(string)> | null;
+    /**
+     * Tracking ids of the chunks to apply a match_any condition with. Only chunks with one of these tracking ids will be returned.
+     */
     tracking_ids?: Array<(string)> | null;
 };
 
@@ -1777,6 +1794,20 @@ export type Message = {
     updated_at: string;
 };
 
+/**
+ * MMR Options lets you specify different methods to rerank the chunks in the result set using Maximal Marginal Relevance. If not specified, this defaults to the score of the chunks.
+ */
+export type MmrOptions = {
+    /**
+     * Set mmr_lambda to a value between 0.0 and 1.0 to control the tradeoff between relevance and diversity. Closer to 1.0 will give more diverse results, closer to 0.0 will give more relevant results. If not specified, this defaults to 0.5.
+     */
+    mmr_lambda?: (number) | null;
+    /**
+     * Set use_mmr to true to use the Maximal Marginal Relevance algorithm to rerank the results.
+     */
+    use_mmr: boolean;
+};
+
 /**
  * MultiQuery allows you to construct a dense vector from multiple queries with a weighted sum. This is useful for when you want to emphasize certain features of the query. This only works with Semantic Search and is not compatible with cross encoder re-ranking or highlights.
  */
@@ -2218,6 +2249,7 @@ export type RegenerateMessageReqPayload = {
      */
     search_query?: (string) | null;
     search_type?: ((SearchMethod) | null);
+    sort_options?: ((SortOptions) | null);
     /**
      * The id of the topic to regenerate the last message for.
      */
@@ -2723,6 +2755,7 @@ export type SortBySearchType = {
  */
 export type SortOptions = {
     location_bias?: ((GeoInfoWithBias) | null);
+    mmr?: ((MmrOptions) | null);
     /**
      * Recency Bias lets you determine how much of an effect the recency of chunks will have on the search results. If not specified, this defaults to 0.0. We recommend setting this to 1.0 for a gentle reranking of the results, >3.0 for a strong reranking of the results.
      */

diff --git a/frontends/search/src/components/FilterModal.tsx b/frontends/search/src/components/FilterModal.tsx
@@ -339,6 +339,8 @@ export const FilterItem = (props: FilterItemProps) => {
               "location",
               "metadata",
               "num_value",
+              "group_tracking_ids",
+              "group_ids",
               "tracking_ids",
               "ids",
             ]}

diff --git a/server/src/data/models.rs b/server/src/data/models.rs
@@ -4204,14 +4204,20 @@ pub struct LocationPolygon {
 #[allow(clippy::large_enum_variant)]
 #[derive(Serialize, Deserialize, Debug, Clone, ToSchema)]
 #[serde(untagged)]
+/// Filters can be constructed using either fields on the chunk objects, ids or tracking ids of chunks, and finally ids or tracking ids of groups.
 pub enum ConditionType {
+    #[schema(title = "FieldCondition")]
     Field(FieldCondition),
-    HasID(HasIDCondition),
+    #[schema(title = "HasChunkIDCondition")]
+    HasChunkId(HasChunkIDCondition),
 }
 
 #[derive(Serialize, Deserialize, Debug, Clone, ToSchema)]
-pub struct HasIDCondition {
+/// HasChunkIDCondition is a JSON object which can be used to filter chunks by their ids or tracking ids. This is useful for when you want to filter chunks by their ids or tracking ids.
+pub struct HasChunkIDCondition {
+    /// Ids of the chunks to apply a match_any condition with. Only chunks with one of these ids will be returned.
     pub ids: Option<Vec<uuid::Uuid>>,
+    /// Tracking ids of the chunks to apply a match_any condition with. Only chunks with one of these tracking ids will be returned.
     pub tracking_ids: Option<Vec<String>>,
 }
 
@@ -4226,8 +4232,9 @@ pub struct HasIDCondition {
         "lt": 1.0
     }
 }))]
+/// FieldCondition is a JSON object which can be used to filter chunks by a field. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
 pub struct FieldCondition {
-    /// Field is the name of the field to filter on. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
+    /// Field is the name of the field to filter on. Commonly used fields are `timestamp`, `link`, `tag_set`, `location`, `num_value`, `group_ids`, and `group_tracking_ids`. The field value will be used to check for an exact substring match on the metadata values for each existing chunk. This is useful for when you want to filter chunks by arbitrary metadata. To access fields inside of the metadata that you provide with the card, prefix the field name with `metadata.`.
     pub field: String,
     /// Match any lets you pass in an array of values that will return results if any of the items match. The match value will be used to check for an exact substring match on the metadata values for each existing chunk. If both match_all and match_any are provided, the match_any condition will be used.
     #[serde(alias = "match")]

diff --git a/server/src/handlers/chunk_handler.rs b/server/src/handlers/chunk_handler.rs
@@ -948,7 +948,7 @@ pub async fn update_chunk_by_tracking_id(
         }
     ]
 }))]
-/// Filters is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.
+/// ChunkFilter is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata.
 pub struct ChunkFilter {
     /// Only one of these field conditions has to match for the chunk to be included in the result set.
     pub should: Option<Vec<ConditionType>>,

diff --git a/server/src/lib.rs b/server/src/lib.rs
@@ -519,7 +519,7 @@ impl Modify for SecurityAddon {
             data::models::ContentChunkMetadata,
             data::models::ChunkMetadataStringTagSet,
             data::models::ConditionType,
-            data::models::HasIDCondition,
+            data::models::HasChunkIDCondition,
             data::models::DistanceMetric,
             data::models::PublicDatasetOptions,
             data::models::Invitation,

diff --git a/server/src/operators/search_operator.rs b/server/src/operators/search_operator.rs
@@ -17,7 +17,7 @@ use super::typo_operator::correct_query;
 use crate::data::models::{
     convert_to_date_time, ChunkGroup, ChunkGroupAndFileId, ChunkMetadata,
     ChunkMetadataStringTagSet, ChunkMetadataTypes, ConditionType, ContentChunkMetadata, Dataset,
-    DatasetConfiguration, HasIDCondition, MmrOptions, QdrantChunkMetadata, QdrantSortBy,
+    DatasetConfiguration, HasChunkIDCondition, MmrOptions, QdrantChunkMetadata, QdrantSortBy,
     QueryTypes, ReRankOptions, RedisPool, ScoreChunk, ScoreChunkDTO, SearchMethod,
     SlimChunkMetadata, SortByField, SortBySearchType, SortOptions, UnifiedId,
 };
@@ -157,7 +157,7 @@ async fn convert_group_tracking_ids_to_group_ids(
 }
 
 pub async fn get_qdrant_ids_from_condition(
-    cond: HasIDCondition,
+    cond: HasChunkIDCondition,
     pool: web::Data<Pool>,
 ) -> Result<Vec<String>, ServiceError> {
     if let Some(ids) = cond.ids {
@@ -220,7 +220,7 @@ pub async fn assemble_qdrant_filter(
                             filter.should.push(condition);
                         }
                     }
-                    ConditionType::HasID(cond) => {
+                    ConditionType::HasChunkId(cond) => {
                         filter.should.push(Condition::has_id(
                             get_qdrant_ids_from_condition(cond, pool.clone()).await?,
                         ));
@@ -245,7 +245,7 @@ pub async fn assemble_qdrant_filter(
                             filter.must.push(condition);
                         }
                     }
-                    ConditionType::HasID(cond) => {
+                    ConditionType::HasChunkId(cond) => {
                         filter.must.push(Condition::has_id(
                             get_qdrant_ids_from_condition(cond, pool.clone()).await?,
                         ));
@@ -270,7 +270,7 @@ pub async fn assemble_qdrant_filter(
                             filter.must_not.push(condition);
                         }
                     }
-                    ConditionType::HasID(cond) => {
+                    ConditionType::HasChunkId(cond) => {
                         filter.must_not.push(Condition::has_id(
                             get_qdrant_ids_from_condition(cond, pool.clone()).await?,
                         ));