Patch unstructured bool (#709)

Patch the Boolean field for unstructured index * We can correctly filter on the string bool field now * Search can return a bool field value
marqo-ai · Feb 6, 2024 · d7bfb6b · d7bfb6b
1 parent 74bf999
commit d7bfb6b
Show file tree

Hide file tree

Showing 8 changed files with 347 additions and 3 deletions.
diff --git a/src/marqo/core/unstructured_vespa_index/unstructured_vespa_index.py b/src/marqo/core/unstructured_vespa_index/unstructured_vespa_index.py
@@ -126,8 +126,9 @@ def generate_equality_filter_string(node: search_filter.EqualityTerm) -> str:
             # Bool Filter
             if node.value.lower() in cls._FILTER_STRING_BOOL_VALUES:
                 filter_value = int(True if node.value.lower() == "true" else False)
-                return (f'({unstructured_common.BOOL_FIELDS} contains sameElement(key contains "{node.field}", '
-                        f'value = {filter_value}))')
+                bool_filter_string = (f'({unstructured_common.BOOL_FIELDS} contains '
+                                      f'sameElement(key contains "{node.field}", value = {filter_value}))')
+                filter_parts.append(bool_filter_string)
 
             # Short String Filter
             short_string_filter_string = (f'({unstructured_common.SHORT_STRINGS_FIELDS} '

diff --git a/src/marqo/core/unstructured_vespa_index/unstructured_vespa_schema.py b/src/marqo/core/unstructured_vespa_index/unstructured_vespa_schema.py
@@ -221,6 +221,7 @@ def _generate_unstructured_schema(cls, marqo_index: UnstructuredMarqoIndex) -> s
                     summary {cls._LONGS_STRINGS_FIELDS} type map<string, string> {{}}
                     summary {cls._SHORT_STRINGS_FIELDS} type map<string, string> {{}}
                     summary {cls._STRING_ARRAY} type array<string> {{}}
+                    summary {cls._BOOL_FIELDS} type map<string, byte> {{}}
                     summary {cls._INT_FIELDS} type map<string, long> {{}}
                     summary {cls._FLOAT_FIELDS} type map<string, double> {{}}
                     summary {cls._CHUNKS} type array<string> {{}}
@@ -232,6 +233,7 @@ def _generate_unstructured_schema(cls, marqo_index: UnstructuredMarqoIndex) -> s
                     summary {cls._LONGS_STRINGS_FIELDS} type map<string, string> {{}}
                     summary {cls._SHORT_STRINGS_FIELDS} type map<string, string> {{}}
                     summary {cls._STRING_ARRAY} type array<string> {{}}
+                    summary {cls._BOOL_FIELDS} type map<string, byte> {{}}
                     summary {cls._INT_FIELDS} type map<string, long> {{}}
                     summary {cls._FLOAT_FIELDS} type map<string, double> {{}}
                     summary {cls._CHUNKS} type array<string> {{}}

diff --git a/tests/core/test_vespa_schema.py b/tests/core/test_vespa_schema.py
@@ -47,3 +47,5 @@ def test_mixed_characters(self):
         index_name = "test_schema-name"
         expected = f"{constants.MARQO_RESERVED_PREFIX}test_00schema_01name"
         self.assertEqual(self.vespa_schema._get_vespa_schema_name(index_name), expected)
+
+
diff --git a/tests/core/unstructured_vespa_index/__init__.py b/tests/core/unstructured_vespa_index/__init__.py
diff --git a/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema.sd b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema.sd
@@ -0,0 +1,171 @@
+schema marqo__test_00unstructured_00schema {
+    document {
+        field marqo__id type string {
+            indexing: attribute | summary
+            attribute: fast-search
+            rank: filter
+        }
+
+        field marqo__strings type array<string>{
+            indexing: index
+            index: enable-bm25
+        }
+
+        field marqo__long_string_fields type map<string, string> {
+            indexing: summary
+        }
+
+        field marqo__short_string_fields type map<string, string> {
+            indexing: summary
+            struct-field key { indexing : attribute
+                               attribute: fast-search
+                               rank: filter }
+            struct-field value { indexing : attribute
+                                  attribute: fast-search
+                                  rank: filter }
+        }
+
+        field marqo__string_array type array<string> {
+            indexing: attribute | summary
+            attribute: fast-search
+            rank: filter
+        }
+
+        field marqo__multimodal_params type map<string, string> {
+            indexing: summary
+        }
+
+        field marqo__int_fields type map<string, long> {
+            indexing: summary
+            struct-field key { indexing : attribute
+                               attribute: fast-search
+                               rank: filter }
+            struct-field value { indexing : attribute
+                               attribute: fast-search
+                               rank: filter }
+        }
+
+        field marqo__bool_fields type map<string, byte> {
+            indexing: summary
+            struct-field key { indexing : attribute
+                                attribute: fast-search
+                                rank: filter }
+            struct-field value { indexing : attribute
+                                  attribute: fast-search
+                                  rank: filter }
+            }
+
+        field marqo__float_fields type map<string, double> {
+            indexing: summary
+            struct-field key { indexing : attribute
+                               attribute: fast-search
+                               rank: filter }
+
+            struct-field value { indexing : attribute
+                               attribute: fast-search
+                               rank: filter }
+        }
+
+        field marqo__score_modifiers type tensor<float>(p{}) {
+            indexing: attribute | summary
+        }
+
+        field marqo__chunks type array<string> {
+            indexing: summary
+        }
+
+        field marqo__vector_count type int {
+            indexing: attribute | summary
+        }
+
+        field marqo__embeddings type tensor<float>(p{}, x[32]) {
+            indexing: attribute | index | summary
+            attribute {
+                distance-metric: prenormalized-angular
+            }
+            index {
+                hnsw {
+                    max-links-per-node: 16
+                    neighbors-to-explore-at-insert: 512
+                }
+            }
+        }
+    }
+
+    fieldset default {
+        fields: marqo__strings
+    }
+
+    rank-profile embedding_similarity inherits default {
+        inputs {
+            query(embedding_query) tensor<float>(x[32])
+        }
+        first-phase {
+            expression: closeness(field, marqo__embeddings)
+        }
+        match-features: closest(marqo__embeddings)
+    }
+
+    rank-profile bm25 inherits default {
+        first-phase {
+        expression: bm25(marqo__strings)
+        }
+    }
+
+    rank-profile modifiers inherits default {
+        inputs {
+            query(marqo__mult_weights) tensor<float>(p{})
+            query(marqo__add_weights) tensor<float>(p{})
+        }
+        function modify(score) {
+            expression: if (count(query(marqo__mult_weights)) == 0, 1, reduce(query(marqo__mult_weights) * attribute(marqo__score_modifiers), prod)) * score + reduce(query(marqo__add_weights) * attribute(marqo__score_modifiers), sum)
+       }
+    }
+
+    rank-profile bm25_modifiers inherits modifiers {
+        inputs {
+            query(marqo__mult_weights) tensor<float>(p{})
+            query(marqo__add_weights) tensor<float>(p{})
+        }
+        first-phase {
+            expression: modify(bm25(marqo__strings))
+        }
+    }
+
+    rank-profile embedding_similarity_modifiers inherits modifiers {
+        inputs {
+            query(marqo__mult_weights) tensor<float>(p{})
+            query(marqo__add_weights) tensor<float>(p{})
+            query(embedding_query) tensor<float>(x[32])
+        }
+        first-phase {
+            expression: modify(closeness(field, marqo__embeddings))
+        }
+        match-features: closest(marqo__embeddings)
+    }
+
+    document-summary all-non-vector-summary {
+        summary marqo__id type string {}
+        summary marqo__strings type array<string> {}
+        summary marqo__long_string_fields type map<string, string> {}
+        summary marqo__short_string_fields type map<string, string> {}
+        summary marqo__string_array type array<string> {}
+        summary marqo__bool_fields type map<string, byte> {}
+        summary marqo__int_fields type map<string, long> {}
+        summary marqo__float_fields type map<string, double> {}
+        summary marqo__chunks type array<string> {}
+    }
+
+    document-summary all-vector-summary {
+        summary marqo__id type string {}
+        summary marqo__strings type array<string> {}
+        summary marqo__long_string_fields type map<string, string> {}
+        summary marqo__short_string_fields type map<string, string> {}
+        summary marqo__string_array type array<string> {}
+        summary marqo__bool_fields type map<string, byte> {}
+        summary marqo__int_fields type map<string, long> {}
+        summary marqo__float_fields type map<string, double> {}
+        summary marqo__chunks type array<string> {}
+        summary marqo__embeddings type tensor<float>(p{}, x[32]) {}
+    }
+}
diff --git a/tests/core/unstructured_vespa_index/test_unstructured_vespa_schema.py b/tests/core/unstructured_vespa_index/test_unstructured_vespa_schema.py
@@ -0,0 +1,58 @@
+import os
+import re
+
+from marqo.core.models.marqo_index_request import MarqoIndexRequest
+from marqo.core.unstructured_vespa_index.unstructured_vespa_schema import UnstructuredVespaSchema
+from marqo.tensor_search.models.index_settings import IndexSettings
+from tests.marqo_test import MarqoTestCase
+
+
+class TestUnstructuredVespaSchema(MarqoTestCase):
+    def _read_schema_from_file(self, path: str) -> str:
+        currentdir = os.path.dirname(os.path.abspath(__file__))
+        abspath = os.path.join(currentdir, path)
+
+        with open(abspath, 'r') as f:
+            schema = f.read()
+
+        return schema
+
+    def _remove_whitespace_in_schema(self, schema: str) -> str:
+        """
+        This function removes as much whitespace as possible from a schema without affecting its semantics.
+        It is intended to help compare schemas independent of non-consequential syntactical differences such as
+        new lines and indentation. Note, however, that not every new line can be removed without breaking the schema.
+        """
+        chars = re.escape('{}=+-<>():,;[]|')
+
+        # Replace whitespace (including newlines) before or after any of the chars
+        pattern = rf"(\s*([{chars}])\s*)"
+        schema = re.sub(pattern, r"\2", schema)
+
+        # Replace multiple spaces with a single space
+        schema = re.sub(r' +', ' ', schema)
+
+        # Replace leading whitespace and blank lines
+        schema = re.sub(r'^\s+', '', schema, flags=re.MULTILINE)
+
+        return schema
+
+    def test_unstructured_index_schema_random_model(self):
+        """A test for the unstructured Vespa schema generation with a random model."""
+        index_name = "test_unstructured_schema"
+
+        test_marqo_index_request: MarqoIndexRequest = IndexSettings(
+            type="unstructured",
+            model="random/small"
+        ).to_marqo_index_request(index_name)
+
+        test_unstructured_schema_object = UnstructuredVespaSchema(test_marqo_index_request)
+
+        generated_schema, _ = test_unstructured_schema_object.generate_schema()
+
+        expected_schema = self._read_schema_from_file('test_schemas/unstructured_vespa_index_schema.sd')
+
+        self.assertEqual(
+            self._remove_whitespace_in_schema(expected_schema),
+            self._remove_whitespace_in_schema(generated_schema)
+        )
diff --git a/tests/tensor_search/integ_tests/test_search_structured.py b/tests/tensor_search/integ_tests/test_search_structured.py
@@ -535,7 +535,6 @@ def test_filtering_list_case_lexical(self):
                 if expected_id:
                     self.assertEqual(expected_id, res["hits"][0]["_id"])
 
-    #
     def test_filtering_list_case_image(self):
         hippo_img = 'https://raw.githubusercontent.com/marqo-ai/marqo-api-tests/mainline/assets/ai_hippo_realistic.png'
         tensor_search.add_documents(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,3 +47,5 @@ def test_mixed_characters(self):
		index_name = "test_schema-name"
		expected = f"{constants.MARQO_RESERVED_PREFIX}test_00schema_01name"
		self.assertEqual(self.vespa_schema._get_vespa_schema_name(index_name), expected)