Skip to content

Commit

Permalink
Patch unstructured bool (#709)
Browse files Browse the repository at this point in the history
Patch the Boolean field for unstructured index

* We can correctly filter on the string bool field now
* Search can return a bool field value
  • Loading branch information
wanliAlex authored Feb 6, 2024
1 parent 74bf999 commit d7bfb6b
Show file tree
Hide file tree
Showing 8 changed files with 347 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ def generate_equality_filter_string(node: search_filter.EqualityTerm) -> str:
# Bool Filter
if node.value.lower() in cls._FILTER_STRING_BOOL_VALUES:
filter_value = int(True if node.value.lower() == "true" else False)
return (f'({unstructured_common.BOOL_FIELDS} contains sameElement(key contains "{node.field}", '
f'value = {filter_value}))')
bool_filter_string = (f'({unstructured_common.BOOL_FIELDS} contains '
f'sameElement(key contains "{node.field}", value = {filter_value}))')
filter_parts.append(bool_filter_string)

# Short String Filter
short_string_filter_string = (f'({unstructured_common.SHORT_STRINGS_FIELDS} '
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ def _generate_unstructured_schema(cls, marqo_index: UnstructuredMarqoIndex) -> s
summary {cls._LONGS_STRINGS_FIELDS} type map<string, string> {{}}
summary {cls._SHORT_STRINGS_FIELDS} type map<string, string> {{}}
summary {cls._STRING_ARRAY} type array<string> {{}}
summary {cls._BOOL_FIELDS} type map<string, byte> {{}}
summary {cls._INT_FIELDS} type map<string, long> {{}}
summary {cls._FLOAT_FIELDS} type map<string, double> {{}}
summary {cls._CHUNKS} type array<string> {{}}
Expand All @@ -232,6 +233,7 @@ def _generate_unstructured_schema(cls, marqo_index: UnstructuredMarqoIndex) -> s
summary {cls._LONGS_STRINGS_FIELDS} type map<string, string> {{}}
summary {cls._SHORT_STRINGS_FIELDS} type map<string, string> {{}}
summary {cls._STRING_ARRAY} type array<string> {{}}
summary {cls._BOOL_FIELDS} type map<string, byte> {{}}
summary {cls._INT_FIELDS} type map<string, long> {{}}
summary {cls._FLOAT_FIELDS} type map<string, double> {{}}
summary {cls._CHUNKS} type array<string> {{}}
Expand Down
2 changes: 2 additions & 0 deletions tests/core/test_vespa_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ def test_mixed_characters(self):
index_name = "test_schema-name"
expected = f"{constants.MARQO_RESERVED_PREFIX}test_00schema_01name"
self.assertEqual(self.vespa_schema._get_vespa_schema_name(index_name), expected)


Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
schema marqo__test_00unstructured_00schema {
document {
field marqo__id type string {
indexing: attribute | summary
attribute: fast-search
rank: filter
}

field marqo__strings type array<string>{
indexing: index
index: enable-bm25
}

field marqo__long_string_fields type map<string, string> {
indexing: summary
}

field marqo__short_string_fields type map<string, string> {
indexing: summary
struct-field key { indexing : attribute
attribute: fast-search
rank: filter }
struct-field value { indexing : attribute
attribute: fast-search
rank: filter }
}

field marqo__string_array type array<string> {
indexing: attribute | summary
attribute: fast-search
rank: filter
}

field marqo__multimodal_params type map<string, string> {
indexing: summary
}

field marqo__int_fields type map<string, long> {
indexing: summary
struct-field key { indexing : attribute
attribute: fast-search
rank: filter }
struct-field value { indexing : attribute
attribute: fast-search
rank: filter }
}

field marqo__bool_fields type map<string, byte> {
indexing: summary
struct-field key { indexing : attribute
attribute: fast-search
rank: filter }
struct-field value { indexing : attribute
attribute: fast-search
rank: filter }
}

field marqo__float_fields type map<string, double> {
indexing: summary
struct-field key { indexing : attribute
attribute: fast-search
rank: filter }

struct-field value { indexing : attribute
attribute: fast-search
rank: filter }
}

field marqo__score_modifiers type tensor<float>(p{}) {
indexing: attribute | summary
}

field marqo__chunks type array<string> {
indexing: summary
}

field marqo__vector_count type int {
indexing: attribute | summary
}

field marqo__embeddings type tensor<float>(p{}, x[32]) {
indexing: attribute | index | summary
attribute {
distance-metric: prenormalized-angular
}
index {
hnsw {
max-links-per-node: 16
neighbors-to-explore-at-insert: 512
}
}
}
}

fieldset default {
fields: marqo__strings
}

rank-profile embedding_similarity inherits default {
inputs {
query(embedding_query) tensor<float>(x[32])
}
first-phase {
expression: closeness(field, marqo__embeddings)
}
match-features: closest(marqo__embeddings)
}

rank-profile bm25 inherits default {
first-phase {
expression: bm25(marqo__strings)
}
}

rank-profile modifiers inherits default {
inputs {
query(marqo__mult_weights) tensor<float>(p{})
query(marqo__add_weights) tensor<float>(p{})
}
function modify(score) {
expression: if (count(query(marqo__mult_weights)) == 0, 1, reduce(query(marqo__mult_weights) * attribute(marqo__score_modifiers), prod)) * score + reduce(query(marqo__add_weights) * attribute(marqo__score_modifiers), sum)
}
}

rank-profile bm25_modifiers inherits modifiers {
inputs {
query(marqo__mult_weights) tensor<float>(p{})
query(marqo__add_weights) tensor<float>(p{})
}
first-phase {
expression: modify(bm25(marqo__strings))
}
}

rank-profile embedding_similarity_modifiers inherits modifiers {
inputs {
query(marqo__mult_weights) tensor<float>(p{})
query(marqo__add_weights) tensor<float>(p{})
query(embedding_query) tensor<float>(x[32])
}
first-phase {
expression: modify(closeness(field, marqo__embeddings))
}
match-features: closest(marqo__embeddings)
}

document-summary all-non-vector-summary {
summary marqo__id type string {}
summary marqo__strings type array<string> {}
summary marqo__long_string_fields type map<string, string> {}
summary marqo__short_string_fields type map<string, string> {}
summary marqo__string_array type array<string> {}
summary marqo__bool_fields type map<string, byte> {}
summary marqo__int_fields type map<string, long> {}
summary marqo__float_fields type map<string, double> {}
summary marqo__chunks type array<string> {}
}

document-summary all-vector-summary {
summary marqo__id type string {}
summary marqo__strings type array<string> {}
summary marqo__long_string_fields type map<string, string> {}
summary marqo__short_string_fields type map<string, string> {}
summary marqo__string_array type array<string> {}
summary marqo__bool_fields type map<string, byte> {}
summary marqo__int_fields type map<string, long> {}
summary marqo__float_fields type map<string, double> {}
summary marqo__chunks type array<string> {}
summary marqo__embeddings type tensor<float>(p{}, x[32]) {}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
import re

from marqo.core.models.marqo_index_request import MarqoIndexRequest
from marqo.core.unstructured_vespa_index.unstructured_vespa_schema import UnstructuredVespaSchema
from marqo.tensor_search.models.index_settings import IndexSettings
from tests.marqo_test import MarqoTestCase


class TestUnstructuredVespaSchema(MarqoTestCase):
def _read_schema_from_file(self, path: str) -> str:
currentdir = os.path.dirname(os.path.abspath(__file__))
abspath = os.path.join(currentdir, path)

with open(abspath, 'r') as f:
schema = f.read()

return schema

def _remove_whitespace_in_schema(self, schema: str) -> str:
"""
This function removes as much whitespace as possible from a schema without affecting its semantics.
It is intended to help compare schemas independent of non-consequential syntactical differences such as
new lines and indentation. Note, however, that not every new line can be removed without breaking the schema.
"""
chars = re.escape('{}=+-<>():,;[]|')

# Replace whitespace (including newlines) before or after any of the chars
pattern = rf"(\s*([{chars}])\s*)"
schema = re.sub(pattern, r"\2", schema)

# Replace multiple spaces with a single space
schema = re.sub(r' +', ' ', schema)

# Replace leading whitespace and blank lines
schema = re.sub(r'^\s+', '', schema, flags=re.MULTILINE)

return schema

def test_unstructured_index_schema_random_model(self):
"""A test for the unstructured Vespa schema generation with a random model."""
index_name = "test_unstructured_schema"

test_marqo_index_request: MarqoIndexRequest = IndexSettings(
type="unstructured",
model="random/small"
).to_marqo_index_request(index_name)

test_unstructured_schema_object = UnstructuredVespaSchema(test_marqo_index_request)

generated_schema, _ = test_unstructured_schema_object.generate_schema()

expected_schema = self._read_schema_from_file('test_schemas/unstructured_vespa_index_schema.sd')

self.assertEqual(
self._remove_whitespace_in_schema(expected_schema),
self._remove_whitespace_in_schema(generated_schema)
)
1 change: 0 additions & 1 deletion tests/tensor_search/integ_tests/test_search_structured.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,6 @@ def test_filtering_list_case_lexical(self):
if expected_id:
self.assertEqual(expected_id, res["hits"][0]["_id"])

#
def test_filtering_list_case_image(self):
hippo_img = 'https://raw.githubusercontent.com/marqo-ai/marqo-api-tests/mainline/assets/ai_hippo_realistic.png'
tensor_search.add_documents(
Expand Down
Loading

0 comments on commit d7bfb6b

Please sign in to comment.