-
Notifications
You must be signed in to change notification settings - Fork 198
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Patch the Boolean field for unstructured index * We can correctly filter on the string bool field now * Search can return a bool field value
- Loading branch information
Showing
8 changed files
with
347 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
171 changes: 171 additions & 0 deletions
171
tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema.sd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
schema marqo__test_00unstructured_00schema { | ||
document { | ||
field marqo__id type string { | ||
indexing: attribute | summary | ||
attribute: fast-search | ||
rank: filter | ||
} | ||
|
||
field marqo__strings type array<string>{ | ||
indexing: index | ||
index: enable-bm25 | ||
} | ||
|
||
field marqo__long_string_fields type map<string, string> { | ||
indexing: summary | ||
} | ||
|
||
field marqo__short_string_fields type map<string, string> { | ||
indexing: summary | ||
struct-field key { indexing : attribute | ||
attribute: fast-search | ||
rank: filter } | ||
struct-field value { indexing : attribute | ||
attribute: fast-search | ||
rank: filter } | ||
} | ||
|
||
field marqo__string_array type array<string> { | ||
indexing: attribute | summary | ||
attribute: fast-search | ||
rank: filter | ||
} | ||
|
||
field marqo__multimodal_params type map<string, string> { | ||
indexing: summary | ||
} | ||
|
||
field marqo__int_fields type map<string, long> { | ||
indexing: summary | ||
struct-field key { indexing : attribute | ||
attribute: fast-search | ||
rank: filter } | ||
struct-field value { indexing : attribute | ||
attribute: fast-search | ||
rank: filter } | ||
} | ||
|
||
field marqo__bool_fields type map<string, byte> { | ||
indexing: summary | ||
struct-field key { indexing : attribute | ||
attribute: fast-search | ||
rank: filter } | ||
struct-field value { indexing : attribute | ||
attribute: fast-search | ||
rank: filter } | ||
} | ||
|
||
field marqo__float_fields type map<string, double> { | ||
indexing: summary | ||
struct-field key { indexing : attribute | ||
attribute: fast-search | ||
rank: filter } | ||
|
||
struct-field value { indexing : attribute | ||
attribute: fast-search | ||
rank: filter } | ||
} | ||
|
||
field marqo__score_modifiers type tensor<float>(p{}) { | ||
indexing: attribute | summary | ||
} | ||
|
||
field marqo__chunks type array<string> { | ||
indexing: summary | ||
} | ||
|
||
field marqo__vector_count type int { | ||
indexing: attribute | summary | ||
} | ||
|
||
field marqo__embeddings type tensor<float>(p{}, x[32]) { | ||
indexing: attribute | index | summary | ||
attribute { | ||
distance-metric: prenormalized-angular | ||
} | ||
index { | ||
hnsw { | ||
max-links-per-node: 16 | ||
neighbors-to-explore-at-insert: 512 | ||
} | ||
} | ||
} | ||
} | ||
|
||
fieldset default { | ||
fields: marqo__strings | ||
} | ||
|
||
rank-profile embedding_similarity inherits default { | ||
inputs { | ||
query(embedding_query) tensor<float>(x[32]) | ||
} | ||
first-phase { | ||
expression: closeness(field, marqo__embeddings) | ||
} | ||
match-features: closest(marqo__embeddings) | ||
} | ||
|
||
rank-profile bm25 inherits default { | ||
first-phase { | ||
expression: bm25(marqo__strings) | ||
} | ||
} | ||
|
||
rank-profile modifiers inherits default { | ||
inputs { | ||
query(marqo__mult_weights) tensor<float>(p{}) | ||
query(marqo__add_weights) tensor<float>(p{}) | ||
} | ||
function modify(score) { | ||
expression: if (count(query(marqo__mult_weights)) == 0, 1, reduce(query(marqo__mult_weights) * attribute(marqo__score_modifiers), prod)) * score + reduce(query(marqo__add_weights) * attribute(marqo__score_modifiers), sum) | ||
} | ||
} | ||
|
||
rank-profile bm25_modifiers inherits modifiers { | ||
inputs { | ||
query(marqo__mult_weights) tensor<float>(p{}) | ||
query(marqo__add_weights) tensor<float>(p{}) | ||
} | ||
first-phase { | ||
expression: modify(bm25(marqo__strings)) | ||
} | ||
} | ||
|
||
rank-profile embedding_similarity_modifiers inherits modifiers { | ||
inputs { | ||
query(marqo__mult_weights) tensor<float>(p{}) | ||
query(marqo__add_weights) tensor<float>(p{}) | ||
query(embedding_query) tensor<float>(x[32]) | ||
} | ||
first-phase { | ||
expression: modify(closeness(field, marqo__embeddings)) | ||
} | ||
match-features: closest(marqo__embeddings) | ||
} | ||
|
||
document-summary all-non-vector-summary { | ||
summary marqo__id type string {} | ||
summary marqo__strings type array<string> {} | ||
summary marqo__long_string_fields type map<string, string> {} | ||
summary marqo__short_string_fields type map<string, string> {} | ||
summary marqo__string_array type array<string> {} | ||
summary marqo__bool_fields type map<string, byte> {} | ||
summary marqo__int_fields type map<string, long> {} | ||
summary marqo__float_fields type map<string, double> {} | ||
summary marqo__chunks type array<string> {} | ||
} | ||
|
||
document-summary all-vector-summary { | ||
summary marqo__id type string {} | ||
summary marqo__strings type array<string> {} | ||
summary marqo__long_string_fields type map<string, string> {} | ||
summary marqo__short_string_fields type map<string, string> {} | ||
summary marqo__string_array type array<string> {} | ||
summary marqo__bool_fields type map<string, byte> {} | ||
summary marqo__int_fields type map<string, long> {} | ||
summary marqo__float_fields type map<string, double> {} | ||
summary marqo__chunks type array<string> {} | ||
summary marqo__embeddings type tensor<float>(p{}, x[32]) {} | ||
} | ||
} |
58 changes: 58 additions & 0 deletions
58
tests/core/unstructured_vespa_index/test_unstructured_vespa_schema.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import os | ||
import re | ||
|
||
from marqo.core.models.marqo_index_request import MarqoIndexRequest | ||
from marqo.core.unstructured_vespa_index.unstructured_vespa_schema import UnstructuredVespaSchema | ||
from marqo.tensor_search.models.index_settings import IndexSettings | ||
from tests.marqo_test import MarqoTestCase | ||
|
||
|
||
class TestUnstructuredVespaSchema(MarqoTestCase): | ||
def _read_schema_from_file(self, path: str) -> str: | ||
currentdir = os.path.dirname(os.path.abspath(__file__)) | ||
abspath = os.path.join(currentdir, path) | ||
|
||
with open(abspath, 'r') as f: | ||
schema = f.read() | ||
|
||
return schema | ||
|
||
def _remove_whitespace_in_schema(self, schema: str) -> str: | ||
""" | ||
This function removes as much whitespace as possible from a schema without affecting its semantics. | ||
It is intended to help compare schemas independent of non-consequential syntactical differences such as | ||
new lines and indentation. Note, however, that not every new line can be removed without breaking the schema. | ||
""" | ||
chars = re.escape('{}=+-<>():,;[]|') | ||
|
||
# Replace whitespace (including newlines) before or after any of the chars | ||
pattern = rf"(\s*([{chars}])\s*)" | ||
schema = re.sub(pattern, r"\2", schema) | ||
|
||
# Replace multiple spaces with a single space | ||
schema = re.sub(r' +', ' ', schema) | ||
|
||
# Replace leading whitespace and blank lines | ||
schema = re.sub(r'^\s+', '', schema, flags=re.MULTILINE) | ||
|
||
return schema | ||
|
||
def test_unstructured_index_schema_random_model(self): | ||
"""A test for the unstructured Vespa schema generation with a random model.""" | ||
index_name = "test_unstructured_schema" | ||
|
||
test_marqo_index_request: MarqoIndexRequest = IndexSettings( | ||
type="unstructured", | ||
model="random/small" | ||
).to_marqo_index_request(index_name) | ||
|
||
test_unstructured_schema_object = UnstructuredVespaSchema(test_marqo_index_request) | ||
|
||
generated_schema, _ = test_unstructured_schema_object.generate_schema() | ||
|
||
expected_schema = self._read_schema_from_file('test_schemas/unstructured_vespa_index_schema.sd') | ||
|
||
self.assertEqual( | ||
self._remove_whitespace_in_schema(expected_schema), | ||
self._remove_whitespace_in_schema(generated_schema) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.