Skip to content

Commit

Permalink
Lexical parser modification - best effort double quote parsing (#979)
Browse files Browse the repository at this point in the history
Co-authored-by: yihanzhao <[email protected]>
  • Loading branch information
vicilliar and papa99do authored Oct 24, 2024
1 parent 4e3451e commit d0226c1
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 52 deletions.
3 changes: 3 additions & 0 deletions src/marqo/core/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ def __init__(self, error_message: str,
self.error_message = error_message
self.status_code = int(status_code)

def __str__(self) -> str:
return f'{self.error_code}: {self.error_message}'


class DuplicateDocumentError(AddDocumentsError):
pass
Expand Down
1 change: 1 addition & 0 deletions src/marqo/core/inference/tensor_fields_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def _s2inference_vectorise(self, content_chunks: List[ContentChunkType],
# Fail the whole batch due to a malfunctioning embedding model
raise ModelError(f'Problem vectorising query. Reason: {str(model_error)}')
except s2_inference_errors.S2InferenceError as e:
# TODO find a better error code, the existing one is 'invalid_argument'
raise AddDocumentsError(e.message) from e

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion src/marqo/core/vespa_index/add_documents_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def _vectorise_tensor_fields_in_batch_per_add_doc_batch(self, model_config: Mode
except AddDocumentsError as err:
logger.error('Encountered problem when vectorising batch of documents. Reason: %s', err)
raise InternalError(
message=f'Encountered problem when vectorising batch of documents. Reason: {str(err)}'
message=f'Encountered problem when vectorising batch of documents. Reason: {err.error_message}'
)

for doc_id, field_name, tensor_field_content in (
Expand Down
76 changes: 43 additions & 33 deletions src/marqo/tensor_search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,70 +199,80 @@ def parse_lexical_query(text: str) -> Tuple[List[str], List[str]]:
"""
Find required terms enclosed within double quotes.
All other terms go into optional_terms, split by whitespace.
All other terms go into blob, split by whitespace. blob starts as a string then splits into
a list by space.
Syntax:
Required strings must be enclosed by quotes. These quotes must be enclosed by spaces or the start
or end of the text
or end of the text. Quotes always come in pairs.
Bad syntax rules:
- If 1 or both of the quotes in a pair has bad syntax (e.g. no space before opening quote or after closing
quote), both will be treated as whitespace.
- Incomplete quotes will be treated as whitespace.
Notes:
Double quote can be either opening, closing, or escaped.
Escaped double quotes are interpreted literally.
If any double quotes exist that are neither opening, closing, nor escaped,
interpret entire string literally instead.
- Correct double quote can be either opening, closing, or escaped.
- Escaped double quotes are interpreted literally.
Users need to escape the backslash itself. (Single \ get ignored) -> q='dwayne \\"the rock\\" johnson'
Return:
2-tuple of <required terms> (for "must" clause) <optional terms> (for "should" clause)
"""
required_terms = []
optional_terms = ""
blob = ""
opening_quote_idx = None
current_quote_pair_is_faulty = False

if not isinstance(text, str):
raise TypeError("parse_lexical_query must have string as input")

for i in range(len(text)):
# Add all characters to blob initially
optional_terms += text[i]
# Add every character to blob initially
blob += text[i]

if text[i] == '"':
# Check if ESCAPED
if i > 0 and text[i - 1] == '\\':
# Read quote literally. Backslash should be ignored (both blob and required)
# Read quote literally. Backslash should be passed directly to Vespa.
pass

# Check if CLOSING QUOTE
# Closing " must have space on the right (or is last character) while opening exists.
elif (opening_quote_idx is not None) and (i == len(text) - 1 or text[i + 1] == " "):
# Add everything in between the quotes as a required term
new_required_term = text[opening_quote_idx + 1:i]
required_terms.append(new_required_term)

# Remove this required term from the optional blob
optional_terms = optional_terms[:-(len(new_required_term) + 2)]
opening_quote_idx = None

# Check if OPENING QUOTE
# Opening " must have space on the left (or is first character).
elif i == 0 or text[i - 1] == " ":
# OPENING QUOTE
elif (opening_quote_idx is None):
opening_quote_idx = i
blob_opening_quote_idx = len(blob) - 1 # Opening quote index in blob is different from text

# None of the above: Syntax error. Interpret text literally instead.
# Bad syntax opening quote: flag it, replace quote with whitespace
if not (i == 0 or text[i - 1] == " "):
current_quote_pair_is_faulty = True
blob = blob[:-1] + " "
# CLOSING QUOTE
else:
return [], text.split()
# Good syntax closing: must have space on the right (or is last character) while opening exists.
if (i == len(text) - 1 or text[i + 1] == " ") and not current_quote_pair_is_faulty:
# Add everything in between the quotes as a required term
new_required_term = text[opening_quote_idx + 1:i]
if new_required_term: # Do not add empty strings as required terms
required_terms.append(new_required_term)

# Remove this required term from the blob
blob = blob[:-(len(new_required_term) + 2)]

else:
# Bad syntax closing: treat this and opening quote as whitespace
blob = blob[:blob_opening_quote_idx] + " " + \
blob[blob_opening_quote_idx + 1:-1] + " "

# Clean up flags
opening_quote_idx = None
current_quote_pair_is_faulty = False

# Unpaired quote will be turned to whitespace
if opening_quote_idx is not None:
# string parsing finished with a quote still open: syntax error.
return [], text.split()
blob = blob[:blob_opening_quote_idx] + " " + blob[blob_opening_quote_idx + 1:]

# Remove double/leading white spaces
optional_terms = optional_terms.split()

# Remove escape character. `\"` becomes just `"`
required_terms = [term.replace('\\"', '"') for term in required_terms]
optional_terms = [term.replace('\\"', '"') for term in optional_terms]
optional_terms = blob.split()

return required_terms, optional_terms

Expand Down
2 changes: 1 addition & 1 deletion tests/core/vespa_index/test_add_documents_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def test_add_documents_should_fail_a_batch_using_vectorise_per_doc_strategy(self
])
)

with self.assertRaises(InternalError) as context:
with self.assertRaisesStrict(InternalError) as context:
handler.add_documents()

self.assertEquals('Encountered problem when vectorising batch of documents. Reason: vectorise error',
Expand Down
48 changes: 47 additions & 1 deletion tests/tensor_search/integ_tests/test_search_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,4 +965,50 @@ def test_search_query_CanAcceptDifferentSearchMethods(self):

# A special case for no search method provided
search_query = SearchQuery(q="test")
self.assertEqual(SearchMethod.TENSOR, search_query.searchMethod)
self.assertEqual(SearchMethod.TENSOR, search_query.searchMethod)

def test_lexical_search_DoesNotErrorWithEscapedQuotes(self):
"""
Ensure that lexical search handles double quotes properly, both escaped and wrong quotes.
Expected behavior: escaped quotes are passed to vespa. Incorrect quotes are treated like whitespace.
"""

docs_list = [
{"_id": "doc1", "text_field_1": '1"2'},
{"_id": "doc2", "text_field_1": 'exact match'},
{"_id": "doc3", "text_field_1": 'exacto wrong syntax'},
{"_id": "doc4", "text_field_1": '"escaped"'},

{"_id": "red_herring_1", "text_field_1": '12'},
{"_id": "red_herring_2", "text_field_1": 'escaped'},
{"_id": "red_herring_3", "text_field_1": 'wrong"'}
]
test_cases = [
('1\\"2', ['doc1']), # Match off of '1"2'
('"exact match"', ['doc2']), # Match off of 'exact match'
('\\"escaped\\"', ['doc4', 'red_herring_2']), # Match off of 'escaped' or '"escaped"'
('"exacto" wrong"', ['doc3']), # Match properly off of 'wrong'
('""', []), # Single quote should return no results (treated as whitespace)
('"', []), # Double quote should return no results (treated as whitespace)
('', []) # Empty string should return no results
]

for index in [self.unstructured_default_text_index, self.structured_default_text_index]:
with self.subTest(index=index.type):
tensor_search.add_documents(
config=self.config,
add_docs_params=AddDocsParams(
index_name=index.name,
docs=docs_list,
tensor_fields=["text_field_1"] if isinstance(index, UnstructuredMarqoIndex) else None
)
)

for query, expected_ids in test_cases:
with self.subTest(query=query):
res = tensor_search.search(
text=query, config=self.config, index_name=index.name,
search_method=SearchMethod.LEXICAL
)
self.assertEqual(len(expected_ids), len(res['hits']))
self.assertEqual(set(expected_ids), {hit['_id'] for hit in res['hits']})
41 changes: 25 additions & 16 deletions tests/tensor_search/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def test_parse_lexical_query(self):
('just "a long long " string', (["a long long "], ['just', 'string'])),
('"required 1 " not required " required2" again',
(["required 1 ", " required2"], ['not', 'required', 'again'])),
('"just" "just" "" a string', (["just", "just", ""], ['a', 'string'])),
('"just" "just" "" a string', (["just", "just"], ['a', 'string'])),

('朋友你好', ([], ['朋友你好'])),
('朋友 "你好"', (["你好"], ['朋友'])),
Expand All @@ -234,30 +234,39 @@ def test_parse_lexical_query(self):
('', ([], [])),
('"cookie"', (["cookie"], [])),
('"朋友"', (["朋友"], [])),
('"', ([], ['"'])),
('"""hello', ([], ['"""hello'])),
('""" python docstring appeared"""', ([], ['"""', 'python', 'docstring', 'appeared"""'])),
('""', ([''], [])),

# Badly formatted double quotes.
# Treat every 2 as a pair, bad/unpaired quotes become whitespace.
('"', ([], [])),
('"""hello', ([], ['hello'])),
('""" python docstring appeared"""', ([], ['python', 'docstring', 'appeared'])),
('""', ([], [])),
('what about backticks `?', ([], ['what', 'about', 'backticks', '`?'])),
(
'\\" escaped quotes\\" what happens here?',
([], ['"', 'escaped', 'quotes"', 'what', 'happens', 'here?'])
([], ['\\"', 'escaped', 'quotes\\"', 'what', 'happens', 'here?'])
),
('\\"朋友\\"', ([], ['"朋友"'])),
('\\"朋友\\"', ([], ['\\"朋友\\"'])),
('double spaces get removed', ([], ['double', 'spaces', 'get', 'removed'])),
('"go"od"', ([], ['"go"od"'])),
('"ter"m1" term2', ([], ['"ter"m1"', 'term2'])),
('"term1" "term2" "term3', ([], ['"term1"', '"term2"', '"term3'])),
('"good', ([], ['"good'])),
('"朋友', ([], ['"朋友'])),
('"go"od"', ([], ['go', 'od'])),
('"ter"m1" term2', ([], ['ter','m1', 'term2'])),
('"term1" "term2" "term3', (['term1', 'term2'], ['term3'])),
('"term1" "term2" "ter"m3', (['term1', 'term2'], ['ter', 'm3'])),
('"term 1" "term "2 "term 3"', (['term 1', 'term 3'], ['term', '2'])), # good syntax in between bad syntax
('"good', ([], ['good'])), # Unpaired quotes
('"朋友', ([], ['朋友'])),
# Combination: good terms, bad terms (opening and closing), escaped quotes, spaces.
('hello "term1" " term 2 " space b"adterm" "badte"rm "term \\"3" "unfinished',
(['term1', ' term 2 ', 'term \\"3'], ['hello', 'space', 'b', 'adterm', 'badte', 'rm', 'unfinished'])),

# on Lucene, these unusual structures seem to get passed straight through as well.
# The quotes seem to be completely ignored (with and without quotes yields identical results,
# including scores):
('"go"od" a"', ([], ['"go"od"', 'a"'])),
('"sam"a', ([], ['"sam"a'])),
('"sam"?', ([], ['"sam"?'])),
('"朋友"你好', ([], ['"朋友"你好'])),
('"go"od" a"', ([], ['go', 'od', 'a'])),
('"sam"a', ([], ['sam', 'a'])), # Good opening, bad closing
('sa"ma" hello!', ([], ['sa', 'ma', 'hello!'])), # Bad opening, good closing
('"sam"?', ([], ['sam', '?'])),
('"朋友"你好', ([], ['朋友', '你好'])),

]
for input, expected_output in cases:
Expand Down

0 comments on commit d0226c1

Please sign in to comment.