From d0226c14ddb45e4e3b2c21c1cdb9a2d1266f2af9 Mon Sep 17 00:00:00 2001 From: Joshua Date: Thu, 24 Oct 2024 17:22:09 +0800 Subject: [PATCH] Lexical parser modification - best effort double quote parsing (#979) Co-authored-by: yihanzhao --- src/marqo/core/exceptions.py | 3 + .../core/inference/tensor_fields_container.py | 1 + .../core/vespa_index/add_documents_handler.py | 2 +- src/marqo/tensor_search/utils.py | 76 +++++++++++-------- .../vespa_index/test_add_documents_handler.py | 2 +- .../integ_tests/test_search_combined.py | 48 +++++++++++- tests/tensor_search/test_utils.py | 41 ++++++---- 7 files changed, 121 insertions(+), 52 deletions(-) diff --git a/src/marqo/core/exceptions.py b/src/marqo/core/exceptions.py index a10755a8b..7467da5e3 100644 --- a/src/marqo/core/exceptions.py +++ b/src/marqo/core/exceptions.py @@ -102,6 +102,9 @@ def __init__(self, error_message: str, self.error_message = error_message self.status_code = int(status_code) + def __str__(self) -> str: + return f'{self.error_code}: {self.error_message}' + class DuplicateDocumentError(AddDocumentsError): pass diff --git a/src/marqo/core/inference/tensor_fields_container.py b/src/marqo/core/inference/tensor_fields_container.py index 43c803823..a9dd57e12 100644 --- a/src/marqo/core/inference/tensor_fields_container.py +++ b/src/marqo/core/inference/tensor_fields_container.py @@ -159,6 +159,7 @@ def _s2inference_vectorise(self, content_chunks: List[ContentChunkType], # Fail the whole batch due to a malfunctioning embedding model raise ModelError(f'Problem vectorising query. Reason: {str(model_error)}') except s2_inference_errors.S2InferenceError as e: + # TODO find a better error code, the existing one is 'invalid_argument' raise AddDocumentsError(e.message) from e @classmethod diff --git a/src/marqo/core/vespa_index/add_documents_handler.py b/src/marqo/core/vespa_index/add_documents_handler.py index ce7a3a802..b181d35b7 100644 --- a/src/marqo/core/vespa_index/add_documents_handler.py +++ b/src/marqo/core/vespa_index/add_documents_handler.py @@ -365,7 +365,7 @@ def _vectorise_tensor_fields_in_batch_per_add_doc_batch(self, model_config: Mode except AddDocumentsError as err: logger.error('Encountered problem when vectorising batch of documents. Reason: %s', err) raise InternalError( - message=f'Encountered problem when vectorising batch of documents. Reason: {str(err)}' + message=f'Encountered problem when vectorising batch of documents. Reason: {err.error_message}' ) for doc_id, field_name, tensor_field_content in ( diff --git a/src/marqo/tensor_search/utils.py b/src/marqo/tensor_search/utils.py index 90072a240..be10e9216 100644 --- a/src/marqo/tensor_search/utils.py +++ b/src/marqo/tensor_search/utils.py @@ -199,17 +199,20 @@ def parse_lexical_query(text: str) -> Tuple[List[str], List[str]]: """ Find required terms enclosed within double quotes. - All other terms go into optional_terms, split by whitespace. + All other terms go into blob, split by whitespace. blob starts as a string then splits into + a list by space. Syntax: Required strings must be enclosed by quotes. These quotes must be enclosed by spaces or the start - or end of the text + or end of the text. Quotes always come in pairs. + Bad syntax rules: + - If 1 or both of the quotes in a pair has bad syntax (e.g. no space before opening quote or after closing + quote), both will be treated as whitespace. + - Incomplete quotes will be treated as whitespace. Notes: - Double quote can be either opening, closing, or escaped. - Escaped double quotes are interpreted literally. - If any double quotes exist that are neither opening, closing, nor escaped, - interpret entire string literally instead. + - Correct double quote can be either opening, closing, or escaped. + - Escaped double quotes are interpreted literally. Users need to escape the backslash itself. (Single \ get ignored) -> q='dwayne \\"the rock\\" johnson' @@ -217,52 +220,59 @@ def parse_lexical_query(text: str) -> Tuple[List[str], List[str]]: 2-tuple of (for "must" clause) (for "should" clause) """ required_terms = [] - optional_terms = "" + blob = "" opening_quote_idx = None + current_quote_pair_is_faulty = False if not isinstance(text, str): raise TypeError("parse_lexical_query must have string as input") for i in range(len(text)): - # Add all characters to blob initially - optional_terms += text[i] + # Add every character to blob initially + blob += text[i] if text[i] == '"': # Check if ESCAPED if i > 0 and text[i - 1] == '\\': - # Read quote literally. Backslash should be ignored (both blob and required) + # Read quote literally. Backslash should be passed directly to Vespa. pass - # Check if CLOSING QUOTE - # Closing " must have space on the right (or is last character) while opening exists. - elif (opening_quote_idx is not None) and (i == len(text) - 1 or text[i + 1] == " "): - # Add everything in between the quotes as a required term - new_required_term = text[opening_quote_idx + 1:i] - required_terms.append(new_required_term) - - # Remove this required term from the optional blob - optional_terms = optional_terms[:-(len(new_required_term) + 2)] - opening_quote_idx = None - - # Check if OPENING QUOTE - # Opening " must have space on the left (or is first character). - elif i == 0 or text[i - 1] == " ": + # OPENING QUOTE + elif (opening_quote_idx is None): opening_quote_idx = i + blob_opening_quote_idx = len(blob) - 1 # Opening quote index in blob is different from text - # None of the above: Syntax error. Interpret text literally instead. + # Bad syntax opening quote: flag it, replace quote with whitespace + if not (i == 0 or text[i - 1] == " "): + current_quote_pair_is_faulty = True + blob = blob[:-1] + " " + # CLOSING QUOTE else: - return [], text.split() + # Good syntax closing: must have space on the right (or is last character) while opening exists. + if (i == len(text) - 1 or text[i + 1] == " ") and not current_quote_pair_is_faulty: + # Add everything in between the quotes as a required term + new_required_term = text[opening_quote_idx + 1:i] + if new_required_term: # Do not add empty strings as required terms + required_terms.append(new_required_term) + # Remove this required term from the blob + blob = blob[:-(len(new_required_term) + 2)] + + else: + # Bad syntax closing: treat this and opening quote as whitespace + blob = blob[:blob_opening_quote_idx] + " " + \ + blob[blob_opening_quote_idx + 1:-1] + " " + + # Clean up flags + opening_quote_idx = None + current_quote_pair_is_faulty = False + + # Unpaired quote will be turned to whitespace if opening_quote_idx is not None: - # string parsing finished with a quote still open: syntax error. - return [], text.split() + blob = blob[:blob_opening_quote_idx] + " " + blob[blob_opening_quote_idx + 1:] # Remove double/leading white spaces - optional_terms = optional_terms.split() - - # Remove escape character. `\"` becomes just `"` - required_terms = [term.replace('\\"', '"') for term in required_terms] - optional_terms = [term.replace('\\"', '"') for term in optional_terms] + optional_terms = blob.split() return required_terms, optional_terms diff --git a/tests/core/vespa_index/test_add_documents_handler.py b/tests/core/vespa_index/test_add_documents_handler.py index 5931f4037..718415723 100644 --- a/tests/core/vespa_index/test_add_documents_handler.py +++ b/tests/core/vespa_index/test_add_documents_handler.py @@ -315,7 +315,7 @@ def test_add_documents_should_fail_a_batch_using_vectorise_per_doc_strategy(self ]) ) - with self.assertRaises(InternalError) as context: + with self.assertRaisesStrict(InternalError) as context: handler.add_documents() self.assertEquals('Encountered problem when vectorising batch of documents. Reason: vectorise error', diff --git a/tests/tensor_search/integ_tests/test_search_combined.py b/tests/tensor_search/integ_tests/test_search_combined.py index e5e26674f..14d38ab19 100644 --- a/tests/tensor_search/integ_tests/test_search_combined.py +++ b/tests/tensor_search/integ_tests/test_search_combined.py @@ -965,4 +965,50 @@ def test_search_query_CanAcceptDifferentSearchMethods(self): # A special case for no search method provided search_query = SearchQuery(q="test") - self.assertEqual(SearchMethod.TENSOR, search_query.searchMethod) \ No newline at end of file + self.assertEqual(SearchMethod.TENSOR, search_query.searchMethod) + + def test_lexical_search_DoesNotErrorWithEscapedQuotes(self): + """ + Ensure that lexical search handles double quotes properly, both escaped and wrong quotes. + Expected behavior: escaped quotes are passed to vespa. Incorrect quotes are treated like whitespace. + """ + + docs_list = [ + {"_id": "doc1", "text_field_1": '1"2'}, + {"_id": "doc2", "text_field_1": 'exact match'}, + {"_id": "doc3", "text_field_1": 'exacto wrong syntax'}, + {"_id": "doc4", "text_field_1": '"escaped"'}, + + {"_id": "red_herring_1", "text_field_1": '12'}, + {"_id": "red_herring_2", "text_field_1": 'escaped'}, + {"_id": "red_herring_3", "text_field_1": 'wrong"'} + ] + test_cases = [ + ('1\\"2', ['doc1']), # Match off of '1"2' + ('"exact match"', ['doc2']), # Match off of 'exact match' + ('\\"escaped\\"', ['doc4', 'red_herring_2']), # Match off of 'escaped' or '"escaped"' + ('"exacto" wrong"', ['doc3']), # Match properly off of 'wrong' + ('""', []), # Single quote should return no results (treated as whitespace) + ('"', []), # Double quote should return no results (treated as whitespace) + ('', []) # Empty string should return no results + ] + + for index in [self.unstructured_default_text_index, self.structured_default_text_index]: + with self.subTest(index=index.type): + tensor_search.add_documents( + config=self.config, + add_docs_params=AddDocsParams( + index_name=index.name, + docs=docs_list, + tensor_fields=["text_field_1"] if isinstance(index, UnstructuredMarqoIndex) else None + ) + ) + + for query, expected_ids in test_cases: + with self.subTest(query=query): + res = tensor_search.search( + text=query, config=self.config, index_name=index.name, + search_method=SearchMethod.LEXICAL + ) + self.assertEqual(len(expected_ids), len(res['hits'])) + self.assertEqual(set(expected_ids), {hit['_id'] for hit in res['hits']}) diff --git a/tests/tensor_search/test_utils.py b/tests/tensor_search/test_utils.py index 5ee7a91af..a4501a7af 100644 --- a/tests/tensor_search/test_utils.py +++ b/tests/tensor_search/test_utils.py @@ -223,7 +223,7 @@ def test_parse_lexical_query(self): ('just "a long long " string', (["a long long "], ['just', 'string'])), ('"required 1 " not required " required2" again', (["required 1 ", " required2"], ['not', 'required', 'again'])), - ('"just" "just" "" a string', (["just", "just", ""], ['a', 'string'])), + ('"just" "just" "" a string', (["just", "just"], ['a', 'string'])), ('朋友你好', ([], ['朋友你好'])), ('朋友 "你好"', (["你好"], ['朋友'])), @@ -234,30 +234,39 @@ def test_parse_lexical_query(self): ('', ([], [])), ('"cookie"', (["cookie"], [])), ('"朋友"', (["朋友"], [])), - ('"', ([], ['"'])), - ('"""hello', ([], ['"""hello'])), - ('""" python docstring appeared"""', ([], ['"""', 'python', 'docstring', 'appeared"""'])), - ('""', ([''], [])), + + # Badly formatted double quotes. + # Treat every 2 as a pair, bad/unpaired quotes become whitespace. + ('"', ([], [])), + ('"""hello', ([], ['hello'])), + ('""" python docstring appeared"""', ([], ['python', 'docstring', 'appeared'])), + ('""', ([], [])), ('what about backticks `?', ([], ['what', 'about', 'backticks', '`?'])), ( '\\" escaped quotes\\" what happens here?', - ([], ['"', 'escaped', 'quotes"', 'what', 'happens', 'here?']) + ([], ['\\"', 'escaped', 'quotes\\"', 'what', 'happens', 'here?']) ), - ('\\"朋友\\"', ([], ['"朋友"'])), + ('\\"朋友\\"', ([], ['\\"朋友\\"'])), ('double spaces get removed', ([], ['double', 'spaces', 'get', 'removed'])), - ('"go"od"', ([], ['"go"od"'])), - ('"ter"m1" term2', ([], ['"ter"m1"', 'term2'])), - ('"term1" "term2" "term3', ([], ['"term1"', '"term2"', '"term3'])), - ('"good', ([], ['"good'])), - ('"朋友', ([], ['"朋友'])), + ('"go"od"', ([], ['go', 'od'])), + ('"ter"m1" term2', ([], ['ter','m1', 'term2'])), + ('"term1" "term2" "term3', (['term1', 'term2'], ['term3'])), + ('"term1" "term2" "ter"m3', (['term1', 'term2'], ['ter', 'm3'])), + ('"term 1" "term "2 "term 3"', (['term 1', 'term 3'], ['term', '2'])), # good syntax in between bad syntax + ('"good', ([], ['good'])), # Unpaired quotes + ('"朋友', ([], ['朋友'])), + # Combination: good terms, bad terms (opening and closing), escaped quotes, spaces. + ('hello "term1" " term 2 " space b"adterm" "badte"rm "term \\"3" "unfinished', + (['term1', ' term 2 ', 'term \\"3'], ['hello', 'space', 'b', 'adterm', 'badte', 'rm', 'unfinished'])), # on Lucene, these unusual structures seem to get passed straight through as well. # The quotes seem to be completely ignored (with and without quotes yields identical results, # including scores): - ('"go"od" a"', ([], ['"go"od"', 'a"'])), - ('"sam"a', ([], ['"sam"a'])), - ('"sam"?', ([], ['"sam"?'])), - ('"朋友"你好', ([], ['"朋友"你好'])), + ('"go"od" a"', ([], ['go', 'od', 'a'])), + ('"sam"a', ([], ['sam', 'a'])), # Good opening, bad closing + ('sa"ma" hello!', ([], ['sa', 'ma', 'hello!'])), # Bad opening, good closing + ('"sam"?', ([], ['sam', '?'])), + ('"朋友"你好', ([], ['朋友', '你好'])), ] for input, expected_output in cases: