From d0226c14ddb45e4e3b2c21c1cdb9a2d1266f2af9 Mon Sep 17 00:00:00 2001
From: Joshua <joshua@marqo.ai>
Date: Thu, 24 Oct 2024 17:22:09 +0800
Subject: [PATCH] Lexical parser modification - best effort double quote
 parsing (#979)

Co-authored-by: yihanzhao <yihan@marqo.ai>
---
 src/marqo/core/exceptions.py                  |  3 +
 .../core/inference/tensor_fields_container.py |  1 +
 .../core/vespa_index/add_documents_handler.py |  2 +-
 src/marqo/tensor_search/utils.py              | 76 +++++++++++--------
 .../vespa_index/test_add_documents_handler.py |  2 +-
 .../integ_tests/test_search_combined.py       | 48 +++++++++++-
 tests/tensor_search/test_utils.py             | 41 ++++++----
 7 files changed, 121 insertions(+), 52 deletions(-)

diff --git a/src/marqo/core/exceptions.py b/src/marqo/core/exceptions.py
index a10755a8b..7467da5e3 100644
--- a/src/marqo/core/exceptions.py
+++ b/src/marqo/core/exceptions.py
@@ -102,6 +102,9 @@ def __init__(self, error_message: str,
         self.error_message = error_message
         self.status_code = int(status_code)
 
+    def __str__(self) -> str:
+        return f'{self.error_code}: {self.error_message}'
+
 
 class DuplicateDocumentError(AddDocumentsError):
     pass
diff --git a/src/marqo/core/inference/tensor_fields_container.py b/src/marqo/core/inference/tensor_fields_container.py
index 43c803823..a9dd57e12 100644
--- a/src/marqo/core/inference/tensor_fields_container.py
+++ b/src/marqo/core/inference/tensor_fields_container.py
@@ -159,6 +159,7 @@ def _s2inference_vectorise(self, content_chunks: List[ContentChunkType],
             # Fail the whole batch due to a malfunctioning embedding model
             raise ModelError(f'Problem vectorising query. Reason: {str(model_error)}')
         except s2_inference_errors.S2InferenceError as e:
+            # TODO find a better error code, the existing one is 'invalid_argument'
             raise AddDocumentsError(e.message) from e
 
     @classmethod
diff --git a/src/marqo/core/vespa_index/add_documents_handler.py b/src/marqo/core/vespa_index/add_documents_handler.py
index ce7a3a802..b181d35b7 100644
--- a/src/marqo/core/vespa_index/add_documents_handler.py
+++ b/src/marqo/core/vespa_index/add_documents_handler.py
@@ -365,7 +365,7 @@ def _vectorise_tensor_fields_in_batch_per_add_doc_batch(self, model_config: Mode
             except AddDocumentsError as err:
                 logger.error('Encountered problem when vectorising batch of documents. Reason: %s', err)
                 raise InternalError(
-                    message=f'Encountered problem when vectorising batch of documents. Reason: {str(err)}'
+                    message=f'Encountered problem when vectorising batch of documents. Reason: {err.error_message}'
                 )
 
             for doc_id, field_name, tensor_field_content in (
diff --git a/src/marqo/tensor_search/utils.py b/src/marqo/tensor_search/utils.py
index 90072a240..be10e9216 100644
--- a/src/marqo/tensor_search/utils.py
+++ b/src/marqo/tensor_search/utils.py
@@ -199,17 +199,20 @@ def parse_lexical_query(text: str) -> Tuple[List[str], List[str]]:
     """
     Find required terms enclosed within double quotes.
 
-    All other terms go into optional_terms, split by whitespace.
+    All other terms go into blob, split by whitespace. blob starts as a string then splits into
+    a list by space.
 
     Syntax:
         Required strings must be enclosed by quotes. These quotes must be enclosed by spaces or the start
-        or end of the text
+        or end of the text. Quotes always come in pairs.
 
+        Bad syntax rules:
+        - If 1 or both of the quotes in a pair has bad syntax (e.g. no space before opening quote or after closing
+          quote), both will be treated as whitespace.
+        - Incomplete quotes will be treated as whitespace.
     Notes:
-        Double quote can be either opening, closing, or escaped.
-        Escaped double quotes are interpreted literally.
-        If any double quotes exist that are neither opening, closing, nor escaped,
-        interpret entire string literally instead.
+        - Correct double quote can be either opening, closing, or escaped.
+        - Escaped double quotes are interpreted literally.
 
     Users need to escape the backslash itself. (Single \ get ignored) -> q='dwayne \\"the rock\\" johnson'
 
@@ -217,52 +220,59 @@ def parse_lexical_query(text: str) -> Tuple[List[str], List[str]]:
         2-tuple of <required terms> (for "must" clause) <optional terms> (for "should" clause)
     """
     required_terms = []
-    optional_terms = ""
+    blob = ""
     opening_quote_idx = None
+    current_quote_pair_is_faulty = False
 
     if not isinstance(text, str):
         raise TypeError("parse_lexical_query must have string as input")
 
     for i in range(len(text)):
-        # Add all characters to blob initially
-        optional_terms += text[i]
+        # Add every character to blob initially
+        blob += text[i]
 
         if text[i] == '"':
             # Check if ESCAPED
             if i > 0 and text[i - 1] == '\\':
-                # Read quote literally. Backslash should be ignored (both blob and required)
+                # Read quote literally. Backslash should be passed directly to Vespa.
                 pass
 
-            # Check if CLOSING QUOTE
-            # Closing " must have space on the right (or is last character) while opening exists.
-            elif (opening_quote_idx is not None) and (i == len(text) - 1 or text[i + 1] == " "):
-                # Add everything in between the quotes as a required term
-                new_required_term = text[opening_quote_idx + 1:i]
-                required_terms.append(new_required_term)
-
-                # Remove this required term from the optional blob
-                optional_terms = optional_terms[:-(len(new_required_term) + 2)]
-                opening_quote_idx = None
-
-            # Check if OPENING QUOTE
-            # Opening " must have space on the left (or is first character).
-            elif i == 0 or text[i - 1] == " ":
+            # OPENING QUOTE
+            elif (opening_quote_idx is None):
                 opening_quote_idx = i
+                blob_opening_quote_idx = len(blob) - 1 # Opening quote index in blob is different from text
 
-            # None of the above: Syntax error. Interpret text literally instead.
+                # Bad syntax opening quote: flag it, replace quote with whitespace
+                if not (i == 0 or text[i - 1] == " "):
+                    current_quote_pair_is_faulty = True
+                    blob = blob[:-1] + " "
+            # CLOSING QUOTE
             else:
-                return [], text.split()
+                # Good syntax closing: must have space on the right (or is last character) while opening exists.
+                if (i == len(text) - 1 or text[i + 1] == " ") and not current_quote_pair_is_faulty:
+                    # Add everything in between the quotes as a required term
+                    new_required_term = text[opening_quote_idx + 1:i]
+                    if new_required_term:                           # Do not add empty strings as required terms
+                        required_terms.append(new_required_term)
 
+                    # Remove this required term from the blob
+                    blob = blob[:-(len(new_required_term) + 2)]
+
+                else:
+                    # Bad syntax closing: treat this and opening quote as whitespace
+                    blob = blob[:blob_opening_quote_idx] + " " + \
+                                     blob[blob_opening_quote_idx + 1:-1] + " "
+
+                # Clean up flags
+                opening_quote_idx = None
+                current_quote_pair_is_faulty = False
+
+    # Unpaired quote will be turned to whitespace
     if opening_quote_idx is not None:
-        # string parsing finished with a quote still open: syntax error.
-        return [], text.split()
+        blob = blob[:blob_opening_quote_idx] + " " + blob[blob_opening_quote_idx + 1:]
 
     # Remove double/leading white spaces
-    optional_terms = optional_terms.split()
-
-    # Remove escape character. `\"` becomes just `"`
-    required_terms = [term.replace('\\"', '"') for term in required_terms]
-    optional_terms = [term.replace('\\"', '"') for term in optional_terms]
+    optional_terms = blob.split()
 
     return required_terms, optional_terms
 
diff --git a/tests/core/vespa_index/test_add_documents_handler.py b/tests/core/vespa_index/test_add_documents_handler.py
index 5931f4037..718415723 100644
--- a/tests/core/vespa_index/test_add_documents_handler.py
+++ b/tests/core/vespa_index/test_add_documents_handler.py
@@ -315,7 +315,7 @@ def test_add_documents_should_fail_a_batch_using_vectorise_per_doc_strategy(self
                 ])
         )
 
-        with self.assertRaises(InternalError) as context:
+        with self.assertRaisesStrict(InternalError) as context:
             handler.add_documents()
 
         self.assertEquals('Encountered problem when vectorising batch of documents. Reason: vectorise error',
diff --git a/tests/tensor_search/integ_tests/test_search_combined.py b/tests/tensor_search/integ_tests/test_search_combined.py
index e5e26674f..14d38ab19 100644
--- a/tests/tensor_search/integ_tests/test_search_combined.py
+++ b/tests/tensor_search/integ_tests/test_search_combined.py
@@ -965,4 +965,50 @@ def test_search_query_CanAcceptDifferentSearchMethods(self):
 
         # A special case for no search method provided
         search_query = SearchQuery(q="test")
-        self.assertEqual(SearchMethod.TENSOR, search_query.searchMethod)
\ No newline at end of file
+        self.assertEqual(SearchMethod.TENSOR, search_query.searchMethod)
+
+    def test_lexical_search_DoesNotErrorWithEscapedQuotes(self):
+        """
+        Ensure that lexical search handles double quotes properly, both escaped and wrong quotes.
+        Expected behavior: escaped quotes are passed to vespa. Incorrect quotes are treated like whitespace.
+        """
+
+        docs_list = [
+            {"_id": "doc1", "text_field_1": '1"2'},
+            {"_id": "doc2", "text_field_1": 'exact match'},
+            {"_id": "doc3", "text_field_1": 'exacto wrong syntax'},
+            {"_id": "doc4", "text_field_1": '"escaped"'},
+
+            {"_id": "red_herring_1", "text_field_1": '12'},
+            {"_id": "red_herring_2", "text_field_1": 'escaped'},
+            {"_id": "red_herring_3", "text_field_1": 'wrong"'}
+        ]
+        test_cases = [
+            ('1\\"2', ['doc1']),                        # Match off of '1"2'
+            ('"exact match"', ['doc2']),                # Match off of 'exact match'
+            ('\\"escaped\\"', ['doc4', 'red_herring_2']),        # Match off of 'escaped' or '"escaped"'
+            ('"exacto" wrong"', ['doc3']),       # Match properly off of 'wrong'
+            ('""', []),                          # Single quote should return no results (treated as whitespace)
+            ('"', []),                           # Double quote should return no results (treated as whitespace)
+            ('', [])                            # Empty string should return no results
+        ]
+
+        for index in [self.unstructured_default_text_index, self.structured_default_text_index]:
+            with self.subTest(index=index.type):
+                tensor_search.add_documents(
+                    config=self.config,
+                    add_docs_params=AddDocsParams(
+                        index_name=index.name,
+                        docs=docs_list,
+                        tensor_fields=["text_field_1"] if isinstance(index, UnstructuredMarqoIndex) else None
+                    )
+                )
+
+                for query, expected_ids in test_cases:
+                    with self.subTest(query=query):
+                        res = tensor_search.search(
+                            text=query, config=self.config, index_name=index.name,
+                            search_method=SearchMethod.LEXICAL
+                        )
+                        self.assertEqual(len(expected_ids), len(res['hits']))
+                        self.assertEqual(set(expected_ids), {hit['_id'] for hit in res['hits']})
diff --git a/tests/tensor_search/test_utils.py b/tests/tensor_search/test_utils.py
index 5ee7a91af..a4501a7af 100644
--- a/tests/tensor_search/test_utils.py
+++ b/tests/tensor_search/test_utils.py
@@ -223,7 +223,7 @@ def test_parse_lexical_query(self):
             ('just "a long long " string', (["a long long "], ['just', 'string'])),
             ('"required 1 " not required " required2" again',
              (["required 1 ", " required2"], ['not', 'required', 'again'])),
-            ('"just" "just" "" a string', (["just", "just", ""], ['a', 'string'])),
+            ('"just" "just" "" a string', (["just", "just"], ['a', 'string'])),
 
             ('朋友你好', ([], ['朋友你好'])),
             ('朋友 "你好"', (["你好"], ['朋友'])),
@@ -234,30 +234,39 @@ def test_parse_lexical_query(self):
             ('', ([], [])),
             ('"cookie"', (["cookie"], [])),
             ('"朋友"', (["朋友"], [])),
-            ('"', ([], ['"'])),
-            ('"""hello', ([], ['"""hello'])),
-            ('""" python docstring appeared"""', ([], ['"""', 'python', 'docstring', 'appeared"""'])),
-            ('""', ([''], [])),
+
+            # Badly formatted double quotes.
+            # Treat every 2 as a pair, bad/unpaired quotes become whitespace.
+            ('"', ([], [])),
+            ('"""hello', ([], ['hello'])),
+            ('""" python docstring appeared"""', ([], ['python', 'docstring', 'appeared'])),
+            ('""', ([], [])),
             ('what about backticks `?', ([], ['what', 'about', 'backticks', '`?'])),
             (
                 '\\" escaped quotes\\"  what happens here?',
-                ([], ['"', 'escaped', 'quotes"', 'what', 'happens', 'here?'])
+                ([], ['\\"', 'escaped', 'quotes\\"', 'what', 'happens', 'here?'])
             ),
-            ('\\"朋友\\"', ([], ['"朋友"'])),
+            ('\\"朋友\\"', ([], ['\\"朋友\\"'])),
             ('double  spaces  get  removed', ([], ['double', 'spaces', 'get', 'removed'])),
-            ('"go"od"', ([], ['"go"od"'])),
-            ('"ter"m1" term2', ([], ['"ter"m1"', 'term2'])),
-            ('"term1" "term2" "term3', ([], ['"term1"', '"term2"', '"term3'])),
-            ('"good', ([], ['"good'])),
-            ('"朋友', ([], ['"朋友'])),
+            ('"go"od"', ([], ['go', 'od'])),
+            ('"ter"m1" term2', ([], ['ter','m1', 'term2'])),
+            ('"term1" "term2" "term3', (['term1', 'term2'], ['term3'])),
+            ('"term1" "term2" "ter"m3', (['term1', 'term2'], ['ter', 'm3'])),
+            ('"term 1" "term "2 "term 3"', (['term 1', 'term 3'], ['term', '2'])),  # good syntax in between bad syntax
+            ('"good', ([], ['good'])),      # Unpaired quotes
+            ('"朋友', ([], ['朋友'])),
+            # Combination: good terms, bad terms (opening and closing), escaped quotes, spaces.
+            ('hello "term1" " term 2 " space b"adterm" "badte"rm "term \\"3" "unfinished',
+             (['term1', ' term 2 ', 'term \\"3'], ['hello', 'space', 'b', 'adterm', 'badte', 'rm', 'unfinished'])),
 
             # on Lucene, these unusual structures seem to get passed straight through as well.
             # The quotes seem to be completely ignored (with and without quotes yields identical results,
             # including scores):
-            ('"go"od" a"', ([], ['"go"od"', 'a"'])),
-            ('"sam"a', ([], ['"sam"a'])),
-            ('"sam"?', ([], ['"sam"?'])),
-            ('"朋友"你好', ([], ['"朋友"你好'])),
+            ('"go"od" a"', ([], ['go', 'od', 'a'])),
+            ('"sam"a', ([], ['sam', 'a'])),     # Good opening, bad closing
+            ('sa"ma" hello!', ([], ['sa', 'ma', 'hello!'])),    # Bad opening, good closing
+            ('"sam"?', ([], ['sam', '?'])),
+            ('"朋友"你好', ([], ['朋友', '你好'])),
 
         ]
         for input, expected_output in cases: