diff --git a/.github/workflows/largemodel_unit_test_CI.yml b/.github/workflows/largemodel_unit_test_CI.yml index 7a751f62d..f4186edad 100644 --- a/.github/workflows/largemodel_unit_test_CI.yml +++ b/.github/workflows/largemodel_unit_test_CI.yml @@ -80,12 +80,14 @@ jobs: uses: actions/checkout@v3 with: repository: marqo-ai/marqo-base + ref: 'releases/2.13' path: marqo-base - name: Install dependencies run: | - pip install -r marqo-base/requirements/amd64-gpu-requirements.txt + pip install -r marqo-base/requirements.txt # override base requirements with marqo requirements, if needed: + pip install -r marqo/requirements.txt pip install -r marqo/requirements.dev.txt pip install pytest==7.4.0 diff --git a/.github/workflows/unit_test_200gb_CI.yml b/.github/workflows/unit_test_200gb_CI.yml index 168b7fd71..e9c09a624 100644 --- a/.github/workflows/unit_test_200gb_CI.yml +++ b/.github/workflows/unit_test_200gb_CI.yml @@ -76,12 +76,14 @@ jobs: uses: actions/checkout@v3 with: repository: marqo-ai/marqo-base + ref: 'releases/2.13' path: marqo-base - name: Install dependencies run: | - pip install -r marqo-base/requirements/amd64-gpu-requirements.txt + pip install -r marqo-base/requirements.txt # override base requirements with marqo requirements, if needed: + pip install -r marqo/requirements.txt pip install -r marqo/requirements.dev.txt - name: Build Vespa diff --git a/requirements.txt b/requirements.txt index ef7f9adbe..943bda804 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ readerwriterlock==1.0.9 kazoo==2.10.0 pycurl==7.45.3 huggingface-hub==0.25.0 -jinja2==3.1.4 \ No newline at end of file +jinja2==3.1.4 +orjson==3.10.14 \ No newline at end of file diff --git a/src/marqo/core/inference/tensor_fields_container.py b/src/marqo/core/inference/tensor_fields_container.py index a9dd57e12..46be95ac6 100644 --- a/src/marqo/core/inference/tensor_fields_container.py +++ b/src/marqo/core/inference/tensor_fields_container.py @@ -1,7 +1,7 @@ import hashlib import json from abc import ABC, abstractmethod -from typing import List, Dict, Set, Optional, Any, Generator, Tuple, cast, TypeVar +from typing import List, Dict, Set, Optional, Any, Generator, Tuple, cast, TypeVar, Callable, Union import numpy as np from PIL.Image import Image @@ -10,7 +10,7 @@ from marqo.core import constants from marqo.core.constants import MARQO_DOC_ID -from marqo.core.exceptions import AddDocumentsError, ModelError +from marqo.core.exceptions import AddDocumentsError, ModelError, InternalError from marqo.core.models.marqo_index import FieldType, TextPreProcessing, ImagePreProcessing from marqo.s2_inference import errors as s2_inference_errors from marqo.s2_inference import s2_inference @@ -493,7 +493,20 @@ def populate_tensor_from_existing_doc(self, existing_marqo_doc: Dict[str, Any], tensor_content.populate_chunks_and_embeddings(existing_tensor[constants.MARQO_DOC_CHUNKS], existing_tensor[constants.MARQO_DOC_EMBEDDINGS]) - def collect(self, doc_id: str, field_name: str, field_content: Any, field_type: Optional[FieldType]) -> Any: + def collect(self, doc_id: str, field_name: str, field_content: Any, + infer_field_type: Callable[[str, Any], FieldType]) -> Any: + """ + Collect tensor field content from the document if it is a tensor field. + + Args: + doc_id: document id + field_name: name of the field + field_content: content of the field + infer_field_type: A callable that takes the field content and field name and returns the field type, or + a FieldType enum value + Returns: + The field content + """ if field_name not in self._tensor_fields and field_name not in self._multimodal_sub_field_reverse_map: # not tensor fields, no need to collect return field_content @@ -511,6 +524,8 @@ def collect(self, doc_id: str, field_name: str, field_content: Any, field_type: f'Invalid type {type(field_content)} for tensor field {field_name}' ) + field_type = infer_field_type(field_name, field_content) + self._add_tensor_field_content( doc_id, field_name, TensorFieldContent( field_content=field_content, @@ -556,4 +571,4 @@ def collect_multi_modal_fields(self, doc_id: str, normalize_embeddings: bool): is_multimodal_subfield=False, normalize_embeddings=normalize_embeddings )) - yield field_name, weights + yield field_name, weights \ No newline at end of file diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py index 74ffb4073..00ee8b697 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py @@ -40,16 +40,8 @@ def __init__(self, marqo_index: SemiStructuredMarqoIndex, add_docs_params: AddDo self.field_count_config = field_count_config def _handle_field(self, marqo_doc, field_name, field_content): - self._validate_field(field_name, field_content) - text_field_type = self._infer_field_type( - field_content, - media_download_headers=self.add_docs_params.media_download_headers - ) - content = self.tensor_fields_container.collect(marqo_doc[MARQO_DOC_ID], field_name, - field_content, text_field_type) - marqo_doc[field_name] = content - - if isinstance(content, str): + super()._handle_field(marqo_doc, field_name, field_content) + if isinstance(marqo_doc[field_name], str): self._add_lexical_field_to_index(field_name) def _to_vespa_doc(self, doc: Dict[str, Any]) -> VespaDocument: diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py index 1540f7d21..dd9579999 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py @@ -62,7 +62,7 @@ def from_vespa_document(cls, document: Dict, marqo_index: SemiStructuredMarqoInd text_fields[field_name] = fields[field_name] return cls(id=document[cls._VESPA_DOC_ID], - fixed_fields=SemiStructuredVespaDocumentFields(**fields), + fixed_fields=SemiStructuredVespaDocumentFields.construct(**fields), tensor_fields=tensor_fields, text_fields=text_fields, raw_tensor_score=cls.extract_field(fields, common.VESPA_DOC_HYBRID_RAW_TENSOR_SCORE, None), diff --git a/src/marqo/core/structured_vespa_index/structured_add_document_handler.py b/src/marqo/core/structured_vespa_index/structured_add_document_handler.py index a8a7097d8..bab9d9f65 100644 --- a/src/marqo/core/structured_vespa_index/structured_add_document_handler.py +++ b/src/marqo/core/structured_vespa_index/structured_add_document_handler.py @@ -56,10 +56,15 @@ def _validate_add_docs_params(self, add_docs_params: AddDocsParams, marqo_index: def _handle_field(self, marqo_doc, field_name, field_content): self._validate_field(field_name, field_content) - field_type = self.marqo_index.field_map[field_name].type - content = self.tensor_fields_container.collect(marqo_doc[MARQO_DOC_ID], field_name, field_content, field_type) + content = self.tensor_fields_container.collect( + marqo_doc[MARQO_DOC_ID], field_name, field_content, + self._infer_field_type + ) marqo_doc[field_name] = content + def _infer_field_type(self, field_name:str, field_content: Any) -> FieldType: + return self.marqo_index.field_map[field_name].type + def _validate_field(self, field_name: str, field_content: Any) -> None: try: # TODO extract the validation logic somewhere else diff --git a/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py b/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py index 7f4e36740..e737e95ca 100644 --- a/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py +++ b/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py @@ -18,15 +18,13 @@ from marqo.core.vespa_index.add_documents_handler import AddDocumentsHandler, AddDocumentsError from marqo.s2_inference.errors import MediaDownloadError from marqo.s2_inference.multimodal_model_load import infer_modality, Modality - -from marqo.vespa.models import VespaDocument -from marqo.vespa.models.get_document_response import Document -from marqo.vespa.vespa_client import VespaClient - # TODO deps to tensor_search needs to be removed from marqo.tensor_search.constants import ALLOWED_UNSTRUCTURED_FIELD_TYPES from marqo.tensor_search.validation import validate_custom_vector, \ validate_map_numeric_field +from marqo.vespa.models import VespaDocument +from marqo.vespa.models.get_document_response import Document +from marqo.vespa.vespa_client import VespaClient class UnstructuredAddDocumentsHandler(AddDocumentsHandler): @@ -67,28 +65,52 @@ def _validate_doc(self, doc): def _handle_field(self, marqo_doc, field_name, field_content): self._validate_field(field_name, field_content) - text_field_type = self._infer_field_type(field_content, self.add_docs_params.media_download_headers) - content = self.tensor_fields_container.collect(marqo_doc[MARQO_DOC_ID], field_name, - field_content, text_field_type) + content = self.tensor_fields_container.collect( + marqo_doc[MARQO_DOC_ID], field_name, field_content, + self._infer_field_type + ) marqo_doc[field_name] = content - def _infer_field_type(self, field_content: Any, media_download_headers: Optional[Dict] = None) \ - -> Optional[FieldType]: - if not isinstance(field_content, str): - return None - - try: - modality = infer_modality(field_content, media_download_headers) - - if not self.marqo_index.treat_urls_and_pointers_as_media and modality in [Modality.AUDIO, Modality.VIDEO]: - modality = Modality.TEXT - - if not self.marqo_index.treat_urls_and_pointers_as_images and modality == Modality.IMAGE: + def _infer_field_type(self, field_name:str, field_content: Any) -> FieldType: + """Infer the field type based on the field content. This is used for both unstructured and semi-structured + indexes. + + We should only infer the field type if the field content is a string. + We only infer the field type if the index is configured to treat URLs and pointers as images or media. + + treatUrlsAndPointersAsMedia is a new parameter introduced in Marqo 2.12 to support the new modalities + of video and audio. Here is how it interacts with treatUrlsAndPointersAsImages: + Both False: All content is processed as text only. + treatUrlsAndPointersAsImages True, treatUrlsAndPointersAsMedia False: + Processes URLs and pointers as images + Does not process other media types (video, audio) + treatUrlsAndPointersAsImages False, treatUrlsAndPointersAsMedia True: + Invalid state since this is a conflict. + Both True: + Processes URLs and pointers as various media types (images, videos, audio) + + The values of treatUrlsAndPointersAsMedia and treatUrlsAndPointersAsImages are validated in the MarqoIndex class + so we do not need to validate them here. + + Args: + field_content: The content of the field. + Returns: + The inferred field type. + Raises: + AddDocumentsError: If the modality of the media content cannot be inferred. + """ + if (self.marqo_index.treat_urls_and_pointers_as_images is True or + self.marqo_index.treat_urls_and_pointers_as_media is True): + try: + modality = infer_modality(field_content, self.add_docs_params.media_download_headers) + except MediaDownloadError as err: + raise AddDocumentsError(err.message) from err + if ((self.marqo_index.treat_urls_and_pointers_as_media is False) and modality in + [Modality.AUDIO, Modality.VIDEO]): modality = Modality.TEXT - return MODALITY_FIELD_TYPE_MAP[modality] - except MediaDownloadError as err: - raise AddDocumentsError(err.message) from err + else: + return FieldType.Text def _validate_field(self, field_name: str, field_content: Any) -> None: try: diff --git a/src/marqo/core/vespa_index/add_documents_handler.py b/src/marqo/core/vespa_index/add_documents_handler.py index ecf77f997..53ab8e0e3 100644 --- a/src/marqo/core/vespa_index/add_documents_handler.py +++ b/src/marqo/core/vespa_index/add_documents_handler.py @@ -196,6 +196,13 @@ def _handle_field(self, marqo_doc, field_name, field_content) -> None: """ pass + @abstractmethod + def _infer_field_type(self, field_name: str, field_content: Any) -> FieldType: + """ + This method infers the field type of a field based on the field name and content. + """ + pass + @abstractmethod def _handle_multi_modal_fields(self, marqo_doc: Dict[str, Any]) -> None: """ diff --git a/src/marqo/tensor_search/api.py b/src/marqo/tensor_search/api.py index 8386b8382..c64111663 100644 --- a/src/marqo/tensor_search/api.py +++ b/src/marqo/tensor_search/api.py @@ -7,7 +7,7 @@ from fastapi import Depends, FastAPI, Request from fastapi.encoders import jsonable_encoder from fastapi.exceptions import RequestValidationError -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, ORJSONResponse from starlette.status import HTTP_422_UNPROCESSABLE_ENTITY from marqo import config, marqo_docs @@ -267,7 +267,7 @@ def create_index(index_name: str, settings: IndexSettings, marqo_config: config. def search(search_query: SearchQuery, index_name: str, device: str = Depends(api_validation.validate_device), marqo_config: config.Config = Depends(get_config)): with RequestMetricsStore.for_request().time(f"POST /indexes/{index_name}/search"): - return tensor_search.search( + result = tensor_search.search( config=marqo_config, text=search_query.q, index_name=index_name, highlights=search_query.showHighlights, searchable_attributes=search_query.searchableAttributes, @@ -284,6 +284,7 @@ def search(search_query: SearchQuery, index_name: str, device: str = Depends(api text_query_prefix=search_query.textQueryPrefix, hybrid_parameters=search_query.hybridParameters ) + return ORJSONResponse(result) @app.post("/indexes/{index_name}/recommend") diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index 5198eaa38..fabb9aa74 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -32,14 +32,13 @@ """ import copy import json -import traceback import typing import uuid import os from collections import defaultdict from contextlib import ExitStack from timeit import default_timer as timer -from typing import List, Optional, Union, Iterable, Sequence, Dict, Any, Tuple +from typing import List, Optional, Union, Iterable, Sequence, Dict, Any, Tuple, Set import numpy as np import psutil @@ -1775,16 +1774,26 @@ def gather_documents_from_response(response: QueryResult, marqo_index: MarqoInde """ Convert a VespaQueryResponse to a Marqo search response """ + + if (marqo_index.type in [IndexType.Unstructured, IndexType.SemiStructured] and + attributes_to_retrieve is not None): + # Unstructured index and Semi-structured index stores fixed fields (numeric, boolean, string arrays, etc.) in + # combined field. It needs to select attributes after converting vespa doc to marqo doc if + # attributes_to_retrieve is specified + metadata_fields_to_retrieve = {"_id", "_score", "_highlights"} + attributes_to_retrieve_set = set(attributes_to_retrieve).union(metadata_fields_to_retrieve) + else: + # If this set is None, we will return the marqo_doc as is. + attributes_to_retrieve_set = None + vespa_index = vespa_index_factory(marqo_index) hits = [] for doc in response.hits: - marqo_doc = vespa_index.to_marqo_document(doc.dict(), return_highlights=highlights) + marqo_doc = vespa_index.to_marqo_document(dict(doc), return_highlights=highlights) marqo_doc['_score'] = doc.relevance - if (marqo_index.type in [IndexType.Unstructured, IndexType.SemiStructured] and - attributes_to_retrieve is not None): - # For an unstructured index, we do the attributes_to_retrieve after search - marqo_doc = unstructured_index_attributes_to_retrieve(marqo_doc, attributes_to_retrieve) + if attributes_to_retrieve_set is not None: + marqo_doc = select_attributes(marqo_doc, attributes_to_retrieve_set) # Delete chunk data if constants.MARQO_DOC_TENSORS in marqo_doc: @@ -1794,16 +1803,18 @@ def gather_documents_from_response(response: QueryResult, marqo_index: MarqoInde return {'hits': hits} -def unstructured_index_attributes_to_retrieve(marqo_doc: Dict[str, Any], attributes_to_retrieve: List[str]) -> Dict[ - str, Any]: - # attributes_to_retrieve should already be validated at the start of search - attributes_to_retrieve = list(set(attributes_to_retrieve).union({"_id", "_score", "_highlights"})) - return {k: v for k, v in marqo_doc.items() if k in attributes_to_retrieve or - # Please note that numeric map fields are flattened for unstructured or semi-structured indexes. - # Therefore, when filtering on attributes_to_retrieve, we need to also include flattened map fields - # with the specified attributes as prefixes. We keep this behaviour only for compatibility reasons. - any([k.startswith(attribute + ".") for attribute in attributes_to_retrieve])} +def select_attributes(marqo_doc: Dict[str, Any], attributes_to_retrieve_set: Set[str]) -> Dict[str, Any]: + """ + Unstructured index and Semi-structured index retrieve all fixed fields (numeric, boolean, string arrays, etc.) + from Vespa when attributes_to_retrieve is specified. After converting the Vespa doc to Marqo doc, it needs to + filter out attributes not in the attributes_to_retrieve list. + Please note that numeric map fields are flattened for unstructured or semi-structured indexes. + Therefore, when filtering on attributes_to_retrieve, we need to also include flattened map fields + with the specified attributes as prefixes. We keep this behaviour only for compatibility reasons. + """ + return {k: v for k, v in marqo_doc.items() if k in attributes_to_retrieve_set or + '.' in k and k.split('.', maxsplit=1)[0] in attributes_to_retrieve_set} def assign_query_to_vector_job( q: BulkSearchQueryEntity, jobs: Dict[JHash, VectorisedJobs], diff --git a/src/marqo/version.py b/src/marqo/version.py index 1d4e365c8..ba79a954a 100644 --- a/src/marqo/version.py +++ b/src/marqo/version.py @@ -1,4 +1,4 @@ -__version__ = "2.13.4" +__version__ = "2.13.5" def get_version() -> str: return f"{__version__}" diff --git a/src/marqo/vespa/vespa_client.py b/src/marqo/vespa/vespa_client.py index 2f4310a34..4247a44f7 100644 --- a/src/marqo/vespa/vespa_client.py +++ b/src/marqo/vespa/vespa_client.py @@ -11,6 +11,7 @@ import httpcore import httpx +import orjson import marqo.logging import marqo.vespa.concurrency as conc @@ -245,7 +246,7 @@ def query(self, yql: str, hits: int = 10, ranking: str = None, model_restrict: s self._query_raise_for_status(resp) - return QueryResult(**resp.json()) + return QueryResult(**orjson.loads(resp.text)) def feed_document(self, document: VespaDocument, schema: str, timeout: int = 60) -> FeedDocumentResponse: """ diff --git a/tests/core/inference/test_tensor_fields_container.py b/tests/core/inference/test_tensor_fields_container.py index 663e5602e..61f3ed4fd 100644 --- a/tests/core/inference/test_tensor_fields_container.py +++ b/tests/core/inference/test_tensor_fields_container.py @@ -6,7 +6,7 @@ from marqo.core.constants import MARQO_DOC_ID, MARQO_DOC_TENSORS, MARQO_DOC_CHUNKS, MARQO_DOC_EMBEDDINGS from marqo.core.inference.tensor_fields_container import TensorFieldsContainer, MultiModalTensorFieldContent from marqo.core.exceptions import AddDocumentsError -from marqo.core.models.marqo_index import FieldType +from marqo.core.models.marqo_index import FieldType, Field @pytest.mark.unittest @@ -33,12 +33,16 @@ def test_initialisation(self): self.assertTrue(self.container.is_multimodal_field('combo_field2')) self.assertFalse(self.container.is_multimodal_field('combo_field3')) - self.assertEquals({'subfield1', 'tensor_field2'}, self.container.get_multimodal_sub_fields()) - self.assertEquals({'subfield1': 1.0}, self.container.get_multimodal_field_mapping('combo_field1')) - self.assertEquals({'subfield1': 2.0, 'tensor_field2': 5.0}, + self.assertEqual({'subfield1', 'tensor_field2'}, self.container.get_multimodal_sub_fields()) + self.assertEqual({'subfield1': 1.0}, self.container.get_multimodal_field_mapping('combo_field1')) + self.assertEqual({'subfield1': 2.0, 'tensor_field2': 5.0}, self.container.get_multimodal_field_mapping('combo_field2')) - self.assertEquals(0, len(self.container._tensor_field_map)) + self.assertEqual(0, len(self.container._tensor_field_map)) + + def _dummy_infer_field_type(self, field_type): + """A function that returns another function which always returns the field_type.""" + return lambda field_name, field_content: field_type def test_collect_non_tensor_fields(self): test_cases = [ @@ -57,26 +61,28 @@ def test_collect_non_tensor_fields(self): ] for (field_content, field_type) in test_cases: with self.subTest(msg=f'field_content {field_content} of type {field_type}'): - content = self.container.collect('doc_id1', 'field1', field_content, field_type) - self.assertEquals(field_content, content) + content = self.container.collect( + 'doc_id1', 'field1', field_content, self._dummy_infer_field_type(field_type) + ) + self.assertEqual(field_content, content) # verify that they won't be collected to tensor field maps - self.assertEquals(0, len(self.container._tensor_field_map)) + self.assertEqual(0, len(self.container._tensor_field_map)) def test_collect_custom_vector_field(self): content = self.container.collect('doc_id1', 'custom_vector_field1', { 'content': 'content1', 'vector': [1.0, 2.0] - }, None) + }, self._dummy_infer_field_type(None)) - self.assertEquals('content1', content) + self.assertEqual('content1', content) self.assertIn('doc_id1', self.container._tensor_field_map) self.assertIn('custom_vector_field1', self.container._tensor_field_map['doc_id1']) tensor_field_content = self.container._tensor_field_map['doc_id1']['custom_vector_field1'] - self.assertEquals('content1', tensor_field_content.field_content) - self.assertEquals(FieldType.CustomVector, tensor_field_content.field_type) - self.assertEquals(['content1'], tensor_field_content.chunks) - self.assertEquals([[0.4472135954999579, 0.8944271909999159]], tensor_field_content.embeddings) # normalised + self.assertEqual('content1', tensor_field_content.field_content) + self.assertEqual(FieldType.CustomVector, tensor_field_content.field_type) + self.assertEqual(['content1'], tensor_field_content.chunks) + self.assertEqual([[0.4472135954999579, 0.8944271909999159]], tensor_field_content.embeddings) # normalised self.assertTrue(tensor_field_content.is_tensor_field) self.assertFalse(tensor_field_content.is_multimodal_subfield) @@ -112,16 +118,17 @@ def test_collect_tensor_field_with_non_string_type(self): def test_collect_tensor_field_with_string_type(self): for text_field_type in [FieldType.Text, FieldType.ImagePointer, FieldType.AudioPointer, FieldType.VideoPointer]: with self.subTest(msg=f'field_type {text_field_type}'): - content = self.container.collect('doc_id1', 'tensor_field1', 'content', text_field_type) - self.assertEquals('content', content) + content = self.container.collect('doc_id1', 'tensor_field1', 'content', + self._dummy_infer_field_type(text_field_type)) + self.assertEqual('content', content) self.assertIn('doc_id1', self.container._tensor_field_map) self.assertIn('tensor_field1', self.container._tensor_field_map['doc_id1']) tensor_field_content = self.container._tensor_field_map['doc_id1']['tensor_field1'] - self.assertEquals('content', tensor_field_content.field_content) - self.assertEquals(text_field_type, tensor_field_content.field_type) - self.assertEquals([], tensor_field_content.chunks) - self.assertEquals([], tensor_field_content.embeddings) + self.assertEqual('content', tensor_field_content.field_content) + self.assertEqual(text_field_type, tensor_field_content.field_type) + self.assertEqual([], tensor_field_content.chunks) + self.assertEqual([], tensor_field_content.embeddings) self.assertTrue(tensor_field_content.is_tensor_field) self.assertFalse(tensor_field_content.is_multimodal_subfield) @@ -134,16 +141,16 @@ def test_collect_tensor_field_can_identify_toplevel_or_subfield(self): for (field_name, is_tensor_field, is_multimodal_subfield) in test_cases: with self.subTest(msg=f'{field_name}: is_tensor_field={is_tensor_field}, is_multimodal_subfield={is_multimodal_subfield}'): - self.container.collect('doc_id1', field_name, 'content', FieldType.Text) + self.container.collect('doc_id1', field_name, 'content', self._dummy_infer_field_type(FieldType.Text)) tensor_field_content = self.container._tensor_field_map['doc_id1'][field_name] - self.assertEquals(is_tensor_field, tensor_field_content.is_tensor_field) - self.assertEquals(is_multimodal_subfield, tensor_field_content.is_multimodal_subfield) + self.assertEqual(is_tensor_field, tensor_field_content.is_tensor_field) + self.assertEqual(is_multimodal_subfield, tensor_field_content.is_multimodal_subfield) def test_remove_doc(self): - self.container.collect('doc_id1', 'tensor_field1', 'content', FieldType.Text) - self.container.collect('doc_id1', 'tensor_field2', 'content', FieldType.Text) - self.container.collect('doc_id2', 'tensor_field2', 'content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'content', self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'tensor_field2', 'content', self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id2', 'tensor_field2', 'content', self._dummy_infer_field_type(FieldType.Text)) self.assertIn('doc_id1', self.container._tensor_field_map) self.assertIn('doc_id2', self.container._tensor_field_map) @@ -157,12 +164,14 @@ def test_remove_doc(self): def test_collect_multimodal_fields_should_return_all(self): fields = list(self.container.collect_multi_modal_fields('doc_id1', True)) - self.assertEquals(('combo_field1', {'subfield1': 1.0}), fields[0]) - self.assertEquals(('combo_field2', {'subfield1': 2.0, 'tensor_field2': 5.0}), fields[1]) + self.assertEqual(('combo_field1', {'subfield1': 1.0}), fields[0]) + self.assertEqual(('combo_field2', {'subfield1': 2.0, 'tensor_field2': 5.0}), fields[1]) def test_collect_multimodal_fields_should_populate_subfields(self): - self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', FieldType.Text) - self.container.collect('doc_id1', 'subfield1', 'subfield1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'subfield1', 'subfield1_content', + self._dummy_infer_field_type(FieldType.Text)) list(self.container.collect_multi_modal_fields('doc_id1', True)) @@ -170,34 +179,36 @@ def test_collect_multimodal_fields_should_populate_subfields(self): self.assertIn('combo_field1', self.container._tensor_field_map['doc_id1']) combo_field1 = cast(MultiModalTensorFieldContent, self.container._tensor_field_map['doc_id1']['combo_field1']) - self.assertEquals(FieldType.MultimodalCombination, combo_field1.field_type) - self.assertEquals('', combo_field1.field_content) + self.assertEqual(FieldType.MultimodalCombination, combo_field1.field_type) + self.assertEqual('', combo_field1.field_content) self.assertTrue(combo_field1.is_tensor_field) self.assertFalse(combo_field1.is_multimodal_subfield) - self.assertEquals({'subfield1': 1.0}, combo_field1.weights) - self.assertEquals({'subfield1': self.container._tensor_field_map['doc_id1']['subfield1']}, + self.assertEqual({'subfield1': 1.0}, combo_field1.weights) + self.assertEqual({'subfield1': self.container._tensor_field_map['doc_id1']['subfield1']}, combo_field1.subfields) self.assertTrue(combo_field1.normalize_embeddings) combo_field2 = cast(MultiModalTensorFieldContent, self.container._tensor_field_map['doc_id1']['combo_field2']) - self.assertEquals({'subfield1': self.container._tensor_field_map['doc_id1']['subfield1'], + self.assertEqual({'subfield1': self.container._tensor_field_map['doc_id1']['subfield1'], 'tensor_field2': self.container._tensor_field_map['doc_id1']['tensor_field2']}, combo_field2.subfields) def test_collect_multimodal_fields_should_not_populate_subfields_not_existing(self): - self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', + self._dummy_infer_field_type(FieldType.Text)) list(self.container.collect_multi_modal_fields('doc_id1', True)) combo_field1 = cast(MultiModalTensorFieldContent, self.container._tensor_field_map['doc_id1']['combo_field1']) - self.assertEquals({}, combo_field1.subfields) + self.assertEqual({}, combo_field1.subfields) combo_field2 = cast(MultiModalTensorFieldContent, self.container._tensor_field_map['doc_id1']['combo_field2']) - self.assertEquals({'tensor_field2': self.container._tensor_field_map['doc_id1']['tensor_field2']}, + self.assertEqual({'tensor_field2': self.container._tensor_field_map['doc_id1']['tensor_field2']}, combo_field2.subfields) def test_populate_tensor_from_existing_docs_will_not_populate_if_doc_id_does_not_match(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) tensor_field1 = self.container._tensor_field_map['doc_id1']['tensor_field1'] self.container.populate_tensor_from_existing_doc({ @@ -208,11 +219,12 @@ def test_populate_tensor_from_existing_docs_will_not_populate_if_doc_id_does_not } }, {}) - self.assertEquals([], tensor_field1.chunks) - self.assertEquals([], tensor_field1.embeddings) + self.assertEqual([], tensor_field1.chunks) + self.assertEqual([], tensor_field1.embeddings) def test_populate_tensor_from_existing_docs_should_populate_if_doc_id_matches(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) tensor_field1 = self.container._tensor_field_map['doc_id1']['tensor_field1'] self.container.populate_tensor_from_existing_doc({ @@ -223,11 +235,12 @@ def test_populate_tensor_from_existing_docs_should_populate_if_doc_id_matches(se } }, {}) - self.assertEquals(['tensor_field1_content'], tensor_field1.chunks) - self.assertEquals([[1.0, 2.0]], tensor_field1.embeddings) + self.assertEqual(['tensor_field1_content'], tensor_field1.chunks) + self.assertEqual([[1.0, 2.0]], tensor_field1.embeddings) def test_populate_tensor_from_existing_docs_will_not_populate_if_content_changes(self): - self.container.collect('doc_id1', 'tensor_field1', 'changed_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'changed_content', + self._dummy_infer_field_type(FieldType.Text)) tensor_field1 = self.container._tensor_field_map['doc_id1']['tensor_field1'] self.container.populate_tensor_from_existing_doc({ @@ -238,11 +251,12 @@ def test_populate_tensor_from_existing_docs_will_not_populate_if_content_changes } }, {}) - self.assertEquals([], tensor_field1.chunks) - self.assertEquals([], tensor_field1.embeddings) + self.assertEqual([], tensor_field1.chunks) + self.assertEqual([], tensor_field1.embeddings) def test_populate_tensor_from_existing_docs_will_not_populate_if_field_does_not_exist(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) tensor_field1 = self.container._tensor_field_map['doc_id1']['tensor_field1'] self.container.populate_tensor_from_existing_doc({ @@ -251,11 +265,12 @@ def test_populate_tensor_from_existing_docs_will_not_populate_if_field_does_not_ MARQO_DOC_TENSORS: {} }, {}) - self.assertEquals([], tensor_field1.chunks) - self.assertEquals([], tensor_field1.embeddings) + self.assertEqual([], tensor_field1.chunks) + self.assertEqual([], tensor_field1.embeddings) def test_populate_tensor_from_existing_docs_will_not_populate_if_embedding_does_not_exist(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) tensor_field1 = self.container._tensor_field_map['doc_id1']['tensor_field1'] self.container.populate_tensor_from_existing_doc({ @@ -264,11 +279,12 @@ def test_populate_tensor_from_existing_docs_will_not_populate_if_embedding_does_ MARQO_DOC_TENSORS: {} # embedding for tensor_field1 does not exist in the existing doc }, {}) - self.assertEquals([], tensor_field1.chunks) - self.assertEquals([], tensor_field1.embeddings) + self.assertEqual([], tensor_field1.chunks) + self.assertEqual([], tensor_field1.embeddings) def test_populate_tensor_from_existing_docs_will_not_populate_if_existing_field_is_multimodal_combo_field(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) tensor_field1 = self.container._tensor_field_map['doc_id1']['tensor_field1'] self.container.populate_tensor_from_existing_doc({ @@ -278,14 +294,14 @@ def test_populate_tensor_from_existing_docs_will_not_populate_if_existing_field_ } }, {'tensor_field1': {'subfield1': 1.0}}) # tensor_field1 is a multimodal combo field - self.assertEquals([], tensor_field1.chunks) - self.assertEquals([], tensor_field1.embeddings) + self.assertEqual([], tensor_field1.chunks) + self.assertEqual([], tensor_field1.embeddings) def test_populate_tensor_from_existing_docs_will_not_populate_for_custom_vector_field(self): self.container.collect('doc_id1', 'custom_vector_field1', { 'content': 'content1', 'vector': [1.0, 2.0] - }, None) + }, self._dummy_infer_field_type(None)) custom_vector_field1 = self.container._tensor_field_map['doc_id1']['custom_vector_field1'] self.container.populate_tensor_from_existing_doc({ @@ -296,8 +312,8 @@ def test_populate_tensor_from_existing_docs_will_not_populate_for_custom_vector_ } # embedding for tensor_field1 does not exist in the existing doc }, {}) - self.assertEquals(['content1'], custom_vector_field1.chunks) - self.assertEquals([[0.4472135954999579, 0.8944271909999159]], custom_vector_field1.embeddings) + self.assertEqual(['content1'], custom_vector_field1.chunks) + self.assertEqual([[0.4472135954999579, 0.8944271909999159]], custom_vector_field1.embeddings) def test_populate_tensor_from_existing_docs_will_not_populate_for_multimodal_field_if_it_does_not_exist(self): combo_field2 = self._get_combo_field2() @@ -309,8 +325,8 @@ def test_populate_tensor_from_existing_docs_will_not_populate_for_multimodal_fie MARQO_DOC_TENSORS: {} # embedding for combo_field2 does not exist in the existing doc }, {}) - self.assertEquals([], combo_field2.chunks) - self.assertEquals([], combo_field2.embeddings) + self.assertEqual([], combo_field2.chunks) + self.assertEqual([], combo_field2.embeddings) def test_populate_tensor_from_existing_docs_will_not_populate_multimodal_field_with_another_type(self): combo_field2 = self._get_combo_field2() @@ -325,8 +341,8 @@ def test_populate_tensor_from_existing_docs_will_not_populate_multimodal_field_w } # although called combo_field2, it is not a multimodal_tensor field in the existing doc }, {}) - self.assertEquals([], combo_field2.chunks) - self.assertEquals([], combo_field2.embeddings) + self.assertEqual([], combo_field2.chunks) + self.assertEqual([], combo_field2.embeddings) def test_populate_tensor_from_existing_docs_will_not_populate_multimodal_field_with_different_weight(self): combo_field2 = self._get_combo_field2() @@ -340,8 +356,8 @@ def test_populate_tensor_from_existing_docs_will_not_populate_multimodal_field_w } }, {'combo_field2': {'subfield1': 0.5, 'tensor_field2': 5.0}}) # weight is different - self.assertEquals([], combo_field2.chunks) - self.assertEquals([], combo_field2.embeddings) + self.assertEqual([], combo_field2.chunks) + self.assertEqual([], combo_field2.embeddings) def test_populate_tensor_from_existing_docs_will_not_populate_multimodal_field_with_different_subfields(self): combo_field2 = self._get_combo_field2() @@ -355,8 +371,8 @@ def test_populate_tensor_from_existing_docs_will_not_populate_multimodal_field_w } }, {'combo_field2': {'subfield1': 2.0, 'tensor_field2': 5.0}}) - self.assertEquals([], combo_field2.chunks) - self.assertEquals([], combo_field2.embeddings) + self.assertEqual([], combo_field2.chunks) + self.assertEqual([], combo_field2.embeddings) def test_populate_tensor_from_existing_docs_should_populate_multimodal_field_if_all_conditions_match(self): combo_field2 = self._get_combo_field2() @@ -370,22 +386,24 @@ def test_populate_tensor_from_existing_docs_should_populate_multimodal_field_if_ } # although called combo_field2, it is not a multimodal_tensor field in the existing doc }, {'combo_field2': {'subfield1': 2.0, 'tensor_field2': 5.0}}) - self.assertEquals(['combo_field2_content'], combo_field2.chunks) - self.assertEquals([[1.0, 2.0]], combo_field2.embeddings) + self.assertEqual(['combo_field2_content'], combo_field2.chunks) + self.assertEqual([[1.0, 2.0]], combo_field2.embeddings) def _get_combo_field2(self): - self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', FieldType.Text) - self.container.collect('doc_id1', 'subfield1', 'subfield1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'subfield1', 'subfield1_content', + self._dummy_infer_field_type(FieldType.Text)) list(self.container.collect_multi_modal_fields('doc_id1', True)) return self.container._tensor_field_map['doc_id1']['combo_field2'] def test_traversing_tensor_fields_to_vectorise_should_return_all_fields(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) - self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', FieldType.ImagePointer) - self.container.collect('doc_id1', 'subfield1', 'subfield1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', self._dummy_infer_field_type(FieldType.ImagePointer)) + self.container.collect('doc_id1', 'subfield1', 'subfield1_content', self._dummy_infer_field_type(FieldType.Text)) list(self.container.collect_multi_modal_fields('doc_id1', True)) - self.container.collect('doc_id2', 'tensor_field1', 'tensor_field1_content', FieldType.AudioPointer) - self.container.collect('doc_id2', 'tensor_field2', 'tensor_field2_content', FieldType.VideoPointer) + self.container.collect('doc_id2', 'tensor_field1', 'tensor_field1_content', self._dummy_infer_field_type(FieldType.AudioPointer)) + self.container.collect('doc_id2', 'tensor_field2', 'tensor_field2_content', self._dummy_infer_field_type(FieldType.VideoPointer)) list(self.container.collect_multi_modal_fields('doc_id2', True)) fields = list(self.container.tensor_fields_to_vectorise(FieldType.Text, FieldType.ImagePointer, @@ -397,8 +415,10 @@ def test_traversing_tensor_fields_to_vectorise_should_return_all_fields(self): self.assertIn(('doc_id2', 'tensor_field2', self.container._tensor_field_map['doc_id2']['tensor_field2']), fields) def test_traversing_tensor_fields_to_vectorise_skips_resolved_fields(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) - self.container.collect('doc_id1', 'subfield1', 'subfield1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'subfield1', 'subfield1_content', + self._dummy_infer_field_type(FieldType.Text)) list(self.container.collect_multi_modal_fields('doc_id1', True)) # resolve tensor_field1 @@ -406,16 +426,20 @@ def test_traversing_tensor_fields_to_vectorise_skips_resolved_fields(self): tensor_field1.populate_chunks_and_embeddings(['hello world'], [[1.0, 1.2]]) fields = list(self.container.tensor_fields_to_vectorise(FieldType.Text)) - self.assertEquals(1, len(fields)) + self.assertEqual(1, len(fields)) (doc_id, field_name, _) = fields[0] - self.assertEquals('doc_id1', doc_id) - self.assertEquals('subfield1', field_name) + self.assertEqual('doc_id1', doc_id) + self.assertEqual('subfield1', field_name) def test_traversing_tensor_fields_to_vectorise_skips_removed_doc(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) - self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', FieldType.Text) - self.container.collect('doc_id2', 'tensor_field1', 'tensor_field1_content', FieldType.Text) - self.container.collect('doc_id2', 'tensor_field2', 'tensor_field2_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id2', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id2', 'tensor_field2', 'tensor_field2_content', + self._dummy_infer_field_type(FieldType.Text)) fields = [] for doc_id, field_name, _ in self.container.tensor_fields_to_vectorise(FieldType.Text): @@ -423,25 +447,30 @@ def test_traversing_tensor_fields_to_vectorise_skips_removed_doc(self): if doc_id == 'doc_id1': # after taking in the first field of doc_id1, remove doc_id1 to simulate a failure self.container.remove_doc(doc_id) - self.assertEquals(3, len(fields)) + self.assertEqual(3, len(fields)) self.assertIn(('doc_id1', 'tensor_field1'), fields) self.assertIn(('doc_id2', 'tensor_field1'), fields) self.assertIn(('doc_id2', 'tensor_field2'), fields) def test_traversing_tensor_fields_to_vectorise_by_type(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) - self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', FieldType.ImagePointer) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', + self._dummy_infer_field_type(FieldType.ImagePointer)) fields = list(self.container.tensor_fields_to_vectorise(FieldType.ImagePointer)) - self.assertEquals(1, len(fields)) + self.assertEqual(1, len(fields)) (doc_id, field_name, _) = fields[0] - self.assertEquals('doc_id1', doc_id) - self.assertEquals('tensor_field2', field_name) + self.assertEqual('doc_id1', doc_id) + self.assertEqual('tensor_field2', field_name) def test_traversing_tensor_fields_to_vectorise_skips_subfields_for_resolved_multimodal_fields(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) - self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', FieldType.Text) - self.container.collect('doc_id1', 'subfield1', 'subfield1_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'subfield1', 'subfield1_content', + self._dummy_infer_field_type(FieldType.Text)) list(self.container.collect_multi_modal_fields('doc_id1', True)) tensor_field2 = self.container._tensor_field_map['doc_id1']['tensor_field2'] @@ -452,39 +481,44 @@ def test_traversing_tensor_fields_to_vectorise_skips_subfields_for_resolved_mult combo_field2.populate_chunks_and_embeddings(['hello world'], [[1.0, 1.2]]) fields = list(self.container.tensor_fields_to_vectorise(FieldType.Text)) - self.assertEquals(1, len(fields)) - self.assertEquals('tensor_field1', fields[0][1]) + self.assertEqual(1, len(fields)) + self.assertEqual('tensor_field1', fields[0][1]) # subfield 1 does not need to be vectorised since all the combo fields using it are resolved # tensor_fields2 does not need to be vectorised since its embeddings are populated and the combo field that # needs it is already resolved def test_get_tensor_field_content_for_persisting(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) - self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', FieldType.Text) - self.container.collect('doc_id1', 'subfield1', 'subfield1_content', FieldType.Text) # subfield is not persisted + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'subfield1', 'subfield1_content', + self._dummy_infer_field_type(FieldType.Text)) # subfield is not persisted list(self.container.collect_multi_modal_fields('doc_id1', True)) self.container._tensor_field_map['doc_id1']['tensor_field1'].populate_chunks_and_embeddings(['hello world'], [[1.0, 1.2]]) self.container._tensor_field_map['doc_id1']['tensor_field2'].populate_chunks_and_embeddings(['hello world'], [[1.0, 1.2]]) fields = self.container.get_tensor_field_content('doc_id1') - self.assertEquals(4, len(fields)) + self.assertEqual(4, len(fields)) self.assertIn('tensor_field1', fields) self.assertIn('tensor_field2', fields) self.assertIn('combo_field1', fields) self.assertIn('combo_field2', fields) def test_get_tensor_field_content_for_persisting_skips_multimodal_field_with_no_subfields(self): - self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', FieldType.Text) - self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', FieldType.Text) + self.container.collect('doc_id1', 'tensor_field1', 'tensor_field1_content', + self._dummy_infer_field_type(FieldType.Text)) + self.container.collect('doc_id1', 'tensor_field2', 'tensor_field2_content', + self._dummy_infer_field_type(FieldType.Text)) list(self.container.collect_multi_modal_fields('doc_id1', True)) self.container._tensor_field_map['doc_id1']['tensor_field1'].populate_chunks_and_embeddings(['hello world'], [[1.0, 1.2]]) self.container._tensor_field_map['doc_id1']['tensor_field2'].populate_chunks_and_embeddings(['hello world'], [[1.0, 1.2]]) fields = self.container.get_tensor_field_content('doc_id1') - self.assertEquals(3, len(fields)) + self.assertEqual(3, len(fields)) self.assertIn('tensor_field1', fields) self.assertIn('tensor_field2', fields) self.assertIn('combo_field2', fields) # combo_field2 has tensor_field2 as subfield diff --git a/tests/core/vespa_index/test_add_documents_handler.py b/tests/core/vespa_index/test_add_documents_handler.py index 718415723..28cff0819 100644 --- a/tests/core/vespa_index/test_add_documents_handler.py +++ b/tests/core/vespa_index/test_add_documents_handler.py @@ -5,26 +5,31 @@ import pytest from marqo.core.constants import MARQO_DOC_ID -from marqo.core.models.marqo_index import FieldType -from marqo.core.vespa_index.add_documents_handler import AddDocumentsResponseCollector, AddDocumentsHandler -from marqo.core.models.add_docs_params import AddDocsParams, BatchVectorisationMode +from marqo.core.exceptions import DuplicateDocumentError, AddDocumentsError, MarqoDocumentParsingError, \ + InternalError from marqo.core.inference.tensor_fields_container import TensorFieldsContainer -from marqo.core.exceptions import DuplicateDocumentError, AddDocumentsError, MarqoDocumentParsingError, InternalError +from marqo.core.models.add_docs_params import AddDocsParams, BatchVectorisationMode from marqo.core.models.marqo_add_documents_response import MarqoAddDocumentsItem +from marqo.core.models.marqo_index import FieldType +from marqo.core.unstructured_vespa_index.unstructured_add_document_handler import \ + UnstructuredAddDocumentsHandler +from marqo.core.vespa_index.add_documents_handler import AddDocumentsResponseCollector, AddDocumentsHandler from marqo.s2_inference import s2_inference from marqo.s2_inference.errors import S2InferenceError +from marqo.s2_inference.multimodal_model_load import Modality from marqo.vespa.models import VespaDocument, FeedBatchResponse, FeedBatchDocumentResponse from marqo.vespa.models.get_document_response import Document, GetBatchResponse, GetBatchDocumentResponse from tests.marqo_test import MarqoTestCase +from tests.marqo_test import TestAudioUrls, TestVideoUrls, TestImageUrls @pytest.mark.unittest class TestAddDocumentHandler(MarqoTestCase): - class DummyAddDocumentsHandler(AddDocumentsHandler): """ We create a dummy implementation of the AddDocumentsHandler to verify the main workflow """ + def __init__(self, **kwargs): super().__init__(**kwargs) self.handled_fields = [] @@ -38,7 +43,7 @@ def _create_tensor_fields_container(self) -> TensorFieldsContainer: def _handle_field(self, marqo_doc, field_name, field_content) -> None: doc_id = marqo_doc[MARQO_DOC_ID] marqo_doc[field_name] = field_content - self.tensor_fields_container.collect(doc_id, field_name, field_content, FieldType.Text) + self.tensor_fields_container.collect(doc_id, field_name, field_content, self._infer_field_type) self.handled_fields.append((doc_id, field_name)) def _handle_multi_modal_fields(self, marqo_doc: Dict[str, Any]) -> None: @@ -52,6 +57,9 @@ def _to_vespa_doc(self, marqo_doc: Dict[str, Any]) -> VespaDocument: self.to_vespa_doc_call_count += 1 return VespaDocument(id=marqo_doc[MARQO_DOC_ID], fields={}) + def _infer_field_type(self, field_name: str, field_content: Any) -> FieldType: + return FieldType.Text + @patch('marqo.vespa.vespa_client.VespaClient.feed_batch') @patch('marqo.vespa.vespa_client.VespaClient.get_batch') def test_add_documents_main_workflow_happy_path(self, mock_get_batch, mock_feed_batch): @@ -87,7 +95,7 @@ def test_add_documents_main_workflow_happy_path(self, mock_get_batch, mock_feed_ self.assertEqual('index1', response.index_name) self.assertEqual(3, len(response.items)) for i in range(3): - self.assertEqual(str(i+1), response.items[i].id) + self.assertEqual(str(i + 1), response.items[i].id) self.assertEqual(200, response.items[i].status) # verify the workflow call the abstract methods @@ -144,6 +152,7 @@ def handle_field_raise_error(self, marqo_doc, field_name, _) -> None: if field_name == 'field5': raise AddDocumentsError('some error') self.handled_fields.append((marqo_doc[MARQO_DOC_ID], field_name)) + handler._handle_field = handle_field_raise_error.__get__(handler) response = handler.add_documents() @@ -160,7 +169,8 @@ def handle_field_raise_error(self, marqo_doc, field_name, _) -> None: @patch('marqo.vespa.vespa_client.VespaClient.feed_batch') def test_add_documents_should_handle_various_errors(self, mock_feed_batch): mock_feed_batch.side_effect = [FeedBatchResponse(errors=False, responses=[ - FeedBatchDocumentResponse(id='id:index1:index1::1', pathId='path_id1', status=400, message='Could not parse field field1'), + FeedBatchDocumentResponse(id='id:index1:index1::1', pathId='path_id1', status=400, message= + 'Could not parse field field1'), FeedBatchDocumentResponse(id='id:index1:index1::2', pathId='path_id2', status=429, message='vespa error2'), FeedBatchDocumentResponse(id='id:index1:index1::3', pathId='path_id3', status=507, message='vespa error3'), ])] @@ -186,6 +196,7 @@ def to_vespa_doc_throw_error(_, marqo_doc: Dict[str, Any]) -> VespaDocument: if marqo_doc.get('bad_field') == 'bad_content': raise MarqoDocumentParsingError('MarqoDocumentParsingError') return VespaDocument(id=marqo_doc[MARQO_DOC_ID], fields={}) + handler._to_vespa_doc = to_vespa_doc_throw_error.__get__(handler) response = handler.add_documents() @@ -214,7 +225,8 @@ def to_vespa_doc_throw_error(_, marqo_doc: Dict[str, Any]) -> VespaDocument: code='doc_too_large'), MarqoAddDocumentsItem(status=400, id='', message="Can't index an empty dict.", error="Can't index an empty dict.", code='invalid_argument'), - MarqoAddDocumentsItem(status=400, id='', message='Docs must be dicts', error='Docs must be dicts', + MarqoAddDocumentsItem(status=400, id='', message='Docs must be dicts', + error='Docs must be dicts', code='invalid_argument') ], response.items) @@ -318,8 +330,150 @@ def test_add_documents_should_fail_a_batch_using_vectorise_per_doc_strategy(self with self.assertRaisesStrict(InternalError) as context: handler.add_documents() - self.assertEquals('Encountered problem when vectorising batch of documents. Reason: vectorise error', - str(context.exception)) + self.assertEqual('Encountered problem when vectorising batch of documents. Reason: vectorise error', + str(context.exception)) + + def test_unstructured_add_documents_handler_infer_modality_logic_image_false_and_media_false(self): + """Test the logic of the infer_modality method in UnstructuredAddDocumentsHandler when + both treat_urls_and_pointers_as_images and treat_urls_and_pointers_as_media are False.""" + unstructured_add_documents_handler = UnstructuredAddDocumentsHandler( + marqo_index=self.unstructured_marqo_index( + 'index1', 'index1', + treat_urls_and_pointers_as_images=False, + treat_urls_and_pointers_as_media=False + ), + add_docs_params=AddDocsParams( + index_name='index1', tensor_fields=[], docs=[{'_id': '1'}] + ), + vespa_client=self.vespa_client + ) + test_cases = [ + (TestAudioUrls.AUDIO1.value, "audio url should be treated as text"), + (TestVideoUrls.VIDEO1.value, "video url should be treated as text"), + (TestImageUrls.IMAGE1.value, "image url should be treated as text"), + ] + for url, test_case in test_cases: + with self.subTest(msg=test_case): + with patch("marqo.core.unstructured_vespa_index.unstructured_add_document_handler.infer_modality") as mock_infer_modality: + self.assertEqual( + FieldType.Text, unstructured_add_documents_handler. + _infer_field_type(field_name="dummy_field_name", field_content=url) + ) + mock_infer_modality.assert_not_called() + + def test_unstructured_add_documents_handler_infer_modality_logic_image_true_and_media_false(self): + """Test the logic of the infer_modality method in UnstructuredAddDocumentsHandler when + treat_urls_and_pointers_as_images=True and treat_urls_and_pointers_as_media=False.""" + unstructured_add_documents_handler = UnstructuredAddDocumentsHandler( + marqo_index=self.unstructured_marqo_index( + 'index1', 'index1', + treat_urls_and_pointers_as_images=True, + treat_urls_and_pointers_as_media=False + ), + add_docs_params=AddDocsParams( + index_name='index1', tensor_fields=[], docs=[{'_id': '1'}] + ), + vespa_client=self.vespa_client + ) + test_cases = [ + (TestAudioUrls.AUDIO1.value, "audio url should be treated as text", FieldType.Text), + (TestVideoUrls.VIDEO1.value, "video url should be treated as text", FieldType.Text), + (TestImageUrls.IMAGE1.value, "image url should be treated as image", FieldType.ImagePointer), + ] + + for url, test_case, expected_field_type in test_cases: + with self.subTest(msg=test_case): + self.assertEqual( + expected_field_type, + unstructured_add_documents_handler._infer_field_type( + field_name="dummy_field_name", + field_content=url + ) + ) + + def test_unstructured_add_documents_handler_infer_modality_logic_image_true_and_media_true(self): + """Test the logic of the infer_modality method in UnstructuredAddDocumentsHandler when + treat_urls_and_pointers_as_images=True and treat_urls_and_pointers_as_media=True.""" + unstructured_add_documents_handler = UnstructuredAddDocumentsHandler( + marqo_index=self.unstructured_marqo_index( + 'index1', 'index1', + treat_urls_and_pointers_as_images=True, + treat_urls_and_pointers_as_media=True + ), + add_docs_params=AddDocsParams( + index_name='index1', tensor_fields=[], docs=[{'_id': '1'}] + ), + vespa_client=self.vespa_client + ) + test_cases = [ + (TestAudioUrls.AUDIO1.value, "audio url should be treated as audio", FieldType.AudioPointer), + (TestVideoUrls.VIDEO1.value, "video url should be treated as video", FieldType.VideoPointer), + (TestImageUrls.IMAGE1.value, "image url should be treated as image", FieldType.ImagePointer), + ] + + for url, test_case, expected_field_type in test_cases: + with self.subTest(msg=test_case): + self.assertEqual( + expected_field_type, + unstructured_add_documents_handler._infer_field_type( + field_name="dummy_field_name", + field_content=url + ) + ) + + def test_collect_tensor_field_content_infer_modality_logic(self): + """A test to ensure collect_tensor_field_content method in UnstructuredAddDocumentsHandler infer modality + for tensor fields and multimodal sub-fields, but not for non-tensor fields.""" + unstructured_add_documents_handler = UnstructuredAddDocumentsHandler( + marqo_index=self.unstructured_marqo_index( + 'index1', 'index1', + treat_urls_and_pointers_as_images=True, + treat_urls_and_pointers_as_media=True + ), + add_docs_params=AddDocsParams( + index_name='index1', tensor_fields=["tensor_field", "my_multimodal_field"], docs=[{'_id': '1'}], + mappings={ + "my_multimodal_field": + { + "type": "multimodal_combination", + "weights": { + "text_field": 0.5, "image_field": 0.8 + } + } + } + ), + vespa_client=self.vespa_client + ) + test_doc = { + "non_tensor_field": TestAudioUrls.AUDIO1.value, + "tensor_field": "This is a tensor field so its modality should be inferred", + "text_field": "This is a sub field of my_multimodal_field so its modality should be inferred", + "image_field": TestImageUrls.IMAGE1.value, + "another_non_tensor_field": TestImageUrls.IMAGE1.value, + "_id": "test" + } + + test_cases = ( + ("non_tensor_field", "A non-tensor field should not be inferred for modality", False), + ("tensor_field", "A tensor field should be inferred for modality", True), + ("text_field", "A sub field of a multimodal field should be inferred for modality", True), + ("image_field", "A sub field of a multimodal field should be inferred for modality", True), + ("another_non_tensor_field", "A non-tensor field should not be inferred for modality", False), + + ) + + for field_name, msg, called in test_cases: + with self.subTest(f"{field_name} - {msg}"): + with patch("marqo.core.unstructured_vespa_index.unstructured_add_document_handler.infer_modality", + return_value=Modality.TEXT) as mock_infer_modality: + _ = unstructured_add_documents_handler._handle_field( + test_doc, field_name=field_name, + field_content=test_doc[field_name] + ) + if called: + mock_infer_modality.assert_called_once_with(test_doc[field_name], None) + else: + mock_infer_modality.assert_not_called() @pytest.mark.unittest @@ -362,7 +516,8 @@ def test_collect_error_response_should_capture_add_document_error_with_default_v def test_collect_error_response_should_capture_add_document_error_with_custom_values(self): collector = AddDocumentsResponseCollector() collector.collect_error_response('doc_id1', AddDocumentsError('error message 2', - error_code='err_code', status_code=403), loc=1) + error_code='err_code', + status_code=403), loc=1) self.assertTrue(collector.errors) loc, add_doc_item = collector.responses[0] self.assertEquals(1, loc) @@ -452,11 +607,13 @@ def test_collect_final_responses(self, mock_timer): self.assertEquals('index', response.index_name) self.assertEquals(1000, response.processingTimeMs) - self.assertEquals(4, len(response.items)) - self.assertEquals('doc_id4', response.items[0].id) # doc_id4 is the original doc_id - self.assertEquals('error message 4', response.items[0].message) - self.assertEquals('doc_id3', response.items[1].id) # doc_id3 should be returned since it's persisted - self.assertEquals('', response.items[2].id) # gen_doc_id2 is generated, should not be returned for error - self.assertEquals('error message 2', response.items[2].message) - self.assertEquals('', response.items[3].id) # doc_id1 error message does not contain id, this came last - self.assertEquals('error message 1', response.items[3].message) + self.assertEqual(4, len(response.items)) + self.assertEqual('doc_id4', response.items[0].id) # doc_id4 is the original doc_id + self.assertEqual('error message 4', response.items[0].message) + self.assertEqual('doc_id3', response.items[1].id) # doc_id3 should be returned since it's persisted + self.assertEqual('', + response.items[2].id) # gen_doc_id2 is generated, should not be returned for error + self.assertEqual('error message 2', response.items[2].message) + self.assertEqual('', + response.items[3].id) # doc_id1 error message does not contain id, this came last + self.assertEqual('error message 1', response.items[3].message) \ No newline at end of file diff --git a/tests/tensor_search/integ_tests/test_add_documents_semi_structured.py b/tests/tensor_search/integ_tests/test_add_documents_semi_structured.py index 381d39d8b..d4cfcad1e 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_semi_structured.py +++ b/tests/tensor_search/integ_tests/test_add_documents_semi_structured.py @@ -754,4 +754,29 @@ def test_duplicate_ids_behaviour(self): self.assertEqual(1, len(r["items"])) number_of_docs_in_index = self.config.monitoring.get_index_stats_by_name( index_name=self.default_text_index).number_of_documents - self.assertEqual(number_of_docs, number_of_docs_in_index) \ No newline at end of file + self.assertEqual(number_of_docs, number_of_docs_in_index) + + def test_a_text_index_will_treat_a_url_as_text(self): + """Test that a text index will treat a URL as text and not download the image""" + valid_url = TestImageUrls.HIPPO_REALISTIC.value + invalid_url = TestImageUrls.HIPPO_REALISTIC.value + "invalid" + self.add_documents( + config=self.config, add_docs_params=AddDocsParams( + index_name=self.default_text_index, docs=[ + { + "_id": "1", + "title": invalid_url, + "non_tensor_field": valid_url + } + ], + device="cpu", tensor_fields=["title"] + ) + ) + doc = tensor_search.get_document_by_id( + config=self.config, index_name=self.default_text_index, document_id="1", show_vectors=True + ) + + self.assertEqual(invalid_url, doc["title"]) + self.assertEqual(valid_url, doc["non_tensor_field"]) + self.assertEqual(1, len(doc[enums.TensorField.tensor_facets])) + self.assertIn("title", doc[enums.TensorField.tensor_facets][0]) \ No newline at end of file diff --git a/tests/tensor_search/integ_tests/test_add_documents_unstructured.py b/tests/tensor_search/integ_tests/test_add_documents_unstructured.py index 4b231aa7f..490536eb6 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_unstructured.py +++ b/tests/tensor_search/integ_tests/test_add_documents_unstructured.py @@ -765,4 +765,29 @@ def test_duplicate_ids_behaviour(self): self.assertEqual(1, len(r["items"])) number_of_docs_in_index = self.config.monitoring.get_index_stats_by_name( index_name=self.default_text_index).number_of_documents - self.assertEqual(number_of_docs, number_of_docs_in_index) \ No newline at end of file + self.assertEqual(number_of_docs, number_of_docs_in_index) + + def test_a_text_index_will_treat_a_url_as_text(self): + """Test that a text index will treat a URL as text and not download the image""" + valid_url = TestImageUrls.HIPPO_REALISTIC.value + invalid_url = TestImageUrls.HIPPO_REALISTIC.value + "invalid" + self.add_documents( + config=self.config, add_docs_params=AddDocsParams( + index_name=self.default_text_index, docs=[ + { + "_id": "1", + "title": invalid_url, + "non_tensor_field": valid_url + } + ], + device="cpu", tensor_fields=["title"] + ) + ) + doc = tensor_search.get_document_by_id( + config=self.config, index_name=self.default_text_index, document_id="1", show_vectors=True + ) + + self.assertEqual(invalid_url, doc["title"]) + self.assertEqual(valid_url, doc["non_tensor_field"]) + self.assertEqual(1, len(doc[enums.TensorField.tensor_facets])) + self.assertIn("title", doc[enums.TensorField.tensor_facets][0]) diff --git a/tests/tensor_search/test_unstructured_index_attributes_to_retrieve.py b/tests/tensor_search/test_unstructured_index_attributes_to_retrieve.py new file mode 100644 index 000000000..c47a9f627 --- /dev/null +++ b/tests/tensor_search/test_unstructured_index_attributes_to_retrieve.py @@ -0,0 +1,73 @@ +import unittest + +from marqo.tensor_search.tensor_search import select_attributes + + +class TestUnstructuredIndexSelectAttributes(unittest.TestCase): + + def test_return_meta_fields_when_attributes_to_retrieve_is_empty(self): + """test that ["_id", "_score", "_highlights"] will be returned when attributes_to_retrieve is empty""" + marqo_doc = { + '_id': '123', + '_score': 0.12, + '_highlights': [{'title': 'Stay Elevated Dress Shoes - Black'}], + 'some_other_fields': 'hello', + '_lexical_score': 0.11, # FIXME: this is a meta field for hybrid search, but isn't returned + '_tensor_score': 0.12, # FIXME: this is a meta field for hybrid search, but isn't returned + } + + expected_result = marqo_doc.copy() + del expected_result['some_other_fields'] + del expected_result['_lexical_score'] + del expected_result['_tensor_score'] + + filtered_doc = select_attributes(marqo_doc, {"_id", "_score", "_highlights"}) + self.assertEqual(expected_result, filtered_doc) + + def test_return_selected_field_and_meta_fields_when_attributes_to_retrieve_is_not_empty(self): + marqo_doc = { + '_id': '123', + '_score': 0.12, + 'string_field': 'aaa', + 'int_field': 123, + 'float_field': 1.23, + 'bool_field': True, + 'string_array_field': ['a', 'b'], + 'some_other_fields': 'hello', + } + + expected_result = marqo_doc.copy() + del expected_result['some_other_fields'] + + attributes_to_retrieve = {'string_field', 'int_field', 'float_field', 'bool_field', 'string_array_field', + "_id", "_score", "_highlights", "field_not_exist_in_doc"} + + filtered_doc = select_attributes(marqo_doc, attributes_to_retrieve) + self.assertEqual(expected_result, filtered_doc) + + def test_return_selected_map_field_flattened(self): + marqo_doc = { + '_id': '123', + '_score': 0.12, + 'int_map_field1.a': 1, + 'int_map_field1.b': 2, + 'int_map_field2.c': 3, + 'int_map_field2.d': 4, + 'float_map_field1.a': 1.0, + 'float_map_field1.b': 2.0, + 'float_map_field2.c': 3.0, + 'float_map_field2.d': 4.0, + 'some_other_fields': 'hello', + } + + expected_result = marqo_doc.copy() + del expected_result['some_other_fields'] + del expected_result['int_map_field2.c'] + del expected_result['int_map_field2.d'] + del expected_result['float_map_field1.a'] + del expected_result['float_map_field1.b'] + + attributes_to_retrieve = {'int_map_field1', 'float_map_field2', "_id", "_score", "_highlights"} + + filtered_doc = select_attributes(marqo_doc, attributes_to_retrieve) + self.assertEqual(expected_result, filtered_doc)