Merge branch 'feature/PI-477-query_by_tag_ignore_chunky_fields' into release/2024-09-13

megan-bower4 · megan-bower4 · commit 10d9fedd2d20 · 2024-09-13T13:43:36.000+01:00
diff --git a/src/api/searchSdsDevice/src/v1/steps.py b/src/api/searchSdsDevice/src/v1/steps.py
@@ -7,6 +7,8 @@
 from event.step_chain import StepChain
 from pydantic import ValidationError
 
+FIELDS_TO_DROP = ["tags"]
+
 
 def parse_event_query(data, cache):
     event = APIGatewayProxyEvent(data[StepChain.INIT])
@@ -31,7 +33,7 @@ def query_devices(data, cache) -> List[dict]:
     device_repo = DeviceRepository(
         table_name=cache["DYNAMODB_TABLE"], dynamodb_client=cache["DYNAMODB_CLIENT"]
     )
-    results = device_repo.query_by_tag(**query_params)
+    results = device_repo.query_by_tag(fields_to_drop=FIELDS_TO_DROP, **query_params)
     return [result.state() for result in results]
 
 
diff --git a/src/api/searchSdsDevice/tests/test_index.py b/src/api/searchSdsDevice/tests/test_index.py
@@ -41,7 +41,8 @@ def _create_device(device, product_team, params):
 
     questionnaire_response = questionnaire.respond(responses=response)
     cpmdevice.add_questionnaire_response(questionnaire_response=questionnaire_response)
-    cpmdevice.add_tag(**params)
+    tag_params = [params]
+    cpmdevice.add_tags(tags=tag_params)
     return cpmdevice
 
 
diff --git a/src/api/searchSdsEndpoint/src/v1/steps.py b/src/api/searchSdsEndpoint/src/v1/steps.py
@@ -7,6 +7,8 @@
 from event.step_chain import StepChain
 from pydantic import ValidationError
 
+FIELDS_TO_DROP = ["tags"]
+
 
 def parse_event_query(data, cache):
     event = APIGatewayProxyEvent(data[StepChain.INIT])
@@ -32,7 +34,7 @@ def query_endpoints(data, cache) -> List[dict]:
     device_repo = DeviceRepository(
         table_name=cache["DYNAMODB_TABLE"], dynamodb_client=cache["DYNAMODB_CLIENT"]
     )
-    results = device_repo.query_by_tag(**query_params)
+    results = device_repo.query_by_tag(fields_to_drop=FIELDS_TO_DROP, **query_params)
     return [result.state() for result in results]
 
 
diff --git a/src/api/searchSdsEndpoint/tests/test_index.py b/src/api/searchSdsEndpoint/tests/test_index.py
@@ -41,7 +41,8 @@ def _create_device(device, product_team, params):
 
     questionnaire_response = questionnaire.respond(responses=response)
     cpmdevice.add_questionnaire_response(questionnaire_response=questionnaire_response)
-    cpmdevice.add_tag(**params)
+    tag_params = [params]
+    cpmdevice.add_tags(tags=tag_params)
     return cpmdevice
 
 
diff --git a/src/etl/sds/worker/load_bulk/tests/test_load_bulk_worker.py b/src/etl/sds/worker/load_bulk/tests/test_load_bulk_worker.py
@@ -31,9 +31,10 @@ def all_devices(self) -> Generator[Device, None, None]:
         for device in devices:
             if not device.get("root"):
                 continue
-            device["tags"] = [
-                pkl_loads_gzip(tag) for tag in pkl_loads_gzip(device["tags"])
-            ]
+            if device.get("tags"):  # Only compress if tags not empty
+                device["tags"] = [
+                    pkl_loads_gzip(tag) for tag in pkl_loads_gzip(device["tags"])
+                ]
             yield Device(**device)
 
     def count(self, by: DeviceType | DeviceKeyType):
diff --git a/src/layers/domain/core/device/v2.py b/src/layers/domain/core/device/v2.py
@@ -450,6 +450,10 @@ def state(self) -> dict:
     def is_active(self):
         return self.status is Status.ACTIVE
 
+    @classmethod
+    def get_all_fields(cls) -> set[str]:
+        return set(cls.__fields__.keys())
+
 
 class DeviceEventDeserializer(EventDeserializer):
     event_types = (
diff --git a/src/layers/domain/repository/device_repository/tests/v2/test_device_repository_tags_v2.py b/src/layers/domain/repository/device_repository/tests/v2/test_device_repository_tags_v2.py
@@ -1,7 +1,13 @@
+from collections import defaultdict
+
 import pytest
 from domain.core.device.v2 import Device, DeviceTag
 from domain.core.device_key.v2 import DeviceKeyType
-from domain.repository.device_repository.v2 import DeviceRepository
+from domain.core.enum import Status
+from domain.repository.device_repository.v2 import (
+    MANDATORY_DEVICE_FIELDS,
+    DeviceRepository,
+)
 
 
 @pytest.mark.integration
@@ -113,3 +119,50 @@ def test__device_repository__add_two_tags_and_then_clear(
 
     assert repository.query_by_tag(shoe_size=123) == []
     assert repository.query_by_tag(shoe_size=456) == []
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize(
+    "field_to_drop, expected_default_value",
+    [
+        (["tags"], set()),  # If 'tags' is dropped, it should default to an empty set
+        (["keys"], []),  # If 'keys' is dropped, it should default to an empty list
+        (["status"], Status.ACTIVE),  # 'status' should default to Status.ACTIVE
+        (["updated_on"], None),  # 'updated_on' should default to None
+        (["deleted_on"], None),  # 'deleted_on' should default to None
+        (
+            ["questionnaire_responses"],
+            defaultdict(dict),
+        ),  # 'questionnaire_responses' defaults to an empty dict
+    ],
+)
+def test__device_repository__drop_fields(
+    device: Device, repository: DeviceRepository, field_to_drop, expected_default_value
+):
+    repository.write(device)
+    (_device_123,) = repository.query_by_tag(abc=123)
+    assert _device_123.dict() == device.dict()
+
+    # Query with specific fields to drop
+    results = repository.query_by_tag(abc=123, fields_to_drop=field_to_drop)
+    assert len(results) == 1
+
+    device_result = results[0]
+
+    assert device_result.dict()[field_to_drop[0]] == expected_default_value
+    assert all(field in device_result.dict() for field in MANDATORY_DEVICE_FIELDS)
+
+
+@pytest.mark.integration
+def test__device_repository__drop_mandatory_fields(
+    device: Device, repository: DeviceRepository
+):
+    repository.write(device)
+    (_device_123,) = repository.query_by_tag(abc=123)
+    assert _device_123.dict() == device.dict()
+
+    # Query with mandatory fields to drop
+    fields_to_drop = list(MANDATORY_DEVICE_FIELDS)
+
+    with pytest.raises(ValueError, match="Cannot drop mandatory fields:"):
+        repository.query_by_tag(abc=123, fields_to_drop=fields_to_drop)
diff --git a/src/layers/domain/repository/device_repository/v2.py b/src/layers/domain/repository/device_repository/v2.py
@@ -33,6 +33,7 @@
 ROOT_FIELDS_TO_COMPRESS = [TAGS]
 NON_ROOT_FIELDS_TO_COMPRESS = ["questionnaire_responses"]
 BATCH_GET_SIZE = 100
+MANDATORY_DEVICE_FIELDS = {"name", "device_type", "product_team_id", "ods_code"}
 
 
 class TooManyResults(Exception):
@@ -42,36 +43,38 @@ class TooManyResults(Exception):
 def compress_device_fields(data: Event | dict, fields_to_compress=None) -> dict:
     _data = copy(data) if isinstance(data, dict) else asdict(data, recurse=False)
 
-    # pop unknown keys
+    # Pop unknown keys
     unknown_keys = _data.keys() - set(Device.__fields__)
     for k in unknown_keys:
         _data.pop(k)
 
-    # compress specified keys if they exist in the data
+    # Compress specified keys if they exist in the data
     fields_to_compress = (fields_to_compress or []) + ROOT_FIELDS_TO_COMPRESS
-    fields_to_compress_that_exist = [f for f in fields_to_compress if f in _data]
+    fields_to_compress_that_exist = [f for f in fields_to_compress if _data.get(f)]
     for field in fields_to_compress_that_exist:
+        # Only proceed if the field is not empty
         if field == TAGS:
-            # tags are doubly compressed: first compress each tag in the list,
-            # and then compress the entire list in the line directly after this
-            # if-block
+            # Tags are doubly compressed: first compress each tag in the list
             _data[field] = [pkl_dumps_gzip(tag) for tag in _data[field]]
+        # Compress the entire field (which includes the doubly compressed tags)
         _data[field] = pkl_dumps_gzip(_data[field])
     return _data
 
 
 def decompress_device_fields(device: dict):
     for field in ROOT_FIELDS_TO_COMPRESS:
-        device[field] = pkl_loads_gzip(device[field])
-        if field == TAGS:
-            # tags are doubly compressed, so first decompress the entire tag list
-            # in the line directly before this if-block, then decompress each tag
-            # in the list
-            device[field] = [pkl_loads_gzip(tag) for tag in device[field]]
-
-    if device["root"] is False:
+        if device.get(field):  # Check if the field is present and not empty
+            device[field] = pkl_loads_gzip(device[field])  # First decompression
+            if field == TAGS:  # Tags are doubly compressed.
+                # Second decompression: Decompress each tag in the list
+                device[field] = [pkl_loads_gzip(tag) for tag in device[field]]
+
+    # Decompress non-root fields if the device is not a root and fields exist
+    if not device.get("root"):  # Use get to handle missing 'root' field
         for field in NON_ROOT_FIELDS_TO_COMPRESS:
-            device[field] = pkl_loads_gzip(device[field])
+            if device.get(field):  # Check if the field is present and non empty
+                device[field] = pkl_loads_gzip(device[field])
+
     return device
 
 
@@ -553,30 +556,64 @@ def read_inactive(self, *key_parts: str) -> Device:
         _device = unmarshall(item)
         return Device(**decompress_device_fields(_device))
 
-    def query_by_tag(self, **kwargs) -> list[Device]:
+    def query_by_tag(self, fields_to_drop: list[str] = None, **kwargs) -> list[Device]:
         """
-        Query the device by predefined tags:
-
-            repository.query_by_tag(foo="123", bar="456")
-
-        NB: the DeviceTag enforces that values (but not keys) are case insensitive
+        Query the device by predefined tags, optionally dropping specific fields from the query result.
+        Example: repository.query_by_tag(fields_to_drop=["field1", "field2"], foo="123", bar="456")
         """
+
         tag_value = DeviceTag(**kwargs).value
         pk = TableKey.DEVICE_TAG.key(tag_value)
 
-        # Initial query to retrieve a list of all the root-device pk's
-        response = self.client.query(
-            ExpressionAttributeValues={":pk": marshall_value(pk)},
-            KeyConditionExpression="pk = :pk",
-            TableName=self.table_name,
-        )
+        query_params = {
+            "ExpressionAttributeValues": {":pk": marshall_value(pk)},
+            "KeyConditionExpression": "pk = :pk",
+            "TableName": self.table_name,
+        }
+
+        # If fields to drop are provided, create a ProjectionExpression
+        if fields_to_drop:
+            all_fields = Device.get_all_fields()
+
+            # Ensure no mandatory fields are dropped
+            dropped_mandatory_fields = set(fields_to_drop) & MANDATORY_DEVICE_FIELDS
+            if dropped_mandatory_fields:
+                raise ValueError(
+                    f"Cannot drop mandatory fields: {', '.join(dropped_mandatory_fields)}"
+                )
+
+            fields_to_return = all_fields - set(fields_to_drop)
+
+            # DynamoDB ProjectionExpression, specifying which fields to return
+            query_params.update(_dynamodb_projection_expression(fields_to_return))
+
+        # Perform the DynamoDB query
+        response = self.client.query(**query_params)
+
         # Not yet implemented: pagination
         if "LastEvaluatedKey" in response:
             raise TooManyResults(f"Too many results for query '{kwargs}'")
 
-        # Convert to Device, sorted by 'pk', which would have been
-        # the expected behaviour if tags in the database were
-        # Device duplicates rather than references
+        # Convert to Device, sorted by 'pk'
         compressed_devices = map(unmarshall, response["Items"])
         devices_as_dict = map(decompress_device_fields, compressed_devices)
+
         return [Device(**d) for d in sorted(devices_as_dict, key=lambda d: d["id"])]
+
+
+def _dynamodb_projection_expression(updated_fields: list[str]):
+    expression_attribute_names = {}
+    update_clauses = []
+
+    for field_name in updated_fields:
+        field_name_placeholder = f"#{field_name}"
+
+        update_clauses.append(field_name_placeholder)
+        expression_attribute_names[field_name_placeholder] = field_name
+
+    projection_expression = ", ".join(update_clauses)
+
+    return dict(
+        ProjectionExpression=projection_expression,
+        ExpressionAttributeNames=expression_attribute_names,
+    )
diff --git a/src/test_helpers/validate_search_response.py b/src/test_helpers/validate_search_response.py
@@ -4,7 +4,7 @@ def validate_result_body(result_body, devices, params):
     for index, result in enumerate(result_body):
         validate_device(result, devices[index])
         validate_keys(result["keys"], devices[index])
-        validate_tags(result["tags"], params)
+        validate_tags(result["tags"])
         validate_questionnaire_responses(result, devices[index], params)
 
 
@@ -18,10 +18,8 @@ def validate_keys(keys, device):
         assert key["key_value"] == device["device_key"]
 
 
-def validate_tags(tags, params):
-    for tag in tags:
-        for key, value in params.items():
-            assert [key, value.lower()] in tag
+def validate_tags(tags):
+    assert tags == []  # The tags field is dropped due to being chunky
 
 
 def validate_questionnaire_responses(result, device, params):