Skip to content

Commit a43fef4

Browse files
authored
Extra pruning of Structured Annotations (#151)
* Extra pruning of Structured Annotations * Adding asserts * Expand test cases * Typos * Modify exclude_metadata argument to be a Callable for custom metadata pruning * PR comments * Adding a minimal test * Adding minimal test * Better callback exception handling
1 parent 6bf3dec commit a43fef4

File tree

6 files changed

+135
-8
lines changed

6 files changed

+135
-8
lines changed

tests/__init__.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,9 @@ def generate_test_case(num_axes, num_ranges, max_value):
115115
return input_ranges, expected_output
116116

117117

118-
def generate_xml(has_macro=True, has_label=True, root_tag="OME", num_images=1):
118+
def generate_xml(
119+
has_macro=True, has_label=True, has_annotations=True, root_tag="OME", num_images=1
120+
):
119121
"""Generate synthetic XML strings with options to include 'macro' and 'label' images."""
120122

121123
# Create the root element
@@ -204,5 +206,37 @@ def generate_xml(has_macro=True, has_label=True, root_tag="OME", num_images=1):
204206
)
205207
tiffdata = ET.SubElement(pixels, "TiffData", IFD="2", PlaneCount="1")
206208

209+
if has_annotations:
210+
annotations = ET.SubElement(ome, "StructuredAnnotations")
211+
if has_macro:
212+
macro_annot = ET.SubElement(
213+
annotations,
214+
"CommentAnnotation",
215+
ID=f"Annotation:{random.randint}",
216+
Namespace="",
217+
)
218+
description = ET.SubElement(macro_annot, "Description")
219+
220+
# Based on standard
221+
description.text = "barcode_value"
222+
value = ET.SubElement(macro_annot, "Value")
223+
value.text = "random_text"
224+
if has_label:
225+
label_annot = ET.SubElement(
226+
annotations,
227+
"CommentAnnotation",
228+
ID=f"Annotation:{random.randint}",
229+
Namespace="",
230+
)
231+
description = ET.SubElement(
232+
label_annot,
233+
"Description",
234+
)
235+
value = ET.SubElement(label_annot, "Value")
236+
237+
# Based on standard
238+
description.text = "label_text"
239+
value.text = "random_text"
240+
207241
# Convert the ElementTree to a string
208242
return ET.tostring(ome, encoding="unicode")

tests/integration/converters/test_ome_tiff.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,31 @@ def test_ome_tiff_converter_exclude_original_metadata(
128128
with TileDBOpenSlide(str(tiledb_path)) as t:
129129
assert t.properties["original_metadata"] == "{}"
130130

131+
# Exclude with a custom function applies only to images with OME-XML metadata
132+
if filename != "UTM2GTIF.tiff":
133+
134+
def custom_pruning_function(xml_string: str):
135+
return "custom_metadata"
136+
137+
input_path = get_path(filename)
138+
tiledb_path = tmp_path / "to_tiledb_custom_metadata"
139+
OMETiffConverter.to_tiledb(
140+
input_path,
141+
str(tiledb_path),
142+
preserve_axes=preserve_axes,
143+
chunked=chunked,
144+
max_workers=max_workers,
145+
compressor=compressor,
146+
log=False,
147+
exclude_metadata=custom_pruning_function,
148+
)
149+
150+
with TileDBOpenSlide(str(tiledb_path)) as t:
151+
assert (
152+
t.properties["original_metadata"]
153+
== '{"ome_metadata": "custom_metadata"}'
154+
)
155+
131156

132157
@pytest.mark.parametrize(
133158
"filename,num_series", [("CMU-1-Small-Region.ome.tiff", 3), ("UTM2GTIF.tiff", 1)]

tests/unit/test_helpers.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,25 @@ def test_validate_ingestion(num_axes, num_ranges, max_value):
6363

6464
@pytest.mark.parametrize("macro", [True, False])
6565
@pytest.mark.parametrize("has_label", [True, False])
66+
@pytest.mark.parametrize("annotations", [True, False])
6667
@pytest.mark.parametrize("num_images", [1, 2, 3])
6768
@pytest.mark.parametrize("root_tag", ["OME", "InvalidRoot"])
68-
def test_remove_ome_image_metadata(macro, has_label, num_images, root_tag):
69+
def test_remove_ome_image_metadata(macro, has_label, annotations, num_images, root_tag):
6970
original_xml_string = generate_xml(
70-
has_macro=macro, has_label=has_label, num_images=1, root_tag=root_tag
71+
has_macro=macro,
72+
has_label=has_label,
73+
has_annotations=annotations,
74+
num_images=1,
75+
root_tag=root_tag,
7176
)
7277

7378
excluded_metadata = remove_ome_image_metadata(original_xml_string)
79+
80+
namespaces = {"ome": "http://www.openmicroscopy.org/Schemas/OME/2016-06"}
81+
82+
barcode_xpath = ".//ome:StructuredAnnotations/ome:CommentAnnotation[ome:Description='barcode_value']"
83+
label_xpath = ".//ome:StructuredAnnotations/ome:CommentAnnotation[ome:Description='label_text']"
84+
7485
if root_tag == "OME":
7586
parsed_excluded = ET.fromstring(excluded_metadata)
7687

@@ -89,5 +100,10 @@ def test_remove_ome_image_metadata(macro, has_label, num_images, root_tag):
89100
)
90101
is None
91102
)
103+
104+
# Assert if "barcode_value" and "label_text" subelement is present
105+
assert parsed_excluded.find(barcode_xpath, namespaces=namespaces) is None
106+
assert parsed_excluded.find(label_xpath, namespaces=namespaces) is None
107+
92108
else:
93109
assert remove_ome_image_metadata(original_xml_string) is None

tiledb/bioimg/converters/base.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from operator import itemgetter
1010
from typing import (
1111
Any,
12+
Callable,
1213
Dict,
1314
Generic,
1415
Iterator,
@@ -41,6 +42,7 @@
4142

4243
from .. import ATTR_NAME
4344
from ..helpers import (
45+
MetadataCallbackError,
4446
ReadWriteGroup,
4547
compute_channel_minmax,
4648
get_axes_mapper,
@@ -324,7 +326,7 @@ def to_tiledb(
324326
preserve_axes: bool = False,
325327
chunked: bool = False,
326328
max_workers: int = 0,
327-
exclude_metadata: bool = False,
329+
exclude_metadata: Union[bool, Callable[[str], str], None] = None,
328330
experimental_reader: bool = False,
329331
experimental_queue_limit: Tuple[int, int] = (10, 20),
330332
compressor: Optional[Union[Mapping[int, Any], Any]] = None,
@@ -350,7 +352,14 @@ def to_tiledb(
350352
original ones.
351353
:param max_workers: Maximum number of threads that can be used for conversion.
352354
Applicable only if chunked=True.
353-
:param exclude_metadata: If true, drop original metadata of the images and exclude them from being ingested.
355+
:param exclude_metadata: An optional argument that specifies how to transform the original metadata.
356+
It can be one of the following:
357+
* A callable (function, method, etc.) that takes an OME-XML string and returns it as a string, while removing
358+
some of the original metadata and excluding them from being ingested.
359+
* A boolean value:
360+
* ``True``: Indicates a specific built-in transformation should be applied. see: `remove_ome_image_metadata`
361+
* ``False``: Indicates no transformation should be applied.
362+
* ``None``: Indicates no transformation should be applied (same as ``False``).
354363
:param experimental_reader: If true, use the experimental tiff reader optimized for s3 reads.
355364
Experimental feature, use with caution
356365
:param experimental_queue_limit: When using the experimental reader, define the minimum and maximum number of
@@ -536,7 +545,15 @@ def to_tiledb(
536545
original_metadata = reader.original_metadata
537546
else:
538547
if ome_xml := reader.original_metadata.get("ome_metadata"):
539-
pruned_metadata = remove_ome_image_metadata(ome_xml)
548+
if isinstance(exclude_metadata, bool):
549+
pruned_metadata = remove_ome_image_metadata(ome_xml)
550+
elif callable(exclude_metadata):
551+
try:
552+
pruned_metadata = exclude_metadata(ome_xml)
553+
except Exception as exc:
554+
raise MetadataCallbackError(str(exc))
555+
else:
556+
raise TypeError("exclude_metadata must be bool or callable")
540557
original_metadata = (
541558
{"ome_metadata": pruned_metadata} if pruned_metadata else {}
542559
)

tiledb/bioimg/helpers.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,13 +504,27 @@ def remove_ome_image_metadata(xml_string: str) -> Union[str, Any]:
504504

505505
# Find all images
506506
images = root.findall("ome:Image", ns)
507+
structured_annotations = root.findall("ome:StructuredAnnotations", ns)
507508

508509
# Iterate over images and remove those with Name 'macro' or 'label'
509510
for image in images:
510511
name = image.attrib.get("Name")
511512
if name in ["macro", "label"]:
512513
root.remove(image)
513514

515+
# Iterate over structured annotations and remove those with name `macro-text` and `label`
516+
for sa in structured_annotations:
517+
comment_annotations = sa.findall("ome:CommentAnnotation", ns)
518+
comments_left = len(comment_annotations)
519+
for comment_annot in sa.findall("ome:CommentAnnotation", ns):
520+
desc = comment_annot.find("ome:Description", ns)
521+
if desc is not None:
522+
if "barcode" in str(desc.text) or "label" in str(desc.text):
523+
sa.remove(comment_annot)
524+
comments_left -= 1
525+
if comments_left == 0:
526+
root.remove(sa)
527+
514528
# Return the modified XML as a string
515529
# Regular expression pattern to match 'ns0', 'ns0:', or ':ns0'
516530
pattern = r"ns0:|:ns0|ns0"
@@ -524,3 +538,16 @@ def remove_ome_image_metadata(xml_string: str) -> Union[str, Any]:
524538
encoding="unicode",
525539
),
526540
)
541+
542+
543+
class MetadataCallbackError(Exception):
544+
"""
545+
Custom exception class for specific error conditions.
546+
"""
547+
548+
def __init__(self, message: str = ""):
549+
self.message = f"OME XML callback function failed {message}"
550+
super().__init__(self.message)
551+
552+
def __str__(self) -> str:
553+
return f"MetadataCallbackError: {self.message}"

tiledb/bioimg/wrappers.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
import importlib.util
4-
from typing import Any, Optional
4+
from typing import Any, Callable, Optional, Union
55

66
try:
77
importlib.util.find_spec("tifffile")
@@ -30,7 +30,7 @@ def from_bioimg(
3030
converter: Converters = Converters.OMETIFF,
3131
*,
3232
verbose: bool = False,
33-
exclude_metadata: bool = False,
33+
exclude_metadata: Union[bool, Callable[[str], str], None] = None,
3434
tile_scale: int = 1,
3535
**kwargs: Any,
3636
) -> Any:
@@ -41,6 +41,14 @@ def from_bioimg(
4141
:param dest: The destination path where the TileDB image will be stored
4242
:param converter: The converter type to be used (tentative) soon automatically detected
4343
:param verbose: verbose logging, defaults to False
44+
:param exclude_metadata: An optional argument that specifies how to transform the original metadata.
45+
It can be one of the following:
46+
* A callable (function, method, etc.) that takes an OME-XML string and returns it as a string, while removing
47+
some of the original metadata and excluding them from being ingested.
48+
* A boolean value:
49+
* ``True``: Indicates a specific built-in transformation should be applied. see: `remove_ome_image_metadata`
50+
* ``False``: Indicates no transformation should be applied.
51+
* ``None``: Indicates no transformation should be applied (same as ``False``).
4452
:param kwargs: keyword arguments for custom ingestion behaviour
4553
:return: The converter class that was used for the ingestion
4654
"""

0 commit comments

Comments
 (0)