ome · joshmoore · Mar 2, 2022 · Mar 3, 2022 · Mar 3, 2022 · Mar 3, 2022
diff --git a/.github/workflows/pre.yml b/.github/workflows/pre.yml
@@ -41,6 +41,7 @@ jobs:
         run: |
           python -m pip install -U pip setuptools wheel pytest
           python -m pip install -r requirements/requirements-dev.txt
+          python -m pip install .  # Without -e for plugins
           tox -e ${{ matrix.toxenv }} --pre
 
       # If something goes wrong, we can open an issue in the repo

diff --git a/.isort.cfg b/.isort.cfg
@@ -1,5 +1,5 @@
 [settings]
-known_third_party = dask,numcodecs,numpy,pytest,scipy,setuptools,skimage,zarr
+known_third_party = dask,entrypoints,numcodecs,numpy,ome_types,pytest,scipy,setuptools,skimage,zarr
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -5,6 +5,7 @@ API
     :maxdepth: 3
 
     api/cli
+    api/bioformats2raw
     api/csv
     api/data
     api/format

diff --git a/docs/source/api/bioformats2raw.rst b/docs/source/api/bioformats2raw.rst
@@ -0,0 +1,5 @@
+Bioformats2raw(``ome_zarr.bioformats2raw``)
+===========================================
+
+.. automodule:: ome_zarr.bioformats2raw
+   :members:
diff --git a/docs/source/python.rst b/docs/source/python.rst
@@ -109,3 +109,13 @@ the data is available as `dask` arrays::
     viewer = napari.view_image(dask_data, channel_axis=0)
     if __name__ == '__main__':
         napari.run()
+
+Reading bioformats2raw filesets
+-------------------------------
+
+The output from bioformats2raw encapsulates _multiple_ OME-NGFF images.
+This structure has been added to the OME-NGFF specification as transitional
+`"bioformats2raw.layout" <https://ngff.openmicroscopy.org/0.4/index.html#bf2raw>`_
+metadata. To read such filesets:
+
+TBD
diff --git a/ome_zarr/bioformats2raw.py b/ome_zarr/bioformats2raw.py
@@ -0,0 +1,133 @@
+"""Spec extension for reading bioformats2raw.layout.
+
+This specification detects and reads filesets which were created by
+bioformats2raw and therefore can have multiple multiscale image groups
+present. Each such image will be returned by the [ome_zarr.reader.Reader]
+as a separate [ome_zarr.reader.Node], but metadata which has been parsed
+from the OME-XML metadata associated with the specific image will be
+attached.
+
+TBD: Example
+"""
+
+import logging
+import os
+import re
+import tempfile
+from xml.etree import ElementTree as ET
+
+import ome_types
+
+from ome_zarr.io import ZarrLocation
+from ome_zarr.reader import Node
+from ome_zarr.reader import Spec as Base
+
+__author__ = "Open Microscopy Environment (OME)"
+__copyright__ = "Open Microscopy Environment (OME)"
+__license__ = "BSD-2-Clause"
+
+_logger = logging.getLogger(__name__)
+
+
+class bioformats2raw(Base):
+    """A spec-type for reading multi-image filesets OME-XML
+    metadata.
+    """
+
+    @staticmethod
+    def matches(zarr: ZarrLocation) -> bool:
+        """Pass if the metadata for the zgroup contains
+        `{"bioformats2raw.layout": 3}`"""
+        layout = zarr.root_attrs.get("bioformats2raw.layout", None)
+        _logger.error(f"layout={layout == 3} zarr={zarr}")
+        return layout == 3
+
+    def __init__(self, node: Node) -> None:
+        """Load metadata from the three sources associated with this
+        specification: the OME zgroup metadata, the OME-XML file, and
+        the images zgroups themselves.
+        """
+        super().__init__(node)
+        try:
+            # Load OME/METADATA.ome.xml
+            data = self._handle(node)
+            if data.plates:
+                _logger.info("Plates detected. Skipping implicit loading")
+            else:
+                # Load the OME/ zgroup metadata
+                ome = node.zarr.create("OME")
+                if ome.exists:
+                    series_metadata = ome.zarr.root_attrs.get("series", None)
+                    if series_metadata is not None:
+                        node.metadata["series"] = series_metadata
+
+                # Load each individual image
+                for idx, image in enumerate(data.images):
+                    series = node.zarr.create(str(idx))
+                    assert series.exists(), f"{series} is missing"
+                    _logger.info("found %s", series)
+                    subnode = node.add(series)
+                    if subnode:
+                        subnode.metadata["ome-xml:index"] = idx
+                        subnode.metadata["ome-xml:image"] = image
+
+            node.metadata["ome-xml"] = data
+
+        except Exception:
+            _logger.exception("failed to parse metadata")
+
+    def _fix_xml(self, ns: str, elem: ET.Element) -> None:
+        """Correct invalid OME-XML.
+
+        Some versions of bioformats2raw did not include a MetadataOnly
+        tag.
+
+        Note: elem.insert() was not updating the object correctly.
+        """
+
+        if elem.tag == f"{ns}Pixels":
+            must_have = {f"{ns}BinData", f"{ns}TiffData", f"{ns}MetadataOnly"}
+            children = {x.tag for x in elem}
+
+            if not any(x in children for x in must_have):
+                # Needs fixing
+                metadata_only = ET.Element(f"{ns}MetadataOnly")
+
+                last_channel = -1
+                for idx, child in enumerate(elem):
+                    if child.tag == f"{ns}Channel":
+                        last_channel = idx
+                elem.insert(last_channel + 1, metadata_only)
+
+        elif elem.tag == f"{ns}Plane":
+            remove = None
+            for idx, child in enumerate(elem):
+                if child.tag == f"{ns}HashSHA1":
+                    remove = child
+            if remove:
+                elem.remove(remove)
+
+    def _parse_xml(self, filename: str) -> ome_types.model.OME:
+        """Generate [ome_types.model.OME] from OME-XML"""
+        # Parse the file and find the current schema
+        root = ET.parse(filename)
+        m = re.match(r"\{.*\}", root.getroot().tag)
+        ns = m.group(0) if m else ""
+
+        # Update the XML to include MetadataOnly
+        for child in list(root.iter()):
+            self._fix_xml(ns, child)
+        fixed = ET.tostring(root.getroot()).decode()
+
+        # Write file out for ome_types
+        with tempfile.NamedTemporaryFile() as t:
+            t.write(fixed.encode())
+            t.flush()
+            return ome_types.from_xml(t.name)
+
+    def _handle(self, node: Node) -> ome_types.model.OME:
+        """Main parsing method which looks for OME/METADATA.ome.xml"""
+        metadata = node.zarr.subpath("OME/METADATA.ome.xml")
+        _logger.info("Looking for metadata in %s", metadata)
+        if os.path.exists(metadata):
+            return self._parse_xml(metadata)
diff --git a/ome_zarr/reader.py b/ome_zarr/reader.py
@@ -1,12 +1,21 @@
-"""Reading logic for ome-zarr."""
+"""Reading logic for ome-zarr.
+
+The main class (Reader) is initialitzed with an [ome_zarr.io.ZarrLocation]
+as returned by [ome_zarr.io.parse_url] and walks up and down the Zarr
+hierarchy parsing each array or group into a [Node] which is aware of all
+meta(data) specifications ([Spec] class) which are available in the current
+runtime.
+"""
 
 import logging
 import math
 from abc import ABC
 from typing import Any, Dict, Iterator, List, Optional, Type, Union, cast, overload
 
 import dask.array as da
+import entrypoints
 import numpy as np
+import zarr
 from dask import delayed
 
 from .axes import Axes
@@ -45,21 +54,49 @@ def __init__(
         self.post_nodes: List[Node] = []
 
         # TODO: this should be some form of plugin infra over subclasses
+        found: List[Spec] = []
         if Labels.matches(zarr):
-            self.specs.append(Labels(self))
+            found.append(Labels(self))
+            self.specs.append(found[-1])
         if Label.matches(zarr):
-            self.specs.append(Label(self))
+            found.append(Label(self))
+            self.specs.append(found[-1])
         if Multiscales.matches(zarr):
-            self.specs.append(Multiscales(self))
+            found.append(Multiscales(self))
+            self.specs.append(found[-1])
         if OMERO.matches(zarr):
-            self.specs.append(OMERO(self))
+            found.append(OMERO(self))
+            self.specs.append(found[-1])
         if plate_labels:
-            self.specs.append(PlateLabels(self))
+            found.append(PlateLabels(self))
+            self.specs.append(found[-1])
         elif Plate.matches(zarr):
-            self.specs.append(Plate(self))
+            found.append(Plate(self))
+            self.specs.append(found[-1])
             # self.add(zarr, plate_labels=True)
         if Well.matches(zarr):
-            self.specs.append(Well(self))
+            found.append(Well(self))
+            self.specs.append(found[-1])
+
+        # Load all entrypoints and give them a chance
+        # to claim parse the current node.
+        for key, value in entrypoints.get_group_named("ome_zarr.spec").items():
+            cls = value.load()
+            if cls.matches(zarr):
+                found.append(cls(self))
+                self.specs.append(found[-1])
+
+        # Anything that has not received a type at this point
+        # can be considered an implicit group.
+        if not found:
+            self.specs.append(Implicit(self))
+
+        if False:  # Temporarily disable. See #174
+            # Load up the hierarchy
+            if Leaf.matches(zarr):
+                self.specs.append(Leaf(self))
+            else:
+                self.specs.append(Root(self))
 
     @overload
     def first(self, spectype: Type["Well"]) -> Optional["Well"]:
@@ -178,6 +215,60 @@ def lookup(self, key: str, default: Any) -> Any:
         return self.zarr.root_attrs.get(key, default)
 
 
+class Implicit(Spec):
+    """
+    A spec-type which simply iterates over available zgroups.
+    """
+
+    @staticmethod
+    def matches(zarr: ZarrLocation) -> bool:
+        """Always return true"""
+        return True
+
+    def __init__(self, node: Node) -> None:
+        super().__init__(node)
+
+        for name in zarr.group(self.zarr.store).group_keys():
+            child_zarr = self.zarr.create(name)
+            if child_zarr.exists():
+                node.add(child_zarr)
+
+
+class Leaf(Spec):
+    """
+    A non-root level of the Zarr hierarchy
+    """
+
+    @staticmethod
+    def matches(zarr: ZarrLocation) -> bool:
+        """Return if the parent directory is within the zarr fileset"""
+
+        parent_zarr = zarr.create("..")
+        return bool(parent_zarr.exists() and (parent_zarr.zgroup or parent_zarr.zarray))
+
+    def __init__(self, node: Node) -> None:
+        super().__init__(node)
+        parent_zarr = node.zarr.create("..")
+        if parent_zarr.exists() and (parent_zarr.zgroup or parent_zarr.zarray):
+            node.add(parent_zarr)
+
+
+class Root(Spec):
+    """
+    Root of the Zarr fileset
+    """
+
+    @staticmethod
+    def matches(zarr: ZarrLocation) -> bool:
+        """Return if the parent directory is not within the zarr fileset"""
+
+        parent_zarr = zarr.create("..")
+        return parent_zarr.exists() and not (parent_zarr.zgroup or parent_zarr.zarray)
+
+    def __init__(self, node: Node) -> None:
+        super().__init__(node)
+
+
 class Labels(Spec):
     """Relatively small specification for the well-known "labels" group which only
     contains the name of subgroups which should be loaded as labeled images."""

diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
@@ -1,6 +1,8 @@
 black
 cython >= 0.29.16
 numpy >= 1.16.0
+entrypoints
+ome-types
 pre-commit
 tox
 wheel

diff --git a/setup.py b/setup.py
@@ -24,6 +24,8 @@ def read(fname):
 install_requires += (["requests"],)
 install_requires += (["scikit-image"],)
 install_requires += (["toolz"],)
+install_requires += (["entrypoints"],)
+install_requires += (["ome-types"],)
 
 
 setup(
@@ -49,6 +51,7 @@ def read(fname):
     ],
     entry_points={
         "console_scripts": ["ome_zarr = ome_zarr.cli:main"],
+        "ome_zarr.spec": ["bioformats2raw = ome_zarr.bioformats2raw:bioformats2raw"],
     },
     tests_require=["pytest"],
 )
diff --git a/tests/data/bf2raw/fake-series-2.zarr/.zattrs b/tests/data/bf2raw/fake-series-2.zarr/.zattrs
@@ -0,0 +1,3 @@
+{
+  "bioformats2raw.layout" : 3
+}
diff --git a/tests/data/bf2raw/fake-series-2.zarr/.zgroup b/tests/data/bf2raw/fake-series-2.zarr/.zgroup
@@ -0,0 +1,3 @@
+{
+  "zarr_format" : 2
+}
diff --git a/tests/data/bf2raw/fake-series-2.zarr/0/.zattrs b/tests/data/bf2raw/fake-series-2.zarr/0/.zattrs
@@ -0,0 +1,38 @@
+{
+  "multiscales" : [ {
+    "metadata" : {
+      "method" : "loci.common.image.SimpleImageScaler",
+      "version" : "Bio-Formats 6.9.1"
+    },
+    "axes" : [ {
+      "name" : "t",
+      "type" : "time"
+    }, {
+      "name" : "c",
+      "type" : "channel"
+    }, {
+      "name" : "z",
+      "type" : "space"
+    }, {
+      "name" : "y",
+      "type" : "space"
+    }, {
+      "name" : "x",
+      "type" : "space"
+    } ],
+    "datasets" : [ {
+      "path" : "0",
+      "coordinateTransformations" : [ {
+        "scale" : [ 1.0, 1.0, 1.0, 1.0, 1.0 ],
+        "type" : "scale"
+      } ]
+    }, {
+      "path" : "1",
+      "coordinateTransformations" : [ {
+        "scale" : [ 1.0, 1.0, 1.0, 2.0, 2.0 ],
+        "type" : "scale"
+      } ]
+    } ],
+    "version" : "0.4"
+  } ]
+}
diff --git a/tests/data/bf2raw/fake-series-2.zarr/0/.zgroup b/tests/data/bf2raw/fake-series-2.zarr/0/.zgroup
@@ -0,0 +1,3 @@
+{
+  "zarr_format" : 2
+}