Merge pull request #67 from muhanadz/detect_header_datasetColumn

Add --detect_header and DatasetColumn support
ome · Apr 5, 2022 · 07a33f3 · 07a33f3
2 parents 2c1b269 + 99383af
commit 07a33f3
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 9 deletions.
diff --git a/setup.py b/setup.py
@@ -127,7 +127,8 @@ def read(fname):
         'future',
         'omero-py>=5.6.0',
         'PyYAML',
-        'jinja2'
+        'jinja2',
+        'pandas'
     ],
     python_requires='>=3',
     tests_require=[

diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py
@@ -32,6 +32,8 @@
 from omero.grid import LongColumn
 from omero.model.enums import UnitsLength
 
+import pandas as pd
+
 HELP = """Metadata utilities
 
 Provides access to and editing of the metadata which
@@ -242,6 +244,9 @@ def _configure(self, parser):
         populate.add_argument("--allow_nan", action="store_true", help=(
             "Allow empty values to become Nan in Long or Double columns"))
 
+        populate.add_argument("--manual_header", action="store_true", help=(
+            "Disable automatic header detection during population"))
+
         populateroi.add_argument(
             "--measurement", type=int, default=None,
             help="Index of the measurement to populate. By default, all")
@@ -483,6 +488,49 @@ def testtables(self, args):
         if not initialized:
             self.ctx.die(100, "Failed to initialize Table")
 
+    @staticmethod
+    def detect_headers(csv_path):
+        '''
+        Function to automatically detect headers from a CSV file. This function
+        loads the table to pandas to detects the column type and match headers
+        '''
+
+        conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi']
+        headers = []
+        table = pd.read_csv(csv_path)
+        col_types = table.dtypes.values.tolist()
+        cols = list(table.columns)
+
+        for index, col_type in enumerate(col_types):
+            col = cols[index]
+            if col.lower() in conserved_headers:
+                headers.append(col.lower())
+            elif col.lower() == 'image id' or col.lower() == 'imageid' or \
+                    col.lower() == 'image_id':
+                headers.append('image')
+            elif col.lower() == 'roi id' or col.lower() == 'roiid' or \
+                    col.lower() == 'roi_id':
+                headers.append('roi')
+            elif col.lower() == 'dataset id' or \
+                    col.lower() == 'datasetid' or \
+                    col.lower() == 'dataset_id':
+                headers.append('dataset')
+            elif col.lower() == 'plate name' or col.lower() == 'platename' or \
+                    col.lower() == 'plate_name':
+                headers.append('plate')
+            elif col.lower() == 'well name' or col.lower() == 'wellname' or \
+                    col.lower() == 'well_name':
+                headers.append('well')
+            elif col_type.name == 'object':
+                headers.append('s')
+            elif col_type.name == 'float64':
+                headers.append('d')
+            elif col_type.name == 'int64':
+                headers.append('l')
+            elif col_type.name == 'bool':
+                headers.append('b')
+        return headers
+
     # WRITE
 
     def populate(self, args):
@@ -521,6 +569,19 @@ def populate(self, args):
                 cfgid = cfgann.getFile().getId()
                 md.linkAnnotation(cfgann)
 
+        header_type = None
+        # To use auto detect header by default unless instructed not to
+        # AND
+        # Check if first row contains `# header`
+        first_row = pd.read_csv(args.file, nrows=1, header=None)
+        if not args.manual_header and \
+                not first_row[0].str.contains('# header').bool():
+            omero_metadata.populate.log.info("Detecting header types")
+            header_type = MetadataControl.detect_headers(args.file)
+            if args.dry_run:
+                omero_metadata.populate.log.info(f"Header Types:{header_type}")
+        else:
+            omero_metadata.populate.log.info("Using user defined header types")
         loops = 0
         ms = 0
         wait = args.wait
@@ -533,7 +594,7 @@ def populate(self, args):
                             cfg=args.cfg, cfgid=cfgid, attach=args.attach,
                             options=localcfg, batch_size=args.batch,
                             loops=loops, ms=ms, dry_run=args.dry_run,
-                            allow_nan=args.allow_nan)
+                            allow_nan=args.allow_nan, column_types=header_type)
         ctx.parse()
 
     def rois(self, args):

diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py
@@ -314,6 +314,13 @@ def _create_columns(self, klass):
                               self.DEFAULT_COLUMN_SIZE, list()))
                 # Ensure RoiColumn is named 'Roi'
                 column.name = "Roi"
+            if column.__class__ is DatasetColumn:
+                # This breaks the code, as currently there is no implementation
+                # of a method to populate the 'Dataset Name' column
+                # append.append(StringColumn(DATASET_NAME_COLUMN, '',
+                #               self.DEFAULT_COLUMN_SIZE, list()))
+                # Ensure DatasetColumn is named 'Dataset'
+                column.name = "Dataset"
             # If image/roi name, then add ID column"
             if column.name == IMAGE_NAME_COLUMN:
                 append.append(ImageColumn("Image", '', list()))
@@ -417,8 +424,6 @@ def resolve(self, column, value, row):
                         )
                         break
                     elif column.name.lower() == "dataset name":
-                        # DatasetColumn unimplemented at the momnet
-                        # We can still access column names though
                         images_by_id = self.wrapper.images_by_id[
                             self.wrapper.datasets_by_name[column_value].id.val
                         ]
@@ -428,8 +433,6 @@ def resolve(self, column, value, row):
                         )
                         break
                     elif column.name.lower() == "dataset":
-                        # DatasetColumn unimplemented at the momnet
-                        # We can still access column names though
                         images_by_id = self.wrapper.images_by_id[
                             self.wrapper.datasets_by_id[
                                 int(column_value)].id.val
@@ -892,7 +895,10 @@ def get_image_name_by_id(self, iid, did=None):
 
     def resolve_dataset(self, column, row, value):
         try:
-            return self.datasets_by_name[value].id.val
+            if column.name.lower() == 'dataset':
+                return self.datasets_by_id[int(value)].id.val
+            else:
+                return self.datasets_by_name[value].id.val
         except KeyError:
             log.warn('Project is missing dataset: %s' % value)
             return Skip()
@@ -1243,6 +1249,8 @@ def preprocess_data(self, reader):
                         column.values.append(value)
                     elif column.name.lower() == "plate":
                         column.values.append(value)
+                    elif column.name.lower() == "dataset":
+                        column.values.append(value)
                 except TypeError:
                     log.error('Original value "%s" now "%s" of bad type!' % (
                         original_value, value))
@@ -1635,7 +1643,7 @@ class BulkToMapAnnotationContext(_QueryContext):
     def __init__(self, client, target_object, file=None, fileid=None,
                  cfg=None, cfgid=None, attach=False, options=None,
                  batch_size=1000, loops=10, ms=10, dry_run=False,
-                 allow_nan=False):
+                 allow_nan=False, **kwargs):
         """
         :param client: OMERO client object
         :param target_object: The object to be annotated
@@ -1968,7 +1976,7 @@ class DeleteMapAnnotationContext(_QueryContext):
     def __init__(self, client, target_object, file=None, fileid=None,
                  cfg=None, cfgid=None, attach=False, options=None,
                  batch_size=1000, loops=10, ms=500, dry_run=False,
-                 allow_nan=False):
+                 allow_nan=False, **kwargs):
 
         """
         :param client: OMERO client object