ome · sbesson · Apr 5, 2022 · Feb 2, 2022 · Feb 2, 2022 · Feb 2, 2022
diff --git a/setup.py b/setup.py
@@ -127,7 +127,8 @@ def read(fname):
         'future',
         'omero-py>=5.6.0',
         'PyYAML',
-        'jinja2'
+        'jinja2',
+        'pandas'
     ],
     python_requires='>=3',
     tests_require=[

diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py
@@ -32,6 +32,8 @@
 from omero.grid import LongColumn
 from omero.model.enums import UnitsLength
 
+import pandas as pd
+
 HELP = """Metadata utilities
 
 Provides access to and editing of the metadata which
@@ -242,6 +244,9 @@ def _configure(self, parser):
         populate.add_argument("--allow_nan", action="store_true", help=(
             "Allow empty values to become Nan in Long or Double columns"))
 
+        populate.add_argument("--detect_header", action="store_true", help=(
+            "Automatically detect header row to populate"))
+
         populateroi.add_argument(
             "--measurement", type=int, default=None,
             help="Index of the measurement to populate. By default, all")
@@ -483,6 +488,45 @@ def testtables(self, args):
         if not initialized:
             self.ctx.die(100, "Failed to initialize Table")
 
+    def detect_headers(self, csv_path):
+        '''
+        Function to automatically detect headers from a CSV file. This function
+        loads the table to pandas to detects the column type and match headers
+        '''
+
+        conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi']
+        headers = []
+        table = pd.read_csv(csv_path)
+        col_types = table.dtypes.values.tolist()
+        cols = list(table.columns)
+
+        for index, col_type in enumerate(col_types):
+            col = cols[index]
+            if col.lower() in conserved_headers:
+                headers.append(col.lower())
+            elif col.lower() == 'image name' or col.lower() == 'imagename' or \
+                    col.lower() == 'image_name':
+                headers.append('image')
+            elif col.lower() == 'dataset name' or \
+                    col.lower() == 'datasetname' or \
+                    col.lower() == 'dataset_name':
+                headers.append('dataset')
+            elif col.lower() == 'plate name' or col.lower() == 'platename' or \
+                    col.lower() == 'plate_name':
+                headers.append('plate')
+            elif col.lower() == 'well name' or col.lower() == 'wellname' or \
+                    col.lower() == 'well_name':
+                headers.append('well')
+            elif col_type.name == 'object':
+                headers.append('s')
+            elif col_type.name == 'float64':
+                headers.append('d')
+            elif col_type.name == 'int64':
+                headers.append('l')
+            elif col_type.name == 'bool':
+                headers.append('b')
+        return headers
+
     # WRITE
 
     def populate(self, args):
@@ -521,6 +565,11 @@ def populate(self, args):
                 cfgid = cfgann.getFile().getId()
                 md.linkAnnotation(cfgann)
 
+        header_type = None
+        if args.detect_header:
+            header_type = self.detect_headers(args.file)
+            if args.dry_run:
+                omero_metadata.populate.log.info(f"Header Types:{header_type}")
         loops = 0
         ms = 0
         wait = args.wait
@@ -533,7 +582,7 @@ def populate(self, args):
                             cfg=args.cfg, cfgid=cfgid, attach=args.attach,
                             options=localcfg, batch_size=args.batch,
                             loops=loops, ms=ms, dry_run=args.dry_run,
-                            allow_nan=args.allow_nan)
+                            allow_nan=args.allow_nan, column_types=header_type)
         ctx.parse()
 
     def rois(self, args):

diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py
@@ -416,8 +416,6 @@ def resolve(self, column, value, row):
                         )
                         break
                     elif column.name.lower() == "dataset name":
-                        # DatasetColumn unimplemented at the momnet
-                        # We can still access column names though
                         images_by_id = self.wrapper.images_by_id[
                             self.wrapper.datasets_by_name[column_value].id.val
                         ]
@@ -427,8 +425,6 @@ def resolve(self, column, value, row):
                         )
                         break
                     elif column.name.lower() == "dataset":
-                        # DatasetColumn unimplemented at the momnet
-                        # We can still access column names though
                         images_by_id = self.wrapper.images_by_id[
                             self.wrapper.datasets_by_id[
                                 int(column_value)].id.val
@@ -825,7 +821,10 @@ def get_image_name_by_id(self, iid, did=None):
 
     def resolve_dataset(self, column, row, value):
         try:
-            return self.datasets_by_name[value].id.val
+            if column.name.lower() == 'dataset':
+                return self.datasets_by_id[int(value)].id.val
+            else:
+                return self.datasets_by_name[value].id.val
         except KeyError:
             log.warn('Project is missing dataset: %s' % value)
             return Skip()
@@ -1160,6 +1159,8 @@ def preprocess_data(self, reader):
                         column.values.append(value)
                     elif column.name.lower() == "plate":
                         column.values.append(value)
+                    elif column.name.lower() == "dataset":
+                        column.values.append(value)
                 except TypeError:
                     log.error('Original value "%s" now "%s" of bad type!' % (
                         original_value, value))