From 29446de869ce18e002d9686fb2e28685e4db2b77 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 2 Feb 2022 12:10:51 +0000 Subject: [PATCH 01/12] Added support for DatasetColumn, specifically dataset ID column --- src/omero_metadata/populate.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index b6b68a39..f3f1f136 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -416,8 +416,6 @@ def resolve(self, column, value, row): ) break elif column.name.lower() == "dataset name": - # DatasetColumn unimplemented at the momnet - # We can still access column names though images_by_id = self.wrapper.images_by_id[ self.wrapper.datasets_by_name[column_value].id.val ] @@ -427,8 +425,6 @@ def resolve(self, column, value, row): ) break elif column.name.lower() == "dataset": - # DatasetColumn unimplemented at the momnet - # We can still access column names though images_by_id = self.wrapper.images_by_id[ self.wrapper.datasets_by_id[ int(column_value)].id.val @@ -825,7 +821,10 @@ def get_image_name_by_id(self, iid, did=None): def resolve_dataset(self, column, row, value): try: - return self.datasets_by_name[value].id.val + if column.name.lower() == 'dataset': + return self.datasets_by_id[int(value)].id.val + else: + return self.datasets_by_name[value].id.val except KeyError: log.warn('Project is missing dataset: %s' % value) return Skip() @@ -1160,6 +1159,8 @@ def preprocess_data(self, reader): column.values.append(value) elif column.name.lower() == "plate": column.values.append(value) + elif column.name.lower() == "dataset": + column.values.append(value) except TypeError: log.error('Original value "%s" now "%s" of bad type!' % ( original_value, value)) From 30dd5a7b02bf13714caaca4e425dd0ef3305774d Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 2 Feb 2022 12:11:52 +0000 Subject: [PATCH 02/12] Added CLI argument and tool to automatically detect header type --- src/omero_metadata/cli.py | 51 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index bc788371..624331e1 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -32,6 +32,8 @@ from omero.grid import LongColumn from omero.model.enums import UnitsLength +import pandas as pd + HELP = """Metadata utilities Provides access to and editing of the metadata which @@ -242,6 +244,9 @@ def _configure(self, parser): populate.add_argument("--allow_nan", action="store_true", help=( "Allow empty values to become Nan in Long or Double columns")) + populate.add_argument("--detect_header", action="store_true", help=( + "Automatically detect header row to populate")) + populateroi.add_argument( "--measurement", type=int, default=None, help="Index of the measurement to populate. By default, all") @@ -483,6 +488,44 @@ def testtables(self, args): if not initialized: self.ctx.die(100, "Failed to initialize Table") + def detect_headers(self, csv_path): + ''' + Function to automatically detect headers from a CSV file. This function + loads the table to pandas to detects the column type and match headers + ''' + + conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi'] + headers = [] + table = pd.read_csv(csv_path) + col_types = table.dtypes.values.tolist() + cols = list(table.columns) + + for index, col_type in enumerate(col_types): + col = cols[index] + if col.lower() in conserved_headers: + headers.append(col.lower()) + elif col.lower() == 'image name' or col.lower() == 'imagename' or \ + col.lower() == 'image_name': + headers.append('image') + elif col.lower() == 'dataset name' or col.lower() == 'datasetname' or \ + col.lower() == 'dataset_name': + headers.append('dataset') + elif col.lower() == 'plate name' or col.lower() == 'platename' or \ + col.lower() == 'plate_name': + headers.append('plate') + elif col.lower() == 'well name' or col.lower() == 'wellname' or \ + col.lower() == 'well_name': + headers.append('well') + elif col_type.name == 'object': + headers.append('s') + elif col_type.name == 'float64': + headers.append('d') + elif col_type.name == 'int64': + headers.append('l') + elif col_type.name == 'bool': + headers.append('b') + return headers + # WRITE def populate(self, args): @@ -521,6 +564,12 @@ def populate(self, args): cfgid = cfgann.getFile().getId() md.linkAnnotation(cfgann) + header_type = None + if args.detect_header: + header_type = self.detect_headers(args.file) + if args.dry_run: + omero_metadata.populate.log.info(f"Header Types:{header_type}") + # add condition col_type = blarg, open arg.file, arg.detect_header loops = 0 ms = 0 wait = args.wait @@ -533,7 +582,7 @@ def populate(self, args): cfg=args.cfg, cfgid=cfgid, attach=args.attach, options=localcfg, batch_size=args.batch, loops=loops, ms=ms, dry_run=args.dry_run, - allow_nan=args.allow_nan) + allow_nan=args.allow_nan, column_types=header_type) ctx.parse() def rois(self, args): From 95d610632dbfd90b32e7c52df0c79bc502b6f840 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 2 Feb 2022 12:24:59 +0000 Subject: [PATCH 03/12] Removed comment. --- src/omero_metadata/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 624331e1..6937732d 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -569,7 +569,6 @@ def populate(self, args): header_type = self.detect_headers(args.file) if args.dry_run: omero_metadata.populate.log.info(f"Header Types:{header_type}") - # add condition col_type = blarg, open arg.file, arg.detect_header loops = 0 ms = 0 wait = args.wait From 3f559af3095478f0118aa63bfe6642d313f8f5e0 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Thu, 3 Feb 2022 14:59:45 +0000 Subject: [PATCH 04/12] Fix flake8 --- src/omero_metadata/cli.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 6937732d..6c67c9e8 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -505,16 +505,17 @@ def detect_headers(self, csv_path): if col.lower() in conserved_headers: headers.append(col.lower()) elif col.lower() == 'image name' or col.lower() == 'imagename' or \ - col.lower() == 'image_name': + col.lower() == 'image_name': headers.append('image') - elif col.lower() == 'dataset name' or col.lower() == 'datasetname' or \ - col.lower() == 'dataset_name': + elif col.lower() == 'dataset name' or \ + col.lower() == 'datasetname' or \ + col.lower() == 'dataset_name': headers.append('dataset') elif col.lower() == 'plate name' or col.lower() == 'platename' or \ - col.lower() == 'plate_name': + col.lower() == 'plate_name': headers.append('plate') elif col.lower() == 'well name' or col.lower() == 'wellname' or \ - col.lower() == 'well_name': + col.lower() == 'well_name': headers.append('well') elif col_type.name == 'object': headers.append('s') From 0f764486fc88f3c46490dc0ab37193b8f762e336 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Thu, 3 Feb 2022 15:09:01 +0000 Subject: [PATCH 05/12] Added pandas module requirment --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cdc649f8..bde27697 100644 --- a/setup.py +++ b/setup.py @@ -127,7 +127,8 @@ def read(fname): 'future', 'omero-py>=5.6.0', 'PyYAML', - 'jinja2' + 'jinja2', + 'pandas' ], python_requires='>=3', tests_require=[ From cfe97c4f0ccbbc2c9004418da55e252a098afcf8 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 23 Feb 2022 09:41:05 +0000 Subject: [PATCH 06/12] Modified code to not detect dataset/image header type for dataset_name/image_name columns and only dataset_id/image_id --- src/omero_metadata/cli.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 6c67c9e8..9af0a6e6 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -504,12 +504,15 @@ def detect_headers(self, csv_path): col = cols[index] if col.lower() in conserved_headers: headers.append(col.lower()) - elif col.lower() == 'image name' or col.lower() == 'imagename' or \ - col.lower() == 'image_name': + elif col.lower() == 'image id' or col.lower() == 'imageid' or \ + col.lower() == 'image_id': headers.append('image') - elif col.lower() == 'dataset name' or \ - col.lower() == 'datasetname' or \ - col.lower() == 'dataset_name': + elif col.lower() == 'roi id' or col.lower() == 'roiid' or \ + col.lower() == 'roi_id': + headers.append('roi') + elif col.lower() == 'dataset id' or \ + col.lower() == 'datasetid' or \ + col.lower() == 'dataset_id': headers.append('dataset') elif col.lower() == 'plate name' or col.lower() == 'platename' or \ col.lower() == 'plate_name': From 9daa9f364f70dfc48409a8534aec8ea6cd86ec46 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Thu, 24 Feb 2022 11:20:44 +0000 Subject: [PATCH 07/12] Added 'Dataset Name' column to be consistent with other column types. Ensured DatasetColumn is named 'Dataset' --- src/omero_metadata/populate.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index f3f1f136..0d329cc6 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -313,6 +313,11 @@ def _create_columns(self, klass): self.DEFAULT_COLUMN_SIZE, list())) # Ensure RoiColumn is named 'Roi' column.name = "Roi" + if column.__class__ is DatasetColumn: + append.append(StringColumn(DATASET_NAME_COLUMN, '', + self.DEFAULT_COLUMN_SIZE, list())) + # Ensure DatasetColumn is named 'Dataset' + column.name = "Dataset" # If image/roi name, then add ID column" if column.name == IMAGE_NAME_COLUMN: append.append(ImageColumn("Image", '', list())) From 9b158f539f766e9338f3742e5ed0cc68b1769e59 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Tue, 1 Mar 2022 11:41:01 +0000 Subject: [PATCH 08/12] Prevent other contexts from breaking when using --detect_header --- src/omero_metadata/populate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index 0d329cc6..88b483e0 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -1556,7 +1556,7 @@ class BulkToMapAnnotationContext(_QueryContext): def __init__(self, client, target_object, file=None, fileid=None, cfg=None, cfgid=None, attach=False, options=None, batch_size=1000, loops=10, ms=10, dry_run=False, - allow_nan=False): + allow_nan=False, **kwargs): """ :param client: OMERO client object :param target_object: The object to be annotated @@ -1889,7 +1889,7 @@ class DeleteMapAnnotationContext(_QueryContext): def __init__(self, client, target_object, file=None, fileid=None, cfg=None, cfgid=None, attach=False, options=None, batch_size=1000, loops=10, ms=500, dry_run=False, - allow_nan=False): + allow_nan=False, **kwargs): """ :param client: OMERO client object From 33de3ba587ee2a8cef7fcda49e5c924d746fe473 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Mon, 28 Mar 2022 14:23:20 +0100 Subject: [PATCH 09/12] Changed the code's default behaviour to use the new header detection method. User can now either pass '--manual_header' or a csv with '# header' header to bypass the auto-detect header method. --- src/omero_metadata/cli.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 9af0a6e6..e0fa15a1 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -244,8 +244,8 @@ def _configure(self, parser): populate.add_argument("--allow_nan", action="store_true", help=( "Allow empty values to become Nan in Long or Double columns")) - populate.add_argument("--detect_header", action="store_true", help=( - "Automatically detect header row to populate")) + populate.add_argument("--manual_header", action="store_true", help=( + "Disable automatic header detection row to populate")) populateroi.add_argument( "--measurement", type=int, default=None, @@ -569,10 +569,18 @@ def populate(self, args): md.linkAnnotation(cfgann) header_type = None - if args.detect_header: + # To use auto detect header by default unless instructed not to + # AND + # Check if first row contains `# header` + first_row = pd.read_csv(args.file, nrows=1, header=None) + if not args.manual_header and \ + not first_row[0].str.contains('# header'): + omero_metadata.populate.log.info("Detecting header types") header_type = self.detect_headers(args.file) if args.dry_run: omero_metadata.populate.log.info(f"Header Types:{header_type}") + else: + omero_metadata.populate.log.info("Using user defined header types") loops = 0 ms = 0 wait = args.wait From 3b8ff20c6203c380cc5b75335c98df740d807b39 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Tue, 29 Mar 2022 13:20:50 +0100 Subject: [PATCH 10/12] Removed the newely added 'Dataset Name' column as it wasn't fully implemented later and caused a bug --- src/omero_metadata/populate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index 88b483e0..dc7c5194 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -314,8 +314,10 @@ def _create_columns(self, klass): # Ensure RoiColumn is named 'Roi' column.name = "Roi" if column.__class__ is DatasetColumn: - append.append(StringColumn(DATASET_NAME_COLUMN, '', - self.DEFAULT_COLUMN_SIZE, list())) + # This breaks the code, as currently there is no implementation + # of a method to populate the 'Dataset Name' column + # append.append(StringColumn(DATASET_NAME_COLUMN, '', + # self.DEFAULT_COLUMN_SIZE, list())) # Ensure DatasetColumn is named 'Dataset' column.name = "Dataset" # If image/roi name, then add ID column" From 06ad0626e8c03b4888eb34679e75e3ed4d1924e0 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Tue, 29 Mar 2022 13:21:44 +0100 Subject: [PATCH 11/12] Improved '--manual_header' help description and fixed a bug with '# header' detection --- src/omero_metadata/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index e0fa15a1..b06c9109 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -245,7 +245,7 @@ def _configure(self, parser): "Allow empty values to become Nan in Long or Double columns")) populate.add_argument("--manual_header", action="store_true", help=( - "Disable automatic header detection row to populate")) + "Disable automatic header detection during population")) populateroi.add_argument( "--measurement", type=int, default=None, @@ -574,7 +574,7 @@ def populate(self, args): # Check if first row contains `# header` first_row = pd.read_csv(args.file, nrows=1, header=None) if not args.manual_header and \ - not first_row[0].str.contains('# header'): + not first_row[0].str.contains('# header').bool(): omero_metadata.populate.log.info("Detecting header types") header_type = self.detect_headers(args.file) if args.dry_run: From 99383af2e50af35f8669aad99dc1f683b2fd67f1 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 30 Mar 2022 13:45:33 +0100 Subject: [PATCH 12/12] Made detect_headers method a static method --- src/omero_metadata/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index b06c9109..edce60b6 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -488,7 +488,8 @@ def testtables(self, args): if not initialized: self.ctx.die(100, "Failed to initialize Table") - def detect_headers(self, csv_path): + @staticmethod + def detect_headers(csv_path): ''' Function to automatically detect headers from a CSV file. This function loads the table to pandas to detects the column type and match headers @@ -576,7 +577,7 @@ def populate(self, args): if not args.manual_header and \ not first_row[0].str.contains('# header').bool(): omero_metadata.populate.log.info("Detecting header types") - header_type = self.detect_headers(args.file) + header_type = MetadataControl.detect_headers(args.file) if args.dry_run: omero_metadata.populate.log.info(f"Header Types:{header_type}") else: