Skip to content

Commit

Permalink
Merge pull request #67 from muhanadz/detect_header_datasetColumn
Browse files Browse the repository at this point in the history
Add --detect_header and DatasetColumn support
  • Loading branch information
sbesson authored Apr 5, 2022
2 parents 2c1b269 + 99383af commit 07a33f3
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 9 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ def read(fname):
'future',
'omero-py>=5.6.0',
'PyYAML',
'jinja2'
'jinja2',
'pandas'
],
python_requires='>=3',
tests_require=[
Expand Down
63 changes: 62 additions & 1 deletion src/omero_metadata/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
from omero.grid import LongColumn
from omero.model.enums import UnitsLength

import pandas as pd

HELP = """Metadata utilities
Provides access to and editing of the metadata which
Expand Down Expand Up @@ -242,6 +244,9 @@ def _configure(self, parser):
populate.add_argument("--allow_nan", action="store_true", help=(
"Allow empty values to become Nan in Long or Double columns"))

populate.add_argument("--manual_header", action="store_true", help=(
"Disable automatic header detection during population"))

populateroi.add_argument(
"--measurement", type=int, default=None,
help="Index of the measurement to populate. By default, all")
Expand Down Expand Up @@ -483,6 +488,49 @@ def testtables(self, args):
if not initialized:
self.ctx.die(100, "Failed to initialize Table")

@staticmethod
def detect_headers(csv_path):
'''
Function to automatically detect headers from a CSV file. This function
loads the table to pandas to detects the column type and match headers
'''

conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi']
headers = []
table = pd.read_csv(csv_path)
col_types = table.dtypes.values.tolist()
cols = list(table.columns)

for index, col_type in enumerate(col_types):
col = cols[index]
if col.lower() in conserved_headers:
headers.append(col.lower())
elif col.lower() == 'image id' or col.lower() == 'imageid' or \
col.lower() == 'image_id':
headers.append('image')
elif col.lower() == 'roi id' or col.lower() == 'roiid' or \
col.lower() == 'roi_id':
headers.append('roi')
elif col.lower() == 'dataset id' or \
col.lower() == 'datasetid' or \
col.lower() == 'dataset_id':
headers.append('dataset')
elif col.lower() == 'plate name' or col.lower() == 'platename' or \
col.lower() == 'plate_name':
headers.append('plate')
elif col.lower() == 'well name' or col.lower() == 'wellname' or \
col.lower() == 'well_name':
headers.append('well')
elif col_type.name == 'object':
headers.append('s')
elif col_type.name == 'float64':
headers.append('d')
elif col_type.name == 'int64':
headers.append('l')
elif col_type.name == 'bool':
headers.append('b')
return headers

# WRITE

def populate(self, args):
Expand Down Expand Up @@ -521,6 +569,19 @@ def populate(self, args):
cfgid = cfgann.getFile().getId()
md.linkAnnotation(cfgann)

header_type = None
# To use auto detect header by default unless instructed not to
# AND
# Check if first row contains `# header`
first_row = pd.read_csv(args.file, nrows=1, header=None)
if not args.manual_header and \
not first_row[0].str.contains('# header').bool():
omero_metadata.populate.log.info("Detecting header types")
header_type = MetadataControl.detect_headers(args.file)
if args.dry_run:
omero_metadata.populate.log.info(f"Header Types:{header_type}")
else:
omero_metadata.populate.log.info("Using user defined header types")
loops = 0
ms = 0
wait = args.wait
Expand All @@ -533,7 +594,7 @@ def populate(self, args):
cfg=args.cfg, cfgid=cfgid, attach=args.attach,
options=localcfg, batch_size=args.batch,
loops=loops, ms=ms, dry_run=args.dry_run,
allow_nan=args.allow_nan)
allow_nan=args.allow_nan, column_types=header_type)
ctx.parse()

def rois(self, args):
Expand Down
22 changes: 15 additions & 7 deletions src/omero_metadata/populate.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,13 @@ def _create_columns(self, klass):
self.DEFAULT_COLUMN_SIZE, list()))
# Ensure RoiColumn is named 'Roi'
column.name = "Roi"
if column.__class__ is DatasetColumn:
# This breaks the code, as currently there is no implementation
# of a method to populate the 'Dataset Name' column
# append.append(StringColumn(DATASET_NAME_COLUMN, '',
# self.DEFAULT_COLUMN_SIZE, list()))
# Ensure DatasetColumn is named 'Dataset'
column.name = "Dataset"
# If image/roi name, then add ID column"
if column.name == IMAGE_NAME_COLUMN:
append.append(ImageColumn("Image", '', list()))
Expand Down Expand Up @@ -417,8 +424,6 @@ def resolve(self, column, value, row):
)
break
elif column.name.lower() == "dataset name":
# DatasetColumn unimplemented at the momnet
# We can still access column names though
images_by_id = self.wrapper.images_by_id[
self.wrapper.datasets_by_name[column_value].id.val
]
Expand All @@ -428,8 +433,6 @@ def resolve(self, column, value, row):
)
break
elif column.name.lower() == "dataset":
# DatasetColumn unimplemented at the momnet
# We can still access column names though
images_by_id = self.wrapper.images_by_id[
self.wrapper.datasets_by_id[
int(column_value)].id.val
Expand Down Expand Up @@ -892,7 +895,10 @@ def get_image_name_by_id(self, iid, did=None):

def resolve_dataset(self, column, row, value):
try:
return self.datasets_by_name[value].id.val
if column.name.lower() == 'dataset':
return self.datasets_by_id[int(value)].id.val
else:
return self.datasets_by_name[value].id.val
except KeyError:
log.warn('Project is missing dataset: %s' % value)
return Skip()
Expand Down Expand Up @@ -1243,6 +1249,8 @@ def preprocess_data(self, reader):
column.values.append(value)
elif column.name.lower() == "plate":
column.values.append(value)
elif column.name.lower() == "dataset":
column.values.append(value)
except TypeError:
log.error('Original value "%s" now "%s" of bad type!' % (
original_value, value))
Expand Down Expand Up @@ -1635,7 +1643,7 @@ class BulkToMapAnnotationContext(_QueryContext):
def __init__(self, client, target_object, file=None, fileid=None,
cfg=None, cfgid=None, attach=False, options=None,
batch_size=1000, loops=10, ms=10, dry_run=False,
allow_nan=False):
allow_nan=False, **kwargs):
"""
:param client: OMERO client object
:param target_object: The object to be annotated
Expand Down Expand Up @@ -1968,7 +1976,7 @@ class DeleteMapAnnotationContext(_QueryContext):
def __init__(self, client, target_object, file=None, fileid=None,
cfg=None, cfgid=None, attach=False, options=None,
batch_size=1000, loops=10, ms=500, dry_run=False,
allow_nan=False):
allow_nan=False, **kwargs):

"""
:param client: OMERO client object
Expand Down

0 comments on commit 07a33f3

Please sign in to comment.