diff --git a/isatools/isajson/validate.py b/isatools/isajson/validate.py index 79ed5a93..8d0d52e7 100644 --- a/isatools/isajson/validate.py +++ b/isatools/isajson/validate.py @@ -693,11 +693,12 @@ def check_measurement_technology_types(assay_json, configs): ) -def check_study_and_assay_graphs(study_json, configs): - def check_assay_graph(process_sequence_json, config): +def check_study_and_assay_graphs(study_json, configs, no_config): + def check_assay_graph(process_sequence_json, config, no_config): list_of_last_processes_in_sequence = [i for i in process_sequence_json if "nextProcess" not in i.keys()] - log.info("Checking against assay protocol sequence configuration {}".format(config["description"])) - config_protocol_sequence = [i["protocol"] for i in config["protocols"]] + if not no_config: + log.info("Checking against assay protocol sequence configuration {}".format(config["description"])) + config_protocol_sequence = [i["protocol"] for i in config["protocols"]] for process in list_of_last_processes_in_sequence: # build graphs backwards assay_graph = list() try: @@ -727,40 +728,48 @@ def check_assay_graph(process_sequence_json, config): break except KeyError: # this happens when we can"t find a previousProcess pass - assay_graph.reverse() - assay_protocol_sequence = [[j for j in i if not j.startswith("#")] for i in assay_graph] - assay_protocol_sequence = [i for j in assay_protocol_sequence for i in j] # flatten list - assay_protocol_sequence_of_interest = [i for i in assay_protocol_sequence if i in config_protocol_sequence] - # filter out protocols in sequence that are not of interest (additional ones to required by config) - squished_assay_protocol_sequence_of_interest = list() - prev_prot = None - for prot in assay_protocol_sequence_of_interest: # remove consecutive same protocols - if prev_prot != prot: - squished_assay_protocol_sequence_of_interest.append(prot) - prev_prot = prot - from isatools.utils import contains - if not contains(squished_assay_protocol_sequence_of_interest, config_protocol_sequence): - warnings.append({ - "message": "Process sequence is not valid against configuration", - "supplemental": "Config protocol sequence {} does not in assay protocol sequence {}".format( - config_protocol_sequence, - squished_assay_protocol_sequence_of_interest), - "code": 4004 - }) - log.warning("Configuration protocol sequence {} does not match study graph found in {}" - .format(config_protocol_sequence, assay_protocol_sequence)) + + if not no_config: + assay_graph.reverse() + assay_protocol_sequence = [[j for j in i if not j.startswith("#")] for i in assay_graph] + assay_protocol_sequence = [i for j in assay_protocol_sequence for i in j] # flatten list + assay_protocol_sequence_of_interest = [i for i in assay_protocol_sequence if i in config_protocol_sequence] + # filter out protocols in sequence that are not of interest (additional ones to required by config) + squished_assay_protocol_sequence_of_interest = list() + prev_prot = None + for prot in assay_protocol_sequence_of_interest: # remove consecutive same protocols + if prev_prot != prot: + squished_assay_protocol_sequence_of_interest.append(prot) + prev_prot = prot + from isatools.utils import contains + if not contains(squished_assay_protocol_sequence_of_interest, config_protocol_sequence): + warnings.append({ + "message": "Process sequence is not valid against configuration", + "supplemental": "Config protocol sequence {} does not in assay protocol sequence {}".format( + config_protocol_sequence, + squished_assay_protocol_sequence_of_interest), + "code": 4004 + }) + log.warning("Configuration protocol sequence {} does not match study graph found in {}" + .format(config_protocol_sequence, assay_protocol_sequence)) protocols_and_types = dict([(i["@id"], i["protocolType"]["annotationValue"]) for i in study_json["protocols"]]) # first check study graph - log.info("Loading configuration (study)") - config = configs["study"] - check_assay_graph(study_json["processSequence"], config) + if not no_config: + log.info("Loading configuration (study)") + config = configs["study"] + else: + config = {} + check_assay_graph(study_json["processSequence"], config, no_config) for assay_json in study_json["assays"]: m = assay_json["measurementType"]["annotationValue"] t = assay_json["technologyType"]["annotationValue"] - log.info("Loading configuration ({}, {})".format(m, t)) - config = configs[(m, t)] - check_assay_graph(assay_json["processSequence"], config) + if not no_config: + log.info("Loading configuration ({}, {})".format(m, t)) + config = configs[(m, t)] + else: + config = {} + check_assay_graph(assay_json["processSequence"], config, no_config) def check_study_groups(study_or_assay): @@ -811,7 +820,8 @@ def validate( fp, config_dir=default_config_dir, log_level=None, - base_schemas_dir="isa_model_version_1_0_schemas" + base_schemas_dir="isa_model_version_1_0_schemas", + no_config: bool = False ): if config_dir is None: config_dir = default_config_dir @@ -887,10 +897,11 @@ def validate( check_term_accession_used_no_source_ref(isa_json) # Rule 3010 log.info("Loading configurations from " + config_dir) configs = load_config(config_dir) # Rule 4001 - log.info("Checking measurement and technology types...") - for study_json in isa_json["studies"]: - for assay_json in study_json["assays"]: - check_measurement_technology_types(assay_json, configs) # Rule 4002 + if not no_config: + log.info("Checking measurement and technology types...") + for study_json in isa_json["studies"]: + for assay_json in study_json["assays"]: + check_measurement_technology_types(assay_json, configs) # Rule 4002 log.info("Checking against configuration schemas...") check_isa_schemas( isa_json=isa_json, @@ -907,7 +918,7 @@ def validate( fp.seek(0) # reset file pointer log.info("Checking study and assay graphs...") for study_json in isa_json["studies"]: - check_study_and_assay_graphs(study_json, configs) # Rule 4004 + check_study_and_assay_graphs(study_json, configs, no_config) # Rule 4004 fp.seek(0) # try load and do study groups check log.info("Checking study groups...") diff --git a/isatools/isatab/validate/core.py b/isatools/isatab/validate/core.py index da13d88b..5a15e452 100644 --- a/isatools/isatab/validate/core.py +++ b/isatools/isatab/validate/core.py @@ -169,7 +169,8 @@ def validate(fp: TextIO, config_dir: str = default_config_dir, origin: str or None = None, rules: dict = None, - log_level=None) -> dict: + log_level=None, + no_config: bool = False) -> dict: """ A function to validate an ISA investigation tab file :param fp: the investigation file handler @@ -177,6 +178,7 @@ def validate(fp: TextIO, :param origin: value accepted = mzml2isa or None :param rules: optional rules to run (default: all rules) :param log_level: optional log level (default: INFO) + :param no_config: whether or not to validate against configs (default: False) :return: a dictionary of the validation results (errors, warnings and info) """ if not log_level: @@ -191,6 +193,7 @@ def validate(fp: TextIO, "investigation_df_dict": i_df_dict, "dir_context": path.dirname(fp.name), "configs": config_dir, + "no_config": no_config } investigation_validator = ISAInvestigationValidator(**params, **built_rules['investigation']) diff --git a/isatools/isatab/validate/rules/core.py b/isatools/isatab/validate/rules/core.py index a3ad728c..d5cd9b34 100644 --- a/isatools/isatab/validate/rules/core.py +++ b/isatools/isatab/validate/rules/core.py @@ -5,6 +5,7 @@ from pandas import DataFrame +from isatools.io import isatab_configurator from isatools.utils import utf8_text_file_open from isatools.isatab.defaults import NUMBER_OF_STUDY_GROUPS from isatools.isatab.load import load_table @@ -112,7 +113,8 @@ def __init__(self, dir_context: str, configs: str, available_rules: list = INVESTIGATION_RULES_MAPPING, - rules_to_run: tuple = DEFAULT_INVESTIGATION_RULES): + rules_to_run: tuple = DEFAULT_INVESTIGATION_RULES, + no_config: bool = False): """ The ISA investigation validator class :param investigation_df_dict: a dictionary of DataFrames and lists of DataFrames representing the investigation file @@ -120,6 +122,7 @@ def __init__(self, :param configs: directory of the XML config files :param available_rules: a customizable list of all available rules for investigation objects :param rules_to_run: a customizable tuple of rules identifiers to run for investigation objects + :param no_config: whether or not to validate against configs (default: False) """ self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules) self.has_validated = False @@ -127,7 +130,8 @@ def __init__(self, 'investigation_df_dict': investigation_df_dict, 'dir_context': dir_context, 'configs': configs, - 'term_source_refs': None + 'term_source_refs': None, + "no_config": no_config } self.all_rules.validate_rules(validator=self) @@ -140,7 +144,8 @@ def __init__(self, study_filename: str, study_df: DataFrame, available_rules: List = STUDY_RULES_MAPPING, - rules_to_run: tuple = DEFAULT_STUDY_RULES): + rules_to_run: tuple = DEFAULT_STUDY_RULES, + no_config: bool = False): """ The ISA study validator class :param validator: the investigation validator @@ -149,13 +154,15 @@ def __init__(self, :param study_df: the study dataframe :param available_rules: a customizable list of all available rules for investigation objects :param rules_to_run: a customizable tuple of rules identifiers to run for investigation objects + :param no_config: whether or not to validate against configs (default: False) """ self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules) self.has_validated = False self.params = { **validator.params, 'study_df': study_df, - 'config': validator.params['configs'][('[sample]', '')], + 'config': validator.params['configs'][('[sample]', '')] if ('[sample]', '') in validator.params['configs'] + else isatab_configurator.IsaTabConfigFileType(), 'study_filename': study_filename } with utf8_text_file_open(path.join(self.params['dir_context'], study_filename)) as s_fp: @@ -183,7 +190,8 @@ def __init__(self, assay_filename: str = None, assay_df: DataFrame = None, available_rules: List = ASSAY_RULES_MAPPING, - rules_to_run: tuple = DEFAULT_ASSAY_RULES): + rules_to_run: tuple = DEFAULT_ASSAY_RULES, + no_config: bool = False): """ The ISA assay validator class :param assay_tables: list of assay tables @@ -193,6 +201,7 @@ def __init__(self, :param assay_df: the assay dataframe :param available_rules: a customizable list of all available rules for investigation objects :param rules_to_run: a customizable tuple of rules identifiers to run for investigation objects + :param no_config: whether or not to validate against configs (default: False) """ self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules) self.has_validated = False @@ -207,7 +216,7 @@ def __init__(self, if assay_filename != '': lowered_mt = assay_df['Study Assay Measurement Type'].tolist()[assay_index].lower() lowered_tt = assay_df['Study Assay Technology Type'].tolist()[assay_index].lower() - self.params['config'] = self.params['configs'].get((lowered_mt, lowered_tt), None) + self.params['config'] = self.params['configs'].get((lowered_mt, lowered_tt), isatab_configurator.IsaTabConfigFileType()) if self.params['config']: with utf8_text_file_open(path.join(self.params['dir_context'], assay_filename)) as a_fp: self.params['assay_table'] = load_table(a_fp) diff --git a/isatools/isatab/validate/rules/defaults.py b/isatools/isatab/validate/rules/defaults.py index eaafb849..05d81f9c 100644 --- a/isatools/isatab/validate/rules/defaults.py +++ b/isatools/isatab/validate/rules/defaults.py @@ -45,9 +45,9 @@ {'rule': check_pubmed_ids_format, 'params': ['investigation_df_dict'], 'identifier': '3003'}, {'rule': check_ontology_sources, 'params': ['investigation_df_dict'], 'identifier': '3008'}, - {'rule': load_config, 'params': ['configs'], 'identifier': '4001'}, - {'rule': check_measurement_technology_types, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4002'}, - {'rule': check_investigation_against_config, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4003'}, + {'rule': load_config, 'params': ['configs', 'no_config'], 'identifier': '4001'}, + {'rule': check_measurement_technology_types, 'params': ['investigation_df_dict', 'configs', 'no_config'], 'identifier': '4002'}, + {'rule': check_investigation_against_config, 'params': ['investigation_df_dict', 'configs', 'no_config'], 'identifier': '4003'}, # copies {'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0008'}, @@ -58,22 +58,22 @@ STUDY_RULES_MAPPING = [ - {'rule': check_unit_field, 'params': ['study_sample_table', 'config'], 'identifier': '1099'}, + {'rule': check_unit_field, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '1099'}, { 'rule': check_ontology_fields, - 'params': ['study_sample_table', 'config', 'term_source_refs'], + 'params': ['study_sample_table', 'config', 'term_source_refs', 'no_config'], 'identifier': '3010' }, - {'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4003'}, + {'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4003'}, {'rule': check_factor_value_presence, 'params': ['study_sample_table'], 'identifier': '4007'}, { 'rule': check_protocol_fields, - 'params': ['study_sample_table', 'config', 'protocol_names_and_types'], + 'params': ['study_sample_table', 'config', 'protocol_names_and_types', 'no_config'], 'identifier': '4009' }, - {'rule': check_field_values, 'params': ['study_sample_table', 'config'], 'identifier': '4011'}, + {'rule': check_field_values, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4011'}, {'rule': load_table_checks, 'params': ['study_sample_table', 'study_filename'], 'identifier': '4014'}, { @@ -83,30 +83,30 @@ }, # copies - {'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4008'}, - {'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4010'}, + {'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4008'}, + {'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4010'}, ] ASSAY_RULES_MAPPING = [ {'rule': check_sample_names, 'params': ['study_sample_table', 'assay_tables'], 'identifier': '0000'}, - {'rule': check_unit_field, 'params': ['assay_table', 'config'], 'identifier': '1099'}, + {'rule': check_unit_field, 'params': ['assay_table', 'config', 'no_config'], 'identifier': '1099'}, - {'rule': check_ontology_fields, 'params': ['assay_table', 'config', 'term_source_refs'], 'identifier': '3010'}, + {'rule': check_ontology_fields, 'params': ['assay_table', 'config', 'term_source_refs', 'no_config'], 'identifier': '3010'}, - {'rule': check_required_fields, 'params': ['assay_table', 'config'], 'identifier': '4003'}, + {'rule': check_required_fields, 'params': ['assay_table', 'config', 'no_config'], 'identifier': '4003'}, {'rule': check_factor_value_presence, 'params': ['assay_table'], 'identifier': '4007'}, { 'rule': check_protocol_fields, - 'params': ['assay_table', 'config', 'protocol_names_and_types'], + 'params': ['assay_table', 'config', 'protocol_names_and_types', 'no_config'], 'identifier': '4009' }, - {'rule': check_field_values, 'params': ['assay_table', 'config'], 'identifier': '4011'}, + {'rule': check_field_values, 'params': ['assay_table', 'config', 'no_config'], 'identifier': '4011'}, {'rule': load_table_checks, 'params': ['assay_table', 'assay_filename'], 'identifier': '4014'}, # copies - {'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4008'}, - {'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4010'}, + {'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4008'}, + {'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4010'}, { 'rule': check_study_groups, diff --git a/isatools/isatab/validate/rules/rules_10xx.py b/isatools/isatab/validate/rules/rules_10xx.py index 6bfc1b0c..4df40c95 100644 --- a/isatools/isatab/validate/rules/rules_10xx.py +++ b/isatools/isatab/validate/rules/rules_10xx.py @@ -315,11 +315,12 @@ def check_study_factor_names(i_df_dict): log.warning(warning) -def check_unit_field(table, cfg): +def check_unit_field(table, cfg, no_config): """Checks if unit columns are valid against a configuration :param table: Table DataFrame :param cfg: An ISA Configuration object + :param no_config: whether or not to validate against configs :return: True if all unit columns in table are OK, False if not OK """ @@ -340,28 +341,25 @@ def check_unit_value(cell_value, unit_value, cfield, filename): return False return True - result = True - for icol, header in enumerate(table.columns): - cfields = [i for i in cfg.get_isatab_configuration()[0].get_field() if i.header == header] - if len(cfields) != 1: - continue - cfield = cfields[0] - ucfields = [i for i in cfg.get_isatab_configuration()[0].get_unit_field() if i.pos == cfield.pos + 1] - if len(ucfields) != 1: - continue - ucfield = ucfields[0] - if ucfield.is_required: - rheader = None - rindx = icol + 1 - if rindx < len(table.columns): - rheader = table.columns[rindx] - if rheader is None or rheader.lower() != 'unit': - spl = "The field '{}' in the file '{}' misses a required 'Unit' column".format(header, table.filename) - validator.add_warning(message="Cell requires a Unit", supplemental=spl, code=4999) - log.warning("(W) {}".format(spl)) - result = False - else: - for irow in range(len(table.index)): - check = check_unit_value(table.iloc[irow][icol], table.iloc[irow][rindx], cfield, table.filename) - result = result and check - return result + if cfg.get_isatab_configuration() and not no_config: + for icol, header in enumerate(table.columns): + cfields = [i for i in cfg.get_isatab_configuration()[0].get_field() if i.header == header] + if len(cfields) != 1: + continue + cfield = cfields[0] + ucfields = [i for i in cfg.get_isatab_configuration()[0].get_unit_field() if i.pos == cfield.pos + 1] + if len(ucfields) != 1: + continue + ucfield = ucfields[0] + if ucfield.is_required: + rheader = None + rindx = icol + 1 + if rindx < len(table.columns): + rheader = table.columns[rindx] + if rheader is None or rheader.lower() != 'unit': + spl = "The field '{}' in the file '{}' misses a required 'Unit' column".format(header, table.filename) + validator.add_warning(message="Cell requires a Unit", supplemental=spl, code=4999) + log.warning("(W) {}".format(spl)) + else: + for irow in range(len(table.index)): + check_unit_value(table.iloc[irow][icol], table.iloc[irow][rindx], cfield, table.filename) diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py index be876dc5..772c27a4 100644 --- a/isatools/isatab/validate/rules/rules_30xx.py +++ b/isatools/isatab/validate/rules/rules_30xx.py @@ -128,13 +128,14 @@ def check_ontology_sources(i_df_dict): return term_source_refs -def check_ontology_fields(table, cfg, tsrs): +def check_ontology_fields(table, cfg, tsrs, no_config): """ Checks ontology annotation columns are correct for a given configuration in a table :param table: Table DataFrame :param cfg: An ISA Configuration object :param tsrs: List of Term Source References from the Ontology Source + :param no_config: whether or not to validate against configs Reference section :return: True if OK, False if not OK """ @@ -167,34 +168,32 @@ def check_single_field(cell_value, source, acc, config_field, filename): return_value = False return return_value - result = True - nfields = len(table.columns) - for icol, header in enumerate(table.columns): - cfields = [i for i in cfg.get_isatab_configuration()[0].get_field() if i.header == header] - if len(cfields) != 1: - continue - cfield = cfields[0] - if cfield.get_recommended_ontologies() is None: - continue - rindx = icol + 1 - rrindx = icol + 2 - rheader = '' - rrheader = '' - if rindx < nfields: - rheader = table.columns[rindx] - if rrindx < nfields: - rrheader = table.columns[rrindx] - if 'term source ref' not in rheader.lower() or 'term accession number' not in rrheader.lower(): - warning = "(W) The Field '{}' should have values from ontologies and has no ontology headers instead" - log.warning(warning.format(header)) - result = False - continue - - for irow in range(len(table.index)): - result = result and check_single_field(table.iloc[irow][icol], - table.iloc[irow][rindx], - table.iloc[irow][rrindx], - cfield, - table.filename) - - return result + if cfg.get_isatab_configuration() and not no_config: + nfields = len(table.columns) + for icol, header in enumerate(table.columns): + cfields = [i for i in cfg.get_isatab_configuration()[0].get_field() if i.header == header] + if len(cfields) != 1: + continue + cfield = cfields[0] + if cfield.get_recommended_ontologies() is None: + continue + rindx = icol + 1 + rrindx = icol + 2 + rheader = '' + rrheader = '' + if rindx < nfields: + rheader = table.columns[rindx] + if rrindx < nfields: + rrheader = table.columns[rrindx] + if 'term source ref' not in rheader.lower() or 'term accession number' not in rrheader.lower(): + warning = "(W) The Field '{}' should have values from ontologies and has no ontology headers instead" + log.warning(warning.format(header)) + continue + + for irow in range(len(table.index)): + check_single_field(table.iloc[irow][icol], + table.iloc[irow][rindx], + table.iloc[irow][rrindx], + cfield, + table.filename) + diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py index 6faedd1e..5f31e595 100644 --- a/isatools/isatab/validate/rules/rules_40xx.py +++ b/isatools/isatab/validate/rules/rules_40xx.py @@ -13,11 +13,12 @@ ) -def check_investigation_against_config(i_df_dict, configs): +def check_investigation_against_config(i_df_dict, configs, no_config): """Checks investigation file against the loaded configurations :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :param configs: A dictionary of ISA Configuration objects + :param no_config: whether or not to validate against configs :return: None """ @@ -50,28 +51,33 @@ def check_section_against_required_fields_one_value(section, required, i=0): if required_value == '' or 'Unnamed: ' in required_value: add_warning(i, col, x) - config_fields = configs[('[investigation]', '')].get_isatab_configuration()[0].get_field() - required_fields = [i.header for i in config_fields if i.is_required] - check_section_against_required_fields_one_value(i_df_dict['investigation'], required_fields) - check_section_against_required_fields_one_value(i_df_dict['i_publications'], required_fields) - check_section_against_required_fields_one_value(i_df_dict['i_contacts'], required_fields) + if ('[investigation]', '') in configs and not no_config: + config_fields = configs[('[investigation]', '')].get_isatab_configuration()[0].get_field() + required_fields = [i.header for i in config_fields if i.is_required] + check_section_against_required_fields_one_value(i_df_dict['investigation'], required_fields) + check_section_against_required_fields_one_value(i_df_dict['i_publications'], required_fields) + check_section_against_required_fields_one_value(i_df_dict['i_contacts'], required_fields) - for x, study_df in enumerate(i_df_dict['studies']): - check_section_against_required_fields_one_value(i_df_dict['studies'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df_dict['s_design_descriptors'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df_dict['s_publications'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df_dict['s_factors'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df_dict['s_assays'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df_dict['s_protocols'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df_dict['s_contacts'][x], required_fields, x) + for x, study_df in enumerate(i_df_dict['studies']): + check_section_against_required_fields_one_value(i_df_dict['studies'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_design_descriptors'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_publications'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_factors'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_assays'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_protocols'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_contacts'][x], required_fields, x) -def load_config(config_dir): +def load_config(config_dir, no_config): """Rule 4001 :param config_dir: Path to a directory containing ISA Configuration XMLs + :param no_config: whether or not to validate against configs :return: A dictionary of ISA Configuration objects """ + if no_config: + return {} + configs = None try: configs = isatab_configurator.load(config_dir) @@ -79,29 +85,31 @@ def load_config(config_dir): spl = "On loading {}".format(config_dir) validator.add_error(message="Configurations could not be loaded", supplemental=spl, code=4001) log.error("(E) FileNotFoundError on trying to load from {}".format(config_dir)) - if configs is None: + if not configs: spl = "On loading {}".format(config_dir) - validator.add_error(message="Configurations could not be loaded", supplemental=spl, code=4001) - log.error("(E) Could not load configurations from {}".format(config_dir)) + validator.add_warning(message="Configurations could not be loaded", supplemental=spl, code=4001) + log.warning("(W) No configurations were loaded from the '{}' directory".format(config_dir)) else: for k in configs.keys(): message = "Loaded table configuration '{}' for measurement and technology {}" log.debug(message.format(str(configs[k].get_isatab_configuration()[0].table_name), str(k))) - if configs is None: - raise SystemError("No configuration to load so cannot proceed with validation!") return configs -def check_measurement_technology_types(i_df_dict, configs): +def check_measurement_technology_types(i_df_dict, configs, no_config): """Rule 4002 :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :param configs: A dictionary of ISA Configuration objects + :param no_config: whether or not to validate against configs :return: None """ - for i, study_assays_df in enumerate(i_df_dict['s_assays']): - measurement_types = study_assays_df['Study Assay Measurement Type'].tolist() - technology_types = study_assays_df['Study Assay Technology Type'].tolist() + if no_config: + return + + for i, assay_df in enumerate(i_df_dict['s_assays']): + measurement_types = assay_df['Study Assay Measurement Type'].tolist() + technology_types = assay_df['Study Assay Technology Type'].tolist() if len(measurement_types) == len(technology_types): for x, measurement_type in enumerate(measurement_types): lowered_mt = measurement_types[x].lower() @@ -131,31 +139,34 @@ def check_factor_value_presence(table): log.warning("(W) {}".format(spl)) -def check_required_fields(table, cfg): +def check_required_fields(table, cfg, no_config): """Checks if the required fields by a configuration have empty cells :param table: Table as a DataFrame :param cfg: A ISA Configuration object + :param no_config: whether or not to validate against configs :return: None """ - for fheader in [i.header for i in cfg.get_isatab_configuration()[0].get_field() if i.is_required]: - found_field = [i for i in table.columns if i.lower() == fheader.lower()] - if len(found_field) == 0: - msg = "A required column in assay table is not present" - spl = "Required field '{}' not found in the file '{}'".format(fheader, table.filename) - validator.add_warning(message=msg, supplemental=spl, code=4010) - log.warning("(W) {}".format(spl)) - elif len(found_field) > 1: - spl = "Field '{}' cannot have multiple values in the file '{}'".format(fheader, table.filename) - validator.add_warning(message="Multiple columns found", supplemental=spl, code=4013) - log.warning("(W) {}".format(spl)) + if cfg.get_isatab_configuration() and not no_config: + for fheader in [i.header for i in cfg.get_isatab_configuration()[0].get_field() if i.is_required]: + found_field = [i for i in table.columns if i.lower() == fheader.lower()] + if len(found_field) == 0: + msg = "A required column in assay table is not present" + spl = "Required field '{}' not found in the file '{}'".format(fheader, table.filename) + validator.add_warning(message=msg, supplemental=spl, code=4010) + log.warning("(W) {}".format(spl)) + elif len(found_field) > 1: + spl = "Field '{}' cannot have multiple values in the file '{}'".format(fheader, table.filename) + validator.add_warning(message="Multiple columns found", supplemental=spl, code=4013) + log.warning("(W) {}".format(spl)) -def check_field_values(table, cfg): +def check_field_values(table, cfg, no_config): """Checks table fields against configuration :param table: Table DataFrame :param cfg: A ISA Configuration object + :param no_config: whether or not to validate against configs :return: None """ @@ -221,26 +232,25 @@ def check_single_field(cell_value, cfg_field): return False if not is_valid_value: msg = "A value does not correspond to the correct data type" - spl = "Invalid value '{}' for type '{}' of the field '{}'" - spl = spl.format(cell_value, data_type, cfg_field.header) + spl = "Invalid value '{}' for type '{}' of the field '{}' in the file '{}'" + spl = spl.format(cell_value, data_type, cfg_field.header, table.filename) + if data_type == 'list': + spl = spl + ". Value must be one of: " + cfg_field.list_values validator.add_warning(message=msg, supplemental=spl, code=4011) log.warning("(W) {}".format(spl)) - if data_type == 'list': - log.warning("(W) Value must be one of: " + cfg_field.list_values) return is_valid_value - result = True - for irow in range(len(table.index)): - ncols = len(table.columns) - for icol in range(0, ncols): - cfields = [i for i in cfg.get_isatab_configuration()[0].get_field() if i.header == table.columns[icol]] - if len(cfields) == 1: - cfield = cfields[0] - result = result and check_single_field(table.iloc[irow][cfield.header], cfield) - return result + if cfg.get_isatab_configuration() and not no_config: + for irow in range(len(table.index)): + ncols = len(table.columns) + for icol in range(0, ncols): + cfields = [i for i in cfg.get_isatab_configuration()[0].get_field() if i.header == table.columns[icol]] + if len(cfields) == 1: + cfield = cfields[0] + check_single_field(table.iloc[irow][cfield.header], cfield) -def check_protocol_fields(table, cfg, proto_map): +def check_protocol_fields(table, cfg, proto_map, no_config): from itertools import tee def pairwise(iterable): @@ -254,7 +264,7 @@ def pairwise(iterable): a, b = tee(iterable) next(b, None) return zip(a, b) - + field_headers = [i for i in table.columns if i.lower().endswith(' name') or i.lower().endswith(' data file') @@ -269,7 +279,7 @@ def pairwise(iterable): spl = "(W) Protocol REF column is not followed by a material or data node in file '" + table.filename + "'" validator.add_warning(message="Missing Protocol Value", supplemental=spl, code=1007) log.warning(spl) - if cfg.get_isatab_configuration(): + if cfg.get_isatab_configuration() and not no_config: for left, right in pairwise(field_headers): cleft = None cright = None diff --git a/tests/isatab/validate/test_core.py b/tests/isatab/validate/test_core.py index ecf8a745..9060cbf4 100644 --- a/tests/isatab/validate/test_core.py +++ b/tests/isatab/validate/test_core.py @@ -17,7 +17,7 @@ def test_b_ii_s_3(self): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'BII-S-3') with open(path.join(data_path, 'i_gilbert.txt'), 'r') as data_file: r = validate(fp=data_file, config_dir=self.default_conf, origin="") - self.assertEqual(len(r['warnings']), 10) + self.assertEqual(len(r['warnings']), 21) def test_mtbls267(self): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'MTBLS267-partial') @@ -42,7 +42,7 @@ def test_bii_s_7(self): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'BII-S-7') with open(path.join(data_path, 'i_matteo.txt'), 'r') as data_file: report = validate(fp=data_file, config_dir=self.default_conf) - self.assertEqual(len(report['warnings']), 14) + self.assertEqual(len(report['warnings']), 42) def test_print_rule(self): raw_rule = INVESTIGATION_RULES_MAPPING[0] @@ -82,7 +82,7 @@ def is_investigation(investigation_df): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'BII-S-3') with open(path.join(data_path, 'i_gilbert.txt'), 'r') as data_file: r = validate(data_file, rules=rules) - self.assertEqual(len(r['warnings']), 10) + self.assertEqual(len(r['warnings']), 21) rule = '12000' expected_error = {