From 01a489cd70b99df0c60374d8db1fedec627b1812 Mon Sep 17 00:00:00 2001 From: Jenke Scheen Date: Fri, 5 Apr 2024 11:32:01 +0200 Subject: [PATCH] add some more checks to CSV reader --- choppa/IO/input.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/choppa/IO/input.py b/choppa/IO/input.py index 2ccbbd7..7edb432 100644 --- a/choppa/IO/input.py +++ b/choppa/IO/input.py @@ -57,6 +57,16 @@ def read_fitness_csv( fitness_colname, confidence_colname, ]] + + # check that there aren't any NaNs and that fitness (and confidence) data is scalar + if fitness_df.isnull().values.any(): + raise ValueError(f"Found missing values in input CSV: {fitness_df[fitness_df.isnull().any(axis=1)]}") + if len(fitness_df[pd.to_numeric(fitness_df[fitness_colname], errors='coerce').isnull()]) > 0: + raise ValueError(f"Found non-numeric fitness values in input CSV: {fitness_df[pd.to_numeric(fitness_df[fitness_colname], errors='coerce').isnull()]}") + if confidence_colname is not None: + if len(fitness_df[pd.to_numeric(fitness_df[confidence_colname], errors='coerce').isnull()]) > 0: + raise ValueError(f"Found non-numeric confidence values in input CSV: {fitness_df[pd.to_numeric(fitness_df[confidence_colname], errors='coerce').isnull()]}") + logger.info(f"Successfully read fitness data:\n{fitness_df}") return fitness_df @@ -64,7 +74,7 @@ def read_fitness_csv( def df_to_basedict(fitness_df): """ Converts a `pandas` fitness dataframe (read by `FitnessFactory.read_fitness_csv`) into - a fitness basedict. + a `fitness basedict` which is essentially just an `OrderedDict`. """