From 01a489cd70b99df0c60374d8db1fedec627b1812 Mon Sep 17 00:00:00 2001
From: Jenke Scheen <jenke.scheen@choderalab.org>
Date: Fri, 5 Apr 2024 11:32:01 +0200
Subject: [PATCH] add some more checks to CSV reader

---
 choppa/IO/input.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/choppa/IO/input.py b/choppa/IO/input.py
index 2ccbbd7..7edb432 100644
--- a/choppa/IO/input.py
+++ b/choppa/IO/input.py
@@ -57,6 +57,16 @@ def read_fitness_csv(
             fitness_colname,
             confidence_colname,
         ]]
+
+        # check that there aren't any NaNs and that fitness (and confidence) data is scalar
+        if fitness_df.isnull().values.any():
+            raise ValueError(f"Found missing values in input CSV: {fitness_df[fitness_df.isnull().any(axis=1)]}")
+        if len(fitness_df[pd.to_numeric(fitness_df[fitness_colname], errors='coerce').isnull()]) > 0:
+            raise ValueError(f"Found non-numeric fitness values in input CSV: {fitness_df[pd.to_numeric(fitness_df[fitness_colname], errors='coerce').isnull()]}")
+        if confidence_colname is not None:
+            if len(fitness_df[pd.to_numeric(fitness_df[confidence_colname], errors='coerce').isnull()]) > 0:
+                raise ValueError(f"Found non-numeric confidence values in input CSV: {fitness_df[pd.to_numeric(fitness_df[confidence_colname], errors='coerce').isnull()]}")
+        
         logger.info(f"Successfully read fitness data:\n{fitness_df}")
 
         return fitness_df
@@ -64,7 +74,7 @@ def read_fitness_csv(
     def df_to_basedict(fitness_df):
         """
         Converts a `pandas` fitness dataframe (read by `FitnessFactory.read_fitness_csv`) into
-        a fitness basedict.
+        a `fitness basedict` which is essentially just an `OrderedDict`.
         """