🐛 Fix flu explorer (#3326)

* 🐛 Fix flu explorer
owid · Oct 6, 2024 · 2e24240 · 2e24240
1 parent 995c0fb
commit 2e24240
Showing 1 changed file with 14 additions and 4 deletions.
diff --git a/etl/steps/data/explorers/who/latest/flu.py b/etl/steps/data/explorers/who/latest/flu.py
@@ -40,6 +40,10 @@ def run(dest_dir: str) -> None:
     tb_flunet = flunet_garden["flunet"]
     tb_fluid = fluid_garden["fluid"]
 
+    # Convert numeric types to float64 for consistency
+    tb_flunet = convert_to_float(tb_flunet)
+    tb_fluid = convert_to_float(tb_fluid)
+
     tb_flu = pd.DataFrame(pd.merge(tb_fluid, tb_flunet, on=["country", "date", "hemisphere", "year"], how="outer"))
     assert tb_flu[["country", "date"]].duplicated().sum() == 0
 
@@ -72,6 +76,12 @@ def run(dest_dir: str) -> None:
     ds_explorer.save()
 
 
+def convert_to_float(df: pd.DataFrame) -> pd.DataFrame:
+    cols = df.select_dtypes(include=["number"]).columns
+    df = df.astype({col: "float64" for col in cols})
+    return df
+
+
 def hold_back_data(df: pd.DataFrame, days_held_back: int) -> pd.DataFrame:
     """
     Removing the last {days_held_back} days from the data, these values are typically adjusted in the following weeks so are often very low when first released.
@@ -120,7 +130,7 @@ def create_full_time_series(df: pd.DataFrame) -> pd.DataFrame:
     especially important for the stacked bar charts which don't automatically fill missing dates with NAs
 
     """
-    filled_df = pd.DataFrame()
+    filled_dfs = []
     for country in df.country.drop_duplicates():
         country_df = df[df["country"] == country]
         min_date = country_df.date.min()
@@ -133,9 +143,9 @@ def create_full_time_series(df: pd.DataFrame) -> pd.DataFrame:
             assert len(date_series) == country_df.shape[0]
             assert country_df.country.isna().sum() == 0
 
-        filled_df = pd.concat([filled_df, country_df])
+        filled_dfs.append(country_df)
 
-    return filled_df
+    return pd.concat(filled_dfs)  # type: ignore
 
 
 def create_monthly_aggregates(df: pd.DataFrame, days_held_back: int) -> pd.DataFrame:
@@ -348,5 +358,5 @@ def remove_sparse_timeseries(
         cols = [x + type + "_zfilled" for x in strain_columns]
         for country in countries:
             if all(df.loc[(df["country"] == country), cols].fillna(0).sum() == 0):
-                df.loc[(df["country"] == country), cols] = np.NaN
+                df.loc[(df["country"] == country), cols] = np.nan
     return df