Skip to content

Commit

Permalink
🐛 Fix flu explorer (#3326)
Browse files Browse the repository at this point in the history
* 🐛 Fix flu explorer
  • Loading branch information
Marigold authored and paarriagadap committed Oct 6, 2024
1 parent 995c0fb commit 2e24240
Showing 1 changed file with 14 additions and 4 deletions.
18 changes: 14 additions & 4 deletions etl/steps/data/explorers/who/latest/flu.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ def run(dest_dir: str) -> None:
tb_flunet = flunet_garden["flunet"]
tb_fluid = fluid_garden["fluid"]

# Convert numeric types to float64 for consistency
tb_flunet = convert_to_float(tb_flunet)
tb_fluid = convert_to_float(tb_fluid)

tb_flu = pd.DataFrame(pd.merge(tb_fluid, tb_flunet, on=["country", "date", "hemisphere", "year"], how="outer"))
assert tb_flu[["country", "date"]].duplicated().sum() == 0

Expand Down Expand Up @@ -72,6 +76,12 @@ def run(dest_dir: str) -> None:
ds_explorer.save()


def convert_to_float(df: pd.DataFrame) -> pd.DataFrame:
cols = df.select_dtypes(include=["number"]).columns
df = df.astype({col: "float64" for col in cols})
return df


def hold_back_data(df: pd.DataFrame, days_held_back: int) -> pd.DataFrame:
"""
Removing the last {days_held_back} days from the data, these values are typically adjusted in the following weeks so are often very low when first released.
Expand Down Expand Up @@ -120,7 +130,7 @@ def create_full_time_series(df: pd.DataFrame) -> pd.DataFrame:
especially important for the stacked bar charts which don't automatically fill missing dates with NAs
"""
filled_df = pd.DataFrame()
filled_dfs = []
for country in df.country.drop_duplicates():
country_df = df[df["country"] == country]
min_date = country_df.date.min()
Expand All @@ -133,9 +143,9 @@ def create_full_time_series(df: pd.DataFrame) -> pd.DataFrame:
assert len(date_series) == country_df.shape[0]
assert country_df.country.isna().sum() == 0

filled_df = pd.concat([filled_df, country_df])
filled_dfs.append(country_df)

return filled_df
return pd.concat(filled_dfs) # type: ignore


def create_monthly_aggregates(df: pd.DataFrame, days_held_back: int) -> pd.DataFrame:
Expand Down Expand Up @@ -348,5 +358,5 @@ def remove_sparse_timeseries(
cols = [x + type + "_zfilled" for x in strain_columns]
for country in countries:
if all(df.loc[(df["country"] == country), cols].fillna(0).sum() == 0):
df.loc[(df["country"] == country), cols] = np.NaN
df.loc[(df["country"] == country), cols] = np.nan
return df

0 comments on commit 2e24240

Please sign in to comment.