Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Fix flu explorer #3326

Merged
merged 2 commits into from
Sep 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions etl/steps/data/explorers/who/latest/flu.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ def run(dest_dir: str) -> None:
tb_flunet = flunet_garden["flunet"]
tb_fluid = fluid_garden["fluid"]

# Convert numeric types to float64 for consistency
tb_flunet = convert_to_float(tb_flunet)
tb_fluid = convert_to_float(tb_fluid)

tb_flu = pd.DataFrame(pd.merge(tb_fluid, tb_flunet, on=["country", "date", "hemisphere", "year"], how="outer"))
assert tb_flu[["country", "date"]].duplicated().sum() == 0

Expand Down Expand Up @@ -72,6 +76,12 @@ def run(dest_dir: str) -> None:
ds_explorer.save()


def convert_to_float(df: pd.DataFrame) -> pd.DataFrame:
cols = df.select_dtypes(include=["number"]).columns
df = df.astype({col: "float64" for col in cols})
return df


def hold_back_data(df: pd.DataFrame, days_held_back: int) -> pd.DataFrame:
"""
Removing the last {days_held_back} days from the data, these values are typically adjusted in the following weeks so are often very low when first released.
Expand Down Expand Up @@ -120,7 +130,7 @@ def create_full_time_series(df: pd.DataFrame) -> pd.DataFrame:
especially important for the stacked bar charts which don't automatically fill missing dates with NAs

"""
filled_df = pd.DataFrame()
filled_dfs = []
for country in df.country.drop_duplicates():
country_df = df[df["country"] == country]
min_date = country_df.date.min()
Expand All @@ -133,9 +143,9 @@ def create_full_time_series(df: pd.DataFrame) -> pd.DataFrame:
assert len(date_series) == country_df.shape[0]
assert country_df.country.isna().sum() == 0

filled_df = pd.concat([filled_df, country_df])
filled_dfs.append(country_df)

return filled_df
return pd.concat(filled_dfs) # type: ignore


def create_monthly_aggregates(df: pd.DataFrame, days_held_back: int) -> pd.DataFrame:
Expand Down Expand Up @@ -348,5 +358,5 @@ def remove_sparse_timeseries(
cols = [x + type + "_zfilled" for x in strain_columns]
for country in countries:
if all(df.loc[(df["country"] == country), cols].fillna(0).sum() == 0):
df.loc[(df["country"] == country), cols] = np.NaN
df.loc[(df["country"] == country), cols] = np.nan
return df
Loading