diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv index ace9042583..15f536d3f4 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c953a6136e61fe55181e477e6c1425ec010ac55b180351f0f466da2c2eb0d379 -size 67854 +oid sha256:79aafe10ade632b4eb487fefa572093a63c39b5cb56322c31619958242e2db8c +size 72269 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv index 25249531b2..12d8b4dfc3 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c358a643e4def0e574b75f89f83d77f9c3366f668422e005150f4d69ebe8d7a7 -size 6169152 +oid sha256:01dda79d4ae913b782819f97fa909b86157956a04fa9a5ecc7238308e141fdc1 +size 45303490 diff --git a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv index 7ab675ecba..3c1b4d92a0 100644 --- a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv +++ b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5b0f417681cbdd2489e2f9c6634b2825c32beb9637dc045b56e308c910a102c -size 90569 +oid sha256:380de2593127b48c6a43f14be6c55bf5c4f78a609b843e4cb66b8066ef7f9399 +size 124792 diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py new file mode 100644 index 0000000000..5b3ee33ba7 --- /dev/null +++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py @@ -0,0 +1,1115 @@ +"""This file uses the results of the results of running `impact_of_cons_availability_intervention.py` +tob extract summary results for the manuscript - "Rethinking economic evaluation of +system level interventions. +I plan to run the simulation for a short period of 5 years (2020 - 2025) because +holding the consumable availability constant in the short run would be more justifiable +than holding it constant for a long period. +""" + +import argparse +from pathlib import Path +import textwrap +from typing import Tuple + +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +import matplotlib.colors as mcolors +from matplotlib.ticker import FuncFormatter +from collections import Counter, defaultdict +import seaborn as sns +import squarify + +from tlo.analysis.utils import ( + CAUSE_OF_DEATH_OR_DALY_LABEL_TO_COLOR_MAP, + extract_results, + get_color_cause_of_death_or_daly_label, + make_age_grp_lookup, + order_of_cause_of_death_or_daly_label, + summarize, +) +import pickle + +from tlo import Date +from tlo.analysis.utils import ( + extract_params, + extract_results, + get_scenario_info, + get_scenario_outputs, + load_pickled_dataframes, + make_age_grp_lookup, + make_age_grp_types, + make_calendar_period_lookup, + make_calendar_period_type, + summarize, + write_log_to_excel, + parse_log_file, + COARSE_APPT_TYPE_TO_COLOR_MAP, + SHORT_TREATMENT_ID_TO_COLOR_MAP, + _standardize_short_treatment_id, + bin_hsi_event_details, + compute_mean_across_runs, + get_coarse_appt_type, + get_color_short_treatment_id, + order_of_short_treatment_ids, + plot_stacked_bar_chart, + squarify_neat, + unflatten_flattened_multi_index_in_logging, +) + +outputspath = Path('./outputs') +figurespath = Path(outputspath / 'impact_of_consumable_scenarios_results') +figurespath.mkdir(parents=True, exist_ok=True) # create directory if it doesn't exist +resourcefilepath = Path("./resources") + +# Declare period for which the results will be generated (defined inclusively) + +TARGET_PERIOD = (Date(2015, 1, 1), Date(2019, 12, 31)) + +make_graph_file_name = lambda stub: output_folder / f"{stub.replace('*', '_star_')}.png" # noqa: E731 + +_, age_grp_lookup = make_age_grp_lookup() + +def target_period() -> str: + """Returns the target period as a string of the form YYYY-YYYY""" + return "-".join(str(t.year) for t in TARGET_PERIOD) + +def drop_outside_period(_df): + """Return a dataframe which only includes for which the date is within the limits defined by TARGET_PERIOD""" + return _df.drop(index=_df.index[~_df['date'].between(*TARGET_PERIOD)]) + +def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrapped=False): + """Make a vertical bar plot for each row of _df, using the columns to identify the height of the bar and the + extent of the error bar.""" + + yerr = np.array([ + (_df['median'] - _df['lower']).values, + (_df['upper'] - _df['median']).values, + ]) + + xticks = {(i + 0.5): k for i, k in enumerate(_df.index)} + + # Define color mapping based on index values + color_mapping = { + 'Actual': '#1f77b4', + 'Non-therapeutic consumables':'#ff7f0e', + 'Vital medicines': '#2ca02c', + 'Pharmacist-managed':'#d62728', + '75th percentile facility':'#9467bd', + '90th percentile facility':'#8c564b', + 'Best facility': '#e377c2', + 'Best facility (including DHO)': '#7f7f7f', + 'HIV supply chain': '#bcbd22', + 'EPI supply chain': '#17becf', + 'Perfect':'#31a354' + } + + colors = [_df.index[i] in color_mapping for i in range(len(_df.index))] + color_values = [color_mapping.get(idx, '#cccccc') for idx in _df.index] + + fig, ax = plt.subplots() + ax.bar( + xticks.keys(), + _df['median'].values, + yerr=yerr, + alpha=1, + color=color_values, + ecolor='black', + capsize=10, + label=xticks.values() + ) + if annotations: + for xpos, ypos, text in zip(xticks.keys(), _df['upper'].values, annotations): + ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize=10) + + ax.set_xticks(list(xticks.keys())) + if not xticklabels_horizontal_and_wrapped: + wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()] + ax.set_xticklabels(wrapped_labs, rotation=45, ha='right', fontsize=10) + else: + wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()] + ax.set_xticklabels(wrapped_labs, fontsize=10) + + # Set font size for y-tick labels + ax.tick_params(axis='y', labelsize=10) + ax.tick_params(axis='x', labelsize=10) + + ax.grid(axis="y") + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + fig.tight_layout() + + return fig, ax + +def do_bar_plot_with_ci_and_heatmap(_df, annotations=None, xticklabels_horizontal_and_wrapped=False, heatmap_values=None, plt_title = 'unnamed_figure'): + """Create a bar plot with CI and a heatmap above it.""" + yerr = np.array([ + (_df['median'] - _df['lower']).values, + (_df['upper'] - _df['median']).values, + ]) + + xticks = {(i + 0.5): k for i, k in enumerate(_df.index)} + + # Define color mapping based on index values + color_mapping = { + 'Actual': '#1f77b4', + 'Non-therapeutic consumables':'#ff7f0e', + 'Vital medicines': '#2ca02c', + 'Pharmacist-managed':'#d62728', + '75th percentile facility':'#9467bd', + '90th percentile facility':'#8c564b', + 'Best facility': '#e377c2', + 'Best facility (including DHO)': '#7f7f7f', + 'HIV supply chain': '#bcbd22', + 'EPI supply chain': '#17becf', + 'Perfect':'#31a354' + } + + color_values = [color_mapping.get(idx, '#cccccc') for idx in _df.index] + + # Create a figure with two axes + fig, (heatmap_ax, ax) = plt.subplots( + nrows=2, ncols=1, gridspec_kw={"height_ratios": [0.3, 2]}, figsize=(10, 7) + ) + + # Heatmap axis + if heatmap_values: + cmap = plt.cm.YlGn + norm = mcolors.Normalize(vmin=min(heatmap_values), vmax=max(heatmap_values)) + heatmap_colors = [cmap(norm(value)) for value in heatmap_values] + + heatmap_ax.bar( + xticks.keys(), + [1] * len(heatmap_values), # Constant height for heatmap bars + color=heatmap_colors, + align='center', + width=0.8 + ) + + # Add data labels to heatmap bars + for xpos, value in zip(xticks.keys(), heatmap_values): + heatmap_ax.text( + xpos, 0.5, f"{value:.2f}", color='black', ha='center', va='center', fontsize= 12, weight='bold' + ) + + heatmap_ax.set_xticks(list(xticks.keys())) + heatmap_ax.set_xticklabels([]) + heatmap_ax.set_yticks([]) + heatmap_ax.set_ylabel('Average consumable \n availability under \n each scenario \n (Baseline = 0.52)', fontsize=10, rotation=0, labelpad=20) + heatmap_ax.spines['top'].set_visible(False) + heatmap_ax.spines['right'].set_visible(False) + heatmap_ax.spines['left'].set_visible(False) + heatmap_ax.spines['bottom'].set_visible(False) + + # Bar plot axis + ax.bar( + xticks.keys(), + _df['median'].values, + yerr=yerr, + alpha=1, + color=color_values, + ecolor='black', + capsize=10 + ) + if annotations: + for xpos, ypos, text in zip(xticks.keys(), _df['upper'].values, annotations): + ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize=10) + + ax.set_xticks(list(xticks.keys())) + if not xticklabels_horizontal_and_wrapped: + wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()] + ax.set_xticklabels(wrapped_labs, rotation=45, ha='right', fontsize=10) + else: + wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()] + ax.set_xticklabels(wrapped_labs, fontsize=10) + + # Set font size for y-tick labels + ax.tick_params(axis='y', labelsize=10) + ax.tick_params(axis='x', labelsize=10) + + ax.grid(axis="y") + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + + # Add global title + fig.suptitle(plt_title, fontsize=16, fontweight='bold') + + fig.tight_layout() + + return fig, (heatmap_ax, ax) + +def get_num_dalys(_df): + """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD). + Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using + results from runs that crashed mid-way through the simulation. + """ + years_needed = [i.year for i in TARGET_PERIOD] + assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded." + return pd.Series( + data=_df + .loc[_df.year.between(*years_needed)] + .drop(columns=['date', 'sex', 'age_range', 'year']) + .sum().sum() + ) + +def get_num_dalys_by_cause(_df): + """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD). + Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using + results from runs that crashed mid-way through the simulation. + """ + years_needed = [i.year for i in TARGET_PERIOD] + assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded." + return pd.Series( + data=_df + .loc[_df.year.between(*years_needed)] + .drop(columns=['date', 'sex', 'age_range', 'year']) + .sum() + ) + +def get_num_dalys_per_person_year(_df): + """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD). + Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using + results from runs that crashed mid-way through the simulation. + """ + years_needed = [i.year for i in TARGET_PERIOD] + assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded." + return pd.Series( + data=_df + .loc[_df.year.between(*years_needed)] + .drop(columns=['date', 'sex', 'age_range']) + .groupby('year').sum().sum(axis = 1) + ) + +def get_num_dalys_per_person_year_by_cause(_df): + """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD). + Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using + results from runs that crashed mid-way through the simulation. + """ + years_needed = [i.year for i in TARGET_PERIOD] + assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded." + return pd.Series( + data=_df + .loc[_df.year.between(*years_needed)] + .drop(columns=['date', 'sex', 'age_range']) + .groupby('year').sum().unstack() + ) +def extract_results_by_person_year(results_folder: Path, + module: str, + key: str, + column: str = None, + index: str = None, + custom_generate_series=None, + ) -> pd.DataFrame: + """Utility function to unpack results. + + Produces a dataframe from extracting information from a log with the column multi-index for the draw/run. + + If the column to be extracted exists in the log, the name of the `column` is provided as `column`. If the resulting + dataframe should be based on another column that exists in the log, this can be provided as 'index'. + + If instead, some work must be done to generate a new column from log, then a function can be provided to do this as + `custom_generate_series`. + + Optionally, with `do_scaling=True`, each element is multiplied by the scaling_factor recorded in the simulation. + + Note that if runs in the batch have failed (such that logs have not been generated), these are dropped silently. + """ + + def get_population_size(_draw, _run): + """Helper function to get the multiplier from the simulation. + Note that if the scaling factor cannot be found a `KeyError` is thrown.""" + return load_pickled_dataframes( + results_folder, _draw, _run, 'tlo.methods.demography' + )['tlo.methods.demography']['population']['total'] + + if custom_generate_series is None: + # If there is no `custom_generate_series` provided, it implies that function required selects the specified + # column from the dataframe. + assert column is not None, "Must specify which column to extract" + else: + assert index is None, "Cannot specify an index if using custom_generate_series" + assert column is None, "Cannot specify a column if using custom_generate_series" + + def generate_series(dataframe: pd.DataFrame) -> pd.Series: + if custom_generate_series is None: + if index is not None: + return dataframe.set_index(index)[column] + else: + return dataframe.reset_index(drop=True)[column] + else: + return custom_generate_series(dataframe) + + # get number of draws and numbers of runs + info = get_scenario_info(results_folder) + + # Collect results from each draw/run + res = dict() + for draw in range(info['number_of_draws']): + for run in range(info['runs_per_draw']): + + draw_run = (draw, run) + + try: + df: pd.DataFrame = load_pickled_dataframes(results_folder, draw, run, module)[module][key] + output_from_eval: pd.Series = generate_series(df) + assert pd.Series == type(output_from_eval), 'Custom command does not generate a pd.Series' + res[draw_run] = output_from_eval.reset_index().drop(columns = ['year']).T / get_population_size(draw, run) + res[draw_run] = res[draw_run].sum(axis =1) + except KeyError: + # Some logs could not be found - probably because this run failed. + res[draw_run] = None + + # Use pd.concat to compile results (skips dict items where the values is None) + _concat = pd.concat(res, axis=1) + _concat.columns.names = ['draw', 'run'] # name the levels of the columns multi-index + return _concat + +def extract_results_by_person_year_by_cause(results_folder: Path, + module: str, + key: str, + column: str = None, + index: str = None, + custom_generate_series=None, + cause: str = None, + ) -> pd.DataFrame: + """Utility function to unpack results. + + Produces a dataframe from extracting information from a log with the column multi-index for the draw/run. + + If the column to be extracted exists in the log, the name of the `column` is provided as `column`. If the resulting + dataframe should be based on another column that exists in the log, this can be provided as 'index'. + + If instead, some work must be done to generate a new column from log, then a function can be provided to do this as + `custom_generate_series`. + + Optionally, with `do_scaling=True`, each element is multiplied by the scaling_factor recorded in the simulation. + + Note that if runs in the batch have failed (such that logs have not been generated), these are dropped silently. + """ + + def get_population_size(_draw, _run): + """Helper function to get the multiplier from the simulation. + Note that if the scaling factor cannot be found a `KeyError` is thrown.""" + return load_pickled_dataframes( + results_folder, _draw, _run, 'tlo.methods.demography' + )['tlo.methods.demography']['population']['total'] + + if custom_generate_series is None: + # If there is no `custom_generate_series` provided, it implies that function required selects the specified + # column from the dataframe. + assert column is not None, "Must specify which column to extract" + else: + assert index is None, "Cannot specify an index if using custom_generate_series" + assert column is None, "Cannot specify a column if using custom_generate_series" + + def generate_series(dataframe: pd.DataFrame) -> pd.Series: + if custom_generate_series is None: + if index is not None: + return dataframe.set_index(index)[column] + else: + return dataframe.reset_index(drop=True)[column] + else: + return custom_generate_series(dataframe) + + # get number of draws and numbers of runs + info = get_scenario_info(results_folder) + + # Collect results from each draw/run + res = dict() + for draw in range(info['number_of_draws']): + for run in range(info['runs_per_draw']): + + draw_run = (draw, run) + + try: + df: pd.DataFrame = load_pickled_dataframes(results_folder, draw, run, module)[module][key] + output_from_eval: pd.Series = generate_series(df) + assert pd.Series == type(output_from_eval), 'Custom command does not generate a pd.Series' + output_from_eval = output_from_eval[output_from_eval.index.get_level_values(0) == cause].droplevel(0) + res[draw_run] = output_from_eval.reset_index().drop(columns = ['year']).T / get_population_size(draw, run) + res[draw_run] = res[draw_run].sum(axis =1) + except KeyError: + # Some logs could not be found - probably because this run failed. + res[draw_run] = None + + # Use pd.concat to compile results (skips dict items where the values is None) + _concat = pd.concat(res, axis=1) + _concat.columns.names = ['draw', 'run'] # name the levels of the columns multi-index + return _concat + +def find_difference_relative_to_comparison(_ser: pd.Series, + comparison: str, + scaled: bool = False, + drop_comparison: bool = True, + ): + """Find the difference in the values in a pd.Series with a multi-index, between the draws (level 0) + within the runs (level 1), relative to where draw = `comparison`. + The comparison is `X - COMPARISON`.""" + return _ser \ + .unstack(level=0) \ + .apply(lambda x: (x - x[comparison]) / (x[comparison] if scaled else 1.0), axis=1) \ + .drop(columns=([comparison] if drop_comparison else [])) \ + .stack() + +# %% Gathering basic information + +# Find results_folder associated with a given batch_file and get most recent +#results_folder = get_scenario_outputs('impact_of_consumable_scenarios.py', outputspath) +results_folder = Path(outputspath / 'sakshi.mohan@york.ac.uk/impact_of_consumables_scenarios-2024-09-12T192454Z/') +#results_folder = Path(outputspath / 'impact_of_consumables_scenarios-2024-09-12T155640Z/') + +# look at one log (so can decide what to extract) +log = load_pickled_dataframes(results_folder) + +# get basic information about the results +info = get_scenario_info(results_folder) + +# 1) Extract the parameters that have varied over the set of simulations +params = extract_params(results_folder) +params_dict = {'default': 'Actual', 'scenario1': 'Non-therapeutic consumables', 'scenario2': 'Vital medicines', + 'scenario3': 'Pharmacist-managed', 'scenario4': 'Level 1b', 'scenario5': 'CHAM', + 'scenario6': '75th percentile facility', 'scenario7': '90th percentile facility', 'scenario8': 'Best facility', + 'scenario9': 'Best facility (including DHO)','scenario10': 'HIV supply chain','scenario11': 'EPI supply chain', + 'scenario12': 'HIV moved to Govt supply chain', 'all': 'Perfect'} +params_dict_df = pd.DataFrame.from_dict(params_dict, orient='index', columns=['name_of_scenario']).reset_index().rename(columns = {'index': 'value'}) +params = params.merge(params_dict_df, on = 'value', how = 'left', validate = '1:1') +scenarios = params['name_of_scenario'] #range(len(params)) # X-axis values representing time periods +drop_scenarios = ['Level 1b', 'CHAM', 'Best facility (including DHO)', 'HIV moved to Govt supply chain'] # Drops scenarios which are no longer considered important for comparison + +# %% Extracting results from run + +# 1. DALYs accrued and averted +################################### +# 1.1 Total DALYs accrued +#------------------------- +# Get total DALYs accrued +num_dalys = extract_results( + results_folder, + module='tlo.methods.healthburden', + key='dalys_stacked', + custom_generate_series=get_num_dalys, + do_scaling=True + ) + +# %% Chart of total number of DALYS +num_dalys_summarized = summarize(num_dalys).loc[0].unstack() +num_dalys_summarized['scenario'] = scenarios.to_list() +num_dalys_summarized = num_dalys_summarized.set_index('scenario') +num_dalys_summarized.to_csv(figurespath/ 'num_dalys_summarized.csv') + +# Plot DALYS accrued (with xtickabels horizontal and wrapped) +name_of_plot = f'Total DALYs accrued, {target_period()}' +chosen_num_dalys_summarized = num_dalys_summarized[~num_dalys_summarized.index.isin(drop_scenarios)] +fig, ax = do_bar_plot_with_ci( + (chosen_num_dalys_summarized / 1e6).clip(lower=0.0), + annotations=[ + f"{round(row['median']/1e6, 1)} \n ({round(row['lower']/1e6, 1)}-{round(row['upper']/1e6, 1)})" + for _, row in chosen_num_dalys_summarized.clip(lower=0.0).iterrows() + ], + xticklabels_horizontal_and_wrapped=False, +) +ax.set_title(name_of_plot) +ax.set_ylim(0, 120) +ax.set_yticks(np.arange(0, 120, 10)) +ax.set_ylabel('Total DALYs accrued \n(Millions)') +fig.tight_layout() +fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '')) +fig.show() +plt.close(fig) + +# 1.2 Total DALYs averted +#------------------------ +# Get absolute DALYs averted +num_dalys_averted = summarize( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys.loc[0], + comparison= 0) # sets the comparator to 0 which is the Actual scenario + ).T + ).iloc[0].unstack() +num_dalys_averted['scenario'] = scenarios.to_list()[1:12] +num_dalys_averted = num_dalys_averted.set_index('scenario') + +# Get percentage DALYs averted +pc_dalys_averted = 100.0 * summarize( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys.loc[0], + comparison= 0, # sets the comparator to 0 which is the Actual scenario + scaled=True) + ).T +).iloc[0].unstack() +pc_dalys_averted['scenario'] = scenarios.to_list()[1:12] +pc_dalys_averted = pc_dalys_averted.set_index('scenario') + +# %% Chart of number of DALYs averted +# Plot DALYS averted (with xtickabels horizontal and wrapped) +average_availability_under_scenarios = [0.59, 0.59, 0.6, 0.57, 0.63, 0.7, 0.79, 0.91, 1] +name_of_plot = f'Health impact of improved consumable availability\n at level 1 health facilities, {target_period()}' +chosen_num_dalys_averted = num_dalys_averted[~num_dalys_averted.index.isin(drop_scenarios)] +chosen_pc_dalys_averted = pc_dalys_averted[~pc_dalys_averted.index.isin(drop_scenarios)] +fig, (heatmap_ax, ax) = do_bar_plot_with_ci_and_heatmap( + (chosen_num_dalys_averted / 1e6), + annotations=[ + f"{round(row['median'], 1)} % \n ({round(row['lower'], 1)}- \n {round(row['upper'], 1)}) %" + for _, row in chosen_pc_dalys_averted.iterrows() + ], + xticklabels_horizontal_and_wrapped=False, + heatmap_values=average_availability_under_scenarios, + plt_title = name_of_plot +) +#ax.set_title(name_of_plot) +ax.set_ylim(0, 14) +ax.set_yticks(np.arange(0, 14, 2)) +ax.set_ylabel('Additional DALYS Averted \n(Millions)') +fig.tight_layout() +fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('\n', '')) +fig.show() +plt.close(fig) + +# 1.2 DALYs by disease area/intervention - for comparison of the magnitude of impact created by consumables interventions +num_dalys_by_cause = extract_results( + results_folder, + module='tlo.methods.healthburden', + key='dalys_stacked', + custom_generate_series=get_num_dalys_by_cause, + do_scaling=True + ) +num_dalys_by_cause_summarized = summarize(num_dalys_by_cause).unstack(level = 0) +num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.reset_index() +num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.rename(columns = {'level_2':'cause', 0: 'DALYs_accrued'}) +num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.pivot(index=['draw','cause'], columns='stat', values='DALYs_accrued') +num_dalys_by_cause_summarized.to_csv(figurespath / 'num_dalys_by_cause_summarized.csv') + +# Get top 10 causes until Actual +num_dalys_by_cause_actual = num_dalys_by_cause_summarized[num_dalys_by_cause_summarized.index.get_level_values(0) == 0] +num_dalys_by_cause_actual = num_dalys_by_cause_actual.sort_values('mean', ascending = False) +num_dalys_by_cause_actual =num_dalys_by_cause_actual[0:10] +top_10_causes_of_dalys = num_dalys_by_cause_actual.index.get_level_values(1).unique() + +# Get DALYs aveterted by cause and plot bar chats +for cause in top_10_causes_of_dalys: + num_dalys_by_cause_pivoted = num_dalys_by_cause[num_dalys_by_cause.index == cause].unstack().reset_index().drop(columns = ['level_2']).set_index(['draw', 'run']) + num_dalys_averted_by_cause = summarize( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys_by_cause_pivoted.squeeze(), + comparison= 0) # sets the comparator to 0 which is the Actual scenario + ).T + ).iloc[0].unstack() + num_dalys_averted_by_cause['scenario'] = scenarios.to_list()[1:12] + num_dalys_averted_by_cause = num_dalys_averted_by_cause.set_index('scenario') + + # Get percentage DALYs averted + pc_dalys_averted_by_cause = 100.0 * summarize( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys_by_cause_pivoted.squeeze(), + comparison= 0, # sets the comparator to 0 which is the Actual scenario + scaled=True) + ).T + ).iloc[0].unstack() + pc_dalys_averted_by_cause['scenario'] = scenarios.to_list()[1:12] + pc_dalys_averted_by_cause = pc_dalys_averted_by_cause.set_index('scenario') + + # Create a plot of DALYs averted by cause + chosen_num_dalys_averted_by_cause = num_dalys_averted_by_cause[~num_dalys_averted_by_cause.index.isin(drop_scenarios)] + chosen_pc_dalys_averted_by_cause = pc_dalys_averted_by_cause[~pc_dalys_averted_by_cause.index.isin(drop_scenarios)] + name_of_plot = f'Additional DALYs averted vs Actual by cause - \n ({cause}), {target_period()}' + fig, ax = do_bar_plot_with_ci( + (chosen_num_dalys_averted_by_cause / 1e6).clip(lower=0.0), + annotations=[ + f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}-{round(row['upper'], 1)}) %" + for _, row in chosen_pc_dalys_averted_by_cause.clip(lower=0.0).iterrows() + ], + xticklabels_horizontal_and_wrapped=False, + ) + if chosen_num_dalys_averted_by_cause.upper.max()/1e6 > 2: + y_limit = 8.5 + y_tick_gaps = 1 + else: + y_limit = 2.5 + y_tick_gaps = 0.5 + ax.set_title(name_of_plot) + ax.set_ylim(0, y_limit) + ax.set_yticks(np.arange(0, y_limit, y_tick_gaps)) + ax.set_ylabel(f'Additional DALYs averted \n(Millions)') + fig.tight_layout() + fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', '')) + #fig.show() + plt.close(fig) + +''' +# PLot DALYs accrued by cause +for cause in top_10_causes_of_dalys: + name_of_plot = f'Total DALYs accrued by cause - \n {cause}, {target_period()}' + chosen_num_dalys_by_cause_summarized = num_dalys_by_cause_summarized[~num_dalys_by_cause_summarized.index.get_level_values(0).isin([4,5])] + chosen_num_dalys_by_cause_summarized = chosen_num_dalys_by_cause_summarized[chosen_num_dalys_by_cause_summarized.index.get_level_values(1) == cause] + fig, ax = do_bar_plot_with_ci( + (chosen_num_dalys_by_cause_summarized / 1e6).clip(lower=0.0), + annotations=[ + f"{round(row['mean'] / 1e6, 1)} \n ({round(row['lower'] / 1e6, 1)}-{round(row['upper'] / 1e6, 1)})" + for _, row in chosen_num_dalys_by_cause_summarized.clip(lower=0.0).iterrows() + ], + xticklabels_horizontal_and_wrapped=False, + ) + ax.set_title(name_of_plot) + if chosen_num_dalys_by_cause_summarized.upper.max()/1e6 > 5: + y_limit = 30 + y_tick_gap = 5 + else: + y_limit = 5 + y_tick_gap = 1 + ax.set_ylim(0, y_limit) + ax.set_yticks(np.arange(0, y_limit, y_tick_gap)) + ax.set_ylabel(f'Total DALYs accrued \n(Millions)') + fig.tight_layout() + fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', '')) + fig.show() + plt.close(fig) + +# TODO Fix xticklabels in the plots above +''' + +# 1.3 Total DALYs averted per person +#---------------------------------------- +num_dalys_per_person_year = extract_results_by_person_year( + results_folder, + module='tlo.methods.healthburden', + key='dalys_stacked', + custom_generate_series=get_num_dalys_per_person_year, + ) + +num_dalys_averted_per_person_year = summarize( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys_per_person_year.loc[0], + comparison= 0) # sets the comparator to 0 which is the Actual scenario + ).T + ).iloc[0].unstack() +num_dalys_averted_per_person_year['scenario'] = scenarios.to_list()[1:12] +num_dalys_averted_per_person_year = num_dalys_averted_per_person_year.set_index('scenario') + +# Get percentage DALYs averted +pct_dalys_averted_per_person_year = 100.0 * summarize( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys_per_person_year.loc[0], + comparison= 0, # sets the comparator to 0 which is the Actual scenario + scaled=True) + ).T +).iloc[0].unstack() +pct_dalys_averted_per_person_year['scenario'] = scenarios.to_list()[1:12] +pct_dalys_averted_per_person_year = pct_dalys_averted_per_person_year.set_index('scenario') + +# %% Chart of number of DALYs averted +# Plot DALYS averted (with xtickabels horizontal and wrapped) +name_of_plot = f'Additional DALYs Averted Per Person vs Actual, \n {target_period()}' +chosen_num_dalys_averted_per_person_year = num_dalys_averted_per_person_year[~num_dalys_averted_per_person_year.index.isin(drop_scenarios)] +chosen_pct_dalys_averted_per_person_year = pct_dalys_averted_per_person_year[~pct_dalys_averted_per_person_year.index.isin(drop_scenarios)] +fig, ax = do_bar_plot_with_ci( + (chosen_num_dalys_averted_per_person_year).clip(lower=0.0), + annotations=[ + f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}- \n {round(row['upper'], 1)}) %" + for _, row in chosen_pct_dalys_averted_per_person_year.clip(lower=0.0).iterrows() + ], + xticklabels_horizontal_and_wrapped=False, +) +ax.set_title(name_of_plot) +ax.set_ylim(0, 1.5) +ax.set_yticks(np.arange(0, 1.5, 0.2)) +ax.set_ylabel('Additional DALYs averted per person') +fig.tight_layout() +fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('\n', '')) +fig.show() +plt.close(fig) + +# 1.4 Total DALYs averted per person by cause +#------------------------------------------------- +for cause in top_10_causes_of_dalys: + num_dalys_per_person_year_by_cause = extract_results_by_person_year_by_cause( + results_folder, + module='tlo.methods.healthburden', + key='dalys_stacked', + custom_generate_series=get_num_dalys_per_person_year_by_cause, + cause = cause, + ) + + num_dalys_per_person_year_by_cause_pivoted = num_dalys_per_person_year_by_cause.unstack().reset_index().drop( + columns=['level_2']).set_index(['draw', 'run']) + num_dalys_averted_per_person_year_by_cause = summarize( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys_per_person_year_by_cause.squeeze(), + comparison=0) # sets the comparator to 0 which is the Actual scenario + ).T + ).iloc[0].unstack() + num_dalys_averted_per_person_year_by_cause['scenario'] = scenarios.to_list()[1:12] + num_dalys_averted_per_person_year_by_cause = num_dalys_averted_per_person_year_by_cause.set_index('scenario') + + # Get percentage DALYs averted + pct_dalys_averted_per_person_year_by_cause = 100.0 * summarize( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys_per_person_year_by_cause.squeeze(), + comparison=0, # sets the comparator to 0 which is the Actual scenario + scaled=True) + ).T + ).iloc[0].unstack() + pct_dalys_averted_per_person_year_by_cause['scenario'] = scenarios.to_list()[1:12] + pct_dalys_averted_per_person_year_by_cause = pct_dalys_averted_per_person_year_by_cause.set_index('scenario') + + # Create a plot of DALYs averted by cause + chosen_num_dalys_averted_per_person_year_by_cause = num_dalys_averted_per_person_year_by_cause[ + ~num_dalys_averted_per_person_year_by_cause.index.isin(drop_scenarios)] + chosen_pct_dalys_averted_per_person_year_by_cause = pct_dalys_averted_per_person_year_by_cause[~pct_dalys_averted_per_person_year_by_cause.index.isin(drop_scenarios)] + name_of_plot = f'Additional DALYs averted per person by cause - \n ({cause}), {target_period()}' + fig, ax = do_bar_plot_with_ci( + (chosen_num_dalys_averted_per_person_year_by_cause).clip(lower=0.0), + annotations=[ + f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}-{round(row['upper'], 1)}) %" + for _, row in pct_dalys_averted_per_person_year_by_cause.clip(lower=0.0).iterrows() + ], + xticklabels_horizontal_and_wrapped=False, + ) + if chosen_num_dalys_averted_per_person_year_by_cause.upper.max() > 0.4: + y_limit = 0.55 + y_tick_gap = 0.1 + elif chosen_num_dalys_averted_per_person_year_by_cause.upper.max() > 0.18: + y_limit = 0.2 + y_tick_gap = 0.025 + else: + y_limit = 0.15 + y_tick_gap = 0.025 + ax.set_title(name_of_plot) + ax.set_ylim(0, y_limit) + ax.set_yticks(np.arange(0, y_limit, y_tick_gap)) + ax.set_ylabel(f'Additional DALYs averted per person') + fig.tight_layout() + fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', '')) + #fig.show() + plt.close(fig) + +# 2. Health work time spent v DALYs accrued +############################################# +# DALYs averted per person on the Y-axis; Capacity of cadre used at levels 1a, 1b, and 2 on the Y-axis +# log['tlo.methods.healthsystem.summary']['Capacity_By_OfficerType_And_FacilityLevel']['OfficerType=Pharmacy|FacilityLevel=2'] +def get_capacity_used_by_cadre_and_level(_df): + """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD). + Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using + results from runs that crashed mid-way through the simulation. + """ + years_needed = [i.year for i in TARGET_PERIOD] + _df['year'] = _df.date.dt.year + #assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded." + string_for_cols_to_drop1 = 'FacilityLevel=0|FacilityLevel=3|FacilityLevel=4|FacilityLevel=5' + string_for_cols_to_drop2 = 'OfficerType=DCSA|OfficerType=Dental|OfficerType=Laboratory|OfficerType=Mental|OfficerType=Nutrition|OfficerType=Radiography' + cols_to_drop1 = _df.columns[_df.columns.str.contains(string_for_cols_to_drop1)] + cols_to_drop2 = _df.columns[_df.columns.str.contains(string_for_cols_to_drop2)] + cols_to_drop = [*cols_to_drop1, *cols_to_drop2, 'year'] + return pd.Series( + data=_df + .loc[_df.year.between(*years_needed)] + .drop(columns= cols_to_drop) + .mean() + ) + +capacity_used = summarize(extract_results( + results_folder, + module='tlo.methods.healthsystem.summary', + key='Capacity_By_OfficerType_And_FacilityLevel', + custom_generate_series=get_capacity_used_by_cadre_and_level, + do_scaling = False, + )) + +#chosen_capacity_used.unstack().reset_index().drop(columns = ['level_2']).pivot(columns ='stat', index = 'draw') +for cadre_level in capacity_used.index: + print(cadre_level) + name_of_plot = f'Capacity used - \n {cadre_level}, {target_period()}' + scenarios_to_drop = capacity_used.columns[capacity_used.columns.get_level_values(0).isin([10])] + chosen_capacity_used = capacity_used.drop(columns = scenarios_to_drop) + chosen_capacity_used = chosen_capacity_used[chosen_capacity_used.index == cadre_level] + chosen_capacity_used = chosen_capacity_used.unstack().reset_index().drop(columns = ['level_2']).pivot(columns ='stat', index = 'draw').droplevel(0,axis = 1) + chosen_capacity_used['scenario'] = [*scenarios.to_list()[0:10], scenarios.to_list()[11]] # [*scenarios.to_list()[0:4],*scenarios.to_list()[6:10]] + #TODO fix above code to be automated + chosen_capacity_used = chosen_capacity_used.set_index('scenario') + fig, ax = do_bar_plot_with_ci( + (chosen_capacity_used), + annotations=[ + f"{round(row['mean'], 2)} \n ({round(row['lower'], 2)}-{round(row['upper'], 2)})" + for _, row in chosen_capacity_used.iterrows() + ], + xticklabels_horizontal_and_wrapped=False, + ) + ax.set_title(name_of_plot) + if chosen_capacity_used.upper.max() > 3: + y_limit = 3.5 + y_tick_gap = 0.5 + else: + y_limit = 2 + y_tick_gap = 0.25 + ax.set_ylim(0, y_limit) + ax.set_yticks(np.arange(0, y_limit, y_tick_gap)) + ax.set_ylabel(f'Capacity used \n (Proportion of capacity available)') + fig.tight_layout() + fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', '_')) + fig.show() + plt.close(fig) + + +# %% Summarizing input resourcefile data + +# 1. Consumable availability by category and level +#-------------------------------------------------- +tlo_availability_df = pd.read_csv(resourcefilepath / 'healthsystem'/ 'consumables' / "ResourceFile_Consumables_availability_small.csv") + +# Attach district, facility level, program to this dataset +mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") +districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) +fac_levels = {'0', '1a', '1b', '2', '3', '4'} +tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on = ['Facility_ID'], how='left') +# Attach programs +program_item_mapping = pd.read_csv(resourcefilepath / 'healthsystem'/ 'consumables' / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] +program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] +tlo_availability_df = tlo_availability_df.merge(program_item_mapping,on = ['item_code'], how='left') + +# First a heatmap of current availability +fac_levels = {'0': 'Health Post', '1a': 'Health Centers', '1b': 'Rural/Community \n Hospitals', '2': 'District Hospitals', '3': 'Central Hospitals', '4': 'Mental Hospital'} +chosen_fac_levels_for_plot = ['0', '1a', '1b', '2', '3', '4'] +correct_order_of_levels = ['Health Post', 'Health Centers', 'Rural/Community \n Hospitals', 'District Hospitals', 'Central Hospitals','Mental Hospital'] +df_for_plots = tlo_availability_df[tlo_availability_df.Facility_Level.isin(chosen_fac_levels_for_plot)] +df_for_plots['Facility_Level'] = df_for_plots['Facility_Level'].map(fac_levels) + +scenario_list = [1,2,3,6,7,8,10,11] +chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in + scenario_list] +scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'General consumables', 'available_prop_scenario2': 'Vital medicines', + 'available_prop_scenario3': 'Pharmacist- managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM', + 'available_prop_scenario6': '75th percentile facility', 'available_prop_scenario7': '90th percentile facility', 'available_prop_scenario8': 'Best facility', + 'available_prop_scenario9': 'Best facility (including DHO)','available_prop_scenario10': 'HIV supply chain', 'available_prop_scenario11': 'EPI supply chain', + 'available_prop_scenario12': 'HIV moved to Govt supply chain'} +# recreate the chosen columns list based on the mapping above +chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns] +df_for_plots = df_for_plots.rename(columns = scenario_names_dict) + +i = 0 +for avail_scenario in chosen_availability_columns: + # Generate a heatmap + # Pivot the DataFrame + aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[avail_scenario].mean().reset_index() + heatmap_data = aggregated_df.pivot("item_category", "Facility_Level", avail_scenario) + heatmap_data = heatmap_data[correct_order_of_levels] # Maintain the order + + # Calculate the aggregate row and column + aggregate_col= aggregated_df.groupby('Facility_Level')[avail_scenario].mean() + aggregate_col = aggregate_col[correct_order_of_levels] + aggregate_row = aggregated_df.groupby('item_category')[avail_scenario].mean() + overall_aggregate = df_for_plots[avail_scenario].mean() + + # Add aggregate row and column + heatmap_data['Average'] = aggregate_row + aggregate_col['Average'] = overall_aggregate + heatmap_data.loc['Average'] = aggregate_col + + # Generate the heatmap + sns.set(font_scale=1.5) + plt.figure(figsize=(10, 8)) + sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) + + # Customize the plot + plt.title(scenarios[i]) + plt.xlabel('Facility Level') + plt.ylabel(f'Disease/Public health \n program') + plt.xticks(rotation=90) + plt.yticks(rotation=0) + + plt.savefig(figurespath /f'consumable_availability_heatmap_{avail_scenario}.png', dpi=300, bbox_inches='tight') + #plt.show() + plt.close() + i = i + 1 + +# TODO Justify the focus on levels 1a and 1b - where do HSIs occur?; at what level is there most misallocation within districts +# TODO get graphs of percentage of successful HSIs under different scenarios for levels 1a and 1b +# TODO is there a way to link consumables directly to DALYs (how many DALYs are lost due to stockouts of specific consumables) +# TODO why are there no appointments at level 1b + +# 2. Consumable demand not met +#----------------------------------------- +# Number of units of item which were needed but not made available for the top 25 items +# TODO ideally this should count the number of treatment IDs but this needs the detailed health system logger +def consumables_availability_figure(results_folder: Path, output_folder: Path, resourcefilepath: Path): + """ 'Figure 3': Usage of consumables in the HealthSystem""" + make_graph_file_name = lambda stub: output_folder / f"Fig3_consumables_availability_figure.png" # noqa: E731 + + def get_counts_of_items_requested(_df): + _df = drop_outside_period(_df) + + counts_of_available = defaultdict(int) + counts_of_not_available = defaultdict(int) + + for _, row in _df.iterrows(): + for item, num in row['Item_Available'].items(): + counts_of_available[item] += num + for item, num in row['Item_NotAvailable'].items(): # eval(row['Item_NotAvailable']) + counts_of_not_available[item] += num + + return pd.concat( + {'Available': pd.Series(counts_of_available), 'Not_Available': pd.Series(counts_of_not_available)}, + axis=1 + ).fillna(0).astype(int).stack() + + cons_req = summarize( + extract_results( + results_folder, + module='tlo.methods.healthsystem.summary', + key='Consumables', + custom_generate_series=get_counts_of_items_requested, + do_scaling=True + ), + only_mean=True, + collapse_columns=True + ) + + cons = cons_req.unstack() + cons_names = pd.read_csv( + resourcefilepath / 'healthsystem' / 'consumables' / 'ResourceFile_Consumables_Items_and_Packages.csv' + )[['Item_Code', 'Items']].set_index('Item_Code').drop_duplicates() + cons_names.index = cons_names.index.astype(str) + cons = cons.merge(cons_names, left_index=True, right_index=True, how='left').set_index('Items') #.astype(int) + cons = cons.assign(total=cons.sum(1)).sort_values('total').drop(columns='total') + + cons.columns = pd.MultiIndex.from_tuples(cons.columns, names=['draw', 'stat', 'var']) + cons_not_available = cons.loc[:, cons.columns.get_level_values(2) == 'Not_Available'] + cons_not_available.mean = cons_not_available.loc[:, cons_not_available.columns.get_level_values(1) == 'mean'] + cons_available = cons.loc[:, cons.columns.get_level_values(2) == 'Available'] + + cons_not_available = cons_not_available.unstack().reset_index() + cons_not_available = cons_not_available.rename(columns={0: 'qty_not_available'}) + +consumables_availability_figure(results_folder, outputspath, resourcefilepath) + +# TODO use squarify_plot to represent which consumables are most used in the system (by short Treatment_ID?) (not quantity but frequency) + +# HSI affected by missing consumables +# We need healthsystem logger for this + +# 3. Number of Health System Interactions +#----------------------------------------- +# HSIs taking place by level in the default scenario +def get_counts_of_hsis(_df): + _df = drop_outside_period(_df) + + # Initialize an empty dictionary to store the total counts + total_hsi_count = {} + + for date, appointment_dict in _df['Number_By_Appt_Type_Code_And_Level'].items(): + print(appointment_dict) + for level, appointments_at_level in appointment_dict.items(): + print(level, appointments_at_level) + total_hsi_count[level] = {} + for appointment_type, count in appointments_at_level.items(): + print(appointment_type, count) + if appointment_type in total_hsi_count: + total_hsi_count[level][appointment_type] += count + else: + total_hsi_count[level][appointment_type] = count + + total_hsi_count_series = pd.Series(total_hsi_count) + for level in ['0', '1a', '1b', '2', '3', '4']: + appointments_at_level = pd.Series(total_hsi_count_series[total_hsi_count_series.index == level].values[0], dtype='int') + # Create a list of tuples with the original index and the new level '1a' + new_index_tuples = [(idx, level) for idx in appointments_at_level.index] + # Create the new MultiIndex + new_index = pd.MultiIndex.from_tuples(new_index_tuples, names=['Appointment', 'Level']) + # Reindex the Series with the new MultiIndex + appointments_at_level_multiindex = appointments_at_level.copy() + appointments_at_level_multiindex.index = new_index + if level == '0': + appointments_all_levels = appointments_at_level_multiindex + else: + appointments_all_levels = pd.concat([appointments_all_levels, appointments_at_level_multiindex], axis = 0) + + return pd.Series(appointments_all_levels).fillna(0).astype(int) + +hsi_count = summarize( + extract_results( + results_folder, + module='tlo.methods.healthsystem.summary', + key='HSI_Event', + custom_generate_series=get_counts_of_hsis, + do_scaling=True + ), + only_mean=True, + collapse_columns=True +) + +hsi = hsi_count.assign(baseline_values=hsi_count[(0, 'mean')]).sort_values('baseline_values').drop(columns='baseline_values') +hsi.columns = pd.MultiIndex.from_tuples(hsi.columns, names=['draw', 'stat']) +#hsi = hsi.unstack().reset_index() +hsi_stacked = hsi.stack().stack().reset_index() +hsi_stacked = hsi_stacked.rename(columns={0: 'hsis_requested'}) + + +# 4.1 Number of Services delivered by long Treatment_ID +#------------------------------------------------------ +def get_counts_of_hsi_by_treatment_id(_df): + """Get the counts of the short TREATMENT_IDs occurring""" + _counts_by_treatment_id = _df \ + .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'TREATMENT_ID'] \ + .apply(pd.Series) \ + .sum() \ + .astype(int) + return _counts_by_treatment_id.groupby(level=0).sum() + +counts_of_hsi_by_treatment_id = summarize( + extract_results( + results_folder, + module='tlo.methods.healthsystem.summary', + key='HSI_Event', + custom_generate_series=get_counts_of_hsi_by_treatment_id, + do_scaling=True + ), + only_mean=True, + collapse_columns=True, +) + +counts_of_hsi_by_treatment_id = counts_of_hsi_by_treatment_id.assign(baseline_values=counts_of_hsi_by_treatment_id[(0, 'mean')]).sort_values('baseline_values').drop(columns='baseline_values') +hsi_by_treatment_id = counts_of_hsi_by_treatment_id.unstack().reset_index() +hsi_by_treatment_id = hsi_by_treatment_id.rename(columns={'level_2': 'Treatment_ID', 0: 'qty_of_HSIs'}) + +# hsi[(0,'mean')].sum()/counts_of_hsi_by_treatment_id[(0,'mean')].sum() + +# 4.2 Number of Services delivered by short Treatment ID +#-------------------------------------------------------- +def get_counts_of_hsi_by_short_treatment_id(_df): + """Get the counts of the short TREATMENT_IDs occurring (shortened, up to first underscore)""" + _counts_by_treatment_id = get_counts_of_hsi_by_treatment_id(_df) + _short_treatment_id = _counts_by_treatment_id.index.map(lambda x: x.split('_')[0] + "*") + return _counts_by_treatment_id.groupby(by=_short_treatment_id).sum() + + +counts_of_hsi_by_treatment_id_short = summarize( + extract_results( + results_folder, + module='tlo.methods.healthsystem.summary', + key='HSI_Event', + custom_generate_series=get_counts_of_hsi_by_short_treatment_id, + do_scaling=True + ), + only_mean=True, + collapse_columns=True, +) + +hsi_by_short_treatment_id = counts_of_hsi_by_treatment_id_short.unstack().reset_index() +hsi_by_short_treatment_id = hsi_by_short_treatment_id.rename(columns = {'level_2': 'Short_Treatment_ID', 0: 'qty_of_HSIs'}) + +# Cost of consumables? diff --git a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py new file mode 100644 index 0000000000..5a5acd6d14 --- /dev/null +++ b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py @@ -0,0 +1,70 @@ +""" +This file defines a batch run to calculate the health effect of updated consumable availability estimates +as a result of a supply chain intervention. The following scenarios are considered: +1. 'default' : this is the benchmark scenario with 2018 levels of consumable availability +2. 'scenario1' : [Level 1a + 1b] All items perform as well as consumables other than drugs/diagnostic tests +3. 'scenario2' : [Level 1a + 1b] 1 + All items perform as well as consumables classified as 'Vital' in the Essential Medicines List +4. 'scenario3' : [Level 1a + 1b] 2 + All facilities perform as well as those in which consumables stock is managed by pharmacists +5. 'scenario6' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 75th percentile best performing facility for each individual item +6. 'scenario7' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 90th percentile best performing facility for each individual item +7. 'scenario8' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item +8. 'scenario9' : [Level 1a + 1b + 2] All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item +9. 'scenario10' : [Level 1a + 1b] All programs perform as well as HIV +10. 'scenario11' : [Level 1a + 1b] All programs perform as well as EPI +12. 'all': all consumable are always available - provides the theoretical maximum health gains which can be made through improving consumable supply + +The batch runs are for a large population for a long time with all disease modules and full use of HSIs. +Run on the batch system using: +```tlo batch-submit src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py``` +or locally using: + ```tlo scenario-run src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py``` +""" + +from pathlib import Path +from typing import Dict + +from tlo import Date, logging +from tlo.analysis.utils import get_parameters_for_status_quo, mix_scenarios +from tlo.methods.fullmodel import fullmodel +from tlo.scenario import BaseScenario + +class ImpactOfConsumablesScenarios(BaseScenario): + def __init__(self): + super().__init__() + self.seed = 99 + self.start_date = Date(2010, 1, 1) + self.end_date = Date(2019, 12, 31) + self.pop_size = 100_000 # large population size for final simulation - 100,000 + self.number_of_draws = 11 # <- 11 scenarios + self.runs_per_draw = 10 # <- repeated this many times + + def log_configuration(self): + return { + 'filename': 'impact_of_consumables_scenarios', + 'directory': './outputs', # <- (specified only for local running) + 'custom_levels': { + '*': logging.WARNING, + 'tlo.methods.demography': logging.INFO, + 'tlo.methods.healthburden': logging.INFO, + "tlo.methods.healthsystem.summary": logging.INFO, + } + } + + def modules(self): + return fullmodel(resourcefilepath=self.resources) + + def draw_parameters(self, draw_number, rng): + return { + 'HealthSystem': { + 'cons_availability': ['default', + 'scenario1', 'scenario2', 'scenario3', + 'scenario6', 'scenario7', 'scenario8', + 'scenario9', 'scenario10', 'scenario11', + 'all'][draw_number] + } + } + +if __name__ == '__main__': + from tlo.cli import scenario_run + + scenario_run([__file__]) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/prepare_hhfa_consumables_data_for_inferential_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/prepare_hhfa_consumables_data_for_inferential_analysis.py new file mode 100644 index 0000000000..6a496519e6 --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/prepare_hhfa_consumables_data_for_inferential_analysis.py @@ -0,0 +1,811 @@ +""" +This script prepares HHFA data on consumable availability for regression analysis on R. + +Inputs: +1. Raw HHFA data - Q1.dta (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/0 raw/2_Final data/) +2. Cleaned variable names for HHFA data - variable_list.csv (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/1 processing) +3. Relevant distance calculations for regression analysis - facility_distances_hhfa.csv (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/2 clean/) +4. Consumable categorisations - items_hhfa.xlsx (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/1 processing/) + +Outputs: +1. Cleaned consumable availability data ready for regression analysis - cleaned_hhfa_2019.csv (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/2 clean/) + +Consumable availability is measured as probability of stockout at any point in time. +""" + +import calendar +import datetime +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from tabulate import tabulate +import copy + +from collections import defaultdict + +# Set local Dropbox source +path_to_dropbox = Path( # <-- point to the TLO dropbox locally + '/Users/sm2511/Dropbox/Thanzi la Onse' +) + +path_to_files_in_the_tlo_dropbox = path_to_dropbox / "07 - Data/HHFA_2018-19/" # <-- point to HHFA data folder in dropbox + +# define a timestamp for script outputs +timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") + +# print the start time of the script +print('Script Start', datetime.datetime.now().strftime('%H:%M')) + +## 1. DATA IMPORT ## +######################################################################################### +raw_hhfa = pd.read_csv(path_to_files_in_the_tlo_dropbox / '0 raw/2_Final data/Q1.csv', + low_memory=False) # import 2018 data +varnames = pd.read_csv(path_to_files_in_the_tlo_dropbox / '1 processing/variable_list.csv', + encoding="ISO-8859-1") # import file with cleaned variable names + +# Rename HHFA columns using variable name mapping in loaded .csv +old_var_name = varnames['var'] +new_var_name = varnames['new_var_name'] + +hhfa = copy.deepcopy(raw_hhfa) +for i in range(len(new_var_name)): + if new_var_name[i] != np.nan: + hhfa.rename(columns={old_var_name[i]: new_var_name[i]}, + inplace=True) + else: + pass + +# Rename columns with missing data to "a" and then drop these columns since these will not be used in the analysis +hhfa.rename({np.nan: "a"}, axis="columns", inplace=True) +hhfa.drop(["a"], axis=1, inplace=True) + +## 2. FEATURE CLEANING/MANIPULATION ## +######################################################################################### +# Clean districts # +cond = hhfa['district'] == 'Blanytyre' +hhfa.loc[cond, 'district'] = 'Blantyre' + +cond = hhfa['district'] == 'Nkhatabay' +hhfa.loc[cond, 'district'] = 'Nkhata Bay' + +# Pvt for profit hospital incorrectly labelled District Hospital +cond = hhfa.fac_code == 5067 +hhfa.loc[cond, 'fac_type'] = 'Other Hospital' + +# Clean fac_owner +cond = hhfa.fac_owner == 'Private non profit' +hhfa.loc[cond, 'fac_owner'] = 'NGO' + +# Bed count = 0 if outpatient only +for var in ['bed_count', 'inpatient_visit_count', 'inpatient_days_count']: + cond = hhfa['outpatient_only'] == "Yes" + hhfa.loc[cond, var] = 0 + +# Number of functional computers = 0 if no functional computers +cond = hhfa.functional_computer == "No" +hhfa.loc[cond, 'functional_computer_no'] = 0 + +for var in ['functional_landline', 'fuctional_mobile', 'functional_radio', 'functional_computer']: + cond = hhfa[var].str.contains("Yes") + hhfa.loc[cond, var] = "Yes" + +# If facility has a function emergency vehicle, then it has an accessible emergency vehicle +cond = hhfa.functional_emergency_vehicle == "Yes" +hhfa.loc[cond, 'accessible_emergency_vehicle'] = "Yes" + +cond = hhfa.accessible_emergency_vehicle == "No" +hhfa.loc[cond, 'fuel_available_today'] = "No" + +cond = hhfa.accessible_emergency_vehicle == "No" +hhfa.loc[cond, 'purpose_last_vehicle_trip'] = "Not applicable" + +# Correct the "Don't knows" # +for var in ['fuel_available_today']: + cond = hhfa[var] == "98" + hhfa.loc[cond, var] = "Don't know" + +for var in ['functional_ambulance', 'functional_car', 'functional_bicycle', 'functional_motor_cycle', + 'functional_bike_ambulance']: + cond = hhfa[var] == "No" + var_no = var + '_no' + hhfa.loc[cond, var_no] = 0 + +# Daily opening hours can't be more than 24 hours +cond = hhfa.fac_daily_opening_hours > 24 +hhfa.loc[cond, 'fac_daily_opening_hours'] = 24 + +# Water source within 500m if piped into facility or facility grounds +hhfa.water_source_main = hhfa.water_source_main.str.lower() +cond1 = hhfa['water_source_main'] == "piped into facility" +cond2 = hhfa['water_source_main'] == "piped onto facility grounds" +hhfa.loc[cond1 | cond2, 'water_source_main_within_500m'] = "Yes" + +# Edit water_source variable to reduce the number of categories +cond_other_watersource_1 = hhfa['water_source_main'] == 'protected spring' +cond_other_watersource_2 = hhfa['water_source_main'] == 'unprotected dug well' +cond_other_watersource_3 = hhfa['water_source_main'] == 'unprotected spring' +cond_other_watersource_4 = hhfa['water_source_main'] == 'tanker truck' +cond_other_watersource_5 = hhfa['water_source_main'] == 'cart w/small tank/drum ………' +cond_other_watersource_6 = hhfa['water_source_main'] == 'rainwater collection' +hhfa.loc[cond_other_watersource_1 | cond_other_watersource_2 | cond_other_watersource_3 | \ + cond_other_watersource_4 | cond_other_watersource_5 | cond_other_watersource_6, \ + 'water_source_main'] = 'other' + +# Convert water disruption duration +cond_hours = hhfa['water_disruption_duration_units'] == "Hours" +cond_weeks = hhfa['water_disruption_duration_units'] == "Weeks" +cond_months = hhfa['water_disruption_duration_units'] == "Months" +hhfa.loc[cond_hours, 'water_disruption_duration'] = hhfa['water_disruption_duration'] / 24 +hhfa.loc[cond_weeks, 'water_disruption_duration'] = hhfa['water_disruption_duration'] * 7 +hhfa.loc[cond_months, 'water_disruption_duration'] = hhfa['water_disruption_duration'] * 365 / 12 + +# If no functional toilet, then noe handwashing facility +cond = hhfa.functional_toilet == "No" +hhfa.loc[cond, 'functional_handwashing_facility'] = "No" + +# No CVD services at lower level facilities +cond = hhfa.fac_type.str.contains("Hospital") +hhfa.loc[~cond, 'service_cvd'] = "No" + +# Clean "other" category for how drug orders are placed +hhfa.drug_resupply_calculation_system_other_spec = hhfa.drug_resupply_calculation_system_other_spec.str.lower() + +cond_notna = hhfa['drug_resupply_calculation_system_other_spec'].notna() +cond_both = hhfa['drug_resupply_calculation_system_other_spec'].str.contains("both") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("for essentials") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("personal of emergency orders") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("pull & push") +hhfa.loc[cond_both & cond_notna, 'drug_resupply_calculation_system'] = "Both push and pull" + +cond_pull = hhfa['drug_resupply_calculation_system_other_spec'].str.contains("buy") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("cstock") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("press") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("counting") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("the facility purchases") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("bought") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("purchased") | \ + hhfa['drug_resupply_calculation_system_other_spec'].str.contains("private pharmacy") + +hhfa.loc[cond_pull & cond_notna, 'drug_resupply_calculation_system'] = "Facility itself (pull distribution system)" + +cond_push = hhfa['drug_resupply_calculation_system_other_spec'].str.contains("from the dho") +hhfa.loc[ + cond_push & cond_notna, 'drug_resupply_calculation_system'] = "A higher level facility (push distribution system)" + +# Clean "other" catergory for how drugs are transported +# drug_transport_other_spec hasn't been cleaned as this hasn't been used in the final analysis + +# If a facility does not report whether a particular source of drugs is used but indicates +# that another is used, mark the first as No +source_drugs_varlist = ['source_drugs_cmst', 'source_drugs_local_warehouse', 'source_drugs_ngo', + 'source_drugs_donor', 'source_drugs_pvt'] +for source1 in source_drugs_varlist: + cond_source_empty = hhfa[source1].isna() + for source2 in source_drugs_varlist: + cond_source_other = hhfa[source2] == "Yes" + hhfa.loc[cond_source_empty & cond_source_other,source1] = "No" + +# If a facility does not report whether a particular drug transport system is used but indicates +# that another is used, mark the first as No +drug_transport_varlist = ['drug_transport_local_supplier', 'drug_transport_higher_level_supplier', +'drug_transport_self', 'drug_transport_other'] +for transport1 in drug_transport_varlist: + cond_transport_empty = hhfa[transport1].isna() + for transport2 in drug_transport_varlist: + cond_transport_other = hhfa[transport2] == "Yes" + hhfa.loc[cond_transport_empty & cond_transport_other,transport1] = "No" + +# Drop outliers +cond = hhfa.travel_time_to_district_hq > 1000 +hhfa.loc[cond, 'travel_time_to_district_hq'] = np.nan + +cond = hhfa.referrals_to_other_facs == "3" +hhfa.loc[cond, 'referrals_to_other_facs'] = np.nan + +# Reduce the number of categories - whether functional refrigerator is available +cond_yes1 = hhfa.functional_refrigerator_epi.str.contains("Yes") +cond_yes2 = hhfa.vaccine_storage.str.contains("Yes") +cond_notna = hhfa.functional_refrigerator_epi.notna() +hhfa.loc[(cond_yes1 | cond_yes2) & cond_notna, 'functional_refrigerator_epi'] = "Yes" +cond_no = hhfa.functional_refrigerator_epi.str.contains("No") +hhfa.loc[cond_no & cond_notna, 'functional_refrigerator_epi'] = "No" + +cond_na = hhfa.service_epi == "No" +hhfa.loc[cond_na, 'vaccine_storage'] = "No" # if no EPI services, vaccines storage is not needed + +cond_yes = hhfa.functional_refrigerator.str.replace(" ","") == 'AVAILABLEANDFUNCTIONAL' +hhfa.loc[cond_yes, 'functional_refrigerator'] = "Yes" +cond_no1 = hhfa.functional_refrigerator.str.replace(" ","") == 'NOTAVAILABLE' +cond_no2 = hhfa.functional_refrigerator.str.replace(" ","")== 'AVAILABLENOTFUNCTIONAL' +hhfa.loc[cond_no1 | cond_no2, 'functional_refrigerator'] = "No" + +hhfa.functional_refrigerator_diagnostics = hhfa.functional_refrigerator_diagnostics.str.lower() +cond_yes = hhfa.functional_refrigerator_diagnostics == "available and functional" +cond_na = hhfa.functional_refrigerator_epi.isna() +cond_notna = hhfa.functional_refrigerator_diagnostics.notna() +hhfa.loc[cond_yes, 'functional_refrigerator_diagnostics'] = "Yes" + +# if refrigerator is (not) available as per the EPI section, then it should be same as per the Diagnostics section +hhfa.loc[cond_yes & cond_na, 'functional_refrigerator_epi'] = "Yes" +cond_no = hhfa.functional_refrigerator_diagnostics == "available not functional" | \ + hhfa.functional_refrigerator_diagnostics.str.contains("not available") +hhfa.loc[cond_no, 'functional_refrigerator_diagnostics'] = "No" +hhfa.loc[cond_no & cond_na, 'functional_refrigerator_epi'] = "No" + +# Aggregate refrigerator availability +cond_fridge_epi = hhfa.functional_refrigerator_epi == "Yes" +cond_fridge_diag = hhfa.functional_refrigerator_diagnostics == "Yes" +cond_vaccine_storage = hhfa.vaccine_storage == "Yes" +hhfa.loc[cond_fridge_epi | cond_fridge_diag | cond_vaccine_storage, 'functional_refrigerator'] = "Yes" +cond_no_fridge_epi = hhfa.functional_refrigerator_epi == "No" +cond_no_fridge_diag = hhfa.functional_refrigerator_diagnostics == "No" +hhfa.loc[cond_no_fridge_epi & cond_no_fridge_diag, 'functional_refrigerator'] = "Yes" +cond_no_vaccine_storage = hhfa.vaccine_storage == "No" +cond_fridge_na = hhfa.functional_refrigerator.isna() +hhfa.loc[cond_no_vaccine_storage & cond_fridge_na, 'functional_refrigerator'] = "No" + +# convert fac_location to binary (Yes/No) +hhfa = hhfa.rename(columns={'fac_location': 'fac_urban'}) +cond1 = hhfa.fac_urban.str.lower() == "rural" +hhfa.loc[cond1, 'fac_urban'] = 'No' +cond2 = hhfa.fac_urban.str.lower() == "urban" +hhfa.loc[cond2, 'fac_urban'] = 'Yes' + +# Clean water disruption variable +cond = hhfa['water_disruption_last_3mts'] == "No" +hhfa.loc[cond, 'water_disruption_duration'] = 0 + +# Change incharge_drugs to lower case +hhfa.incharge_drug_orders = hhfa.incharge_drug_orders.str.lower() + +# Clean other category for incharge_drug_orders +incharge_drug_orders_other_mapping = pd.read_csv(path_to_files_in_the_tlo_dropbox / '1 processing/incharge_drug_orders_other_mapping.csv') +hhfa = pd.merge(hhfa, incharge_drug_orders_other_mapping[['incharge_drug_orders_other_spec', + 'cleaned_incharge_drug_orders']], + on = 'incharge_drug_orders_other_spec', + how = 'left') +cond = hhfa.incharge_drug_orders == 'other' +hhfa.loc[cond, 'incharge_drug_orders'] = hhfa['cleaned_incharge_drug_orders'] + +## 3. CREATE CONSUMABLE AVAILABILITY DATAFRAME ## +######################################################################################### +# --- 3.1 Extract dataframe containing consumable availability from the raw HHFA dataframe --- # +# Rename columns variable name mapping in loaded .csv +consumables = copy.deepcopy(raw_hhfa) +for i in range(len(varnames['var'])): + # if HHFA variable has been mapped to a consumable and is not an equipment + if pd.notna(varnames['item'][i]) and pd.isna(varnames['equipment'][i]): + consumables.rename(columns={varnames['var'][i]: varnames['availability_metric'][i] + '_' + varnames['item'][i]}, + inplace=True) + elif varnames['var'][i] == 'Fcode': # keep facility code variable + consumables.rename(columns={varnames['var'][i]: varnames['new_var_name'][i]}, + inplace=True) + + # Mark all other unmapped variables to be dropped + else: + consumables.rename(columns={varnames['var'][i]: 'Drop'}, + inplace=True) + +consumables = consumables.drop(columns='Drop') + +# Check that there are no duplicate consumable + metric entries +assert len(consumables.columns[consumables.columns.duplicated(keep='last')]) == 0 + +# --- 3.2 Convert availability categories to binary entries --- # +# - 3.2.1 List of columns asking about consumable availability on the day of survey - # +today_cols = [col for col in consumables.columns if 'today' in col] +# Get list of availability entries +i = 0 +for col in today_cols: + consumables[col] = consumables[col].str.replace(" ", "") + if i == 0: + today_categories = consumables[col].unique() + else: + today_categories = np.append(today_categories, consumables[col].unique()) + i = 1 + +today_categories = list(dict.fromkeys(today_categories)) +today_categories = [x for x in today_categories if pd.isnull(x) == False] # drop nan from list + +# Create dictionary to map availability options to a number +today_categories_dict = {'NEVERAVAILABLE': -1} +for i in ['NOTAVAILABLE', 'No', 'AVAILABLENOTFUNCTIONAL', 'NOTAVAILABLETODAY', 'AVAILABLENONVALID']: + today_categories_dict[i] = 0 +for i in ['AVAILABLE,OBSERVED', 'AVAILABLEANDFUNCTIONAL', 'ATLEASTONEVALID']: + today_categories_dict[i] = 1 +for i in ['AVAILABLE,NOTOBSERVED', 'Yes', "AVAILABLEDON'TKNOWIFFUNCTIONAL", 'REPORTEDAVAILABLEBUTNOTSEEN']: + today_categories_dict[i] = 2 + +# Assert if any entries did not get featured in the dictionary +today_mapping = {k: today_categories_dict[k] for k in today_categories_dict.keys() & set(today_categories)} +assert len(today_mapping) == len(today_categories) + +# - 3.2.2 List of columns asking about consumable availability during the 3 months before survey - # +last_mth_or_3mts_cols = [col for col in consumables.columns if 'last' in col] +# Get list of avaibility entries +i = 0 +for col in last_mth_or_3mts_cols: + consumables[col] = consumables[col].str.replace(" ", "") + if i == 0: + last_mth_or_3mts_cols_categories = consumables[col].unique() + else: + last_mth_or_3mts_cols_categories = np.append(last_mth_or_3mts_cols_categories, consumables[col].unique()) + i = 1 + +last_mth_or_3mts_cols_categories = list(dict.fromkeys(last_mth_or_3mts_cols_categories)) +last_mth_or_3mts_cols_categories = [x for x in last_mth_or_3mts_cols_categories if + pd.isnull(x) == False] # drop nan from list + +# Create dictionary to map availability options to a number +last_mth_or_3mts_cols_categories_dict = {'PRODUCTNOTOFFERED': -1} +for i in ['NOTINDICATED', 'FACILITYRECORDNOTAVAILABLE']: + last_mth_or_3mts_cols_categories_dict[i] = -1 +for i in ['STOCK-OUTINTHEPAST3MONTHS', 'Yes']: + last_mth_or_3mts_cols_categories_dict[i] = 0 +for i in ['NOSTOCK-OUTINPAST3MONTHS', 'No']: + last_mth_or_3mts_cols_categories_dict[i] = 1 + +# Assert if any entries did not get featured in the dictionary +last_mth_or_3mts_mapping = {k: last_mth_or_3mts_cols_categories_dict[k] for k in + last_mth_or_3mts_cols_categories_dict.keys() & set(last_mth_or_3mts_cols_categories)} +assert len(last_mth_or_3mts_mapping) == len(last_mth_or_3mts_cols_categories) + +# - 3.2.3 Recode all availbility variables - # +consumables_num = copy.deepcopy(consumables) +for col in today_cols: + consumables_num[col] = consumables[col].map(today_categories_dict).fillna(consumables[col]) +for col in last_mth_or_3mts_cols: + cond1 = (consumables_num[col] == 'NOTINDICATED') + cond2 = (consumables_num[col] == 'FACILITYRECORDNOTAVAILABLE') + consumables_num.loc[cond1 | cond2, col] = np.nan + consumables_num[col] = consumables[col].map(last_mth_or_3mts_cols_categories_dict).fillna(consumables[col]) + +# - 3.2.4 Convert numeric availability variable to binary - # +consumables_bin = copy.deepcopy(consumables_num) +for col in today_cols: + cond0 = consumables_bin[col] == -1 + cond1 = consumables_bin[col] == 2 + consumables_bin.loc[cond0, col] = 0 + consumables_bin.loc[cond1, col] = 1 +for col in last_mth_or_3mts_cols: + cond0 = consumables_bin[col] == -1 + consumables_bin.loc[cond0, col] = 0 + +## 4. RESHAPE DATA AND DROP DUPLICATE CONSUMABLE ENTRIES ## +######################################################################################### +consumables_long = pd.melt(consumables_bin, id_vars= 'fac_code', value_vars= today_cols, + var_name= 'item', value_name='value') + +consumables_long = consumables_long.rename(columns = {'value': 'available'}) + +# Split consumable variable name into consumable name (item) and avaibility metric used +consumables_long['temp'] = consumables_long.item.str.rfind('_', start = 0, end = 100) +consumables_long['metric']=consumables_long.apply(lambda x: x['item'][:x['temp']],axis = 1) +consumables_long['item']=consumables_long.apply(lambda x: x['item'][x['temp']+1:],axis = 1) +consumables_long = consumables_long.drop(columns = 'temp') + +# For items which are duplicated, keep the metric under which more facilities have reported +# Get list of items which are duplicated +duplicated_items = consumables_long[consumables_long.duplicated(['item', 'fac_code'], keep=False)].item.unique() +cond1 = consumables_long.item.isin(duplicated_items) +cond2 = consumables_long.available.notna() + +# Generate variable denoting number of reporting facilities +aggregated = consumables_long[cond2].groupby(['item', 'metric']).nunique()['fac_code'] +aggregated.name = 'report_count' +consumables_long = consumables_long.join(aggregated,on=['item', 'metric']) + +# Sort in descending order of reporting facilities +consumables_long.sort_values(['item', 'report_count', 'fac_code'], axis=0, ascending=True) + +# Drop duplicates based on number of reporting facilities +# so that if there are two records of the same consumable, the one with more reported observations is preserved +consumables_long_unique = copy.deepcopy(consumables_long) +consumables_long_unique.drop_duplicates(subset=['item', 'fac_code'], keep='first', inplace=True) + +# Assert that all duplicates have been addressed +assert len(consumables_long_unique[consumables_long_unique.duplicated(['item', 'fac_code'], keep=False)].item.unique()) == 0 +print(len(consumables_long[consumables_long.duplicated(['item', 'fac_code'], keep=False)].item.unique()), + "duplicate items were reduced to", + len(consumables_long_unique[consumables_long_unique.duplicated(['item', 'fac_code'], keep=False)].item.unique())) + +## 5. ACCOUNT FOR SUBSTITUTABLE CONSUMABLES ## +######################################################################################### +# --- 5.1 ART components --- +# Component 1: AZT or TDF or d4T +cond1 = consumables_long_unique['item'].isin(['Zidovudine (ZDV, AZT)', + 'Zidovudine (ZDV, AZT) syrup (ARVs)', + 'Tenofovir Disoproxil Fumarate (TDF) (ARVs)', + 'Zidovudine + Lamivudine (AZT + 3TC) (ARVs)', + 'Zidovudine + Lamivudine + Abacavir (AZT + 3TC + ABC) (ARVs)', + 'Zidovudine + Lamivudine + Nevirapine (AZT + 3TC + NVP) (ARVs)', + 'Tenofovir + Emtricitabine (TDF + FTC) (ARVs)', + 'Tenofovir + Lamivudine (TDF + 3TC) (ARVs)', + 'Tenofovir + Lamivudine + Efavirenz (TDF + 3TC + EFV) (ARVs)', + 'Tenofovir + Emtricitabine + Efavirenz (TDF + FTC + EFV)', + 'Stavudine 30 or 40 (D4T) (ARVs)', + 'Stavudine syrup (ARVs)', + 'Stavudine + Lamivudine (D4T + 3TC) (ARVs)', + 'Stavudine + Lamivudine + Nevirapine (D4T + 3TC + NVP) (ARVs)']) +## Component 2: 3TC or FTC +cond2 = consumables_long_unique['item'].isin(['Lamivudine (3TC) (ARVs)', + 'Emtricitabine (FTC) (ARVs)', + 'Lamivudine + Abacavir (3TC + ABC)', + 'Zidovudine + Lamivudine (AZT + 3TC) (ARVs)', + 'Zidovudine + Lamivudine + Abacavir (AZT + 3TC + ABC) (ARVs)', + 'Zidovudine + Lamivudine + Nevirapine (AZT + 3TC + NVP) (ARVs)', + 'Tenofovir + Emtricitabine (TDF + FTC) (ARVs)', + 'Tenofovir + Lamivudine (TDF + 3TC) (ARVs)', + 'Tenofovir + Lamivudine + Efavirenz (TDF + 3TC + EFV) (ARVs)', + 'Tenofovir + Emtricitabine + Efavirenz (TDF + FTC + EFV)', + 'Lamivudine (3TC) syrup (ARVs)', + 'Stavudine + Lamivudine (D4T + 3TC) (ARVs)', + 'Stavudine + Lamivudine + Nevirapine (D4T + 3TC + NVP) (ARVs)']) +## Component 3: Protease inhibitor +cond3 = consumables_long_unique['item'].isin(['Abacavir (ABC) (ARVs)', + 'Nevirapine (NVP) (ARVs)', + 'Nevirapine (NVP) syrup (ARVs)', + 'Efavirenz (EFV) (ARVs)', + 'Lamivudine + Abacavir (3TC + ABC)', + 'Zidovudine + Lamivudine + Abacavir (AZT + 3TC + ABC) (ARVs)', + 'Zidovudine + Lamivudine + Nevirapine (AZT + 3TC + NVP) (ARVs)', + 'Tenofovir + Lamivudine + Efavirenz (TDF + 3TC + EFV) (ARVs)', + 'Tenofovir + Emtricitabine + Efavirenz (TDF + FTC + EFV)', + 'Efavirenz (EFV) syrup (ARVs)', + 'Stavudine + Lamivudine + Nevirapine (D4T + 3TC + NVP) (ARVs)', + 'Lopinavir (LPV) (protease inhibitors)', + 'Indinavir (IDV) (protease inhibitors)', + 'Nelfinavir (NFV) (protease inhibitors)', + 'Saquinavir (SQV) (protease inhibitors)', + 'Ritonavir (RTV) (protease inhibitors)', + 'Atazanavir (ATV) (protease inhibitors)', + 'Fosamprenavir (FPV) (protease inhibitors)', + 'Tipranavir (TPV) (protease inhibitors)', + 'Darunavir (DPV) (protease inhibitors)']) + +# Component 1 +art_component_1 = consumables_long_unique[cond1].copy() +art_component_1.loc[:,'item'] = 'art_component_1' +art_component_1 = art_component_1.groupby( + ['fac_code', 'item'], + as_index=False).agg({'available': np.nanmax, + 'metric': 'first'}) + +# Component 2 +art_component_2 = consumables_long_unique[cond2].copy() +art_component_2['item'] = 'art_component_2' +art_component_2 = art_component_2.groupby( + ['fac_code', 'item'], + as_index=False).agg({'available': np.nanmax, + 'metric': 'first'}) + +# Component 3 +art_component_3 = consumables_long_unique[cond3].copy() +art_component_3['item'] = 'art_component_3' +art_component_3 = art_component_3.groupby( + ['fac_code', 'item'], + as_index=False).agg({'available': np.nanmax, + 'metric': 'first'}) + +# Append all datasets +art = pd.concat([art_component_1, art_component_2, art_component_3]) + +consumables_postart = pd.concat([art,consumables_long_unique[~cond1 & ~cond2 & ~cond3]]) + +print(consumables_postart.item.nunique(), + "left out of", + consumables_long_unique.item.nunique(), + "items left after accounting for ART substitutes") + +# --- 5.2 Tuberculosis treatment substitutes --- # +# Combinations used as per https://stoptb.org/assets/documents/gdf/whatis/faq-brochure.pdf = RHZE, RHZ, RH, EH, TH +# Component 1: Isoniazid +cond1 = consumables_postart['item'].isin(['Isoniazid', + 'Isoniazid + Rifampicin (2FDC)', + 'Isoniazid + Ethambutol (EH) (2FDC)', + 'Isoniazid + Rifampicin + Pyrazinamide (RHZ) (3FDC)', + 'Isoniazid + Rifampicin + Ethambutol (RHE) (3FDC)', + 'Isoniazid + Rifampicin + Pyrazinamide + Ethambutol (4FDC)']) + +# Component 2: Rifampicin +cond2 = consumables_postart['item'].isin(['Rifampicin', + 'Isoniazid + Rifampicin (2FDC)', + 'Isoniazid + Rifampicin + Pyrazinamide (RHZ) (3FDC)', + 'Isoniazid + Rifampicin + Ethambutol (RHE) (3FDC)', + 'Isoniazid + Rifampicin + Pyrazinamide + Ethambutol (4FDC)']) + +# Component 3: Pyrazinamide +cond3 = consumables_postart['item'].isin(['Pyrazinamide', + 'Isoniazid + Rifampicin + Pyrazinamide (RHZ) (3FDC)', + 'Isoniazid + Rifampicin + Pyrazinamide + Ethambutol (4FDC)']) + +# Component 4: Ethambutol +cond4 = consumables_postart['item'].isin(['Ethambutol', + 'Isoniazid + Ethambutol (EH) (2FDC)', + 'Isoniazid + Rifampicin + Ethambutol (RHE) (3FDC)', + 'Isoniazid + Rifampicin + Pyrazinamide + Ethambutol (4FDC)']) + +# Component 1 +tb_component_1 = consumables_postart[cond1].copy() +tb_component_1['item'] = 'Isoniazid' +tb_component_1 = tb_component_1.groupby( + ['fac_code', 'item'], + as_index=False).agg({'available': np.nanmax, + 'metric': 'first'}) +# Component 2 +tb_component_2 = consumables_postart[cond2].copy() +tb_component_2['item'] = 'Rifampicin' +tb_component_2 = tb_component_2.groupby( + ['fac_code', 'item'], + as_index=False).agg({'available': np.nanmax, + 'metric': 'first'}) +# Component 3 +tb_component_3 = consumables_postart[cond3].copy() +tb_component_3['item'] = 'Pyrazinamide' +tb_component_3 = tb_component_3.groupby( + ['fac_code', 'item'], + as_index=False).agg({'available': np.nanmax, + 'metric': 'first'}) +# Component 4 +tb_component_4 = consumables_postart[cond4].copy() +tb_component_4['item'] = 'Ethambutol' +tb_component_4 = tb_component_4.groupby( + ['fac_code', 'item'], + as_index=False).agg({'available': np.nanmax, + 'metric': 'first'}) + +# Append all datasets +tb = pd.concat([tb_component_1, tb_component_2, tb_component_3, tb_component_4]) + +consumables_posttb = pd.concat([tb, consumables_postart[~cond1 & ~cond2 & ~cond3 & ~cond4]]) + +print(consumables_posttb.item.nunique(), + "left out of", + consumables_long_unique.item.nunique(), + "items left after accounting for ART and TB substitutes") + +# --- 5.3 Iron and folic acid --- # +# Component 1: Iron +cond1 = consumables_posttb['item'].isin(['Iron tablet', + 'Iron and folic combined tablets']) + +# Component 2: Folic Acid +cond2 = consumables_posttb['item'].isin(['Folic acid tablet', + 'Iron and folic combined tablets']) + +# Component 1 +fefo_component_1 = consumables_posttb[cond1].copy() +fefo_component_1['item'] = 'Iron tablet' +fefo_component_1 = fefo_component_1.groupby( + ['fac_code', 'item'], + as_index=False).agg({'available': np.nanmax, + 'metric': 'first'}) +# Component 2 +fefo_component_2 = consumables_posttb[cond2].copy() +fefo_component_2['item'] = 'Folic acid tablet' +fefo_component_2 = fefo_component_2.groupby( + ['fac_code', 'item'], + as_index=False).agg({'available': np.nanmax, + 'metric': 'first'}) + +# Append all datasets +fefo = pd.concat([fefo_component_1, fefo_component_2]) + +consumables_postfefo = pd.concat([fefo,consumables_posttb[~cond1 & ~cond2]]) + +print(consumables_postfefo.item.nunique(), + "left out of", + consumables_long_unique.item.nunique(), + "items left after accounting for ART, TB, and FeFo substitutes") + +# --- 5.4 Other substitutes --- # +# Merge in information on which consumables are substitutable for each other +item_categories = pd.read_excel(path_to_files_in_the_tlo_dropbox / '1 processing/items_hhfa.xlsx', + index_col=0, sheet_name = "items_hhfa") + +cols = ['item', 'substitute_group'] +consumables_postfefo = pd.merge(consumables_postfefo,item_categories[cols], how = 'left', on = 'item') +cond = consumables_postfefo.substitute_group == 'manual fix' +consumables_postfefo.loc[cond, 'substitute_group'] = np.nan + +j = 0 +for i in consumables_postfefo.substitute_group.unique(): + cond = consumables_postfefo.substitute_group == i + sub_group = consumables_postfefo[cond].copy() + sub_group = sub_group.groupby(['fac_code', 'substitute_group'], as_index=False).agg({'available': np.nanmax, + 'metric': 'first', + 'item': 'first'}) + if j == 0: + sub_groups_all = sub_group + j = 1 + else: + sub_groups_all = pd.concat([sub_groups_all, sub_group]) + +# Append with the remaining data +consumables_final = consumables_postfefo[consumables_postfefo.substitute_group.isna()] +consumables_final = pd.concat([sub_groups_all, consumables_final]) + +print(consumables_final.item.nunique(), + "left out of", + consumables_long_unique.item.nunique(), + "items left after accounting for all substitutes") + +## 6. PREPARE DATA FOR REGRESSION ## +######################################################################################### +# Keep columns to be analysed +feature_cols = ['fac_code', 'fac_name', + # Main characteristics + 'district', 'fac_type','fac_owner', 'fac_urban', + + # types of services offered + 'outpatient_only','bed_count', 'inpatient_visit_count','inpatient_days_count', + 'service_fp','service_anc','service_pmtct','service_delivery','service_pnc','service_epi','service_imci', + 'service_hiv','service_tb','service_othersti','service_malaria','service_blood_transfusion','service_diagnostic', + 'service_cvd','service_chronic_respiratory_mgt', 'service_consumable_stock', + + # operation frequency + 'fac_weekly_opening_days','fac_daily_opening_hours', + + # utilities/facilities available + 'functional_landline','fuctional_mobile','functional_radio','functional_computer','functional_computer_no', + 'internet_access_today', + 'electricity', 'water_source_main','water_source_main_within_500m', # this is in minutes + 'functional_toilet','functional_handwashing_facility', + 'water_disruption_last_3mts','water_disruption_duration', # converted to days + + # vehicles available + 'functional_emergency_vehicle','accessible_emergency_vehicle','fuel_available_today','purpose_last_vehicle_trip', + + 'functional_ambulance', 'functional_ambulance_no', # Keep one of the two variables + 'functional_car', 'functional_car_no', + 'functional_motor_cycle', 'functional_motor_cycle_no', + 'functional_bike_ambulance', 'functional_bike_ambulance_no', + 'functional_bicycle', 'functional_bicycle_no', + + 'vaccine_storage','functional_refrigerator', + + # Drug ordering process + 'incharge_drug_orders','drug_resupply_calculation_system','drug_resupply_calculation_method','source_drugs_cmst', + 'source_drugs_local_warehouse','source_drugs_ngo','source_drugs_donor','source_drugs_pvt', + + 'drug_transport_local_supplier','drug_transport_higher_level_supplier','drug_transport_self','drug_transport_other', + + 'drug_order_fulfilment_delay','drug_order_fulfilment_freq_last_3mts', + 'transport_to_district_hq','travel_time_to_district_hq', + + # referral system + 'referral_system_from_community','referrals_to_other_facs', + + # Drug management + 'mathealth_label_and_expdate_visible', 'mathealth_expdate_fefo', + 'childhealth_label_and_expdate_visible', 'childhealth_expdate_fefo'] + +# Merge consumable availability data +merged_df1 = pd.merge(consumables_final,hhfa[feature_cols], how = 'left', on = 'fac_code') + +# Merge distances data +fac_gis = pd.read_csv(path_to_files_in_the_tlo_dropbox / "2 clean/facility_distances_hhfa.csv") +merged_df2 = pd.merge(merged_df1, fac_gis[['fac_code', 'lat', 'long', 'lat_dh', 'long_dh', 'lat_rms', 'long_rms','rms', + 'dist_todh', 'dist_torms', 'drivetime_todh', 'drivetime_torms']], how = 'left', on = 'fac_code') + +# Merge item categories +item_categories = pd.read_excel(path_to_files_in_the_tlo_dropbox / '1 processing/items_hhfa.xlsx', + index_col=0, sheet_name = "items_hhfa") + +cols = ['drug_class_rx_list', 'item', 'mode_administration', 'program', 'item_type'] +df_for_regression = pd.merge(merged_df2,item_categories[cols], how = 'left', on = 'item') + +# Add information on program and other categorisations where this is not available +cond = df_for_regression.item.str.contains('art_component') +df_for_regression.loc[cond, 'program'] = 'hiv' +df_for_regression.loc[cond, 'mode_administration'] = 'oral' +df_for_regression.loc[cond, 'drug_class_rx_list'] = 'antiretroviral' +df_for_regression.loc[cond, 'item_type'] = 'drug' + +# Clean program to reduce the number of categories +cond_resp = df_for_regression.program == 'respiratory illness' +df_for_regression.loc[cond_resp,'program'] = 'acute lower respiratory infections' + +cond_road = df_for_regression.program == 'road traffic injuries' +df_for_regression.loc[cond_road,'program'] = 'surgical' + +cond_fungal = df_for_regression.program == 'fungal infection' +df_for_regression.loc[cond_fungal,'program'] = 'other' + +cond_iv = df_for_regression.program == 'IV and injectables' +df_for_regression.loc[cond_iv,'program'] = 'general' + +cond_ncd1 = df_for_regression.program == 'hypertension' +cond_ncd2 = df_for_regression.program == 'diabetes' +df_for_regression.loc[cond_ncd1|cond_ncd2,'program'] = 'ncds' + +cond_vit = df_for_regression.item == 'Vitamin A (retinol) capsule' +df_for_regression.loc[cond_vit, 'program'] = 'obstetric and newborn care' + +cond_mvit = df_for_regression.item == 'Multivitamins' +df_for_regression.loc[cond_mvit, 'program'] = 'general' + +cond_rutf = df_for_regression.item == 'Ready to use therapeutic food(RUTF)' +df_for_regression.loc[cond_rutf, 'program'] = 'child health' + +# Reduce the number of categories in mode_administration +cond = df_for_regression['mode_administration'] == "implant" +df_for_regression.loc[cond, 'mode_administration'] = "other" + +# Clean facility level +df_for_regression = df_for_regression.rename(columns = {'fac_type': 'fac_type_original'}) +df_for_regression['fac_type'] = "" + +cond_mch = (df_for_regression['fac_name'].str.contains('Mzuzu Cental Hospital')) +df_for_regression.loc[cond_mch, 'fac_name'] = 'Mzuzu Central Hospital' + +cond_level0 = (df_for_regression['fac_name'].str.contains('Health Post')) | \ + (df_for_regression['fac_type_original'].str.contains('Health Post')) +cond_level1a = (df_for_regression['fac_type_original'] == 'Clinic') | (df_for_regression['fac_type_original'] == 'Health Centre') | \ + (df_for_regression['fac_type_original'].str.contains('Dispensary')) | \ + (df_for_regression['fac_type_original'].str.contains('Maternity')) +cond_level1b = (df_for_regression['fac_type_original'].str.contains('Community Hospital')) | \ + (df_for_regression['fac_type_original'] == 'Other Hospital') +cond_level2 = (df_for_regression['fac_type_original'] == 'District Hospital') +cond_level3 = df_for_regression.fac_name.str.contains("Central Hospit") +cond_level4 = df_for_regression.fac_name.str.contains("Mental Hospit") + +df_for_regression.loc[cond_level0,'fac_type'] = 'Facility_level_0' +df_for_regression.loc[cond_level1a,'fac_type'] = 'Facility_level_1a' +df_for_regression.loc[cond_level1b,'fac_type'] = 'Facility_level_1b' +df_for_regression.loc[cond_level2,'fac_type'] = 'Facility_level_2' +df_for_regression.loc[cond_level3,'fac_type'] = 'Facility_level_3' +df_for_regression.loc[cond_level4,'fac_type'] = 'Facility_level_4' + +# Sort by facility type +df_for_regression['fac_type_original'] = pd.Categorical(df_for_regression['fac_type_original'], ['Health Post', + 'Dispensary', + 'Maternity', + 'Clinic', + 'Health Centre', + 'Rural/Community Hospital', + 'Other Hospital', + 'District Hospital', + 'Central Hospital']) +df_for_regression.sort_values(['fac_name','fac_type_original'], inplace = True) + +# Convert binary variables to numeric +fac_vars_binary = ['outpatient_only', 'fac_urban', + 'service_fp','service_anc','service_pmtct','service_delivery','service_pnc','service_epi','service_imci', + 'service_hiv','service_tb','service_othersti','service_malaria','service_blood_transfusion','service_diagnostic', + 'service_cvd', 'service_consumable_stock', + 'functional_landline','fuctional_mobile','functional_radio','functional_computer','internet_access_today', + 'electricity', 'functional_toilet','functional_handwashing_facility', + 'water_disruption_last_3mts', 'water_source_main_within_500m', + 'functional_emergency_vehicle','accessible_emergency_vehicle', + 'functional_ambulance', + 'functional_car', + 'functional_motor_cycle', + 'functional_bike_ambulance', + 'functional_bicycle', + 'vaccine_storage','functional_refrigerator', + 'source_drugs_cmst','source_drugs_local_warehouse','source_drugs_ngo','source_drugs_donor','source_drugs_pvt', + 'drug_transport_local_supplier','drug_transport_higher_level_supplier','drug_transport_self','drug_transport_other', + 'referral_system_from_community','referrals_to_other_facs', + 'mathealth_label_and_expdate_visible', 'mathealth_expdate_fefo', + 'childhealth_label_and_expdate_visible'] + +binary_dict = {'yes': 1, 'no': 0} + +df_for_regression_binvars_cleaned = copy.deepcopy(df_for_regression) +for col in fac_vars_binary: + df_for_regression_binvars_cleaned[col] = df_for_regression_binvars_cleaned[col].str.lower() + + # replace don't know as missing + cond = df_for_regression_binvars_cleaned[col] == "don't know" + df_for_regression_binvars_cleaned.loc[cond, col] = np.nan + + assert len([x for x in df_for_regression_binvars_cleaned[col].unique() if + pd.isnull(x) == False]) == 2 # verify that the variable is binary + + df_for_regression_binvars_cleaned[col] = df_for_regression_binvars_cleaned[col].map(binary_dict).fillna(df_for_regression_binvars_cleaned[col]) + +# Save dataset ready for regression analysis as a .csv file +df_for_regression_binvars_cleaned.to_csv(path_to_files_in_the_tlo_dropbox / '2 clean/cleaned_hhfa_2019.csv') diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/data_setup.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/data_setup.R new file mode 100644 index 0000000000..b2cf39eaff --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/data_setup.R @@ -0,0 +1,136 @@ +# This script loads the data for descriptive and inferential analysis + +# 1. Load HHFA data +#################### +df_orig <- read.csv(paste0(path_to_data, "cleaned_hhfa_2019.csv")) + +# 1.1 Assign a code to items +#---------------------------- +df_orig <- df_orig %>% + group_by(item) %>% + mutate(item_code = cur_group_id()) %>% + ungroup() + +# 2. Filter data to be ignored +############################### +second_line_arvs <- c("Didanosine (DDI) (ARVs)", + "Enfuvirtide (T-20) (ARVs)", + "Delavirdine (DLV) (ARVs)") +df <- df_orig %>% + filter(service_consumable_stock == 1, # Facilities which do not store consumables + program != 'mental health/epilepsy', # mental health commodities are rare at this level of care + fac_daily_opening_hours != 0, # Facilities which reporting 0 opening hours per day + !(item %in% second_line_arvs) # Second line ARVs + ) + +# Formatting +df$fac_code <- as.character(df$fac_code) + +# 3. Add consumable classification in the Essential Medicines List # +##################################################################### +essential_med_mapping <- read.csv(paste0(path_to_data, "essential_medicine_list_mapping.csv")) +names(essential_med_mapping)[names(essential_med_mapping) == 'Consumable'] <- 'item' +names(essential_med_mapping)[names(essential_med_mapping) == 'Therapeutic.priority'] <- 'eml_therapeutic_priority' + +df <- merge(df, essential_med_mapping[c('item','eml_therapeutic_priority')], by = 'item') +# Generate binary variable +df$eml_priority_v <- 0 +df[which(df$eml_therapeutic_priority == "V"),]$eml_priority_v <- 1 +df$eml_priority_e <- 0 +df[which(df$eml_therapeutic_priority == "E"),]$eml_priority_e <- 1 + +# 4. Define variable lists for analysis # +######################################### +fac_exp_vars <- c(# Main characteristics + 'district', 'fac_type','fac_owner', 'fac_urban', 'rms', 'item_drug', + + # types of services offered + 'outpatient_only','bed_count', 'inpatient_visit_count','inpatient_days_count', + 'service_fp','service_anc','service_pmtct','service_delivery','service_pnc','service_epi','service_imci', + 'service_hiv','service_tb','service_othersti','service_malaria','service_blood_transfusion', + 'service_cvd', 'service_diagnostic', + + # operation frequency + 'fac_weekly_opening_days','fac_daily_opening_hours', + + # utilities/facilities available + 'functional_landline','fuctional_mobile','functional_radio','functional_computer','functional_computer_no', + 'internet_access_today', + 'electricity', 'water_source_main','water_source_main_within_500m', # this is in minutes + 'functional_toilet','functional_handwashing_facility', + 'water_disruption_last_3mts','water_disruption_duration', # converted to days + + # vehicles available + 'functional_emergency_vehicle','accessible_emergency_vehicle','purpose_last_vehicle_trip', + + 'functional_ambulance', 'functional_ambulance_no', # Keep one of the two variables + 'functional_car', 'functional_car_no', + 'functional_motor_cycle', 'functional_motor_cycle_no', + 'functional_bike_ambulance', 'functional_bike_ambulance_no', + 'functional_bicycle', 'functional_bicycle_no', + 'vaccine_storage','functional_refrigerator', + + # Drug ordering process + 'incharge_drug_orders','drug_resupply_calculation_system','drug_resupply_calculation_method','source_drugs_cmst', + 'source_drugs_local_warehouse','source_drugs_ngo','source_drugs_donor','source_drugs_pvt', + + 'drug_transport_local_supplier','drug_transport_higher_level_supplier','drug_transport_self','drug_transport_other', + + 'drug_order_fulfilment_delay','drug_order_fulfilment_freq_last_3mts', + 'transport_to_district_hq','travel_time_to_district_hq', + + # referral system + 'referral_system_from_community','referrals_to_other_facs', + + # location w.r.t reference points + 'dist_todh', 'dist_torms', 'drivetime_todh', 'drivetime_torms') + +# Continuous variables +fac_vars_numeric <- c('bed_count', 'inpatient_visit_count','inpatient_days_count', 'functional_computer_no', + 'water_disruption_duration', + 'functional_ambulance_no', 'functional_car_no', 'functional_motor_cycle_no', 'functional_bike_ambulance_no', + 'functional_bicycle_no', + 'fac_weekly_opening_days','fac_daily_opening_hours', + 'drug_order_fulfilment_delay','drug_order_fulfilment_freq_last_3mts', + 'travel_time_to_district_hq', 'dist_todh', 'dist_torms', 'drivetime_todh', 'drivetime_torms') + +# Binary variables +fac_vars_binary <- c('outpatient_only', 'fac_urban', + 'service_fp','service_anc','service_pmtct','service_delivery','service_pnc','service_epi','service_imci', + 'service_hiv','service_tb','service_othersti','service_malaria','service_blood_transfusion', + 'service_cvd', 'service_diagnostic', + 'functional_landline','fuctional_mobile','functional_radio','functional_computer','internet_access_today', + 'electricity', 'functional_toilet','functional_handwashing_facility', + 'water_disruption_last_3mts', + 'functional_emergency_vehicle','accessible_emergency_vehicle', + 'functional_ambulance', + 'functional_car', + 'functional_motor_cycle', + 'functional_bike_ambulance', + 'functional_bicycle', + 'vaccine_storage','functional_refrigerator', + 'source_drugs_cmst','source_drugs_local_warehouse','source_drugs_ngo','source_drugs_donor','source_drugs_pvt', + 'drug_transport_local_supplier','drug_transport_higher_level_supplier','drug_transport_self','drug_transport_other', + 'referral_system_from_community','referrals_to_other_facs') + +item_vars_binary <- c('item_drug', 'eml_priority_v', 'eml_priority_e') + +# Determinants as per literature +stockout_factors_from_lit <- c('fac_type', 'fac_owner', 'fac_urban','functional_computer', + 'functional_emergency_vehicle', 'incharge_drug_orders', + 'dist_todh','dist_torms','drug_order_fulfilment_freq_last_3mts', + 'service_diagnostic', 'item_drug','eml_priority_v') +# drug_resupply_calculation_system - this variable has been dropped since it's not clear whether this was accurately reported + +# Look at data by item +df_not_na <- subset(df, !is.na(df$available)) +df_by_item <- df %>% + group_by(item) %>% + summarise(available = mean(available)) + +df_by_fac <- df_not_na %>% + group_by(fac_code) %>% + summarise(available = mean(available), + drug_resupply_calculation_system = first(na.omit(drug_resupply_calculation_system)), + fac_owner = first(na.omit(fac_owner)), + functional_bike_ambulance_no = first(na.omit(functional_bike_ambulance_no))) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/feature_manipulation.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/feature_manipulation.R new file mode 100644 index 0000000000..307d526e77 --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/feature_manipulation.R @@ -0,0 +1,202 @@ +# This script performs feature manipulation on the cleaned HHFA dataset + +# 1. Feature manipulation # +########################################## +# 1.0 Clean binary variables +#--------------------------------------- +# Convert item_type variable to binary +df["item_drug"] <- 0 +df[which(df$item_type == "drug"),]$item_drug <- 1 + +# 1.1 Cap variables/remove outliers +#---------------------------------- +df$bed_count <- edit_outliers(df$bed_count, "upper") +df$inpatient_visit_count <- edit_outliers(df$inpatient_visit_count, "upper") +df$drug_order_fulfilment_delay <- edit_outliers(df$drug_order_fulfilment_delay, "upper") +df$drug_order_fulfilment_freq_last_3mts <- edit_outliers(df$drug_order_fulfilment_freq_last_3mts, "upper") +df$water_disruption_duration <- edit_outliers(df$water_disruption_duration, "upper") +df$dist_todh <- edit_outliers(df$dist_todh, "upper") +df$dist_torms <- edit_outliers(df$dist_torms, "upper") + +df$functional_ambulance_no <- edit_outliers(df$functional_ambulance_no, "upper") +df$functional_car_no <- edit_outliers(df$functional_car_no, "upper") +df$functional_bicycle_no <- edit_outliers(df$functional_bicycle_no, "upper") +df$functional_motor_cycle_no <- edit_outliers(df$functional_motor_cycle_no, "upper") + + +# 1.2 Rescale continuous variables +#---------------------------------- +df <- df %>% dplyr::rename( + dist_todh_orig = dist_todh, + dist_torms_orig = dist_torms, + fac_daily_opening_hours_orig = fac_daily_opening_hours, + bed_count_orig = bed_count, + inpatient_visit_count_orig = inpatient_visit_count, + inpatient_days_count_orig = inpatient_days_count, + drug_order_fulfilment_delay_orig = drug_order_fulfilment_delay, + drug_order_fulfilment_freq_last_3mts_orig = drug_order_fulfilment_freq_last_3mts, + water_disruption_duration_orig = water_disruption_duration, + functional_bicycle_no_orig = functional_bicycle_no, + functional_motor_cycle_no_orig = functional_motor_cycle_no +) + +df$dist_todh <- log(df$dist_todh_orig + 1) +df$dist_torms <- log(df$dist_torms_orig + 1) +df$fac_daily_opening_hours <- log(df$fac_daily_opening_hours_orig) +df$bed_count <- log(df$bed_count_orig + 1) +df$inpatient_visit_count <- log(df$inpatient_visit_count_orig + 1) +df$inpatient_days_count <- log(df$inpatient_days_count_orig + 1) +df$drug_order_fulfilment_delay <- log(df$drug_order_fulfilment_delay_orig + 1) +df$drug_order_fulfilment_freq_last_3mts <- log(df$drug_order_fulfilment_freq_last_3mts_orig +1) +df$water_disruption_duration <- log(df$water_disruption_duration_orig +1) + +df$functional_bicycle_no <- log(df$functional_bicycle_no_orig +1) +df$functional_motor_cycle_no <- log(df$functional_motor_cycle_no_orig +1) + + +# 1.3 Streamline categorical variables +#------------------------------------- +# Incharge of drug orders +df[which(df$incharge_drug_orders == "pharmacy technician"|df$incharge_drug_orders == "pharmacist"),]$incharge_drug_orders <- + 'pharmacist or pharmacy technician' + +# Water source +df[which(df$water_source_main == "other"|df$water_source_main == "no water source"),]$water_source_main <- + 'no convenient water source' + +# Generate categorical version of dist_rorms +df$dist_torms_cat <- "" +df$dist_torms_cat <- ifelse((df$dist_torms_orig > 10000) & (df$dist_torms_orig <= 50000), "10-50 kms", + ifelse((df$dist_torms_orig > 50000) & (df$dist_torms_orig < 100000),"50-100 kms", + ifelse((df$dist_torms_orig > 100000) & (df$dist_torms_orig < 200000),"100-200 kms", + ifelse((df$dist_torms_orig < 10000), "0-10 kms","> 200 kms")) + )) + +df$dist_torms_cat <- ifelse((is.na(df$dist_torms_orig)), NaN, df$dist_torms_cat) +df$dist_torms_cat <- factor(df$dist_torms_cat, levels = c("0-10 kms", "10-50 kms", "50-100 kms", "100-200 kms", "> 200 kms")) # specify order + +# Generate categorical version of dist_todh +df$dist_todh_cat <- ifelse((df$dist_todh_orig > 10000) & (df$dist_todh_orig <= 25000), "10-25 kms", + ifelse((df$dist_todh_orig > 25000) & (df$dist_todh_orig < 50000),"25-50 kms", + ifelse((df$dist_todh_orig > 50000) & (df$dist_todh_orig < 75000),"50-75 kms", + ifelse((df$dist_todh_orig < 10000), "0-10 kms","> 75 kms")) + )) + +df$dist_todh_cat <- ifelse((is.na(df$dist_todh_orig)), NaN, df$dist_todh_cat) +df$dist_todh_cat <- factor(df$dist_todh_cat, levels = c("0-10 kms", "10-25 kms", "25-50 kms", "50-75 kms", "> 75 kms")) # specify order + +# Generate categorical version of drug_order_fulfilment_freq_last_3mts +df$drug_order_fulfilment_freq_last_3mts_cat <- as.character(df$drug_order_fulfilment_freq_last_3mts_orig) +df$drug_order_fulfilment_freq_last_3mts_cat <- ifelse((df$drug_order_fulfilment_freq_last_3mts_orig > 3), ">= 4", + df$drug_order_fulfilment_freq_last_3mts_cat) + +df$drug_order_fulfilment_freq_last_3mts_cat <- factor(df$drug_order_fulfilment_freq_last_3mts_cat, levels = c("1", "2", "3", ">= 4")) # specify order + + +# Drug transport +df$drug_transport_self <- ifelse(is.na(df$drug_transport_self), NaN, df$drug_transport_self) +df$drug_transport_higher_level_supplier <- ifelse(is.na(df$drug_transport_higher_level_supplier), NaN, df$drug_transport_higher_level_supplier) +df$drug_transport_local_supplier <- ifelse(is.na(df$drug_transport_local_supplier), NaN, df$drug_transport_local_supplier) +df$drug_transport_other <- ifelse(is.na(df$drug_transport_other), NaN, df$drug_transport_other) + + +# 1.4 Create a joint drug storage practice variable from individual ones +# for oxytocin and amoxicillin +#------------------------------------------------------------------------ +# Clean variable +df$label_and_expdate_visible <- ifelse((df$mathealth_label_and_expdate_visible == 1) & + (df$childhealth_label_and_expdate_visible == 1), 1, + ifelse((df$mathealth_label_and_expdate_visible == 0) & + (df$childhealth_label_and_expdate_visible == 0),3,2)) + +df$expdate_fefo <- ifelse((df$mathealth_expdate_fefo == 1) & + (df$childhealth_expdate_fefo == 1), 1, + ifelse((df$mathealth_expdate_fefo == 0) & + (df$childhealth_expdate_fefo == 0),3,2)) + +df$label_and_expdate_visible <- factor(df$label_and_expdate_visible) +df$expdate_fefo <- factor(df$expdate_fefo) + +# 1.5 Clean classification of programs into items +#------------------------------------------------ +df[which(df$item == "Oral hypoglycaemics"),]$program <- "ncds" +df[which(df$program == "hygiene/antiseptic"),]$program <- "other - infection prevention" +df[which(df$item == "Disposable latex gloves"),]$program <- "other - infection prevention" +df[which(df$item == "Medical (surgical or procedural) masks"),]$program <- "other - infection prevention" +df[which(df$item == "Eye protection (goggles, face shields)"),]$program <- "other - infection prevention" +df[which(df$item == "Gowns"),]$program <- "other - infection prevention" +df[which(df$program == "other"),]$program <- "general" +df[which(df$item == "Simvastatin tablet or other statin"),]$program <- "ncds" +df[which(df$item == "Cryptococcal antigen"),]$program <- "hiv" +df[which(df$item == "Ephedrine (injection)"),]$program <- "surgical" +df[which(df$item == "Fluconazole"),]$program <- "hiv" + +# 1.6 Clean consumable names for manuscript +#------------------------------------------------------------- +df[which(df$item == "slides and cover slips"),]$item <- "Slides and cover slips" +df[which(df$item == "art_component_1"),]$item <- "Antiretroviral treatment (ART) component 1 (ZDV/AZT/TDF/D4T)" +df[which(df$item == "art_component_2"),]$item <- "Antiretroviral treatment (ART) component 2 (3TC/FTC)" +df[which(df$item == "art_component_3"),]$item <- "Antiretroviral treatment (ART) component 3 (Protease inhibitor)" +df[which(df$item == "ACT"),]$item <- "Artemisinin-based combination therapy (ACT)" +df[which(df$item == "MRDT"),]$item <- "Malaria Rapid Diagnostic Test (MRDT)" +df[which(df$item == "SP"),]$item <- "Sulfadoxine/pyrimethamine" +df[which(df$item == "glucose inhibitors"),]$item <- "Glucose inhibitor" +df[which(df$item == "ACE inhibitor"),]$item <- "Angiotensin-converting enzyme(ACE) inhibitor" + +# 1.7 Clean level of care +#------------------------------------------- +#df[which(df$item == "Facility_level_1a"),]$item = "Primary level" +#df[which(df$item == "Facility_level_1b"),]$item = "Secondary level" + + +# 2. Create final dataframes for analysis +############################################################## +# Dataframes for sub-level analysis +df_level0 <- df[df$fac_type == 'Facility_level_0',] +df_level1a <- df[df$fac_type == 'Facility_level_1a',] +df_level1b <- df[df$fac_type == 'Facility_level_1b',] +df_level2 <- df[df$fac_type == 'Facility_level_2',] +df_level3 <- df[df$fac_type == 'Facility_level_2',] + + +# Set reference level for categorical variables +df$incharge_drug_orders <- relevel(factor(df$incharge_drug_orders), ref="drug store clerk") +df$district <- relevel(factor(df$district), ref="Lilongwe") +df$item <- relevel(factor(df$item), ref="Paracetamol cap/tab") +df$fac_owner <- relevel(factor(df$fac_owner), ref="Government") +df$fac_type <- relevel(factor(df$fac_type), ref="Facility_level_1a") +df$water_source_main <- relevel(factor(df$water_source_main), ref="piped into facility") +df$program <- relevel(factor(df$program), ref="general") +df$dist_torms_cat <- relevel(factor(df$dist_torms_cat), ref = "0-10 kms") +df$dist_todh_cat <- relevel(factor(df$dist_todh_cat), ref = "0-10 kms") +df$drug_order_fulfilment_freq_last_3mts_cat <- relevel(factor(df$drug_order_fulfilment_freq_last_3mts_cat), ref = "1") + +df_all_levels <- df + +# Dataframe for primary analysis - only levels 1a and 1b +df <- df %>% + filter(fac_type != 'Facility_level_4', # Zomba Mental Hospital + fac_type != 'Facility_level_3', # Central Hospitals + fac_type != 'Facility_level_2', # District Hospitals + fac_type != 'Facility_level_0', # Health posts + ) +# fac_types 2 and 3 are always urban and always government owned +# fac_type 0 has only 3 instances of non-government ownership, never provides cvd, +# never has a vehicle + +# 3 Create facility-level dataframe with the above adjustments +############################################################### +# For the dataframe with facilities in levels 1a and 1b +new_vars <- c('dist_torms_cat', 'dist_todh_cat', 'drug_order_fulfilment_freq_last_3mts_cat' ) +fac_features <- aggregate(df[unlist(c(fac_exp_vars, new_vars))], by = list(fac_code = df$fac_code), FUN = head, 1) +availability_by_facility <- aggregate( df[,'available'], list(fac_code = df$fac_code), + FUN = mean, na.rm = TRUE) +fac_reg_df <- merge(fac_features,availability_by_facility,by="fac_code") +fac_reg_df <- na.omit(fac_reg_df) + +# For the dataframe with all facilities +fac_features_all_levels <- aggregate(df_all_levels[unlist(c(fac_exp_vars, new_vars))], list(fac_code = df_all_levels$fac_code), FUN = head, 1) +availability_by_facility_all_levels <- aggregate( df_all_levels[,'available'], list(fac_code = df_all_levels$fac_code), + FUN = mean, na.rm = TRUE) +fac_reg_df_all_levels <- merge(fac_features_all_levels,availability_by_facility_all_levels,by="fac_code") +fac_reg_df_all_levels <- na.omit(fac_reg_df_all_levels) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R new file mode 100644 index 0000000000..2649203fbb --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R @@ -0,0 +1,263 @@ +# This script loads the necessary packages for the regression analysis +# and creates functions used in the feature manipulation and regression analysis scripts. + +# 1. Load libraries # +##################### +install.packages("pacman") +pacman::p_load(magrittr, # for %>% to work + estimatr, # for lm_robust to work (clustered SE) + + dplyr, + modeest, + broom.mixed, # for tidy to work to generate conf int table + stringr, + + #Excel packages + caTools, + readxl, + writexl, + openxlsx, #for write.xlsx + readr, + ggcorrplot, + + # Regression packages + nlme, # random effects regression - lme + lmerTest, # random effects regression - lmer + ggfortify, # for diagnostic plots + glmmTMB, # Multilevel regression model + MASS, # to run stepAIC with BIC criterion + + # visualisation packages + jtools, # for regression visualisations + sjPlot, # for plot_model + sjmisc, # for plot_model + viridis, + ggpubr, + ggplot2, + cowplot, # for plot_grid (combine multiple graphs) + gridExtra, # for combining forest plots (text grob) + grid, + + # packages for tables + gtsummary, # to get p-values in summary tables + huxtable, # to export regression summary tables using export_summs + margins, # to calculate average marginal effects from logistic regression results + janitor, # for tabyl command - get summary of categorical variable + aod, # wald.test + + MESS, # for model comparison + car, # to run Wald tests on regression outputs (Anova function) + caret, # to run k-fold cross validation + effects, # to allow for type= "eff" in plot_model + cvTools, # for k-fold cross validation + fastDummies # to create dummies from categorical variable + ) + +# Set paths to store outputs +dir.create(file.path(path_to_local_repo, "outputs/", "regression_analysis")) +path_to_outputs <- paste0(path_to_local_repo, "outputs/", "regression_analysis/") +dir.create(file.path(path_to_outputs, "regression_results")) +dir.create(file.path(path_to_outputs, "predictions")) +dir.create(file.path(paste0(path_to_outputs, "predictions/", "figures"))) +dir.create(file.path(path_to_outputs, "tables")) +dir.create(file.path(path_to_outputs, "figures")) + +# 2. Create functions # +####################### +# 2.1 Cap variables (or remove outliers) +#--------------------------------------- +edit_outliers <- function(varname, cap_direction){ + caps <- quantile(varname, probs=c(.05, .99), na.rm = T) + if (cap_direction == "upper"){ + varname[varname > caps[2]] <- caps[2] + } + if (cap_direction == "lower"){ + varname[varname < caps[1]] <- caps[1] + } + else{ + varname[varname < caps[1]] <- caps[1] + varname[varname > caps[2]] <- caps[2] + } + return(varname) +} + +# 2.2 Standardise variable +#------------------------------ +standardise <- function(varname){ + varname = (varname - mean(varname, na.rm = TRUE))/sd(varname, na.rm = TRUE) + return(varname) +} + +# 2.3 Function for captioning and referencing images and RMD +#-------------------------------------------------------------- +fig <- local({ + i <- 0 + ref <- list() + list( + cap=function(refName, text) { + i <<- i + 1 + ref[[refName]] <<- i + paste("Figure ", i, ": ", text, sep="") + }, + ref=function(refName) { + ref[[refName]] + }) +}) + +# 2.4 Function to insert row into a dataframe +#--------------------------------------------- +insertRow <- function(existingDF, newrow, r) { + existingDF[seq(r+1,nrow(existingDF)+1),] <- existingDF[seq(r,nrow(existingDF)),] + existingDF[r,] <- newrow + return(existingDF) +} + +# 2.5 Create forest plot of model results - main regression +#------------------------------------------------------------------------------------------- +custom_forest_plot <- function(model, ylimit= 4){ + # Choose model for graphing + model_forest <- model + + # Cleaned row names for dataframes + vars_of_interest <- c("fac_typeFacility_level_1b", "fac_ownerCHAM", "fac_ownerNGO","fac_ownerPrivate for profit", + "fac_urban", + "functional_computer", + "functional_emergency_vehicle", + "service_diagnostic", + "incharge_drug_orderscenter manager/owner","incharge_drug_ordersclinical officer", + "incharge_drug_ordershsa/shsa", "incharge_drug_ordersmedical assistant", + "incharge_drug_ordersnurse", "incharge_drug_ordersother", "incharge_drug_orderspharmacist or pharmacy technician", + "incharge_drug_orderspharmacy assistant", + "dist_todh_cat10-25 kms", "dist_todh_cat25-50 kms", "dist_todh_cat50-75 kms", "dist_todh_cat> 75 kms", + "dist_torms_cat10-50 kms", "dist_torms_cat50-100 kms", "dist_torms_cat100-200 kms", "dist_torms_cat> 200 kms", + "drug_order_fulfilment_freq_last_3mts_cat2", "drug_order_fulfilment_freq_last_3mts_cat3", "drug_order_fulfilment_freq_last_3mts_cat>= 4", + "item_drug", + "eml_priority_v") + reg_table_rownames <- data.frame(term = vars_of_interest, + newnames = c("...Level 1b", + "...Christian Health Association of Malawi (CHAM)", "...Non-governmental Organisation (NGO)", "...Private for profit", + "Facility is urban", + "Functional computer available", + "Emergency vehicle available", + "Diagnostic services available", + "...Center manager/owner", "...Clinical officer", + "...Health Surveillance Assistant (HSA)/Senior HSA", "...Medical assistant", + "...Nurse", "...Other", + "...Pharmacist or Pharmacy technician", "...Pharmacist assistant", + "...10-25 kms", "...25-50 kms", "...50-75 kms", "...> 75 kms", + "...10-50 kms", "...50-100 kms", "...100-200 kms", "...> 200 kms", + "...2", "...3", "...>=4", + "Consumable is a drug", + "Consumable is classified as vital")) + + + # Create the dataframe "forestdf" for the forest plot (Odds ratio, and upper/lower bounds) + #----------------------------------------------------------------------------------------- + forest_matrix <- tidy(model_forest,conf.int=TRUE,exponentiate=TRUE) + forest_matrix <- forest_matrix[which(forest_matrix$term %in% vars_of_interest),] + forest_matrix[c('estimate', 'conf.low', 'conf.high')] <- lapply(forest_matrix[c('estimate', 'conf.low', 'conf.high')], round, 2) + forest_matrix[c('p.value')] <- sprintf("%.4f",unlist(lapply(forest_matrix[c('p.value')], round, 4))) + forest_matrix[which(forest_matrix[c('p.value')] == "0.0000"),][c('p.value')] <- "<0.0001" # added on 11 March 2023 + + # Change rownames + forest_matrix$order <- 1:length(vars_of_interest) + forest_matrix <- merge(reg_table_rownames,forest_matrix,by="term") + forest_matrix$term <- forest_matrix$newnames + forest_matrix <- forest_matrix[order(forest_matrix$order),] + + forestdf <- structure(list(labels = structure(1:length(vars_of_interest), .Label = forest_matrix$term, class = "factor"), + rr = forest_matrix$estimate, rrhigh = forest_matrix$conf.high, rrlow = forest_matrix$conf.low), + class = "data.frame", row.names = c(NA, -29L)) # changes from factor to character + + + # Create the dataframe "fpplot" for the data table + #----------------------------------------------------------- + fplottable <- structure(list(labels = structure(1:length(vars_of_interest), .Label = forest_matrix$term, class = "factor"), + ci = paste0(forest_matrix$estimate, " (", forest_matrix$conf.low, " - ",forest_matrix$conf.high, ")"), + p = forest_matrix$p.value), + class = "data.frame", row.names = c(NA,-29L)) + + # Add reference level rows to above dataframes + #---------------------------------------------- + for (df_name in c('fplottable', 'forestdf')){ + df_results <- get(df_name) + df_results$labels <- as.character(df_results$labels) # change format to character in order to add new rows + r_fac_type <- c("Facility level (Ref: Level 1a)", rep(NA, dim(df_results)[2]-1)) + r_fac_owner <- c("Facility owner (Ref: Government)", rep(NA, dim(df_results)[2]-1)) + r_incharge_drug_orders <- c("Person in charge of drug orders (Ref: Drug store clerk)", rep(NA, dim(df_results)[2]-1)) + r_dist_todh <- c("Distance from DHO (Ref: 0-10kms)", rep(NA, dim(df_results)[2]-1)) + r_dist_torms <- c("Distance from RMS (Ref: 0-10kms)", rep(NA, dim(df_results)[2]-1)) + r_drug_order_fulfilment_freq_last_3mts <- c("Quarterly drug order fulfillment frequency (Ref: 1)", rep(NA, dim(df_results)[2]-1)) + regressor_headings = c("Facility level (Ref: Level 1a)", "Facility owner (Ref: Government)", + "Person in charge of drug orders (Ref: Drug store clerk)", "Distance from DHO (Ref: 0-10kms)", + "Distance from RMS (Ref: 0-10kms)", "Quarterly drug order fulfillment frequency (Ref: 1)") + + a <- which(df_results$labels == "...Level 1b") + df_results <- insertRow(df_results , r_fac_type, a) + b <- which(df_results$labels == "...Christian Health Association of Malawi (CHAM)") + df_results <- insertRow(df_results , r_fac_owner, b) + c <- which(df_results$labels == "...Center manager/owner") + df_results <- insertRow(df_results , r_incharge_drug_orders, c) + d <- which(df_results$labels == "...10-25 kms") + df_results <- insertRow(df_results , r_dist_todh, d) + e <- which(df_results$labels == "...10-50 kms") + df_results <- insertRow(df_results , r_dist_torms, e) + f <- which(df_results$labels == "...2") + df_results <- insertRow(df_results , r_drug_order_fulfilment_freq_last_3mts, f) + + # Add alternating color scheme + if((dim(df_results)[1] %% 2) == 0){ + df_results$colour <- rep(c("white", "gray"), dim(df_results)[1]/2) + } else { + df_results$colour <- c(rep(c("white", "gray"), dim(df_results)[1]/2), "white") + } + + assign(df_name, df_results) + } + + column_headers_space1 <- c("", NA, NA, NA, "white") + forestdf <- insertRow(forestdf , column_headers_space1, 1) + forestdf <<- insertRow(forestdf , column_headers_space1, 1) + + column_headers <- c("", "Odds ratio (95% CI)", "p-value", "white") + column_headers_space2 <- c("", "__________________", "________", "white") + fplottable <- insertRow(fplottable , column_headers, 1) + fplottable <<- insertRow(fplottable , column_headers_space2, 2) + + + # Create data table for plot + #------------------------------- + # Ensure that the order of labels does not change in the table + fplottable$labels <- factor(fplottable$labels, levels = rev(unique(fplottable$labels))) + + data_table <<- ggplot(data = fplottable, aes(y = labels, fontface = ifelse(forestdf$labels %in% regressor_headings, "italic", "plain")), + family = "Times") + + geom_hline(aes(yintercept = labels, colour = colour), size = 3) + + geom_text(aes(x = 0, label = labels), hjust = 0, size = 2.5) + + geom_text(aes(x = 5, label = ci), size = 2.5) + + geom_text(aes(x = 7, label = p), hjust = 1, size = 2.5) + + scale_colour_identity() + + theme_void() + + theme(plot.margin = ggplot2::margin(5, 0, 32, 0)) + + # Create forest plot for plot + #------------------------------- + forestdf[c('rr', 'rrhigh', 'rrlow')] <- sapply(forestdf[c('rr', 'rrhigh', 'rrlow')],as.numeric) + + p <<- ggplot(forestdf, aes(x = rr, y = labels, xmin = rrlow, xmax = rrhigh)) + + geom_hline(aes(yintercept = labels, colour = colour), size = 3) + # creates the grid #4.5 + geom_pointrange(shape = 20, fill = "black") + # previous 22 + geom_vline(xintercept = 1, linetype = 3) + # vertical line at x = 1 + xlab("Odds Ratio with 95% Confidence Interval") + + ylab("") + + theme_classic() + + scale_colour_identity() + + scale_y_discrete(limits = rev(forestdf$labels)) + + scale_x_log10(limits = c(0.20, ylimit*(1.15)), + breaks = 0.25 * 2^(seq(0,log(4*ylimit)/log(2),1)), + labels = as.character(0.25 * 2^(seq(0,log(4*ylimit)/log(2),1))), expand = c(0,0), + oob=scales::squish) + + theme(axis.text.y = element_blank(), axis.title.y = element_blank()) + + theme(plot.margin = ggplot2::margin(5, 0, 5, 0)) + + theme(axis.title=element_text(size=6, face = "bold")) +} diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R new file mode 100644 index 0000000000..641e52bf88 --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R @@ -0,0 +1,23 @@ +# This script sources other scripts needed for the regression analysis and controls the workflow. + +# Set file paths +path_to_local_repo <- "/Users/sm2511/PycharmProjects/TLOmodel/" # Change this if different user +path_to_dropbox <- "/Users/sm2511/Dropbox/Thanzi la Onse/" # Change this if different user +path_to_files_in_dropbox <- paste0(path_to_dropbox, "05 - Resources/Module-healthsystem/consumables raw files/") +path_to_data <- paste0(path_to_dropbox, "07 - Data/HHFA_2018-19/2 clean/") +path_to_scripts <- paste0(path_to_local_repo, "src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/") + +# Load Libraries and Functions +source(paste0(path_to_scripts, "load_packages_and_functions.R")) + +# Load Data +source(paste0(path_to_scripts, "data_setup.R")) + +# Data Preparation +source(paste0(path_to_scripts, "feature_manipulation.R")) + +# Regression analysis +source(paste0(path_to_scripts, "regression_analysis.R")) + +# Prediction +source(paste0(path_to_scripts, "predict.R")) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/predict.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/predict.R new file mode 100644 index 0000000000..64f86b48c6 --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/predict.R @@ -0,0 +1,145 @@ +# This script loads the outputs of the final regression model +# and generates predictions for consumable availability scenarios + +########################################################### +# 1. Load regression model output +########################################################### +load(paste0(path_to_outputs, "regression_results/model_fac_item_re.rdta")) +#TODO CHange above path + +########################################################### +# 2. Run predictions for policy evaluation +########################################################### +# 2.1 Setup +#-------------------------------------------------------------------------------------------------------------------------------- +# Replace program string which shorter text for plotting +df_regress <- df_for_re_reg_sorted +df_regress$program_plot <- df_regress$program +rep_str <- c('acute lower respiratory infections'='alri','obstetric and newborn care'='obs&newb', + 'other - infection prevention'='infection_prev', 'child health' = 'child') +df_regress$program_plot <- str_replace_all(df_regress$program_plot, rep_str) +df_regress <- df_regress[unlist(c(chosen_varlist_for_re_reg, 'program_plot'))] + +# Drop one item which seems to be causing an issue in the prediction +drop_items <- c("Dipsticks for urine ketone bodies for rapid diagnostic ") +df_regress <- df_regress %>% + filter(!(item %in% drop_items) # Second line ARVs + ) +# TODO check why this item needs to be dropped +# TODO check why there is an extra column item.1 in the dataframe + +# Generate a column with regression based predictions +df_regress <- df_regress %>% + mutate(available_prob_predicted = predict(model_fac_item_re,newdata=df_regress, type = "response")) +df_regress$fac_type_original <- df_regress$fac_type # because predictions are run on fac_type and then data collapsed on the basis of this variable + +# 2.2 Run predictions by changing the 5 characteristics which have the largest association with consumable availability +#-------------------------------------------------------------------------------------------------------------------------------- +top_5_features_for_cons_availability <- list('item_drug' = 0, + 'eml_priority_v' = 1, + 'incharge_drug_orders' = 'pharmacist or pharmacy technician', + 'fac_type' = 'Facility_level_1b', + 'fac_owner' = 'CHAM') +top_5_features_for_cons_availability_cumulative_names <- list('item_drug' = 'scen1', + 'eml_priority_v' = 1, + 'incharge_drug_orders' = 'pharmacist or pharmacy technician', + 'fac_type' = 'Facility_level_1b', + 'fac_owner' = 'CHAM') # this is for naming + +i <- 0 +j <- 0 +for (feature in names(top_5_features_for_cons_availability)){ + if (i == 0){ + print(paste0("Running predictions for ", feature, " = ", top_5_features_for_cons_availability[feature])) + df <- df_regress + df[feature] <- top_5_features_for_cons_availability[feature] # cumulatively change facility and consumable features + old_prediction <- df$available_prob_predicted # store prediction from the previous run + i <- 1 + } else{ + print(paste0("Running predictions for ", feature, " = ", top_5_features_for_cons_availability[feature], " (cumulative, i.e. in addition to previous update)")) + df[feature] <- top_5_features_for_cons_availability[feature] # cumulatively change facility and consumable features + old_prediction <- df$available_prob_new_prediction # store prediction from the previous run + } + #unique_values_of_chosen_features <- df %>% summarise_at(vars(eml_priority_v, item_drug, incharge_drug_orders, fac_type, fac_owner), ~list(unique(.))) + + # Run prediction with update set of features + new_prediction <- predict(model_fac_item_re,newdata=df, type = "response") # Predict availability when all facilities have pharmacist managing drug orders + df$available_prob_new_prediction <- new_prediction + + print(paste0("New mean probability of availability is ",mean(df$available_prob_new_prediction, na.rm = TRUE) * 100, "%")) + + # Check that the prediction improves availability from the previous case + stopifnot(mean(old_prediction, na.rm = TRUE) < + mean(new_prediction, na.rm = TRUE)) + + # Calculate proportional change in availability from baseline prediction which can be applied to LMIS data + df$availability_change_prop <- df$available_prob_new_prediction/df$available_prob_predicted + + if (j == 0){ + # Collapse data + summary_pred <- df %>% + group_by(district, fac_type_original, program_plot, item) %>% + summarise_at(vars(available, available_prob_predicted, availability_change_prop), list(mean)) + + pred_col_number <- length(colnames(summary_pred)) + colnames(summary_pred)[pred_col_number] <- paste0('change_proportion_scenario', j+1) + all_predictions_df <- summary_pred + } else{ + # Collapse data + summary_pred <- df %>% + group_by(district, fac_type_original, program_plot, item) %>% + summarise_at(vars(availability_change_prop), list(mean)) + + pred_col_number <- length(colnames(summary_pred)) + colnames(summary_pred)[pred_col_number] <- paste0('change_proportion_scenario', j+1) # names(top_5_features_for_cons_availability)[j] + + all_predictions_df <- merge(all_predictions_df, summary_pred, by = c('district', 'fac_type_original', 'program_plot', 'item')) + } + j <- j + 1 +} + +########################################################### +# 3. Export predictions +########################################################### +write.csv(all_predictions_df,paste0(path_to_outputs, "predictions/predicted_consumable_availability_regression_scenarios.csv"), row.names = TRUE) + +################################################################## +# 4. Plot predicted availability under 5 scenarios +################################################################ +# Plot original values +p_original <- ggplot(all_predictions_df, aes(item, district, fill= available_prob_predicted)) + + geom_tile() + + facet_wrap(~fac_type_original) + + scale_fill_viridis_c(limits = c(0, 1), option = "viridis", direction = -1) + + theme(axis.text.x = element_text(angle = 45 , vjust = 0.7, size = 1), + axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 8), + axis.text.y = element_text(size = 4), + legend.position = 'none', + plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5)) + + labs(title = "Probability of consumable availability - actual", + subtitle =paste0("Global average = ", round(mean(all_predictions_df$available_prob_predicted) *100, 2),"%")) + + xlab("consumable") + +# Plot predicted values +p_predict <- ggplot(all_predictions_df, aes(item, district, fill= available_prob_predicted * change_proportion_scenario1)) + + geom_tile() + + facet_wrap(~fac_type_original) + + scale_fill_viridis_c(limits = c(0, 1), option = "viridis", direction = -1) + + theme(axis.text.x = element_text(angle = 45 , vjust = 0.7, size = 1), + axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 8), + axis.text.y = element_text(size = 4), + legend.position = 'none', + plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5)) + + labs(title = "Probability of consumable availability - predicted \n (all items are consumables rather than drugs)", + subtitle =paste0("Global average = ", round(mean(all_predictions_df$available_prob_predicted * all_predictions_df$change_proportion_scenario1) *100, 2),"%")) + + xlab("consumable") + +figure <- ggpubr::ggarrange(p_original, p_predict, # list of plots + labels = "AUTO", # labels + common.legend = T, # COMMON LEGEND + legend = "bottom", # legend position + align = "hv", # Align them both, horizontal and vertical + nrow = 2) %>% # number of rows + ggexport(filename = paste0(path_to_outputs, "predictions/figures/pred_item_is_consumable_other_than_drug.pdf")) + +# TODO Update this figure to include all scenarios diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/regression_analysis.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/regression_analysis.R new file mode 100644 index 0000000000..c22ed31087 --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/regression_analysis.R @@ -0,0 +1,181 @@ +# This script performs the regression analyses +# Step 1: Chose subset of variables which will be included in the regression model - The Lancet Global Health Paper describes in +# detail how this was arrived at, therefore, the script to produce chosen_varlist is not included in master +# Link to paper - https://doi.org/10.1016/S2214-109X(24)00095-0 +# Link to branch which contains the full script - https://github.com/UCL/TLOmodel/tree/consumable_scenarios/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis + +# Step 2: Run chosen regression model. See links above for a description of how the facility and item random effects +# model was chosen for the main result + +#################################################################### +# 1. Set up list of variables to be included in the regression model +#################################################################### +# 1.1 Choose variable list for regression model +#----------------------------------------------- +chosen_varlist_orig <- c( "available", "fac_urban", + "fac_type" , "functional_computer", + "incharge_drug_orders", "functional_emergency_vehicle", + "dist_todh", "dist_torms", + "drug_order_fulfilment_freq_last_3mts", "service_diagnostic", + "item", "functional_refrigerator", + "fac_owner", "service_fp", + "service_othersti", "functional_handwashing_facility", + "service_hiv", "service_malaria", + "source_drugs_ngo", "service_cvd", + "service_tb", "bed_count", + "outpatient_only", "water_source_main", + "functional_toilet", "functional_landline", + "fuctional_mobile", "service_imci", + "drug_transport_self", "source_drugs_pvt", + "rms", "item_drug", + "eml_priority_v") + +# 1.2 Setting up categorical variables for regression analysis +#-------------------------------------------------------------- +continuous_variables <- c("dist_todh", "dist_torms", "drug_order_fulfilment_freq_last_3mts") +continuous_variables_cat_version <- c("dist_todh_cat", "dist_torms_cat", + "drug_order_fulfilment_freq_last_3mts_cat") + +chosen_varlist <- chosen_varlist_orig[!(chosen_varlist_orig %in% continuous_variables)] +chosen_varlist <- unlist(c(chosen_varlist,continuous_variables_cat_version )) + +# 1.3 Clean chosen independent variables list to ensure convergence of random effects model +# - remove highly correlated variables +# - remove variables with little variability +#------------------------------------------------------------------------------------------- +# Set up data for regression analysis +full_df <- df[chosen_varlist] +full_df <- na.omit(full_df) + +# Check correlation between independent variables +varlist_check_corr <- chosen_varlist[!(chosen_varlist %in% c('district', 'fac_type', 'incharge_drug_orders', + 'fac_owner', 'water_source_main', 'item', 'rms', + continuous_variables_cat_version))] # 'program', 'mode_administration', +correlation_final_varlist <- cor(full_df[varlist_check_corr]) +corrplot_final_varlist <- ggcorrplot(correlation_final_varlist, lab_size = 1.5, p.mat = NULL, + insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1, + tl.cex =5.5, lab = TRUE) +ggsave(plot = corrplot_final_varlist, filename = paste0(path_to_outputs, "figures/correlation_final_varlist.png")) + +# Based on the above results, drop variables on account of reasons listed below +vars_low_variation <- c('service_cvd') # %% service_diagnostic was previously dropped %% +vars_highly_correlated <- c('bed_count') # highly correlated with outpatient_only +chosen_varlist <- chosen_varlist[!(chosen_varlist %in% vars_low_variation)] +chosen_varlist <- chosen_varlist[!(chosen_varlist %in% vars_highly_correlated)] + +# bed_count is highly correlated with outpatient_only +# very few instances of no lab facilities/no diagnostic services + +# Update correlation matrix after dropping the above variables +varlist_check_corr_post <- chosen_varlist[!(chosen_varlist %in% c('district', 'fac_type', 'incharge_drug_orders', + 'fac_owner', 'water_source_main', 'item','rms', + continuous_variables_cat_version))] # 'program', 'mode_administration', +correlation_final_varlist_post <- cor(full_df[varlist_check_corr_post]) +corrplot_final_varlist_post <- ggcorrplot(correlation_final_varlist_post, lab_size = 1.5, p.mat = NULL, + insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1, + tl.cex =5.5, lab = TRUE) + +ggsave(plot = corrplot_final_varlist_post, filename = paste0(path_to_outputs, "figures/correlation_final_varlist_post.png")) + + +# List of binary variables +bin_exp_vars <- c('fac_urban', 'functional_emergency_vehicle' ,'functional_computer', + 'service_diagnostic' , 'functional_refrigerator', 'functional_landline', + 'fuctional_mobile', 'functional_toilet', 'functional_handwashing_facility', + 'outpatient_only', 'service_hiv', 'service_othersti', 'service_malaria', + 'service_tb', 'service_fp', 'service_imci', 'source_drugs_ngo', + 'source_drugs_pvt', 'drug_transport_self', 'item_drug', "eml_priority_v") + + +####################################################################### +# 2. Set up Dataframe for Regression analysis +####################################################################### +# Drop items and facilities with too few facilities reporting/items reported respectively +# Drop items and facs with very few observations +df_not_na_by_fac <- df %>% + group_by(fac_code) %>% + summarise(available_count = sum(!is.na(available))) %>% + arrange(available_count) + +df_not_na_by_item <- df %>% + group_by(item) %>% + summarise(available_count = sum(!is.na(available))) %>% + arrange(available_count) + +# Make a list of items with less than 10% facilities reporting (these are the consumables relevant to higher level RMNCH services for which +# only 64 facilities submitted reports) +items_with_too_few_obs <- subset(df_not_na_by_item, df_not_na_by_item$available_count < 100)['item'] +items_with_too_few_obs <- as.list(items_with_too_few_obs) + +# Make a list of facilities with less than 10% items reported +print(max(df_not_na_by_fac$available_count)) +facs_with_too_few_obs <- subset(df_not_na_by_fac, df_not_na_by_fac$available_count <= 0.1*max(df_not_na_by_fac$available_count))['fac_code'] +facs_with_too_few_obs <- as.list(facs_with_too_few_obs) + +df_for_regs <- subset(df, !(fac_code %in% facs_with_too_few_obs$fac_code)) +df_for_regs <- subset(df_for_regs, !(item %in% items_with_too_few_obs$item)) + +print(paste(length(facs_with_too_few_obs$fac_code), " facilities dropped.")) +print(paste(length(items_with_too_few_obs$item), " items dropped.")) + +# Set up dataframe for facility and item random effects model +#------------------------------------------------------------ +# Note that this is model 4 in the Lancet GH paper. We do not include models 1-3 because these are not used for analysis on the impact of consumable availability scenarios +# add variables for random effects regression (re_reg) +chosen_varlist_for_re_reg <- unlist(c(chosen_varlist, 'program', 'fac_code', 'item', 'district', 'item_type')) + +df_for_re_reg <- df_for_regs[chosen_varlist_for_re_reg] +df_for_re_reg <- na.omit(df_for_re_reg) + +# Sort by fac_code +df_for_re_reg_sorted <- df_for_re_reg[order(df_for_re_reg$fac_code, df_for_re_reg$item),] +# Create an numeric value for fac_code (clustering variable) +df_for_re_reg_sorted$fac_id <- as.integer(factor(df_for_re_reg_sorted$fac_code,levels=unique(df_for_re_reg_sorted$fac_code))) +df_for_re_reg_sorted <- as.data.frame(df_for_re_reg_sorted) + +####################################################################### +# 3. Run Regression Model +####################################################################### +# The model below works but does not have service_diagnostic +model_fac_item_re <- glmer(available ~ fac_type + fac_owner + fac_urban + functional_computer + + functional_emergency_vehicle + service_diagnostic + + incharge_drug_orders + + dist_todh_cat + dist_torms_cat + + drug_order_fulfilment_freq_last_3mts_cat + rms + + functional_refrigerator + functional_landline + fuctional_mobile + + functional_toilet + functional_handwashing_facility + + water_source_main + + outpatient_only + + service_hiv + service_othersti + + service_malaria + service_tb + + service_fp + service_imci + + source_drugs_ngo + source_drugs_pvt + + drug_transport_self + item_drug + eml_priority_v + + (1|district/fac_code) + (1|program/item), + family = binomial(logit), + data = df_for_re_reg_sorted, + control = glmerControl(optimizer = "bobyqa", + optCtrl=list(maxfun=1e5), + calc.derivs = TRUE) +) + +# Calculate the Intra-class correlation +icc_between_model_fac_item_re <- performance::icc(model_fac_item_re, by_group = TRUE) + +# Save regression results +#------------------------- +save(model_fac_item_re, file = paste0(path_to_outputs, "regression_results/model_fac_item_re.rdta")) + +# Summarise results in a table +#-------------------------------- +t <- tbl_regression(model_fac_item_re, exponentiate = TRUE, conf.int = TRUE, pvalue_fun = ~style_sigfig(., digits = 4)) + +tbl_merge <- + tbl_merge( + tbls = list(t), + tab_spanner = c("**Facility and Item RE**") # + ) %>% # build gtsummary table + as_gt() # %>% # convert to gt table +# gt::gtsave( # save table as image +# filename = reg_results1 +# ) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py deleted file mode 100644 index 3dcd4fe56e..0000000000 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py +++ /dev/null @@ -1,360 +0,0 @@ -""" -This script generates GIS data on facilities: - -Outputs: -* ResourceFile_Facility_locations.csv -* facility_distances.csv - -The following variables are added to the dataset generated by consumables_avaialbility_estimation.py: -1. facility GIS coordinates -2. Distance and drive time to corresponding District Health office -3. Distance and drive time to corresponding Regional Medical Store (warehouse) - -Inputs: -Dropbox location - ~05 - Resources/Module-healthsystem/consumables raw files/gis_data/LMISFacilityLocations_raw.xlsx - -NB. The comment of this file are commented-out because the script requires dependencies that are not included in the -TLO framework at the time of writing. -""" - - -""" -import datetime -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import requests -import googlemaps as gmaps -import requests -from matplotlib.lines import Line2D - - -# Path to TLO directory -outputfilepath = Path("./outputs") -resourcefilepath = Path("./resources") -path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" - -# Set local Dropbox source -path_to_dropbox = Path( # <-- point to the TLO dropbox locally - 'C:/Users/sm2511/Dropbox/Thanzi la Onse' - # '/Users/tbh03/Dropbox (SPH Imperial College)/Thanzi la Onse Theme 1 SHARE' -) - -path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/" - -# define a timestamp for script outputs -timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") - -# print the start time of the script -print('Script Start', datetime.datetime.now().strftime('%H:%M')) - -# Use googlemaps package to obtain GIS coordinates using facility names -GCODE_URL = 'https://maps.googleapis.com/maps/api/geocode/json?' -GCODE_KEY = '' # PLaceholder to enter googlemaps API -# gmaps = gmaps.Client(key=GCODE_KEY) - -# 1. Clean Master Health Facility Registry (MHFR) data -###################################################################### -# Clean locations for facilities for which GIS data was not available on incorrect in the MHFR -# --- 1.1 Load and set up data --- # -fac_gis = pd.read_excel(open(path_to_files_in_the_tlo_dropbox / 'gis_data/LMISFacilityLocations_raw.xlsx', - 'rb'), sheet_name='final_gis_data') -fac_gis = fac_gis.rename( - columns={'LMIS Facility List': 'fac_name', 'OWNERSHIP': 'fac_owner', 'TYPE': 'fac_type', 'STATUS': 'fac_status', - 'ZONE': 'zone', 'DISTRICT': 'district', 'DATE OPENED': 'open_date', 'LATITUDE': 'lat', - 'LONGITUDE': 'long'}) - -# Create a new column providing source of GIS data -fac_gis['gis_source'] = "" - -# Store unique district names -districts = fac_gis['district'].unique() - -# Preserve rows with missing or incorrect location data in order to derive GIS data using googlemaps API -cond1 = fac_gis['lat'] > -8.5 -cond2 = fac_gis['lat'] < -17.5 -cond3 = fac_gis['long'] > 36.5 -cond4 = fac_gis['long'] < 32.5 -conda = cond1 | cond2 | cond3 | cond4 # outside Malawi's boundaries -fac_gis_noloc = fac_gis[fac_gis.lat.isna() | conda] -fac_gis_noloc = fac_gis_noloc.reset_index() -fac_gis_noloc = fac_gis_noloc.drop(columns='index') - -# Edit data source -cond_originalmhfr = fac_gis.lat.notna() & ~conda -fac_gis.loc[cond_originalmhfr, 'gis_source'] = 'Master Health Facility Registry' -cond_manual = fac_gis['manual_entry'].notna() -fac_gis.loc[cond_manual, 'gis_source'] = 'Manual google search' - -fac_gis_clean = fac_gis[~conda & fac_gis.lat.notna()] # save clean portion of raw data to be appended later - - -# --- 1.2 Geocode facilities with missing data --- # -# Define a function to geocode locations based on names -def reverse_gcode(location): - location = str(location).replace(' ', '+') - nav_req = 'address={}&key={}'.format(location, GCODE_KEY) - request = GCODE_URL + nav_req - result = requests.get(request) - data = result.json() - status = data['status'] - - geo_location = {} - if str(status) == "OK": - sizeofjson = len(data['results'][0]['address_components']) - for i in range(sizeofjson): - sizeoftype = len(data['results'][0]['address_components'][i]['types']) - if sizeoftype == 3: - geo_location[data['results'][0]['address_components'][i]['types'][2]] = \ - data['results'][0]['address_components'][i]['long_name'] - - else: - if data['results'][0]['address_components'][i]['types'][0] == 'administrative_area_level_1': - geo_location['state'] = data['results'][0]['address_components'][i]['long_name'] - - elif data['results'][0]['address_components'][i]['types'][0] == 'administrative_area_level_2': - geo_location['city'] = data['results'][0]['address_components'][i]['long_name'] - geo_location['town'] = geo_location['city'] - - else: - geo_location[data['results'][0]['address_components'][i]['types'][0]] = \ - data['results'][0]['address_components'][i]['long_name'] - - formatted_address = data['results'][0]['formatted_address'] - geo_location['lat'] = data['results'][0]['geometry']['location']['lat'] - geo_location['lang'] = data['results'][0]['geometry']['location']['lng'] - geo_location['formatted_address'] = formatted_address - - return geo_location - - -# Extract latitude, longitude and city based on facility name -for i in range(len(fac_gis_noloc)): - try: - try: - try: - print("Processing facility", fac_gis_noloc['fac_name'][i]) - geo_info = reverse_gcode(fac_gis_noloc['fac_name'][i] + 'Malawi') - fac_gis_noloc['lat'][i] = geo_info['lat'] - fac_gis_noloc['long'][i] = geo_info['lang'] - fac_gis_noloc['gis_source'][i] = 'Google maps geolocation' - fac_gis_noloc['district'][i] = geo_info['city'] - except ValueError: - pass - except TypeError: - pass - except KeyError: - pass - -# Drop incorrect GIS coordinates from the above generated dataset -conda = fac_gis_noloc.district.isin(districts) # districts not from Malawi -cond1 = fac_gis_noloc['lat'] > -8.5 -cond2 = fac_gis_noloc['lat'] < -17.5 -cond3 = fac_gis_noloc['long'] > 36.5 -cond4 = fac_gis_noloc['long'] < 32.5 -condb = cond1 | cond2 | cond3 | cond4 # outside Malawi's boundaries -fac_gis_noloc.loc[~conda | condb, 'lat'] = np.nan -fac_gis_noloc.loc[~conda | condb, 'long'] = np.nan -fac_gis_noloc.loc[~conda | condb, 'district'] = np.nan - -cond = fac_gis_noloc.gis_source.isna() -fac_gis_noloc.loc[cond, 'lat'] = np.nan -fac_gis_noloc.loc[cond, 'long'] = np.nan - -# Append newly generated GIS information to the raw data -fac_gis = fac_gis_noloc.append(fac_gis_clean) - -# Drop incorrect GIS coordinates based on later comparison with district data from LMIS -list_of_incorrect_locations = ['Bilal Clinic', 'Biliwiri Health Centre', 'Chilonga Health care Health Centre', - 'Diamphwi Health Centre', 'Matope Health Centre (CHAM)', 'Nambazo Health Centre', - 'Nkhwayi Health Centre', 'Nsambe Health Centre (CHAM)', 'Padley Pio Health Centre', - 'Phanga Health Centre', 'Somba Clinic', "St. Martin's Molere Health Centre CHAM", - 'Ngapani Clinic', 'Mulungu Alinafe Clinic', 'Mdeza Health Centre', - 'Matandani Health Centre (CHAM)', - 'Sunrise Clinic', 'Sucoma Clinic'] -mapped_to_malawi = fac_gis.lat == -13.254308 -cond = fac_gis.fac_name.isin(list_of_incorrect_locations) | mapped_to_malawi -fac_gis.loc[cond, 'lat'] = np.nan -fac_gis.loc[cond, 'long'] = np.nan -fac_gis.loc[cond, 'gis_source'] = np.nan -fac_gis.loc[cond, 'district'] = np.nan - -# 2. Clean data using information from LMIS # -##################################################################################################### -# --- 2.1 Load and set up LMIS data --- # -stkout_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv", - low_memory=False) - -# Drop rows which can't be used in regression analysis -regsubset_cond1 = stkout_df['data_source'] == 'original_lmis_data' -regsubset_cond2 = stkout_df[ - 'fac_type_tlo'] == 'Facility_level_0' # since only one facility from Mchinji reported in OpenLMIS -stkout_df_reg = stkout_df[regsubset_cond1 & ~regsubset_cond2] - -# Clean some district names to match with master health facility registry -rename_districts = { - 'Nkhota Kota': 'Nkhotakota', - 'Nkhata bay': 'Nkhata Bay' -} -stkout_df['district'] = stkout_df['district'].replace(rename_districts) - -# Keep only relevant columns -lmis_district = stkout_df[['fac_name', 'fac_type_tlo', 'district']] -lmis_district = lmis_district.drop_duplicates() - -# --- 2.2 Clean district column and assign relevant DHO to each facility --- # -# Manual fixes before assigning DHO -# Master Health facility registry did not differentiate between Mzimba North and Mzimba South --> get this data -# and any other district discrepancies from LMIS -fac_gis = fac_gis.rename(columns={'district': 'district_mhfr'}) -fac_gis = pd.merge(fac_gis, lmis_district, how='left', on='fac_name') - -list_mhfr_district_is_correct = ['Chididi Health Centre', 'Chikowa Health Centre', - 'Chileka Health Centre'] -cond_mhfr_district_is_correct = fac_gis.fac_name.isin(list_mhfr_district_is_correct) -cond_lmis_district_missing = fac_gis.district.isna() -fac_gis.loc[cond_mhfr_district_is_correct | cond_lmis_district_missing, 'district'] = fac_gis.district_mhfr -fac_gis = fac_gis.drop(columns=['zone', 'district_mhfr', 'open_date', 'manual_entry']) - -# --- 1.3 Extract final file with GIS locations into .csv --- # -fac_gis = fac_gis[fac_gis['lat'].notna()] # Keep rows with GIS locations -fac_gis.to_csv(path_for_new_resourcefiles / "ResourceFile_Facility_locations.csv") - -# Locate the corresponding DHO for each facility -cond1 = fac_gis['fac_name'].str.contains('DHO') -cond2 = fac_gis['fac_name'].str.contains('istrict') -# Create columns indicating the coordinates of the corresponding DHO for each facility -dho_df = fac_gis[cond1 | cond2].reset_index() -# Rename columns -dho_df = dho_df.rename(columns={'lat': 'lat_dh', 'long': 'long_dh'}) - -# Merge main GIS dataframe with corresponding DHO -fac_gis = pd.merge(fac_gis, dho_df[['district', 'lat_dh', 'long_dh']], how='left', on='district') - -# --- 2.3 Assign relevant CMST Regional Medical Store to each facility --- # -# Create columns indicating the coordinates of the corresponding CMST warehouse (regional medical store) for each -# facility -fac_gis['lat_rms'] = np.nan -fac_gis['long_rms'] = np.nan -fac_gis['rms'] = np.nan - -# RMS Center (-13.980394, 33.783521) -cond_center1 = fac_gis['district'].isin(['Kasungu', 'Ntchisi', 'Dowa', 'Mchinji', 'Lilongwe', 'Ntcheu', - 'Dedza', 'Nkhotakota', 'Salima']) -cond_center2 = fac_gis['fac_name'].str.contains('Kamuzu Central Hospital') -fac_gis.loc[cond_center1 | cond_center2, 'lat_rms'] = -13.980394 -fac_gis.loc[cond_center1 | cond_center2, 'long_rms'] = 33.783521 -fac_gis.loc[cond_center1 | cond_center2, 'rms'] = 'RMS Center' - -# RMS North (-11.425590, 33.997467) -cond_north1 = fac_gis['district'].isin(['Nkhata Bay', 'Rumphi', 'Chitipa', 'Likoma', 'Karonga', - 'Mzimba North', 'Mzimba South']) -cond_north2 = fac_gis['fac_name'].str.contains('Mzuzu Central Hospital') -fac_gis.loc[cond_north1 | cond_north2, 'lat_rms'] = -11.425590 -fac_gis.loc[cond_north1 | cond_north2, 'long_rms'] = 33.997467 -fac_gis.loc[cond_north1 | cond_north2, 'rms'] = 'RMS North' - -# RMS South (-15.804544, 35.021192) -cond_south1 = fac_gis['district'].isin(['Blantyre', 'Balaka', 'Machinga', 'Zomba', 'Mangochi', 'Thyolo', 'Nsanje', - 'Chikwawa', 'Mwanza', 'Neno', 'Mulanje', 'Phalombe', 'Chiradzulu']) -cond_south2 = fac_gis['fac_name'].str.contains('Queen Elizabeth Central') -cond_south3 = fac_gis['fac_name'].str.contains('Zomba Central') -cond_south4 = fac_gis['fac_name'].str.contains('Zomba Mental') -fac_gis.loc[cond_south1 | cond_south2 | cond_south3 | cond_south4, 'lat_rms'] = -15.804544 -fac_gis.loc[cond_south1 | cond_south2 | cond_south3 | cond_south4, 'long_rms'] = 35.021192 -fac_gis.loc[cond_south1 | cond_south2 | cond_south3 | cond_south4, 'rms'] = 'RMS South' -fac_gis['district'].unique() - -# 3. Generate data on distance and travel time between facilities and DHO/RMS # -##################################################################################################### -# --- 3.1 Distance and travel time of each facility from the corresponding DHO --- # -fac_gis['dist_todh'] = np.nan -fac_gis['drivetime_todh'] = np.nan -for i in range(len(fac_gis)): - try: - # print("Processing facility", i) - latfac = fac_gis['lat'][i] - longfac = fac_gis['long'][i] - latdho = fac_gis['lat_dh'][i] - longdho = fac_gis['long_dh'][i] - origin = (latdho, longdho) - dest = (latfac, longfac) - - fac_gis['dist_todh'][i] = \ - gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['distance']['value'] - fac_gis['drivetime_todh'][i] = \ - gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['duration']['value'] - except: - pass - -# --- 3.2 Distance and travel time of each facility from the corresponding RMS --- # -fac_gis['dist_torms'] = np.nan -fac_gis['drivetime_torms'] = np.nan -for i in range(len(fac_gis)): - try: - # print("Processing facility", i) - latfac = fac_gis['lat'][i] - longfac = fac_gis['long'][i] - latdho = fac_gis['lat_rms'][i] - longdho = fac_gis['long_rms'][i] - origin = (latdho, longdho) - dest = (latfac, longfac) - - fac_gis['dist_torms'][i] = \ - gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['distance']['value'] - fac_gis['drivetime_torms'][i] = \ - gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['duration']['value'] - except: - pass - -# Update distance values from DH to 0 for levels 2 and above -cond1 = fac_gis['fac_type_tlo'] == 'Facility_level_2' -cond2 = fac_gis['fac_type_tlo'] == 'Facility_level_3' -cond3 = fac_gis['fac_type_tlo'] == 'Facility_level_4' -fac_gis.loc[cond1 | cond2 | cond3, 'dist_todh'] = 0 -fac_gis.loc[cond1 | cond2 | cond3, 'drivetime_todh'] = 0 - -# 4. Save data to be merge into Consumable availabilty dataset for regression analysis # -##################################################################################################### -# Keep only necessary columns and save as .csv -fac_gis = fac_gis[['district', 'rms', 'lat', 'long', 'lat_dh', 'long_dh', 'lat_rms', 'long_rms', - 'dist_torms', 'drivetime_torms', 'dist_todh', 'drivetime_todh', 'fac_name', 'gis_source']] - -# - 1.2.5 Export distances file to dropbox - # -fac_gis.to_csv(path_to_files_in_the_tlo_dropbox / 'gis_data/facility_distances.csv') - -# 5. Descriptive graphs # -##################################################################################################### -groups = fac_gis.groupby('district') - -# Scatterplot of distance and drive time to DHO -fig, ax = plt.subplots() -ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling -for name, group in groups: - ax.plot(group.dist_todh / 1000, group.drivetime_todh, marker='o', linestyle='', ms=5, label=name) -# Shrink current axis by 20% to fit legend -box = ax.get_position() -ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) -# Put a legend to the right of the current axis -ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) -plt.xlabel("Distance (kilometers)", fontsize=12) -plt.ylabel("Drive time (minutes)", fontsize=12) -plt.savefig('C:/Users/sm2511/OneDrive - University of York/Desktop/faclocation_wrtdh_new.png') - -# Scatterplot of distance and drive time to RMS -groups = fac_gis.groupby('rms') -fig, ax = plt.subplots() -ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling -for name, group in groups: - ax.plot(group.dist_torms / 1000, group.drivetime_torms, marker='o', linestyle='', ms=5, label=name) -# Shrink current axis by 20% to fit legend -box = ax.get_position() -ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) -# Put a legend to the right of the current axis -ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) -plt.xlabel("Distance (kilometers)", fontsize=12) -plt.ylabel("Drive time (minutes)", fontsize=12) -plt.savefig('C:/Users/sm2511/OneDrive - University of York/Desktop/faclocation_wrtrms.png') -""" diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py index 3615afd400..9f0c71f22c 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py @@ -30,14 +30,12 @@ from tlo.methods.consumables import check_format_of_consumables_file -# Set local Dropbox source -path_to_dropbox = Path( # <-- point to the TLO dropbox locally - '/Users/sm2511/Dropbox/Thanzi la Onse' - # '/Users/sejjj49/Dropbox/Thanzi la Onse' - # 'C:/Users/tmangal/Dropbox/Thanzi la Onse' +# Set local shared folder source +path_to_share = Path( # <-- point to the shared folder + '/Users/sm2511/Library/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/' ) -path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/" +path_to_files_in_the_tlo_shared_drive = path_to_share / "07 - Data/Consumables data/" # define a timestamp for script outputs timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") @@ -68,7 +66,7 @@ def change_colnames(df, NameChangeList): # Change column names ######################################################################################### # Import 2018 data -lmis_df = pd.read_csv(path_to_files_in_the_tlo_dropbox / 'ResourceFile_LMIS_2018.csv', low_memory=False) +lmis_df = pd.read_csv(path_to_files_in_the_tlo_shared_drive / 'OpenLMIS/2018/ResourceFile_LMIS_2018.csv', low_memory=False) # 1. BASIC CLEANING ## # Rename columns @@ -515,7 +513,7 @@ def custom_agg_stkout(x): unmatched_consumables = unmatched_consumables[unmatched_consumables['item_y'].isna()] # ** Extract stock availability data from HHFA and clean data ** -hhfa_df = pd.read_excel(path_to_files_in_the_tlo_dropbox / 'ResourceFile_hhfa_consumables.xlsx', sheet_name='hhfa_data') +hhfa_df = pd.read_excel(path_to_files_in_the_tlo_shared_drive / 'ResourceFile_hhfa_consumables.xlsx', sheet_name='hhfa_data') # Use the ratio of availability rates between levels 1b on one hand and levels 2 and 3 on the other to extrapolate # availability rates for levels 2 and 3 from the HHFA data @@ -541,7 +539,7 @@ def custom_agg_stkout(x): hhfa_df.loc[cond, var] = 1 # Add further assumptions on consumable availability from other sources -assumptions_df = pd.read_excel(open(path_to_files_in_the_tlo_dropbox / 'ResourceFile_hhfa_consumables.xlsx', 'rb'), +assumptions_df = pd.read_excel(open(path_to_files_in_the_tlo_shared_drive / 'ResourceFile_hhfa_consumables.xlsx', 'rb'), sheet_name='availability_assumptions') assumptions_df = assumptions_df[['item_code', 'available_prop_Facility_level_0', 'available_prop_Facility_level_1a', 'available_prop_Facility_level_1b', @@ -606,35 +604,54 @@ def custom_agg_stkout(x): stkout_df = pd.concat([stkout_df, hhfa_fac0], axis=0, ignore_index=True) # --- 6.4 Generate new category variable for analysis --- # -stkout_df['category'] = stkout_df['module_name'].str.lower() -cond_RH = (stkout_df['category'].str.contains('care_of_women_during_pregnancy')) | \ - (stkout_df['category'].str.contains('labour')) -cond_newborn = (stkout_df['category'].str.contains('newborn')) -cond_childhood = (stkout_df['category'] == 'acute lower respiratory infections') | \ - (stkout_df['category'] == 'measles') | \ - (stkout_df['category'] == 'diarrhoea') -cond_rti = stkout_df['category'] == 'road traffic injuries' -cond_cancer = stkout_df['category'].str.contains('cancer') -cond_ncds = (stkout_df['category'] == 'epilepsy') | \ - (stkout_df['category'] == 'depression') -stkout_df.loc[cond_RH, 'category'] = 'reproductive_health' -stkout_df.loc[cond_cancer, 'category'] = 'cancer' -stkout_df.loc[cond_newborn, 'category'] = 'neonatal_health' -stkout_df.loc[cond_childhood, 'category'] = 'other_childhood_illnesses' -stkout_df.loc[cond_rti, 'category'] = 'road_traffic_injuries' -stkout_df.loc[cond_ncds, 'category'] = 'ncds' - -cond_condom = stkout_df['item_code'] == 2 -stkout_df.loc[cond_condom, 'category'] = 'contraception' - -# Create a general consumables category -general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76, - 82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247] -diagnostics_cons_list = [41, 50, 128, 216, 2008, 47, 190, 191, 196, 206, 207, 163, 175, 184, - 187] # for now these have not been applied because most diagnostics are program specific - -cond_general = stkout_df['item_code'].isin(general_cons_list) -stkout_df.loc[cond_general, 'category'] = 'general' +def recategorize_modules_into_consumable_categories(_df): + _df['item_category'] = _df['module_name'].str.lower() + cond_RH = (_df['item_category'].str.contains('care_of_women_during_pregnancy')) | \ + (_df['item_category'].str.contains('labour')) + cond_newborn = (_df['item_category'].str.contains('newborn')) + cond_newborn[cond_newborn.isna()] = False + cond_childhood = (_df['item_category'] == 'acute lower respiratory infections') | \ + (_df['item_category'] == 'measles') | \ + (_df['item_category'] == 'diarrhoea') + cond_rti = _df['item_category'] == 'road traffic injuries' + cond_cancer = _df['item_category'].str.contains('cancer') + cond_cancer[cond_cancer.isna()] = False + cond_ncds = (_df['item_category'] == 'epilepsy') | \ + (_df['item_category'] == 'depression') + _df.loc[cond_RH, 'item_category'] = 'reproductive_health' + _df.loc[cond_cancer, 'item_category'] = 'cancer' + _df.loc[cond_newborn, 'item_category'] = 'neonatal_health' + _df.loc[cond_childhood, 'item_category'] = 'other_childhood_illnesses' + _df.loc[cond_rti, 'item_category'] = 'road_traffic_injuries' + _df.loc[cond_ncds, 'item_category'] = 'ncds' + cond_condom = _df['item_code'] == 2 + _df.loc[cond_condom, 'item_category'] = 'contraception' + + # Create a general consumables category + general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76, + 82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247, 49, 112, 1933, 1960] + cond_general = _df['item_code'].isin(general_cons_list) + _df.loc[cond_general, 'item_category'] = 'general' + + # Fill gaps in categories + dict_for_missing_categories = {292: 'acute lower respiratory infections', 293: 'acute lower respiratory infections', + 307: 'reproductive_health', 2019: 'reproductive_health', + 2678: 'tb', 1171: 'other_childhood_illnesses', 1237: 'cancer', 1239: 'cancer'} + # Use map to create a new series from item_code to fill missing values in category + mapped_categories = _df['item_code'].map(dict_for_missing_categories) + # Use fillna on the 'item_category' column to fill missing values using the mapped_categories + _df['item_category'] = _df['item_category'].fillna(mapped_categories) + + return _df + +stkout_df = recategorize_modules_into_consumable_categories(stkout_df) +item_code_category_mapping = stkout_df[['item_category', 'item_code']].drop_duplicates() + +# Add item_category to ResourceFile_Consumables_Item_Designations +item_designations = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv') +item_designations = item_designations.drop(columns = 'item_category') +item_designations = item_designations.merge(item_code_category_mapping, left_on = 'Item_Code', right_on = 'item_code', how = 'left', validate = '1:1') +item_designations.drop(columns = 'item_code').to_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv', index = False) # --- 6.5 Replace district/fac_name/month entries where missing --- # for var in ['district', 'fac_name', 'month']: @@ -822,12 +839,15 @@ def interpolate_missing_with_mean(_ser): # Check that there are not missing values assert not pd.isnull(full_set_interpolated).any().any() +full_set_interpolated = full_set_interpolated.reset_index() +#full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1') + # --- Check that the exported file has the properties required of it by the model code. --- # -check_format_of_consumables_file(df=full_set_interpolated.reset_index(), fac_ids=fac_ids) +check_format_of_consumables_file(df=full_set_interpolated, fac_ids=fac_ids) # %% # Save -full_set_interpolated.reset_index().to_csv( +full_set_interpolated.to_csv( path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv", index=False ) @@ -849,7 +869,7 @@ def interpolate_missing_with_mean(_ser): hhfa_comparison_df = hhfa_comparison_df.rename({'fac_type_tlo': 'Facility_Level'}, axis=1) # ii. Collapse final model availability data by facility level -final_availability_df = full_set_interpolated.reset_index() +final_availability_df = full_set_interpolated mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") final_availability_df = pd.merge(final_availability_df, mfl[['District', 'Facility_Level', 'Facility_ID']], how="left", on=['Facility_ID'], @@ -871,7 +891,6 @@ def interpolate_missing_with_mean(_ser): size = 10 comparison_df['consumable_labels'] = comparison_df['consumable_name_tlo'].str[:10] - # Define function to draw calibration plots at different levels of disaggregation def comparison_plot(level_of_disaggregation, group_by_var, colour): comparison_df_agg = comparison_df.groupby([group_by_var], diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py deleted file mode 100644 index fc5c775bce..0000000000 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -This script generates the consumables availability dataset for regression analysis using the outputs of - -consumables_availability_estimation.py and clean_fac_locations.py - -and generates descriptive figures and tables. -""" -import datetime -from pathlib import Path - -import pandas as pd - -# import numpy as np -# import calendar -# import copy -# import matplotlib.pyplot as plt -# from matplotlib.lines import Line2D -# from matplotlib import pyplot # for figures -# import seaborn as sns -# import math - -# Path to TLO directory -outputfilepath = Path("./outputs") -resourcefilepath = Path("./resources") -path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" - -# Set local Dropbox source -path_to_dropbox = Path( # <-- point to the TLO dropbox locally - 'C:/Users/sm2511/Dropbox/Thanzi la Onse' -) - -path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/" - -# define a timestamp for script outputs -timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") - -# print the start time of the script -print('Script Start', datetime.datetime.now().strftime('%H:%M')) - -# 1. DATA IMPORT AND CLEANING # -######################################################################################### -# --- 1.1 Import consumables availability data --- # -stkout_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv", - low_memory=False) - -# Drop rows which can't be used in regression analysis -regsubset_cond1 = stkout_df['data_source'] == 'original_lmis_data' -regsubset_cond2 = stkout_df[ - 'fac_type_tlo'] == 'Facility_level_0' # since only one facility from Mchinji reported in OpenLMIS -stkout_df_reg = stkout_df[regsubset_cond1 & ~regsubset_cond2] - -# Clean some district names to match with master health facility registry -rename_districts = { - 'Nkhota Kota': 'Nkhotakota', - 'Nkhata bay': 'Nkhata Bay' -} -stkout_df['district'] = stkout_df['district'].replace(rename_districts) - -# --- 1.2 Import GIS data --- # -fac_gis = pd.read_csv(path_to_files_in_the_tlo_dropbox / "gis_data/facility_distances.csv") - -# --- 1.3 Merge cleaned LMIS data with GIS data --- # -consumables_df = pd.merge(stkout_df.drop(columns=['district', 'Unnamed: 0']), fac_gis.drop(columns=['Unnamed: 0']), - how='left', on='fac_name') -consumables_df.to_csv(path_to_files_in_the_tlo_dropbox / 'consumables_df.csv') diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py new file mode 100644 index 0000000000..1d6a8dee84 --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py @@ -0,0 +1,1001 @@ +""" +This script adds estimates of availability of consumables under different scenarios to the base Resource File: + +Outputs: +* Updated version of ResourceFile_Consumables_availability_small.csv (estimate of consumable available - smaller file for use in the + simulation) which includes new consumable availability estimates for all scenarios. The following scenarios are generated +1. 'default' : this is the benchmark scenario with 2018 levels of consumable availability +2. 'scenario1' : [Level 1a + 1b] All items perform as well as consumables other than drugs/diagnostic tests +3. 'scenario2' : [Level 1a + 1b] 1 + All items perform as well as consumables classified as 'Vital' in the Essential Medicines List +4. 'scenario3' : [Level 1a + 1b] 2 + All facilities perform as well as those in which consumables stock is managed by pharmacists +5. 'scenario4' : [Level 1a + 1b] 3 + Level 1a facilities perform as well as level 1b +6. 'scenario5' : [Level 1a + 1b] 4 + All facilities perform as well as CHAM facilities +7. 'scenario6' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 75th percentile best performing facility for each individual item +8. 'scenario7' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 90th percentile best performing facility for each individual item +9. 'scenario8' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item +10. 'scenario9' : [Level 1a + 1b + 2] All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item +11. 'scenario10' : [Level 1a + 1b] All programs perform as well as HIV +12. 'scenario11' : [Level 1a + 1b] All programs perform as well as EPI +13. 'scenario12' : [Level 1a + 1b] HIV performs as well as other programs (excluding EPI) +14. 'all': all consumable are always available - provides the theoretical maximum health gains which can be made through improving consumable supply + +Inputs: +* outputs/regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv - This file is hosted +locally after running consumable_resource_analyses_with_hhfa/regression_analysis/main.R +* ResourceFile_Consumables_availability_small.csv` - This file contains the original consumable availability estimates +from OpenLMIS 2018 data +* `ResourceFile_Consumables_matched.csv` - This file contains the crosswalk of HHFA consumables to consumables in the +TLO model + +It creates one row for each consumable for availability at a specific facility and month when the data is extracted from +the OpenLMIS dataset and one row for each consumable for availability aggregated across all facilities when the data is +extracted from the Harmonised Health Facility Assessment 2018/19. + +Consumable availability is measured as probability of consumable being available at any point in time. +""" +import calendar +import datetime +from collections import defaultdict +from pathlib import Path +import os + +import matplotlib.pyplot as plt +from plotnine import * # ggplot, aes, geom_point for ggplots from R +import seaborn as sns +import numpy as np +import pandas as pd + +from tlo.methods.consumables import check_format_of_consumables_file + +# define a timestamp for script outputs +timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") + +# print the start time of the script +print('Script Start', datetime.datetime.now().strftime('%H:%M')) + +# define a pathway to the data folder (note: currently outside the TLO model directory) +# remember to set working directory to TLOmodel/ +outputfilepath = Path("./outputs") +resourcefilepath = Path("./resources") +path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" + +# 1. Import and clean data files +#********************************** +# 1.1 Import TLO model availability data +#------------------------------------------------------ +tlo_availability_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv") +# Drop any scenario data previously included in the resourcefile +tlo_availability_df = tlo_availability_df[['Facility_ID', 'month','item_code', 'available_prop']] + +# Import item_category +program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] +program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] + +# 1.1.1 Attach district, facility level and item_category to this dataset +#---------------------------------------------------------------- +# Get TLO Facility_ID for each district and facility level +mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") +districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) +fac_levels = {'0', '1a', '1b', '2', '3', '4'} +tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on = ['Facility_ID'], how='left') + +tlo_availability_df = tlo_availability_df.merge(program_item_mapping, + on = ['item_code'], how='left') + +# 1.2 Import scenario data +#------------------------------------------------------ +scenario_availability_df = pd.read_csv(outputfilepath / "regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv") +scenario_availability_df = scenario_availability_df.drop(['Unnamed: 0'], axis=1) +scenario_availability_df = scenario_availability_df.rename({'item': 'item_hhfa'}, axis=1) + +# Prepare scenario data to be merged to TLO model availability based on TLO model features +# 1.2.1 Level of care +#------------------------------------------------------ +scenario_availability_df['fac_type'] = scenario_availability_df['fac_type_original'].str.replace("Facility_level_", "") + +# 1.2.2 District +#------------------------------------------------------ +# Do some mapping to make the Districts in the scenario file line-up with the definition of Districts in the model +rename_and_collapse_to_model_districts = { + 'Mzimba South': 'Mzimba', + 'Mzimba North': 'Mzimba', +} +scenario_availability_df = scenario_availability_df.rename({'district': 'district_original'}, axis=1) +scenario_availability_df['District'] = scenario_availability_df['district_original'].replace(rename_and_collapse_to_model_districts) + +# Cities to get same results as their respective regions +copy_source_to_destination = { + 'Mzimba': 'Mzuzu City', + 'Lilongwe': 'Lilongwe City', + 'Zomba': 'Zomba City', + 'Blantyre': 'Blantyre City', + 'Nkhata Bay': 'Likoma' # based on anecdotal evidence, assume that they experience the same change in avaiability as a result of interventions based on regression results +} +for source, destination in copy_source_to_destination.items(): + new_rows = scenario_availability_df.loc[scenario_availability_df.District == source].copy() # standardised district names + new_rows.District = destination + scenario_availability_df = pd.concat([scenario_availability_df, new_rows], ignore_index = True) + +assert sorted(set(districts)) == sorted(set(pd.unique(scenario_availability_df.District))) + +# 1.2.3 Facility_ID +# #------------------------------------------------------ +# Merge-in facility_id +scenario_availability_df = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + left_on=['District', 'fac_type'], + right_on=['District', 'Facility_Level'], how='left', indicator=True) +scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_facid'}, axis=1) + +# Extract list of District X Facility Level combinations for which there is no HHFA data +df_to_check_prediction_completeness = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + left_on=['District', 'Facility_Level'], + right_on=['District', 'Facility_Level'], how='right', indicator=True) +cond_no_1b = (df_to_check_prediction_completeness['Facility_Level'].isin(['1b'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') +cond_no_1a = (df_to_check_prediction_completeness['Facility_Level'].isin(['1a'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') +districts_with_no_scenario_data_for_1b = df_to_check_prediction_completeness[cond_no_1b]['District'].unique() +districts_with_no_scenario_data_for_1a = df_to_check_prediction_completeness[cond_no_1a]['District'].unique() +districts_with_no_scenario_data_for_1b_only = np.setdiff1d(districts_with_no_scenario_data_for_1b, districts_with_no_scenario_data_for_1a) + +# According to HHFA data, Balaka, Machinga, Mwanza, Ntchisi and Salima do not have level 1b facilities +# Likoma was not included in the regression because of the limited variation within the district - only 4 facilities - we have assumed that the change of consumable +# availability in Likoma is equal to that predicted for Nkhata Bay + +# 1.2.4 Program +#------------------------------------------------------ +scenario_availability_df.loc[scenario_availability_df.program_plot == 'infection_prev', 'program_plot'] = 'general' # there is no separate infection_prevention category in the TLO availability data +map_model_programs_to_hhfa = { + 'contraception': 'contraception', + 'general': 'general', + 'reproductive_health': 'obs&newb', + 'road_traffic_injuries': 'surgical', + 'epi': 'epi', + 'neonatal_health': 'obs&newb', + 'other_childhood_illnesses': 'alri', + 'malaria': 'malaria', + 'tb': 'tb', + 'hiv': 'hiv', + 'undernutrition': 'child', + 'ncds': 'ncds', + 'cardiometabolicdisorders': 'ncds', + 'cancer': 'ncds', +} +# Reverse the map_model_programs_to_hhfa dictionary +hhfa_to_model_programs = {v: k for k, v in map_model_programs_to_hhfa.items()} + +scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(hhfa_to_model_programs) # TODO this might not be relevant + +# 1.2.5 Consumable/Item code and Category +#------------------------------------------------------ +# Load TLO - HHFA consumable name crosswalk +consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['item_code', 'consumable_name_tlo', +'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation']] + +# Keep only item_codes in the availability dataframe +consumable_crosswalk_df = consumable_crosswalk_df.merge(tlo_availability_df[['item_code']], how = 'right', on = 'item_code') +# TODO is module_name used? +# TODO add new consumables Rifapentine to this? + +# Now merge in TLO item codes +scenario_availability_df = scenario_availability_df.reset_index(drop = True) +scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation', 'consumable_name_tlo']], + left_on = ['item_hhfa'], right_on = ['item_hhfa_for_scenario_generation'], how='right', indicator=True, validate = "m:m") +scenario_availability_df = scenario_availability_df.drop_duplicates(['Facility_ID', 'item_code']) +scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_itemcode'}, axis=1) +print("Number of item codes from the TLO model for which no match was found in the regression-based scenario data = ", scenario_availability_df.merge_itemcode.value_counts()[1]) + +# Before merging the above dataframe with tlo_availability_df, and apply a general interpolation rule to fill any gaps, +# we need to make sure that any specific interpolation rules are applied to the scenario dataframe + +# Further a row needs to be added for 1b level under Balaka, Machinga, Mwanza, Ntchisi and Salima +print("Number of unique facility IDs: \n - TLO consumable data = ", tlo_availability_df.Facility_ID.nunique(), + "\n - Scenario based on regression = ", scenario_availability_df.Facility_ID.nunique(), + "\nNumber of unique item codes: \n - TLO consumable availability data = ", tlo_availability_df.item_code.nunique(), + "\n - TLO consumable availability repository = ", consumable_crosswalk_df.item_code.nunique(), + "\n - Scenario based on regression = ", scenario_availability_df.item_code.nunique()) + +# Extract list of TLO consumables which weren't matched with the availability prediction dataframe +items_not_matched = scenario_availability_df['merge_itemcode'] == 'right_only' + +# Get average availability_change_prop value by facility_ID and category_tlo +scenario_availability_df = scenario_availability_df.merge(program_item_mapping, + on = ['item_code'], validate = "m:1", + how = "left") + +# 1.3 Initial interpolation +#------------------------------------------------------ +# 1.3.1 Items not relevant to the regression analysis +items_not_relevant_to_regression = (items_not_matched) & (scenario_availability_df['hhfa_mapping_rationale_for_scenario_generation'] == 'not relevant to logistic regression analysis') +# For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability +list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2', + 'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5'] +for var in list_of_scenario_variables: + scenario_availability_df.loc[items_not_relevant_to_regression,var] = 1 + +# 1.3.2 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume +# that the change is equal to the product of the (ratio of average change across districts for level 1b to +# average change across districts for level 1a) and change for each item_code for level 1a for that district +#------------------------------------------------------------------------------------------------------------ +average_change_across_districts = scenario_availability_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index() + +# Generate the ratio of the proportional changes to availability of level 1b to 1a in the districts for which level 1b data is available +new_colnames_1a = {col: col + '_1a' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} +new_colnames_1b = {col: col + '_1b' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} +average_change_across_districts_for_1a = average_change_across_districts[average_change_across_districts.Facility_Level == "1a"].rename(new_colnames_1a, axis = 1).drop('Facility_Level', axis = 1) +average_change_across_districts_for_1b = average_change_across_districts[average_change_across_districts.Facility_Level == "1b"].rename(new_colnames_1b, axis = 1).drop('Facility_Level', axis = 1) +ratio_of_change_across_districts_1b_to_1a = average_change_across_districts_for_1a.merge(average_change_across_districts_for_1b, + how = "left", on = ['item_code']) +for var in list_of_scenario_variables: + var_ratio = 'ratio_' + var + var_1a = var + '_1a' + var_1b = var + '_1b' + ratio_of_change_across_districts_1b_to_1a[var_ratio] = (ratio_of_change_across_districts_1b_to_1a[var_1b])/(ratio_of_change_across_districts_1b_to_1a[var_1a]) +ratio_of_change_across_districts_1b_to_1a.reset_index(drop = True) +# TODO check if this ratio should be of the proportions minus 1 + +# For districts with no level 1b data in the HHFA, use the ratio of change in level 1b facilities to level 1a facilities to generate the expected proportional change in availability +# for level 1b facilities in those districts +scenario_availability_df = scenario_availability_df.reset_index(drop = True) +cond_districts_with_1b_missing = scenario_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only) +cond_1a = scenario_availability_df.Facility_Level == '1a' +cond_1b = scenario_availability_df.Facility_Level == '1b' +df_1a = scenario_availability_df[cond_districts_with_1b_missing & cond_1a] + +ratio_vars = ['ratio_' + item for item in list_of_scenario_variables] # create columns to represent the ratio of change in 1b facilities to level 1a facilities + +item_var = ['item_code'] + +# First merge the dataframe with changes at level 1a with the ratio of 1b to 1a +df_missing_1b_imputed = df_1a.merge(ratio_of_change_across_districts_1b_to_1a[item_var + ratio_vars], + on = item_var, + how = 'left', validate = "m:1") + +# Then multiply the ratio of 1b to 1a with the change at level 1a to get the expected change at level 1b +for var in list_of_scenario_variables: + df_missing_1b_imputed[var] = df_missing_1b_imputed[var] * df_missing_1b_imputed['ratio_' + var] +# Update columns so the dataframe in fact refers to level 1b facilities +df_missing_1b_imputed.Facility_Level = '1b' # Update facility level to 1 +# Replace Facility_IDs +df_missing_1b_imputed = df_missing_1b_imputed.drop('Facility_ID', axis = 1).merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on =['District', 'Facility_Level'], + how = 'left') +# Append the new imputed level 1b dataframe to the original dataframe +df_without_districts_with_no_1b_facilities = scenario_availability_df[~(cond_districts_with_1b_missing & cond_1b)] +scenario_availability_df = pd.concat([df_without_districts_with_no_1b_facilities, df_missing_1b_imputed], ignore_index = True) + +# 2. Merge TLO model availability data with scenario data using crosswalk +#************************************************************************* +# 2.1 Merge the two datasets +#------------------------------------------------------ +id_variables = ['item_code','Facility_ID'] + +full_scenario_df = tlo_availability_df.merge(scenario_availability_df[id_variables + list_of_scenario_variables], + how='left', on=['Facility_ID', 'item_code'], indicator = True) +full_scenario_df = full_scenario_df.rename({'_merge': 'merge_scenario'}, axis=1) +full_scenario_df = full_scenario_df.drop_duplicates(['Facility_ID', 'item_code', 'month']) + +# Check that level 1b values are currently imputed +#full_scenario_df[full_scenario_df.District == 'Balaka'].groupby(['District', 'Facility_Level'])['change_proportion_scenario1'].mean() + +# 2.2 Further imputation +#------------------------------------------------------ +# 2.2.1 For all levels other than 1a and 1b, there will be no change in consumable availability +#------------------------------------------------------------------------------------------------------------ +fac_levels_not_relevant_to_regression = full_scenario_df.Facility_Level.isin(['0', '2', '3', '4']) + +for var in list_of_scenario_variables: + full_scenario_df.loc[fac_levels_not_relevant_to_regression, var] = 1 + +# 2.3 Final checks +#------------------------------------------------------ +# 2.3.1 Check that the merged dataframe has the same number of unique items, facility IDs, and total +# number of rows as the original small availability resource file +#--------------------------------------------------------------------------------------------------------- +assert(full_scenario_df.item_code.nunique() == tlo_availability_df.item_code.nunique()) # the number of items in the new dataframe is the same at the original availability dataframe +assert(full_scenario_df.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique()) # the number of Facility IDs in the new dataframe is the same at the original availability dataframe +assert(len(full_scenario_df) == len(tlo_availability_df)) # the number of rows in the new dataframe is the same at the original availability dataframe + +# 2.3.2 Construct dataset that conforms to the principles expected by the simulation: i.e. that there is an entry for every +# facility_id and for every month for every item_code. +#----------------------------------------------------------------------------------------------------------------------- +# Generate the dataframe that has the desired size and shape +fac_ids = set(mfl.loc[mfl.Facility_Level != '5'].Facility_ID) +item_codes = set(tlo_availability_df.item_code.unique()) +months = range(1, 13) +all_availability_columns = ['available_prop'] + list_of_scenario_variables + +# Create a MultiIndex from the product of fac_ids, months, and item_codes +index = pd.MultiIndex.from_product([fac_ids, months, item_codes], names=['Facility_ID', 'month', 'item_code']) + +# Initialize a DataFrame with the MultiIndex and columns, filled with NaN +full_set = pd.DataFrame(index=index, columns=all_availability_columns) +full_set = full_set.astype(float) # Ensure all columns are float type and filled with NaN + +# Insert the data, where it is available. +full_set = full_set.combine_first(full_scenario_df.set_index(['Facility_ID', 'month', 'item_code'])[all_availability_columns]) + +# Fill in the blanks with rules for interpolation. +facilities_by_level = defaultdict(set) +for ix, row in mfl.iterrows(): + facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) + +items_by_category = defaultdict(set) +for ix, row in program_item_mapping.iterrows(): + items_by_category[row['item_category']].add(row['item_code']) + +def get_other_facilities_of_same_level(_fac_id): + """Return a set of facility_id for other facilities that are of the same level as that provided.""" + for v in facilities_by_level.values(): + if _fac_id in v: + return v - {_fac_id} + +def get_other_items_of_same_category(_item_code): + """Return a set of item_codes for other items that are in the same category/program as that provided.""" + for v in items_by_category.values(): + if _item_code in v: + return v - {_item_code} +def interpolate_missing_with_mean(_ser): + """Return a series in which any values that are null are replaced with the mean of the non-missing.""" + if pd.isnull(_ser).all(): + raise ValueError + return _ser.fillna(_ser.mean()) + +# Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based +# on what results are missing before the interpolations in other facilities). +full_set_interpolated = full_set * np.nan +full_set_interpolated.available_prop = full_set.available_prop + +for fac in fac_ids: + for item in item_codes: + for col in list_of_scenario_variables: + print(f"Now doing: fac={fac}, item={item}, column={col}") + + # Get records of the availability of this item in this facility. + _monthly_records = full_set.loc[(fac, slice(None), item), col].copy() + + if pd.notnull(_monthly_records).any(): + # If there is at least one record of this item at this facility, then interpolate the missing months from + # the months for there are data on this item in this facility. (If none are missing, this has no effect). + _monthly_records = interpolate_missing_with_mean(_monthly_records) + + else: + # If there is no record of this item at this facility, check to see if it's available at other facilities + # of the same level + # Or if there is no record of item at other facilities at this level, check to see if other items of this category + # are available at this facility level + facilities = list(get_other_facilities_of_same_level(fac)) + + other_items = get_other_items_of_same_category(item) + items = list(other_items) if other_items else other_items + + recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(facilities, slice(None), item), col] + ).any() + + if not items: + category_recorded_at_other_facilities_of_same_level = False + else: + category_recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(fac, slice(None), items), col] + ).any() + + if recorded_at_other_facilities_of_same_level: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for facility ", fac, " extrapolated from other facilities within level - ", facilities) + facilities = list(get_other_facilities_of_same_level(fac)) + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(facilities, slice(None), item), col].groupby(level=1).mean() + ) + + elif category_recorded_at_other_facilities_of_same_level: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for item ", item, " extrapolated from other items within category - ", items) + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(fac, slice(None), items), col].groupby(level=1).mean() + ) + + else: + # If it is not recorded at other facilities of same level, then assume that there is no change + print("No interpolation worked") + _monthly_records = _monthly_records.fillna(1.0) + + # Insert values (including corrections) into the resulting dataset. + full_set_interpolated.loc[(fac, slice(None), item), col] = _monthly_records.values + # temporary code + assert full_set_interpolated.loc[(fac, slice(None), item), col].mean() >= 0 + +# 3. Generate regression-based scenario data on consumable availablity +#************************************************************************* +# Create new consumable availability estimates for TLO model consumables using +# estimates of proportional change from the regression analysis based on HHFA data +#------------------------------------------------------ +prefix = 'change_proportion_' +list_of_scenario_suffixes = [s.replace(prefix, '') for s in list_of_scenario_variables] + +for scenario in list_of_scenario_suffixes: + full_set_interpolated['available_prop_' + scenario] = full_set_interpolated['available_prop'] * full_set_interpolated['change_proportion_' + scenario] + availability_greater_than_1 = full_set_interpolated['available_prop_' + scenario] > 1 + full_set_interpolated.loc[availability_greater_than_1, 'available_prop_' + scenario] = 1 + + assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) == + sum(full_set_interpolated['change_proportion_' + scenario].isna())) # make sure that there is an entry for every row in which there was previously data + +# 4. Generate best performing facility-based scenario data on consumable availability +#*************************************************************************************** +df = full_set_interpolated.reset_index().copy() + +# Try updating the avaiability to represent the 75th percentile by consumable +facility_levels = ['1a', '1b', '2'] +target_percentiles = [75, 90, 99] + +best_performing_facilities = {} +# Populate the dictionary +for level in facility_levels: + # Create an empty dictionary for the current level + best_performing_facilities[level] = {} + + for item in item_codes: + best_performing_facilities[level][item] = {} + # Get the mean availability by Facility for the current level + mean_consumable_availability = pd.DataFrame(df[(df.Facility_ID.isin(facilities_by_level[level])) & (df.item_code == item)] + .groupby('Facility_ID')['available_prop'].mean()).reset_index() + + # Calculate the percentile rank of each row for 'available_prop' + mean_consumable_availability['percentile_rank'] = mean_consumable_availability['available_prop'].rank(pct=True) * 100 + + # Find the row which is closest to the nth percentile rank for each target percentile + for target_perc in target_percentiles: + # Calculate the difference to target percentile + mean_consumable_availability['diff_to_target_' + str(target_perc)] = np.abs( + mean_consumable_availability['percentile_rank'] - target_perc) + + # Find the row with the minimum difference to the target percentile + closest_row = mean_consumable_availability.loc[ + mean_consumable_availability['diff_to_target_' + str(target_perc)].idxmin()] + + # Store the Facility_ID of the closest row in the dictionary for the current level + best_performing_facilities[level][item][str(target_perc) + 'th percentile'] = closest_row['Facility_ID'] + +print("Reference facilities at each level for each item: ", best_performing_facilities) + +# Obtain the updated availability estimates for level 1a for scenarios 6-8 +updated_availability_1a = df[['item_code', 'month']].drop_duplicates() +updated_availability_1b = df[['item_code', 'month']].drop_duplicates() +updated_availability_2 = df[['item_code', 'month']].drop_duplicates() +temporary_df = pd.DataFrame([]) +availability_dataframes = [updated_availability_1a, updated_availability_1b, updated_availability_2] + +i = 6 # start scenario counter +j = 0 # start level counter +for level in facility_levels: + for target_perc in target_percentiles: + for item in item_codes: + + print("Running level ", level, "; Running scenario ", str(i), "; Running item ", item) + reference_facility = df['Facility_ID'] == best_performing_facilities[level][item][str(target_perc) + 'th percentile'] + current_item = df['item_code'] == item + availability_at_reference_facility = df[reference_facility & current_item][['item_code', 'month', 'available_prop']] + + if temporary_df.empty: + temporary_df = availability_at_reference_facility + else: + temporary_df = pd.concat([temporary_df,availability_at_reference_facility], ignore_index = True) + + column_name = 'available_prop_scenario' + str(i) + temporary_df = temporary_df.rename(columns = {'available_prop': column_name }) + availability_dataframes[j] = availability_dataframes[j].merge(temporary_df, on = ['item_code', 'month'], how = 'left', validate = '1:1') + temporary_df = pd.DataFrame([]) + i = i + 1 + i = 6 # restart scenario counter + j = j + 1 # move to the next level + +# Merge the above scenario data to the full availability scenario dataframe +# 75, 90 and 99th percentile availability data for level 1a +df_new_1a = df[df['Facility_ID'].isin(facilities_by_level['1a'])].merge(availability_dataframes[0],on = ['item_code', 'month'], + how = 'left', + validate = "m:1") +# 75, 90 and 99th percentile availability data for level 1b +df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(availability_dataframes[1],on = ['item_code', 'month'], + how = 'left', + validate = "m:1") + +# For scenarios 6-8, replace the new availability probabilities by the max(original availability, availability at target percentile) +for scen in [6,7,8]: + df_new_1a['available_prop_scenario' + str(scen)] = df_new_1a.apply( + lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) + df_new_1b['available_prop_scenario' + str(scen)] = df_new_1b.apply( + lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) + +# 75, 90 and 99th percentile availability data for level 2 +df_new_2 = df[df['Facility_ID'].isin(facilities_by_level['2'])].merge(availability_dataframes[2],on = ['item_code', 'month'], + how = 'left', + validate = "m:1") + +# Generate scenarios 6-8 +#------------------------ +# scenario 6: only levels 1a and 1b changed to availability at 75th percentile for the corresponding level +# scenario 7: only levels 1a and 1b changed to availability at 90th percentile for the corresponding level +# scenario 8: only levels 1a and 1b changed to availability at 99th percentile for the corresponding level +# Scenario 6-8 availability data for other levels +df_new_otherlevels = df[~df['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b'])] +new_scenario_columns = ['available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8'] +for col in new_scenario_columns: + df_new_otherlevels[col] = df_new_otherlevels['available_prop'] +# Append the above dataframes +df_new_scenarios6to8 = pd.concat([df_new_1a, df_new_1b, df_new_otherlevels], ignore_index = True) + + +# Generate scenario 9 +#------------------------ +# scenario 9: levels 1a, 1b and 2 changed to availability at 99th percentile for the corresponding level +df_new_otherlevels = df_new_scenarios6to8[~df_new_scenarios6to8['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b']|facilities_by_level['2'])].reset_index(drop = True) +df_new_1a_scenario9 = df_new_scenarios6to8[df_new_scenarios6to8['Facility_ID'].isin(facilities_by_level['1a'])].reset_index(drop = True) +df_new_1b_scenario9 = df_new_scenarios6to8[df_new_scenarios6to8['Facility_ID'].isin(facilities_by_level['1b'])].reset_index(drop = True) +df_new_2_scenario9 = df_new_2[df_new_2['Facility_ID'].isin(facilities_by_level['2'])].reset_index(drop = True) +new_scenario_columns = ['available_prop_scenario9'] +for col in new_scenario_columns: + df_new_otherlevels[col] = df_new_otherlevels['available_prop'] + df_new_1a_scenario9[col] = df_new_1a_scenario9['available_prop_scenario8'] + df_new_1b_scenario9[col] = df_new_1b_scenario9['available_prop_scenario8'] + df_new_2_scenario9[col] = df_new_2_scenario9.apply(lambda row: max(row['available_prop_scenario8'], row['available_prop']), axis=1) + +# Append the above dataframes +df_new_scenarios9 = pd.concat([df_new_1a_scenario9, df_new_1b_scenario9, df_new_2_scenario9, df_new_otherlevels], ignore_index = True) + +# 6. Generate scenarios based on the performance of vertical programs +#*************************************************************************************** +df_scenarios_10to12 = tlo_availability_df +cond_levels1a1b = (df_scenarios_10to12.Facility_Level == '1a') |(df_scenarios_10to12.Facility_Level == '1b') +cond_hiv = df_scenarios_10to12.item_category == 'hiv' +cond_epi = df_scenarios_10to12.item_category == 'epi' +cond_cancer = df_scenarios_10to12.item_category == 'cancer' + +# Calculate the average available_prop for each Facility_Level of HIV and EPI +avg_available_prop_hiv = df_scenarios_10to12[cond_hiv & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() +avg_available_prop_epi = df_scenarios_10to12[cond_epi & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() +avg_available_prop_other = df_scenarios_10to12[~cond_epi & ~cond_hiv & ~ cond_cancer & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() + +# Create new scenario variables +df_scenarios_10to12['available_prop_scenario10'] = df_scenarios_10to12['available_prop'] +df_scenarios_10to12['available_prop_scenario11'] = df_scenarios_10to12['available_prop'] +df_scenarios_10to12['available_prop_scenario12'] = df_scenarios_10to12['available_prop'] + +# Update scenario availability based on the average availbility by level of the parallel supply chains +df_scenarios_10to12['available_prop_scenario10'] = df_scenarios_10to12.apply( + lambda row: max(row['available_prop_scenario10'], avg_available_prop_hiv[row['Facility_Level']]) + if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] + else row['available_prop_scenario10'], axis=1 +) + +df_scenarios_10to12['available_prop_scenario11'] = df_scenarios_10to12.apply( + lambda row: max(row['available_prop_scenario11'], avg_available_prop_epi[row['Facility_Level']]) + if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] + else row['available_prop_scenario11'], axis=1 +) + +df_scenarios_10to12['available_prop_scenario12'] = df_scenarios_10to12.apply( + lambda row: min(row['available_prop_scenario12'], avg_available_prop_other[row['Facility_Level']]) + if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] in ['hiv'] + else row['available_prop_scenario12'], axis=1 +) + +# Add scenarios 6 to 11 to the original dataframe +#------------------------------------------------------ +list_of_scenario_suffixes_first_stage = list_of_scenario_suffixes + ['scenario6', 'scenario7', 'scenario8', 'scenario9'] +list_of_scenario_variables_first_stage = ['available_prop_' + item for item in list_of_scenario_suffixes_first_stage] +old_vars = ['Facility_ID', 'month', 'item_code'] +full_df_with_scenario = df_new_scenarios6to8[old_vars + ['available_prop'] + [col for col in list_of_scenario_variables_first_stage if col != 'available_prop_scenario9']].reset_index().drop('index', axis = 1) +full_df_with_scenario = full_df_with_scenario.merge(df_new_scenarios9[old_vars + ['available_prop_scenario9']], on = old_vars, how = 'left', validate = "1:1") + +list_of_scenario_suffixes_second_stage = list_of_scenario_suffixes_first_stage + ['scenario10', 'scenario11', 'scenario12'] +final_list_of_scenario_vars = ['available_prop_' + item for item in list_of_scenario_suffixes_second_stage] +full_df_with_scenario = full_df_with_scenario.merge(df_scenarios_10to12[old_vars + ['available_prop_scenario10', 'available_prop_scenario11', 'available_prop_scenario12']], on = old_vars, how = 'left', validate = "1:1") + +#full_df_with_scenario = full_df_with_scenario.merge(program_item_mapping, on = 'item_code', validate = 'm:1', how = 'left') + +# --- Check that the scenarios 6-11 always have higher prob of availability than baseline --- # +for scen in range(6,12): + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] < full_df_with_scenario['available_prop']) == 0 + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] >= full_df_with_scenario['available_prop']) == len(full_df_with_scenario) + +# --- Check that the exported file has the properties required of it by the model code. --- # +check_format_of_consumables_file(df=full_df_with_scenario, fac_ids=fac_ids) + +# Save updated consumable availability resource file with scenario data +full_df_with_scenario.to_csv( + path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv", + index=False +) +# TODO: Create a column providing the source of scenario data + +# 8. Plot new availability estimates by scenario +#********************************************************************************************* +full_df_with_scenario = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv") + +# Create the directory if it doesn't exist +figurespath = outputfilepath / 'consumable_scenario_analysis' +if not os.path.exists(figurespath): + os.makedirs(figurespath) + +# Prepare the availability dataframe for descriptive plots +df_for_plots = full_df_with_scenario.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1") +df_for_plots = df_for_plots.merge(program_item_mapping, on = 'item_code', how = 'left', validate = "m:1") +scenario_list = [1,2,3,6,7,8,10,11,12] +chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in + scenario_list] +scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'Non-therapeutic \n consumables', 'available_prop_scenario2': 'Vital medicines', + 'available_prop_scenario3': 'Pharmacist-\n managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM', + 'available_prop_scenario6': '75th percentile\n facility', 'available_prop_scenario7': '90th percentile \n facility', 'available_prop_scenario8': 'Best \n facility', + 'available_prop_scenario9': 'Best facility \n (including DHO)','available_prop_scenario10': 'HIV supply \n chain', 'available_prop_scenario11': 'EPI supply \n chain', + 'available_prop_scenario12': 'HIV moved to \n Govt supply chain'} +# recreate the chosen columns list based on the mapping above +chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns] +df_for_plots = df_for_plots.rename(columns = scenario_names_dict) + +# Generate a bar plot of average availability under each scenario by item_category and Facility_Level +def generate_barplot_of_scenarios(_df, _x_axis_var, _filename): + df_for_bar_plot = _df.groupby([_x_axis_var])[chosen_availability_columns].mean() + df_for_bar_plot = df_for_bar_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=chosen_availability_columns, + var_name='Scenario', value_name='Value') + plot = (ggplot(df_for_bar_plot.reset_index(), aes(x=_x_axis_var, y='Value', fill = 'Scenario')) + + geom_bar(stat='identity', position='dodge') + + ylim(0, 1) + + labs(title = "Probability of availability across scenarios", + x=_x_axis_var, + y='Probability of availability') + + theme(axis_text_x=element_text(angle=45, hjust=1)) + ) + + plot.save(filename= figurespath / _filename, dpi=300, width=10, height=8, units='in') +generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'item_category', _filename = 'availability_by_category.png') +generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png') + +# Create heatmaps by Facility_Level of average availability by item_category across chosen scenarios +for level in fac_levels: + # Generate a heatmap + # Pivot the DataFrame + aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[chosen_availability_columns].mean().reset_index() + aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin([level])] + heatmap_data = aggregated_df.set_index('item_category').drop(columns = 'Facility_Level') + + # Calculate the aggregate row and column + aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), chosen_availability_columns].mean() + #overall_aggregate = aggregate_col.mean() + + # Add aggregate row and column + #heatmap_data['Average'] = aggregate_row + #aggregate_col['Average'] = overall_aggregate + heatmap_data.loc['Average'] = aggregate_col + + # Generate the heatmap + sns.set(font_scale=1.2) + plt.figure(figsize=(10, 8)) + sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) + + # Customize the plot + plt.title(f'Facility Level {level}') + plt.xlabel('Scenarios') + plt.ylabel(f'Disease/Public health \n program') + plt.xticks(rotation=90) + plt.yticks(rotation=0) + + plt.savefig(figurespath /f'consumable_availability_heatmap_{level}.png', dpi=300, bbox_inches='tight') + plt.close() + +# Create heatmap of average availability by Facility_Level across chosen scenarios +# Pivot the DataFrame +aggregated_df = df_for_plots.groupby(['Facility_Level'])[chosen_availability_columns].mean().reset_index() +heatmap_data = aggregated_df.set_index('Facility_Level') + +# Calculate the aggregate row and column +aggregate_col= df_for_plots[chosen_availability_columns].mean() +#overall_aggregate = aggregate_col.mean() + +# Add aggregate row and column +#heatmap_data['Average'] = aggregate_row +#aggregate_col['Average'] = overall_aggregate +heatmap_data.loc['Average'] = aggregate_col + +# Generate the heatmap +sns.set(font_scale=1.2) +plt.figure(figsize=(10, 8)) +sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) + +# Customize the plot +plt.title(f'Availability across scenarios') +plt.xlabel('Scenarios') +plt.ylabel(f'Facility Level') +plt.xticks(rotation=90) +plt.yticks(rotation=0) + +plt.savefig(figurespath /f'consumable_availability_heatmap_alllevels.png', dpi=300, bbox_inches='tight') +plt.show() +plt.close() + +# Create heatmap of average availability by Facility_Level and program for actual and 75th percentile (Costing paper) +# Actual +aggregated_df = df_for_plots.groupby(['Facility_Level', 'item_category'])['Actual'].mean().reset_index() +heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='Actual') + +# Calculate the aggregate row and column +aggregate_col= df_for_plots.groupby('item_category')['Actual'].mean() +overall_aggregate = df_for_plots['Actual'].mean() +aggregate_row = df_for_plots.groupby('Facility_Level')['Actual'].mean() + +# Add aggregate row and column +heatmap_data['Average'] = aggregate_col +aggregate_row['Average'] = overall_aggregate +heatmap_data.loc['Average'] = aggregate_row + +# Generate the heatmap +sns.set(font_scale=1.2) +plt.figure(figsize=(10, 8)) +sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) + +# Customize the plot +plt.xlabel('Facility Level') +plt.ylabel(f'Program') +plt.xticks(rotation=90) +plt.yticks(rotation=0) + +plt.savefig(figurespath /f'heatmap_program_and_level_actual.png', dpi=300, bbox_inches='tight') +plt.show() +plt.close() + +# 75th percentile +aggregated_df = df_for_plots.groupby(['Facility_Level', 'item_category'])['75th percentile\n facility'].mean().reset_index() +heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='75th percentile\n facility') + +# Calculate the aggregate row and column +aggregate_col= df_for_plots.groupby('item_category')['75th percentile\n facility'].mean() +overall_aggregate = df_for_plots['75th percentile\n facility'].mean() +aggregate_row = df_for_plots.groupby('Facility_Level')['75th percentile\n facility'].mean() + + +# Add aggregate row and column +heatmap_data['Average'] = aggregate_col +aggregate_row['Average'] = overall_aggregate +heatmap_data.loc['Average'] = aggregate_row + +# Generate the heatmap +sns.set(font_scale=1.2) +plt.figure(figsize=(10, 8)) +sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) + +# Customize the plot +plt.xlabel('Facility Level') +plt.ylabel(f'Program') +plt.xticks(rotation=90) +plt.yticks(rotation=0) + +plt.savefig(figurespath /f'heatmap_program_and_level_75perc.png', dpi=300, bbox_inches='tight') +plt.show() +plt.close() + + +# Create a heatmap of average availability by item_category and scenario +# Base scenario list +base_scenarios = [['Actual']] +# Additional scenarios to add iteratively +additional_scenarios = [ + ['Non-therapeutic \n consumables', 'Vital medicines', 'Pharmacist-\n managed'], + ['75th percentile\n facility', '90th percentile \n facility', 'Best \n facility'], + ['HIV supply \n chain', 'EPI supply \n chain', 'HIV moved to \n Govt supply chain'] +] +# Generate iteratively chosen availability columns +iteratively_chosen_availability_columns = [ + base_scenarios[0] + sum(additional_scenarios[:i], []) for i in range(len(additional_scenarios) + 1) +] + +i = 1 +for column_list in iteratively_chosen_availability_columns: + # Create heatmap of average availability by item_category across chosen scenarios + # Pivot the DataFrame + chosen_levels = ['1a', '1b'] + aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin(chosen_levels)] + aggregated_df = aggregated_df.groupby(['item_category'])[column_list].mean().reset_index() + heatmap_data = aggregated_df.set_index('item_category') + + # Calculate the aggregate row and column + aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin(chosen_levels), column_list].mean() + #overall_aggregate = aggregate_col.mean() + + # Add aggregate row and column + heatmap_data['Perfect'] = 1 # Add a column representing the perfect scenario + aggregate_col['Perfect'] = 1 + heatmap_data.loc['Average'] = aggregate_col + + # Ensure all scenarios are represented for consistent column widths + all_scenarios = chosen_availability_columns + ['Perfect'] + heatmap_data = heatmap_data.reindex(columns=all_scenarios) + + # Update column names for x-axis labels + # Generate the heatmap + sns.set(font_scale=1.2) + plt.figure(figsize=(10, 8)) + sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) + + # Customize the plot + plt.title(f'Availability across scenarios') + plt.xlabel('Scenarios') + plt.ylabel(f'Facility Level') + plt.xticks(rotation=90) + plt.yticks(rotation=0) + + plt.savefig(figurespath /f'consumable_availability_heatmap_bycategory_iter{i}.png', dpi=300, bbox_inches='tight') + plt.close() + i = i + 1 + +# Create a barplot of average consumable availability based on the colours used in analysis_impact_of_consumable_scenarios +average_availability = df_for_plots[chosen_availability_columns].mean().reset_index() +average_availability.columns = ['scenario', 'average_availability'] +new_row = pd.DataFrame([['Perfect', 1]], columns=['scenario', 'average_availability']) # new row for perfect availability +average_availability = pd.concat([average_availability, new_row], axis=0, ignore_index=True) # Concatenate the new row with the existing DataFrame + +# Define color mapping for each scenario +color_mapping = { + 'Actual': '#1f77b4', + 'Non-therapeutic \n consumables': '#ff7f0e', + 'Vital medicines': '#2ca02c', + 'Pharmacist-\n managed': '#d62728', + '75th percentile\n facility': '#9467bd', + '90th percentile \n facility': '#8c564b', + 'Best \n facility': '#e377c2', + 'Best facility \n (including DHO)': '#7f7f7f', + 'HIV supply \n chain': '#bcbd22', + 'EPI supply \n chain': '#17becf', + 'HIV moved to \n Govt supply chain':'#ff6347', + 'Perfect': '#31a354' +} +# Create a color list for the bars +colors = [color_mapping[scenario] for scenario in average_availability['scenario']] +# Create the bar plot and capture the bars +plt.figure(figsize=(10, 6)) +bars = plt.bar(average_availability['scenario'], average_availability['average_availability'], color=colors) +plt.title('Average Availability by Scenario') +plt.xlabel('Scenario') +plt.ylabel('Average Availability') +plt.xticks(rotation=45) +plt.ylim(0, 1) # Adjust based on your data range +plt.grid(axis='y') +# Add data labels +for bar in bars: + yval = bar.get_height() + plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.02, round(yval, 2), ha='center', va='bottom') + +# Save the plot +plt.tight_layout() +plt.tight_layout() +plt.savefig(figurespath / 'scenarios_average_availability.png', dpi=300, bbox_inches='tight') + + +# Create the directory if it doesn't exist +roi_plots_path = './outputs/horizontal_v_vertical/roi/' +if not os.path.exists(roi_plots_path): + os.makedirs(roi_plots_path) + +# Create a combined plot of heatmaps of average availability for levels 1a and 1b under actual, 75th percentile, HIV and EPI scenarios +# Scenario list +scenarios_for_roi_paper = ['Actual', '75th percentile\n facility', 'HIV supply \n chain', 'EPI supply \n chain'] +# Define facility levels +chosen_levels = ['1a', '1b'] + +# Create a figure with subplots for each level +fig, axes = plt.subplots(nrows=1, ncols=len(chosen_levels), figsize=(20, 8), sharex=True, sharey=True) +# Create a single colorbar axis +cbar_ax = fig.add_axes([.91, .3, .02, .4]) # Position of the colorbar + +for ax, level in zip(axes, chosen_levels): + # Filter data for the current facility level + aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin([level])] + aggregated_df = aggregated_df.groupby(['item_category'])[scenarios_for_roi_paper].mean().reset_index() + heatmap_data = aggregated_df.set_index('item_category') + + # Calculate the aggregate row + aggregate_col = df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), scenarios_for_roi_paper].mean() + heatmap_data.loc['Average'] = aggregate_col + + # Generate the heatmap on the current subplot + sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', ax=ax, cbar=(ax == axes[-1]), cbar_ax=(cbar_ax if ax == axes[-1] else None)) + + # Set labels + ax.set_title(f'Level {level}') + ax.set_xlabel('Scenarios') + ax.set_ylabel('Program' if ax == axes[0] else "") + +cbar_ax.set_ylabel('Proportion of days consumable is available') +# Save the combined heatmap +plt.savefig(roi_plots_path / f'combined_consumable_availability_heatmap_1a_1b.png', dpi=300, bbox_inches='tight') +plt.close() + +# Create a combined plot of heatmaps of average availability for all levels under actual, 75th percentile, HIV and EPI scenarios +chosen_levels = ['0', '1a', '1b', '2', '3'] +# Create a figure with subplots +fig, axes = plt.subplots(nrows=1, ncols=len(chosen_levels), figsize=(20, 8), sharex=True, sharey=True) + +# Create a single colorbar axis +cbar_ax = fig.add_axes([.91, .3, .02, .4]) # Position of the colorbar + +for ax, level in zip(axes, chosen_levels): + # Filter data for the current facility level + aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin([level])] + aggregated_df = aggregated_df.groupby(['item_category'])[scenarios_for_roi_paper].mean().reset_index() + heatmap_data = aggregated_df.set_index('item_category') + + # Calculate the aggregate row + aggregate_col = df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), scenarios_for_roi_paper].mean() + heatmap_data.loc['Average'] = aggregate_col + + # Generate the heatmap on the current subplot + sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', ax=ax, cbar=(ax == axes[-1]), cbar_ax=(cbar_ax if ax == axes[-1] else None)) + + # Set labels + ax.set_title(f'Level {level}') + ax.set_xlabel('Scenarios') + ax.set_ylabel('Program' if ax == axes[0] else "") + +# Adjust layout +cbar_ax.set_ylabel('Proportion of days consumable is available') +# Save the combined heatmap +plt.savefig(roi_plots_path / f'combined_consumable_availability_heatmap_all_levels.png', dpi=300, bbox_inches='tight') +plt.close() + +''' +# Create heatmap of average availability by Facility_Level just showing scenario 12 +scenario_list = [12] +chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in + scenario_list] +# Pivot the DataFrame +df_for_hiv_plot = df_for_plots +df_for_hiv_plot['hiv_or_other'] = np.where(df_for_hiv_plot['item_category'] == 'hiv', 'hiv', 'other programs') + +aggregated_df = df_for_hiv_plot.groupby(['Facility_Level', 'hiv_or_other'])[chosen_availability_columns].mean().reset_index() +aggregated_df = aggregated_df.rename(columns = {'available_prop': 'Actual', 'available_prop_scenario12': 'HIV moved to Govt supply chain'}) +heatmap_data = aggregated_df.pivot_table(index=['Facility_Level'], # Keep other relevant columns in the index + columns='hiv_or_other', + values=['Actual', 'HIV moved to Govt supply chain']) +# Generate the heatmap +sns.set(font_scale=1) +plt.figure(figsize=(10, 8)) +sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) + +# Customize the plot +plt.title(f'Availability across scenarios') +plt.xlabel('Scenarios') +plt.ylabel(f'Facility Level') +plt.xticks(rotation=90) +plt.yticks(rotation=0) + +plt.savefig(figurespath /f'consumable_availability_heatmap_hiv_v_other.png', dpi=300, bbox_inches='tight') +plt.show() +plt.close() + + +# Scenario on the X axis, level on the Y axis +# Scenario on the X axis, program on the Y axis +# TODO add heat maps i. heatmap by item_category across the sceanrios + + +# 2.3.2. Browse missingness in the availability_change_prop variable +#------------------------------------------------------ +pivot_table = pd.pivot_table(scenario_availability_df, + values=list_of_scenario_variables, + index=['item_category'], + columns=['Facility_Level'], + aggfunc=lambda x: sum(pd.isna(x))/len(x)*100) +pivot_table.to_csv(outputfilepath / "temp.csv") +print(pivot_table[('change_proportion_scenario5', '0')]) + +a = availability_dataframes[1].reset_index() +print(best_performing_facilities['1b'][5][str(75) + 'th percentile']) +print(best_performing_facilities['1b'][222][str(90) + 'th percentile']) +print(best_performing_facilities['1b'][222][str(99) + 'th percentile']) +a[a.item_code == 222][['month', 'available_prop_scenario8']] +item_chosen = 222 +fac_chosen = 110 +print(df[(df.item_code == item_chosen) & (df.Facility_ID == fac_chosen)][['month', 'available_prop']]) + +''' + diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py index 6fb43e4a6d..414b7b7cb6 100644 --- a/src/tlo/methods/consumables.py +++ b/src/tlo/methods/consumables.py @@ -46,6 +46,9 @@ def __init__(self, 'all_medicines_and_other_available', 'all_vital_available', 'all_drug_or_vaccine_available', + 'scenario1', 'scenario2', 'scenario3', 'scenario4', + 'scenario5', 'scenario6', 'scenario7', 'scenario8', + 'scenario9', 'scenario10', 'scenario11', 'scenario12', } # Create internal items: @@ -61,7 +64,7 @@ def __init__(self, # Save all item_codes that are defined and pd.Series with probs of availability from ResourceFile self.item_codes, self._processed_consumables_data = \ - self._process_consumables_data(availability_data=availability_data) + self._process_consumables_data(availability_data=availability_data, availability=availability) # Set the availability based on the argument provided (this can be updated later after the class is initialised) self.availability = availability @@ -102,7 +105,10 @@ def _update_prob_item_codes_available(self, availability: str): item_code_designations = self._item_code_designations # Over-ride the data according to option for `availability` - if availability == 'default': + if availability in ('default', + 'scenario1', 'scenario2', 'scenario3', 'scenario4', + 'scenario5', 'scenario6', 'scenario7', 'scenario8', + 'scenario9', 'scenario10', 'scenario11', 'scenario12'): pass elif availability == 'all': self.override_availability(dict(zip(self.item_codes, repeat(1.0)))) @@ -134,16 +140,24 @@ def _update_prob_item_codes_available(self, availability: str): else: raise ValueError - def _process_consumables_data(self, availability_data: pd.DataFrame) -> Tuple[set, pd.Series]: + def _process_consumables_data(self, availability_data: pd.DataFrame, availability: str) -> Tuple[set, pd.Series]: """Helper function for processing the consumables data, passed in here as pd.DataFrame that has been read-in by the HealthSystem. Returns: (i) the set of all recognised item_codes; (ii) pd.Series of the availability of each consumable at each facility_id during each month. """ - return ( - set(availability_data.item_code), - availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop'] - ) + if availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4', + 'scenario5', 'scenario6', 'scenario7', 'scenario8', + 'scenario9', 'scenario10', 'scenario11', 'scenario12'): + return ( + set(availability_data.item_code), + availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + availability] + ) + else: + return ( + set(availability_data['item_code']), + availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop'] + ) def _refresh_availability_of_consumables(self, date: datetime.datetime): """Update the availability of all items based on the data for the probability of availability, given the current @@ -267,7 +281,7 @@ def _lookup_availability_of_consumables(self, def on_simulation_end(self): """Do tasks at the end of the simulation. - + Raise warnings and enter to log about item_codes not recognised. """ if len(self._not_recognised_item_codes) > 0: @@ -337,8 +351,12 @@ def check_format_of_consumables_file(df, fac_ids): """Check that we have a complete set of estimates, for every region & facility_type, as defined in the model.""" months = set(range(1, 13)) item_codes = set(df.item_code.unique()) + number_of_scenarios = 12 + + availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in + range(1, number_of_scenarios + 1)] - assert set(df.columns) == {'Facility_ID', 'month', 'item_code', 'available_prop'} + assert set(df.columns).issubset({'Facility_ID', 'month', 'item_code'} | set(availability_columns)) # Check that all permutations of Facility_ID, month and item_code are present pd.testing.assert_index_equal( @@ -348,8 +366,11 @@ def check_format_of_consumables_file(df, fac_ids): ) # Check that every entry for a probability is a float on [0,1] - assert (df.available_prop <= 1.0).all() and (df.available_prop >= 0.0).all() - assert not pd.isnull(df.available_prop).any() + for col in availability_columns: + if col in df.columns: # This makes sure that even when all scenarios have not been created, the ones that are + # have appropriate values + assert (df[col] <= 1.0).all() and (df[col] >= 0.0).all() + assert not pd.isnull(df[col]).any() class ConsumablesSummaryCounter: diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py index 5c6b2022e1..8c1f6f99b1 100644 --- a/src/tlo/methods/healthsystem.py +++ b/src/tlo/methods/healthsystem.py @@ -1076,10 +1076,16 @@ def update_consumables_availability_to_represent_merging_of_levels_1b_and_2(self how='left' ) + availability_columns = ['available_prop', 'available_prop_scenario1', 'available_prop_scenario2', + 'available_prop_scenario3', 'available_prop_scenario4', 'available_prop_scenario5', + 'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8', + 'available_prop_scenario9', 'available_prop_scenario10', 'available_prop_scenario11', + 'available_prop_scenario12'] + # compute the updated availability at the merged level '1b' and '2' availability_at_1b_and_2 = \ dfx.drop(dfx.index[~dfx['Facility_Level'].isin(AVAILABILITY_OF_CONSUMABLES_AT_MERGED_LEVELS_1B_AND_2)]) \ - .groupby(by=['District', 'month', 'item_code'])['available_prop'] \ + .groupby(by=['District', 'month', 'item_code'])[availability_columns] \ .mean() \ .reset_index()\ .assign(Facility_Level=LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2) @@ -1108,7 +1114,7 @@ def update_consumables_availability_to_represent_merging_of_levels_1b_and_2(self # check values the same for everything apart from the facility level '2' facilities facilities_with_any_differences = set( df_updated.loc[ - ~(df_original == df_updated).all(axis=1), + ~(df_original.sort_values(['Facility_ID', 'month', 'item_code']).reset_index(drop=True) == df_updated).all(axis=1), 'Facility_ID'] ) level2_facilities = set( diff --git a/tests/test_consumables.py b/tests/test_consumables.py index 6ddf3b3a28..50a9e6faf4 100644 --- a/tests/test_consumables.py +++ b/tests/test_consumables.py @@ -606,7 +606,11 @@ def test_consumables_availability_modes_that_depend_on_designations(seed): target_items = items_drug_or_vaccine elif availability == 'none': target_items = set() - elif availability == 'default': + elif availability in ('default', + 'scenario1', 'scenario2', 'scenario3', 'scenario4', + 'scenario5', 'scenario6', 'scenario7', 'scenario8', + 'scenario9', 'scenario10', 'scenario11', 'scenario12', + ): continue else: raise ValueError(f'Unexpected availability: {availability}')