From b632850953dc8f1401543a2ae6fb7c8976179778 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Wed, 22 May 2024 17:56:57 +0100
Subject: [PATCH 001/100] add script to prepare HHFA data for regression
 analysis

---
 ...nsumables_data_for_inferential_analysis.py | 811 ++++++++++++++++++
 1 file changed, 811 insertions(+)
 create mode 100644 src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/prepare_hhfa_consumables_data_for_inferential_analysis.py

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/prepare_hhfa_consumables_data_for_inferential_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/prepare_hhfa_consumables_data_for_inferential_analysis.py
new file mode 100644
index 0000000000..6a496519e6
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/prepare_hhfa_consumables_data_for_inferential_analysis.py
@@ -0,0 +1,811 @@
+"""
+This script prepares HHFA data on consumable availability for regression analysis on R.
+
+Inputs:
+1. Raw HHFA data - Q1.dta (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/0 raw/2_Final data/)
+2. Cleaned variable names for HHFA data - variable_list.csv (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/1 processing)
+3. Relevant distance calculations for regression analysis - facility_distances_hhfa.csv (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/2 clean/)
+4. Consumable categorisations - items_hhfa.xlsx (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/1 processing/)
+
+Outputs:
+1. Cleaned consumable availability data ready for regression analysis - cleaned_hhfa_2019.csv (~Dropbox/Thanzi la Onse/07 - Data/HHFA_2018-19/2 clean/)
+
+Consumable availability is measured as probability of stockout at any point in time.
+"""
+
+import calendar
+import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+import copy
+
+from collections import defaultdict
+
+# Set local Dropbox source
+path_to_dropbox = Path(  # <-- point to the TLO dropbox locally
+     '/Users/sm2511/Dropbox/Thanzi la Onse'
+)
+
+path_to_files_in_the_tlo_dropbox = path_to_dropbox / "07 - Data/HHFA_2018-19/" # <-- point to HHFA data folder in dropbox
+
+# define a timestamp for script outputs
+timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
+
+# print the start time of the script
+print('Script Start', datetime.datetime.now().strftime('%H:%M'))
+
+## 1. DATA IMPORT ##
+#########################################################################################
+raw_hhfa = pd.read_csv(path_to_files_in_the_tlo_dropbox / '0 raw/2_Final data/Q1.csv',
+                       low_memory=False)  # import 2018 data
+varnames = pd.read_csv(path_to_files_in_the_tlo_dropbox / '1 processing/variable_list.csv',
+                       encoding="ISO-8859-1")  # import file with cleaned variable names
+
+# Rename HHFA columns using variable name mapping in loaded .csv
+old_var_name = varnames['var']
+new_var_name = varnames['new_var_name']
+
+hhfa = copy.deepcopy(raw_hhfa)
+for i in range(len(new_var_name)):
+    if new_var_name[i] != np.nan:
+        hhfa.rename(columns={old_var_name[i]: new_var_name[i]},
+                    inplace=True)
+    else:
+        pass
+
+# Rename columns with missing data to "a" and then drop these columns since these will not be used in the analysis
+hhfa.rename({np.nan: "a"}, axis="columns", inplace=True)
+hhfa.drop(["a"], axis=1, inplace=True)
+
+## 2. FEATURE CLEANING/MANIPULATION ##
+#########################################################################################
+# Clean districts #
+cond = hhfa['district'] == 'Blanytyre'
+hhfa.loc[cond, 'district'] = 'Blantyre'
+
+cond = hhfa['district'] == 'Nkhatabay'
+hhfa.loc[cond, 'district'] = 'Nkhata Bay'
+
+# Pvt for profit hospital incorrectly labelled District Hospital
+cond = hhfa.fac_code == 5067
+hhfa.loc[cond, 'fac_type'] = 'Other Hospital'
+
+# Clean fac_owner
+cond = hhfa.fac_owner == 'Private non profit'
+hhfa.loc[cond, 'fac_owner'] = 'NGO'
+
+# Bed count = 0 if outpatient only
+for var in ['bed_count', 'inpatient_visit_count', 'inpatient_days_count']:
+    cond = hhfa['outpatient_only'] == "Yes"
+    hhfa.loc[cond, var] = 0
+
+# Number of functional computers = 0 if no functional computers
+cond = hhfa.functional_computer == "No"
+hhfa.loc[cond, 'functional_computer_no'] = 0
+
+for var in ['functional_landline', 'fuctional_mobile', 'functional_radio', 'functional_computer']:
+    cond = hhfa[var].str.contains("Yes")
+    hhfa.loc[cond, var] = "Yes"
+
+# If facility has a function emergency vehicle, then it has an accessible emergency vehicle
+cond = hhfa.functional_emergency_vehicle == "Yes"
+hhfa.loc[cond, 'accessible_emergency_vehicle'] = "Yes"
+
+cond = hhfa.accessible_emergency_vehicle == "No"
+hhfa.loc[cond, 'fuel_available_today'] = "No"
+
+cond = hhfa.accessible_emergency_vehicle == "No"
+hhfa.loc[cond, 'purpose_last_vehicle_trip'] = "Not applicable"
+
+# Correct the "Don't knows" #
+for var in ['fuel_available_today']:
+    cond = hhfa[var] == "98"
+    hhfa.loc[cond, var] = "Don't know"
+
+for var in ['functional_ambulance', 'functional_car', 'functional_bicycle', 'functional_motor_cycle',
+            'functional_bike_ambulance']:
+    cond = hhfa[var] == "No"
+    var_no = var + '_no'
+    hhfa.loc[cond, var_no] = 0
+
+# Daily opening hours can't be more than 24 hours
+cond = hhfa.fac_daily_opening_hours > 24
+hhfa.loc[cond, 'fac_daily_opening_hours'] = 24
+
+# Water source within 500m if piped into facility or facility grounds
+hhfa.water_source_main = hhfa.water_source_main.str.lower()
+cond1 = hhfa['water_source_main'] == "piped into facility"
+cond2 = hhfa['water_source_main'] == "piped onto facility grounds"
+hhfa.loc[cond1 | cond2, 'water_source_main_within_500m'] = "Yes"
+
+# Edit water_source variable to reduce the number of categories
+cond_other_watersource_1 = hhfa['water_source_main'] == 'protected spring'
+cond_other_watersource_2 = hhfa['water_source_main'] == 'unprotected dug well'
+cond_other_watersource_3 = hhfa['water_source_main'] == 'unprotected spring'
+cond_other_watersource_4 = hhfa['water_source_main'] == 'tanker truck'
+cond_other_watersource_5 = hhfa['water_source_main'] == 'cart w/small tank/drum ………'
+cond_other_watersource_6 = hhfa['water_source_main'] == 'rainwater collection'
+hhfa.loc[cond_other_watersource_1 | cond_other_watersource_2 | cond_other_watersource_3 | \
+    cond_other_watersource_4 | cond_other_watersource_5 | cond_other_watersource_6, \
+    'water_source_main'] = 'other'
+
+# Convert water disruption duration
+cond_hours = hhfa['water_disruption_duration_units'] == "Hours"
+cond_weeks = hhfa['water_disruption_duration_units'] == "Weeks"
+cond_months = hhfa['water_disruption_duration_units'] == "Months"
+hhfa.loc[cond_hours, 'water_disruption_duration'] = hhfa['water_disruption_duration'] / 24
+hhfa.loc[cond_weeks, 'water_disruption_duration'] = hhfa['water_disruption_duration'] * 7
+hhfa.loc[cond_months, 'water_disruption_duration'] = hhfa['water_disruption_duration'] * 365 / 12
+
+# If no functional toilet, then noe handwashing facility
+cond = hhfa.functional_toilet == "No"
+hhfa.loc[cond, 'functional_handwashing_facility'] = "No"
+
+# No CVD services at lower level facilities
+cond = hhfa.fac_type.str.contains("Hospital")
+hhfa.loc[~cond, 'service_cvd'] = "No"
+
+# Clean "other" category for how drug orders are placed
+hhfa.drug_resupply_calculation_system_other_spec = hhfa.drug_resupply_calculation_system_other_spec.str.lower()
+
+cond_notna = hhfa['drug_resupply_calculation_system_other_spec'].notna()
+cond_both = hhfa['drug_resupply_calculation_system_other_spec'].str.contains("both") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("for essentials") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("personal of emergency orders") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("pull & push")
+hhfa.loc[cond_both & cond_notna, 'drug_resupply_calculation_system'] = "Both push and pull"
+
+cond_pull = hhfa['drug_resupply_calculation_system_other_spec'].str.contains("buy") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("cstock") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("press") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("counting") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("the facility purchases") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("bought") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("purchased") | \
+            hhfa['drug_resupply_calculation_system_other_spec'].str.contains("private pharmacy")
+
+hhfa.loc[cond_pull & cond_notna, 'drug_resupply_calculation_system'] = "Facility itself (pull distribution system)"
+
+cond_push = hhfa['drug_resupply_calculation_system_other_spec'].str.contains("from the dho")
+hhfa.loc[
+    cond_push & cond_notna, 'drug_resupply_calculation_system'] = "A higher level facility (push distribution system)"
+
+# Clean "other" catergory for how drugs are transported
+# drug_transport_other_spec hasn't been cleaned as this hasn't been used in the final analysis
+
+# If a facility does not report whether a particular source of drugs is used but indicates
+# that another is used, mark the first as No
+source_drugs_varlist = ['source_drugs_cmst', 'source_drugs_local_warehouse', 'source_drugs_ngo',
+             'source_drugs_donor', 'source_drugs_pvt']
+for source1 in source_drugs_varlist:
+    cond_source_empty = hhfa[source1].isna()
+    for source2 in source_drugs_varlist:
+        cond_source_other = hhfa[source2] == "Yes"
+        hhfa.loc[cond_source_empty & cond_source_other,source1] = "No"
+
+# If a facility does not report whether a particular drug transport system is used but indicates
+# that another is used, mark the first as No
+drug_transport_varlist = ['drug_transport_local_supplier', 'drug_transport_higher_level_supplier',
+'drug_transport_self', 'drug_transport_other']
+for transport1 in drug_transport_varlist:
+    cond_transport_empty = hhfa[transport1].isna()
+    for transport2 in drug_transport_varlist:
+        cond_transport_other = hhfa[transport2] == "Yes"
+        hhfa.loc[cond_transport_empty & cond_transport_other,transport1] = "No"
+
+# Drop outliers
+cond = hhfa.travel_time_to_district_hq > 1000
+hhfa.loc[cond, 'travel_time_to_district_hq'] = np.nan
+
+cond = hhfa.referrals_to_other_facs == "3"
+hhfa.loc[cond, 'referrals_to_other_facs'] = np.nan
+
+# Reduce the number of categories - whether functional refrigerator is available
+cond_yes1 = hhfa.functional_refrigerator_epi.str.contains("Yes")
+cond_yes2 = hhfa.vaccine_storage.str.contains("Yes")
+cond_notna = hhfa.functional_refrigerator_epi.notna()
+hhfa.loc[(cond_yes1 | cond_yes2) & cond_notna, 'functional_refrigerator_epi'] = "Yes"
+cond_no = hhfa.functional_refrigerator_epi.str.contains("No")
+hhfa.loc[cond_no & cond_notna, 'functional_refrigerator_epi'] = "No"
+
+cond_na = hhfa.service_epi == "No"
+hhfa.loc[cond_na, 'vaccine_storage'] = "No" # if no EPI services, vaccines storage is not needed
+
+cond_yes = hhfa.functional_refrigerator.str.replace(" ","") == 'AVAILABLEANDFUNCTIONAL'
+hhfa.loc[cond_yes, 'functional_refrigerator'] = "Yes"
+cond_no1 = hhfa.functional_refrigerator.str.replace(" ","") == 'NOTAVAILABLE'
+cond_no2 = hhfa.functional_refrigerator.str.replace(" ","")== 'AVAILABLENOTFUNCTIONAL'
+hhfa.loc[cond_no1 | cond_no2, 'functional_refrigerator'] = "No"
+
+hhfa.functional_refrigerator_diagnostics = hhfa.functional_refrigerator_diagnostics.str.lower()
+cond_yes = hhfa.functional_refrigerator_diagnostics == "available and functional"
+cond_na = hhfa.functional_refrigerator_epi.isna()
+cond_notna = hhfa.functional_refrigerator_diagnostics.notna()
+hhfa.loc[cond_yes, 'functional_refrigerator_diagnostics'] = "Yes"
+
+# if refrigerator is (not) available as per the EPI section, then it should be same as per the Diagnostics section
+hhfa.loc[cond_yes & cond_na, 'functional_refrigerator_epi'] = "Yes"
+cond_no = hhfa.functional_refrigerator_diagnostics == "available not functional" | \
+          hhfa.functional_refrigerator_diagnostics.str.contains("not available")
+hhfa.loc[cond_no, 'functional_refrigerator_diagnostics'] = "No"
+hhfa.loc[cond_no & cond_na, 'functional_refrigerator_epi'] = "No"
+
+# Aggregate refrigerator availability
+cond_fridge_epi = hhfa.functional_refrigerator_epi == "Yes"
+cond_fridge_diag = hhfa.functional_refrigerator_diagnostics == "Yes"
+cond_vaccine_storage =  hhfa.vaccine_storage == "Yes"
+hhfa.loc[cond_fridge_epi | cond_fridge_diag | cond_vaccine_storage, 'functional_refrigerator'] = "Yes"
+cond_no_fridge_epi = hhfa.functional_refrigerator_epi == "No"
+cond_no_fridge_diag = hhfa.functional_refrigerator_diagnostics == "No"
+hhfa.loc[cond_no_fridge_epi & cond_no_fridge_diag, 'functional_refrigerator'] = "Yes"
+cond_no_vaccine_storage = hhfa.vaccine_storage == "No"
+cond_fridge_na = hhfa.functional_refrigerator.isna()
+hhfa.loc[cond_no_vaccine_storage & cond_fridge_na, 'functional_refrigerator'] = "No"
+
+# convert fac_location to binary (Yes/No)
+hhfa = hhfa.rename(columns={'fac_location': 'fac_urban'})
+cond1 = hhfa.fac_urban.str.lower() == "rural"
+hhfa.loc[cond1, 'fac_urban'] = 'No'
+cond2 = hhfa.fac_urban.str.lower() == "urban"
+hhfa.loc[cond2, 'fac_urban'] = 'Yes'
+
+# Clean water disruption variable
+cond = hhfa['water_disruption_last_3mts'] == "No"
+hhfa.loc[cond, 'water_disruption_duration'] = 0
+
+# Change incharge_drugs to lower case
+hhfa.incharge_drug_orders = hhfa.incharge_drug_orders.str.lower()
+
+# Clean other category for incharge_drug_orders
+incharge_drug_orders_other_mapping = pd.read_csv(path_to_files_in_the_tlo_dropbox / '1 processing/incharge_drug_orders_other_mapping.csv')
+hhfa = pd.merge(hhfa, incharge_drug_orders_other_mapping[['incharge_drug_orders_other_spec',
+                                                          'cleaned_incharge_drug_orders']],
+                on = 'incharge_drug_orders_other_spec',
+                how = 'left')
+cond = hhfa.incharge_drug_orders == 'other'
+hhfa.loc[cond, 'incharge_drug_orders'] = hhfa['cleaned_incharge_drug_orders']
+
+## 3. CREATE CONSUMABLE AVAILABILITY DATAFRAME ##
+#########################################################################################
+# --- 3.1 Extract dataframe containing consumable availability from the raw HHFA dataframe --- #
+# Rename columns variable name mapping in loaded .csv
+consumables = copy.deepcopy(raw_hhfa)
+for i in range(len(varnames['var'])):
+    # if HHFA variable has been mapped to a consumable and is not an equipment
+    if pd.notna(varnames['item'][i]) and pd.isna(varnames['equipment'][i]):
+        consumables.rename(columns={varnames['var'][i]: varnames['availability_metric'][i] + '_' + varnames['item'][i]},
+                           inplace=True)
+    elif varnames['var'][i] == 'Fcode':  # keep facility code variable
+        consumables.rename(columns={varnames['var'][i]: varnames['new_var_name'][i]},
+                           inplace=True)
+
+    # Mark all other unmapped variables to be dropped
+    else:
+        consumables.rename(columns={varnames['var'][i]: 'Drop'},
+                           inplace=True)
+
+consumables = consumables.drop(columns='Drop')
+
+# Check that there are no duplicate consumable + metric entries
+assert len(consumables.columns[consumables.columns.duplicated(keep='last')]) == 0
+
+# --- 3.2 Convert availability categories to binary entries --- #
+# - 3.2.1 List of columns asking about consumable availability on the day of survey - #
+today_cols = [col for col in consumables.columns if 'today' in col]
+# Get list of availability entries
+i = 0
+for col in today_cols:
+    consumables[col] = consumables[col].str.replace(" ", "")
+    if i == 0:
+        today_categories = consumables[col].unique()
+    else:
+        today_categories = np.append(today_categories, consumables[col].unique())
+    i = 1
+
+today_categories = list(dict.fromkeys(today_categories))
+today_categories = [x for x in today_categories if pd.isnull(x) == False]  # drop nan from list
+
+# Create dictionary to map availability options to a number
+today_categories_dict = {'NEVERAVAILABLE': -1}
+for i in ['NOTAVAILABLE', 'No', 'AVAILABLENOTFUNCTIONAL', 'NOTAVAILABLETODAY', 'AVAILABLENONVALID']:
+    today_categories_dict[i] = 0
+for i in ['AVAILABLE,OBSERVED', 'AVAILABLEANDFUNCTIONAL', 'ATLEASTONEVALID']:
+    today_categories_dict[i] = 1
+for i in ['AVAILABLE,NOTOBSERVED', 'Yes', "AVAILABLEDON'TKNOWIFFUNCTIONAL", 'REPORTEDAVAILABLEBUTNOTSEEN']:
+    today_categories_dict[i] = 2
+
+# Assert if any entries did not get featured in the dictionary
+today_mapping = {k: today_categories_dict[k] for k in today_categories_dict.keys() & set(today_categories)}
+assert len(today_mapping) == len(today_categories)
+
+# - 3.2.2 List of columns asking about consumable availability during the 3 months before survey - #
+last_mth_or_3mts_cols = [col for col in consumables.columns if 'last' in col]
+# Get list of avaibility entries
+i = 0
+for col in last_mth_or_3mts_cols:
+    consumables[col] = consumables[col].str.replace(" ", "")
+    if i == 0:
+        last_mth_or_3mts_cols_categories = consumables[col].unique()
+    else:
+        last_mth_or_3mts_cols_categories = np.append(last_mth_or_3mts_cols_categories, consumables[col].unique())
+    i = 1
+
+last_mth_or_3mts_cols_categories = list(dict.fromkeys(last_mth_or_3mts_cols_categories))
+last_mth_or_3mts_cols_categories = [x for x in last_mth_or_3mts_cols_categories if
+                                    pd.isnull(x) == False]  # drop nan from list
+
+# Create dictionary to map availability options to a number
+last_mth_or_3mts_cols_categories_dict = {'PRODUCTNOTOFFERED': -1}
+for i in ['NOTINDICATED', 'FACILITYRECORDNOTAVAILABLE']:
+    last_mth_or_3mts_cols_categories_dict[i] = -1
+for i in ['STOCK-OUTINTHEPAST3MONTHS', 'Yes']:
+    last_mth_or_3mts_cols_categories_dict[i] = 0
+for i in ['NOSTOCK-OUTINPAST3MONTHS', 'No']:
+    last_mth_or_3mts_cols_categories_dict[i] = 1
+
+# Assert if any entries did not get featured in the dictionary
+last_mth_or_3mts_mapping = {k: last_mth_or_3mts_cols_categories_dict[k] for k in
+                            last_mth_or_3mts_cols_categories_dict.keys() & set(last_mth_or_3mts_cols_categories)}
+assert len(last_mth_or_3mts_mapping) == len(last_mth_or_3mts_cols_categories)
+
+# - 3.2.3 Recode all availbility variables - #
+consumables_num = copy.deepcopy(consumables)
+for col in today_cols:
+    consumables_num[col] = consumables[col].map(today_categories_dict).fillna(consumables[col])
+for col in last_mth_or_3mts_cols:
+    cond1 = (consumables_num[col] == 'NOTINDICATED')
+    cond2 = (consumables_num[col] == 'FACILITYRECORDNOTAVAILABLE')
+    consumables_num.loc[cond1 | cond2, col] = np.nan
+    consumables_num[col] = consumables[col].map(last_mth_or_3mts_cols_categories_dict).fillna(consumables[col])
+
+# - 3.2.4 Convert numeric availability variable to binary - #
+consumables_bin = copy.deepcopy(consumables_num)
+for col in today_cols:
+    cond0 = consumables_bin[col] == -1
+    cond1 = consumables_bin[col] == 2
+    consumables_bin.loc[cond0, col] = 0
+    consumables_bin.loc[cond1, col] = 1
+for col in last_mth_or_3mts_cols:
+    cond0 = consumables_bin[col] == -1
+    consumables_bin.loc[cond0, col] = 0
+
+## 4. RESHAPE DATA AND DROP DUPLICATE CONSUMABLE ENTRIES ##
+#########################################################################################
+consumables_long = pd.melt(consumables_bin, id_vars= 'fac_code', value_vars= today_cols,
+                             var_name= 'item', value_name='value')
+
+consumables_long = consumables_long.rename(columns = {'value': 'available'})
+
+# Split consumable variable name into consumable name (item) and avaibility metric used
+consumables_long['temp'] = consumables_long.item.str.rfind('_', start = 0, end = 100)
+consumables_long['metric']=consumables_long.apply(lambda x: x['item'][:x['temp']],axis = 1)
+consumables_long['item']=consumables_long.apply(lambda x: x['item'][x['temp']+1:],axis = 1)
+consumables_long = consumables_long.drop(columns = 'temp')
+
+# For items which are duplicated, keep the metric under which more facilities have reported
+# Get list of items which are duplicated
+duplicated_items = consumables_long[consumables_long.duplicated(['item', 'fac_code'], keep=False)].item.unique()
+cond1 = consumables_long.item.isin(duplicated_items)
+cond2 = consumables_long.available.notna()
+
+# Generate variable denoting number of reporting facilities
+aggregated = consumables_long[cond2].groupby(['item', 'metric']).nunique()['fac_code']
+aggregated.name = 'report_count'
+consumables_long = consumables_long.join(aggregated,on=['item', 'metric'])
+
+# Sort in descending order of reporting facilities
+consumables_long.sort_values(['item', 'report_count', 'fac_code'], axis=0, ascending=True)
+
+# Drop duplicates based on number of reporting facilities
+# so that if there are two records of the same consumable, the one with more reported observations is preserved
+consumables_long_unique = copy.deepcopy(consumables_long)
+consumables_long_unique.drop_duplicates(subset=['item', 'fac_code'], keep='first', inplace=True)
+
+# Assert that all duplicates have been addressed
+assert len(consumables_long_unique[consumables_long_unique.duplicated(['item', 'fac_code'], keep=False)].item.unique()) == 0
+print(len(consumables_long[consumables_long.duplicated(['item', 'fac_code'], keep=False)].item.unique()),
+     "duplicate items were reduced to",
+     len(consumables_long_unique[consumables_long_unique.duplicated(['item', 'fac_code'], keep=False)].item.unique()))
+
+## 5. ACCOUNT FOR SUBSTITUTABLE CONSUMABLES ##
+#########################################################################################
+# --- 5.1 ART components ---
+# Component 1: AZT or TDF or d4T
+cond1 = consumables_long_unique['item'].isin(['Zidovudine (ZDV, AZT)',
+                            'Zidovudine (ZDV, AZT) syrup (ARVs)',
+                            'Tenofovir Disoproxil Fumarate (TDF) (ARVs)',
+                            'Zidovudine + Lamivudine (AZT + 3TC) (ARVs)',
+                            'Zidovudine + Lamivudine + Abacavir (AZT + 3TC + ABC) (ARVs)',
+                            'Zidovudine + Lamivudine + Nevirapine (AZT + 3TC + NVP) (ARVs)',
+                            'Tenofovir + Emtricitabine (TDF + FTC) (ARVs)',
+                            'Tenofovir + Lamivudine (TDF + 3TC) (ARVs)',
+                            'Tenofovir + Lamivudine + Efavirenz (TDF + 3TC + EFV) (ARVs)',
+                            'Tenofovir + Emtricitabine + Efavirenz (TDF + FTC + EFV)',
+                            'Stavudine 30 or 40 (D4T) (ARVs)',
+                            'Stavudine syrup (ARVs)',
+                            'Stavudine + Lamivudine (D4T + 3TC) (ARVs)',
+                            'Stavudine + Lamivudine + Nevirapine (D4T + 3TC + NVP) (ARVs)'])
+## Component 2: 3TC or FTC
+cond2 = consumables_long_unique['item'].isin(['Lamivudine (3TC) (ARVs)',
+                            'Emtricitabine (FTC) (ARVs)',
+                            'Lamivudine + Abacavir (3TC + ABC)',
+                            'Zidovudine + Lamivudine (AZT + 3TC) (ARVs)',
+                            'Zidovudine + Lamivudine + Abacavir (AZT + 3TC + ABC) (ARVs)',
+                            'Zidovudine + Lamivudine + Nevirapine (AZT + 3TC + NVP) (ARVs)',
+                            'Tenofovir + Emtricitabine (TDF + FTC) (ARVs)',
+                            'Tenofovir + Lamivudine (TDF + 3TC) (ARVs)',
+                            'Tenofovir + Lamivudine + Efavirenz (TDF + 3TC + EFV) (ARVs)',
+                            'Tenofovir + Emtricitabine + Efavirenz (TDF + FTC + EFV)',
+                            'Lamivudine (3TC) syrup (ARVs)',
+                            'Stavudine + Lamivudine (D4T + 3TC) (ARVs)',
+                            'Stavudine + Lamivudine + Nevirapine (D4T + 3TC + NVP) (ARVs)'])
+## Component 3: Protease inhibitor
+cond3 = consumables_long_unique['item'].isin(['Abacavir (ABC) (ARVs)',
+                            'Nevirapine (NVP) (ARVs)',
+                            'Nevirapine (NVP) syrup (ARVs)',
+                            'Efavirenz (EFV) (ARVs)',
+                            'Lamivudine + Abacavir (3TC + ABC)',
+                            'Zidovudine + Lamivudine + Abacavir (AZT + 3TC + ABC) (ARVs)',
+                            'Zidovudine + Lamivudine + Nevirapine (AZT + 3TC + NVP) (ARVs)',
+                            'Tenofovir + Lamivudine + Efavirenz (TDF + 3TC + EFV) (ARVs)',
+                            'Tenofovir + Emtricitabine + Efavirenz (TDF + FTC + EFV)',
+                            'Efavirenz (EFV) syrup (ARVs)',
+                            'Stavudine + Lamivudine + Nevirapine (D4T + 3TC + NVP) (ARVs)',
+                            'Lopinavir (LPV) (protease inhibitors)',
+                            'Indinavir (IDV) (protease inhibitors)',
+                            'Nelfinavir (NFV) (protease inhibitors)',
+                            'Saquinavir (SQV) (protease inhibitors)',
+                            'Ritonavir (RTV) (protease inhibitors)',
+                            'Atazanavir (ATV) (protease inhibitors)',
+                            'Fosamprenavir (FPV) (protease inhibitors)',
+                            'Tipranavir (TPV) (protease inhibitors)',
+                            'Darunavir (DPV) (protease inhibitors)'])
+
+# Component 1
+art_component_1 = consumables_long_unique[cond1].copy()
+art_component_1.loc[:,'item'] = 'art_component_1'
+art_component_1 = art_component_1.groupby(
+    ['fac_code', 'item'],
+    as_index=False).agg({'available': np.nanmax,
+                         'metric': 'first'})
+
+# Component 2
+art_component_2 = consumables_long_unique[cond2].copy()
+art_component_2['item'] = 'art_component_2'
+art_component_2 = art_component_2.groupby(
+    ['fac_code', 'item'],
+    as_index=False).agg({'available': np.nanmax,
+                         'metric': 'first'})
+
+# Component 3
+art_component_3 = consumables_long_unique[cond3].copy()
+art_component_3['item'] = 'art_component_3'
+art_component_3 = art_component_3.groupby(
+    ['fac_code', 'item'],
+    as_index=False).agg({'available': np.nanmax,
+                         'metric': 'first'})
+
+# Append all datasets
+art = pd.concat([art_component_1, art_component_2, art_component_3])
+
+consumables_postart = pd.concat([art,consumables_long_unique[~cond1 & ~cond2 & ~cond3]])
+
+print(consumables_postart.item.nunique(),
+      "left out of",
+      consumables_long_unique.item.nunique(),
+     "items left after accounting for ART substitutes")
+
+# --- 5.2 Tuberculosis treatment substitutes --- #
+# Combinations used as per https://stoptb.org/assets/documents/gdf/whatis/faq-brochure.pdf = RHZE, RHZ, RH, EH, TH
+# Component 1: Isoniazid
+cond1 = consumables_postart['item'].isin(['Isoniazid',
+                                          'Isoniazid + Rifampicin (2FDC)',
+                                          'Isoniazid + Ethambutol (EH) (2FDC)',
+                                          'Isoniazid + Rifampicin + Pyrazinamide (RHZ) (3FDC)',
+                                          'Isoniazid + Rifampicin + Ethambutol (RHE) (3FDC)',
+                                          'Isoniazid + Rifampicin + Pyrazinamide + Ethambutol (4FDC)'])
+
+# Component 2: Rifampicin
+cond2 = consumables_postart['item'].isin(['Rifampicin',
+                                          'Isoniazid + Rifampicin (2FDC)',
+                                          'Isoniazid + Rifampicin + Pyrazinamide (RHZ) (3FDC)',
+                                          'Isoniazid + Rifampicin + Ethambutol (RHE) (3FDC)',
+                                          'Isoniazid + Rifampicin + Pyrazinamide + Ethambutol (4FDC)'])
+
+# Component 3: Pyrazinamide
+cond3 = consumables_postart['item'].isin(['Pyrazinamide',
+                                          'Isoniazid + Rifampicin + Pyrazinamide (RHZ) (3FDC)',
+                                          'Isoniazid + Rifampicin + Pyrazinamide + Ethambutol (4FDC)'])
+
+# Component 4: Ethambutol
+cond4 = consumables_postart['item'].isin(['Ethambutol',
+                                          'Isoniazid + Ethambutol (EH) (2FDC)',
+                                          'Isoniazid + Rifampicin + Ethambutol (RHE) (3FDC)',
+                                          'Isoniazid + Rifampicin + Pyrazinamide + Ethambutol (4FDC)'])
+
+# Component 1
+tb_component_1 = consumables_postart[cond1].copy()
+tb_component_1['item'] = 'Isoniazid'
+tb_component_1 = tb_component_1.groupby(
+    ['fac_code', 'item'],
+    as_index=False).agg({'available': np.nanmax,
+                         'metric': 'first'})
+# Component 2
+tb_component_2 = consumables_postart[cond2].copy()
+tb_component_2['item'] = 'Rifampicin'
+tb_component_2 = tb_component_2.groupby(
+    ['fac_code', 'item'],
+    as_index=False).agg({'available': np.nanmax,
+                         'metric': 'first'})
+# Component 3
+tb_component_3 = consumables_postart[cond3].copy()
+tb_component_3['item'] = 'Pyrazinamide'
+tb_component_3 = tb_component_3.groupby(
+    ['fac_code', 'item'],
+    as_index=False).agg({'available': np.nanmax,
+                         'metric': 'first'})
+# Component 4
+tb_component_4 = consumables_postart[cond4].copy()
+tb_component_4['item'] = 'Ethambutol'
+tb_component_4 = tb_component_4.groupby(
+    ['fac_code', 'item'],
+    as_index=False).agg({'available': np.nanmax,
+                         'metric': 'first'})
+
+# Append all datasets
+tb = pd.concat([tb_component_1, tb_component_2, tb_component_3, tb_component_4])
+
+consumables_posttb = pd.concat([tb, consumables_postart[~cond1 & ~cond2 & ~cond3 & ~cond4]])
+
+print(consumables_posttb.item.nunique(),
+      "left out of",
+      consumables_long_unique.item.nunique(),
+      "items left after accounting for ART and TB substitutes")
+
+# --- 5.3 Iron and folic acid --- #
+# Component 1: Iron
+cond1 = consumables_posttb['item'].isin(['Iron tablet',
+                            'Iron and folic combined tablets'])
+
+# Component 2: Folic Acid
+cond2 = consumables_posttb['item'].isin(['Folic acid tablet',
+                            'Iron and folic combined tablets'])
+
+# Component 1
+fefo_component_1 = consumables_posttb[cond1].copy()
+fefo_component_1['item'] = 'Iron tablet'
+fefo_component_1 = fefo_component_1.groupby(
+    ['fac_code', 'item'],
+    as_index=False).agg({'available': np.nanmax,
+                         'metric': 'first'})
+# Component 2
+fefo_component_2 = consumables_posttb[cond2].copy()
+fefo_component_2['item'] = 'Folic acid tablet'
+fefo_component_2 = fefo_component_2.groupby(
+    ['fac_code', 'item'],
+    as_index=False).agg({'available': np.nanmax,
+                         'metric': 'first'})
+
+# Append all datasets
+fefo = pd.concat([fefo_component_1, fefo_component_2])
+
+consumables_postfefo = pd.concat([fefo,consumables_posttb[~cond1 & ~cond2]])
+
+print(consumables_postfefo.item.nunique(),
+      "left out of",
+      consumables_long_unique.item.nunique(),
+     "items left after accounting for ART, TB, and FeFo substitutes")
+
+# --- 5.4 Other substitutes --- #
+# Merge in information on which consumables are substitutable for each other
+item_categories = pd.read_excel(path_to_files_in_the_tlo_dropbox / '1 processing/items_hhfa.xlsx',
+                                index_col=0, sheet_name = "items_hhfa")
+
+cols = ['item', 'substitute_group']
+consumables_postfefo = pd.merge(consumables_postfefo,item_categories[cols], how = 'left', on = 'item')
+cond = consumables_postfefo.substitute_group == 'manual fix'
+consumables_postfefo.loc[cond, 'substitute_group'] = np.nan
+
+j = 0
+for i in consumables_postfefo.substitute_group.unique():
+    cond = consumables_postfefo.substitute_group == i
+    sub_group = consumables_postfefo[cond].copy()
+    sub_group = sub_group.groupby(['fac_code', 'substitute_group'], as_index=False).agg({'available': np.nanmax,
+                                                                            'metric': 'first',
+                                                                            'item': 'first'})
+    if j == 0:
+        sub_groups_all = sub_group
+        j = 1
+    else:
+        sub_groups_all = pd.concat([sub_groups_all, sub_group])
+
+# Append with the remaining data
+consumables_final = consumables_postfefo[consumables_postfefo.substitute_group.isna()]
+consumables_final = pd.concat([sub_groups_all, consumables_final])
+
+print(consumables_final.item.nunique(),
+      "left out of",
+      consumables_long_unique.item.nunique(),
+     "items left after accounting for all substitutes")
+
+## 6. PREPARE DATA FOR REGRESSION ##
+#########################################################################################
+# Keep columns to be analysed
+feature_cols = ['fac_code', 'fac_name',
+            # Main characteristics
+            'district', 'fac_type','fac_owner', 'fac_urban',
+
+            # types of services offered
+            'outpatient_only','bed_count', 'inpatient_visit_count','inpatient_days_count',
+            'service_fp','service_anc','service_pmtct','service_delivery','service_pnc','service_epi','service_imci',
+            'service_hiv','service_tb','service_othersti','service_malaria','service_blood_transfusion','service_diagnostic',
+            'service_cvd','service_chronic_respiratory_mgt', 'service_consumable_stock',
+
+            # operation frequency
+            'fac_weekly_opening_days','fac_daily_opening_hours',
+
+            # utilities/facilities available
+            'functional_landline','fuctional_mobile','functional_radio','functional_computer','functional_computer_no',
+            'internet_access_today',
+            'electricity', 'water_source_main','water_source_main_within_500m', # this is in minutes
+            'functional_toilet','functional_handwashing_facility',
+            'water_disruption_last_3mts','water_disruption_duration', # converted to days
+
+            # vehicles available
+            'functional_emergency_vehicle','accessible_emergency_vehicle','fuel_available_today','purpose_last_vehicle_trip',
+
+            'functional_ambulance', 'functional_ambulance_no', # Keep one of the two variables
+            'functional_car', 'functional_car_no',
+            'functional_motor_cycle', 'functional_motor_cycle_no',
+            'functional_bike_ambulance', 'functional_bike_ambulance_no',
+            'functional_bicycle', 'functional_bicycle_no',
+
+            'vaccine_storage','functional_refrigerator',
+
+            # Drug ordering process
+            'incharge_drug_orders','drug_resupply_calculation_system','drug_resupply_calculation_method','source_drugs_cmst',
+            'source_drugs_local_warehouse','source_drugs_ngo','source_drugs_donor','source_drugs_pvt',
+
+            'drug_transport_local_supplier','drug_transport_higher_level_supplier','drug_transport_self','drug_transport_other',
+
+            'drug_order_fulfilment_delay','drug_order_fulfilment_freq_last_3mts',
+            'transport_to_district_hq','travel_time_to_district_hq',
+
+            # referral system
+            'referral_system_from_community','referrals_to_other_facs',
+
+            # Drug management
+            'mathealth_label_and_expdate_visible', 'mathealth_expdate_fefo',
+            'childhealth_label_and_expdate_visible', 'childhealth_expdate_fefo']
+
+# Merge consumable availability data
+merged_df1 = pd.merge(consumables_final,hhfa[feature_cols], how = 'left', on = 'fac_code')
+
+# Merge distances data
+fac_gis = pd.read_csv(path_to_files_in_the_tlo_dropbox / "2 clean/facility_distances_hhfa.csv")
+merged_df2 = pd.merge(merged_df1, fac_gis[['fac_code', 'lat', 'long', 'lat_dh', 'long_dh', 'lat_rms', 'long_rms','rms',
+                                  'dist_todh', 'dist_torms', 'drivetime_todh', 'drivetime_torms']], how = 'left', on = 'fac_code')
+
+# Merge item categories
+item_categories = pd.read_excel(path_to_files_in_the_tlo_dropbox / '1 processing/items_hhfa.xlsx',
+                                index_col=0, sheet_name = "items_hhfa")
+
+cols = ['drug_class_rx_list', 'item', 'mode_administration', 'program', 'item_type']
+df_for_regression = pd.merge(merged_df2,item_categories[cols], how = 'left', on = 'item')
+
+# Add information on program and other categorisations where this is not available
+cond = df_for_regression.item.str.contains('art_component')
+df_for_regression.loc[cond, 'program'] = 'hiv'
+df_for_regression.loc[cond, 'mode_administration'] = 'oral'
+df_for_regression.loc[cond, 'drug_class_rx_list'] = 'antiretroviral'
+df_for_regression.loc[cond, 'item_type'] = 'drug'
+
+# Clean program to reduce the number of categories
+cond_resp = df_for_regression.program == 'respiratory illness'
+df_for_regression.loc[cond_resp,'program'] = 'acute lower respiratory infections'
+
+cond_road = df_for_regression.program == 'road traffic injuries'
+df_for_regression.loc[cond_road,'program'] = 'surgical'
+
+cond_fungal = df_for_regression.program == 'fungal infection'
+df_for_regression.loc[cond_fungal,'program'] = 'other'
+
+cond_iv = df_for_regression.program == 'IV and injectables'
+df_for_regression.loc[cond_iv,'program'] = 'general'
+
+cond_ncd1 = df_for_regression.program == 'hypertension'
+cond_ncd2 = df_for_regression.program == 'diabetes'
+df_for_regression.loc[cond_ncd1|cond_ncd2,'program'] = 'ncds'
+
+cond_vit = df_for_regression.item == 'Vitamin A (retinol) capsule'
+df_for_regression.loc[cond_vit, 'program'] = 'obstetric and newborn care'
+
+cond_mvit = df_for_regression.item == 'Multivitamins'
+df_for_regression.loc[cond_mvit, 'program'] = 'general'
+
+cond_rutf = df_for_regression.item == 'Ready to use therapeutic food(RUTF)'
+df_for_regression.loc[cond_rutf, 'program'] = 'child health'
+
+# Reduce the number of categories in mode_administration
+cond = df_for_regression['mode_administration'] == "implant"
+df_for_regression.loc[cond, 'mode_administration'] = "other"
+
+# Clean facility level
+df_for_regression = df_for_regression.rename(columns = {'fac_type': 'fac_type_original'})
+df_for_regression['fac_type'] = ""
+
+cond_mch = (df_for_regression['fac_name'].str.contains('Mzuzu Cental Hospital'))
+df_for_regression.loc[cond_mch, 'fac_name'] = 'Mzuzu Central Hospital'
+
+cond_level0 = (df_for_regression['fac_name'].str.contains('Health Post')) | \
+                (df_for_regression['fac_type_original'].str.contains('Health Post'))
+cond_level1a = (df_for_regression['fac_type_original'] == 'Clinic') | (df_for_regression['fac_type_original'] == 'Health Centre') | \
+        (df_for_regression['fac_type_original'].str.contains('Dispensary')) | \
+        (df_for_regression['fac_type_original'].str.contains('Maternity'))
+cond_level1b =  (df_for_regression['fac_type_original'].str.contains('Community Hospital')) | \
+                (df_for_regression['fac_type_original'] == 'Other Hospital')
+cond_level2 = (df_for_regression['fac_type_original'] == 'District Hospital')
+cond_level3 = df_for_regression.fac_name.str.contains("Central Hospit")
+cond_level4 = df_for_regression.fac_name.str.contains("Mental Hospit")
+
+df_for_regression.loc[cond_level0,'fac_type'] = 'Facility_level_0'
+df_for_regression.loc[cond_level1a,'fac_type'] = 'Facility_level_1a'
+df_for_regression.loc[cond_level1b,'fac_type'] = 'Facility_level_1b'
+df_for_regression.loc[cond_level2,'fac_type'] = 'Facility_level_2'
+df_for_regression.loc[cond_level3,'fac_type'] = 'Facility_level_3'
+df_for_regression.loc[cond_level4,'fac_type'] = 'Facility_level_4'
+
+# Sort by facility type
+df_for_regression['fac_type_original'] = pd.Categorical(df_for_regression['fac_type_original'], ['Health Post',
+                'Dispensary',
+                'Maternity',
+                'Clinic',
+                'Health Centre',
+                'Rural/Community Hospital',
+                'Other Hospital',
+                'District Hospital',
+                'Central Hospital'])
+df_for_regression.sort_values(['fac_name','fac_type_original'], inplace = True)
+
+# Convert binary variables to numeric
+fac_vars_binary = ['outpatient_only', 'fac_urban',
+                'service_fp','service_anc','service_pmtct','service_delivery','service_pnc','service_epi','service_imci',
+                'service_hiv','service_tb','service_othersti','service_malaria','service_blood_transfusion','service_diagnostic',
+                'service_cvd', 'service_consumable_stock',
+                'functional_landline','fuctional_mobile','functional_radio','functional_computer','internet_access_today',
+                'electricity', 'functional_toilet','functional_handwashing_facility',
+                'water_disruption_last_3mts', 'water_source_main_within_500m',
+                'functional_emergency_vehicle','accessible_emergency_vehicle',
+                'functional_ambulance',
+                'functional_car',
+                'functional_motor_cycle',
+                'functional_bike_ambulance',
+                'functional_bicycle',
+                'vaccine_storage','functional_refrigerator',
+                'source_drugs_cmst','source_drugs_local_warehouse','source_drugs_ngo','source_drugs_donor','source_drugs_pvt',
+                'drug_transport_local_supplier','drug_transport_higher_level_supplier','drug_transport_self','drug_transport_other',
+                'referral_system_from_community','referrals_to_other_facs',
+                'mathealth_label_and_expdate_visible', 'mathealth_expdate_fefo',
+                'childhealth_label_and_expdate_visible']
+
+binary_dict = {'yes': 1, 'no': 0}
+
+df_for_regression_binvars_cleaned = copy.deepcopy(df_for_regression)
+for col in fac_vars_binary:
+    df_for_regression_binvars_cleaned[col] = df_for_regression_binvars_cleaned[col].str.lower()
+
+    # replace don't know as missing
+    cond = df_for_regression_binvars_cleaned[col] == "don't know"
+    df_for_regression_binvars_cleaned.loc[cond, col] = np.nan
+
+    assert len([x for x in df_for_regression_binvars_cleaned[col].unique() if
+                pd.isnull(x) == False]) == 2  # verify that the variable is binary
+
+    df_for_regression_binvars_cleaned[col] = df_for_regression_binvars_cleaned[col].map(binary_dict).fillna(df_for_regression_binvars_cleaned[col])
+
+# Save dataset ready for regression analysis as a .csv file
+df_for_regression_binvars_cleaned.to_csv(path_to_files_in_the_tlo_dropbox / '2 clean/cleaned_hhfa_2019.csv')

From a1646a5aed18aaf14a0545e2496ace340b5d83ec Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Wed, 22 May 2024 19:29:03 +0100
Subject: [PATCH 002/100] add scripts to run regression analysis

---
 .../regression_analysis/data_setup.R          | 136 +++++++++
 .../feature_manipulation.R                    | 202 +++++++++++++
 .../load_packages_and_functions.R             | 270 ++++++++++++++++++
 .../regression_analysis/main.R                |  15 +
 .../regression_analysis/regression_analysis.R | 181 ++++++++++++
 5 files changed, 804 insertions(+)
 create mode 100644 src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/data_setup.R
 create mode 100644 src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/feature_manipulation.R
 create mode 100644 src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R
 create mode 100644 src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R
 create mode 100644 src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/regression_analysis.R

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/data_setup.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/data_setup.R
new file mode 100644
index 0000000000..b2cf39eaff
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/data_setup.R
@@ -0,0 +1,136 @@
+# This script loads the data for descriptive and inferential analysis
+
+# 1. Load HHFA data
+####################
+df_orig <- read.csv(paste0(path_to_data, "cleaned_hhfa_2019.csv"))
+
+# 1.1 Assign a code to items
+#----------------------------
+df_orig <- df_orig %>%
+  group_by(item) %>%
+  mutate(item_code = cur_group_id()) %>%
+  ungroup()
+
+# 2. Filter data to be ignored
+###############################
+second_line_arvs <- c("Didanosine (DDI) (ARVs)",
+                     "Enfuvirtide (T-20) (ARVs)",
+                     "Delavirdine (DLV) (ARVs)")
+df <- df_orig %>%
+  filter(service_consumable_stock == 1, # Facilities which do not store consumables
+         program != 'mental health/epilepsy', # mental health commodities are rare at this level of care
+         fac_daily_opening_hours != 0, # Facilities which reporting 0 opening hours per day
+         !(item %in% second_line_arvs) # Second line ARVs
+  )
+
+# Formatting
+df$fac_code <- as.character(df$fac_code)
+
+# 3. Add consumable classification in the Essential Medicines List #
+#####################################################################
+essential_med_mapping <- read.csv(paste0(path_to_data, "essential_medicine_list_mapping.csv"))
+names(essential_med_mapping)[names(essential_med_mapping) == 'Consumable'] <- 'item'
+names(essential_med_mapping)[names(essential_med_mapping) == 'Therapeutic.priority'] <- 'eml_therapeutic_priority'
+
+df <- merge(df, essential_med_mapping[c('item','eml_therapeutic_priority')], by = 'item')
+# Generate binary variable
+df$eml_priority_v <- 0
+df[which(df$eml_therapeutic_priority == "V"),]$eml_priority_v <- 1
+df$eml_priority_e <- 0
+df[which(df$eml_therapeutic_priority == "E"),]$eml_priority_e <- 1
+
+# 4. Define variable lists for analysis #
+#########################################
+fac_exp_vars <- c(# Main characteristics
+  'district', 'fac_type','fac_owner', 'fac_urban', 'rms', 'item_drug',
+
+  # types of services offered
+  'outpatient_only','bed_count', 'inpatient_visit_count','inpatient_days_count',
+  'service_fp','service_anc','service_pmtct','service_delivery','service_pnc','service_epi','service_imci',
+  'service_hiv','service_tb','service_othersti','service_malaria','service_blood_transfusion',
+  'service_cvd', 'service_diagnostic',
+
+  # operation frequency
+  'fac_weekly_opening_days','fac_daily_opening_hours',
+
+  # utilities/facilities available
+  'functional_landline','fuctional_mobile','functional_radio','functional_computer','functional_computer_no',
+  'internet_access_today',
+  'electricity', 'water_source_main','water_source_main_within_500m', # this is in minutes
+  'functional_toilet','functional_handwashing_facility',
+  'water_disruption_last_3mts','water_disruption_duration', # converted to days
+
+  # vehicles available
+  'functional_emergency_vehicle','accessible_emergency_vehicle','purpose_last_vehicle_trip',
+
+  'functional_ambulance', 'functional_ambulance_no', # Keep one of the two variables
+  'functional_car', 'functional_car_no',
+  'functional_motor_cycle', 'functional_motor_cycle_no',
+  'functional_bike_ambulance', 'functional_bike_ambulance_no',
+  'functional_bicycle', 'functional_bicycle_no',
+  'vaccine_storage','functional_refrigerator',
+
+  # Drug ordering process
+  'incharge_drug_orders','drug_resupply_calculation_system','drug_resupply_calculation_method','source_drugs_cmst',
+  'source_drugs_local_warehouse','source_drugs_ngo','source_drugs_donor','source_drugs_pvt',
+
+  'drug_transport_local_supplier','drug_transport_higher_level_supplier','drug_transport_self','drug_transport_other',
+
+  'drug_order_fulfilment_delay','drug_order_fulfilment_freq_last_3mts',
+  'transport_to_district_hq','travel_time_to_district_hq',
+
+  # referral system
+  'referral_system_from_community','referrals_to_other_facs',
+
+  # location w.r.t reference points
+  'dist_todh', 'dist_torms', 'drivetime_todh', 'drivetime_torms')
+
+# Continuous variables
+fac_vars_numeric <- c('bed_count', 'inpatient_visit_count','inpatient_days_count', 'functional_computer_no',
+                     'water_disruption_duration',
+                     'functional_ambulance_no', 'functional_car_no', 'functional_motor_cycle_no', 'functional_bike_ambulance_no',
+                     'functional_bicycle_no',
+                     'fac_weekly_opening_days','fac_daily_opening_hours',
+                     'drug_order_fulfilment_delay','drug_order_fulfilment_freq_last_3mts',
+                     'travel_time_to_district_hq', 'dist_todh', 'dist_torms', 'drivetime_todh', 'drivetime_torms')
+
+# Binary variables
+fac_vars_binary <- c('outpatient_only', 'fac_urban',
+                    'service_fp','service_anc','service_pmtct','service_delivery','service_pnc','service_epi','service_imci',
+                    'service_hiv','service_tb','service_othersti','service_malaria','service_blood_transfusion',
+                    'service_cvd', 'service_diagnostic',
+                    'functional_landline','fuctional_mobile','functional_radio','functional_computer','internet_access_today',
+                    'electricity', 'functional_toilet','functional_handwashing_facility',
+                    'water_disruption_last_3mts',
+                    'functional_emergency_vehicle','accessible_emergency_vehicle',
+                    'functional_ambulance',
+                    'functional_car',
+                    'functional_motor_cycle',
+                    'functional_bike_ambulance',
+                    'functional_bicycle',
+                    'vaccine_storage','functional_refrigerator',
+                    'source_drugs_cmst','source_drugs_local_warehouse','source_drugs_ngo','source_drugs_donor','source_drugs_pvt',
+                    'drug_transport_local_supplier','drug_transport_higher_level_supplier','drug_transport_self','drug_transport_other',
+                    'referral_system_from_community','referrals_to_other_facs')
+
+item_vars_binary <- c('item_drug', 'eml_priority_v', 'eml_priority_e')
+
+# Determinants as per literature
+stockout_factors_from_lit <- c('fac_type', 'fac_owner', 'fac_urban','functional_computer',
+                               'functional_emergency_vehicle', 'incharge_drug_orders',
+                               'dist_todh','dist_torms','drug_order_fulfilment_freq_last_3mts',
+                               'service_diagnostic', 'item_drug','eml_priority_v')
+# drug_resupply_calculation_system - this variable has been dropped since it's not clear whether this was  accurately reported
+
+# Look at data by item
+df_not_na <- subset(df, !is.na(df$available))
+df_by_item <- df %>%
+  group_by(item) %>%
+  summarise(available = mean(available))
+
+df_by_fac <- df_not_na %>%
+  group_by(fac_code) %>%
+  summarise(available = mean(available),
+            drug_resupply_calculation_system = first(na.omit(drug_resupply_calculation_system)),
+            fac_owner = first(na.omit(fac_owner)),
+            functional_bike_ambulance_no = first(na.omit(functional_bike_ambulance_no)))
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/feature_manipulation.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/feature_manipulation.R
new file mode 100644
index 0000000000..307d526e77
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/feature_manipulation.R
@@ -0,0 +1,202 @@
+# This script performs feature manipulation on the cleaned HHFA dataset
+
+# 1. Feature manipulation #
+##########################################
+# 1.0 Clean binary variables
+#---------------------------------------
+# Convert item_type variable to binary
+df["item_drug"] <- 0
+df[which(df$item_type == "drug"),]$item_drug <- 1
+
+# 1.1 Cap variables/remove outliers
+#----------------------------------
+df$bed_count <- edit_outliers(df$bed_count, "upper")
+df$inpatient_visit_count <- edit_outliers(df$inpatient_visit_count, "upper")
+df$drug_order_fulfilment_delay <- edit_outliers(df$drug_order_fulfilment_delay, "upper")
+df$drug_order_fulfilment_freq_last_3mts <- edit_outliers(df$drug_order_fulfilment_freq_last_3mts, "upper")
+df$water_disruption_duration <- edit_outliers(df$water_disruption_duration, "upper")
+df$dist_todh <- edit_outliers(df$dist_todh, "upper")
+df$dist_torms <- edit_outliers(df$dist_torms, "upper")
+
+df$functional_ambulance_no <- edit_outliers(df$functional_ambulance_no, "upper")
+df$functional_car_no <- edit_outliers(df$functional_car_no, "upper")
+df$functional_bicycle_no <- edit_outliers(df$functional_bicycle_no, "upper")
+df$functional_motor_cycle_no <- edit_outliers(df$functional_motor_cycle_no, "upper")
+
+
+# 1.2 Rescale continuous variables
+#----------------------------------
+df <- df %>% dplyr::rename(
+  dist_todh_orig = dist_todh,
+  dist_torms_orig = dist_torms,
+  fac_daily_opening_hours_orig = fac_daily_opening_hours,
+  bed_count_orig = bed_count,
+  inpatient_visit_count_orig = inpatient_visit_count,
+  inpatient_days_count_orig = inpatient_days_count,
+  drug_order_fulfilment_delay_orig = drug_order_fulfilment_delay,
+  drug_order_fulfilment_freq_last_3mts_orig = drug_order_fulfilment_freq_last_3mts,
+  water_disruption_duration_orig = water_disruption_duration,
+  functional_bicycle_no_orig = functional_bicycle_no,
+  functional_motor_cycle_no_orig = functional_motor_cycle_no
+)
+
+df$dist_todh <- log(df$dist_todh_orig + 1)
+df$dist_torms <- log(df$dist_torms_orig + 1)
+df$fac_daily_opening_hours <- log(df$fac_daily_opening_hours_orig)
+df$bed_count <- log(df$bed_count_orig + 1)
+df$inpatient_visit_count <- log(df$inpatient_visit_count_orig + 1)
+df$inpatient_days_count <- log(df$inpatient_days_count_orig + 1)
+df$drug_order_fulfilment_delay <- log(df$drug_order_fulfilment_delay_orig + 1)
+df$drug_order_fulfilment_freq_last_3mts <- log(df$drug_order_fulfilment_freq_last_3mts_orig +1)
+df$water_disruption_duration <- log(df$water_disruption_duration_orig +1)
+
+df$functional_bicycle_no <- log(df$functional_bicycle_no_orig +1)
+df$functional_motor_cycle_no <- log(df$functional_motor_cycle_no_orig +1)
+
+
+# 1.3 Streamline categorical variables
+#-------------------------------------
+# Incharge of drug orders
+df[which(df$incharge_drug_orders == "pharmacy technician"|df$incharge_drug_orders == "pharmacist"),]$incharge_drug_orders <-
+  'pharmacist or pharmacy technician'
+
+# Water source
+df[which(df$water_source_main == "other"|df$water_source_main == "no water source"),]$water_source_main <-
+  'no convenient water source'
+
+# Generate categorical version of dist_rorms
+df$dist_torms_cat <- ""
+df$dist_torms_cat <- ifelse((df$dist_torms_orig > 10000) & (df$dist_torms_orig <= 50000), "10-50 kms",
+                            ifelse((df$dist_torms_orig > 50000) & (df$dist_torms_orig < 100000),"50-100 kms",
+                                   ifelse((df$dist_torms_orig > 100000) & (df$dist_torms_orig < 200000),"100-200 kms",
+                                          ifelse((df$dist_torms_orig < 10000), "0-10 kms","> 200 kms"))
+                            ))
+
+df$dist_torms_cat <- ifelse((is.na(df$dist_torms_orig)), NaN, df$dist_torms_cat)
+df$dist_torms_cat <- factor(df$dist_torms_cat, levels = c("0-10 kms", "10-50 kms", "50-100 kms", "100-200 kms", "> 200 kms")) # specify order
+
+# Generate categorical version of dist_todh
+df$dist_todh_cat <- ifelse((df$dist_todh_orig > 10000) & (df$dist_todh_orig <= 25000), "10-25 kms",
+                           ifelse((df$dist_todh_orig > 25000) & (df$dist_todh_orig < 50000),"25-50 kms",
+                                  ifelse((df$dist_todh_orig > 50000) & (df$dist_todh_orig < 75000),"50-75 kms",
+                                         ifelse((df$dist_todh_orig < 10000), "0-10 kms","> 75 kms"))
+                           ))
+
+df$dist_todh_cat <- ifelse((is.na(df$dist_todh_orig)), NaN, df$dist_todh_cat)
+df$dist_todh_cat <- factor(df$dist_todh_cat, levels = c("0-10 kms", "10-25 kms", "25-50 kms", "50-75 kms", "> 75 kms")) # specify order
+
+# Generate categorical version of drug_order_fulfilment_freq_last_3mts
+df$drug_order_fulfilment_freq_last_3mts_cat <- as.character(df$drug_order_fulfilment_freq_last_3mts_orig)
+df$drug_order_fulfilment_freq_last_3mts_cat <- ifelse((df$drug_order_fulfilment_freq_last_3mts_orig > 3), ">= 4",
+                                                      df$drug_order_fulfilment_freq_last_3mts_cat)
+
+df$drug_order_fulfilment_freq_last_3mts_cat <- factor(df$drug_order_fulfilment_freq_last_3mts_cat, levels = c("1", "2", "3", ">= 4")) # specify order
+
+
+# Drug transport
+df$drug_transport_self <- ifelse(is.na(df$drug_transport_self), NaN, df$drug_transport_self)
+df$drug_transport_higher_level_supplier <- ifelse(is.na(df$drug_transport_higher_level_supplier), NaN, df$drug_transport_higher_level_supplier)
+df$drug_transport_local_supplier <- ifelse(is.na(df$drug_transport_local_supplier), NaN, df$drug_transport_local_supplier)
+df$drug_transport_other <- ifelse(is.na(df$drug_transport_other), NaN, df$drug_transport_other)
+
+
+# 1.4 Create a joint drug storage practice variable from individual ones
+# for oxytocin and amoxicillin
+#------------------------------------------------------------------------
+# Clean variable
+df$label_and_expdate_visible <- ifelse((df$mathealth_label_and_expdate_visible == 1) &
+                                         (df$childhealth_label_and_expdate_visible == 1), 1,
+                                       ifelse((df$mathealth_label_and_expdate_visible == 0) &
+                                                (df$childhealth_label_and_expdate_visible == 0),3,2))
+
+df$expdate_fefo <- ifelse((df$mathealth_expdate_fefo == 1) &
+                            (df$childhealth_expdate_fefo == 1), 1,
+                          ifelse((df$mathealth_expdate_fefo == 0) &
+                                   (df$childhealth_expdate_fefo == 0),3,2))
+
+df$label_and_expdate_visible <- factor(df$label_and_expdate_visible)
+df$expdate_fefo <- factor(df$expdate_fefo)
+
+# 1.5 Clean classification of programs into items
+#------------------------------------------------
+df[which(df$item == "Oral hypoglycaemics"),]$program <- "ncds"
+df[which(df$program == "hygiene/antiseptic"),]$program <- "other - infection prevention"
+df[which(df$item == "Disposable latex gloves"),]$program <- "other - infection prevention"
+df[which(df$item == "Medical (surgical or procedural) masks"),]$program <- "other - infection prevention"
+df[which(df$item == "Eye protection (goggles, face shields)"),]$program <- "other - infection prevention"
+df[which(df$item == "Gowns"),]$program <- "other - infection prevention"
+df[which(df$program == "other"),]$program <- "general"
+df[which(df$item == "Simvastatin tablet or other statin"),]$program <- "ncds"
+df[which(df$item == "Cryptococcal antigen"),]$program <- "hiv"
+df[which(df$item == "Ephedrine (injection)"),]$program <- "surgical"
+df[which(df$item == "Fluconazole"),]$program <- "hiv"
+
+# 1.6 Clean consumable names for manuscript
+#-------------------------------------------------------------
+df[which(df$item == "slides and cover slips"),]$item <- "Slides and cover slips"
+df[which(df$item == "art_component_1"),]$item <- "Antiretroviral treatment (ART) component 1 (ZDV/AZT/TDF/D4T)"
+df[which(df$item == "art_component_2"),]$item <- "Antiretroviral treatment (ART) component 2 (3TC/FTC)"
+df[which(df$item == "art_component_3"),]$item <- "Antiretroviral treatment (ART) component 3 (Protease inhibitor)"
+df[which(df$item == "ACT"),]$item <- "Artemisinin-based combination therapy (ACT)"
+df[which(df$item == "MRDT"),]$item <- "Malaria Rapid Diagnostic Test (MRDT)"
+df[which(df$item == "SP"),]$item <- "Sulfadoxine/pyrimethamine"
+df[which(df$item == "glucose inhibitors"),]$item <- "Glucose inhibitor"
+df[which(df$item == "ACE inhibitor"),]$item <- "Angiotensin-converting enzyme(ACE) inhibitor"
+
+# 1.7 Clean level of care
+#-------------------------------------------
+#df[which(df$item == "Facility_level_1a"),]$item = "Primary level"
+#df[which(df$item == "Facility_level_1b"),]$item = "Secondary level"
+
+
+# 2. Create final dataframes for analysis
+##############################################################
+# Dataframes for sub-level analysis
+df_level0 <- df[df$fac_type == 'Facility_level_0',]
+df_level1a <- df[df$fac_type == 'Facility_level_1a',]
+df_level1b <- df[df$fac_type == 'Facility_level_1b',]
+df_level2 <- df[df$fac_type == 'Facility_level_2',]
+df_level3 <- df[df$fac_type == 'Facility_level_2',]
+
+
+# Set reference level for categorical variables
+df$incharge_drug_orders <- relevel(factor(df$incharge_drug_orders), ref="drug store clerk")
+df$district <- relevel(factor(df$district), ref="Lilongwe")
+df$item <- relevel(factor(df$item), ref="Paracetamol cap/tab")
+df$fac_owner <- relevel(factor(df$fac_owner), ref="Government")
+df$fac_type <- relevel(factor(df$fac_type), ref="Facility_level_1a")
+df$water_source_main <- relevel(factor(df$water_source_main), ref="piped into facility")
+df$program <- relevel(factor(df$program), ref="general")
+df$dist_torms_cat <- relevel(factor(df$dist_torms_cat), ref = "0-10 kms")
+df$dist_todh_cat <- relevel(factor(df$dist_todh_cat), ref = "0-10 kms")
+df$drug_order_fulfilment_freq_last_3mts_cat <- relevel(factor(df$drug_order_fulfilment_freq_last_3mts_cat), ref = "1")
+
+df_all_levels <- df
+
+# Dataframe for primary analysis - only levels 1a and 1b
+df <- df %>%
+  filter(fac_type != 'Facility_level_4', # Zomba Mental Hospital
+         fac_type != 'Facility_level_3', # Central Hospitals
+         fac_type != 'Facility_level_2', # District Hospitals
+         fac_type != 'Facility_level_0', # Health posts
+  )
+# fac_types 2 and 3 are always urban and always government owned
+# fac_type 0 has only 3 instances of non-government ownership, never provides cvd,
+# never has a vehicle
+
+# 3 Create facility-level dataframe with the above adjustments
+###############################################################
+# For the dataframe with facilities in levels 1a and 1b
+new_vars <- c('dist_torms_cat', 'dist_todh_cat', 'drug_order_fulfilment_freq_last_3mts_cat' )
+fac_features <- aggregate(df[unlist(c(fac_exp_vars, new_vars))], by = list(fac_code = df$fac_code), FUN = head, 1)
+availability_by_facility <- aggregate( df[,'available'], list(fac_code = df$fac_code),
+                                       FUN = mean, na.rm = TRUE)
+fac_reg_df <- merge(fac_features,availability_by_facility,by="fac_code")
+fac_reg_df <- na.omit(fac_reg_df)
+
+# For the dataframe with all facilities
+fac_features_all_levels <- aggregate(df_all_levels[unlist(c(fac_exp_vars, new_vars))], list(fac_code = df_all_levels$fac_code),  FUN = head, 1)
+availability_by_facility_all_levels <- aggregate( df_all_levels[,'available'], list(fac_code = df_all_levels$fac_code),
+                                                  FUN = mean, na.rm = TRUE)
+fac_reg_df_all_levels <- merge(fac_features_all_levels,availability_by_facility_all_levels,by="fac_code")
+fac_reg_df_all_levels <- na.omit(fac_reg_df_all_levels)
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R
new file mode 100644
index 0000000000..939a3bb655
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R
@@ -0,0 +1,270 @@
+# This script loads the necessary packages for the regression analysis
+# and creates functions used in the feature manipulation and regression analysis scripts.
+
+# 1. Load libraries #
+#####################
+install.packages("pacman")
+pacman::p_load(magrittr, # for %>% to work
+               estimatr, # for lm_robust to work (clustered SE)
+
+               dplyr,
+               modeest,
+               broom.mixed, # for tidy to work to generate conf int table
+               stringr,
+
+               #Excel packages
+               caTools,
+               readxl,
+               writexl,
+               openxlsx, #for write.xlsx
+               readr,
+               ggcorrplot,
+
+               # Regression packages
+               nlme, # random effects regression - lme
+               lmerTest, # random effects regression - lmer
+               ggfortify, # for diagnostic plots
+               glmmTMB, # Multilevel regression model
+               MASS, # to run stepAIC with BIC criterion
+
+               # visualisation packages
+               jtools, # for regression visualisations
+               sjPlot, # for plot_model
+               sjmisc, # for plot_model
+               viridis,
+               ggpubr,
+               ggplot2,
+               cowplot, # for plot_grid (combine multiple graphs)
+               gridExtra, # for combining forest plots (text grob)
+               grid,
+
+               # packages for tables
+               gtsummary, # to get p-values in summary tables
+               huxtable,  # to export regression summary tables using export_summs
+               margins, # to calculate average marginal effects from logistic regression results
+               janitor, # for tabyl command - get summary of categorical variable
+               aod, # wald.test
+
+               MESS, # for model comparison
+               car, # to run Wald tests on regression outputs (Anova function)
+               caret, # to run k-fold cross validation
+               effects, # to allow for type= "eff" in plot_model
+               cvTools, # for k-fold cross validation
+               fastDummies # to create dummies from categorical variable
+               )
+
+# Set file paths
+path_to_local_repo <- "/Users/sm2511/PycharmProjects/TLOmodel/" # Change this if different user
+path_to_dropbox <- "/Users/sm2511/Dropbox/Thanzi la Onse/" # Change this if different user
+path_to_files_in_dropbox <- paste0(path_to_dropbox, "05 - Resources/Module-healthsystem/consumables raw files/")
+path_to_data <- paste0(path_to_dropbox, "07 - Data/HHFA_2018-19/2 clean/")
+path_to_scripts <-  paste0(path_to_local_repo, "src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/")
+
+# Set paths to store outputs
+dir.create(file.path(path_to_local_repo, "outputs/", "regression_analysis"))
+path_to_outputs <- paste0(path_to_local_repo, "outputs/", "regression_analysis/")
+dir.create(file.path(path_to_outputs, "regression_results"))
+dir.create(file.path(path_to_outputs, "predictions"))
+dir.create(file.path(paste0(path_to_outputs, "predictions/", "figures")))
+dir.create(file.path(path_to_outputs, "tables"))
+dir.create(file.path(path_to_outputs, "figures"))
+
+# 2. Create functions #
+#######################
+# 2.1 Cap variables (or remove outliers)
+#---------------------------------------
+edit_outliers <- function(varname, cap_direction){
+  caps <- quantile(varname, probs=c(.05, .99), na.rm = T)
+  if (cap_direction == "upper"){
+    varname[varname > caps[2]] <- caps[2]
+  }
+  if (cap_direction == "lower"){
+    varname[varname < caps[1]] <- caps[1]
+  }
+  else{
+    varname[varname < caps[1]] <- caps[1]
+    varname[varname > caps[2]] <- caps[2]
+  }
+  return(varname)
+}
+
+# 2.2 Standardise variable
+#------------------------------
+standardise <- function(varname){
+  varname = (varname - mean(varname, na.rm = TRUE))/sd(varname, na.rm = TRUE)
+  return(varname)
+}
+
+# 2.3 Function for captioning and referencing images and RMD
+#--------------------------------------------------------------
+fig <- local({
+  i <- 0
+  ref <- list()
+  list(
+    cap=function(refName, text) {
+      i <<- i + 1
+      ref[[refName]] <<- i
+      paste("Figure ", i, ": ", text, sep="")
+    },
+    ref=function(refName) {
+      ref[[refName]]
+    })
+})
+
+# 2.4 Function to insert row into a dataframe
+#---------------------------------------------
+insertRow <- function(existingDF, newrow, r) {
+  existingDF[seq(r+1,nrow(existingDF)+1),] <- existingDF[seq(r,nrow(existingDF)),]
+  existingDF[r,] <- newrow
+  return(existingDF)
+}
+
+# 2.5 Create forest plot of model results - main regression
+#-------------------------------------------------------------------------------------------
+custom_forest_plot <- function(model, ylimit= 4){
+  # Choose model for graphing
+  model_forest <- model
+
+  # Cleaned row names for dataframes
+  vars_of_interest <- c("fac_typeFacility_level_1b", "fac_ownerCHAM", "fac_ownerNGO","fac_ownerPrivate for profit",
+                        "fac_urban",
+                        "functional_computer",
+                        "functional_emergency_vehicle",
+                        "service_diagnostic",
+                        "incharge_drug_orderscenter manager/owner","incharge_drug_ordersclinical officer",
+                        "incharge_drug_ordershsa/shsa", "incharge_drug_ordersmedical assistant",
+                        "incharge_drug_ordersnurse", "incharge_drug_ordersother", "incharge_drug_orderspharmacist or pharmacy technician",
+                        "incharge_drug_orderspharmacy assistant",
+                        "dist_todh_cat10-25 kms", "dist_todh_cat25-50 kms", "dist_todh_cat50-75 kms", "dist_todh_cat> 75 kms",
+                        "dist_torms_cat10-50 kms", "dist_torms_cat50-100 kms", "dist_torms_cat100-200 kms", "dist_torms_cat> 200 kms",
+                        "drug_order_fulfilment_freq_last_3mts_cat2", "drug_order_fulfilment_freq_last_3mts_cat3", "drug_order_fulfilment_freq_last_3mts_cat>= 4",
+                        "item_drug",
+                        "eml_priority_v")
+  reg_table_rownames <- data.frame(term = vars_of_interest,
+                                   newnames = c("...Level 1b",
+                                                "...Christian Health Association of Malawi (CHAM)", "...Non-governmental Organisation (NGO)", "...Private for profit",
+                                                "Facility is urban",
+                                                "Functional computer available",
+                                                "Emergency vehicle available",
+                                                "Diagnostic services available",
+                                                "...Center manager/owner", "...Clinical officer",
+                                                "...Health Surveillance Assistant (HSA)/Senior HSA", "...Medical assistant",
+                                                "...Nurse", "...Other",
+                                                "...Pharmacist or Pharmacy technician", "...Pharmacist assistant",
+                                                "...10-25 kms", "...25-50 kms", "...50-75 kms", "...> 75 kms",
+                                                "...10-50 kms", "...50-100 kms", "...100-200 kms", "...> 200 kms",
+                                                "...2", "...3", "...>=4",
+                                                "Consumable is a drug",
+                                                "Consumable is classified as vital"))
+
+
+  # Create the dataframe "forestdf" for the forest plot (Odds ratio, and upper/lower bounds)
+  #-----------------------------------------------------------------------------------------
+  forest_matrix <- tidy(model_forest,conf.int=TRUE,exponentiate=TRUE)
+  forest_matrix <- forest_matrix[which(forest_matrix$term %in% vars_of_interest),]
+  forest_matrix[c('estimate', 'conf.low', 'conf.high')] <- lapply(forest_matrix[c('estimate', 'conf.low', 'conf.high')], round, 2)
+  forest_matrix[c('p.value')] <- sprintf("%.4f",unlist(lapply(forest_matrix[c('p.value')], round, 4)))
+  forest_matrix[which(forest_matrix[c('p.value')] == "0.0000"),][c('p.value')] <- "<0.0001" # added on 11 March 2023
+
+  # Change rownames
+  forest_matrix$order <- 1:length(vars_of_interest)
+  forest_matrix <- merge(reg_table_rownames,forest_matrix,by="term")
+  forest_matrix$term <- forest_matrix$newnames
+  forest_matrix <-  forest_matrix[order(forest_matrix$order),]
+
+  forestdf <- structure(list(labels = structure(1:length(vars_of_interest), .Label = forest_matrix$term, class = "factor"),
+                             rr = forest_matrix$estimate, rrhigh = forest_matrix$conf.high, rrlow = forest_matrix$conf.low),
+                        class = "data.frame", row.names = c(NA, -29L)) # changes from factor to character
+
+
+  # Create the dataframe "fpplot" for the data table
+  #-----------------------------------------------------------
+  fplottable <- structure(list(labels = structure(1:length(vars_of_interest), .Label = forest_matrix$term, class = "factor"),
+                               ci = paste0(forest_matrix$estimate, " (", forest_matrix$conf.low, " - ",forest_matrix$conf.high, ")"),
+                               p = forest_matrix$p.value),
+                          class = "data.frame", row.names = c(NA,-29L))
+
+  # Add reference level rows to above dataframes
+  #----------------------------------------------
+  for (df_name in c('fplottable', 'forestdf')){
+    df_results <- get(df_name)
+    df_results$labels <- as.character(df_results$labels) # change format to character in order to add new rows
+    r_fac_type <- c("Facility level (Ref: Level 1a)", rep(NA, dim(df_results)[2]-1))
+    r_fac_owner <- c("Facility owner (Ref: Government)", rep(NA, dim(df_results)[2]-1))
+    r_incharge_drug_orders <- c("Person in charge of drug orders (Ref: Drug store clerk)", rep(NA, dim(df_results)[2]-1))
+    r_dist_todh <- c("Distance from DHO (Ref: 0-10kms)", rep(NA, dim(df_results)[2]-1))
+    r_dist_torms <- c("Distance from RMS (Ref: 0-10kms)", rep(NA, dim(df_results)[2]-1))
+    r_drug_order_fulfilment_freq_last_3mts <- c("Quarterly drug order fulfillment frequency (Ref: 1)", rep(NA, dim(df_results)[2]-1))
+    regressor_headings = c("Facility level (Ref: Level 1a)", "Facility owner (Ref: Government)",
+                           "Person in charge of drug orders (Ref: Drug store clerk)", "Distance from DHO (Ref: 0-10kms)",
+                           "Distance from RMS (Ref: 0-10kms)", "Quarterly drug order fulfillment frequency (Ref: 1)")
+
+    a <- which(df_results$labels == "...Level 1b")
+    df_results <- insertRow(df_results , r_fac_type, a)
+    b <- which(df_results$labels == "...Christian Health Association of Malawi (CHAM)")
+    df_results <- insertRow(df_results , r_fac_owner, b)
+    c <- which(df_results$labels == "...Center manager/owner")
+    df_results <- insertRow(df_results , r_incharge_drug_orders, c)
+    d <- which(df_results$labels == "...10-25 kms")
+    df_results <- insertRow(df_results , r_dist_todh, d)
+    e <- which(df_results$labels == "...10-50 kms")
+    df_results <- insertRow(df_results , r_dist_torms, e)
+    f <- which(df_results$labels == "...2")
+    df_results <- insertRow(df_results , r_drug_order_fulfilment_freq_last_3mts, f)
+
+    # Add alternating color scheme
+    if((dim(df_results)[1] %% 2) == 0){
+      df_results$colour <- rep(c("white", "gray"), dim(df_results)[1]/2)
+    } else {
+      df_results$colour <- c(rep(c("white", "gray"), dim(df_results)[1]/2), "white")
+    }
+
+    assign(df_name, df_results)
+  }
+
+  column_headers_space1 <- c("", NA, NA, NA, "white")
+  forestdf <- insertRow(forestdf , column_headers_space1, 1)
+  forestdf <<- insertRow(forestdf , column_headers_space1, 1)
+
+  column_headers <- c("", "Odds ratio (95% CI)", "p-value", "white")
+  column_headers_space2 <- c("", "__________________", "________", "white")
+  fplottable <- insertRow(fplottable , column_headers, 1)
+  fplottable <<- insertRow(fplottable , column_headers_space2, 2)
+
+
+  # Create data table for plot
+  #-------------------------------
+  # Ensure that the order of labels does not change in the table
+  fplottable$labels <- factor(fplottable$labels, levels = rev(unique(fplottable$labels)))
+
+  data_table <<- ggplot(data = fplottable, aes(y = labels, fontface = ifelse(forestdf$labels %in% regressor_headings, "italic", "plain")),
+                        family = "Times") +
+    geom_hline(aes(yintercept = labels, colour = colour), size = 3) +
+    geom_text(aes(x = 0, label = labels), hjust = 0, size = 2.5) +
+    geom_text(aes(x = 5, label = ci), size = 2.5) +
+    geom_text(aes(x = 7, label = p), hjust = 1, size = 2.5) +
+    scale_colour_identity() +
+    theme_void() +
+    theme(plot.margin = ggplot2::margin(5, 0, 32, 0))
+
+  # Create forest plot for plot
+  #-------------------------------
+  forestdf[c('rr', 'rrhigh', 'rrlow')] <- sapply(forestdf[c('rr', 'rrhigh', 'rrlow')],as.numeric)
+
+  p <<- ggplot(forestdf, aes(x = rr, y = labels, xmin = rrlow, xmax = rrhigh)) +
+    geom_hline(aes(yintercept = labels, colour = colour), size = 3) + # creates the grid #4.5
+    geom_pointrange(shape = 20, fill = "black") + # previous 22
+    geom_vline(xintercept = 1, linetype = 3) + # vertical line at x = 1
+    xlab("Odds Ratio with 95% Confidence Interval") +
+    ylab("") +
+    theme_classic() +
+    scale_colour_identity() +
+    scale_y_discrete(limits = rev(forestdf$labels)) +
+    scale_x_log10(limits = c(0.20, ylimit*(1.15)),
+                  breaks =  0.25 * 2^(seq(0,log(4*ylimit)/log(2),1)),
+                  labels = as.character(0.25 * 2^(seq(0,log(4*ylimit)/log(2),1))), expand = c(0,0),
+                  oob=scales::squish) +
+    theme(axis.text.y = element_blank(), axis.title.y = element_blank()) +
+    theme(plot.margin = ggplot2::margin(5, 0, 5, 0)) +
+    theme(axis.title=element_text(size=6, face = "bold"))
+}
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R
new file mode 100644
index 0000000000..9d18533b32
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R
@@ -0,0 +1,15 @@
+# This script sources other scripts needed for the regression analysis and controls the workflow.
+
+# Load Libraries and Functions
+source(paste0(path_to_scripts, "load_packages_and_functions.R"))
+
+# Load Data
+source(paste0(path_to_scripts, "data_setup.R"))
+
+# Data Preparation
+source(paste0(path_to_scripts, "feature_manipulation.R"))
+
+# Regression analysis
+source(paste0(path_to_scripts, "regression_analysis.R"))
+
+# Prediction
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/regression_analysis.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/regression_analysis.R
new file mode 100644
index 0000000000..c22ed31087
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/regression_analysis.R
@@ -0,0 +1,181 @@
+# This script performs the regression analyses
+# Step 1: Chose subset of variables which will be included in the regression model - The Lancet Global Health Paper describes in
+# detail how this was arrived at, therefore, the script to produce chosen_varlist is not included in master
+# Link to paper - https://doi.org/10.1016/S2214-109X(24)00095-0
+# Link to branch which contains the full script - https://github.com/UCL/TLOmodel/tree/consumable_scenarios/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis
+
+# Step 2: Run chosen regression model. See links above for a description of how the facility and item random effects
+# model was chosen for the main result
+
+####################################################################
+# 1. Set up list of variables to be included in the regression model
+####################################################################
+# 1.1 Choose variable list for regression model
+#-----------------------------------------------
+chosen_varlist_orig <- c( "available",                            "fac_urban",
+                         "fac_type" ,                            "functional_computer",
+                         "incharge_drug_orders",                 "functional_emergency_vehicle",
+                         "dist_todh",                            "dist_torms",
+                         "drug_order_fulfilment_freq_last_3mts", "service_diagnostic",
+                         "item",                                 "functional_refrigerator",
+                         "fac_owner",                            "service_fp",
+                         "service_othersti",                     "functional_handwashing_facility",
+                         "service_hiv",                          "service_malaria",
+                         "source_drugs_ngo",                     "service_cvd",
+                         "service_tb",                           "bed_count",
+                         "outpatient_only",                      "water_source_main",
+                         "functional_toilet",                    "functional_landline",
+                         "fuctional_mobile",                     "service_imci",
+                         "drug_transport_self",                  "source_drugs_pvt",
+                         "rms",                                  "item_drug",
+                         "eml_priority_v")
+
+# 1.2 Setting up categorical variables for regression analysis
+#--------------------------------------------------------------
+continuous_variables <- c("dist_todh", "dist_torms", "drug_order_fulfilment_freq_last_3mts")
+continuous_variables_cat_version <- c("dist_todh_cat", "dist_torms_cat",
+                                     "drug_order_fulfilment_freq_last_3mts_cat")
+
+chosen_varlist <- chosen_varlist_orig[!(chosen_varlist_orig %in% continuous_variables)]
+chosen_varlist <- unlist(c(chosen_varlist,continuous_variables_cat_version ))
+
+# 1.3 Clean chosen independent variables list to ensure convergence of random effects model
+# - remove highly correlated variables
+# - remove variables with little variability
+#-------------------------------------------------------------------------------------------
+# Set up data for regression analysis
+full_df <- df[chosen_varlist]
+full_df <- na.omit(full_df)
+
+# Check correlation between independent variables
+varlist_check_corr <- chosen_varlist[!(chosen_varlist %in% c('district', 'fac_type', 'incharge_drug_orders',
+                                                            'fac_owner', 'water_source_main', 'item', 'rms',
+                                                            continuous_variables_cat_version))] # 'program', 'mode_administration',
+correlation_final_varlist <- cor(full_df[varlist_check_corr])
+corrplot_final_varlist <- ggcorrplot(correlation_final_varlist, lab_size = 1.5, p.mat = NULL,
+                                     insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1,
+                                     tl.cex =5.5, lab = TRUE)
+ggsave(plot = corrplot_final_varlist, filename = paste0(path_to_outputs, "figures/correlation_final_varlist.png"))
+
+# Based on the above results, drop variables on account of reasons listed below
+vars_low_variation <- c('service_cvd') # %% service_diagnostic was previously dropped %%
+vars_highly_correlated <- c('bed_count') # highly correlated with outpatient_only
+chosen_varlist <- chosen_varlist[!(chosen_varlist %in% vars_low_variation)]
+chosen_varlist <- chosen_varlist[!(chosen_varlist %in% vars_highly_correlated)]
+
+# bed_count is highly correlated with outpatient_only
+# very few instances of no lab facilities/no diagnostic services
+
+# Update correlation matrix after dropping the above variables
+varlist_check_corr_post <- chosen_varlist[!(chosen_varlist %in% c('district', 'fac_type', 'incharge_drug_orders',
+                                                                 'fac_owner', 'water_source_main', 'item','rms',
+                                                                 continuous_variables_cat_version))] # 'program', 'mode_administration',
+correlation_final_varlist_post <- cor(full_df[varlist_check_corr_post])
+corrplot_final_varlist_post <- ggcorrplot(correlation_final_varlist_post, lab_size = 1.5, p.mat = NULL,
+                                          insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1,
+                                          tl.cex =5.5, lab = TRUE)
+
+ggsave(plot = corrplot_final_varlist_post, filename = paste0(path_to_outputs, "figures/correlation_final_varlist_post.png"))
+
+
+# List of binary variables
+bin_exp_vars <- c('fac_urban', 'functional_emergency_vehicle' ,'functional_computer',
+                  'service_diagnostic' , 'functional_refrigerator', 'functional_landline',
+                  'fuctional_mobile', 'functional_toilet', 'functional_handwashing_facility',
+                  'outpatient_only', 'service_hiv', 'service_othersti', 'service_malaria',
+                  'service_tb', 'service_fp', 'service_imci', 'source_drugs_ngo',
+                  'source_drugs_pvt', 'drug_transport_self', 'item_drug', "eml_priority_v")
+
+
+#######################################################################
+# 2. Set up Dataframe for Regression analysis
+#######################################################################
+# Drop items and facilities with too few facilities reporting/items reported respectively
+# Drop items and facs with very few observations
+df_not_na_by_fac <- df %>%
+  group_by(fac_code) %>%
+  summarise(available_count = sum(!is.na(available))) %>%
+  arrange(available_count)
+
+df_not_na_by_item <- df %>%
+  group_by(item) %>%
+  summarise(available_count = sum(!is.na(available))) %>%
+  arrange(available_count)
+
+# Make a list of items with less than 10% facilities reporting (these are the consumables relevant to higher level RMNCH services for which
+# only 64 facilities submitted reports)
+items_with_too_few_obs <- subset(df_not_na_by_item, df_not_na_by_item$available_count < 100)['item']
+items_with_too_few_obs <- as.list(items_with_too_few_obs)
+
+# Make a list of facilities with less than 10% items reported
+print(max(df_not_na_by_fac$available_count))
+facs_with_too_few_obs <- subset(df_not_na_by_fac, df_not_na_by_fac$available_count <= 0.1*max(df_not_na_by_fac$available_count))['fac_code']
+facs_with_too_few_obs <- as.list(facs_with_too_few_obs)
+
+df_for_regs <- subset(df, !(fac_code %in% facs_with_too_few_obs$fac_code))
+df_for_regs <- subset(df_for_regs, !(item %in% items_with_too_few_obs$item))
+
+print(paste(length(facs_with_too_few_obs$fac_code), " facilities dropped."))
+print(paste(length(items_with_too_few_obs$item), " items dropped."))
+
+# Set up dataframe for facility and item random effects model
+#------------------------------------------------------------
+# Note that this is model 4 in the Lancet GH paper. We do not include models 1-3 because these are not used for analysis on the impact of consumable availability scenarios
+# add variables for random effects regression (re_reg)
+chosen_varlist_for_re_reg <- unlist(c(chosen_varlist, 'program', 'fac_code', 'item', 'district', 'item_type'))
+
+df_for_re_reg <- df_for_regs[chosen_varlist_for_re_reg]
+df_for_re_reg <- na.omit(df_for_re_reg)
+
+# Sort by fac_code
+df_for_re_reg_sorted <-  df_for_re_reg[order(df_for_re_reg$fac_code, df_for_re_reg$item),]
+# Create an numeric value for fac_code (clustering variable)
+df_for_re_reg_sorted$fac_id <- as.integer(factor(df_for_re_reg_sorted$fac_code,levels=unique(df_for_re_reg_sorted$fac_code)))
+df_for_re_reg_sorted <- as.data.frame(df_for_re_reg_sorted)
+
+#######################################################################
+# 3. Run Regression Model
+#######################################################################
+# The model below works but does not have service_diagnostic
+model_fac_item_re <- glmer(available ~ fac_type + fac_owner + fac_urban + functional_computer +
+                             functional_emergency_vehicle + service_diagnostic +
+                             incharge_drug_orders +
+                             dist_todh_cat + dist_torms_cat +
+                             drug_order_fulfilment_freq_last_3mts_cat +  rms +
+                             functional_refrigerator +  functional_landline + fuctional_mobile +
+                             functional_toilet +  functional_handwashing_facility +
+                             water_source_main +
+                             outpatient_only +
+                             service_hiv + service_othersti +
+                             service_malaria + service_tb +
+                             service_fp + service_imci +
+                             source_drugs_ngo +  source_drugs_pvt +
+                             drug_transport_self + item_drug + eml_priority_v +
+                             (1|district/fac_code) + (1|program/item),
+                           family = binomial(logit),
+                           data = df_for_re_reg_sorted,
+                           control = glmerControl(optimizer = "bobyqa",
+                                                  optCtrl=list(maxfun=1e5),
+                                                  calc.derivs = TRUE)
+)
+
+# Calculate the Intra-class correlation
+icc_between_model_fac_item_re <- performance::icc(model_fac_item_re, by_group = TRUE)
+
+# Save regression results
+#-------------------------
+save(model_fac_item_re, file = paste0(path_to_outputs, "regression_results/model_fac_item_re.rdta"))
+
+# Summarise results in a table
+#--------------------------------
+t <- tbl_regression(model_fac_item_re, exponentiate = TRUE, conf.int = TRUE, pvalue_fun = ~style_sigfig(., digits = 4))
+
+tbl_merge <-
+  tbl_merge(
+    tbls = list(t),
+    tab_spanner = c("**Facility and Item RE**") #
+  )  %>%    # build gtsummary table
+  as_gt() # %>%             # convert to gt table
+#  gt::gtsave(             # save table as image
+#    filename = reg_results1
+#  )

From eedb2e8c60828cc1e5a63a1f3010a313e0ba0c20 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Thu, 23 May 2024 11:27:06 +0100
Subject: [PATCH 003/100] move file paths to main.R

---
 .../regression_analysis/load_packages_and_functions.R      | 7 -------
 .../regression_analysis/main.R                             | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R
index 939a3bb655..2649203fbb 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/load_packages_and_functions.R
@@ -53,13 +53,6 @@ pacman::p_load(magrittr, # for %>% to work
                fastDummies # to create dummies from categorical variable
                )
 
-# Set file paths
-path_to_local_repo <- "/Users/sm2511/PycharmProjects/TLOmodel/" # Change this if different user
-path_to_dropbox <- "/Users/sm2511/Dropbox/Thanzi la Onse/" # Change this if different user
-path_to_files_in_dropbox <- paste0(path_to_dropbox, "05 - Resources/Module-healthsystem/consumables raw files/")
-path_to_data <- paste0(path_to_dropbox, "07 - Data/HHFA_2018-19/2 clean/")
-path_to_scripts <-  paste0(path_to_local_repo, "src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/")
-
 # Set paths to store outputs
 dir.create(file.path(path_to_local_repo, "outputs/", "regression_analysis"))
 path_to_outputs <- paste0(path_to_local_repo, "outputs/", "regression_analysis/")
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R
index 9d18533b32..7c637a62df 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R
@@ -1,5 +1,12 @@
 # This script sources other scripts needed for the regression analysis and controls the workflow.
 
+# Set file paths
+path_to_local_repo <- "/Users/sm2511/PycharmProjects/TLOmodel/" # Change this if different user
+path_to_dropbox <- "/Users/sm2511/Dropbox/Thanzi la Onse/" # Change this if different user
+path_to_files_in_dropbox <- paste0(path_to_dropbox, "05 - Resources/Module-healthsystem/consumables raw files/")
+path_to_data <- paste0(path_to_dropbox, "07 - Data/HHFA_2018-19/2 clean/")
+path_to_scripts <-  paste0(path_to_local_repo, "src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/")
+
 # Load Libraries and Functions
 source(paste0(path_to_scripts, "load_packages_and_functions.R"))
 

From 1f2c5b78b650a6b50d66aaf8ad754dc07654c7eb Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Thu, 23 May 2024 16:03:40 +0100
Subject: [PATCH 004/100] Add script to generation regression-based predicted
 changes to availability

---
 .../regression_analysis/predict.R             | 145 ++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/predict.R

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/predict.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/predict.R
new file mode 100644
index 0000000000..64f86b48c6
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/predict.R
@@ -0,0 +1,145 @@
+# This script loads the outputs of the final regression model
+# and generates predictions for consumable availability scenarios
+
+###########################################################
+# 1. Load regression model output
+###########################################################
+load(paste0(path_to_outputs, "regression_results/model_fac_item_re.rdta"))
+#TODO CHange above path
+
+###########################################################
+# 2. Run predictions for policy evaluation
+###########################################################
+# 2.1 Setup
+#--------------------------------------------------------------------------------------------------------------------------------
+# Replace program string which shorter text for plotting
+df_regress <- df_for_re_reg_sorted
+df_regress$program_plot <- df_regress$program
+rep_str <- c('acute lower respiratory infections'='alri','obstetric and newborn care'='obs&newb',
+            'other - infection prevention'='infection_prev', 'child health' = 'child')
+df_regress$program_plot <- str_replace_all(df_regress$program_plot, rep_str)
+df_regress <- df_regress[unlist(c(chosen_varlist_for_re_reg, 'program_plot'))]
+
+# Drop one item which seems to be causing an issue in the prediction
+drop_items <- c("Dipsticks for urine ketone bodies for rapid diagnostic ")
+df_regress <- df_regress %>%
+  filter(!(item %in% drop_items) # Second line ARVs
+  )
+# TODO check why this item needs to be dropped
+# TODO check why there is an extra column item.1 in the dataframe
+
+# Generate a column with regression based predictions
+df_regress <- df_regress %>%
+  mutate(available_prob_predicted = predict(model_fac_item_re,newdata=df_regress, type = "response"))
+df_regress$fac_type_original <- df_regress$fac_type # because predictions are run on fac_type and then data collapsed on the basis of this variable
+
+# 2.2 Run predictions by changing the 5 characteristics which have the largest association with consumable availability
+#--------------------------------------------------------------------------------------------------------------------------------
+top_5_features_for_cons_availability <- list('item_drug' = 0,
+                                             'eml_priority_v' = 1,
+                                             'incharge_drug_orders' = 'pharmacist or pharmacy technician',
+                                             'fac_type' = 'Facility_level_1b',
+                                             'fac_owner' = 'CHAM')
+top_5_features_for_cons_availability_cumulative_names <- list('item_drug' = 'scen1',
+                                             'eml_priority_v' = 1,
+                                             'incharge_drug_orders' = 'pharmacist or pharmacy technician',
+                                             'fac_type' = 'Facility_level_1b',
+                                             'fac_owner' = 'CHAM') # this is for naming
+
+i <- 0
+j <- 0
+for (feature in names(top_5_features_for_cons_availability)){
+    if (i == 0){
+        print(paste0("Running predictions for ", feature, " = ", top_5_features_for_cons_availability[feature]))
+        df <- df_regress
+        df[feature] <- top_5_features_for_cons_availability[feature] # cumulatively change facility and consumable features
+        old_prediction <- df$available_prob_predicted # store prediction from the previous run
+        i <- 1
+    } else{
+        print(paste0("Running predictions for ", feature, " = ", top_5_features_for_cons_availability[feature], " (cumulative, i.e. in addition to previous update)"))
+        df[feature] <- top_5_features_for_cons_availability[feature] # cumulatively change facility and consumable features
+        old_prediction <- df$available_prob_new_prediction # store prediction from the previous run
+    }
+    #unique_values_of_chosen_features <- df %>%  summarise_at(vars(eml_priority_v, item_drug, incharge_drug_orders, fac_type, fac_owner), ~list(unique(.)))
+
+    # Run prediction with update set of features
+    new_prediction <- predict(model_fac_item_re,newdata=df, type = "response") # Predict availability when all facilities have pharmacist managing drug orders
+    df$available_prob_new_prediction <- new_prediction
+
+    print(paste0("New mean probability of availability is ",mean(df$available_prob_new_prediction, na.rm = TRUE) * 100, "%"))
+
+    # Check that the prediction improves availability from the previous case
+    stopifnot(mean(old_prediction, na.rm = TRUE) <
+                  mean(new_prediction, na.rm = TRUE))
+
+    # Calculate proportional change in availability from baseline prediction which can be applied to LMIS data
+      df$availability_change_prop <- df$available_prob_new_prediction/df$available_prob_predicted
+
+      if (j == 0){
+        # Collapse data
+        summary_pred <- df %>%
+          group_by(district, fac_type_original, program_plot, item) %>%
+          summarise_at(vars(available, available_prob_predicted, availability_change_prop), list(mean))
+
+        pred_col_number <- length(colnames(summary_pred))
+        colnames(summary_pred)[pred_col_number] <- paste0('change_proportion_scenario', j+1)
+        all_predictions_df <- summary_pred
+      } else{
+        # Collapse data
+        summary_pred <- df %>%
+          group_by(district, fac_type_original, program_plot, item) %>%
+          summarise_at(vars(availability_change_prop), list(mean))
+
+        pred_col_number <- length(colnames(summary_pred))
+        colnames(summary_pred)[pred_col_number] <- paste0('change_proportion_scenario', j+1) # names(top_5_features_for_cons_availability)[j]
+
+        all_predictions_df <- merge(all_predictions_df, summary_pred, by = c('district', 'fac_type_original', 'program_plot', 'item'))
+      }
+      j <- j + 1
+}
+
+###########################################################
+# 3. Export predictions
+###########################################################
+write.csv(all_predictions_df,paste0(path_to_outputs, "predictions/predicted_consumable_availability_regression_scenarios.csv"), row.names = TRUE)
+
+##################################################################
+# 4. Plot predicted availability under 5 scenarios
+################################################################
+# Plot original values
+p_original <- ggplot(all_predictions_df, aes(item, district,  fill= available_prob_predicted)) +
+  geom_tile() +
+  facet_wrap(~fac_type_original) +
+  scale_fill_viridis_c(limits = c(0, 1), option = "viridis", direction = -1) +
+  theme(axis.text.x = element_text(angle = 45 , vjust = 0.7, size = 1),
+        axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 8),
+        axis.text.y = element_text(size = 4),
+        legend.position = 'none',
+        plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5)) +
+  labs(title = "Probability of consumable availability - actual",
+       subtitle =paste0("Global average = ", round(mean(all_predictions_df$available_prob_predicted) *100, 2),"%")) +
+  xlab("consumable")
+
+# Plot predicted values
+p_predict <- ggplot(all_predictions_df, aes(item, district,  fill= available_prob_predicted * change_proportion_scenario1)) +
+  geom_tile() +
+  facet_wrap(~fac_type_original) +
+  scale_fill_viridis_c(limits = c(0, 1), option = "viridis", direction = -1) +
+  theme(axis.text.x = element_text(angle = 45 , vjust = 0.7, size = 1),
+        axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 8),
+        axis.text.y = element_text(size = 4),
+        legend.position = 'none',
+        plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5))  +
+  labs(title = "Probability of consumable availability - predicted \n (all items are consumables rather than drugs)",
+       subtitle =paste0("Global average = ", round(mean(all_predictions_df$available_prob_predicted * all_predictions_df$change_proportion_scenario1) *100, 2),"%")) +
+  xlab("consumable")
+
+figure <- ggpubr::ggarrange(p_original, p_predict, # list of plots
+                  labels = "AUTO", # labels
+                  common.legend = T, # COMMON LEGEND
+                  legend = "bottom", # legend position
+                  align = "hv", # Align them both, horizontal and vertical
+                  nrow = 2)  %>% # number of rows
+  ggexport(filename = paste0(path_to_outputs, "predictions/figures/pred_item_is_consumable_other_than_drug.pdf"))
+
+# TODO Update this figure to include all scenarios

From 64b21605a9bc618fed453a86917967cb4bdfb384 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Thu, 23 May 2024 16:13:47 +0100
Subject: [PATCH 005/100] add predict.R to workflow

---
 .../regression_analysis/main.R                                   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R
index 7c637a62df..641e52bf88 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_hhfa/regression_analysis/main.R
@@ -20,3 +20,4 @@ source(paste0(path_to_scripts, "feature_manipulation.R"))
 source(paste0(path_to_scripts, "regression_analysis.R"))
 
 # Prediction
+source(paste0(path_to_scripts, "predict.R"))

From 33c90390ab30c11f1dfe6a0a82493335dd208e61 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Thu, 23 May 2024 17:20:52 +0100
Subject: [PATCH 006/100] draft script to generate consumable availability
 scenarios

---
 ...ilability_scenarios_for_impact_analysis.py | 408 ++++++++++++++++++
 1 file changed, 408 insertions(+)
 create mode 100644 src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
new file mode 100644
index 0000000000..9d37bd1eef
--- /dev/null
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -0,0 +1,408 @@
+"""
+This script adds estimates of availability of consumables under different scenarios to the base Resource File:
+
+Outputs:
+* Updated version of ResourceFile_Consumables_availability_small.csv (estimate of consumable available - smaller file for use in the
+ simulation) which includes new consumable availability estimates for all scenarios
+
+Inputs:
+* outputs/regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv - This file is hosted
+locally after running consumable_resource_analyses_with_hhfa/regression_analysis/main.R
+* ResourceFile_Consumables_availability_small.csv` - This file contains the original consumable availability estimates
+from OpenLMIS 2018 data
+* `ResourceFile_Consumables_matched.csv` - This file contains the crosswalk of HHFA consumables to consumables in the
+TLO model
+
+It creates one row for each consumable for availability at a specific facility and month when the data is extracted from
+the OpenLMIS dataset and one row for each consumable for availability aggregated across all facilities when the data is
+extracted from the Harmonised Health Facility Assessment 2018/19.
+
+Consumable availability is measured as probability of consumable being available at any point in time.
+"""
+import calendar
+import datetime
+from collections import defaultdict
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+from plotnine import * # ggplot, aes, geom_point for ggplots from R
+import numpy as np
+import pandas as pd
+
+from tlo.methods.consumables import check_format_of_consumables_file
+
+# Set local Dropbox source
+path_to_dropbox = Path(  # <-- point to the TLO dropbox locally
+    '/Users/sm2511/Dropbox/Thanzi la Onse'
+)
+
+# define a timestamp for script outputs
+timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
+
+# print the start time of the script
+print('Script Start', datetime.datetime.now().strftime('%H:%M'))
+
+# define a pathway to the data folder (note: currently outside the TLO model directory)
+# remember to set working directory to TLOmodel/
+outputfilepath = Path("./outputs")
+resourcefilepath = Path("./resources")
+path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables"
+
+# 1. Import and clean data files
+#**********************************
+# 1.1 Import TLO model availability data
+#------------------------------------------------------
+tlo_availability_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv")
+# Drop any scenario data previously included in the resourcefile
+tlo_availability_df = tlo_availability_df[['Facility_ID', 'month', 'item_code', 'available_prop']]
+
+# 1.1.1 Attach district, facility level, program to this dataset
+#----------------------------------------------------------------
+# Get TLO Facility_ID for each district and facility level
+mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv")
+districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District'])
+fac_levels = {'0', '1a', '1b', '2', '3', '4'}
+tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
+                    on = ['Facility_ID'], how='left')
+
+# 1.1.2 Attach programs
+programs = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv")[['category', 'item_code', 'module_name']]
+# TODO See if programs can be extracted from a different location as ResourceFile_Consumables_availability_and_usage.csv is now deprecated in master
+programs = programs.drop_duplicates('item_code')
+tlo_availability_df = tlo_availability_df.merge(programs, on = ['item_code'], how = 'left')
+
+# 1.2 Import scenario data
+#------------------------------------------------------
+scenario_availability_df = pd.read_csv(outputfilepath / "regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv")
+scenario_availability_df = scenario_availability_df.drop(['Unnamed: 0'], axis=1)
+scenario_availability_df = scenario_availability_df.rename({'item': 'item_hhfa'}, axis=1)
+
+# Get Scenario data ready to be merged based on TLO model features
+# 1.2.1 Level of care
+#------------------------------------------------------
+scenario_availability_df['fac_type'] = scenario_availability_df['fac_type'].str.replace("Facility_level_", "")
+
+# 1.2.2 District
+#------------------------------------------------------
+# Do some mapping to make the Districts line-up with the definition of Districts in the model
+rename_and_collapse_to_model_districts = {
+    'Mzimba South': 'Mzimba',
+    'Mzimba North': 'Mzimba',
+}
+scenario_availability_df['district_std'] = scenario_availability_df['district'].replace(rename_and_collapse_to_model_districts)
+
+# Cities to get same results as their respective regions
+copy_source_to_destination = {
+    'Mzimba': 'Mzuzu City',
+    'Lilongwe': 'Lilongwe City',
+    'Zomba': 'Zomba City',
+    'Blantyre': 'Blantyre City',
+    'Nkhata Bay': 'Likoma' # based on anecdotal evidence, assume that they experience the same change in avaiability as a result of interventions based on regression results
+}
+for source, destination in copy_source_to_destination.items():
+    new_rows = scenario_availability_df.loc[scenario_availability_df.district_std == source].copy()
+    new_rows.district_std = destination
+    scenario_availability_df = pd.concat([scenario_availability_df, new_rows], ignore_index = True)
+
+# 1.2.3 Facility_ID
+# #------------------------------------------------------
+# Merge-in facility_id
+scenario_availability_facid_merge = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
+                    left_on=['district_std', 'fac_type'],
+                    right_on=['District', 'Facility_Level'], how='left', indicator=True)
+scenario_availability_facid_merge = scenario_availability_facid_merge.rename({'_merge': 'merge_facid'}, axis=1)
+
+# Extract list of District X Facility Level combinations for which there is no HHFA data
+scenario_availability_df_test = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
+                    left_on=['district_std', 'fac_type'],
+                    right_on=['District', 'Facility_Level'], how='right', indicator=True)
+cond_no_1b = (scenario_availability_df_test['Facility_Level'].isin(['1b'])) & (scenario_availability_df_test['_merge'] == 'right_only')
+cond_no_1a = (scenario_availability_df_test['Facility_Level'].isin(['1a'])) & (scenario_availability_df_test['_merge'] == 'right_only')
+districts_with_no_scenario_data_for_1b = scenario_availability_df_test[cond_no_1b]['District'].unique()
+districts_with_no_scenario_data_for_1a = scenario_availability_df_test[cond_no_1a]['District'].unique()
+districts_with_no_scenario_data_for_1b_only = np.setdiff1d(districts_with_no_scenario_data_for_1b, districts_with_no_scenario_data_for_1a)
+
+# According to HHFA data, Balaka, Machinga, Mwanza, Ntchisi and Salima do not have level 1b facilities
+# Likoma was not included in the regression because of the limited variation within the district - only 4 facilities
+
+# 1.2.4 Program
+#------------------------------------------------------
+map_model_programs_to_hhfa = {
+    'contraception': 'contraception',
+    'general': 'general',
+    'reproductive_health': 'obs&newb',
+    'road_traffic_injuries': 'surgical',
+    'epi': 'epi',
+    'neonatal_health': 'obs&newb',
+    'other_childhood_illnesses': 'alri',
+    'malaria': 'malaria',
+    'tb': 'tb',
+    'hiv': 'hiv',
+    'undernutrition': 'child',
+    'ncds': 'ncds',
+    'cardiometabolicdisorders': 'ncds',
+    'cancer': 'ncds',
+}
+# TODO Check if the above mapping is correct
+# TODO check how infection_prev should be mapped
+
+scenario_availability_facid_merge['category_tlo'] = scenario_availability_facid_merge['program_plot'].replace(map_model_programs_to_hhfa)
+
+# 1.2.5 Consumable/Item code
+#------------------------------------------------------
+# Load TLO - HHFA consumable name crosswalk
+consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')
+# TODO Check that this crosswalk is complete
+
+# Now merge in TLO item codes
+scenario_availability_facid_merge = scenario_availability_facid_merge.reset_index().drop(['index'], axis=1)
+scenario_availability_facid_itemcode_merge = scenario_availability_facid_merge.merge(consumable_crosswalk_df[['item_code', 'item_hhfa', 'regression_application', 'module_name']],
+                    on = ['item_hhfa'], how='right', indicator=True, validate = "m:m")
+scenario_availability_facid_itemcode_merge = scenario_availability_facid_itemcode_merge.drop_duplicates(['Facility_ID', 'item_code'])
+#scenario_availability_facid_itemcode_merge.to_csv(outputfilepath / "temp_merge_status.csv")
+scenario_availability_facid_itemcode_merge = scenario_availability_facid_itemcode_merge.rename({'_merge': 'merge_itemcode'}, axis=1)
+print("Number of item codes from the TLO model for which no match was found in the regression-based scenario data = ", scenario_availability_facid_itemcode_merge.merge_itemcode.value_counts()[1])
+
+# Before merging the above dataframe with tlo_availability_df, we need to duplicate rows for each of the unmatched items
+# to represent all facility IDs
+
+# Further a row needs to be added for 1b level under Balaka, Machinga, Mwanza, Ntchisi and Salima
+print("Number of unique facility IDs: \n TLO consumable data = ", tlo_availability_df.Facility_ID.nunique(),
+      "\n Scenario based on regression = ", scenario_availability_facid_itemcode_merge.Facility_ID.nunique(),
+      "\n Number of unique item codes: \n TLO consumable data = ", tlo_availability_df.item_code.nunique(),
+      "\n  Scenario based on regression = ", scenario_availability_facid_itemcode_merge.item_code.nunique())
+
+# 1.2.6 Interpolation/Imputation where data is missing
+#------------------------------------------------------
+# 1.2.6.1 Items not matched
+#------------------------------------------------------
+# Before merging the scenario dataframe with tlo_availability_df, generate rows with all 57 relevant facility IDs for item_codes
+# which are not matched
+df = scenario_availability_facid_itemcode_merge
+df_missing = df.loc[df['Facility_ID'].isna()].reset_index()
+df_missing = df_missing.drop_duplicates('item_code')
+
+df_existing = df.loc[~df['Facility_ID'].isna()]
+
+# Create a cartesian product of unique values in columns 'item_code' and 'Facility_ID' from both dataframes
+df_new = pd.DataFrame({'Facility_ID': np.repeat(df_existing['Facility_ID'].unique(), df_missing['item_code'].nunique()),
+                       'item_code': np.tile(df_missing['item_code'].unique(), df_existing['Facility_ID'].nunique())})
+# Merge df_new with df_missing on columns 'item_code' and 'Facility_ID'
+df_new = df_new[['Facility_ID', 'item_code']].merge(df_missing.drop('Facility_ID', axis = 1), on=['item_code'], how='left', validate = "m:1")
+# df_new = df_new.append(df_existing)
+df_new = pd.concat([df_new, df_existing], ignore_index = True)
+
+#df_new.to_csv(outputfilepath / 'temp_scenario_file_with_adequate_rows.csv')
+scenario_final_df = df_new
+len(scenario_final_df[~scenario_final_df['change_proportion_incharge_drug_orders'].isna()])
+
+# Now provide availability_change_prop for rows where this data is not available
+
+# 1.2.6.1 Extract list of TLO consumables which weren't matched with the availability prediction dataframe
+#scenario_availability_facid_itemcode_merge[items_not_matched][['item_code', 'regression_application']].to_csv(outputfilepath / 'temp_items_not_matched.csv')
+
+# Get average  availability_change_prop value by facility_ID and category_tlo
+scenario_final_df = scenario_final_df.drop('module_name', axis = 1).merge(programs,
+                                            on = ['item_code'], validate = "m:1",
+                                            how = "left")
+list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2',
+       'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5']
+scenario_averages_by_program_and_facid = scenario_final_df.groupby(['Facility_ID','category'])[list_of_scenario_variables].mean().reset_index()
+
+# a. Some consumables which were matched with a corresponding HHFA item do not appear in the above dataset because
+# they were excluded from the regression analysis due to missing availability information on them
+# from most facilities. We will assume that their availability changes as per the average overall change
+# (eg. diazepam, morphine, atropine)
+items_not_matched = scenario_final_df['merge_itemcode'] == 'right_only'
+scenario_final_df[items_not_matched]['regression_application'].unique()
+
+unmatched_items_category1 = (items_not_matched) & ((scenario_final_df['regression_application'].isna()) | \
+                                                   (scenario_final_df['regression_application'] == 'proxy'))
+
+# b.'assume average'
+unmatched_items_category2 = (items_not_matched) & (scenario_final_df['regression_application'] == 'assume average')
+
+# Replace missing instances with the above average values for categories 1 and 2
+scenario_cat1_and_cat2 = scenario_final_df[(unmatched_items_category1|unmatched_items_category2)]
+scenario_cat1_and_cat2 = scenario_cat1_and_cat2.drop(list_of_scenario_variables, axis = 1)
+
+scenario_cat1_and_cat2 = scenario_cat1_and_cat2.merge(scenario_averages_by_program_and_facid,
+                                                      on = ['Facility_ID','category'], validate = "m:1",
+                                                      how = "left")
+for var in list_of_scenario_variables:
+    var_imputed = var + '_imputed'
+    scenario_cat1_and_cat2[var_imputed] = scenario_cat1_and_cat2[var]
+    scenario_final_df = scenario_final_df.merge(scenario_cat1_and_cat2[['Facility_ID','item_code', var_imputed]],
+                                                          on = ['Facility_ID','item_code'], validate = "1:1",
+                                                          how = "left")
+    scenario_final_df.loc[(unmatched_items_category1|unmatched_items_category2), var] = scenario_final_df[var_imputed]
+
+# c. 'not relevant to logistic regression analysis'
+unmatched_items_category3 = (items_not_matched) & (scenario_final_df['regression_application'] == 'not relevant to logistic regression analysis')
+# For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability
+for var in list_of_scenario_variables:
+    scenario_final_df.loc[unmatched_items_category3,var] = 1
+
+# d. any other categories of unmatched consumables
+unmatched_items_category4 = ~unmatched_items_category1 & ~unmatched_items_category2 & ~unmatched_items_category3 & items_not_matched
+assert(sum(unmatched_items_category4) == 0) # check that we haven't missed any consumables
+
+# 1.2.6.2 Inf values
+#------------------------------------------------------
+# Where the values are Inf because the availability changed from 0 to X, replace with the average for the category and Facility_ID
+for var in list_of_scenario_variables:
+    print(f"Running scenario {var}")
+    change_value_is_infinite = scenario_final_df[var].isin([np.inf])
+    print(sum(change_value_is_infinite))
+    '''
+    # None of the values is infinite so we don't have to run the code below
+    df_inf = scenario_final_df[change_value_is_infinite].reset_index()
+
+    average_change_across_category_and_facid = scenario_final_df[~change_value_is_infinite].groupby(['Facility_ID','category'])[var].mean().reset_index()
+
+    df_inf_replaced = df_inf.drop(var, axis = 1).merge(average_change_across_category_and_facid,
+                                   on = ['Facility_ID','category'],
+                                   how = 'left', validate = "m:1")
+    scenario_final_df = pd.concat([scenario_final_df[~change_value_is_infinite], df_inf_replaced], ignore_index = True)
+    '''
+
+#cond1 = scenario_final_df.District.isin(districts_with_no_scenario_data_for_1b_only)
+#cond2 = scenario_final_df.Facility_Level == '1b'
+#scenario_final_df[cond1 & cond2]
+
+# 2. Merge TLO model availability data with scenario data using crosswalk
+#*************************************************************************
+# 2.1 Merge the two datasets
+#------------------------------------------------------
+id_variables = ['item_code','Facility_ID','regression_application']
+
+new_availability_df = tlo_availability_df.merge(scenario_final_df[id_variables + list_of_scenario_variables],
+                               how='left', on=['Facility_ID', 'item_code'], indicator = True)
+new_availability_df = new_availability_df.rename({'_merge': 'merge_scenario'}, axis=1)
+new_availability_df = new_availability_df.drop_duplicates(['Facility_ID', 'item_code', 'month'])
+
+# 2.2 Further imputation
+#------------------------------------------------------
+# 2.2.1 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume
+# that the change is equal to the product of the (ratio of average change across districts for level 1b to
+# average change across districts for level 1a) and change for each item_code for level 1a for that district
+#------------------------------------------------------
+average_change_across_districts = scenario_final_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index()
+
+#average_change_across_districts = scenario_final_df.groupby(['Facility_Level','item_code']).agg({'availability_change_prop': lambda x: x.mean(skipna = False)}).reset_index()
+new_colnames_1a = {col: col + '_1a' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns}
+new_colnames_1b = {col: col + '_1b' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns}
+average_change_across_districts_for_1a = average_change_across_districts[average_change_across_districts.Facility_Level == "1a"].rename(new_colnames_1a, axis = 1).drop('Facility_Level', axis = 1)
+average_change_across_districts_for_1b = average_change_across_districts[average_change_across_districts.Facility_Level == "1b"].rename(new_colnames_1b, axis = 1).drop('Facility_Level', axis = 1)
+ratio_of_change_across_districts_1b_to_1a = average_change_across_districts_for_1a.merge(average_change_across_districts_for_1b,
+                                                                                         how = "left", on = ['item_code'])
+# START HERE
+for var in list_of_scenario_variables:
+    var_ratio = 'ratio_' + var
+    var_1a = var + '_1a'
+    var_1b = var + '_1b'
+    ratio_of_change_across_districts_1b_to_1a[var_ratio] = (ratio_of_change_across_districts_1b_to_1a[var_1b]-1)/(ratio_of_change_across_districts_1b_to_1a[var_1a] - 1)
+ratio_of_change_across_districts_1b_to_1a.reset_index()
+
+# Use the above for those districts no level 1b facilities recorded in the HHFA data
+cond_1b_missing_district = new_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only)
+cond_1b = new_availability_df.Facility_Level == '1b'
+cond_1a = new_availability_df.Facility_Level == '1a'
+df_missing_1b = new_availability_df[cond_1b_missing_district & cond_1b]
+df_1a = new_availability_df[cond_1b_missing_district & cond_1a]
+
+ratio_vars = ['ratio_' + item for item in list_of_scenario_variables]
+item_var = ['item_code']
+df_missing_1b_imputed = df_missing_1b.merge(ratio_of_change_across_districts_1b_to_1a[item_var + ratio_vars],
+                               on = ['item_code'],
+                               how = 'left', validate = "m:1")
+for var in list_of_scenario_variables:
+    # check that the values we are replacing are in fact missing
+    assert np.isnan(df_missing_1b_imputed[var].unique()).all()
+
+id_vars_level1a = ['District', 'item_code', 'month']
+df_missing_1b_imputed = df_missing_1b_imputed.drop(list_of_scenario_variables, axis = 1).merge(df_1a[id_vars_level1a + list_of_scenario_variables],
+                               on = id_vars_level1a,
+                               how = 'left', validate = "1:1", indicator = True)
+
+for var in list_of_scenario_variables:
+    df_missing_1b_imputed[var] = ((df_missing_1b_imputed[var]-1) * df_missing_1b_imputed['ratio_' + var]) + 1
+
+new_availability_df_imputed = pd.concat([new_availability_df[~(cond_1b_missing_district & cond_1b)], df_missing_1b_imputed], ignore_index = True)
+
+# 2.2.2 For all levels other than 1a and 1b, there will be no change in consumable availability
+#------------------------------------------------------
+fac_levels_not_relevant_to_regression = new_availability_df_imputed.Facility_Level.isin(['0', '2', '3', '4'])
+new_availability_df_imputed.loc[fac_levels_not_relevant_to_regression, 'availability_change_prop'] = 1
+
+# 2.3 Final checks
+#------------------------------------------------------
+# 2.3.1 Check that the merged dataframe has the same number of unique items, facility IDs, and total
+# number of rows as the original small availability resource file
+#------------------------------------------------------
+assert(new_availability_df_imputed.item_code.nunique() == tlo_availability_df.item_code.nunique())
+assert(new_availability_df_imputed.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique())
+assert(len(new_availability_df_imputed) == len(tlo_availability_df))
+
+# 2.3.2. Browse missingness in the availability_change_prop variable
+#------------------------------------------------------
+pivot_table = pd.pivot_table(new_availability_df_imputed,
+                             values=list_of_scenario_variables,
+                             index=['category'],
+                             columns=['Facility_Level'],
+                             aggfunc=lambda x: sum(pd.isna(x))/len(x)*100)
+
+print(pivot_table[('change_proportion_all_features', '1b')])
+'''
+Cases which are still missing data:
+1. For the 5 districts without 1b facilities in HHFA (Balaka, Machinga, Mwanza, Ntchisi, Salima), data on 54 items
+for 1b is missing.
+2. Chitipa 1b is missing data on 5 consumables - 176, 177,178,179,181,192, 2675
+3. 184	187 are missing for nearly all districts
+
+Previously, Likoma did not have data from the regression analysis - I have now used values from Nkhata bay as proxy
+'''
+
+# 2.2.4 PLaceholder code to replace all other missing values
+# TODO Check why there are still instances of missing data when regression_application is assume average or proxy
+#------------------------------------------------------
+# For all other cases, assume no change
+for var in list_of_scenario_variables:
+    missing_change_data = new_availability_df_imputed[var].isna()
+    new_availability_df_imputed.loc[missing_change_data, var] = new_availability_df_imputed[var].mean()
+
+# Where the merge_scenario == "left_only", we need to provide data on "availability_change_prop"
+# new_availability_df_imputed.to_csv(outputfilepath / 'current_status_of_scenario_merge.csv')
+# TODO Check why the city and district both have an instance of facility level 2
+
+# 3. Generate scenario data on consumable availablity
+#------------------------------------------------------
+# Create new consumable availability estimates for TLO model consumables using
+# estimates of proportional change from the regression analysis based on HHFA data
+#------------------------------------------------------
+prefix = 'change_proportion_'
+list_of_scenario_suffixes = [s.replace(prefix, '') for s in list_of_scenario_variables]
+
+for scenario in list_of_scenario_suffixes:
+    new_availability_df_imputed['available_prop_scenario_' + scenario] = new_availability_df_imputed['available_prop'] * new_availability_df_imputed['change_proportion_' + scenario]
+    availability_greater_than_1 = new_availability_df_imputed['available_prop_scenario_' + scenario] > 1
+    new_availability_df_imputed.loc[availability_greater_than_1, 'available_prop_scenario_' + scenario] = 1
+
+    assert(sum(new_availability_df_imputed['available_prop_scenario_' + scenario].isna()) ==
+           sum(new_availability_df_imputed['change_proportion_' + scenario].isna()))
+
+# Save
+#------------------------------------------------------
+final_list_of_scenario_vars = ['available_prop_scenario_' + item for item in list_of_scenario_suffixes]
+old_vars = ['Facility_ID', 'month', 'item_code', 'available_prop']
+full_df_with_scenario = new_availability_df_imputed[old_vars + final_list_of_scenario_vars].reset_index().drop('index', axis = 1)
+# Remove suffix for column names ending with '_cat' to mean 'categorised'
+full_df_with_scenario.columns = [col.replace('_cat', '') if col.endswith('_cat') else col for col in full_df_with_scenario.columns]
+
+# Save updated consumable availability resource file with scenario data
+full_df_with_scenario.to_csv(
+    path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv",
+    index=False
+)
+# TODO: What about cases which do not exist in HHFA data?
+# TODO: Create a column providing the source of scenario data

From 3f1fdff0a31d8a78ce3a8fbbac57483018f486e8 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Tue, 28 May 2024 16:40:34 +0100
Subject: [PATCH 007/100] update consumable RF to include TLO-HHFA consumable
 mapping

---
 .../consumables/ResourceFile_consumables_matched.csv          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
index 7754d65118..d7798357fc 100644
--- a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
+++ b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fbfe91222d3a2a32ed44a4be711b30c5323276a71df802f6c9249eb4c21f8d43
-size 90158
+oid sha256:66b68e216701f841382e3add63305d55e1598862076789352ac7d799100e6d1a
+size 110047

From 1e4631507f0b03f5e2e3927b02021edaf4e5cfdb Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Wed, 29 May 2024 19:49:40 +0100
Subject: [PATCH 008/100] updates to imputation

---
 ...ilability_scenarios_for_impact_analysis.py | 112 ++++++++++++------
 1 file changed, 76 insertions(+), 36 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 9d37bd1eef..fab35617f6 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -69,6 +69,7 @@
 programs = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv")[['category', 'item_code', 'module_name']]
 # TODO See if programs can be extracted from a different location as ResourceFile_Consumables_availability_and_usage.csv is now deprecated in master
 programs = programs.drop_duplicates('item_code')
+# manually add category for the two consumables for which it is missing
 tlo_availability_df = tlo_availability_df.merge(programs, on = ['item_code'], how = 'left')
 
 # 1.2 Import scenario data
@@ -77,10 +78,10 @@
 scenario_availability_df = scenario_availability_df.drop(['Unnamed: 0'], axis=1)
 scenario_availability_df = scenario_availability_df.rename({'item': 'item_hhfa'}, axis=1)
 
-# Get Scenario data ready to be merged based on TLO model features
+# Prepare scenario data to be merged to TLO model availability based on TLO model features
 # 1.2.1 Level of care
 #------------------------------------------------------
-scenario_availability_df['fac_type'] = scenario_availability_df['fac_type'].str.replace("Facility_level_", "")
+scenario_availability_df['fac_type'] = scenario_availability_df['fac_type_original'].str.replace("Facility_level_", "")
 
 # 1.2.2 District
 #------------------------------------------------------
@@ -89,7 +90,8 @@
     'Mzimba South': 'Mzimba',
     'Mzimba North': 'Mzimba',
 }
-scenario_availability_df['district_std'] = scenario_availability_df['district'].replace(rename_and_collapse_to_model_districts)
+scenario_availability_df = scenario_availability_df.rename({'district': 'district_original'}, axis=1)
+scenario_availability_df['district'] = scenario_availability_df['district_original'].replace(rename_and_collapse_to_model_districts)
 
 # Cities to get same results as their respective regions
 copy_source_to_destination = {
@@ -100,21 +102,21 @@
     'Nkhata Bay': 'Likoma' # based on anecdotal evidence, assume that they experience the same change in avaiability as a result of interventions based on regression results
 }
 for source, destination in copy_source_to_destination.items():
-    new_rows = scenario_availability_df.loc[scenario_availability_df.district_std == source].copy()
-    new_rows.district_std = destination
+    new_rows = scenario_availability_df.loc[scenario_availability_df.district == source].copy() # standardised district names
+    new_rows.district = destination
     scenario_availability_df = pd.concat([scenario_availability_df, new_rows], ignore_index = True)
 
 # 1.2.3 Facility_ID
 # #------------------------------------------------------
 # Merge-in facility_id
 scenario_availability_facid_merge = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
-                    left_on=['district_std', 'fac_type'],
+                    left_on=['district', 'fac_type'],
                     right_on=['District', 'Facility_Level'], how='left', indicator=True)
 scenario_availability_facid_merge = scenario_availability_facid_merge.rename({'_merge': 'merge_facid'}, axis=1)
 
 # Extract list of District X Facility Level combinations for which there is no HHFA data
 scenario_availability_df_test = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
-                    left_on=['district_std', 'fac_type'],
+                    left_on=['district', 'fac_type'],
                     right_on=['District', 'Facility_Level'], how='right', indicator=True)
 cond_no_1b = (scenario_availability_df_test['Facility_Level'].isin(['1b'])) & (scenario_availability_df_test['_merge'] == 'right_only')
 cond_no_1a = (scenario_availability_df_test['Facility_Level'].isin(['1a'])) & (scenario_availability_df_test['_merge'] == 'right_only')
@@ -151,8 +153,11 @@
 # 1.2.5 Consumable/Item code
 #------------------------------------------------------
 # Load TLO - HHFA consumable name crosswalk
-consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')
+consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['module_name', 'item_code', 'consumable_name_tlo',
+'item_code_hhfa', 'item_hhfa', 'regression_application', 'notes_on_regression_application']]
 # TODO Check that this crosswalk is complete
+# TODO is module_name used?
+# TODO add new consumables Rifapentine to this?
 
 # Now merge in TLO item codes
 scenario_availability_facid_merge = scenario_availability_facid_merge.reset_index().drop(['index'], axis=1)
@@ -167,48 +172,60 @@
 # to represent all facility IDs
 
 # Further a row needs to be added for 1b level under Balaka, Machinga, Mwanza, Ntchisi and Salima
-print("Number of unique facility IDs: \n TLO consumable data = ", tlo_availability_df.Facility_ID.nunique(),
-      "\n Scenario based on regression = ", scenario_availability_facid_itemcode_merge.Facility_ID.nunique(),
-      "\n Number of unique item codes: \n TLO consumable data = ", tlo_availability_df.item_code.nunique(),
-      "\n  Scenario based on regression = ", scenario_availability_facid_itemcode_merge.item_code.nunique())
+print("Number of unique facility IDs: \n - TLO consumable data = ", tlo_availability_df.Facility_ID.nunique(),
+      "\n - Scenario based on regression = ", scenario_availability_facid_itemcode_merge.Facility_ID.nunique(),
+      "\nNumber of unique item codes: \n - TLO consumable availability data = ", tlo_availability_df.item_code.nunique(),
+      "\n - TLO consumable availability repository = ", consumable_crosswalk_df.item_code.nunique(),
+      "\n - Scenario based on regression = ", scenario_availability_facid_itemcode_merge.item_code.nunique())
 
 # 1.2.6 Interpolation/Imputation where data is missing
 #------------------------------------------------------
-# 1.2.6.1 Items not matched
+# 1.2.6.1 Facility IDs not matched
 #------------------------------------------------------
 # Before merging the scenario dataframe with tlo_availability_df, generate rows with all 57 relevant facility IDs for item_codes
 # which are not matched
 df = scenario_availability_facid_itemcode_merge
-df_missing = df.loc[df['Facility_ID'].isna()].reset_index()
-df_missing = df_missing.drop_duplicates('item_code')
+df_missing_facids = df.loc[df['Facility_ID'].isna()].reset_index()
+df_missing_facids = df_missing_facids.drop_duplicates('item_code') # These item_codes don't have separate rows by
+# Facility_ID because they were not found in the HHFA regression analysis
 
-df_existing = df.loc[~df['Facility_ID'].isna()]
+df_with_facids = df.loc[~df['Facility_ID'].isna()] # these are rows with Facility_ID
 
 # Create a cartesian product of unique values in columns 'item_code' and 'Facility_ID' from both dataframes
-df_new = pd.DataFrame({'Facility_ID': np.repeat(df_existing['Facility_ID'].unique(), df_missing['item_code'].nunique()),
-                       'item_code': np.tile(df_missing['item_code'].unique(), df_existing['Facility_ID'].nunique())})
+df_with_all_facids = pd.DataFrame({'Facility_ID': np.repeat(df_with_facids['Facility_ID'].unique(), df_missing_facids['item_code'].nunique()),
+                       'item_code': np.tile(df_missing_facids['item_code'].unique(), df_with_facids['Facility_ID'].nunique())})
 # Merge df_new with df_missing on columns 'item_code' and 'Facility_ID'
-df_new = df_new[['Facility_ID', 'item_code']].merge(df_missing.drop('Facility_ID', axis = 1), on=['item_code'], how='left', validate = "m:1")
+df_with_all_facids = df_with_all_facids[['Facility_ID', 'item_code']].merge(df_missing_facids.drop('Facility_ID', axis = 1), on=['item_code'], how='left', validate = "m:1")
 # df_new = df_new.append(df_existing)
-df_new = pd.concat([df_new, df_existing], ignore_index = True)
+df_with_all_facids = pd.concat([df_with_all_facids, df_with_facids], ignore_index = True)
+# Now all item_codes have all Facility_IDs included in the HHFA regression analysis but no values for scenarios
 
-#df_new.to_csv(outputfilepath / 'temp_scenario_file_with_adequate_rows.csv')
-scenario_final_df = df_new
-len(scenario_final_df[~scenario_final_df['change_proportion_incharge_drug_orders'].isna()])
-
-# Now provide availability_change_prop for rows where this data is not available
+scenario_final_df = df_with_all_facids
+#len(scenario_final_df[scenario_final_df['change_proportion_scenario1'].isna()])
 
+# Now provide scenario data for rows where this data is not available
 # 1.2.6.1 Extract list of TLO consumables which weren't matched with the availability prediction dataframe
 #scenario_availability_facid_itemcode_merge[items_not_matched][['item_code', 'regression_application']].to_csv(outputfilepath / 'temp_items_not_matched.csv')
 
 # Get average  availability_change_prop value by facility_ID and category_tlo
-scenario_final_df = scenario_final_df.drop('module_name', axis = 1).merge(programs,
+scenario_final_df = scenario_final_df.merge(programs[['category', 'item_code']],
                                             on = ['item_code'], validate = "m:1",
                                             how = "left")
 list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2',
        'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5']
 scenario_averages_by_program_and_facid = scenario_final_df.groupby(['Facility_ID','category'])[list_of_scenario_variables].mean().reset_index()
 
+# check that all consumables have a category assigned to them
+map_items_with_missing_category_to_category= {77:'reproductive_health',
+301:'alri',
+63: 'neonatal_health',
+258: 'cancer',
+1735: 'general'}
+# Update the category column based on item_code
+scenario_final_df['category'] = scenario_final_df.apply(lambda row: map_items_with_missing_category_to_category[row['item_code']]
+                          if row['item_code'] in map_items_with_missing_category_to_category
+                          else row['category'], axis=1)
+
 # a. Some consumables which were matched with a corresponding HHFA item do not appear in the above dataset because
 # they were excluded from the regression analysis due to missing availability information on them
 # from most facilities. We will assume that their availability changes as per the average overall change
@@ -237,13 +254,20 @@
                                                           how = "left")
     scenario_final_df.loc[(unmatched_items_category1|unmatched_items_category2), var] = scenario_final_df[var_imputed]
 
-# c. 'not relevant to logistic regression analysis'
+# c. if category is missing, take average across items
+cond_missing_category = scenario_final_df.category.isna()
+cond_missing_scenario_data = scenario_final_df.change_proportion_scenario1.isna()
+print("The following items don't have an appropriate category assigned for imputation - ", scenario_final_df[cond_missing_category & cond_missing_scenario_data]['item_code'].unique())
+for var in list_of_scenario_variables:
+    scenario_final_df.loc[cond_missing_category & cond_missing_scenario_data, var] = scenario_final_df[var].mean()
+
+# d. 'not relevant to logistic regression analysis'
 unmatched_items_category3 = (items_not_matched) & (scenario_final_df['regression_application'] == 'not relevant to logistic regression analysis')
 # For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability
 for var in list_of_scenario_variables:
     scenario_final_df.loc[unmatched_items_category3,var] = 1
 
-# d. any other categories of unmatched consumables
+# e. any other categories of unmatched consumables
 unmatched_items_category4 = ~unmatched_items_category1 & ~unmatched_items_category2 & ~unmatched_items_category3 & items_not_matched
 assert(sum(unmatched_items_category4) == 0) # check that we haven't missed any consumables
 
@@ -253,8 +277,8 @@
 for var in list_of_scenario_variables:
     print(f"Running scenario {var}")
     change_value_is_infinite = scenario_final_df[var].isin([np.inf])
-    print(sum(change_value_is_infinite))
-    '''
+    print("Number of Inf values changed to average = ", sum(change_value_is_infinite))
+'''
     # None of the values is infinite so we don't have to run the code below
     df_inf = scenario_final_df[change_value_is_infinite].reset_index()
 
@@ -264,11 +288,27 @@
                                    on = ['Facility_ID','category'],
                                    how = 'left', validate = "m:1")
     scenario_final_df = pd.concat([scenario_final_df[~change_value_is_infinite], df_inf_replaced], ignore_index = True)
-    '''
+'''
 
-#cond1 = scenario_final_df.District.isin(districts_with_no_scenario_data_for_1b_only)
-#cond2 = scenario_final_df.Facility_Level == '1b'
-#scenario_final_df[cond1 & cond2]
+'''
+# 1.2.6.3 Assume that the availability at level 1b in districts (Balaka, Salima, Ntchisi, Mwanza, Machinga) changes in the same proportion as the average across all
+# districts at level 1b
+cond1 = ~scenario_final_df.District.isin(districts_with_no_scenario_data_for_1b_only)
+cond2 = scenario_final_df.Facility_Level == '1b'
+scenario_averages_by_item_for_level1b = scenario_final_df[cond1 & cond2].groupby(['item_code'])[list_of_scenario_variables].mean().reset_index()
+dfs = []
+for dist in districts_with_no_scenario_data_for_1b_only:
+    df_copy = scenario_averages_by_item_for_level1b.copy()
+    df_copy['district'] = dist
+    dfs.append(df_copy)
+
+result_df = pd.concat(dfs, ignore_index=True)
+result_df['Facility_Level'] = '1b'
+result_df = result_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], left_on = ['district', 'Facility_Level'], right_on = ['District', 'Facility_Level'], how = 'left')
+scenario_final_df = pd.concat([scenario_final_df, result_df], ignore_index=True)
+
+'''
+# TODO There are still some items missing for some facility IDs
 
 # 2. Merge TLO model availability data with scenario data using crosswalk
 #*************************************************************************
@@ -351,8 +391,8 @@
                              index=['category'],
                              columns=['Facility_Level'],
                              aggfunc=lambda x: sum(pd.isna(x))/len(x)*100)
-
-print(pivot_table[('change_proportion_all_features', '1b')])
+pivot_table.to_csv(outputfilepath / "temp.csv")
+print(pivot_table[('change_proportion_scenario5', '1b')])
 '''
 Cases which are still missing data:
 1. For the 5 districts without 1b facilities in HHFA (Balaka, Machinga, Mwanza, Ntchisi, Salima), data on 54 items

From 61754953bad2dd12aa14ce5c5655dbcf70fcb753 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Thu, 30 May 2024 11:20:02 +0100
Subject: [PATCH 009/100] check that the dataframe contains all districts

---
 ...ate_consumable_availability_scenarios_for_impact_analysis.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index fab35617f6..f6a200e02a 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -106,6 +106,8 @@
     new_rows.district = destination
     scenario_availability_df = pd.concat([scenario_availability_df, new_rows], ignore_index = True)
 
+assert sorted(set(districts)) == sorted(set(pd.unique(scenario_availability_df.district)))
+
 # 1.2.3 Facility_ID
 # #------------------------------------------------------
 # Merge-in facility_id

From e9a2f51027afed6c908cf22c554d4bacfecb9082 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Thu, 30 May 2024 15:14:07 +0100
Subject: [PATCH 010/100] temporary changes to revert

---
 ...ilability_scenarios_for_impact_analysis.py | 154 +++++++++++++++++-
 1 file changed, 145 insertions(+), 9 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index f6a200e02a..7c533fe444 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -148,7 +148,7 @@
     'cancer': 'ncds',
 }
 # TODO Check if the above mapping is correct
-# TODO check how infection_prev should be mapped
+# TODO collapse infection_prev and general in the HHFA-based predicted dataframe
 
 scenario_availability_facid_merge['category_tlo'] = scenario_availability_facid_merge['program_plot'].replace(map_model_programs_to_hhfa)
 
@@ -162,7 +162,7 @@
 # TODO add new consumables Rifapentine to this?
 
 # Now merge in TLO item codes
-scenario_availability_facid_merge = scenario_availability_facid_merge.reset_index().drop(['index'], axis=1)
+scenario_availability_facid_merge = scenario_availability_facid_merge.reset_index(drop = True)
 scenario_availability_facid_itemcode_merge = scenario_availability_facid_merge.merge(consumable_crosswalk_df[['item_code', 'item_hhfa', 'regression_application', 'module_name']],
                     on = ['item_hhfa'], how='right', indicator=True, validate = "m:m")
 scenario_availability_facid_itemcode_merge = scenario_availability_facid_itemcode_merge.drop_duplicates(['Facility_ID', 'item_code'])
@@ -184,10 +184,15 @@
 #------------------------------------------------------
 # 1.2.6.1 Facility IDs not matched
 #------------------------------------------------------
-# Before merging the scenario dataframe with tlo_availability_df, generate rows with all 57 relevant facility IDs for item_codes
+# Before merging the scenario dataframe with tlo_availability_df, generate rows with all 59 relevant facility IDs for item_codes
 # which are not matched
-df = scenario_availability_facid_itemcode_merge
-df_missing_facids = df.loc[df['Facility_ID'].isna()].reset_index()
+df = scenario_availability_facid_itemcode_merge[['District', 'Facility_Level', 'Facility_ID',
+       'category_tlo', 'item_code',
+       'available', 'available_prob_predicted', 'change_proportion_scenario1',
+       'change_proportion_scenario2', 'change_proportion_scenario3',
+       'change_proportion_scenario4', 'change_proportion_scenario5', 'regression_application',
+       'merge_itemcode']]
+df_missing_facids = df.loc[df['Facility_ID'].isna()].reset_index(drop = True)
 df_missing_facids = df_missing_facids.drop_duplicates('item_code') # These item_codes don't have separate rows by
 # Facility_ID because they were not found in the HHFA regression analysis
 
@@ -310,6 +315,39 @@
 scenario_final_df = pd.concat([scenario_final_df, result_df], ignore_index=True)
 
 '''
+# 1.2.6.3 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume
+# that the change is equal to the product of the (ratio of average change across districts for level 1b to
+# average change across districts for level 1a) and change for each item_code for level 1a for that district
+#------------------------------------------------------------------------------------------------------------
+average_change_across_districts = scenario_final_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index()
+
+# Generate the ratio of the proportional changes to availability of level 1b to 1a in the districts for which level 1b data is available
+new_colnames_1a = {col: col + '_1a' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns}
+new_colnames_1b = {col: col + '_1b' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns}
+average_change_across_districts_for_1a = average_change_across_districts[average_change_across_districts.Facility_Level == "1a"].rename(new_colnames_1a, axis = 1).drop('Facility_Level', axis = 1)
+average_change_across_districts_for_1b = average_change_across_districts[average_change_across_districts.Facility_Level == "1b"].rename(new_colnames_1b, axis = 1).drop('Facility_Level', axis = 1)
+ratio_of_change_across_districts_1b_to_1a = average_change_across_districts_for_1a.merge(average_change_across_districts_for_1b,
+                                                                                         how = "left", on = ['item_code'])
+for var in list_of_scenario_variables:
+    var_ratio = 'ratio_' + var
+    var_1a = var + '_1a'
+    var_1b = var + '_1b'
+    ratio_of_change_across_districts_1b_to_1a[var_ratio] = (ratio_of_change_across_districts_1b_to_1a[var_1b])/(ratio_of_change_across_districts_1b_to_1a[var_1a])
+ratio_of_change_across_districts_1b_to_1a.reset_index(drop = True)
+# TODO check if this ratio should be of the proportions minus 1
+
+# Use the above for those districts with no level 1b facilities recorded in the HHFA data
+cond_1b_missing_district = scenario_final_df.District.isin(districts_with_no_scenario_data_for_1b_only)
+cond_1b = scenario_final_df.Facility_Level == '1b'
+cond_1a = scenario_final_df.Facility_Level == '1a'
+df_missing_1b = scenario_final_df[cond_1b_missing_district & cond_1b]
+df_1a = scenario_final_df[cond_1b_missing_district & cond_1a]
+
+
+scenario_final_df
+
+
+
 # TODO There are still some items missing for some facility IDs
 
 # 2. Merge TLO model availability data with scenario data using crosswalk
@@ -344,7 +382,7 @@
     var_1a = var + '_1a'
     var_1b = var + '_1b'
     ratio_of_change_across_districts_1b_to_1a[var_ratio] = (ratio_of_change_across_districts_1b_to_1a[var_1b]-1)/(ratio_of_change_across_districts_1b_to_1a[var_1a] - 1)
-ratio_of_change_across_districts_1b_to_1a.reset_index()
+ratio_of_change_across_districts_1b_to_1a.reset_index(drop = True)
 
 # Use the above for those districts no level 1b facilities recorded in the HHFA data
 cond_1b_missing_district = new_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only)
@@ -373,19 +411,117 @@
 new_availability_df_imputed = pd.concat([new_availability_df[~(cond_1b_missing_district & cond_1b)], df_missing_1b_imputed], ignore_index = True)
 
 # 2.2.2 For all levels other than 1a and 1b, there will be no change in consumable availability
-#------------------------------------------------------
+#------------------------------------------------------------------------------------------------------------
 fac_levels_not_relevant_to_regression = new_availability_df_imputed.Facility_Level.isin(['0', '2', '3', '4'])
-new_availability_df_imputed.loc[fac_levels_not_relevant_to_regression, 'availability_change_prop'] = 1
+
+for var in list_of_scenario_variables:
+    new_availability_df_imputed.loc[fac_levels_not_relevant_to_regression, var] = 1
 
 # 2.3 Final checks
 #------------------------------------------------------
 # 2.3.1 Check that the merged dataframe has the same number of unique items, facility IDs, and total
 # number of rows as the original small availability resource file
-#------------------------------------------------------
+#---------------------------------------------------------------------------------------------------------
 assert(new_availability_df_imputed.item_code.nunique() == tlo_availability_df.item_code.nunique())
 assert(new_availability_df_imputed.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique())
 assert(len(new_availability_df_imputed) == len(tlo_availability_df))
 
+# 2.3.2 Construct dataset that conforms to the principles expected by the simulation: i.e. that there is an entry for every
+# facility_id and for every month for every item_code.
+#-----------------------------------------------------------------------------------------------------------------------
+# Generate the dataframe that has the desired size and shape
+fac_ids = set(mfl.loc[mfl.Facility_Level != '5'].Facility_ID)
+item_codes = set(tlo_availability_df.item_code.unique())
+months = range(1, 13)
+all_availability_columns = ['available_prop', 'change_proportion_scenario1', 'change_proportion_scenario2',
+       'change_proportion_scenario3', 'change_proportion_scenario4',
+       'change_proportion_scenario5']
+
+# Create a MultiIndex from the product of fac_ids, months, and item_codes
+index = pd.MultiIndex.from_product([fac_ids, months, item_codes], names=['Facility_ID', 'month', 'item_code'])
+
+# Initialize a DataFrame with the MultiIndex and columns, filled with NaN
+full_set = pd.DataFrame(index=index, columns=all_availability_columns)
+full_set = full_set.astype(float)  # Ensure all columns are float type and filled with NaN
+
+# Insert the data, where it is available.
+full_set = full_set.combine_first(new_availability_df_imputed.set_index(['Facility_ID', 'month', 'item_code'])[all_availability_columns])
+
+# Fill in the blanks with rules for interpolation.
+
+facilities_by_level = defaultdict(set)
+for ix, row in mfl.iterrows():
+    facilities_by_level[row['Facility_Level']].add(row['Facility_ID'])
+
+
+def get_other_facilities_of_same_level(_fac_id):
+    """Return a set of facility_id for other facilities that are of the same level as that provided."""
+    for v in facilities_by_level.values():
+        if _fac_id in v:
+            return v - {_fac_id}
+
+
+def interpolate_missing_with_mean(_ser):
+    """Return a series in which any values that are null are replaced with the mean of the non-missing."""
+    if pd.isnull(_ser).all():
+        raise ValueError
+    return _ser.fillna(_ser.mean())
+
+
+# Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based
+# on what results are missing before the interpolations in other facilities).
+full_set_interpolated = full_set * np.nan
+
+for fac in fac_ids:
+    for item in item_codes:
+
+        print(f"Now doing: fac={fac}, item={item}")
+
+        # Get records of the availability of this item in this facility.
+        _monthly_records = full_set.loc[(fac, slice(None), item)].copy()
+
+        if pd.notnull(_monthly_records).any():
+            # If there is at least one record of this item at this facility, then interpolate the missing months from
+            # the months for there are data on this item in this facility. (If none are missing, this has no effect).
+            _monthly_records = interpolate_missing_with_mean(_monthly_records)
+
+        else:
+            # If there is no record of this item at this facility, check to see if it's available at other facilities
+            # of the same level
+            facilities = list(get_other_facilities_of_same_level(fac))
+            recorded_at_other_facilities_of_same_level = pd.notnull(
+                full_set.loc[(facilities, slice(None), item)]
+            ).any()
+
+            if recorded_at_other_facilities_of_same_level:
+                # If it recorded at other facilities of same level, find the average availability of the item at other
+                # facilities of the same level.
+                facilities = list(get_other_facilities_of_same_level(fac))
+                _monthly_records = interpolate_missing_with_mean(
+                    full_set.loc[(facilities, slice(None), item)].groupby(level=1).mean()
+                )
+
+            else:
+                # If it is not recorded at other facilities of same level, then assume it is never available at the
+                # facility.
+                _monthly_records = _monthly_records.fillna(0.0)
+
+        # Insert values (including corrections) into the resulting dataset.
+        full_set_interpolated.loc[(fac, slice(None), item)] = _monthly_records.values
+
+# Check that there are not missing values
+assert not pd.isnull(full_set_interpolated).any().any()
+
+# --- Check that the exported file has the properties required of it by the model code. --- #
+check_format_of_consumables_file(df=full_set_interpolated.reset_index(), fac_ids=fac_ids)
+
+# %%
+# Save
+full_set_interpolated.reset_index().to_csv(
+    path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv",
+    index=False
+)
+
 # 2.3.2. Browse missingness in the availability_change_prop variable
 #------------------------------------------------------
 pivot_table = pd.pivot_table(new_availability_df_imputed,

From 56dc3712a7b0db5c22308cd74d0d35c83f1d0bc8 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Fri, 31 May 2024 11:43:18 +0100
Subject: [PATCH 011/100] update 'regression_application' column - some CMD and
 Reproductive health consumables were classified as "not relevant to
 regression analysis" and therefore assume not change to availability. This
 has been removed. We now assume that their availability changes in proportion
 to the average increase in the odds of availability across all consuamables.

---
 .../consumables/ResourceFile_consumables_matched.csv          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
index d7798357fc..5142a8b737 100644
--- a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
+++ b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66b68e216701f841382e3add63305d55e1598862076789352ac7d799100e6d1a
-size 110047
+oid sha256:0ba68fa949f61962d0bc8459fada5dd8977d8c8822b18d73d57ce869f9f90d73
+size 109485

From e3fc74003dd34e72216dc736911eabc671a9e7c3 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 3 Jun 2024 16:51:45 +0100
Subject: [PATCH 012/100] update interpolation methodology

---
 ...rceFile_Consumables_availability_small.csv |   4 +-
 ...ilability_scenarios_for_impact_analysis.py | 442 ++++++------------
 2 files changed, 155 insertions(+), 291 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index 54453cbc2f..25fa724ffc 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69a5143c0b7307c7bb48726aa73d6c2f61de2a69aeb445eec87494cf9d4a1041
-size 6087331
+oid sha256:394fa8aff768551c805a081d7f005124504b6270cd73fcbeef5fb094298da9a0
+size 21701688
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 7c533fe444..e93c11d324 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -91,7 +91,7 @@
     'Mzimba North': 'Mzimba',
 }
 scenario_availability_df = scenario_availability_df.rename({'district': 'district_original'}, axis=1)
-scenario_availability_df['district'] = scenario_availability_df['district_original'].replace(rename_and_collapse_to_model_districts)
+scenario_availability_df['District'] = scenario_availability_df['district_original'].replace(rename_and_collapse_to_model_districts)
 
 # Cities to get same results as their respective regions
 copy_source_to_destination = {
@@ -102,23 +102,23 @@
     'Nkhata Bay': 'Likoma' # based on anecdotal evidence, assume that they experience the same change in avaiability as a result of interventions based on regression results
 }
 for source, destination in copy_source_to_destination.items():
-    new_rows = scenario_availability_df.loc[scenario_availability_df.district == source].copy() # standardised district names
-    new_rows.district = destination
+    new_rows = scenario_availability_df.loc[scenario_availability_df.District == source].copy() # standardised district names
+    new_rows.District = destination
     scenario_availability_df = pd.concat([scenario_availability_df, new_rows], ignore_index = True)
 
-assert sorted(set(districts)) == sorted(set(pd.unique(scenario_availability_df.district)))
+assert sorted(set(districts)) == sorted(set(pd.unique(scenario_availability_df.District)))
 
 # 1.2.3 Facility_ID
 # #------------------------------------------------------
 # Merge-in facility_id
-scenario_availability_facid_merge = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
-                    left_on=['district', 'fac_type'],
+scenario_availability_df = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
+                    left_on=['District', 'fac_type'],
                     right_on=['District', 'Facility_Level'], how='left', indicator=True)
-scenario_availability_facid_merge = scenario_availability_facid_merge.rename({'_merge': 'merge_facid'}, axis=1)
+scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_facid'}, axis=1)
 
 # Extract list of District X Facility Level combinations for which there is no HHFA data
 scenario_availability_df_test = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
-                    left_on=['district', 'fac_type'],
+                    left_on=['District', 'Facility_Level'],
                     right_on=['District', 'Facility_Level'], how='right', indicator=True)
 cond_no_1b = (scenario_availability_df_test['Facility_Level'].isin(['1b'])) & (scenario_availability_df_test['_merge'] == 'right_only')
 cond_no_1a = (scenario_availability_df_test['Facility_Level'].isin(['1a'])) & (scenario_availability_df_test['_merge'] == 'right_only')
@@ -150,9 +150,9 @@
 # TODO Check if the above mapping is correct
 # TODO collapse infection_prev and general in the HHFA-based predicted dataframe
 
-scenario_availability_facid_merge['category_tlo'] = scenario_availability_facid_merge['program_plot'].replace(map_model_programs_to_hhfa)
+scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(map_model_programs_to_hhfa)
 
-# 1.2.5 Consumable/Item code
+# 1.2.5 Consumable/Item code and Category
 #------------------------------------------------------
 # Load TLO - HHFA consumable name crosswalk
 consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['module_name', 'item_code', 'consumable_name_tlo',
@@ -162,65 +162,31 @@
 # TODO add new consumables Rifapentine to this?
 
 # Now merge in TLO item codes
-scenario_availability_facid_merge = scenario_availability_facid_merge.reset_index(drop = True)
-scenario_availability_facid_itemcode_merge = scenario_availability_facid_merge.merge(consumable_crosswalk_df[['item_code', 'item_hhfa', 'regression_application', 'module_name']],
+scenario_availability_df = scenario_availability_df.reset_index(drop = True)
+scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa', 'regression_application', 'module_name']],
                     on = ['item_hhfa'], how='right', indicator=True, validate = "m:m")
-scenario_availability_facid_itemcode_merge = scenario_availability_facid_itemcode_merge.drop_duplicates(['Facility_ID', 'item_code'])
-#scenario_availability_facid_itemcode_merge.to_csv(outputfilepath / "temp_merge_status.csv")
-scenario_availability_facid_itemcode_merge = scenario_availability_facid_itemcode_merge.rename({'_merge': 'merge_itemcode'}, axis=1)
-print("Number of item codes from the TLO model for which no match was found in the regression-based scenario data = ", scenario_availability_facid_itemcode_merge.merge_itemcode.value_counts()[1])
+scenario_availability_df = scenario_availability_df.drop_duplicates(['Facility_ID', 'item_code'])
+scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_itemcode'}, axis=1)
+print("Number of item codes from the TLO model for which no match was found in the regression-based scenario data = ", scenario_availability_df.merge_itemcode.value_counts()[1])
 
-# Before merging the above dataframe with tlo_availability_df, we need to duplicate rows for each of the unmatched items
-# to represent all facility IDs
+# Before merging the above dataframe with tlo_availability_df, and apply a general interpolation rule to fill any gaps,
+# we need to make sure that any specific interpolation rules are applied to the scenario dataframe
 
 # Further a row needs to be added for 1b level under Balaka, Machinga, Mwanza, Ntchisi and Salima
 print("Number of unique facility IDs: \n - TLO consumable data = ", tlo_availability_df.Facility_ID.nunique(),
-      "\n - Scenario based on regression = ", scenario_availability_facid_itemcode_merge.Facility_ID.nunique(),
+      "\n - Scenario based on regression = ", scenario_availability_df.Facility_ID.nunique(),
       "\nNumber of unique item codes: \n - TLO consumable availability data = ", tlo_availability_df.item_code.nunique(),
       "\n - TLO consumable availability repository = ", consumable_crosswalk_df.item_code.nunique(),
-      "\n - Scenario based on regression = ", scenario_availability_facid_itemcode_merge.item_code.nunique())
+      "\n - Scenario based on regression = ", scenario_availability_df.item_code.nunique())
 
-# 1.2.6 Interpolation/Imputation where data is missing
-#------------------------------------------------------
-# 1.2.6.1 Facility IDs not matched
-#------------------------------------------------------
-# Before merging the scenario dataframe with tlo_availability_df, generate rows with all 59 relevant facility IDs for item_codes
-# which are not matched
-df = scenario_availability_facid_itemcode_merge[['District', 'Facility_Level', 'Facility_ID',
-       'category_tlo', 'item_code',
-       'available', 'available_prob_predicted', 'change_proportion_scenario1',
-       'change_proportion_scenario2', 'change_proportion_scenario3',
-       'change_proportion_scenario4', 'change_proportion_scenario5', 'regression_application',
-       'merge_itemcode']]
-df_missing_facids = df.loc[df['Facility_ID'].isna()].reset_index(drop = True)
-df_missing_facids = df_missing_facids.drop_duplicates('item_code') # These item_codes don't have separate rows by
-# Facility_ID because they were not found in the HHFA regression analysis
-
-df_with_facids = df.loc[~df['Facility_ID'].isna()] # these are rows with Facility_ID
-
-# Create a cartesian product of unique values in columns 'item_code' and 'Facility_ID' from both dataframes
-df_with_all_facids = pd.DataFrame({'Facility_ID': np.repeat(df_with_facids['Facility_ID'].unique(), df_missing_facids['item_code'].nunique()),
-                       'item_code': np.tile(df_missing_facids['item_code'].unique(), df_with_facids['Facility_ID'].nunique())})
-# Merge df_new with df_missing on columns 'item_code' and 'Facility_ID'
-df_with_all_facids = df_with_all_facids[['Facility_ID', 'item_code']].merge(df_missing_facids.drop('Facility_ID', axis = 1), on=['item_code'], how='left', validate = "m:1")
-# df_new = df_new.append(df_existing)
-df_with_all_facids = pd.concat([df_with_all_facids, df_with_facids], ignore_index = True)
-# Now all item_codes have all Facility_IDs included in the HHFA regression analysis but no values for scenarios
-
-scenario_final_df = df_with_all_facids
-#len(scenario_final_df[scenario_final_df['change_proportion_scenario1'].isna()])
-
-# Now provide scenario data for rows where this data is not available
-# 1.2.6.1 Extract list of TLO consumables which weren't matched with the availability prediction dataframe
-#scenario_availability_facid_itemcode_merge[items_not_matched][['item_code', 'regression_application']].to_csv(outputfilepath / 'temp_items_not_matched.csv')
+# Extract list of TLO consumables which weren't matched with the availability prediction dataframe
+items_not_matched = scenario_availability_df['merge_itemcode'] == 'right_only'
+scenario_availability_df[items_not_matched][['item_code', 'regression_application']].to_csv(outputfilepath / 'temp_items_not_matched.csv')
 
 # Get average  availability_change_prop value by facility_ID and category_tlo
-scenario_final_df = scenario_final_df.merge(programs[['category', 'item_code']],
+scenario_availability_df = scenario_availability_df.merge(programs[['category', 'item_code']],
                                             on = ['item_code'], validate = "m:1",
                                             how = "left")
-list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2',
-       'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5']
-scenario_averages_by_program_and_facid = scenario_final_df.groupby(['Facility_ID','category'])[list_of_scenario_variables].mean().reset_index()
 
 # check that all consumables have a category assigned to them
 map_items_with_missing_category_to_category= {77:'reproductive_health',
@@ -228,98 +194,27 @@
 63: 'neonatal_health',
 258: 'cancer',
 1735: 'general'}
+
 # Update the category column based on item_code
-scenario_final_df['category'] = scenario_final_df.apply(lambda row: map_items_with_missing_category_to_category[row['item_code']]
+scenario_availability_df['category'] = scenario_availability_df.apply(lambda row: map_items_with_missing_category_to_category[row['item_code']]
                           if row['item_code'] in map_items_with_missing_category_to_category
                           else row['category'], axis=1)
 
-# a. Some consumables which were matched with a corresponding HHFA item do not appear in the above dataset because
-# they were excluded from the regression analysis due to missing availability information on them
-# from most facilities. We will assume that their availability changes as per the average overall change
-# (eg. diazepam, morphine, atropine)
-items_not_matched = scenario_final_df['merge_itemcode'] == 'right_only'
-scenario_final_df[items_not_matched]['regression_application'].unique()
-
-unmatched_items_category1 = (items_not_matched) & ((scenario_final_df['regression_application'].isna()) | \
-                                                   (scenario_final_df['regression_application'] == 'proxy'))
-
-# b.'assume average'
-unmatched_items_category2 = (items_not_matched) & (scenario_final_df['regression_application'] == 'assume average')
-
-# Replace missing instances with the above average values for categories 1 and 2
-scenario_cat1_and_cat2 = scenario_final_df[(unmatched_items_category1|unmatched_items_category2)]
-scenario_cat1_and_cat2 = scenario_cat1_and_cat2.drop(list_of_scenario_variables, axis = 1)
-
-scenario_cat1_and_cat2 = scenario_cat1_and_cat2.merge(scenario_averages_by_program_and_facid,
-                                                      on = ['Facility_ID','category'], validate = "m:1",
-                                                      how = "left")
-for var in list_of_scenario_variables:
-    var_imputed = var + '_imputed'
-    scenario_cat1_and_cat2[var_imputed] = scenario_cat1_and_cat2[var]
-    scenario_final_df = scenario_final_df.merge(scenario_cat1_and_cat2[['Facility_ID','item_code', var_imputed]],
-                                                          on = ['Facility_ID','item_code'], validate = "1:1",
-                                                          how = "left")
-    scenario_final_df.loc[(unmatched_items_category1|unmatched_items_category2), var] = scenario_final_df[var_imputed]
-
-# c. if category is missing, take average across items
-cond_missing_category = scenario_final_df.category.isna()
-cond_missing_scenario_data = scenario_final_df.change_proportion_scenario1.isna()
-print("The following items don't have an appropriate category assigned for imputation - ", scenario_final_df[cond_missing_category & cond_missing_scenario_data]['item_code'].unique())
-for var in list_of_scenario_variables:
-    scenario_final_df.loc[cond_missing_category & cond_missing_scenario_data, var] = scenario_final_df[var].mean()
-
-# d. 'not relevant to logistic regression analysis'
-unmatched_items_category3 = (items_not_matched) & (scenario_final_df['regression_application'] == 'not relevant to logistic regression analysis')
-# For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability
-for var in list_of_scenario_variables:
-    scenario_final_df.loc[unmatched_items_category3,var] = 1
-
-# e. any other categories of unmatched consumables
-unmatched_items_category4 = ~unmatched_items_category1 & ~unmatched_items_category2 & ~unmatched_items_category3 & items_not_matched
-assert(sum(unmatched_items_category4) == 0) # check that we haven't missed any consumables
-
-# 1.2.6.2 Inf values
+# 1.3 Initial interpolation
 #------------------------------------------------------
-# Where the values are Inf because the availability changed from 0 to X, replace with the average for the category and Facility_ID
+# 1.3.1 Items not relevant to the regression analysis
+items_not_relevant_to_regression = (items_not_matched) & (scenario_availability_df['regression_application'] == 'not relevant to logistic regression analysis')
+# For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability
+list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2',
+       'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5']
 for var in list_of_scenario_variables:
-    print(f"Running scenario {var}")
-    change_value_is_infinite = scenario_final_df[var].isin([np.inf])
-    print("Number of Inf values changed to average = ", sum(change_value_is_infinite))
-'''
-    # None of the values is infinite so we don't have to run the code below
-    df_inf = scenario_final_df[change_value_is_infinite].reset_index()
-
-    average_change_across_category_and_facid = scenario_final_df[~change_value_is_infinite].groupby(['Facility_ID','category'])[var].mean().reset_index()
-
-    df_inf_replaced = df_inf.drop(var, axis = 1).merge(average_change_across_category_and_facid,
-                                   on = ['Facility_ID','category'],
-                                   how = 'left', validate = "m:1")
-    scenario_final_df = pd.concat([scenario_final_df[~change_value_is_infinite], df_inf_replaced], ignore_index = True)
-'''
-
-'''
-# 1.2.6.3 Assume that the availability at level 1b in districts (Balaka, Salima, Ntchisi, Mwanza, Machinga) changes in the same proportion as the average across all
-# districts at level 1b
-cond1 = ~scenario_final_df.District.isin(districts_with_no_scenario_data_for_1b_only)
-cond2 = scenario_final_df.Facility_Level == '1b'
-scenario_averages_by_item_for_level1b = scenario_final_df[cond1 & cond2].groupby(['item_code'])[list_of_scenario_variables].mean().reset_index()
-dfs = []
-for dist in districts_with_no_scenario_data_for_1b_only:
-    df_copy = scenario_averages_by_item_for_level1b.copy()
-    df_copy['district'] = dist
-    dfs.append(df_copy)
-
-result_df = pd.concat(dfs, ignore_index=True)
-result_df['Facility_Level'] = '1b'
-result_df = result_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], left_on = ['district', 'Facility_Level'], right_on = ['District', 'Facility_Level'], how = 'left')
-scenario_final_df = pd.concat([scenario_final_df, result_df], ignore_index=True)
+    scenario_availability_df.loc[items_not_relevant_to_regression,var] = 1
 
-'''
-# 1.2.6.3 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume
+# 1.3.2 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume
 # that the change is equal to the product of the (ratio of average change across districts for level 1b to
 # average change across districts for level 1a) and change for each item_code for level 1a for that district
 #------------------------------------------------------------------------------------------------------------
-average_change_across_districts = scenario_final_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index()
+average_change_across_districts = scenario_availability_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index()
 
 # Generate the ratio of the proportional changes to availability of level 1b to 1a in the districts for which level 1b data is available
 new_colnames_1a = {col: col + '_1a' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns}
@@ -337,94 +232,62 @@
 # TODO check if this ratio should be of the proportions minus 1
 
 # Use the above for those districts with no level 1b facilities recorded in the HHFA data
-cond_1b_missing_district = scenario_final_df.District.isin(districts_with_no_scenario_data_for_1b_only)
-cond_1b = scenario_final_df.Facility_Level == '1b'
-cond_1a = scenario_final_df.Facility_Level == '1a'
-df_missing_1b = scenario_final_df[cond_1b_missing_district & cond_1b]
-df_1a = scenario_final_df[cond_1b_missing_district & cond_1a]
+scenario_availability_df = scenario_availability_df.reset_index(drop = True)
+cond_1b_missing_districts = scenario_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only)
+cond_1a = scenario_availability_df.Facility_Level == '1a'
+cond_1b = scenario_availability_df.Facility_Level == '1b'
+df_1a = scenario_availability_df[cond_1b_missing_districts & cond_1a]
 
+ratio_vars = ['ratio_' + item for item in list_of_scenario_variables]
 
-scenario_final_df
+item_var = ['item_code']
+df_missing_1b_imputed = df_1a.merge(ratio_of_change_across_districts_1b_to_1a[item_var + ratio_vars],
+                               on = ['item_code'],
+                               how = 'left', validate = "m:1")
 
+for var in list_of_scenario_variables:
+    df_missing_1b_imputed[var] = df_missing_1b_imputed[var] * df_missing_1b_imputed['ratio_' + var]
 
+df_missing_1b_imputed.Facility_Level = '1b' # Update facility level to 1
+# Replace Facility_IDs
+df_missing_1b_imputed = df_missing_1b_imputed.drop('Facility_ID', axis = 1).merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
+                                                                                     on =['District', 'Facility_Level'],
+                                                                                     how = 'left')
 
-# TODO There are still some items missing for some facility IDs
+df_without_districts_with_no_1b_facilities = scenario_availability_df[~(cond_1b_missing_districts & cond_1b)]
+scenario_availability_df = pd.concat([df_without_districts_with_no_1b_facilities, df_missing_1b_imputed], ignore_index = True)
 
 # 2. Merge TLO model availability data with scenario data using crosswalk
 #*************************************************************************
 # 2.1 Merge the two datasets
 #------------------------------------------------------
-id_variables = ['item_code','Facility_ID','regression_application']
+id_variables = ['item_code','Facility_ID']
 
-new_availability_df = tlo_availability_df.merge(scenario_final_df[id_variables + list_of_scenario_variables],
+full_scenario_df = tlo_availability_df.merge(scenario_availability_df[id_variables + list_of_scenario_variables],
                                how='left', on=['Facility_ID', 'item_code'], indicator = True)
-new_availability_df = new_availability_df.rename({'_merge': 'merge_scenario'}, axis=1)
-new_availability_df = new_availability_df.drop_duplicates(['Facility_ID', 'item_code', 'month'])
+full_scenario_df = full_scenario_df.rename({'_merge': 'merge_scenario'}, axis=1)
+full_scenario_df = full_scenario_df.drop_duplicates(['Facility_ID', 'item_code', 'month'])
+
+# Check that level 1b values are currently imputed
+# full_scenario_df[full_scenario_df.District == 'Balaka'].groupby(['District', 'Facility_Level'])['change_proportion_scenario1'].mean()
 
 # 2.2 Further imputation
 #------------------------------------------------------
-# 2.2.1 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume
-# that the change is equal to the product of the (ratio of average change across districts for level 1b to
-# average change across districts for level 1a) and change for each item_code for level 1a for that district
-#------------------------------------------------------
-average_change_across_districts = scenario_final_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index()
-
-#average_change_across_districts = scenario_final_df.groupby(['Facility_Level','item_code']).agg({'availability_change_prop': lambda x: x.mean(skipna = False)}).reset_index()
-new_colnames_1a = {col: col + '_1a' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns}
-new_colnames_1b = {col: col + '_1b' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns}
-average_change_across_districts_for_1a = average_change_across_districts[average_change_across_districts.Facility_Level == "1a"].rename(new_colnames_1a, axis = 1).drop('Facility_Level', axis = 1)
-average_change_across_districts_for_1b = average_change_across_districts[average_change_across_districts.Facility_Level == "1b"].rename(new_colnames_1b, axis = 1).drop('Facility_Level', axis = 1)
-ratio_of_change_across_districts_1b_to_1a = average_change_across_districts_for_1a.merge(average_change_across_districts_for_1b,
-                                                                                         how = "left", on = ['item_code'])
-# START HERE
-for var in list_of_scenario_variables:
-    var_ratio = 'ratio_' + var
-    var_1a = var + '_1a'
-    var_1b = var + '_1b'
-    ratio_of_change_across_districts_1b_to_1a[var_ratio] = (ratio_of_change_across_districts_1b_to_1a[var_1b]-1)/(ratio_of_change_across_districts_1b_to_1a[var_1a] - 1)
-ratio_of_change_across_districts_1b_to_1a.reset_index(drop = True)
-
-# Use the above for those districts no level 1b facilities recorded in the HHFA data
-cond_1b_missing_district = new_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only)
-cond_1b = new_availability_df.Facility_Level == '1b'
-cond_1a = new_availability_df.Facility_Level == '1a'
-df_missing_1b = new_availability_df[cond_1b_missing_district & cond_1b]
-df_1a = new_availability_df[cond_1b_missing_district & cond_1a]
-
-ratio_vars = ['ratio_' + item for item in list_of_scenario_variables]
-item_var = ['item_code']
-df_missing_1b_imputed = df_missing_1b.merge(ratio_of_change_across_districts_1b_to_1a[item_var + ratio_vars],
-                               on = ['item_code'],
-                               how = 'left', validate = "m:1")
-for var in list_of_scenario_variables:
-    # check that the values we are replacing are in fact missing
-    assert np.isnan(df_missing_1b_imputed[var].unique()).all()
-
-id_vars_level1a = ['District', 'item_code', 'month']
-df_missing_1b_imputed = df_missing_1b_imputed.drop(list_of_scenario_variables, axis = 1).merge(df_1a[id_vars_level1a + list_of_scenario_variables],
-                               on = id_vars_level1a,
-                               how = 'left', validate = "1:1", indicator = True)
-
-for var in list_of_scenario_variables:
-    df_missing_1b_imputed[var] = ((df_missing_1b_imputed[var]-1) * df_missing_1b_imputed['ratio_' + var]) + 1
-
-new_availability_df_imputed = pd.concat([new_availability_df[~(cond_1b_missing_district & cond_1b)], df_missing_1b_imputed], ignore_index = True)
-
-# 2.2.2 For all levels other than 1a and 1b, there will be no change in consumable availability
+# 2.2.1 For all levels other than 1a and 1b, there will be no change in consumable availability
 #------------------------------------------------------------------------------------------------------------
-fac_levels_not_relevant_to_regression = new_availability_df_imputed.Facility_Level.isin(['0', '2', '3', '4'])
+fac_levels_not_relevant_to_regression = full_scenario_df.Facility_Level.isin(['0', '2', '3', '4'])
 
 for var in list_of_scenario_variables:
-    new_availability_df_imputed.loc[fac_levels_not_relevant_to_regression, var] = 1
+    full_scenario_df.loc[fac_levels_not_relevant_to_regression, var] = 1
 
 # 2.3 Final checks
 #------------------------------------------------------
 # 2.3.1 Check that the merged dataframe has the same number of unique items, facility IDs, and total
 # number of rows as the original small availability resource file
 #---------------------------------------------------------------------------------------------------------
-assert(new_availability_df_imputed.item_code.nunique() == tlo_availability_df.item_code.nunique())
-assert(new_availability_df_imputed.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique())
-assert(len(new_availability_df_imputed) == len(tlo_availability_df))
+assert(full_scenario_df.item_code.nunique() == tlo_availability_df.item_code.nunique())
+assert(full_scenario_df.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique())
+assert(len(full_scenario_df) == len(tlo_availability_df))
 
 # 2.3.2 Construct dataset that conforms to the principles expected by the simulation: i.e. that there is an entry for every
 # facility_id and for every month for every item_code.
@@ -433,9 +296,7 @@
 fac_ids = set(mfl.loc[mfl.Facility_Level != '5'].Facility_ID)
 item_codes = set(tlo_availability_df.item_code.unique())
 months = range(1, 13)
-all_availability_columns = ['available_prop', 'change_proportion_scenario1', 'change_proportion_scenario2',
-       'change_proportion_scenario3', 'change_proportion_scenario4',
-       'change_proportion_scenario5']
+all_availability_columns = ['available_prop'] + list_of_scenario_variables
 
 # Create a MultiIndex from the product of fac_ids, months, and item_codes
 index = pd.MultiIndex.from_product([fac_ids, months, item_codes], names=['Facility_ID', 'month', 'item_code'])
@@ -445,14 +306,16 @@
 full_set = full_set.astype(float)  # Ensure all columns are float type and filled with NaN
 
 # Insert the data, where it is available.
-full_set = full_set.combine_first(new_availability_df_imputed.set_index(['Facility_ID', 'month', 'item_code'])[all_availability_columns])
+full_set = full_set.combine_first(full_scenario_df.set_index(['Facility_ID', 'month', 'item_code'])[all_availability_columns])
 
 # Fill in the blanks with rules for interpolation.
-
 facilities_by_level = defaultdict(set)
 for ix, row in mfl.iterrows():
     facilities_by_level[row['Facility_Level']].add(row['Facility_ID'])
 
+items_by_category = defaultdict(set)
+for ix, row in programs.iterrows():
+    items_by_category[row['category']].add(row['item_code'])
 
 def get_other_facilities_of_same_level(_fac_id):
     """Return a set of facility_id for other facilities that are of the same level as that provided."""
@@ -460,98 +323,90 @@ def get_other_facilities_of_same_level(_fac_id):
         if _fac_id in v:
             return v - {_fac_id}
 
-
+def get_other_items_of_same_category(_item_code):
+    """Return a set of item_codes for other items that are in the same category/program as that provided."""
+    for v in items_by_category.values():
+        if _item_code in v:
+            return v - {_item_code}
 def interpolate_missing_with_mean(_ser):
     """Return a series in which any values that are null are replaced with the mean of the non-missing."""
     if pd.isnull(_ser).all():
         raise ValueError
     return _ser.fillna(_ser.mean())
 
-
 # Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based
 # on what results are missing before the interpolations in other facilities).
 full_set_interpolated = full_set * np.nan
+full_set_interpolated.available_prop = full_set.available_prop
 
 for fac in fac_ids:
     for item in item_codes:
+        for col in list_of_scenario_variables:
+            print(f"Now doing: fac={fac}, item={item}, column={col}")
 
-        print(f"Now doing: fac={fac}, item={item}")
-
-        # Get records of the availability of this item in this facility.
-        _monthly_records = full_set.loc[(fac, slice(None), item)].copy()
+            # Get records of the availability of this item in this facility.
+            _monthly_records = full_set.loc[(fac, slice(None), item), col].copy()
 
-        if pd.notnull(_monthly_records).any():
-            # If there is at least one record of this item at this facility, then interpolate the missing months from
-            # the months for there are data on this item in this facility. (If none are missing, this has no effect).
-            _monthly_records = interpolate_missing_with_mean(_monthly_records)
-
-        else:
-            # If there is no record of this item at this facility, check to see if it's available at other facilities
-            # of the same level
-            facilities = list(get_other_facilities_of_same_level(fac))
-            recorded_at_other_facilities_of_same_level = pd.notnull(
-                full_set.loc[(facilities, slice(None), item)]
-            ).any()
-
-            if recorded_at_other_facilities_of_same_level:
-                # If it recorded at other facilities of same level, find the average availability of the item at other
-                # facilities of the same level.
-                facilities = list(get_other_facilities_of_same_level(fac))
-                _monthly_records = interpolate_missing_with_mean(
-                    full_set.loc[(facilities, slice(None), item)].groupby(level=1).mean()
-                )
+            if pd.notnull(_monthly_records).any():
+                # If there is at least one record of this item at this facility, then interpolate the missing months from
+                # the months for there are data on this item in this facility. (If none are missing, this has no effect).
+                _monthly_records = interpolate_missing_with_mean(_monthly_records)
 
             else:
-                # If it is not recorded at other facilities of same level, then assume it is never available at the
-                # facility.
-                _monthly_records = _monthly_records.fillna(0.0)
+                # If there is no record of this item at this facility, check to see if it's available at other facilities
+                # of the same level
+                # Or if there is no record of item at other facilities at this level, check to see if other items of this category
+                # are available at this facility level
+                facilities = list(get_other_facilities_of_same_level(fac))
 
-        # Insert values (including corrections) into the resulting dataset.
-        full_set_interpolated.loc[(fac, slice(None), item)] = _monthly_records.values
+                other_items = get_other_items_of_same_category(item)
+                items = list(other_items) if other_items else other_items
+
+                recorded_at_other_facilities_of_same_level = pd.notnull(
+                    full_set.loc[(facilities, slice(None), item), col]
+                ).any()
+
+                if not items:
+                    category_recorded_at_other_facilities_of_same_level = False
+                else:
+                    category_recorded_at_other_facilities_of_same_level = pd.notnull(
+                        full_set.loc[(fac, slice(None), items), col]
+                    ).any()
+
+                if recorded_at_other_facilities_of_same_level:
+                    # If it recorded at other facilities of same level, find the average availability of the item at other
+                    # facilities of the same level.
+                    print("Data for facility ", fac, " extrapolated from other facilities within level - ", facilities)
+                    facilities = list(get_other_facilities_of_same_level(fac))
+                    _monthly_records = interpolate_missing_with_mean(
+                        full_set.loc[(facilities, slice(None), item), col].groupby(level=1).mean()
+                    )
+
+                elif category_recorded_at_other_facilities_of_same_level:
+                    # If it recorded at other facilities of same level, find the average availability of the item at other
+                    # facilities of the same level.
+                    print("Data for item ", item, " extrapolated from other items within category - ", items)
+                    _monthly_records = interpolate_missing_with_mean(
+                        full_set.loc[(fac, slice(None), items), col].groupby(level=1).mean()
+                    )
+
+                else:
+                    # If it is not recorded at other facilities of same level, then assume it is never available at the
+                    # facility.
+                    print("No interpolation worked")
+                    _monthly_records = _monthly_records.fillna(1.0)
+# TODO this should be available_prop
+
+            # Insert values (including corrections) into the resulting dataset.
+            full_set_interpolated.loc[(fac, slice(None), item), col] = _monthly_records.values
+            # temporary code
+            assert full_set_interpolated.loc[(fac, slice(None), item), col].mean() >= 0
 
 # Check that there are not missing values
 assert not pd.isnull(full_set_interpolated).any().any()
 
 # --- Check that the exported file has the properties required of it by the model code. --- #
-check_format_of_consumables_file(df=full_set_interpolated.reset_index(), fac_ids=fac_ids)
-
-# %%
-# Save
-full_set_interpolated.reset_index().to_csv(
-    path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv",
-    index=False
-)
-
-# 2.3.2. Browse missingness in the availability_change_prop variable
-#------------------------------------------------------
-pivot_table = pd.pivot_table(new_availability_df_imputed,
-                             values=list_of_scenario_variables,
-                             index=['category'],
-                             columns=['Facility_Level'],
-                             aggfunc=lambda x: sum(pd.isna(x))/len(x)*100)
-pivot_table.to_csv(outputfilepath / "temp.csv")
-print(pivot_table[('change_proportion_scenario5', '1b')])
-'''
-Cases which are still missing data:
-1. For the 5 districts without 1b facilities in HHFA (Balaka, Machinga, Mwanza, Ntchisi, Salima), data on 54 items
-for 1b is missing.
-2. Chitipa 1b is missing data on 5 consumables - 176, 177,178,179,181,192, 2675
-3. 184	187 are missing for nearly all districts
-
-Previously, Likoma did not have data from the regression analysis - I have now used values from Nkhata bay as proxy
-'''
-
-# 2.2.4 PLaceholder code to replace all other missing values
-# TODO Check why there are still instances of missing data when regression_application is assume average or proxy
-#------------------------------------------------------
-# For all other cases, assume no change
-for var in list_of_scenario_variables:
-    missing_change_data = new_availability_df_imputed[var].isna()
-    new_availability_df_imputed.loc[missing_change_data, var] = new_availability_df_imputed[var].mean()
-
-# Where the merge_scenario == "left_only", we need to provide data on "availability_change_prop"
-# new_availability_df_imputed.to_csv(outputfilepath / 'current_status_of_scenario_merge.csv')
-# TODO Check why the city and district both have an instance of facility level 2
+#check_format_of_consumables_file(df=full_set_interpolated.reset_index(), fac_ids=fac_ids)
 
 # 3. Generate scenario data on consumable availablity
 #------------------------------------------------------
@@ -562,25 +417,34 @@ def interpolate_missing_with_mean(_ser):
 list_of_scenario_suffixes = [s.replace(prefix, '') for s in list_of_scenario_variables]
 
 for scenario in list_of_scenario_suffixes:
-    new_availability_df_imputed['available_prop_scenario_' + scenario] = new_availability_df_imputed['available_prop'] * new_availability_df_imputed['change_proportion_' + scenario]
-    availability_greater_than_1 = new_availability_df_imputed['available_prop_scenario_' + scenario] > 1
-    new_availability_df_imputed.loc[availability_greater_than_1, 'available_prop_scenario_' + scenario] = 1
+    full_set_interpolated['available_prop_' + scenario] = full_set_interpolated['available_prop'] * full_set_interpolated['change_proportion_' + scenario]
+    availability_greater_than_1 = full_set_interpolated['available_prop_' + scenario] > 1
+    full_set_interpolated.loc[availability_greater_than_1, 'available_prop_' + scenario] = 1
 
-    assert(sum(new_availability_df_imputed['available_prop_scenario_' + scenario].isna()) ==
-           sum(new_availability_df_imputed['change_proportion_' + scenario].isna()))
+    assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) ==
+           sum(full_set_interpolated['change_proportion_' + scenario].isna()))
 
-# Save
+# Save dataframe
 #------------------------------------------------------
-final_list_of_scenario_vars = ['available_prop_scenario_' + item for item in list_of_scenario_suffixes]
+final_list_of_scenario_vars = ['available_prop_' + item for item in list_of_scenario_suffixes]
 old_vars = ['Facility_ID', 'month', 'item_code', 'available_prop']
-full_df_with_scenario = new_availability_df_imputed[old_vars + final_list_of_scenario_vars].reset_index().drop('index', axis = 1)
-# Remove suffix for column names ending with '_cat' to mean 'categorised'
-full_df_with_scenario.columns = [col.replace('_cat', '') if col.endswith('_cat') else col for col in full_df_with_scenario.columns]
+full_df_with_scenario = full_set_interpolated[old_vars + final_list_of_scenario_vars].reset_index().drop('index', axis = 1)
 
 # Save updated consumable availability resource file with scenario data
 full_df_with_scenario.to_csv(
     path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv",
     index=False
 )
-# TODO: What about cases which do not exist in HHFA data?
 # TODO: Create a column providing the source of scenario data
+
+'''
+# 2.3.2. Browse missingness in the availability_change_prop variable
+#------------------------------------------------------
+pivot_table = pd.pivot_table(scenario_availability_df,
+                             values=list_of_scenario_variables,
+                             index=['category'],
+                             columns=['Facility_Level'],
+                             aggfunc=lambda x: sum(pd.isna(x))/len(x)*100)
+pivot_table.to_csv(outputfilepath / "temp.csv")
+print(pivot_table[('change_proportion_scenario5', '0')])
+'''

From 3e287a89b6186b6e49161efdb1218aab992bf4dd Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Tue, 4 Jun 2024 18:30:46 +0100
Subject: [PATCH 013/100] add scenarios 6-8 - this equates the probability of
 availability to be equal to that of the 75th, 90th and 99th percentile
 facility (in terms of the mean vale of `available_prop` within the
 corresponding level; only availability of facilities in levels 1a and 1b are
 updated

---
 ...rceFile_Consumables_availability_small.csv |  4 +-
 ...ilability_scenarios_for_impact_analysis.py | 85 ++++++++++++++++++-
 2 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index 25fa724ffc..acc6db2e7b 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:394fa8aff768551c805a081d7f005124504b6270cd73fcbeef5fb094298da9a0
-size 21701688
+oid sha256:a64491e6e2eb2884628e52c796391725b4738a74ad3dbf2e12eb20057dd52322
+size 30981143
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index e93c11d324..28d163cf73 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -408,8 +408,8 @@ def interpolate_missing_with_mean(_ser):
 # --- Check that the exported file has the properties required of it by the model code. --- #
 #check_format_of_consumables_file(df=full_set_interpolated.reset_index(), fac_ids=fac_ids)
 
-# 3. Generate scenario data on consumable availablity
-#------------------------------------------------------
+# 3. Generate regression-based scenario data on consumable availablity
+#*************************************************************************
 # Create new consumable availability estimates for TLO model consumables using
 # estimates of proportional change from the regression analysis based on HHFA data
 #------------------------------------------------------
@@ -424,11 +424,89 @@ def interpolate_missing_with_mean(_ser):
     assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) ==
            sum(full_set_interpolated['change_proportion_' + scenario].isna()))
 
+# 3. Generate best performing facility-based scenario data on consumable availablity
+#*********************************************************************************************
+df = full_df_with_scenario.copy() #.set_index(['Facility_ID', 'month', 'item_code'])
+
+#new_scenarios = ['available_prop_scenario6','available_prop_scenario7', 'available_prop_scenario8']
+#df[new_scenarios] = np.nan
+
+# Create a dictionary to store facilities at the 75th, 90th and 99th percentile in terms of consumable availability
+# List of facility levels to process
+facility_levels = ['1a', '1b']
+best_performing_facilities = {} # Dictionary to store the best performing facilities for each level
+target_percentiles = [75, 90, 99]
+
+# Populate the dictionary
+for level in facility_levels:
+    # Create an empty dictionary for the current level
+    best_performing_facilities[level] = {}
+
+    # Get the mean availability by Facility for the current level
+    mean_consumable_availability = pd.DataFrame(df[df.Facility_ID.isin(facilities_by_level[level])]
+                                                .groupby('Facility_ID')['available_prop'].mean()).reset_index()
+
+    # Calculate the percentile rank of each row for 'available_prop'
+    mean_consumable_availability['percentile_rank'] = mean_consumable_availability['available_prop'].rank(pct=True) * 100
+
+    # Find the row which is closest to the nth percentile rank for each target percentile
+    for target_perc in target_percentiles:
+        # Calculate the difference to target percentile
+        mean_consumable_availability['diff_to_target_' + str(target_perc)] = np.abs(
+            mean_consumable_availability['percentile_rank'] - target_perc)
+
+        # Find the row with the minimum difference to the target percentile
+        closest_row = mean_consumable_availability.loc[
+            mean_consumable_availability['diff_to_target_' + str(target_perc)].idxmin()]
+
+        # Store the Facility_ID of the closest row in the dictionary for the current level
+        best_performing_facilities[level][str(target_perc) + 'th percentile'] = closest_row['Facility_ID']
+
+print("Reference facilities at each level: ", best_performing_facilities)
+
+# Obtain the updated availability estimates for level 1a for scenarios 6-8
+id_variables = ['item_code', 'month']
+updated_availability_1a_scenario6 = df[df['Facility_ID']== best_performing_facilities['1a']['75th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario6'})
+updated_availability_1a_scenario7 = df[df['Facility_ID']== best_performing_facilities['1a']['90th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario7'})
+updated_availability_1a_scenario8 = df[df['Facility_ID']== best_performing_facilities['1a']['99th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario8'})
+updated_availability_1a = updated_availability_1a_scenario6.merge(updated_availability_1a_scenario7, on = ['item_code', 'month'],
+                                                                  how = 'inner', validate = "1:1")
+updated_availability_1a = updated_availability_1a.merge(updated_availability_1a_scenario8, on = ['item_code', 'month'],
+                                                                  how = 'inner', validate = "1:1")
+
+# Obtain the updated availability estimates for level 1b for scenarios 6-8
+updated_availability_1b_scenario6 = df[df['Facility_ID']== best_performing_facilities['1b']['75th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario6'})
+updated_availability_1b_scenario7 = df[df['Facility_ID']== best_performing_facilities['1b']['90th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario7'})
+updated_availability_1b_scenario8 = df[df['Facility_ID']== best_performing_facilities['1b']['99th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario8'})
+updated_availability_1b = updated_availability_1b_scenario6.merge(updated_availability_1b_scenario7, on = ['item_code', 'month'],
+                                                                  how = 'inner', validate = "1:1")
+updated_availability_1b = updated_availability_1b.merge(updated_availability_1b_scenario8, on = ['item_code', 'month'],
+                                                                  how = 'inner', validate = "1:1")
+
+# Merge the above scenario data to the full availability scenario dataframe
+# Scenario 6-8 availability data for level 1a
+df_new_1a = df[df['Facility_ID'].isin(facilities_by_level['1a'])].merge(updated_availability_1a,on = ['item_code', 'month'],
+                                      how = 'left',
+                                      validate = "m:1")
+# Scenario 6-8 availability data for level 1b
+df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(updated_availability_1b,on = ['item_code', 'month'],
+                                      how = 'left',
+                                      validate = "m:1")
+# Scenario 6-8 availability data for other levels
+df_new_otherlevels = df[~df['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b'])]
+new_scenario_columns = ['available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8']
+for col in new_scenario_columns:
+    df_new_otherlevels[col] = df_new_otherlevels['available_prop']
+
+# Append the above dataframes
+df_new = pd.concat([df_new_1a, df_new_1b, df_new_otherlevels], ignore_index = True)
+
 # Save dataframe
 #------------------------------------------------------
+list_of_scenario_suffixes = list_of_scenario_suffixes + ['scenario6', 'scenario7', 'scenario8']
 final_list_of_scenario_vars = ['available_prop_' + item for item in list_of_scenario_suffixes]
 old_vars = ['Facility_ID', 'month', 'item_code', 'available_prop']
-full_df_with_scenario = full_set_interpolated[old_vars + final_list_of_scenario_vars].reset_index().drop('index', axis = 1)
+full_df_with_scenario = df_new[old_vars + final_list_of_scenario_vars].reset_index().drop('index', axis = 1)
 
 # Save updated consumable availability resource file with scenario data
 full_df_with_scenario.to_csv(
@@ -448,3 +526,4 @@ def interpolate_missing_with_mean(_ser):
 pivot_table.to_csv(outputfilepath / "temp.csv")
 print(pivot_table[('change_proportion_scenario5', '0')])
 '''
+

From ed7882832d38e4a3427c67ce021c98d710529d85 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Tue, 4 Jun 2024 18:42:01 +0100
Subject: [PATCH 014/100] update test for consumables availability ResourceFile
 - {'Facility_ID', 'month', 'item_code', 'available_prop'} only need to be a
 subset of all the columns because there may be additional scenario columns

---
 ...sumable_availability_scenarios_for_impact_analysis.py | 9 +++------
 src/tlo/methods/consumables.py                           | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 28d163cf73..4f3d9b14c8 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -402,12 +402,6 @@ def interpolate_missing_with_mean(_ser):
             # temporary code
             assert full_set_interpolated.loc[(fac, slice(None), item), col].mean() >= 0
 
-# Check that there are not missing values
-assert not pd.isnull(full_set_interpolated).any().any()
-
-# --- Check that the exported file has the properties required of it by the model code. --- #
-#check_format_of_consumables_file(df=full_set_interpolated.reset_index(), fac_ids=fac_ids)
-
 # 3. Generate regression-based scenario data on consumable availablity
 #*************************************************************************
 # Create new consumable availability estimates for TLO model consumables using
@@ -508,6 +502,9 @@ def interpolate_missing_with_mean(_ser):
 old_vars = ['Facility_ID', 'month', 'item_code', 'available_prop']
 full_df_with_scenario = df_new[old_vars + final_list_of_scenario_vars].reset_index().drop('index', axis = 1)
 
+# --- Check that the exported file has the properties required of it by the model code. --- #
+check_format_of_consumables_file(df=full_df_with_scenario.reset_index(), fac_ids=fac_ids)
+
 # Save updated consumable availability resource file with scenario data
 full_df_with_scenario.to_csv(
     path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv",
diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index bf5a00c7ed..759c9e8efc 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -251,7 +251,7 @@ def check_format_of_consumables_file(df, fac_ids):
     months = set(range(1, 13))
     item_codes = set(df.item_code.unique())
 
-    assert set(df.columns) == {'Facility_ID', 'month', 'item_code', 'available_prop'}
+    assert {'Facility_ID', 'month', 'item_code', 'available_prop'}.issubset(set(df.columns))
 
     # Check that all permutations of Facility_ID, month and item_code are present
     pd.testing.assert_index_equal(

From 31f6ead22ee8540c9e06856aaf8e02ccea60f352 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Wed, 5 Jun 2024 09:43:25 +0100
Subject: [PATCH 015/100] update test for consumables availability ResourceFile
 - specify all scenarios

---
 ...consumable_availability_scenarios_for_impact_analysis.py | 2 +-
 src/tlo/methods/consumables.py                              | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 4f3d9b14c8..dd1cf4427d 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -503,7 +503,7 @@ def interpolate_missing_with_mean(_ser):
 full_df_with_scenario = df_new[old_vars + final_list_of_scenario_vars].reset_index().drop('index', axis = 1)
 
 # --- Check that the exported file has the properties required of it by the model code. --- #
-check_format_of_consumables_file(df=full_df_with_scenario.reset_index(), fac_ids=fac_ids)
+check_format_of_consumables_file(df=full_df_with_scenario, fac_ids=fac_ids)
 
 # Save updated consumable availability resource file with scenario data
 full_df_with_scenario.to_csv(
diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 759c9e8efc..78d1ba441f 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -251,7 +251,11 @@ def check_format_of_consumables_file(df, fac_ids):
     months = set(range(1, 13))
     item_codes = set(df.item_code.unique())
 
-    assert {'Facility_ID', 'month', 'item_code', 'available_prop'}.issubset(set(df.columns))
+    availability_columns = ['available_prop', 'available_prop_scenario1', 'available_prop_scenario2',
+                                      'available_prop_scenario3', 'available_prop_scenario4', 'available_prop_scenario5',
+                                      'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8']
+
+    assert set(df.columns) == {'Facility_ID', 'month', 'item_code'} | set(availability_columns)
 
     # Check that all permutations of Facility_ID, month and item_code are present
     pd.testing.assert_index_equal(

From 008952702529c22866f14b3d9689ebf900cabe4c Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Wed, 5 Jun 2024 11:02:40 +0100
Subject: [PATCH 016/100] add script to generate barplots to visualise change
 in available_prop under scenarios

---
 ...ilability_scenarios_for_impact_analysis.py | 36 ++++++++++++++++---
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index dd1cf4427d..d411d5f642 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -23,6 +23,7 @@
 import datetime
 from collections import defaultdict
 from pathlib import Path
+import os
 
 import matplotlib.pyplot as plt
 from plotnine import * # ggplot, aes, geom_point for ggplots from R
@@ -418,12 +419,9 @@ def interpolate_missing_with_mean(_ser):
     assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) ==
            sum(full_set_interpolated['change_proportion_' + scenario].isna()))
 
-# 3. Generate best performing facility-based scenario data on consumable availablity
+# 4. Generate best performing facility-based scenario data on consumable availablity
 #*********************************************************************************************
-df = full_df_with_scenario.copy() #.set_index(['Facility_ID', 'month', 'item_code'])
-
-#new_scenarios = ['available_prop_scenario6','available_prop_scenario7', 'available_prop_scenario8']
-#df[new_scenarios] = np.nan
+df = full_df_with_scenario.copy()
 
 # Create a dictionary to store facilities at the 75th, 90th and 99th percentile in terms of consumable availability
 # List of facility levels to process
@@ -512,6 +510,34 @@ def interpolate_missing_with_mean(_ser):
 )
 # TODO: Create a column providing the source of scenario data
 
+# 8. Plot new availability estimates by scenario
+#*********************************************************************************************
+# Creating the line plot with ggplot
+df_for_plots = full_df_with_scenario.merge(programs[['category', 'item_code']], on = 'item_code', how = "left", validate = "m:1")
+df_for_plots = df_for_plots.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1")
+def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
+    df_for_line_plot = _df.groupby([_x_axis_var])[['available_prop'] + final_list_of_scenario_vars].mean()
+    df_for_line_plot = df_for_line_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=['available_prop'] + final_list_of_scenario_vars,
+                        var_name='Scenario', value_name='Value')
+    plot = (ggplot(df_for_line_plot.reset_index(), aes(x=_x_axis_var, y='Value', fill = 'Scenario'))
+            + geom_bar(stat='identity', position='dodge')
+            + ylim(0, 1)
+            + labs(title = "Probability of availability across scenarios",
+                   x=_x_axis_var,
+                   y='Probability of availability')
+           )
+    # Create the directory if it doesn't exist
+    directory = outputfilepath / 'consumable_scenario_analysis'
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    plot.save(filename= directory / _filename, dpi=300, width=10, height=8, units='in')
+generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'category', _filename = 'availability_by_category.png')
+generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png')
+
+# Scenario on the X axis, level on the Y axis
+# Scenario on the X axis, program on the Y axis
+# TODO add heat maps
+
 '''
 # 2.3.2. Browse missingness in the availability_change_prop variable
 #------------------------------------------------------

From e3211fdacce24e72c7116ebcb0cc086db3861911 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Thu, 6 Jun 2024 15:32:05 +0100
Subject: [PATCH 017/100] update the method of generating scenarios 6-8 -
 previously for each level of care, one best performing facility was chosen
 based on the average consumable availability across all item_codes; Now a
 different best performing facility is chosen for each level for each
 item_code, giving a greater improvement in available_prop

---
 ...rceFile_Consumables_availability_small.csv |   4 +-
 ...ilability_scenarios_for_impact_analysis.py | 100 +++++++++++-------
 2 files changed, 63 insertions(+), 41 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index acc6db2e7b..eac30c2910 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a64491e6e2eb2884628e52c796391725b4738a74ad3dbf2e12eb20057dd52322
-size 30981143
+oid sha256:32bee3daecd1958cd8c29b7881f6349f037967166b9318b937f1edbd50a7f31b
+size 30603767
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index d411d5f642..75c53b520b 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -421,12 +421,10 @@ def interpolate_missing_with_mean(_ser):
 
 # 4. Generate best performing facility-based scenario data on consumable availablity
 #*********************************************************************************************
-df = full_df_with_scenario.copy()
+df = full_set_interpolated.reset_index().copy()
 
-# Create a dictionary to store facilities at the 75th, 90th and 99th percentile in terms of consumable availability
-# List of facility levels to process
+# Try updating the avaiability to represent the 75th percentile by consumable
 facility_levels = ['1a', '1b']
-best_performing_facilities = {} # Dictionary to store the best performing facilities for each level
 target_percentiles = [75, 90, 99]
 
 # Populate the dictionary
@@ -434,54 +432,68 @@ def interpolate_missing_with_mean(_ser):
     # Create an empty dictionary for the current level
     best_performing_facilities[level] = {}
 
-    # Get the mean availability by Facility for the current level
-    mean_consumable_availability = pd.DataFrame(df[df.Facility_ID.isin(facilities_by_level[level])]
-                                                .groupby('Facility_ID')['available_prop'].mean()).reset_index()
+    for item in item_codes:
+        best_performing_facilities[level][item] = {}
+        # Get the mean availability by Facility for the current level
+        mean_consumable_availability = pd.DataFrame(df[(df.Facility_ID.isin(facilities_by_level[level])) & (df.item_code == item)]
+                                                    .groupby('Facility_ID')['available_prop'].mean()).reset_index()
 
-    # Calculate the percentile rank of each row for 'available_prop'
-    mean_consumable_availability['percentile_rank'] = mean_consumable_availability['available_prop'].rank(pct=True) * 100
+        # Calculate the percentile rank of each row for 'available_prop'
+        mean_consumable_availability['percentile_rank'] = mean_consumable_availability['available_prop'].rank(pct=True) * 100
 
-    # Find the row which is closest to the nth percentile rank for each target percentile
-    for target_perc in target_percentiles:
-        # Calculate the difference to target percentile
-        mean_consumable_availability['diff_to_target_' + str(target_perc)] = np.abs(
-            mean_consumable_availability['percentile_rank'] - target_perc)
+        # Find the row which is closest to the nth percentile rank for each target percentile
+        for target_perc in target_percentiles:
+            # Calculate the difference to target percentile
+            mean_consumable_availability['diff_to_target_' + str(target_perc)] = np.abs(
+                mean_consumable_availability['percentile_rank'] - target_perc)
 
-        # Find the row with the minimum difference to the target percentile
-        closest_row = mean_consumable_availability.loc[
-            mean_consumable_availability['diff_to_target_' + str(target_perc)].idxmin()]
+            # Find the row with the minimum difference to the target percentile
+            closest_row = mean_consumable_availability.loc[
+                mean_consumable_availability['diff_to_target_' + str(target_perc)].idxmin()]
 
-        # Store the Facility_ID of the closest row in the dictionary for the current level
-        best_performing_facilities[level][str(target_perc) + 'th percentile'] = closest_row['Facility_ID']
+            # Store the Facility_ID of the closest row in the dictionary for the current level
+            best_performing_facilities[level][item][str(target_perc) + 'th percentile'] = closest_row['Facility_ID']
 
-print("Reference facilities at each level: ", best_performing_facilities)
+print("Reference facilities at each level for each item: ", best_performing_facilities)
+# TODO Flip the nesting order above for percentile to go before item?
 
 # Obtain the updated availability estimates for level 1a for scenarios 6-8
-id_variables = ['item_code', 'month']
-updated_availability_1a_scenario6 = df[df['Facility_ID']== best_performing_facilities['1a']['75th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario6'})
-updated_availability_1a_scenario7 = df[df['Facility_ID']== best_performing_facilities['1a']['90th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario7'})
-updated_availability_1a_scenario8 = df[df['Facility_ID']== best_performing_facilities['1a']['99th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario8'})
-updated_availability_1a = updated_availability_1a_scenario6.merge(updated_availability_1a_scenario7, on = ['item_code', 'month'],
-                                                                  how = 'inner', validate = "1:1")
-updated_availability_1a = updated_availability_1a.merge(updated_availability_1a_scenario8, on = ['item_code', 'month'],
-                                                                  how = 'inner', validate = "1:1")
-
-# Obtain the updated availability estimates for level 1b for scenarios 6-8
-updated_availability_1b_scenario6 = df[df['Facility_ID']== best_performing_facilities['1b']['75th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario6'})
-updated_availability_1b_scenario7 = df[df['Facility_ID']== best_performing_facilities['1b']['90th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario7'})
-updated_availability_1b_scenario8 = df[df['Facility_ID']== best_performing_facilities['1b']['99th percentile']][id_variables + ['available_prop']].rename(columns = {'available_prop': 'available_prop_scenario8'})
-updated_availability_1b = updated_availability_1b_scenario6.merge(updated_availability_1b_scenario7, on = ['item_code', 'month'],
-                                                                  how = 'inner', validate = "1:1")
-updated_availability_1b = updated_availability_1b.merge(updated_availability_1b_scenario8, on = ['item_code', 'month'],
-                                                                  how = 'inner', validate = "1:1")
+updated_availability_1a = df[['item_code', 'month']].drop_duplicates()
+updated_availability_1b = df[['item_code', 'month']].drop_duplicates()
+temporary_df = pd.DataFrame([])
+availability_dataframes = [updated_availability_1a, updated_availability_1b]
+
+i = 6 # start scenario counter
+j = 0 # start level counter
+for level in facility_levels:
+    for target_perc in target_percentiles:
+        for item in item_codes:
+
+            print("Running level ", level, "; Running scenario ", str(i), "; Running item ", item)
+            reference_facility = df['Facility_ID'] == best_performing_facilities[level][item][str(target_perc) + 'th percentile']
+            current_item = df['item_code'] == item
+            availability_at_reference_facility = df[reference_facility & current_item][['item_code', 'month', 'available_prop']]
+
+            if temporary_df.empty:
+                temporary_df = availability_at_reference_facility
+            else:
+                temporary_df = pd.concat([temporary_df,availability_at_reference_facility], ignore_index = True)
+
+        column_name = 'available_prop_scenario' + str(i)
+        temporary_df = temporary_df.rename(columns = {'available_prop': column_name })
+        availability_dataframes[j] = availability_dataframes[j].merge(temporary_df, on =  ['item_code', 'month'], how = 'left', validate = '1:1')
+        temporary_df = pd.DataFrame([])
+        i = i + 1
+    i = 6 # restart scenario counter
+    j = j + 1 # move to the next level
 
 # Merge the above scenario data to the full availability scenario dataframe
 # Scenario 6-8 availability data for level 1a
-df_new_1a = df[df['Facility_ID'].isin(facilities_by_level['1a'])].merge(updated_availability_1a,on = ['item_code', 'month'],
+df_new_1a = df[df['Facility_ID'].isin(facilities_by_level['1a'])].merge(availability_dataframes[0],on = ['item_code', 'month'],
                                       how = 'left',
                                       validate = "m:1")
 # Scenario 6-8 availability data for level 1b
-df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(updated_availability_1b,on = ['item_code', 'month'],
+df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(availability_dataframes[1],on = ['item_code', 'month'],
                                       how = 'left',
                                       validate = "m:1")
 # Scenario 6-8 availability data for other levels
@@ -548,5 +560,15 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
                              aggfunc=lambda x: sum(pd.isna(x))/len(x)*100)
 pivot_table.to_csv(outputfilepath / "temp.csv")
 print(pivot_table[('change_proportion_scenario5', '0')])
+
+a = availability_dataframes[1].reset_index()
+print(best_performing_facilities['1b'][5][str(75) + 'th percentile'])
+print(best_performing_facilities['1b'][222][str(90) + 'th percentile'])
+print(best_performing_facilities['1b'][222][str(99) + 'th percentile'])
+a[a.item_code == 222][['month', 'available_prop_scenario8']]
+item_chosen = 222
+fac_chosen = 110
+print(df[(df.item_code == item_chosen) & (df.Facility_ID == fac_chosen)][['month', 'available_prop']])
+
 '''
 

From 19aa36e8b6c30e794c51084b5569d61b2fa57ed6 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 16:00:32 +0100
Subject: [PATCH 018/100] minor figure edit

---
 ...rate_consumable_availability_scenarios_for_impact_analysis.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 75c53b520b..7b5185b399 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -537,6 +537,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
             + labs(title = "Probability of availability across scenarios",
                    x=_x_axis_var,
                    y='Probability of availability')
+            + theme(axis_text_x=element_text(angle=45, hjust=1))
            )
     # Create the directory if it doesn't exist
     directory = outputfilepath / 'consumable_scenario_analysis'

From 0135a23b10661da6902ec6e7ff500bb984ce3876 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 16:48:53 +0100
Subject: [PATCH 019/100] allow simulation to import availability estimates
 based on new scenarios

---
 src/tlo/methods/consumables.py | 25 +++++++++++++++++++------
 tests/test_consumables.py      |  5 ++++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index e8e117e2f7..3de1376197 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -46,6 +46,8 @@ def __init__(self,
             'all_medicines_and_other_available',
             'all_vital_available',
             'all_drug_or_vaccine_available',
+            'scenario1', 'scenario2', 'scenario3', 'scenario4',
+            'scenario5', 'scenario6', 'scenario7', 'scenario8',
         }
 
         # Create internal items:
@@ -102,7 +104,9 @@ def _update_prob_item_codes_available(self, availability: str):
         item_code_designations = self._item_code_designations
 
         # Over-ride the data according to option for `availability`
-        if availability == 'default':
+        if availability in ('default',
+                            'scenario1', 'scenario2', 'scenario3', 'scenario4',
+                            'scenario5', 'scenario6', 'scenario7', 'scenario8'):
             pass
         elif availability == 'all':
             self.override_availability(dict(zip(self.item_codes, repeat(1.0))))
@@ -134,16 +138,25 @@ def _update_prob_item_codes_available(self, availability: str):
         else:
             raise ValueError
 
-    def _process_consumables_data(self, availability_data: pd.DataFrame) -> Tuple[set, pd.Series]:
+    def _process_consumables_data(self, availability_data: pd.DataFrame, availability: str) -> Tuple[set, pd.Series]:
         """Helper function for processing the consumables data, passed in here as pd.DataFrame that has been read-in by
         the HealthSystem.
         Returns: (i) the set of all recognised item_codes; (ii) pd.Series of the availability of
         each consumable at each facility_id during each month.
         """
-        return (
-            set(availability_data.item_code),
-            availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop']
-        )
+        if availability == 'default':
+            return (
+                set(availability_data.item_code),
+                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop']
+            )
+        elif availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
+                              'scenario5', 'scenario6', 'scenario7', 'scenario8'):
+            return (
+                set(availability_data.item_code),
+                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + availability]
+            )
+        else:
+            pass
 
     def _refresh_availability_of_consumables(self, date: datetime.datetime):
         """Update the availability of all items based on the data for the probability of availability, given the current
diff --git a/tests/test_consumables.py b/tests/test_consumables.py
index 6eee6dac38..f3cdc65c5d 100644
--- a/tests/test_consumables.py
+++ b/tests/test_consumables.py
@@ -606,7 +606,10 @@ def test_consumables_availability_modes_that_depend_on_designations(seed):
             target_items = items_drug_or_vaccine
         elif availability == 'none':
             target_items = set()
-        elif availability == 'default':
+        elif availability in ('default',
+                              'scenario1', 'scenario2', 'scenario3', 'scenario4',
+                              'scenario5', 'scenario6', 'scenario7', 'scenario8',
+                            ):
             continue
         else:
             raise ValueError(f'Unexpected availability: {availability}')

From f9c7df11dea1b2654ce37f7960d41f940809ec07 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 18:24:00 +0100
Subject: [PATCH 020/100] update health system parameter

---
 ...scenario_impact_of_consumable_scenarios.py | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py

diff --git a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
new file mode 100644
index 0000000000..a4f18a401b
--- /dev/null
+++ b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
@@ -0,0 +1,63 @@
+"""
+This file defines a batch run to calculate the health effect of updated consumable availability estimates
+as a result of a supply chain intervention. The following scenarios are considered:
+1. 'scenario1' - all facilities set to 1b,
+2. 'scenario2' - all facility ownership set to CHAM,
+3. 'scenario3' - all facilities have functional computers,
+4. 'scenario4' - all facility drug stocks are managed by pharmacists or pharmacist technicians,
+5. 'scenario5' - all facilities have a functional emergency vehicle,
+6. 'scenario6' - all facilities provide diagnostic services,
+7. 'scenario7' - all facilities are within 10 kms from the relevant DHO,
+8. 'scenario8' - all facilities are within 10 kms from the relevant Regional medical Store (Warehouse),
+The batch runs are for a large population for a long time with all disease modules and full use of HSIs.
+Run on the batch system using:
+```tlo batch-submit src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py```
+or locally using:
+    ```tlo scenario-run src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py```
+"""
+
+from pathlib import Path
+from typing import Dict
+
+from tlo import Date, logging
+from tlo.analysis.utils import get_parameters_for_status_quo, mix_scenarios
+from tlo.methods.fullmodel import fullmodel
+from tlo.scenario import BaseScenario
+
+class ImpactOfConsumablesScenarios(BaseScenario):
+    def __init__(self):
+        super().__init__()
+        self.seed = 99
+        self.start_date = Date(2010, 1, 1)
+        self.end_date = Date(2019, 12, 31)
+        self.pop_size = 50 # large population size for final simulation
+        self.number_of_draws = 2  # <- 10 scenarios (10)
+        self.runs_per_draw = 2  # <- repeated this many times
+
+    def log_configuration(self):
+        return {
+            'filename': 'impact_of_consumables_scenarios',
+            'directory': './outputs',  # <- (specified only for local running)
+            'custom_levels': {
+                '*': logging.WARNING,
+                'tlo.methods.demography': logging.INFO,
+                'tlo.methods.healthburden': logging.INFO,
+                "tlo.methods.healthsystem.summary": logging.INFO,
+            }
+        }
+
+    def modules(self):
+        return fullmodel(resourcefilepath=self.resources)
+
+    def draw_parameters(self, draw_number, rng):
+        return {
+            'HealthSystem': {
+                'cons_availability': ['default', 'scenario1'][draw_number] # , 'scenario2', 'scenario3', 'scenario4', 'scenario5', 'scenario6', 'scenario7', 'scenario8', 'all'
+               }
+        }
+
+
+if __name__ == '__main__':
+    from tlo.cli import scenario_run
+
+    scenario_run([__file__])

From 2e5ed3f828e0c1ccd2b76daa90b1e4b74d18abd6 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 18:41:40 +0100
Subject: [PATCH 021/100] revert to consumable RFs before master was merged in

---
 .../ResourceFile_Consumables_availability_small.csv           | 4 ++--
 .../consumables/ResourceFile_consumables_matched.csv          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index 05826c5426..533c9a0821 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41b885bd2a36c406538b345a6affc6ab95e731bc80d218c9e9cba5b6106ee62e
-size 1280001
+oid sha256:0b53caa5f750cf9810c901aeeb993470eceea8ecc4119f5183f996607c8e252e
+size 6389877
diff --git a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
index 980adbfba2..d7798357fc 100644
--- a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
+++ b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:084aa9b6e011321538e907e0860a57b6392e9b16fd848bfb982374365e5fa286
-size 90079
+oid sha256:66b68e216701f841382e3add63305d55e1598862076789352ac7d799100e6d1a
+size 110047

From 84fdb7394c7e111e4ca56879298c475e992e116f Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 18:50:44 +0100
Subject: [PATCH 022/100] update RF to include availability iunder the 8
 realistic improved availability scenarios

---
 .../ResourceFile_Consumables_availability_small.csv           | 4 ++--
 ...e_consumable_availability_scenarios_for_impact_analysis.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index 533c9a0821..d20472a441 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b53caa5f750cf9810c901aeeb993470eceea8ecc4119f5183f996607c8e252e
-size 6389877
+oid sha256:9d45afdfc7f250bd308f1d3442dfa0b1b2f825e163faa05877e4536099cbfe81
+size 30603766
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 7b5185b399..bc05f733e0 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -427,6 +427,7 @@ def interpolate_missing_with_mean(_ser):
 facility_levels = ['1a', '1b']
 target_percentiles = [75, 90, 99]
 
+best_performing_facilities = {}
 # Populate the dictionary
 for level in facility_levels:
     # Create an empty dictionary for the current level

From 35393979b4bd795540947bd4ea06fa913bc145b6 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 19:01:55 +0100
Subject: [PATCH 023/100] add the full list of scenario availability columns to
 consumable and health systems methods

---
 src/tlo/methods/consumables.py  | 5 +++--
 src/tlo/methods/healthsystem.py | 6 +++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 3de1376197..ee164d16e6 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -355,8 +355,9 @@ def check_format_of_consumables_file(df, fac_ids):
     )
 
     # Check that every entry for a probability is a float on [0,1]
-    assert (df.available_prop <= 1.0).all() and (df.available_prop >= 0.0).all()
-    assert not pd.isnull(df.available_prop).any()
+    for col in availability_columns:
+        assert (df[col] <= 1.0).all() and (df[col] >= 0.0).all()
+        assert not pd.isnull(df[col]).any()
 
 
 class ConsumablesSummaryCounter:
diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py
index 8099346ddf..bdfa22a8af 100644
--- a/src/tlo/methods/healthsystem.py
+++ b/src/tlo/methods/healthsystem.py
@@ -1046,10 +1046,14 @@ def update_consumables_availability_to_represent_merging_of_levels_1b_and_2(self
             how='left'
         )
 
+        availability_columns = ['available_prop', 'available_prop_scenario1', 'available_prop_scenario2',
+                                'available_prop_scenario3', 'available_prop_scenario4', 'available_prop_scenario5',
+                                'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8']
+
         # compute the updated availability at the merged level '1b' and '2'
         availability_at_1b_and_2 = \
             dfx.drop(dfx.index[~dfx['Facility_Level'].isin(AVAILABILITY_OF_CONSUMABLES_AT_MERGED_LEVELS_1B_AND_2)]) \
-               .groupby(by=['District', 'month', 'item_code'])['available_prop'] \
+               .groupby(by=['District', 'month', 'item_code'])[availability_columns] \
                .mean() \
                .reset_index()\
                .assign(Facility_Level=LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2)

From 7bf2b0733925913853e18aef51c87d99ec6085eb Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 19:29:05 +0100
Subject: [PATCH 024/100] update assertion to ensure that only consumable
 availabilility at level 2 changes when levels 1b and 2 are collapsed

---
 src/tlo/methods/healthsystem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py
index bdfa22a8af..9c4ff479e6 100644
--- a/src/tlo/methods/healthsystem.py
+++ b/src/tlo/methods/healthsystem.py
@@ -1082,7 +1082,7 @@ def update_consumables_availability_to_represent_merging_of_levels_1b_and_2(self
         # check values the same for everything apart from the facility level '2' facilities
         facilities_with_any_differences = set(
             df_updated.loc[
-                ~(df_original == df_updated).all(axis=1),
+                ~(df_original.sort_values(['Facility_ID', 'month', 'item_code']).reset_index(drop=True) == df_updated).all(axis=1),
                 'Facility_ID']
         )
         level2_facilities = set(

From 6cbe0ee2dd05bdee8045ba2f1fb67911f6e410bc Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 19:31:16 +0100
Subject: [PATCH 025/100] update helper function to load consuamable data

---
 src/tlo/methods/consumables.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index ee164d16e6..623d06275d 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -144,16 +144,16 @@ def _process_consumables_data(self, availability_data: pd.DataFrame, availabilit
         Returns: (i) the set of all recognised item_codes; (ii) pd.Series of the availability of
         each consumable at each facility_id during each month.
         """
-        if availability == 'default':
+        if cons_availability == 'default':
             return (
                 set(availability_data.item_code),
                 availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop']
             )
-        elif availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
+        elif cons_availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
                               'scenario5', 'scenario6', 'scenario7', 'scenario8'):
             return (
                 set(availability_data.item_code),
-                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + availability]
+                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + cons_availability]
             )
         else:
             pass

From 553d0cf1555e773eabf06b1b6b312463e8ebbca7 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 19:31:49 +0100
Subject: [PATCH 026/100] Revert "update helper function to load consuamable
 data"

This reverts commit 6cbe0ee2dd05bdee8045ba2f1fb67911f6e410bc.
---
 src/tlo/methods/consumables.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 623d06275d..ee164d16e6 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -144,16 +144,16 @@ def _process_consumables_data(self, availability_data: pd.DataFrame, availabilit
         Returns: (i) the set of all recognised item_codes; (ii) pd.Series of the availability of
         each consumable at each facility_id during each month.
         """
-        if cons_availability == 'default':
+        if availability == 'default':
             return (
                 set(availability_data.item_code),
                 availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop']
             )
-        elif cons_availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
+        elif availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
                               'scenario5', 'scenario6', 'scenario7', 'scenario8'):
             return (
                 set(availability_data.item_code),
-                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + cons_availability]
+                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + availability]
             )
         else:
             pass

From d4fd581dc5fdff23eecec53aef89a8777a8e828a Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 19:32:43 +0100
Subject: [PATCH 027/100] update helper function to load consuamable data

---
 src/tlo/methods/consumables.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index ee164d16e6..32d509c977 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -138,22 +138,22 @@ def _update_prob_item_codes_available(self, availability: str):
         else:
             raise ValueError
 
-    def _process_consumables_data(self, availability_data: pd.DataFrame, availability: str) -> Tuple[set, pd.Series]:
+    def _process_consumables_data(self, availability_data: pd.DataFrame, cons_availability: str) -> Tuple[set, pd.Series]:
         """Helper function for processing the consumables data, passed in here as pd.DataFrame that has been read-in by
         the HealthSystem.
         Returns: (i) the set of all recognised item_codes; (ii) pd.Series of the availability of
         each consumable at each facility_id during each month.
         """
-        if availability == 'default':
+        if cons_availability == 'default':
             return (
                 set(availability_data.item_code),
                 availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop']
             )
-        elif availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
+        elif cons_availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
                               'scenario5', 'scenario6', 'scenario7', 'scenario8'):
             return (
                 set(availability_data.item_code),
-                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + availability]
+                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + cons_availability]
             )
         else:
             pass

From 242503a45ee0671ecc43575a6af031e02db59ea4 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 19:50:53 +0100
Subject: [PATCH 028/100] update _process_consumables_data

---
 src/tlo/methods/consumables.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 32d509c977..ee164d16e6 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -138,22 +138,22 @@ def _update_prob_item_codes_available(self, availability: str):
         else:
             raise ValueError
 
-    def _process_consumables_data(self, availability_data: pd.DataFrame, cons_availability: str) -> Tuple[set, pd.Series]:
+    def _process_consumables_data(self, availability_data: pd.DataFrame, availability: str) -> Tuple[set, pd.Series]:
         """Helper function for processing the consumables data, passed in here as pd.DataFrame that has been read-in by
         the HealthSystem.
         Returns: (i) the set of all recognised item_codes; (ii) pd.Series of the availability of
         each consumable at each facility_id during each month.
         """
-        if cons_availability == 'default':
+        if availability == 'default':
             return (
                 set(availability_data.item_code),
                 availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop']
             )
-        elif cons_availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
+        elif availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
                               'scenario5', 'scenario6', 'scenario7', 'scenario8'):
             return (
                 set(availability_data.item_code),
-                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + cons_availability]
+                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + availability]
             )
         else:
             pass

From 82cb1b0eca559aedf27cf12c56f083d619562e2d Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 20:00:22 +0100
Subject: [PATCH 029/100] update _process_consumables_data

---
 src/tlo/methods/consumables.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index ee164d16e6..1809e847d6 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -138,7 +138,7 @@ def _update_prob_item_codes_available(self, availability: str):
         else:
             raise ValueError
 
-    def _process_consumables_data(self, availability_data: pd.DataFrame, availability: str) -> Tuple[set, pd.Series]:
+    def _process_consumables_data(self, availability_data: pd.DataFrame) -> Tuple[set, pd.Series]:
         """Helper function for processing the consumables data, passed in here as pd.DataFrame that has been read-in by
         the HealthSystem.
         Returns: (i) the set of all recognised item_codes; (ii) pd.Series of the availability of

From df28a999cdb52ed03513f70f78c2afcd080b3712 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 20:51:36 +0100
Subject: [PATCH 030/100] correct _process_consumables_data

---
 src/tlo/methods/consumables.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 1809e847d6..933264b00d 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -63,7 +63,7 @@ def __init__(self,
 
         # Save all item_codes that are defined and pd.Series with probs of availability from ResourceFile
         self.item_codes,  self._processed_consumables_data = \
-            self._process_consumables_data(availability_data=availability_data)
+            self._process_consumables_data(availability_data=availability_data, availability=availability)
 
         # Set the availability based on the argument provided (this can be updated later after the class is initialised)
         self.availability = availability
@@ -138,7 +138,7 @@ def _update_prob_item_codes_available(self, availability: str):
         else:
             raise ValueError
 
-    def _process_consumables_data(self, availability_data: pd.DataFrame) -> Tuple[set, pd.Series]:
+    def _process_consumables_data(self, availability_data: pd.DataFrame, availability: str) -> Tuple[set, pd.Series]:
         """Helper function for processing the consumables data, passed in here as pd.DataFrame that has been read-in by
         the HealthSystem.
         Returns: (i) the set of all recognised item_codes; (ii) pd.Series of the availability of
@@ -146,7 +146,7 @@ def _process_consumables_data(self, availability_data: pd.DataFrame) -> Tuple[se
         """
         if availability == 'default':
             return (
-                set(availability_data.item_code),
+                set(availability_data['item_code']),
                 availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop']
             )
         elif availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
@@ -156,7 +156,7 @@ def _process_consumables_data(self, availability_data: pd.DataFrame) -> Tuple[se
                 availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + availability]
             )
         else:
-            pass
+            raise ValueError(f"Unknown availability parameter")
 
     def _refresh_availability_of_consumables(self, date: datetime.datetime):
         """Update the availability of all items based on the data for the probability of availability, given the current

From ee9616edddcbaf04c9b678660ee716192c8d04e4 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Jun 2024 20:56:36 +0100
Subject: [PATCH 031/100] update duration of simulation for local run

---
 .../scenario_impact_of_consumable_scenarios.py                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
index a4f18a401b..baf930513a 100644
--- a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
@@ -29,7 +29,7 @@ def __init__(self):
         super().__init__()
         self.seed = 99
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2019, 12, 31)
+        self.end_date = Date(2011, 12, 31)
         self.pop_size = 50 # large population size for final simulation
         self.number_of_draws = 2  # <- 10 scenarios (10)
         self.runs_per_draw = 2  # <- repeated this many times

From 8d2a79f591963c2adc4348a7d74cdfd46c8747d2 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Tue, 11 Jun 2024 12:27:35 +0100
Subject: [PATCH 032/100] update _process_consumables_data

---
 src/tlo/methods/consumables.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 933264b00d..64fb46a00a 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -144,19 +144,17 @@ def _process_consumables_data(self, availability_data: pd.DataFrame, availabilit
         Returns: (i) the set of all recognised item_codes; (ii) pd.Series of the availability of
         each consumable at each facility_id during each month.
         """
-        if availability == 'default':
-            return (
-                set(availability_data['item_code']),
-                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop']
-            )
-        elif availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
+        if availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
                               'scenario5', 'scenario6', 'scenario7', 'scenario8'):
             return (
                 set(availability_data.item_code),
                 availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + availability]
             )
         else:
-            raise ValueError(f"Unknown availability parameter")
+            return (
+                set(availability_data['item_code']),
+                availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop']
+            )
 
     def _refresh_availability_of_consumables(self, date: datetime.datetime):
         """Update the availability of all items based on the data for the probability of availability, given the current

From b78d5a1c84611fa721f1a5accca8986ab89694c9 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Tue, 11 Jun 2024 21:37:48 +0100
Subject: [PATCH 033/100] update scenario for large run

---
 ...scenario_impact_of_consumable_scenarios.py | 33 +++++++++++--------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
index baf930513a..bfa555f01f 100644
--- a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
@@ -1,14 +1,17 @@
 """
 This file defines a batch run to calculate the health effect of updated consumable availability estimates
 as a result of a supply chain intervention. The following scenarios are considered:
-1. 'scenario1' - all facilities set to 1b,
-2. 'scenario2' - all facility ownership set to CHAM,
-3. 'scenario3' - all facilities have functional computers,
-4. 'scenario4' - all facility drug stocks are managed by pharmacists or pharmacist technicians,
-5. 'scenario5' - all facilities have a functional emergency vehicle,
-6. 'scenario6' - all facilities provide diagnostic services,
-7. 'scenario7' - all facilities are within 10 kms from the relevant DHO,
-8. 'scenario8' - all facilities are within 10 kms from the relevant Regional medical Store (Warehouse),
+1. 'default' - this is the benchmark scenario with 2018 levels of consumable availability
+2. 'scenario1' : All items perform as well as consumables other than drugs/diagnostic tests
+3. 'scenario2' : 1 + All items perform as well as consumables classified as 'Vital' in the Essential Medicines List
+4. 'scenario3' : 2 + All facilities perform as well as those in which consumables stock is managed by pharmacists
+5. 'scenario4' : 3 + Level 1a facilities perform as well as level 1b
+6. 'scenario5' : 4 + All facilities perform as well as CHAM facilities
+7. 'scenario6' : All facilities have the same probability of consumable availability as the 75th percentile best performing facility for each individual item
+8. 'scenario7' : All facilities have the same probability of consumable availability as the 90th percentile best performing facility for each individual item
+9. 'scenario8' : All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item
+10. 'all': all consumable are always available - provides the theoretical maximum health gains which can be made through improving consumable supply
+
 The batch runs are for a large population for a long time with all disease modules and full use of HSIs.
 Run on the batch system using:
 ```tlo batch-submit src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py```
@@ -29,10 +32,10 @@ def __init__(self):
         super().__init__()
         self.seed = 99
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2011, 12, 31)
-        self.pop_size = 50 # large population size for final simulation
-        self.number_of_draws = 2  # <- 10 scenarios (10)
-        self.runs_per_draw = 2  # <- repeated this many times
+        self.end_date = Date(2019, 12, 31)
+        self.pop_size = 100_000 # large population size for final simulation
+        self.number_of_draws = 10  # <- 10 scenarios (10)
+        self.runs_per_draw = 5  # <- repeated this many times
 
     def log_configuration(self):
         return {
@@ -52,11 +55,13 @@ def modules(self):
     def draw_parameters(self, draw_number, rng):
         return {
             'HealthSystem': {
-                'cons_availability': ['default', 'scenario1'][draw_number] # , 'scenario2', 'scenario3', 'scenario4', 'scenario5', 'scenario6', 'scenario7', 'scenario8', 'all'
+                'cons_availability': ['default',
+                                      'scenario1', 'scenario2', 'scenario3', 'scenario4',
+                                      'scenario5', 'scenario6', 'scenario7', 'scenario8',
+                                      'all'][draw_number]
                }
         }
 
-
 if __name__ == '__main__':
     from tlo.cli import scenario_run
 

From be63ffc6a76908a6546f7ca91e5266fa150f7fe2 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Thu, 13 Jun 2024 13:49:01 +0100
Subject: [PATCH 034/100] add scenario analysis script - graph for Total DALYs
 by scenario - graph for DALYs by cause

---
 ...analysis_impact_of_consumable_scenarios.py | 238 ++++++++++++++++++
 1 file changed, 238 insertions(+)
 create mode 100644 src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
new file mode 100644
index 0000000000..3d0929968b
--- /dev/null
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -0,0 +1,238 @@
+"""This file uses the results of the results of running `impact_of_cons_availability_intervention.py`
+tob extract summary results for the manuscript - "Rethinking economic evaluation of
+system level interventions.
+I plan to run the simulation for a short period of 5 years (2020 - 2025) because
+holding the consumable availability constant in the short run would be more justifiable
+than holding it constant for a long period.
+"""
+
+import argparse
+from pathlib import Path
+import textwrap
+from typing import Tuple
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+from matplotlib.ticker import FuncFormatter
+from collections import Counter, defaultdict
+
+
+from tlo.analysis.utils import (
+    CAUSE_OF_DEATH_OR_DALY_LABEL_TO_COLOR_MAP,
+    extract_results,
+    get_color_cause_of_death_or_daly_label,
+    make_age_grp_lookup,
+    order_of_cause_of_death_or_daly_label,
+    summarize,
+)
+import pickle
+
+from tlo import Date
+from tlo.analysis.utils import (
+    extract_params,
+    extract_results,
+    get_scenario_info,
+    get_scenario_outputs,
+    load_pickled_dataframes,
+    make_age_grp_lookup,
+    make_age_grp_types,
+    make_calendar_period_lookup,
+    make_calendar_period_type,
+    summarize,
+    write_log_to_excel,
+    parse_log_file,
+    COARSE_APPT_TYPE_TO_COLOR_MAP,
+    SHORT_TREATMENT_ID_TO_COLOR_MAP,
+    _standardize_short_treatment_id,
+    bin_hsi_event_details,
+    compute_mean_across_runs,
+    get_coarse_appt_type,
+    get_color_short_treatment_id,
+    order_of_short_treatment_ids,
+    plot_stacked_bar_chart,
+    squarify_neat,
+    unflatten_flattened_multi_index_in_logging,
+)
+
+outputspath = Path('./outputs/')
+resourcefilepath = Path("./resources")
+
+PREFIX_ON_FILENAME = '3'
+
+# Declare period for which the results will be generated (defined inclusively)
+
+TARGET_PERIOD = (Date(2010, 1, 1), Date(2011, 12, 31))
+
+make_graph_file_name = lambda stub: output_folder / f"{stub.replace('*', '_star_')}.png"  # noqa: E731
+
+_, age_grp_lookup = make_age_grp_lookup()
+
+def target_period() -> str:
+    """Returns the target period as a string of the form YYYY-YYYY"""
+    return "-".join(str(t.year) for t in TARGET_PERIOD)
+
+def drop_outside_period(_df):
+    """Return a dataframe which only includes for which the date is within the limits defined by TARGET_PERIOD"""
+    return _df.drop(index=_df.index[~_df['date'].between(*TARGET_PERIOD)])
+
+
+# %% Gathering basic information
+
+# Find results_folder associated with a given batch_file and get most recent
+results_folder = get_scenario_outputs('impact_of_consumable_scenarios.py', outputspath)
+#results_folder = Path(outputspath/ 'impact_of_consumables_availability_intervention-2023-05-09T210307Z/')
+results_folder = Path(outputspath / 'impact_of_consumables_scenarios-2024-06-10T201342Z/')
+
+# look at one log (so can decide what to extract)
+log = load_pickled_dataframes(results_folder)
+
+# get basic information about the results
+info = get_scenario_info(results_folder)
+
+# 1) Extract the parameters that have varied over the set of simulations
+params = extract_params(results_folder)
+
+
+# %% Extracting results from run
+
+# 1. DALYs averted
+#-----------------------------------------
+# 1.1 Difference in total DALYs accrued
+def extract_total_dalys(results_folder):
+
+    def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
+        return pd.Series({"Total": df.drop(['date', 'sex', 'age_range', 'year'], axis = 1).sum().sum()})
+
+    return extract_results(
+        results_folder,
+        module="tlo.methods.healthburden",
+        key="dalys_stacked",
+        custom_generate_series=extract_dalys_total,
+        do_scaling=True
+    )
+
+total_dalys_accrued = extract_total_dalys(results_folder)
+dalys_summarized = summarize(total_dalys_accrued)
+dalys_summarized = dalys_summarized.unstack()
+
+fig, ax = plt.subplots()
+
+# Arrays to store the values for plotting
+central_vals = []
+lower_vals = []
+upper_vals = []
+
+# Extract values for each parameter
+for i, _p in enumerate(params['value']):
+    central_val = dalys_summarized[(i, 'mean')].values[0]
+    lower_val = dalys_summarized[(i, 'lower')].values[0]
+    upper_val = dalys_summarized[(i, 'upper')].values[0]
+
+    central_vals.append(central_val)
+    lower_vals.append(lower_val)
+    upper_vals.append(upper_val)
+
+# Generate the plot
+scenarios = params['value'] #range(len(params))  # X-axis values representing time periods
+colors = plt.cm.viridis(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
+
+for i in range(len(scenarios)):
+    ax.bar(scenarios[i], central_vals[i], color=colors[i], label=scenarios[i])
+    ax.errorbar(scenarios[i], central_vals[i], yerr=[[central_vals[i] - lower_vals[i]], [upper_vals[i] - central_vals[i]]], fmt='o', color='black')
+
+plt.xticks(scenarios, params['value'], rotation=45)
+ax.set_xlabel('Scenarios')
+ax.set_ylabel('Total DALYs accrued (in millions)')
+
+# Format y-axis ticks to display in millions
+formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
+ax.yaxis.set_major_formatter(formatter)
+
+#ax.set_ylim((0, 50))
+ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+fig.tight_layout()
+plt.show()
+
+# DALYs by disease area/intervention - for comparison of the magnitude of impact created by consumables interventions
+def _extract_dalys_by_disease(_df: pd.DataFrame) -> pd.Series:
+    """Construct a series with index disease and value of the total of DALYS (stacked) from the
+    `dalys_stacked` key logged in `tlo.methods.healthburden`.
+    N.B. This limits the time period of interest to 2010-2019"""
+    _, calperiodlookup = make_calendar_period_lookup()
+
+    return _df.loc[(_df['year'] >=2009) & (_df['year'] < 2012)]\
+             .drop(columns=['date', 'sex', 'age_range', 'year'])\
+             .sum(axis=0)
+
+dalys_extracted_by_disease = extract_results(
+    results_folder,
+    module="tlo.methods.healthburden",
+    key="dalys_stacked",
+    custom_generate_series=_extract_dalys_by_disease,
+    do_scaling=True
+)
+
+dalys_by_disease_summarized = summarize(dalys_extracted_by_disease)
+dalys_by_disease_summarized = dalys_by_disease_summarized.unstack()
+
+# Figure - Focus on top 5 diseases across the 10 scenarios? 0r do a dot plot
+# Assuming dalys_by_disease_summarized is your MultiIndex Series
+# Convert it to a DataFrame for easier manipulation
+dalys_by_disease_summarized_df = dalys_by_disease_summarized.reset_index()
+dalys_by_disease_summarized_df = dalys_by_disease_summarized_df.rename(columns = {'level_2': 'disease', 0: 'DALYs'})
+
+# Pivot the DataFrame to get 'draw' as columns, 'disease' as index, and 'DALYs' as values
+pivot_df = dalys_by_disease_summarized_df.pivot_table(
+    index='disease',
+    columns=['draw', 'stat'],
+    values='DALYs'
+)
+pivot_df = pivot_df.sort_values(by=(0, 'mean'), ascending=False)
+pivot_df = pivot_df[0:9] # Keep only top 10 conditions
+
+# Plotting
+fig, ax = plt.subplots(figsize=(12, 8))
+
+# Get the list of diseases for the x-axis
+diseases = pivot_df.index
+
+# Plot each draw with its confidence interval
+for draw in pivot_df.columns.levels[0]:
+    central_vals = pivot_df[(draw, 'mean')]
+    lower_vals = pivot_df[(draw, 'lower')]
+    upper_vals = pivot_df[(draw, 'upper')]
+
+    ax.plot(diseases, central_vals, label=f'Draw {draw}') # TODO update label to name of scenario
+    ax.fill_between(diseases, lower_vals, upper_vals, alpha=0.3)
+
+# Customize plot
+ax.set_xlabel('Cause of DALYs (Top 10)')
+ax.set_ylabel('Total DALYs accrued (in millions)')
+
+# Format y-axis ticks to display in millions
+formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
+ax.yaxis.set_major_formatter(formatter)
+
+ax.set_title('DALYs by Cause')
+ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1))
+
+# Rotate x-axis labels for better readability
+plt.xticks(rotation=45, ha='right')
+
+fig.tight_layout()
+plt.show()
+
+# TODO update the plot above so that only three scenarions are represented
+
+# 2. Mechanisms of impact
+#-----------------------------------------
+# Number of units of item which were needed but not made available for the top 25 items
+# TODO ideally this should count the number of treatment IDs but this needs the detailed health system logger
+
+# Cost of consumables?
+
+# TODO Justify the focus on levels 1a and 1b - where do HSIs occur?; at what level is there most misallocation within districts
+# TODO get graphs of percentage of successful HSIs under different scenarios for levels 1a and 1b
+
+

From 37d9b5fdfe8ebeab31b69b8c9ec5cc14db5ae401 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Fri, 14 Jun 2024 20:22:19 +0100
Subject: [PATCH 035/100] add figures showing mechanisms of impact

---
 ...analysis_impact_of_consumable_scenarios.py | 346 +++++++++++++++---
 1 file changed, 294 insertions(+), 52 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 3d0929968b..265bdc5af5 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -56,13 +56,13 @@
 )
 
 outputspath = Path('./outputs/')
+figurespath = Path(outputspath / 'impact_of_consumable_scenarios')
+figurespath.mkdir(parents=True, exist_ok=True) # create directory if it doesn't exist
 resourcefilepath = Path("./resources")
 
-PREFIX_ON_FILENAME = '3'
-
 # Declare period for which the results will be generated (defined inclusively)
 
-TARGET_PERIOD = (Date(2010, 1, 1), Date(2011, 12, 31))
+TARGET_PERIOD = (Date(2010, 1, 1), Date(2019, 12, 31))
 
 make_graph_file_name = lambda stub: output_folder / f"{stub.replace('*', '_star_')}.png"  # noqa: E731
 
@@ -77,12 +77,107 @@ def drop_outside_period(_df):
     return _df.drop(index=_df.index[~_df['date'].between(*TARGET_PERIOD)])
 
 
+def create_line_plot_absolute_figure(_df, _plt_var, _index_var, keep_rows, metric, _plt_name):
+    pivot_df = _df.pivot_table(
+        index=_index_var,
+        columns=['draw', 'stat'],
+        values=_plt_var
+    )
+    pivot_df = pivot_df.sort_values(by=(0, 'mean'), ascending=False)
+    pivot_df = pivot_df[0:keep_rows]  # Keep only top X conditions
+
+    # Define Scnearios and colours
+    scenarios = params['value']  # range(len(params))  # X-axis values representing time periods
+    colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
+    #colors = plt.cm.viridis(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
+
+    # Plotting
+    fig, ax = plt.subplots(figsize=(12, 8))
+
+    # Get the list of labels for the x-axis
+    x_axis_label_names = pivot_df.index
+
+    # Plot each draw with its confidence interval
+    for draw in pivot_df.columns.levels[0]:
+        central_vals = pivot_df[(draw, 'mean')]
+        lower_vals = pivot_df[(draw, 'lower')]
+        upper_vals = pivot_df[(draw, 'upper')]
+
+        ax.plot(x_axis_label_names, central_vals, label= scenarios[draw], color = colors[draw])  # TODO update label to name of scenario
+        ax.fill_between(x_axis_label_names, lower_vals, upper_vals, alpha=0.3, color = colors[draw])
+
+    # Customize plot
+    ax.set_ylabel(f'{metric}')
+
+    # Format y-axis ticks to display in millions
+    formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
+    ax.yaxis.set_major_formatter(formatter)
+
+    #ax.set_title('DALYs by Cause')
+    ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1))
+
+    # Rotate x-axis labels for better readability
+    plt.xticks(rotation=45, ha='right')
+
+    fig.tight_layout()
+    fig.savefig(figurespath / _plt_name)
+    fig.show()
+    plt.close(fig)
+
+def create_line_plot_percentage_of_baseline(_df, _plt_var, _index_var, keep_rows, metric, _plt_name):
+    pivot_df = _df.pivot_table(
+        index=_index_var,
+        columns=['draw', 'stat'],
+        values=_plt_var
+    )
+    pivot_df = pivot_df.sort_values(by=(0, 'mean'), ascending=False)
+    pivot_df = pivot_df[0:keep_rows]  # Keep only top X conditions
+
+    # Define Scnearios and colours
+    scenarios = params['value']  # range(len(params))  # X-axis values representing time periods
+    colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
+
+    # Plotting
+    fig, ax = plt.subplots(figsize=(12, 8))
+
+    # Get the list of labels for the x-axis
+    x_axis_label_names = pivot_df.index
+
+    # Plot each draw with its confidence interval
+    for draw in pivot_df.columns.levels[0]:
+        if scenarios[draw] == 'default': # Because this is a comparative graph and 'default' is the baseline for comparison
+            pass
+        else:
+            central_vals = (1 - pivot_df[(draw, 'mean')]/pivot_df[(0, 'mean')]) * 100# this shows the % reduction in DALYs compared to baseline
+            lower_vals = (1 - pivot_df[(draw, 'lower')]/pivot_df[(0, 'lower')]) * 100
+            upper_vals = (1 - pivot_df[(draw, 'upper')]/pivot_df[(0, 'upper')]) * 100
+
+            ax.plot(x_axis_label_names, central_vals, label= scenarios[draw], color = colors[draw])  # TODO update label to name of scenario
+            ax.fill_between(x_axis_label_names, lower_vals, upper_vals, alpha=0.3, color = colors[draw])
+
+    # Customize plot
+    ax.set_ylabel(f'Percentage reduction in {metric} compared to baseline')
+
+    # Formatting y-axis as percentages
+    ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0f}%'.format(y)))
+
+    #ax.set_title('DALYs by Cause')
+    ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1))
+
+    # Rotate x-axis labels for better readability
+    plt.xticks(rotation=45, ha='right')
+
+    fig.tight_layout()
+    fig.savefig(figurespath / _plt_name)
+    fig.show()
+    plt.close(fig)
+
+
 # %% Gathering basic information
 
 # Find results_folder associated with a given batch_file and get most recent
-results_folder = get_scenario_outputs('impact_of_consumable_scenarios.py', outputspath)
-#results_folder = Path(outputspath/ 'impact_of_consumables_availability_intervention-2023-05-09T210307Z/')
-results_folder = Path(outputspath / 'impact_of_consumables_scenarios-2024-06-10T201342Z/')
+#results_folder = get_scenario_outputs('impact_of_consumable_scenarios.py', outputspath)
+results_folder = Path(outputspath / 'sakshi.mohan@york.ac.uk/impact_of_consumables_scenarios-2024-06-11T204007Z/')
 
 # look at one log (so can decide what to extract)
 log = load_pickled_dataframes(results_folder)
@@ -112,9 +207,8 @@ def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
         do_scaling=True
     )
 
-total_dalys_accrued = extract_total_dalys(results_folder)
-dalys_summarized = summarize(total_dalys_accrued)
-dalys_summarized = dalys_summarized.unstack()
+total_dalys_accrued = summarize(extract_total_dalys(results_folder))
+total_dalys_accrued = total_dalys_accrued.unstack()
 
 fig, ax = plt.subplots()
 
@@ -125,9 +219,9 @@ def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
 
 # Extract values for each parameter
 for i, _p in enumerate(params['value']):
-    central_val = dalys_summarized[(i, 'mean')].values[0]
-    lower_val = dalys_summarized[(i, 'lower')].values[0]
-    upper_val = dalys_summarized[(i, 'upper')].values[0]
+    central_val = total_dalys_accrued[(i, 'mean')].values[0]
+    lower_val = total_dalys_accrued[(i, 'lower')].values[0]
+    upper_val = total_dalys_accrued[(i, 'upper')].values[0]
 
     central_vals.append(central_val)
     lower_vals.append(lower_val)
@@ -180,59 +274,207 @@ def _extract_dalys_by_disease(_df: pd.DataFrame) -> pd.Series:
 # Assuming dalys_by_disease_summarized is your MultiIndex Series
 # Convert it to a DataFrame for easier manipulation
 dalys_by_disease_summarized_df = dalys_by_disease_summarized.reset_index()
-dalys_by_disease_summarized_df = dalys_by_disease_summarized_df.rename(columns = {'level_2': 'disease', 0: 'DALYs'})
+dalys_by_disease_summarized_df = dalys_by_disease_summarized_df.rename(columns = {'level_2': 'cause', 0: 'DALYs'})
+
+create_line_plot_absolute_figure(_df = dalys_by_disease_summarized_df,
+                 _plt_var = "DALYs",
+                 _index_var = "cause",
+                 keep_rows = 10, # keep top 10 causes
+                  metric='DALYs accrued',
+                 _plt_name = 'DALYs_by_cause.png')
+
+create_line_plot_percentage_of_baseline(_df = dalys_by_disease_summarized_df,
+                 _plt_var = "DALYs",
+                 _index_var = "cause",
+                 keep_rows = 10, # keep top 10 causes
+                  metric = 'DALYs accrued',
+                 _plt_name = 'DALYs_by_cause_percentage.png')
 
-# Pivot the DataFrame to get 'draw' as columns, 'disease' as index, and 'DALYs' as values
-pivot_df = dalys_by_disease_summarized_df.pivot_table(
-    index='disease',
-    columns=['draw', 'stat'],
-    values='DALYs'
+# 2. Mechanisms of impact
+#-----------------------------------------
+# Number of units of item which were needed but not made available for the top 25 items
+# TODO ideally this should count the number of treatment IDs but this needs the detailed health system logger
+def consumables_availability_figure(results_folder: Path, output_folder: Path, resourcefilepath: Path):
+    """ 'Figure 3': Usage of consumables in the HealthSystem"""
+    make_graph_file_name = lambda stub: output_folder / f"Fig3_consumables_availability_figure.png"  # noqa: E731
+
+    def get_counts_of_items_requested(_df):
+        _df = drop_outside_period(_df)
+
+        counts_of_available = defaultdict(int)
+        counts_of_not_available = defaultdict(int)
+
+        for _, row in _df.iterrows():
+            for item, num in row['Item_Available'].items():
+                counts_of_available[item] += num
+            for item, num in row['Item_NotAvailable'].items(): # eval(row['Item_NotAvailable'])
+                counts_of_not_available[item] += num
+
+        return pd.concat(
+            {'Available': pd.Series(counts_of_available), 'Not_Available': pd.Series(counts_of_not_available)},
+            axis=1
+        ).fillna(0).astype(int).stack()
+
+    cons_req = summarize(
+        extract_results(
+            results_folder,
+            module='tlo.methods.healthsystem.summary',
+            key='Consumables',
+            custom_generate_series=get_counts_of_items_requested,
+            do_scaling=True
+        ),
+        only_mean=True,
+        collapse_columns=True
+    )
+
+    cons = cons_req.unstack()
+    cons_names = pd.read_csv(
+        resourcefilepath / 'healthsystem' / 'consumables' / 'ResourceFile_Consumables_Items_and_Packages.csv'
+    )[['Item_Code', 'Items']].set_index('Item_Code').drop_duplicates()
+    cons_names.index = cons_names.index.astype(str)
+    cons = cons.merge(cons_names, left_index=True, right_index=True, how='left').set_index('Items') #.astype(int)
+    cons = cons.assign(total=cons.sum(1)).sort_values('total').drop(columns='total')
+
+    cons.columns = pd.MultiIndex.from_tuples(cons.columns, names=['draw', 'stat', 'var'])
+    cons_not_available = cons.loc[:, cons.columns.get_level_values(2) == 'Not_Available']
+    cons_not_available.mean = cons_not_available.loc[:, cons_not_available.columns.get_level_values(1) == 'mean']
+    cons_available = cons.loc[:, cons.columns.get_level_values(2) == 'Available']
+
+    cons_not_available = cons_not_available.unstack().reset_index()
+    cons_not_available = cons_not_available.rename(columns={0: 'qty_not_available'})
+
+    create_line_plot_absolute_figure(_df=cons_not_available,
+                     _plt_var="qty_not_available",
+                     _index_var="Items",
+                     keep_rows=25,  # keep top 25 demanded consumables
+                     metric="consumable demand not met",
+                     _plt_name='consumables_demand_not_met.png')
+
+    create_line_plot_percentage_of_baseline(_df=cons_not_available,
+                     _plt_var="qty_not_available",
+                     _index_var="Items",
+                     keep_rows=25,   # keep top 25 demanded consumables
+                     metric = "consumable demand not met",
+                     _plt_name='consumables_demand_not_met_percentage.png')
+
+consumables_availability_figure(results_folder, outputspath, resourcefilepath)
+
+# HSI affected by missing consumables
+# We need healthsystem logger for this
+
+# HSIs taking place by level in the default scenario
+def get_counts_of_hsis(_df):
+    _df = drop_outside_period(_df)
+
+    # Initialize an empty dictionary to store the total counts
+    total_hsi_count = {}
+
+    for date, appointment_dict in _df['Number_By_Appt_Type_Code_And_Level'].items():
+        print(appointment_dict)
+        for level, appointments_at_level in appointment_dict.items():
+            print(level, appointments_at_level)
+            total_hsi_count[level] = {}
+            for appointment_type, count in appointments_at_level.items():
+                print(appointment_type, count)
+                if appointment_type in total_hsi_count:
+                    total_hsi_count[level][appointment_type] += count
+                else:
+                    total_hsi_count[level][appointment_type] = count
+
+    total_hsi_count_series = pd.Series(total_hsi_count)
+    for level in ['0', '1a', '1b', '2', '3', '4']:
+        appointments_at_level = pd.Series(total_hsi_count_series[total_hsi_count_series.index == level].values[0], dtype='int')
+        # Create a list of tuples with the original index and the new level '1a'
+        new_index_tuples = [(idx, level) for idx in appointments_at_level.index]
+        # Create the new MultiIndex
+        new_index = pd.MultiIndex.from_tuples(new_index_tuples, names=['Appointment', 'Level'])
+        # Reindex the Series with the new MultiIndex
+        appointments_at_level_multiindex = appointments_at_level.copy()
+        appointments_at_level_multiindex.index = new_index
+        if level == '0':
+            appointments_all_levels = appointments_at_level_multiindex
+        else:
+            appointments_all_levels = pd.concat([appointments_all_levels, appointments_at_level_multiindex], axis = 0)
+
+    return pd.Series(appointments_all_levels).fillna(0).astype(int)
+
+hsi_count = summarize(
+    extract_results(
+        results_folder,
+        module='tlo.methods.healthsystem.summary',
+        key='HSI_Event',
+        custom_generate_series=get_counts_of_hsis,
+        do_scaling=True
+    ),
+    only_mean=True,
+    collapse_columns=True
 )
-pivot_df = pivot_df.sort_values(by=(0, 'mean'), ascending=False)
-pivot_df = pivot_df[0:9] # Keep only top 10 conditions
 
-# Plotting
-fig, ax = plt.subplots(figsize=(12, 8))
+hsi = hsi_count.assign(baseline_values=hsi_count[(0, 'mean')]).sort_values('baseline_values').drop(columns='baseline_values')
+hsi.columns = pd.MultiIndex.from_tuples(hsi.columns, names=['draw', 'stat'])
+#hsi = hsi.unstack().reset_index()
+hsi_stacked = hsi.stack().stack().reset_index()
+hsi_stacked = hsi_stacked.rename(columns={0: 'hsis_requested'})
 
-# Get the list of diseases for the x-axis
-diseases = pivot_df.index
+create_line_plot_absolute_figure(_df=hsi_stacked,
+                 _plt_var="hsis_requested",
+                 _index_var="Level",
+                 keep_rows=6,  # show all levels
+                 metric = 'HSIs',
+                 _plt_name='hsis_requested.png')
 
-# Plot each draw with its confidence interval
-for draw in pivot_df.columns.levels[0]:
-    central_vals = pivot_df[(draw, 'mean')]
-    lower_vals = pivot_df[(draw, 'lower')]
-    upper_vals = pivot_df[(draw, 'upper')]
+create_line_plot_percentage_of_baseline(_df=hsi_stacked,
+                 _plt_var="hsis_requested",
+                 _index_var="Level",
+                 keep_rows=6,  # show all levels
+                 metric = 'HSIs',
+                 _plt_name='hsis_requested.png')
 
-    ax.plot(diseases, central_vals, label=f'Draw {draw}') # TODO update label to name of scenario
-    ax.fill_between(diseases, lower_vals, upper_vals, alpha=0.3)
+# Appointments by treatment ID
+def get_counts_of_treatments(_df):
+    _df = drop_outside_period(_df)
 
-# Customize plot
-ax.set_xlabel('Cause of DALYs (Top 10)')
-ax.set_ylabel('Total DALYs accrued (in millions)')
+    counts_of_treatments = defaultdict(int)
 
-# Format y-axis ticks to display in millions
-formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
-ax.yaxis.set_major_formatter(formatter)
+    for _, row in _df.iterrows():
+        for item, num in row['TREATMENT_ID'].items():
+            counts_of_treatments[item] += num
 
-ax.set_title('DALYs by Cause')
-ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1))
+    return pd.Series(counts_of_treatments).fillna(0).astype(int)
 
-# Rotate x-axis labels for better readability
-plt.xticks(rotation=45, ha='right')
 
-fig.tight_layout()
-plt.show()
-
-# TODO update the plot above so that only three scenarions are represented
-
-# 2. Mechanisms of impact
-#-----------------------------------------
-# Number of units of item which were needed but not made available for the top 25 items
-# TODO ideally this should count the number of treatment IDs but this needs the detailed health system logger
+count_of_treatments_delivered = summarize(
+    extract_results(
+        results_folder,
+        module='tlo.methods.healthsystem.summary',
+        key='HSI_Event',
+        custom_generate_series=get_counts_of_treatments,
+        do_scaling=True
+    ),
+    only_mean=True,
+    collapse_columns=True
+)
+count_of_treatments_delivered = count_of_treatments_delivered.assign(baseline_values=count_of_treatments_delivered[(0, 'mean')]).sort_values('baseline_values').drop(columns='baseline_values')
+treatments_delivered = count_of_treatments_delivered.unstack().reset_index()
+treatments_delivered = treatments_delivered.rename(columns={'level_2': 'Treatment_ID', 0: 'qty_delivered'})
+
+create_line_plot_absolute_figure(_df=treatments_delivered,
+                 _plt_var="qty_delivered",
+                 _index_var="Treatment_ID",
+                 keep_rows=20,  # show all levels
+                 metric = 'Number of HSIs requested by Treatment ID',
+                 _plt_name='treatments_delivered.png')
+
+create_line_plot_percentage_of_baseline(_df=treatments_delivered,
+                 _plt_var="qty_delivered",
+                 _index_var="Treatment_ID",
+                 keep_rows=20,  # show all levels
+                 metric = 'number of HSIs requested by Treatment ID',
+                 _plt_name='treatments_delivered_percentage.png')
 
 # Cost of consumables?
 
 # TODO Justify the focus on levels 1a and 1b - where do HSIs occur?; at what level is there most misallocation within districts
 # TODO get graphs of percentage of successful HSIs under different scenarios for levels 1a and 1b
-
-
+# TODO is there a way to link consumables directly to DALYs (how many DALYs are lost due to stockouts of specific consumables)
+# TODO why are there no appointments at level 1b

From 30548abb1387c03c67e9a631a348b99bb076de67 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sat, 15 Jun 2024 15:47:52 +0100
Subject: [PATCH 036/100] update figure legend to show cleaner scneario names

---
 ...analysis_impact_of_consumable_scenarios.py | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 265bdc5af5..68b00468f0 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -87,7 +87,7 @@ def create_line_plot_absolute_figure(_df, _plt_var, _index_var, keep_rows, metri
     pivot_df = pivot_df[0:keep_rows]  # Keep only top X conditions
 
     # Define Scnearios and colours
-    scenarios = params['value']  # range(len(params))  # X-axis values representing time periods
+    scenarios = params['name_of_scenario']  # range(len(params))  # X-axis values representing time periods
     colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
     #colors = plt.cm.viridis(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
 
@@ -110,8 +110,8 @@ def create_line_plot_absolute_figure(_df, _plt_var, _index_var, keep_rows, metri
     ax.set_ylabel(f'{metric}')
 
     # Format y-axis ticks to display in millions
-    formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
-    ax.yaxis.set_major_formatter(formatter)
+    #formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
+    #ax.yaxis.set_major_formatter(formatter)
 
     #ax.set_title('DALYs by Cause')
     ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1))
@@ -134,7 +134,7 @@ def create_line_plot_percentage_of_baseline(_df, _plt_var, _index_var, keep_rows
     pivot_df = pivot_df[0:keep_rows]  # Keep only top X conditions
 
     # Define Scnearios and colours
-    scenarios = params['value']  # range(len(params))  # X-axis values representing time periods
+    scenarios = params['name_of_scenario']  # range(len(params))  # X-axis values representing time periods
     colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
 
     # Plotting
@@ -145,18 +145,18 @@ def create_line_plot_percentage_of_baseline(_df, _plt_var, _index_var, keep_rows
 
     # Plot each draw with its confidence interval
     for draw in pivot_df.columns.levels[0]:
-        if scenarios[draw] == 'default': # Because this is a comparative graph and 'default' is the baseline for comparison
+        if scenarios[draw] == 'Actual': # Because this is a comparative graph and 'default' is the baseline for comparison
             pass
         else:
-            central_vals = (1 - pivot_df[(draw, 'mean')]/pivot_df[(0, 'mean')]) * 100# this shows the % reduction in DALYs compared to baseline
-            lower_vals = (1 - pivot_df[(draw, 'lower')]/pivot_df[(0, 'lower')]) * 100
-            upper_vals = (1 - pivot_df[(draw, 'upper')]/pivot_df[(0, 'upper')]) * 100
+            central_vals = pivot_df[(draw, 'mean')]/pivot_df[(0, 'mean')] * 100# this shows the % reduction in DALYs compared to baseline
+            lower_vals = pivot_df[(draw, 'lower')]/pivot_df[(0, 'lower')] * 100
+            upper_vals = pivot_df[(draw, 'upper')]/pivot_df[(0, 'upper')] * 100
 
             ax.plot(x_axis_label_names, central_vals, label= scenarios[draw], color = colors[draw])  # TODO update label to name of scenario
             ax.fill_between(x_axis_label_names, lower_vals, upper_vals, alpha=0.3, color = colors[draw])
 
     # Customize plot
-    ax.set_ylabel(f'Percentage reduction in {metric} compared to baseline')
+    ax.set_ylabel(f"{metric} (percentage of 'Actual')")
 
     # Formatting y-axis as percentages
     ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0f}%'.format(y)))
@@ -187,6 +187,12 @@ def create_line_plot_percentage_of_baseline(_df, _plt_var, _index_var, keep_rows
 
 # 1) Extract the parameters that have varied over the set of simulations
 params = extract_params(results_folder)
+params_dict  = {'default': 'Actual', 'scenario1': 'Scenario 1', 'scenario2': 'Scenario 2',
+                'scenario3': 'Scenario 3', 'scenario4': 'Scenario 4', 'scenario5': 'Scenario 5',
+                'scenario6': 'Scenario 6', 'scenario7': 'Scenario 7', 'scenario8': 'Scenario 8',
+                'all': 'Perfect'}
+params_dict_df = pd.DataFrame.from_dict(params_dict, orient='index', columns=['name_of_scenario']).reset_index().rename(columns = {'index': 'value'})
+params = params.merge(params_dict_df, on = 'value', how = 'left', validate = '1:1')
 
 
 # %% Extracting results from run
@@ -228,14 +234,14 @@ def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
     upper_vals.append(upper_val)
 
 # Generate the plot
-scenarios = params['value'] #range(len(params))  # X-axis values representing time periods
-colors = plt.cm.viridis(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
+scenarios = params['name_of_scenario'] #range(len(params))  # X-axis values representing time periods
+colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
 
 for i in range(len(scenarios)):
     ax.bar(scenarios[i], central_vals[i], color=colors[i], label=scenarios[i])
     ax.errorbar(scenarios[i], central_vals[i], yerr=[[central_vals[i] - lower_vals[i]], [upper_vals[i] - central_vals[i]]], fmt='o', color='black')
 
-plt.xticks(scenarios, params['value'], rotation=45)
+plt.xticks(scenarios, params['name_of_scenario'], rotation=45)
 ax.set_xlabel('Scenarios')
 ax.set_ylabel('Total DALYs accrued (in millions)')
 
@@ -244,7 +250,7 @@ def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
 ax.yaxis.set_major_formatter(formatter)
 
 #ax.set_ylim((0, 50))
-ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
 fig.tight_layout()
 plt.show()
 

From 4d7ae4272a16612b2240ace93bc7721d5f9b3384 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sat, 15 Jun 2024 15:51:34 +0100
Subject: [PATCH 037/100] minor script cleaning

---
 .../analysis_impact_of_consumable_scenarios.py    | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 68b00468f0..8acab90c98 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -197,9 +197,9 @@ def create_line_plot_percentage_of_baseline(_df, _plt_var, _index_var, keep_rows
 
 # %% Extracting results from run
 
-# 1. DALYs averted
+# 1. DALYs accrued
 #-----------------------------------------
-# 1.1 Difference in total DALYs accrued
+# 1.1 Total DALYs accrued
 def extract_total_dalys(results_folder):
 
     def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
@@ -254,7 +254,7 @@ def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
 fig.tight_layout()
 plt.show()
 
-# DALYs by disease area/intervention - for comparison of the magnitude of impact created by consumables interventions
+# 1.2 DALYs by disease area/intervention - for comparison of the magnitude of impact created by consumables interventions
 def _extract_dalys_by_disease(_df: pd.DataFrame) -> pd.Series:
     """Construct a series with index disease and value of the total of DALYS (stacked) from the
     `dalys_stacked` key logged in `tlo.methods.healthburden`.
@@ -296,7 +296,7 @@ def _extract_dalys_by_disease(_df: pd.DataFrame) -> pd.Series:
                   metric = 'DALYs accrued',
                  _plt_name = 'DALYs_by_cause_percentage.png')
 
-# 2. Mechanisms of impact
+# 2. Consumable demand not met
 #-----------------------------------------
 # Number of units of item which were needed but not made available for the top 25 items
 # TODO ideally this should count the number of treatment IDs but this needs the detailed health system logger
@@ -368,6 +368,8 @@ def get_counts_of_items_requested(_df):
 # HSI affected by missing consumables
 # We need healthsystem logger for this
 
+# 3. Number of Health System Interactions
+#-----------------------------------------
 # HSIs taking place by level in the default scenario
 def get_counts_of_hsis(_df):
     _df = drop_outside_period(_df)
@@ -436,7 +438,9 @@ def get_counts_of_hsis(_df):
                  metric = 'HSIs',
                  _plt_name='hsis_requested.png')
 
-# Appointments by treatment ID
+
+# 4. Number of Services delivered
+#-----------------------------------------
 def get_counts_of_treatments(_df):
     _df = drop_outside_period(_df)
 
@@ -484,3 +488,4 @@ def get_counts_of_treatments(_df):
 # TODO get graphs of percentage of successful HSIs under different scenarios for levels 1a and 1b
 # TODO is there a way to link consumables directly to DALYs (how many DALYs are lost due to stockouts of specific consumables)
 # TODO why are there no appointments at level 1b
+

From db710c7bf5345523919033c3663e2c16390c8405 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sat, 15 Jun 2024 16:47:02 +0100
Subject: [PATCH 038/100] add heatmap aummarising consumable availability in
 the RF

---
 ...analysis_impact_of_consumable_scenarios.py | 53 ++++++++++++++++++-
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 8acab90c98..ad4b2b0d55 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -16,7 +16,7 @@
 from matplotlib import pyplot as plt
 from matplotlib.ticker import FuncFormatter
 from collections import Counter, defaultdict
-
+import seaborn as sns
 
 from tlo.analysis.utils import (
     CAUSE_OF_DEATH_OR_DALY_LABEL_TO_COLOR_MAP,
@@ -252,7 +252,9 @@ def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
 #ax.set_ylim((0, 50))
 #ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
 fig.tight_layout()
-plt.show()
+fig.savefig(figurespath / 'main_result_DALYs.png')
+fig.show()
+plt.close(fig)
 
 # 1.2 DALYs by disease area/intervention - for comparison of the magnitude of impact created by consumables interventions
 def _extract_dalys_by_disease(_df: pd.DataFrame) -> pd.Series:
@@ -484,6 +486,53 @@ def get_counts_of_treatments(_df):
 
 # Cost of consumables?
 
+# %% Summarizing input resourcefile data
+
+# 1. Consumable availability by category and level
+#--------------------------------------------------
+tlo_availability_df = pd.read_csv(resourcefilepath  / 'healthsystem'/ 'consumables' / "ResourceFile_Consumables_availability_small.csv")
+
+# Attach district, facility level, program to this dataset
+mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv")
+districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District'])
+fac_levels = {'0', '1a', '1b', '2', '3', '4'}
+tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
+                    on = ['Facility_ID'], how='left')
+# Attach programs
+programs = pd.read_csv(resourcefilepath / 'healthsystem'/ 'consumables' / "ResourceFile_Consumables_availability_and_usage.csv")[['category', 'item_code', 'module_name']]
+programs = programs.drop_duplicates('item_code')
+tlo_availability_df = tlo_availability_df.merge(programs, on = ['item_code'], how = 'left')
+
+# Generate a heatmap
+# Pivot the DataFrame
+aggregated_df = tlo_availability_df.groupby(['category', 'Facility_Level'])['available_prop'].mean().reset_index()
+heatmap_data = aggregated_df.pivot("category", "Facility_Level", "available_prop")
+
+# Calculate the aggregate row and column
+aggregate_col= tlo_availability_df.groupby('Facility_Level')['available_prop'].mean()
+aggregate_row = tlo_availability_df.groupby('category')['available_prop'].mean()
+overall_aggregate = tlo_availability_df['available_prop'].mean()
+
+# Add aggregate row and column
+heatmap_data['Average'] = aggregate_row
+aggregate_col['Average'] = overall_aggregate
+heatmap_data.loc['Average'] = aggregate_col
+
+# Generate the heatmap
+plt.figure(figsize=(10, 8))
+sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+
+# Customize the plot
+#plt.title('Consumable availability by Facility Level and Category')
+plt.xlabel('Facility Level')
+plt.ylabel('Category')
+plt.xticks(rotation=45)
+plt.yticks(rotation=0)
+
+plt.savefig(figurespath /'consumable_availability_heatmap.png', dpi=300, bbox_inches='tight')
+plt.show()
+plt.close()
+
 # TODO Justify the focus on levels 1a and 1b - where do HSIs occur?; at what level is there most misallocation within districts
 # TODO get graphs of percentage of successful HSIs under different scenarios for levels 1a and 1b
 # TODO is there a way to link consumables directly to DALYs (how many DALYs are lost due to stockouts of specific consumables)

From 1c68e6b96a3ab5907885334cb3656a4018fc3a29 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sat, 22 Jun 2024 18:44:43 +0100
Subject: [PATCH 039/100] remove line plots for mechanisms of impact

---
 ...analysis_impact_of_consumable_scenarios.py | 265 +++++++-----------
 1 file changed, 98 insertions(+), 167 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index ad4b2b0d55..09a3726f58 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -17,6 +17,7 @@
 from matplotlib.ticker import FuncFormatter
 from collections import Counter, defaultdict
 import seaborn as sns
+import squarify
 
 from tlo.analysis.utils import (
     CAUSE_OF_DEATH_OR_DALY_LABEL_TO_COLOR_MAP,
@@ -76,103 +77,6 @@ def drop_outside_period(_df):
     """Return a dataframe which only includes for which the date is within the limits defined by TARGET_PERIOD"""
     return _df.drop(index=_df.index[~_df['date'].between(*TARGET_PERIOD)])
 
-
-def create_line_plot_absolute_figure(_df, _plt_var, _index_var, keep_rows, metric, _plt_name):
-    pivot_df = _df.pivot_table(
-        index=_index_var,
-        columns=['draw', 'stat'],
-        values=_plt_var
-    )
-    pivot_df = pivot_df.sort_values(by=(0, 'mean'), ascending=False)
-    pivot_df = pivot_df[0:keep_rows]  # Keep only top X conditions
-
-    # Define Scnearios and colours
-    scenarios = params['name_of_scenario']  # range(len(params))  # X-axis values representing time periods
-    colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
-    #colors = plt.cm.viridis(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
-
-    # Plotting
-    fig, ax = plt.subplots(figsize=(12, 8))
-
-    # Get the list of labels for the x-axis
-    x_axis_label_names = pivot_df.index
-
-    # Plot each draw with its confidence interval
-    for draw in pivot_df.columns.levels[0]:
-        central_vals = pivot_df[(draw, 'mean')]
-        lower_vals = pivot_df[(draw, 'lower')]
-        upper_vals = pivot_df[(draw, 'upper')]
-
-        ax.plot(x_axis_label_names, central_vals, label= scenarios[draw], color = colors[draw])  # TODO update label to name of scenario
-        ax.fill_between(x_axis_label_names, lower_vals, upper_vals, alpha=0.3, color = colors[draw])
-
-    # Customize plot
-    ax.set_ylabel(f'{metric}')
-
-    # Format y-axis ticks to display in millions
-    #formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
-    #ax.yaxis.set_major_formatter(formatter)
-
-    #ax.set_title('DALYs by Cause')
-    ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1))
-
-    # Rotate x-axis labels for better readability
-    plt.xticks(rotation=45, ha='right')
-
-    fig.tight_layout()
-    fig.savefig(figurespath / _plt_name)
-    fig.show()
-    plt.close(fig)
-
-def create_line_plot_percentage_of_baseline(_df, _plt_var, _index_var, keep_rows, metric, _plt_name):
-    pivot_df = _df.pivot_table(
-        index=_index_var,
-        columns=['draw', 'stat'],
-        values=_plt_var
-    )
-    pivot_df = pivot_df.sort_values(by=(0, 'mean'), ascending=False)
-    pivot_df = pivot_df[0:keep_rows]  # Keep only top X conditions
-
-    # Define Scnearios and colours
-    scenarios = params['name_of_scenario']  # range(len(params))  # X-axis values representing time periods
-    colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
-
-    # Plotting
-    fig, ax = plt.subplots(figsize=(12, 8))
-
-    # Get the list of labels for the x-axis
-    x_axis_label_names = pivot_df.index
-
-    # Plot each draw with its confidence interval
-    for draw in pivot_df.columns.levels[0]:
-        if scenarios[draw] == 'Actual': # Because this is a comparative graph and 'default' is the baseline for comparison
-            pass
-        else:
-            central_vals = pivot_df[(draw, 'mean')]/pivot_df[(0, 'mean')] * 100# this shows the % reduction in DALYs compared to baseline
-            lower_vals = pivot_df[(draw, 'lower')]/pivot_df[(0, 'lower')] * 100
-            upper_vals = pivot_df[(draw, 'upper')]/pivot_df[(0, 'upper')] * 100
-
-            ax.plot(x_axis_label_names, central_vals, label= scenarios[draw], color = colors[draw])  # TODO update label to name of scenario
-            ax.fill_between(x_axis_label_names, lower_vals, upper_vals, alpha=0.3, color = colors[draw])
-
-    # Customize plot
-    ax.set_ylabel(f"{metric} (percentage of 'Actual')")
-
-    # Formatting y-axis as percentages
-    ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0f}%'.format(y)))
-
-    #ax.set_title('DALYs by Cause')
-    ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1))
-
-    # Rotate x-axis labels for better readability
-    plt.xticks(rotation=45, ha='right')
-
-    fig.tight_layout()
-    fig.savefig(figurespath / _plt_name)
-    fig.show()
-    plt.close(fig)
-
-
 # %% Gathering basic information
 
 # Find results_folder associated with a given batch_file and get most recent
@@ -215,6 +119,18 @@ def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
 
 total_dalys_accrued = summarize(extract_total_dalys(results_folder))
 total_dalys_accrued = total_dalys_accrued.unstack()
+print(total_dalys_accrued)
+
+'''
+total_dalys_accrued = total_dalys_accrued.reset_index()
+total_dalys_accrued = total_dalys_accrued.rename(columns = {'level_2': 'var', 0: 'DALYs'})
+
+create_bar_plot_dalys_averted(_df = total_dalys_accrued,
+                 _plt_var = "DALYs",
+                 _index_var = "cause",
+                  metric='DALYs accrued',
+                 _plt_name = 'DALYs_averted.png')
+'''
 
 fig, ax = plt.subplots()
 
@@ -278,26 +194,56 @@ def _extract_dalys_by_disease(_df: pd.DataFrame) -> pd.Series:
 dalys_by_disease_summarized = summarize(dalys_extracted_by_disease)
 dalys_by_disease_summarized = dalys_by_disease_summarized.unstack()
 
+for disease in ['AIDS', 'Lower respiratory infections', 'Neonatal Disorders', 'Malaria', 'TB (non-AIDS)']:
+    dalys_accrued = dalys_by_disease_summarized.xs(disease, level=2)
+    fig, ax = plt.subplots()
+
+    # Arrays to store the values for plotting
+    central_vals = []
+    lower_vals = []
+    upper_vals = []
+
+    # Extract values for each parameter
+    for i, _p in enumerate(params['value']):
+        central_val = dalys_accrued[(i, 'mean')]
+        lower_val = dalys_accrued[(i, 'lower')]
+        upper_val = dalys_accrued[(i, 'upper')]
+
+        central_vals.append(central_val)
+        lower_vals.append(lower_val)
+        upper_vals.append(upper_val)
+
+    # Generate the plot
+    scenarios = params['name_of_scenario'] #range(len(params))  # X-axis values representing time periods
+    colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
+
+    for i in range(len(scenarios)):
+        ax.bar(scenarios[i], central_vals[i], color=colors[i], label=scenarios[i])
+        ax.errorbar(scenarios[i], central_vals[i], yerr=[[central_vals[i] - lower_vals[i]], [upper_vals[i] - central_vals[i]]], fmt='o', color='black')
+
+    plt.xticks(scenarios, params['name_of_scenario'], rotation=45)
+    ax.set_xlabel('Scenarios')
+    ax.set_ylabel('Total DALYs accrued (in millions)')
+    ax.set_title(disease)
+
+    # Format y-axis ticks to display in millions
+    formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
+    ax.yaxis.set_major_formatter(formatter)
+
+    #ax.set_ylim((0, 50))
+    #ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+    fig.tight_layout()
+    fig.savefig(figurespath / f'main_result_DALYs_{disease}.png')
+    fig.show()
+    plt.close(fig)
+
+
 # Figure - Focus on top 5 diseases across the 10 scenarios? 0r do a dot plot
 # Assuming dalys_by_disease_summarized is your MultiIndex Series
 # Convert it to a DataFrame for easier manipulation
 dalys_by_disease_summarized_df = dalys_by_disease_summarized.reset_index()
 dalys_by_disease_summarized_df = dalys_by_disease_summarized_df.rename(columns = {'level_2': 'cause', 0: 'DALYs'})
 
-create_line_plot_absolute_figure(_df = dalys_by_disease_summarized_df,
-                 _plt_var = "DALYs",
-                 _index_var = "cause",
-                 keep_rows = 10, # keep top 10 causes
-                  metric='DALYs accrued',
-                 _plt_name = 'DALYs_by_cause.png')
-
-create_line_plot_percentage_of_baseline(_df = dalys_by_disease_summarized_df,
-                 _plt_var = "DALYs",
-                 _index_var = "cause",
-                 keep_rows = 10, # keep top 10 causes
-                  metric = 'DALYs accrued',
-                 _plt_name = 'DALYs_by_cause_percentage.png')
-
 # 2. Consumable demand not met
 #-----------------------------------------
 # Number of units of item which were needed but not made available for the top 25 items
@@ -351,22 +297,10 @@ def get_counts_of_items_requested(_df):
     cons_not_available = cons_not_available.unstack().reset_index()
     cons_not_available = cons_not_available.rename(columns={0: 'qty_not_available'})
 
-    create_line_plot_absolute_figure(_df=cons_not_available,
-                     _plt_var="qty_not_available",
-                     _index_var="Items",
-                     keep_rows=25,  # keep top 25 demanded consumables
-                     metric="consumable demand not met",
-                     _plt_name='consumables_demand_not_met.png')
-
-    create_line_plot_percentage_of_baseline(_df=cons_not_available,
-                     _plt_var="qty_not_available",
-                     _index_var="Items",
-                     keep_rows=25,   # keep top 25 demanded consumables
-                     metric = "consumable demand not met",
-                     _plt_name='consumables_demand_not_met_percentage.png')
-
 consumables_availability_figure(results_folder, outputspath, resourcefilepath)
 
+# TODO use squarify_plot to represent which consumables are most used in the system (by short Treatment_ID?) (not quantity but frequency)
+
 # HSI affected by missing consumables
 # We need healthsystem logger for this
 
@@ -426,63 +360,60 @@ def get_counts_of_hsis(_df):
 hsi_stacked = hsi.stack().stack().reset_index()
 hsi_stacked = hsi_stacked.rename(columns={0: 'hsis_requested'})
 
-create_line_plot_absolute_figure(_df=hsi_stacked,
-                 _plt_var="hsis_requested",
-                 _index_var="Level",
-                 keep_rows=6,  # show all levels
-                 metric = 'HSIs',
-                 _plt_name='hsis_requested.png')
-
-create_line_plot_percentage_of_baseline(_df=hsi_stacked,
-                 _plt_var="hsis_requested",
-                 _index_var="Level",
-                 keep_rows=6,  # show all levels
-                 metric = 'HSIs',
-                 _plt_name='hsis_requested.png')
 
+# 4.1 Number of Services delivered by long Treatment_ID
+#------------------------------------------------------
+def get_counts_of_hsi_by_treatment_id(_df):
+    """Get the counts of the short TREATMENT_IDs occurring"""
+    _counts_by_treatment_id = _df \
+        .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'TREATMENT_ID'] \
+        .apply(pd.Series) \
+        .sum() \
+        .astype(int)
+    return _counts_by_treatment_id.groupby(level=0).sum()
 
-# 4. Number of Services delivered
-#-----------------------------------------
-def get_counts_of_treatments(_df):
-    _df = drop_outside_period(_df)
+counts_of_hsi_by_treatment_id = summarize(
+    extract_results(
+        results_folder,
+        module='tlo.methods.healthsystem.summary',
+        key='HSI_Event',
+        custom_generate_series=get_counts_of_hsi_by_treatment_id,
+        do_scaling=True
+    ),
+    only_mean=True,
+    collapse_columns=True,
+)
 
-    counts_of_treatments = defaultdict(int)
+counts_of_hsi_by_treatment_id = counts_of_hsi_by_treatment_id.assign(baseline_values=counts_of_hsi_by_treatment_id[(0, 'mean')]).sort_values('baseline_values').drop(columns='baseline_values')
+hsi_by_treatment_id = counts_of_hsi_by_treatment_id.unstack().reset_index()
+hsi_by_treatment_id = hsi_by_treatment_id.rename(columns={'level_2': 'Treatment_ID', 0: 'qty_of_HSIs'})
 
-    for _, row in _df.iterrows():
-        for item, num in row['TREATMENT_ID'].items():
-            counts_of_treatments[item] += num
+# hsi[(0,'mean')].sum()/counts_of_hsi_by_treatment_id[(0,'mean')].sum()
 
-    return pd.Series(counts_of_treatments).fillna(0).astype(int)
+# 4.2 Number of Services delivered by short Treatment ID
+#--------------------------------------------------------
+def get_counts_of_hsi_by_short_treatment_id(_df):
+    """Get the counts of the short TREATMENT_IDs occurring (shortened, up to first underscore)"""
+    _counts_by_treatment_id = get_counts_of_hsi_by_treatment_id(_df)
+    _short_treatment_id = _counts_by_treatment_id.index.map(lambda x: x.split('_')[0] + "*")
+    return _counts_by_treatment_id.groupby(by=_short_treatment_id).sum()
 
 
-count_of_treatments_delivered = summarize(
+counts_of_hsi_by_treatment_id_short = summarize(
     extract_results(
         results_folder,
         module='tlo.methods.healthsystem.summary',
         key='HSI_Event',
-        custom_generate_series=get_counts_of_treatments,
+        custom_generate_series=get_counts_of_hsi_by_short_treatment_id,
         do_scaling=True
     ),
     only_mean=True,
-    collapse_columns=True
+    collapse_columns=True,
 )
-count_of_treatments_delivered = count_of_treatments_delivered.assign(baseline_values=count_of_treatments_delivered[(0, 'mean')]).sort_values('baseline_values').drop(columns='baseline_values')
-treatments_delivered = count_of_treatments_delivered.unstack().reset_index()
-treatments_delivered = treatments_delivered.rename(columns={'level_2': 'Treatment_ID', 0: 'qty_delivered'})
-
-create_line_plot_absolute_figure(_df=treatments_delivered,
-                 _plt_var="qty_delivered",
-                 _index_var="Treatment_ID",
-                 keep_rows=20,  # show all levels
-                 metric = 'Number of HSIs requested by Treatment ID',
-                 _plt_name='treatments_delivered.png')
-
-create_line_plot_percentage_of_baseline(_df=treatments_delivered,
-                 _plt_var="qty_delivered",
-                 _index_var="Treatment_ID",
-                 keep_rows=20,  # show all levels
-                 metric = 'number of HSIs requested by Treatment ID',
-                 _plt_name='treatments_delivered_percentage.png')
+
+hsi_by_short_treatment_id = counts_of_hsi_by_treatment_id_short.unstack().reset_index()
+hsi_by_short_treatment_id = hsi_by_short_treatment_id.rename(columns = {'level_2': 'Short_Treatment_ID', 0: 'qty_of_HSIs'})
+
 
 # Cost of consumables?
 

From 998c5ae6ffc6b4bb3e5c8ea8d9d9bf85e4de88f2 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sat, 22 Jun 2024 19:19:23 +0100
Subject: [PATCH 040/100] update total DALYs averted and total DALYs accrued
 figures

---
 ...analysis_impact_of_consumable_scenarios.py | 201 +++++++++++++-----
 1 file changed, 142 insertions(+), 59 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 09a3726f58..723309e36e 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -77,6 +77,73 @@ def drop_outside_period(_df):
     """Return a dataframe which only includes for which the date is within the limits defined by TARGET_PERIOD"""
     return _df.drop(index=_df.index[~_df['date'].between(*TARGET_PERIOD)])
 
+def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrapped=False):
+    """Make a vertical bar plot for each row of _df, using the columns to identify the height of the bar and the
+     extent of the error bar."""
+    yerr = np.array([
+        (_df['mean'] - _df['lower']).values,
+        (_df['upper'] - _df['mean']).values,
+    ])
+
+    xticks = {(i + 0.5): k for i, k in enumerate(_df.index)}
+    colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
+
+    fig, ax = plt.subplots()
+    ax.bar(
+        xticks.keys(),
+        _df['mean'].values,
+        yerr=yerr,
+        alpha=1,
+        color = colors,
+        ecolor='black',
+        capsize=10,
+        label=xticks.values()
+    )
+    if annotations:
+        for xpos, ypos, text in zip(xticks.keys(), _df['upper'].values, annotations):
+            ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize = 9)
+    ax.set_xticks(list(xticks.keys()))
+    if not xticklabels_horizontal_and_wrapped:
+        # xticklabels will be vertical and not wrapped
+        ax.set_xticklabels(list(xticks.values()), rotation=90)
+    else:
+        wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()]
+        ax.set_xticklabels(wrapped_labs)
+    ax.grid(axis="y")
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    fig.tight_layout()
+
+    return fig, ax
+
+def get_num_dalys(_df):
+    """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD).
+    Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using
+    results from runs that crashed mid-way through the simulation.
+    """
+    years_needed = [i.year for i in TARGET_PERIOD]
+    assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded."
+    return pd.Series(
+        data=_df
+        .loc[_df.year.between(*years_needed)]
+        .drop(columns=['date', 'sex', 'age_range', 'year'])
+        .sum().sum()
+    )
+
+def find_difference_relative_to_comparison(_ser: pd.Series,
+                                           comparison: str,
+                                           scaled: bool = False,
+                                           drop_comparison: bool = True,
+                                           ):
+    """Find the difference in the values in a pd.Series with a multi-index, between the draws (level 0)
+    within the runs (level 1), relative to where draw = `comparison`.
+    The comparison is `X - COMPARISON`."""
+    return _ser \
+        .unstack(level=0) \
+        .apply(lambda x: (x - x[comparison]) / (x[comparison] if scaled else 1.0), axis=1) \
+        .drop(columns=([comparison] if drop_comparison else [])) \
+        .stack()
+
 # %% Gathering basic information
 
 # Find results_folder associated with a given batch_file and get most recent
@@ -97,78 +164,94 @@ def drop_outside_period(_df):
                 'all': 'Perfect'}
 params_dict_df = pd.DataFrame.from_dict(params_dict, orient='index', columns=['name_of_scenario']).reset_index().rename(columns = {'index': 'value'})
 params = params.merge(params_dict_df, on = 'value', how = 'left', validate = '1:1')
-
+scenarios = params['name_of_scenario'] #range(len(params))  # X-axis values representing time periods
+drop_scenarios = ['Scenario 4', 'Scenario 5'] # Drops scenarios which are no longer considered important for comparison
 
 # %% Extracting results from run
 
-# 1. DALYs accrued
+# 1. DALYs accrued and averted
 #-----------------------------------------
 # 1.1 Total DALYs accrued
-def extract_total_dalys(results_folder):
-
-    def extract_dalys_total(df: pd.DataFrame) -> pd.Series:
-        return pd.Series({"Total": df.drop(['date', 'sex', 'age_range', 'year'], axis = 1).sum().sum()})
-
-    return extract_results(
+# Get total DALYs accrued
+num_dalys = extract_results(
         results_folder,
-        module="tlo.methods.healthburden",
-        key="dalys_stacked",
-        custom_generate_series=extract_dalys_total,
+        module='tlo.methods.healthburden',
+        key='dalys_stacked',
+        custom_generate_series=get_num_dalys,
         do_scaling=True
     )
 
-total_dalys_accrued = summarize(extract_total_dalys(results_folder))
-total_dalys_accrued = total_dalys_accrued.unstack()
-print(total_dalys_accrued)
-
-'''
-total_dalys_accrued = total_dalys_accrued.reset_index()
-total_dalys_accrued = total_dalys_accrued.rename(columns = {'level_2': 'var', 0: 'DALYs'})
-
-create_bar_plot_dalys_averted(_df = total_dalys_accrued,
-                 _plt_var = "DALYs",
-                 _index_var = "cause",
-                  metric='DALYs accrued',
-                 _plt_name = 'DALYs_averted.png')
-'''
+# %% Charts of total numbers of deaths / DALYS
+num_dalys_summarized = summarize(num_dalys).loc[0].unstack()
+num_dalys_summarized['scenario'] = scenarios.to_list()
+num_dalys_summarized = num_dalys_summarized.set_index('scenario')
+
+# Plot DALYS averted (with xtickabels horizontal and wrapped)
+name_of_plot = f'Total DALYs accrued, {target_period()}'
+chosen_num_dalys_summarized = num_dalys_summarized[~num_dalys_summarized.index.isin(drop_scenarios)]
+fig, ax = do_bar_plot_with_ci(
+    (chosen_num_dalys_summarized / 1e6).clip(lower=0.0),
+    annotations=[
+        f"{round(row['mean']/1e6, 1)} \n ({round(row['lower']/1e6, 1)}-{round(row['upper']/1e6, 1)})"
+        for _, row in chosen_num_dalys_summarized.clip(lower=0.0).iterrows()
+    ],
+    xticklabels_horizontal_and_wrapped=False,
+)
+ax.set_title(name_of_plot)
+ax.set_ylim(0, 120)
+ax.set_yticks(np.arange(0, 120, 10))
+ax.set_ylabel('Total DALYs accrued \n(Millions)')
+fig.tight_layout()
+fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', ''))
+fig.show()
+plt.close(fig)
 
 fig, ax = plt.subplots()
 
-# Arrays to store the values for plotting
-central_vals = []
-lower_vals = []
-upper_vals = []
-
-# Extract values for each parameter
-for i, _p in enumerate(params['value']):
-    central_val = total_dalys_accrued[(i, 'mean')].values[0]
-    lower_val = total_dalys_accrued[(i, 'lower')].values[0]
-    upper_val = total_dalys_accrued[(i, 'upper')].values[0]
-
-    central_vals.append(central_val)
-    lower_vals.append(lower_val)
-    upper_vals.append(upper_val)
-
-# Generate the plot
-scenarios = params['name_of_scenario'] #range(len(params))  # X-axis values representing time periods
-colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
-
-for i in range(len(scenarios)):
-    ax.bar(scenarios[i], central_vals[i], color=colors[i], label=scenarios[i])
-    ax.errorbar(scenarios[i], central_vals[i], yerr=[[central_vals[i] - lower_vals[i]], [upper_vals[i] - central_vals[i]]], fmt='o', color='black')
-
-plt.xticks(scenarios, params['name_of_scenario'], rotation=45)
-ax.set_xlabel('Scenarios')
-ax.set_ylabel('Total DALYs accrued (in millions)')
-
-# Format y-axis ticks to display in millions
-formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
-ax.yaxis.set_major_formatter(formatter)
-
-#ax.set_ylim((0, 50))
-#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+# 1.2 Total DALYs averted
+# Get absolute DALYs averted
+num_dalys_averted = summarize(
+        -1.0 *
+        pd.DataFrame(
+            find_difference_relative_to_comparison(
+                num_dalys.loc[0],
+                comparison= 0) # sets the comparator to 0 which is the Status Quo scenario
+        ).T
+    ).iloc[0].unstack()
+num_dalys_averted['scenario'] = scenarios.to_list()[1:10]
+num_dalys_averted = num_dalys_averted.set_index('scenario')
+
+# Get percentage DALYs averted
+pc_dalys_averted = 100.0 * summarize(
+    -1.0 *
+    pd.DataFrame(
+        find_difference_relative_to_comparison(
+            num_dalys.loc[0],
+            comparison= 0, # sets the comparator to 0 which is the Status Quo scenario
+            scaled=True)
+    ).T
+).iloc[0].unstack()
+pc_dalys_averted['scenario'] = scenarios.to_list()[1:10]
+pc_dalys_averted = pc_dalys_averted.set_index('scenario')
+
+# Plot DALYS averted (with xtickabels horizontal and wrapped)
+name_of_plot = f'Additional DALYs Averted vs Actual, {target_period()}'
+chosen_num_dalys_averted = num_dalys_averted[~num_dalys_averted.index.isin(drop_scenarios)]
+chosen_pc_dalys_averted = pc_dalys_averted[~pc_dalys_averted.index.isin(drop_scenarios)]
+fig, ax = do_bar_plot_with_ci(
+    (chosen_num_dalys_averted / 1e6).clip(lower=0.0),
+    annotations=[
+        f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}-{round(row['upper'], 1)}) %"
+        for _, row in chosen_pc_dalys_averted.clip(lower=0.0).iterrows()
+    ],
+    xticklabels_horizontal_and_wrapped=False,
+)
+ax.set_title(name_of_plot)
+ax.set_ylim(0, 16)
+ax.set_yticks(np.arange(0, 18, 2))
+ax.set_ylabel('Additional DALYS Averted \n(Millions)')
 fig.tight_layout()
-fig.savefig(figurespath / 'main_result_DALYs.png')
+fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', ''))
 fig.show()
 plt.close(fig)
 

From c47d771298cbf91bb3dec6e907ef92a7d0dd23be Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sat, 22 Jun 2024 19:46:55 +0100
Subject: [PATCH 041/100] plot DALYs accrued by cause

---
 ...analysis_impact_of_consumable_scenarios.py | 59 +++++++++++++++++--
 1 file changed, 55 insertions(+), 4 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 723309e36e..a0ae1b9f41 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -130,6 +130,20 @@ def get_num_dalys(_df):
         .sum().sum()
     )
 
+def get_num_dalys_by_cause(_df):
+    """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD).
+    Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using
+    results from runs that crashed mid-way through the simulation.
+    """
+    years_needed = [i.year for i in TARGET_PERIOD]
+    assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded."
+    return pd.Series(
+        data=_df
+        .loc[_df.year.between(*years_needed)]
+        .drop(columns=['date', 'sex', 'age_range', 'year'])
+        .sum()
+    )
+
 def find_difference_relative_to_comparison(_ser: pd.Series,
                                            comparison: str,
                                            scaled: bool = False,
@@ -181,12 +195,12 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
         do_scaling=True
     )
 
-# %% Charts of total numbers of deaths / DALYS
+# %% Chart of total number of DALYS
 num_dalys_summarized = summarize(num_dalys).loc[0].unstack()
 num_dalys_summarized['scenario'] = scenarios.to_list()
 num_dalys_summarized = num_dalys_summarized.set_index('scenario')
 
-# Plot DALYS averted (with xtickabels horizontal and wrapped)
+# Plot DALYS accrued (with xtickabels horizontal and wrapped)
 name_of_plot = f'Total DALYs accrued, {target_period()}'
 chosen_num_dalys_summarized = num_dalys_summarized[~num_dalys_summarized.index.isin(drop_scenarios)]
 fig, ax = do_bar_plot_with_ci(
@@ -206,8 +220,6 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 fig.show()
 plt.close(fig)
 
-fig, ax = plt.subplots()
-
 # 1.2 Total DALYs averted
 # Get absolute DALYs averted
 num_dalys_averted = summarize(
@@ -234,6 +246,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 pc_dalys_averted['scenario'] = scenarios.to_list()[1:10]
 pc_dalys_averted = pc_dalys_averted.set_index('scenario')
 
+# %% Chart of number of DALYs averted
 # Plot DALYS averted (with xtickabels horizontal and wrapped)
 name_of_plot = f'Additional DALYs Averted vs Actual, {target_period()}'
 chosen_num_dalys_averted = num_dalys_averted[~num_dalys_averted.index.isin(drop_scenarios)]
@@ -256,6 +269,44 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 plt.close(fig)
 
 # 1.2 DALYs by disease area/intervention - for comparison of the magnitude of impact created by consumables interventions
+num_dalys_by_cause = extract_results(
+        results_folder,
+        module='tlo.methods.healthburden',
+        key='dalys_stacked',
+        custom_generate_series=get_num_dalys_by_cause,
+        do_scaling=True
+    )
+num_dalys_by_cause_summarized = summarize(num_dalys_by_cause).unstack(level = 0)
+num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.reset_index()
+num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.rename(columns = {'level_2':'cause', 0: 'DALYs_accrued'})
+num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.pivot(index=['draw','cause'], columns='stat', values='DALYs_accrued')
+
+# Get top 10 causes until status quo
+num_dalys_by_cause_status_quo = num_dalys_by_cause_summarized[num_dalys_by_cause_summarized.index.get_level_values(0) == 0]
+num_dalys_by_cause_status_quo = num_dalys_by_cause_status_quo.sort_values('mean', ascending = False)
+num_dalys_by_cause_status_quo =num_dalys_by_cause_status_quo[0:10]
+
+for cause in num_dalys_by_cause_status_quo.index.get_level_values(1).unique():
+    name_of_plot = f'Total DALYs accrued by {cause}, {target_period()}'
+    chosen_num_dalys_by_cause_summarized = num_dalys_by_cause_summarized[~num_dalys_by_cause_summarized.index.get_level_values(0).isin([4,5])]
+    chosen_num_dalys_by_cause_summarized = chosen_num_dalys_by_cause_summarized[chosen_num_dalys_by_cause_summarized.index.get_level_values(1) == cause]
+    fig, ax = do_bar_plot_with_ci(
+        (chosen_num_dalys_by_cause_summarized / 1e6).clip(lower=0.0),
+        annotations=[
+            f"{round(row['mean'] / 1e6, 1)} \n ({round(row['lower'] / 1e6, 1)}-{round(row['upper'] / 1e6, 1)})"
+            for _, row in chosen_num_dalys_by_cause_summarized.clip(lower=0.0).iterrows()
+        ],
+        xticklabels_horizontal_and_wrapped=False,
+    )
+    ax.set_title(name_of_plot)
+    ax.set_ylim(0, 30)
+    ax.set_yticks(np.arange(0, 30, 5))
+    ax.set_ylabel(f'Total DALYs accrued by {cause} \n(Millions)')
+    fig.tight_layout()
+    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
+    fig.show()
+    plt.close(fig)
+
 def _extract_dalys_by_disease(_df: pd.DataFrame) -> pd.Series:
     """Construct a series with index disease and value of the total of DALYS (stacked) from the
     `dalys_stacked` key logged in `tlo.methods.healthburden`.

From 847d4aed149d77c78b56de39f3408e0796a4b3f0 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sun, 23 Jun 2024 13:48:41 +0100
Subject: [PATCH 042/100] plot DALYs averted by cause

---
 ...analysis_impact_of_consumable_scenarios.py | 139 ++++++++----------
 1 file changed, 65 insertions(+), 74 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index a0ae1b9f41..f3c94c543f 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -285,8 +285,62 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 num_dalys_by_cause_status_quo = num_dalys_by_cause_summarized[num_dalys_by_cause_summarized.index.get_level_values(0) == 0]
 num_dalys_by_cause_status_quo = num_dalys_by_cause_status_quo.sort_values('mean', ascending = False)
 num_dalys_by_cause_status_quo =num_dalys_by_cause_status_quo[0:10]
+top_10_causes_of_dalys = num_dalys_by_cause_status_quo.index.get_level_values(1).unique()
+
+# Get DALYs aveterted by cause and plot bar chats
+for cause in top_10_causes_of_dalys:
+    num_dalys_by_cause_pivoted = num_dalys_by_cause[num_dalys_by_cause.index == cause].unstack().reset_index().drop(columns = ['level_2']).set_index(['draw', 'run'])
+    num_dalys_averted_by_cause = summarize(
+            -1.0 *
+            pd.DataFrame(
+                find_difference_relative_to_comparison(
+                    num_dalys_by_cause_pivoted.squeeze(),
+                    comparison= 0) # sets the comparator to 0 which is the Status Quo scenario
+            ).T
+        ).iloc[0].unstack()
+    num_dalys_averted_by_cause['scenario'] = scenarios.to_list()[1:10]
+    num_dalys_averted_by_cause = num_dalys_averted_by_cause.set_index('scenario')
+
+    # Get percentage DALYs averted
+    pc_dalys_averted_by_cause = 100.0 * summarize(
+        -1.0 *
+        pd.DataFrame(
+            find_difference_relative_to_comparison(
+                num_dalys_by_cause_pivoted.squeeze(),
+                comparison= 0, # sets the comparator to 0 which is the Status Quo scenario
+                scaled=True)
+        ).T
+    ).iloc[0].unstack()
+    pc_dalys_averted_by_cause['scenario'] = scenarios.to_list()[1:10]
+    pc_dalys_averted_by_cause = pc_dalys_averted_by_cause.set_index('scenario')
+
+    # Create a plot of DALYs averted by cause
+    chosen_num_dalys_averted_by_cause = num_dalys_averted_by_cause[~num_dalys_averted_by_cause.index.isin(drop_scenarios)]
+    chosen_pc_dalys_averted_by_cause = pc_dalys_averted_by_cause[~pc_dalys_averted_by_cause.index.isin(drop_scenarios)]
+    name_of_plot = f'Total DALYs averted by cause ({cause}), {target_period()}'
+    fig, ax = do_bar_plot_with_ci(
+        (chosen_num_dalys_averted_by_cause / 1e6).clip(lower=0.0),
+        annotations=[
+            f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}-{round(row['upper'], 1)}) %"
+            for _, row in chosen_pc_dalys_averted_by_cause.clip(lower=0.0).iterrows()
+        ],
+        xticklabels_horizontal_and_wrapped=False,
+    )
+    if chosen_num_dalys_averted_by_cause.upper.max()/1e6 > 2:
+        y_limit = 8.5
+    else:
+        y_limit = 2.5
+    ax.set_title(name_of_plot)
+    ax.set_ylim(0, y_limit)
+    ax.set_yticks(np.arange(0, y_limit, 0.5))
+    ax.set_ylabel(f'Total DALYs averted \n(Millions)')
+    fig.tight_layout()
+    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
+    fig.show()
+    plt.close(fig)
 
-for cause in num_dalys_by_cause_status_quo.index.get_level_values(1).unique():
+# PLot DALYs accrued by cause
+for cause in top_10_causes_of_dalys:
     name_of_plot = f'Total DALYs accrued by {cause}, {target_period()}'
     chosen_num_dalys_by_cause_summarized = num_dalys_by_cause_summarized[~num_dalys_by_cause_summarized.index.get_level_values(0).isin([4,5])]
     chosen_num_dalys_by_cause_summarized = chosen_num_dalys_by_cause_summarized[chosen_num_dalys_by_cause_summarized.index.get_level_values(1) == cause]
@@ -299,84 +353,21 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
         xticklabels_horizontal_and_wrapped=False,
     )
     ax.set_title(name_of_plot)
-    ax.set_ylim(0, 30)
-    ax.set_yticks(np.arange(0, 30, 5))
-    ax.set_ylabel(f'Total DALYs accrued by {cause} \n(Millions)')
+    if chosen_num_dalys_by_cause_summarized.upper.max()/1e6 > 5:
+        y_limit = 30
+        y_tick_gap = 5
+    else:
+        y_limit = 5
+        y_tick_gap = 1
+    ax.set_ylim(0, y_limit)
+    ax.set_yticks(np.arange(0, y_limit, y_tick_gap))
+    ax.set_ylabel(f'Total DALYs accrued \n(Millions)')
     fig.tight_layout()
     fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
     fig.show()
     plt.close(fig)
 
-def _extract_dalys_by_disease(_df: pd.DataFrame) -> pd.Series:
-    """Construct a series with index disease and value of the total of DALYS (stacked) from the
-    `dalys_stacked` key logged in `tlo.methods.healthburden`.
-    N.B. This limits the time period of interest to 2010-2019"""
-    _, calperiodlookup = make_calendar_period_lookup()
-
-    return _df.loc[(_df['year'] >=2009) & (_df['year'] < 2012)]\
-             .drop(columns=['date', 'sex', 'age_range', 'year'])\
-             .sum(axis=0)
-
-dalys_extracted_by_disease = extract_results(
-    results_folder,
-    module="tlo.methods.healthburden",
-    key="dalys_stacked",
-    custom_generate_series=_extract_dalys_by_disease,
-    do_scaling=True
-)
-
-dalys_by_disease_summarized = summarize(dalys_extracted_by_disease)
-dalys_by_disease_summarized = dalys_by_disease_summarized.unstack()
-
-for disease in ['AIDS', 'Lower respiratory infections', 'Neonatal Disorders', 'Malaria', 'TB (non-AIDS)']:
-    dalys_accrued = dalys_by_disease_summarized.xs(disease, level=2)
-    fig, ax = plt.subplots()
-
-    # Arrays to store the values for plotting
-    central_vals = []
-    lower_vals = []
-    upper_vals = []
-
-    # Extract values for each parameter
-    for i, _p in enumerate(params['value']):
-        central_val = dalys_accrued[(i, 'mean')]
-        lower_val = dalys_accrued[(i, 'lower')]
-        upper_val = dalys_accrued[(i, 'upper')]
-
-        central_vals.append(central_val)
-        lower_vals.append(lower_val)
-        upper_vals.append(upper_val)
-
-    # Generate the plot
-    scenarios = params['name_of_scenario'] #range(len(params))  # X-axis values representing time periods
-    colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
-
-    for i in range(len(scenarios)):
-        ax.bar(scenarios[i], central_vals[i], color=colors[i], label=scenarios[i])
-        ax.errorbar(scenarios[i], central_vals[i], yerr=[[central_vals[i] - lower_vals[i]], [upper_vals[i] - central_vals[i]]], fmt='o', color='black')
-
-    plt.xticks(scenarios, params['name_of_scenario'], rotation=45)
-    ax.set_xlabel('Scenarios')
-    ax.set_ylabel('Total DALYs accrued (in millions)')
-    ax.set_title(disease)
-
-    # Format y-axis ticks to display in millions
-    formatter = FuncFormatter(lambda x, _: '{:,.0f}'.format(x / 1000000))
-    ax.yaxis.set_major_formatter(formatter)
-
-    #ax.set_ylim((0, 50))
-    #ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
-    fig.tight_layout()
-    fig.savefig(figurespath / f'main_result_DALYs_{disease}.png')
-    fig.show()
-    plt.close(fig)
-
-
-# Figure - Focus on top 5 diseases across the 10 scenarios? 0r do a dot plot
-# Assuming dalys_by_disease_summarized is your MultiIndex Series
-# Convert it to a DataFrame for easier manipulation
-dalys_by_disease_summarized_df = dalys_by_disease_summarized.reset_index()
-dalys_by_disease_summarized_df = dalys_by_disease_summarized_df.rename(columns = {'level_2': 'cause', 0: 'DALYs'})
+# TODO Fix xticklabels in the plots above
 
 # 2. Consumable demand not met
 #-----------------------------------------

From fc41b9a2de7995b833513f14107ab7428580431d Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sun, 23 Jun 2024 15:53:58 +0100
Subject: [PATCH 043/100] plot DALYs averted per person year

---
 ...analysis_impact_of_consumable_scenarios.py | 148 +++++++++++++++++-
 1 file changed, 146 insertions(+), 2 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index f3c94c543f..87db34602d 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -144,6 +144,90 @@ def get_num_dalys_by_cause(_df):
         .sum()
     )
 
+def get_num_dalys_per_person_year(_df):
+    """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD).
+    Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using
+    results from runs that crashed mid-way through the simulation.
+    """
+    years_needed = [i.year for i in TARGET_PERIOD]
+    assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded."
+    return pd.Series(
+        data=_df
+        .loc[_df.year.between(*years_needed)]
+        .drop(columns=['date', 'sex', 'age_range'])
+        .groupby('year').sum().sum(axis = 1)
+    )
+def extract_results_by_person_year(results_folder: Path,
+                    module: str,
+                    key: str,
+                    column: str = None,
+                    index: str = None,
+                    custom_generate_series=None,
+                    ) -> pd.DataFrame:
+    """Utility function to unpack results.
+
+    Produces a dataframe from extracting information from a log with the column multi-index for the draw/run.
+
+    If the column to be extracted exists in the log, the name of the `column` is provided as `column`. If the resulting
+     dataframe should be based on another column that exists in the log, this can be provided as 'index'.
+
+    If instead, some work must be done to generate a new column from log, then a function can be provided to do this as
+     `custom_generate_series`.
+
+    Optionally, with `do_scaling=True`, each element is multiplied by the scaling_factor recorded in the simulation.
+
+    Note that if runs in the batch have failed (such that logs have not been generated), these are dropped silently.
+    """
+
+    def get_population_size(_draw, _run):
+        """Helper function to get the multiplier from the simulation.
+        Note that if the scaling factor cannot be found a `KeyError` is thrown."""
+        return load_pickled_dataframes(
+            results_folder, _draw, _run, 'tlo.methods.demography'
+        )['tlo.methods.demography']['population']['total']
+
+    if custom_generate_series is None:
+        # If there is no `custom_generate_series` provided, it implies that function required selects the specified
+        # column from the dataframe.
+        assert column is not None, "Must specify which column to extract"
+    else:
+        assert index is None, "Cannot specify an index if using custom_generate_series"
+        assert column is None, "Cannot specify a column if using custom_generate_series"
+
+    def generate_series(dataframe: pd.DataFrame) -> pd.Series:
+        if custom_generate_series is None:
+            if index is not None:
+                return dataframe.set_index(index)[column]
+            else:
+                return dataframe.reset_index(drop=True)[column]
+        else:
+            return custom_generate_series(dataframe)
+
+    # get number of draws and numbers of runs
+    info = get_scenario_info(results_folder)
+
+    # Collect results from each draw/run
+    res = dict()
+    for draw in range(info['number_of_draws']):
+        for run in range(info['runs_per_draw']):
+
+            draw_run = (draw, run)
+
+            try:
+                df: pd.DataFrame = load_pickled_dataframes(results_folder, draw, run, module)[module][key]
+                output_from_eval: pd.Series = generate_series(df)
+                assert pd.Series == type(output_from_eval), 'Custom command does not generate a pd.Series'
+                res[draw_run] = output_from_eval.reset_index().drop(columns = ['year']).T / get_population_size(draw, run)
+                res[draw_run] = res[draw_run].sum(axis =1)
+            except KeyError:
+                # Some logs could not be found - probably because this run failed.
+                res[draw_run] = None
+
+    # Use pd.concat to compile results (skips dict items where the values is None)
+    _concat = pd.concat(res, axis=1)
+    _concat.columns.names = ['draw', 'run']  # name the levels of the columns multi-index
+    return _concat
+
 def find_difference_relative_to_comparison(_ser: pd.Series,
                                            comparison: str,
                                            scaled: bool = False,
@@ -184,8 +268,9 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 # %% Extracting results from run
 
 # 1. DALYs accrued and averted
-#-----------------------------------------
+###################################
 # 1.1 Total DALYs accrued
+#-------------------------
 # Get total DALYs accrued
 num_dalys = extract_results(
         results_folder,
@@ -221,6 +306,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 plt.close(fig)
 
 # 1.2 Total DALYs averted
+#------------------------
 # Get absolute DALYs averted
 num_dalys_averted = summarize(
         -1.0 *
@@ -369,6 +455,65 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # TODO Fix xticklabels in the plots above
 
+# Plot health worker capacity utilised against DALYs accrued
+# TODO add plot showing health workers capacity utilised
+# log['tlo.methods.healthsystem.summary']['Capacity_By_OfficerType_And_FacilityLevel']['OfficerType=Pharmacy|FacilityLevel=2']
+
+# 1.3 Total DALYs averted per person
+#----------------------------------------
+num_dalys_per_person_year = extract_results_by_person_year(
+        results_folder,
+        module='tlo.methods.healthburden',
+        key='dalys_stacked',
+        custom_generate_series=get_num_dalys_per_person_year,
+    )
+
+num_dalys_averted_per_person_year = summarize(
+        -1.0 *
+        pd.DataFrame(
+            find_difference_relative_to_comparison(
+                num_dalys_per_person_year.loc[0],
+                comparison= 0) # sets the comparator to 0 which is the Status Quo scenario
+        ).T
+    ).iloc[0].unstack()
+num_dalys_averted_per_person_year['scenario'] = scenarios.to_list()[1:10]
+num_dalys_averted_per_person_year = num_dalys_averted_per_person_year.set_index('scenario')
+
+# Get percentage DALYs averted
+pct_dalys_averted_per_person_year = 100.0 * summarize(
+    -1.0 *
+    pd.DataFrame(
+        find_difference_relative_to_comparison(
+            num_dalys_per_person_year.loc[0],
+            comparison= 0, # sets the comparator to 0 which is the Status Quo scenario
+            scaled=True)
+    ).T
+).iloc[0].unstack()
+pct_dalys_averted_per_person_year['scenario'] = scenarios.to_list()[1:10]
+pct_dalys_averted_per_person_year = pct_dalys_averted_per_person_year.set_index('scenario')
+
+# %% Chart of number of DALYs averted
+# Plot DALYS averted (with xtickabels horizontal and wrapped)
+name_of_plot = f'Additional DALYs Averted Per Person-Year vs Status Quo, {target_period()}'
+chosen_num_dalys_averted_per_person_year = num_dalys_averted_per_person_year[~num_dalys_averted_per_person_year.index.isin(drop_scenarios)]
+chosen_pct_dalys_averted_per_person_year = pct_dalys_averted_per_person_year[~pct_dalys_averted_per_person_year.index.isin(drop_scenarios)]
+fig, ax = do_bar_plot_with_ci(
+    (chosen_num_dalys_averted_per_person_year).clip(lower=0.0),
+    annotations=[
+        f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}-{round(row['upper'], 1)}) %"
+        for _, row in chosen_pct_dalys_averted_per_person_year.clip(lower=0.0).iterrows()
+    ],
+    xticklabels_horizontal_and_wrapped=False,
+)
+ax.set_title(name_of_plot)
+ax.set_ylim(0, 1.5)
+ax.set_yticks(np.arange(0, 1.5, 0.2))
+ax.set_ylabel('Additional DALYs averted per person-year')
+fig.tight_layout()
+fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', ''))
+fig.show()
+plt.close(fig)
+
 # 2. Consumable demand not met
 #-----------------------------------------
 # Number of units of item which were needed but not made available for the top 25 items
@@ -539,7 +684,6 @@ def get_counts_of_hsi_by_short_treatment_id(_df):
 hsi_by_short_treatment_id = counts_of_hsi_by_treatment_id_short.unstack().reset_index()
 hsi_by_short_treatment_id = hsi_by_short_treatment_id.rename(columns = {'level_2': 'Short_Treatment_ID', 0: 'qty_of_HSIs'})
 
-
 # Cost of consumables?
 
 # %% Summarizing input resourcefile data

From 230f1a391e6798ae037c482333494ac72f168a1e Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sun, 23 Jun 2024 16:59:43 +0100
Subject: [PATCH 044/100] plot DALYs averted per person year by cause

---
 ...analysis_impact_of_consumable_scenarios.py | 157 +++++++++++++++++-
 1 file changed, 156 insertions(+), 1 deletion(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 87db34602d..60fc88292b 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -157,6 +157,20 @@ def get_num_dalys_per_person_year(_df):
         .drop(columns=['date', 'sex', 'age_range'])
         .groupby('year').sum().sum(axis = 1)
     )
+
+def get_num_dalys_per_person_year_by_cause(_df):
+    """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD).
+    Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using
+    results from runs that crashed mid-way through the simulation.
+    """
+    years_needed = [i.year for i in TARGET_PERIOD]
+    assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded."
+    return pd.Series(
+        data=_df
+        .loc[_df.year.between(*years_needed)]
+        .drop(columns=['date', 'sex', 'age_range'])
+        .groupby('year').sum().unstack()
+    )
 def extract_results_by_person_year(results_folder: Path,
                     module: str,
                     key: str,
@@ -217,6 +231,79 @@ def generate_series(dataframe: pd.DataFrame) -> pd.Series:
                 df: pd.DataFrame = load_pickled_dataframes(results_folder, draw, run, module)[module][key]
                 output_from_eval: pd.Series = generate_series(df)
                 assert pd.Series == type(output_from_eval), 'Custom command does not generate a pd.Series'
+                res[draw_run] = output_from_eval.reset_index().drop(columns = ['year']).T # / get_population_size(draw, run)
+                res[draw_run] = res[draw_run].sum(axis =1)
+            except KeyError:
+                # Some logs could not be found - probably because this run failed.
+                res[draw_run] = None
+
+    # Use pd.concat to compile results (skips dict items where the values is None)
+    _concat = pd.concat(res, axis=1)
+    _concat.columns.names = ['draw', 'run']  # name the levels of the columns multi-index
+    return _concat
+
+def extract_results_by_person_year_by_cause(results_folder: Path,
+                    module: str,
+                    key: str,
+                    column: str = None,
+                    index: str = None,
+                    custom_generate_series=None,
+                    cause: str = None,
+                    ) -> pd.DataFrame:
+    """Utility function to unpack results.
+
+    Produces a dataframe from extracting information from a log with the column multi-index for the draw/run.
+
+    If the column to be extracted exists in the log, the name of the `column` is provided as `column`. If the resulting
+     dataframe should be based on another column that exists in the log, this can be provided as 'index'.
+
+    If instead, some work must be done to generate a new column from log, then a function can be provided to do this as
+     `custom_generate_series`.
+
+    Optionally, with `do_scaling=True`, each element is multiplied by the scaling_factor recorded in the simulation.
+
+    Note that if runs in the batch have failed (such that logs have not been generated), these are dropped silently.
+    """
+
+    def get_population_size(_draw, _run):
+        """Helper function to get the multiplier from the simulation.
+        Note that if the scaling factor cannot be found a `KeyError` is thrown."""
+        return load_pickled_dataframes(
+            results_folder, _draw, _run, 'tlo.methods.demography'
+        )['tlo.methods.demography']['population']['total']
+
+    if custom_generate_series is None:
+        # If there is no `custom_generate_series` provided, it implies that function required selects the specified
+        # column from the dataframe.
+        assert column is not None, "Must specify which column to extract"
+    else:
+        assert index is None, "Cannot specify an index if using custom_generate_series"
+        assert column is None, "Cannot specify a column if using custom_generate_series"
+
+    def generate_series(dataframe: pd.DataFrame) -> pd.Series:
+        if custom_generate_series is None:
+            if index is not None:
+                return dataframe.set_index(index)[column]
+            else:
+                return dataframe.reset_index(drop=True)[column]
+        else:
+            return custom_generate_series(dataframe)
+
+    # get number of draws and numbers of runs
+    info = get_scenario_info(results_folder)
+
+    # Collect results from each draw/run
+    res = dict()
+    for draw in range(info['number_of_draws']):
+        for run in range(info['runs_per_draw']):
+
+            draw_run = (draw, run)
+
+            try:
+                df: pd.DataFrame = load_pickled_dataframes(results_folder, draw, run, module)[module][key]
+                output_from_eval: pd.Series = generate_series(df)
+                assert pd.Series == type(output_from_eval), 'Custom command does not generate a pd.Series'
+                output_from_eval = output_from_eval[output_from_eval.index.get_level_values(0) == cause].droplevel(0)
                 res[draw_run] = output_from_eval.reset_index().drop(columns = ['year']).T / get_population_size(draw, run)
                 res[draw_run] = res[draw_run].sum(axis =1)
             except KeyError:
@@ -459,7 +546,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 # TODO add plot showing health workers capacity utilised
 # log['tlo.methods.healthsystem.summary']['Capacity_By_OfficerType_And_FacilityLevel']['OfficerType=Pharmacy|FacilityLevel=2']
 
-# 1.3 Total DALYs averted per person
+# 1.3 Total DALYs averted per person-year
 #----------------------------------------
 num_dalys_per_person_year = extract_results_by_person_year(
         results_folder,
@@ -514,6 +601,74 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 fig.show()
 plt.close(fig)
 
+# 1.4 Total DALYs averted per person-year by cause
+#-------------------------------------------------
+for cause in top_10_causes_of_dalys:
+    num_dalys_per_person_year_by_cause = extract_results_by_person_year_by_cause(
+            results_folder,
+            module='tlo.methods.healthburden',
+            key='dalys_stacked',
+            custom_generate_series=get_num_dalys_per_person_year_by_cause,
+            cause = cause,
+        )
+
+    num_dalys_per_person_year_by_cause_pivoted = num_dalys_per_person_year_by_cause.unstack().reset_index().drop(
+        columns=['level_2']).set_index(['draw', 'run'])
+    num_dalys_averted_per_person_year_by_cause = summarize(
+        -1.0 *
+        pd.DataFrame(
+            find_difference_relative_to_comparison(
+                num_dalys_per_person_year_by_cause.squeeze(),
+                comparison=0)  # sets the comparator to 0 which is the Status Quo scenario
+        ).T
+    ).iloc[0].unstack()
+    num_dalys_averted_per_person_year_by_cause['scenario'] = scenarios.to_list()[1:10]
+    num_dalys_averted_per_person_year_by_cause = num_dalys_averted_per_person_year_by_cause.set_index('scenario')
+
+    # Get percentage DALYs averted
+    pct_dalys_averted_per_person_year_by_cause = 100.0 * summarize(
+        -1.0 *
+        pd.DataFrame(
+            find_difference_relative_to_comparison(
+                num_dalys_per_person_year_by_cause.squeeze(),
+                comparison=0,  # sets the comparator to 0 which is the Status Quo scenario
+                scaled=True)
+        ).T
+    ).iloc[0].unstack()
+    pct_dalys_averted_per_person_year_by_cause['scenario'] = scenarios.to_list()[1:10]
+    pct_dalys_averted_per_person_year_by_cause = pct_dalys_averted_per_person_year_by_cause.set_index('scenario')
+
+    # Create a plot of DALYs averted by cause
+    chosen_num_dalys_averted_per_person_year_by_cause = num_dalys_averted_per_person_year_by_cause[
+        ~num_dalys_averted_per_person_year_by_cause.index.isin(drop_scenarios)]
+    chosen_pct_dalys_averted_per_person_year_by_cause = pct_dalys_averted_per_person_year_by_cause[~pct_dalys_averted_per_person_year_by_cause.index.isin(drop_scenarios)]
+    name_of_plot = f'Total DALYs averted per person year by cause - \n ({cause}), {target_period()}'
+    fig, ax = do_bar_plot_with_ci(
+        (chosen_num_dalys_averted_per_person_year_by_cause).clip(lower=0.0),
+        annotations=[
+            f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}-{round(row['upper'], 1)}) %"
+            for _, row in pct_dalys_averted_per_person_year_by_cause.clip(lower=0.0).iterrows()
+        ],
+        xticklabels_horizontal_and_wrapped=False,
+    )
+    if chosen_num_dalys_averted_per_person_year_by_cause.upper.max() > 0.4:
+        y_limit = 0.55
+        y_tick_gap = 0.1
+    elif chosen_num_dalys_averted_per_person_year_by_cause.upper.max() > 0.18:
+        y_limit = 0.2
+        y_tick_gap = 0.025
+    else:
+        y_limit = 0.15
+        y_tick_gap = 0.025
+    ax.set_title(name_of_plot)
+    ax.set_ylim(0, y_limit)
+    ax.set_yticks(np.arange(0, y_limit, y_tick_gap))
+    ax.set_ylabel(f'Total DALYs averted per person year')
+    fig.tight_layout()
+    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
+    fig.show()
+    plt.close(fig)
+
 # 2. Consumable demand not met
 #-----------------------------------------
 # Number of units of item which were needed but not made available for the top 25 items

From 52d54caf2463d89fb331546ff1d5fc1ded7890ef Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sun, 23 Jun 2024 18:14:48 +0100
Subject: [PATCH 045/100] update scenario names to be more clear - also update
 figure formatting to accommodate this

---
 ...analysis_impact_of_consumable_scenarios.py | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 60fc88292b..5a58cf48ff 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -105,7 +105,8 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
     ax.set_xticks(list(xticks.keys()))
     if not xticklabels_horizontal_and_wrapped:
         # xticklabels will be vertical and not wrapped
-        ax.set_xticklabels(list(xticks.values()), rotation=90)
+        wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()]
+        ax.set_xticklabels(wrapped_labs, rotation=45, ha = 'right')
     else:
         wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()]
         ax.set_xticklabels(wrapped_labs)
@@ -343,14 +344,14 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # 1) Extract the parameters that have varied over the set of simulations
 params = extract_params(results_folder)
-params_dict  = {'default': 'Actual', 'scenario1': 'Scenario 1', 'scenario2': 'Scenario 2',
-                'scenario3': 'Scenario 3', 'scenario4': 'Scenario 4', 'scenario5': 'Scenario 5',
-                'scenario6': 'Scenario 6', 'scenario7': 'Scenario 7', 'scenario8': 'Scenario 8',
+params_dict  = {'default': 'Status Quo', 'scenario1': 'Non-drug/diagnostic consumables', 'scenario2': 'Vital drugs',
+                'scenario3': 'Pharmacist managed', 'scenario4': 'Level 1b', 'scenario5': 'CHAM',
+                'scenario6': '75th percentile facility', 'scenario7': '90th percentile facility', 'scenario8': 'Best facility',
                 'all': 'Perfect'}
 params_dict_df = pd.DataFrame.from_dict(params_dict, orient='index', columns=['name_of_scenario']).reset_index().rename(columns = {'index': 'value'})
 params = params.merge(params_dict_df, on = 'value', how = 'left', validate = '1:1')
 scenarios = params['name_of_scenario'] #range(len(params))  # X-axis values representing time periods
-drop_scenarios = ['Scenario 4', 'Scenario 5'] # Drops scenarios which are no longer considered important for comparison
+drop_scenarios = ['Level 1b', 'CHAM'] # Drops scenarios which are no longer considered important for comparison
 
 # %% Extracting results from run
 
@@ -421,7 +422,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # %% Chart of number of DALYs averted
 # Plot DALYS averted (with xtickabels horizontal and wrapped)
-name_of_plot = f'Additional DALYs Averted vs Actual, {target_period()}'
+name_of_plot = f'Additional DALYs Averted vs Status Quo, {target_period()}'
 chosen_num_dalys_averted = num_dalys_averted[~num_dalys_averted.index.isin(drop_scenarios)]
 chosen_pc_dalys_averted = pc_dalys_averted[~pc_dalys_averted.index.isin(drop_scenarios)]
 fig, ax = do_bar_plot_with_ci(
@@ -433,7 +434,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     xticklabels_horizontal_and_wrapped=False,
 )
 ax.set_title(name_of_plot)
-ax.set_ylim(0, 16)
+ax.set_ylim(0, 20)
 ax.set_yticks(np.arange(0, 18, 2))
 ax.set_ylabel('Additional DALYS Averted \n(Millions)')
 fig.tight_layout()
@@ -490,7 +491,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     # Create a plot of DALYs averted by cause
     chosen_num_dalys_averted_by_cause = num_dalys_averted_by_cause[~num_dalys_averted_by_cause.index.isin(drop_scenarios)]
     chosen_pc_dalys_averted_by_cause = pc_dalys_averted_by_cause[~pc_dalys_averted_by_cause.index.isin(drop_scenarios)]
-    name_of_plot = f'Total DALYs averted by cause ({cause}), {target_period()}'
+    name_of_plot = f'DALYs averted vs Status Quo by cause - \n ({cause}), {target_period()}'
     fig, ax = do_bar_plot_with_ci(
         (chosen_num_dalys_averted_by_cause / 1e6).clip(lower=0.0),
         annotations=[
@@ -506,7 +507,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     ax.set_title(name_of_plot)
     ax.set_ylim(0, y_limit)
     ax.set_yticks(np.arange(0, y_limit, 0.5))
-    ax.set_ylabel(f'Total DALYs averted \n(Millions)')
+    ax.set_ylabel(f'DALYs averted \n(Millions)')
     fig.tight_layout()
     fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
     fig.show()
@@ -514,7 +515,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # PLot DALYs accrued by cause
 for cause in top_10_causes_of_dalys:
-    name_of_plot = f'Total DALYs accrued by {cause}, {target_period()}'
+    name_of_plot = f'Total DALYs accrued by cause - \n {cause}, {target_period()}'
     chosen_num_dalys_by_cause_summarized = num_dalys_by_cause_summarized[~num_dalys_by_cause_summarized.index.get_level_values(0).isin([4,5])]
     chosen_num_dalys_by_cause_summarized = chosen_num_dalys_by_cause_summarized[chosen_num_dalys_by_cause_summarized.index.get_level_values(1) == cause]
     fig, ax = do_bar_plot_with_ci(
@@ -581,7 +582,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # %% Chart of number of DALYs averted
 # Plot DALYS averted (with xtickabels horizontal and wrapped)
-name_of_plot = f'Additional DALYs Averted Per Person-Year vs Status Quo, {target_period()}'
+name_of_plot = f'Additional DALYs Averted Per Person-Year vs Status Quo, \n {target_period()}'
 chosen_num_dalys_averted_per_person_year = num_dalys_averted_per_person_year[~num_dalys_averted_per_person_year.index.isin(drop_scenarios)]
 chosen_pct_dalys_averted_per_person_year = pct_dalys_averted_per_person_year[~pct_dalys_averted_per_person_year.index.isin(drop_scenarios)]
 fig, ax = do_bar_plot_with_ci(
@@ -669,6 +670,9 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     fig.show()
     plt.close(fig)
 
+# 2. Health work time spent v DALYs accrued
+#############################################
+
 # 2. Consumable demand not met
 #-----------------------------------------
 # Number of units of item which were needed but not made available for the top 25 items

From bd75d63e4238e55f2c0b0562885cefdf1f2b8800 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sun, 23 Jun 2024 18:19:15 +0100
Subject: [PATCH 046/100] fix error in calculation of DALYs per person-year

---
 .../analysis_impact_of_consumable_scenarios.py         | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 5a58cf48ff..b9d332f428 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -232,7 +232,7 @@ def generate_series(dataframe: pd.DataFrame) -> pd.Series:
                 df: pd.DataFrame = load_pickled_dataframes(results_folder, draw, run, module)[module][key]
                 output_from_eval: pd.Series = generate_series(df)
                 assert pd.Series == type(output_from_eval), 'Custom command does not generate a pd.Series'
-                res[draw_run] = output_from_eval.reset_index().drop(columns = ['year']).T # / get_population_size(draw, run)
+                res[draw_run] = output_from_eval.reset_index().drop(columns = ['year']).T  / get_population_size(draw, run)
                 res[draw_run] = res[draw_run].sum(axis =1)
             except KeyError:
                 # Some logs could not be found - probably because this run failed.
@@ -491,7 +491,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     # Create a plot of DALYs averted by cause
     chosen_num_dalys_averted_by_cause = num_dalys_averted_by_cause[~num_dalys_averted_by_cause.index.isin(drop_scenarios)]
     chosen_pc_dalys_averted_by_cause = pc_dalys_averted_by_cause[~pc_dalys_averted_by_cause.index.isin(drop_scenarios)]
-    name_of_plot = f'DALYs averted vs Status Quo by cause - \n ({cause}), {target_period()}'
+    name_of_plot = f'Additional DALYs averted vs Status Quo by cause - \n ({cause}), {target_period()}'
     fig, ax = do_bar_plot_with_ci(
         (chosen_num_dalys_averted_by_cause / 1e6).clip(lower=0.0),
         annotations=[
@@ -507,7 +507,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     ax.set_title(name_of_plot)
     ax.set_ylim(0, y_limit)
     ax.set_yticks(np.arange(0, y_limit, 0.5))
-    ax.set_ylabel(f'DALYs averted \n(Millions)')
+    ax.set_ylabel(f'Additional DALYs averted \n(Millions)')
     fig.tight_layout()
     fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
     fig.show()
@@ -643,7 +643,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     chosen_num_dalys_averted_per_person_year_by_cause = num_dalys_averted_per_person_year_by_cause[
         ~num_dalys_averted_per_person_year_by_cause.index.isin(drop_scenarios)]
     chosen_pct_dalys_averted_per_person_year_by_cause = pct_dalys_averted_per_person_year_by_cause[~pct_dalys_averted_per_person_year_by_cause.index.isin(drop_scenarios)]
-    name_of_plot = f'Total DALYs averted per person year by cause - \n ({cause}), {target_period()}'
+    name_of_plot = f'Additional DALYs averted per person year by cause - \n ({cause}), {target_period()}'
     fig, ax = do_bar_plot_with_ci(
         (chosen_num_dalys_averted_per_person_year_by_cause).clip(lower=0.0),
         annotations=[
@@ -664,7 +664,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     ax.set_title(name_of_plot)
     ax.set_ylim(0, y_limit)
     ax.set_yticks(np.arange(0, y_limit, y_tick_gap))
-    ax.set_ylabel(f'Total DALYs averted per person year')
+    ax.set_ylabel(f'Additional DALYs averted per person year')
     fig.tight_layout()
     fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
     fig.show()

From 30682a2ed9fb532d5794a4b9db814c6d03e8c62a Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sun, 23 Jun 2024 18:28:20 +0100
Subject: [PATCH 047/100] change plot labels from person-year to person - we're
 plotting DALYs averted per person over 10 years in total

---
 .../analysis_impact_of_consumable_scenarios.py       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index b9d332f428..0eebb307b7 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -547,7 +547,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 # TODO add plot showing health workers capacity utilised
 # log['tlo.methods.healthsystem.summary']['Capacity_By_OfficerType_And_FacilityLevel']['OfficerType=Pharmacy|FacilityLevel=2']
 
-# 1.3 Total DALYs averted per person-year
+# 1.3 Total DALYs averted per person
 #----------------------------------------
 num_dalys_per_person_year = extract_results_by_person_year(
         results_folder,
@@ -582,7 +582,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # %% Chart of number of DALYs averted
 # Plot DALYS averted (with xtickabels horizontal and wrapped)
-name_of_plot = f'Additional DALYs Averted Per Person-Year vs Status Quo, \n {target_period()}'
+name_of_plot = f'Additional DALYs Averted Per Person vs Status Quo, \n {target_period()}'
 chosen_num_dalys_averted_per_person_year = num_dalys_averted_per_person_year[~num_dalys_averted_per_person_year.index.isin(drop_scenarios)]
 chosen_pct_dalys_averted_per_person_year = pct_dalys_averted_per_person_year[~pct_dalys_averted_per_person_year.index.isin(drop_scenarios)]
 fig, ax = do_bar_plot_with_ci(
@@ -596,13 +596,13 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 ax.set_title(name_of_plot)
 ax.set_ylim(0, 1.5)
 ax.set_yticks(np.arange(0, 1.5, 0.2))
-ax.set_ylabel('Additional DALYs averted per person-year')
+ax.set_ylabel('Additional DALYs averted per person')
 fig.tight_layout()
 fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', ''))
 fig.show()
 plt.close(fig)
 
-# 1.4 Total DALYs averted per person-year by cause
+# 1.4 Total DALYs averted per person by cause
 #-------------------------------------------------
 for cause in top_10_causes_of_dalys:
     num_dalys_per_person_year_by_cause = extract_results_by_person_year_by_cause(
@@ -643,7 +643,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     chosen_num_dalys_averted_per_person_year_by_cause = num_dalys_averted_per_person_year_by_cause[
         ~num_dalys_averted_per_person_year_by_cause.index.isin(drop_scenarios)]
     chosen_pct_dalys_averted_per_person_year_by_cause = pct_dalys_averted_per_person_year_by_cause[~pct_dalys_averted_per_person_year_by_cause.index.isin(drop_scenarios)]
-    name_of_plot = f'Additional DALYs averted per person year by cause - \n ({cause}), {target_period()}'
+    name_of_plot = f'Additional DALYs averted per person by cause - \n ({cause}), {target_period()}'
     fig, ax = do_bar_plot_with_ci(
         (chosen_num_dalys_averted_per_person_year_by_cause).clip(lower=0.0),
         annotations=[
@@ -664,7 +664,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     ax.set_title(name_of_plot)
     ax.set_ylim(0, y_limit)
     ax.set_yticks(np.arange(0, y_limit, y_tick_gap))
-    ax.set_ylabel(f'Additional DALYs averted per person year')
+    ax.set_ylabel(f'Additional DALYs averted per person')
     fig.tight_layout()
     fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
     fig.show()

From a96d78e986d7a0fb70ad6a47c063674dd71c0737 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Sun, 23 Jun 2024 19:55:46 +0100
Subject: [PATCH 048/100] add bar plots of capacity utilised by cadre and level

---
 ...analysis_impact_of_consumable_scenarios.py | 68 +++++++++++++++++--
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 0eebb307b7..f922b65e56 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -543,10 +543,6 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # TODO Fix xticklabels in the plots above
 
-# Plot health worker capacity utilised against DALYs accrued
-# TODO add plot showing health workers capacity utilised
-# log['tlo.methods.healthsystem.summary']['Capacity_By_OfficerType_And_FacilityLevel']['OfficerType=Pharmacy|FacilityLevel=2']
-
 # 1.3 Total DALYs averted per person
 #----------------------------------------
 num_dalys_per_person_year = extract_results_by_person_year(
@@ -672,6 +668,70 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # 2. Health work time spent v DALYs accrued
 #############################################
+# DALYs averted per person on the Y-axis; Capacity of cadre used at levels 1a, 1b, and 2 on the Y-axis
+# log['tlo.methods.healthsystem.summary']['Capacity_By_OfficerType_And_FacilityLevel']['OfficerType=Pharmacy|FacilityLevel=2']
+def get_capacity_used_by_cadre_and_level(_df):
+    """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD).
+    Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using
+    results from runs that crashed mid-way through the simulation.
+    """
+    years_needed = [i.year for i in TARGET_PERIOD]
+    _df['year'] = _df.date.dt.year
+    #assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded."
+    string_for_cols_to_drop1 = 'FacilityLevel=0|FacilityLevel=3|FacilityLevel=4|FacilityLevel=5'
+    string_for_cols_to_drop2 = 'OfficerType=DCSA|OfficerType=Dental|OfficerType=Laboratory|OfficerType=Mental|OfficerType=Nutrition|OfficerType=Radiography'
+    cols_to_drop1 = _df.columns[_df.columns.str.contains(string_for_cols_to_drop1)]
+    cols_to_drop2 = _df.columns[_df.columns.str.contains(string_for_cols_to_drop2)]
+    cols_to_drop = [*cols_to_drop1, *cols_to_drop2, 'year']
+    return pd.Series(
+        data=_df
+        .loc[_df.year.between(*years_needed)]
+        .drop(columns= cols_to_drop)
+        .mean()
+    )
+
+capacity_used = summarize(extract_results(
+                results_folder,
+                module='tlo.methods.healthsystem.summary',
+                key='Capacity_By_OfficerType_And_FacilityLevel',
+                custom_generate_series=get_capacity_used_by_cadre_and_level,
+                do_scaling = False,
+            ))
+
+#chosen_capacity_used.unstack().reset_index().drop(columns = ['level_2']).pivot(columns ='stat', index = 'draw')
+for cadre_level in capacity_used.index:
+    print(cadre_level)
+    name_of_plot = f'Capacity used - \n {cadre_level}, {target_period()}'
+    scenarios_to_drop = capacity_used.columns[capacity_used.columns.get_level_values(0).isin([4,5])]
+    chosen_capacity_used = capacity_used.drop(columns = scenarios_to_drop)
+    chosen_capacity_used = chosen_capacity_used[chosen_capacity_used.index == cadre_level]
+    chosen_capacity_used = chosen_capacity_used.unstack().reset_index().drop(columns = ['level_2']).pivot(columns ='stat', index = 'draw').droplevel(0,axis = 1)
+    chosen_capacity_used['scenario'] = [*scenarios.to_list()[0:4],*scenarios.to_list()[6:10]]
+    #TODO fix above code to be automated
+    chosen_capacity_used = chosen_capacity_used.set_index('scenario')
+    fig, ax = do_bar_plot_with_ci(
+        (chosen_capacity_used),
+        annotations=[
+            f"{round(row['mean'], 2)} \n ({round(row['lower'], 2)}-{round(row['upper'], 2)})"
+            for _, row in chosen_capacity_used.iterrows()
+        ],
+        xticklabels_horizontal_and_wrapped=False,
+    )
+    ax.set_title(name_of_plot)
+    if chosen_capacity_used.upper.max() > 3:
+        y_limit = 3.5
+        y_tick_gap = 0.5
+    else:
+        y_limit = 2
+        y_tick_gap = 0.25
+    ax.set_ylim(0, y_limit)
+    ax.set_yticks(np.arange(0, y_limit, y_tick_gap))
+    ax.set_ylabel(f'Capacity used \n (Proportion of capacity available)')
+    fig.tight_layout()
+    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
+    fig.show()
+    plt.close(fig)
+
 
 # 2. Consumable demand not met
 #-----------------------------------------

From 10097340c5b0bdb97205051ee6493e57da3b482c Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 24 Jun 2024 18:38:29 +0100
Subject: [PATCH 049/100] update names of scenarios

---
 .../analysis_impact_of_consumable_scenarios.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index f922b65e56..060d26ade6 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -344,8 +344,8 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # 1) Extract the parameters that have varied over the set of simulations
 params = extract_params(results_folder)
-params_dict  = {'default': 'Status Quo', 'scenario1': 'Non-drug/diagnostic consumables', 'scenario2': 'Vital drugs',
-                'scenario3': 'Pharmacist managed', 'scenario4': 'Level 1b', 'scenario5': 'CHAM',
+params_dict  = {'default': 'Status Quo', 'scenario1': 'General consumables', 'scenario2': 'Vital medicines',
+                'scenario3': 'Pharmacist-managed', 'scenario4': 'Level 1b', 'scenario5': 'CHAM',
                 'scenario6': '75th percentile facility', 'scenario7': '90th percentile facility', 'scenario8': 'Best facility',
                 'all': 'Perfect'}
 params_dict_df = pd.DataFrame.from_dict(params_dict, orient='index', columns=['name_of_scenario']).reset_index().rename(columns = {'index': 'value'})
@@ -502,14 +502,16 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     )
     if chosen_num_dalys_averted_by_cause.upper.max()/1e6 > 2:
         y_limit = 8.5
+        y_tick_gaps = 1
     else:
         y_limit = 2.5
+        y_tick_gaps = 0.5
     ax.set_title(name_of_plot)
     ax.set_ylim(0, y_limit)
-    ax.set_yticks(np.arange(0, y_limit, 0.5))
+    ax.set_yticks(np.arange(0, y_limit, y_tick_gaps))
     ax.set_ylabel(f'Additional DALYs averted \n(Millions)')
     fig.tight_layout()
-    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
+    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', ''))
     fig.show()
     plt.close(fig)
 
@@ -537,7 +539,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     ax.set_yticks(np.arange(0, y_limit, y_tick_gap))
     ax.set_ylabel(f'Total DALYs accrued \n(Millions)')
     fig.tight_layout()
-    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
+    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', ''))
     fig.show()
     plt.close(fig)
 
@@ -594,7 +596,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 ax.set_yticks(np.arange(0, 1.5, 0.2))
 ax.set_ylabel('Additional DALYs averted per person')
 fig.tight_layout()
-fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', ''))
+fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('\n', ''))
 fig.show()
 plt.close(fig)
 
@@ -662,7 +664,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     ax.set_yticks(np.arange(0, y_limit, y_tick_gap))
     ax.set_ylabel(f'Additional DALYs averted per person')
     fig.tight_layout()
-    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
+    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', ''))
     fig.show()
     plt.close(fig)
 
@@ -728,7 +730,7 @@ def get_capacity_used_by_cadre_and_level(_df):
     ax.set_yticks(np.arange(0, y_limit, y_tick_gap))
     ax.set_ylabel(f'Capacity used \n (Proportion of capacity available)')
     fig.tight_layout()
-    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_'))
+    fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', '_'))
     fig.show()
     plt.close(fig)
 

From 030083ae12c9be0b8f69988107ab9aea0b901ea8 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Mon, 24 Jun 2024 19:03:52 +0100
Subject: [PATCH 050/100] update heatmap to only include two levels of care

---
 ...analysis_impact_of_consumable_scenarios.py | 108 +++++++++---------
 1 file changed, 56 insertions(+), 52 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 060d26ade6..546a9074d3 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -735,6 +735,62 @@ def get_capacity_used_by_cadre_and_level(_df):
     plt.close(fig)
 
 
+# %% Summarizing input resourcefile data
+
+# 1. Consumable availability by category and level
+#--------------------------------------------------
+tlo_availability_df = pd.read_csv(resourcefilepath  / 'healthsystem'/ 'consumables' / "ResourceFile_Consumables_availability_small.csv")
+
+# Attach district, facility level, program to this dataset
+mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv")
+districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District'])
+fac_levels = {'0', '1a', '1b', '2', '3', '4'}
+tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
+                    on = ['Facility_ID'], how='left')
+# Attach programs
+programs = pd.read_csv(resourcefilepath / 'healthsystem'/ 'consumables' / "ResourceFile_Consumables_availability_and_usage.csv")[['category', 'item_code', 'module_name']]
+programs = programs.drop_duplicates('item_code')
+tlo_availability_df = tlo_availability_df.merge(programs, on = ['item_code'], how = 'left')
+
+# Generate a heatmap
+# Pivot the DataFrame
+aggregated_df = tlo_availability_df.groupby(['category', 'Facility_Level'])['available_prop'].mean().reset_index()
+aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin(['1a', '2'])]
+aggregated_df.loc[aggregated_df.Facility_Level == '1a','Facility_Level'] = 'Health centres'
+aggregated_df.loc[aggregated_df.Facility_Level == '2','Facility_Level'] = 'Hospitals'
+heatmap_data = aggregated_df.pivot("category", "Facility_Level", "available_prop")
+
+# Calculate the aggregate row and column
+aggregate_col= aggregated_df.groupby('Facility_Level')['available_prop'].mean()
+aggregate_row = aggregated_df.groupby('category')['available_prop'].mean()
+overall_aggregate = tlo_availability_df['available_prop'].mean()
+
+# Add aggregate row and column
+heatmap_data['Average'] = aggregate_row
+aggregate_col['Average'] = overall_aggregate
+heatmap_data.loc['Average'] = aggregate_col
+
+# Generate the heatmap
+sns.set(font_scale=1.5)
+plt.figure(figsize=(10, 8))
+sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+
+# Customize the plot
+#plt.title('Consumable availability by Facility Level and Category')
+plt.xlabel('Facility Level')
+plt.ylabel(f'Disease/Public health \n program')
+plt.xticks(rotation=45)
+plt.yticks(rotation=0)
+
+plt.savefig(figurespath /'consumable_availability_heatmap_1a_2.png', dpi=300, bbox_inches='tight')
+plt.show()
+plt.close()
+
+# TODO Justify the focus on levels 1a and 1b - where do HSIs occur?; at what level is there most misallocation within districts
+# TODO get graphs of percentage of successful HSIs under different scenarios for levels 1a and 1b
+# TODO is there a way to link consumables directly to DALYs (how many DALYs are lost due to stockouts of specific consumables)
+# TODO why are there no appointments at level 1b
+
 # 2. Consumable demand not met
 #-----------------------------------------
 # Number of units of item which were needed but not made available for the top 25 items
@@ -907,55 +963,3 @@ def get_counts_of_hsi_by_short_treatment_id(_df):
 
 # Cost of consumables?
 
-# %% Summarizing input resourcefile data
-
-# 1. Consumable availability by category and level
-#--------------------------------------------------
-tlo_availability_df = pd.read_csv(resourcefilepath  / 'healthsystem'/ 'consumables' / "ResourceFile_Consumables_availability_small.csv")
-
-# Attach district, facility level, program to this dataset
-mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv")
-districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District'])
-fac_levels = {'0', '1a', '1b', '2', '3', '4'}
-tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
-                    on = ['Facility_ID'], how='left')
-# Attach programs
-programs = pd.read_csv(resourcefilepath / 'healthsystem'/ 'consumables' / "ResourceFile_Consumables_availability_and_usage.csv")[['category', 'item_code', 'module_name']]
-programs = programs.drop_duplicates('item_code')
-tlo_availability_df = tlo_availability_df.merge(programs, on = ['item_code'], how = 'left')
-
-# Generate a heatmap
-# Pivot the DataFrame
-aggregated_df = tlo_availability_df.groupby(['category', 'Facility_Level'])['available_prop'].mean().reset_index()
-heatmap_data = aggregated_df.pivot("category", "Facility_Level", "available_prop")
-
-# Calculate the aggregate row and column
-aggregate_col= tlo_availability_df.groupby('Facility_Level')['available_prop'].mean()
-aggregate_row = tlo_availability_df.groupby('category')['available_prop'].mean()
-overall_aggregate = tlo_availability_df['available_prop'].mean()
-
-# Add aggregate row and column
-heatmap_data['Average'] = aggregate_row
-aggregate_col['Average'] = overall_aggregate
-heatmap_data.loc['Average'] = aggregate_col
-
-# Generate the heatmap
-plt.figure(figsize=(10, 8))
-sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
-
-# Customize the plot
-#plt.title('Consumable availability by Facility Level and Category')
-plt.xlabel('Facility Level')
-plt.ylabel('Category')
-plt.xticks(rotation=45)
-plt.yticks(rotation=0)
-
-plt.savefig(figurespath /'consumable_availability_heatmap.png', dpi=300, bbox_inches='tight')
-plt.show()
-plt.close()
-
-# TODO Justify the focus on levels 1a and 1b - where do HSIs occur?; at what level is there most misallocation within districts
-# TODO get graphs of percentage of successful HSIs under different scenarios for levels 1a and 1b
-# TODO is there a way to link consumables directly to DALYs (how many DALYs are lost due to stockouts of specific consumables)
-# TODO why are there no appointments at level 1b
-

From 0b671c92e5f2398c96d99021d0fbd926365acc79 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Tue, 25 Jun 2024 08:57:50 +0100
Subject: [PATCH 051/100] fix average to only include 1a and 1b in the heatmap

---
 .../analysis_impact_of_consumable_scenarios.py              | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 546a9074d3..83e9b7871d 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -755,15 +755,15 @@ def get_capacity_used_by_cadre_and_level(_df):
 # Generate a heatmap
 # Pivot the DataFrame
 aggregated_df = tlo_availability_df.groupby(['category', 'Facility_Level'])['available_prop'].mean().reset_index()
-aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin(['1a', '2'])]
+aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin(['1a', '1b'])]
 aggregated_df.loc[aggregated_df.Facility_Level == '1a','Facility_Level'] = 'Health centres'
-aggregated_df.loc[aggregated_df.Facility_Level == '2','Facility_Level'] = 'Hospitals'
+aggregated_df.loc[aggregated_df.Facility_Level == '1b','Facility_Level'] = 'Hospitals'
 heatmap_data = aggregated_df.pivot("category", "Facility_Level", "available_prop")
 
 # Calculate the aggregate row and column
 aggregate_col= aggregated_df.groupby('Facility_Level')['available_prop'].mean()
 aggregate_row = aggregated_df.groupby('category')['available_prop'].mean()
-overall_aggregate = tlo_availability_df['available_prop'].mean()
+overall_aggregate = aggregated_df['available_prop'].mean()
 
 # Add aggregate row and column
 heatmap_data['Average'] = aggregate_row

From 2558e5f6d54c777f12c90496b33fa1042ebe04a8 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Fri, 28 Jun 2024 23:07:08 +0200
Subject: [PATCH 052/100] update 'status quo' to 'actual'

---
 ...analysis_impact_of_consumable_scenarios.py | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 83e9b7871d..2b67f25d9a 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -344,7 +344,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # 1) Extract the parameters that have varied over the set of simulations
 params = extract_params(results_folder)
-params_dict  = {'default': 'Status Quo', 'scenario1': 'General consumables', 'scenario2': 'Vital medicines',
+params_dict  = {'default': 'Actual', 'scenario1': 'General consumables', 'scenario2': 'Vital medicines',
                 'scenario3': 'Pharmacist-managed', 'scenario4': 'Level 1b', 'scenario5': 'CHAM',
                 'scenario6': '75th percentile facility', 'scenario7': '90th percentile facility', 'scenario8': 'Best facility',
                 'all': 'Perfect'}
@@ -401,7 +401,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
         pd.DataFrame(
             find_difference_relative_to_comparison(
                 num_dalys.loc[0],
-                comparison= 0) # sets the comparator to 0 which is the Status Quo scenario
+                comparison= 0) # sets the comparator to 0 which is the Actual scenario
         ).T
     ).iloc[0].unstack()
 num_dalys_averted['scenario'] = scenarios.to_list()[1:10]
@@ -413,7 +413,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     pd.DataFrame(
         find_difference_relative_to_comparison(
             num_dalys.loc[0],
-            comparison= 0, # sets the comparator to 0 which is the Status Quo scenario
+            comparison= 0, # sets the comparator to 0 which is the Actual scenario
             scaled=True)
     ).T
 ).iloc[0].unstack()
@@ -422,7 +422,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # %% Chart of number of DALYs averted
 # Plot DALYS averted (with xtickabels horizontal and wrapped)
-name_of_plot = f'Additional DALYs Averted vs Status Quo, {target_period()}'
+name_of_plot = f'Additional DALYs Averted vs Actual, {target_period()}'
 chosen_num_dalys_averted = num_dalys_averted[~num_dalys_averted.index.isin(drop_scenarios)]
 chosen_pc_dalys_averted = pc_dalys_averted[~pc_dalys_averted.index.isin(drop_scenarios)]
 fig, ax = do_bar_plot_with_ci(
@@ -455,11 +455,11 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.rename(columns = {'level_2':'cause', 0: 'DALYs_accrued'})
 num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.pivot(index=['draw','cause'], columns='stat', values='DALYs_accrued')
 
-# Get top 10 causes until status quo
-num_dalys_by_cause_status_quo = num_dalys_by_cause_summarized[num_dalys_by_cause_summarized.index.get_level_values(0) == 0]
-num_dalys_by_cause_status_quo = num_dalys_by_cause_status_quo.sort_values('mean', ascending = False)
-num_dalys_by_cause_status_quo =num_dalys_by_cause_status_quo[0:10]
-top_10_causes_of_dalys = num_dalys_by_cause_status_quo.index.get_level_values(1).unique()
+# Get top 10 causes until Actual
+num_dalys_by_cause_actual = num_dalys_by_cause_summarized[num_dalys_by_cause_summarized.index.get_level_values(0) == 0]
+num_dalys_by_cause_actual = num_dalys_by_cause_actual.sort_values('mean', ascending = False)
+num_dalys_by_cause_actual =num_dalys_by_cause_actual[0:10]
+top_10_causes_of_dalys = num_dalys_by_cause_actual.index.get_level_values(1).unique()
 
 # Get DALYs aveterted by cause and plot bar chats
 for cause in top_10_causes_of_dalys:
@@ -469,7 +469,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
             pd.DataFrame(
                 find_difference_relative_to_comparison(
                     num_dalys_by_cause_pivoted.squeeze(),
-                    comparison= 0) # sets the comparator to 0 which is the Status Quo scenario
+                    comparison= 0) # sets the comparator to 0 which is the Actual scenario
             ).T
         ).iloc[0].unstack()
     num_dalys_averted_by_cause['scenario'] = scenarios.to_list()[1:10]
@@ -481,7 +481,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
         pd.DataFrame(
             find_difference_relative_to_comparison(
                 num_dalys_by_cause_pivoted.squeeze(),
-                comparison= 0, # sets the comparator to 0 which is the Status Quo scenario
+                comparison= 0, # sets the comparator to 0 which is the Actual scenario
                 scaled=True)
         ).T
     ).iloc[0].unstack()
@@ -491,7 +491,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     # Create a plot of DALYs averted by cause
     chosen_num_dalys_averted_by_cause = num_dalys_averted_by_cause[~num_dalys_averted_by_cause.index.isin(drop_scenarios)]
     chosen_pc_dalys_averted_by_cause = pc_dalys_averted_by_cause[~pc_dalys_averted_by_cause.index.isin(drop_scenarios)]
-    name_of_plot = f'Additional DALYs averted vs Status Quo by cause - \n ({cause}), {target_period()}'
+    name_of_plot = f'Additional DALYs averted vs Actual by cause - \n ({cause}), {target_period()}'
     fig, ax = do_bar_plot_with_ci(
         (chosen_num_dalys_averted_by_cause / 1e6).clip(lower=0.0),
         annotations=[
@@ -559,7 +559,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
         pd.DataFrame(
             find_difference_relative_to_comparison(
                 num_dalys_per_person_year.loc[0],
-                comparison= 0) # sets the comparator to 0 which is the Status Quo scenario
+                comparison= 0) # sets the comparator to 0 which is the Actual scenario
         ).T
     ).iloc[0].unstack()
 num_dalys_averted_per_person_year['scenario'] = scenarios.to_list()[1:10]
@@ -571,7 +571,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     pd.DataFrame(
         find_difference_relative_to_comparison(
             num_dalys_per_person_year.loc[0],
-            comparison= 0, # sets the comparator to 0 which is the Status Quo scenario
+            comparison= 0, # sets the comparator to 0 which is the Actual scenario
             scaled=True)
     ).T
 ).iloc[0].unstack()
@@ -580,7 +580,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # %% Chart of number of DALYs averted
 # Plot DALYS averted (with xtickabels horizontal and wrapped)
-name_of_plot = f'Additional DALYs Averted Per Person vs Status Quo, \n {target_period()}'
+name_of_plot = f'Additional DALYs Averted Per Person vs Actual, \n {target_period()}'
 chosen_num_dalys_averted_per_person_year = num_dalys_averted_per_person_year[~num_dalys_averted_per_person_year.index.isin(drop_scenarios)]
 chosen_pct_dalys_averted_per_person_year = pct_dalys_averted_per_person_year[~pct_dalys_averted_per_person_year.index.isin(drop_scenarios)]
 fig, ax = do_bar_plot_with_ci(
@@ -618,7 +618,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
         pd.DataFrame(
             find_difference_relative_to_comparison(
                 num_dalys_per_person_year_by_cause.squeeze(),
-                comparison=0)  # sets the comparator to 0 which is the Status Quo scenario
+                comparison=0)  # sets the comparator to 0 which is the Actual scenario
         ).T
     ).iloc[0].unstack()
     num_dalys_averted_per_person_year_by_cause['scenario'] = scenarios.to_list()[1:10]
@@ -630,7 +630,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
         pd.DataFrame(
             find_difference_relative_to_comparison(
                 num_dalys_per_person_year_by_cause.squeeze(),
-                comparison=0,  # sets the comparator to 0 which is the Status Quo scenario
+                comparison=0,  # sets the comparator to 0 which is the Actual scenario
                 scaled=True)
         ).T
     ).iloc[0].unstack()

From 18046f4ea259b8cfd7a634c36a23684ce0a8bda2 Mon Sep 17 00:00:00 2001
From: sm2511 <sakshi.mohan@york.ac.uk>
Date: Wed, 10 Jul 2024 10:42:48 +0100
Subject: [PATCH 053/100] get heatmaps for all scenarios

---
 ...analysis_impact_of_consumable_scenarios.py | 75 ++++++++++---------
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 2b67f25d9a..fbdd03691d 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -751,40 +751,47 @@ def get_capacity_used_by_cadre_and_level(_df):
 programs = pd.read_csv(resourcefilepath / 'healthsystem'/ 'consumables' / "ResourceFile_Consumables_availability_and_usage.csv")[['category', 'item_code', 'module_name']]
 programs = programs.drop_duplicates('item_code')
 tlo_availability_df = tlo_availability_df.merge(programs, on = ['item_code'], how = 'left')
-
-# Generate a heatmap
-# Pivot the DataFrame
-aggregated_df = tlo_availability_df.groupby(['category', 'Facility_Level'])['available_prop'].mean().reset_index()
-aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin(['1a', '1b'])]
-aggregated_df.loc[aggregated_df.Facility_Level == '1a','Facility_Level'] = 'Health centres'
-aggregated_df.loc[aggregated_df.Facility_Level == '1b','Facility_Level'] = 'Hospitals'
-heatmap_data = aggregated_df.pivot("category", "Facility_Level", "available_prop")
-
-# Calculate the aggregate row and column
-aggregate_col= aggregated_df.groupby('Facility_Level')['available_prop'].mean()
-aggregate_row = aggregated_df.groupby('category')['available_prop'].mean()
-overall_aggregate = aggregated_df['available_prop'].mean()
-
-# Add aggregate row and column
-heatmap_data['Average'] = aggregate_row
-aggregate_col['Average'] = overall_aggregate
-heatmap_data.loc['Average'] = aggregate_col
-
-# Generate the heatmap
-sns.set(font_scale=1.5)
-plt.figure(figsize=(10, 8))
-sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
-
-# Customize the plot
-#plt.title('Consumable availability by Facility Level and Category')
-plt.xlabel('Facility Level')
-plt.ylabel(f'Disease/Public health \n program')
-plt.xticks(rotation=45)
-plt.yticks(rotation=0)
-
-plt.savefig(figurespath /'consumable_availability_heatmap_1a_2.png', dpi=300, bbox_inches='tight')
-plt.show()
-plt.close()
+availability_scenarios = ['available_prop',
+       'available_prop_scenario1', 'available_prop_scenario2',
+       'available_prop_scenario3', 'available_prop_scenario4',
+       'available_prop_scenario5', 'available_prop_scenario6',
+       'available_prop_scenario7', 'available_prop_scenario8']
+i = 0
+for avail_scenario in availability_scenarios:
+    # Generate a heatmap
+    # Pivot the DataFrame
+    aggregated_df = tlo_availability_df.groupby(['category', 'Facility_Level'])[avail_scenario].mean().reset_index()
+    aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin(['1a', '1b'])]
+    aggregated_df.loc[aggregated_df.Facility_Level == '1a','Facility_Level'] = 'Health centres'
+    aggregated_df.loc[aggregated_df.Facility_Level == '1b','Facility_Level'] = 'Hospitals'
+    heatmap_data = aggregated_df.pivot("category", "Facility_Level", avail_scenario)
+
+    # Calculate the aggregate row and column
+    aggregate_col= aggregated_df.groupby('Facility_Level')[avail_scenario].mean()
+    aggregate_row = aggregated_df.groupby('category')[avail_scenario].mean()
+    overall_aggregate = aggregated_df[avail_scenario].mean()
+
+    # Add aggregate row and column
+    heatmap_data['Average'] = aggregate_row
+    aggregate_col['Average'] = overall_aggregate
+    heatmap_data.loc['Average'] = aggregate_col
+
+    # Generate the heatmap
+    sns.set(font_scale=1.5)
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+
+    # Customize the plot
+    plt.title(scenarios[i])
+    plt.xlabel('Facility Level')
+    plt.ylabel(f'Disease/Public health \n program')
+    plt.xticks(rotation=45)
+    plt.yticks(rotation=0)
+
+    plt.savefig(figurespath /f'consumable_availability_heatmap_1a_2_{avail_scenario}.png', dpi=300, bbox_inches='tight')
+    plt.show()
+    plt.close()
+    i = i + 1
 
 # TODO Justify the focus on levels 1a and 1b - where do HSIs occur?; at what level is there most misallocation within districts
 # TODO get graphs of percentage of successful HSIs under different scenarios for levels 1a and 1b

From 8dad43ab7e59b5704da43ab74be5361d7e5baa5a Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Wed, 4 Sep 2024 19:16:33 +0100
Subject: [PATCH 054/100] update paths from dropbox to sharepoint - note that
 the location of the files in sharepoint has also been changed

---
 .../consumables_availability_estimation.py       | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
index 3615afd400..a76d87a9d3 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
@@ -30,14 +30,12 @@
 
 from tlo.methods.consumables import check_format_of_consumables_file
 
-# Set local Dropbox source
-path_to_dropbox = Path(  # <-- point to the TLO dropbox locally
-    '/Users/sm2511/Dropbox/Thanzi la Onse'
-    # '/Users/sejjj49/Dropbox/Thanzi la Onse'
-    # 'C:/Users/tmangal/Dropbox/Thanzi la Onse'
+# Set local shared folder source
+path_to_share = Path(  # <-- point to the shared folder
+    '/Users/sm2511/Library/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/'
 )
 
-path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/"
+path_to_files_in_the_tlo_shared_drive = path_to_share / "07 - Data/Consumables data/"
 
 # define a timestamp for script outputs
 timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
@@ -68,7 +66,7 @@ def change_colnames(df, NameChangeList):  # Change column names
 #########################################################################################
 
 # Import 2018 data
-lmis_df = pd.read_csv(path_to_files_in_the_tlo_dropbox / 'ResourceFile_LMIS_2018.csv', low_memory=False)
+lmis_df = pd.read_csv(path_to_files_in_the_tlo_shared_drive / 'OpenLMIS/2018/ResourceFile_LMIS_2018.csv', low_memory=False)
 
 # 1. BASIC CLEANING ##
 # Rename columns
@@ -515,7 +513,7 @@ def custom_agg_stkout(x):
 unmatched_consumables = unmatched_consumables[unmatched_consumables['item_y'].isna()]
 
 # ** Extract stock availability data from HHFA and clean data **
-hhfa_df = pd.read_excel(path_to_files_in_the_tlo_dropbox / 'ResourceFile_hhfa_consumables.xlsx', sheet_name='hhfa_data')
+hhfa_df = pd.read_excel(path_to_files_in_the_tlo_shared_drive / 'ResourceFile_hhfa_consumables.xlsx', sheet_name='hhfa_data')
 
 # Use the ratio of availability rates between levels 1b on one hand and levels 2 and 3 on the other to extrapolate
 # availability rates for levels 2 and 3 from the HHFA data
@@ -541,7 +539,7 @@ def custom_agg_stkout(x):
     hhfa_df.loc[cond, var] = 1
 
 # Add further assumptions on consumable availability from other sources
-assumptions_df = pd.read_excel(open(path_to_files_in_the_tlo_dropbox / 'ResourceFile_hhfa_consumables.xlsx', 'rb'),
+assumptions_df = pd.read_excel(open(path_to_files_in_the_tlo_shared_drive / 'ResourceFile_hhfa_consumables.xlsx', 'rb'),
                                sheet_name='availability_assumptions')
 assumptions_df = assumptions_df[['item_code', 'available_prop_Facility_level_0',
                                  'available_prop_Facility_level_1a', 'available_prop_Facility_level_1b',

From d62e086d6f9c2769a69aebc9c35c326b2cc946a3 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Wed, 4 Sep 2024 19:17:10 +0100
Subject: [PATCH 055/100] Clean code for assigning consumable category

---
 .../consumables_availability_estimation.py    | 70 +++++++++++--------
 1 file changed, 41 insertions(+), 29 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
index a76d87a9d3..e8df3319a4 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
@@ -604,35 +604,47 @@ def custom_agg_stkout(x):
 stkout_df = pd.concat([stkout_df, hhfa_fac0], axis=0, ignore_index=True)
 
 # --- 6.4 Generate new category variable for analysis --- #
-stkout_df['category'] = stkout_df['module_name'].str.lower()
-cond_RH = (stkout_df['category'].str.contains('care_of_women_during_pregnancy')) | \
-          (stkout_df['category'].str.contains('labour'))
-cond_newborn = (stkout_df['category'].str.contains('newborn'))
-cond_childhood = (stkout_df['category'] == 'acute lower respiratory infections') | \
-                 (stkout_df['category'] == 'measles') | \
-                 (stkout_df['category'] == 'diarrhoea')
-cond_rti = stkout_df['category'] == 'road traffic injuries'
-cond_cancer = stkout_df['category'].str.contains('cancer')
-cond_ncds = (stkout_df['category'] == 'epilepsy') | \
-            (stkout_df['category'] == 'depression')
-stkout_df.loc[cond_RH, 'category'] = 'reproductive_health'
-stkout_df.loc[cond_cancer, 'category'] = 'cancer'
-stkout_df.loc[cond_newborn, 'category'] = 'neonatal_health'
-stkout_df.loc[cond_childhood, 'category'] = 'other_childhood_illnesses'
-stkout_df.loc[cond_rti, 'category'] = 'road_traffic_injuries'
-stkout_df.loc[cond_ncds, 'category'] = 'ncds'
-
-cond_condom = stkout_df['item_code'] == 2
-stkout_df.loc[cond_condom, 'category'] = 'contraception'
-
-# Create a general consumables category
-general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76,
-                     82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247]
-diagnostics_cons_list = [41, 50, 128, 216, 2008, 47, 190, 191, 196, 206, 207, 163, 175, 184,
-                         187]  # for now these have not been applied because most diagnostics are program specific
-
-cond_general = stkout_df['item_code'].isin(general_cons_list)
-stkout_df.loc[cond_general, 'category'] = 'general'
+def recategorize_modules_into_consumable_categories(_df):
+    _df['category'] = _df['module_name'].str.lower()
+    cond_RH = (_df['category'].str.contains('care_of_women_during_pregnancy')) | \
+              (_df['category'].str.contains('labour'))
+    cond_newborn = (_df['category'].str.contains('newborn'))
+    cond_newborn[cond_newborn.isna()] = False
+    cond_childhood = (_df['category'] == 'acute lower respiratory infections') | \
+                     (_df['category'] == 'measles') | \
+                     (_df['category'] == 'diarrhoea')
+    cond_rti = _df['category'] == 'road traffic injuries'
+    cond_cancer = _df['category'].str.contains('cancer')
+    cond_cancer[cond_cancer.isna()] = False
+    cond_ncds = (_df['category'] == 'epilepsy') | \
+                (_df['category'] == 'depression')
+    _df.loc[cond_RH, 'category'] = 'reproductive_health'
+    _df.loc[cond_cancer, 'category'] = 'cancer'
+    _df.loc[cond_newborn, 'category'] = 'neonatal_health'
+    _df.loc[cond_childhood, 'category'] = 'other_childhood_illnesses'
+    _df.loc[cond_rti, 'category'] = 'road_traffic_injuries'
+    _df.loc[cond_ncds, 'category'] = 'ncds'
+    cond_condom = _df['item_code'] == 2
+    _df.loc[cond_condom, 'category'] = 'contraception'
+
+    # Create a general consumables category
+    general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76,
+                         82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247, 49, 112, 1933, 1960]
+    cond_general = _df['item_code'].isin(general_cons_list)
+    _df.loc[cond_general, 'category'] = 'general'
+
+    # Fill gaps in categories
+    dict_for_missing_categories = {292: 'acute lower respiratory infections', 293: 'acute lower respiratory infections',
+                                   307: 'reproductive_health', 2019: 'reproductive_health',
+                                   2678: 'tb', 1171: 'other_childhood_illnesses', 1237: 'cancer', 1239: 'cancer'}
+    # Use map to create a new series from item_code to fill missing values in category
+    mapped_categories = _df['item_code'].map(dict_for_missing_categories)
+    # Use fillna on the 'category' column to fill missing values using the mapped_categories
+    _df['category'] = _df['category'].fillna(mapped_categories)
+
+    return _df
+
+stkout_df = recategorize_modules_into_consumable_categories(stkout_df)
 
 # --- 6.5 Replace district/fac_name/month entries where missing --- #
 for var in ['district', 'fac_name', 'month']:

From fd106ae3fbc4327b043b7152d6dba881b391bb5c Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 5 Sep 2024 12:05:07 +0100
Subject: [PATCH 056/100] Rename item category column and extract into
 consumable availability RF

---
 ...rceFile_Consumables_availability_small.csv |  4 +-
 .../consumables_availability_estimation.py    | 46 ++++++++++---------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index 25249531b2..ef72473491 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c358a643e4def0e574b75f89f83d77f9c3366f668422e005150f4d69ebe8d7a7
-size 6169152
+oid sha256:5c495020b77bf7fcb715ca4253759497c6a7ea84040ef8b262025634d5062263
+size 10070558
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
index e8df3319a4..db5b2816f6 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
@@ -605,33 +605,33 @@ def custom_agg_stkout(x):
 
 # --- 6.4 Generate new category variable for analysis --- #
 def recategorize_modules_into_consumable_categories(_df):
-    _df['category'] = _df['module_name'].str.lower()
-    cond_RH = (_df['category'].str.contains('care_of_women_during_pregnancy')) | \
-              (_df['category'].str.contains('labour'))
-    cond_newborn = (_df['category'].str.contains('newborn'))
+    _df['item_category'] = _df['module_name'].str.lower()
+    cond_RH = (_df['item_category'].str.contains('care_of_women_during_pregnancy')) | \
+              (_df['item_category'].str.contains('labour'))
+    cond_newborn = (_df['item_category'].str.contains('newborn'))
     cond_newborn[cond_newborn.isna()] = False
-    cond_childhood = (_df['category'] == 'acute lower respiratory infections') | \
-                     (_df['category'] == 'measles') | \
-                     (_df['category'] == 'diarrhoea')
-    cond_rti = _df['category'] == 'road traffic injuries'
-    cond_cancer = _df['category'].str.contains('cancer')
+    cond_childhood = (_df['item_category'] == 'acute lower respiratory infections') | \
+                     (_df['item_category'] == 'measles') | \
+                     (_df['item_category'] == 'diarrhoea')
+    cond_rti = _df['item_category'] == 'road traffic injuries'
+    cond_cancer = _df['item_category'].str.contains('cancer')
     cond_cancer[cond_cancer.isna()] = False
-    cond_ncds = (_df['category'] == 'epilepsy') | \
-                (_df['category'] == 'depression')
-    _df.loc[cond_RH, 'category'] = 'reproductive_health'
-    _df.loc[cond_cancer, 'category'] = 'cancer'
-    _df.loc[cond_newborn, 'category'] = 'neonatal_health'
-    _df.loc[cond_childhood, 'category'] = 'other_childhood_illnesses'
-    _df.loc[cond_rti, 'category'] = 'road_traffic_injuries'
-    _df.loc[cond_ncds, 'category'] = 'ncds'
+    cond_ncds = (_df['item_category'] == 'epilepsy') | \
+                (_df['item_category'] == 'depression')
+    _df.loc[cond_RH, 'item_category'] = 'reproductive_health'
+    _df.loc[cond_cancer, 'item_category'] = 'cancer'
+    _df.loc[cond_newborn, 'item_category'] = 'neonatal_health'
+    _df.loc[cond_childhood, 'item_category'] = 'other_childhood_illnesses'
+    _df.loc[cond_rti, 'item_category'] = 'road_traffic_injuries'
+    _df.loc[cond_ncds, 'item_category'] = 'ncds'
     cond_condom = _df['item_code'] == 2
-    _df.loc[cond_condom, 'category'] = 'contraception'
+    _df.loc[cond_condom, 'item_category'] = 'contraception'
 
     # Create a general consumables category
     general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76,
                          82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247, 49, 112, 1933, 1960]
     cond_general = _df['item_code'].isin(general_cons_list)
-    _df.loc[cond_general, 'category'] = 'general'
+    _df.loc[cond_general, 'item_category'] = 'general'
 
     # Fill gaps in categories
     dict_for_missing_categories = {292: 'acute lower respiratory infections', 293: 'acute lower respiratory infections',
@@ -639,12 +639,13 @@ def recategorize_modules_into_consumable_categories(_df):
                                    2678: 'tb', 1171: 'other_childhood_illnesses', 1237: 'cancer', 1239: 'cancer'}
     # Use map to create a new series from item_code to fill missing values in category
     mapped_categories = _df['item_code'].map(dict_for_missing_categories)
-    # Use fillna on the 'category' column to fill missing values using the mapped_categories
-    _df['category'] = _df['category'].fillna(mapped_categories)
+    # Use fillna on the 'item_category' column to fill missing values using the mapped_categories
+    _df['item_category'] = _df['item_category'].fillna(mapped_categories)
 
     return _df
 
 stkout_df = recategorize_modules_into_consumable_categories(stkout_df)
+item_code_category_mapping = stkout_df[['item_category', 'item_code']].drop_duplicates()
 
 # --- 6.5 Replace district/fac_name/month entries where missing --- #
 for var in ['district', 'fac_name', 'month']:
@@ -835,9 +836,10 @@ def interpolate_missing_with_mean(_ser):
 # --- Check that the exported file has the properties required of it by the model code. --- #
 check_format_of_consumables_file(df=full_set_interpolated.reset_index(), fac_ids=fac_ids)
 
+full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1')
 # %%
 # Save
-full_set_interpolated.reset_index().to_csv(
+full_set_interpolated.to_csv(
     path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv",
     index=False
 )

From 4c2644c0add5fd2567f7070a60da7d05cddf61d9 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 5 Sep 2024 12:08:30 +0100
Subject: [PATCH 057/100] Add 'item_category' to the function to check format
 of consumable availability RF

---
 .../consumables_availability_estimation.py                   | 5 +++--
 src/tlo/methods/consumables.py                               | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
index db5b2816f6..bd410488b1 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
@@ -833,10 +833,11 @@ def interpolate_missing_with_mean(_ser):
 # Check that there are not missing values
 assert not pd.isnull(full_set_interpolated).any().any()
 
+full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1')
+
 # --- Check that the exported file has the properties required of it by the model code. --- #
-check_format_of_consumables_file(df=full_set_interpolated.reset_index(), fac_ids=fac_ids)
+check_format_of_consumables_file(df=full_set_interpolated, fac_ids=fac_ids)
 
-full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1')
 # %%
 # Save
 full_set_interpolated.to_csv(
diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 674035ad98..1be37c3267 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -266,7 +266,7 @@ def _lookup_availability_of_consumables(self,
 
     def on_simulation_end(self):
         """Do tasks at the end of the simulation.
-         
+
         Raise warnings and enter to log about item_codes not recognised.
         """
         if self._not_recognised_item_codes:
@@ -339,7 +339,7 @@ def check_format_of_consumables_file(df, fac_ids):
     months = set(range(1, 13))
     item_codes = set(df.item_code.unique())
 
-    assert set(df.columns) == {'Facility_ID', 'month', 'item_code', 'available_prop'}
+    assert set(df.columns) == {'Facility_ID', 'month', 'item_code', 'item_category', 'available_prop'}
 
     # Check that all permutations of Facility_ID, month and item_code are present
     pd.testing.assert_index_equal(

From b01904f0e2e7e9f14b651d7a0ec9763892ddfe39 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 5 Sep 2024 12:13:08 +0100
Subject: [PATCH 058/100] remove reindex() line because this is performed
 earlier in the script

---
 .../consumables_availability_estimation.py                     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
index bd410488b1..b779e571df 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
@@ -862,7 +862,7 @@ def interpolate_missing_with_mean(_ser):
 hhfa_comparison_df = hhfa_comparison_df.rename({'fac_type_tlo': 'Facility_Level'}, axis=1)
 
 # ii. Collapse final model availability data by facility level
-final_availability_df = full_set_interpolated.reset_index()
+final_availability_df = full_set_interpolated
 mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv")
 final_availability_df = pd.merge(final_availability_df, mfl[['District', 'Facility_Level', 'Facility_ID']], how="left",
                                  on=['Facility_ID'],
@@ -884,7 +884,6 @@ def interpolate_missing_with_mean(_ser):
 size = 10
 comparison_df['consumable_labels'] = comparison_df['consumable_name_tlo'].str[:10]
 
-
 # Define function to draw calibration plots at different levels of disaggregation
 def comparison_plot(level_of_disaggregation, group_by_var, colour):
     comparison_df_agg = comparison_df.groupby([group_by_var],

From 4ab53e343ad0aee4c375dfdbf43036e82687c70c Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 5 Sep 2024 12:54:46 +0100
Subject: [PATCH 059/100] removed these scripts which are not being used in the
 TLO model

---
 .../clean_fac_locations.py                    | 360 ------------------
 .../descriptive_stats.py                      |  63 ---
 2 files changed, 423 deletions(-)
 delete mode 100644 src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py
 delete mode 100644 src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py
deleted file mode 100644
index 3dcd4fe56e..0000000000
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py
+++ /dev/null
@@ -1,360 +0,0 @@
-"""
-This script generates GIS data on facilities:
-
-Outputs:
-* ResourceFile_Facility_locations.csv
-* facility_distances.csv
-
-The following variables are added to the dataset generated by consumables_avaialbility_estimation.py:
-1. facility  GIS coordinates
-2. Distance and drive time to corresponding District Health office
-3. Distance and drive time to corresponding Regional Medical Store (warehouse)
-
-Inputs:
-Dropbox location - ~05 - Resources/Module-healthsystem/consumables raw files/gis_data/LMISFacilityLocations_raw.xlsx
-
-NB. The comment of this file are commented-out because the script requires dependencies that are not included in the
-TLO framework at the time of writing.
-"""
-
-
-"""
-import datetime
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import requests
-import googlemaps as gmaps
-import requests
-from matplotlib.lines import Line2D
-
-
-# Path to TLO directory
-outputfilepath = Path("./outputs")
-resourcefilepath = Path("./resources")
-path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables"
-
-# Set local Dropbox source
-path_to_dropbox = Path(  # <-- point to the TLO dropbox locally
-    'C:/Users/sm2511/Dropbox/Thanzi la Onse'
-    # '/Users/tbh03/Dropbox (SPH Imperial College)/Thanzi la Onse Theme 1 SHARE'
-)
-
-path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/"
-
-# define a timestamp for script outputs
-timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
-
-# print the start time of the script
-print('Script Start', datetime.datetime.now().strftime('%H:%M'))
-
-# Use googlemaps package to obtain GIS coordinates using facility names
-GCODE_URL = 'https://maps.googleapis.com/maps/api/geocode/json?'
-GCODE_KEY = ''  # PLaceholder to enter googlemaps API
-# gmaps = gmaps.Client(key=GCODE_KEY)
-
-# 1. Clean Master Health Facility Registry (MHFR) data
-######################################################################
-# Clean locations for facilities for which GIS data was not available on incorrect in the MHFR
-# --- 1.1 Load and set up data --- #
-fac_gis = pd.read_excel(open(path_to_files_in_the_tlo_dropbox / 'gis_data/LMISFacilityLocations_raw.xlsx',
-                             'rb'), sheet_name='final_gis_data')
-fac_gis = fac_gis.rename(
-    columns={'LMIS Facility List': 'fac_name', 'OWNERSHIP': 'fac_owner', 'TYPE': 'fac_type', 'STATUS': 'fac_status',
-             'ZONE': 'zone', 'DISTRICT': 'district', 'DATE OPENED': 'open_date', 'LATITUDE': 'lat',
-             'LONGITUDE': 'long'})
-
-# Create a new column providing source of GIS data
-fac_gis['gis_source'] = ""
-
-# Store unique district names
-districts = fac_gis['district'].unique()
-
-# Preserve rows with missing or incorrect location data in order to derive GIS data using googlemaps API
-cond1 = fac_gis['lat'] > -8.5
-cond2 = fac_gis['lat'] < -17.5
-cond3 = fac_gis['long'] > 36.5
-cond4 = fac_gis['long'] < 32.5
-conda = cond1 | cond2 | cond3 | cond4  # outside Malawi's boundaries
-fac_gis_noloc = fac_gis[fac_gis.lat.isna() | conda]
-fac_gis_noloc = fac_gis_noloc.reset_index()
-fac_gis_noloc = fac_gis_noloc.drop(columns='index')
-
-# Edit data source
-cond_originalmhfr = fac_gis.lat.notna() & ~conda
-fac_gis.loc[cond_originalmhfr, 'gis_source'] = 'Master Health Facility Registry'
-cond_manual = fac_gis['manual_entry'].notna()
-fac_gis.loc[cond_manual, 'gis_source'] = 'Manual google search'
-
-fac_gis_clean = fac_gis[~conda & fac_gis.lat.notna()]  # save clean portion of raw data to be appended later
-
-
-# --- 1.2 Geocode facilities with missing data --- #
-# Define a function to geocode locations based on names
-def reverse_gcode(location):
-    location = str(location).replace(' ', '+')
-    nav_req = 'address={}&key={}'.format(location, GCODE_KEY)
-    request = GCODE_URL + nav_req
-    result = requests.get(request)
-    data = result.json()
-    status = data['status']
-
-    geo_location = {}
-    if str(status) == "OK":
-        sizeofjson = len(data['results'][0]['address_components'])
-        for i in range(sizeofjson):
-            sizeoftype = len(data['results'][0]['address_components'][i]['types'])
-            if sizeoftype == 3:
-                geo_location[data['results'][0]['address_components'][i]['types'][2]] = \
-                    data['results'][0]['address_components'][i]['long_name']
-
-            else:
-                if data['results'][0]['address_components'][i]['types'][0] == 'administrative_area_level_1':
-                    geo_location['state'] = data['results'][0]['address_components'][i]['long_name']
-
-                elif data['results'][0]['address_components'][i]['types'][0] == 'administrative_area_level_2':
-                    geo_location['city'] = data['results'][0]['address_components'][i]['long_name']
-                    geo_location['town'] = geo_location['city']
-
-                else:
-                    geo_location[data['results'][0]['address_components'][i]['types'][0]] = \
-                        data['results'][0]['address_components'][i]['long_name']
-
-        formatted_address = data['results'][0]['formatted_address']
-        geo_location['lat'] = data['results'][0]['geometry']['location']['lat']
-        geo_location['lang'] = data['results'][0]['geometry']['location']['lng']
-        geo_location['formatted_address'] = formatted_address
-
-        return geo_location
-
-
-# Extract latitude, longitude and city based on facility name
-for i in range(len(fac_gis_noloc)):
-    try:
-        try:
-            try:
-                print("Processing facility", fac_gis_noloc['fac_name'][i])
-                geo_info = reverse_gcode(fac_gis_noloc['fac_name'][i] + 'Malawi')
-                fac_gis_noloc['lat'][i] = geo_info['lat']
-                fac_gis_noloc['long'][i] = geo_info['lang']
-                fac_gis_noloc['gis_source'][i] = 'Google maps geolocation'
-                fac_gis_noloc['district'][i] = geo_info['city']
-            except ValueError:
-                pass
-        except TypeError:
-            pass
-    except KeyError:
-        pass
-
-# Drop incorrect GIS coordinates from the above generated dataset
-conda = fac_gis_noloc.district.isin(districts)  # districts not from Malawi
-cond1 = fac_gis_noloc['lat'] > -8.5
-cond2 = fac_gis_noloc['lat'] < -17.5
-cond3 = fac_gis_noloc['long'] > 36.5
-cond4 = fac_gis_noloc['long'] < 32.5
-condb = cond1 | cond2 | cond3 | cond4  # outside Malawi's boundaries
-fac_gis_noloc.loc[~conda | condb, 'lat'] = np.nan
-fac_gis_noloc.loc[~conda | condb, 'long'] = np.nan
-fac_gis_noloc.loc[~conda | condb, 'district'] = np.nan
-
-cond = fac_gis_noloc.gis_source.isna()
-fac_gis_noloc.loc[cond, 'lat'] = np.nan
-fac_gis_noloc.loc[cond, 'long'] = np.nan
-
-# Append newly generated GIS information to the raw data
-fac_gis = fac_gis_noloc.append(fac_gis_clean)
-
-# Drop incorrect GIS coordinates based on later comparison with district data from LMIS
-list_of_incorrect_locations = ['Bilal Clinic', 'Biliwiri Health Centre', 'Chilonga Health care Health Centre',
-                               'Diamphwi Health Centre', 'Matope Health Centre (CHAM)', 'Nambazo Health Centre',
-                               'Nkhwayi Health Centre', 'Nsambe Health Centre (CHAM)', 'Padley Pio Health Centre',
-                               'Phanga Health Centre', 'Somba Clinic', "St. Martin's Molere Health Centre CHAM",
-                               'Ngapani Clinic', 'Mulungu Alinafe Clinic', 'Mdeza Health Centre',
-                               'Matandani Health Centre (CHAM)',
-                               'Sunrise Clinic', 'Sucoma Clinic']
-mapped_to_malawi = fac_gis.lat == -13.254308
-cond = fac_gis.fac_name.isin(list_of_incorrect_locations) | mapped_to_malawi
-fac_gis.loc[cond, 'lat'] = np.nan
-fac_gis.loc[cond, 'long'] = np.nan
-fac_gis.loc[cond, 'gis_source'] = np.nan
-fac_gis.loc[cond, 'district'] = np.nan
-
-# 2. Clean data using information from LMIS #
-#####################################################################################################
-# --- 2.1 Load and set up LMIS data --- #
-stkout_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv",
-                        low_memory=False)
-
-# Drop rows which can't be used in regression analysis
-regsubset_cond1 = stkout_df['data_source'] == 'original_lmis_data'
-regsubset_cond2 = stkout_df[
-                      'fac_type_tlo'] == 'Facility_level_0'  # since only one facility from Mchinji reported in OpenLMIS
-stkout_df_reg = stkout_df[regsubset_cond1 & ~regsubset_cond2]
-
-# Clean some district names to match with master health facility registry
-rename_districts = {
-    'Nkhota Kota': 'Nkhotakota',
-    'Nkhata bay': 'Nkhata Bay'
-}
-stkout_df['district'] = stkout_df['district'].replace(rename_districts)
-
-# Keep only relevant columns
-lmis_district = stkout_df[['fac_name', 'fac_type_tlo', 'district']]
-lmis_district = lmis_district.drop_duplicates()
-
-# --- 2.2 Clean district column and assign relevant DHO to each facility --- #
-# Manual fixes before assigning DHO
-# Master Health facility registry did not differentiate between Mzimba North and Mzimba South --> get this data
-# and any other district discrepancies from LMIS
-fac_gis = fac_gis.rename(columns={'district': 'district_mhfr'})
-fac_gis = pd.merge(fac_gis, lmis_district, how='left', on='fac_name')
-
-list_mhfr_district_is_correct = ['Chididi Health Centre', 'Chikowa Health Centre',
-                                 'Chileka Health Centre']
-cond_mhfr_district_is_correct = fac_gis.fac_name.isin(list_mhfr_district_is_correct)
-cond_lmis_district_missing = fac_gis.district.isna()
-fac_gis.loc[cond_mhfr_district_is_correct | cond_lmis_district_missing, 'district'] = fac_gis.district_mhfr
-fac_gis = fac_gis.drop(columns=['zone', 'district_mhfr', 'open_date', 'manual_entry'])
-
-# --- 1.3 Extract final file with GIS locations into .csv --- #
-fac_gis = fac_gis[fac_gis['lat'].notna()]  # Keep rows with GIS locations
-fac_gis.to_csv(path_for_new_resourcefiles / "ResourceFile_Facility_locations.csv")
-
-# Locate the corresponding DHO for each facility
-cond1 = fac_gis['fac_name'].str.contains('DHO')
-cond2 = fac_gis['fac_name'].str.contains('istrict')
-# Create columns indicating the coordinates of the corresponding DHO for each facility
-dho_df = fac_gis[cond1 | cond2].reset_index()
-# Rename columns
-dho_df = dho_df.rename(columns={'lat': 'lat_dh', 'long': 'long_dh'})
-
-# Merge main GIS dataframe with corresponding DHO
-fac_gis = pd.merge(fac_gis, dho_df[['district', 'lat_dh', 'long_dh']], how='left', on='district')
-
-# --- 2.3 Assign relevant CMST Regional Medical Store to each facility --- #
-# Create columns indicating the coordinates of the corresponding CMST warehouse (regional medical store) for each
-# facility
-fac_gis['lat_rms'] = np.nan
-fac_gis['long_rms'] = np.nan
-fac_gis['rms'] = np.nan
-
-# RMS Center (-13.980394, 33.783521)
-cond_center1 = fac_gis['district'].isin(['Kasungu', 'Ntchisi', 'Dowa', 'Mchinji', 'Lilongwe', 'Ntcheu',
-                                         'Dedza', 'Nkhotakota', 'Salima'])
-cond_center2 = fac_gis['fac_name'].str.contains('Kamuzu Central Hospital')
-fac_gis.loc[cond_center1 | cond_center2, 'lat_rms'] = -13.980394
-fac_gis.loc[cond_center1 | cond_center2, 'long_rms'] = 33.783521
-fac_gis.loc[cond_center1 | cond_center2, 'rms'] = 'RMS Center'
-
-# RMS North (-11.425590, 33.997467)
-cond_north1 = fac_gis['district'].isin(['Nkhata Bay', 'Rumphi', 'Chitipa', 'Likoma', 'Karonga',
-                                        'Mzimba North', 'Mzimba South'])
-cond_north2 = fac_gis['fac_name'].str.contains('Mzuzu Central Hospital')
-fac_gis.loc[cond_north1 | cond_north2, 'lat_rms'] = -11.425590
-fac_gis.loc[cond_north1 | cond_north2, 'long_rms'] = 33.997467
-fac_gis.loc[cond_north1 | cond_north2, 'rms'] = 'RMS North'
-
-# RMS South (-15.804544, 35.021192)
-cond_south1 = fac_gis['district'].isin(['Blantyre', 'Balaka', 'Machinga', 'Zomba', 'Mangochi', 'Thyolo', 'Nsanje',
-                                        'Chikwawa', 'Mwanza', 'Neno', 'Mulanje', 'Phalombe', 'Chiradzulu'])
-cond_south2 = fac_gis['fac_name'].str.contains('Queen Elizabeth Central')
-cond_south3 = fac_gis['fac_name'].str.contains('Zomba Central')
-cond_south4 = fac_gis['fac_name'].str.contains('Zomba Mental')
-fac_gis.loc[cond_south1 | cond_south2 | cond_south3 | cond_south4, 'lat_rms'] = -15.804544
-fac_gis.loc[cond_south1 | cond_south2 | cond_south3 | cond_south4, 'long_rms'] = 35.021192
-fac_gis.loc[cond_south1 | cond_south2 | cond_south3 | cond_south4, 'rms'] = 'RMS South'
-fac_gis['district'].unique()
-
-# 3. Generate data on distance and travel time between facilities and DHO/RMS #
-#####################################################################################################
-# --- 3.1 Distance and travel time of each facility from the corresponding DHO --- #
-fac_gis['dist_todh'] = np.nan
-fac_gis['drivetime_todh'] = np.nan
-for i in range(len(fac_gis)):
-    try:
-        # print("Processing facility", i)
-        latfac = fac_gis['lat'][i]
-        longfac = fac_gis['long'][i]
-        latdho = fac_gis['lat_dh'][i]
-        longdho = fac_gis['long_dh'][i]
-        origin = (latdho, longdho)
-        dest = (latfac, longfac)
-
-        fac_gis['dist_todh'][i] = \
-            gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['distance']['value']
-        fac_gis['drivetime_todh'][i] = \
-            gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['duration']['value']
-    except:
-        pass
-
-# --- 3.2 Distance and travel time of each facility from the corresponding RMS --- #
-fac_gis['dist_torms'] = np.nan
-fac_gis['drivetime_torms'] = np.nan
-for i in range(len(fac_gis)):
-    try:
-        # print("Processing facility", i)
-        latfac = fac_gis['lat'][i]
-        longfac = fac_gis['long'][i]
-        latdho = fac_gis['lat_rms'][i]
-        longdho = fac_gis['long_rms'][i]
-        origin = (latdho, longdho)
-        dest = (latfac, longfac)
-
-        fac_gis['dist_torms'][i] = \
-            gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['distance']['value']
-        fac_gis['drivetime_torms'][i] = \
-            gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['duration']['value']
-    except:
-        pass
-
-# Update distance values from DH to 0 for levels 2 and above
-cond1 = fac_gis['fac_type_tlo'] == 'Facility_level_2'
-cond2 = fac_gis['fac_type_tlo'] == 'Facility_level_3'
-cond3 = fac_gis['fac_type_tlo'] == 'Facility_level_4'
-fac_gis.loc[cond1 | cond2 | cond3, 'dist_todh'] = 0
-fac_gis.loc[cond1 | cond2 | cond3, 'drivetime_todh'] = 0
-
-# 4. Save data to be merge into Consumable availabilty dataset for regression analysis #
-#####################################################################################################
-# Keep only necessary columns and save as .csv
-fac_gis = fac_gis[['district', 'rms', 'lat', 'long', 'lat_dh', 'long_dh', 'lat_rms', 'long_rms',
-                   'dist_torms', 'drivetime_torms', 'dist_todh', 'drivetime_todh', 'fac_name', 'gis_source']]
-
-# - 1.2.5 Export distances file to dropbox - #
-fac_gis.to_csv(path_to_files_in_the_tlo_dropbox / 'gis_data/facility_distances.csv')
-
-# 5. Descriptive graphs #
-#####################################################################################################
-groups = fac_gis.groupby('district')
-
-# Scatterplot of distance and drive time to DHO
-fig, ax = plt.subplots()
-ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling
-for name, group in groups:
-    ax.plot(group.dist_todh / 1000, group.drivetime_todh, marker='o', linestyle='', ms=5, label=name)
-# Shrink current axis by 20% to fit legend
-box = ax.get_position()
-ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
-# Put a legend to the right of the current axis
-ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
-plt.xlabel("Distance (kilometers)", fontsize=12)
-plt.ylabel("Drive time (minutes)", fontsize=12)
-plt.savefig('C:/Users/sm2511/OneDrive - University of York/Desktop/faclocation_wrtdh_new.png')
-
-# Scatterplot of distance and drive time to RMS
-groups = fac_gis.groupby('rms')
-fig, ax = plt.subplots()
-ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling
-for name, group in groups:
-    ax.plot(group.dist_torms / 1000, group.drivetime_torms, marker='o', linestyle='', ms=5, label=name)
-# Shrink current axis by 20% to fit legend
-box = ax.get_position()
-ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
-# Put a legend to the right of the current axis
-ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
-plt.xlabel("Distance (kilometers)", fontsize=12)
-plt.ylabel("Drive time (minutes)", fontsize=12)
-plt.savefig('C:/Users/sm2511/OneDrive - University of York/Desktop/faclocation_wrtrms.png')
-"""
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py
deleted file mode 100644
index fc5c775bce..0000000000
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""
-This script generates the consumables availability dataset for regression analysis using the outputs of -
-consumables_availability_estimation.py and clean_fac_locations.py -
-and generates descriptive figures and tables.
-"""
-import datetime
-from pathlib import Path
-
-import pandas as pd
-
-# import numpy as np
-# import calendar
-# import copy
-# import matplotlib.pyplot as plt
-# from matplotlib.lines import Line2D
-# from matplotlib import pyplot # for figures
-# import seaborn as sns
-# import math
-
-# Path to TLO directory
-outputfilepath = Path("./outputs")
-resourcefilepath = Path("./resources")
-path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables"
-
-# Set local Dropbox source
-path_to_dropbox = Path(  # <-- point to the TLO dropbox locally
-    'C:/Users/sm2511/Dropbox/Thanzi la Onse'
-)
-
-path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/"
-
-# define a timestamp for script outputs
-timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
-
-# print the start time of the script
-print('Script Start', datetime.datetime.now().strftime('%H:%M'))
-
-# 1. DATA IMPORT AND CLEANING #
-#########################################################################################
-# --- 1.1 Import consumables availability data --- #
-stkout_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv",
-                        low_memory=False)
-
-# Drop rows which can't be used in regression analysis
-regsubset_cond1 = stkout_df['data_source'] == 'original_lmis_data'
-regsubset_cond2 = stkout_df[
-                      'fac_type_tlo'] == 'Facility_level_0'  # since only one facility from Mchinji reported in OpenLMIS
-stkout_df_reg = stkout_df[regsubset_cond1 & ~regsubset_cond2]
-
-# Clean some district names to match with master health facility registry
-rename_districts = {
-    'Nkhota Kota': 'Nkhotakota',
-    'Nkhata bay': 'Nkhata Bay'
-}
-stkout_df['district'] = stkout_df['district'].replace(rename_districts)
-
-# --- 1.2 Import GIS data --- #
-fac_gis = pd.read_csv(path_to_files_in_the_tlo_dropbox / "gis_data/facility_distances.csv")
-
-# --- 1.3 Merge cleaned LMIS data with GIS data --- #
-consumables_df = pd.merge(stkout_df.drop(columns=['district', 'Unnamed: 0']), fac_gis.drop(columns=['Unnamed: 0']),
-                          how='left', on='fac_name')
-consumables_df.to_csv(path_to_files_in_the_tlo_dropbox / 'consumables_df.csv')

From 09d8bb2b47da5dfc1377d649c1a23a2764e0940a Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 5 Sep 2024 14:20:29 +0100
Subject: [PATCH 060/100] Drop columns not needed from RF_items_and_packages -
 the units per case are now hard coded in the script - the unit cost is in
 ResourceFile_costing.xlsx

---
 ...rceFile_Consumables_Items_and_Packages.csv |  4 +-
 ...ate_consumables_item_codes_and_packages.py | 72 ++++++-------------
 2 files changed, 22 insertions(+), 54 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv
index 0ee403abb0..f3589757c7 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4106c2e3ae068d40b115857885b673bec3e1114be5183c0a4ae0366560e2a5c9
-size 249391
+oid sha256:596a1bc8d570f341da180fea6db1836c181f6a2a984a9c7f9b4990b78df8e689
+size 215244
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py b/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py
index 7ca04f763f..db6af01154 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py
@@ -19,22 +19,20 @@
 import numpy as np
 import pandas as pd
 
-# Set local Dropbox source
-path_to_dropbox = Path(  # <-- point to the TLO dropbox locally
-   # '/Users/tbh03/Dropbox (SPH Imperial College)/Thanzi la Onse Theme 1 SHARE'
-    '/Users/sm2511/Dropbox/Thanzi La Onse')
+# Set local shared folder source
+path_to_share = Path(  # <-- point to the shared folder
+    '/Users/sm2511/Library/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/'
+)
 
 resourcefilepath = Path("./resources")
 path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables"
 
 
 # EHP Consumables list
-path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/"
-
-workingfile_ehp_consumables = path_to_dropbox / \
+workingfile_ehp_consumables = path_to_share / \
               "05 - Resources/Module-healthsystem/From Matthias Arnold/ORIGINAL_Intervention input.xlsx"
 
-workingfile_one_health = path_to_dropbox / \
+workingfile_one_health = path_to_share / \
                          "07 - Data/OneHealth projection files/OneHealth commodities.xlsx"
 
 
@@ -157,7 +155,8 @@
 
 wb = wb.merge(intv_codes, on='Intervention_Pkg', how='left', indicator=True)
 assert (wb['_merge'] == 'both').all()
-wb = wb.drop(columns='_merge')
+wb = wb.drop(columns=['_merge','Expected_Units_Per_Case',
+                'Unit_Cost'])
 
 # Assign a unique code for each individual consumable item
 unique_items = pd.unique(wb['Items'])
@@ -172,9 +171,7 @@
          'Intervention_Pkg',
          'Intervention_Pkg_Code',
          'Items',
-         'Item_Code',
-         'Expected_Units_Per_Case',
-         'Unit_Cost']]
+         'Item_Code']]
 
 assert not pd.isnull(wb).any().any()
 
@@ -215,7 +212,7 @@
 only_in_oh['Intervention_Pkg'] = 'Misc'
 only_in_oh['Intervention_Pkg_Code'] = -99
 only_in_oh['Item_Code'] = np.arange(1000, 1000 + len(only_in_oh))
-only_in_oh['Expected_Units_Per_Case'] = 1.0
+only_in_oh = only_in_oh.drop(columns = 'Unit_Cost')
 
 assert set(only_in_oh.columns) == set(wb.columns)
 
@@ -249,8 +246,6 @@ def add_record(df: pd.DataFrame, record: Dict):
     assert set(df.columns) == set(record.keys())
     return pd.concat([df, pd.DataFrame.from_records([record])], ignore_index=True)
 
-
-
 cons = add_record(
     cons,
     {
@@ -258,10 +253,7 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "Forceps, obstetric",
-        'Item_Code': 2669,
-        'Expected_Units_Per_Case': 1.0,
-        'Unit_Cost': 1.0
-        }
+        'Item_Code': 2669}
 )
 
 cons = add_record(
@@ -271,10 +263,7 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "Vacuum, obstetric",
-        'Item_Code': 2670,
-        'Expected_Units_Per_Case': 1.0,
-        'Unit_Cost': 1.0
-    },
+        'Item_Code': 2670},
 )
 
 cons = add_record(
@@ -284,10 +273,7 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "First-line ART regimen: adult",
-        'Item_Code': 2671,
-        'Expected_Units_Per_Case': 1.0,
-        'Unit_Cost': 1.0
-    },
+        'Item_Code': 2671},
 )
 
 cons = add_record(
@@ -297,10 +283,7 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "First line ART regimen: older child",
-        'Item_Code': 2672,
-        'Expected_Units_Per_Case': 1.0,
-        'Unit_Cost': 1.0
-    },
+        'Item_Code': 2672},
 )
 
 cons = add_record(
@@ -310,10 +293,7 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "First line ART regimen: young child",
-        'Item_Code': 2673,
-        'Expected_Units_Per_Case': 1.0,
-        'Unit_Cost': 1.0
-    },
+        'Item_Code': 2673},
 )
 
 cons = add_record(
@@ -323,10 +303,7 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "Pre-exposure prophlaxis for HIV",
-        'Item_Code': 2674,
-        'Expected_Units_Per_Case': 1.0,
-        'Unit_Cost': 1.0
-    },
+        'Item_Code': 2674},
 )
 
 cons = add_record(
@@ -336,10 +313,7 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Isoniazid preventative therapy for HIV+ no TB",
         'Intervention_Pkg_Code': 82,
         'Items': "Isoniazid/Rifapentine",
-        'Item_Code': 2678,
-        'Expected_Units_Per_Case': 1.0,
-        'Unit_Cost': 1.0
-    },
+        'Item_Code': 2678},
 )
 
 cons = add_record(
@@ -349,9 +323,7 @@ def add_record(df: pd.DataFrame, record: Dict):
     'Intervention_Pkg': "Misc",
     'Intervention_Pkg_Code': -99,
     'Items': "Cystoscope",
-    'Item_Code': 285,
-    'Expected_Units_Per_Case': 1.0,
-    'Unit_Cost': np.nan},
+    'Item_Code': 285},
 )
 
 cons = add_record(
@@ -360,9 +332,7 @@ def add_record(df: pd.DataFrame, record: Dict):
     'Intervention_Pkg': "Misc",
     'Intervention_Pkg_Code': -99,
     'Items': "Endoscope",
-    'Item_Code': 280,
-    'Expected_Units_Per_Case': 1.0,
-    'Unit_Cost': np.nan},
+    'Item_Code': 280},
 )
 
 cons = add_record(
@@ -371,9 +341,7 @@ def add_record(df: pd.DataFrame, record: Dict):
     'Intervention_Pkg': "Misc",
     'Intervention_Pkg_Code': -99,
     'Items': "Prostate specific antigen test",
-    'Item_Code': 281,
-    'Expected_Units_Per_Case': 1.0,
-    'Unit_Cost': np.nan},
+    'Item_Code': 281},
 )
 
 

From 15aeb98b95c591081fd44e89504d67f0bb5aaaf9 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 5 Sep 2024 16:11:16 +0100
Subject: [PATCH 061/100] load 'item_category' directly from the small
 consumables RF - this is due to changes in PR #1458

---
 ...e_availability_scenarios_for_impact_analysis.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index bc05f733e0..ae8f3ec405 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -32,11 +32,6 @@
 
 from tlo.methods.consumables import check_format_of_consumables_file
 
-# Set local Dropbox source
-path_to_dropbox = Path(  # <-- point to the TLO dropbox locally
-    '/Users/sm2511/Dropbox/Thanzi la Onse'
-)
-
 # define a timestamp for script outputs
 timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M")
 
@@ -55,7 +50,7 @@
 #------------------------------------------------------
 tlo_availability_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv")
 # Drop any scenario data previously included in the resourcefile
-tlo_availability_df = tlo_availability_df[['Facility_ID', 'month', 'item_code', 'available_prop']]
+tlo_availability_df = tlo_availability_df[['Facility_ID', 'month', 'item_category', 'item_code', 'available_prop']]
 
 # 1.1.1 Attach district, facility level, program to this dataset
 #----------------------------------------------------------------
@@ -66,13 +61,6 @@
 tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
                     on = ['Facility_ID'], how='left')
 
-# 1.1.2 Attach programs
-programs = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv")[['category', 'item_code', 'module_name']]
-# TODO See if programs can be extracted from a different location as ResourceFile_Consumables_availability_and_usage.csv is now deprecated in master
-programs = programs.drop_duplicates('item_code')
-# manually add category for the two consumables for which it is missing
-tlo_availability_df = tlo_availability_df.merge(programs, on = ['item_code'], how = 'left')
-
 # 1.2 Import scenario data
 #------------------------------------------------------
 scenario_availability_df = pd.read_csv(outputfilepath / "regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv")

From e51384a6fc0c1214ffc256b6d0a32f5163eb398e Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 5 Sep 2024 17:45:15 +0100
Subject: [PATCH 062/100] update df name for readability

---
 ...ilability_scenarios_for_impact_analysis.py | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index ae8f3ec405..557a79c576 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -106,20 +106,22 @@
 scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_facid'}, axis=1)
 
 # Extract list of District X Facility Level combinations for which there is no HHFA data
-scenario_availability_df_test = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
+df_to_check_prediction_completeness = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
                     left_on=['District', 'Facility_Level'],
                     right_on=['District', 'Facility_Level'], how='right', indicator=True)
-cond_no_1b = (scenario_availability_df_test['Facility_Level'].isin(['1b'])) & (scenario_availability_df_test['_merge'] == 'right_only')
-cond_no_1a = (scenario_availability_df_test['Facility_Level'].isin(['1a'])) & (scenario_availability_df_test['_merge'] == 'right_only')
-districts_with_no_scenario_data_for_1b = scenario_availability_df_test[cond_no_1b]['District'].unique()
-districts_with_no_scenario_data_for_1a = scenario_availability_df_test[cond_no_1a]['District'].unique()
+cond_no_1b = (df_to_check_prediction_completeness['Facility_Level'].isin(['1b'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only')
+cond_no_1a = (df_to_check_prediction_completeness['Facility_Level'].isin(['1a'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only')
+districts_with_no_scenario_data_for_1b = df_to_check_prediction_completeness[cond_no_1b]['District'].unique()
+districts_with_no_scenario_data_for_1a = df_to_check_prediction_completeness[cond_no_1a]['District'].unique()
 districts_with_no_scenario_data_for_1b_only = np.setdiff1d(districts_with_no_scenario_data_for_1b, districts_with_no_scenario_data_for_1a)
 
 # According to HHFA data, Balaka, Machinga, Mwanza, Ntchisi and Salima do not have level 1b facilities
-# Likoma was not included in the regression because of the limited variation within the district - only 4 facilities
+# Likoma was not included in the regression because of the limited variation within the district - only 4 facilities - we have assumed that the change of consumable
+# availability in Likoma is equal to that predicted for Nkhata Bay
 
 # 1.2.4 Program
 #------------------------------------------------------
+scenario_availability_df.loc[scenario_availability_df.program_plot == 'infection_prev', 'program_plot'] = 'general' # there is no separate infection_prevention category in the TLO availability data
 map_model_programs_to_hhfa = {
     'contraception': 'contraception',
     'general': 'general',
@@ -136,23 +138,23 @@
     'cardiometabolicdisorders': 'ncds',
     'cancer': 'ncds',
 }
-# TODO Check if the above mapping is correct
-# TODO collapse infection_prev and general in the HHFA-based predicted dataframe
-
-scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(map_model_programs_to_hhfa)
+scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(map_model_programs_to_hhfa) # TODO this does not work
 
 # 1.2.5 Consumable/Item code and Category
 #------------------------------------------------------
 # Load TLO - HHFA consumable name crosswalk
 consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['module_name', 'item_code', 'consumable_name_tlo',
 'item_code_hhfa', 'item_hhfa', 'regression_application', 'notes_on_regression_application']]
+
+# Keep only item_codes in the availability dataframe
+consumable_crosswalk_df = consumable_crosswalk_df.merge(tlo_availability_df[['item_code']], how = 'right', on = 'item_code')
 # TODO Check that this crosswalk is complete
 # TODO is module_name used?
 # TODO add new consumables Rifapentine to this?
 
 # Now merge in TLO item codes
 scenario_availability_df = scenario_availability_df.reset_index(drop = True)
-scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa', 'regression_application', 'module_name']],
+scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa', 'regression_application', 'module_name', 'consumable_name_tlo']],
                     on = ['item_hhfa'], how='right', indicator=True, validate = "m:m")
 scenario_availability_df = scenario_availability_df.drop_duplicates(['Facility_ID', 'item_code'])
 scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_itemcode'}, axis=1)

From 21fd8a8e12bc24ea43c6374d675ec67f146ed415 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Fri, 6 Sep 2024 16:44:35 +0100
Subject: [PATCH 063/100] add HHFA proxies for two more TLO model consumables 
 + update column names and script for clarity

---
 .../ResourceFile_consumables_matched.csv      |  4 +-
 ...ilability_scenarios_for_impact_analysis.py | 75 ++++++++-----------
 2 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
index d7798357fc..2193096eed 100644
--- a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
+++ b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66b68e216701f841382e3add63305d55e1598862076789352ac7d799100e6d1a
-size 110047
+oid sha256:feed2010d40e0a6f553ebae084543745ed2177640c47d278d186884346c7649a
+size 122886
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 557a79c576..5aef416f34 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -51,8 +51,9 @@
 tlo_availability_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv")
 # Drop any scenario data previously included in the resourcefile
 tlo_availability_df = tlo_availability_df[['Facility_ID', 'month', 'item_category', 'item_code', 'available_prop']]
+program_item_mapping = tlo_availability_df[['item_category', 'item_code']].drop_duplicates()
 
-# 1.1.1 Attach district, facility level, program to this dataset
+# 1.1.1 Attach district and facility level to this dataset
 #----------------------------------------------------------------
 # Get TLO Facility_ID for each district and facility level
 mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv")
@@ -74,7 +75,7 @@
 
 # 1.2.2 District
 #------------------------------------------------------
-# Do some mapping to make the Districts line-up with the definition of Districts in the model
+# Do some mapping to make the Districts in the scenario file line-up with the definition of Districts in the model
 rename_and_collapse_to_model_districts = {
     'Mzimba South': 'Mzimba',
     'Mzimba North': 'Mzimba',
@@ -143,19 +144,18 @@
 # 1.2.5 Consumable/Item code and Category
 #------------------------------------------------------
 # Load TLO - HHFA consumable name crosswalk
-consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['module_name', 'item_code', 'consumable_name_tlo',
-'item_code_hhfa', 'item_hhfa', 'regression_application', 'notes_on_regression_application']]
+consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['item_code', 'consumable_name_tlo',
+'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation']]
 
 # Keep only item_codes in the availability dataframe
 consumable_crosswalk_df = consumable_crosswalk_df.merge(tlo_availability_df[['item_code']], how = 'right', on = 'item_code')
-# TODO Check that this crosswalk is complete
 # TODO is module_name used?
 # TODO add new consumables Rifapentine to this?
 
 # Now merge in TLO item codes
 scenario_availability_df = scenario_availability_df.reset_index(drop = True)
-scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa', 'regression_application', 'module_name', 'consumable_name_tlo']],
-                    on = ['item_hhfa'], how='right', indicator=True, validate = "m:m")
+scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation', 'consumable_name_tlo']],
+                    left_on = ['item_hhfa'], right_on = ['item_hhfa_for_scenario_generation'], how='right', indicator=True, validate = "m:m")
 scenario_availability_df = scenario_availability_df.drop_duplicates(['Facility_ID', 'item_code'])
 scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_itemcode'}, axis=1)
 print("Number of item codes from the TLO model for which no match was found in the regression-based scenario data = ", scenario_availability_df.merge_itemcode.value_counts()[1])
@@ -172,29 +172,16 @@
 
 # Extract list of TLO consumables which weren't matched with the availability prediction dataframe
 items_not_matched = scenario_availability_df['merge_itemcode'] == 'right_only'
-scenario_availability_df[items_not_matched][['item_code', 'regression_application']].to_csv(outputfilepath / 'temp_items_not_matched.csv')
 
 # Get average  availability_change_prop value by facility_ID and category_tlo
-scenario_availability_df = scenario_availability_df.merge(programs[['category', 'item_code']],
+scenario_availability_df = scenario_availability_df.merge(program_item_mapping,
                                             on = ['item_code'], validate = "m:1",
                                             how = "left")
 
-# check that all consumables have a category assigned to them
-map_items_with_missing_category_to_category= {77:'reproductive_health',
-301:'alri',
-63: 'neonatal_health',
-258: 'cancer',
-1735: 'general'}
-
-# Update the category column based on item_code
-scenario_availability_df['category'] = scenario_availability_df.apply(lambda row: map_items_with_missing_category_to_category[row['item_code']]
-                          if row['item_code'] in map_items_with_missing_category_to_category
-                          else row['category'], axis=1)
-
 # 1.3 Initial interpolation
 #------------------------------------------------------
 # 1.3.1 Items not relevant to the regression analysis
-items_not_relevant_to_regression = (items_not_matched) & (scenario_availability_df['regression_application'] == 'not relevant to logistic regression analysis')
+items_not_relevant_to_regression = (items_not_matched) & (scenario_availability_df['hhfa_mapping_rationale_for_scenario_generation'] == 'not relevant to logistic regression analysis')
 # For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability
 list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2',
        'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5']
@@ -222,30 +209,34 @@
 ratio_of_change_across_districts_1b_to_1a.reset_index(drop = True)
 # TODO check if this ratio should be of the proportions minus 1
 
-# Use the above for those districts with no level 1b facilities recorded in the HHFA data
+# For districts with no level 1b data in the HHFA, use the ratio of change in level 1b facilities to level 1a facilities to generate the expected proportional change in availability
+# for level 1b facilities in those districts
 scenario_availability_df = scenario_availability_df.reset_index(drop = True)
-cond_1b_missing_districts = scenario_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only)
+cond_districts_with_1b_missing = scenario_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only)
 cond_1a = scenario_availability_df.Facility_Level == '1a'
 cond_1b = scenario_availability_df.Facility_Level == '1b'
-df_1a = scenario_availability_df[cond_1b_missing_districts & cond_1a]
+df_1a = scenario_availability_df[cond_districts_with_1b_missing & cond_1a]
 
-ratio_vars = ['ratio_' + item for item in list_of_scenario_variables]
+ratio_vars = ['ratio_' + item for item in list_of_scenario_variables] # create columns to represent the ratio of change in 1b facilities to level 1a facilities
 
 item_var = ['item_code']
+
+# First merge the dataframe with changes at level 1a with the ratio of 1b to 1a
 df_missing_1b_imputed = df_1a.merge(ratio_of_change_across_districts_1b_to_1a[item_var + ratio_vars],
-                               on = ['item_code'],
+                               on = item_var,
                                how = 'left', validate = "m:1")
 
+# Then multiply the ratio of 1b to 1a with the change at level 1a to get the expected change at level 1b
 for var in list_of_scenario_variables:
     df_missing_1b_imputed[var] = df_missing_1b_imputed[var] * df_missing_1b_imputed['ratio_' + var]
-
+# Update columns so the dataframe in fact refers to level 1b facilities
 df_missing_1b_imputed.Facility_Level = '1b' # Update facility level to 1
 # Replace Facility_IDs
 df_missing_1b_imputed = df_missing_1b_imputed.drop('Facility_ID', axis = 1).merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
                                                                                      on =['District', 'Facility_Level'],
                                                                                      how = 'left')
-
-df_without_districts_with_no_1b_facilities = scenario_availability_df[~(cond_1b_missing_districts & cond_1b)]
+# Append the new imputed level 1b dataframe to the original dataframe
+df_without_districts_with_no_1b_facilities = scenario_availability_df[~(cond_districts_with_1b_missing & cond_1b)]
 scenario_availability_df = pd.concat([df_without_districts_with_no_1b_facilities, df_missing_1b_imputed], ignore_index = True)
 
 # 2. Merge TLO model availability data with scenario data using crosswalk
@@ -260,7 +251,7 @@
 full_scenario_df = full_scenario_df.drop_duplicates(['Facility_ID', 'item_code', 'month'])
 
 # Check that level 1b values are currently imputed
-# full_scenario_df[full_scenario_df.District == 'Balaka'].groupby(['District', 'Facility_Level'])['change_proportion_scenario1'].mean()
+#full_scenario_df[full_scenario_df.District == 'Balaka'].groupby(['District', 'Facility_Level'])['change_proportion_scenario1'].mean()
 
 # 2.2 Further imputation
 #------------------------------------------------------
@@ -276,9 +267,9 @@
 # 2.3.1 Check that the merged dataframe has the same number of unique items, facility IDs, and total
 # number of rows as the original small availability resource file
 #---------------------------------------------------------------------------------------------------------
-assert(full_scenario_df.item_code.nunique() == tlo_availability_df.item_code.nunique())
-assert(full_scenario_df.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique())
-assert(len(full_scenario_df) == len(tlo_availability_df))
+assert(full_scenario_df.item_code.nunique() == tlo_availability_df.item_code.nunique()) # the number of items in the new dataframe is the same at the original availability dataframe
+assert(full_scenario_df.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique()) # the number of Facility IDs in the new dataframe is the same at the original availability dataframe
+assert(len(full_scenario_df) == len(tlo_availability_df)) # the number of rows in the new dataframe is the same at the original availability dataframe
 
 # 2.3.2 Construct dataset that conforms to the principles expected by the simulation: i.e. that there is an entry for every
 # facility_id and for every month for every item_code.
@@ -305,8 +296,8 @@
     facilities_by_level[row['Facility_Level']].add(row['Facility_ID'])
 
 items_by_category = defaultdict(set)
-for ix, row in programs.iterrows():
-    items_by_category[row['category']].add(row['item_code'])
+for ix, row in program_item_mapping.iterrows():
+    items_by_category[row['item_category']].add(row['item_code'])
 
 def get_other_facilities_of_same_level(_fac_id):
     """Return a set of facility_id for other facilities that are of the same level as that provided."""
@@ -382,11 +373,9 @@ def interpolate_missing_with_mean(_ser):
                     )
 
                 else:
-                    # If it is not recorded at other facilities of same level, then assume it is never available at the
-                    # facility.
+                    # If it is not recorded at other facilities of same level, then assume that there is no change
                     print("No interpolation worked")
                     _monthly_records = _monthly_records.fillna(1.0)
-# TODO this should be available_prop
 
             # Insert values (including corrections) into the resulting dataset.
             full_set_interpolated.loc[(fac, slice(None), item), col] = _monthly_records.values
@@ -407,7 +396,7 @@ def interpolate_missing_with_mean(_ser):
     full_set_interpolated.loc[availability_greater_than_1, 'available_prop_' + scenario] = 1
 
     assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) ==
-           sum(full_set_interpolated['change_proportion_' + scenario].isna()))
+           sum(full_set_interpolated['change_proportion_' + scenario].isna())) # make sure that there is an entry for every row in which there was previously data
 
 # 4. Generate best performing facility-based scenario data on consumable availablity
 #*********************************************************************************************
@@ -516,7 +505,7 @@ def interpolate_missing_with_mean(_ser):
 # 8. Plot new availability estimates by scenario
 #*********************************************************************************************
 # Creating the line plot with ggplot
-df_for_plots = full_df_with_scenario.merge(programs[['category', 'item_code']], on = 'item_code', how = "left", validate = "m:1")
+df_for_plots = full_df_with_scenario.merge(programs[['item_category', 'item_code']], on = 'item_code', how = "left", validate = "m:1")
 df_for_plots = df_for_plots.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1")
 def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     df_for_line_plot = _df.groupby([_x_axis_var])[['available_prop'] + final_list_of_scenario_vars].mean()
@@ -535,7 +524,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     if not os.path.exists(directory):
         os.makedirs(directory)
     plot.save(filename= directory / _filename, dpi=300, width=10, height=8, units='in')
-generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'category', _filename = 'availability_by_category.png')
+generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'item_category', _filename = 'availability_by_category.png')
 generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png')
 
 # Scenario on the X axis, level on the Y axis
@@ -547,7 +536,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 #------------------------------------------------------
 pivot_table = pd.pivot_table(scenario_availability_df,
                              values=list_of_scenario_variables,
-                             index=['category'],
+                             index=['item_category'],
                              columns=['Facility_Level'],
                              aggfunc=lambda x: sum(pd.isna(x))/len(x)*100)
 pivot_table.to_csv(outputfilepath / "temp.csv")

From 1b63b3072b8b5b525f09801a3e82d2172e5dcb6f Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Fri, 6 Sep 2024 19:57:46 +0100
Subject: [PATCH 064/100] add scenario 9 (level 2 increased to 99th percentile)

---
 ...rceFile_Consumables_availability_small.csv |  4 +-
 ...ilability_scenarios_for_impact_analysis.py | 52 ++++++++++++++-----
 src/tlo/methods/consumables.py                |  3 +-
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index ef72473491..0ca97619ac 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c495020b77bf7fcb715ca4253759497c6a7ea84040ef8b262025634d5062263
-size 10070558
+oid sha256:e1f9d8d7f72e3d1a960a3e16e5207f6baf7d57347784023d6c3a880293199a69
+size 37424822
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 5aef416f34..8434d5b293 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -398,12 +398,12 @@ def interpolate_missing_with_mean(_ser):
     assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) ==
            sum(full_set_interpolated['change_proportion_' + scenario].isna())) # make sure that there is an entry for every row in which there was previously data
 
-# 4. Generate best performing facility-based scenario data on consumable availablity
-#*********************************************************************************************
+# 4. Generate best performing facility-based scenario data on consumable availability
+#***************************************************************************************
 df = full_set_interpolated.reset_index().copy()
 
 # Try updating the avaiability to represent the 75th percentile by consumable
-facility_levels = ['1a', '1b']
+facility_levels = ['1a', '1b', '2']
 target_percentiles = [75, 90, 99]
 
 best_performing_facilities = {}
@@ -440,8 +440,9 @@ def interpolate_missing_with_mean(_ser):
 # Obtain the updated availability estimates for level 1a for scenarios 6-8
 updated_availability_1a = df[['item_code', 'month']].drop_duplicates()
 updated_availability_1b = df[['item_code', 'month']].drop_duplicates()
+updated_availability_2 = df[['item_code', 'month']].drop_duplicates()
 temporary_df = pd.DataFrame([])
-availability_dataframes = [updated_availability_1a, updated_availability_1b]
+availability_dataframes = [updated_availability_1a, updated_availability_1b, updated_availability_2]
 
 i = 6 # start scenario counter
 j = 0 # start level counter
@@ -468,29 +469,57 @@ def interpolate_missing_with_mean(_ser):
     j = j + 1 # move to the next level
 
 # Merge the above scenario data to the full availability scenario dataframe
-# Scenario 6-8 availability data for level 1a
+# 75, 90 and 99th percentile availability data for level 1a
 df_new_1a = df[df['Facility_ID'].isin(facilities_by_level['1a'])].merge(availability_dataframes[0],on = ['item_code', 'month'],
                                       how = 'left',
                                       validate = "m:1")
-# Scenario 6-8 availability data for level 1b
+# 75, 90 and 99th percentile availability data for level 1b
 df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(availability_dataframes[1],on = ['item_code', 'month'],
                                       how = 'left',
                                       validate = "m:1")
+# 75, 90 and 99th percentile availability data for level 2
+df_new_2 = df[df['Facility_ID'].isin(facilities_by_level['2'])].merge(availability_dataframes[2],on = ['item_code', 'month'],
+                                      how = 'left',
+                                      validate = "m:1")
+
+# Generate scenarios 6-8
+#------------------------
+# scenario 6: only levels 1a and 1b changed to availability at 75th percentile for the corresponding level
+# scenario 7: only levels 1a and 1b changed to availability at 90th percentile for the corresponding level
+# scenario 8: only levels 1a and 1b changed to availability at 99th percentile for the corresponding level
 # Scenario 6-8 availability data for other levels
 df_new_otherlevels = df[~df['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b'])]
 new_scenario_columns = ['available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8']
 for col in new_scenario_columns:
     df_new_otherlevels[col] = df_new_otherlevels['available_prop']
+# Append the above dataframes
+df_new_scenarios6to8 = pd.concat([df_new_1a, df_new_1b, df_new_otherlevels], ignore_index = True)
+
 
+# Generate scenario 9
+#------------------------
+# scenario 9: levels 1a, 1b and 2 changed to availability at 99th percentile for the corresponding level
+df_new_otherlevels = df_new_scenarios6to8[~df_new_scenarios6to8['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b']|facilities_by_level['2'])].reset_index(drop  = True)
+df_new_1a_scenario9 =  df_new_scenarios6to8[df_new_scenarios6to8['Facility_ID'].isin(facilities_by_level['1a'])].reset_index(drop  = True)
+df_new_1b_scenario9 =  df_new_scenarios6to8[df_new_scenarios6to8['Facility_ID'].isin(facilities_by_level['1b'])].reset_index(drop  = True)
+df_new_2_scenario9 =  df_new_2[df_new_2['Facility_ID'].isin(facilities_by_level['2'])].reset_index(drop  = True)
+new_scenario_columns = ['available_prop_scenario9']
+for col in new_scenario_columns:
+    df_new_otherlevels[col] = df_new_otherlevels['available_prop']
+    df_new_1a_scenario9[col] = df_new_1a_scenario9['available_prop_scenario8']
+    df_new_1b_scenario9[col] = df_new_1b_scenario9['available_prop_scenario8']
+    df_new_2_scenario9[col] = df_new_2_scenario9['available_prop_scenario8']
 # Append the above dataframes
-df_new = pd.concat([df_new_1a, df_new_1b, df_new_otherlevels], ignore_index = True)
+df_new_scenarios9 = pd.concat([df_new_1a_scenario9, df_new_1b_scenario9, df_new_2_scenario9, df_new_otherlevels], ignore_index = True)
 
 # Save dataframe
 #------------------------------------------------------
-list_of_scenario_suffixes = list_of_scenario_suffixes + ['scenario6', 'scenario7', 'scenario8']
+list_of_scenario_suffixes = list_of_scenario_suffixes + ['scenario6', 'scenario7', 'scenario8', 'scenario9']
 final_list_of_scenario_vars = ['available_prop_' + item for item in list_of_scenario_suffixes]
-old_vars = ['Facility_ID', 'month', 'item_code', 'available_prop']
-full_df_with_scenario = df_new[old_vars + final_list_of_scenario_vars].reset_index().drop('index', axis = 1)
+old_vars = ['Facility_ID', 'month', 'item_code', 'available_prop', 'item_category']
+full_df_with_scenario = df_new_scenarios6to8[old_vars + [col for col in final_list_of_scenario_vars if col != 'available_prop_scenario9']].reset_index().drop('index', axis = 1)
+full_df_with_scenario = full_df_with_scenario.merge(df_new_scenarios9[old_vars + ['available_prop_scenario9']], on = old_vars, how = 'left', validate = "1:1")
+full_df_with_scenario = full_df_with_scenario.merge(program_item_mapping, on = 'item_code', validate = 'm:1', how = 'left')
 
 # --- Check that the exported file has the properties required of it by the model code. --- #
 check_format_of_consumables_file(df=full_df_with_scenario, fac_ids=fac_ids)
@@ -505,8 +534,7 @@ def interpolate_missing_with_mean(_ser):
 # 8. Plot new availability estimates by scenario
 #*********************************************************************************************
 # Creating the line plot with ggplot
-df_for_plots = full_df_with_scenario.merge(programs[['item_category', 'item_code']], on = 'item_code', how = "left", validate = "m:1")
-df_for_plots = df_for_plots.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1")
+df_for_plots = full_df_with_scenario.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1")
 def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     df_for_line_plot = _df.groupby([_x_axis_var])[['available_prop'] + final_list_of_scenario_vars].mean()
     df_for_line_plot = df_for_line_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=['available_prop'] + final_list_of_scenario_vars,
diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index ed19694cc2..9834a2d486 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -352,7 +352,8 @@ def check_format_of_consumables_file(df, fac_ids):
 
     availability_columns = ['available_prop', 'available_prop_scenario1', 'available_prop_scenario2',
                                       'available_prop_scenario3', 'available_prop_scenario4', 'available_prop_scenario5',
-                                      'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8']
+                                      'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8',
+                                      'available_prop_scenario9']
 
     assert set(df.columns) == {'Facility_ID', 'month', 'item_code', 'item_category'} | set(availability_columns)
 

From 6488940b29b2c9151e34f3fc312129d1d4f6fd2a Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 10 Sep 2024 17:37:51 +0100
Subject: [PATCH 065/100] add parallel supply chain scenarios - scenario 10 -
 availability increased to that of HIV - scenario 11 - availability increased
 to that of EPI - scenario 12 - availability of HIV decreased to that of other

---
 ...rceFile_Consumables_availability_small.csv |  4 +-
 ...ilability_scenarios_for_impact_analysis.py | 59 ++++++++++++++++---
 src/tlo/methods/consumables.py                |  7 +--
 3 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index 0ca97619ac..b0f7d9ea22 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e1f9d8d7f72e3d1a960a3e16e5207f6baf7d57347784023d6c3a880293199a69
-size 37424822
+oid sha256:530071e40f2b123037506eff673e036adac51bc124f579d942b8820896d17f43
+size 49336553
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 8434d5b293..61e321d05f 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -435,7 +435,6 @@ def interpolate_missing_with_mean(_ser):
             best_performing_facilities[level][item][str(target_perc) + 'th percentile'] = closest_row['Facility_ID']
 
 print("Reference facilities at each level for each item: ", best_performing_facilities)
-# TODO Flip the nesting order above for percentile to go before item?
 
 # Obtain the updated availability estimates for level 1a for scenarios 6-8
 updated_availability_1a = df[['item_code', 'month']].drop_duplicates()
@@ -512,13 +511,58 @@ def interpolate_missing_with_mean(_ser):
 # Append the above dataframes
 df_new_scenarios9 = pd.concat([df_new_1a_scenario9, df_new_1b_scenario9, df_new_2_scenario9, df_new_otherlevels], ignore_index = True)
 
-# Save dataframe
+# 6. Generate scenarios based on the performance of vertical programs
+#***************************************************************************************
+cond_levels1a1b = (tlo_availability_df.Facility_Level == '1a') |(tlo_availability_df.Facility_Level == '1b')
+cond_hiv = tlo_availability_df.item_category == 'hiv'
+cond_epi = tlo_availability_df.item_category == 'epi'
+#cond_not_hivorepi = (tlo_availability_df.item_category != 'hiv') & (tlo_availability_df.item_category != 'epi')
+nonhivepi_availability_df = tlo_availability_df[(~cond_hiv & ~cond_epi) & cond_levels1a1b]
+hivepi_availability_df = tlo_availability_df[(cond_hiv| cond_epi) & cond_levels1a1b]
+irrelevant_levels_availability_df = tlo_availability_df[~cond_levels1a1b]
+
+hiv_availability_df = tlo_availability_df[cond_hiv & cond_levels1a1b].groupby(['Facility_ID', 'month', 'item_category'])['available_prop'].mean().reset_index()
+hiv_availability_df = hiv_availability_df.rename(columns = {'available_prop': 'available_prop_scenario10'})
+hivepi_availability_df['available_prop_scenario10'] = hivepi_availability_df['available_prop']
+irrelevant_levels_availability_df['available_prop_scenario10'] = irrelevant_levels_availability_df['available_prop']
+minimum_scenario_varlist = ['Facility_ID', 'month', 'item_code', 'available_prop_scenario10']
+hiv_scenario_df = nonhivepi_availability_df.merge(hiv_availability_df, on = ['Facility_ID', 'month'] , how = 'left', validate = 'm:1')
+hiv_scenario_df = pd.concat([hiv_scenario_df[minimum_scenario_varlist], hivepi_availability_df[minimum_scenario_varlist], irrelevant_levels_availability_df[minimum_scenario_varlist]], ignore_index = True)
+
+epi_availability_df = tlo_availability_df[cond_epi].groupby(['Facility_ID', 'month', 'item_category'])['available_prop'].mean().reset_index()
+epi_availability_df = epi_availability_df.rename(columns = {'available_prop': 'available_prop_scenario11'})
+hivepi_availability_df['available_prop_scenario11'] = hivepi_availability_df['available_prop']
+irrelevant_levels_availability_df['available_prop_scenario11'] = irrelevant_levels_availability_df['available_prop']
+epi_scenario_df = nonhivepi_availability_df.merge(epi_availability_df, on = ['Facility_ID', 'month'] , how = 'left', validate = 'm:1')
+minimum_scenario_varlist = ['Facility_ID', 'month', 'item_code', 'available_prop_scenario11']
+epi_scenario_df = nonhivepi_availability_df.merge(epi_availability_df, on = ['Facility_ID', 'month'] , how = 'left', validate = 'm:1')
+epi_scenario_df = pd.concat([epi_scenario_df[minimum_scenario_varlist], hivepi_availability_df[minimum_scenario_varlist], irrelevant_levels_availability_df[minimum_scenario_varlist]], ignore_index = True)
+
+# 7. Generate a scenario to represent HIV availability falling to that of other programs
+#***************************************************************************************
+nonhivepi_availability_average = tlo_availability_df[(~cond_hiv & ~cond_epi)].groupby(['Facility_ID', 'month'])['available_prop'].mean().reset_index()
+nonhiv_availability_df = tlo_availability_df[~cond_hiv]
+non_vertical_hiv_availability_df = tlo_availability_df[cond_hiv]
+nonhivepi_availability_average = nonhivepi_availability_average.rename(columns = {'available_prop':'available_prop_scenario12'})
+nonhiv_availability_df['available_prop_scenario12'] = nonhiv_availability_df['available_prop']
+non_vertical_hiv_availability_df = non_vertical_hiv_availability_df.merge(nonhiv_availability_average, on = ['Facility_ID', 'month'],  how = 'left', validate = 'm:1')
+minimum_scenario_varlist = ['Facility_ID', 'month', 'item_code', 'available_prop_scenario12']
+non_vertical_hiv_scenario_df = pd.concat([non_vertical_hiv_availability_df[minimum_scenario_varlist], nonhiv_availability_df[minimum_scenario_varlist]], ignore_index = True)
+
+# Add scenarios 6 to 11 to the original dataframe
 #------------------------------------------------------
-list_of_scenario_suffixes = list_of_scenario_suffixes + ['scenario6', 'scenario7', 'scenario8', 'scenario9']
-final_list_of_scenario_vars = ['available_prop_' + item for item in list_of_scenario_suffixes]
-old_vars = ['Facility_ID', 'month', 'item_code', 'available_prop', 'item_category']
-full_df_with_scenario = df_new_scenarios6to8[old_vars + [col for col in final_list_of_scenario_vars if col != 'available_prop_scenario9']].reset_index().drop('index', axis = 1)
+list_of_scenario_suffixes_first_stage = list_of_scenario_suffixes + ['scenario6', 'scenario7', 'scenario8', 'scenario9']
+list_of_scenario_variables_first_stage = ['available_prop_' + item for item in list_of_scenario_suffixes_first_stage]
+old_vars = ['Facility_ID', 'month', 'item_code']
+full_df_with_scenario = df_new_scenarios6to8[old_vars + ['available_prop'] + [col for col in list_of_scenario_variables_first_stage if col != 'available_prop_scenario9']].reset_index().drop('index', axis = 1)
 full_df_with_scenario = full_df_with_scenario.merge(df_new_scenarios9[old_vars + ['available_prop_scenario9']], on = old_vars, how = 'left', validate = "1:1")
+
+list_of_scenario_suffixes_second_stage = list_of_scenario_suffixes_first_stage + ['scenario10', 'scenario11', 'scenario12']
+final_list_of_scenario_vars = ['available_prop_' + item for item in list_of_scenario_suffixes_second_stage]
+full_df_with_scenario = full_df_with_scenario.merge(hiv_scenario_df[old_vars + ['available_prop_scenario10']], on = old_vars, how = 'left', validate = "1:1")
+full_df_with_scenario = full_df_with_scenario.merge(epi_scenario_df[old_vars + ['available_prop_scenario11']], on = old_vars, how = 'left', validate = "1:1")
+full_df_with_scenario = full_df_with_scenario.merge(non_vertical_hiv_scenario_df[old_vars + ['available_prop_scenario12']], on = old_vars, how = 'left', validate = "1:1")
+
 full_df_with_scenario = full_df_with_scenario.merge(program_item_mapping, on = 'item_code', validate = 'm:1', how = 'left')
 
 # --- Check that the exported file has the properties required of it by the model code. --- #
@@ -530,6 +574,7 @@ def interpolate_missing_with_mean(_ser):
     index=False
 )
 # TODO: Create a column providing the source of scenario data
+# TODO: 3 more scenarios where availability is equated to HIV and EPI, HIV availability is equated to general (excluding EPI)
 
 # 8. Plot new availability estimates by scenario
 #*********************************************************************************************
@@ -557,7 +602,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 
 # Scenario on the X axis, level on the Y axis
 # Scenario on the X axis, program on the Y axis
-# TODO add heat maps
+# TODO add heat maps i. heatmap by item_category across the sceanrios
 
 '''
 # 2.3.2. Browse missingness in the availability_change_prop variable
diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 9834a2d486..ef8065a1d1 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -349,11 +349,10 @@ def check_format_of_consumables_file(df, fac_ids):
     """Check that we have a complete set of estimates, for every region & facility_type, as defined in the model."""
     months = set(range(1, 13))
     item_codes = set(df.item_code.unique())
+    number_of_scenarios = 12
 
-    availability_columns = ['available_prop', 'available_prop_scenario1', 'available_prop_scenario2',
-                                      'available_prop_scenario3', 'available_prop_scenario4', 'available_prop_scenario5',
-                                      'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8',
-                                      'available_prop_scenario9']
+    availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
+                                                 range(1, number_of_scenarios + 1)]
 
     assert set(df.columns) == {'Facility_ID', 'month', 'item_code', 'item_category'} | set(availability_columns)
 

From b4dedea863badfc41a9754ed22a71b2f3f4e22f5 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 10 Sep 2024 20:14:35 +0100
Subject: [PATCH 066/100] add heatmaps to represent scenarios

---
 ...ilability_scenarios_for_impact_analysis.py | 86 +++++++++++++++++--
 1 file changed, 80 insertions(+), 6 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 61e321d05f..e32a61ea38 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -27,6 +27,7 @@
 
 import matplotlib.pyplot as plt
 from plotnine import * # ggplot, aes, geom_point for ggplots from R
+import seaborn as sns
 import numpy as np
 import pandas as pd
 
@@ -574,10 +575,14 @@ def interpolate_missing_with_mean(_ser):
     index=False
 )
 # TODO: Create a column providing the source of scenario data
-# TODO: 3 more scenarios where availability is equated to HIV and EPI, HIV availability is equated to general (excluding EPI)
 
 # 8. Plot new availability estimates by scenario
 #*********************************************************************************************
+# Create the directory if it doesn't exist
+figurespath = outputfilepath / 'consumable_scenario_analysis'
+if not os.path.exists(figurespath):
+    os.makedirs(figurespath)
+
 # Creating the line plot with ggplot
 df_for_plots = full_df_with_scenario.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1")
 def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
@@ -592,14 +597,83 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
                    y='Probability of availability')
             + theme(axis_text_x=element_text(angle=45, hjust=1))
            )
-    # Create the directory if it doesn't exist
-    directory = outputfilepath / 'consumable_scenario_analysis'
-    if not os.path.exists(directory):
-        os.makedirs(directory)
-    plot.save(filename= directory / _filename, dpi=300, width=10, height=8, units='in')
+
+    plot.save(filename= figurespath / _filename, dpi=300, width=10, height=8, units='in')
 generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'item_category', _filename = 'availability_by_category.png')
 generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png')
 
+# Create heatmaps by Facility_Level of average availability by item_category across chosen scenarios
+chosen_scenarios_for_heatmap =
+number_of_scenarios = 12
+availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
+                                             range(1, number_of_scenarios + 1)]
+
+for level in fac_levels:
+    # Generate a heatmap
+    # Pivot the DataFrame
+    aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[availability_columns].mean().reset_index()
+    aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin([level])]
+    heatmap_data = aggregated_df.set_index('item_category').drop(columns = 'Facility_Level')
+
+    # Calculate the aggregate row and column
+    aggregate_col= aggregated_df[availability_columns].mean()
+    #overall_aggregate = aggregate_col.mean()
+
+    # Add aggregate row and column
+    #heatmap_data['Average'] = aggregate_row
+    #aggregate_col['Average'] = overall_aggregate
+    heatmap_data.loc['Average'] = aggregate_col
+
+    # Generate the heatmap
+    sns.set(font_scale=0.5)
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+
+    # Customize the plot
+    plt.title(f'Facility Level {level}')
+    plt.xlabel('Scenarios')
+    plt.ylabel(f'Disease/Public health \n program')
+    plt.xticks(rotation=90)
+    plt.yticks(rotation=0)
+
+    plt.savefig(figurespath /f'consumable_availability_heatmap_{level}.png', dpi=300, bbox_inches='tight')
+    plt.show()
+    plt.close()
+
+# Create heatmap of average availability by Facility_Level across chosen scenarios
+scenario_list = [1,2,3,6,7,8,9,10,11]
+chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
+                                             scenario_list]
+# Pivot the DataFrame
+aggregated_df = df_for_plots.groupby(['Facility_Level'])[chosen_availability_columns].mean().reset_index()
+heatmap_data = aggregated_df.set_index('Facility_Level')
+
+# Calculate the aggregate row and column
+aggregate_col= aggregated_df[chosen_availability_columns].mean()
+#overall_aggregate = aggregate_col.mean()
+
+# Add aggregate row and column
+#heatmap_data['Average'] = aggregate_row
+#aggregate_col['Average'] = overall_aggregate
+heatmap_data.loc['Average'] = aggregate_col
+
+# Generate the heatmap
+sns.set(font_scale=0.5)
+plt.figure(figsize=(10, 8))
+sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+
+# Customize the plot
+plt.title(f'Availability across scenarios')
+plt.xlabel('Scenarios')
+plt.ylabel(f'Facility Level')
+plt.xticks(rotation=90)
+plt.yticks(rotation=0)
+
+plt.savefig(figurespath /f'consumable_availability_heatmap_alllevels.png', dpi=300, bbox_inches='tight')
+plt.show()
+plt.close()
+
+
 # Scenario on the X axis, level on the Y axis
 # Scenario on the X axis, program on the Y axis
 # TODO add heat maps i. heatmap by item_category across the sceanrios

From a733a89b18ef37d5ea8ed2cd6980bf422206642a Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 10 Sep 2024 20:14:48 +0100
Subject: [PATCH 067/100] add scenarios to simulation file

---
 .../scenario_impact_of_consumable_scenarios.py       | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
index bfa555f01f..a3bd700903 100644
--- a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
@@ -33,9 +33,9 @@ def __init__(self):
         self.seed = 99
         self.start_date = Date(2010, 1, 1)
         self.end_date = Date(2019, 12, 31)
-        self.pop_size = 100_000 # large population size for final simulation
-        self.number_of_draws = 10  # <- 10 scenarios (10)
-        self.runs_per_draw = 5  # <- repeated this many times
+        self.pop_size = 5000 # large population size for final simulation - 100,000
+        self.number_of_draws = 12  # <- 10 scenarios (10)
+        self.runs_per_draw = 3  # <- repeated this many times
 
     def log_configuration(self):
         return {
@@ -56,8 +56,10 @@ def draw_parameters(self, draw_number, rng):
         return {
             'HealthSystem': {
                 'cons_availability': ['default',
-                                      'scenario1', 'scenario2', 'scenario3', 'scenario4',
-                                      'scenario5', 'scenario6', 'scenario7', 'scenario8',
+                                      'scenario1', 'scenario2', 'scenario3',
+                                      'scenario6', 'scenario7', 'scenario8',
+                                      'scenario9', 'scenario10', 'scenario11',
+                                      'scenario12',
                                       'all'][draw_number]
                }
         }

From 2eda2d44662768ef956586695ed6bd83edb92f8c Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 10 Sep 2024 20:17:10 +0100
Subject: [PATCH 068/100] add new scenarios to consumables.py

---
 src/tlo/methods/consumables.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index ef8065a1d1..00c065868b 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -48,6 +48,7 @@ def __init__(self,
             'all_drug_or_vaccine_available',
             'scenario1', 'scenario2', 'scenario3', 'scenario4',
             'scenario5', 'scenario6', 'scenario7', 'scenario8',
+            'scenario9', 'scenario10', 'scenario11', 'scenario12',
         }
 
         # Create internal items:
@@ -106,7 +107,8 @@ def _update_prob_item_codes_available(self, availability: str):
         # Over-ride the data according to option for `availability`
         if availability in ('default',
                             'scenario1', 'scenario2', 'scenario3', 'scenario4',
-                            'scenario5', 'scenario6', 'scenario7', 'scenario8'):
+                            'scenario5', 'scenario6', 'scenario7', 'scenario8',
+                            'scenario9', 'scenario10', 'scenario11', 'scenario12'):
             pass
         elif availability == 'all':
             self.override_availability(dict(zip(self.item_codes, repeat(1.0))))
@@ -145,7 +147,8 @@ def _process_consumables_data(self, availability_data: pd.DataFrame, availabilit
         each consumable at each facility_id during each month.
         """
         if availability in ('scenario1', 'scenario2', 'scenario3', 'scenario4',
-                              'scenario5', 'scenario6', 'scenario7', 'scenario8'):
+                              'scenario5', 'scenario6', 'scenario7', 'scenario8',
+                            'scenario9', 'scenario10', 'scenario11', 'scenario12'):
             return (
                 set(availability_data.item_code),
                 availability_data.set_index(['month', 'Facility_ID', 'item_code'])['available_prop_' + availability]

From 5f9b9c41f56e63c35c320ea85e87db19e33f3ef2 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 10 Sep 2024 20:19:42 +0100
Subject: [PATCH 069/100] add new scenarios to healthsystem.py

---
 src/tlo/methods/healthsystem.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py
index ff12848efa..c3740df110 100644
--- a/src/tlo/methods/healthsystem.py
+++ b/src/tlo/methods/healthsystem.py
@@ -1048,7 +1048,9 @@ def update_consumables_availability_to_represent_merging_of_levels_1b_and_2(self
 
         availability_columns = ['available_prop', 'available_prop_scenario1', 'available_prop_scenario2',
                                 'available_prop_scenario3', 'available_prop_scenario4', 'available_prop_scenario5',
-                                'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8']
+                                'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8',
+                                'available_prop_scenario9', 'available_prop_scenario10', 'available_prop_scenario11',
+                                'available_prop_scenario12']
 
         # compute the updated availability at the merged level '1b' and '2'
         availability_at_1b_and_2 = \

From 106e3221044123fd76c61dc1afb3019dd2926e4d Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 10 Sep 2024 20:22:19 +0100
Subject: [PATCH 070/100] [TEMPORARY] remove item_category

---
 ...ate_consumable_availability_scenarios_for_impact_analysis.py | 2 +-
 src/tlo/methods/consumables.py                                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index e32a61ea38..b034015427 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -564,7 +564,7 @@ def interpolate_missing_with_mean(_ser):
 full_df_with_scenario = full_df_with_scenario.merge(epi_scenario_df[old_vars + ['available_prop_scenario11']], on = old_vars, how = 'left', validate = "1:1")
 full_df_with_scenario = full_df_with_scenario.merge(non_vertical_hiv_scenario_df[old_vars + ['available_prop_scenario12']], on = old_vars, how = 'left', validate = "1:1")
 
-full_df_with_scenario = full_df_with_scenario.merge(program_item_mapping, on = 'item_code', validate = 'm:1', how = 'left')
+#full_df_with_scenario = full_df_with_scenario.merge(program_item_mapping, on = 'item_code', validate = 'm:1', how = 'left')
 
 # --- Check that the exported file has the properties required of it by the model code. --- #
 check_format_of_consumables_file(df=full_df_with_scenario, fac_ids=fac_ids)
diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 00c065868b..8e6636ba07 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -357,7 +357,7 @@ def check_format_of_consumables_file(df, fac_ids):
     availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
                                                  range(1, number_of_scenarios + 1)]
 
-    assert set(df.columns) == {'Facility_ID', 'month', 'item_code', 'item_category'} | set(availability_columns)
+    assert set(df.columns) == {'Facility_ID', 'month', 'item_code'} | set(availability_columns)
 
     # Check that all permutations of Facility_ID, month and item_code are present
     pd.testing.assert_index_equal(

From c8b1431b190fd9a0b5c9ffeeb6ca9cc25269a207 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 10 Sep 2024 20:30:36 +0100
Subject: [PATCH 071/100] Revert "Drop columns not needed from
 RF_items_and_packages"

This reverts commit 09d8bb2b47da5dfc1377d649c1a23a2764e0940a.
---
 ...rceFile_Consumables_Items_and_Packages.csv |  4 +-
 ...ate_consumables_item_codes_and_packages.py | 72 +++++++++++++------
 2 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv
index f3589757c7..0ee403abb0 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:596a1bc8d570f341da180fea6db1836c181f6a2a984a9c7f9b4990b78df8e689
-size 215244
+oid sha256:4106c2e3ae068d40b115857885b673bec3e1114be5183c0a4ae0366560e2a5c9
+size 249391
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py b/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py
index db6af01154..7ca04f763f 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py
@@ -19,20 +19,22 @@
 import numpy as np
 import pandas as pd
 
-# Set local shared folder source
-path_to_share = Path(  # <-- point to the shared folder
-    '/Users/sm2511/Library/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/'
-)
+# Set local Dropbox source
+path_to_dropbox = Path(  # <-- point to the TLO dropbox locally
+   # '/Users/tbh03/Dropbox (SPH Imperial College)/Thanzi la Onse Theme 1 SHARE'
+    '/Users/sm2511/Dropbox/Thanzi La Onse')
 
 resourcefilepath = Path("./resources")
 path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables"
 
 
 # EHP Consumables list
-workingfile_ehp_consumables = path_to_share / \
+path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/"
+
+workingfile_ehp_consumables = path_to_dropbox / \
               "05 - Resources/Module-healthsystem/From Matthias Arnold/ORIGINAL_Intervention input.xlsx"
 
-workingfile_one_health = path_to_share / \
+workingfile_one_health = path_to_dropbox / \
                          "07 - Data/OneHealth projection files/OneHealth commodities.xlsx"
 
 
@@ -155,8 +157,7 @@
 
 wb = wb.merge(intv_codes, on='Intervention_Pkg', how='left', indicator=True)
 assert (wb['_merge'] == 'both').all()
-wb = wb.drop(columns=['_merge','Expected_Units_Per_Case',
-                'Unit_Cost'])
+wb = wb.drop(columns='_merge')
 
 # Assign a unique code for each individual consumable item
 unique_items = pd.unique(wb['Items'])
@@ -171,7 +172,9 @@
          'Intervention_Pkg',
          'Intervention_Pkg_Code',
          'Items',
-         'Item_Code']]
+         'Item_Code',
+         'Expected_Units_Per_Case',
+         'Unit_Cost']]
 
 assert not pd.isnull(wb).any().any()
 
@@ -212,7 +215,7 @@
 only_in_oh['Intervention_Pkg'] = 'Misc'
 only_in_oh['Intervention_Pkg_Code'] = -99
 only_in_oh['Item_Code'] = np.arange(1000, 1000 + len(only_in_oh))
-only_in_oh = only_in_oh.drop(columns = 'Unit_Cost')
+only_in_oh['Expected_Units_Per_Case'] = 1.0
 
 assert set(only_in_oh.columns) == set(wb.columns)
 
@@ -246,6 +249,8 @@ def add_record(df: pd.DataFrame, record: Dict):
     assert set(df.columns) == set(record.keys())
     return pd.concat([df, pd.DataFrame.from_records([record])], ignore_index=True)
 
+
+
 cons = add_record(
     cons,
     {
@@ -253,7 +258,10 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "Forceps, obstetric",
-        'Item_Code': 2669}
+        'Item_Code': 2669,
+        'Expected_Units_Per_Case': 1.0,
+        'Unit_Cost': 1.0
+        }
 )
 
 cons = add_record(
@@ -263,7 +271,10 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "Vacuum, obstetric",
-        'Item_Code': 2670},
+        'Item_Code': 2670,
+        'Expected_Units_Per_Case': 1.0,
+        'Unit_Cost': 1.0
+    },
 )
 
 cons = add_record(
@@ -273,7 +284,10 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "First-line ART regimen: adult",
-        'Item_Code': 2671},
+        'Item_Code': 2671,
+        'Expected_Units_Per_Case': 1.0,
+        'Unit_Cost': 1.0
+    },
 )
 
 cons = add_record(
@@ -283,7 +297,10 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "First line ART regimen: older child",
-        'Item_Code': 2672},
+        'Item_Code': 2672,
+        'Expected_Units_Per_Case': 1.0,
+        'Unit_Cost': 1.0
+    },
 )
 
 cons = add_record(
@@ -293,7 +310,10 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "First line ART regimen: young child",
-        'Item_Code': 2673},
+        'Item_Code': 2673,
+        'Expected_Units_Per_Case': 1.0,
+        'Unit_Cost': 1.0
+    },
 )
 
 cons = add_record(
@@ -303,7 +323,10 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Misc",
         'Intervention_Pkg_Code': -99,
         'Items': "Pre-exposure prophlaxis for HIV",
-        'Item_Code': 2674},
+        'Item_Code': 2674,
+        'Expected_Units_Per_Case': 1.0,
+        'Unit_Cost': 1.0
+    },
 )
 
 cons = add_record(
@@ -313,7 +336,10 @@ def add_record(df: pd.DataFrame, record: Dict):
         'Intervention_Pkg': "Isoniazid preventative therapy for HIV+ no TB",
         'Intervention_Pkg_Code': 82,
         'Items': "Isoniazid/Rifapentine",
-        'Item_Code': 2678},
+        'Item_Code': 2678,
+        'Expected_Units_Per_Case': 1.0,
+        'Unit_Cost': 1.0
+    },
 )
 
 cons = add_record(
@@ -323,7 +349,9 @@ def add_record(df: pd.DataFrame, record: Dict):
     'Intervention_Pkg': "Misc",
     'Intervention_Pkg_Code': -99,
     'Items': "Cystoscope",
-    'Item_Code': 285},
+    'Item_Code': 285,
+    'Expected_Units_Per_Case': 1.0,
+    'Unit_Cost': np.nan},
 )
 
 cons = add_record(
@@ -332,7 +360,9 @@ def add_record(df: pd.DataFrame, record: Dict):
     'Intervention_Pkg': "Misc",
     'Intervention_Pkg_Code': -99,
     'Items': "Endoscope",
-    'Item_Code': 280},
+    'Item_Code': 280,
+    'Expected_Units_Per_Case': 1.0,
+    'Unit_Cost': np.nan},
 )
 
 cons = add_record(
@@ -341,7 +371,9 @@ def add_record(df: pd.DataFrame, record: Dict):
     'Intervention_Pkg': "Misc",
     'Intervention_Pkg_Code': -99,
     'Items': "Prostate specific antigen test",
-    'Item_Code': 281},
+    'Item_Code': 281,
+    'Expected_Units_Per_Case': 1.0,
+    'Unit_Cost': np.nan},
 )
 
 

From c76624a9adf9ab0a6d8db7a1374c25e47f087f3c Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Wed, 11 Sep 2024 19:38:08 +0100
Subject: [PATCH 072/100] update scenario for local run

---
 .../scenario_impact_of_consumable_scenarios.py            | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
index a3bd700903..742fb3a794 100644
--- a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
@@ -32,10 +32,10 @@ def __init__(self):
         super().__init__()
         self.seed = 99
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2019, 12, 31)
-        self.pop_size = 5000 # large population size for final simulation - 100,000
-        self.number_of_draws = 12  # <- 10 scenarios (10)
-        self.runs_per_draw = 3  # <- repeated this many times
+        self.end_date = Date(2012, 12, 31)
+        self.pop_size = 1000 # large population size for final simulation - 100,000
+        self.number_of_draws = 12  # <- 12 scenarios
+        self.runs_per_draw = 1  # <- repeated this many times
 
     def log_configuration(self):
         return {

From 208fc4e74125866ef3c38c90ae433f9d8ec30a3e Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 12 Sep 2024 16:16:59 +0100
Subject: [PATCH 073/100] Remove item_category from RF_small and move to
 RF_designations + update check format to accommodate versions of the RF
 without all availability columns

This makes sure that the `check_format_of_consumables_file` function works for the dataframe created by consumable_availability_estimation.py and not just generate_consumable_availability_scenarios_for_impact_analysis.py
---
 .../ResourceFile_Consumables_Item_Designations.csv       | 4 ++--
 .../ResourceFile_Consumables_availability_small.csv      | 4 ++--
 .../consumables_availability_estimation.py               | 9 ++++++++-
 src/tlo/methods/consumables.py                           | 8 +++++---
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv
index ace9042583..15f536d3f4 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c953a6136e61fe55181e477e6c1425ec010ac55b180351f0f466da2c2eb0d379
-size 67854
+oid sha256:79aafe10ade632b4eb487fefa572093a63c39b5cb56322c31619958242e2db8c
+size 72269
diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index b0f7d9ea22..2e745530ac 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:530071e40f2b123037506eff673e036adac51bc124f579d942b8820896d17f43
-size 49336553
+oid sha256:746e8bbdb48854349998b671536d9eed86862d824f50de28fd0242c006993c55
+size 6087715
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
index b779e571df..9f0c71f22c 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py
@@ -647,6 +647,12 @@ def recategorize_modules_into_consumable_categories(_df):
 stkout_df = recategorize_modules_into_consumable_categories(stkout_df)
 item_code_category_mapping = stkout_df[['item_category', 'item_code']].drop_duplicates()
 
+# Add item_category to ResourceFile_Consumables_Item_Designations
+item_designations = pd.read_csv(path_for_new_resourcefiles  / 'ResourceFile_Consumables_Item_Designations.csv')
+item_designations = item_designations.drop(columns = 'item_category')
+item_designations = item_designations.merge(item_code_category_mapping, left_on = 'Item_Code', right_on = 'item_code', how = 'left', validate = '1:1')
+item_designations.drop(columns = 'item_code').to_csv(path_for_new_resourcefiles  / 'ResourceFile_Consumables_Item_Designations.csv', index = False)
+
 # --- 6.5 Replace district/fac_name/month entries where missing --- #
 for var in ['district', 'fac_name', 'month']:
     cond = stkout_df[var].isna()
@@ -833,7 +839,8 @@ def interpolate_missing_with_mean(_ser):
 # Check that there are not missing values
 assert not pd.isnull(full_set_interpolated).any().any()
 
-full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1')
+full_set_interpolated = full_set_interpolated.reset_index()
+#full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1')
 
 # --- Check that the exported file has the properties required of it by the model code. --- #
 check_format_of_consumables_file(df=full_set_interpolated, fac_ids=fac_ids)
diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py
index 8e6636ba07..6c943f79c5 100644
--- a/src/tlo/methods/consumables.py
+++ b/src/tlo/methods/consumables.py
@@ -357,7 +357,7 @@ def check_format_of_consumables_file(df, fac_ids):
     availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
                                                  range(1, number_of_scenarios + 1)]
 
-    assert set(df.columns) == {'Facility_ID', 'month', 'item_code'} | set(availability_columns)
+    assert set(df.columns).issubset({'Facility_ID', 'month', 'item_code'} | set(availability_columns))
 
     # Check that all permutations of Facility_ID, month and item_code are present
     pd.testing.assert_index_equal(
@@ -368,8 +368,10 @@ def check_format_of_consumables_file(df, fac_ids):
 
     # Check that every entry for a probability is a float on [0,1]
     for col in availability_columns:
-        assert (df[col] <= 1.0).all() and (df[col] >= 0.0).all()
-        assert not pd.isnull(df[col]).any()
+        if col in df.columns: # This makes sure that even when all scenarios have not been created, the ones that are
+            # have appropriate values
+            assert (df[col] <= 1.0).all() and (df[col] >= 0.0).all()
+            assert not pd.isnull(df[col]).any()
 
 
 class ConsumablesSummaryCounter:

From d1b7d78d7484e95bafebe74613ad1c24ab7e6e3d Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 12 Sep 2024 16:55:17 +0100
Subject: [PATCH 074/100] add item_category from RF_designations

---
 ...rceFile_Consumables_availability_small.csv |  4 +-
 ...ilability_scenarios_for_impact_analysis.py | 45 ++++++++++++++++---
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index 2e745530ac..bb889f0ffd 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:746e8bbdb48854349998b671536d9eed86862d824f50de28fd0242c006993c55
-size 6087715
+oid sha256:6a50172b5823f13d291e88c6de28d9487bcf890940aabaa03031938ba4cb38a5
+size 44835162
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index b034015427..2430af8e23 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -51,10 +51,13 @@
 #------------------------------------------------------
 tlo_availability_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv")
 # Drop any scenario data previously included in the resourcefile
-tlo_availability_df = tlo_availability_df[['Facility_ID', 'month', 'item_category', 'item_code', 'available_prop']]
-program_item_mapping = tlo_availability_df[['item_category', 'item_code']].drop_duplicates()
+tlo_availability_df = tlo_availability_df[['Facility_ID', 'month','item_code', 'available_prop']]
 
-# 1.1.1 Attach district and facility level to this dataset
+# Import item_category
+program_item_mapping = pd.read_csv(path_for_new_resourcefiles  / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']]
+program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()]
+
+# 1.1.1 Attach district,  facility level and item_category to this dataset
 #----------------------------------------------------------------
 # Get TLO Facility_ID for each district and facility level
 mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv")
@@ -63,6 +66,9 @@
 tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
                     on = ['Facility_ID'], how='left')
 
+tlo_availability_df = tlo_availability_df.merge(program_item_mapping,
+                    on = ['item_code'], how='left')
+
 # 1.2 Import scenario data
 #------------------------------------------------------
 scenario_availability_df = pd.read_csv(outputfilepath / "regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv")
@@ -546,7 +552,7 @@ def interpolate_missing_with_mean(_ser):
 non_vertical_hiv_availability_df = tlo_availability_df[cond_hiv]
 nonhivepi_availability_average = nonhivepi_availability_average.rename(columns = {'available_prop':'available_prop_scenario12'})
 nonhiv_availability_df['available_prop_scenario12'] = nonhiv_availability_df['available_prop']
-non_vertical_hiv_availability_df = non_vertical_hiv_availability_df.merge(nonhiv_availability_average, on = ['Facility_ID', 'month'],  how = 'left', validate = 'm:1')
+non_vertical_hiv_availability_df = non_vertical_hiv_availability_df.merge(nonhivepi_availability_average, on = ['Facility_ID', 'month'],  how = 'left', validate = 'm:1')
 minimum_scenario_varlist = ['Facility_ID', 'month', 'item_code', 'available_prop_scenario12']
 non_vertical_hiv_scenario_df = pd.concat([non_vertical_hiv_availability_df[minimum_scenario_varlist], nonhiv_availability_df[minimum_scenario_varlist]], ignore_index = True)
 
@@ -585,6 +591,7 @@ def interpolate_missing_with_mean(_ser):
 
 # Creating the line plot with ggplot
 df_for_plots = full_df_with_scenario.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1")
+df_for_plots = df_for_plots.merge(program_item_mapping, on = 'item_code', how = 'left', validate = "m:1")
 def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     df_for_line_plot = _df.groupby([_x_axis_var])[['available_prop'] + final_list_of_scenario_vars].mean()
     df_for_line_plot = df_for_line_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=['available_prop'] + final_list_of_scenario_vars,
@@ -603,7 +610,6 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png')
 
 # Create heatmaps by Facility_Level of average availability by item_category across chosen scenarios
-chosen_scenarios_for_heatmap =
 number_of_scenarios = 12
 availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
                                              range(1, number_of_scenarios + 1)]
@@ -673,6 +679,35 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 plt.show()
 plt.close()
 
+# Create heatmap of average availability by Facility_Level just showing scenario 12
+scenario_list = [12]
+chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
+                                             scenario_list]
+# Pivot the DataFrame
+df_for_hiv_plot = df_for_plots
+df_for_hiv_plot['hiv_or_other'] = np.where(df_for_hiv_plot['item_category'] == 'hiv', 'hiv', 'other programs')
+
+aggregated_df = df_for_hiv_plot.groupby(['Facility_Level', 'hiv_or_other'])[chosen_availability_columns].mean().reset_index()
+aggregated_df = aggregated_df.rename(columns = {'available_prop': 'Actual', 'available_prop_scenario12': 'HIV moved to Govt supply chain'})
+heatmap_data = aggregated_df.pivot_table(index=['Facility_Level'],  # Keep other relevant columns in the index
+                                      columns='hiv_or_other',
+                                      values=['Actual', 'HIV moved to Govt supply chain'])
+# Generate the heatmap
+sns.set(font_scale=1)
+plt.figure(figsize=(10, 8))
+sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+
+# Customize the plot
+plt.title(f'Availability across scenarios')
+plt.xlabel('Scenarios')
+plt.ylabel(f'Facility Level')
+plt.xticks(rotation=90)
+plt.yticks(rotation=0)
+
+plt.savefig(figurespath /f'consumable_availability_heatmap_hiv_v_other.png', dpi=300, bbox_inches='tight')
+plt.show()
+plt.close()
+
 
 # Scenario on the X axis, level on the Y axis
 # Scenario on the X axis, program on the Y axis

From 7aae87e06deaf0f31b8e617aaf3cde528f325716 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 12 Sep 2024 16:56:16 +0100
Subject: [PATCH 075/100] update scenario file for local run

---
 .../scenario_impact_of_consumable_scenarios.py                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
index 742fb3a794..2736d8befe 100644
--- a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
@@ -35,7 +35,7 @@ def __init__(self):
         self.end_date = Date(2012, 12, 31)
         self.pop_size = 1000 # large population size for final simulation - 100,000
         self.number_of_draws = 12  # <- 12 scenarios
-        self.runs_per_draw = 1  # <- repeated this many times
+        self.runs_per_draw = 2  # <- repeated this many times
 
     def log_configuration(self):
         return {

From 3989c44089af23e12c6ce83757f01c0dd43e88c4 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 12 Sep 2024 17:39:27 +0100
Subject: [PATCH 076/100] add description of scenarios

---
 ...scenario_impact_of_consumable_scenarios.py | 22 ++++++++++---------
 ...ilability_scenarios_for_impact_analysis.py | 16 +++++++++++++-
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
index 2736d8befe..85e4af52b7 100644
--- a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
@@ -1,16 +1,18 @@
 """
 This file defines a batch run to calculate the health effect of updated consumable availability estimates
 as a result of a supply chain intervention. The following scenarios are considered:
-1. 'default' - this is the benchmark scenario with 2018 levels of consumable availability
-2. 'scenario1' : All items perform as well as consumables other than drugs/diagnostic tests
-3. 'scenario2' : 1 + All items perform as well as consumables classified as 'Vital' in the Essential Medicines List
-4. 'scenario3' : 2 + All facilities perform as well as those in which consumables stock is managed by pharmacists
-5. 'scenario4' : 3 + Level 1a facilities perform as well as level 1b
-6. 'scenario5' : 4 + All facilities perform as well as CHAM facilities
-7. 'scenario6' : All facilities have the same probability of consumable availability as the 75th percentile best performing facility for each individual item
-8. 'scenario7' : All facilities have the same probability of consumable availability as the 90th percentile best performing facility for each individual item
-9. 'scenario8' : All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item
-10. 'all': all consumable are always available - provides the theoretical maximum health gains which can be made through improving consumable supply
+1. 'default' : this is the benchmark scenario with 2018 levels of consumable availability
+2. 'scenario1' : [Level 1a + 1b] All items perform as well as consumables other than drugs/diagnostic tests
+3. 'scenario2' : [Level 1a + 1b] 1 + All items perform as well as consumables classified as 'Vital' in the Essential Medicines List
+4. 'scenario3' : [Level 1a + 1b] 2 + All facilities perform as well as those in which consumables stock is managed by pharmacists
+5. 'scenario6' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 75th percentile best performing facility for each individual item
+6. 'scenario7' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 90th percentile best performing facility for each individual item
+7. 'scenario8' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item
+8. 'scenario9' : [Level 1a + 1b + 2] All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item
+9. 'scenario10' : [Level 1a + 1b] All programs perform as well as HIV
+10. 'scenario11' : [Level 1a + 1b] All programs perform as well as EPI
+11. 'scenario12' : [Level 1a + 1b] HIV performs as well as other programs (excluding EPI)
+12. 'all': all consumable are always available - provides the theoretical maximum health gains which can be made through improving consumable supply
 
 The batch runs are for a large population for a long time with all disease modules and full use of HSIs.
 Run on the batch system using:
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 2430af8e23..c620762fce 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -3,7 +3,21 @@
 
 Outputs:
 * Updated version of ResourceFile_Consumables_availability_small.csv (estimate of consumable available - smaller file for use in the
- simulation) which includes new consumable availability estimates for all scenarios
+ simulation) which includes new consumable availability estimates for all scenarios. The following scenarios are generated
+1. 'default' : this is the benchmark scenario with 2018 levels of consumable availability
+2. 'scenario1' : [Level 1a + 1b] All items perform as well as consumables other than drugs/diagnostic tests
+3. 'scenario2' : [Level 1a + 1b] 1 + All items perform as well as consumables classified as 'Vital' in the Essential Medicines List
+4. 'scenario3' : [Level 1a + 1b] 2 + All facilities perform as well as those in which consumables stock is managed by pharmacists
+5. 'scenario4' : [Level 1a + 1b] 3 + Level 1a facilities perform as well as level 1b
+6. 'scenario5' : [Level 1a + 1b] 4 + All facilities perform as well as CHAM facilities
+7. 'scenario6' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 75th percentile best performing facility for each individual item
+8. 'scenario7' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 90th percentile best performing facility for each individual item
+9. 'scenario8' : [Level 1a + 1b] All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item
+10. 'scenario9' : [Level 1a + 1b + 2] All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item
+11. 'scenario10' : [Level 1a + 1b] All programs perform as well as HIV
+12. 'scenario11' : [Level 1a + 1b] All programs perform as well as EPI
+13. 'scenario12' : [Level 1a + 1b] HIV performs as well as other programs (excluding EPI)
+14. 'all': all consumable are always available - provides the theoretical maximum health gains which can be made through improving consumable supply
 
 Inputs:
 * outputs/regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv - This file is hosted

From fedc2fcd55984885ad21da868cf57e2131f4aa60 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 12 Sep 2024 18:27:28 +0100
Subject: [PATCH 077/100] minor update to correct direction of mapping

---
 ..._consumable_availability_scenarios_for_impact_analysis.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index c620762fce..20b383821a 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -160,7 +160,10 @@
     'cardiometabolicdisorders': 'ncds',
     'cancer': 'ncds',
 }
-scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(map_model_programs_to_hhfa) # TODO this does not work
+# Reverse the map_model_programs_to_hhfa dictionary
+hhfa_to_model_programs = {v: k for k, v in map_model_programs_to_hhfa.items()}
+
+scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(hhfa_to_model_programs) # TODO this might not be relevant
 
 # 1.2.5 Consumable/Item code and Category
 #------------------------------------------------------

From 82cc7f23d1efbc1e42cc8187aaaec16fe42e0190 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 12 Sep 2024 20:21:57 +0100
Subject: [PATCH 078/100] update scenario to submit to Azure

---
 .../scenario_impact_of_consumable_scenarios.py         | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
index 85e4af52b7..5a5acd6d14 100644
--- a/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/scenario_impact_of_consumable_scenarios.py
@@ -11,7 +11,6 @@
 8. 'scenario9' : [Level 1a + 1b + 2] All facilities have the same probability of consumable availability as the 99th percentile best performing facility for each individual item
 9. 'scenario10' : [Level 1a + 1b] All programs perform as well as HIV
 10. 'scenario11' : [Level 1a + 1b] All programs perform as well as EPI
-11. 'scenario12' : [Level 1a + 1b] HIV performs as well as other programs (excluding EPI)
 12. 'all': all consumable are always available - provides the theoretical maximum health gains which can be made through improving consumable supply
 
 The batch runs are for a large population for a long time with all disease modules and full use of HSIs.
@@ -34,10 +33,10 @@ def __init__(self):
         super().__init__()
         self.seed = 99
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2012, 12, 31)
-        self.pop_size = 1000 # large population size for final simulation - 100,000
-        self.number_of_draws = 12  # <- 12 scenarios
-        self.runs_per_draw = 2  # <- repeated this many times
+        self.end_date = Date(2019, 12, 31)
+        self.pop_size = 100_000 # large population size for final simulation - 100,000
+        self.number_of_draws = 11  # <- 11 scenarios
+        self.runs_per_draw = 10  # <- repeated this many times
 
     def log_configuration(self):
         return {
@@ -61,7 +60,6 @@ def draw_parameters(self, draw_number, rng):
                                       'scenario1', 'scenario2', 'scenario3',
                                       'scenario6', 'scenario7', 'scenario8',
                                       'scenario9', 'scenario10', 'scenario11',
-                                      'scenario12',
                                       'all'][draw_number]
                }
         }

From c9bffa501d3c908b60c4e7b6c92ca8fe5ce5541d Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 17 Sep 2024 10:44:52 +0300
Subject: [PATCH 079/100] minor changes to figures + allow new results to be
 loaded

---
 ...analysis_impact_of_consumable_scenarios.py | 60 ++++++++++++-------
 1 file changed, 37 insertions(+), 23 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index fbdd03691d..89c67dd349 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -56,14 +56,14 @@
     unflatten_flattened_multi_index_in_logging,
 )
 
-outputspath = Path('./outputs/')
-figurespath = Path(outputspath / 'impact_of_consumable_scenarios')
+outputspath = Path('./outputs')
+figurespath = Path(outputspath / 'impact_of_consumable_scenarios_12Sep2024')
 figurespath.mkdir(parents=True, exist_ok=True) # create directory if it doesn't exist
 resourcefilepath = Path("./resources")
 
 # Declare period for which the results will be generated (defined inclusively)
 
-TARGET_PERIOD = (Date(2010, 1, 1), Date(2019, 12, 31))
+TARGET_PERIOD = (Date(2015, 1, 1), Date(2019, 12, 31))
 
 make_graph_file_name = lambda stub: output_folder / f"{stub.replace('*', '_star_')}.png"  # noqa: E731
 
@@ -86,7 +86,7 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
     ])
 
     xticks = {(i + 0.5): k for i, k in enumerate(_df.index)}
-    colors = plt.get_cmap('tab10')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
+    colors = plt.get_cmap('tab20')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
 
     fig, ax = plt.subplots()
     ax.bar(
@@ -110,6 +110,11 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
     else:
         wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()]
         ax.set_xticklabels(wrapped_labs)
+
+    # Set font size for y-tick labels
+    ax.tick_params(axis='y', labelsize=10)
+    ax.tick_params(axis='x', labelsize=10)
+
     ax.grid(axis="y")
     ax.spines['top'].set_visible(False)
     ax.spines['right'].set_visible(False)
@@ -334,7 +339,8 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # Find results_folder associated with a given batch_file and get most recent
 #results_folder = get_scenario_outputs('impact_of_consumable_scenarios.py', outputspath)
-results_folder = Path(outputspath / 'sakshi.mohan@york.ac.uk/impact_of_consumables_scenarios-2024-06-11T204007Z/')
+results_folder = Path(outputspath / 'sakshi.mohan@york.ac.uk/impact_of_consumables_scenarios-2024-09-12T192454Z/')
+#results_folder = Path(outputspath / 'impact_of_consumables_scenarios-2024-09-12T155640Z/')
 
 # look at one log (so can decide what to extract)
 log = load_pickled_dataframes(results_folder)
@@ -347,11 +353,12 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 params_dict  = {'default': 'Actual', 'scenario1': 'General consumables', 'scenario2': 'Vital medicines',
                 'scenario3': 'Pharmacist-managed', 'scenario4': 'Level 1b', 'scenario5': 'CHAM',
                 'scenario6': '75th percentile facility', 'scenario7': '90th percentile facility', 'scenario8': 'Best facility',
-                'all': 'Perfect'}
+                'scenario9': 'Best facility (including DHO)','scenario10': 'HIV supply chain','scenario11': 'EPI supply chain',
+                'scenario12': 'HIV moved to Govt supply chain', 'all': 'Perfect'}
 params_dict_df = pd.DataFrame.from_dict(params_dict, orient='index', columns=['name_of_scenario']).reset_index().rename(columns = {'index': 'value'})
 params = params.merge(params_dict_df, on = 'value', how = 'left', validate = '1:1')
 scenarios = params['name_of_scenario'] #range(len(params))  # X-axis values representing time periods
-drop_scenarios = ['Level 1b', 'CHAM'] # Drops scenarios which are no longer considered important for comparison
+drop_scenarios = ['Level 1b', 'CHAM', 'HIV moved to Govt supply chain'] # Drops scenarios which are no longer considered important for comparison
 
 # %% Extracting results from run
 
@@ -404,7 +411,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
                 comparison= 0) # sets the comparator to 0 which is the Actual scenario
         ).T
     ).iloc[0].unstack()
-num_dalys_averted['scenario'] = scenarios.to_list()[1:10]
+num_dalys_averted['scenario'] = scenarios.to_list()[1:12]
 num_dalys_averted = num_dalys_averted.set_index('scenario')
 
 # Get percentage DALYs averted
@@ -417,7 +424,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
             scaled=True)
     ).T
 ).iloc[0].unstack()
-pc_dalys_averted['scenario'] = scenarios.to_list()[1:10]
+pc_dalys_averted['scenario'] = scenarios.to_list()[1:12]
 pc_dalys_averted = pc_dalys_averted.set_index('scenario')
 
 # %% Chart of number of DALYs averted
@@ -428,7 +435,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 fig, ax = do_bar_plot_with_ci(
     (chosen_num_dalys_averted / 1e6).clip(lower=0.0),
     annotations=[
-        f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}-{round(row['upper'], 1)}) %"
+        f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}- \n {round(row['upper'], 1)}) %"
         for _, row in chosen_pc_dalys_averted.clip(lower=0.0).iterrows()
     ],
     xticklabels_horizontal_and_wrapped=False,
@@ -472,7 +479,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
                     comparison= 0) # sets the comparator to 0 which is the Actual scenario
             ).T
         ).iloc[0].unstack()
-    num_dalys_averted_by_cause['scenario'] = scenarios.to_list()[1:10]
+    num_dalys_averted_by_cause['scenario'] = scenarios.to_list()[1:12]
     num_dalys_averted_by_cause = num_dalys_averted_by_cause.set_index('scenario')
 
     # Get percentage DALYs averted
@@ -485,7 +492,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
                 scaled=True)
         ).T
     ).iloc[0].unstack()
-    pc_dalys_averted_by_cause['scenario'] = scenarios.to_list()[1:10]
+    pc_dalys_averted_by_cause['scenario'] = scenarios.to_list()[1:12]
     pc_dalys_averted_by_cause = pc_dalys_averted_by_cause.set_index('scenario')
 
     # Create a plot of DALYs averted by cause
@@ -512,9 +519,10 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     ax.set_ylabel(f'Additional DALYs averted \n(Millions)')
     fig.tight_layout()
     fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', ''))
-    fig.show()
+    #fig.show()
     plt.close(fig)
 
+'''
 # PLot DALYs accrued by cause
 for cause in top_10_causes_of_dalys:
     name_of_plot = f'Total DALYs accrued by cause - \n {cause}, {target_period()}'
@@ -544,6 +552,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     plt.close(fig)
 
 # TODO Fix xticklabels in the plots above
+'''
 
 # 1.3 Total DALYs averted per person
 #----------------------------------------
@@ -562,7 +571,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
                 comparison= 0) # sets the comparator to 0 which is the Actual scenario
         ).T
     ).iloc[0].unstack()
-num_dalys_averted_per_person_year['scenario'] = scenarios.to_list()[1:10]
+num_dalys_averted_per_person_year['scenario'] = scenarios.to_list()[1:12]
 num_dalys_averted_per_person_year = num_dalys_averted_per_person_year.set_index('scenario')
 
 # Get percentage DALYs averted
@@ -575,7 +584,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
             scaled=True)
     ).T
 ).iloc[0].unstack()
-pct_dalys_averted_per_person_year['scenario'] = scenarios.to_list()[1:10]
+pct_dalys_averted_per_person_year['scenario'] = scenarios.to_list()[1:12]
 pct_dalys_averted_per_person_year = pct_dalys_averted_per_person_year.set_index('scenario')
 
 # %% Chart of number of DALYs averted
@@ -621,7 +630,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
                 comparison=0)  # sets the comparator to 0 which is the Actual scenario
         ).T
     ).iloc[0].unstack()
-    num_dalys_averted_per_person_year_by_cause['scenario'] = scenarios.to_list()[1:10]
+    num_dalys_averted_per_person_year_by_cause['scenario'] = scenarios.to_list()[1:12]
     num_dalys_averted_per_person_year_by_cause = num_dalys_averted_per_person_year_by_cause.set_index('scenario')
 
     # Get percentage DALYs averted
@@ -634,7 +643,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
                 scaled=True)
         ).T
     ).iloc[0].unstack()
-    pct_dalys_averted_per_person_year_by_cause['scenario'] = scenarios.to_list()[1:10]
+    pct_dalys_averted_per_person_year_by_cause['scenario'] = scenarios.to_list()[1:12]
     pct_dalys_averted_per_person_year_by_cause = pct_dalys_averted_per_person_year_by_cause.set_index('scenario')
 
     # Create a plot of DALYs averted by cause
@@ -665,7 +674,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
     ax.set_ylabel(f'Additional DALYs averted per person')
     fig.tight_layout()
     fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('/', '_').replace('\n', ''))
-    fig.show()
+    #fig.show()
     plt.close(fig)
 
 # 2. Health work time spent v DALYs accrued
@@ -704,11 +713,11 @@ def get_capacity_used_by_cadre_and_level(_df):
 for cadre_level in capacity_used.index:
     print(cadre_level)
     name_of_plot = f'Capacity used - \n {cadre_level}, {target_period()}'
-    scenarios_to_drop = capacity_used.columns[capacity_used.columns.get_level_values(0).isin([4,5])]
+    scenarios_to_drop = capacity_used.columns[capacity_used.columns.get_level_values(0).isin([10])]
     chosen_capacity_used = capacity_used.drop(columns = scenarios_to_drop)
     chosen_capacity_used = chosen_capacity_used[chosen_capacity_used.index == cadre_level]
     chosen_capacity_used = chosen_capacity_used.unstack().reset_index().drop(columns = ['level_2']).pivot(columns ='stat', index = 'draw').droplevel(0,axis = 1)
-    chosen_capacity_used['scenario'] = [*scenarios.to_list()[0:4],*scenarios.to_list()[6:10]]
+    chosen_capacity_used['scenario'] = [*scenarios.to_list()[0:10], scenarios.to_list()[11]] # [*scenarios.to_list()[0:4],*scenarios.to_list()[6:10]]
     #TODO fix above code to be automated
     chosen_capacity_used = chosen_capacity_used.set_index('scenario')
     fig, ax = do_bar_plot_with_ci(
@@ -748,9 +757,14 @@ def get_capacity_used_by_cadre_and_level(_df):
 tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
                     on = ['Facility_ID'], how='left')
 # Attach programs
-programs = pd.read_csv(resourcefilepath / 'healthsystem'/ 'consumables' / "ResourceFile_Consumables_availability_and_usage.csv")[['category', 'item_code', 'module_name']]
-programs = programs.drop_duplicates('item_code')
-tlo_availability_df = tlo_availability_df.merge(programs, on = ['item_code'], how = 'left')
+program_item_mapping = pd.read_csv(path_for_new_resourcefiles  / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']]
+program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()]
+tlo_availability_df = tlo_availability_df.merge(program_item_mapping,on = ['item_code'], how='left')
+
+# First a heatmap of current availability
+fac_levels = {'0': 'Health Post', '1a': 'Health Centers', '1b': 'Rural/Community Hospitals', '2': 'District Hospitals', '3': 'Central Hospitals', '4': 'Mental Hospital'}
+df_for_plots = tlo_availability_df
+
 availability_scenarios = ['available_prop',
        'available_prop_scenario1', 'available_prop_scenario2',
        'available_prop_scenario3', 'available_prop_scenario4',

From ecaf310d7b9b6e951a7306dabf8e04f2390c7c98 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 17 Sep 2024 10:44:59 +0300
Subject: [PATCH 080/100] minor changes to figures

---
 ...ilability_scenarios_for_impact_analysis.py | 48 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 20b383821a..9a7ad55313 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -581,7 +581,7 @@ def interpolate_missing_with_mean(_ser):
 full_df_with_scenario = df_new_scenarios6to8[old_vars + ['available_prop'] + [col for col in list_of_scenario_variables_first_stage if col != 'available_prop_scenario9']].reset_index().drop('index', axis = 1)
 full_df_with_scenario = full_df_with_scenario.merge(df_new_scenarios9[old_vars + ['available_prop_scenario9']], on = old_vars, how = 'left', validate = "1:1")
 
-list_of_scenario_suffixes_second_stage = list_of_scenario_suffixes_first_stage + ['scenario10', 'scenario11', 'scenario12']
+list_of_scenario_suffixes_second_stage = list_of_scenario_suffixes_first_stage + ['c', 'scenario11', 'scenario12']
 final_list_of_scenario_vars = ['available_prop_' + item for item in list_of_scenario_suffixes_second_stage]
 full_df_with_scenario = full_df_with_scenario.merge(hiv_scenario_df[old_vars + ['available_prop_scenario10']], on = old_vars, how = 'left', validate = "1:1")
 full_df_with_scenario = full_df_with_scenario.merge(epi_scenario_df[old_vars + ['available_prop_scenario11']], on = old_vars, how = 'left', validate = "1:1")
@@ -601,6 +601,8 @@ def interpolate_missing_with_mean(_ser):
 
 # 8. Plot new availability estimates by scenario
 #*********************************************************************************************
+full_df_with_scenario = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv")
+
 # Create the directory if it doesn't exist
 figurespath = outputfilepath / 'consumable_scenario_analysis'
 if not os.path.exists(figurespath):
@@ -696,6 +698,48 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 plt.show()
 plt.close()
 
+# Create heatmap of average availability by item_category across chosen scenarios
+scenario_list = [1,2,3,6,7,8,10,11]
+chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
+                                             scenario_list]
+scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'General consumables', 'available_prop_scenario2': 'Vital medicines',
+                'available_prop_scenario3': 'Pharmacist-managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM',
+                'available_prop_scenario6': '75th percentile facility', 'available_prop_scenario7': '90th percentile facility', 'available_prop_scenario8': 'Best facility',
+                'available_prop_scenario9': 'Best facility (including DHO)','available_prop_scenario10': 'HIV supply chain', 'available_prop_scenario11': 'EPI supply chain',
+                'available_prop_scenario12': 'HIV moved to Govt supply chain'}
+
+# Pivot the DataFrame
+aggregated_df = df_for_plots.groupby(['item_category'])[chosen_availability_columns].mean().reset_index()
+heatmap_data = aggregated_df.set_index('item_category')
+
+# Calculate the aggregate row and column
+aggregate_col= aggregated_df[chosen_availability_columns].mean()
+#overall_aggregate = aggregate_col.mean()
+
+# Add aggregate row and column
+heatmap_data['Perfect'] = 1 # Add a column representing the perfect scenario
+aggregate_col['Perfect'] = 1
+heatmap_data.loc['Perfect'] = aggregate_col
+
+# Update column names for x-axis labels
+heatmap_data = heatmap_data.rename(columns = scenario_names_dict)
+# Generate the heatmap
+sns.set(font_scale=1.2)
+plt.figure(figsize=(10, 8))
+sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+
+# Customize the plot
+plt.title(f'Availability across scenarios')
+plt.xlabel('Scenarios')
+plt.ylabel(f'Facility Level')
+plt.xticks(rotation=90)
+plt.yticks(rotation=0)
+
+plt.savefig(figurespath /f'consumable_availability_heatmap_bycategory_withoutDHO.png', dpi=300, bbox_inches='tight')
+plt.show()
+plt.close()
+
+'''
 # Create heatmap of average availability by Facility_Level just showing scenario 12
 scenario_list = [12]
 chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
@@ -730,7 +774,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 # Scenario on the X axis, program on the Y axis
 # TODO add heat maps i. heatmap by item_category across the sceanrios
 
-'''
+
 # 2.3.2. Browse missingness in the availability_change_prop variable
 #------------------------------------------------------
 pivot_table = pd.pivot_table(scenario_availability_df,

From c26245289b6999c41c9e4e112bf4ca69e0df3d4a Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 17 Sep 2024 10:56:04 +0300
Subject: [PATCH 081/100] Explicitly specify colours for each scenario

---
 ...analysis_impact_of_consumable_scenarios.py | 33 ++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 89c67dd349..8ae7549056 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -79,14 +79,32 @@ def drop_outside_period(_df):
 
 def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrapped=False):
     """Make a vertical bar plot for each row of _df, using the columns to identify the height of the bar and the
-     extent of the error bar."""
+    extent of the error bar."""
+
     yerr = np.array([
         (_df['mean'] - _df['lower']).values,
         (_df['upper'] - _df['mean']).values,
     ])
 
     xticks = {(i + 0.5): k for i, k in enumerate(_df.index)}
-    colors = plt.get_cmap('tab20')(np.linspace(0, 1, len(params['value'])))  # Generate different colors for each bar
+
+    # Define color mapping based on index values
+    color_mapping = {
+          'Actual': '#1f77b4',
+          'General consumables':'#ff7f0e',
+          'Vital medicines': '#2ca02c',
+          'Pharmacist-managed':'#d62728',
+          '75th percentile facility':'#9467bd',
+          '90th percentile facility':'#8c564b',
+          'Best facility': '#e377c2',
+          'Best facility (including DHO)': '#7f7f7f',
+          'HIV supply chain': '#bcbd22',
+          'EPI supply chain': '#17becf',
+          'Perfect':'#31a354'
+    }
+
+    colors = [_df.index[i] in color_mapping for i in range(len(_df.index))]
+    color_values = [color_mapping.get(idx, 'default_color') for idx in _df.index]
 
     fig, ax = plt.subplots()
     ax.bar(
@@ -94,22 +112,22 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
         _df['mean'].values,
         yerr=yerr,
         alpha=1,
-        color = colors,
+        color=color_values,
         ecolor='black',
         capsize=10,
         label=xticks.values()
     )
     if annotations:
         for xpos, ypos, text in zip(xticks.keys(), _df['upper'].values, annotations):
-            ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize = 9)
+            ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize=9)
+
     ax.set_xticks(list(xticks.keys()))
     if not xticklabels_horizontal_and_wrapped:
-        # xticklabels will be vertical and not wrapped
         wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()]
-        ax.set_xticklabels(wrapped_labs, rotation=45, ha = 'right')
+        ax.set_xticklabels(wrapped_labs, rotation=45, ha='right', fontsize=10)
     else:
         wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()]
-        ax.set_xticklabels(wrapped_labs)
+        ax.set_xticklabels(wrapped_labs, fontsize=10)
 
     # Set font size for y-tick labels
     ax.tick_params(axis='y', labelsize=10)
@@ -983,4 +1001,3 @@ def get_counts_of_hsi_by_short_treatment_id(_df):
 hsi_by_short_treatment_id = hsi_by_short_treatment_id.rename(columns = {'level_2': 'Short_Treatment_ID', 0: 'qty_of_HSIs'})
 
 # Cost of consumables?
-

From 3cd31b6255c6a803f5a76ac4c0c54ab75ac45ba6 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 17 Sep 2024 10:58:24 +0300
Subject: [PATCH 082/100] drop the DHO scenario

---
 .../analysis_impact_of_consumable_scenarios.py                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 8ae7549056..796102c9e5 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -376,7 +376,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 params_dict_df = pd.DataFrame.from_dict(params_dict, orient='index', columns=['name_of_scenario']).reset_index().rename(columns = {'index': 'value'})
 params = params.merge(params_dict_df, on = 'value', how = 'left', validate = '1:1')
 scenarios = params['name_of_scenario'] #range(len(params))  # X-axis values representing time periods
-drop_scenarios = ['Level 1b', 'CHAM', 'HIV moved to Govt supply chain'] # Drops scenarios which are no longer considered important for comparison
+drop_scenarios = ['Level 1b', 'CHAM', 'Best facility (including DHO)',  'HIV moved to Govt supply chain'] # Drops scenarios which are no longer considered important for comparison
 
 # %% Extracting results from run
 

From f22d845655c8f06e43015258d6664e58f9dfab61 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 17 Sep 2024 11:08:05 +0300
Subject: [PATCH 083/100] rename columns for df_for_plot and streamline the
 choice of scenarios for all plots

---
 ...ilability_scenarios_for_impact_analysis.py | 39 ++++++++-----------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 9a7ad55313..46f2d00022 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -608,9 +608,21 @@ def interpolate_missing_with_mean(_ser):
 if not os.path.exists(figurespath):
     os.makedirs(figurespath)
 
-# Creating the line plot with ggplot
+# Prepare the availability dataframe for descriptive plots
 df_for_plots = full_df_with_scenario.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1")
 df_for_plots = df_for_plots.merge(program_item_mapping, on = 'item_code', how = 'left', validate = "m:1")
+scenario_list = [1,2,3,6,7,8,10,11]
+chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
+                                             scenario_list]
+scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'General consumables', 'available_prop_scenario2': 'Vital medicines',
+                'available_prop_scenario3': 'Pharmacist-managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM',
+                'available_prop_scenario6': '75th percentile facility', 'available_prop_scenario7': '90th percentile facility', 'available_prop_scenario8': 'Best facility',
+                'available_prop_scenario9': 'Best facility (including DHO)','available_prop_scenario10': 'HIV supply chain', 'available_prop_scenario11': 'EPI supply chain',
+                'available_prop_scenario12': 'HIV moved to Govt supply chain'}
+# recreate the chosen columns list based on the mapping above
+chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns]
+df_for_plots = df_for_plots.rename(columns = scenario_names_dict)
+
 def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     df_for_line_plot = _df.groupby([_x_axis_var])[['available_prop'] + final_list_of_scenario_vars].mean()
     df_for_line_plot = df_for_line_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=['available_prop'] + final_list_of_scenario_vars,
@@ -629,19 +641,15 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png')
 
 # Create heatmaps by Facility_Level of average availability by item_category across chosen scenarios
-number_of_scenarios = 12
-availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
-                                             range(1, number_of_scenarios + 1)]
-
 for level in fac_levels:
     # Generate a heatmap
     # Pivot the DataFrame
-    aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[availability_columns].mean().reset_index()
+    aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[chosen_availability_columns].mean().reset_index()
     aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin([level])]
     heatmap_data = aggregated_df.set_index('item_category').drop(columns = 'Facility_Level')
 
     # Calculate the aggregate row and column
-    aggregate_col= aggregated_df[availability_columns].mean()
+    aggregate_col= aggregated_df[chosen_availability_columns].mean()
     #overall_aggregate = aggregate_col.mean()
 
     # Add aggregate row and column
@@ -650,7 +658,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     heatmap_data.loc['Average'] = aggregate_col
 
     # Generate the heatmap
-    sns.set(font_scale=0.5)
+    sns.set(font_scale=1.2)
     plt.figure(figsize=(10, 8))
     sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
 
@@ -666,9 +674,6 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     plt.close()
 
 # Create heatmap of average availability by Facility_Level across chosen scenarios
-scenario_list = [1,2,3,6,7,8,9,10,11]
-chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
-                                             scenario_list]
 # Pivot the DataFrame
 aggregated_df = df_for_plots.groupby(['Facility_Level'])[chosen_availability_columns].mean().reset_index()
 heatmap_data = aggregated_df.set_index('Facility_Level')
@@ -683,7 +688,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 heatmap_data.loc['Average'] = aggregate_col
 
 # Generate the heatmap
-sns.set(font_scale=0.5)
+sns.set(font_scale=1.2)
 plt.figure(figsize=(10, 8))
 sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
 
@@ -699,15 +704,6 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 plt.close()
 
 # Create heatmap of average availability by item_category across chosen scenarios
-scenario_list = [1,2,3,6,7,8,10,11]
-chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
-                                             scenario_list]
-scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'General consumables', 'available_prop_scenario2': 'Vital medicines',
-                'available_prop_scenario3': 'Pharmacist-managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM',
-                'available_prop_scenario6': '75th percentile facility', 'available_prop_scenario7': '90th percentile facility', 'available_prop_scenario8': 'Best facility',
-                'available_prop_scenario9': 'Best facility (including DHO)','available_prop_scenario10': 'HIV supply chain', 'available_prop_scenario11': 'EPI supply chain',
-                'available_prop_scenario12': 'HIV moved to Govt supply chain'}
-
 # Pivot the DataFrame
 aggregated_df = df_for_plots.groupby(['item_category'])[chosen_availability_columns].mean().reset_index()
 heatmap_data = aggregated_df.set_index('item_category')
@@ -722,7 +718,6 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 heatmap_data.loc['Perfect'] = aggregate_col
 
 # Update column names for x-axis labels
-heatmap_data = heatmap_data.rename(columns = scenario_names_dict)
 # Generate the heatmap
 sns.set(font_scale=1.2)
 plt.figure(figsize=(10, 8))

From ec338b882daba9b94aa69ea1f899984b1b5cf83a Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 19 Sep 2024 12:32:21 +0300
Subject: [PATCH 084/100] update figures for think tank presentation

---
 ...analysis_impact_of_consumable_scenarios.py |  57 +++++----
 ...ilability_scenarios_for_impact_analysis.py | 120 +++++++++++++-----
 2 files changed, 124 insertions(+), 53 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 796102c9e5..1af57aa4fc 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -119,7 +119,7 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
     )
     if annotations:
         for xpos, ypos, text in zip(xticks.keys(), _df['upper'].values, annotations):
-            ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize=9)
+            ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize=11)
 
     ax.set_xticks(list(xticks.keys()))
     if not xticklabels_horizontal_and_wrapped:
@@ -130,8 +130,8 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
         ax.set_xticklabels(wrapped_labs, fontsize=10)
 
     # Set font size for y-tick labels
-    ax.tick_params(axis='y', labelsize=10)
-    ax.tick_params(axis='x', labelsize=10)
+    ax.tick_params(axis='y', labelsize=12)
+    ax.tick_params(axis='x', labelsize=11)
 
     ax.grid(axis="y")
     ax.spines['top'].set_visible(False)
@@ -397,6 +397,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 num_dalys_summarized = summarize(num_dalys).loc[0].unstack()
 num_dalys_summarized['scenario'] = scenarios.to_list()
 num_dalys_summarized = num_dalys_summarized.set_index('scenario')
+num_dalys_summarized.to_csv(figurespath/ 'num_dalys_summarized.csv')
 
 # Plot DALYS accrued (with xtickabels horizontal and wrapped)
 name_of_plot = f'Total DALYs accrued, {target_period()}'
@@ -479,6 +480,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.reset_index()
 num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.rename(columns = {'level_2':'cause', 0: 'DALYs_accrued'})
 num_dalys_by_cause_summarized = num_dalys_by_cause_summarized.pivot(index=['draw','cause'], columns='stat', values='DALYs_accrued')
+num_dalys_by_cause_summarized.to_csv(figurespath / 'num_dalys_by_cause_summarized.csv')
 
 # Get top 10 causes until Actual
 num_dalys_by_cause_actual = num_dalys_by_cause_summarized[num_dalys_by_cause_summarized.index.get_level_values(0) == 0]
@@ -613,7 +615,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 fig, ax = do_bar_plot_with_ci(
     (chosen_num_dalys_averted_per_person_year).clip(lower=0.0),
     annotations=[
-        f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}-{round(row['upper'], 1)}) %"
+        f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}- \n {round(row['upper'], 1)}) %"
         for _, row in chosen_pct_dalys_averted_per_person_year.clip(lower=0.0).iterrows()
     ],
     xticklabels_horizontal_and_wrapped=False,
@@ -780,28 +782,37 @@ def get_capacity_used_by_cadre_and_level(_df):
 tlo_availability_df = tlo_availability_df.merge(program_item_mapping,on = ['item_code'], how='left')
 
 # First a heatmap of current availability
-fac_levels = {'0': 'Health Post', '1a': 'Health Centers', '1b': 'Rural/Community Hospitals', '2': 'District Hospitals', '3': 'Central Hospitals', '4': 'Mental Hospital'}
-df_for_plots = tlo_availability_df
-
-availability_scenarios = ['available_prop',
-       'available_prop_scenario1', 'available_prop_scenario2',
-       'available_prop_scenario3', 'available_prop_scenario4',
-       'available_prop_scenario5', 'available_prop_scenario6',
-       'available_prop_scenario7', 'available_prop_scenario8']
+fac_levels = {'0': 'Health Post', '1a': 'Health Centers', '1b': 'Rural/Community \n Hospitals', '2': 'District Hospitals', '3': 'Central Hospitals', '4': 'Mental Hospital'}
+chosen_fac_levels_for_plot = ['0', '1a', '1b', '2', '3', '4']
+correct_order_of_levels = ['Health Post', 'Health Centers', 'Rural/Community \n Hospitals', 'District Hospitals', 'Central Hospitals','Mental Hospital']
+df_for_plots = tlo_availability_df[tlo_availability_df.Facility_Level.isin(chosen_fac_levels_for_plot)]
+df_for_plots['Facility_Level'] = df_for_plots['Facility_Level'].map(fac_levels)
+
+scenario_list = [1,2,3,6,7,8,10,11]
+chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
+                                             scenario_list]
+scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'General consumables', 'available_prop_scenario2': 'Vital medicines',
+                'available_prop_scenario3': 'Pharmacist- managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM',
+                'available_prop_scenario6': '75th percentile  facility', 'available_prop_scenario7': '90th percentile  facility', 'available_prop_scenario8': 'Best facility',
+                'available_prop_scenario9': 'Best facility (including DHO)','available_prop_scenario10': 'HIV supply  chain', 'available_prop_scenario11': 'EPI supply chain',
+                'available_prop_scenario12': 'HIV moved to Govt supply chain'}
+# recreate the chosen columns list based on the mapping above
+chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns]
+df_for_plots = df_for_plots.rename(columns = scenario_names_dict)
+
 i = 0
-for avail_scenario in availability_scenarios:
+for avail_scenario in chosen_availability_columns:
     # Generate a heatmap
     # Pivot the DataFrame
-    aggregated_df = tlo_availability_df.groupby(['category', 'Facility_Level'])[avail_scenario].mean().reset_index()
-    aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin(['1a', '1b'])]
-    aggregated_df.loc[aggregated_df.Facility_Level == '1a','Facility_Level'] = 'Health centres'
-    aggregated_df.loc[aggregated_df.Facility_Level == '1b','Facility_Level'] = 'Hospitals'
-    heatmap_data = aggregated_df.pivot("category", "Facility_Level", avail_scenario)
+    aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[avail_scenario].mean().reset_index()
+    heatmap_data = aggregated_df.pivot("item_category", "Facility_Level", avail_scenario)
+    heatmap_data = heatmap_data[correct_order_of_levels] # Maintain the order
 
     # Calculate the aggregate row and column
     aggregate_col= aggregated_df.groupby('Facility_Level')[avail_scenario].mean()
-    aggregate_row = aggregated_df.groupby('category')[avail_scenario].mean()
-    overall_aggregate = aggregated_df[avail_scenario].mean()
+    aggregate_col = aggregate_col[correct_order_of_levels]
+    aggregate_row = aggregated_df.groupby('item_category')[avail_scenario].mean()
+    overall_aggregate = df_for_plots[avail_scenario].mean()
 
     # Add aggregate row and column
     heatmap_data['Average'] = aggregate_row
@@ -817,11 +828,11 @@ def get_capacity_used_by_cadre_and_level(_df):
     plt.title(scenarios[i])
     plt.xlabel('Facility Level')
     plt.ylabel(f'Disease/Public health \n program')
-    plt.xticks(rotation=45)
+    plt.xticks(rotation=90)
     plt.yticks(rotation=0)
 
-    plt.savefig(figurespath /f'consumable_availability_heatmap_1a_2_{avail_scenario}.png', dpi=300, bbox_inches='tight')
-    plt.show()
+    plt.savefig(figurespath /f'consumable_availability_heatmap_{avail_scenario}.png', dpi=300, bbox_inches='tight')
+    #plt.show()
     plt.close()
     i = i + 1
 
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 46f2d00022..323b6651af 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -614,11 +614,11 @@ def interpolate_missing_with_mean(_ser):
 scenario_list = [1,2,3,6,7,8,10,11]
 chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
                                              scenario_list]
-scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'General consumables', 'available_prop_scenario2': 'Vital medicines',
-                'available_prop_scenario3': 'Pharmacist-managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM',
-                'available_prop_scenario6': '75th percentile facility', 'available_prop_scenario7': '90th percentile facility', 'available_prop_scenario8': 'Best facility',
-                'available_prop_scenario9': 'Best facility (including DHO)','available_prop_scenario10': 'HIV supply chain', 'available_prop_scenario11': 'EPI supply chain',
-                'available_prop_scenario12': 'HIV moved to Govt supply chain'}
+scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'General \n consumables', 'available_prop_scenario2': 'Vital medicines',
+                'available_prop_scenario3': 'Pharmacist-\n managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM',
+                'available_prop_scenario6': '75th percentile\n  facility', 'available_prop_scenario7': '90th percentile \n facility', 'available_prop_scenario8': 'Best \n facility',
+                'available_prop_scenario9': 'Best facility \n (including DHO)','available_prop_scenario10': 'HIV supply \n chain', 'available_prop_scenario11': 'EPI supply \n chain',
+                'available_prop_scenario12': 'HIV moved to \n Govt supply chain'}
 # recreate the chosen columns list based on the mapping above
 chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns]
 df_for_plots = df_for_plots.rename(columns = scenario_names_dict)
@@ -703,36 +703,96 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 plt.show()
 plt.close()
 
-# Create heatmap of average availability by item_category across chosen scenarios
-# Pivot the DataFrame
-aggregated_df = df_for_plots.groupby(['item_category'])[chosen_availability_columns].mean().reset_index()
-heatmap_data = aggregated_df.set_index('item_category')
+# Create a heatmap of average availability by item_category and scenario
+# Base scenario list
+base_scenarios = [['Actual']]
+# Additional scenarios to add iteratively
+additional_scenarios = [
+    ['General \n consumables', 'Vital medicines', 'Pharmacist-\n managed'],
+    ['75th percentile\n  facility', '90th percentile \n facility', 'Best \n facility'],
+    ['HIV supply \n chain', 'EPI supply \n chain']
+]
+# Generate iteratively chosen availability columns
+iteratively_chosen_availability_columns = [
+    base_scenarios[0] + sum(additional_scenarios[:i], []) for i in range(len(additional_scenarios) + 1)
+]
+
+i = 1
+for column_list in iteratively_chosen_availability_columns:
+    # Create heatmap of average availability by item_category across chosen scenarios
+    # Pivot the DataFrame
+    chosen_levels = ['1a', '1b']
+    aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin(chosen_levels)]
+    aggregated_df = aggregated_df.groupby(['item_category'])[column_list].mean().reset_index()
+    heatmap_data = aggregated_df.set_index('item_category')
 
-# Calculate the aggregate row and column
-aggregate_col= aggregated_df[chosen_availability_columns].mean()
-#overall_aggregate = aggregate_col.mean()
+    # Calculate the aggregate row and column
+    aggregate_col= aggregated_df[column_list].mean()
+    #overall_aggregate = aggregate_col.mean()
 
-# Add aggregate row and column
-heatmap_data['Perfect'] = 1 # Add a column representing the perfect scenario
-aggregate_col['Perfect'] = 1
-heatmap_data.loc['Perfect'] = aggregate_col
+    # Add aggregate row and column
+    heatmap_data['Perfect'] = 1 # Add a column representing the perfect scenario
+    aggregate_col['Perfect'] = 1
+    heatmap_data.loc['Average'] = aggregate_col
 
-# Update column names for x-axis labels
-# Generate the heatmap
-sns.set(font_scale=1.2)
-plt.figure(figsize=(10, 8))
-sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+    # Ensure all scenarios are represented for consistent column widths
+    all_scenarios = chosen_availability_columns + ['Perfect']
+    heatmap_data = heatmap_data.reindex(columns=all_scenarios)
 
-# Customize the plot
-plt.title(f'Availability across scenarios')
-plt.xlabel('Scenarios')
-plt.ylabel(f'Facility Level')
-plt.xticks(rotation=90)
-plt.yticks(rotation=0)
+    # Update column names for x-axis labels
+    # Generate the heatmap
+    sns.set(font_scale=1.2)
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
 
-plt.savefig(figurespath /f'consumable_availability_heatmap_bycategory_withoutDHO.png', dpi=300, bbox_inches='tight')
-plt.show()
-plt.close()
+    # Customize the plot
+    plt.title(f'Availability across scenarios')
+    plt.xlabel('Scenarios')
+    plt.ylabel(f'Facility Level')
+    plt.xticks(rotation=90)
+    plt.yticks(rotation=0)
+
+    plt.savefig(figurespath /f'consumable_availability_heatmap_bycategory_iter{i}.png', dpi=300, bbox_inches='tight')
+    plt.close()
+    i = i + 1
+
+# Create a barplot of average consumable availability based on the colours used in analysis_impact_of_consumable_scenarios
+average_availability = df_for_plots[chosen_availability_columns].mean().reset_index()
+average_availability.columns = ['scenario', 'average_availability']
+# Define color mapping for each scenario
+color_mapping = {
+    'Actual': '#1f77b4',
+    'General \n consumables': '#ff7f0e',
+    'Vital medicines': '#2ca02c',
+    'Pharmacist-\n managed': '#d62728',
+    '75th percentile\n  facility': '#9467bd',
+    '90th percentile \n facility': '#8c564b',
+    'Best \n facility': '#e377c2',
+    'Best facility \n (including DHO)': '#7f7f7f',
+    'HIV supply \n chain': '#bcbd22',
+    'EPI supply \n chain': '#17becf',
+    'Perfect': '#31a354'
+}
+# Create a color list for the bars
+colors = [color_mapping[scenario] for scenario in average_availability['scenario']]
+# Create the bar plot and capture the bars
+plt.figure(figsize=(10, 6))
+bars = plt.bar(average_availability['scenario'], average_availability['average_availability'], color=colors)
+plt.title('Average Availability by Scenario')
+plt.xlabel('Scenario')
+plt.ylabel('Average Availability')
+plt.xticks(rotation=45)
+plt.ylim(0, 1)  # Adjust based on your data range
+plt.grid(axis='y')
+# Add data labels
+for bar in bars:
+    yval = bar.get_height()
+    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.02, round(yval, 2), ha='center', va='bottom')
+
+# Save the plot
+plt.tight_layout()
+plt.tight_layout()
+plt.savefig(figurespath / 'scenarios_average_availability.png', dpi=300, bbox_inches='tight')
 
 '''
 # Create heatmap of average availability by Facility_Level just showing scenario 12

From 05156d26904180d26df8ee3cddb4bee9da96a279 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 26 Sep 2024 17:52:44 +0530
Subject: [PATCH 085/100] Update scenarios 10-12 to apply max conditions as
 necessary ..and avoid instances of consumable availability falling below
 govt. supply chain levels when updating the constraint.

---
 ...rceFile_Consumables_availability_small.csv |  4 +-
 ...ilability_scenarios_for_impact_analysis.py | 80 +++++++++----------
 2 files changed, 40 insertions(+), 44 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index bb889f0ffd..b4a9e4726a 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6a50172b5823f13d291e88c6de28d9487bcf890940aabaa03031938ba4cb38a5
-size 44835162
+oid sha256:af36185b5c1abb87eae521f007467209ddb430eafca48432251e3e9afcdc3493
+size 44333025
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 323b6651af..b7280d679b 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -537,41 +537,40 @@ def interpolate_missing_with_mean(_ser):
 
 # 6. Generate scenarios based on the performance of vertical programs
 #***************************************************************************************
-cond_levels1a1b = (tlo_availability_df.Facility_Level == '1a') |(tlo_availability_df.Facility_Level == '1b')
-cond_hiv = tlo_availability_df.item_category == 'hiv'
-cond_epi = tlo_availability_df.item_category == 'epi'
-#cond_not_hivorepi = (tlo_availability_df.item_category != 'hiv') & (tlo_availability_df.item_category != 'epi')
-nonhivepi_availability_df = tlo_availability_df[(~cond_hiv & ~cond_epi) & cond_levels1a1b]
-hivepi_availability_df = tlo_availability_df[(cond_hiv| cond_epi) & cond_levels1a1b]
-irrelevant_levels_availability_df = tlo_availability_df[~cond_levels1a1b]
-
-hiv_availability_df = tlo_availability_df[cond_hiv & cond_levels1a1b].groupby(['Facility_ID', 'month', 'item_category'])['available_prop'].mean().reset_index()
-hiv_availability_df = hiv_availability_df.rename(columns = {'available_prop': 'available_prop_scenario10'})
-hivepi_availability_df['available_prop_scenario10'] = hivepi_availability_df['available_prop']
-irrelevant_levels_availability_df['available_prop_scenario10'] = irrelevant_levels_availability_df['available_prop']
-minimum_scenario_varlist = ['Facility_ID', 'month', 'item_code', 'available_prop_scenario10']
-hiv_scenario_df = nonhivepi_availability_df.merge(hiv_availability_df, on = ['Facility_ID', 'month'] , how = 'left', validate = 'm:1')
-hiv_scenario_df = pd.concat([hiv_scenario_df[minimum_scenario_varlist], hivepi_availability_df[minimum_scenario_varlist], irrelevant_levels_availability_df[minimum_scenario_varlist]], ignore_index = True)
-
-epi_availability_df = tlo_availability_df[cond_epi].groupby(['Facility_ID', 'month', 'item_category'])['available_prop'].mean().reset_index()
-epi_availability_df = epi_availability_df.rename(columns = {'available_prop': 'available_prop_scenario11'})
-hivepi_availability_df['available_prop_scenario11'] = hivepi_availability_df['available_prop']
-irrelevant_levels_availability_df['available_prop_scenario11'] = irrelevant_levels_availability_df['available_prop']
-epi_scenario_df = nonhivepi_availability_df.merge(epi_availability_df, on = ['Facility_ID', 'month'] , how = 'left', validate = 'm:1')
-minimum_scenario_varlist = ['Facility_ID', 'month', 'item_code', 'available_prop_scenario11']
-epi_scenario_df = nonhivepi_availability_df.merge(epi_availability_df, on = ['Facility_ID', 'month'] , how = 'left', validate = 'm:1')
-epi_scenario_df = pd.concat([epi_scenario_df[minimum_scenario_varlist], hivepi_availability_df[minimum_scenario_varlist], irrelevant_levels_availability_df[minimum_scenario_varlist]], ignore_index = True)
-
-# 7. Generate a scenario to represent HIV availability falling to that of other programs
-#***************************************************************************************
-nonhivepi_availability_average = tlo_availability_df[(~cond_hiv & ~cond_epi)].groupby(['Facility_ID', 'month'])['available_prop'].mean().reset_index()
-nonhiv_availability_df = tlo_availability_df[~cond_hiv]
-non_vertical_hiv_availability_df = tlo_availability_df[cond_hiv]
-nonhivepi_availability_average = nonhivepi_availability_average.rename(columns = {'available_prop':'available_prop_scenario12'})
-nonhiv_availability_df['available_prop_scenario12'] = nonhiv_availability_df['available_prop']
-non_vertical_hiv_availability_df = non_vertical_hiv_availability_df.merge(nonhivepi_availability_average, on = ['Facility_ID', 'month'],  how = 'left', validate = 'm:1')
-minimum_scenario_varlist = ['Facility_ID', 'month', 'item_code', 'available_prop_scenario12']
-non_vertical_hiv_scenario_df = pd.concat([non_vertical_hiv_availability_df[minimum_scenario_varlist], nonhiv_availability_df[minimum_scenario_varlist]], ignore_index = True)
+df_scenarios_10to12 = tlo_availability_df
+cond_levels1a1b = (df_scenarios_10to12.Facility_Level == '1a') |(df_scenarios_10to12.Facility_Level == '1b')
+cond_hiv = df_scenarios_10to12.item_category == 'hiv'
+cond_epi = df_scenarios_10to12.item_category == 'epi'
+cond_cancer = df_scenarios_10to12.item_category == 'cancer'
+
+# Calculate the average available_prop for each Facility_Level of HIV and EPI
+avg_available_prop_hiv = df_scenarios_10to12[cond_hiv & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean()
+avg_available_prop_epi = df_scenarios_10to12[cond_epi & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean()
+avg_available_prop_other = df_scenarios_10to12[~cond_epi & ~cond_hiv & ~ cond_cancer & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean()
+
+# Create new scenario variables
+df_scenarios_10to12['available_prop_scenario10'] = df_scenarios_10to12['available_prop']
+df_scenarios_10to12['available_prop_scenario11'] = df_scenarios_10to12['available_prop']
+df_scenarios_10to12['available_prop_scenario12'] = df_scenarios_10to12['available_prop']
+
+# Update scenario availability based on the average availbility by level of the parallel supply chains
+df_scenarios_10to12['available_prop_scenario10'] = df_scenarios_10to12.apply(
+    lambda row: max(row['available_prop_scenario10'], avg_available_prop_hiv[row['Facility_Level']])
+    if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi']
+    else row['available_prop_scenario10'], axis=1
+)
+
+df_scenarios_10to12['available_prop_scenario11'] = df_scenarios_10to12.apply(
+    lambda row: max(row['available_prop_scenario11'], avg_available_prop_epi[row['Facility_Level']])
+    if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi']
+    else row['available_prop_scenario11'], axis=1
+)
+
+df_scenarios_10to12['available_prop_scenario12'] = df_scenarios_10to12.apply(
+    lambda row: min(row['available_prop_scenario12'], avg_available_prop_other[row['Facility_Level']])
+    if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] in ['hiv']
+    else row['available_prop_scenario12'], axis=1
+)
 
 # Add scenarios 6 to 11 to the original dataframe
 #------------------------------------------------------
@@ -581,11 +580,9 @@ def interpolate_missing_with_mean(_ser):
 full_df_with_scenario = df_new_scenarios6to8[old_vars + ['available_prop'] + [col for col in list_of_scenario_variables_first_stage if col != 'available_prop_scenario9']].reset_index().drop('index', axis = 1)
 full_df_with_scenario = full_df_with_scenario.merge(df_new_scenarios9[old_vars + ['available_prop_scenario9']], on = old_vars, how = 'left', validate = "1:1")
 
-list_of_scenario_suffixes_second_stage = list_of_scenario_suffixes_first_stage + ['c', 'scenario11', 'scenario12']
+list_of_scenario_suffixes_second_stage = list_of_scenario_suffixes_first_stage + ['scenario10', 'scenario11', 'scenario12']
 final_list_of_scenario_vars = ['available_prop_' + item for item in list_of_scenario_suffixes_second_stage]
-full_df_with_scenario = full_df_with_scenario.merge(hiv_scenario_df[old_vars + ['available_prop_scenario10']], on = old_vars, how = 'left', validate = "1:1")
-full_df_with_scenario = full_df_with_scenario.merge(epi_scenario_df[old_vars + ['available_prop_scenario11']], on = old_vars, how = 'left', validate = "1:1")
-full_df_with_scenario = full_df_with_scenario.merge(non_vertical_hiv_scenario_df[old_vars + ['available_prop_scenario12']], on = old_vars, how = 'left', validate = "1:1")
+full_df_with_scenario = full_df_with_scenario.merge(df_scenarios_10to12[old_vars + ['available_prop_scenario10', 'available_prop_scenario11', 'available_prop_scenario12']], on = old_vars, how = 'left', validate = "1:1")
 
 #full_df_with_scenario = full_df_with_scenario.merge(program_item_mapping, on = 'item_code', validate = 'm:1', how = 'left')
 
@@ -611,7 +608,7 @@ def interpolate_missing_with_mean(_ser):
 # Prepare the availability dataframe for descriptive plots
 df_for_plots = full_df_with_scenario.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1")
 df_for_plots = df_for_plots.merge(program_item_mapping, on = 'item_code', how = 'left', validate = "m:1")
-scenario_list = [1,2,3,6,7,8,10,11]
+scenario_list = [1,2,3,6,7,8,10,11,12]
 chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
                                              scenario_list]
 scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'General \n consumables', 'available_prop_scenario2': 'Vital medicines',
@@ -670,7 +667,6 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     plt.yticks(rotation=0)
 
     plt.savefig(figurespath /f'consumable_availability_heatmap_{level}.png', dpi=300, bbox_inches='tight')
-    plt.show()
     plt.close()
 
 # Create heatmap of average availability by Facility_Level across chosen scenarios
@@ -710,7 +706,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 additional_scenarios = [
     ['General \n consumables', 'Vital medicines', 'Pharmacist-\n managed'],
     ['75th percentile\n  facility', '90th percentile \n facility', 'Best \n facility'],
-    ['HIV supply \n chain', 'EPI supply \n chain']
+    ['HIV supply \n chain', 'EPI supply \n chain',  'HIV moved to \n Govt supply chain']
 ]
 # Generate iteratively chosen availability columns
 iteratively_chosen_availability_columns = [

From 52f12bf76c74e5ea72bc35d16ff2053e3c91cacd Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 26 Sep 2024 18:01:37 +0530
Subject: [PATCH 086/100] update bar plot

---
 ...le_availability_scenarios_for_impact_analysis.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index b7280d679b..1a773d2e41 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -611,7 +611,7 @@ def interpolate_missing_with_mean(_ser):
 scenario_list = [1,2,3,6,7,8,10,11,12]
 chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in
                                              scenario_list]
-scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'General \n consumables', 'available_prop_scenario2': 'Vital medicines',
+scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'Non-therapeutic \n consumables', 'available_prop_scenario2': 'Vital medicines',
                 'available_prop_scenario3': 'Pharmacist-\n managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM',
                 'available_prop_scenario6': '75th percentile\n  facility', 'available_prop_scenario7': '90th percentile \n facility', 'available_prop_scenario8': 'Best \n facility',
                 'available_prop_scenario9': 'Best facility \n (including DHO)','available_prop_scenario10': 'HIV supply \n chain', 'available_prop_scenario11': 'EPI supply \n chain',
@@ -620,11 +620,12 @@ def interpolate_missing_with_mean(_ser):
 chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns]
 df_for_plots = df_for_plots.rename(columns = scenario_names_dict)
 
+# Generate a bar plot of average availability under each scenario by item_category and Facility_Level
 def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
-    df_for_line_plot = _df.groupby([_x_axis_var])[['available_prop'] + final_list_of_scenario_vars].mean()
-    df_for_line_plot = df_for_line_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=['available_prop'] + final_list_of_scenario_vars,
+    df_for_bar_plot = _df.groupby([_x_axis_var])[chosen_availability_columns].mean()
+    df_for_bar_plot = df_for_bar_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=chosen_availability_columns,
                         var_name='Scenario', value_name='Value')
-    plot = (ggplot(df_for_line_plot.reset_index(), aes(x=_x_axis_var, y='Value', fill = 'Scenario'))
+    plot = (ggplot(df_for_bar_plot.reset_index(), aes(x=_x_axis_var, y='Value', fill = 'Scenario'))
             + geom_bar(stat='identity', position='dodge')
             + ylim(0, 1)
             + labs(title = "Probability of availability across scenarios",
@@ -704,7 +705,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 base_scenarios = [['Actual']]
 # Additional scenarios to add iteratively
 additional_scenarios = [
-    ['General \n consumables', 'Vital medicines', 'Pharmacist-\n managed'],
+    ['Non-therapeutic \n consumables', 'Vital medicines', 'Pharmacist-\n managed'],
     ['75th percentile\n  facility', '90th percentile \n facility', 'Best \n facility'],
     ['HIV supply \n chain', 'EPI supply \n chain',  'HIV moved to \n Govt supply chain']
 ]
@@ -758,7 +759,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 # Define color mapping for each scenario
 color_mapping = {
     'Actual': '#1f77b4',
-    'General \n consumables': '#ff7f0e',
+    'Non-therapeutic \n consumables': '#ff7f0e',
     'Vital medicines': '#2ca02c',
     'Pharmacist-\n managed': '#d62728',
     '75th percentile\n  facility': '#9467bd',

From 280c753ec5e84ee3e878a8f287fb02dd08faee90 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 26 Sep 2024 18:13:39 +0530
Subject: [PATCH 087/100] update bar plot

---
 ...e_consumable_availability_scenarios_for_impact_analysis.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 1a773d2e41..38b215cc35 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -756,6 +756,9 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 # Create a barplot of average consumable availability based on the colours used in analysis_impact_of_consumable_scenarios
 average_availability = df_for_plots[chosen_availability_columns].mean().reset_index()
 average_availability.columns = ['scenario', 'average_availability']
+new_row = pd.DataFrame([['Perfect', 1]], columns=['scenario', 'average_availability']) # new row for perfect availability
+average_availability = pd.concat([average_availability, new_row], axis=0, ignore_index=True) # Concatenate the new row with the existing DataFrame
+
 # Define color mapping for each scenario
 color_mapping = {
     'Actual': '#1f77b4',
@@ -768,6 +771,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     'Best facility \n (including DHO)': '#7f7f7f',
     'HIV supply \n chain': '#bcbd22',
     'EPI supply \n chain': '#17becf',
+    'HIV moved to \n Govt supply chain':'#ff6347',
     'Perfect': '#31a354'
 }
 # Create a color list for the bars

From a018c3a48beec8e57df285567e411d4b5a7a5784 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Sun, 20 Oct 2024 15:14:08 +0100
Subject: [PATCH 088/100] update scenarios 6-9 so that availability is
 max(original availability, new availability based on target percentile)

---
 ...sourceFile_Consumables_availability_small.csv |  4 ++--
 ...availability_scenarios_for_impact_analysis.py | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index b4a9e4726a..5616f34f21 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af36185b5c1abb87eae521f007467209ddb430eafca48432251e3e9afcdc3493
-size 44333025
+oid sha256:c1a25a157f94eae1ec5e43e18f6d8eb91f0f46fda5a25d9eced8a1c824758cad
+size 44304883
diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 38b215cc35..bf4fbd3737 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -500,6 +500,14 @@ def interpolate_missing_with_mean(_ser):
 df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(availability_dataframes[1],on = ['item_code', 'month'],
                                       how = 'left',
                                       validate = "m:1")
+
+# For scenarios 6-8, replace the new availability probabilities by the max(original availability, availability at target percentile)
+for scen in [6,7,8]:
+    df_new_1a['available_prop_scenario' + str(scen)] = df_new_1a.apply(
+        lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1)
+    df_new_1b['available_prop_scenario' + str(scen)] = df_new_1b.apply(
+        lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1)
+
 # 75, 90 and 99th percentile availability data for level 2
 df_new_2 = df[df['Facility_ID'].isin(facilities_by_level['2'])].merge(availability_dataframes[2],on = ['item_code', 'month'],
                                       how = 'left',
@@ -531,7 +539,8 @@ def interpolate_missing_with_mean(_ser):
     df_new_otherlevels[col] = df_new_otherlevels['available_prop']
     df_new_1a_scenario9[col] = df_new_1a_scenario9['available_prop_scenario8']
     df_new_1b_scenario9[col] = df_new_1b_scenario9['available_prop_scenario8']
-    df_new_2_scenario9[col] = df_new_2_scenario9['available_prop_scenario8']
+    df_new_2_scenario9[col] = df_new_2_scenario9.apply(lambda row: max(row['available_prop_scenario8'], row['available_prop']), axis=1)
+
 # Append the above dataframes
 df_new_scenarios9 = pd.concat([df_new_1a_scenario9, df_new_1b_scenario9, df_new_2_scenario9, df_new_otherlevels], ignore_index = True)
 
@@ -586,6 +595,11 @@ def interpolate_missing_with_mean(_ser):
 
 #full_df_with_scenario = full_df_with_scenario.merge(program_item_mapping, on = 'item_code', validate = 'm:1', how = 'left')
 
+# --- Check that the scenarios 6-11 always have higher prob of availability than baseline  --- #
+for scen in range(6,12):
+    assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] < full_df_with_scenario['available_prop']) == 0
+    assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] >= full_df_with_scenario['available_prop']) == len(full_df_with_scenario)
+
 # --- Check that the exported file has the properties required of it by the model code. --- #
 check_format_of_consumables_file(df=full_df_with_scenario, fac_ids=fac_ids)
 

From 0f55c9cee7a0deb07017f3324342a49ff77e603c Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Sun, 20 Oct 2024 18:11:57 +0100
Subject: [PATCH 089/100] add scenarios 9-12 to test_consumables.py

---
 tests/test_consumables.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_consumables.py b/tests/test_consumables.py
index fb11abe13c..50a9e6faf4 100644
--- a/tests/test_consumables.py
+++ b/tests/test_consumables.py
@@ -609,6 +609,7 @@ def test_consumables_availability_modes_that_depend_on_designations(seed):
         elif availability in ('default',
                               'scenario1', 'scenario2', 'scenario3', 'scenario4',
                               'scenario5', 'scenario6', 'scenario7', 'scenario8',
+                              'scenario9', 'scenario10', 'scenario11', 'scenario12',
                             ):
             continue
         else:

From 693518ded393dd9e8d6d4c020865aeba6b0a3721 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Fri, 15 Nov 2024 11:43:49 +0000
Subject: [PATCH 090/100] correct num_dalys plots to account for median rather
 than mean values

---
 ...analysis_impact_of_consumable_scenarios.py | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 1af57aa4fc..029810c4e1 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -57,7 +57,7 @@
 )
 
 outputspath = Path('./outputs')
-figurespath = Path(outputspath / 'impact_of_consumable_scenarios_12Sep2024')
+figurespath = Path(outputspath / 'impact_of_consumable_scenarios_results')
 figurespath.mkdir(parents=True, exist_ok=True) # create directory if it doesn't exist
 resourcefilepath = Path("./resources")
 
@@ -82,8 +82,8 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
     extent of the error bar."""
 
     yerr = np.array([
-        (_df['mean'] - _df['lower']).values,
-        (_df['upper'] - _df['mean']).values,
+        (_df['median'] - _df['lower']).values,
+        (_df['upper'] - _df['median']).values,
     ])
 
     xticks = {(i + 0.5): k for i, k in enumerate(_df.index)}
@@ -91,7 +91,7 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
     # Define color mapping based on index values
     color_mapping = {
           'Actual': '#1f77b4',
-          'General consumables':'#ff7f0e',
+          'Non-therapeutic consumables':'#ff7f0e',
           'Vital medicines': '#2ca02c',
           'Pharmacist-managed':'#d62728',
           '75th percentile facility':'#9467bd',
@@ -104,12 +104,12 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
     }
 
     colors = [_df.index[i] in color_mapping for i in range(len(_df.index))]
-    color_values = [color_mapping.get(idx, 'default_color') for idx in _df.index]
+    color_values = [color_mapping.get(idx, '#cccccc') for idx in _df.index]
 
     fig, ax = plt.subplots()
     ax.bar(
         xticks.keys(),
-        _df['mean'].values,
+        _df['median'].values,
         yerr=yerr,
         alpha=1,
         color=color_values,
@@ -119,7 +119,7 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
     )
     if annotations:
         for xpos, ypos, text in zip(xticks.keys(), _df['upper'].values, annotations):
-            ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize=11)
+            ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize=10)
 
     ax.set_xticks(list(xticks.keys()))
     if not xticklabels_horizontal_and_wrapped:
@@ -130,8 +130,8 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
         ax.set_xticklabels(wrapped_labs, fontsize=10)
 
     # Set font size for y-tick labels
-    ax.tick_params(axis='y', labelsize=12)
-    ax.tick_params(axis='x', labelsize=11)
+    ax.tick_params(axis='y', labelsize=10)
+    ax.tick_params(axis='x', labelsize=10)
 
     ax.grid(axis="y")
     ax.spines['top'].set_visible(False)
@@ -368,7 +368,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # 1) Extract the parameters that have varied over the set of simulations
 params = extract_params(results_folder)
-params_dict  = {'default': 'Actual', 'scenario1': 'General consumables', 'scenario2': 'Vital medicines',
+params_dict  = {'default': 'Actual', 'scenario1': 'Non-therapeutic consumables', 'scenario2': 'Vital medicines',
                 'scenario3': 'Pharmacist-managed', 'scenario4': 'Level 1b', 'scenario5': 'CHAM',
                 'scenario6': '75th percentile facility', 'scenario7': '90th percentile facility', 'scenario8': 'Best facility',
                 'scenario9': 'Best facility (including DHO)','scenario10': 'HIV supply chain','scenario11': 'EPI supply chain',
@@ -405,7 +405,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 fig, ax = do_bar_plot_with_ci(
     (chosen_num_dalys_summarized / 1e6).clip(lower=0.0),
     annotations=[
-        f"{round(row['mean']/1e6, 1)} \n ({round(row['lower']/1e6, 1)}-{round(row['upper']/1e6, 1)})"
+        f"{round(row['median']/1e6, 1)} \n ({round(row['lower']/1e6, 1)}-{round(row['upper']/1e6, 1)})"
         for _, row in chosen_num_dalys_summarized.clip(lower=0.0).iterrows()
     ],
     xticklabels_horizontal_and_wrapped=False,
@@ -448,13 +448,13 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # %% Chart of number of DALYs averted
 # Plot DALYS averted (with xtickabels horizontal and wrapped)
-name_of_plot = f'Additional DALYs Averted vs Actual, {target_period()}'
+name_of_plot = f'Health impact of improved consumable availability\n {target_period()}'
 chosen_num_dalys_averted = num_dalys_averted[~num_dalys_averted.index.isin(drop_scenarios)]
 chosen_pc_dalys_averted = pc_dalys_averted[~pc_dalys_averted.index.isin(drop_scenarios)]
 fig, ax = do_bar_plot_with_ci(
     (chosen_num_dalys_averted / 1e6).clip(lower=0.0),
     annotations=[
-        f"{round(row['mean'], 1)} % \n ({round(row['lower'], 1)}- \n {round(row['upper'], 1)}) %"
+        f"{round(row['median'], 1)} % \n ({round(row['lower'], 1)}- \n {round(row['upper'], 1)}) %"
         for _, row in chosen_pc_dalys_averted.clip(lower=0.0).iterrows()
     ],
     xticklabels_horizontal_and_wrapped=False,
@@ -464,7 +464,7 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 ax.set_yticks(np.arange(0, 18, 2))
 ax.set_ylabel('Additional DALYS Averted \n(Millions)')
 fig.tight_layout()
-fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', ''))
+fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('\n', ''))
 fig.show()
 plt.close(fig)
 

From 9c4b4b4d76d0bf329969db2304fdfd8fea047f1e Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Fri, 15 Nov 2024 12:40:24 +0000
Subject: [PATCH 091/100] update DALYs averted plot to add heatmap of average
 consumable availability on top

---
 ...analysis_impact_of_consumable_scenarios.py | 115 ++++++++++++++++--
 1 file changed, 108 insertions(+), 7 deletions(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 029810c4e1..88d0237ced 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -14,6 +14,7 @@
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
+import matplotlib.colors as mcolors
 from matplotlib.ticker import FuncFormatter
 from collections import Counter, defaultdict
 import seaborn as sns
@@ -140,6 +141,103 @@ def do_bar_plot_with_ci(_df, annotations=None, xticklabels_horizontal_and_wrappe
 
     return fig, ax
 
+def do_bar_plot_with_ci_and_heatmap(_df, annotations=None, xticklabels_horizontal_and_wrapped=False, heatmap_values=None, plt_title = 'unnamed_figure'):
+    """Create a bar plot with CI and a heatmap above it."""
+    yerr = np.array([
+        (_df['median'] - _df['lower']).values,
+        (_df['upper'] - _df['median']).values,
+    ])
+
+    xticks = {(i + 0.5): k for i, k in enumerate(_df.index)}
+
+    # Define color mapping based on index values
+    color_mapping = {
+          'Actual': '#1f77b4',
+          'Non-therapeutic consumables':'#ff7f0e',
+          'Vital medicines': '#2ca02c',
+          'Pharmacist-managed':'#d62728',
+          '75th percentile facility':'#9467bd',
+          '90th percentile facility':'#8c564b',
+          'Best facility': '#e377c2',
+          'Best facility (including DHO)': '#7f7f7f',
+          'HIV supply chain': '#bcbd22',
+          'EPI supply chain': '#17becf',
+          'Perfect':'#31a354'
+    }
+
+    color_values = [color_mapping.get(idx, '#cccccc') for idx in _df.index]
+
+    # Create a figure with two axes
+    fig, (heatmap_ax, ax) = plt.subplots(
+        nrows=2, ncols=1, gridspec_kw={"height_ratios": [0.3, 2]}, figsize=(10, 7)
+    )
+
+    # Heatmap axis
+    if heatmap_values:
+        cmap = plt.cm.YlGn
+        norm = mcolors.Normalize(vmin=min(heatmap_values), vmax=max(heatmap_values))
+        heatmap_colors = [cmap(norm(value)) for value in heatmap_values]
+
+        heatmap_ax.bar(
+            xticks.keys(),
+            [1] * len(heatmap_values),  # Constant height for heatmap bars
+            color=heatmap_colors,
+            align='center',
+            width=0.8
+        )
+
+        # Add data labels to heatmap bars
+        for xpos, value in zip(xticks.keys(), heatmap_values):
+            heatmap_ax.text(
+                xpos, 0.5, f"{value:.2f}", color='black', ha='center', va='center', fontsize= 12, weight='bold'
+            )
+
+        heatmap_ax.set_xticks(list(xticks.keys()))
+        heatmap_ax.set_xticklabels([])
+        heatmap_ax.set_yticks([])
+        heatmap_ax.set_ylabel('Average consumable \n availability under \n each scenario \n (Baseline = 0.52)', fontsize=10, rotation=0, labelpad=20)
+        heatmap_ax.spines['top'].set_visible(False)
+        heatmap_ax.spines['right'].set_visible(False)
+        heatmap_ax.spines['left'].set_visible(False)
+        heatmap_ax.spines['bottom'].set_visible(False)
+
+    # Bar plot axis
+    ax.bar(
+        xticks.keys(),
+        _df['median'].values,
+        yerr=yerr,
+        alpha=1,
+        color=color_values,
+        ecolor='black',
+        capsize=10
+    )
+    if annotations:
+        for xpos, ypos, text in zip(xticks.keys(), _df['upper'].values, annotations):
+            ax.text(xpos, ypos * 1.05, text, horizontalalignment='center', fontsize=10)
+
+    ax.set_xticks(list(xticks.keys()))
+    if not xticklabels_horizontal_and_wrapped:
+        wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()]
+        ax.set_xticklabels(wrapped_labs, rotation=45, ha='right', fontsize=10)
+    else:
+        wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in xticks.values()]
+        ax.set_xticklabels(wrapped_labs, fontsize=10)
+
+    # Set font size for y-tick labels
+    ax.tick_params(axis='y', labelsize=10)
+    ax.tick_params(axis='x', labelsize=10)
+
+    ax.grid(axis="y")
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+
+    # Add global title
+    fig.suptitle(plt_title, fontsize=16, fontweight='bold')
+
+    fig.tight_layout()
+
+    return fig, (heatmap_ax, ax)
+
 def get_num_dalys(_df):
     """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD).
     Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using
@@ -448,20 +546,23 @@ def find_difference_relative_to_comparison(_ser: pd.Series,
 
 # %% Chart of number of DALYs averted
 # Plot DALYS averted (with xtickabels horizontal and wrapped)
-name_of_plot = f'Health impact of improved consumable availability\n {target_period()}'
+average_availability_under_scenarios = [0.59, 0.59, 0.6, 0.57, 0.63, 0.7, 0.79, 0.91, 1]
+name_of_plot = f'Health impact of improved consumable availability\n at level 1 health facilities, {target_period()}'
 chosen_num_dalys_averted = num_dalys_averted[~num_dalys_averted.index.isin(drop_scenarios)]
 chosen_pc_dalys_averted = pc_dalys_averted[~pc_dalys_averted.index.isin(drop_scenarios)]
-fig, ax = do_bar_plot_with_ci(
-    (chosen_num_dalys_averted / 1e6).clip(lower=0.0),
+fig, (heatmap_ax, ax) = do_bar_plot_with_ci_and_heatmap(
+    (chosen_num_dalys_averted / 1e6),
     annotations=[
         f"{round(row['median'], 1)} % \n ({round(row['lower'], 1)}- \n {round(row['upper'], 1)}) %"
-        for _, row in chosen_pc_dalys_averted.clip(lower=0.0).iterrows()
+        for _, row in chosen_pc_dalys_averted.iterrows()
     ],
     xticklabels_horizontal_and_wrapped=False,
+    heatmap_values=average_availability_under_scenarios,
+    plt_title = name_of_plot
 )
-ax.set_title(name_of_plot)
-ax.set_ylim(0, 20)
-ax.set_yticks(np.arange(0, 18, 2))
+#ax.set_title(name_of_plot)
+ax.set_ylim(0, 14)
+ax.set_yticks(np.arange(0, 14, 2))
 ax.set_ylabel('Additional DALYS Averted \n(Millions)')
 fig.tight_layout()
 fig.savefig(figurespath / name_of_plot.replace(' ', '_').replace(',', '').replace('\n', ''))

From 96ae730607bffb0c82d02e2b39d6e8216f865ec2 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Fri, 15 Nov 2024 14:31:08 +0000
Subject: [PATCH 092/100] correct resourcefilepath

---
 .../analysis_impact_of_consumable_scenarios.py                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
index 88d0237ced..5b3ee33ba7 100644
--- a/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
+++ b/src/scripts/consumables_analyses/analysis_impact_of_consumable_scenarios.py
@@ -878,7 +878,7 @@ def get_capacity_used_by_cadre_and_level(_df):
 tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],
                     on = ['Facility_ID'], how='left')
 # Attach programs
-program_item_mapping = pd.read_csv(path_for_new_resourcefiles  / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']]
+program_item_mapping = pd.read_csv(resourcefilepath  / 'healthsystem'/ 'consumables' /  'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']]
 program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()]
 tlo_availability_df = tlo_availability_df.merge(program_item_mapping,on = ['item_code'], how='left')
 

From cdaa1b2fbcecd65b7b0c1da2ec993d51068ee995 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Wed, 15 Jan 2025 12:57:56 +0000
Subject: [PATCH 093/100] add items 1237, 1239, 2678, and 1124 to consumables
 RFs (include Fansidar and Rifapentine)

---
 .../ResourceFile_Consumables_availability_small.csv           | 4 ++--
 .../consumables/ResourceFile_consumables_matched.csv          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index 5616f34f21..a7d56903cd 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c1a25a157f94eae1ec5e43e18f6d8eb91f0f46fda5a25d9eced8a1c824758cad
-size 44304883
+oid sha256:fa713b2e93717d064af3a042846e3a7c7e95c4adc3d22db0c057ce9d8fb93a95
+size 45082689
diff --git a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
index 2193096eed..018d9fb7bc 100644
--- a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
+++ b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:feed2010d40e0a6f553ebae084543745ed2177640c47d278d186884346c7649a
-size 122886
+oid sha256:0e60c650da1f02de667c5bfef8ecbf28c740ad4535a45e31e4f7496ee473b4c0
+size 123790

From 8eae35a2f47a8b2138014b86c56c8849e57f1927 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Wed, 15 Jan 2025 17:56:33 +0000
Subject: [PATCH 094/100] added item code 75 (Gauze)

---
 .../ResourceFile_Consumables_availability_small.csv           | 4 ++--
 .../consumables/ResourceFile_consumables_matched.csv          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
index a7d56903cd..12d8b4dfc3 100644
--- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
+++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa713b2e93717d064af3a042846e3a7c7e95c4adc3d22db0c057ce9d8fb93a95
-size 45082689
+oid sha256:01dda79d4ae913b782819f97fa909b86157956a04fa9a5ecc7238308e141fdc1
+size 45303490
diff --git a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
index 018d9fb7bc..3c1b4d92a0 100644
--- a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
+++ b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e60c650da1f02de667c5bfef8ecbf28c740ad4535a45e31e4f7496ee473b4c0
-size 123790
+oid sha256:380de2593127b48c6a43f14be6c55bf5c4f78a609b843e4cb66b8066ef7f9399
+size 124792

From a2815275d2c0daf1106d48481fc2fef4df63589a Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Thu, 16 Jan 2025 18:18:55 +0000
Subject: [PATCH 095/100] add detailed heatmaps for actual and 75th percentile
 scenarios for costing paper

---
 ...ilability_scenarios_for_impact_analysis.py | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index bf4fbd3737..b8a39f3c6c 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -714,6 +714,66 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 plt.show()
 plt.close()
 
+# Create heatmap of average availability by Facility_Level and program for actual and 75th percentile (Costing paper)
+# Actual
+aggregated_df = df_for_plots.groupby(['Facility_Level', 'item_category'])['Actual'].mean().reset_index()
+heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='Actual')
+
+# Calculate the aggregate row and column
+aggregate_col= heatmap_data.mean(axis=1)
+overall_aggregate = aggregate_col.mean()
+aggregate_row =  heatmap_data.mean(axis=0)
+
+# Add aggregate row and column
+heatmap_data['Average'] = aggregate_col
+aggregate_row['Average'] = overall_aggregate
+heatmap_data.loc['Average'] = aggregate_row
+
+# Generate the heatmap
+sns.set(font_scale=1.2)
+plt.figure(figsize=(10, 8))
+sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+
+# Customize the plot
+plt.xlabel('Scenarios')
+plt.ylabel(f'Facility Level')
+plt.xticks(rotation=90)
+plt.yticks(rotation=0)
+
+plt.savefig(figurespath /f'heatmap_program_and_level_actual.png', dpi=300, bbox_inches='tight')
+plt.show()
+plt.close()
+
+# 75th percentile
+aggregated_df = df_for_plots.groupby(['Facility_Level', 'item_category'])['75th percentile\n  facility'].mean().reset_index()
+heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='75th percentile\n  facility')
+
+# Calculate the aggregate row and column
+aggregate_col= heatmap_data.mean(axis=1)
+overall_aggregate = aggregate_col.mean()
+aggregate_row =  heatmap_data.mean(axis=0)
+
+# Add aggregate row and column
+heatmap_data['Average'] = aggregate_col
+aggregate_row['Average'] = overall_aggregate
+heatmap_data.loc['Average'] = aggregate_row
+
+# Generate the heatmap
+sns.set(font_scale=1.2)
+plt.figure(figsize=(10, 8))
+sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
+
+# Customize the plot
+plt.xlabel('Scenarios')
+plt.ylabel(f'Facility Level')
+plt.xticks(rotation=90)
+plt.yticks(rotation=0)
+
+plt.savefig(figurespath /f'heatmap_program_and_level_75perc.png', dpi=300, bbox_inches='tight')
+plt.show()
+plt.close()
+
+
 # Create a heatmap of average availability by item_category and scenario
 # Base scenario list
 base_scenarios = [['Actual']]

From b90dd1332ebe96301c5c49ad7b6db0276a212721 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Fri, 17 Jan 2025 14:32:32 +0000
Subject: [PATCH 096/100] Assume that 50% of the expenditure reported under
 'Vehicles - Fuel and Maintenance' under 'Programme management and
 administration' is for central and local administrative bodies.

---
 resources/costing/ResourceFile_Costing.xlsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/costing/ResourceFile_Costing.xlsx b/resources/costing/ResourceFile_Costing.xlsx
index 9f06132aaa..f18d7b0401 100644
--- a/resources/costing/ResourceFile_Costing.xlsx
+++ b/resources/costing/ResourceFile_Costing.xlsx
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2afa3649672e10c9741b26dc70fa0f4496af4fccfafbf8b7b70f3b90b291a4fb
-size 1007463
+oid sha256:8e989e777c346ca067cc4e22b49e3733291195c3703decd4fdb96956f1edfe6f
+size 4303214

From 08acc9bd45deb98142b1b53a41e846ff4235e0b5 Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Fri, 17 Jan 2025 14:47:06 +0000
Subject: [PATCH 097/100] Revert "Assume that 50% of the expenditure reported
 under 'Vehicles - Fuel and Maintenance' under 'Programme management and
 administration' is for central and local administrative bodies."

This reverts commit b90dd1332ebe96301c5c49ad7b6db0276a212721.
---
 resources/costing/ResourceFile_Costing.xlsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/costing/ResourceFile_Costing.xlsx b/resources/costing/ResourceFile_Costing.xlsx
index f18d7b0401..9f06132aaa 100644
--- a/resources/costing/ResourceFile_Costing.xlsx
+++ b/resources/costing/ResourceFile_Costing.xlsx
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e989e777c346ca067cc4e22b49e3733291195c3703decd4fdb96956f1edfe6f
-size 4303214
+oid sha256:2afa3649672e10c9741b26dc70fa0f4496af4fccfafbf8b7b70f3b90b291a4fb
+size 1007463

From aeee9ee593ecda259aba560f6118d00992fb161a Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Tue, 21 Jan 2025 08:55:43 +0000
Subject: [PATCH 098/100] edit axis titles

---
 ...nsumable_availability_scenarios_for_impact_analysis.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index b8a39f3c6c..4fc85bef00 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -735,8 +735,8 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
 
 # Customize the plot
-plt.xlabel('Scenarios')
-plt.ylabel(f'Facility Level')
+plt.xlabel('Facility Level')
+plt.ylabel(f'Program')
 plt.xticks(rotation=90)
 plt.yticks(rotation=0)
 
@@ -764,8 +764,8 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'})
 
 # Customize the plot
-plt.xlabel('Scenarios')
-plt.ylabel(f'Facility Level')
+plt.xlabel('Facility Level')
+plt.ylabel(f'Program')
 plt.xticks(rotation=90)
 plt.yticks(rotation=0)
 

From c1cebe81d1ec8a2ae2f1224818c9b8fcf46bb1ed Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Fri, 24 Jan 2025 18:20:02 +0000
Subject: [PATCH 099/100] update heatmap figures so that average availability
 across rows is based on the mean from the full dataframe rather than the
 summarised dataframe

---
 ...ilability_scenarios_for_impact_analysis.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 4fc85bef00..19465dba59 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -661,7 +661,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     heatmap_data = aggregated_df.set_index('item_category').drop(columns = 'Facility_Level')
 
     # Calculate the aggregate row and column
-    aggregate_col= aggregated_df[chosen_availability_columns].mean()
+    aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), chosen_availability_columns].mean()
     #overall_aggregate = aggregate_col.mean()
 
     # Add aggregate row and column
@@ -690,7 +690,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 heatmap_data = aggregated_df.set_index('Facility_Level')
 
 # Calculate the aggregate row and column
-aggregate_col= aggregated_df[chosen_availability_columns].mean()
+aggregate_col= df_for_plots[chosen_availability_columns].mean()
 #overall_aggregate = aggregate_col.mean()
 
 # Add aggregate row and column
@@ -720,9 +720,9 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='Actual')
 
 # Calculate the aggregate row and column
-aggregate_col= heatmap_data.mean(axis=1)
-overall_aggregate = aggregate_col.mean()
-aggregate_row =  heatmap_data.mean(axis=0)
+aggregate_col= df_for_plots.groupby('item_category')['Actual'].mean()
+overall_aggregate = df_for_plots['Actual'].mean()
+aggregate_row =  df_for_plots.groupby('Facility_Level')['Actual'].mean()
 
 # Add aggregate row and column
 heatmap_data['Average'] = aggregate_col
@@ -749,9 +749,10 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='75th percentile\n  facility')
 
 # Calculate the aggregate row and column
-aggregate_col= heatmap_data.mean(axis=1)
-overall_aggregate = aggregate_col.mean()
-aggregate_row =  heatmap_data.mean(axis=0)
+aggregate_col= df_for_plots.groupby('item_category')['75th percentile\n  facility'].mean()
+overall_aggregate = df_for_plots['75th percentile\n  facility'].mean()
+aggregate_row =  df_for_plots.groupby('Facility_Level')['75th percentile\n  facility'].mean()
+
 
 # Add aggregate row and column
 heatmap_data['Average'] = aggregate_col
@@ -798,7 +799,7 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
     heatmap_data = aggregated_df.set_index('item_category')
 
     # Calculate the aggregate row and column
-    aggregate_col= aggregated_df[column_list].mean()
+    aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin(chosen_levels), column_list].mean()
     #overall_aggregate = aggregate_col.mean()
 
     # Add aggregate row and column

From 7d15cb61f127b7ed2c367b4971f6891bf86e95ac Mon Sep 17 00:00:00 2001
From: sakshimohan <sakshi.mohan@york.ac.uk>
Date: Mon, 10 Mar 2025 11:06:40 +0000
Subject: [PATCH 100/100] add heatmaps for ROI paper

---
 ...ilability_scenarios_for_impact_analysis.py | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
index 19465dba59..1d6a8dee84 100644
--- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
+++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py
@@ -870,6 +870,78 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename):
 plt.tight_layout()
 plt.savefig(figurespath / 'scenarios_average_availability.png', dpi=300, bbox_inches='tight')
 
+
+# Create the directory if it doesn't exist
+roi_plots_path = './outputs/horizontal_v_vertical/roi/'
+if not os.path.exists(roi_plots_path):
+    os.makedirs(roi_plots_path)
+
+# Create a combined plot of heatmaps of average availability for levels 1a and 1b under actual, 75th percentile, HIV and EPI scenarios
+# Scenario list
+scenarios_for_roi_paper = ['Actual', '75th percentile\n  facility', 'HIV supply \n chain', 'EPI supply \n chain']
+# Define facility levels
+chosen_levels = ['1a', '1b']
+
+# Create a figure with subplots for each level
+fig, axes = plt.subplots(nrows=1, ncols=len(chosen_levels), figsize=(20, 8), sharex=True, sharey=True)
+# Create a single colorbar axis
+cbar_ax = fig.add_axes([.91, .3, .02, .4])  # Position of the colorbar
+
+for ax, level in zip(axes, chosen_levels):
+    # Filter data for the current facility level
+    aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin([level])]
+    aggregated_df = aggregated_df.groupby(['item_category'])[scenarios_for_roi_paper].mean().reset_index()
+    heatmap_data = aggregated_df.set_index('item_category')
+
+    # Calculate the aggregate row
+    aggregate_col = df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), scenarios_for_roi_paper].mean()
+    heatmap_data.loc['Average'] = aggregate_col
+
+    # Generate the heatmap on the current subplot
+    sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', ax=ax, cbar=(ax == axes[-1]), cbar_ax=(cbar_ax if ax == axes[-1] else None))
+
+    # Set labels
+    ax.set_title(f'Level {level}')
+    ax.set_xlabel('Scenarios')
+    ax.set_ylabel('Program' if ax == axes[0] else "")
+
+cbar_ax.set_ylabel('Proportion of days consumable is available')
+# Save the combined heatmap
+plt.savefig(roi_plots_path / f'combined_consumable_availability_heatmap_1a_1b.png', dpi=300, bbox_inches='tight')
+plt.close()
+
+# Create a combined plot of heatmaps of average availability for all levels under actual, 75th percentile, HIV and EPI scenarios
+chosen_levels = ['0', '1a', '1b', '2', '3']
+# Create a figure with subplots
+fig, axes = plt.subplots(nrows=1, ncols=len(chosen_levels), figsize=(20, 8), sharex=True, sharey=True)
+
+# Create a single colorbar axis
+cbar_ax = fig.add_axes([.91, .3, .02, .4])  # Position of the colorbar
+
+for ax, level in zip(axes, chosen_levels):
+    # Filter data for the current facility level
+    aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin([level])]
+    aggregated_df = aggregated_df.groupby(['item_category'])[scenarios_for_roi_paper].mean().reset_index()
+    heatmap_data = aggregated_df.set_index('item_category')
+
+    # Calculate the aggregate row
+    aggregate_col = df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), scenarios_for_roi_paper].mean()
+    heatmap_data.loc['Average'] = aggregate_col
+
+    # Generate the heatmap on the current subplot
+    sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', ax=ax, cbar=(ax == axes[-1]), cbar_ax=(cbar_ax if ax == axes[-1] else None))
+
+    # Set labels
+    ax.set_title(f'Level {level}')
+    ax.set_xlabel('Scenarios')
+    ax.set_ylabel('Program' if ax == axes[0] else "")
+
+# Adjust layout
+cbar_ax.set_ylabel('Proportion of days consumable is available')
+# Save the combined heatmap
+plt.savefig(roi_plots_path / f'combined_consumable_availability_heatmap_all_levels.png', dpi=300, bbox_inches='tight')
+plt.close()
+
 '''
 # Create heatmap of average availability by Facility_Level just showing scenario 12
 scenario_list = [12]