From 7ec23cb85bea96f5190cea220878fff7fb2ae789 Mon Sep 17 00:00:00 2001 From: Graham Stark Date: Mon, 9 Dec 2024 23:44:08 +0000 Subject: [PATCH] ladata --- src/FRSHouseholdGetter.jl | 210 ++++++++++++++++--------- src/LocalLevelCalculations.jl | 78 +-------- src/WeightingData.jl | 76 ++++++++- test/local_level_calculations_tests.jl | 29 +++- 4 files changed, 238 insertions(+), 155 deletions(-) diff --git a/src/FRSHouseholdGetter.jl b/src/FRSHouseholdGetter.jl index f7af5749..7c21a6d9 100644 --- a/src/FRSHouseholdGetter.jl +++ b/src/FRSHouseholdGetter.jl @@ -66,7 +66,7 @@ module FRSHouseholdGetter pseqs :: Vector{Int} end - struct HHWrapper + mutable struct HHWrapper hhlds :: Vector{Household{Float64}} # weight :: Vector{Float64} dimensions :: Vector{Int} @@ -96,7 +96,18 @@ module FRSHouseholdGetter zeros(0)) function backup() - BACKUP_HOUSEHOLDS.hhlds = deepcopy( MODEL_HOUSEHOLDS.hhlds ) + #= BACKUP_HOUSEHOLDS.hhlds = similar( MODEL_HOUSEHOLDS.hhlds ) + if length( BACKUP_HOUSEHOLDS.hhlds ) == 0 + for i in eachindex( MODEL_HOUSEHOLDS.hhlds ) + push!(BACKUP_HOUSEHOLDS.hhlds, deepcopy( MODEL_HOUSEHOLDS.hhlds[i])) + end + else + for i in eachindex( MODEL_HOUSEHOLDS.hhlds ) + BACKUP_HOUSEHOLDS.hhlds[i] = deepcopy( MODEL_HOUSEHOLDS.hhlds[i]) + end + end + =# + BACKUP_HOUSEHOLDS.hhlds = deepcopy(MODEL_HOUSEHOLDS.hhlds) BACKUP_HOUSEHOLDS.dimensions = deepcopy( MODEL_HOUSEHOLDS.dimensions ) BACKUP_HOUSEHOLDS.hh_map = deepcopy( MODEL_HOUSEHOLDS.hh_map ) BACKUP_HOUSEHOLDS.pers_map = deepcopy( MODEL_HOUSEHOLDS.pers_map ) @@ -105,10 +116,15 @@ module FRSHouseholdGetter end function restore() - MODEL_HOUSEHOLDS.hhlds = deepcopy( BACKUP_HOUSEHOLDS.hhlds ) + #= + for i in eachindex( BACKUP_HOUSEHOLDS.hhlds ) + MODEL_HOUSEHOLDS.hhlds[i] = deepcopy( BACKUP_HOUSEHOLDS.hhlds[i]) + end + =# + MODEL_HOUSEHOLDS.hhlds = deepcopy( BACKUP_HOUSEHOLDS.hhlds ) MODEL_HOUSEHOLDS.dimensions = deepcopy( BACKUP_HOUSEHOLDS.dimensions ) - MODEL_HOUSEHOLDS.hh_map = deepcopy( BACKUP_HOUSEHOLDS.hh_map ) - MODEL_HOUSEHOLDS.pers_map = deepcopy( BACKUP_HOUSEHOLDS.pers_map ) + MODEL_HOUSEHOLDS.hh_map = deepcopy( BACKUP_HOUSEHOLDS.hh_map ) + MODEL_HOUSEHOLDS.pers_map = deepcopy( BACKUP_HOUSEHOLDS.pers_map ) MODEL_HOUSEHOLDS.data_years = deepcopy( BACKUP_HOUSEHOLDS.data_years ) MODEL_HOUSEHOLDS.interview_years = deepcopy( BACKUP_HOUSEHOLDS.interview_years ) end @@ -193,19 +209,126 @@ module FRSHouseholdGetter return size(sk)[1] == 0 # not in skiplist end - function set_local_weights_and_incomes( settings::Settings, reset::Bool) + + @with_kw mutable struct PopAndWage + popn=zeros(100_000) + wage=zeros(100_000) + mean = 0.0 + median = 0.0 + ratio = 1.0 + sdev = 0.0 + n=0 + end + + function addone!( pw :: PopAndWage, wage::Real, weight::Real ) + pw.n += 1 + pw.popn[pw.n] = weight + pw.wage[pw.n] = wage + end + + function summarise!( pw :: PopAndWage; + sex :: Union{Sex,Nothing}, + is_ft :: Union{Bool,Nothing}, + ccode :: Symbol ) + if pw.n > 0 + pw.wage = pw.wage[1:pw.n] + pw.popn = pw.popn[1:pw.n] + pw.mean = mean( pw.wage, Weights(pw.popn)) + pw.median = median( pw.wage, Weights(pw.popn)) + pw.sdev = std( pw.wage, Weights(pw.popn)) + pw.wage = zeros(0) + pw.popn = zeros(0) + wd = WeightingData.get_earnings_data(; sex = sex, is_ft = is_ft, council=ccode ) + @show pw.mean + @show wd.Mean + if ! ismissing( wd.Mean ) + pw.ratio = wd.Mean/pw.mean + end + @show pw.ratio + end + end + + """ + Make dictionary of wages from the model to match to the LA NOMIS data from WeightingData.jl+ + and use this to adjust average wages to match the ratio between the two. + """ + function fixup_earnings_data!( + settings :: Settings )::Dict + popns = Dict{String,PopAndWage}() + for sex in [Male,Female,nothing] + for is_ft in [true,false,nothing] + key = WeightingData.make_wage_key(; sex=sex, is_ft=is_ft) + popns[key] = PopAndWage() + end + end + # @show settings.num_households + for hno in 1:settings.num_households + hh = get_household(hno) + for (pid,ad) in hh.people + # @show ad + if ad.employment_status in [Full_time_Employee, + Part_time_Employee] + key = WeightingData.make_wage_key(; sex=ad.sex, is_ft=ad.employment_status == Full_time_Employee ) + addone!( popns[key], ad.income[wages], hh.weight ) + allsexk = WeightingData.make_wage_key(; is_ft=ad.employment_status == Full_time_Employee ) + addone!( popns[allsexk], ad.income[wages], hh.weight ) + allk = WeightingData.make_wage_key(; ) + addone!( popns[allk], ad.income[wages], hh.weight ) + # full time workers both sexes + allftk = WeightingData.make_wage_key(; sex = ad.sex ) + addone!( popns[allftk], ad.income[wages], hh.weight ) + end + end + end + for sex in [Male,Female] + for is_ft in [true,false] + key = WeightingData.make_wage_key(; sex=sex, is_ft=is_ft) + summarise!(popns[key]; sex = sex, is_ft = is_ft, ccode=settings.ccode ) + println( "summarising $key ") + end + end + for hno in 1:settings.num_households + hh = get_household(hno) + for (pid,ad) in hh.people + ratio = 1.0 + if ad.employment_status in [Full_time_Employee, + Full_time_Self_Employed] + key = WeightingData.make_wage_key(; sex=ad.sex, is_ft=true ) + ratio = popns[key].ratio + elseif ad.employment_status in [Part_time_Employee, + Part_time_Self_Employed] + key = WeightingData.make_wage_key(; sex=ad.sex, is_ft=false ) + ratio = popns[key].ratio + end + if haskey( ad.income, wages ) + # @show ad.income[wages] + # @show ratio + ad.income[ wages ] *= ratio + end + if haskey( ad.income, self_employment_income ) + ad.income[ self_employment_income ] *= ratio + end + end + end + return popns + end + + function set_local_weights_and_incomes!( settings::Settings; reset::Bool) # wsizes = WeightingData.init_local_weights( settings; reset=reset ) for hno in eachindex(MODEL_HOUSEHOLDS.hhlds) MODEL_HOUSEHOLDS.hhlds[hno].council = settings.ccode # ?? FIXME fixup nhs area?? WeightingData.set_weight!( MODEL_HOUSEHOLDS.hhlds[hno], settings ) end + ratios = fixup_earnings_data!( settings ) + # .. something to summarise the ratios if very large end """ Initialise the dataset. If this has already been done, do nothing unless `reset` is true. - return (number of households available, num people loaded inc. kids, num hhls in dataset (should always = item[1])) + return (number of households available, num people loaded inc. kids, num hhls in dataset (should always = item[1]), and a valuw + that's nothing if not a local run, else the funny ratios dict) """ function initialise( settings :: Settings; @@ -286,17 +409,17 @@ module FRSHouseholdGetter REG_DATA = create_regression_dataframe( hh_dataset, people_dataset ) # Save a copy of the dataset before we maybe mess with council weights # and incomes. - backup() # weights: national or local - if settings.do_local_run && (settings.ccode != :"") - set_local_weights_and_incomes( settings, reset ) - else - WeightingData.init_national_weights( settings, reset=reset ) - for hno in eachindex(MODEL_HOUSEHOLDS.hhlds) - WeightingData.set_weight!( MODEL_HOUSEHOLDS.hhlds[hno], settings ) - end + lsummary = nothing + # if settings.do_local_run && (settings.ccode != :"") + # lsummary = set_local_weights_and_incomes!( settings, reset ) + # else + WeightingData.init_national_weights( settings, reset=reset ) + for hno in eachindex(MODEL_HOUSEHOLDS.hhlds) + WeightingData.set_weight!( MODEL_HOUSEHOLDS.hhlds[hno], settings ) end fill_in_deciles!(settings) + backup() return (MODEL_HOUSEHOLDS.dimensions...,) end @@ -320,63 +443,6 @@ module FRSHouseholdGetter function get_regression_dataset()::DataFrame return REG_DATA end - - @with_kw mutable struct PopAndWage - popn=zeros(100_000) - wage=zeros(100_000) - n=0 - end - - function addone!( pw :: PopAndWage, wage::Real, weight::Real ) - pw.n += 1 - pw.popn[pw.n] = weight - pw.wage[pw.n] = wage - end - - function truncate!( pw :: PopAndWage ) - if pw.n > 0 - pw.wage = pw.wage[1:pw.n] - pw.popn = pw.popn[1:pw.n] - end - end - - """ - A dictionary of wages from the model to match to the LA NOMIS data from WeightingData.jl+ - """ - function get_mean_earnings( - settings :: Settings )::Dict - popns = Dict{String,PopAndWage}() - for sex in [Male,Female,nothing] - for is_ft in [true,false,nothing] - key = WeightingData.make_wage_key(; sex=sex, is_ft=is_ft) - popns[key] = PopAndWage() - end - end - for hno in 1:settings.num_households - hh = get_household(hno) - for (pid,ad) in hh.people - if ad.employment_status in [Full_time_Employee, - Part_time_Employee] - key = WeightingData.make_wage_key(; sex=ad.sex, is_ft=ad.employment_status == Full_time_Employee ) - addone!( popns[key], ad.income[wages], hh.weight ) - allsexk = WeightingData.make_wage_key(; is_ft=ad.employment_status == Full_time_Employee ) - addone!( popns[allsexk], ad.income[wages], hh.weight ) - allk = WeightingData.make_wage_key(; ) - addone!( popns[allk], ad.income[wages], hh.weight ) - # full time workers both sexes - allftk = WeightingData.make_wage_key(; sex = ad.sex ) - addone!( popns[allftk], ad.income[wages], hh.weight ) - end - end - end - for sex in [Male,Female,nothing] - for is_ft in [true,false,nothing] - key = WeightingData.make_wage_key(; sex=sex, is_ft=is_ft) - truncate!(popns[key]) - end - end - return popns - end """ A vector of the data years in the actual data e.g.2014,2020 .. diff --git a/src/LocalLevelCalculations.jl b/src/LocalLevelCalculations.jl index 3f67c5c2..b2687117 100644 --- a/src/LocalLevelCalculations.jl +++ b/src/LocalLevelCalculations.jl @@ -14,6 +14,7 @@ using LazyArtifacts using ScottishTaxBenefitModel using .Definitions +using .WeightingData using .ModelHousehold: BenefitUnit, @@ -38,9 +39,6 @@ using .Results: export LA_BRMA_MAP, - LA_CODES, - LA_NAMES, - LA_NAMES_TO_CCODES, apply_rent_restrictions, apply_size_criteria, calc_council_tax, @@ -48,80 +46,6 @@ export lookup, make_la_to_brma_map - # la codes in order - const LA_CODES = [ - :S12000033, - :S12000034, - :S12000041, - :S12000035, - :S12000036, - :S12000005, - :S12000006, - :S12000042, - :S12000008, - :S12000045, - :S12000010, - :S12000011, - :S12000014, - :S12000047, - :S12000049, - :S12000017, - :S12000018, - :S12000019, - :S12000020, - :S12000013, - :S12000021, - :S12000050, - :S12000023, - :S12000048, - :S12000038, - :S12000026, - :S12000027, - :S12000028, - :S12000029, - :S12000030, - :S12000039, - :S12000040 - ] - - const LA_NAMES = Dict( - :S12000033 => "Aberdeen City", - :S12000034 => "Aberdeenshire", - :S12000041 => "Angus", - :S12000035 => "Argyll and Bute", - :S12000036 => "City of Edinburgh", - :S12000005 => "Clackmannanshire", - :S12000006 => "Dumfries and Galloway", - :S12000042 => "Dundee City", - :S12000008 => "East Ayrshire", - :S12000045 => "East Dunbartonshire", - :S12000010 => "East Lothian", - :S12000011 => "East Renfrewshire", - :S12000014 => "Falkirk", - :S12000047 => "Fife", - :S12000049 => "Glasgow City", - :S12000017 => "Highland", - :S12000018 => "Inverclyde", - :S12000019 => "Midlothian", - :S12000020 => "Moray", - :S12000013 => "Na h-Eileanan Siar", - :S12000021 => "North Ayrshire", - :S12000050 => "North Lanarkshire", - :S12000023 => "Orkney Islands", - :S12000048 => "Perth and Kinross", - :S12000038 => "Renfrewshire", - :S12000026 => "Scottish Borders", - :S12000027 => "Shetland Islands", - :S12000028 => "South Ayrshire", - :S12000029 => "South Lanarkshire", - :S12000030 => "Stirling", - :S12000039 => "West Dunbartonshire", - :S12000040 => "West Lothian" - ) - # reverse lookup - const LA_NAMES_TO_CCODES = Dict( values(LA_NAMES) .=> keys(LA_NAMES)) - - # FIXME hard code this in function make_la_to_brma_map() lacsv = CSV.File( joinpath( artifact"augdata", "la_to_brma_approx_mappings.csv" )) |> DataFrame diff --git a/src/WeightingData.jl b/src/WeightingData.jl index 4cd7d57f..5a501332 100644 --- a/src/WeightingData.jl +++ b/src/WeightingData.jl @@ -15,12 +15,83 @@ using .ModelHousehold using .Definitions using .Weighting -export init, get_weight +export init, get_weight,LA_CODES,LA_NAMES mutable struct WS weights::DataFrame end +# la codes in order +const LA_CODES = [ + :S12000033, + :S12000034, + :S12000041, + :S12000035, + :S12000036, + :S12000005, + :S12000006, + :S12000042, + :S12000008, + :S12000045, + :S12000010, + :S12000011, + :S12000014, + :S12000047, + :S12000049, + :S12000017, + :S12000018, + :S12000019, + :S12000020, + :S12000013, + :S12000021, + :S12000050, + :S12000023, + :S12000048, + :S12000038, + :S12000026, + :S12000027, + :S12000028, + :S12000029, + :S12000030, + :S12000039, + :S12000040] + +const LA_NAMES = Dict( + :S12000033 => "Aberdeen City", + :S12000034 => "Aberdeenshire", + :S12000041 => "Angus", + :S12000035 => "Argyll and Bute", + :S12000036 => "City of Edinburgh", + :S12000005 => "Clackmannanshire", + :S12000006 => "Dumfries and Galloway", + :S12000042 => "Dundee City", + :S12000008 => "East Ayrshire", + :S12000045 => "East Dunbartonshire", + :S12000010 => "East Lothian", + :S12000011 => "East Renfrewshire", + :S12000014 => "Falkirk", + :S12000047 => "Fife", + :S12000049 => "Glasgow City", + :S12000017 => "Highland", + :S12000018 => "Inverclyde", + :S12000019 => "Midlothian", + :S12000020 => "Moray", + :S12000013 => "Na h-Eileanan Siar", + :S12000021 => "North Ayrshire", + :S12000050 => "North Lanarkshire", + :S12000023 => "Orkney Islands", + :S12000048 => "Perth and Kinross", + :S12000038 => "Renfrewshire", + :S12000026 => "Scottish Borders", + :S12000027 => "Shetland Islands", + :S12000028 => "South Ayrshire", + :S12000029 => "South Lanarkshire", + :S12000030 => "Stirling", + :S12000039 => "West Dunbartonshire", + :S12000040 => "West Lothian") + # reverse lookup +const LA_NAMES_TO_CCODES = Dict( values(LA_NAMES) .=> keys(LA_NAMES)) + const WEIGHTS = WS(DataFrame()) const WEIGHTS_LA = WS(DataFrame()) const NULL_CC = :"" @@ -138,7 +209,6 @@ function get_earnings_data(; sex :: Union{Sex,Nothing}=nothing, is_ft :: Union{Bool,Nothing}=nothing, paytype = "Weekly pay - gross", - # field :: Symbol = :Mean, council :: Symbol )::DataFrameRow if length( NOMIS_WAGE_DATA ) == 0 load_wage_data() @@ -189,7 +259,7 @@ function set_weight!( hh :: Household, settings::Settings ) end hno = findfirst( (WEIGHTS_LA.weights.hid .== hh.hid).&(WEIGHTS_LA.weights.data_year.==hh.data_year)) hh.weight = WEIGHTS_LA.weights[hno,hh.council] - elseif scottish=settings.target_nation == N_Scotland #FIXME parameterise this + elseif settings.target_nation == N_Scotland #FIXME parameterise this if size( WEIGHTS.weights )==(0,0) init_national_weights( settings, reset=true ) end diff --git a/test/local_level_calculations_tests.jl b/test/local_level_calculations_tests.jl index a3e4ee3f..da522991 100644 --- a/test/local_level_calculations_tests.jl +++ b/test/local_level_calculations_tests.jl @@ -10,12 +10,14 @@ using .FRSHouseholdGetter using .GeneralTaxComponents: WEEKS_PER_YEAR using .RunSettings: Settings using .LocalLevelCalculations: apply_size_criteria, apply_rent_restrictions, - make_la_to_brma_map, LA_BRMA_MAP, lookup, apply_rent_restrictions, calc_council_tax, - LA_NAMES, LA_CODES + make_la_to_brma_map, LA_BRMA_MAP, lookup, apply_rent_restrictions, calc_council_tax + +using .WeightingData: LA_NAMES, LA_CODES using .STBParameters using .Intermediate: make_intermediate, MTIntermediate using .ExampleHelpers +using CSV,DataFrames ## FIXME don't need both lmt = LegacyMeansTestedBenefitSystem{Float64}() @@ -23,7 +25,7 @@ sys = get_system( year=2019, scotland=true ) settings = Settings() rc = @timed begin - num_households,total_num_people,nhh2 = FRSHouseholdGetter.initialise( Settings() ) + settings.num_households,settings.num_people,nhh2 = FRSHouseholdGetter.initialise( Settings(), reset=true ) end @@ -416,4 +418,25 @@ end ct = calc_council_tax( hh, intermed.hhint, sys.loctax.ct ) @test hh.ct_band == Band_B @test ct ≈ 1_078.00/WEEKS_PER_YEAR # glasgow 2020/1 CT band b per week +end + +@testset "Local reweighing" begin + n = length(LA_CODES) + # settings = Settings() + settings.do_local_run = true + + # rc = @timed begin + # settings.num_households,settings.total_num_people,nhh2 = FRSHouseholdGetter.initialise( + # settings, reset=true ) + # end + for i in 1:n + reset = i==1 + settings.ccode = LA_CODES[i] + FRSHouseholdGetter.restore() + @show settings.ccode + dict = FRSHouseholdGetter.set_local_weights_and_incomes!( settings, reset=false ) + # println(dict) + + i+= 1 + end end \ No newline at end of file