From 35dd3e3097168dff8f3c39e357871a0273916a82 Mon Sep 17 00:00:00 2001 From: Graham Stark Date: Thu, 7 Sep 2023 16:32:37 +0100 Subject: [PATCH] Start on updating Example code for VAT and Wealth --- matching/lcf_frs_matching.jl | 92 +++++++++++++++++++++++++++++++++++- src/ConsumptionData.jl | 18 +++++-- src/Definitions.jl | 33 +++++++++++++ src/Utils.jl | 2 +- 4 files changed, 138 insertions(+), 7 deletions(-) diff --git a/matching/lcf_frs_matching.jl b/matching/lcf_frs_matching.jl index 1ffb4d3a..a24d5101 100644 --- a/matching/lcf_frs_matching.jl +++ b/matching/lcf_frs_matching.jl @@ -78,6 +78,11 @@ function make_lfs_subset( lfs :: DataFrame ) :: DataFrame datayear = lcf.datayear, month = lcf.a055, year= lcf.year, + a121 = lcf.a121, + gorx = lcf.gorx, + a065p = lcf.a065p, + a062 = lcf.a062, + any_wages = lcf.any_wages, any_pension_income = lcf.any_pension_income, any_selfemp = lcf.any_selfemp, @@ -1487,6 +1492,36 @@ function frs_age_hrp( hhagegr4 :: Int ) :: Vector{Int} out end +function model_age_hrp( age :: Int ) + return if age < 20 + 1 + elseif age < 25 + 2 + elseif age < 30 + 3 + elseif age < 35 + 4 + elseif age < 40 + 5 + elseif age < 45 + 6 + elseif age < 50 + 7 + elseif age < 55 + 8 + elseif age < 60 + 9 + elseif age < 65 + 10 + elseif age < 70 + 11 + elseif age < 75 + 12 + elseif age >= 75 + 13 + end +end + #= Value = 3.0 Label = 15 but under 20 yrs Value = 4.0 Label = 20 but under 25 yrs @@ -1597,6 +1632,54 @@ function frs_lcf_match_row( frs :: DataFrameRow, lcf :: DataFrameRow ) :: Tuple return t,incdiff end +function example_lcf_match( hh :: Household, lcf :: DataFrameRow ) :: Tuple + t = 0.0 + t += score( lcf_tenuremap( lcf.a121 ), model_tenuremap( hh.tenure )) + t += score( lcf_regionmap( lcf.gorx ), model_regionmap( model_region )) + # !!! both next missing in 2020 LCF FUCKKK + # t += score( lcf_accmap( lcf.a116 ), frs_accmap( frs.typeacc )) + # t += score( rooms( lcf.a111p, 998 ), rooms( frs.bedroom6, 999 )) + t += score( lcf_age_hrp( lcf.a065p ), frs_age_hrp( frs.hhagegr4 )) + t += score( lcf_composition_map( lcf.a062 ), frs_composition_map( frs.hhcomps )) + any_wages = false + any_selfemp = false + any_pension_income = false + has_female_adult = false + hrp = get_head( hh ) + income = 0.0 + for (pid,pers) in hh.people + if get(pers.income,wages,0) > 0 + any_wages = true + end + if get(pers.income,self_employment_income,0) > 0 + any_selfemp = true + end + if (get(pers.income,private_pensions,0) > 0) || pers.age >= 66 + any_pension_income = true + end + if (! pers.is_standard_child) && (pers.sex == Female ) + has_female_adult = true + end + income += sum( pers.income, start=wages, stop=alimony_and_child_support_received ) # FIXME + end + t += lcf.any_wages == any_wages ? 1 : 0 + t += lcf.any_pension_income == any_pension_income ? 1 : 0 + t += lcf.any_selfemp == any_selfemp ? 1 : 0 + t += lcf.hrp_unemployed == hrp.employment_status == Unemployed ? 1 : 0 + t += lcf.hrp_non_white == hrp.ethnic_group !== White ? 1 : 0 + # t += lcf.datayear == frs.datayear ? 0.5 : 0 # - a little on same year FIXME use date range + # t += lcf.any_disabled == frs.any_disabled ? 1 : 0 -- not possible in LCF?? + t += Int(lcf.has_female_adult) == Int(has_female_adult) ? 1 : 0 + t += score( lcf.num_children, num_children(hh) ) + t += score( lcf.num_people, num_people(hh) ) + # fixme should we include this at all? + incdiff = compare_income( lcf.income, income ) + t += 10.0*incdiff + return t,incdiff + + +end + islessscore( l1::LCFLocation, l2::LCFLocation ) = l1.score < l2.score islessincdiff( l1::LCFLocation, l2::LCFLocation ) = l1.incdiff < l2.incdiff @@ -1604,7 +1687,7 @@ islessincdiff( l1::LCFLocation, l2::LCFLocation ) = l1.incdiff < l2.incdiff Match one row in the FRS (recip) with all possible lcf matches (donor). Intended to be general but isn't really any more. FIXME: pass in a saving function so we're not tied to case/datayear. """ -function match_recip_row( recip :: DataFrameRow, donor :: DataFrame, matcher :: Function ) :: Vector{LCFLocation} +function match_recip_row( recip, donor :: DataFrame, matcher :: Function ) :: Vector{LCFLocation} drows, dcols = size(donor) i = 0 similar = Vector{LCFLocation}( undef, drows ) @@ -1620,6 +1703,8 @@ function match_recip_row( recip :: DataFrameRow, donor :: DataFrame, matcher :: return similar end + + """ Create a dataframe for storing all the matches. This has the FRS record and then 20 lcf records, with case,year,income and matching score for each. @@ -1673,6 +1758,11 @@ function map_all( recip :: DataFrame, donor :: DataFrame, matcher :: Function ): return df end +function map_example( example :: Household, donor :: DataFrame, matcher::Function )::LCFLocation + matches = map_recip_row( example, donor, matcher ) + return matches[1] +end + """ print out our lcf and frs records """ diff --git a/src/ConsumptionData.jl b/src/ConsumptionData.jl index c0060f7c..32c7463f 100644 --- a/src/ConsumptionData.jl +++ b/src/ConsumptionData.jl @@ -165,6 +165,18 @@ end const DEFAULT_STANDARD_RATE = default_standard_rate() +""" +Match in the lcf data using the lookup table constructed in 'matching/lcf_frs_matching.jl' +'which' best, 2nd best etc match (<=20) +""" +function find_consumption_for_hh!( hh :: Household, case :: Int, dataset :: Int ) + # println( "find_consumption_for_hh! matching to case $case datayear $datayear") + hh.expenditure = EXPENDITURE_DATASET[(EXPENDITURE_DATASET.case .== case).&(EXPENDITURE_DATASET.datayear.==datayear),:][1,:] + hh.factor_costs = FACTOR_COST_DATASET[(FACTOR_COST_DATASET.case .== case).&(FACTOR_COST_DATASET.datayear.==datayear),:][1,:] + @assert ! isnothing( hh.expenditure ) + @assert ! isnothing( hh.factor_costs ) +end + """ Match in the lcf data using the lookup table constructed in 'matching/lcf_frs_matching.jl' 'which' best, 2nd best etc match (<=20) @@ -177,11 +189,7 @@ function find_consumption_for_hh!( hh :: Household, settings :: Settings, which lcf_datayear_sym = Symbol( "lcf_datayear_$(which)") case = match[lcf_case_sym] datayear = match[lcf_datayear_sym] - # println( "find_consumption_for_hh! matching to case $case datayear $datayear") - hh.expenditure = EXPENDITURE_DATASET[(EXPENDITURE_DATASET.case .== case).&(EXPENDITURE_DATASET.datayear.==datayear),:][1,:] - hh.factor_costs = FACTOR_COST_DATASET[(FACTOR_COST_DATASET.case .== case).&(FACTOR_COST_DATASET.datayear.==datayear),:][1,:] - @assert ! isnothing( hh.expenditure ) - @assert ! isnothing( hh.factor_costs ) + find_consumption_for_hh!( hh, case, datayear ) end # FIXME FIXME CHAOTIC EVIL this is the diff between actual 157bn and crude modelled VAT receipts of 102mb. 2022 diff --git a/src/Definitions.jl b/src/Definitions.jl index fe58c140..866337dc 100644 --- a/src/Definitions.jl +++ b/src/Definitions.jl @@ -6,6 +6,7 @@ using ScottishTaxBenefitModel using ScottishTaxBenefitModel.Utils using Parameters using JSON3 +import Base.sum export Employment_Status, # mapped from empstat @@ -1228,6 +1229,38 @@ end Incomes_Dict = Dict{Incomes_Type,T} where T<:Real Incomes_Set = Set{Incomes_Type} + +export sum + +function Base.sum( i :: Incomes_Dict{T}, which :: Incomes_Set ) :: T where T <: Number + z = zero(T) + t = intersect( which, keys(i)) + for k in t + v += i[k] + end + v +end + +function ran( start :: Incomes_Type, stop :: Incomes_Type ) :: Incomes_Set + s = Incomes_Set() + for k in instances(Incomes_Type) + if k >= from + push!(s,k) + if k == to + break + end + end + end + s +end + +function Base.sum( i :: Incomes_Dict{T}; start :: Incomes_Type, stop :: Incomes_Type ) :: T where T <: Number + z = zero(T) + s = rand( start, stop ) + return sum( i, s ) +end + + const Expenses = Incomes_Set([ permanent_health_insurance, health_insurance, diff --git a/src/Utils.jl b/src/Utils.jl index 4b9b675c..479df7bf 100644 --- a/src/Utils.jl +++ b/src/Utils.jl @@ -95,7 +95,7 @@ start_col and end_col and other fields just copied diff is difference df2 - df1, Frames should have identical other cols. """ function df_diff( df1, df2 :: DataFrame, start_col::Int, end_col :: Int ) :: DataFrame - @argcheck size( df1 ) == size( df2 ) + argch@eck size( df1 ) == size( df2 ) ## maybe check that the non diffed fields are all the same too.. d = copy(df1) d[:,start_col:end_col] = df2[:,start_col:end_col] .- df1[:,start_col:end_col]