Start on updating Example code for VAT and Wealth

grahamstark · Sep 7, 2023 · 35dd3e3 · 35dd3e3
1 parent e0337ac
commit 35dd3e3
Show file tree

Hide file tree

Showing 4 changed files with 138 additions and 7 deletions.
diff --git a/matching/lcf_frs_matching.jl b/matching/lcf_frs_matching.jl
@@ -78,6 +78,11 @@ function make_lfs_subset( lfs :: DataFrame ) :: DataFrame
         datayear = lcf.datayear, 
         month = lcf.a055, 
         year= lcf.year,
+        a121 = lcf.a121,
+        gorx = lcf.gorx,
+        a065p  = lcf.a065p,
+        a062 = lcf.a062,
+
         any_wages = lcf.any_wages,
         any_pension_income = lcf.any_pension_income,
         any_selfemp = lcf.any_selfemp,
@@ -1487,6 +1492,36 @@ function frs_age_hrp( hhagegr4 :: Int ) :: Vector{Int}
     out
 end
 
+function model_age_hrp( age :: Int )
+    return if age < 20
+        1
+    elseif age < 25
+        2
+    elseif age < 30
+        3
+    elseif age < 35
+        4
+    elseif age < 40
+        5
+    elseif age < 45
+        6
+    elseif age < 50
+        7
+    elseif age < 55
+        8
+    elseif age < 60
+        9
+    elseif age < 65
+        10
+    elseif age < 70
+        11
+    elseif age < 75
+        12
+    elseif age >= 75
+        13
+    end
+end
+
 #=
 	Value = 3.0	Label =  15 but under 20 yrs
 	Value = 4.0	Label =  20 but under 25 yrs
@@ -1597,14 +1632,62 @@ function frs_lcf_match_row( frs :: DataFrameRow, lcf :: DataFrameRow ) :: Tuple
     return t,incdiff
 end
 
+function example_lcf_match( hh :: Household, lcf :: DataFrameRow ) :: Tuple
+    t = 0.0
+    t += score( lcf_tenuremap( lcf.a121 ), model_tenuremap( hh.tenure ))
+    t += score( lcf_regionmap( lcf.gorx ), model_regionmap( model_region ))
+    # !!! both next missing in 2020 LCF FUCKKK 
+    # t += score( lcf_accmap( lcf.a116 ), frs_accmap( frs.typeacc ))
+    # t += score( rooms( lcf.a111p, 998 ), rooms( frs.bedroom6, 999 ))
+    t += score( lcf_age_hrp(  lcf.a065p ), frs_age_hrp( frs.hhagegr4 ))
+    t += score( lcf_composition_map( lcf.a062 ), frs_composition_map( frs.hhcomps ))
+    any_wages = false
+    any_selfemp = false
+    any_pension_income = false 
+    has_female_adult = false
+    hrp = get_head( hh )
+    income = 0.0
+    for (pid,pers) in hh.people
+        if get(pers.income,wages,0) > 0
+            any_wages = true
+        end
+        if get(pers.income,self_employment_income,0) > 0
+            any_selfemp = true
+        end
+        if (get(pers.income,private_pensions,0) > 0) || pers.age >= 66
+            any_pension_income = true
+        end
+        if (! pers.is_standard_child) && (pers.sex == Female )
+            has_female_adult = true
+        end
+        income += sum( pers.income, start=wages, stop=alimony_and_child_support_received ) # FIXME
+    end
+    t += lcf.any_wages == any_wages ? 1 : 0
+    t += lcf.any_pension_income == any_pension_income ? 1 : 0
+    t += lcf.any_selfemp == any_selfemp ? 1 : 0
+    t += lcf.hrp_unemployed == hrp.employment_status == Unemployed ? 1 : 0
+    t += lcf.hrp_non_white == hrp.ethnic_group !== White ? 1 : 0
+    # t += lcf.datayear == frs.datayear ? 0.5 : 0 # - a little on same year FIXME use date range
+    # t += lcf.any_disabled == frs.any_disabled ? 1 : 0 -- not possible in LCF??
+    t += Int(lcf.has_female_adult) == Int(has_female_adult) ? 1 : 0
+    t += score( lcf.num_children, num_children(hh) )
+    t += score( lcf.num_people, num_people(hh) )
+    # fixme should we include this at all?
+    incdiff = compare_income( lcf.income, income )
+    t += 10.0*incdiff
+    return t,incdiff
+
+
+end
+
 islessscore( l1::LCFLocation, l2::LCFLocation ) = l1.score < l2.score
 islessincdiff( l1::LCFLocation, l2::LCFLocation ) = l1.incdiff < l2.incdiff
 
 """
 Match one row in the FRS (recip) with all possible lcf matches (donor). Intended to be general
 but isn't really any more. FIXME: pass in a saving function so we're not tied to case/datayear.
 """
-function match_recip_row( recip :: DataFrameRow, donor :: DataFrame, matcher :: Function ) :: Vector{LCFLocation}
+function match_recip_row( recip, donor :: DataFrame, matcher :: Function ) :: Vector{LCFLocation}
     drows, dcols = size(donor)
     i = 0
     similar = Vector{LCFLocation}( undef, drows )
@@ -1620,6 +1703,8 @@ function match_recip_row( recip :: DataFrameRow, donor :: DataFrame, matcher ::
     return similar
 end
 
+
+
 """
 Create a dataframe for storing all the matches. 
 This has the FRS record and then 20 lcf records, with case,year,income and matching score for each.
@@ -1673,6 +1758,11 @@ function map_all( recip :: DataFrame, donor :: DataFrame, matcher :: Function ):
     return df
 end
 
+function map_example( example :: Household, donor :: DataFrame, matcher::Function )::LCFLocation
+    matches = map_recip_row( example, donor, matcher )
+    return matches[1]
+end
+
 """
 print out our lcf and frs records
 """

diff --git a/src/ConsumptionData.jl b/src/ConsumptionData.jl
@@ -165,6 +165,18 @@ end
 
 const DEFAULT_STANDARD_RATE = default_standard_rate()
 
+"""
+Match in the lcf data using the lookup table constructed in 'matching/lcf_frs_matching.jl'
+'which' best, 2nd best etc match (<=20)
+"""
+function find_consumption_for_hh!( hh :: Household, case :: Int, dataset :: Int )
+    # println( "find_consumption_for_hh! matching to case $case datayear $datayear")
+    hh.expenditure = EXPENDITURE_DATASET[(EXPENDITURE_DATASET.case .== case).&(EXPENDITURE_DATASET.datayear.==datayear),:][1,:]
+    hh.factor_costs = FACTOR_COST_DATASET[(FACTOR_COST_DATASET.case .== case).&(FACTOR_COST_DATASET.datayear.==datayear),:][1,:]
+    @assert ! isnothing( hh.expenditure )
+    @assert ! isnothing( hh.factor_costs )
+end
+
 """
 Match in the lcf data using the lookup table constructed in 'matching/lcf_frs_matching.jl'
 'which' best, 2nd best etc match (<=20)
@@ -177,11 +189,7 @@ function find_consumption_for_hh!( hh :: Household, settings :: Settings, which
     lcf_datayear_sym = Symbol( "lcf_datayear_$(which)")
     case = match[lcf_case_sym]
     datayear = match[lcf_datayear_sym]
-    # println( "find_consumption_for_hh! matching to case $case datayear $datayear")
-    hh.expenditure = EXPENDITURE_DATASET[(EXPENDITURE_DATASET.case .== case).&(EXPENDITURE_DATASET.datayear.==datayear),:][1,:]
-    hh.factor_costs = FACTOR_COST_DATASET[(FACTOR_COST_DATASET.case .== case).&(FACTOR_COST_DATASET.datayear.==datayear),:][1,:]
-    @assert ! isnothing( hh.expenditure )
-    @assert ! isnothing( hh.factor_costs )
+    find_consumption_for_hh!( hh, case, datayear )
 end
 
 # FIXME FIXME CHAOTIC EVIL this is the diff between actual 157bn and crude modelled VAT receipts of 102mb. 2022

diff --git a/src/Definitions.jl b/src/Definitions.jl
@@ -6,6 +6,7 @@ using ScottishTaxBenefitModel
 using ScottishTaxBenefitModel.Utils
 using Parameters
 using JSON3
+import Base.sum
 
 export 
    Employment_Status,  # mapped from empstat
@@ -1228,6 +1229,38 @@ end
 Incomes_Dict = Dict{Incomes_Type,T} where T<:Real
 Incomes_Set = Set{Incomes_Type}
 
+
+export sum 
+
+function Base.sum( i :: Incomes_Dict{T}, which :: Incomes_Set ) :: T where T <: Number
+   z = zero(T)
+   t = intersect( which, keys(i))
+   for k in t
+      v += i[k]
+  end
+  v
+end
+
+function ran( start :: Incomes_Type, stop :: Incomes_Type ) :: Incomes_Set
+   s = Incomes_Set()
+   for k in instances(Incomes_Type)
+       if k >= from 
+           push!(s,k)
+           if k == to
+               break
+           end
+       end
+   end
+   s
+end
+
+function Base.sum( i :: Incomes_Dict{T}; start :: Incomes_Type, stop :: Incomes_Type ) :: T where T <: Number
+   z = zero(T)
+   s = rand( start, stop )
+   return sum( i, s )
+end
+
+
 const Expenses = Incomes_Set([
    permanent_health_insurance,
    health_insurance,

diff --git a/src/Utils.jl b/src/Utils.jl
@@ -95,7 +95,7 @@ start_col and end_col and other fields just copied
 diff is difference df2 - df1, Frames should have identical other cols.
 """
 function df_diff( df1, df2 :: DataFrame, start_col::Int, end_col :: Int  ) :: DataFrame
-   @argcheck size( df1 ) == size( df2 )
+   argch@eck size( df1 ) == size( df2 )
    ## maybe check that the non diffed fields are all the same too.. 
    d = copy(df1)
    d[:,start_col:end_col] = df2[:,start_col:end_col] .- df1[:,start_col:end_col]