Skip to content

Commit

Permalink
Initial version of LA weights as module
Browse files Browse the repository at this point in the history
  • Loading branch information
grahamstark committed Dec 17, 2024
1 parent 0cedda0 commit 32b90a7
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 120 deletions.
230 changes: 137 additions & 93 deletions src/LocalWeightGeneration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ module LocalWeightGeneration
# weights and adjusting incomes.
#

using CSV, DataFrames
using ScottishTaxBenefitModel
using .ModelHousehold
using .Definitions
Expand All @@ -14,38 +13,13 @@ using .Weighting
using SurveyDataWeighting
using CSV
using StatsBase
using PrettyTables
using DataFrames
using LinearAlgebra

include( joinpath(SRC_DIR,"targets","scotland-localities-2024.jl") )

function summarise_dfs( data :: DataFrame, targets::DataFrameRow, household_total :: Number )::DataFrame
nms = Symbol.(names(targets))
nrows, ncols = size( data )
d = DataFrame()
scale = nrows / popn
initial_weights = Weights(ones(nrows)*household_total/rows)
for n in nms
d[n] = zeros(11)
v = summarystats(data[!,n], initial_weights)
d[1,n] = v.max
d[3,n] = v.mean
d[4,n] = v.median
d[5,n] = v.nmiss
d[6,n] = v.min
d[7,n] = v.nobs
d[8,n] = v.q25
d[9,n] = v.q75
d[10,n] = v.sd
d[11,n] = targets[n] / sum(data[!,n],initial_weights)
end
#=
max mean
median min
nmiss nobs
q25 q75
sd
=#
d
end
export weight_to_la, INCLUDE_ALL, create_model_dataset

const INCLUDE_OCCUP = 1
const INCLUDE_HOUSING = 2
Expand All @@ -57,24 +31,24 @@ const INCLUDE_INDUSTRY = 7
const INCLUDE_HH_SIZE = 8

const INCLUDE_ALL = Set{Integer}(
INCLUDE_OCCUP,
[INCLUDE_OCCUP,
INCLUDE_HOUSING,
INCLUDE_BEDROOMS,
INCLUDE_CT,
INCLUDE_HCOMP,
INCLUDE_EMPLOYMENT,
INCLUDE_INDUSTRY,
INCLUDE_HH_SIZE )
INCLUDE_HH_SIZE] )

function summarise_dfs( data :: DataFrame, targets::DataFrameRow, household_total :: Number )::DataFrame
nms = Symbol.(names(targets))
function summarise_dfs( data :: DataFrame, targets::DataFrameRow, initial_weights::Vector )::DataFrame
nrows, ncols = size( data )
@show nrows ncols
d = DataFrame()
scale = nrows / popn
initial_weights = Weights(ones(nrows)*household_total/rows)
nms = names(data)
for n in nms
d[n] = zeros(11)
v = summarystats(data[!,n], initial_weights)
d[:,n] = zeros(11)
# @show data[!,n]
v = summarystats(data[!,n])
d[1,n] = v.max
d[3,n] = v.mean
d[4,n] = v.median
Expand All @@ -84,32 +58,52 @@ function summarise_dfs( data :: DataFrame, targets::DataFrameRow, household_tota
d[8,n] = v.q25
d[9,n] = v.q75
d[10,n] = v.sd
d[11,n] = targets[n] / sum(data[!,n],initial_weights)
it = sum(data[!,n])*initial_weights[1]
d[11,n] = (targets[n] - it )/ targets[n]
end
return d
poss = sortperm(abs.(Vector(d[11,:])),rev=true)
@show poss
return d[!,poss] #, d[11,poss]
end

function weight_to_la(
settings :: Settings,
model_data :: DataFrame,
household_total :: Real,
all_council_data :: DataFrameRow,
included_categories :: Set{Integer} )

targets = make_target_list_2024(
all_councils_census, code, included_categories )

println( "calculating for $code; hh total $hhtotal")

weights = generate_weights(
num_households;
weight_type = settings.weight_type,
lower_multiple = settings.lower_multiple, # these values can be narrowed somewhat, to around 0.25-4.7
upper_multiple = settings.upper_multiple,
household_total = all_council_data.total_hhlds,
targets = targets,
initialise_target_dataframe = initialise_target_dataframe_scotland_la,
make_target_row! = make_model_dataframe_row! )
initial_weights( )
nhhlds = size(model_data)[1]
targets, tnames = make_target_list_2024(
all_council_data, included_categories )
data = select(model_data, tnames)
initial_weights = ones(nhhlds)*household_total/nhhlds
@show initial_weights[1:20] household_total nhhlds
pt = summarise_dfs( data, targets, initial_weights )
pretty_table(pt)
# @show diffs
@show near_collinear_cols( data )
mdata = Matrix(data)
vtargets = Vector(targets)
# println( "calculating for $code; hh total $hhtotal")
weights = do_reweighting(
data = mdata,
initial_weights = initial_weights,
target_populations = vtargets,
functiontype = settings.weight_type,
lower_multiple = settings.lower_multiple,
upper_multiple = settings.upper_multiple,
tol = 0.000001 )
weighted_popn = (weights'*mdata)'
println( "weighted_popn = $weighted_popn" )
@assert weighted_popn vtargets
if settings.weight_type in [constrained_chi_square, d_and_s_constrained ]
# check the constrainted methods keep things inside ll and ul
for r in 1:nhhlds
@assert weights[r] <= initial_weights[r]*settings.upper_multiple
@assert weights[r] >= initial_weights[r]*settings.lower_multiple
end
end
summarystats( weights )
return weights
end

Expand All @@ -123,11 +117,14 @@ function create_model_dataset(
make_target_row!( df[hno,:], hh )
end
# println(m)
nr,nc = size(df)
for c in 1:nc
col = df[!,c]
nr,nc = size(df)
nm = names(df)
i = 0
for col in eachcol(df)
i += 1
# col = df[!,c]
if eltype(col) <: Number
@assert sum(col) != 0 "all zero column $c"
@assert sum(col) != 0 "all zero column $(nm[i])"
end
end
# no row all zero
Expand All @@ -139,12 +136,58 @@ function create_model_dataset(
s += c
end
end
@assert c != 0 "all zero row $r"
@assert s != 0 "all zero row $r"
end
return df
end

#=
using Revise
using ScottishTaxBenefitModel
using .ModelHousehold
using .Definitions
using .FRSHouseholdGetter
using .RunSettings
using .Weighting
using .LocalWeightGeneration
using SurveyDataWeighting
using CSV
using StatsBase
using PrettyTables
using DataFrames
EXCLUDE_CT = Set{Integer}(
[LocalWeightGeneration.INCLUDE_OCCUP,
LocalWeightGeneration.INCLUDE_HOUSING,
LocalWeightGeneration.INCLUDE_BEDROOMS,
# INCLUDE_CT,
LocalWeightGeneration.INCLUDE_HCOMP,
LocalWeightGeneration.INCLUDE_EMPLOYMENT,
LocalWeightGeneration.INCLUDE_INDUSTRY])
# LocalWeightGeneration.INCLUDE_HH_SIZE] )
merged_census_files = LocalWeightGeneration.load_census_2024()
settings = Settings()
settings.lower_multiple = 0.01
settings.upper_multiple = 100.0
settings.num_households, settings.num_people = FRSHouseholdGetter.initialise( settings )
df = LocalWeightGeneration.create_model_dataset(
settings,
LocalWeightGeneration.initialise_model_dataframe_scotland_la,
LocalWeightGeneration.make_model_dataframe_row! )
targets = merged_census_files[merged_census_files.authority_code.==:S12000039,:][1,:]
weight_to_la(
settings,
df,
targets.total_hhlds,
targets,
EXCLUDE_CT)
=#

function create_la_weights(
settings :: Settings,
Expand All @@ -157,9 +200,12 @@ function create_la_weights(
settings.num_households,
initialise_target_dataframe_scotland_la,
make_model_dataframe_row! )



errors = []
const wides = Set([:S12000013] ) # h-Eileanan Siar""Angus", "East Lothian", "East Renfrewshire", "Renfrewshire", "East Dunbartonshire", "North Ayrshire", "West Dunbartonshire", "Shetland Islands", "Orkney Islands", "Inverclyde", "Midlothian", "Argyll and Bute", "East Ayrshire", "Dundee City", "Na h-Eileanan Siar", "South Lanarkshire", "Clackmannanshire", "West Lothian", "Falkirk", "Moray", "South Ayrshire", "City of Edinburgh", "Aberdeenshire", "North Lanarkshire"])
const verywides = Set([:S12000010, :S12000019, :S12000011, :S12000035, :S12000045] )
wides = Set([:S12000013] ) # h-Eileanan Siar""Angus", "East Lothian", "East Renfrewshire", "Renfrewshire", "East Dunbartonshire", "North Ayrshire", "West Dunbartonshire", "Shetland Islands", "Orkney Islands", "Inverclyde", "Midlothian", "Argyll and Bute", "East Ayrshire", "Dundee City", "Na h-Eileanan Siar", "South Lanarkshire", "Clackmannanshire", "West Lothian", "Falkirk", "Moray", "South Ayrshire", "City of Edinburgh", "Aberdeenshire", "North Lanarkshire"])
verywides = Set([:S12000010, :S12000019, :S12000011, :S12000035, :S12000045] )
#"East Lothian", "Midlothian", "East Renfrewshire", "Argyll and Bute", "East Dunbartonshire"])
s = Set()
settings.lower_multiple = 0.01
Expand All @@ -180,42 +226,40 @@ function create_la_weights(
for code in allfs.authority_code
println( "on $code")

council_data = all_councils_census[
all_councils_census.authority_code .== council,:][1,:]


try
# FIXME messing with globals for empl, hhsize, which break some authorities
if code in verywides
INCLUDE_EMPLOYMENT = false
INCLUDE_HH_SIZE = false
elseif code in wides
INCLUDE_EMPLOYMENT = true
INCLUDE_HH_SIZE = true
settings.lower_multiple = 0.001
settings.upper_multiple = 100.0
else
INCLUDE_HH_SIZE = true
INCLUDE_EMPLOYMENT = true
council_data = all_councils_census[
all_councils_census.authority_code .== council,:][1,:]
try
# FIXME messing with globals for empl, hhsize, which break some authorities
if code in verywides
# INCLUDE_EMPLOYMENT = false
# INCLUDE_HH_SIZE = false
elseif code in wides
# INCLUDE_EMPLOYMENT = true
# INCLUDE_HH_SIZE = true
# settings.lower_multiple = 0.001
# settings.upper_multiple = 100.0
else
# INCLUDE_HH_SIZE = true
# INCLUDE_EMPLOYMENT = true
end
w = weight_to_la( settings, allfs, code, settings.num_households )
println("OK")
outweights[!,code] = w
catch e
println( "error $e")
push!( errors, (; e, code ))
push!(s, code )
end
w = weight_to_la( settings, allfs, code, settings.num_households )
println("OK")
outweights[!,code] = w
catch e
println( "error $e")
push!( errors, (; e, code ))
push!(s, code )
end

end
end

println( errors )
println(s)
# println( errors )
# println(s)
end

CSV.write( joinpath( DDIR, "la-frs-weights-scotland-2024.tab"), outweights; delim='\t')
# CSV.write( joinpath( DDIR, "la-frs-weights-scotland-2024.tab"), outweights; delim='\t')

weights = CSV.File( joinpath( DDIR, "la-frs-weights-scotland-2024.tab") ) |> DataFrame
# weights = CSV.File( joinpath( DDIR, "la-frs-weights-scotland-2024.tab") ) |> DataFrame



Loading

0 comments on commit 32b90a7

Please sign in to comment.