-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3669553
commit be03ca6
Showing
27 changed files
with
4,965 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
|
||
[Ann] Easiest AutoEncoder on Earth: `m=AutoEncoder(); fit!(m,x); latent_x = predict(m,x); xest=inverse_predict(m,x_latent)` | ||
|
||
[ANN] A easy to use AutoEncoder to reduce the dimensionality of the data | ||
|
||
(I hope I'm not breaking the self-promotion rule ;-) ) | ||
Hello, I am pleased to announce one of the easiest to use [AutoEncoder](https://sylvaticus.github.io/BetaML.jl/dev/Utils.html#BetaML.Utils.AutoEncoder) models in the world. | ||
|
||
No need to implement a neural network yourself, just use | ||
- `mod = AutoEncoder([optional stuff])` to create the model | ||
- `fit!(mod,x)` to fit the model to some tabular data (dims in cols) | ||
- x_latent = predict(mod)` or `x_latent = predict(mod,otherx)` to get the data encoded in latent space (usually with many less dimensions than the original) | ||
- x_decoded = inverse_predict(mod,x_latent)` to get the decoded values. | ||
|
||
The user can still specify the number of dimensions in the latent space, the number of neurons in the inner layers, or the full specification of encoding/decoding layers and NN training options, but this remains completely optional as some heuristics are applied. The `autotune' method also allows to further simplify these choices. | ||
|
||
`AutoEncoder` is part of the v0.10.4 of [BetaML Machine Learning Toolkit](https://github.com/sylvaticus/BetaML.jl), an open source set of machine learning models and utilities, and a wrapper should soon be available as part of the MLJ library. | ||
Although developed in the Julia language, it can be easily accessed from R, Python or any other language with a Julia binding, as specified [here](https://sylvaticus.github.io/BetaML.jl/stable/tutorials/Betaml_tutorial_getting_started.html#using_betaml_from_other_languages). | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
# Scratchpad | ||
|
||
x = [0.12 0.31 0.29 3.21 0.21; | ||
0.44 1.21 1.18 13.54 0.85 | ||
0.22 0.61 0.58 6.43 0.42; | ||
0.35 0.93 0.91 10.04 0.71; | ||
0.51 1.47 1.46 16.12 0.99; | ||
0.35 0.93 0.91 10.04 0.71; | ||
0.51 1.47 1.46 16.12 0.99; | ||
0.22 0.61 0.58 6.43 0.42; | ||
0.12 0.31 0.29 3.21 0.21; | ||
0.44 1.21 1.18 13.54 0.85]; | ||
m = AutoEncoder(encoded_size=2,layers_size=15,epochs=400,autotune=false,rng=copy(TESTRNG)) | ||
x_reduced = fit!(m,x) | ||
x̂ = inverse_predict(m,x_reduced) | ||
x̂sum = sum(x̂) | ||
|
||
x = vcat(rand(copy(TESTRNG),0:0.001:0.6,30,5), rand(copy(TESTRNG),0.4:0.001:1,30,5)) | ||
m = AutoEncoder(rng=copy(TESTRNG), verbosity=NONE) | ||
x_reduced = fit!(m,x) | ||
x̂ = inverse_predict(m,x_reduced) | ||
x̂sum = sum(x̂) | ||
|
||
l2loss_by_cv2(AutoEncoder(rng=copy(TESTRNG), verbosity=NONE),(x,),rng=copy(TESTRNG)) | ||
|
||
m = AutoEncoder(rng=copy(TESTRNG), verbosity=NONE) | ||
sampler = KFold(nsplits=5,nrepeats=1,rng=copy(TESTRNG)) | ||
(μ,σ) = cross_validation([x],sampler) do trainData,valData,rng | ||
(xtrain,) = trainData; (xval,) = valData | ||
fit!(m,xtrain) | ||
x̂val_red = predict(m,xval) | ||
x̂val = inverse_predict(m,x̂val_red) | ||
ϵ = norm(xval .- x̂val)/size(xval,1) | ||
println(ϵ) # different | ||
reset!(m) | ||
return ismissing(ϵ) ? Inf : ϵ | ||
end | ||
|
||
function l2loss_by_cv2(m,data;nsplits=5,nrepeats=1,rng=Random.GLOBAL_RNG) | ||
x= data[1] | ||
sampler = KFold(nsplits=nsplits,nrepeats=nrepeats,rng=rng) | ||
(μ,σ) = cross_validation([x],sampler) do trainData,valData,rng | ||
(xtrain,) = trainData; (xval,) = valData | ||
fit!(m,xtrain) | ||
x̂val = inverse_predict(m,xval) | ||
ϵ = norm(xval .- x̂val)/size(xval,1) | ||
reset!(m) | ||
return ismissing(ϵ) ? Inf : ϵ | ||
end | ||
return μ | ||
end | ||
|
||
|
||
|
||
using Random, Pipe, HTTP, CSV, DataFrames, Plots, BetaML | ||
import Distributions: Normal, quantile | ||
Random.seed!(123) | ||
|
||
# We download the Boston house prices dataset from interet and split it into x and y | ||
dataURL = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data" | ||
data = @pipe HTTP.get(dataURL).body |> CSV.File(_, delim=' ', header=false, ignorerepeated=true) |> DataFrame | ||
|
||
data = CSV.File(joinpath("docs","src","tutorials","Feature importance", "data","housing.data"), delim=' ', header=false, ignorerepeated=true) |> DataFrame | ||
|
||
var_names = [ | ||
"CRIM", # per capita crime rate by town | ||
"ZN", # proportion of residential land zoned for lots over 25,000 sq.ft. | ||
"INDUS", # proportion of non-retail business acres per town | ||
"CHAS", # Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) | ||
"NOX", # nitric oxides concentration (parts per 10 million) | ||
"RM", # average number of rooms per dwelling | ||
"AGE", # proportion of owner-occupied units built prior to 1940 | ||
"DIS", # weighted distances to five Boston employment centres | ||
"RAD", # index of accessibility to radial highways | ||
"TAX", # full-value property-tax rate per $10,000 | ||
"PTRATIO", # pupil-teacher ratio by town | ||
"B", # 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | ||
"LSTAT", # % lower status of the population | ||
] | ||
y_name = "MEDV" # Median value of owner-occupied homes in $1000's | ||
|
||
# Our features are a set of 13 explanatory variables, while the label that we want to estimate is the average housing prices: | ||
x = Matrix(data[:,1:13]) | ||
y = data[:,14] | ||
|
||
# We use a Random Forest model as regressor and we compute the variable importance for this model : | ||
fr = FeatureRanker(model=RandomForestEstimator(),nsplits=3,nrepeats=2,recursive=false, ignore_dims_keyword="ignore_dims") | ||
rank = fit!(fr,x,y) | ||
|
||
loss_by_col = info(fr)["loss_by_col"] | ||
sobol_by_col = info(fr)["sobol_by_col"] | ||
loss_by_col_sd = info(fr)["loss_by_col_sd"] | ||
sobol_by_col_sd = info(fr)["sobol_by_col_sd"] | ||
loss_fullmodel = info(fr)["loss_all_cols"] | ||
loss_fullmodel_sd = info(fr)["loss_all_cols_sd"] | ||
ntrials_per_metric = info(fr)["ntrials_per_metric"] | ||
|
||
# Finally we can plot the variable importance, first using the loss metric ("mda") and then the sobol one: | ||
bar(var_names[sortperm(loss_by_col)], loss_by_col[sortperm(loss_by_col)],label="Loss by var", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (loss_by_col_sd[sortperm(loss_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.5]) | ||
vline!([loss_fullmodel], label="Loss with all vars",linewidth=2) | ||
vline!([loss_fullmodel-quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric), | ||
loss_fullmodel+quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric), | ||
], label=nothing,linecolor=:black,linestyle=:dot,linewidth=1) | ||
savefig("loss_by_var.png") | ||
#- | ||
bar(var_names[sortperm(sobol_by_col)],sobol_by_col[sortperm(sobol_by_col)],label="Sobol index by col", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (sobol_by_col_sd[sortperm(sobol_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.4]) | ||
savefig("sobol_ny_var.png") | ||
# As we can see, the two analyses agree on the most important variables, showing that the size of the house (number of rooms), the percentage of low-income population in the neighbourhood and, to a lesser extent, the distance to employment centres are the most important explanatory variables of house price in the Boston area. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
|
||
using Random, LinearAlgebra, Plots | ||
Random.seed!(123) | ||
# Syntetic data generation | ||
# x1: high importance, x2: little importance, x3: mixed effects with x1, x4: highly correlated with x1 but no effects on Y, x5 and x6: no effects on Y | ||
TEMPRNG = copy(Random.GLOBAL_RNG) | ||
N = 2000 | ||
D = 6 | ||
nAttempts = 30 | ||
xa = rand(TEMPRNG,0:0.0001:10,N,3) | ||
xb = (xa[:,1] .* 0.5 .* rand(TEMPRNG,0.7:0.001:1.3)) .+ 10 | ||
xc = rand(TEMPRNG,0:0.0001:10,N,D-4) | ||
x = hcat(xa,xb,xc) | ||
y = [10*r[1]-r[2]+0.1*r[3]*r[1] for r in eachrow(x) ] | ||
((xtrain,xtest),(ytrain,ytest)) = BetaML.partition([x,y],[0.8,0.2],rng=TEMPRNG) | ||
|
||
|
||
# full cols model: | ||
m = RandomForestEstimator(n_trees=100,rng=TEMPRNG) | ||
m = DecisionTreeEstimator(rng=TEMPRNG) | ||
m = NeuralNetworkEstimator(verbosity=NONE,rng=TEMPRNG) | ||
fit!(m,xtrain,ytrain) | ||
ŷtrain = predict(m,xtrain) | ||
loss = norm(ytrain-ŷtrain)/length(ytrain) # this is good | ||
|
||
ŷtest = predict(m,xtest) | ||
loss = norm(ytest-ŷtest)/length(ytest) # this is good | ||
|
||
loss_by_cols = zeros(D) | ||
sobol_by_cols = zeros(D) | ||
loss_by_cols2 = zeros(D) | ||
sobol_by_cols2 = zeros(D) | ||
diffest_bycols = zeros(D) | ||
loss_by_cols_test = zeros(D) | ||
sobol_by_cols_test = zeros(D) | ||
loss_by_cols2_test = zeros(D) | ||
sobol_by_cols2_test = zeros(D) | ||
diffest_bycols_test = zeros(D) | ||
for a in 1:nAttempts | ||
println("Running attempt $a...") | ||
for d in 1:D | ||
println("- doing modelling without dimension $d ....") | ||
xd_train = hcat(xtrain[:,1:d-1],shuffle(TEMPRNG,xtrain[:,d]),xtrain[:,d+1:end]) | ||
xd_test = hcat(xtest[:,1:d-1],shuffle(TEMPRNG,xtest[:,d]),xtest[:,d+1:end]) | ||
#md = RandomForestEstimator(n_trees=100,rng=TEMPRNG) | ||
#md = DecisionTreeEstimator(rng=TEMPRNG) | ||
md = NeuralNetworkEstimator(verbosity=NONE,rng=TEMPRNG) | ||
fit!(md,xd_train,ytrain) | ||
ŷdtrain = predict(md,xd_train) | ||
#ŷdtrain2 = predict(m,xtrain,ignore_dims=d) | ||
ŷdtest = predict(md,xd_test) | ||
#ŷdtest2 = predict(m,xtest,ignore_dims=d) | ||
if a == 1 | ||
loss_by_cols[d] = norm(ytrain-ŷdtrain)/length(ytrain) | ||
sobol_by_cols[d] = sobol_index(ŷtrain,ŷdtrain) | ||
#loss_by_cols2[d] = norm(ytrain-ŷdtrain2)/length(ytrain) | ||
#sobol_by_cols2[d] = sobol_index(ŷtrain,ŷdtrain2) | ||
#diffest_bycols[d] = norm(ŷdtrain-ŷdtrain2)/length(ytrain) | ||
loss_by_cols_test[d] = norm(ytest-ŷdtest)/length(ytest) | ||
sobol_by_cols_test[d] = sobol_index(ŷtest,ŷdtest) | ||
#loss_by_cols2_test[d] = norm(ytest-ŷdtest2)/length(ytest) | ||
#sobol_by_cols2_test[d] = sobol_index(ŷtest,ŷdtest2) | ||
#diffest_bycols_test[d] = norm(ŷdtest-ŷdtest2)/length(ytest) | ||
else | ||
loss_by_cols[d] = online_mean(norm(ytrain-ŷdtrain)/length(ytrain); mean=loss_by_cols[d],n=a-1) | ||
sobol_by_cols[d] = online_mean(sobol_index(ŷtrain,ŷdtrain) ; mean=sobol_by_cols[d],n=a-1) | ||
#loss_by_cols2[d] = online_mean(norm(ytrain-ŷdtrain2)/length(ytrain); mean=loss_by_cols2[d],n=a-1) | ||
#sobol_by_cols2[d] = online_mean(sobol_index(ŷtrain,ŷdtrain2) ; mean=sobol_by_cols2[d],n=a-1) | ||
#diffest_bycols[d] = online_mean(norm(ŷdtrain-ŷdtrain2)/length(ytrain); mean=diffest_bycols[d],n=a-1) | ||
loss_by_cols_test[d] = online_mean(norm(ytest-ŷdtest)/length(ytest); mean=loss_by_cols_test[d],n=a-1) | ||
sobol_by_cols_test[d] = online_mean(sobol_index(ŷtest,ŷdtest) ; mean=sobol_by_cols_test[d],n=a-1) | ||
#loss_by_cols2_test[d] = online_mean(norm(ytest-ŷdtest2)/length(ytest); mean=loss_by_cols2_test[d],n=a-1) | ||
#sobol_by_cols2_test[d] = online_mean(sobol_index(ŷtest,ŷdtest2) ; mean=sobol_by_cols2_test[d],n=a-1) | ||
#diffest_bycols_test[d] = online_mean(norm(ŷdtest-ŷdtest2)/length(ytest); mean=diffest_bycols_test[d],n=a-1) | ||
end | ||
end | ||
end | ||
# Expected order: ~ [{5,6,4},{3,2},1] good | ||
# ~ [{5,6},{4,3,2,1}] still good but don't see corelation | ||
bar(string.(sortperm(loss_by_cols)),loss_by_cols[sortperm(loss_by_cols)],label="loss_by_cols train") | ||
bar(string.(sortperm(sobol_by_cols)),sobol_by_cols[sortperm(sobol_by_cols)],label="sobol_by_cols train") | ||
bar(string.(sortperm(loss_by_cols2)),loss_by_cols2[sortperm(loss_by_cols2)],label="loss_by_cols2 train") | ||
bar(string.(sortperm(sobol_by_cols2)),sobol_by_cols2[sortperm(sobol_by_cols2)],label="sobol_by_cols2 train") | ||
bar(string.(sortperm(loss_by_cols_test)),loss_by_cols_test[sortperm(loss_by_cols_test)],label="loss_by_cols test") | ||
bar(string.(sortperm(sobol_by_cols_test)),sobol_by_cols_test[sortperm(sobol_by_cols_test)],label="sobol_by_cols test") | ||
bar(string.(sortperm(loss_by_cols2_test)),loss_by_cols2_test[sortperm(loss_by_cols2_test)],label="loss_by_cols2 test") | ||
bar(string.(sortperm(sobol_by_cols2_test)),sobol_by_cols2_test[sortperm(sobol_by_cols2_test)],label="sobol_by_cols2 test") | ||
|
||
|
||
|
||
|
||
d = 5 | ||
xd_train = hcat(xtrain[:,1:d-1],shuffle(xtrain[:,d]),xtrain[:,d+1:end]) | ||
md = RandomForestEstimator(n_trees=50) | ||
fit!(md,xd_train,ytrain) | ||
ŷdtrain = predict(md,xd_train) | ||
loss_d = norm(ytrain-ŷdtrain)/length(ytrain) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
|
||
# Syntetic data generation | ||
# x1: high importance, x2: little importance, x3: mixed effects with x1, x4: highly correlated with x1 but no effects on Y, x5 and x6: no effects on Y | ||
using Random | ||
TESTRNG = Random.GLOBAL_RNG | ||
N = 2000 | ||
D = 6 | ||
nAttempts = 10 | ||
xa = rand(copy(TESTRNG),0:0.0001:10,N,3) | ||
xb = (xa[:,1] .* 0.5 .* rand(0.8:0.001:1.2)) .+ 10 | ||
xc = rand(copy(TESTRNG),0:0.0001:10,N,D-4) | ||
x = hcat(xa,xb,xc) | ||
y = [10*r[1]-r[2]+0.05*r[3]*r[1] for r in eachrow(x) ] | ||
((xtrain,xtest),(ytrain,ytest)) = BetaML.partition([x,y],[0.8,0.2],rng=copy(TESTRNG)) | ||
|
||
|
||
# full cols model: | ||
m = RandomForestEstimator(n_trees=50) | ||
fit!(m,xtrain,ytrain) | ||
ŷtrain = predict(m,xtrain) | ||
loss = norm(ytrain-ŷtrain)/length(ytrain) # this is good | ||
|
||
ŷtest = predict(m,xtest) | ||
loss = norm(ytest-ŷtest)/length(ytest) # this is good | ||
|
||
loss_by_cols = zeros(D) | ||
sobol_by_cols = zeros(D) | ||
loss_by_cols2 = zeros(D) | ||
sobol_by_cols2 = zeros(D) | ||
diffest_bycols = zeros(D) | ||
for a in 1:nAttempts | ||
println("Running attempt $a...") | ||
for d in 1:D | ||
println("- doing modelling without dimension $d ....") | ||
xd_train = hcat(xtrain[:,1:d-1],shuffle(xtrain[:,d]),xtrain[:,d+1:end]) | ||
xd_test = hcat(xtest[:,1:d-1],shuffle(xtest[:,d]),xtest[:,d+1:end]) | ||
md = RandomForestEstimator(n_trees=50) | ||
fit!(md,xd_train,ytrain) | ||
ŷdtrain = predict(md,xd_train) | ||
ŷdtrain2 = predict(m,xtrain,ignore_dims=d) | ||
ŷdtest = predict(md,xd_test) | ||
ŷdtest2 = predict(m,xtest,ignore_dims=d) | ||
if a == 1 | ||
#= | ||
loss_by_cols[d] = norm(ytest-ŷdtest)/length(ytest) | ||
sobol_by_cols[d] = sobol_index(ŷtest,ŷdtest) | ||
loss_by_cols2[d] = norm(ytest-ŷdtest2)/length(ytest) | ||
sobol_by_cols2[d] = sobol_index(ŷtest,ŷdtest2) | ||
diffest_bycols[d] = norm(ŷdtest-ŷdtest2)/length(ytest) | ||
=# | ||
loss_by_cols[d] = norm(ytrain-ŷdtrain)/length(ytrain) | ||
sobol_by_cols[d] = sobol_index(ŷtrain,ŷdtrain) | ||
loss_by_cols2[d] = norm(ytrain-ŷdtrain2)/length(ytrain) | ||
sobol_by_cols2[d] = sobol_index(ŷtrain,ŷdtrain2) | ||
diffest_bycols[d] = norm(ŷdtrain-ŷdtrain2)/length(ytrain) | ||
else | ||
#= | ||
loss_by_cols[d] = online_mean(norm(ytest-ŷdtest)/length(ytest); mean=loss_by_cols[d],n=a-1) | ||
sobol_by_cols[d] = online_mean(sobol_index(ŷtest,ŷdtest) ; mean=sobol_by_cols[d],n=a-1) | ||
loss_by_cols2[d] = online_mean(norm(ytest-ŷdtest2)/length(ytest); mean=loss_by_cols2[d],n=a-1) | ||
sobol_by_cols2[d] = online_mean(sobol_index(ŷtest,ŷdtest2) ; mean=sobol_by_cols2[d],n=a-1) | ||
diffest_bycols[d] = online_mean(norm(ŷdtest-ŷdtest2)/length(ytest); mean=diffest_bycols[d],n=a-1) | ||
=# | ||
loss_by_cols[d] = online_mean(norm(ytrain-ŷdtrain)/length(ytrain); mean=loss_by_cols[d],n=a-1) | ||
sobol_by_cols[d] = online_mean(sobol_index(ŷtrain,ŷdtrain) ; mean=sobol_by_cols[d],n=a-1) | ||
loss_by_cols2[d] = online_mean(norm(ytrain-ŷdtrain2)/length(ytrain); mean=loss_by_cols2[d],n=a-1) | ||
sobol_by_cols2[d] = online_mean(sobol_index(ŷtrain,ŷdtrain2) ; mean=sobol_by_cols2[d],n=a-1) | ||
diffest_bycols[d] = online_mean(norm(ŷdtrain-ŷdtrain2)/length(ytrain); mean=diffest_bycols[d],n=a-1) | ||
end | ||
end | ||
end | ||
# Expected order: ~ [5,6,4,3,2,1] | ||
|
||
d = 5 | ||
xd_train = hcat(xtrain[:,1:d-1],shuffle(xtrain[:,d]),xtrain[:,d+1:end]) | ||
md = RandomForestEstimator(n_trees=50) | ||
fit!(md,xd_train,ytrain) | ||
ŷdtrain = predict(md,xd_train) | ||
loss_d = norm(ytrain-ŷdtrain)/length(ytrain) |
Oops, something went wrong.