changed readme

sylvaticus · May 24, 2024 · be03ca6 · be03ca6
1 parent 3669553
commit be03ca6
Show file tree

Hide file tree

Showing 27 changed files with 4,965 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -167,23 +167,23 @@ Neural Networks |  [Flux.jl](https://fluxml.ai/), [Knet](https://github.com/deni
 Decision Trees | [DecisionTree.jl](https://github.com/bensadeghi/DecisionTree.jl)
 Clustering | [Clustering.jl](https://github.com/JuliaStats/Clustering.jl), [GaussianMixtures.jl](https://github.com/davidavdav/GaussianMixtures.jl)
 Missing imputation | [Impute.jl](https://github.com/invenia/Impute.jl), [Mice.jl](https://github.com/tom-metherell/Mice.jl)
-
+Variable importance | [ShapML.jl](https://github.com/nredell/ShapML.jl)
 
 
 ## TODO
 
 ### Short term
 
 - Implement autotuning of `GaussianMixtureClusterer` using  `BIC` or `AIC`
-- Add Silhouette method to check cluster validity
+- <del>Add Silhouette method to check cluster validity</del>
 - Implement PAM and/or variants for kmedoids
 
 ### Mid/Long term
 
 - Add RNN support and improve convolutional layers speed
 - Reinforcement learning (Markov decision processes)
 - Standardize data sampling in training
-- Convert to GPU
+- Add GPU
 
 ## Contribute
 

diff --git a/announce_autoencoder.txt b/announce_autoencoder.txt
@@ -0,0 +1,21 @@
+
+[Ann] Easiest AutoEncoder on Earth: `m=AutoEncoder(); fit!(m,x); latent_x = predict(m,x); xest=inverse_predict(m,x_latent)`
+
+[ANN] A easy to use AutoEncoder to reduce the dimensionality of the data
+
+(I hope I'm not breaking the self-promotion rule ;-) )
+Hello, I am pleased to announce one of the easiest to use [AutoEncoder](https://sylvaticus.github.io/BetaML.jl/dev/Utils.html#BetaML.Utils.AutoEncoder) models in the world.
+
+No need to implement a neural network yourself, just use
+- `mod = AutoEncoder([optional stuff])` to create the model
+- `fit!(mod,x)` to fit the model to some tabular data (dims in cols)
+- x_latent = predict(mod)` or `x_latent = predict(mod,otherx)` to get the data encoded in latent space (usually with many less dimensions than the original)
+- x_decoded = inverse_predict(mod,x_latent)` to get the decoded values.
+
+The user can still specify the number of dimensions in the latent space, the number of neurons in the inner layers, or the full specification of encoding/decoding layers and NN training options, but this remains completely optional as some heuristics are applied. The `autotune' method also allows to further simplify these choices.
+
+`AutoEncoder` is part of the v0.10.4 of [BetaML Machine Learning Toolkit](https://github.com/sylvaticus/BetaML.jl), an open source set of machine learning models and utilities, and a wrapper should soon be available as part of the MLJ library.
+Although developed in the Julia language, it can be easily accessed from R, Python or any other language with a Julia binding, as specified [here](https://sylvaticus.github.io/BetaML.jl/stable/tutorials/Betaml_tutorial_getting_started.html#using_betaml_from_other_languages).
+
+
+
diff --git a/cancellable/# Scratchpad.jl b/cancellable/# Scratchpad.jl
@@ -0,0 +1,108 @@
+# Scratchpad
+
+x = [0.12 0.31 0.29 3.21 0.21;
+     0.44 1.21 1.18 13.54 0.85
+     0.22 0.61 0.58 6.43 0.42;
+     0.35 0.93 0.91 10.04 0.71;
+     0.51 1.47 1.46 16.12 0.99;
+     0.35 0.93 0.91 10.04 0.71;
+     0.51 1.47 1.46 16.12 0.99;
+     0.22 0.61 0.58 6.43 0.42;
+     0.12 0.31 0.29 3.21 0.21;
+     0.44 1.21 1.18 13.54 0.85];
+m    = AutoEncoder(encoded_size=2,layers_size=15,epochs=400,autotune=false,rng=copy(TESTRNG)) 
+x_reduced = fit!(m,x)
+x̂ = inverse_predict(m,x_reduced)
+x̂sum = sum(x̂)
+
+x = vcat(rand(copy(TESTRNG),0:0.001:0.6,30,5), rand(copy(TESTRNG),0.4:0.001:1,30,5))
+m = AutoEncoder(rng=copy(TESTRNG), verbosity=NONE)
+x_reduced = fit!(m,x)
+x̂ = inverse_predict(m,x_reduced)
+x̂sum = sum(x̂)
+
+l2loss_by_cv2(AutoEncoder(rng=copy(TESTRNG), verbosity=NONE),(x,),rng=copy(TESTRNG))
+
+m = AutoEncoder(rng=copy(TESTRNG), verbosity=NONE)
+sampler = KFold(nsplits=5,nrepeats=1,rng=copy(TESTRNG))
+(μ,σ) = cross_validation([x],sampler) do trainData,valData,rng
+    (xtrain,) = trainData; (xval,) = valData
+    fit!(m,xtrain)
+    x̂val_red = predict(m,xval)
+    x̂val     = inverse_predict(m,x̂val_red)
+    ϵ        = norm(xval .- x̂val)/size(xval,1) 
+    println(ϵ) # different
+    reset!(m)
+    return ismissing(ϵ) ? Inf : ϵ 
+end
+
+function l2loss_by_cv2(m,data;nsplits=5,nrepeats=1,rng=Random.GLOBAL_RNG)
+    x= data[1]
+    sampler = KFold(nsplits=nsplits,nrepeats=nrepeats,rng=rng)
+    (μ,σ) = cross_validation([x],sampler) do trainData,valData,rng
+        (xtrain,) = trainData; (xval,) = valData
+        fit!(m,xtrain)
+        x̂val     = inverse_predict(m,xval)
+        ϵ        = norm(xval .- x̂val)/size(xval,1) 
+        reset!(m)
+        return ismissing(ϵ) ? Inf : ϵ 
+    end
+    return μ
+end
+
+
+
+using Random, Pipe, HTTP, CSV, DataFrames, Plots, BetaML
+import Distributions: Normal, quantile
+Random.seed!(123)
+
+# We download the Boston house prices dataset from interet and split it into x and y
+dataURL = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
+data    = @pipe HTTP.get(dataURL).body |> CSV.File(_, delim=' ', header=false, ignorerepeated=true) |> DataFrame
+
+data = CSV.File(joinpath("docs","src","tutorials","Feature importance", "data","housing.data"), delim=' ', header=false, ignorerepeated=true) |> DataFrame
+
+var_names = [
+  "CRIM",    # per capita crime rate by town
+  "ZN",      # proportion of residential land zoned for lots over 25,000 sq.ft.
+  "INDUS",   # proportion of non-retail business acres per town
+  "CHAS",    # Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
+  "NOX",     # nitric oxides concentration (parts per 10 million)
+  "RM",      # average number of rooms per dwelling
+  "AGE",     # proportion of owner-occupied units built prior to 1940
+  "DIS",     # weighted distances to five Boston employment centres
+  "RAD",     # index of accessibility to radial highways
+  "TAX",     # full-value property-tax rate per $10,000
+  "PTRATIO", # pupil-teacher ratio by town
+  "B",       # 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
+  "LSTAT",   # % lower status of the population
+]
+y_name = "MEDV" # Median value of owner-occupied homes in $1000's
+
+# Our features are a set of 13 explanatory variables, while the label that we want to estimate is the average housing prices:
+x = Matrix(data[:,1:13])
+y = data[:,14]
+
+# We use a Random Forest model as regressor and we compute the variable importance for this model :
+fr = FeatureRanker(model=RandomForestEstimator(),nsplits=3,nrepeats=2,recursive=false, ignore_dims_keyword="ignore_dims")
+rank = fit!(fr,x,y)
+
+loss_by_col        = info(fr)["loss_by_col"]
+sobol_by_col       = info(fr)["sobol_by_col"]
+loss_by_col_sd     = info(fr)["loss_by_col_sd"]
+sobol_by_col_sd    = info(fr)["sobol_by_col_sd"]
+loss_fullmodel     = info(fr)["loss_all_cols"]
+loss_fullmodel_sd  = info(fr)["loss_all_cols_sd"]
+ntrials_per_metric = info(fr)["ntrials_per_metric"]
+
+# Finally we can plot the variable importance, first using the loss metric ("mda") and then the sobol one:
+bar(var_names[sortperm(loss_by_col)], loss_by_col[sortperm(loss_by_col)],label="Loss by var", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (loss_by_col_sd[sortperm(loss_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.5])
+vline!([loss_fullmodel], label="Loss with all vars",linewidth=2)
+vline!([loss_fullmodel-quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric),
+        loss_fullmodel+quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric),
+], label=nothing,linecolor=:black,linestyle=:dot,linewidth=1)
+savefig("loss_by_var.png")
+#-
+bar(var_names[sortperm(sobol_by_col)],sobol_by_col[sortperm(sobol_by_col)],label="Sobol index by col", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (sobol_by_col_sd[sortperm(sobol_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.4])
+savefig("sobol_ny_var.png")
+# As we can see, the two analyses agree on the most important variables, showing that the size of the house (number of rooms), the percentage of low-income population in the neighbourhood and, to a lesser extent, the distance to employment centres are the most important explanatory variables of house price in the Boston area.
diff --git a/cancellable/Untitled-2.jl b/cancellable/Untitled-2.jl
@@ -0,0 +1,97 @@
+
+using Random, LinearAlgebra, Plots
+Random.seed!(123)
+# Syntetic data generation
+# x1: high importance, x2: little importance, x3: mixed effects with x1, x4: highly correlated with x1 but no effects on Y, x5 and x6: no effects on Y 
+TEMPRNG = copy(Random.GLOBAL_RNG)
+N     = 2000
+D     = 6
+nAttempts = 30
+xa    = rand(TEMPRNG,0:0.0001:10,N,3)
+xb    = (xa[:,1] .* 0.5 .* rand(TEMPRNG,0.7:0.001:1.3)) .+ 10 
+xc    = rand(TEMPRNG,0:0.0001:10,N,D-4)
+x     = hcat(xa,xb,xc)  
+y     = [10*r[1]-r[2]+0.1*r[3]*r[1] for r in eachrow(x) ]
+((xtrain,xtest),(ytrain,ytest)) = BetaML.partition([x,y],[0.8,0.2],rng=TEMPRNG)
+
+
+# full cols model:
+m = RandomForestEstimator(n_trees=100,rng=TEMPRNG)
+m = DecisionTreeEstimator(rng=TEMPRNG)
+m = NeuralNetworkEstimator(verbosity=NONE,rng=TEMPRNG)
+fit!(m,xtrain,ytrain)
+ŷtrain = predict(m,xtrain)
+loss = norm(ytrain-ŷtrain)/length(ytrain) # this is good
+
+ŷtest = predict(m,xtest)
+loss = norm(ytest-ŷtest)/length(ytest) # this is good
+
+loss_by_cols  = zeros(D)
+sobol_by_cols = zeros(D)
+loss_by_cols2  = zeros(D)
+sobol_by_cols2 = zeros(D)
+diffest_bycols = zeros(D)
+loss_by_cols_test  = zeros(D)
+sobol_by_cols_test = zeros(D)
+loss_by_cols2_test  = zeros(D)
+sobol_by_cols2_test = zeros(D)
+diffest_bycols_test = zeros(D)
+for a in 1:nAttempts
+    println("Running attempt $a...")
+    for d in 1:D
+        println("- doing modelling without dimension $d ....")
+        xd_train = hcat(xtrain[:,1:d-1],shuffle(TEMPRNG,xtrain[:,d]),xtrain[:,d+1:end])
+        xd_test = hcat(xtest[:,1:d-1],shuffle(TEMPRNG,xtest[:,d]),xtest[:,d+1:end])  
+        #md = RandomForestEstimator(n_trees=100,rng=TEMPRNG)
+        #md = DecisionTreeEstimator(rng=TEMPRNG)
+        md = NeuralNetworkEstimator(verbosity=NONE,rng=TEMPRNG)
+        fit!(md,xd_train,ytrain)
+        ŷdtrain           = predict(md,xd_train)
+        #ŷdtrain2          = predict(m,xtrain,ignore_dims=d)
+        ŷdtest            = predict(md,xd_test)
+        #ŷdtest2           = predict(m,xtest,ignore_dims=d)  
+        if a == 1
+            loss_by_cols[d]   = norm(ytrain-ŷdtrain)/length(ytrain)
+            sobol_by_cols[d]  = sobol_index(ŷtrain,ŷdtrain) 
+            #loss_by_cols2[d]  = norm(ytrain-ŷdtrain2)/length(ytrain)
+            #sobol_by_cols2[d] = sobol_index(ŷtrain,ŷdtrain2) 
+            #diffest_bycols[d] = norm(ŷdtrain-ŷdtrain2)/length(ytrain)
+            loss_by_cols_test[d]   = norm(ytest-ŷdtest)/length(ytest)
+            sobol_by_cols_test[d]  = sobol_index(ŷtest,ŷdtest) 
+            #loss_by_cols2_test[d]  = norm(ytest-ŷdtest2)/length(ytest)
+            #sobol_by_cols2_test[d] = sobol_index(ŷtest,ŷdtest2) 
+            #diffest_bycols_test[d] = norm(ŷdtest-ŷdtest2)/length(ytest)
+        else
+            loss_by_cols[d]   = online_mean(norm(ytrain-ŷdtrain)/length(ytrain); mean=loss_by_cols[d],n=a-1)
+            sobol_by_cols[d]  = online_mean(sobol_index(ŷtrain,ŷdtrain) ; mean=sobol_by_cols[d],n=a-1)
+            #loss_by_cols2[d]  = online_mean(norm(ytrain-ŷdtrain2)/length(ytrain); mean=loss_by_cols2[d],n=a-1)
+            #sobol_by_cols2[d] = online_mean(sobol_index(ŷtrain,ŷdtrain2) ; mean=sobol_by_cols2[d],n=a-1)
+            #diffest_bycols[d] = online_mean(norm(ŷdtrain-ŷdtrain2)/length(ytrain); mean=diffest_bycols[d],n=a-1)
+            loss_by_cols_test[d]   = online_mean(norm(ytest-ŷdtest)/length(ytest); mean=loss_by_cols_test[d],n=a-1)
+            sobol_by_cols_test[d]  = online_mean(sobol_index(ŷtest,ŷdtest) ; mean=sobol_by_cols_test[d],n=a-1)
+            #loss_by_cols2_test[d]  = online_mean(norm(ytest-ŷdtest2)/length(ytest); mean=loss_by_cols2_test[d],n=a-1)
+            #sobol_by_cols2_test[d] = online_mean(sobol_index(ŷtest,ŷdtest2) ; mean=sobol_by_cols2_test[d],n=a-1)
+            #diffest_bycols_test[d] = online_mean(norm(ŷdtest-ŷdtest2)/length(ytest); mean=diffest_bycols_test[d],n=a-1)
+        end
+    end
+end
+# Expected order: ~ [{5,6,4},{3,2},1] good
+#                 ~ [{5,6},{4,3,2,1}] still good but don't see corelation
+bar(string.(sortperm(loss_by_cols)),loss_by_cols[sortperm(loss_by_cols)],label="loss_by_cols train")
+bar(string.(sortperm(sobol_by_cols)),sobol_by_cols[sortperm(sobol_by_cols)],label="sobol_by_cols train")
+bar(string.(sortperm(loss_by_cols2)),loss_by_cols2[sortperm(loss_by_cols2)],label="loss_by_cols2 train")
+bar(string.(sortperm(sobol_by_cols2)),sobol_by_cols2[sortperm(sobol_by_cols2)],label="sobol_by_cols2 train")
+bar(string.(sortperm(loss_by_cols_test)),loss_by_cols_test[sortperm(loss_by_cols_test)],label="loss_by_cols test")
+bar(string.(sortperm(sobol_by_cols_test)),sobol_by_cols_test[sortperm(sobol_by_cols_test)],label="sobol_by_cols test")
+bar(string.(sortperm(loss_by_cols2_test)),loss_by_cols2_test[sortperm(loss_by_cols2_test)],label="loss_by_cols2 test")
+bar(string.(sortperm(sobol_by_cols2_test)),sobol_by_cols2_test[sortperm(sobol_by_cols2_test)],label="sobol_by_cols2 test")
+
+
+
+
+d = 5
+xd_train = hcat(xtrain[:,1:d-1],shuffle(xtrain[:,d]),xtrain[:,d+1:end])
+md = RandomForestEstimator(n_trees=50)
+fit!(md,xd_train,ytrain)
+ŷdtrain            = predict(md,xd_train)
+loss_d = norm(ytrain-ŷdtrain)/length(ytrain)
diff --git a/cancellable/Untitled-3.jl b/cancellable/Untitled-3.jl
@@ -0,0 +1,79 @@
+
+# Syntetic data generation
+# x1: high importance, x2: little importance, x3: mixed effects with x1, x4: highly correlated with x1 but no effects on Y, x5 and x6: no effects on Y 
+using Random
+TESTRNG = Random.GLOBAL_RNG
+N     = 2000
+D     = 6
+nAttempts = 10
+xa    = rand(copy(TESTRNG),0:0.0001:10,N,3)
+xb    = (xa[:,1] .* 0.5 .* rand(0.8:0.001:1.2)) .+ 10 
+xc    = rand(copy(TESTRNG),0:0.0001:10,N,D-4)
+x     = hcat(xa,xb,xc)  
+y     = [10*r[1]-r[2]+0.05*r[3]*r[1] for r in eachrow(x) ]
+((xtrain,xtest),(ytrain,ytest)) = BetaML.partition([x,y],[0.8,0.2],rng=copy(TESTRNG))
+
+
+# full cols model:
+m = RandomForestEstimator(n_trees=50)
+fit!(m,xtrain,ytrain)
+ŷtrain = predict(m,xtrain)
+loss = norm(ytrain-ŷtrain)/length(ytrain) # this is good
+
+ŷtest = predict(m,xtest)
+loss = norm(ytest-ŷtest)/length(ytest) # this is good
+
+loss_by_cols  = zeros(D)
+sobol_by_cols = zeros(D)
+loss_by_cols2  = zeros(D)
+sobol_by_cols2 = zeros(D)
+diffest_bycols = zeros(D)
+for a in 1:nAttempts
+    println("Running attempt $a...")
+    for d in 1:D
+        println("- doing modelling without dimension $d ....")
+        xd_train = hcat(xtrain[:,1:d-1],shuffle(xtrain[:,d]),xtrain[:,d+1:end])
+        xd_test = hcat(xtest[:,1:d-1],shuffle(xtest[:,d]),xtest[:,d+1:end])  
+        md = RandomForestEstimator(n_trees=50)
+        fit!(md,xd_train,ytrain)
+        ŷdtrain            = predict(md,xd_train)
+        ŷdtrain2           = predict(m,xtrain,ignore_dims=d)
+        ŷdtest            = predict(md,xd_test)
+        ŷdtest2           = predict(m,xtest,ignore_dims=d)  
+        if a == 1
+            #=
+            loss_by_cols[d]   = norm(ytest-ŷdtest)/length(ytest)
+            sobol_by_cols[d]  = sobol_index(ŷtest,ŷdtest) 
+            loss_by_cols2[d]  = norm(ytest-ŷdtest2)/length(ytest)
+            sobol_by_cols2[d] = sobol_index(ŷtest,ŷdtest2) 
+            diffest_bycols[d] = norm(ŷdtest-ŷdtest2)/length(ytest)
+            =#
+            loss_by_cols[d]   = norm(ytrain-ŷdtrain)/length(ytrain)
+            sobol_by_cols[d]  = sobol_index(ŷtrain,ŷdtrain) 
+            loss_by_cols2[d]  = norm(ytrain-ŷdtrain2)/length(ytrain)
+            sobol_by_cols2[d] = sobol_index(ŷtrain,ŷdtrain2) 
+            diffest_bycols[d] = norm(ŷdtrain-ŷdtrain2)/length(ytrain)
+        else
+            #=
+            loss_by_cols[d]   = online_mean(norm(ytest-ŷdtest)/length(ytest); mean=loss_by_cols[d],n=a-1)
+            sobol_by_cols[d]  = online_mean(sobol_index(ŷtest,ŷdtest) ; mean=sobol_by_cols[d],n=a-1)
+            loss_by_cols2[d]  = online_mean(norm(ytest-ŷdtest2)/length(ytest); mean=loss_by_cols2[d],n=a-1)
+            sobol_by_cols2[d] = online_mean(sobol_index(ŷtest,ŷdtest2) ; mean=sobol_by_cols2[d],n=a-1)
+            diffest_bycols[d] = online_mean(norm(ŷdtest-ŷdtest2)/length(ytest); mean=diffest_bycols[d],n=a-1)
+            =#
+            loss_by_cols[d]   = online_mean(norm(ytrain-ŷdtrain)/length(ytrain); mean=loss_by_cols[d],n=a-1)
+            sobol_by_cols[d]  = online_mean(sobol_index(ŷtrain,ŷdtrain) ; mean=sobol_by_cols[d],n=a-1)
+            loss_by_cols2[d]  = online_mean(norm(ytrain-ŷdtrain2)/length(ytrain); mean=loss_by_cols2[d],n=a-1)
+            sobol_by_cols2[d] = online_mean(sobol_index(ŷtrain,ŷdtrain2) ; mean=sobol_by_cols2[d],n=a-1)
+            diffest_bycols[d] = online_mean(norm(ŷdtrain-ŷdtrain2)/length(ytrain); mean=diffest_bycols[d],n=a-1)
+        end
+    end
+end
+# Expected order: ~ [5,6,4,3,2,1]
+
+d = 5
+xd_train = hcat(xtrain[:,1:d-1],shuffle(xtrain[:,d]),xtrain[:,d+1:end])
+md = RandomForestEstimator(n_trees=50)
+fit!(md,xd_train,ytrain)
+ŷdtrain            = predict(md,xd_train)
+loss_d = norm(ytrain-ŷdtrain)/length(ytrain)