From 9a3b394c99b281f6ff75cb636700b9e39b418d46 Mon Sep 17 00:00:00 2001 From: Antonello Lobianco Date: Wed, 13 Jul 2022 11:49:19 +0200 Subject: [PATCH] Changed V2API, some defaults on gmm/predictMissing, started v2api implementation of clusters This test ok --- src/Api.jl | 6 ++-- src/Clustering/Clustering.jl | 54 +++----------------------------- src/Clustering/Clustering_MLJ.jl | 8 ++--- src/Imputation/Imputation.jl | 4 +-- src/Imputation/Imputation_MLJ.jl | 2 +- src/Trees/DecisionTrees.jl | 20 ++++++------ src/Trees/RandomForests.jl | 22 ++++++------- test/Clustering_tests.jl | 4 +-- test/Trees_tests.jl | 10 +++--- 9 files changed, 42 insertions(+), 88 deletions(-) diff --git a/src/Api.jl b/src/Api.jl index 2415dc6e..7efd3391 100644 --- a/src/Api.jl +++ b/src/Api.jl @@ -8,7 +8,7 @@ module Api export BetaMLModel, BetaMLSupervisedModel, BetaMLUnsupervisedModel, BetaMLOptionsSet, BetaMLHyperParametersSet, BetaMLLearnableParametersSet, - predict, fit, fit!, train!, partition, report + predict, fit, fit!, train!, partition, info abstract type BetaMLModel end abstract type BetaMLSupervisedModel <: BetaMLModel end @@ -30,8 +30,8 @@ Predict new information (including transformation) based on a trained BetaMLMode """ predict(::BetaMLModel) = nothing -function report(m::BetaMLModel) - return m.report +function info(m::BetaMLModel) + return m.info end partition() = nothing diff --git a/src/Clustering/Clustering.jl b/src/Clustering/Clustering.jl index a22dae63..bd0fa6e3 100644 --- a/src/Clustering/Clustering.jl +++ b/src/Clustering/Clustering.jl @@ -393,7 +393,7 @@ function gmm(X,K;p₀=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=1 end # finding empty/non_empty values - Xmask = .! ismissing.(X) + #Xmask = .! ismissing.(X) lL = -Inf @@ -401,43 +401,8 @@ function gmm(X,K;p₀=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=1 while(true) oldlL = lL # E Step: assigning the posterior prob p(j|xi) and computing the log-Likelihood of the parameters given the set of data - # (this last one for informative purposes and terminating the algorithm) pₙₖlagged = copy(pₙₖ) - - logpₙₖ = log.(pₙₖ) - lL = 0 - for n in 1:N - if any(Xmask[n,:]) # if at least one true - Xu = X[n,Xmask[n,:]] - #=if (length(ϵ) == 2) - println("here I am") - for m in mixtures[3:end] - println(m.μ) - println(m.σ²) - println(Xu) - println(Xmask[n,:]) - lpdf(m,Xu,Xmask[n,:]) - println("here I am partially") - end - println("here I am dead") - end=# - logpx = lse([log(pₖ[k] + 1e-16) + lpdf(mixtures[k],Xu,Xmask[n,:]) for k in 1:K]) - lL += logpx - #px = sum([pⱼ[k]*normalFixedSd(Xu,μ[k,XMask[n,:]],σ²[k]) for k in 1:K]) - #println(n) - for k in 1:K - logpₙₖ[n,k] = log(pₖ[k] + 1e-16)+lpdf(mixtures[k],Xu,Xmask[n,:])-logpx - end - else - logpₙₖ[n,:] = log.(pₖ) - end - end - pₙₖ = exp.(logpₙₖ) - - pₙₖ2, lL2 = estep(X,pₖ,mixtures) - @assert pₙₖ == pₙₖ2 - @assert lL == lL2 - + pₙₖ, lL = estep(X,pₖ,mixtures) push!(ϵ,norm(pₙₖlagged - pₙₖ)) # M step: find parameters that maximise the likelihood @@ -466,17 +431,6 @@ function gmm(X,K;p₀=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=1 end # end while loop end # end function -#using BenchmarkTools -#@benchmark clusters = emGMM([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3,msgStep=0) -#@benchmark clusters = emGMM([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0) -#@benchmark clusters = emGMM([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0) -#@benchmark clusters = emGMM([1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4],3,msgStep=0) -#@code_warntype gmm([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0) -#using Profile -#Juno.@profiler (for i = 1:1000 gmm([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0) end) -#Profile.clear() -#Profile.print() - # - For mixtures with full covariance matrix (i.e. `FullGaussian(μ,σ²)`) the minCovariance should NOT be set equal to the minVariance, or if the covariance matrix goes too low, it will become singular and not invertible. """ predictMissing(X,K;p₀,mixtures,tol,verbosity,minVariance,minCovariance) @@ -498,7 +452,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions * `minVariance`: Minimum variance for the mixtures [default: 0.05] * `minCovariance`: Minimum covariance for the mixtures with full covariance matrix [default: 0]. This should be set different than minVariance (see notes). * `initStrategy`: Mixture initialisation algorithm [def: `grid`] -* `maxIter`: Maximum number of iterations [def: `-1`, i.e. ∞] +* `maxIter`: Maximum number of iterations [def: `typemax(Int64)`, i.e. ∞] * `rng`: Random Number Generator (see [`FIXEDSEED`](@ref)) [deafult: `Random.GLOBAL_RNG`] # Returns: @@ -520,7 +474,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions julia> cFOut = predictMissing([1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4],3) ``` """ -function predictMissing(X,K=3;p₀=[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=-1,rng = Random.GLOBAL_RNG) +function predictMissing(X,K=3;p₀=[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=typemax(Int64),rng = Random.GLOBAL_RNG) if verbosity > STD @codeLocation end diff --git a/src/Clustering/Clustering_MLJ.jl b/src/Clustering/Clustering_MLJ.jl index b6bda125..8e4f1a14 100644 --- a/src/Clustering/Clustering_MLJ.jl +++ b/src/Clustering/Clustering_MLJ.jl @@ -40,7 +40,7 @@ KMeans(; mutable struct GMMClusterer <: MMI.Unsupervised K::Int64 - p₀::Union{Nothing,AbstractArray{Float64,1}} + p₀::AbstractArray{Float64,1} mixtures::Symbol tol::Float64 minVariance::Float64 @@ -50,7 +50,7 @@ mutable struct GMMClusterer <: MMI.Unsupervised end GMMClusterer(; K = 3, - p₀ = nothing, + p₀ = Float64[], mixtures = :diag_gaussian, tol = 10^(-6), minVariance = 0.05, @@ -61,7 +61,7 @@ GMMClusterer(; mutable struct MissingImputator <: MMI.Unsupervised K::Int64 - p₀::Union{Nothing,AbstractArray{Float64,1}} + p₀::AbstractArray{Float64,1} mixtures::Symbol tol::Float64 minVariance::Float64 @@ -71,7 +71,7 @@ mutable struct MissingImputator <: MMI.Unsupervised end MissingImputator(; K = 3, - p₀ = nothing, + p₀ = Float64[], mixtures = :diag_gaussian, tol = 10^(-6), minVariance = 0.05, diff --git a/src/Imputation/Imputation.jl b/src/Imputation/Imputation.jl index 5cbf957b..e04aa2ac 100644 --- a/src/Imputation/Imputation.jl +++ b/src/Imputation/Imputation.jl @@ -173,14 +173,14 @@ Limitations: """ Base.@kwdef mutable struct GMMImputer <: Imputer K::Int64 = 3 - p₀::Union{Nothing,Vector{Float64}} = nothing + p₀::Vector{Float64} = Float64[] mixtures::Vector{AbstractMixture} = [DiagonalGaussian() for i in 1:K] tol::Float64 = 10^(-6) verbosity::Verbosity = STD minVariance::Float64 = 0.05 minCovariance::Float64 = 0.0 initStrategy::String = "kmeans" - maxIter::Int64 = -1 + maxIter::Int64 = typemax(Int64) multipleImputations::Int64 = 1 rng::AbstractRNG = Random.GLOBAL_RNG fitResults::Union{GMMImputerResult,Nothing} = nothing diff --git a/src/Imputation/Imputation_MLJ.jl b/src/Imputation/Imputation_MLJ.jl index abc5f74e..705a7554 100644 --- a/src/Imputation/Imputation_MLJ.jl +++ b/src/Imputation/Imputation_MLJ.jl @@ -27,7 +27,7 @@ mutable struct BetaMLGMMImputer <: MMI.Unsupervised end BetaMLGMMImputer(; K = 3, - p₀ = nothing, + p₀ = Int64[], mixtures = :diag_gaussian, tol = 10^(-6), minVariance = 0.05, diff --git a/src/Trees/DecisionTrees.jl b/src/Trees/DecisionTrees.jl index d5aed025..b4751e71 100644 --- a/src/Trees/DecisionTrees.jl +++ b/src/Trees/DecisionTrees.jl @@ -116,7 +116,7 @@ mutable struct DTModel <: BetaMLSupervisedModel opt::DTOptionsSet par::DTLearnableParameters trained::Bool - report + info end function DTModel(;kwargs...) @@ -410,10 +410,10 @@ function train!(m::DTModel,x,y::AbstractArray{Ty,1}) where {Ty} jobIsRegression = (forceClassification || ! (Ty <: Number) ) ? false : true - m.report[:trainedRecords] = size(x,1) - m.report[:dimensions] = size(x,2) - m.report[:jobIsRegression] = jobIsRegression ? 1 : 0 - (m.report[:avgDepth],m.report[:maxDepth]) = computeDepths(m.par.tree) + m.info[:trainedRecords] = size(x,1) + m.info[:dimensions] = size(x,2) + m.info[:jobIsRegression] = jobIsRegression ? 1 : 0 + (m.info[:avgDepth],m.info[:maxDepth]) = computeDepths(m.par.tree) return true end @@ -544,8 +544,8 @@ function show(io::IO, ::MIME"text/plain", m::DTModel) if m.trained == false print(io,"DTModel - A Decision Tree model (untrained)") else - job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier" - print(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)") + job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier" + print(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)") end end @@ -553,9 +553,9 @@ function show(io::IO, m::DTModel) if m.trained == false print(io,"DTModel - A Decision Tree model (untrained)") else - job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier" - println(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)") - println(io,m.report) + job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier" + println(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)") + println(io,m.info) _printNode(m.par.tree) end end diff --git a/src/Trees/RandomForests.jl b/src/Trees/RandomForests.jl index 06ed876f..bbce7b8d 100644 --- a/src/Trees/RandomForests.jl +++ b/src/Trees/RandomForests.jl @@ -49,7 +49,7 @@ mutable struct RFModel <: BetaMLSupervisedModel opt::RFOptionsSet par::Union{Nothing,Forest} #TODO: Forest contain info that is actualy in report. Currently we duplicate, we should just remofe them from par by making a dedicated struct instead of Forest trained::Bool - report + info end function RFModel(;kwargs...) @@ -164,12 +164,12 @@ function train!(m::RFModel,x,y::AbstractArray{Ty,1}) where {Ty} m.trained = true - m.report[:trainedRecords] = size(x,1) - m.report[:dimensions] = maxFeatures - m.report[:jobIsRegression] = m.par.isRegression ? 1 : 0 - m.report[:oobE] = m.par.oobError + m.info[:trainedRecords] = size(x,1) + m.info[:dimensions] = maxFeatures + m.info[:jobIsRegression] = m.par.isRegression ? 1 : 0 + m.info[:oobE] = m.par.oobError depths = vcat([transpose([computeDepths(tree)[1],computeDepths(tree)[2]]) for tree in m.par.trees]...) - (m.report[:avgAvgDepth],m.report[:avgMmaxDepth]) = mean(depths,dims=1)[1], mean(depths,dims=1)[2] + (m.info[:avgAvgDepth],m.info[:avgMmaxDepth]) = mean(depths,dims=1)[1], mean(depths,dims=1)[2] return true end @@ -303,8 +303,8 @@ function show(io::IO, ::MIME"text/plain", m::RFModel) if m.trained == false print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest model (untrained)") else - job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier" - print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.report[:trainedRecords]) records)") + job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier" + print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.info[:trainedRecords]) records)") end end @@ -312,8 +312,8 @@ function show(io::IO, m::RFModel) if m.trained == false print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest model (untrained)") else - job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier" - println(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.report[:trainedRecords]) records)") - println(io,m.report) + job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier" + println(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.info[:trainedRecords]) records)") + println(io,m.info) end end \ No newline at end of file diff --git a/test/Clustering_tests.jl b/test/Clustering_tests.jl index 7333e3ad..786e3c41 100644 --- a/test/Clustering_tests.jl +++ b/test/Clustering_tests.jl @@ -217,10 +217,10 @@ probsx2 = predict(m) -@test isapprox(clusters.BIC,114.1492467835965) +#@test isapprox(clusters.BIC,114.1492467835965) #clusters.pₙₖ #clusters.pₖ #clusters.mixtures #clusters.BIC -m.hyperparameters \ No newline at end of file +#m.hyperparameters \ No newline at end of file diff --git a/test/Trees_tests.jl b/test/Trees_tests.jl index 25c9e0fe..0feb44f4 100644 --- a/test/Trees_tests.jl +++ b/test/Trees_tests.jl @@ -50,7 +50,7 @@ ŷtest2 = predict(m, xtest) @test accuracy(ŷtest,ytest,rng=copy(TESTRNG)) >= 0.8 @test ŷtest == ŷtest2 -@test report(m) == Dict(:jobIsRegression => 0,:maxDepth => 3, :dimensions => 2, :trainedRecords => 5, :avgDepth => 2.6666666666666665) +@test info(m) == Dict(:jobIsRegression => 0,:maxDepth => 3, :dimensions => 2, :trainedRecords => 5, :avgDepth => 2.6666666666666665) #print(myTree) # ================================== @@ -290,26 +290,26 @@ println("Testing MLJ interface for Trees models....") X, y = Mlj.@load_boston model_dtr = DecisionTreeRegressor(rng=copy(TESTRNG)) regressor_dtr = Mlj.machine(model_dtr, X, y) -(fitresult_dtr, cache, reportobj) = Mlj.fit(model_dtr, 0, X, y) +(fitresult_dtr, cache, report) = Mlj.fit(model_dtr, 0, X, y) yhat_dtr = Mlj.predict(model_dtr, fitresult_dtr, X) @test meanRelError(yhat_dtr,y) < 0.02 model_rfr = RandomForestRegressor(rng=copy(TESTRNG)) regressor_rfr = Mlj.machine(model_rfr, X, y) -(fitresult_rfr, cache, reportObj) = Mlj.fit(model_rfr, 0, X, y) +(fitresult_rfr, cache, report) = Mlj.fit(model_rfr, 0, X, y) yhat_rfr = Mlj.predict(model_rfr, fitresult_rfr, X) @test meanRelError(yhat_rfr,y) < 0.06 X, y = Mlj.@load_iris model_dtc = DecisionTreeClassifier(rng=copy(TESTRNG)) regressor_dtc = Mlj.machine(model_dtc, X, y) -(fitresult_dtc, cache, reportObj) = Mlj.fit(model_dtc, 0, X, y) +(fitresult_dtc, cache, report) = Mlj.fit(model_dtc, 0, X, y) yhat_dtc = Mlj.predict(model_dtc, fitresult_dtc, X) @test Mlj.mean(Mlj.LogLoss(tol=1e-4)(yhat_dtc, y)) < 0.0002 model_rfc = RandomForestClassifier(maxFeatures=3,rng=copy(TESTRNG)) regressor_rfc = Mlj.machine(model_rfc, X, y) -(fitresult_rfc, cache, reportObj) = Mlj.fit(model_rfc, 0, X, y) +(fitresult_rfc, cache, report) = Mlj.fit(model_rfc, 0, X, y) yhat_rfc = Mlj.predict(model_rfc, fitresult_rfc, X) @test Mlj.mean(Mlj.LogLoss(tol=1e-4)(yhat_rfc, y)) < 0.04