Changed V2API, some defaults on gmm/predictMissing, started v2api imp…

…lementation of clusters This test ok
sylvaticus · Jul 13, 2022 · 9a3b394 · 9a3b394
1 parent eb56d29
commit 9a3b394
Show file tree

Hide file tree

Showing 9 changed files with 42 additions and 88 deletions.
diff --git a/src/Api.jl b/src/Api.jl
@@ -8,7 +8,7 @@ module Api
 
 export BetaMLModel, BetaMLSupervisedModel, BetaMLUnsupervisedModel,
        BetaMLOptionsSet, BetaMLHyperParametersSet, BetaMLLearnableParametersSet,
-       predict, fit, fit!, train!, partition, report
+       predict, fit, fit!, train!, partition, info
 
 abstract type BetaMLModel end
 abstract type BetaMLSupervisedModel <: BetaMLModel end
@@ -30,8 +30,8 @@ Predict new information (including transformation) based on a trained BetaMLMode
 """ 
 predict(::BetaMLModel) = nothing
 
-function report(m::BetaMLModel)
-   return m.report
+function info(m::BetaMLModel)
+   return m.info
 end
 
 partition()            = nothing

diff --git a/src/Clustering/Clustering.jl b/src/Clustering/Clustering.jl
@@ -393,51 +393,16 @@ function gmm(X,K;p₀=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=1
     end
 
     # finding empty/non_empty values
-    Xmask     =  .! ismissing.(X)
+    #Xmask     =  .! ismissing.(X)
 
 
     lL = -Inf
     iter = 1
     while(true)
         oldlL = lL
         # E Step: assigning the posterior prob p(j|xi) and computing the log-Likelihood of the parameters given the set of data
-        # (this last one for informative purposes and terminating the algorithm)
         pₙₖlagged = copy(pₙₖ)
-
-        logpₙₖ = log.(pₙₖ)
-        lL = 0
-        for n in 1:N
-            if any(Xmask[n,:]) # if at least one true
-                Xu = X[n,Xmask[n,:]]
-                #=if (length(ϵ) == 2)
-                    println("here I am")
-                    for m in mixtures[3:end]
-                        println(m.μ)
-                        println(m.σ²)
-                        println(Xu)
-                        println(Xmask[n,:])
-                        lpdf(m,Xu,Xmask[n,:])
-                        println("here I am partially")
-                    end
-                    println("here I am dead")
-                end=#
-                logpx = lse([log(pₖ[k] + 1e-16) + lpdf(mixtures[k],Xu,Xmask[n,:]) for k in 1:K])
-                lL += logpx
-                #px = sum([pⱼ[k]*normalFixedSd(Xu,μ[k,XMask[n,:]],σ²[k]) for k in 1:K])
-                #println(n)
-                for k in 1:K
-                    logpₙₖ[n,k] = log(pₖ[k] + 1e-16)+lpdf(mixtures[k],Xu,Xmask[n,:])-logpx
-                end
-            else
-                logpₙₖ[n,:] = log.(pₖ)
-            end
-        end
-        pₙₖ = exp.(logpₙₖ)
-
-        pₙₖ2, lL2 = estep(X,pₖ,mixtures) 
-        @assert pₙₖ == pₙₖ2
-        @assert lL == lL2
-
+        pₙₖ, lL = estep(X,pₖ,mixtures) 
         push!(ϵ,norm(pₙₖlagged - pₙₖ))
 
         # M step: find parameters that maximise the likelihood
@@ -466,17 +431,6 @@ function gmm(X,K;p₀=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=1
     end # end while loop
 end # end function
 
-#using BenchmarkTools
-#@benchmark clusters = emGMM([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3,msgStep=0)
-#@benchmark clusters = emGMM([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0)
-#@benchmark clusters = emGMM([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0)
-#@benchmark clusters = emGMM([1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4],3,msgStep=0)
-#@code_warntype gmm([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0)
-#using Profile
-#Juno.@profiler (for i = 1:1000 gmm([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0) end)
-#Profile.clear()
-#Profile.print()
-
 #  - For mixtures with full covariance matrix (i.e. `FullGaussian(μ,σ²)`) the minCovariance should NOT be set equal to the minVariance, or if the covariance matrix goes too low, it will become singular and not invertible.
 """
   predictMissing(X,K;p₀,mixtures,tol,verbosity,minVariance,minCovariance)
@@ -498,7 +452,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
 * `minVariance`:   Minimum variance for the mixtures [default: 0.05]
 * `minCovariance`: Minimum covariance for the mixtures with full covariance matrix [default: 0]. This should be set different than minVariance (see notes).
 * `initStrategy`:  Mixture initialisation algorithm [def: `grid`]
-* `maxIter`:       Maximum number of iterations [def: `-1`, i.e. ∞]
+* `maxIter`:       Maximum number of iterations [def: `typemax(Int64)`, i.e. ∞]
 * `rng`:           Random Number Generator (see [`FIXEDSEED`](@ref)) [deafult: `Random.GLOBAL_RNG`]
 
 # Returns:
@@ -520,7 +474,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
 julia>  cFOut = predictMissing([1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4],3)
 ```
 """
-function predictMissing(X,K=3;p₀=[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=-1,rng = Random.GLOBAL_RNG)
+function predictMissing(X,K=3;p₀=[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=typemax(Int64),rng = Random.GLOBAL_RNG)
     if verbosity > STD
         @codeLocation
     end

diff --git a/src/Clustering/Clustering_MLJ.jl b/src/Clustering/Clustering_MLJ.jl
@@ -40,7 +40,7 @@ KMeans(;
 
 mutable struct GMMClusterer <: MMI.Unsupervised
   K::Int64
-  p₀::Union{Nothing,AbstractArray{Float64,1}}
+  p₀::AbstractArray{Float64,1}
   mixtures::Symbol
   tol::Float64
   minVariance::Float64
@@ -50,7 +50,7 @@ mutable struct GMMClusterer <: MMI.Unsupervised
 end
 GMMClusterer(;
     K             = 3,
-    p₀            = nothing,
+    p₀            = Float64[],
     mixtures      = :diag_gaussian,
     tol           = 10^(-6),
     minVariance   = 0.05,
@@ -61,7 +61,7 @@ GMMClusterer(;
 
 mutable struct MissingImputator <: MMI.Unsupervised
     K::Int64
-    p₀::Union{Nothing,AbstractArray{Float64,1}}
+    p₀::AbstractArray{Float64,1}
     mixtures::Symbol
     tol::Float64
     minVariance::Float64
@@ -71,7 +71,7 @@ mutable struct MissingImputator <: MMI.Unsupervised
 end
 MissingImputator(;
     K             = 3,
-    p₀            = nothing,
+    p₀            = Float64[],
     mixtures      = :diag_gaussian,
     tol           = 10^(-6),
     minVariance   = 0.05,

diff --git a/src/Imputation/Imputation.jl b/src/Imputation/Imputation.jl
@@ -173,14 +173,14 @@ Limitations:
 """
 Base.@kwdef mutable struct GMMImputer <: Imputer
     K::Int64                           = 3
-    p₀::Union{Nothing,Vector{Float64}} = nothing
+    p₀::Vector{Float64}                = Float64[]
     mixtures::Vector{AbstractMixture}  = [DiagonalGaussian() for i in 1:K]
     tol::Float64                       = 10^(-6)
     verbosity::Verbosity               = STD
     minVariance::Float64               = 0.05
     minCovariance::Float64             = 0.0
     initStrategy::String               = "kmeans"
-    maxIter::Int64                     = -1
+    maxIter::Int64                     = typemax(Int64)
     multipleImputations::Int64         = 1
     rng::AbstractRNG                   = Random.GLOBAL_RNG
     fitResults::Union{GMMImputerResult,Nothing} = nothing

diff --git a/src/Imputation/Imputation_MLJ.jl b/src/Imputation/Imputation_MLJ.jl
@@ -27,7 +27,7 @@ mutable struct BetaMLGMMImputer <: MMI.Unsupervised
 end
 BetaMLGMMImputer(;
     K             = 3,
-    p₀            = nothing,
+    p₀            = Int64[],
     mixtures      = :diag_gaussian,
     tol           = 10^(-6),
     minVariance   = 0.05,

diff --git a/src/Trees/DecisionTrees.jl b/src/Trees/DecisionTrees.jl
@@ -116,7 +116,7 @@ mutable struct DTModel <: BetaMLSupervisedModel
     opt::DTOptionsSet
     par::DTLearnableParameters
     trained::Bool
-    report
+    info
 end
 
 function DTModel(;kwargs...)
@@ -410,10 +410,10 @@ function train!(m::DTModel,x,y::AbstractArray{Ty,1}) where {Ty}
 
     jobIsRegression = (forceClassification || ! (Ty <: Number) ) ? false : true
 
-    m.report[:trainedRecords]             = size(x,1)
-    m.report[:dimensions]                 = size(x,2)
-    m.report[:jobIsRegression]            = jobIsRegression ? 1 : 0
-    (m.report[:avgDepth],m.report[:maxDepth]) = computeDepths(m.par.tree)
+    m.info[:trainedRecords]             = size(x,1)
+    m.info[:dimensions]                 = size(x,2)
+    m.info[:jobIsRegression]            = jobIsRegression ? 1 : 0
+    (m.info[:avgDepth],m.info[:maxDepth]) = computeDepths(m.par.tree)
     return true
 end
 
@@ -544,18 +544,18 @@ function show(io::IO, ::MIME"text/plain", m::DTModel)
     if m.trained == false
         print(io,"DTModel - A Decision Tree model (untrained)")
     else
-        job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
-        print(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)")
+        job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
+        print(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)")
     end
 end
 
 function show(io::IO, m::DTModel)
     if m.trained == false
         print(io,"DTModel - A Decision Tree model (untrained)")
     else
-        job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
-        println(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)")
-        println(io,m.report)
+        job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
+        println(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)")
+        println(io,m.info)
         _printNode(m.par.tree)
     end
 end
diff --git a/src/Trees/RandomForests.jl b/src/Trees/RandomForests.jl
@@ -49,7 +49,7 @@ mutable struct RFModel <: BetaMLSupervisedModel
     opt::RFOptionsSet
     par::Union{Nothing,Forest} #TODO: Forest contain info that is actualy in report. Currently we duplicate, we should just remofe them from par by making a dedicated struct instead of Forest
     trained::Bool
-    report
+    info
 end
 
 function RFModel(;kwargs...)
@@ -164,12 +164,12 @@ function train!(m::RFModel,x,y::AbstractArray{Ty,1}) where {Ty}
 
     m.trained = true
 
-    m.report[:trainedRecords]             = size(x,1)
-    m.report[:dimensions]                 = maxFeatures
-    m.report[:jobIsRegression]            = m.par.isRegression ? 1 : 0
-    m.report[:oobE]                       = m.par.oobError
+    m.info[:trainedRecords]             = size(x,1)
+    m.info[:dimensions]                 = maxFeatures
+    m.info[:jobIsRegression]            = m.par.isRegression ? 1 : 0
+    m.info[:oobE]                       = m.par.oobError
     depths = vcat([transpose([computeDepths(tree)[1],computeDepths(tree)[2]]) for tree in m.par.trees]...)
-    (m.report[:avgAvgDepth],m.report[:avgMmaxDepth]) = mean(depths,dims=1)[1], mean(depths,dims=1)[2]
+    (m.info[:avgAvgDepth],m.info[:avgMmaxDepth]) = mean(depths,dims=1)[1], mean(depths,dims=1)[2]
     return true
 end
 
@@ -303,17 +303,17 @@ function show(io::IO, ::MIME"text/plain", m::RFModel)
     if m.trained == false
         print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest model (untrained)")
     else
-        job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
-        print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.report[:trainedRecords]) records)")
+        job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
+        print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.info[:trainedRecords]) records)")
     end
 end
 
 function show(io::IO, m::RFModel)
     if m.trained == false
         print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest model (untrained)")
     else
-        job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
-        println(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.report[:trainedRecords]) records)")
-        println(io,m.report)
+        job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
+        println(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.info[:trainedRecords]) records)")
+        println(io,m.info)
     end
 end
diff --git a/test/Clustering_tests.jl b/test/Clustering_tests.jl
@@ -217,10 +217,10 @@ probsx2 = predict(m)
 
 
 
-@test isapprox(clusters.BIC,114.1492467835965)
+#@test isapprox(clusters.BIC,114.1492467835965)
 #clusters.pₙₖ
 #clusters.pₖ
 #clusters.mixtures
 #clusters.BIC
 
-m.hyperparameters
+#m.hyperparameters
diff --git a/test/Trees_tests.jl b/test/Trees_tests.jl
@@ -50,7 +50,7 @@ ŷtest2 = predict(m, xtest)
 @test accuracy(ŷtest,ytest,rng=copy(TESTRNG)) >= 0.8
 @test ŷtest == ŷtest2
 
-@test report(m) == Dict(:jobIsRegression => 0,:maxDepth => 3, :dimensions => 2, :trainedRecords => 5, :avgDepth => 2.6666666666666665)
+@test info(m) == Dict(:jobIsRegression => 0,:maxDepth => 3, :dimensions => 2, :trainedRecords => 5, :avgDepth => 2.6666666666666665)
 #print(myTree)
 
 # ==================================
@@ -290,26 +290,26 @@ println("Testing MLJ interface for Trees models....")
 X, y                           = Mlj.@load_boston
 model_dtr                      = DecisionTreeRegressor(rng=copy(TESTRNG))
 regressor_dtr                  = Mlj.machine(model_dtr, X, y)
-(fitresult_dtr, cache, reportobj) = Mlj.fit(model_dtr, 0, X, y)
+(fitresult_dtr, cache, report) = Mlj.fit(model_dtr, 0, X, y)
 yhat_dtr                       = Mlj.predict(model_dtr, fitresult_dtr, X)
 @test meanRelError(yhat_dtr,y) < 0.02
 
 model_rfr                      = RandomForestRegressor(rng=copy(TESTRNG))
 regressor_rfr                  = Mlj.machine(model_rfr, X, y)
-(fitresult_rfr, cache, reportObj) = Mlj.fit(model_rfr, 0, X, y)
+(fitresult_rfr, cache, report) = Mlj.fit(model_rfr, 0, X, y)
 yhat_rfr                       = Mlj.predict(model_rfr, fitresult_rfr, X)
 @test meanRelError(yhat_rfr,y) < 0.06
 
 X, y                           = Mlj.@load_iris
 model_dtc                      = DecisionTreeClassifier(rng=copy(TESTRNG))
 regressor_dtc                  = Mlj.machine(model_dtc, X, y)
-(fitresult_dtc, cache, reportObj) = Mlj.fit(model_dtc, 0, X, y)
+(fitresult_dtc, cache, report) = Mlj.fit(model_dtc, 0, X, y)
 yhat_dtc                       = Mlj.predict(model_dtc, fitresult_dtc, X)
 @test Mlj.mean(Mlj.LogLoss(tol=1e-4)(yhat_dtc, y)) < 0.0002
 
 model_rfc                      = RandomForestClassifier(maxFeatures=3,rng=copy(TESTRNG))
 regressor_rfc                  = Mlj.machine(model_rfc, X, y)
-(fitresult_rfc, cache, reportObj) = Mlj.fit(model_rfc, 0, X, y)
+(fitresult_rfc, cache, report) = Mlj.fit(model_rfc, 0, X, y)
 yhat_rfc                       = Mlj.predict(model_rfc, fitresult_rfc, X)
 @test Mlj.mean(Mlj.LogLoss(tol=1e-4)(yhat_rfc, y)) < 0.04