Changed V2 implementation and api but problems with predictMissing

sylvaticus · Jul 12, 2022 · eb56d29 · eb56d29
1 parent 81e1be3
commit eb56d29
Show file tree

Hide file tree

Showing 6 changed files with 240 additions and 78 deletions.
diff --git a/src/Api.jl b/src/Api.jl
@@ -8,7 +8,7 @@ module Api
 
 export BetaMLModel, BetaMLSupervisedModel, BetaMLUnsupervisedModel,
        BetaMLOptionsSet, BetaMLHyperParametersSet, BetaMLLearnableParametersSet,
-       predict, fit, fit!, train!, partition, info
+       predict, fit, fit!, train!, partition, report
 
 abstract type BetaMLModel end
 abstract type BetaMLSupervisedModel <: BetaMLModel end
@@ -30,8 +30,8 @@ Predict new information (including transformation) based on a trained BetaMLMode
 """ 
 predict(::BetaMLModel) = nothing
 
-function info(m::BetaMLModel)
-   return m.info
+function report(m::BetaMLModel)
+   return m.report
 end
 
 partition()            = nothing

diff --git a/src/Clustering/Clustering.jl b/src/Clustering/Clustering.jl
@@ -38,7 +38,7 @@ using  ForceImport
 @force using ..Api
 @force using ..Utils
 
-export initRepresentatives, kmeans, kmedoids, gmm, predictMissing, AbstractMixture
+export initRepresentatives, kmeans, kmedoids, gmm, predictMissing, AbstractMixture, GMMClusterModel
 
 abstract type AbstractMixture end
 include("Mixtures.jl")
@@ -276,6 +276,36 @@ function kmedoids(X,K;dist=(x,y) -> norm(x-y),initStrategy="grid",Z₀=nothing,r
 end
 
 
+"""
+   estep(X,pₖ,mixtures)
+
+E-step: assign the posterior prob p(j|xi) and computing the log-Likelihood of the parameters given the set of data(this last one for informative purposes and terminating the algorithm only)
+
+"""
+function estep(X,pₖ,mixtures)
+    (N,D)  = size(X)
+    K      = length(mixtures)
+    Xmask  = .! ismissing.(X)
+    logpₙₖ = zeros(N,K)
+    lL     = 0
+    for n in 1:N
+        if any(Xmask[n,:]) # if at least one true
+            Xu    = X[n,Xmask[n,:]]
+            logpx = lse([log(pₖ[k] + 1e-16) + lpdf(mixtures[k],Xu,Xmask[n,:]) for k in 1:K])
+            lL += logpx
+            for k in 1:K
+                logpₙₖ[n,k] = log(pₖ[k] + 1e-16)+lpdf(mixtures[k],Xu,Xmask[n,:])-logpx
+            end
+        else
+            logpₙₖ[n,:] = log.(pₖ)
+        end
+    end
+    pₙₖ = exp.(logpₙₖ)
+    return (pₙₖ,lL)
+end
+
+
+
 ## The gmm algorithm (Lecture/segment 16.5 of https://www.edx.org/course/machine-learning-with-python-from-linear-models-to)
 
 # no longer true with the numerical trick implemented
@@ -291,14 +321,14 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
 # Parameters:
 * `X`  :           A (n x d) data to clusterise
 * `K`  :           Number of cluster wanted
-* `p₀` :           Initial probabilities of the categorical distribution (K x 1) [default: `nothing`]
+* `p₀` :           Initial probabilities of the categorical distribution (K x 1) [default: `[]`]
 * `mixtures`:      An array (of length K) of the mixture to employ (see notes) [def: `[DiagonalGaussian() for i in 1:K]`]
 * `tol`:           Tolerance to stop the algorithm [default: 10^(-6)]
 * `verbosity`:     A verbosity parameter regulating the information messages frequency [def: `STD`]
 * `minVariance`:   Minimum variance for the mixtures [default: 0.05]
 * `minCovariance`: Minimum covariance for the mixtures with full covariance matrix [default: 0]. This should be set different than minVariance (see notes).
 * `initStrategy`:  Mixture initialisation algorithm [def: `kmeans`]
-* `maxIter`:       Maximum number of iterations [def: `-1`, i.e. ∞]
+* `maxIter`:       Maximum number of iterations [def: `typemax(Int64)`, i.e. ∞]
 * `rng`:           Random Number Generator (see [`FIXEDSEED`](@ref)) [deafult: `Random.GLOBAL_RNG`]
 
 # Returns:
@@ -326,7 +356,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
 julia> clusters = gmm([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,verbosity=HIGH)
 ```
 """
-function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=-1,rng = Random.GLOBAL_RNG)
+function gmm(X,K;p₀=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=typemax(Int64),rng = Random.GLOBAL_RNG)
     if verbosity > STD
         @codeLocation
     end
@@ -338,7 +368,7 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^
     # ---------
     X     = makeMatrix(X)
     (N,D) = size(X)
-    pₖ    = isnothing(p₀) ? fill(1/K,K) : p₀
+    pₖ    = isempty(p₀) ? fill(1/K,K) : p₀
 
     # no longer true with the numerical trick implemented
     #if (minVariance == minCovariance)
@@ -364,7 +394,7 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^
 
     # finding empty/non_empty values
     Xmask     =  .! ismissing.(X)
-    #XdimCount = sum(Xmask, dims=2)
+
 
     lL = -Inf
     iter = 1
@@ -373,6 +403,7 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^
         # E Step: assigning the posterior prob p(j|xi) and computing the log-Likelihood of the parameters given the set of data
         # (this last one for informative purposes and terminating the algorithm)
         pₙₖlagged = copy(pₙₖ)
+
         logpₙₖ = log.(pₙₖ)
         lL = 0
         for n in 1:N
@@ -402,6 +433,10 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^
             end
         end
         pₙₖ = exp.(logpₙₖ)
+
+        pₙₖ2, lL2 = estep(X,pₖ,mixtures) 
+        @assert pₙₖ == pₙₖ2
+        @assert lL == lL2
 
         push!(ϵ,norm(pₙₖlagged - pₙₖ))
 
@@ -418,7 +453,7 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^
         end
 
         # Closing conditions. Note that the logLikelihood is those without considering the new mu,sigma
-        if ((lL - oldlL) <= (tol * abs(lL))) || (maxIter > 0 && iter == maxIter)
+        if ((lL - oldlL) <= (tol * abs(lL))) || (iter >= maxIter)
             npars = npar(mixtures) + (K-1)
             #BIC  = lL - (1/2) * npars * log(N)
             BICv = bic(lL,npars,N)
@@ -456,7 +491,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
 # Parameters:
 * `X`  :           A (N x D) sparse matrix of data to fill according to a GMM model
 * `K`  :           Number of mixtures (latent classes) to consider [def: 3]
-* `p₀` :           Initial probabilities of the categorical distribution (K x 1) [default: `nothing`]
+* `p₀` :           Initial probabilities of the categorical distribution (K x 1) [default: `[]`]
 * `mixtures`:      An array (of length K) of the mixture to employ (see notes) [def: `[DiagonalGaussian() for i in 1:K]`]
 * `tol`:           Tolerance to stop the algorithm [default: 10^(-6)]
 * `verbosity`:     A verbosity parameter regulating the information messages frequency [def: `STD`]
@@ -485,7 +520,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
 julia>  cFOut = predictMissing([1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4],3)
 ```
 """
-function predictMissing(X,K=3;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=-1,rng = Random.GLOBAL_RNG)
+function predictMissing(X,K=3;p₀=[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=-1,rng = Random.GLOBAL_RNG)
     if verbosity > STD
         @codeLocation
     end
@@ -514,6 +549,98 @@ function predictMissing(X,K=3;p₀=nothing,mixtures=[DiagonalGaussian() for i in
 end
 
 
+
+# Avi v2..
+Base.@kwdef mutable struct GMMClusterHyperParametersSet <: BetaMLHyperParametersSet
+    nClasses::Int64                   = 3
+    probMixtures::Vector{Float64}     = []
+    mixtures::Vector{AbstractMixture} = [DiagonalGaussian() for i in 1:nClasses]
+    tol::Float64                      = 10^(-6)
+    minVariance::Float64              = 0.05
+    minCovariance::Float64            = 0.0
+    initStrategy::String              = "kmeans"
+    maxIter::Int64                    = typemax(Int64)
+
+end
+
+Base.@kwdef mutable struct GMMClusterOptionsSet <: BetaMLOptionsSet
+    verbosity::Verbosity = STD
+    rng                  = Random.GLOBAL_RNG
+end
+
+
+Base.@kwdef mutable struct GMMClusterLearnableParameters <: BetaMLLearnableParametersSet
+    mixtures::Vector{AbstractMixture}      = []
+    probMixtures::Vector{Float64}               = []
+    probRecords::Union{Nothing,Matrix{Float64}} = nothing
+end
+
+
+mutable struct GMMClusterModel <: BetaMLUnsupervisedModel
+    hpar::GMMClusterHyperParametersSet
+    opt::GMMClusterOptionsSet
+    par::GMMClusterLearnableParameters
+    trained::Bool
+    info
+end
+
+function GMMClusterModel(;kwargs...)
+    m = GMMClusterModel(GMMClusterHyperParametersSet(),GMMClusterOptionsSet(),GMMClusterLearnableParameters(),false,Dict{Symbol,Any}())
+    thisobjfields  = fieldnames(typeof(m))
+    for (kw,kwv) in kwargs
+       for f in thisobjfields
+          fobj = getproperty(m,f)
+          if kw in fieldnames(typeof(fobj))
+              setproperty!(fobj,kw,kwv)
+          end
+        end
+    end
+    return m
+end
+
+function train!(m::GMMClusterModel,x)
+
+    # Parameter alias..
+    K             = m.hpar.nClasses
+    p₀            = m.hpar.probMixtures
+    mixtures      = m.hpar.mixtures
+    tol           = m.hpar.tol
+    minVariance   = m.hpar.minVariance
+    minCovariance = m.hpar.minCovariance
+    initStrategy  = m.hpar.initStrategy
+    maxIter       = m.hpar.maxIter
+    verbosity     = m.opt.verbosity
+    rng           = m.opt.rng
+
+    if m.trained
+        verbosity >= STD && @warn "Continuing training of a pre-trained model"
+        gmmOut = gmm(x,K;p₀=m.par.probMixtures,mixtures=m.par.mixtures,tol=tol,verbosity=verbosity,minVariance=minVariance,minCovariance=minCovariance,initStrategy="given",maxIter=maxIter,rng = rng)
+    else
+        gmmOut = gmm(x,K;p₀=p₀,mixtures=mixtures,tol=tol,verbosity=verbosity,minVariance=minVariance,minCovariance=minCovariance,initStrategy=initStrategy,maxIter=maxIter,rng = rng)
+    end
+    m.par = GMMClusterLearnableParameters(mixtures = gmmOut.mixtures, probMixtures=makeColVector(gmmOut.pₖ), probRecords = gmmOut.pₙₖ)
+
+    m.info[:error]        = gmmOut.ϵ
+    m.info[:lL]           = gmmOut.lL
+    m.info[:BIC]          = gmmOut.BIC
+    m.info[:AIC]          = gmmOut.AIC
+    m.trained=true
+    return true
+end    
+
+function predict(m::GMMClusterModel)
+    return m.par.probRecords
+end
+
+function predict(m::GMMClusterModel,X)
+    x = makeMatrix(x)
+    mixtures = m.par.mixtures
+    probMixtures = m.par.probMixtures
+
+    return m.par.probRecords
+end
+
+
 # MLJ interface
 include("Clustering_MLJ.jl")
 

diff --git a/src/Trees/DecisionTrees.jl b/src/Trees/DecisionTrees.jl
@@ -112,11 +112,11 @@ end
 
 
 mutable struct DTModel <: BetaMLSupervisedModel
-    hyperparameters::DTHyperParametersSet
-    options::DTOptionsSet
-    learnableparameters::DTLearnableParameters
+    hpar::DTHyperParametersSet
+    opt::DTOptionsSet
+    par::DTLearnableParameters
     trained::Bool
-    info
+    report
 end
 
 function DTModel(;kwargs...)
@@ -394,26 +394,26 @@ function train!(m::DTModel,x,y::AbstractArray{Ty,1}) where {Ty}
     end
 
     # Setting default parameters that depends from the data...
-    maxDepth    = m.hyperparameters.maxDepth    == nothing ?  size(x,1) : m.hyperparameters.maxDepth
-    maxFeatures = m.hyperparameters.maxFeatures == nothing ?  size(x,2) : m.hyperparameters.maxFeatures
-    splittingCriterion = m.hyperparameters.splittingCriterion == nothing ? ( (Ty <: Number && !m.hyperparameters.forceClassification) ? variance : gini) : m.hyperparameters.splittingCriterion
+    maxDepth    = m.hpar.maxDepth    == nothing ?  size(x,1) : m.hpar.maxDepth
+    maxFeatures = m.hpar.maxFeatures == nothing ?  size(x,2) : m.hpar.maxFeatures
+    splittingCriterion = m.hpar.splittingCriterion == nothing ? ( (Ty <: Number && !m.hpar.forceClassification) ? variance : gini) : m.hpar.splittingCriterion
     # Setting schortcuts to other hyperparameters/options....
-    minGain             = m.hyperparameters.minGain
-    minRecords          = m.hyperparameters.minRecords
-    forceClassification = m.hyperparameters.forceClassification
-    rng                 = m.options.rng
-    verbosity           = m.options.verbosity
+    minGain             = m.hpar.minGain
+    minRecords          = m.hpar.minRecords
+    forceClassification = m.hpar.forceClassification
+    rng                 = m.opt.rng
+    verbosity           = m.opt.verbosity
 
-    m.learnableparameters.tree = buildTree(x, y; maxDepth = maxDepth, minGain=minGain, minRecords=minRecords, maxFeatures=maxFeatures, forceClassification=forceClassification, splittingCriterion = splittingCriterion, mCols=nothing, rng = rng)
+    m.par.tree = buildTree(x, y; maxDepth = maxDepth, minGain=minGain, minRecords=minRecords, maxFeatures=maxFeatures, forceClassification=forceClassification, splittingCriterion = splittingCriterion, mCols=nothing, rng = rng)
 
     m.trained = true
 
     jobIsRegression = (forceClassification || ! (Ty <: Number) ) ? false : true
 
-    m.info[:trainedRecords]             = size(x,1)
-    m.info[:dimensions]                 = size(x,2)
-    m.info[:jobIsRegression]            = jobIsRegression ? 1 : 0
-    (m.info[:avgDepth],m.info[:maxDepth]) = computeDepths(m.learnableparameters.tree)
+    m.report[:trainedRecords]             = size(x,1)
+    m.report[:dimensions]                 = size(x,2)
+    m.report[:jobIsRegression]            = jobIsRegression ? 1 : 0
+    (m.report[:avgDepth],m.report[:maxDepth]) = computeDepths(m.par.tree)
     return true
 end
 
@@ -466,14 +466,14 @@ end
 
 # API V2...
 function predict(m::DTModel,x)
-    return predictSingle.(Ref(m.learnableparameters.tree),eachrow(x),rng=m.options.rng)
+    return predictSingle.(Ref(m.par.tree),eachrow(x),rng=m.opt.rng)
 end
 
 # ------------------------------------------------------------------------------
 # OTHER (MODEL OPTIONAL PARTS, INFO, VISUALISATION,...)
 
 function reset!(m::DTModel)
-    m.learnableparameters = DTLearnableParameters()
+    m.par = DTLearnableParameters()
     m.trained             = false
     # note info is NOT resetted
 end
@@ -544,18 +544,18 @@ function show(io::IO, ::MIME"text/plain", m::DTModel)
     if m.trained == false
         print(io,"DTModel - A Decision Tree model (untrained)")
     else
-        job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
-        print(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)")
+        job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
+        print(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)")
     end
 end
 
 function show(io::IO, m::DTModel)
     if m.trained == false
         print(io,"DTModel - A Decision Tree model (untrained)")
     else
-        job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
-        println(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)")
-        println(io,m.info)
-        _printNode(m.learnableparameters.tree)
+        job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
+        println(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)")
+        println(io,m.report)
+        _printNode(m.par.tree)
     end
 end