Skip to content

Commit

Permalink
Changed V2 implementation and api but problems with predictMissing
Browse files Browse the repository at this point in the history
  • Loading branch information
sylvaticus committed Jul 12, 2022
1 parent 81e1be3 commit eb56d29
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 78 deletions.
6 changes: 3 additions & 3 deletions src/Api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ module Api

export BetaMLModel, BetaMLSupervisedModel, BetaMLUnsupervisedModel,
BetaMLOptionsSet, BetaMLHyperParametersSet, BetaMLLearnableParametersSet,
predict, fit, fit!, train!, partition, info
predict, fit, fit!, train!, partition, report

abstract type BetaMLModel end
abstract type BetaMLSupervisedModel <: BetaMLModel end
Expand All @@ -30,8 +30,8 @@ Predict new information (including transformation) based on a trained BetaMLMode
"""
predict(::BetaMLModel) = nothing

function info(m::BetaMLModel)
return m.info
function report(m::BetaMLModel)
return m.report
end

partition() = nothing
Expand Down
145 changes: 136 additions & 9 deletions src/Clustering/Clustering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ using ForceImport
@force using ..Api
@force using ..Utils

export initRepresentatives, kmeans, kmedoids, gmm, predictMissing, AbstractMixture
export initRepresentatives, kmeans, kmedoids, gmm, predictMissing, AbstractMixture, GMMClusterModel

abstract type AbstractMixture end
include("Mixtures.jl")
Expand Down Expand Up @@ -276,6 +276,36 @@ function kmedoids(X,K;dist=(x,y) -> norm(x-y),initStrategy="grid",Z₀=nothing,r
end


"""
estep(X,pₖ,mixtures)
E-step: assign the posterior prob p(j|xi) and computing the log-Likelihood of the parameters given the set of data(this last one for informative purposes and terminating the algorithm only)
"""
function estep(X,pₖ,mixtures)
(N,D) = size(X)
K = length(mixtures)
Xmask = .! ismissing.(X)
logpₙₖ = zeros(N,K)
lL = 0
for n in 1:N
if any(Xmask[n,:]) # if at least one true
Xu = X[n,Xmask[n,:]]
logpx = lse([log(pₖ[k] + 1e-16) + lpdf(mixtures[k],Xu,Xmask[n,:]) for k in 1:K])
lL += logpx
for k in 1:K
logpₙₖ[n,k] = log(pₖ[k] + 1e-16)+lpdf(mixtures[k],Xu,Xmask[n,:])-logpx
end
else
logpₙₖ[n,:] = log.(pₖ)
end
end
pₙₖ = exp.(logpₙₖ)
return (pₙₖ,lL)
end



## The gmm algorithm (Lecture/segment 16.5 of https://www.edx.org/course/machine-learning-with-python-from-linear-models-to)

# no longer true with the numerical trick implemented
Expand All @@ -291,14 +321,14 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
# Parameters:
* `X` : A (n x d) data to clusterise
* `K` : Number of cluster wanted
* `p₀` : Initial probabilities of the categorical distribution (K x 1) [default: `nothing`]
* `p₀` : Initial probabilities of the categorical distribution (K x 1) [default: `[]`]
* `mixtures`: An array (of length K) of the mixture to employ (see notes) [def: `[DiagonalGaussian() for i in 1:K]`]
* `tol`: Tolerance to stop the algorithm [default: 10^(-6)]
* `verbosity`: A verbosity parameter regulating the information messages frequency [def: `STD`]
* `minVariance`: Minimum variance for the mixtures [default: 0.05]
* `minCovariance`: Minimum covariance for the mixtures with full covariance matrix [default: 0]. This should be set different than minVariance (see notes).
* `initStrategy`: Mixture initialisation algorithm [def: `kmeans`]
* `maxIter`: Maximum number of iterations [def: `-1`, i.e. ∞]
* `maxIter`: Maximum number of iterations [def: `typemax(Int64)`, i.e. ∞]
* `rng`: Random Number Generator (see [`FIXEDSEED`](@ref)) [deafult: `Random.GLOBAL_RNG`]
# Returns:
Expand Down Expand Up @@ -326,7 +356,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
julia> clusters = gmm([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,verbosity=HIGH)
```
"""
function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=-1,rng = Random.GLOBAL_RNG)
function gmm(X,K;p₀=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=typemax(Int64),rng = Random.GLOBAL_RNG)
if verbosity > STD
@codeLocation
end
Expand All @@ -338,7 +368,7 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^
# ---------
X = makeMatrix(X)
(N,D) = size(X)
pₖ = isnothing(p₀) ? fill(1/K,K) : p₀
pₖ = isempty(p₀) ? fill(1/K,K) : p₀

# no longer true with the numerical trick implemented
#if (minVariance == minCovariance)
Expand All @@ -364,7 +394,7 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^

# finding empty/non_empty values
Xmask = .! ismissing.(X)
#XdimCount = sum(Xmask, dims=2)


lL = -Inf
iter = 1
Expand All @@ -373,6 +403,7 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^
# E Step: assigning the posterior prob p(j|xi) and computing the log-Likelihood of the parameters given the set of data
# (this last one for informative purposes and terminating the algorithm)
pₙₖlagged = copy(pₙₖ)

logpₙₖ = log.(pₙₖ)
lL = 0
for n in 1:N
Expand Down Expand Up @@ -402,6 +433,10 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^
end
end
pₙₖ = exp.(logpₙₖ)

pₙₖ2, lL2 = estep(X,pₖ,mixtures)
@assert pₙₖ == pₙₖ2
@assert lL == lL2

push!(ϵ,norm(pₙₖlagged - pₙₖ))

Expand All @@ -418,7 +453,7 @@ function gmm(X,K;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^
end

# Closing conditions. Note that the logLikelihood is those without considering the new mu,sigma
if ((lL - oldlL) <= (tol * abs(lL))) || (maxIter > 0 && iter == maxIter)
if ((lL - oldlL) <= (tol * abs(lL))) || (iter >= maxIter)
npars = npar(mixtures) + (K-1)
#BIC = lL - (1/2) * npars * log(N)
BICv = bic(lL,npars,N)
Expand Down Expand Up @@ -456,7 +491,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
# Parameters:
* `X` : A (N x D) sparse matrix of data to fill according to a GMM model
* `K` : Number of mixtures (latent classes) to consider [def: 3]
* `p₀` : Initial probabilities of the categorical distribution (K x 1) [default: `nothing`]
* `p₀` : Initial probabilities of the categorical distribution (K x 1) [default: `[]`]
* `mixtures`: An array (of length K) of the mixture to employ (see notes) [def: `[DiagonalGaussian() for i in 1:K]`]
* `tol`: Tolerance to stop the algorithm [default: 10^(-6)]
* `verbosity`: A verbosity parameter regulating the information messages frequency [def: `STD`]
Expand Down Expand Up @@ -485,7 +520,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
julia> cFOut = predictMissing([1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4],3)
```
"""
function predictMissing(X,K=3;p₀=nothing,mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=-1,rng = Random.GLOBAL_RNG)
function predictMissing(X,K=3;p₀=[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=-1,rng = Random.GLOBAL_RNG)
if verbosity > STD
@codeLocation
end
Expand Down Expand Up @@ -514,6 +549,98 @@ function predictMissing(X,K=3;p₀=nothing,mixtures=[DiagonalGaussian() for i in
end



# Avi v2..
Base.@kwdef mutable struct GMMClusterHyperParametersSet <: BetaMLHyperParametersSet
nClasses::Int64 = 3
probMixtures::Vector{Float64} = []
mixtures::Vector{AbstractMixture} = [DiagonalGaussian() for i in 1:nClasses]
tol::Float64 = 10^(-6)
minVariance::Float64 = 0.05
minCovariance::Float64 = 0.0
initStrategy::String = "kmeans"
maxIter::Int64 = typemax(Int64)

end

Base.@kwdef mutable struct GMMClusterOptionsSet <: BetaMLOptionsSet
verbosity::Verbosity = STD
rng = Random.GLOBAL_RNG
end


Base.@kwdef mutable struct GMMClusterLearnableParameters <: BetaMLLearnableParametersSet
mixtures::Vector{AbstractMixture} = []
probMixtures::Vector{Float64} = []
probRecords::Union{Nothing,Matrix{Float64}} = nothing
end


mutable struct GMMClusterModel <: BetaMLUnsupervisedModel
hpar::GMMClusterHyperParametersSet
opt::GMMClusterOptionsSet
par::GMMClusterLearnableParameters
trained::Bool
info
end

function GMMClusterModel(;kwargs...)
m = GMMClusterModel(GMMClusterHyperParametersSet(),GMMClusterOptionsSet(),GMMClusterLearnableParameters(),false,Dict{Symbol,Any}())
thisobjfields = fieldnames(typeof(m))
for (kw,kwv) in kwargs
for f in thisobjfields
fobj = getproperty(m,f)
if kw in fieldnames(typeof(fobj))
setproperty!(fobj,kw,kwv)
end
end
end
return m
end

function train!(m::GMMClusterModel,x)

# Parameter alias..
K = m.hpar.nClasses
p₀ = m.hpar.probMixtures
mixtures = m.hpar.mixtures
tol = m.hpar.tol
minVariance = m.hpar.minVariance
minCovariance = m.hpar.minCovariance
initStrategy = m.hpar.initStrategy
maxIter = m.hpar.maxIter
verbosity = m.opt.verbosity
rng = m.opt.rng

if m.trained
verbosity >= STD && @warn "Continuing training of a pre-trained model"
gmmOut = gmm(x,K;p₀=m.par.probMixtures,mixtures=m.par.mixtures,tol=tol,verbosity=verbosity,minVariance=minVariance,minCovariance=minCovariance,initStrategy="given",maxIter=maxIter,rng = rng)
else
gmmOut = gmm(x,K;p₀=p₀,mixtures=mixtures,tol=tol,verbosity=verbosity,minVariance=minVariance,minCovariance=minCovariance,initStrategy=initStrategy,maxIter=maxIter,rng = rng)
end
m.par = GMMClusterLearnableParameters(mixtures = gmmOut.mixtures, probMixtures=makeColVector(gmmOut.pₖ), probRecords = gmmOut.pₙₖ)

m.info[:error] = gmmOut.ϵ
m.info[:lL] = gmmOut.lL
m.info[:BIC] = gmmOut.BIC
m.info[:AIC] = gmmOut.AIC
m.trained=true
return true
end

function predict(m::GMMClusterModel)
return m.par.probRecords
end

function predict(m::GMMClusterModel,X)
x = makeMatrix(x)
mixtures = m.par.mixtures
probMixtures = m.par.probMixtures

return m.par.probRecords
end


# MLJ interface
include("Clustering_MLJ.jl")

Expand Down
50 changes: 25 additions & 25 deletions src/Trees/DecisionTrees.jl
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,11 @@ end


mutable struct DTModel <: BetaMLSupervisedModel
hyperparameters::DTHyperParametersSet
options::DTOptionsSet
learnableparameters::DTLearnableParameters
hpar::DTHyperParametersSet
opt::DTOptionsSet
par::DTLearnableParameters
trained::Bool
info
report
end

function DTModel(;kwargs...)
Expand Down Expand Up @@ -394,26 +394,26 @@ function train!(m::DTModel,x,y::AbstractArray{Ty,1}) where {Ty}
end

# Setting default parameters that depends from the data...
maxDepth = m.hyperparameters.maxDepth == nothing ? size(x,1) : m.hyperparameters.maxDepth
maxFeatures = m.hyperparameters.maxFeatures == nothing ? size(x,2) : m.hyperparameters.maxFeatures
splittingCriterion = m.hyperparameters.splittingCriterion == nothing ? ( (Ty <: Number && !m.hyperparameters.forceClassification) ? variance : gini) : m.hyperparameters.splittingCriterion
maxDepth = m.hpar.maxDepth == nothing ? size(x,1) : m.hpar.maxDepth
maxFeatures = m.hpar.maxFeatures == nothing ? size(x,2) : m.hpar.maxFeatures
splittingCriterion = m.hpar.splittingCriterion == nothing ? ( (Ty <: Number && !m.hpar.forceClassification) ? variance : gini) : m.hpar.splittingCriterion
# Setting schortcuts to other hyperparameters/options....
minGain = m.hyperparameters.minGain
minRecords = m.hyperparameters.minRecords
forceClassification = m.hyperparameters.forceClassification
rng = m.options.rng
verbosity = m.options.verbosity
minGain = m.hpar.minGain
minRecords = m.hpar.minRecords
forceClassification = m.hpar.forceClassification
rng = m.opt.rng
verbosity = m.opt.verbosity

m.learnableparameters.tree = buildTree(x, y; maxDepth = maxDepth, minGain=minGain, minRecords=minRecords, maxFeatures=maxFeatures, forceClassification=forceClassification, splittingCriterion = splittingCriterion, mCols=nothing, rng = rng)
m.par.tree = buildTree(x, y; maxDepth = maxDepth, minGain=minGain, minRecords=minRecords, maxFeatures=maxFeatures, forceClassification=forceClassification, splittingCriterion = splittingCriterion, mCols=nothing, rng = rng)

m.trained = true

jobIsRegression = (forceClassification || ! (Ty <: Number) ) ? false : true

m.info[:trainedRecords] = size(x,1)
m.info[:dimensions] = size(x,2)
m.info[:jobIsRegression] = jobIsRegression ? 1 : 0
(m.info[:avgDepth],m.info[:maxDepth]) = computeDepths(m.learnableparameters.tree)
m.report[:trainedRecords] = size(x,1)
m.report[:dimensions] = size(x,2)
m.report[:jobIsRegression] = jobIsRegression ? 1 : 0
(m.report[:avgDepth],m.report[:maxDepth]) = computeDepths(m.par.tree)
return true
end

Expand Down Expand Up @@ -466,14 +466,14 @@ end

# API V2...
function predict(m::DTModel,x)
return predictSingle.(Ref(m.learnableparameters.tree),eachrow(x),rng=m.options.rng)
return predictSingle.(Ref(m.par.tree),eachrow(x),rng=m.opt.rng)
end

# ------------------------------------------------------------------------------
# OTHER (MODEL OPTIONAL PARTS, INFO, VISUALISATION,...)

function reset!(m::DTModel)
m.learnableparameters = DTLearnableParameters()
m.par = DTLearnableParameters()
m.trained = false
# note info is NOT resetted
end
Expand Down Expand Up @@ -544,18 +544,18 @@ function show(io::IO, ::MIME"text/plain", m::DTModel)
if m.trained == false
print(io,"DTModel - A Decision Tree model (untrained)")
else
job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
print(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)")
job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
print(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)")
end
end

function show(io::IO, m::DTModel)
if m.trained == false
print(io,"DTModel - A Decision Tree model (untrained)")
else
job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
println(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)")
println(io,m.info)
_printNode(m.learnableparameters.tree)
job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
println(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)")
println(io,m.report)
_printNode(m.par.tree)
end
end
Loading

0 comments on commit eb56d29

Please sign in to comment.