Skip to content

Commit

Permalink
Changed V2API, some defaults on gmm/predictMissing, started v2api imp…
Browse files Browse the repository at this point in the history
…lementation of clusters

This test ok
  • Loading branch information
sylvaticus committed Jul 13, 2022
1 parent eb56d29 commit 9a3b394
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 88 deletions.
6 changes: 3 additions & 3 deletions src/Api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ module Api

export BetaMLModel, BetaMLSupervisedModel, BetaMLUnsupervisedModel,
BetaMLOptionsSet, BetaMLHyperParametersSet, BetaMLLearnableParametersSet,
predict, fit, fit!, train!, partition, report
predict, fit, fit!, train!, partition, info

abstract type BetaMLModel end
abstract type BetaMLSupervisedModel <: BetaMLModel end
Expand All @@ -30,8 +30,8 @@ Predict new information (including transformation) based on a trained BetaMLMode
"""
predict(::BetaMLModel) = nothing

function report(m::BetaMLModel)
return m.report
function info(m::BetaMLModel)
return m.info
end

partition() = nothing
Expand Down
54 changes: 4 additions & 50 deletions src/Clustering/Clustering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -393,51 +393,16 @@ function gmm(X,K;p₀=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=1
end

# finding empty/non_empty values
Xmask = .! ismissing.(X)
#Xmask = .! ismissing.(X)


lL = -Inf
iter = 1
while(true)
oldlL = lL
# E Step: assigning the posterior prob p(j|xi) and computing the log-Likelihood of the parameters given the set of data
# (this last one for informative purposes and terminating the algorithm)
pₙₖlagged = copy(pₙₖ)

logpₙₖ = log.(pₙₖ)
lL = 0
for n in 1:N
if any(Xmask[n,:]) # if at least one true
Xu = X[n,Xmask[n,:]]
#=if (length(ϵ) == 2)
println("here I am")
for m in mixtures[3:end]
println(m.μ)
println(m.σ²)
println(Xu)
println(Xmask[n,:])
lpdf(m,Xu,Xmask[n,:])
println("here I am partially")
end
println("here I am dead")
end=#
logpx = lse([log(pₖ[k] + 1e-16) + lpdf(mixtures[k],Xu,Xmask[n,:]) for k in 1:K])
lL += logpx
#px = sum([pⱼ[k]*normalFixedSd(Xu,μ[k,XMask[n,:]],σ²[k]) for k in 1:K])
#println(n)
for k in 1:K
logpₙₖ[n,k] = log(pₖ[k] + 1e-16)+lpdf(mixtures[k],Xu,Xmask[n,:])-logpx
end
else
logpₙₖ[n,:] = log.(pₖ)
end
end
pₙₖ = exp.(logpₙₖ)

pₙₖ2, lL2 = estep(X,pₖ,mixtures)
@assert pₙₖ == pₙₖ2
@assert lL == lL2

pₙₖ, lL = estep(X,pₖ,mixtures)
push!(ϵ,norm(pₙₖlagged - pₙₖ))

# M step: find parameters that maximise the likelihood
Expand Down Expand Up @@ -466,17 +431,6 @@ function gmm(X,K;p₀=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=1
end # end while loop
end # end function

#using BenchmarkTools
#@benchmark clusters = emGMM([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3,msgStep=0)
#@benchmark clusters = emGMM([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0)
#@benchmark clusters = emGMM([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0)
#@benchmark clusters = emGMM([1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4],3,msgStep=0)
#@code_warntype gmm([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0)
#using Profile
#Juno.@profiler (for i = 1:1000 gmm([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,msgStep=0,missingValue=0) end)
#Profile.clear()
#Profile.print()

# - For mixtures with full covariance matrix (i.e. `FullGaussian(μ,σ²)`) the minCovariance should NOT be set equal to the minVariance, or if the covariance matrix goes too low, it will become singular and not invertible.
"""
predictMissing(X,K;p₀,mixtures,tol,verbosity,minVariance,minCovariance)
Expand All @@ -498,7 +452,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
* `minVariance`: Minimum variance for the mixtures [default: 0.05]
* `minCovariance`: Minimum covariance for the mixtures with full covariance matrix [default: 0]. This should be set different than minVariance (see notes).
* `initStrategy`: Mixture initialisation algorithm [def: `grid`]
* `maxIter`: Maximum number of iterations [def: `-1`, i.e. ∞]
* `maxIter`: Maximum number of iterations [def: `typemax(Int64)`, i.e. ∞]
* `rng`: Random Number Generator (see [`FIXEDSEED`](@ref)) [deafult: `Random.GLOBAL_RNG`]
# Returns:
Expand All @@ -520,7 +474,7 @@ Implemented in the log-domain for better numerical accuracy with many dimensions
julia> cFOut = predictMissing([1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4],3)
```
"""
function predictMissing(X,K=3;p₀=[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=-1,rng = Random.GLOBAL_RNG)
function predictMissing(X,K=3;p₀=[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minVariance=0.05,minCovariance=0.0,initStrategy="kmeans",maxIter=typemax(Int64),rng = Random.GLOBAL_RNG)
if verbosity > STD
@codeLocation
end
Expand Down
8 changes: 4 additions & 4 deletions src/Clustering/Clustering_MLJ.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ KMeans(;

mutable struct GMMClusterer <: MMI.Unsupervised
K::Int64
p₀::Union{Nothing,AbstractArray{Float64,1}}
p₀::AbstractArray{Float64,1}
mixtures::Symbol
tol::Float64
minVariance::Float64
Expand All @@ -50,7 +50,7 @@ mutable struct GMMClusterer <: MMI.Unsupervised
end
GMMClusterer(;
K = 3,
p₀ = nothing,
p₀ = Float64[],
mixtures = :diag_gaussian,
tol = 10^(-6),
minVariance = 0.05,
Expand All @@ -61,7 +61,7 @@ GMMClusterer(;

mutable struct MissingImputator <: MMI.Unsupervised
K::Int64
p₀::Union{Nothing,AbstractArray{Float64,1}}
p₀::AbstractArray{Float64,1}
mixtures::Symbol
tol::Float64
minVariance::Float64
Expand All @@ -71,7 +71,7 @@ mutable struct MissingImputator <: MMI.Unsupervised
end
MissingImputator(;
K = 3,
p₀ = nothing,
p₀ = Float64[],
mixtures = :diag_gaussian,
tol = 10^(-6),
minVariance = 0.05,
Expand Down
4 changes: 2 additions & 2 deletions src/Imputation/Imputation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -173,14 +173,14 @@ Limitations:
"""
Base.@kwdef mutable struct GMMImputer <: Imputer
K::Int64 = 3
p₀::Union{Nothing,Vector{Float64}} = nothing
p₀::Vector{Float64} = Float64[]
mixtures::Vector{AbstractMixture} = [DiagonalGaussian() for i in 1:K]
tol::Float64 = 10^(-6)
verbosity::Verbosity = STD
minVariance::Float64 = 0.05
minCovariance::Float64 = 0.0
initStrategy::String = "kmeans"
maxIter::Int64 = -1
maxIter::Int64 = typemax(Int64)
multipleImputations::Int64 = 1
rng::AbstractRNG = Random.GLOBAL_RNG
fitResults::Union{GMMImputerResult,Nothing} = nothing
Expand Down
2 changes: 1 addition & 1 deletion src/Imputation/Imputation_MLJ.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ mutable struct BetaMLGMMImputer <: MMI.Unsupervised
end
BetaMLGMMImputer(;
K = 3,
p₀ = nothing,
p₀ = Int64[],
mixtures = :diag_gaussian,
tol = 10^(-6),
minVariance = 0.05,
Expand Down
20 changes: 10 additions & 10 deletions src/Trees/DecisionTrees.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ mutable struct DTModel <: BetaMLSupervisedModel
opt::DTOptionsSet
par::DTLearnableParameters
trained::Bool
report
info
end

function DTModel(;kwargs...)
Expand Down Expand Up @@ -410,10 +410,10 @@ function train!(m::DTModel,x,y::AbstractArray{Ty,1}) where {Ty}

jobIsRegression = (forceClassification || ! (Ty <: Number) ) ? false : true

m.report[:trainedRecords] = size(x,1)
m.report[:dimensions] = size(x,2)
m.report[:jobIsRegression] = jobIsRegression ? 1 : 0
(m.report[:avgDepth],m.report[:maxDepth]) = computeDepths(m.par.tree)
m.info[:trainedRecords] = size(x,1)
m.info[:dimensions] = size(x,2)
m.info[:jobIsRegression] = jobIsRegression ? 1 : 0
(m.info[:avgDepth],m.info[:maxDepth]) = computeDepths(m.par.tree)
return true
end

Expand Down Expand Up @@ -544,18 +544,18 @@ function show(io::IO, ::MIME"text/plain", m::DTModel)
if m.trained == false
print(io,"DTModel - A Decision Tree model (untrained)")
else
job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
print(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)")
job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
print(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)")
end
end

function show(io::IO, m::DTModel)
if m.trained == false
print(io,"DTModel - A Decision Tree model (untrained)")
else
job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
println(io,"DTModel - A Decision Tree $job (trained on $(m.report[:trainedRecords]) records)")
println(io,m.report)
job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
println(io,"DTModel - A Decision Tree $job (trained on $(m.info[:trainedRecords]) records)")
println(io,m.info)
_printNode(m.par.tree)
end
end
22 changes: 11 additions & 11 deletions src/Trees/RandomForests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ mutable struct RFModel <: BetaMLSupervisedModel
opt::RFOptionsSet
par::Union{Nothing,Forest} #TODO: Forest contain info that is actualy in report. Currently we duplicate, we should just remofe them from par by making a dedicated struct instead of Forest
trained::Bool
report
info
end

function RFModel(;kwargs...)
Expand Down Expand Up @@ -164,12 +164,12 @@ function train!(m::RFModel,x,y::AbstractArray{Ty,1}) where {Ty}

m.trained = true

m.report[:trainedRecords] = size(x,1)
m.report[:dimensions] = maxFeatures
m.report[:jobIsRegression] = m.par.isRegression ? 1 : 0
m.report[:oobE] = m.par.oobError
m.info[:trainedRecords] = size(x,1)
m.info[:dimensions] = maxFeatures
m.info[:jobIsRegression] = m.par.isRegression ? 1 : 0
m.info[:oobE] = m.par.oobError
depths = vcat([transpose([computeDepths(tree)[1],computeDepths(tree)[2]]) for tree in m.par.trees]...)
(m.report[:avgAvgDepth],m.report[:avgMmaxDepth]) = mean(depths,dims=1)[1], mean(depths,dims=1)[2]
(m.info[:avgAvgDepth],m.info[:avgMmaxDepth]) = mean(depths,dims=1)[1], mean(depths,dims=1)[2]
return true
end

Expand Down Expand Up @@ -303,17 +303,17 @@ function show(io::IO, ::MIME"text/plain", m::RFModel)
if m.trained == false
print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest model (untrained)")
else
job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.report[:trainedRecords]) records)")
job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.info[:trainedRecords]) records)")
end
end

function show(io::IO, m::RFModel)
if m.trained == false
print(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest model (untrained)")
else
job = m.report[:jobIsRegression] == 1 ? "regressor" : "classifier"
println(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.report[:trainedRecords]) records)")
println(io,m.report)
job = m.info[:jobIsRegression] == 1 ? "regressor" : "classifier"
println(io,"RFModel - A $(m.hpar.nTrees) trees Random Forest $job (trained on $(m.info[:trainedRecords]) records)")
println(io,m.info)
end
end
4 changes: 2 additions & 2 deletions test/Clustering_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,10 @@ probsx2 = predict(m)



@test isapprox(clusters.BIC,114.1492467835965)
#@test isapprox(clusters.BIC,114.1492467835965)
#clusters.pₙₖ
#clusters.pₖ
#clusters.mixtures
#clusters.BIC

m.hyperparameters
#m.hyperparameters
10 changes: 5 additions & 5 deletions test/Trees_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ ŷtest2 = predict(m, xtest)
@test accuracy(ŷtest,ytest,rng=copy(TESTRNG)) >= 0.8
@test ŷtest == ŷtest2

@test report(m) == Dict(:jobIsRegression => 0,:maxDepth => 3, :dimensions => 2, :trainedRecords => 5, :avgDepth => 2.6666666666666665)
@test info(m) == Dict(:jobIsRegression => 0,:maxDepth => 3, :dimensions => 2, :trainedRecords => 5, :avgDepth => 2.6666666666666665)
#print(myTree)

# ==================================
Expand Down Expand Up @@ -290,26 +290,26 @@ println("Testing MLJ interface for Trees models....")
X, y = Mlj.@load_boston
model_dtr = DecisionTreeRegressor(rng=copy(TESTRNG))
regressor_dtr = Mlj.machine(model_dtr, X, y)
(fitresult_dtr, cache, reportobj) = Mlj.fit(model_dtr, 0, X, y)
(fitresult_dtr, cache, report) = Mlj.fit(model_dtr, 0, X, y)
yhat_dtr = Mlj.predict(model_dtr, fitresult_dtr, X)
@test meanRelError(yhat_dtr,y) < 0.02

model_rfr = RandomForestRegressor(rng=copy(TESTRNG))
regressor_rfr = Mlj.machine(model_rfr, X, y)
(fitresult_rfr, cache, reportObj) = Mlj.fit(model_rfr, 0, X, y)
(fitresult_rfr, cache, report) = Mlj.fit(model_rfr, 0, X, y)
yhat_rfr = Mlj.predict(model_rfr, fitresult_rfr, X)
@test meanRelError(yhat_rfr,y) < 0.06

X, y = Mlj.@load_iris
model_dtc = DecisionTreeClassifier(rng=copy(TESTRNG))
regressor_dtc = Mlj.machine(model_dtc, X, y)
(fitresult_dtc, cache, reportObj) = Mlj.fit(model_dtc, 0, X, y)
(fitresult_dtc, cache, report) = Mlj.fit(model_dtc, 0, X, y)
yhat_dtc = Mlj.predict(model_dtc, fitresult_dtc, X)
@test Mlj.mean(Mlj.LogLoss(tol=1e-4)(yhat_dtc, y)) < 0.0002

model_rfc = RandomForestClassifier(maxFeatures=3,rng=copy(TESTRNG))
regressor_rfc = Mlj.machine(model_rfc, X, y)
(fitresult_rfc, cache, reportObj) = Mlj.fit(model_rfc, 0, X, y)
(fitresult_rfc, cache, report) = Mlj.fit(model_rfc, 0, X, y)
yhat_rfc = Mlj.predict(model_rfc, fitresult_rfc, X)
@test Mlj.mean(Mlj.LogLoss(tol=1e-4)(yhat_rfc, y)) < 0.04

Expand Down

0 comments on commit 9a3b394

Please sign in to comment.