Merge pull request #18 from JuliaAI/bugfixes

fix issue #17
JuliaAI · Jun 2, 2022 · 4f13cc3 · 4f13cc3
2 parents 70e29d9 + 2cc89da
commit 4f13cc3
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 12 deletions.
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,7 @@ version = "0.2.0"
 
 [deps]
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 XGBoost = "009559a3-9522-5dbb-924b-0b6ed2b22bb9"
 

diff --git a/src/MLJXGBoostInterface.jl b/src/MLJXGBoostInterface.jl
@@ -12,6 +12,7 @@ const MMI = MLJModelInterface
 import Tables: schema
 
 import XGBoost
+import SparseArrays
 
 # helper for feature importances:
 # XGBoost used "f
@@ -223,15 +224,20 @@ function MMI.clean!(model::XGBoostRegressor)
     return warning
 end
 
+# For `XGBoost.DMatrix(Xmatrix, y)` `Xmatrix` must either be a julia `Array` or
+# a `SparseMatrixCSC` while `y` must be a `Vector` 
+_to_array(x::Union{Array, SparseArrays.SparseMatrixCSC}) = x
+_to_array(x::AbstractArray) = copyto!(similar(Array{eltype(x)}, axes(x)), x) 
+
 function MMI.fit(model::XGBoostRegressor
              , verbosity::Int
              , X
              , y)
 
              silent =
                  verbosity > 0 ?  false : true
-    Xmatrix = MMI.matrix(X)
-    dm = XGBoost.DMatrix(Xmatrix,label=y)
+    Xmatrix = _to_array(MMI.matrix(X))
+    dm = XGBoost.DMatrix(Xmatrix, label=_to_array(y))
 
     objective =
         model.objective in ["linear", "gamma", "tweedie"] ?
@@ -259,7 +265,7 @@ end
 function MMI.predict(model::XGBoostRegressor
         , fitresult
         , Xnew)
-    Xmatrix = MMI.matrix(Xnew)
+    Xmatrix = _to_array(MMI.matrix(Xnew))
     return XGBoost.predict(fitresult, Xmatrix)
 end
 
@@ -417,8 +423,8 @@ function MMI.fit(model::XGBoostCount
 
     silent = verbosity > 0 ?  false : true
 
-    Xmatrix = MMI.matrix(X)
-    dm = XGBoost.DMatrix(Xmatrix,label=y)
+    Xmatrix = _to_array(MMI.matrix(X))
+    dm = XGBoost.DMatrix(Xmatrix, label=_to_array(y))
 
     seed =
         model.seed == -1 ? generate_seed() : model.seed
@@ -439,7 +445,7 @@ end
 function MMI.predict(model::XGBoostCount
         , fitresult
         , Xnew)
-    Xmatrix = MMI.matrix(Xnew)
+    Xmatrix = _to_array(MMI.matrix(Xnew))
     return XGBoost.predict(fitresult, Xmatrix)
 end
 
@@ -594,7 +600,7 @@ function MMI.fit(model::XGBoostClassifier
                      , verbosity::Int     #> must be here even if unsupported in pkg
                      , X
                      , y)
-    Xmatrix = MMI.matrix(X)
+    Xmatrix = _to_array(MMI.matrix(X))
 
     a_target_element = y[1] # a CategoricalValue or CategoricalString
     num_class = length(MMI.classes(a_target_element))
@@ -608,14 +614,16 @@ function MMI.fit(model::XGBoostClassifier
         eval_metric = "mlogloss"
     end
 
-    y_plain = MMI.int(y) .- 1 # integer relabeling should start at 0
+    y_plain_ = MMI.int(y) .- 1 # integer relabeling should start at 0
 
     if(num_class==2)
         objective="binary:logistic"
-        y_plain = convert(Array{Bool}, y_plain)
+        y_plain_ = convert(Array{Bool}, y_plain_)
     else
         objective="multi:softprob"
     end
+
+    y_plain = _to_array(y_plain_)
 
     silent =
         verbosity > 0 ?  false : true
@@ -655,7 +663,7 @@ function MMI.predict(model::XGBoostClassifier
     decode = MMI.decoder(a_target_element)
     classes = MMI.classes(a_target_element)
 
-    Xmatrix = MMI.matrix(Xnew)
+    Xmatrix = _to_array(MMI.matrix(Xnew))
     XGBpredictions = XGBoost.predict(result, Xmatrix)
 
     nlevels = length(classes)

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -63,9 +63,16 @@ Xtable = table(X)
 α = 0.1
 β = [-0.3, 0.2, -0.1]
 λ = exp.(α .+ X * β)
-ycount = [rand(rng, Poisson(λᵢ)) for λᵢ ∈ λ]
+ycount_ = [rand(rng, Poisson(λᵢ)) for λᵢ ∈ λ]
+ycount = @view(ycount_[:]) # intention is to simulate issue #17
 
 fitresultC, cacheC, reportC = MLJBase.fit(count_regressor, 0, Xtable, ycount);
+fitresultC_, cacheC_, reportC_ = MLJBase.fit(count_regressor, 0, Xtable, ycount_);
+# the `cacheC` and `reportC` should be same for both models but the 
+# `fitresultC`s might be different since they may have different pointers to same
+# information. 
+@test cacheC == cacheC_
+@test reportC == reportC_
 cpred = predict(count_regressor, fitresultC, Xtable);
 
 importances = reportC.feature_importances
@@ -102,6 +109,15 @@ y = identity.(ycat) # make plain Vector with categ. elements
 train, test = partition(eachindex(y), 0.6)
 fitresult, cache, report = MLJBase.fit(plain_classifier, 0,
                                             selectrows(X, train), y[train];)
+fitresult_, cache_, report_ = MLJBase.fit(
+    plain_classifier, 0, selectrows(X, train), @view(y[train]);
+) # mimick issue #17
+# the `cache` and `report` should be same for both models but the 
+# `fitresult` might be different since they may have different pointers to same
+# information. 
+@test cache == cache_
+@test report == report_
+
 yhat = mode.(predict(plain_classifier, fitresult, selectrows(X, test)))
 misclassification_rate = sum(yhat .!= y[test])/length(test)
 @test misclassification_rate < 0.01
@@ -128,7 +144,6 @@ restored_fitresult = MLJBase.restore(plain_classifier,
 @test predict_mode(plain_classifier, restored_fitresult, selectrows(X, test)) ==
     yhat
 
-
 ## MACHINE INTEGRATION
 
 # count regressor (`count_regressor`, `Xtable` and `ycount`