Merge pull request #536 from alan-turing-institute/dev

For a 0.18.1 release
JuliaAI · Apr 19, 2021 · 773508f · 773508f
2 parents 8c87e4c + 7781275
commit 773508f
Show file tree

Hide file tree

Showing 8 changed files with 61 additions and 101 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJBase"
 uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "0.18.0"
+version = "0.18.1"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -33,15 +33,15 @@ ComputationalResources = "^0.3"
 Distributions = "0.22, 0.23, 0.24"
 InvertedIndices = "^1"
 LossFunctions = "0.5, 0.6"
-MLJModelInterface = "^0.4.1"
+MLJModelInterface = "^0.4.1, 1.0"
 MLJScientificTypes = "^0.4.1"
 Missings = "^0.4"
 OrderedCollections = "^1.1"
 Parameters = "^0.12"
 PrettyTables = "^0.8,^0.9,^0.10,^0.11"
 ProgressMeter = "^1.3"
+StatisticalTraits = "^0.1.1, 1.0"
 StatsBase = "^0.32,^0.33"
-StatisticalTraits = "^0.1.1"
 Tables = "^0.2,^1.0"
 julia = "1"
 

diff --git a/src/composition/models/pipelines.jl b/src/composition/models/pipelines.jl
@@ -128,7 +128,7 @@ not_unsupervised_alert(v) =
 pipe_argument_error(v) =
     pipe_alert("Encountered `$v` where a "*
                "model instance, model type, function, "*
-               "or key-word assignement was expected. ")
+               "or key-word assignment was expected. ")
 
 function super_type(prediction_type::Symbol)
     if prediction_type == :deterministic

diff --git a/src/univariate_finite/methods.jl b/src/univariate_finite/methods.jl
@@ -64,15 +64,13 @@ CategoricalArrays.isordered(u::UnivariateFiniteArray) = isordered(classes(u))
 
 ## DISPLAY
 
+_round_prob(p) = p
+_round_prob(p::Union{Float32,Float64}) = round(p, sigdigits=3)
+
 function Base.show(stream::IO, d::UnivariateFinite)
     pairs = Distributions.params(d).probs
-    x1, p1 = pairs[1]
-    str = "UnivariateFinite{$(d.scitype)}($x1=>$(round(p1, sigdigits=3))"
-    for pair in pairs[2:end]
-        str *= ", $(pair[1])=>$(round(pair[2], sigdigits=3))"
-    end
-    str *= ")"
-    print(stream, str)
+    arg_str = join(["$(pr[1])=>$(_round_prob(pr[2]))" for pr in pairs], ", ")
+    print(stream, "UnivariateFinite{$(d.scitype)}($arg_str)")
 end
 
 show_prefix(u::UnivariateFiniteArray{S,V,R,P,1}) where {S,V,R,P} =
@@ -142,7 +140,7 @@ function Base.isapprox(d1::UnivariateFiniteArray,
     for c in support1
         c in support2 || return false
         isapprox(pdf.(d1, c), pdf.(d2, c); kwargs...) ||
-            return false 
+            return false
     end
     return true
 end
@@ -270,10 +268,11 @@ end
 """
     _cumulative(d::UnivariateFinite)
 
-Return the cumulative probability vector `[0, ..., 1]` for the
-distribution `d`, using only classes in the support of `d`, ordered
-according to the categorical elements used at instantiation of
-`d`. Used only to implement random sampling from `d`.
+Return the cumulative probability vector `C` for the distribution `d`,
+using only classes in the support of `d`, ordered according to the
+categorical elements used at instantiation of `d`. Used only to
+implement random sampling from `d`. We have `C[1] == 0` and `C[end] ==
+1`, assuming the probabilities have been normalized.
 
 """
 function _cumulative(d::UnivariateFinite{S,V,R,P}) where {S,V,R,P<:Real}
@@ -283,8 +282,7 @@ function _cumulative(d::UnivariateFinite{S,V,R,P}) where {S,V,R,P<:Real}
     K = length(p)
     p_cumulative = Array{P}(undef, K + 1)
     p_cumulative[1] = zero(P)
-    p_cumulative[K + 1] = one(P)
-    for i in 2:K
+    for i in 2:K + 1
         p_cumulative[i] = p_cumulative[i-1] + p[i-1]
     end
     return p_cumulative
@@ -294,13 +292,12 @@ end
 _rand(rng, p_cumulative, R)
 
 Randomly sample the distribution with discrete support `R(1):R(n)`
-which has cumulative probability vector `p_cumulative=[0, ..., 1]` (of
-length `n+1`). Does not check the first and last elements of
-`p_cumulative` but does not use them either.
+which has cumulative probability vector `p_cumulative` (see
+[`_cummulative`](@ref)).
 
 """
 function _rand(rng, p_cumulative, R)
-    real_sample = rand(rng)
+    real_sample = rand(rng)*p_cumulative[end]
     K = R(length(p_cumulative))
     index = K
     for i in R(2):R(K)

diff --git a/src/univariate_finite/types.jl b/src/univariate_finite/types.jl
@@ -7,11 +7,10 @@ const UnivariateFiniteSuper = Dist.Distribution{Dist.Univariate,NonEuclidean}
 # V - type of class labels (eg, Char in `categorical(['a', 'b'])`)
 # P - raw probability type
 # S - scitype of samples
-# L - raw type of labels, eg `Symbol` or `String`
 
 # Note that the keys of `prob_given_ref` need not exhaust all the
 # refs of all classes but will be ordered (LittleDicts preserve order)
-struct UnivariateFinite{S,V,R,P<:Real} <: UnivariateFiniteSuper
+struct UnivariateFinite{S,V,R,P} <: UnivariateFiniteSuper
     scitype::Type{S}
     decoder::CategoricalDecoder{V,R}
     prob_given_ref::LittleDict{R,P,Vector{R}, Vector{P}}
@@ -35,9 +34,11 @@ UnivariateFinite(a...; kwargs...) = MMI.UnivariateFinite(a...; kwargs...)
 
 ## CHECKS AND ERROR MESSAGES
 
-const Prob{P} = Union{P, AbstractArray{P}} where P <: Real
+# checks that scalar probabilities lie in [0, 1] and checks that
+# vector probabilities sum to one have now been dropped, except where
+# `augment=true` is specified.
 
-prob_error = ArgumentError("Probabilities must have `Real` type. ")
+const Prob{P} = Union{P, AbstractArray{P}} where P
 
 _err_01() = throw(DomainError("Probabilities must be in [0,1]."))
 _err_sum_1() = throw(DomainError(
@@ -65,7 +66,7 @@ function _check_pool(pool)
 end
 _check_probs_01(probs) =
     all(0 .<= probs .<= 1) || _err_01()
-_check_probs_sum(probs::Vector{<:Prob{P}}) where P<:Real =
+_check_probs_sum(probs::Vector{<:Prob{P}}) where P =
     all(x -> x≈one(P), sum(probs)) || _err_sum_1()
 _check_probs(probs) = (_check_probs_01(probs); _check_probs_sum(probs))
 _check_augmentable(support, probs) = _check_probs_01(probs) &&
@@ -124,8 +125,8 @@ function MMI.UnivariateFinite(
     # this constructor ignores kwargs
 
     probs = values(prob_given_class) |> collect
-    _check_probs_01.(probs)
-    _check_probs_sum(probs)
+#    _check_probs_01.(probs)
+#    _check_probs_sum(probs)
 
     # retrieve decoder and classes from element
     class1         = first(keys(prob_given_class))
@@ -134,20 +135,20 @@ function MMI.UnivariateFinite(
 
     # `LittleDict`s preserve order of keys, which we need for rand():
 
-    support  = keys(prob_given_class) |> collect |> sort
+    _support  = keys(prob_given_class) |> collect |> sort
 
-    issubset(support, parent_classes) ||
+    issubset(_support, parent_classes) ||
         error("Categorical elements are not from the same pool. ")
 
     pairs = [int(c) => prob_given_class[c]
-                for c in support]
+                for c in _support]
 
     probs1 = first(values(prob_given_class))
     S = scitype(class1)
-    if probs1 isa Real
-        return UnivariateFinite(S, parent_decoder, LittleDict(pairs...))
-    else
+    if  probs1 isa AbstractArray
         return UnivariateFiniteArray(S, parent_decoder, LittleDict(pairs...))
+    else
+        return UnivariateFinite(S, parent_decoder, LittleDict(pairs...))
     end
 end
 
@@ -188,14 +189,15 @@ end
 ## CONSTRUCTORS - FROM ARRAYS
 
 # example: _get_on_last(A, 4) = A[:, :, 4] if A has 3 dims:
-_get_on_last(probs::AbstractArray{<:Any,N}, i) where N = probs[fill(:,N-1)..., i]
+_get_on_last(probs::AbstractArray{<:Any,N}, i) where N =
+    probs[fill(:,N-1)..., i]
 
 # 1. Univariate Finite from a vector of classes or raw labels and
 # array of probs; first, a dispatcher:
 function MMI.UnivariateFinite(
     ::FI,
     support::AbstractVector,
-    probs::Union{AbstractArray,Real};
+    probs;
     kwargs...)
 
     if support isa AbstractArray{<:CategoricalValue}
@@ -215,13 +217,12 @@ function MMI.UnivariateFinite(
                              kwargs...)
 end
 
-# The core method, ultimately called by 1.0, 1.1, 1.2, 1.3 below, or
-# directly from the dispatcher 1. above
+# The core method, ultimately called by 1.0, 1.1, 1.2, 1.3 below.
 function _UnivariateFinite(support::AbstractVector{CategoricalValue{V,R}},
                            probs::AbstractArray{P},
                            N;
                            augment=false,
-                           kwargs...) where {V,R,P<:Real}
+                           kwargs...) where {V,R,P}
 
     unique(support) == support ||
         error("Non-unique vector of classes specified")
@@ -246,15 +247,15 @@ function _UnivariateFinite(support::AbstractVector{CategoricalValue{V,R}},
 end
 
 # 1.0 support does not consist of categorical elements:
-function _UnivariateFinite(support::AbstractVector{L},
-                           probs::AbstractArray{P},
+function _UnivariateFinite(support,
+                           probs::AbstractArray,
                            N;
                            augment=false,
                            pool=nothing,
-                           ordered=false) where {L,P<:Real}
+                           ordered=false)
 
-    # If we got here, then L<:CategoricalValue is not true, ie L is a
-    # raw label type
+    # If we got here, then the vector `support` is not
+    # `AbstractVector{<:CategoricalValue}`
 
     if pool === nothing || ismissing(pool)
         if pool === nothing
@@ -315,13 +316,13 @@ _UnivariateFinite(::Val{true},
 # 1.3 corner case, probs a scalar:
 _UnivariateFinite(::Val{true},
                   support::AbstractVector,
-                  probs::Real;
+                  probs;
                   kwargs...) =
                       UnivariateFinite(support, [probs,]; kwargs...)[1]
 
 # 2. probablity only; unspecified support:
 function MMI.UnivariateFinite(::FI,
-                              probs::AbstractArray{<:Real,N};
+                              probs::AbstractArray{<:Any,N};
                               pool=nothing,
                               augment=false,
                               kwargs...) where N

diff --git a/test/openml.jl b/test/openml.jl
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -106,6 +106,3 @@ end
     @test include("hyperparam/one_dimensional_range_methods.jl")
 end
 
-# @testset "openml" begin
-#     @test include("openml.jl")
-# end
diff --git a/test/univariate_finite/arrays.jl b/test/univariate_finite/arrays.jl
@@ -17,14 +17,14 @@ c   = 3
     probs  = rand(rng, n)
     supp = ["class1", "class2"]
 
-    @test_throws DomainError UnivariateFinite(supp, probs, pool=missing)
+    # @test_throws DomainError UnivariateFinite(supp, probs, pool=missing)
     u = UnivariateFinite(supp, probs, pool=missing, augment=true)
     @test length(u) == n
     @test size(u) == (n,)
     @test pdf.(u, "class2") ≈ probs
 
     # autosupport:
-    @test_throws DomainError UnivariateFinite(probs, pool=missing)
+    # @test_throws DomainError UnivariateFinite(probs, pool=missing)
     u = UnivariateFinite(probs, pool=missing, augment=true)
     @test length(u) == n
     @test size(u) == (n,)

diff --git a/test/univariate_finite/methods.jl b/test/univariate_finite/methods.jl
@@ -14,7 +14,7 @@ V = categorical(collect("asqfasqffqsaaaa"))
 a, s, q, f = v[1], v[2], v[3], v[4]
 A, S, Q, F = V[1], V[2], V[3], V[4]
 
-@testset "UnivariateFinite constructor" begin
+@testset "set 1" begin
 
     # ordered (OrderedFactor)
     dict = Dict(s=>0.1, q=> 0.2, f=> 0.7)
@@ -53,13 +53,22 @@ A, S, Q, F = V[1], V[2], V[3], V[4]
     @test samples == [rand(rng, d) for i in 1:N]
 
     N = 10000
-    samples = rand(rng, d, N);
+    samples = rand(StableRNG(123), d, N);
     @test Set(samples) == Set(support(d))
     freq = Distributions.countmap(samples)
     @test isapprox(freq[f]/N, 0.7, atol=0.05)
     @test isapprox(freq[s]/N, 0.1, atol=0.05)
     @test isapprox(freq[q]/N, 0.2, atol=0.05)
 
+    # test unnormalized case gives same answer:
+    dd = UnivariateFinite(support(d), [70, 20, 10])
+    samples = rand(StableRNG(123), dd, N);
+    @test Set(samples) == Set(support(d))
+    ffreq = Distributions.countmap(samples)
+    @test isapprox(freq[f]/N, ffreq[f]/N)
+    @test isapprox(freq[s]/N, ffreq[s]/N)
+    @test isapprox(freq[q]/N, ffreq[q]/N)
+
     #
     # unordered (Multiclass):
     dict = Dict(S=>0.1, Q=> 0.2, F=> 0.7)
@@ -133,9 +142,9 @@ end
     @test pdf(d, 'q') ≈ 0.2
     @test_logs((:warn, r"No "),
                UnivariateFinite(['f', 'q', 's'],  [0.7, 0.2, 0.1]))
-    @test_throws(MethodError,
-                 UnivariateFinite(['f', 'q', 's'],  ["junk", 0.2, 0.1],
-                                  pool=missing))
+    # @test_throws(MethodError,
+    #              UnivariateFinite(['f', 'q', 's'],  ["junk", 0.2, 0.1],
+    #                               pool=missing))
     d = UnivariateFinite(['f', 'q', 's'],  [0.7, 0.2, 0.1], pool=missing)
     @test pdf(d, 'f') ≈ 0.7
     @test pdf(d, 's') ≈ 0.1