diff --git a/Project.toml b/Project.toml index be6425bc..6e7a0e71 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJBase" uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" authors = ["Anthony D. Blaom "] -version = "0.5.0" +version = "0.5.1" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" @@ -27,9 +27,10 @@ julia = "1" [extras] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" [targets] -test = ["CSV", "DataFrames", "LossFunctions", "Test", "TypedTables"] +test = ["CSV", "DataFrames", "Distances", "LossFunctions", "Test", "TypedTables"] diff --git a/src/MLJBase.jl b/src/MLJBase.jl index b4e9282e..c986f2d5 100644 --- a/src/MLJBase.jl +++ b/src/MLJBase.jl @@ -20,6 +20,8 @@ export selectrows, selectcols, select, nrows # data.jl export table, levels_seen, matrix, container_type # data.jl export partition, unpack # data.jl export @set_defaults # utilities.jl +export @mlj_model # mlj_model_macro.jl +export metadata_model, metadata_pkg # metadata_utilities export HANDLE_GIVEN_ID, @more, @constant # show.jl export color_on, color_off # show.jl export UnivariateFinite, average # distributions.jl @@ -50,6 +52,7 @@ export pdf, mode, median, mean, shuffle!, categorical, shuffle, levels, levels! export std import Base.== +import Base: @__doc__ using Tables using OrderedCollections # already a dependency of StatsBase @@ -83,7 +86,7 @@ const DEFAULT_SHOW_DEPTH = 0 include("utilities.jl") -## BASE TYPES +## BASE TYPES abstract type MLJType end include("equality.jl") # equality for MLJType objects @@ -116,7 +119,7 @@ abstract type UnsupervisedNetwork <: Unsupervised end ## THE MODEL INTERFACE # every model interface must implement a `fit` method of the form -# `fit(model, verbosity::Integer, training_args...) -> fitresult, cache, report` +# `fit(model, verbosity::Integer, training_args...) -> fitresult, cache, report` # or, one the simplified versions # `fit(model, training_args...) -> fitresult` # `fit(model, X, ys...) -> fitresult` @@ -169,14 +172,14 @@ clean!(model::Model) = "" ## TRAITS -""" +""" info(object) List the traits of an object, such as a model or a performance measure. """ -info(object) = info(object, Val(ScientificTypes.trait(object))) +info(object) = info(object, Val(ScientificTypes.trait(object))) include("model_traits.jl") @@ -199,6 +202,12 @@ include("datasets.jl") # importing CSV will also load datasets_requires.jl include("tasks.jl") include("measures.jl") +# mlj model macro to help define models +include("mlj_model_macro.jl") + +# metadata utils +include("metadata_utilities.jl") + # __init__() function: include("init.jl") diff --git a/src/data.jl b/src/data.jl index e2fcb23d..bdeac020 100644 --- a/src/data.jl +++ b/src/data.jl @@ -45,12 +45,12 @@ function partition(rows::AbstractVector{Int}, fractions...; shuffle::Bool=false, end """ - t1, t2, ...., tk = unnpack(table, c1, c2, ... ck; wrap_singles=false) + t1, t2, ...., tk = unnpack(table, t1, t2, ... tk; wrap_singles=false) Split any Tables.jl compatible `table` into smaller tables (or vectors) `t1, t2, ..., tk` by making selections *without replacement* -from the column names defined by the conditionals `c1`, `c2`, ..., -`ck`. A *conditional* is any object `c` such that `c(name)` is `true` +from the column names defined by the tests `t1`, `t2`, ..., +`tk`. A *test* is any object `t` such that `t(name)` is `true` or `false` for each column `name::Symbol` of `table`. Whenever a returned table contains a single column, it is converted to @@ -59,7 +59,7 @@ a vector unless `wrap_singles=true`. Scientific type conversions can be optionally specified (note semicolon): - unpack(table, c...; wrap_singles=false, col1=>scitype1, col2=>scitype2, ... ) + unpack(table, t...; wrap_singles=false, col1=>scitype1, col2=>scitype2, ... ) ### Example @@ -82,7 +82,7 @@ julia> Z ``` """ -function unpack(X, conditionals...; wrap_singles=false, pairs...) +function unpack(X, tests...; wrap_singles=false, pairs...) if isempty(pairs) Xfixed = X @@ -94,7 +94,7 @@ function unpack(X, conditionals...; wrap_singles=false, pairs...) names_left = schema(Xfixed).names |> collect history = "" counter = 1 - for c in conditionals + for c in tests names = filter(c, names_left) filter!(!in(names), names_left) history *= "selection $counter: $names\n remaining: $names_left\n" diff --git a/src/distributions.jl b/src/distributions.jl index df353cdb..fdd7bc03 100644 --- a/src/distributions.jl +++ b/src/distributions.jl @@ -2,9 +2,11 @@ const Dist = Distributions - ## EQUALITY OF DISTRIBUTIONS (minor type piracy) +# TODO: We should get rid of this. I think it is used only in +# MLJModels/test. + function ==(d1::D, d2::D) where D<:Dist.Sampleable ret = true for fld in fieldnames(D) @@ -108,8 +110,10 @@ end function UnivariateFinite(classes::AbstractVector{L}, p::AbstractVector{<:Real}) where L - L <: CategoricalElement || error("classes must have CategoricalValue or "* - "CategoricalString type.") + L <: CategoricalElement || + error("`classes` must have type `AbstractVector{T}` where "* + "`T <: Union{CategoricalValue,CategoricalString}. "* + "Perhaps you have `T=Any`? ") Dist.@check_args(UnivariateFinite, length(classes)==length(p)) prob_given_class = LittleDict([classes[i]=>p[i] for i in eachindex(p)]) return UnivariateFinite(prob_given_class) @@ -138,6 +142,28 @@ function Base.show(stream::IO, d::UnivariateFinite) print(stream, str) end +""" + isapprox(d1::UnivariateFinite, d2::UnivariateFinite; kwargs...) + +Returns `true` if and only if `Set(classes(d1) == Set(classes(d2))` +and the corresponding probabilities are approximately equal. The +key-word arguments `kwargs` are passed through to each call of +`isapprox` on probabiliity pairs. Returns `false` otherwise. + +""" +function Base.isapprox(d1::UnivariateFinite, d2::UnivariateFinite; kwargs...) + + classes1 = classes(d1) + classes2 = classes(d2) + + for c in classes1 + c in classes2 || return false + isapprox(pdf(d1, c), pdf(d2, c); kwargs...) || + return false # pdf defined below + end + return true +end + function average(dvec::AbstractVector{UnivariateFinite{L,U,T}}; weights=nothing) where {L,U,T} diff --git a/src/metadata_utilities.jl b/src/metadata_utilities.jl new file mode 100644 index 00000000..83264abd --- /dev/null +++ b/src/metadata_utilities.jl @@ -0,0 +1,57 @@ +""" +docstring_ext + +Helper function to generate the docstring for a package. +""" +function docstring_ext(T; descr::String="") + package_name = MLJBase.package_name(T) + package_url = MLJBase.package_url(T) + model_name = MLJBase.name(T) + # the message to return + message = "$descr" + message *= "\n→ based on [$package_name]($package_url)" + message *= "\n→ do `@load $model_name` to use the model" + message *= "\n→ do `?$model_name` for documentation." +end + +""" +metadata_pkg + +Helper function to write the metadata for a package. +""" +function metadata_pkg(T; name::String="unknown", uuid::String="unknown", url::String="unknown", + julia::Union{Missing,Bool}=missing, license::String="unknown", + is_wrapper::Bool=false) + ex = quote + package_name(::Type{<:$T}) = $name + package_uuid(::Type{<:$T}) = $uuid + package_url(::Type{<:$T}) = $url + is_pure_julia(::Type{<:$T}) = $julia + package_license(::Type{<:$T}) = $license + is_wrapper(::Type{<:$T}) = $is_wrapper + end + eval(ex) +end + +""" +metadata_model + +Helper function to write the metadata for a single model of a package (complements +[`metadata_ext`](@ref)). +""" +function metadata_model(T; input=Unknown, target=Unknown, + output=Unknown, weights::Bool=false, + descr::String="", path::String="") + if isempty(path) + path = "MLJModels.$(package_name(T))_.$(name(T))" + end + ex = quote + input_scitype(::Type{<:$T}) = $input + output_scitype(::Type{<:$T}) = $output + target_scitype(::Type{<:$T}) = $target + supports_weights(::Type{<:$T}) = $weights + docstring(::Type{<:$T}) = docstring_ext($T, descr=$descr) + load_path(::Type{<:$T}) = $path + end + eval(ex) +end diff --git a/src/mlj_model_macro.jl b/src/mlj_model_macro.jl new file mode 100644 index 00000000..3a233a10 --- /dev/null +++ b/src/mlj_model_macro.jl @@ -0,0 +1,185 @@ +# This defines a macro `mlj_model` which is a simpler version than the +# @sk_model macro defined to help import sklearn models. +# The difference is that the `mlj_model` macro only defines the constructor and the `clean!` +# and does not automatically define the `fit` and `predict` methods +# +# NOTE: it does NOT handle parametric types (yet). + +""" +_process_model_def(ex) + +Take an expression defining a model (`mutable struct Model ...`) and unpack key elements for +further processing: + +- Model name (`modelname`) +- Names of parameters (`params`) +- Default values (`defaults`) +- Constraints (`constraints`) +""" +function _process_model_def(ex) + defaults = Dict{Symbol,Any}() + constraints = Dict{Symbol,Any}() + modelname = ex.args[2] isa Symbol ? ex.args[2] : ex.args[2].args[1] + params = Symbol[] + + # inspect all lines which may define parameters, retrieve their names, + # default values and constraints on values that can be given to them + for i in 1:length(ex.args[3].args) + # retrieve meaningful lines + line = ex.args[3].args[i] + line isa LineNumberNode && continue + + # line without information (e.g. just a name "a") + if line isa Symbol + param = line + push!(params, param) + defaults[param] = missing + else + # A meaningful line will look like + # line.args[1] = line.args[2] + # + # where line.args[1] will either be just `name` or `name::Type` + # and line.args[2] will either be just `value` or `value::constraint` + # --------------------------------------------------------- + # 1. decompose `line.args[1]` appropriately (name and type) + if line.args[1] isa Symbol # case :a + param = line.args[1] + type = length(line.args) > 1 ? line.args[2] : :Any + else # case :(a::Int) + param, type = line.args[1].args[1:2] # (:a, Int) + end + push!(params, param) + # ------------------------------------------------------------------ + # 2. decompose `line.args[2]` appropriately (values and constraints) + if line.head == :(=) # assignment for default + default = line.args[2] + # if a constraint is given (value::constraint) + if default isa Expr && length(default.args) > 1 + constraints[param] = default.args[2] + # now discard the constraint to keep only the value + default = default.args[1] + end + defaults[param] = default # this will be a value not an expr + ex.args[3].args[i] = line.args[1] # name or name::Type (for the constructor) + else + # these are simple heuristics when no default value is given for the + # field but an "obvious" one can be provided implicitly (ideally this should + # not be used as it's not very clear that the intention matches the usage) + eff_type = eval(type) + if eff_type <: Number + defaults[param] = zero(eff_type) + elseif eff_type <: AbstractString + defaults[param] = "" + elseif eff_type == Any # e.g. Any or no type given + defaults[param] = missing + elseif eff_type >: Nothing # e.g. Union{Nothing, ...} + defaults[param] = nothing + elseif eff_type >: Missing # e.g. Union{Missing, ...} (unlikely) + defaults[param] = missing + else + @error "A default value for parameter '$param' (type '$type') must be given" + end + end + end + end + return ex, modelname, params, defaults, constraints +end + + +""" +_unpack!(ex, rep) + +Internal function to allow to read a constraint given after a default value for a parameter +and transform it in an executable condition (which is returned to be executed later). +For instance if we have + + alpha::Int = 0.5::(arg > 0.0) + +Then it would transform the `(arg > 0.0)` in `(alpha > 0.0)` which is executable. +""" +function _unpack!(ex::Expr, rep) + for i in eachindex(ex.args) + if ex.args[i] ∈ (:_, :arg) + ex.args[i] = rep + end + _unpack!(ex.args[i], rep) + end + return ex +end +_unpack!(ex, _) = ex # when it's been unpacked, it's not an expression anymore + + +""" +_model_constructor(modelname, params, defaults) + +Build the expression of the keyword constructor associated with a model definition. +When the constructor is called, the `clean!` function is called as well to check that +parameter assignments are valid. +""" +function _model_constructor(modelname, params, defaults) + Expr(:function, Expr(:call, modelname, Expr(:parameters, (Expr(:kw, p, defaults[p]) for p in params)...)), + # body of the function + Expr(:block, + Expr(:(=), :model, Expr(:call, :new, params...)), + :(message = clean!(model)), + :(isempty(message) || @warn message), + :(return model) + ) + ) +end + + +""" +_model_cleaner(modelname, defaults, constraints) + +Build the expression of the cleaner associated with the constraints specified in a model def. +""" +function _model_cleaner(modelname, defaults, constraints) + Expr(:function, :(clean!(model::$modelname)), + # body of the function + Expr(:block, + :(warning = ""), + # condition and action for each constraint + # each parameter is given as field::Type = default::constraint + # here we recuperate the constraint and express it as an if statement + # for instance if we had + # alpha::Real = 0.0::(arg > 0.0) + # this would become + # if !(alpha > 0.0) + (Expr(:if, Expr(:call, :!, _unpack!(constr, :(model.$param))), + # action of the constraint is violated: + # add a message and use default for the parameter + Expr(:block, + :(warning *= $("Constraint `$constr` failed; using default: $param=$(defaults[param]).")), + :(model.$param = $(defaults[param])) + ) + ) for (param, constr) in constraints)..., + # return full message + :(return warning) + ) + ) +end + +""" +mlj_model + +Macro to help define MLJ models with constraints on the default parameters, this can be seen as +a tweaked version of the `@with_kw` macro from `Parameters`. +""" +macro mlj_model(ex) + ex, modelname, params, defaults, constraints = _process_model_def(ex) + # keyword constructor + const_ex = _model_constructor(modelname, params, defaults) + # associate the constructor with the definition of the struct + push!(ex.args[3].args, const_ex) + # cleaner + clean_ex = _model_cleaner(modelname, defaults, constraints) + esc( + quote + Base.@__doc__ $ex + export $modelname + $ex + $clean_ex + end + ) +end diff --git a/src/utilities.jl b/src/utilities.jl index 78a2cf50..b755baa4 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -16,6 +16,7 @@ function finaltypes(T::Type) end +# NOTE: deprecated, see @mlj_model """ @set_defaults ModelType(args...) @@ -39,7 +40,7 @@ trivial fallback defined for all subtypes of `MLJBase.Model`. end @set_defaults Foo(1,2) - + julia> Foo() Foo(1, 2) @@ -76,12 +77,9 @@ function set_defaults_(mod, T_ex, values) eachindex(values)] program = quote - $T_ex(; $(equality_pair_exs...)) = + $T_ex(; $(equality_pair_exs...)) = $T_ex($(fields...)) end mod.eval(program) end - - - diff --git a/test/distributions.jl b/test/distributions.jl index 5887f898..49d91207 100644 --- a/test/distributions.jl +++ b/test/distributions.jl @@ -10,7 +10,7 @@ import Random.seed! seed!(1234) -## UNIVARIATE NOMINAL +## UNIVARIATE FINITE v = categorical(collect("asqfasqffqsaaaa"), ordered=true) a, s, q, f = v[1], v[2], v[3], v[4] @@ -68,6 +68,17 @@ d = Distributions.fit(UnivariateFinite, v) @test_throws ArgumentError pdf(d, 'j') @test_throws ArgumentError pdf(d, j) +@testset "approx for UnivariateFinite" begin + y = categorical(["yes", "no", "maybe"]) + yes = y[1] + no = y[2] + maybe = y[3] + @test(UnivariateFinite([yes, no, maybe], [0.1, 0.2, 0.7]) ≈ + UnivariateFinite([maybe, yes, no], [0.7, 0.1, 0.2])) + @test(!(UnivariateFinite([yes, no, maybe], [0.1, 0.2, 0.7]) ≈ + UnivariateFinite([maybe, yes, no], [0.7, 0.2, 0.1]))) +end + # arithmetic v = categorical(collect("abc")) a, b, c = v[1], v[2], v[3] diff --git a/test/metadata_utilities.jl b/test/metadata_utilities.jl new file mode 100644 index 00000000..9c117f22 --- /dev/null +++ b/test/metadata_utilities.jl @@ -0,0 +1,38 @@ +module TestMetadataUtils + +using MLJBase, Test + +@mlj_model mutable struct FooRegressor <: Deterministic + a::Int = 0::(_ ≥ 0) + b +end +metadata_pkg(FooRegressor, + name="FooRegressor", + uuid="10745b16-79ce-11e8-11f9-7d13ad32a3b2", + url="http://existentialcomics.com/", + julia=true, + license="MIT", + is_wrapper=false + ) +metadata_model(FooRegressor, + input=MLJBase.Table(Continuous), + target=AbstractVector{Continuous}, + descr="La di da") + +infos = info_dict(FooRegressor) + +@test infos[:input_scitype] == MLJBase.Table(Continuous) +@test infos[:target_scitype] == AbstractVector{Continuous} +@test infos[:is_pure_julia] +@test !infos[:is_wrapper] +@test infos[:docstring] == raw"""La di da + → based on [FooRegressor](http://existentialcomics.com/) + → do `@load FooRegressor` to use the model + → do `?FooRegressor` for documentation.""" +@test infos[:name] == "FooRegressor" + +@test infos[:is_supervised] +@test infos[:prediction_type] == :deterministic + +end +true diff --git a/test/mlj_model_macro.jl b/test/mlj_model_macro.jl new file mode 100644 index 00000000..b3d7b41e --- /dev/null +++ b/test/mlj_model_macro.jl @@ -0,0 +1,92 @@ +module TestMacroMLJ + +using MLJBase, Test, Distances + +# No type, no default +@mlj_model mutable struct A1 + a +end +a = A1() +@test ismissing(a.a) +a.a = 5 +@test a.a == 5 + +# No type, with default +@mlj_model mutable struct A1b + a = 5 +end +a = A1b() +@test a.a == 5 +a.a = "hello" +@test a.a == "hello" + +# If a type is given but no default value is given, then the macro tries to fill +# a default value; either 0 if it's a Number type, or an empty string and otherwise fails. +@mlj_model mutable struct A1c + a::Int +end +a = A1c() +@test a.a == 0 +a = A1c(a=7) +@test a.a == 7 +@test_throws InexactError A1c(a=5.3) +@test_throws MethodError A1c(a="hello") + +# Type is given and default is given +@mlj_model mutable struct A1d + a::Int = 5 +end +a = A1d() +@test a.a == 5 +a = A1d(a=7) +@test a.a == 7 + +# No type is given but a default and constraint +@mlj_model mutable struct A1e + a = 5::(_ > 0) +end +a = A1e() +@test a.a == 5 +a = A1e(a=7) +@test a.a == 7 +@test @test_logs (:warn, "Constraint `model.a > 0` failed; using default: a=5.") A1e(a=-1).a==5 +a = A1e(a=7.5) +@test a.a == 7.5 + +# Type is given with default and constraint +@mlj_model mutable struct A1f + a::Int = 5::(_ > 0) +end +a = A1f() +@test a.a == 5 +a = A1f(a=7) +@test a.a == 7 +@test_throws InexactError A1f(a=7.5) +@test @test_logs (:warn, "Constraint `model.a > 0` failed; using default: a=5.") A1f(a=-1).a==5 + +abstract type FooBar end +@mlj_model mutable struct B1a <: FooBar + a::Symbol = :auto::(_ in (:auto, :semi)) +end +b = B1a() +@test b.a == :auto +b = B1a(a=:semi) +@test b.a == :semi +@test @test_logs (:warn, "Constraint `model.a in (:auto, :semi)` failed; using default: a=:auto.") B1a(a=:autos).a == :auto +@test_throws MethodError B1a(b="blah") + +# == dependence on other types +@mlj_model mutable struct B1b + a::SemiMetric = Euclidean()::(_ isa Metric) +end +@test B1b().a isa Euclidean +@test @test_logs (:warn, "Constraint `model.a isa Metric` failed; using default: a=Euclidean().") B1b(a=BhattacharyyaDist()).a isa Euclidean + +@mlj_model mutable struct B1c + a::SemiMetric = Euclidean() +end +@test B1c().a isa Euclidean + + +end +true diff --git a/test/runtests.jl b/test/runtests.jl index a10ff422..486c971d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,9 +3,7 @@ # this test code be wrapped in a module. Any new module name will do - # eg, `module TestDatasets` for code testing `datasets.jl`. -# using Revise -using MLJBase -using Test +using MLJBase, Test @testset "equality" begin @test include("equality.jl") @@ -47,3 +45,10 @@ end @test include("loss_functions_interface.jl") end +@testset "@mlj_model" begin + @test include("mlj_model_macro.jl") +end + +@testset "metadatautils" begin + @test include("metadata_utilities.jl") +end