diff --git a/Project.toml b/Project.toml index 792ffb6a..80203bc0 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJBase" uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" authors = ["Anthony D. Blaom "] -version = "1.5.0" +version = "1.6" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" @@ -47,7 +47,7 @@ DelimitedFiles = "1" Distributions = "0.25.3" InvertedIndices = "1" LearnAPI = "0.1" -MLJModelInterface = "1.10" +MLJModelInterface = "1.11" Missings = "0.4, 1" OrderedCollections = "1.1" Parameters = "0.12" @@ -58,7 +58,7 @@ Reexport = "1.2" ScientificTypes = "3" StatisticalMeasures = "0.1.1" StatisticalMeasuresBase = "0.1.1" -StatisticalTraits = "3.3" +StatisticalTraits = "3.4" Statistics = "1" StatsBase = "0.32, 0.33, 0.34" Tables = "0.2, 1.0" diff --git a/src/composition/learning_networks/nodes.jl b/src/composition/learning_networks/nodes.jl index c4508448..bbcd0351 100644 --- a/src/composition/learning_networks/nodes.jl +++ b/src/composition/learning_networks/nodes.jl @@ -277,7 +277,7 @@ function _formula(stream, X::Node, depth, indent) if X.machine !== nothing print(stream, crind(indent + length(operation_name) - anti)) printstyled(IOContext(stream, :color=>SHOW_COLOR[]), -# handle(X.machine), + #handle(X.machine), X.machine, bold=SHOW_COLOR[]) n_args == 0 || print(stream, ", ") diff --git a/src/composition/learning_networks/signatures.jl b/src/composition/learning_networks/signatures.jl index 7ffadb7c..84c88b27 100644 --- a/src/composition/learning_networks/signatures.jl +++ b/src/composition/learning_networks/signatures.jl @@ -307,7 +307,8 @@ See also [`MLJBase.Signature`](@ref). """ fitted_params_supplement(signature::Signature) = call_and_copy(fitted_params_nodes(signature)) -""" report(signature; supplement=true) +""" + report(signature; supplement=true) **Private method.** diff --git a/src/composition/models/pipelines.jl b/src/composition/models/pipelines.jl index 0ea85297..4ba1e65a 100644 --- a/src/composition/models/pipelines.jl +++ b/src/composition/models/pipelines.jl @@ -182,7 +182,7 @@ or what `transform` returns if it is `Unsupervised`. Names for the component fields are automatically generated unless explicitly specified, as in -``` +```julia Pipeline(encoder=ContinuousEncoder(drop_last=false), stand=Standardizer()) ``` @@ -225,6 +225,15 @@ implements it (some clustering models). Similarly, calling `transform` on a supervised pipeline calls `transform` on the supervised component. +### Transformers that need a target in training + +Some transformers that have type `Unsupervised` (so that the output of `transform` is +propagated in pipelines) may require a target variable for training. An example are +so-called target encoders (which transform categorical input features, based on some +target observations). Provided they appear before any `Supervised` component in the +pipelines, such models are supported. Of course a target must be provided whenever +training such a pipeline, whether or not it contains a `Supervised` component. + ### Optional key-word arguments - `prediction_type` - @@ -444,9 +453,13 @@ function extend(front::Front{Pred}, ::Static, name, cache, args...) Front(transform(mach, active(front)), front.transform, Pred()) end -function extend(front::Front{Trans}, component::Unsupervised, name, cache, args...) +function extend(front::Front{Trans}, component::Unsupervised, name, cache, ::Any, sources...) a = active(front) - mach = machine(name, a; cache=cache) + if target_in_fit(component) + mach = machine(name, a, first(sources); cache=cache) + else + mach = machine(name, a; cache=cache) + end Front(predict(mach, a), transform(mach, a), Trans()) end @@ -598,6 +611,7 @@ function MMI.iteration_parameter(pipe::SupervisedPipeline) end MMI.target_scitype(p::SupervisedPipeline) = target_scitype(supervised_component(p)) +MMI.target_in_fit(p::SomePipeline) = any(target_in_fit, components(p)) MMI.package_name(::Type{<:SomePipeline}) = "MLJBase" MMI.load_path(::Type{<:SomePipeline}) = "MLJBase.Pipeline" diff --git a/src/data/data.jl b/src/data/data.jl index d3242807..af6bf859 100644 --- a/src/data/data.jl +++ b/src/data/data.jl @@ -401,12 +401,18 @@ _isnan(x::Number) = isnan(x) skipnan(x) = Iterators.filter(!_isnan, x) +isinvalid(x) = ismissing(x) || _isnan(x) + """ skipinvalid(itr) Return an iterator over the elements in `itr` skipping `missing` and `NaN` values. Behaviour is similar to [`skipmissing`](@ref). +""" +skipinvalid(v) = v |> skipmissing |> skipnan + +""" skipinvalid(A, B) For vectors `A` and `B` of the same length, return a tuple of vectors @@ -417,10 +423,6 @@ always returns a vector. Does not remove `Missing` from the element types if present in the original iterators. """ -skipinvalid(v) = v |> skipmissing |> skipnan - -isinvalid(x) = ismissing(x) || _isnan(x) - function skipinvalid(yhat, y) mask = .!(isinvalid.(yhat) .| isinvalid.(y)) return yhat[mask], y[mask] diff --git a/src/data/datasets.jl b/src/data/datasets.jl index ba4d88db..929a41f9 100644 --- a/src/data/datasets.jl +++ b/src/data/datasets.jl @@ -199,7 +199,7 @@ function load_smarket() end """Load a well-known sunspot time series (table with one column). -[https://www.sws.bom.gov.au/Educational/2/3/6]](https://www.sws.bom.gov.au/Educational/2/3/6) + """ load_sunspots() = load_dataset("sunspots.csv", COERCE_SUNSPOTS) @@ -250,9 +250,10 @@ macro load_crabs() end end -""" Load S&P Stock Market dataset, as used in (An Introduction to -Statistical Learning with applications in -R)[https://rdrr.io/cran/ISLR/man/Smarket.html](https://rdrr.io/cran/ISLR/man/Smarket.html), +""" +Load S&P Stock Market dataset, as used in +[An Introduction to Statistical Learning with applications in +R](https://rdrr.io/cran/ISLR/man/Smarket.html), by Witten et al (2013), Springer-Verlag, New York.""" macro load_smarket() quote diff --git a/src/data/datasets_synthetic.jl b/src/data/datasets_synthetic.jl index d1a8830e..eb442a9f 100644 --- a/src/data/datasets_synthetic.jl +++ b/src/data/datasets_synthetic.jl @@ -21,12 +21,12 @@ Internal function to finalize the `make_*` functions. function finalize_Xy(X, y, shuffle, as_table, eltype, rng; clf::Bool=true) # Shuffle the rows if required if shuffle - X, y = shuffle_rows(X, y; rng=rng) - end - if eltype != Float64 - X = convert.(eltype, X) - end - # return as matrix if as_table=false + X, y = shuffle_rows(X, y; rng=rng) + end + if eltype != Float64 + X = convert.(eltype, X) + end + # return as matrix if as_table=false as_table || return X, y clf && return MLJBase.table(X), categorical(y) if length(size(y)) > 1 @@ -172,7 +172,6 @@ membership to the smaller or larger circle, respectively. * `noise=0`: standard deviation of the Gaussian noise added to the data, * `factor=0.8`: ratio of the smaller radius over the larger one, - $(EXTRA_KW_MAKE*EXTRA_CLASSIFICATION) ### Example @@ -318,7 +317,12 @@ Make portion `s` of vector `θ` exactly 0. """ sparsify!(rng, θ, s) = (θ .*= (rand(rng, length(θ)) .< s)) -"""Add outliers to portion s of vector.""" +""" + outlify!(rng, y, s) + +Add outliers to portion `s` of vector `y`. + +""" outlify!(rng, y, s) = (n = length(y); y .+= 20 * randn(rng, n) .* (rand(rng, n) .< s)) @@ -329,7 +333,7 @@ const SIGMOID_32 = log(Float32(1)/eps(Float32) - Float32(1)) sigmoid(x) Return the sigmoid computed in a numerically stable way: -``σ(x) = 1/(1+exp(-x))`` +``σ(x) = 1/(1+\\exp(-x))`` """ function sigmoid(x::Float64) diff --git a/src/hyperparam/one_dimensional_range_methods.jl b/src/hyperparam/one_dimensional_range_methods.jl index dc82d03a..b187252e 100644 --- a/src/hyperparam/one_dimensional_range_methods.jl +++ b/src/hyperparam/one_dimensional_range_methods.jl @@ -66,31 +66,31 @@ In the first case iteration is over all `values` stored in the range iteration is over approximately `n` ordered values, generated as follows: -(i) First, exactly `n` values are generated between `U` and `L`, with a -spacing determined by `r.scale` (uniform if `scale=:linear`) where `U` -and `L` are given by the following table: - -| `r.lower` | `r.upper` | `L` | `U` | -|-------------|------------|---------------------|---------------------| -| finite | finite | `r.lower` | `r.upper` | -| `-Inf` | finite | `r.upper - 2r.unit` | `r.upper` | -| finite | `Inf` | `r.lower` | `r.lower + 2r.unit` | -| `-Inf` | `Inf` | `r.origin - r.unit` | `r.origin + r.unit` | - -(ii) If a callable `f` is provided as `scale`, then a uniform spacing -is always applied in (i) but `f` is broadcast over the results. (Unlike -ordinary scales, this alters the effective range of values generated, -instead of just altering the spacing.) - -(iii) If `r` is a discrete numeric range (`r isa NumericRange{<:Integer}`) -then the values are additionally rounded, with any duplicate values -removed. Otherwise all the values are used (and there are exacltly `n` -of them). - -(iv) Finally, if a random number generator `rng` is specified, then the values are -returned in random order (sampling without replacement), and otherwise -they are returned in numeric order, or in the order provided to the -range constructor, in the case of a `NominalRange`. +1. First, exactly `n` values are generated between `U` and `L`, with a + spacing determined by `r.scale` (uniform if `scale=:linear`) where `U` + and `L` are given by the following table: + + | `r.lower` | `r.upper` | `L` | `U` | + |-------------|------------|---------------------|---------------------| + | finite | finite | `r.lower` | `r.upper` | + | `-Inf` | finite | `r.upper - 2r.unit` | `r.upper` | + | finite | `Inf` | `r.lower` | `r.lower + 2r.unit` | + | `-Inf` | `Inf` | `r.origin - r.unit` | `r.origin + r.unit` | + +2. If a callable `f` is provided as `scale`, then a uniform spacing + is always applied in (1) but `f` is broadcast over the results. (Unlike + ordinary scales, this alters the effective range of values generated, + instead of just altering the spacing.) + +3. If `r` is a discrete numeric range (`r isa NumericRange{<:Integer}`) + then the values are additionally rounded, with any duplicate values + removed. Otherwise all the values are used (and there are exacltly `n` + of them). + +4. Finally, if a random number generator `rng` is specified, then the values are + returned in random order (sampling without replacement), and otherwise + they are returned in numeric order, or in the order provided to the + range constructor, in the case of a `NominalRange`. """ iterator(rng::AbstractRNG, r::ParamRange, args...) = diff --git a/src/machines.jl b/src/machines.jl index 8f5aa438..a7473642 100644 --- a/src/machines.jl +++ b/src/machines.jl @@ -529,7 +529,7 @@ err_missing_model(model) = ErrorException( ) """ - last_model(mach::Machine) + last_model(mach::Machine) Return the last model used to train the machine `mach`. This is a bona fide model, even if `mach.model` is a symbol. @@ -572,31 +572,31 @@ the true model given by `getproperty(composite, model)`. See also [`machine`](@r For the action to be a no-operation, either `mach.frozen == true` or or none of the following apply: -- (i) `mach` has never been trained (`mach.state == 0`). +1. `mach` has never been trained (`mach.state == 0`). -- (ii) `force == true`. +2. `force == true`. -- (iii) The `state` of some other machine on which `mach` depends has - changed since the last time `mach` was trained (ie, the last time - `mach.state` was last incremented). +3. The `state` of some other machine on which `mach` depends has + changed since the last time `mach` was trained (ie, the last time + `mach.state` was last incremented). -- (iv) The specified `rows` have changed since the last retraining and - `mach.model` does not have `Static` type. +4. The specified `rows` have changed since the last retraining and + `mach.model` does not have `Static` type. -- (v) `mach.model` is a model and different from the last model used for training, but has - the same type. +5. `mach.model` is a model and different from the last model used for training, but has + the same type. -- (vi) `mach.model` is a model but has a type different from the last model used for - training. +6. `mach.model` is a model but has a type different from the last model used for + training. -- (vii) `mach.model` is a symbol and `(composite, mach.model)` is different from the last - model used for training, but has the same type. +7. `mach.model` is a symbol and `(composite, mach.model)` is different from the last + model used for training, but has the same type. -- (viii) `mach.model` is a symbol and `(composite, mach.model)` has a different type from - the last model used for training. +8. `mach.model` is a symbol and `(composite, mach.model)` has a different type from + the last model used for training. -In any of the cases (i) - (iv), (vi), or (viii), `mach` is trained ab initio. If (v) or -(vii) is true, then a training update is applied. +In any of the cases (1) - (4), (6), or (8), `mach` is trained ab initio. +If (5) or (7) is true, then a training update is applied. To freeze or unfreeze `mach`, use `freeze!(mach)` or `thaw!(mach)`. @@ -658,7 +658,7 @@ function fit_only!( rows === nothing && (rows = (:)) rows_is_new = !isdefined(mach, :old_rows) || rows != mach.old_rows - condition_iv = rows_is_new && !(mach.model isa Static) + condition_4 = rows_is_new && !(mach.model isa Static) upstream_has_changed = mach.old_upstream_state != upstream_state @@ -672,16 +672,16 @@ function fit_only!( # build or update cached `resampled_data` if necessary (`mach.data` is already defined # above if needed here): - if cache_data && (!data_is_valid || condition_iv) + if cache_data && (!data_is_valid || condition_4) mach.resampled_data = selectrows(model, rows, mach.data...) end # `fit`, `update`, or return untouched: - if mach.state == 0 || # condition (i) - force == true || # condition (ii) - upstream_has_changed || # condition (iii) - condition_iv || # condition (iv) - modeltype_changed # conditions (vi) or (vii) + if mach.state == 0 || # condition (1) + force == true || # condition (2) + upstream_has_changed || # condition (3) + condition_4 || # condition (4) + modeltype_changed # conditions (6) or (7) isdefined(mach, :report) || (mach.report = LittleDict{Symbol,Any}()) @@ -709,7 +709,7 @@ function fit_only!( rethrow() end - elseif model != mach.old_model # condition (v) + elseif model != mach.old_model # condition (5) # update the model: fitlog(mach, :update, verbosity) @@ -1044,9 +1044,10 @@ To serialise using a different format, see [`serializable`](@ref). Machines are deserialized using the `machine` constructor as shown in the example below. -> The implementation of `save` for machines changed in MLJ 0.18 -> (MLJBase 0.20). You can only restore a machine saved using older -> versions of MLJ using an older version. +!!! note + The implementation of `save` for machines changed in MLJ 0.18 + (MLJBase 0.20). You can only restore a machine saved using older + versions of MLJ using an older version. ### Example @@ -1073,8 +1074,7 @@ predict(predict_only_mach, X) general purpose serialization formats, can allow for arbitrary code execution during loading. This means it is possible for someone to use a JLS file that looks like a serialized MLJ machine as a - [Trojan - horse](https://en.wikipedia.org/wiki/Trojan_horse_(computing)). + [Trojan horse](https://en.wikipedia.org/wiki/Trojan_horse_(computing)). See also [`serializable`](@ref), [`machine`](@ref). diff --git a/src/resampling.jl b/src/resampling.jl index dd317092..7f1eb970 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -536,8 +536,8 @@ and the corresponding estimates, aggregated over all train/test pairs, are recor When displayed, a `PerformanceEvaluation` object includes a value under the heading `1.96*SE`, derived from the standard error of the `per_fold` entries. This value is suitable for constructing a formal 95% confidence interval for the given -`measurement`. Such intervals should be interpreted with caution. See, for example, Bates -et al. [(2021)](https://arxiv.org/abs/2104.00673). +`measurement`. Such intervals should be interpreted with caution. See, for example, [Bates +et al. (2021)](https://arxiv.org/abs/2104.00673). ### Fields @@ -752,15 +752,15 @@ Base.show(io::IO, e::CompactPerformanceEvaluation) = ## USER CONTROL OF DEFAULT LOGGING const DOC_DEFAULT_LOGGER = - """ +""" - The default logger is used in calls to [`evaluate!`](@ref) and [`evaluate`](@ref), and - in the constructors `TunedModel` and `IteratedModel`, unless the `logger` keyword is - explicitly specified. +The default logger is used in calls to [`evaluate!`](@ref) and [`evaluate`](@ref), and +in the constructors `TunedModel` and `IteratedModel`, unless the `logger` keyword is +explicitly specified. - !!! note +!!! note - Prior to MLJ v0.20.7 (and MLJBase 1.5) the default logger was always `nothing`. + Prior to MLJ v0.20.7 (and MLJBase 1.5) the default logger was always `nothing`. """ @@ -772,8 +772,7 @@ tracking platforms, such as [MLflow](https://mlflow.org/docs/latest/index.html). $DOC_DEFAULT_LOGGER - When MLJBase is first loaded, the default logger is `nothing`. To reset the logger, see - beow. +When MLJBase is first loaded, the default logger is `nothing`. """ default_logger() = DEFAULT_LOGGER[] @@ -786,11 +785,11 @@ Reset the default logger. # Example Suppose an [MLflow](https://mlflow.org/docs/latest/index.html) tracking service is running -on a local server at `http://127.0.0.1:500`. Then every in every `evaluate` call in which -`logger` is not specified, as in the example below, the peformance evaluation is -automatically logged to the service. +on a local server at `http://127.0.0.1:500`. Then in every `evaluate` call in which +`logger` is not specified, the peformance evaluation is +automatically logged to the service, as here: -```julia-repl +```julia using MLJ logger = MLJFlow.Logger("http://127.0.0.1:5000/api") default_logger(logger) @@ -798,6 +797,7 @@ default_logger(logger) X, y = make_moons() model = ConstantClassifier() evaluate(model, X, y, measures=[log_loss, accuracy)]) +``` """ function default_logger(logger) @@ -1073,8 +1073,8 @@ instance of one of these, then a vector of tuples of the form `(train_rows, test is expected. For example, setting ```julia -resampling = [((1:100), (101:200)), - ((101:200), (1:100))] +resampling = [(1:100, 101:200), + (101:200, 1:100)] ``` gives two-fold cross-validation using the first 200 rows of data. diff --git a/src/show.jl b/src/show.jl index 9a9616af..7a5d0310 100644 --- a/src/show.jl +++ b/src/show.jl @@ -34,8 +34,10 @@ MLJBase.HANDLE_GIVEN_ID[objectid(value)] = :x Registered objects get displayed using the variable name to which it was bound in calls to `show(x)`, etc. -WARNING: As with any `const` declaration, binding `x` to new value of -the same type is not prevented and the registration will not be updated. +!!! warning + + As with any `const` declaration, binding `x` to new value of + the same type is not prevented and the registration will not be updated. """ macro constant(ex) @@ -50,14 +52,22 @@ macro constant(ex) end end -"""to display abbreviated versions of integers""" +""" + abbreviated(n) + +Display abbreviated versions of integers. +""" function abbreviated(n) as_string = string(n) return "@"*as_string[end-2:end] end -"""return abbreviated object id (as string) or it's registered handle -(as string) if this exists""" +""" + handle(X) + +return abbreviated object id (as string) or it's registered handle +(as string) if this exists +""" function handle(X) id = objectid(X) if id in keys(HANDLE_GIVEN_ID) @@ -347,7 +357,7 @@ function _recursive_show(stream::IO, object::MLJType, current_depth, depth) print(stream, "#"^current_depth, " ") show(stream, object) println(stream, ": ") -# println(stream) + # println(stream) if isempty(fields) println(stream) return @@ -358,10 +368,10 @@ function _recursive_show(stream::IO, object::MLJType, current_depth, depth) print(stream, fld_string) if isdefined(object, fld) _show(stream, getproperty(object, fld)) - # println(stream) + # println(stream) else println(stream, "(undefined)") - # println(stream) + # println(stream) end end println(stream) diff --git a/src/utilities.jl b/src/utilities.jl index 3dcf31a6..ccb50208 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -301,7 +301,7 @@ column cycle fastest, those in the last clolumn slowest. ```julia-repl julia> iterators = ([1, 2], ["a","b"], ["x", "y", "z"]); julia> MLJTuning.unwind(iterators...) -12×3 Array{Any,2}: +12×3 Matrix{Any}: 1 "a" "x" 2 "a" "x" 1 "b" "x" @@ -347,7 +347,7 @@ Split an `AbstractRange` into `n` subranges of approximately equal length. ### Example ```julia-repl julia> collect(chunks(1:5, 2)) -2-element Array{UnitRange{Int64},1}: +2-element Vector{UnitRange{Int64}}: 1:3 4:5 ``` diff --git a/test/composition/models/pipelines.jl b/test/composition/models/pipelines.jl index e213cdc2..c90143a7 100644 --- a/test/composition/models/pipelines.jl +++ b/test/composition/models/pipelines.jl @@ -544,6 +544,7 @@ end # inverse transform: p = Pipeline(UnivariateBoxCoxTransformer, UnivariateStandardizer) + @test !target_in_fit(p) xtrain = rand(rng, 10) mach = machine(p, xtrain) fit!(mach, verbosity=0) @@ -702,6 +703,40 @@ end @test Set(features) == Set(keys(X)) end +struct SupervisedTransformer <: Unsupervised end + +MLJBase.fit(::SupervisedTransformer, verbosity, X, y) = (mean(y), nothing, nothing) +MLJBase.transform(::SupervisedTransformer, fitresult, X) = + fitresult*MLJBase.matrix(X) |> MLJBase.table +MLJBase.target_in_fit(::Type{<:SupervisedTransformer}) = true + +struct DummyTransformer <: Unsupervised end +MLJBase.fit(::DummyTransformer, verbosity, X) = (nothing, nothing, nothing) +MLJBase.transform(::DummyTransformer, fitresult, X) = X + +@testset "supervised transformers in a pipeline" begin + X = MLJBase.table((a=fill(10.0, 3),)) + y = fill(2, 3) + pipe = SupervisedTransformer() |> DeterministicConstantRegressor() + @test target_in_fit(pipe) + mach = machine(pipe, X, y) + fit!(mach, verbosity=0) + @test predict(mach, X) == fill(2.0, 3) + + pipe2 = DummyTransformer |> pipe + @test target_in_fit(pipe2) + mach = machine(pipe2, X, y) + fit!(mach, verbosity=0) + @test predict(mach, X) == fill(2.0, 3) + + pipe3 = DummyTransformer |> SupervisedTransformer |> DummyTransformer + @test target_in_fit(pipe3) + mach = machine(pipe3, X, y) + fit!(mach, verbosity=0) + @test transform(mach, X).x1 == fill(20.0, 3) +end + + end # module true