Merge pull request #977 from JuliaAI/dev

For a 1.3 release
JuliaAI · May 6, 2024 · d5f3398 · d5f3398
2 parents 5046989 + af10ff2
commit d5f3398
Show file tree

Hide file tree

Showing 20 changed files with 533 additions and 305 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -49,7 +49,7 @@ jobs:
         env:
           JULIA_NUM_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v1
+      - uses: codecov/codecov-action@v3
         with:
           file: lcov.info
   docs:

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJBase"
 uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "1.2.1"
+version = "1.3"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"

diff --git a/src/MLJBase.jl b/src/MLJBase.jl
@@ -291,8 +291,8 @@ export machines, sources, Stack,
 export TransformedTargetModel
 
 # resampling.jl:
-export ResamplingStrategy, Holdout, CV, StratifiedCV, TimeSeriesCV,
-    evaluate!, Resampler, PerformanceEvaluation
+export ResamplingStrategy, InSample, Holdout, CV, StratifiedCV, TimeSeriesCV,
+    evaluate!, Resampler, PerformanceEvaluation, CompactPerformanceEvaluation
 
 # `MLJType` and the abstract `Model` subtypes are exported from within
 # src/composition/abstract_types.jl

diff --git a/src/composition/learning_networks/nodes.jl b/src/composition/learning_networks/nodes.jl
@@ -27,9 +27,9 @@ See also [`node`](@ref), [`Source`](@ref), [`origins`](@ref),
 [`sources`](@ref), [`fit!`](@ref).
 
 """
-struct Node{T<:Union{Machine, Nothing}} <: AbstractNode
+struct Node{T<:Union{Machine, Nothing},Oper} <: AbstractNode
 
-    operation   # eg, `predict` or a static operation, such as `exp`
+    operation::Oper   # eg, `predict` or a static operation, such as `exp`
     machine::T  # is `nothing` for static operations
 
     # nodes called to get args for `operation(model, ...) ` or
@@ -43,9 +43,11 @@ struct Node{T<:Union{Machine, Nothing}} <: AbstractNode
     # order consistent with extended graph, excluding self
     nodes::Vector{AbstractNode}
 
-    function Node(operation,
-                  machine::T,
-                  args::AbstractNode...) where T<:Union{Machine, Nothing}
+    function Node(
+        operation::Oper,
+        machine::T,
+        args::AbstractNode...,
+        ) where {T<:Union{Machine, Nothing}, Oper}
 
         # check the number of arguments:
         # if machine === nothing && isempty(args)
@@ -70,7 +72,7 @@ struct Node{T<:Union{Machine, Nothing}} <: AbstractNode
                 vcat(nodes_, (nodes(n) for n in machine.args)...) |> unique
         end
 
-        return new{T}(operation, machine, args, origins_, nodes_)
+        return new{T,Oper}(operation, machine, args, origins_, nodes_)
     end
 end
 
@@ -407,14 +409,14 @@ of nodes, sources and other arguments.
 
 ### Examples
 
-```
-X = source(π)
-W = @node sin(X)
+```julia-repl
+julia> X = source(π)
+julia> W = @node sin(X)
 julia> W()
 0
 
-X = source(1:10)
-Y = @node selectrows(X, 3:4)
+julia> X = source(1:10)
+julia> Y = @node selectrows(X, 3:4)
 julia> Y()
 3:4
 
@@ -423,10 +425,10 @@ julia> Y(["one", "two", "three", "four"])
  "three"
  "four"
 
-X1 = source(4)
-X2 = source(5)
-add(a, b, c) = a + b + c
-N = @node add(X1, 1, X2)
+julia> X1 = source(4)
+julia> X2 = source(5)
+julia> add(a, b, c) = a + b + c
+julia> N = @node add(X1, 1, X2)
 julia> N()
 10
 

diff --git a/src/composition/learning_networks/signatures.jl b/src/composition/learning_networks/signatures.jl
@@ -8,10 +8,10 @@
 
 **Private method.**
 
-Return a dictionary of machines, keyed on model, for the all machines in the completed
-learning network for which `node` is the greatest lower bound. Only machines bound to
-symbolic models are included. Values are always vectors, even if they contain only a
-single machine.
+Return a dictionary of machines, keyed on model, for the all machines in the
+completed learning network for which `node` is the greatest lower bound. Only
+machines bound to symbolic models are included. Values are always vectors,
+even if they contain only a single machine.
 
 """
 function machines_given_model(node::AbstractNode)
@@ -35,14 +35,14 @@ attempt_scalarize(v) = length(v) == 1 ? v[1] : v
 
 **Private method.**
 
-Given a dictionary of machine vectors, keyed on model names (symbols), broadcast `f` over
-each vector, and make the result, in the returned named tuple, the value associated with
-the corresponding model name as key.
+Given a dictionary of machine vectors, keyed on model names (symbols), broadcast
+`f` over each vector, and make the result, in the returned named tuple, the
+value associated with the corresponding model name as key.
 
 Singleton vector values are scalarized, unless `scalarize = false`.
 
-If a value in the computed named tuple is `nothing`, or a vector of `nothing`s, then the
-entry is dropped from the tuple, unless `drop_nothings=false`.
+If a value in the computed named tuple is `nothing`, or a vector of `nothing`s,
+then the entry is dropped from the tuple, unless `drop_nothings=false`.
 
 """
 function tuple_keyed_on_model(f, machines_given_model; scalarize=true, drop_nothings=true)

diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl
@@ -337,12 +337,12 @@ internal_stack_report(
 ) = NamedTuple{}()
 
 """
-internal_stack_report(
-    m::Stack,
-    verbosity::Int,
-    y::AbstractNode,
-    folds_evaluations::Vararg{AbstractNode},
-)
+    internal_stack_report(
+        m::Stack,
+        verbosity::Int,
+        y::AbstractNode,
+        folds_evaluations::Vararg{AbstractNode},
+    )
 
 When measure/measures is provided, the folds_evaluation will have been filled by
 `store_for_evaluation`. This function is not doing any heavy work (not constructing nodes
@@ -518,7 +518,7 @@ function oos_set(m::Stack{modelnames}, Xs::Source, ys::Source, tt_pairs) where m
 end
 
 #######################################
-################# Prefit #################
+################# Prefit ##############
 #######################################
 
 function prefit(m::Stack{modelnames}, verbosity::Int, X, y) where modelnames
@@ -564,8 +564,7 @@ const DOC_STACK =
     Stack(; metalearner=nothing, name1=model1, name2=model2, ..., keyword_options...)
 
 Implements the two-layer generalized stack algorithm introduced by
-[Wolpert
-(1992)](https://www.sciencedirect.com/science/article/abs/pii/S0893608005800231)
+[Wolpert (1992)](https://www.sciencedirect.com/science/article/abs/pii/S0893608005800231)
 and generalized by [Van der Laan et al
 (2007)](https://biostats.bepress.com/ucbbiostat/paper222/). Returns an
 instance of type `ProbabilisticStack` or `DeterministicStack`,

diff --git a/src/composition/models/transformed_target_model.jl b/src/composition/models/transformed_target_model.jl
@@ -61,7 +61,7 @@ const ERR_MODEL_UNSPECIFIED = ArgumentError(
     "Expecting atomic model as argument. None specified. "
 )
 const ERR_TRANSFORMER_UNSPECIFIED = ArgumentError(
-"You must specify `transformer=...`. ."
+    "You must specify `transformer=...`. ."
 )
 const ERR_TOO_MANY_ARGUMENTS = ArgumentError(
     "At most one non-keyword argument, a model, allowed. "
@@ -123,7 +123,7 @@ y -> mode.(y))`.
 A model that normalizes the target before applying ridge regression,
 with predictions returned on the original scale:
 
-```
+```julia
 @load RidgeRegressor pkg=MLJLinearModels
 model = RidgeRegressor()
 tmodel = TransformedTargetModel(model, transformer=Standardizer())
@@ -132,7 +132,7 @@ tmodel = TransformedTargetModel(model, transformer=Standardizer())
 A model that applies a static `log` transformation to the data, again
 returning predictions to the original scale:
 
-```
+```julia
 tmodel2 = TransformedTargetModel(model, transformer=y->log.(y), inverse=z->exp.(y))
 ```
 

diff --git a/src/data/data.jl b/src/data/data.jl
@@ -104,23 +104,28 @@ corresponding `fractions` of `length(nrows(X))`, where valid fractions
 are floats between 0 and 1 whose sum is less than one. The last
 fraction is not provided, as it is inferred from the preceding ones.
 
-For "synchronized" partitioning of multiple objects, use the
-`multi=true` option described below.
+For synchronized partitioning of multiple objects, use the
+`multi=true` option.
 
-    julia> partition(1:1000, 0.8)
-    ([1,...,800], [801,...,1000])
+```julia-repl
+julia> partition(1:1000, 0.8)
+([1,...,800], [801,...,1000])
 
-    julia> partition(1:1000, 0.2, 0.7)
-    ([1,...,200], [201,...,900], [901,...,1000])
+julia> partition(1:1000, 0.2, 0.7)
+([1,...,200], [201,...,900], [901,...,1000])
 
-    julia> partition(reshape(1:10, 5, 2), 0.2, 0.4)
-    ([1 6], [2 7; 3 8], [4 9; 5 10])
+julia> partition(reshape(1:10, 5, 2), 0.2, 0.4)
+([1 6], [2 7; 3 8], [4 9; 5 10])
 
-    X, y = make_blobs() # a table and vector
-    Xtrain, Xtest = partition(X, 0.8, stratify=y)
+julia> X, y = make_blobs() # a table and vector
+julia> Xtrain, Xtest = partition(X, 0.8, stratify=y)
+```
 
-    (Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true)
+Here's an example of synchronized partitioning of multiple objects:
 
+```julia-repl
+julia> (Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true)
+```
 
 ## Keywords
 
@@ -209,7 +214,7 @@ Returns a tuple of tables/vectors with length one greater than the
 number of supplied predicates, with the last component including all
 previously unselected columns.
 
-```
+```julia-repl
 julia> table = DataFrame(x=[1,2], y=['a', 'b'], z=[10.0, 20.0], w=["A", "B"])
 2×4 DataFrame
  Row │ x      y     z        w
@@ -218,7 +223,7 @@ julia> table = DataFrame(x=[1,2], y=['a', 'b'], z=[10.0, 20.0], w=["A", "B"])
    1 │     1  a        10.0  A
    2 │     2  b        20.0  B
 
-Z, XY, W = unpack(table, ==(:z), !=(:w))
+julia> Z, XY, W = unpack(table, ==(:z), !=(:w));
 julia> Z
 2-element Vector{Float64}:
  10.0
@@ -300,9 +305,11 @@ The method is curried, so that `restrict(folds, i)` is the operator
 on data defined by `restrict(folds, i)(X) = restrict(X, folds, i)`.
 
 ### Example
-
-    folds = ([1, 2], [3, 4, 5],  [6,])
-    restrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x3, :x4, :x5]
+#
+```julia
+folds = ([1, 2], [3, 4, 5],  [6,])
+restrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x3, :x4, :x5]
+```
 
 See also [`corestrict`](@ref)
 
@@ -322,7 +329,9 @@ all elements of `folds`. Here `folds` is a vector or tuple of integer
 vectors, typically representing row indices or a vector, matrix or
 table.
 
-    complement(([1,2], [3,], [4, 5]), 2) # [1 ,2, 4, 5]
+```julia
+complement(([1,2], [3,], [4, 5]), 2) # [1 ,2, 4, 5]
+```
 
 """
 complement(f, i) = reduce(vcat, collect(f)[Not(i)])
@@ -345,8 +354,10 @@ on data defined by `corestrict(folds, i)(X) = corestrict(X, folds, i)`.
 
 ### Example
 
-    folds = ([1, 2], [3, 4, 5],  [6,])
-    corestrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x1, :x2, :x6]
+```julia
+folds = ([1, 2], [3, 4, 5],  [6,])
+corestrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x1, :x2, :x6]
+```
 
 """
 corestrict(f::NTuple{N}, i) where N = FoldComplementRestrictor{i,N}(f)

diff --git a/src/data/datasets.jl b/src/data/datasets.jl
@@ -158,7 +158,7 @@ const COERCE_SUNSPOTS = (
     (:sunspot_number=>Continuous),)
 
 """
-load_dataset(fpath, coercions)
+    load_dataset(fpath, coercions)
 
 Load one of standard dataset like Boston etc assuming the file is a
 comma separated file with a header.