From afb88226acec3154a646e04aa18fefd1081ca3ae Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Fri, 1 Nov 2019 13:21:33 +0100 Subject: [PATCH 1/2] more arrows --- src/MLJ.jl | 5 +++ src/arrows.jl | 24 ++++++++++ src/operations.jl | 20 --------- test/arrows.jl | 109 +++++++++++++++++++++++++++++++++++++++++++++ test/operations.jl | 45 ------------------- test/runtests.jl | 6 ++- 6 files changed, 142 insertions(+), 67 deletions(-) create mode 100644 src/arrows.jl create mode 100644 test/arrows.jl delete mode 100644 test/operations.jl diff --git a/src/MLJ.jl b/src/MLJ.jl index 24213abb5..d21bc0ebb 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -127,6 +127,11 @@ include("networks.jl") # for building learning networks include("composites.jl") # composite models & exporting learning networks include("pipelines.jl") # pipelines (exported linear learning networks) include("operations.jl") # syntactic sugar for operations (predict, etc) + +if VERSION ≥ v"1.3.0-" + include("arrows.jl") +end + include("resampling.jl") # resampling strategies and model evaluation include("parameters.jl") # hyperparameter ranges and grid generation include("tuning.jl") diff --git a/src/arrows.jl b/src/arrows.jl new file mode 100644 index 000000000..e3d5a614e --- /dev/null +++ b/src/arrows.jl @@ -0,0 +1,24 @@ +# Syntactic sugar for arrow syntax +# we need version ≥ 1.3 in order to make use of multiple dispatch +# over abstract types + + +# This allows implicit: data |> machine +(mach::AbstractMachine{<:Unsupervised})(data) = transform(mach, data) +(mach::AbstractMachine{<:Supervised})(data) = predict(mach, data) +(mach::AbstractMachine)(data::AbstractMatrix) = data |> table |> mach + +# This allows implicit: data |> Unsupervised +(m::Unsupervised)(data::AbstractNode) = data |> machine(m, data) +(m::Unsupervised)(data) = source(data) |> m +(m::Unsupervised)(data::AbstractMatrix) = data |> table |> m + +# This allows implicit: data |> Supervised +(m::Supervised)(data::NTuple{2,AbstractNode}) = data[1] |> machine(m, data...) +(m::Supervised)(data::Tuple{AbstractNode,Any}) = (@show "hello"; (data[1], source(data[2], kind=:target)) |> m) +(m::Supervised)(data::Tuple) = (source(data[1]), data[2]) |> m +(m::Supervised)(data::Tuple{AbstractMatrix,Any}) = (data[1] |> table, data[2]) |> m + +# This allows implicit: data |> inverse_transform(node) +inverse_transform(node::Node{<:NodalMachine{<:Unsupervised}}) = + data -> inverse_transform(node.machine, data) diff --git a/src/operations.jl b/src/operations.jl index 817431261..525ba0eeb 100644 --- a/src/operations.jl +++ b/src/operations.jl @@ -85,25 +85,5 @@ function fitted_params(machine::AbstractMachine) end end - -# Syntactic sugar for pipe syntax -# we need version ≥ 1.3 in order to make use of multiple dispatch -# over abstract types -if VERSION ≥ v"1.3.0-" - - (mach::AbstractMachine{<:Unsupervised})(data) = transform(mach, data) - (mach::AbstractMachine{<:Supervised})(data) = predict(mach, data) - - (m::Unsupervised)(data::AbstractNode) = data |> machine(m, data) - (m::Unsupervised)(data) = source(data) |> m - - (m::Supervised)(data::NTuple{2,AbstractNode}) = data[1] |> machine(m, data...) - (m::Supervised)(data::Tuple) = source.(data) |> m - - inverse_transform(node::Node{<:NodalMachine{<:Unsupervised}}) = - data->inverse_transform(node.machine, data) -end # version ≥ 1.3 - -# Syntactic sugar to directly access hyperparameters getindex(n::Node{<:NodalMachine{<:Model}}, s::Symbol) = getproperty(n.machine.model, s) setindex!(n::Node{<:NodalMachine{<:Model}}, v, s::Symbol) = setproperty!(n.machine.model, s, v) diff --git a/test/arrows.jl b/test/arrows.jl new file mode 100644 index 000000000..89ed8d40b --- /dev/null +++ b/test/arrows.jl @@ -0,0 +1,109 @@ +module TestArrows + +using MLJ +using MLJBase +using Test +using Random + +@testset "|> syntax for pipelines" begin + Random.seed!(142) + @load RidgeRegressor pkg="MultivariateStats" + @load KNNRegressor pkg="NearestNeighbors" + X = MLJBase.table(randn(500, 5)) + y = abs.(randn(500)) + train, test = partition(eachindex(y), 0.7) + + # Feeding data directly to a supervised model + knn = KNNRegressor(K=10) + ŷ = (X, y) |> knn + fit!(ŷ, rows=train) + + # Describing a full pipeline using |> syntax. + Xs, ys = source.((X, y)) + + # "first layer" + W = Xs |> Standardizer() + z = ys |> UnivariateBoxCoxTransformer() + # "second layer" + ẑ = (W, z) |> RidgeRegressor(lambda=0.1) + # "output layer" + ŷ = ẑ |> inverse_transform(z) + + fit!(ŷ, rows=train) + + @test isapprox(rms(ŷ(rows=test), ys(rows=test)), 0.627123, rtol=1e-4) + + # shortcut to get and set hyperparameters of a node + ẑ[:lambda] = 5.0 + fit!(ŷ, rows=train) + @test isapprox(rms(ŷ(rows=test), ys(rows=test)), 0.62699, rtol=1e-4) +end + +@testset "Auto-source" begin + @load PCA + @load RidgeRegressor pkg="MultivariateStats" + Random.seed!(5615151) + + X = MLJBase.table(randn(500, 5)) + y = abs.(randn(500)) + + pca = X |> Standardizer() |> PCA(maxoutdim=2) + fit!(pca) + + W = pca() + sch = schema(W) + @test sch.names == (:x1, :x2) + @test sch.scitypes == (Continuous, Continuous) + @test sch.nrows == 500 + + pipe = (pca, y) |> RidgeRegressor() + fit!(pipe) + + ŷ = pipe() + @test ŷ isa Vector{Float64} + @test length(ŷ) == 500 +end + +@testset "Auto-table" begin + @load PCA + @load RidgeRegressor pkg="MultivariateStats" + Random.seed!(5615151) + + X = randn(500, 5) + y = abs.(randn(500)) + + pca = X |> Standardizer() |> PCA(maxoutdim=2) + pipe = (pca, y) |> RidgeRegressor() + fit!(pipe) + + ŷ = pipe() + @test ŷ isa Vector{Float64} + @test length(ŷ) == 500 +end + +@testset "Stacking" begin + @load PCA + @load RidgeRegressor pkg=MultivariateStats + @load DecisionTreeRegressor pkg=DecisionTree + Random.seed!(5615151) + + X = randn(500, 5) + y = abs.(randn(500)) + + W = X |> Standardizer() |> PCA(maxoutdim=3) + z = y |> UnivariateBoxCoxTransformer() + ẑ₁ = (W, z) |> RidgeRegressor() + ẑ₂ = (W, z) |> DecisionTreeRegressor() + R = hcat(ẑ₁, ẑ₂) + ẑ = (R, z) |> DecisionTreeRegressor() + ŷ = ẑ |> inverse_transform(z) + + fit!(ŷ) + + p̂ = ŷ() + @test p̂ isa Vector{Float64} + @test length(p̂) == 500 +end + +end +true diff --git a/test/operations.jl b/test/operations.jl deleted file mode 100644 index ba92509a5..000000000 --- a/test/operations.jl +++ /dev/null @@ -1,45 +0,0 @@ -module TestOperations - -using MLJ -using MLJBase -using Test -using Random - -if VERSION ≥ v"1.3.0-" - @testset "|> syntax for pipelines" begin - Random.seed!(142) - @load RidgeRegressor pkg="MultivariateStats" - @load KNNRegressor pkg="NearestNeighbors" - X = MLJBase.table(randn(500, 5)) - y = abs.(randn(500)) - train, test = partition(eachindex(y), 0.7) - - # Feeding data directly to a supervised model - knn = KNNRegressor(K=10) - ŷ = (X, y) |> knn - fit!(ŷ, rows=train) - - # Describing a full pipeline using |> syntax. - Xs, ys = source.((X, y)) - - # "first layer" - W = Xs |> Standardizer() - z = ys |> UnivariateBoxCoxTransformer() - # "second layer" - ẑ = (W, z) |> RidgeRegressor(lambda=0.1) - # "output layer" - ŷ = ẑ |> inverse_transform(z) - - fit!(ŷ, rows=train) - - @test isapprox(rms(ŷ(rows=test), ys(rows=test)), 0.627123, rtol=1e-4) - - # shortcut to get and set hyperparameters of a node - ẑ[:lambda] = 5.0 - fit!(ŷ, rows=train) - @test isapprox(rms(ŷ(rows=test), ys(rows=test)), 0.62699, rtol=1e-4) - end -end # version - -end -true diff --git a/test/runtests.jl b/test/runtests.jl index 640db9e40..b220fd0bb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,8 +26,10 @@ end @test include("networks.jl") end -@testset "operations" begin - @test include("operations.jl") +if VERSION ≥ v"1.3.0-" + @testset "arrows" begin + @test include("arrows.jl") + end end @testset "composites" begin From 12fb9fc1fee0599c6d1b8c856750be2348a8a272 Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Fri, 1 Nov 2019 13:25:31 +0100 Subject: [PATCH 2/2] removing a debugging show --- src/arrows.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arrows.jl b/src/arrows.jl index e3d5a614e..fca0c1aaa 100644 --- a/src/arrows.jl +++ b/src/arrows.jl @@ -15,7 +15,7 @@ # This allows implicit: data |> Supervised (m::Supervised)(data::NTuple{2,AbstractNode}) = data[1] |> machine(m, data...) -(m::Supervised)(data::Tuple{AbstractNode,Any}) = (@show "hello"; (data[1], source(data[2], kind=:target)) |> m) +(m::Supervised)(data::Tuple{AbstractNode,Any}) = (data[1], source(data[2], kind=:target)) |> m (m::Supervised)(data::Tuple) = (source(data[1]), data[2]) |> m (m::Supervised)(data::Tuple{AbstractMatrix,Any}) = (data[1] |> table, data[2]) |> m