Merge pull request #28 from alan-turing-institute/tablescitype

Replace src/scitypes.jl with ScientificTypes.jl package
JuliaAI · Aug 19, 2019 · f36dcd0 · f36dcd0
2 parents cbdf183 + aea2fd7
commit f36dcd0
Show file tree

Hide file tree

Showing 18 changed files with 3,311 additions and 3,464 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,15 +1,16 @@
 name = "MLJBase"
 uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "0.3.0"
+version = "0.4.0"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
-ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
+ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
@@ -19,6 +20,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 CSV = "0.5"
 CategoricalArrays = "<0.5.3"
 Requires = "^0.5.2"
+ScientificTypes = "0.1.2"
 Tables = "<0.1.19, >= 0.2"
 julia = "1"
 

diff --git a/data/ames.csv b/data/ames.csv
diff --git a/data/reduced_ames.csv b/data/reduced_ames.csv
diff --git a/src/MLJBase.jl b/src/MLJBase.jl
@@ -9,37 +9,43 @@ export fit, update, clean!
 export predict, predict_mean, predict_mode, fitted_params
 export transform, inverse_transform, se, evaluate, best
 export load_path, package_url, package_name, package_uuid
-export input_scitype_union, input_is_multivariate
-export target_scitype_union, target_quantity
-export is_pure_julia, is_wrapper
+export input_scitype, supports_weights
+export target_scitype, target_quantity            
+export is_pure_julia, is_wrapper                                 
 
 export params                                        # parameters.jl
 export reconstruct, int, decoder, classes            # data.jl
-export selectrows, selectcols, select, nrows, schema # data.jl
+export selectrows, selectcols, select, nrows         # data.jl
 export table, levels_seen, matrix, container_type    # data.jl
 export partition, @set_defaults                      # utilities.jl
-export Found, Continuous, Finite, Infinite           # sgcitypes.jl
-export OrderedFactor, Unknown                        # scitypes.jl
-export Count, Multiclass, Binary                     # scitypes.jl
-export scitype, scitype_union, scitypes              # scitypes.jl
 export HANDLE_GIVEN_ID, @more, @constant             # show.jl
 export color_on, color_off                           # show.jl
-export UnivariateFinite, average                    # distributions.jl
+export UnivariateFinite, average                     # distributions.jl
 export SupervisedTask, UnsupervisedTask, MLJTask     # tasks.jl
 export X_and_y, X_, y_, nrows, nfeatures             # tasks.jl
 export info                                          # info.jl
 
 # methods from other packages to be rexported:
 export pdf, mean, mode
 
+# re-export of ScientificTypes (`Table` not exported):
+export trait
+export Scientific, Found, Unknown, Finite, Infinite
+export OrderedFactor, Multiclass, Count, Continuous
+export Binary, ColorImage, GrayImage
+export scitype, scitype_union, coerce, schema
+
 import Base.==
 
 using Tables
+using OrderedCollections # already a dependency of StatsBase
 import Distributions
 import Distributions: pdf, mode
 using CategoricalArrays
+using OrderedCollections
 import CategoricalArrays
-import ColorTypes
+using ScientificTypes
+import ScientificTypes: trait
 
 # to be extended:
 import StatsBase: fit, predict, fit!
@@ -60,13 +66,15 @@ const COLUMN_WIDTH = 24
 const DEFAULT_SHOW_DEPTH = 0
 
 include("utilities.jl")
-include("scitypes.jl")
 
 
-## ABSTRACT TYPES
+## BASE TYPES 
 
-# overarching MLJ type:
 abstract type MLJType end
+include("equality.jl") # equality for MLJType objects
+
+
+## ABSTRACT MODEL TYPES
 
 # for storing hyperparameters:
 abstract type Model <: MLJType end
@@ -86,16 +94,13 @@ abstract type ProbabilisticNetwork <: Probabilistic end
 abstract type DeterministicNetwork <: Deterministic end
 abstract type UnsupervisedNetwork <: Unsupervised end
 
-include("equality.jl")
-
 
 ## THE MODEL INTERFACE
 
 # every model interface must implement a `fit` method of the form
-# `fit(model, verbosity, X, y) -> fitresult, cache, report` or
-# `fit(model, verbosity, X, ys...) -> fitresult, cache, report` (multivariate case)
+# `fit(model, verbosity::Integer, training_args...) -> fitresult, cache, report` 
 # or, one the simplified versions
-# `fit(model, X, y) -> fitresult`
+# `fit(model, training_args...) -> fitresult`
 # `fit(model, X, ys...) -> fitresult`
 fit(model::Model, verbosity::Integer, args...) = fit(model, args...), nothing, nothing
 
@@ -129,29 +134,27 @@ function best end
 clean!(model::Model) = ""
 
 # fallback trait declarations:
-target_scitype_union(::Type{<:Supervised}) =
-    Union{Found,NTuple{N,Found}} where N # a Tuple type in multivariate case
-output_scitype_union(::Type{<:Unsupervised}) =
-    Union{Missing,Found}
-output_is_multivariate(::Type{<:Unsupervised}) = true
-input_scitype_union(::Type{<:Model}) = Union{Missing,Found}
-input_is_multivariate(::Type{<:Model}) = true
-is_pure_julia(::Type{<:Model}) = false
-package_name(::Type{<:Model}) = "unknown"
-load_path(M::Type{<:Model}) = "unknown"
-package_uuid(::Type{<:Model}) = "unknown"
-package_url(::Type{<:Model}) = "unknown"
-is_wrapper(::Type{<:Model}) = false
-is_wrapper(m::Model) = is_wrapper(typeof(m))
-
-target_scitype_union(model::Model) = target_scitype_union(typeof(model))
-input_scitype_union(model::Model) = input_scitype_union(typeof(model))
-input_is_multivariate(model::Model) = input_is_multivariate(typeof(model))
+input_scitype(::Any) = Unknown
+output_scitype(::Any) = Unknown
+target_scitype(::Any) = Unknown
+is_pure_julia(::Any) = false
+package_name(::Any) = "unknown"
+package_license(::Any) = "unkown"
+load_path(::Any) = "unknown"
+package_uuid(::Any) = "unknown"
+package_url(::Any) = "unknown"
+is_wrapper(::Any) = false
+supports_weights(::Any) = false
+
+input_scitype(model::Model) = input_scitype(typeof(model))
+output_scitype(model::Model) = output_scitype(typeof(model))
+target_scitype(model::Model) = target_scitype(typeof(model))
 is_pure_julia(model::Model) = is_pure_julia(typeof(model))
 package_name(model::Model) = package_name(typeof(model))
 load_path(model::Model) = load_path(typeof(model))
 package_uuid(model::Model) = package_uuid(typeof(model))
 package_url(model::Model) = package_url(typeof(model))
+is_wrapper(m::Model) = is_wrapper(typeof(m))
 
 # probabilistic supervised models may also overload one or more of
 # `predict_mode`, `predict_median` and `predict_mean` defined below.

diff --git a/src/data.jl b/src/data.jl
@@ -1,35 +1,4 @@
-## CATEGORICAL ARRAY DECODER UTILITY
-
-# """
-#     reconstruct(A)
-
-# For reconstructing categorical arrays from their elements alone. Here
-# `A` is of type `AbstractArray{T}` where `T` is a subtype of
-# `CategoricalString` or `CategoricalValue`. The function `reconstruct` has
-# the property that `reconstruct(broadcast(identity, C)) == C`, whenever `C`
-# is a `CategoricalArray`. In other words, `reconstruct` is a left-inverse
-# for the function `C -> broadcast(identity, C)` that strips a
-# CategoricalArray of its "categorical wrapper".
-
-# Does not handle missing values.
-
-# """
-# function reconstruct(A::AbstractArray{<:CategoricalValue{T},N}) where {T,N}
-#     firstnonmissing = findfirst(x->!ismissing(x), A)
-#     isnothing(firstnonmissing) && error("No non-missing values encountered. ")
-#     pool = A[firstnonmissing].pool
-#     refs = broadcast(x -> x.level, A)
-#     return CategoricalArray{T,N}(refs, pool)
-# end
-# function reconstruct(A::AbstractArray{<:CategoricalString,N}) where {T,N}
-#     firstnonmissing = findfirst(x->!ismissing(x), A)
-#     isnothing(firstnonmissing) && error("No non-missing values encountered. ")
-#     pool = A[firstnonmissing].pool
-#     refs = broadcast(x -> x.level, A)
-#     return CategoricalArray{String,N}(refs, pool)
-# end
-
-CategoricalElement = Union{CategoricalValue,CategoricalString}
+CategoricalElement{U} = Union{CategoricalValue{<:Any,U},CategoricalString{U}}
 
 """
     classes(x)
@@ -61,7 +30,8 @@ function classes(x::CategoricalElement)
     return [p.valindex[p.invindex[v]] for v in p.levels]
 end
 
-raw(x::CategoricalElement) = x.pool.index[x.level] # a method just for testing
+# a method just for testing:
+raw(x::CategoricalElement) = x.pool.index[x.level]
 
 """
    int(x)
@@ -95,8 +65,11 @@ Broadcasted versions of `int`.
 See also: [`decoder`](@ref).
 """
 int(x::CategoricalElement) = x.pool.order[x.pool.invindex[x]]
-int(X::CategoricalArray) = broadcast(r -> X.pool.order[r], X.refs)
-int(V::Array{<:CategoricalElement}) = broadcast(int, V)
+int(A::AbstractArray{<:CategoricalElement}) = broadcast(int, A)
+# workaround for CategoricalArrays issue
+# https://github.com/JuliaData/CategoricalArrays.jl/issues/199:
+# function int(X::CategoricalArray)
+
 
 struct CategoricalDecoder{T,R} # <: MLJType
     pool::CategoricalPool{T,R}
@@ -144,35 +117,10 @@ decoder(element::CategoricalElement) =
 
 ## TABULAR DATA
 
-const istable = Tables.istable
-
 # hack for detecting JuliaDB.NDSparse tables without loading as dependency:
 isndsparse(X) = isdefined(X, :data_buffer)
 
 
-"""
-    container_type(X)
-
-Return `:table`, `:sparse`, or `:other`, according to whether `X` is a
-supported table format, a supported sparse table format, or something
-else.
-
-The first two formats, together abstract vectors, support the
-`MLJBase` accessor methods `selectrows`, `selectcols`, `select`,
-`nrows`, `schema`, and `union_scitypes`.
-
-"""
-function container_type(X)
-    if istable(X)
-        return :table
-    elseif isndsparse(X)
-        return :sparse
-    else
-        return :other
-    end
-end
-
-
 ## UTILITY FOR CONVERTING BETWEEN TABULAR DATA AND MATRICES
 
 """
@@ -186,7 +134,7 @@ returned. The integer relabelling of column names follows the
 lexicographic ordering (as indicated by `schema(X).names`).
 
 """
-matrix(X) = matrix(Val(container_type(X)), X)
+matrix(X) = matrix(Val(ScientificTypes.trait(X)), X)
 matrix(::Val{:other}, X) = throw(ArgumentError)
 matrix(::Val{:other}, X::AbstractMatrix) = X
 
@@ -229,7 +177,7 @@ named tuple of columns of `X`, with `keys(cols) = names`.
 
 """
 function table(cols::NamedTuple; prototype=cols)
-    istable(prototype) || error("prototype is not tabular.")
+    Tables.istable(prototype) || error("prototype is not tabular.")
     return Tables.materializer(prototype)(cols)
 end
 function table(X::AbstractMatrix; names=nothing, prototype=nothing)
@@ -246,7 +194,6 @@ end
 
 ## UNIFIED API FOR ACCESSING TABLES, MATRICES AND VECTORS
 
-
 """
     selectrows(X, r)
 
@@ -256,7 +203,7 @@ table of the preferred sink type of `typeof(X)`, even a single row is
 selected.
 
 """
-selectrows(X, r) = selectrows(Val(container_type(X)), X, r)
+selectrows(X, r) = selectrows(Val(ScientificTypes.trait(X)), X, r)
 selectrows(::Val{:other}, X, r) = throw(ArgumentError)
 
 """
@@ -269,7 +216,7 @@ object returned is a table of the preferred sink type of
 or `CategoricalVector` is returned.
 
 """
-selectcols(X, c) = selectcols(Val(container_type(X)), X, c)
+selectcols(X, c) = selectcols(Val(ScientificTypes.trait(X)), X, c)
 selectcols(::Val{:other}, X, c) = throw(ArgumentError)
 
 """
@@ -282,26 +229,16 @@ Select element of a table or sparse table at row `r` and column
 See also: [`selectrows`](@ref), [`selectcols`](@ref).
 
 """
-select(X, r, c) = select(Val(container_type(X)), X, r, c)
+select(X, r, c) = select(Val(ScientificTypes.trait(X)), X, r, c)
 select(::Val{:other}, X, r, c) = throw(ArgumentError)
 
-"""
-    schema(X)
-
-Returns a struct with properties `names`, `types`
-with the obvious meanings. Here `X` is any table or sparse table.
-
-"""
-schema(X) = schema(Val(container_type(X)), X)
-schema(::Val{:other}, X) = throw(ArgumentError)
-
 """
     nrows(X)
 
 Return the number of rows in a table, sparse table, or abstract vector.
 
 """
-nrows(X) = nrows(Val(container_type(X)), X)
+nrows(X) = nrows(Val(ScientificTypes.trait(X)), X)
 nrows(::Val{:other}, X) = throw(ArgumentError)
 
 
@@ -359,24 +296,7 @@ select(::Val{:table}, X, r::Integer, c) = selectcols(selectrows(X, r), c)
 select(::Val{:table}, X, r, c::Symbol) = selectcols(X, c)[r]
 select(::Val{:table}, X, r, c) = selectcols(selectrows(X, r), c)
 
-function schema(::Val{:table}, X)
-    istable(X) || throw(ArgumentError)
-    if !Tables.columnaccess(X)
-        return Tables.schema(Tables.rows(X))
-    else
-        return Tables.schema(Tables.columns(X))
-    end
-end
-
-function nrows(::Val{:table}, X)
-    if !Tables.columnaccess(X)
-        return length(collect(X))
-    else
-        cols = Tables.columntable(X)
-        !isempty(cols) || return 0
-        return length(cols[1])
-    end
-end
+nrows(::Val{:table}, X) = schema(X).nrows
 
 
 ## ACCESSORS FOR ABSTRACT VECTORS
@@ -385,7 +305,7 @@ selectrows(::Val{:other}, v::AbstractVector, r) = v[r]
 nrows(::Val{:other}, v::AbstractVector) = length(v)
 selectrows(::Val{:other}, v::CategoricalVector, r) = @inbounds v[r]
 
-
+## to be replaced (not used anywhere):
 ## ACCESSORS FOR JULIA NDSPARSE ARRAYS (N=2)
 
 nrows(::Val{:sparse}, X) = maximum([r[1] for r in keys(X)])
@@ -408,8 +328,3 @@ select(::Val{:sparse}, X, r::Integer, c::AbstractVector{Symbol}) = X[r,sort(c)]
 select(::Val{:sparse}, X, r::Integer, ::Colon) = X[r,:]
 select(::Val{:sparse}, X, r, c) = X[r,sort(c)]
 
-function schema(::Val{:sparse}, X)
-    names = sort(unique([r[2] for r in keys(X)]))
-    types = [eltype(selectcols(X, name)) for name in names]
-    return Tables.Schema(names, types)
-end
diff --git a/src/datasets.jl b/src/datasets.jl
@@ -21,7 +21,6 @@ having six numerical and six categorical features."""
 function load_reduced_ames()
     df = CSV.read(joinpath(datadir, "reduced_ames.csv"), copycols=true,
                   categorical=true)
-    df[:target] = exp.(df[:target])
     # TODO: uncomment following after julia #29501 is resolved
 #    df.OverallQual = categorical(df.OverallQual, ordered=true)
 #    df[:GarageCars] = categorical(df[:GarageCars], ordered=true)
@@ -36,7 +35,6 @@ end
 function load_ames()
     df = CSV.read(joinpath(datadir, "ames.csv"), copycols=true,
                   categorical=true)
-    df[:target] = exp.(df[:target])
     return SupervisedTask(verbosity=0, data=df,
                           target=:target,
                           ignore=[:Id,],