Skip to content

Commit

Permalink
Merge pull request #28 from alan-turing-institute/tablescitype
Browse files Browse the repository at this point in the history
Replace src/scitypes.jl with ScientificTypes.jl package
  • Loading branch information
ablaom authored Aug 19, 2019
2 parents cbdf183 + aea2fd7 commit f36dcd0
Show file tree
Hide file tree
Showing 18 changed files with 3,311 additions and 3,464 deletions.
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
name = "MLJBase"
uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
authors = ["Anthony D. Blaom <[email protected]>"]
version = "0.3.0"
version = "0.4.0"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Expand All @@ -19,6 +20,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
CSV = "0.5"
CategoricalArrays = "<0.5.3"
Requires = "^0.5.2"
ScientificTypes = "0.1.2"
Tables = "<0.1.19, >= 0.2"
julia = "1"

Expand Down
2,914 changes: 1,457 additions & 1,457 deletions data/ames.csv

Large diffs are not rendered by default.

2,912 changes: 1,456 additions & 1,456 deletions data/reduced_ames.csv

Large diffs are not rendered by default.

75 changes: 39 additions & 36 deletions src/MLJBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,43 @@ export fit, update, clean!
export predict, predict_mean, predict_mode, fitted_params
export transform, inverse_transform, se, evaluate, best
export load_path, package_url, package_name, package_uuid
export input_scitype_union, input_is_multivariate
export target_scitype_union, target_quantity
export is_pure_julia, is_wrapper
export input_scitype, supports_weights
export target_scitype, target_quantity
export is_pure_julia, is_wrapper

export params # parameters.jl
export reconstruct, int, decoder, classes # data.jl
export selectrows, selectcols, select, nrows, schema # data.jl
export selectrows, selectcols, select, nrows # data.jl
export table, levels_seen, matrix, container_type # data.jl
export partition, @set_defaults # utilities.jl
export Found, Continuous, Finite, Infinite # sgcitypes.jl
export OrderedFactor, Unknown # scitypes.jl
export Count, Multiclass, Binary # scitypes.jl
export scitype, scitype_union, scitypes # scitypes.jl
export HANDLE_GIVEN_ID, @more, @constant # show.jl
export color_on, color_off # show.jl
export UnivariateFinite, average # distributions.jl
export UnivariateFinite, average # distributions.jl
export SupervisedTask, UnsupervisedTask, MLJTask # tasks.jl
export X_and_y, X_, y_, nrows, nfeatures # tasks.jl
export info # info.jl

# methods from other packages to be rexported:
export pdf, mean, mode

# re-export of ScientificTypes (`Table` not exported):
export trait
export Scientific, Found, Unknown, Finite, Infinite
export OrderedFactor, Multiclass, Count, Continuous
export Binary, ColorImage, GrayImage
export scitype, scitype_union, coerce, schema

import Base.==

using Tables
using OrderedCollections # already a dependency of StatsBase
import Distributions
import Distributions: pdf, mode
using CategoricalArrays
using OrderedCollections
import CategoricalArrays
import ColorTypes
using ScientificTypes
import ScientificTypes: trait

# to be extended:
import StatsBase: fit, predict, fit!
Expand All @@ -60,13 +66,15 @@ const COLUMN_WIDTH = 24
const DEFAULT_SHOW_DEPTH = 0

include("utilities.jl")
include("scitypes.jl")


## ABSTRACT TYPES
## BASE TYPES

# overarching MLJ type:
abstract type MLJType end
include("equality.jl") # equality for MLJType objects


## ABSTRACT MODEL TYPES

# for storing hyperparameters:
abstract type Model <: MLJType end
Expand All @@ -86,16 +94,13 @@ abstract type ProbabilisticNetwork <: Probabilistic end
abstract type DeterministicNetwork <: Deterministic end
abstract type UnsupervisedNetwork <: Unsupervised end

include("equality.jl")


## THE MODEL INTERFACE

# every model interface must implement a `fit` method of the form
# `fit(model, verbosity, X, y) -> fitresult, cache, report` or
# `fit(model, verbosity, X, ys...) -> fitresult, cache, report` (multivariate case)
# `fit(model, verbosity::Integer, training_args...) -> fitresult, cache, report`
# or, one the simplified versions
# `fit(model, X, y) -> fitresult`
# `fit(model, training_args...) -> fitresult`
# `fit(model, X, ys...) -> fitresult`
fit(model::Model, verbosity::Integer, args...) = fit(model, args...), nothing, nothing

Expand Down Expand Up @@ -129,29 +134,27 @@ function best end
clean!(model::Model) = ""

# fallback trait declarations:
target_scitype_union(::Type{<:Supervised}) =
Union{Found,NTuple{N,Found}} where N # a Tuple type in multivariate case
output_scitype_union(::Type{<:Unsupervised}) =
Union{Missing,Found}
output_is_multivariate(::Type{<:Unsupervised}) = true
input_scitype_union(::Type{<:Model}) = Union{Missing,Found}
input_is_multivariate(::Type{<:Model}) = true
is_pure_julia(::Type{<:Model}) = false
package_name(::Type{<:Model}) = "unknown"
load_path(M::Type{<:Model}) = "unknown"
package_uuid(::Type{<:Model}) = "unknown"
package_url(::Type{<:Model}) = "unknown"
is_wrapper(::Type{<:Model}) = false
is_wrapper(m::Model) = is_wrapper(typeof(m))

target_scitype_union(model::Model) = target_scitype_union(typeof(model))
input_scitype_union(model::Model) = input_scitype_union(typeof(model))
input_is_multivariate(model::Model) = input_is_multivariate(typeof(model))
input_scitype(::Any) = Unknown
output_scitype(::Any) = Unknown
target_scitype(::Any) = Unknown
is_pure_julia(::Any) = false
package_name(::Any) = "unknown"
package_license(::Any) = "unkown"
load_path(::Any) = "unknown"
package_uuid(::Any) = "unknown"
package_url(::Any) = "unknown"
is_wrapper(::Any) = false
supports_weights(::Any) = false

input_scitype(model::Model) = input_scitype(typeof(model))
output_scitype(model::Model) = output_scitype(typeof(model))
target_scitype(model::Model) = target_scitype(typeof(model))
is_pure_julia(model::Model) = is_pure_julia(typeof(model))
package_name(model::Model) = package_name(typeof(model))
load_path(model::Model) = load_path(typeof(model))
package_uuid(model::Model) = package_uuid(typeof(model))
package_url(model::Model) = package_url(typeof(model))
is_wrapper(m::Model) = is_wrapper(typeof(m))

# probabilistic supervised models may also overload one or more of
# `predict_mode`, `predict_median` and `predict_mean` defined below.
Expand Down
117 changes: 16 additions & 101 deletions src/data.jl
Original file line number Diff line number Diff line change
@@ -1,35 +1,4 @@
## CATEGORICAL ARRAY DECODER UTILITY

# """
# reconstruct(A)

# For reconstructing categorical arrays from their elements alone. Here
# `A` is of type `AbstractArray{T}` where `T` is a subtype of
# `CategoricalString` or `CategoricalValue`. The function `reconstruct` has
# the property that `reconstruct(broadcast(identity, C)) == C`, whenever `C`
# is a `CategoricalArray`. In other words, `reconstruct` is a left-inverse
# for the function `C -> broadcast(identity, C)` that strips a
# CategoricalArray of its "categorical wrapper".

# Does not handle missing values.

# """
# function reconstruct(A::AbstractArray{<:CategoricalValue{T},N}) where {T,N}
# firstnonmissing = findfirst(x->!ismissing(x), A)
# isnothing(firstnonmissing) && error("No non-missing values encountered. ")
# pool = A[firstnonmissing].pool
# refs = broadcast(x -> x.level, A)
# return CategoricalArray{T,N}(refs, pool)
# end
# function reconstruct(A::AbstractArray{<:CategoricalString,N}) where {T,N}
# firstnonmissing = findfirst(x->!ismissing(x), A)
# isnothing(firstnonmissing) && error("No non-missing values encountered. ")
# pool = A[firstnonmissing].pool
# refs = broadcast(x -> x.level, A)
# return CategoricalArray{String,N}(refs, pool)
# end

CategoricalElement = Union{CategoricalValue,CategoricalString}
CategoricalElement{U} = Union{CategoricalValue{<:Any,U},CategoricalString{U}}

"""
classes(x)
Expand Down Expand Up @@ -61,7 +30,8 @@ function classes(x::CategoricalElement)
return [p.valindex[p.invindex[v]] for v in p.levels]
end

raw(x::CategoricalElement) = x.pool.index[x.level] # a method just for testing
# a method just for testing:
raw(x::CategoricalElement) = x.pool.index[x.level]

"""
int(x)
Expand Down Expand Up @@ -95,8 +65,11 @@ Broadcasted versions of `int`.
See also: [`decoder`](@ref).
"""
int(x::CategoricalElement) = x.pool.order[x.pool.invindex[x]]
int(X::CategoricalArray) = broadcast(r -> X.pool.order[r], X.refs)
int(V::Array{<:CategoricalElement}) = broadcast(int, V)
int(A::AbstractArray{<:CategoricalElement}) = broadcast(int, A)
# workaround for CategoricalArrays issue
# https://github.com/JuliaData/CategoricalArrays.jl/issues/199:
# function int(X::CategoricalArray)


struct CategoricalDecoder{T,R} # <: MLJType
pool::CategoricalPool{T,R}
Expand Down Expand Up @@ -144,35 +117,10 @@ decoder(element::CategoricalElement) =

## TABULAR DATA

const istable = Tables.istable

# hack for detecting JuliaDB.NDSparse tables without loading as dependency:
isndsparse(X) = isdefined(X, :data_buffer)


"""
container_type(X)
Return `:table`, `:sparse`, or `:other`, according to whether `X` is a
supported table format, a supported sparse table format, or something
else.
The first two formats, together abstract vectors, support the
`MLJBase` accessor methods `selectrows`, `selectcols`, `select`,
`nrows`, `schema`, and `union_scitypes`.
"""
function container_type(X)
if istable(X)
return :table
elseif isndsparse(X)
return :sparse
else
return :other
end
end


## UTILITY FOR CONVERTING BETWEEN TABULAR DATA AND MATRICES

"""
Expand All @@ -186,7 +134,7 @@ returned. The integer relabelling of column names follows the
lexicographic ordering (as indicated by `schema(X).names`).
"""
matrix(X) = matrix(Val(container_type(X)), X)
matrix(X) = matrix(Val(ScientificTypes.trait(X)), X)
matrix(::Val{:other}, X) = throw(ArgumentError)
matrix(::Val{:other}, X::AbstractMatrix) = X

Expand Down Expand Up @@ -229,7 +177,7 @@ named tuple of columns of `X`, with `keys(cols) = names`.
"""
function table(cols::NamedTuple; prototype=cols)
istable(prototype) || error("prototype is not tabular.")
Tables.istable(prototype) || error("prototype is not tabular.")
return Tables.materializer(prototype)(cols)
end
function table(X::AbstractMatrix; names=nothing, prototype=nothing)
Expand All @@ -246,7 +194,6 @@ end

## UNIFIED API FOR ACCESSING TABLES, MATRICES AND VECTORS


"""
selectrows(X, r)
Expand All @@ -256,7 +203,7 @@ table of the preferred sink type of `typeof(X)`, even a single row is
selected.
"""
selectrows(X, r) = selectrows(Val(container_type(X)), X, r)
selectrows(X, r) = selectrows(Val(ScientificTypes.trait(X)), X, r)
selectrows(::Val{:other}, X, r) = throw(ArgumentError)

"""
Expand All @@ -269,7 +216,7 @@ object returned is a table of the preferred sink type of
or `CategoricalVector` is returned.
"""
selectcols(X, c) = selectcols(Val(container_type(X)), X, c)
selectcols(X, c) = selectcols(Val(ScientificTypes.trait(X)), X, c)
selectcols(::Val{:other}, X, c) = throw(ArgumentError)

"""
Expand All @@ -282,26 +229,16 @@ Select element of a table or sparse table at row `r` and column
See also: [`selectrows`](@ref), [`selectcols`](@ref).
"""
select(X, r, c) = select(Val(container_type(X)), X, r, c)
select(X, r, c) = select(Val(ScientificTypes.trait(X)), X, r, c)
select(::Val{:other}, X, r, c) = throw(ArgumentError)

"""
schema(X)
Returns a struct with properties `names`, `types`
with the obvious meanings. Here `X` is any table or sparse table.
"""
schema(X) = schema(Val(container_type(X)), X)
schema(::Val{:other}, X) = throw(ArgumentError)

"""
nrows(X)
Return the number of rows in a table, sparse table, or abstract vector.
"""
nrows(X) = nrows(Val(container_type(X)), X)
nrows(X) = nrows(Val(ScientificTypes.trait(X)), X)
nrows(::Val{:other}, X) = throw(ArgumentError)


Expand Down Expand Up @@ -359,24 +296,7 @@ select(::Val{:table}, X, r::Integer, c) = selectcols(selectrows(X, r), c)
select(::Val{:table}, X, r, c::Symbol) = selectcols(X, c)[r]
select(::Val{:table}, X, r, c) = selectcols(selectrows(X, r), c)

function schema(::Val{:table}, X)
istable(X) || throw(ArgumentError)
if !Tables.columnaccess(X)
return Tables.schema(Tables.rows(X))
else
return Tables.schema(Tables.columns(X))
end
end

function nrows(::Val{:table}, X)
if !Tables.columnaccess(X)
return length(collect(X))
else
cols = Tables.columntable(X)
!isempty(cols) || return 0
return length(cols[1])
end
end
nrows(::Val{:table}, X) = schema(X).nrows


## ACCESSORS FOR ABSTRACT VECTORS
Expand All @@ -385,7 +305,7 @@ selectrows(::Val{:other}, v::AbstractVector, r) = v[r]
nrows(::Val{:other}, v::AbstractVector) = length(v)
selectrows(::Val{:other}, v::CategoricalVector, r) = @inbounds v[r]


## to be replaced (not used anywhere):
## ACCESSORS FOR JULIA NDSPARSE ARRAYS (N=2)

nrows(::Val{:sparse}, X) = maximum([r[1] for r in keys(X)])
Expand All @@ -408,8 +328,3 @@ select(::Val{:sparse}, X, r::Integer, c::AbstractVector{Symbol}) = X[r,sort(c)]
select(::Val{:sparse}, X, r::Integer, ::Colon) = X[r,:]
select(::Val{:sparse}, X, r, c) = X[r,sort(c)]

function schema(::Val{:sparse}, X)
names = sort(unique([r[2] for r in keys(X)]))
types = [eltype(selectcols(X, name)) for name in names]
return Tables.Schema(names, types)
end
2 changes: 0 additions & 2 deletions src/datasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ having six numerical and six categorical features."""
function load_reduced_ames()
df = CSV.read(joinpath(datadir, "reduced_ames.csv"), copycols=true,
categorical=true)
df[:target] = exp.(df[:target])
# TODO: uncomment following after julia #29501 is resolved
# df.OverallQual = categorical(df.OverallQual, ordered=true)
# df[:GarageCars] = categorical(df[:GarageCars], ordered=true)
Expand All @@ -36,7 +35,6 @@ end
function load_ames()
df = CSV.read(joinpath(datadir, "ames.csv"), copycols=true,
categorical=true)
df[:target] = exp.(df[:target])
return SupervisedTask(verbosity=0, data=df,
target=:target,
ignore=[:Id,],
Expand Down
Loading

0 comments on commit f36dcd0

Please sign in to comment.