Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

do not require DataFrames.jl when output doesn't contain DataFrames #185

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions src/datasets/misc/boston_housing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,24 @@ end
function BostonHousing(; as_df = true, dir = nothing)
@assert dir === nothing "custom `dir` is not supported at the moment."
path = joinpath(@__DIR__, "..", "..", "..", "data", "boston_housing.csv")
df = read_csv(path)
features = df[!, DataFrames.Not(:MEDV)]
targets = df[!, [:MEDV]]
t = read_csv(path, CSV.File)
colnames = Tables.columnnames(t)
features = table_to_matrix(t, select = colnames[1:end-1])
targets = table_to_matrix(t, select = colnames[end:end])


metadata = Dict{String, Any}()
metadata["path"] = path
metadata["feature_names"] = names(features)
metadata["target_names"] = names(targets)
metadata["n_observations"] = size(targets, 1)
metadata["feature_names"] = colnames[1:end-1]
metadata["target_names"] = colnames[end:end]
metadata["n_observations"] = size(features, 1)
metadata["description"] = BOSTONHOUSING_DESCR

if !as_df
features = df_to_matrix(features)
targets = df_to_matrix(targets)
df = nothing
df = nothing
if as_df
df = table_to_df(t, names = colnames)
features = matrix_to_df(features, names = colnames[1:end-1])
targets = matrix_to_df(targets, names = colnames[end:end])
end

return BostonHousing(metadata, features, targets, df)
Expand Down
29 changes: 15 additions & 14 deletions src/datasets/misc/iris.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,22 +79,23 @@ end

function Iris(; dir = nothing, as_df = true)
path = datafile("Iris", "iris.data", dir)
df = read_csv(path, header=0)
DataFrames.rename!(df, ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"])

features = df[!, DataFrames.Not(:class)]
targets = df[!, [:class]]

t = read_csv(path, CSV.File, header=0)
colnames = Tables.columnnames(t)
truecolnames = ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"]
features = table_to_matrix(t, select = colnames[1:end-1])
targets = table_to_matrix(t, select = colnames[end:end])
metadata = Dict{String, Any}()
metadata["path"] = path
metadata["n_observations"] = size(df, 1)
metadata["feature_names"] = names(features)
metadata["target_names"] = names(targets)

if !as_df
features = df_to_matrix(features)
targets = df_to_matrix(targets)
df = nothing
metadata["n_observations"] = size(features, 1)
metadata["feature_names"] = truecolnames[1:end-1]
metadata["target_names"] = truecolnames[end:end]

df = nothing
if as_df
df = table_to_df(t, names = truecolnames)
features = matrix_to_df(features, names = truecolnames[1:end-1])
targets = matrix_to_df(targets, names = truecolnames[end:end])
end

return Iris(metadata, features, targets, df)
Expand Down
25 changes: 14 additions & 11 deletions src/datasets/misc/titanic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,22 +61,25 @@ end
function Titanic(; as_df = true, dir = nothing)
@assert dir === nothing "custom `dir` is not supported at the moment."
path = joinpath(@__DIR__, "..", "..", "..", "data", "titanic.csv")
df = read_csv(path)

features = df[!, DataFrames.Not(:Survived)]
targets = df[!, [:Survived]]
t = read_csv(path, CSV.File)
colnames = Tables.columnnames(t)
ncols = length(colnames)
# :Sruvived is the second column
features = table_to_matrix(t, select = colnames[[1; 3:ncols]])
targets = table_to_matrix(t, select = colnames[2:2])

metadata = Dict{String, Any}()
metadata["path"] = path
metadata["feature_names"] = names(features)
metadata["target_names"] = names(targets)
metadata["n_observations"] = size(df, 1)
metadata["feature_names"] = colnames[[1; 3:ncols]]
metadata["target_names"] = colnames[2:2]
metadata["n_observations"] = size(features, 1)
metadata["description"] = TITANIC_DESCR

if !as_df
features = df_to_matrix(features)
targets = df_to_matrix(targets)
df = nothing
df = nothing
if as_df
df = table_to_df(t, names = colnames)
features = matrix_to_df(features, names = colnames[[1; 3:ncols]])
targets = matrix_to_df(targets, names = colnames[2:2])
end

return Titanic(metadata, features, targets, df)
Expand Down
11 changes: 5 additions & 6 deletions src/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@ function read_csv(path; kws...)
return read_csv_asdf(path; kws...)
end

# function read_csv(path, sink::Type{<:AbstractMatrix{T}}; delim=nothing, kws...) where T
# x = delim === nothing ? readdlm(path, T; kws...) : readdlm(path, delim, T; kws...)
# return x
# end

function read_csv(path, sink::Type{A}; kws...) where A <: AbstractMatrix
return A(read_csv(path; kws...))
return table_to_matrix(read_csv(path, CSV.File; kws...))
end

function read_csv(path, sink::Type{CSV.File}; kws...)
return CSV.File(path; kws...)
end

function read_csv_asdf(path; kws...)
Expand Down
25 changes: 25 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,31 @@ function restrict_array_type(res::AbstractArray)
end
end

function table_to_matrix(t; select = nothing)
if select === nothing
cnames = Tables.columnnames(t)
else
cnames = select
end
return hcat((Tables.getcolumn(t, n) for n in cnames)...)
end

function table_to_df(t; names = nothing)
df = DataFrames.DataFrame(t)
if names !== nothing
DataFrames.rename!(df, names)
end
return df
end

function matrix_to_df(a::AbstractMatrix; names = nothing)
df = DataFrames.DataFrame(a, :auto)
if names !== nothing
DataFrames.rename!(df, names)
end
return df
end

function df_to_matrix(df)
x = Matrix(df)
if size(x, 2) == 1
Expand Down
5 changes: 2 additions & 3 deletions test/test_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,11 @@ function test_inmemory_supervised_table_dataset(d::D;
@test size(d.features) == (n_obs, n_features)
@test size(d.targets) == (n_obs, n_targets)

# check that dataframe shares the same storage of features and targets
for c in names(d.dataframe)
if c in names(d.targets)
@test d.dataframe[!, c] === d.targets[!,c]
@test d.dataframe[!, c] == d.targets[!,c]
else
@test d.dataframe[!, c] === d.features[!,c]
@test d.dataframe[!, c] == d.features[!,c]
end
end

Expand Down