From b3e2fc4c8eb785895720efcba823ba58ea602d32 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 27 Oct 2022 06:47:39 +0200 Subject: [PATCH 1/4] do not require DataFrame when output is not DataFrame --- src/datasets/misc/iris.jl | 29 +++++++++++++++-------------- src/io.jl | 11 +++++------ src/utils.jl | 25 +++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/src/datasets/misc/iris.jl b/src/datasets/misc/iris.jl index 0a5cd09a..60fcd836 100644 --- a/src/datasets/misc/iris.jl +++ b/src/datasets/misc/iris.jl @@ -79,22 +79,23 @@ end function Iris(; dir = nothing, as_df = true) path = datafile("Iris", "iris.data", dir) - df = read_csv(path, header=0) - DataFrames.rename!(df, ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"]) - - features = df[!, DataFrames.Not(:class)] - targets = df[!, [:class]] - + t = read_csv(path, CSV.File, header=0) + colnames = Tables.columnnames(t) + truecolnames = ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"] + features = table_to_matrix(t, select = colnames[1:4]) + targets = table_to_matrix(t, select = colnames[5:5]) + metadata = Dict{String, Any}() metadata["path"] = path - metadata["n_observations"] = size(df, 1) - metadata["feature_names"] = names(features) - metadata["target_names"] = names(targets) - - if !as_df - features = df_to_matrix(features) - targets = df_to_matrix(targets) - df = nothing + metadata["n_observations"] = size(features, 1) + metadata["feature_names"] = truecolnames[1:4] + metadata["target_names"] = truecolnames[5:5] + + df = nothing + if as_df + df = table_to_df(t, names = truecolnames) + features = matrix_to_df(features, names = truecolnames[1:4]) + targets = matrix_to_df(targets, names = truecolnames[5:5]) end return Iris(metadata, features, targets, df) diff --git a/src/io.jl b/src/io.jl index 168323b4..b4d0e467 100644 --- a/src/io.jl +++ b/src/io.jl @@ -3,13 +3,12 @@ function read_csv(path; kws...) return read_csv_asdf(path; kws...) end -# function read_csv(path, sink::Type{<:AbstractMatrix{T}}; delim=nothing, kws...) where T -# x = delim === nothing ? readdlm(path, T; kws...) : readdlm(path, delim, T; kws...) -# return x -# end - function read_csv(path, sink::Type{A}; kws...) where A <: AbstractMatrix - return A(read_csv(path; kws...)) + return table_to_matrix(read_csv(path, CSV.File; kws...)) +end + +function read_csv(path, sink::Type{CSV.File}; kws...) + return CSV.File(path; kws...) end function read_csv_asdf(path; kws...) diff --git a/src/utils.jl b/src/utils.jl index ada82445..f8d72bb1 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -21,6 +21,31 @@ function restrict_array_type(res::AbstractArray) end end +function table_to_matrix(t; select = nothing) + if select === nothing + cnames = Tables.columnnames(cols) + else + cnames = select + end + return hcat((Tables.getcolumn(t, n) for n in cnames)...) +end + +function table_to_df(t; names = nothing) + df = DataFrames.DataFrame(t) + if names !== nothing + DataFrames.rename!(df, names) + end + return df +end + +function matrix_to_df(a::AbstractMatrix; names = nothing) + df = DataFrames.DataFrame(a, :auto) + if names !== nothing + DataFrames.rename!(df, names) + end + return df +end + function df_to_matrix(df) x = Matrix(df) if size(x, 2) == 1 From a83184b489f6f77a29775039791890f0ce60e74b Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 27 Oct 2022 07:01:06 +0200 Subject: [PATCH 2/4] also boston housing and titanic --- src/datasets/misc/boston_housing.jl | 23 +++++++++++++---------- src/datasets/misc/iris.jl | 12 ++++++------ src/datasets/misc/titanic.jl | 25 ++++++++++++++----------- 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/src/datasets/misc/boston_housing.jl b/src/datasets/misc/boston_housing.jl index 53cc1f4d..a85ec31e 100644 --- a/src/datasets/misc/boston_housing.jl +++ b/src/datasets/misc/boston_housing.jl @@ -77,21 +77,24 @@ end function BostonHousing(; as_df = true, dir = nothing) @assert dir === nothing "custom `dir` is not supported at the moment." path = joinpath(@__DIR__, "..", "..", "..", "data", "boston_housing.csv") - df = read_csv(path) - features = df[!, DataFrames.Not(:MEDV)] - targets = df[!, [:MEDV]] + t = read_csv(path, CSV.File) + colnames = Tables.columnnames(t) + features = table_to_matrix(t, select = colnames[1:end-1]) + targets = table_to_matrix(t, select = colnames[end:end]) + metadata = Dict{String, Any}() metadata["path"] = path - metadata["feature_names"] = names(features) - metadata["target_names"] = names(targets) - metadata["n_observations"] = size(targets, 1) + metadata["feature_names"] = colnames[1:end-1] + metadata["target_names"] = colnames[end:end] + metadata["n_observations"] = size(features, 1) metadata["description"] = BOSTONHOUSING_DESCR - if !as_df - features = df_to_matrix(features) - targets = df_to_matrix(targets) - df = nothing + df = nothing + if as_df + df = table_to_df(t, names = colnames) + features = matrix_to_df(features, names = colnames[1:end-1]) + targets = matrix_to_df(targets, names = colnames[end:end]) end return BostonHousing(metadata, features, targets, df) diff --git a/src/datasets/misc/iris.jl b/src/datasets/misc/iris.jl index 60fcd836..53162451 100644 --- a/src/datasets/misc/iris.jl +++ b/src/datasets/misc/iris.jl @@ -82,20 +82,20 @@ function Iris(; dir = nothing, as_df = true) t = read_csv(path, CSV.File, header=0) colnames = Tables.columnnames(t) truecolnames = ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"] - features = table_to_matrix(t, select = colnames[1:4]) - targets = table_to_matrix(t, select = colnames[5:5]) + features = table_to_matrix(t, select = colnames[1:end-1]) + targets = table_to_matrix(t, select = colnames[end:end]) metadata = Dict{String, Any}() metadata["path"] = path metadata["n_observations"] = size(features, 1) - metadata["feature_names"] = truecolnames[1:4] - metadata["target_names"] = truecolnames[5:5] + metadata["feature_names"] = truecolnames[1:end-1] + metadata["target_names"] = truecolnames[end:end] df = nothing if as_df df = table_to_df(t, names = truecolnames) - features = matrix_to_df(features, names = truecolnames[1:4]) - targets = matrix_to_df(targets, names = truecolnames[5:5]) + features = matrix_to_df(features, names = truecolnames[1:end-1]) + targets = matrix_to_df(targets, names = truecolnames[end:end]) end return Iris(metadata, features, targets, df) diff --git a/src/datasets/misc/titanic.jl b/src/datasets/misc/titanic.jl index 2bf4a04a..53fc0d5b 100644 --- a/src/datasets/misc/titanic.jl +++ b/src/datasets/misc/titanic.jl @@ -61,22 +61,25 @@ end function Titanic(; as_df = true, dir = nothing) @assert dir === nothing "custom `dir` is not supported at the moment." path = joinpath(@__DIR__, "..", "..", "..", "data", "titanic.csv") - df = read_csv(path) - - features = df[!, DataFrames.Not(:Survived)] - targets = df[!, [:Survived]] + t = read_csv(path, CSV.File) + colnames = Tables.columnnames(t) + ncols = length(colnames) + # :Sruvived is the second column + features = table_to_matrix(t, select = colnames[[1; 3:ncols]]) + targets = table_to_matrix(t, select = colnames[2:2]) metadata = Dict{String, Any}() metadata["path"] = path - metadata["feature_names"] = names(features) - metadata["target_names"] = names(targets) - metadata["n_observations"] = size(df, 1) + metadata["feature_names"] = colnames[[1; 3:ncols]] + metadata["target_names"] = colnames[2:2] + metadata["n_observations"] = size(features, 1) metadata["description"] = TITANIC_DESCR - if !as_df - features = df_to_matrix(features) - targets = df_to_matrix(targets) - df = nothing + df = nothing + if as_df + df = table_to_df(t, names = colnames) + features = matrix_to_df(features, names = colnames[[1; 3:ncols]]) + targets = matrix_to_df(targets, names = colnames[2:2]) end return Titanic(metadata, features, targets, df) From 4baf0a6ed2e736a1207939d4aebc1e2ef3e672db Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 27 Oct 2022 07:03:00 +0200 Subject: [PATCH 3/4] fix --- src/utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.jl b/src/utils.jl index f8d72bb1..e5e4ca30 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -23,7 +23,7 @@ end function table_to_matrix(t; select = nothing) if select === nothing - cnames = Tables.columnnames(cols) + cnames = Tables.columnnames(t) else cnames = select end From 93d14fef513c664cc73cf1a7be6d9c1a17af1769 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 27 Oct 2022 13:12:34 +0200 Subject: [PATCH 4/4] relax test --- test/test_utils.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_utils.jl b/test/test_utils.jl index 60277c11..e26123e0 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -29,12 +29,11 @@ function test_inmemory_supervised_table_dataset(d::D; @test size(d.features) == (n_obs, n_features) @test size(d.targets) == (n_obs, n_targets) - # check that dataframe shares the same storage of features and targets for c in names(d.dataframe) if c in names(d.targets) - @test d.dataframe[!, c] === d.targets[!,c] + @test d.dataframe[!, c] == d.targets[!,c] else - @test d.dataframe[!, c] === d.features[!,c] + @test d.dataframe[!, c] == d.features[!,c] end end