Skip to content

Commit

Permalink
Compat: DataFrames and CategoricalArrays (#72)
Browse files Browse the repository at this point in the history
  • Loading branch information
greimel committed May 13, 2020
1 parent 5a04c97 commit 030b0fb
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 14 deletions.
7 changes: 4 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "RData"
uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
version = "0.7.1"
version = "0.7.2"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand All @@ -10,11 +10,12 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[compat]
CategoricalArrays = "0.5, 0.6, 0.7, 0.8"
CategoricalArrays = "0.8"
CodecZlib = "0.4, 0.5, 0.6, 0.7"
DataFrames = "0.19, 0.20"
DataFrames = "0.21"
FileIO = "1.0.5"
Requires = "1.0.0"
TimeZones = "0.7, 0.8, 0.9, 0.10, 1.0"
Expand Down
46 changes: 46 additions & 0 deletions src/DictoVec.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,49 @@ function Base.convert(::Type{Dict{RString,Any}}, dv::DictoVec)
end
return res
end

# RESERVED_WORDS, identifier, makeidentifier were originally part of DataFrames v0.20
const RESERVED_WORDS = Set(["local", "global", "export", "let",
"for", "struct", "while", "const", "continue", "import",
"function", "if", "else", "try", "begin", "break", "catch",
"return", "using", "baremodule", "macro", "finally",
"module", "elseif", "end", "quote", "do"])

function identifier(s::AbstractString)
s = Unicode.normalize(s)
if !Base.isidentifier(s)
s = makeidentifier(s)
end
Symbol(in(s, RESERVED_WORDS) ? "_"*s : s)
end

function makeidentifier(s::AbstractString)
(iresult = iterate(s)) === nothing && return "x"

res = IOBuffer(zeros(UInt8, sizeof(s)+1), write=true)

(c, i) = iresult
under = if Base.is_id_start_char(c)
write(res, c)
c == '_'
elseif Base.is_id_char(c)
write(res, 'x', c)
false
else
write(res, '_')
true
end

while (iresult = iterate(s, i)) !== nothing
(c, i) = iresult
if c != '_' && Base.is_id_char(c)
write(res, c)
under = false
elseif !under
write(res, '_')
under = true
end
end

return String(take!(res))
end
3 changes: 1 addition & 2 deletions src/RData.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
module RData

using DataFrames, CategoricalArrays, FileIO, TimeZones
import DataFrames: identifier
using DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode

export
sexp2julia,
Expand Down
24 changes: 20 additions & 4 deletions src/convert.jl
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,29 @@ end
function jlvec(::Type{CategoricalArray}, ri::RVEC, force_missing::Bool=true)
@assert isfactor(ri)

rlevels = getattr(ri, "levels")
rlevels0 = getattr(ri, "levels")
sz0 = length(rlevels0)
# CategoricalArrays#v0.8 does not allow duplicate levels
rlevels = unique(rlevels0)
sz = length(rlevels)
REFTYPE = sz <= typemax(UInt8) ? UInt8 :
sz <= typemax(UInt16) ? UInt16 :
sz <= typemax(UInt32) ? UInt32 :
hasduplicates = sz0 != sz

REFTYPE = sz0 <= typemax(UInt8) ? UInt8 :
sz0 <= typemax(UInt16) ? UInt16 :
sz0 <= typemax(UInt32) ? UInt32 :
UInt64
refs = na2zero(REFTYPE, ri.data)

if hasduplicates
# map refs with dups to unique refs
ref_map = REFTYPE.(indexin(rlevels0, rlevels))
@inbounds for i in eachindex(refs)
ref = refs[i]
refs[i] = ref == 0 ? 0 : ref_map[ref]
end
@warn "Dropped duplicate factor levels"
end

anyna = any(iszero, refs)
pool = CategoricalPool{String, REFTYPE}(rlevels, inherits(ri, "ordered"))
if force_missing || anyna
Expand Down
17 changes: 13 additions & 4 deletions test/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ using RData
int = Int32[1, 2],
logi = [true, false],
chr = ["ab", "c"],
factor = categorical(["ab", "c"], true),
factor = categorical(["ab", "c"], compress=true),
cplx = [1.1+0.5im, 1.0im])
rdf = sexp2julia(load(joinpath(rdata_path, "types.rda"), convert=false)["df"])
@test eltype.(eachcol(rdf)) == eltype.(eachcol(df))
Expand All @@ -62,7 +62,7 @@ using RData
int = Union{Int32, Missing}[1, 2],
logi = Union{Bool, Missing}[true, false],
chr = Union{String, Missing}["ab", "c"],
factor = categorical(Union{String, Missing}["ab", "c"], true),
factor = categorical(Union{String, Missing}["ab", "c"], compress=true),
cplx = Union{ComplexF64, Missing}[1.1+0.5im, 1.0im])

df[2, :] .= Ref(missing)
Expand All @@ -75,10 +75,10 @@ using RData

@testset "Column names conversion" begin
rda_names = names(sexp2julia(load(joinpath(rdata_path, "names.rda"), convert=false)["df"]))
expected_names = [:_end, :x!, :x1, :_B_C_, :x, :x_1]
expected_names = ["_end", "x!", "x1", "_B_C_", "x", "x_1"]
@test rda_names == expected_names
rda_names = names(sexp2julia(load(joinpath(rdata_path, "names_ascii.rda"), convert=false)["df"]))
@test rda_names == [:_end, :x!, :x1, :_B_C_, :x, :x_1]
@test rda_names == expected_names
end

@testset "Reading RDA with complex types (environments, closures etc)" begin
Expand Down Expand Up @@ -122,4 +122,13 @@ end # for ver in ...
@test size(altrep_conv_rda["nonnilpairlist"]) == (0, 10)
end

@testset "Duplicate levels in factor (version=3)" begin
dup_cat = sexp2julia(load(joinpath("data_v3", "dup_levels.rda"), convert=false)["dup_levels"])
@test dup_cat[1] == "Paced"
@test dup_cat[2] == "Inferior"
@test dup_cat[end] == "Anterior"
@test levels(dup_cat) ==
["Inferior", "Anterior", "LBBB", "Missing", "NoSTUp", "OtherSTUp", "Paced"]
end

end # module TestRDA
2 changes: 1 addition & 1 deletion test/RDS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ using TimeZones
int = Int32[1, 2],
logi = [true, false],
chr = ["ab", "c"],
factor = categorical(["ab", "c"], true),
factor = categorical(["ab", "c"], compress=true),
cplx = ComplexF64[1.1+0.5im, 1.0im])
rdf = sexp2julia(load(joinpath(rdata_path, "types.rds"), convert=false))
@test rdf isa DataFrame
Expand Down
Binary file added test/data_v3/dup_levels.rda
Binary file not shown.

2 comments on commit 030b0fb

@nalimilan
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/14665

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.7.2 -m "<description of version>" 030b0fb5aae7a4323109b4f8e2397c201a7207d5
git push origin v0.7.2

Please sign in to comment.