Compat: DataFrames and CategoricalArrays (#72)

JuliaData · May 13, 2020 · 030b0fb · 030b0fb · nalimilan · May 13, 2020
1 parent 5a04c97
commit 030b0fb
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 14 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "RData"
 uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
-version = "0.7.1"
+version = "0.7.2"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -10,11 +10,12 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
+Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [compat]
-CategoricalArrays = "0.5, 0.6, 0.7, 0.8"
+CategoricalArrays = "0.8"
 CodecZlib = "0.4, 0.5, 0.6, 0.7"
-DataFrames = "0.19, 0.20"
+DataFrames = "0.21"
 FileIO = "1.0.5"
 Requires = "1.0.0"
 TimeZones = "0.7, 0.8, 0.9, 0.10, 1.0"

diff --git a/src/DictoVec.jl b/src/DictoVec.jl
@@ -125,3 +125,49 @@ function Base.convert(::Type{Dict{RString,Any}}, dv::DictoVec)
     end
     return res
 end
+
+# RESERVED_WORDS, identifier, makeidentifier were originally part of DataFrames v0.20
+const RESERVED_WORDS = Set(["local", "global", "export", "let",
+    "for", "struct", "while", "const", "continue", "import",
+    "function", "if", "else", "try", "begin", "break", "catch",
+    "return", "using", "baremodule", "macro", "finally",
+    "module", "elseif", "end", "quote", "do"])
+
+function identifier(s::AbstractString)
+    s = Unicode.normalize(s)
+    if !Base.isidentifier(s)
+        s = makeidentifier(s)
+    end
+    Symbol(in(s, RESERVED_WORDS) ? "_"*s : s)
+end
+
+function makeidentifier(s::AbstractString)
+    (iresult = iterate(s)) === nothing && return "x"
+
+    res = IOBuffer(zeros(UInt8, sizeof(s)+1), write=true)
+
+    (c, i) = iresult
+    under = if Base.is_id_start_char(c)
+        write(res, c)
+        c == '_'
+    elseif Base.is_id_char(c)
+        write(res, 'x', c)
+        false
+    else
+        write(res, '_')
+        true
+    end
+
+    while (iresult = iterate(s, i)) !== nothing
+        (c, i) = iresult
+        if c != '_' && Base.is_id_char(c)
+            write(res, c)
+            under = false
+        elseif !under
+            write(res, '_')
+            under = true
+        end
+    end
+
+    return String(take!(res))
+end
diff --git a/src/RData.jl b/src/RData.jl
@@ -1,7 +1,6 @@
 module RData
 
-using DataFrames, CategoricalArrays, FileIO, TimeZones
-import DataFrames: identifier
+using DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode
 
 export
     sexp2julia,

diff --git a/src/convert.jl b/src/convert.jl
@@ -95,13 +95,29 @@ end
 function jlvec(::Type{CategoricalArray}, ri::RVEC, force_missing::Bool=true)
     @assert isfactor(ri)
 
-    rlevels = getattr(ri, "levels")
+    rlevels0 = getattr(ri, "levels")
+    sz0 = length(rlevels0)
+    # CategoricalArrays#v0.8 does not allow duplicate levels
+    rlevels = unique(rlevels0)
     sz = length(rlevels)
-    REFTYPE = sz <= typemax(UInt8)  ? UInt8 :
-              sz <= typemax(UInt16) ? UInt16 :
-              sz <= typemax(UInt32) ? UInt32 :
+    hasduplicates = sz0 != sz
+
+    REFTYPE = sz0 <= typemax(UInt8)  ? UInt8 :
+              sz0 <= typemax(UInt16) ? UInt16 :
+              sz0 <= typemax(UInt32) ? UInt32 :
                                       UInt64
     refs = na2zero(REFTYPE, ri.data)
+
+    if hasduplicates
+        # map refs with dups to unique refs
+        ref_map = REFTYPE.(indexin(rlevels0, rlevels))
+        @inbounds for i in eachindex(refs)
+            ref = refs[i]
+            refs[i] = ref == 0 ? 0 : ref_map[ref]
+        end
+        @warn "Dropped duplicate factor levels"
+    end
+
     anyna = any(iszero, refs)
     pool = CategoricalPool{String, REFTYPE}(rlevels, inherits(ri, "ordered"))
     if force_missing || anyna

diff --git a/test/RDA.jl b/test/RDA.jl
@@ -47,7 +47,7 @@ using RData
                        int = Int32[1, 2],
                        logi = [true, false],
                        chr = ["ab", "c"],
-                       factor = categorical(["ab", "c"], true),
+                       factor = categorical(["ab", "c"], compress=true),
                        cplx = [1.1+0.5im, 1.0im])
         rdf = sexp2julia(load(joinpath(rdata_path, "types.rda"), convert=false)["df"])
         @test eltype.(eachcol(rdf)) == eltype.(eachcol(df))
@@ -62,7 +62,7 @@ using RData
                        int = Union{Int32, Missing}[1, 2],
                        logi = Union{Bool, Missing}[true, false],
                        chr = Union{String, Missing}["ab", "c"],
-                       factor = categorical(Union{String, Missing}["ab", "c"], true),
+                       factor = categorical(Union{String, Missing}["ab", "c"], compress=true),
                        cplx = Union{ComplexF64, Missing}[1.1+0.5im, 1.0im])
 
         df[2, :] .= Ref(missing)
@@ -75,10 +75,10 @@ using RData
 
     @testset "Column names conversion" begin
         rda_names = names(sexp2julia(load(joinpath(rdata_path, "names.rda"), convert=false)["df"]))
-        expected_names = [:_end, :x!, :x1, :_B_C_, :x, :x_1]
+        expected_names = ["_end", "x!", "x1", "_B_C_", "x", "x_1"]
         @test rda_names == expected_names
         rda_names = names(sexp2julia(load(joinpath(rdata_path, "names_ascii.rda"), convert=false)["df"]))
-        @test rda_names == [:_end, :x!, :x1, :_B_C_, :x, :x_1]
+        @test rda_names == expected_names
     end
 
     @testset "Reading RDA with complex types (environments, closures etc)" begin
@@ -122,4 +122,13 @@ end # for ver in ...
     @test size(altrep_conv_rda["nonnilpairlist"]) == (0, 10)
 end
 
+@testset "Duplicate levels in factor (version=3)" begin
+    dup_cat = sexp2julia(load(joinpath("data_v3", "dup_levels.rda"), convert=false)["dup_levels"])
+    @test dup_cat[1] == "Paced"
+    @test dup_cat[2] == "Inferior"
+    @test dup_cat[end] == "Anterior"
+    @test levels(dup_cat) ==
+        ["Inferior", "Anterior", "LBBB", "Missing", "NoSTUp", "OtherSTUp", "Paced"]
+end
+
 end # module TestRDA
diff --git a/test/RDS.jl b/test/RDS.jl
@@ -13,7 +13,7 @@ using TimeZones
                        int = Int32[1, 2],
                        logi = [true, false],
                        chr = ["ab", "c"],
-                       factor = categorical(["ab", "c"], true),
+                       factor = categorical(["ab", "c"], compress=true),
                        cplx = ComplexF64[1.1+0.5im, 1.0im])
         rdf = sexp2julia(load(joinpath(rdata_path, "types.rds"), convert=false))
         @test rdf isa DataFrame

diff --git a/test/data_v3/dup_levels.rda b/test/data_v3/dup_levels.rda