From 1db6279eccced2dab8760d191f0fe5cbea9f779b Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Fri, 1 Nov 2024 15:18:42 +0100 Subject: [PATCH 01/17] make recode! type stable --- src/recode.jl | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index 282d4fb6..4ae262b0 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -57,13 +57,11 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end - opt_pairs = map(optimize_pair, pairs) - @inbounds for i in eachindex(dest, src) x = src[i] - for j in 1:length(opt_pairs) - p = opt_pairs[j] + for j in 1:length(pairs) + p = optimize_pair(pairs[j]) # we use isequal and recode_in because we cannot really distinguish scalars from collections if x ≅ p.first || recode_in(x, p.first) dest[i] = p.second @@ -101,9 +99,7 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end - opt_pairs = map(optimize_pair, pairs) - - vals = T[p.second for p in opt_pairs] + vals = T[p.second for p in pairs] default !== nothing && push!(vals, default) levels!(dest.pool, filter!(!ismissing, unique(vals))) @@ -117,8 +113,8 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa @inbounds for i in eachindex(drefs, src) x = src[i] - for j in 1:length(opt_pairs) - p = opt_pairs[j] + for j in 1:length(pairs) + p = optimize_pair(pairs[j]) # we use isequal and recode_in because we cannot really distinguish scalars from collections if x ≅ p.first || recode_in(x, p.first) drefs[i] = dupvals ? pairmap[j] : j From ef10966e0092d8f86b6528e28e7a8736d9031a59 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Wed, 20 Nov 2024 15:46:06 +0100 Subject: [PATCH 02/17] move optimize_pair away from main function --- src/recode.jl | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index 4ae262b0..2b5624aa 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -52,16 +52,21 @@ A user defined type could override this method to define an appropriate test fun optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second -function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} +function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::Pair...) if length(dest) != length(src) throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end + opt_pairs = map(optimize_pair, pairs) + + _recode!(dest, src, default, opt_pairs...) +end + +function _recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} @inbounds for i in eachindex(dest, src) x = src[i] - for j in 1:length(pairs) - p = optimize_pair(pairs[j]) + for p in pairs # we use isequal and recode_in because we cannot really distinguish scalars from collections if x ≅ p.first || recode_in(x, p.first) dest[i] = p.second @@ -94,11 +99,7 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs dest end -function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} - if length(dest) != length(src) - throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) - end - +function _recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} vals = T[p.second for p in pairs] default !== nothing && push!(vals, default) @@ -110,11 +111,12 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa drefs = dest.refs pairmap = [ismissing(v) ? 0 : get(dest.pool, v) for v in vals] defaultref = default === nothing || ismissing(default) ? 0 : get(dest.pool, default) + @inbounds for i in eachindex(drefs, src) x = src[i] - for j in 1:length(pairs) - p = optimize_pair(pairs[j]) + for j in eachindex(pairs) + p = pairs[j] # we use isequal and recode_in because we cannot really distinguish scalars from collections if x ≅ p.first || recode_in(x, p.first) drefs[i] = dupvals ? pairmap[j] : j @@ -164,13 +166,8 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa dest end -function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, +function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, default::Any, pairs::Pair...) where {T, N, R<:Integer} - if length(dest) != length(src) - throw(DimensionMismatch("dest and src must be of the same length " * - "(got $(length(dest)) and $(length(src)))")) - end - vals = T[p.second for p in pairs] if default === nothing srclevels = levels(src) From 1688fb23db5c8a6d744d447d6db543cc11c3d9c7 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Wed, 20 Nov 2024 16:00:45 +0100 Subject: [PATCH 03/17] improve type stability of recode_in --- src/recode.jl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index 2b5624aa..80c61714 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -46,8 +46,10 @@ The default method is to test if any element in the `collection` `isequal` to A user defined type could override this method to define an appropriate test function. """ @inline recode_in(x, ::Missing) = false +@inline recode_in(::Missing, ::Missing) = true @inline recode_in(x, collection::Set) = x in collection -@inline recode_in(x, collection) = any(x ≅ y for y in collection) +@inline recode_in(x, collection) = x ≅ collection || any(x ≅ y for y in collection) +@inline recode_in(x::T, y::T) where T = x === y optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second @@ -68,7 +70,7 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pair for p in pairs # we use isequal and recode_in because we cannot really distinguish scalars from collections - if x ≅ p.first || recode_in(x, p.first) + if recode_in(x, p.first) dest[i] = p.second @goto nextitem end @@ -118,7 +120,7 @@ function _recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, p for j in eachindex(pairs) p = pairs[j] # we use isequal and recode_in because we cannot really distinguish scalars from collections - if x ≅ p.first || recode_in(x, p.first) + if recode_in(x, p.first) drefs[i] = dupvals ? pairmap[j] : j @goto nextitem end @@ -226,7 +228,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, @inbounds for (i, l) in enumerate(srclevels) for j in 1:length(pairs) p = pairs[j] - if l ≅ p.first || recode_in(l, p.first) + if recode_in(l, p.first) levelsmap[i+1] = pairmap[j] @goto nextitem end From b412ce4e45fc48aed6842a2e350527462169d3e7 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Wed, 20 Nov 2024 16:22:19 +0100 Subject: [PATCH 04/17] un-inline recode_in --- src/recode.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index 80c61714..c72b8788 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -45,11 +45,11 @@ The default method is to test if any element in the `collection` `isequal` to `x`. For `Set`s `in` is used as it is faster than the default method and equivalent to it. A user defined type could override this method to define an appropriate test function. """ -@inline recode_in(x, ::Missing) = false -@inline recode_in(::Missing, ::Missing) = true -@inline recode_in(x, collection::Set) = x in collection -@inline recode_in(x, collection) = x ≅ collection || any(x ≅ y for y in collection) -@inline recode_in(x::T, y::T) where T = x === y +recode_in(x, ::Missing) = false +recode_in(::Missing, ::Missing) = true +recode_in(x, collection::Set) = x in collection +recode_in(x, collection) = x ≅ collection || any(x ≅ y for y in collection) +recode_in(x::T, y::T) where T = x === y optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second From 0581607316b5260ca05310417e6f46903cadd00d Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Fri, 22 Nov 2024 14:16:42 +0100 Subject: [PATCH 05/17] pass first and last pairs as tuples --- src/recode.jl | 52 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index c72b8788..1b6aeae9 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -45,11 +45,8 @@ The default method is to test if any element in the `collection` `isequal` to `x`. For `Set`s `in` is used as it is faster than the default method and equivalent to it. A user defined type could override this method to define an appropriate test function. """ -recode_in(x, ::Missing) = false -recode_in(::Missing, ::Missing) = true -recode_in(x, collection::Set) = x in collection -recode_in(x, collection) = x ≅ collection || any(x ≅ y for y in collection) -recode_in(x::T, y::T) where T = x === y +@inline recode_in(x, collection) = any(x ≅ y for y in collection) +@inline recode_in(x, ::Missing) = false optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second @@ -59,21 +56,25 @@ function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::P throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end - opt_pairs = map(optimize_pair, pairs) + opt_pairs = optimize_pair.(pairs) - _recode!(dest, src, default, opt_pairs...) + if dest isa CategoricalArray && src isa CategoricalArray + # in this case, we don't need to do much for type stability + _recode!(dest, src, default, opt_pairs...) + else + # in these cases, this is only type stable if we pass the pairs as tuples + _recode!(dest, src, default, first.(opt_pairs), last.(opt_pairs)) + end end -function _recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} +function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, recode_from::Tuple, recode_to::Tuple) where {T} @inbounds for i in eachindex(dest, src) x = src[i] - for p in pairs - # we use isequal and recode_in because we cannot really distinguish scalars from collections - if recode_in(x, p.first) - dest[i] = p.second - @goto nextitem - end + j = findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + if !isnothing(j) + dest[i] = recode_to[j] + @goto nextitem end # Value not in any of the pairs @@ -101,9 +102,9 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pair dest end -function _recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} - vals = T[p.second for p in pairs] - default !== nothing && push!(vals, default) +function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, recode_from::Tuple, recode_to::Tuple) where {T, R} + vals = convert.(T, recode_to) + vals = default === nothing ? vals : (vals..., default) levels!(dest.pool, filter!(!ismissing, unique(vals))) # In the absence of duplicated recoded values, we do not need to lookup the reference @@ -111,19 +112,16 @@ function _recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, p dupvals = length(vals) != length(levels(dest.pool)) drefs = dest.refs - pairmap = [ismissing(v) ? 0 : get(dest.pool, v) for v in vals] - defaultref = default === nothing || ismissing(default) ? 0 : get(dest.pool, default) + pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals] + defaultref = default === nothing ? nothing : ismissing(default) ? 0 : get(dest.pool, default) @inbounds for i in eachindex(drefs, src) x = src[i] - for j in eachindex(pairs) - p = pairs[j] - # we use isequal and recode_in because we cannot really distinguish scalars from collections - if recode_in(x, p.first) - drefs[i] = dupvals ? pairmap[j] : j - @goto nextitem - end + j = findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + if !isnothing(j) + drefs[i] = dupvals ? pairmap[j] : j + @goto nextitem end # Value not in any of the pairs @@ -228,7 +226,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, @inbounds for (i, l) in enumerate(srclevels) for j in 1:length(pairs) p = pairs[j] - if recode_in(l, p.first) + if l ≅ p.first ||recode_in(l, p.first) levelsmap[i+1] = pairmap[j] @goto nextitem end From 06b9265fba770033dc539adcb352e54e57c7c591 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Fri, 22 Nov 2024 14:55:47 +0100 Subject: [PATCH 06/17] recode_in is always false for identical types --- src/recode.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/recode.jl b/src/recode.jl index 1b6aeae9..86817d59 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -47,6 +47,8 @@ A user defined type could override this method to define an appropriate test fun """ @inline recode_in(x, collection) = any(x ≅ y for y in collection) @inline recode_in(x, ::Missing) = false +@inline recode_in(::T, ::T) where T = false +@inline recode_in(::Missing, ::Missing) where T = false optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second From 74123a6ca69c0939bcb207306105197e7054b5a5 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 28 Nov 2024 11:20:59 +0100 Subject: [PATCH 07/17] revert changes to recode_in, inline findfirst --- src/recode.jl | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index 86817d59..15d8506e 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -45,10 +45,9 @@ The default method is to test if any element in the `collection` `isequal` to `x`. For `Set`s `in` is used as it is faster than the default method and equivalent to it. A user defined type could override this method to define an appropriate test function. """ -@inline recode_in(x, collection) = any(x ≅ y for y in collection) @inline recode_in(x, ::Missing) = false -@inline recode_in(::T, ::T) where T = false -@inline recode_in(::Missing, ::Missing) where T = false +@inline recode_in(x, collection::Set) = x in collection +@inline recode_in(x, collection) = any(x ≅ y for y in collection) optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second @@ -60,20 +59,17 @@ function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::P opt_pairs = optimize_pair.(pairs) - if dest isa CategoricalArray && src isa CategoricalArray - # in this case, we don't need to do much for type stability - _recode!(dest, src, default, opt_pairs...) - else - # in these cases, this is only type stable if we pass the pairs as tuples - _recode!(dest, src, default, first.(opt_pairs), last.(opt_pairs)) - end + _recode!(dest, src, default, opt_pairs...) end -function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, recode_from::Tuple, recode_to::Tuple) where {T} +function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs...) where {T} + recode_to = last.(pairs) + recode_from = first.(pairs) + @inbounds for i in eachindex(dest, src) x = src[i] - j = findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + j = @inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) if !isnothing(j) dest[i] = recode_to[j] @goto nextitem @@ -104,7 +100,10 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, recode_fr dest end -function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, recode_from::Tuple, recode_to::Tuple) where {T, R} +function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, pairs...) where {T, R} + recode_to = last.(pairs) + recode_from = first.(pairs) + vals = convert.(T, recode_to) vals = default === nothing ? vals : (vals..., default) @@ -120,7 +119,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau @inbounds for i in eachindex(drefs, src) x = src[i] - j = findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + j = @inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) if !isnothing(j) drefs[i] = dupvals ? pairmap[j] : j @goto nextitem From 15a09dbb40daaa4300388100cd568f534672c1f0 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 28 Nov 2024 11:33:57 +0100 Subject: [PATCH 08/17] do not splat pairs passed to `_recode!` --- src/recode.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index 15d8506e..0e0e113a 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -59,10 +59,10 @@ function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::P opt_pairs = optimize_pair.(pairs) - _recode!(dest, src, default, opt_pairs...) + _recode!(dest, src, default, opt_pairs) end -function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs...) where {T} +function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs) where {T} recode_to = last.(pairs) recode_from = first.(pairs) @@ -100,7 +100,7 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs...) dest end -function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, pairs...) where {T, R} +function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, pairs) where {T, R} recode_to = last.(pairs) recode_from = first.(pairs) @@ -168,7 +168,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau end function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, - default::Any, pairs::Pair...) where {T, N, R<:Integer} + default::Any, pairs::Tuple) where {T, N, R<:Integer} vals = T[p.second for p in pairs] if default === nothing srclevels = levels(src) From c419cf5afca6c2fab7e38d07d400e4ed4e518c43 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 28 Nov 2024 11:44:54 +0100 Subject: [PATCH 09/17] require julia 1.6 --- .github/workflows/ci.yml | 2 +- Project.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aaeda107..71e6a6d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: fail-fast: false matrix: version: - - '1.0' + - '1.6' - '1' # automatically expands to the latest stable 1.x release of Julia - 'nightly' os: diff --git a/Project.toml b/Project.toml index 5846e340..0e4b8c1d 100644 --- a/Project.toml +++ b/Project.toml @@ -32,7 +32,7 @@ RecipesBase = "1.1" Requires = "1" SentinelArrays = "1" StructTypes = "1" -julia = "1" +julia = "1.6" [extras] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" From 8e2db47610ad38bcf170b0db3fd8251746630074 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 28 Nov 2024 12:02:44 +0100 Subject: [PATCH 10/17] Revert "require julia 1.6" --- .github/workflows/ci.yml | 2 +- Project.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 71e6a6d2..aaeda107 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: fail-fast: false matrix: version: - - '1.6' + - '1.0' - '1' # automatically expands to the latest stable 1.x release of Julia - 'nightly' os: diff --git a/Project.toml b/Project.toml index 0e4b8c1d..5846e340 100644 --- a/Project.toml +++ b/Project.toml @@ -32,7 +32,7 @@ RecipesBase = "1.1" Requires = "1" SentinelArrays = "1" StructTypes = "1" -julia = "1.6" +julia = "1" [extras] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" From 2a446daecce8840ce9db2d08d430c96a4a0194d9 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 28 Nov 2024 12:04:17 +0100 Subject: [PATCH 11/17] define inlined findfirstrecode --- src/recode.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index 0e0e113a..ce788430 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -49,6 +49,8 @@ A user defined type could override this method to define an appropriate test fun @inline recode_in(x, collection::Set) = x in collection @inline recode_in(x, collection) = any(x ≅ y for y in collection) +@inline findfirstrecode(x, recode_from) = findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second @@ -69,7 +71,7 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs) wh @inbounds for i in eachindex(dest, src) x = src[i] - j = @inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + j = findfirstrecode(x, recode_from) if !isnothing(j) dest[i] = recode_to[j] @goto nextitem @@ -119,7 +121,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau @inbounds for i in eachindex(drefs, src) x = src[i] - j = @inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + j = findfirstrecode(x, recode_from) if !isnothing(j) drefs[i] = dupvals ? pairmap[j] : j @goto nextitem From 445469e2bc9bc1e963c99e36142ede1b0fd45fbd Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Mon, 30 Dec 2024 13:16:35 +0100 Subject: [PATCH 12/17] use `Compat.@inline` --- Project.toml | 2 ++ src/recode.jl | 7 +++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index 5846e340..b14f7c8d 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" version = "0.10.8" [deps] +Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" Future = "9fa8497b-333b-5362-9e8d-4d0656e87820" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" @@ -24,6 +25,7 @@ CategoricalArraysSentinelArraysExt = "SentinelArrays" CategoricalArraysStructTypesExt = "StructTypes" [compat] +Compat = "3.37" DataAPI = "1.6" JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" JSON3 = "1.1.2" diff --git a/src/recode.jl b/src/recode.jl index ce788430..ac0a9669 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -1,3 +1,4 @@ +import Compat const ≅ = isequal """ @@ -49,8 +50,6 @@ A user defined type could override this method to define an appropriate test fun @inline recode_in(x, collection::Set) = x in collection @inline recode_in(x, collection) = any(x ≅ y for y in collection) -@inline findfirstrecode(x, recode_from) = findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) - optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second @@ -71,7 +70,7 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs) wh @inbounds for i in eachindex(dest, src) x = src[i] - j = findfirstrecode(x, recode_from) + j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) if !isnothing(j) dest[i] = recode_to[j] @goto nextitem @@ -121,7 +120,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau @inbounds for i in eachindex(drefs, src) x = src[i] - j = findfirstrecode(x, recode_from) + j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) if !isnothing(j) drefs[i] = dupvals ? pairmap[j] : j @goto nextitem From 91de2033ba37b58b58afef6da83e194f022e2b32 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Tue, 31 Dec 2024 10:44:24 +0100 Subject: [PATCH 13/17] avoid `isnothing` --- src/recode.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index ac0a9669..4d4b7461 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -71,7 +71,7 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs) wh x = src[i] j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) - if !isnothing(j) + if j !== nothing dest[i] = recode_to[j] @goto nextitem end @@ -121,7 +121,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau x = src[i] j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) - if !isnothing(j) + if j !== nothing drefs[i] = dupvals ? pairmap[j] : j @goto nextitem end From 539a3ba912e6de5d81408b3145d437ed025d6bf8 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 2 Jan 2025 14:06:25 +0100 Subject: [PATCH 14/17] allow compat v4 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index b14f7c8d..8a2500bb 100644 --- a/Project.toml +++ b/Project.toml @@ -25,7 +25,7 @@ CategoricalArraysSentinelArraysExt = "SentinelArrays" CategoricalArraysStructTypesExt = "StructTypes" [compat] -Compat = "3.37" +Compat = "3.37, 4" DataAPI = "1.6" JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" JSON3 = "1.1.2" From 25af087d3c0663b93cbe94aa601ab844010016fe Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 2 Jan 2025 14:10:58 +0100 Subject: [PATCH 15/17] various minor changes --- src/CategoricalArrays.jl | 1 + src/recode.jl | 47 +++++++++++++++++++--------------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index 214a5d17..a28cba94 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -14,6 +14,7 @@ module CategoricalArrays using DataAPI using Missings using Printf + import Compat # JuliaLang/julia#36810 if VERSION < v"1.5.2" diff --git a/src/recode.jl b/src/recode.jl index 4d4b7461..225cbd8c 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -63,21 +63,23 @@ function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::P _recode!(dest, src, default, opt_pairs) end -function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs) where {T} +function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs::NTuple{<:Any, Pair}) where {T} recode_to = last.(pairs) recode_from = first.(pairs) @inbounds for i in eachindex(dest, src) x = src[i] + # @inline is needed for type stability and Compat for compatibility before julia v1.8 + # we use isequal and recode_in because we cannot really + # distinguish scalars from collections j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + + # Value in one of the pairs if j !== nothing dest[i] = recode_to[j] - @goto nextitem - end - # Value not in any of the pairs - if ismissing(x) + elseif ismissing(x) eltype(dest) >: Missing || throw(MissingException("missing value found, but dest does not support them: " * "recode them to a supported value")) @@ -94,19 +96,17 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs) wh else dest[i] = default end - - @label nextitem end dest end -function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, pairs) where {T, R} - recode_to = last.(pairs) +function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, + pairs::NTuple{<:Any, Pair}) where {T, R} recode_from = first.(pairs) + vals = T[p.second for p in pairs] - vals = convert.(T, recode_to) - vals = default === nothing ? vals : (vals..., default) + default !== nothing && push!(vals, default) levels!(dest.pool, filter!(!ismissing, unique(vals))) # In the absence of duplicated recoded values, we do not need to lookup the reference @@ -115,19 +115,17 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau drefs = dest.refs pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals] - defaultref = default === nothing ? nothing : ismissing(default) ? 0 : get(dest.pool, default) + defaultref = default === nothing || ismissing(default) ? zero(R) : get(dest.pool, default) @inbounds for i in eachindex(drefs, src) x = src[i] - j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + # we use isequal and recode_in because we cannot really + # distinguish scalars from collections + j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x, y), recode_from) if j !== nothing drefs[i] = dupvals ? pairmap[j] : j - @goto nextitem - end - - # Value not in any of the pairs - if ismissing(x) + elseif ismissing(x) eltype(dest) >: Missing || throw(MissingException("missing value found, but dest does not support them: " * "recode them to a supported value")) @@ -144,8 +142,6 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau else drefs[i] = defaultref end - - @label nextitem end # Put existing levels first, and sort them if possible @@ -169,19 +165,20 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau end function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, - default::Any, pairs::Tuple) where {T, N, R<:Integer} + default::Any, pairs::NTuple{<:Any, Pair}) where {T, N, R<:Integer} + recode_from = first.(pairs) vals = T[p.second for p in pairs] + if default === nothing srclevels = levels(src) # Remove recoded levels as they won't appear in result - firsts = (p.first for p in pairs) keptlevels = Vector{T}(undef, 0) sizehint!(keptlevels, length(srclevels)) for l in srclevels - if !(any(x -> x ≅ l, firsts) || - any(f -> recode_in(l, f), firsts)) + if !(any(x -> x ≅ l, recode_from) || + any(f -> recode_in(l, f), recode_from)) try push!(keptlevels, l) catch err @@ -228,7 +225,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, @inbounds for (i, l) in enumerate(srclevels) for j in 1:length(pairs) p = pairs[j] - if l ≅ p.first ||recode_in(l, p.first) + if l ≅ p.first || recode_in(l, p.first) levelsmap[i+1] = pairmap[j] @goto nextitem end From b08a4ba91bb3a91ba875486e673d13f6552cdd19 Mon Sep 17 00:00:00 2001 From: Tiem van der Deure Date: Fri, 3 Jan 2025 11:46:24 +0100 Subject: [PATCH 16/17] drop second import Compat Co-authored-by: Milan Bouchet-Valat --- src/recode.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/recode.jl b/src/recode.jl index 225cbd8c..9341536c 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -1,4 +1,3 @@ -import Compat const ≅ = isequal """ From 04695faef233f7cd6c66955ff3b8fca5ef96180e Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Fri, 3 Jan 2025 11:52:10 +0100 Subject: [PATCH 17/17] whitespace and comments --- src/recode.jl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index 9341536c..141f9967 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -62,7 +62,8 @@ function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::P _recode!(dest, src, default, opt_pairs) end -function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs::NTuple{<:Any, Pair}) where {T} +function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, + pairs::NTuple{<:Any, Pair}) where {T} recode_to = last.(pairs) recode_from = first.(pairs) @@ -101,7 +102,7 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs::NT end function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, - pairs::NTuple{<:Any, Pair}) where {T, R} + pairs::NTuple{<:Any, Pair}) where {T, R} recode_from = first.(pairs) vals = T[p.second for p in pairs] @@ -119,11 +120,15 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau @inbounds for i in eachindex(drefs, src) x = src[i] + # @inline is needed for type stability and Compat for compatibility before julia v1.8 # we use isequal and recode_in because we cannot really # distinguish scalars from collections j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x, y), recode_from) + + # Value in one of the pairs if j !== nothing drefs[i] = dupvals ? pairmap[j] : j + # Value not in any of the pairs elseif ismissing(x) eltype(dest) >: Missing || throw(MissingException("missing value found, but dest does not support them: " * @@ -164,7 +169,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau end function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, - default::Any, pairs::NTuple{<:Any, Pair}) where {T, N, R<:Integer} + default::Any, pairs::NTuple{<:Any, Pair}) where {T, N, R<:Integer} recode_from = first.(pairs) vals = T[p.second for p in pairs]