Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make recode! type stable #407

Merged
merged 17 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ CategoricalArraysSentinelArraysExt = "SentinelArrays"
CategoricalArraysStructTypesExt = "StructTypes"

[compat]
Compat = "3.37"
Compat = "3.37, 4"
DataAPI = "1.6"
JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21"
JSON3 = "1.1.2"
Expand Down
1 change: 1 addition & 0 deletions src/CategoricalArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module CategoricalArrays
using DataAPI
using Missings
using Printf
import Compat

# JuliaLang/julia#36810
if VERSION < v"1.5.2"
Expand Down
47 changes: 22 additions & 25 deletions src/recode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,21 +63,23 @@ function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::P
_recode!(dest, src, default, opt_pairs)
end

function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs) where {T}
function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs::NTuple{<:Any, Pair}) where {T}
tiemvanderdeure marked this conversation as resolved.
Show resolved Hide resolved
recode_to = last.(pairs)
recode_from = first.(pairs)

@inbounds for i in eachindex(dest, src)
x = src[i]

# @inline is needed for type stability and Compat for compatibility before julia v1.8
# we use isequal and recode_in because we cannot really
# distinguish scalars from collections
j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from)
tiemvanderdeure marked this conversation as resolved.
Show resolved Hide resolved

# Value in one of the pairs
if j !== nothing
dest[i] = recode_to[j]
@goto nextitem
end

# Value not in any of the pairs
if ismissing(x)
elseif ismissing(x)
eltype(dest) >: Missing ||
throw(MissingException("missing value found, but dest does not support them: " *
"recode them to a supported value"))
Expand All @@ -94,19 +96,17 @@ function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, pairs) wh
else
dest[i] = default
end

@label nextitem
end

dest
end

function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, pairs) where {T, R}
recode_to = last.(pairs)
function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any,
pairs::NTuple{<:Any, Pair}) where {T, R}
tiemvanderdeure marked this conversation as resolved.
Show resolved Hide resolved
recode_from = first.(pairs)
vals = T[p.second for p in pairs]
tiemvanderdeure marked this conversation as resolved.
Show resolved Hide resolved

vals = convert.(T, recode_to)
vals = default === nothing ? vals : (vals..., default)
default !== nothing && push!(vals, default)

levels!(dest.pool, filter!(!ismissing, unique(vals)))
# In the absence of duplicated recoded values, we do not need to lookup the reference
Expand All @@ -115,19 +115,17 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau

drefs = dest.refs
pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals]
defaultref = default === nothing ? nothing : ismissing(default) ? 0 : get(dest.pool, default)
defaultref = default === nothing || ismissing(default) ? zero(R) : get(dest.pool, default)

@inbounds for i in eachindex(drefs, src)
x = src[i]

j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from)
# we use isequal and recode_in because we cannot really
tiemvanderdeure marked this conversation as resolved.
Show resolved Hide resolved
# distinguish scalars from collections
j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x, y), recode_from)
if j !== nothing
drefs[i] = dupvals ? pairmap[j] : j
@goto nextitem
end

# Value not in any of the pairs
if ismissing(x)
elseif ismissing(x)
eltype(dest) >: Missing ||
throw(MissingException("missing value found, but dest does not support them: " *
"recode them to a supported value"))
Expand All @@ -144,8 +142,6 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau
else
drefs[i] = defaultref
end

@label nextitem
end

# Put existing levels first, and sort them if possible
Expand All @@ -169,19 +165,20 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau
end

function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
default::Any, pairs::Tuple) where {T, N, R<:Integer}
default::Any, pairs::NTuple{<:Any, Pair}) where {T, N, R<:Integer}
recode_from = first.(pairs)
vals = T[p.second for p in pairs]

if default === nothing
srclevels = levels(src)
tiemvanderdeure marked this conversation as resolved.
Show resolved Hide resolved

# Remove recoded levels as they won't appear in result
firsts = (p.first for p in pairs)
keptlevels = Vector{T}(undef, 0)
sizehint!(keptlevels, length(srclevels))

for l in srclevels
if !(any(x -> x ≅ l, firsts) ||
any(f -> recode_in(l, f), firsts))
if !(any(x -> x ≅ l, recode_from) ||
any(f -> recode_in(l, f), recode_from))
try
push!(keptlevels, l)
catch err
Expand Down Expand Up @@ -228,7 +225,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
@inbounds for (i, l) in enumerate(srclevels)
for j in 1:length(pairs)
p = pairs[j]
if l ≅ p.first ||recode_in(l, p.first)
if l ≅ p.first || recode_in(l, p.first)
levelsmap[i+1] = pairmap[j]
@goto nextitem
end
Expand Down
Loading