-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
improve performance of recode! for array dest #355
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,8 @@ recode!(dest::AbstractArray, src::AbstractArray, pairs::Pair...) = | |
# To fix ambiguity | ||
recode!(dest::CategoricalArray, src::AbstractArray, pairs::Pair...) = | ||
recode!(dest, src, nothing, pairs...) | ||
recode!(dest::AbstractArray, src::CategoricalArray, pairs::Pair...) = | ||
recode!(dest, src, nothing, pairs...) | ||
recode!(dest::CategoricalArray, src::CategoricalArray, pairs::Pair...) = | ||
recode!(dest, src, nothing, pairs...) | ||
|
||
|
@@ -52,22 +54,47 @@ A user defined type could override this method to define an appropriate test fun | |
optimize_pair(pair::Pair) = pair | ||
optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second | ||
|
||
function missing_check(value) | ||
ismissing(value) && throw(MissingException("missing value found, but dest does not support them: " * | ||
"recode them to a supported value")) | ||
value | ||
end | ||
|
||
function recode!(dest::AbstractArray{T}, src::CategoricalArray, default::Any, pairs::Pair...) where {T} | ||
if length(dest) != length(src) | ||
throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) | ||
end | ||
|
||
pairs = map(pairs) do p | ||
p.first => convert(T, p.second) | ||
end | ||
recoded = recode(src, default, pairs...) | ||
if T >: Missing | ||
dest .= unwrap.(recoded) | ||
else | ||
dest .= missing_check.(unwrap.(recoded)) | ||
end | ||
dest | ||
end | ||
|
||
function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} | ||
if length(dest) != length(src) | ||
throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) | ||
end | ||
|
||
opt_pairs = map(optimize_pair, pairs) | ||
|
||
@inbounds for i in eachindex(dest, src) | ||
x = src[i] | ||
map!(dest, src) do x | ||
|
||
for j in 1:length(opt_pairs) | ||
p = opt_pairs[j] | ||
# we use isequal and recode_in because we cannot really distinguish scalars from collections | ||
if x ≅ p.first || recode_in(x, p.first) | ||
dest[i] = p.second | ||
@goto nextitem | ||
# we use isequal and recode_in because we cannot really distinguish scalars from collections | ||
for p in opt_pairs | ||
if x ≅ p.first | ||
return p.second | ||
end | ||
end | ||
for p in opt_pairs | ||
if recode_in(x, p.first) | ||
return p.second | ||
end | ||
Comment on lines
+90
to
98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could change the behavior in case of overlap between pairs. Why did you change this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this was a while a go I'll need to take some time with it and rerun my (micro-)benchmarks to be sure but unless my memory fails me There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. Maybe better do this in a separate PR since it's a bit more tricky. |
||
end | ||
|
||
|
@@ -76,21 +103,19 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs | |
eltype(dest) >: Missing || | ||
throw(MissingException("missing value found, but dest does not support them: " * | ||
"recode them to a supported value")) | ||
dest[i] = missing | ||
return missing | ||
elseif default isa Nothing | ||
try | ||
dest[i] = x | ||
return convert(T, x) | ||
catch err | ||
isa(err, MethodError) || rethrow(err) | ||
throw(ArgumentError("cannot `convert` value $(repr(x)) (of type $(typeof(x))) to type of recoded levels ($T). " * | ||
"This will happen with recode() when not all original levels are recoded " * | ||
"(i.e. some are preserved) and their type is incompatible with that of recoded levels.")) | ||
end | ||
else | ||
dest[i] = default | ||
return default | ||
end | ||
|
||
@label nextitem | ||
nalimilan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
end | ||
|
||
dest | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than doing this, to avoid making a copy and two passes over the data, we should call
recode
onlevels(src)
, and then do something like:The actual implementation needs to be a bit more complex so that the first entry in
newlevels
ismissing
(to handle the case whensrc.refs
is 0).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By
copy
do you mean copy of thesrc.levels
? In this implementation no copy of the actual array (orrefs
) is made which is the main reason why it is so much faster (as outlined in my StackOverflow answer) is because all the actual copying of therefs
happens only once at the last linedest .= unwrap.(recoded)
therecoded
variable shares the refs withsrc
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
recode(src, default, pairs...)
allocates a new vector, right? That's relatively fast, but it's even better to avoid it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right, I remembered the details wrong. In the StackOverflow example I did:
which is similar to what you're suggesting. However, in this PR we initialize the CategoricalArray that will be put in the
recoded
variable with something likeCategoricalArray{S, N, R}(undef, size(a))
so therefs
are not shared between src and recoded.EDIT: Like I noted on SO using
levels!
does not work in the general case