Skip to content

Commit

Permalink
Make cut close last interval on the right (#409)
Browse files Browse the repository at this point in the history
This is much more useful, though slightly breaking.
  • Loading branch information
nalimilan authored Dec 27, 2024
1 parent d61d911 commit bbac8dc
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 29 deletions.
12 changes: 5 additions & 7 deletions src/extras.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray,

if ismissing(x)
refs[i] = 0
elseif extend === true && x == upper
elseif x == upper
refs[i] = n-1
elseif extend !== true && !(lower <= x < upper)
elseif extend !== true && !(lower <= x <= upper)
extend === missing ||
throw(ArgumentError("value $x (at index $i) does not fall inside the breaks: " *
"adapt them manually, or pass extend=true or extend=missing"))
Expand Down Expand Up @@ -41,17 +41,15 @@ Cut a numeric array into intervals at values `breaks`
and return an ordered `CategoricalArray` indicating
the interval into which each entry falls. Intervals are of the form `[lower, upper)`,
i.e. the lower bound is included and the upper bound is excluded, except
if `extend=true` the last interval, which is then closed on both ends,
i.e. `[lower, upper]`.
the last interval, which is closed on both ends, i.e. `[lower, upper]`.
If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will
also accept them.
# Keyword arguments
* `extend::Union{Bool, Missing}=false`: when `false`, an error is raised if some values
in `x` fall outside of the breaks; when `true`, breaks are automatically added to include
all values in `x`, and the upper bound is included in the last interval; when `missing`,
values outside of the breaks generate `missing` entries.
all values in `x`; when `missing`, values outside of the breaks generate `missing` entries.
* `labels::Union{AbstractVector, Function}`: a vector of strings, characters
or numbers giving the names to use for
the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
Expand Down Expand Up @@ -200,7 +198,7 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
end
levs[end] = labels(from[end], to[end], n-1,
leftclosed=breaks[end-1] != breaks[end],
rightclosed=coalesce(extend, false))
rightclosed=true)
else
length(labels) == n-1 ||
throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))"))
Expand Down
52 changes: 31 additions & 21 deletions test/15_extras.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,37 @@ const ≅ = isequal

@testset "cut($(Union{Int, T})[...])" for T in (Union{}, Missing)
x = @inferred cut(Vector{Union{Int, T}}([2, 3, 5]), [1, 3, 6])
@test x == ["[1, 3)", "[3, 6)", "[3, 6)"]
@test x == ["[1, 3)", "[3, 6]", "[3, 6]"]
@test isa(x, CategoricalVector{Union{String, T}})
@test isordered(x)
@test levels(x) == ["[1, 3)", "[3, 6)"]
@test levels(x) == ["[1, 3)", "[3, 6]"]

@test cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=false) ==
["[2, 5]", "[2, 5]", "[2, 5]"]

err = @test_throws ArgumentError cut(Vector{Union{T, Int}}([2, 3, 5]), [3, 6])
@test err.value.msg == "value 2 (at index 1) does not fall inside the breaks: adapt them manually, or pass extend=true or extend=missing"


err = @test_throws ArgumentError cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5])
@test err.value.msg == "value 5 (at index 3) does not fall inside the breaks: adapt them manually, or pass extend=true or extend=missing"

if T === Missing
x = @inferred cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=missing)
else
x = cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=missing)
end
@test x ["[2, 5)", "[2, 5)", missing]
@test x ["[2, 5]", "[2, 5]", "[2, 5]"]
@test isa(x, CategoricalVector{Union{String, Missing}})
@test isordered(x)
@test levels(x) == ["[2, 5)"]
@test levels(x) == ["[2, 5]"]

if T === Missing
x = @inferred cut(Vector{Union{T, Int}}([2, 3, 6]), [2, 5], extend=missing)
else
x = cut(Vector{Union{T, Int}}([2, 3, 6]), [2, 5], extend=missing)
end
@test x ["[2, 5]", "[2, 5]", missing]
@test isa(x, CategoricalVector{Union{String, Missing}})
@test isordered(x)
@test levels(x) == ["[2, 5]"]

x = @inferred cut(Vector{Union{T, Int}}([2, 3, 5]), [3, 6], extend=true)
@test x == ["[2, 3)", "[3, 6]", "[3, 6]"]
Expand All @@ -40,10 +50,10 @@ const ≅ = isequal
@test levels(x) == ["[2, 3)", "[3, 6]"]

x = @inferred cut(Vector{Union{T, Int}}([1, 2, 4]), [1, 3, 6])
@test x == ["[1, 3)", "[1, 3)", "[3, 6)"]
@test x == ["[1, 3)", "[1, 3)", "[3, 6]"]
@test isa(x, CategoricalVector{Union{String, T}})
@test isordered(x)
@test levels(x) == ["[1, 3)", "[3, 6)"]
@test levels(x) == ["[1, 3)", "[3, 6]"]

x = @inferred cut(Vector{Union{T, Int}}([1, 2, 4]), [3, 6], extend=true)
@test x == ["[1, 3)", "[1, 3)", "[3, 6]"]
Expand All @@ -67,10 +77,10 @@ const ≅ = isequal
breaks = [18, 25, 35, 60, 100]
x = @inferred cut(Vector{Union{T, Int}}(ages), breaks)
@test x == ["[18, 25)", "[18, 25)", "[25, 35)", "[25, 35)", "[18, 25)", "[18, 25)",
"[35, 60)", "[25, 35)", "[60, 100)", "[35, 60)", "[35, 60)", "[25, 35)"]
"[35, 60)", "[25, 35)", "[60, 100]", "[35, 60)", "[35, 60)", "[25, 35)"]
@test isa(x, CategoricalVector{Union{String, T}})
@test isordered(x)
@test levels(x) == ["[18, 25)", "[25, 35)", "[35, 60)", "[60, 100)"]
@test levels(x) == ["[18, 25)", "[25, 35)", "[35, 60)", "[60, 100]"]

breaks = [1, 6, 3] # Unsorted breaks
labels = ["b", "a"] # Differs from lexical ordering
Expand All @@ -83,10 +93,10 @@ const ≅ = isequal
@test levels(x) == ["b", "a"]

x = @inferred cut(Matrix{Union{Float64, T}}([-1.1 3.0; 1.456 10.394]), [-2.134, 3.0, 12.5])
@test x == ["[-2.134, 3.0)" "[3.0, 12.5)"; "[-2.134, 3.0)" "[3.0, 12.5)"]
@test x == ["[-2.134, 3.0)" "[3.0, 12.5]"; "[-2.134, 3.0)" "[3.0, 12.5]"]
@test isa(x, CategoricalMatrix{Union{String, T}})
@test isordered(x)
@test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5)"]
@test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5]"]

labels = 0:2:8
x = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels)
Expand Down Expand Up @@ -179,7 +189,7 @@ end
@test_throws ArgumentError cut(1:10, [1, 5, 5, 11])
y = cut(1:10, [1, 5, 5, 11], allowempty=true)
@test y == cut(1:10, [1, 5, 11])
@test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11)"]
@test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11]"]

@test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11])
@test_throws ArgumentError cut(1:10, [1, 5, 5, 11],
Expand All @@ -191,29 +201,29 @@ end

@test_throws ArgumentError cut(1:10, [1, 5, 5, 11], labels=string.(1:3))
y = cut(1:10, [1, 5, 5, 11], allowempty=true, labels=string.(1:3))
@test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "3")
@test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "3")
@test levels(y) == string.(1:3)

@test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11], labels=string.(1:4))
y = cut(1:10, [1, 5, 5, 5, 11], allowempty=true, labels=string.(1:4))
@test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "4")
@test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "4")
@test levels(y) == string.(1:4)

@test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 5, 11], labels=string.(1:5))
y = cut(1:10, [1, 5, 5, 5, 5, 11], allowempty=true, labels=string.(1:5))
@test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "5")
@test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "5")
@test levels(y) == string.(1:5)

@test_throws ArgumentError cut(1:10, [1, 3, 3, 5, 5, 11], labels=string.(1:5))
y = cut(1:10, [1, 3, 3, 5, 5, 11], allowempty=true, labels=string.(1:5))
@test y == recode(cut(1:10, [1, 3, 5, 11]),
"[1, 3)" => "1", "[3, 5)" => "3", "[5, 11)" => "5")
"[1, 3)" => "1", "[3, 5)" => "3", "[5, 11]" => "5")
@test levels(y) == string.(1:5)

@test_throws ArgumentError cut(1:10, [1, 3, 3, 3, 5, 5, 5, 11], labels=string.(1:7))
y = cut(1:10, [1, 3, 3, 3, 5, 5, 5, 11], allowempty=true, labels=string.(1:7))
@test y == recode(cut(1:10, [1, 3, 5, 11]),
"[1, 3)" => "1", "[3, 5)" => "4", "[5, 11)" => "7")
"[1, 3)" => "1", "[3, 5)" => "4", "[5, 11]" => "7")
@test levels(y) == string.(1:7)

@test_throws ArgumentError cut(1:10, [1, 3, 5, 5, 11],
Expand Down Expand Up @@ -255,9 +265,9 @@ end
end

@testset "cut with extend=missing" begin
x = @inferred cut([-0.0, 0.0, 1.0, 2.0, 3.0, 4.0], [-0.0, 0.0, 3.0],
x = @inferred cut([-0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0], [-0.0, 0.0, 3.0],
labels=[-0.0, 0.0], extend=missing)
@test x [-0.0, 0.0, 0.0, 0.0, missing, missing]
@test x [-0.0, 0.0, 0.0, 0.0, 0.0, missing, missing]
@test x isa CategoricalArray{Union{Missing, Float64},1,UInt32}
@test isordered(x)
@test levels(x) == [-0.0, 0.0]
Expand Down
2 changes: 1 addition & 1 deletion test/17_deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ const ≅ = isequal
@test x ["a", missing, missing]

x = cut([1, missing, 100], [1, 2], allow_missing=true)
@test x ["[1, 2)", missing, missing]
@test x ["[1, 2]", missing, missing]
end

end

0 comments on commit bbac8dc

Please sign in to comment.