Skip to content

Commit

Permalink
[GR-19220] Fix an issue where Regexp.union was improperly negotiating…
Browse files Browse the repository at this point in the history
… the result encoding (#3296)

PullRequest: truffleruby/4034
(cherry picked from commit 6642143)
  • Loading branch information
eregon committed Nov 22, 2023
1 parent d74667c commit 152a876
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 45 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Bug fixes:

* Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon).
* Fix `IO.copy_stream` with a `Tempfile` destination (#3280, @eregon).
* Fix `Regexp.union` negotiating the wrong result encoding (#3287, @nirvdrum, @simonlevasseur).

# 23.1.0

Expand Down
51 changes: 37 additions & 14 deletions spec/ruby/core/regexp/union_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,27 @@
Regexp.union("\u00A9".encode("ISO-8859-1"), "a".encode("UTF-8")).encoding.should == Encoding::ISO_8859_1
end

it "returns ASCII-8BIT if the regexp encodings are ASCII-8BIT and at least one has non-ASCII characters" do
us_ascii_implicit, us_ascii_explicit, binary = /abc/, /[\x00-\x7f]/n, /[\x80-\xBF]/n
us_ascii_implicit.encoding.should == Encoding::US_ASCII
us_ascii_explicit.encoding.should == Encoding::US_ASCII
binary.encoding.should == Encoding::BINARY

Regexp.union(us_ascii_implicit, us_ascii_explicit, binary).encoding.should == Encoding::BINARY
Regexp.union(us_ascii_implicit, binary, us_ascii_explicit).encoding.should == Encoding::BINARY
Regexp.union(us_ascii_explicit, us_ascii_implicit, binary).encoding.should == Encoding::BINARY
Regexp.union(us_ascii_explicit, binary, us_ascii_implicit).encoding.should == Encoding::BINARY
Regexp.union(binary, us_ascii_implicit, us_ascii_explicit).encoding.should == Encoding::BINARY
Regexp.union(binary, us_ascii_explicit, us_ascii_implicit).encoding.should == Encoding::BINARY
end

it "return US-ASCII if all patterns are ASCII-only" do
Regexp.union(/abc/e, /def/e).encoding.should == Encoding::US_ASCII
Regexp.union(/abc/n, /def/n).encoding.should == Encoding::US_ASCII
Regexp.union(/abc/s, /def/s).encoding.should == Encoding::US_ASCII
Regexp.union(/abc/u, /def/u).encoding.should == Encoding::US_ASCII
end

it "returns a Regexp with UTF-8 if one part is UTF-8" do
Regexp.union(/probl[éeè]me/i, /help/i).encoding.should == Encoding::UTF_8
end
Expand All @@ -54,83 +75,83 @@
it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Strings" do
-> {
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-16BE"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
end

it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Regexps" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")),
Regexp.new("b".encode("UTF-16BE")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
end

it "raises ArgumentError if the arguments include conflicting fixed encoding Regexps" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
Regexp.new("b".encode("US-ASCII"), Regexp::FIXEDENCODING))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and US-ASCII')
end

it "raises ArgumentError if the arguments include a fixed encoding Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
"\u00A9".encode("ISO-8859-1"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and ISO-8859-1')
end

it "raises ArgumentError if the arguments include a String containing non-ASCII-compatible characters and a fixed encoding Regexp in a different encoding" do
-> {
Regexp.union("\u00A9".encode("ISO-8859-1"),
Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: ISO-8859-1 and UTF-8')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only String" do
-> {
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-8"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only String" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "b".encode("UTF-8"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only Regexp" do
-> {
Regexp.union("a".encode("UTF-16LE"), Regexp.new("b".encode("UTF-8")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only Regexp" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("b".encode("UTF-8")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and a String containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union("a".encode("UTF-16LE"), "\u00A9".encode("ISO-8859-1"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "\u00A9".encode("ISO-8859-1"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and a Regexp containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union("a".encode("UTF-16LE"), Regexp.new("\u00A9".encode("ISO-8859-1")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a Regexp containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("\u00A9".encode("ISO-8859-1")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "uses to_str to convert arguments (if not Regexp)" do
Expand All @@ -154,6 +175,8 @@
not_supported_on :opal do
Regexp.union([/dogs/, /cats/i]).should == /(?-mix:dogs)|(?i-mx:cats)/
end
->{Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])}.should raise_error(TypeError)
-> {
Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])
}.should raise_error(TypeError, 'no implicit conversion of Array into String')
end
end
65 changes: 34 additions & 31 deletions src/main/ruby/truffleruby/core/regexp.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,22 +55,27 @@ def self.try_convert(obj)
Truffle::Type.try_convert obj, Regexp, :to_regexp
end

def self.convert(pattern)
return pattern if Primitive.is_a?(pattern, Regexp)
if Primitive.is_a?(pattern, Array)
union(*pattern)
else
Regexp.quote(pattern.to_s)
end
end
def self.negotiate_union_encoding(*patterns)
compatible_enc = nil

patterns.each do |pattern|
converted = Primitive.is_a?(pattern, Regexp) ? pattern : Regexp.quote(pattern)

enc = converted.encoding

if Primitive.nil?(compatible_enc)
compatible_enc = enc
else
if test = Primitive.encoding_compatible?(enc, compatible_enc)
compatible_enc = test
else
raise ArgumentError, "incompatible encodings: #{compatible_enc} and #{enc}"
end

def self.compatible?(*patterns)
encodings = patterns.map { |r| convert(r).encoding }
last_enc = encodings.pop
encodings.each do |encoding|
raise ArgumentError, "incompatible encodings: #{encoding} and #{last_enc}" unless Primitive.encoding_compatible?(last_enc, encoding)
last_enc = encoding
end
end

compatible_enc
end

def self.last_match(index = nil)
Expand All @@ -96,37 +101,35 @@ def self.last_match(index = nil)
def self.union(*patterns)
case patterns.size
when 0
return %r/(?!)/
%r/(?!)/
when 1
pattern = patterns.first
case pattern
when Array
return union(*pattern)
union(*pattern)
else
converted = Truffle::Type.rb_check_convert_type(pattern, Regexp, :to_regexp)
if Primitive.nil? converted
return Regexp.new(Regexp.quote(pattern))
Regexp.new(Regexp.quote(pattern))
else
return converted
converted
end
end
else
compatible?(*patterns)
enc = convert(patterns.first).encoding
end
patterns = patterns.map do |pat|
if Primitive.is_a?(pat, Regexp)
pat
else
StringValue(pat)
end
end

sep = '|'.encode(enc)
str = ''.encode(enc)
enc = negotiate_union_encoding(*patterns)
sep = '|'.encode(enc)
str = ''.encode(enc)

patterns = patterns.map do |pat|
if Primitive.is_a?(pat, Regexp)
pat
else
StringValue(pat)
end
Truffle::RegexpOperations.union(str, sep, *patterns)
end

Truffle::RegexpOperations.union(str, sep, *patterns)
end
Truffle::Graal.always_split(method(:union))

Expand Down

0 comments on commit 152a876

Please sign in to comment.