From 85f64dd1ddb79a8502953b3717bede136bb88cd4 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 20:43:11 -0400 Subject: [PATCH 01/11] initial port of Base.Unicode --- Project.toml | 6 ++- src/JuliaSyntax.jl | 1 + src/literal_parsing.jl | 43 +---------------- src/source_files.jl | 2 +- src/tokenize.jl | 27 +++-------- src/unicode.jl | 107 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 123 insertions(+), 63 deletions(-) create mode 100644 src/unicode.jl diff --git a/Project.toml b/Project.toml index 6ffbaa40..3dc96fae 100644 --- a/Project.toml +++ b/Project.toml @@ -3,10 +3,12 @@ uuid = "70703baa-626e-46a2-a12c-08ffd08c73b4" authors = ["Claire Foster and contributors"] version = "0.4.6" +[deps] +utf8proc_jll = "00992c89-a35c-5347-9984-e6609dacc59a" + [compat] julia = "1.0" - -[deps] +utf8proc_jll = "2.9" # for Unicode 15.1 [extras] Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" diff --git a/src/JuliaSyntax.jl b/src/JuliaSyntax.jl index 3f1ad27a..8a8f6140 100644 --- a/src/JuliaSyntax.jl +++ b/src/JuliaSyntax.jl @@ -20,6 +20,7 @@ export SyntaxNode # Helper utilities include("utils.jl") +include("unicode.jl") include("kinds.jl") diff --git a/src/literal_parsing.jl b/src/literal_parsing.jl index a027985a..81ab5b32 100644 --- a/src/literal_parsing.jl +++ b/src/literal_parsing.jl @@ -329,47 +329,9 @@ function unescape_julia_string(io::IO, txtbuf::Vector{UInt8}, end #------------------------------------------------------------------------------- -# Unicode normalization. As of Julia 1.8, this is part of Base and the Unicode -# stdlib under the name `Unicode.julia_chartransform`. See -# https://github.com/JuliaLang/julia/pull/42561 -# -# To allow use on older Julia versions and to workaround the bug -# https://github.com/JuliaLang/julia/issues/45716 -# we reproduce a specialized version of that logic here. - -# static wrapper around user callback function -function utf8proc_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32 - (codepoint == 0x025B ? 0x03B5 : # 'ɛ' => 'ε' - codepoint == 0x00B5 ? 0x03BC : # 'µ' => 'μ' - codepoint == 0x00B7 ? 0x22C5 : # '·' => '⋅' - codepoint == 0x0387 ? 0x22C5 : # '·' => '⋅' - codepoint == 0x2212 ? 0x002D : # '−' (\minus) => '-' - codepoint == 0x210F ? 0x0127 : # 'ℏ' (\hslash) => 'ħ' \hbar - codepoint) -end - -function utf8proc_decompose(str, options, buffer, nwords) - ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ptr{Cvoid}), - str, sizeof(str), buffer, nwords, options, - @cfunction(utf8proc_custom_func, UInt32, (UInt32, Ptr{Cvoid})), C_NULL) - ret < 0 && Base.Unicode.utf8proc_error(ret) - return ret -end - -function utf8proc_map(str::Union{String,SubString{String}}, options::Integer) - nwords = utf8proc_decompose(str, options, C_NULL, 0) - buffer = Base.StringVector(nwords*4) - nwords = utf8proc_decompose(str, options, buffer, nwords) - nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options) - nbytes < 0 && Base.Unicode.utf8proc_error(nbytes) - return String(resize!(buffer, nbytes)) -end - -function normalize_identifier(str) - flags = Base.Unicode.UTF8PROC_STABLE | Base.Unicode.UTF8PROC_COMPOSE - return isascii(str) ? str : utf8proc_map(str, flags) -end +# Unicode normalization. +using .Unicode: normalize_identifier #------------------------------------------------------------------------------- function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange) @@ -451,4 +413,3 @@ function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange) ErrorVal() end end - diff --git a/src/source_files.jl b/src/source_files.jl index 06cae008..a0871347 100644 --- a/src/source_files.jl +++ b/src/source_files.jl @@ -145,7 +145,7 @@ function _print_marker_line(io, prefix_str, str, underline, singleline, color, # Getting exactly the same width of whitespace as `str` is tricky. # Especially for mixtures of tabs and spaces. # tabs are zero width according to textwidth - indent = join(isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str) + indent = join(Unicode.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str) # Assume tabs are 4 wide rather than 0. (fixme: implement tab alignment?) w = textwidth(str) + 4*count(c->c=='\t', str) diff --git a/src/tokenize.jl b/src/tokenize.jl index 9c19c040..7db1366d 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -4,26 +4,15 @@ export tokenize, untokenize using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str -import ..JuliaSyntax: kind, +import ..JuliaSyntax: kind, Unicode, is_literal, is_error, is_contextual_keyword, is_word_operator #------------------------------------------------------------------------------- # Character-based predicates for tokenization -import Base.Unicode const EOF_CHAR = typemax(Char) -function is_identifier_char(c::Char) - c == EOF_CHAR && return false - isvalid(c) || return false - return Base.is_id_char(c) -end - -function is_identifier_start_char(c::Char) - c == EOF_CHAR && return false - isvalid(c) || return false - return Base.is_id_start_char(c) -end +using .Unicode: is_identifier_char, is_identifier_start_char function is_invisible_char(c::Char) # These are the chars considered invisible by the reference parser. @@ -72,7 +61,7 @@ end readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char) # Some unicode operators are normalized by the tokenizer into their equivalent -# kinds. See also normalize_identifier() +# kinds. See also Unicode.normalize_identifier() const _ops_with_unicode_aliases = [ # \minus '−' is normalized into K"-", '−' => K"-" @@ -137,10 +126,10 @@ end if (u < 0xa1 || u > 0x10ffff) return false end - cat = Base.Unicode.category_code(u) - if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN || - cat == Base.Unicode.UTF8PROC_CATEGORY_MC || - cat == Base.Unicode.UTF8PROC_CATEGORY_ME) + cat = Unicode.category_code(u) + if (cat == Unicode.UTF8PROC_CATEGORY_MN || + cat == Unicode.UTF8PROC_CATEGORY_MC || + cat == Unicode.UTF8PROC_CATEGORY_ME) return true end # Additional allowed cases @@ -226,7 +215,7 @@ end @inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F') @inline isbinary(c::Char) = c == '0' || c == '1' @inline isoctal(c::Char) = '0' ≤ c ≤ '7' -@inline iswhitespace(c::Char) = (isvalid(c) && Base.isspace(c)) || c === '\ufeff' +@inline iswhitespace(c::Char) = (isvalid(c) && Unicode.isspace(c)) || c === '\ufeff' struct StringState triplestr::Bool diff --git a/src/unicode.jl b/src/unicode.jl new file mode 100644 index 00000000..ec6f3806 --- /dev/null +++ b/src/unicode.jl @@ -0,0 +1,107 @@ +# this is a mirror of some of Base.Unicode and related functions, +# but using the utf8proc_jll version tied to JuliaSyntax, so that +# the supported Unicode version is the same across Julia versions. + +module Unicode + +export category_code, normalize_identifier, is_identifier_char, is_identifier_start_char + +using utf8proc_jll: libutf8proc + +# these constants have been stable across all utf8proc versions, +# so no need to redefine them: +import Base.Unicode: UTF8PROC_CATEGORY_CN, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_MN, UTF8PROC_CATEGORY_MC, UTF8PROC_CATEGORY_ME, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO, UTF8PROC_CATEGORY_PC, UTF8PROC_CATEGORY_PD, UTF8PROC_CATEGORY_PS, UTF8PROC_CATEGORY_PE, UTF8PROC_CATEGORY_PI, UTF8PROC_CATEGORY_PF, UTF8PROC_CATEGORY_PO, UTF8PROC_CATEGORY_SM, UTF8PROC_CATEGORY_SC, UTF8PROC_CATEGORY_SK, UTF8PROC_CATEGORY_SO, UTF8PROC_CATEGORY_ZS, UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP, UTF8PROC_CATEGORY_CC, UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, UTF8PROC_CATEGORY_CO + +using Base: ismalformed + +##################################################################### +# functions copied almost as-is from Base.Unicode, with the only change +# being that they now ccall into utf8proc_jll. + +utf8proc_error(result) = error(unsafe_string(ccall((:utf8proc_errmsg,libutf8proc), Cstring, (Cssize_t,), result))) + +# Stateful grapheme break required by Unicode-9 rules: the string +# must be processed in sequence, with state initialized to Ref{Int32}(0). +# Requires utf8proc v2.0 or later. +function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar) + if ismalformed(c1) || ismalformed(c2) + state[] = 0 + return true + end + ccall((:utf8proc_grapheme_break_stateful,libutf8proc), Bool, + (UInt32, UInt32, Ref{Int32}), c1, c2, state) +end + +# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category +function category_code(c::AbstractChar) + !ismalformed(c) ? category_code(UInt32(c)) : Cint(31) +end + +function category_code(x::Integer) + x ≤ 0x10ffff ? ccall((:utf8proc_category,libutf8proc), Cint, (UInt32,), x) : Cint(30) +end + +@inline isspace(c::AbstractChar) = + c == ' ' || '\t' <= c <= '\r' || c == '\u85' || + '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS + +##################################################################### +# Julia identifier normalization, closely based on functions +# from Base.Unicode except that we hard-code the Julia +# chartransform (working around JuliaLang/julia#45716) + +# Julia's custom character normalization mapping, based on +# julia/src/flisp/julia_charmap.h: +function julia_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32 + (codepoint < 0x007f ? codepoint : # optimize for ASCII common case + codepoint == 0x025B ? 0x03B5 : # 'ɛ' => 'ε' + codepoint == 0x00B5 ? 0x03BC : # 'µ' => 'μ' + codepoint == 0x00B7 ? 0x22C5 : # '·' => '⋅' + codepoint == 0x0387 ? 0x22C5 : # '·' => '⋅' + codepoint == 0x2212 ? 0x002D : # '−' (\minus) => '-' + codepoint == 0x210F ? 0x0127 : # 'ℏ' (\hslash) => 'ħ' \hbar + codepoint) +end + +function utf8proc_decompose_julia(str, options, buffer, nwords) + ret = ccall((:utf8proc_decompose_custom,libutf8proc), Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ptr{Cvoid}), + str, sizeof(str), buffer, nwords, options, + @cfunction(julia_custom_func, UInt32, (UInt32, Ptr{Cvoid})), C_NULL) + ret < 0 && utf8proc_error(ret) + return ret +end + +function utf8proc_map_julia(str::Union{String,SubString{String}}, options::Integer, chartransform=identity) + nwords = utf8proc_decompose_julia(str, options, C_NULL, 0) + buffer = Base.StringVector(nwords*4) + nwords = utf8proc_decompose_julia(str, options, buffer, nwords) + nbytes = ccall((:utf8proc_reencode,libutf8proc), Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options) + nbytes < 0 && utf8proc_error(nbytes) + return String(resize!(buffer, nbytes)) +end + +function normalize_identifier(str) + # note that the values of UTF8PROC_x constants have not changed + # over many utf8proc versions, so we can use them from Base.Unicode + flags = Base.Unicode.UTF8PROC_STABLE | Base.Unicode.UTF8PROC_COMPOSE + return isascii(str) ? str : utf8proc_map_julia(str, flags) +end + +##################################################################### +# Julia identifier parsing predicates + +function is_identifier_char(c::Char) + # c == EOF_CHAR && return false # covered by isvalid check + isvalid(c) || return false + return Base.is_id_char(c) +end + +function is_identifier_start_char(c::Char) + # c == EOF_CHAR && return false # covered by isvalid check + isvalid(c) || return false + return Base.is_id_start_char(c) +end + +##################################################################### + +end From 22e3c6af7886e53a53ddbf315c31fcb9e9a3b3c4 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 21:03:52 -0400 Subject: [PATCH 02/11] port Base.is_id_char and Base.is_id_start_char --- src/unicode.jl | 102 +++++++++++++++++++++++++++++++++++++++++++---- test/tokenize.jl | 2 - 2 files changed, 94 insertions(+), 10 deletions(-) diff --git a/src/unicode.jl b/src/unicode.jl index ec6f3806..677c84e5 100644 --- a/src/unicode.jl +++ b/src/unicode.jl @@ -37,8 +37,11 @@ function category_code(c::AbstractChar) !ismalformed(c) ? category_code(UInt32(c)) : Cint(31) end +# doesn't check validity of x: +@inline _category_code(x::Integer) = ccall((:utf8proc_category,libutf8proc), Cint, (UInt32,), x) + function category_code(x::Integer) - x ≤ 0x10ffff ? ccall((:utf8proc_category,libutf8proc), Cint, (UInt32,), x) : Cint(30) + x ≤ 0x10ffff ? _category_code(x) : Cint(30) end @inline isspace(c::AbstractChar) = @@ -90,16 +93,99 @@ end ##################################################################### # Julia identifier parsing predicates -function is_identifier_char(c::Char) - # c == EOF_CHAR && return false # covered by isvalid check - isvalid(c) || return false - return Base.is_id_char(c) +# port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c +function _is_identifier_start_char(c::UInt32, cat::Integer) + return (cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LL || + cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LM || + cat == UTF8PROC_CATEGORY_LO || cat == UTF8PROC_CATEGORY_NL || + cat == UTF8PROC_CATEGORY_SC || # allow currency symbols + # other symbols, but not arrows or replacement characters + (cat == UTF8PROC_CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) && + c != 0xfffc && c != 0xfffd && + c != 0x233f && # notslash + c != 0x00a6) || # broken bar + + # math symbol (category Sm) whitelist + (c >= 0x2140 && c <= 0x2a1c && + ((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄ + c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿ + c == 0x22a4 || c == 0x22a5 || # ⊤ ⊥ + + (c >= 0x2200 && c <= 0x2233 && + (c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆ + c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏ + c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄ + c == 0x2210 || c == 0x2211 || # ∐, ∑ + c == 0x221e || c == 0x221f || # ∞, ∟ + c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳ + + (c >= 0x22c0 && c <= 0x22c3) || # N-ary big ops: ⋀, ⋁, ⋂, ⋃ + (c >= 0x25F8 && c <= 0x25ff) || # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿ + + (c >= 0x266f && + (c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙ + (c >= 0x27c0 && c <= 0x27c1) || # ⟀, ⟁ + (c >= 0x29b0 && c <= 0x29b4) || # ⦰, ⦱, ⦲, ⦳, ⦴ + (c >= 0x2a00 && c <= 0x2a06) || # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆ + (c >= 0x2a09 && c <= 0x2a16) || # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖ + c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜ + + (c >= 0x1d6c1 && # variants of \nabla and \partial + (c == 0x1d6c1 || c == 0x1d6db || + c == 0x1d6fb || c == 0x1d715 || + c == 0x1d735 || c == 0x1d74f || + c == 0x1d76f || c == 0x1d789 || + c == 0x1d7a9 || c == 0x1d7c3)) || + + # super- and subscript +-=() + (c >= 0x207a && c <= 0x207e) || + (c >= 0x208a && c <= 0x208e) || + + # angle symbols + (c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢ + (c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯ + + # Other_ID_Start + c == 0x2118 || c == 0x212E || # ℘, ℮ + (c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks + + # bold-digits and double-struck digits + (c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive) end +# from jl_id_start_char in julia/src/flisp/julia_extensions.c function is_identifier_start_char(c::Char) - # c == EOF_CHAR && return false # covered by isvalid check - isvalid(c) || return false - return Base.is_id_start_char(c) + if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' + return true + end + if c < Char(0xA1) || !isvalid(c) + return false + end + x = UInt32(c) + return _is_identifier_start_char(x, _category_code(x)) +end + +# from jl_id_char in julia/src/flisp/julia_extensions.c +function is_identifier_char(c::Char) + if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' || + (c >= '0' && c <= '9') || c == '!' + return true + end + if c < Char(0xA1) || !isvalid(c) + return false + end + x = UInt32(c) + cat = _category_code(x) + _is_identifier_start_char(x, cat) && return true + if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC || + cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC || + cat == UTF8PROC_CATEGORY_SK || cat == UTF8PROC_CATEGORY_ME || + cat == UTF8PROC_CATEGORY_NO || + # primes (single, double, triple, their reverses, and quadruple) + (x >= 0x2032 && x <= 0x2037) || (x == 0x2057)) + return true + end + return false end ##################################################################### diff --git a/test/tokenize.jl b/test/tokenize.jl index 26ab044a..afebd643 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -1154,5 +1154,3 @@ end @test strtok("a &&̄ b") == ["a", " ", "&&", "̄", " ", "b", ""] @test strtok("a .&&₁ b") == ["a", " ", ".&&", "₁", " ", "b", ""] end - -end From e34e93d289c9d37115148eff1e96245aa4264aa4 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 21:17:51 -0400 Subject: [PATCH 03/11] optimize ascii case --- src/unicode.jl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/unicode.jl b/src/unicode.jl index 677c84e5..4b7a83dd 100644 --- a/src/unicode.jl +++ b/src/unicode.jl @@ -155,8 +155,9 @@ end # from jl_id_start_char in julia/src/flisp/julia_extensions.c function is_identifier_start_char(c::Char) - if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' - return true + if isascii(c) + a = c % UInt8 + return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) || a == UInt8('_') end if c < Char(0xA1) || !isvalid(c) return false @@ -167,9 +168,10 @@ end # from jl_id_char in julia/src/flisp/julia_extensions.c function is_identifier_char(c::Char) - if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' || - (c >= '0' && c <= '9') || c == '!' - return true + if isascii(c) + a = c % UInt8 + return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) || + a == UInt8('_') || (a >= UInt8('0') && a <= UInt8('9')) || a == UInt8('!') end if c < Char(0xA1) || !isvalid(c) return false From 2676e1ad5be5a1f2efe93520d2997fec59aba943 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 21:20:11 -0400 Subject: [PATCH 04/11] revert accidental change to test --- test/tokenize.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/tokenize.jl b/test/tokenize.jl index afebd643..26ab044a 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -1154,3 +1154,5 @@ end @test strtok("a &&̄ b") == ["a", " ", "&&", "̄", " ", "b", ""] @test strtok("a .&&₁ b") == ["a", " ", ".&&", "₁", " ", "b", ""] end + +end From c6101d097de39cb5a0dbbbca3c2d39f5aee951c8 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 21:22:30 -0400 Subject: [PATCH 05/11] =?UTF-8?q?rm=20fixme=20for=20=F0=9F=8F=B3=EF=B8=8F?= =?UTF-8?q?=E2=80=8D=F0=9F=8C=88=20=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/tokenize.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/tokenize.jl b/test/tokenize.jl index 26ab044a..52169562 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -44,8 +44,7 @@ end end # testset @testset "tokenize unicode" begin - # FIXME: rm VERSION check once we implement our own is_identifier_char - emoji = VERSION < v"1.5" ? "😄" : "\U1F3F3\UFE0F\U200D\U1F308" # 🏳️‍🌈 requires newer Unicode + emoji = "\U1F3F3\UFE0F\U200D\U1F308" # == "🏳️‍🌈" str = "𝘋 =2"*emoji for s in [str, IOBuffer(str)] l = tokenize(s) From 5a69b0fa965feaef7cbab2fa073d128c418b91b8 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 21:25:10 -0400 Subject: [PATCH 06/11] rm obsolete arg --- src/unicode.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unicode.jl b/src/unicode.jl index 4b7a83dd..64982f77 100644 --- a/src/unicode.jl +++ b/src/unicode.jl @@ -74,7 +74,7 @@ function utf8proc_decompose_julia(str, options, buffer, nwords) return ret end -function utf8proc_map_julia(str::Union{String,SubString{String}}, options::Integer, chartransform=identity) +function utf8proc_map_julia(str::Union{String,SubString{String}}, options::Integer) nwords = utf8proc_decompose_julia(str, options, C_NULL, 0) buffer = Base.StringVector(nwords*4) nwords = utf8proc_decompose_julia(str, options, buffer, nwords) From 7fcbdd038b642428bb25b1cc51473b7907543c6e Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 21:58:07 -0400 Subject: [PATCH 07/11] slight consolidation/optimization of common isascii(c), c % UInt8 codepath --- src/tokenize.jl | 4 ++-- src/unicode.jl | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 7db1366d..c8d36bed 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -1282,9 +1282,9 @@ function lex_identifier(l::Lexer, c) graphemestate_peek = Ref(zero(Int32)) while true pc, ppc = dpeekchar(l) - ascii = ascii && isascii(pc) + pc_byte = Unicode.ascii_byte(pc) + ascii = ascii && pc_byte != 0xff if ascii # fast path - pc_byte = pc % UInt8 @inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1] break end diff --git a/src/unicode.jl b/src/unicode.jl index 64982f77..e14b2109 100644 --- a/src/unicode.jl +++ b/src/unicode.jl @@ -153,10 +153,18 @@ function _is_identifier_start_char(c::UInt32, cat::Integer) (c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive) end +# utility function to return the ASCII byte if isascii(c), +# and otherwise (for ASCII or invalid chars) return 0xff, +# based on the isascii source code. +@inline function ascii_byte(c::Char) + x = bswap(reinterpret(UInt32, c)) + return x < 0x80 ? x % UInt8 : 0xff +end + # from jl_id_start_char in julia/src/flisp/julia_extensions.c function is_identifier_start_char(c::Char) - if isascii(c) - a = c % UInt8 + a = ascii_byte(c) + if a != 0xff return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) || a == UInt8('_') end if c < Char(0xA1) || !isvalid(c) @@ -168,8 +176,8 @@ end # from jl_id_char in julia/src/flisp/julia_extensions.c function is_identifier_char(c::Char) - if isascii(c) - a = c % UInt8 + a = ascii_byte(c) + if a != 0xff return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) || a == UInt8('_') || (a >= UInt8('0') && a <= UInt8('9')) || a == UInt8('!') end From 3ebeaf89d6c526bde509f72e0feb36080433e11c Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 22:02:26 -0400 Subject: [PATCH 08/11] cleanup using u8_str macro --- src/unicode.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/unicode.jl b/src/unicode.jl index e14b2109..97cc280b 100644 --- a/src/unicode.jl +++ b/src/unicode.jl @@ -8,6 +8,8 @@ export category_code, normalize_identifier, is_identifier_char, is_identifier_st using utf8proc_jll: libutf8proc +import ..JuliaSyntax: @u8_str + # these constants have been stable across all utf8proc versions, # so no need to redefine them: import Base.Unicode: UTF8PROC_CATEGORY_CN, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_MN, UTF8PROC_CATEGORY_MC, UTF8PROC_CATEGORY_ME, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO, UTF8PROC_CATEGORY_PC, UTF8PROC_CATEGORY_PD, UTF8PROC_CATEGORY_PS, UTF8PROC_CATEGORY_PE, UTF8PROC_CATEGORY_PI, UTF8PROC_CATEGORY_PF, UTF8PROC_CATEGORY_PO, UTF8PROC_CATEGORY_SM, UTF8PROC_CATEGORY_SC, UTF8PROC_CATEGORY_SK, UTF8PROC_CATEGORY_SO, UTF8PROC_CATEGORY_ZS, UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP, UTF8PROC_CATEGORY_CC, UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, UTF8PROC_CATEGORY_CO @@ -165,7 +167,7 @@ end function is_identifier_start_char(c::Char) a = ascii_byte(c) if a != 0xff - return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) || a == UInt8('_') + return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_" end if c < Char(0xA1) || !isvalid(c) return false @@ -178,8 +180,8 @@ end function is_identifier_char(c::Char) a = ascii_byte(c) if a != 0xff - return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) || - a == UInt8('_') || (a >= UInt8('0') && a <= UInt8('9')) || a == UInt8('!') + return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || + a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!" end if c < Char(0xA1) || !isvalid(c) return false From 8f55073fefc7b400a0789502bae049a2c47c3c69 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 22:09:09 -0400 Subject: [PATCH 09/11] pin utf8proc_jll to a specific minor version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 3dc96fae..05b01bcf 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,7 @@ utf8proc_jll = "00992c89-a35c-5347-9984-e6609dacc59a" [compat] julia = "1.0" -utf8proc_jll = "2.9" # for Unicode 15.1 +utf8proc_jll = "~2.9" # = 2.9.x for Unicode 15.1 [extras] Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" From 1506668d001aa263cf6b2c39c6ec8708c8ee093e Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 4 Nov 2023 22:11:34 -0400 Subject: [PATCH 10/11] clarify comments --- src/unicode.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/unicode.jl b/src/unicode.jl index 97cc280b..8edaacbd 100644 --- a/src/unicode.jl +++ b/src/unicode.jl @@ -156,7 +156,7 @@ function _is_identifier_start_char(c::UInt32, cat::Integer) end # utility function to return the ASCII byte if isascii(c), -# and otherwise (for ASCII or invalid chars) return 0xff, +# and otherwise (for non-ASCII or invalid chars) return 0xff, # based on the isascii source code. @inline function ascii_byte(c::Char) x = bswap(reinterpret(UInt32, c)) @@ -166,7 +166,7 @@ end # from jl_id_start_char in julia/src/flisp/julia_extensions.c function is_identifier_start_char(c::Char) a = ascii_byte(c) - if a != 0xff + if a != 0xff # ascii fast path return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_" end if c < Char(0xA1) || !isvalid(c) @@ -179,7 +179,7 @@ end # from jl_id_char in julia/src/flisp/julia_extensions.c function is_identifier_char(c::Char) a = ascii_byte(c) - if a != 0xff + if a != 0xff # ascii fast path return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!" end From c820724ceea2da1e4d31ea916e1939e75359fbe9 Mon Sep 17 00:00:00 2001 From: Claire Foster Date: Tue, 23 Jan 2024 16:03:13 +1000 Subject: [PATCH 11/11] Use UnicodeNext for unicode functionality Here I've put tokenization-related functionality like `is_id_start_char()` in the Tokenize module. --- Project.toml | 3 +- src/JuliaSyntax.jl | 3 +- src/literal_parsing.jl | 19 +++- src/source_files.jl | 2 +- src/tokenize.jl | 162 +++++++++++++++++++++++++++----- src/unicode.jl | 205 ----------------------------------------- 6 files changed, 160 insertions(+), 234 deletions(-) delete mode 100644 src/unicode.jl diff --git a/Project.toml b/Project.toml index 05b01bcf..bed57ee1 100644 --- a/Project.toml +++ b/Project.toml @@ -4,11 +4,10 @@ authors = ["Claire Foster and contributors"] version = "0.4.6" [deps] -utf8proc_jll = "00992c89-a35c-5347-9984-e6609dacc59a" +UnicodeNext = "7b9d9d2f-29eb-4111-b31d-f1cfc33d1412" [compat] julia = "1.0" -utf8proc_jll = "~2.9" # = 2.9.x for Unicode 15.1 [extras] Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" diff --git a/src/JuliaSyntax.jl b/src/JuliaSyntax.jl index 8a8f6140..bd1fe5e0 100644 --- a/src/JuliaSyntax.jl +++ b/src/JuliaSyntax.jl @@ -1,5 +1,7 @@ module JuliaSyntax +using UnicodeNext + # Conservative list of exports - only export the most common/useful things # here. @@ -20,7 +22,6 @@ export SyntaxNode # Helper utilities include("utils.jl") -include("unicode.jl") include("kinds.jl") diff --git a/src/literal_parsing.jl b/src/literal_parsing.jl index 81ab5b32..3277380e 100644 --- a/src/literal_parsing.jl +++ b/src/literal_parsing.jl @@ -331,7 +331,24 @@ end #------------------------------------------------------------------------------- # Unicode normalization. -using .Unicode: normalize_identifier +function normalize_identifier(c::Char) + if c <= '~' + return c # ASCII common case + end + return c == '\u025B' ? '\u03B5' : # 'ɛ' => 'ε' + c == '\u00B5' ? '\u03BC' : # 'µ' => 'μ' + c == '\u00B7' ? '\u22C5' : # '·' => '⋅' + c == '\u0387' ? '\u22C5' : # '·' => '⋅' + c == '\u2212' ? '\u002D' : # '−' (\minus) => '-' + c == '\u210F' ? '\u0127' : # 'ℏ' (\hslash) => 'ħ' \hbar + c +end + +function normalize_identifier(str::AbstractString) + isascii(str) ? str : + UnicodeNext.normalize(str, stable=true, compose=true, + chartransform=normalize_identifier) +end #------------------------------------------------------------------------------- function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange) diff --git a/src/source_files.jl b/src/source_files.jl index a0871347..5b37c395 100644 --- a/src/source_files.jl +++ b/src/source_files.jl @@ -145,7 +145,7 @@ function _print_marker_line(io, prefix_str, str, underline, singleline, color, # Getting exactly the same width of whitespace as `str` is tricky. # Especially for mixtures of tabs and spaces. # tabs are zero width according to textwidth - indent = join(Unicode.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str) + indent = join(UnicodeNext.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str) # Assume tabs are 4 wide rather than 0. (fixme: implement tab alignment?) w = textwidth(str) + 4*count(c->c=='\t', str) diff --git a/src/tokenize.jl b/src/tokenize.jl index c8d36bed..37445a34 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -2,9 +2,9 @@ module Tokenize export tokenize, untokenize -using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str +using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str, @u8_str -import ..JuliaSyntax: kind, Unicode, +import ..JuliaSyntax: kind, is_literal, is_error, is_contextual_keyword, is_word_operator #------------------------------------------------------------------------------- @@ -12,7 +12,119 @@ import ..JuliaSyntax: kind, Unicode, const EOF_CHAR = typemax(Char) -using .Unicode: is_identifier_char, is_identifier_start_char +# Julia identifier parsing predicates + +using UnicodeNext + +import UnicodeNext: CATEGORY_CS, CATEGORY_LL, CATEGORY_LM, CATEGORY_LO, + CATEGORY_LT, CATEGORY_LU, CATEGORY_MC, CATEGORY_ME, CATEGORY_MN, + CATEGORY_ND, CATEGORY_NL, CATEGORY_NO, CATEGORY_PC, CATEGORY_PD, + CATEGORY_PO, CATEGORY_SC, CATEGORY_SK, CATEGORY_SO, CATEGORY_ZS + +# port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c +function _is_identifier_start_char(c::UInt32, cat::Integer) + return (cat == CATEGORY_LU || cat == CATEGORY_LL || + cat == CATEGORY_LT || cat == CATEGORY_LM || + cat == CATEGORY_LO || cat == CATEGORY_NL || + cat == CATEGORY_SC || # allow currency symbols + # other symbols, but not arrows or replacement characters + (cat == CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) && + c != 0xfffc && c != 0xfffd && + c != 0x233f && # notslash + c != 0x00a6) || # broken bar + + # math symbol (category Sm) whitelist + (c >= 0x2140 && c <= 0x2a1c && + ((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄ + c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿ + c == 0x22a4 || c == 0x22a5 || # ⊤ ⊥ + + (c >= 0x2200 && c <= 0x2233 && + (c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆ + c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏ + c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄ + c == 0x2210 || c == 0x2211 || # ∐, ∑ + c == 0x221e || c == 0x221f || # ∞, ∟ + c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳ + + (c >= 0x22c0 && c <= 0x22c3) || # N-ary big ops: ⋀, ⋁, ⋂, ⋃ + (c >= 0x25F8 && c <= 0x25ff) || # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿ + + (c >= 0x266f && + (c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙ + (c >= 0x27c0 && c <= 0x27c1) || # ⟀, ⟁ + (c >= 0x29b0 && c <= 0x29b4) || # ⦰, ⦱, ⦲, ⦳, ⦴ + (c >= 0x2a00 && c <= 0x2a06) || # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆ + (c >= 0x2a09 && c <= 0x2a16) || # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖ + c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜ + + (c >= 0x1d6c1 && # variants of \nabla and \partial + (c == 0x1d6c1 || c == 0x1d6db || + c == 0x1d6fb || c == 0x1d715 || + c == 0x1d735 || c == 0x1d74f || + c == 0x1d76f || c == 0x1d789 || + c == 0x1d7a9 || c == 0x1d7c3)) || + + # super- and subscript +-=() + (c >= 0x207a && c <= 0x207e) || + (c >= 0x208a && c <= 0x208e) || + + # angle symbols + (c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢ + (c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯ + + # Other_ID_Start + c == 0x2118 || c == 0x212E || # ℘, ℮ + (c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks + + # bold-digits and double-struck digits + (c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive) +end + +# utility function to return the ASCII byte if isascii(c), +# and otherwise (for non-ASCII or invalid chars) return 0xff, +# based on the isascii source code. +@inline function _ascii_byte(c::Char) + x = bswap(reinterpret(UInt32, c)) + return x < 0x80 ? x % UInt8 : 0xff +end + +# from jl_id_start_char in julia/src/flisp/julia_extensions.c +function is_identifier_start_char(c::Char) + a = _ascii_byte(c) + if a != 0xff # ascii fast path + return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_" + end + if c < Char(0xA1) || !isvalid(c) + return false + end + x = UInt32(c) + return _is_identifier_start_char(x, UnicodeNext.category_code(x)) +end + +# from jl_id_char in julia/src/flisp/julia_extensions.c +function is_identifier_char(c::Char) + a = _ascii_byte(c) + if a != 0xff # ascii fast path + return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || + a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!" + end + if c < Char(0xA1) || !isvalid(c) + return false + end + x = UInt32(c) + cat = UnicodeNext.category_code(x) + _is_identifier_start_char(x, cat) && return true + if (cat == CATEGORY_MN || cat == CATEGORY_MC || + cat == CATEGORY_ND || cat == CATEGORY_PC || + cat == CATEGORY_SK || cat == CATEGORY_ME || + cat == CATEGORY_NO || + # primes (single, double, triple, their reverses, and quadruple) + (x >= 0x2032 && x <= 0x2037) || (x == 0x2057)) + return true + end + return false +end function is_invisible_char(c::Char) # These are the chars considered invisible by the reference parser. @@ -33,15 +145,15 @@ end # Chars that we will never allow to be part of a valid non-operator identifier function is_never_id_char(ch::Char) isvalid(ch) || return true - cat = Unicode.category_code(ch) + cat = UnicodeNext.category_code(ch) c = UInt32(ch) return ( # spaces and control characters: - (cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) || + (cat >= CATEGORY_ZS && cat <= CATEGORY_CS) || # ASCII and Latin1 non-connector punctuation (c < 0xff && - cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) || + cat >= CATEGORY_PD && cat <= CATEGORY_PO) || c == UInt32('`') || @@ -61,7 +173,7 @@ end readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char) # Some unicode operators are normalized by the tokenizer into their equivalent -# kinds. See also Unicode.normalize_identifier() +# kinds. See also normalize_identifier() const _ops_with_unicode_aliases = [ # \minus '−' is normalized into K"-", '−' => K"-" @@ -126,10 +238,10 @@ end if (u < 0xa1 || u > 0x10ffff) return false end - cat = Unicode.category_code(u) - if (cat == Unicode.UTF8PROC_CATEGORY_MN || - cat == Unicode.UTF8PROC_CATEGORY_MC || - cat == Unicode.UTF8PROC_CATEGORY_ME) + cat = UnicodeNext.category_code(u) + if (cat == CATEGORY_MN || + cat == CATEGORY_MC || + cat == CATEGORY_ME) return true end # Additional allowed cases @@ -215,7 +327,7 @@ end @inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F') @inline isbinary(c::Char) = c == '0' || c == '1' @inline isoctal(c::Char) = '0' ≤ c ≤ '7' -@inline iswhitespace(c::Char) = (isvalid(c) && Unicode.isspace(c)) || c === '\ufeff' +@inline iswhitespace(c::Char) = (isvalid(c) && UnicodeNext.isspace(c)) || c === '\ufeff' struct StringState triplestr::Bool @@ -1278,25 +1390,27 @@ function lex_identifier(l::Lexer, c) h = simple_hash(c, UInt64(0)) n = 1 ascii = isascii(c) - graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER - graphemestate_peek = Ref(zero(Int32)) + graphemestate = UnicodeNext.GraphemeState(c) while true pc, ppc = dpeekchar(l) - pc_byte = Unicode.ascii_byte(pc) + pc_byte = _ascii_byte(pc) ascii = ascii && pc_byte != 0xff if ascii # fast path @inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1] break end - elseif Unicode.isgraphemebreak!(graphemestate, c, pc) - if (pc == '!' && ppc == '=') || !is_identifier_char(pc) - break - end - elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters - # ZWJ/ZWNJ only within grapheme sequences, not at end - graphemestate_peek[] = graphemestate[] - if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc) - break + else + graphemestate, isbreak = UnicodeNext.isgraphemebreak(graphemestate, pc) + if isbreak + if ((pc == '!' && ppc == '=') || !is_identifier_char(pc)) + break + end + elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters + # ZWJ/ZWNJ only within grapheme sequences, not at end + _, isbreak_peek = UnicodeNext.isgraphemebreak(graphemestate, ppc) + if isbreak_peek + break + end end end c = readchar(l) diff --git a/src/unicode.jl b/src/unicode.jl deleted file mode 100644 index 8edaacbd..00000000 --- a/src/unicode.jl +++ /dev/null @@ -1,205 +0,0 @@ -# this is a mirror of some of Base.Unicode and related functions, -# but using the utf8proc_jll version tied to JuliaSyntax, so that -# the supported Unicode version is the same across Julia versions. - -module Unicode - -export category_code, normalize_identifier, is_identifier_char, is_identifier_start_char - -using utf8proc_jll: libutf8proc - -import ..JuliaSyntax: @u8_str - -# these constants have been stable across all utf8proc versions, -# so no need to redefine them: -import Base.Unicode: UTF8PROC_CATEGORY_CN, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_MN, UTF8PROC_CATEGORY_MC, UTF8PROC_CATEGORY_ME, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO, UTF8PROC_CATEGORY_PC, UTF8PROC_CATEGORY_PD, UTF8PROC_CATEGORY_PS, UTF8PROC_CATEGORY_PE, UTF8PROC_CATEGORY_PI, UTF8PROC_CATEGORY_PF, UTF8PROC_CATEGORY_PO, UTF8PROC_CATEGORY_SM, UTF8PROC_CATEGORY_SC, UTF8PROC_CATEGORY_SK, UTF8PROC_CATEGORY_SO, UTF8PROC_CATEGORY_ZS, UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP, UTF8PROC_CATEGORY_CC, UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, UTF8PROC_CATEGORY_CO - -using Base: ismalformed - -##################################################################### -# functions copied almost as-is from Base.Unicode, with the only change -# being that they now ccall into utf8proc_jll. - -utf8proc_error(result) = error(unsafe_string(ccall((:utf8proc_errmsg,libutf8proc), Cstring, (Cssize_t,), result))) - -# Stateful grapheme break required by Unicode-9 rules: the string -# must be processed in sequence, with state initialized to Ref{Int32}(0). -# Requires utf8proc v2.0 or later. -function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar) - if ismalformed(c1) || ismalformed(c2) - state[] = 0 - return true - end - ccall((:utf8proc_grapheme_break_stateful,libutf8proc), Bool, - (UInt32, UInt32, Ref{Int32}), c1, c2, state) -end - -# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category -function category_code(c::AbstractChar) - !ismalformed(c) ? category_code(UInt32(c)) : Cint(31) -end - -# doesn't check validity of x: -@inline _category_code(x::Integer) = ccall((:utf8proc_category,libutf8proc), Cint, (UInt32,), x) - -function category_code(x::Integer) - x ≤ 0x10ffff ? _category_code(x) : Cint(30) -end - -@inline isspace(c::AbstractChar) = - c == ' ' || '\t' <= c <= '\r' || c == '\u85' || - '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS - -##################################################################### -# Julia identifier normalization, closely based on functions -# from Base.Unicode except that we hard-code the Julia -# chartransform (working around JuliaLang/julia#45716) - -# Julia's custom character normalization mapping, based on -# julia/src/flisp/julia_charmap.h: -function julia_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32 - (codepoint < 0x007f ? codepoint : # optimize for ASCII common case - codepoint == 0x025B ? 0x03B5 : # 'ɛ' => 'ε' - codepoint == 0x00B5 ? 0x03BC : # 'µ' => 'μ' - codepoint == 0x00B7 ? 0x22C5 : # '·' => '⋅' - codepoint == 0x0387 ? 0x22C5 : # '·' => '⋅' - codepoint == 0x2212 ? 0x002D : # '−' (\minus) => '-' - codepoint == 0x210F ? 0x0127 : # 'ℏ' (\hslash) => 'ħ' \hbar - codepoint) -end - -function utf8proc_decompose_julia(str, options, buffer, nwords) - ret = ccall((:utf8proc_decompose_custom,libutf8proc), Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ptr{Cvoid}), - str, sizeof(str), buffer, nwords, options, - @cfunction(julia_custom_func, UInt32, (UInt32, Ptr{Cvoid})), C_NULL) - ret < 0 && utf8proc_error(ret) - return ret -end - -function utf8proc_map_julia(str::Union{String,SubString{String}}, options::Integer) - nwords = utf8proc_decompose_julia(str, options, C_NULL, 0) - buffer = Base.StringVector(nwords*4) - nwords = utf8proc_decompose_julia(str, options, buffer, nwords) - nbytes = ccall((:utf8proc_reencode,libutf8proc), Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options) - nbytes < 0 && utf8proc_error(nbytes) - return String(resize!(buffer, nbytes)) -end - -function normalize_identifier(str) - # note that the values of UTF8PROC_x constants have not changed - # over many utf8proc versions, so we can use them from Base.Unicode - flags = Base.Unicode.UTF8PROC_STABLE | Base.Unicode.UTF8PROC_COMPOSE - return isascii(str) ? str : utf8proc_map_julia(str, flags) -end - -##################################################################### -# Julia identifier parsing predicates - -# port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c -function _is_identifier_start_char(c::UInt32, cat::Integer) - return (cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LL || - cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LM || - cat == UTF8PROC_CATEGORY_LO || cat == UTF8PROC_CATEGORY_NL || - cat == UTF8PROC_CATEGORY_SC || # allow currency symbols - # other symbols, but not arrows or replacement characters - (cat == UTF8PROC_CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) && - c != 0xfffc && c != 0xfffd && - c != 0x233f && # notslash - c != 0x00a6) || # broken bar - - # math symbol (category Sm) whitelist - (c >= 0x2140 && c <= 0x2a1c && - ((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄ - c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿ - c == 0x22a4 || c == 0x22a5 || # ⊤ ⊥ - - (c >= 0x2200 && c <= 0x2233 && - (c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆ - c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏ - c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄ - c == 0x2210 || c == 0x2211 || # ∐, ∑ - c == 0x221e || c == 0x221f || # ∞, ∟ - c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳ - - (c >= 0x22c0 && c <= 0x22c3) || # N-ary big ops: ⋀, ⋁, ⋂, ⋃ - (c >= 0x25F8 && c <= 0x25ff) || # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿ - - (c >= 0x266f && - (c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙ - (c >= 0x27c0 && c <= 0x27c1) || # ⟀, ⟁ - (c >= 0x29b0 && c <= 0x29b4) || # ⦰, ⦱, ⦲, ⦳, ⦴ - (c >= 0x2a00 && c <= 0x2a06) || # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆ - (c >= 0x2a09 && c <= 0x2a16) || # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖ - c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜ - - (c >= 0x1d6c1 && # variants of \nabla and \partial - (c == 0x1d6c1 || c == 0x1d6db || - c == 0x1d6fb || c == 0x1d715 || - c == 0x1d735 || c == 0x1d74f || - c == 0x1d76f || c == 0x1d789 || - c == 0x1d7a9 || c == 0x1d7c3)) || - - # super- and subscript +-=() - (c >= 0x207a && c <= 0x207e) || - (c >= 0x208a && c <= 0x208e) || - - # angle symbols - (c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢ - (c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯ - - # Other_ID_Start - c == 0x2118 || c == 0x212E || # ℘, ℮ - (c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks - - # bold-digits and double-struck digits - (c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive) -end - -# utility function to return the ASCII byte if isascii(c), -# and otherwise (for non-ASCII or invalid chars) return 0xff, -# based on the isascii source code. -@inline function ascii_byte(c::Char) - x = bswap(reinterpret(UInt32, c)) - return x < 0x80 ? x % UInt8 : 0xff -end - -# from jl_id_start_char in julia/src/flisp/julia_extensions.c -function is_identifier_start_char(c::Char) - a = ascii_byte(c) - if a != 0xff # ascii fast path - return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_" - end - if c < Char(0xA1) || !isvalid(c) - return false - end - x = UInt32(c) - return _is_identifier_start_char(x, _category_code(x)) -end - -# from jl_id_char in julia/src/flisp/julia_extensions.c -function is_identifier_char(c::Char) - a = ascii_byte(c) - if a != 0xff # ascii fast path - return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || - a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!" - end - if c < Char(0xA1) || !isvalid(c) - return false - end - x = UInt32(c) - cat = _category_code(x) - _is_identifier_start_char(x, cat) && return true - if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC || - cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC || - cat == UTF8PROC_CATEGORY_SK || cat == UTF8PROC_CATEGORY_ME || - cat == UTF8PROC_CATEGORY_NO || - # primes (single, double, triple, their reverses, and quadruple) - (x >= 0x2032 && x <= 0x2037) || (x == 0x2057)) - return true - end - return false -end - -##################################################################### - -end