From 85f64dd1ddb79a8502953b3717bede136bb88cd4 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 20:43:11 -0400
Subject: [PATCH 01/11] initial port of Base.Unicode

---
 Project.toml           |   6 ++-
 src/JuliaSyntax.jl     |   1 +
 src/literal_parsing.jl |  43 +----------------
 src/source_files.jl    |   2 +-
 src/tokenize.jl        |  27 +++--------
 src/unicode.jl         | 107 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 123 insertions(+), 63 deletions(-)
 create mode 100644 src/unicode.jl

diff --git a/Project.toml b/Project.toml
index 6ffbaa40..3dc96fae 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,10 +3,12 @@ uuid = "70703baa-626e-46a2-a12c-08ffd08c73b4"
 authors = ["Claire Foster <aka.c42f@gmail.com> and contributors"]
 version = "0.4.6"
 
+[deps]
+utf8proc_jll = "00992c89-a35c-5347-9984-e6609dacc59a"
+
 [compat]
 julia = "1.0"
-
-[deps]
+utf8proc_jll = "2.9" # for Unicode 15.1
 
 [extras]
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
diff --git a/src/JuliaSyntax.jl b/src/JuliaSyntax.jl
index 3f1ad27a..8a8f6140 100644
--- a/src/JuliaSyntax.jl
+++ b/src/JuliaSyntax.jl
@@ -20,6 +20,7 @@ export SyntaxNode
 
 # Helper utilities
 include("utils.jl")
+include("unicode.jl")
 
 include("kinds.jl")
 
diff --git a/src/literal_parsing.jl b/src/literal_parsing.jl
index a027985a..81ab5b32 100644
--- a/src/literal_parsing.jl
+++ b/src/literal_parsing.jl
@@ -329,47 +329,9 @@ function unescape_julia_string(io::IO, txtbuf::Vector{UInt8},
 end
 
 #-------------------------------------------------------------------------------
-# Unicode normalization. As of Julia 1.8, this is part of Base and the Unicode
-# stdlib under the name `Unicode.julia_chartransform`. See
-# https://github.com/JuliaLang/julia/pull/42561
-#
-# To allow use on older Julia versions and to workaround the bug
-# https://github.com/JuliaLang/julia/issues/45716
-# we reproduce a specialized version of that logic here.
-
-# static wrapper around user callback function
-function utf8proc_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32
-    (codepoint == 0x025B ? 0x03B5 :  # 'ɛ' => 'ε'
-    codepoint == 0x00B5 ? 0x03BC :   # 'µ' => 'μ'
-    codepoint == 0x00B7 ? 0x22C5 :   # '·' => '⋅'
-    codepoint == 0x0387 ? 0x22C5 :   # '·' => '⋅'
-    codepoint == 0x2212 ? 0x002D :   # '−' (\minus) => '-'
-    codepoint == 0x210F ? 0x0127 :   # 'ℏ' (\hslash) => 'ħ' \hbar
-    codepoint)
-end
-
-function utf8proc_decompose(str, options, buffer, nwords)
-    ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ptr{Cvoid}),
-                str, sizeof(str), buffer, nwords, options,
-                @cfunction(utf8proc_custom_func, UInt32, (UInt32, Ptr{Cvoid})), C_NULL)
-    ret < 0 && Base.Unicode.utf8proc_error(ret)
-    return ret
-end
-
-function utf8proc_map(str::Union{String,SubString{String}}, options::Integer)
-    nwords = utf8proc_decompose(str, options, C_NULL, 0)
-    buffer = Base.StringVector(nwords*4)
-    nwords = utf8proc_decompose(str, options, buffer, nwords)
-    nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
-    nbytes < 0 && Base.Unicode.utf8proc_error(nbytes)
-    return String(resize!(buffer, nbytes))
-end
-
-function normalize_identifier(str)
-    flags = Base.Unicode.UTF8PROC_STABLE | Base.Unicode.UTF8PROC_COMPOSE
-    return isascii(str) ? str : utf8proc_map(str, flags)
-end
+# Unicode normalization.
 
+using .Unicode: normalize_identifier
 
 #-------------------------------------------------------------------------------
 function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange)
@@ -451,4 +413,3 @@ function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange)
         ErrorVal()
     end
 end
-
diff --git a/src/source_files.jl b/src/source_files.jl
index 06cae008..a0871347 100644
--- a/src/source_files.jl
+++ b/src/source_files.jl
@@ -145,7 +145,7 @@ function _print_marker_line(io, prefix_str, str, underline, singleline, color,
     # Getting exactly the same width of whitespace as `str` is tricky.
     # Especially for mixtures of tabs and spaces.
     # tabs are zero width according to textwidth
-    indent = join(isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)
+    indent = join(Unicode.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)
 
     # Assume tabs are 4 wide rather than 0. (fixme: implement tab alignment?)
     w = textwidth(str) + 4*count(c->c=='\t', str)
diff --git a/src/tokenize.jl b/src/tokenize.jl
index 9c19c040..7db1366d 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -4,26 +4,15 @@ export tokenize, untokenize
 
 using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str
 
-import ..JuliaSyntax: kind,
+import ..JuliaSyntax: kind, Unicode,
     is_literal, is_error, is_contextual_keyword, is_word_operator
 
 #-------------------------------------------------------------------------------
 # Character-based predicates for tokenization
-import Base.Unicode
 
 const EOF_CHAR = typemax(Char)
 
-function is_identifier_char(c::Char)
-    c == EOF_CHAR && return false
-    isvalid(c) || return false
-    return Base.is_id_char(c)
-end
-
-function is_identifier_start_char(c::Char)
-    c == EOF_CHAR && return false
-    isvalid(c) || return false
-    return Base.is_id_start_char(c)
-end
+using .Unicode: is_identifier_char, is_identifier_start_char
 
 function is_invisible_char(c::Char)
     # These are the chars considered invisible by the reference parser.
@@ -72,7 +61,7 @@ end
 readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
 
 # Some unicode operators are normalized by the tokenizer into their equivalent
-# kinds. See also normalize_identifier()
+# kinds. See also Unicode.normalize_identifier()
 const _ops_with_unicode_aliases = [
     # \minus '−' is normalized into K"-",
     '−' => K"-"
@@ -137,10 +126,10 @@ end
     if (u < 0xa1 || u > 0x10ffff)
         return false
     end
-    cat = Base.Unicode.category_code(u)
-    if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
-        cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
-        cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
+    cat = Unicode.category_code(u)
+    if (cat == Unicode.UTF8PROC_CATEGORY_MN ||
+        cat == Unicode.UTF8PROC_CATEGORY_MC ||
+        cat == Unicode.UTF8PROC_CATEGORY_ME)
         return true
     end
     # Additional allowed cases
@@ -226,7 +215,7 @@ end
 @inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
 @inline isbinary(c::Char) = c == '0' || c == '1'
 @inline isoctal(c::Char) =  '0' ≤ c ≤ '7'
-@inline iswhitespace(c::Char) = (isvalid(c) && Base.isspace(c)) || c === '\ufeff'
+@inline iswhitespace(c::Char) = (isvalid(c) && Unicode.isspace(c)) || c === '\ufeff'
 
 struct StringState
     triplestr::Bool
diff --git a/src/unicode.jl b/src/unicode.jl
new file mode 100644
index 00000000..ec6f3806
--- /dev/null
+++ b/src/unicode.jl
@@ -0,0 +1,107 @@
+# this is a mirror of some of Base.Unicode and related functions,
+# but using the utf8proc_jll version tied to JuliaSyntax, so that
+# the supported Unicode version is the same across Julia versions.
+
+module Unicode
+
+export category_code, normalize_identifier, is_identifier_char, is_identifier_start_char
+
+using utf8proc_jll: libutf8proc
+
+# these constants have been stable across all utf8proc versions,
+# so no need to redefine them:
+import Base.Unicode: UTF8PROC_CATEGORY_CN, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_MN, UTF8PROC_CATEGORY_MC, UTF8PROC_CATEGORY_ME, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO, UTF8PROC_CATEGORY_PC, UTF8PROC_CATEGORY_PD, UTF8PROC_CATEGORY_PS, UTF8PROC_CATEGORY_PE, UTF8PROC_CATEGORY_PI, UTF8PROC_CATEGORY_PF, UTF8PROC_CATEGORY_PO, UTF8PROC_CATEGORY_SM, UTF8PROC_CATEGORY_SC, UTF8PROC_CATEGORY_SK, UTF8PROC_CATEGORY_SO, UTF8PROC_CATEGORY_ZS, UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP, UTF8PROC_CATEGORY_CC, UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, UTF8PROC_CATEGORY_CO
+
+using Base: ismalformed
+
+#####################################################################
+# functions copied almost as-is from Base.Unicode, with the only change
+# being that they now ccall into utf8proc_jll.
+
+utf8proc_error(result) = error(unsafe_string(ccall((:utf8proc_errmsg,libutf8proc), Cstring, (Cssize_t,), result)))
+
+# Stateful grapheme break required by Unicode-9 rules: the string
+# must be processed in sequence, with state initialized to Ref{Int32}(0).
+# Requires utf8proc v2.0 or later.
+function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar)
+    if ismalformed(c1) || ismalformed(c2)
+        state[] = 0
+        return true
+    end
+    ccall((:utf8proc_grapheme_break_stateful,libutf8proc), Bool,
+          (UInt32, UInt32, Ref{Int32}), c1, c2, state)
+end
+
+# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
+function category_code(c::AbstractChar)
+    !ismalformed(c) ? category_code(UInt32(c)) : Cint(31)
+end
+
+function category_code(x::Integer)
+    x ≤ 0x10ffff ? ccall((:utf8proc_category,libutf8proc), Cint, (UInt32,), x) : Cint(30)
+end
+
+@inline isspace(c::AbstractChar) =
+    c == ' ' || '\t' <= c <= '\r' || c == '\u85' ||
+    '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
+
+#####################################################################
+# Julia identifier normalization, closely based on functions
+# from Base.Unicode except that we hard-code the Julia
+# chartransform (working around JuliaLang/julia#45716)
+
+# Julia's custom character normalization mapping, based on
+# julia/src/flisp/julia_charmap.h:
+function julia_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32
+    (codepoint < 0x007f ? codepoint : # optimize for ASCII common case
+    codepoint == 0x025B ? 0x03B5 :  # 'ɛ' => 'ε'
+    codepoint == 0x00B5 ? 0x03BC :   # 'µ' => 'μ'
+    codepoint == 0x00B7 ? 0x22C5 :   # '·' => '⋅'
+    codepoint == 0x0387 ? 0x22C5 :   # '·' => '⋅'
+    codepoint == 0x2212 ? 0x002D :   # '−' (\minus) => '-'
+    codepoint == 0x210F ? 0x0127 :   # 'ℏ' (\hslash) => 'ħ' \hbar
+    codepoint)
+end
+
+function utf8proc_decompose_julia(str, options, buffer, nwords)
+    ret = ccall((:utf8proc_decompose_custom,libutf8proc), Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ptr{Cvoid}),
+                str, sizeof(str), buffer, nwords, options,
+                @cfunction(julia_custom_func, UInt32, (UInt32, Ptr{Cvoid})), C_NULL)
+    ret < 0 && utf8proc_error(ret)
+    return ret
+end
+
+function utf8proc_map_julia(str::Union{String,SubString{String}}, options::Integer, chartransform=identity)
+    nwords = utf8proc_decompose_julia(str, options, C_NULL, 0)
+    buffer = Base.StringVector(nwords*4)
+    nwords = utf8proc_decompose_julia(str, options, buffer, nwords)
+    nbytes = ccall((:utf8proc_reencode,libutf8proc), Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
+    nbytes < 0 && utf8proc_error(nbytes)
+    return String(resize!(buffer, nbytes))
+end
+
+function normalize_identifier(str)
+    # note that the values of UTF8PROC_x constants have not changed
+    # over many utf8proc versions, so we can use them from Base.Unicode
+    flags = Base.Unicode.UTF8PROC_STABLE | Base.Unicode.UTF8PROC_COMPOSE
+    return isascii(str) ? str : utf8proc_map_julia(str, flags)
+end
+
+#####################################################################
+# Julia identifier parsing predicates
+
+function is_identifier_char(c::Char)
+    # c == EOF_CHAR && return false # covered by isvalid check
+    isvalid(c) || return false
+    return Base.is_id_char(c)
+end
+
+function is_identifier_start_char(c::Char)
+    # c == EOF_CHAR && return false # covered by isvalid check
+    isvalid(c) || return false
+    return Base.is_id_start_char(c)
+end
+
+#####################################################################
+
+end

From 22e3c6af7886e53a53ddbf315c31fcb9e9a3b3c4 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 21:03:52 -0400
Subject: [PATCH 02/11] port Base.is_id_char and Base.is_id_start_char

---
 src/unicode.jl   | 102 +++++++++++++++++++++++++++++++++++++++++++----
 test/tokenize.jl |   2 -
 2 files changed, 94 insertions(+), 10 deletions(-)

diff --git a/src/unicode.jl b/src/unicode.jl
index ec6f3806..677c84e5 100644
--- a/src/unicode.jl
+++ b/src/unicode.jl
@@ -37,8 +37,11 @@ function category_code(c::AbstractChar)
     !ismalformed(c) ? category_code(UInt32(c)) : Cint(31)
 end
 
+# doesn't check validity of x:
+@inline _category_code(x::Integer) = ccall((:utf8proc_category,libutf8proc), Cint, (UInt32,), x)
+
 function category_code(x::Integer)
-    x ≤ 0x10ffff ? ccall((:utf8proc_category,libutf8proc), Cint, (UInt32,), x) : Cint(30)
+    x ≤ 0x10ffff ? _category_code(x) : Cint(30)
 end
 
 @inline isspace(c::AbstractChar) =
@@ -90,16 +93,99 @@ end
 #####################################################################
 # Julia identifier parsing predicates
 
-function is_identifier_char(c::Char)
-    # c == EOF_CHAR && return false # covered by isvalid check
-    isvalid(c) || return false
-    return Base.is_id_char(c)
+# port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c
+function _is_identifier_start_char(c::UInt32, cat::Integer)
+    return (cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LL ||
+            cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LM ||
+            cat == UTF8PROC_CATEGORY_LO || cat == UTF8PROC_CATEGORY_NL ||
+            cat == UTF8PROC_CATEGORY_SC ||  # allow currency symbols
+            # other symbols, but not arrows or replacement characters
+            (cat == UTF8PROC_CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) &&
+             c != 0xfffc && c != 0xfffd &&
+             c != 0x233f &&  # notslash
+             c != 0x00a6) || # broken bar
+
+            # math symbol (category Sm) whitelist
+            (c >= 0x2140 && c <= 0x2a1c &&
+             ((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄
+              c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿
+              c == 0x22a4 || c == 0x22a5 ||   # ⊤ ⊥
+
+              (c >= 0x2200 && c <= 0x2233 &&
+               (c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆
+                c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏
+                c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄
+                c == 0x2210 || c == 0x2211 || # ∐, ∑
+                c == 0x221e || c == 0x221f || # ∞, ∟
+                c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳
+
+              (c >= 0x22c0 && c <= 0x22c3) ||  # N-ary big ops: ⋀, ⋁, ⋂, ⋃
+              (c >= 0x25F8 && c <= 0x25ff) ||  # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿
+
+              (c >= 0x266f &&
+               (c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙
+                (c >= 0x27c0 && c <= 0x27c1) ||  # ⟀, ⟁
+                (c >= 0x29b0 && c <= 0x29b4) ||  # ⦰, ⦱, ⦲, ⦳, ⦴
+                (c >= 0x2a00 && c <= 0x2a06) ||  # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆
+                (c >= 0x2a09 && c <= 0x2a16) ||  # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖
+                c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜
+
+            (c >= 0x1d6c1 && # variants of \nabla and \partial
+             (c == 0x1d6c1 || c == 0x1d6db ||
+              c == 0x1d6fb || c == 0x1d715 ||
+              c == 0x1d735 || c == 0x1d74f ||
+              c == 0x1d76f || c == 0x1d789 ||
+              c == 0x1d7a9 || c == 0x1d7c3)) ||
+
+            # super- and subscript +-=()
+            (c >= 0x207a && c <= 0x207e) ||
+            (c >= 0x208a && c <= 0x208e) ||
+
+            # angle symbols
+            (c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢
+            (c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯
+
+            # Other_ID_Start
+            c == 0x2118 || c == 0x212E || # ℘, ℮
+            (c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks
+
+            # bold-digits and double-struck digits
+            (c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive)
 end
 
+# from jl_id_start_char in julia/src/flisp/julia_extensions.c
 function is_identifier_start_char(c::Char)
-    # c == EOF_CHAR && return false # covered by isvalid check
-    isvalid(c) || return false
-    return Base.is_id_start_char(c)
+    if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'
+        return true
+    end
+    if c < Char(0xA1) || !isvalid(c)
+        return false
+    end
+    x = UInt32(c)
+    return _is_identifier_start_char(x, _category_code(x))
+end
+
+# from jl_id_char in julia/src/flisp/julia_extensions.c
+function is_identifier_char(c::Char)
+    if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' ||
+       (c >= '0' && c <= '9') || c == '!'
+        return true
+    end
+    if c < Char(0xA1) || !isvalid(c)
+        return false
+    end
+    x = UInt32(c)
+    cat = _category_code(x)
+    _is_identifier_start_char(x, cat) && return true
+    if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC ||
+        cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC ||
+        cat == UTF8PROC_CATEGORY_SK || cat == UTF8PROC_CATEGORY_ME ||
+        cat == UTF8PROC_CATEGORY_NO ||
+        # primes (single, double, triple, their reverses, and quadruple)
+        (x >= 0x2032 && x <= 0x2037) || (x == 0x2057))
+        return true
+    end
+    return false
 end
 
 #####################################################################
diff --git a/test/tokenize.jl b/test/tokenize.jl
index 26ab044a..afebd643 100644
--- a/test/tokenize.jl
+++ b/test/tokenize.jl
@@ -1154,5 +1154,3 @@ end
     @test strtok("a &&̄ b")   ==  ["a", " ", "&&", "̄", " ", "b", ""]
     @test strtok("a .&&₁ b") ==  ["a", " ", ".&&", "₁", " ", "b", ""]
 end
-
-end

From e34e93d289c9d37115148eff1e96245aa4264aa4 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 21:17:51 -0400
Subject: [PATCH 03/11] optimize ascii case

---
 src/unicode.jl | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/unicode.jl b/src/unicode.jl
index 677c84e5..4b7a83dd 100644
--- a/src/unicode.jl
+++ b/src/unicode.jl
@@ -155,8 +155,9 @@ end
 
 # from jl_id_start_char in julia/src/flisp/julia_extensions.c
 function is_identifier_start_char(c::Char)
-    if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'
-        return true
+    if isascii(c)
+        a = c % UInt8
+        return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) || a == UInt8('_')
     end
     if c < Char(0xA1) || !isvalid(c)
         return false
@@ -167,9 +168,10 @@ end
 
 # from jl_id_char in julia/src/flisp/julia_extensions.c
 function is_identifier_char(c::Char)
-    if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' ||
-       (c >= '0' && c <= '9') || c == '!'
-        return true
+    if isascii(c)
+        a = c % UInt8
+        return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) ||
+               a == UInt8('_') || (a >= UInt8('0') && a <= UInt8('9')) || a == UInt8('!')
     end
     if c < Char(0xA1) || !isvalid(c)
         return false

From 2676e1ad5be5a1f2efe93520d2997fec59aba943 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 21:20:11 -0400
Subject: [PATCH 04/11] revert accidental change to test

---
 test/tokenize.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/tokenize.jl b/test/tokenize.jl
index afebd643..26ab044a 100644
--- a/test/tokenize.jl
+++ b/test/tokenize.jl
@@ -1154,3 +1154,5 @@ end
     @test strtok("a &&̄ b")   ==  ["a", " ", "&&", "̄", " ", "b", ""]
     @test strtok("a .&&₁ b") ==  ["a", " ", ".&&", "₁", " ", "b", ""]
 end
+
+end

From c6101d097de39cb5a0dbbbca3c2d39f5aee951c8 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 21:22:30 -0400
Subject: [PATCH 05/11] =?UTF-8?q?rm=20fixme=20for=20=F0=9F=8F=B3=EF=B8=8F?=
 =?UTF-8?q?=E2=80=8D=F0=9F=8C=88=20=20test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/tokenize.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/tokenize.jl b/test/tokenize.jl
index 26ab044a..52169562 100644
--- a/test/tokenize.jl
+++ b/test/tokenize.jl
@@ -44,8 +44,7 @@ end
 end # testset
 
 @testset "tokenize unicode" begin
-    # FIXME: rm VERSION check once we implement our own is_identifier_char
-    emoji = VERSION < v"1.5" ? "😄" : "\U1F3F3\UFE0F\U200D\U1F308" # 🏳️‍🌈 requires newer Unicode
+    emoji = "\U1F3F3\UFE0F\U200D\U1F308" # == "🏳️‍🌈"
     str = "𝘋 =2"*emoji
     for s in [str, IOBuffer(str)]
         l = tokenize(s)

From 5a69b0fa965feaef7cbab2fa073d128c418b91b8 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 21:25:10 -0400
Subject: [PATCH 06/11] rm obsolete arg

---
 src/unicode.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unicode.jl b/src/unicode.jl
index 4b7a83dd..64982f77 100644
--- a/src/unicode.jl
+++ b/src/unicode.jl
@@ -74,7 +74,7 @@ function utf8proc_decompose_julia(str, options, buffer, nwords)
     return ret
 end
 
-function utf8proc_map_julia(str::Union{String,SubString{String}}, options::Integer, chartransform=identity)
+function utf8proc_map_julia(str::Union{String,SubString{String}}, options::Integer)
     nwords = utf8proc_decompose_julia(str, options, C_NULL, 0)
     buffer = Base.StringVector(nwords*4)
     nwords = utf8proc_decompose_julia(str, options, buffer, nwords)

From 7fcbdd038b642428bb25b1cc51473b7907543c6e Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 21:58:07 -0400
Subject: [PATCH 07/11] slight consolidation/optimization of common isascii(c),
 c % UInt8 codepath

---
 src/tokenize.jl |  4 ++--
 src/unicode.jl  | 16 ++++++++++++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/tokenize.jl b/src/tokenize.jl
index 7db1366d..c8d36bed 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -1282,9 +1282,9 @@ function lex_identifier(l::Lexer, c)
     graphemestate_peek = Ref(zero(Int32))
     while true
         pc, ppc = dpeekchar(l)
-        ascii = ascii && isascii(pc)
+        pc_byte = Unicode.ascii_byte(pc)
+        ascii = ascii && pc_byte != 0xff
         if ascii # fast path
-            pc_byte = pc % UInt8
             @inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1]
                 break
             end
diff --git a/src/unicode.jl b/src/unicode.jl
index 64982f77..e14b2109 100644
--- a/src/unicode.jl
+++ b/src/unicode.jl
@@ -153,10 +153,18 @@ function _is_identifier_start_char(c::UInt32, cat::Integer)
             (c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive)
 end
 
+# utility function to return the ASCII byte if isascii(c),
+# and otherwise (for ASCII or invalid chars) return 0xff,
+# based on the isascii source code.
+@inline function ascii_byte(c::Char)
+    x = bswap(reinterpret(UInt32, c))
+    return x < 0x80 ? x % UInt8 : 0xff
+end
+
 # from jl_id_start_char in julia/src/flisp/julia_extensions.c
 function is_identifier_start_char(c::Char)
-    if isascii(c)
-        a = c % UInt8
+    a = ascii_byte(c)
+    if a != 0xff
         return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) || a == UInt8('_')
     end
     if c < Char(0xA1) || !isvalid(c)
@@ -168,8 +176,8 @@ end
 
 # from jl_id_char in julia/src/flisp/julia_extensions.c
 function is_identifier_char(c::Char)
-    if isascii(c)
-        a = c % UInt8
+    a = ascii_byte(c)
+    if a != 0xff
         return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) ||
                a == UInt8('_') || (a >= UInt8('0') && a <= UInt8('9')) || a == UInt8('!')
     end

From 3ebeaf89d6c526bde509f72e0feb36080433e11c Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 22:02:26 -0400
Subject: [PATCH 08/11] cleanup using u8_str macro

---
 src/unicode.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/unicode.jl b/src/unicode.jl
index e14b2109..97cc280b 100644
--- a/src/unicode.jl
+++ b/src/unicode.jl
@@ -8,6 +8,8 @@ export category_code, normalize_identifier, is_identifier_char, is_identifier_st
 
 using utf8proc_jll: libutf8proc
 
+import ..JuliaSyntax: @u8_str
+
 # these constants have been stable across all utf8proc versions,
 # so no need to redefine them:
 import Base.Unicode: UTF8PROC_CATEGORY_CN, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_MN, UTF8PROC_CATEGORY_MC, UTF8PROC_CATEGORY_ME, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO, UTF8PROC_CATEGORY_PC, UTF8PROC_CATEGORY_PD, UTF8PROC_CATEGORY_PS, UTF8PROC_CATEGORY_PE, UTF8PROC_CATEGORY_PI, UTF8PROC_CATEGORY_PF, UTF8PROC_CATEGORY_PO, UTF8PROC_CATEGORY_SM, UTF8PROC_CATEGORY_SC, UTF8PROC_CATEGORY_SK, UTF8PROC_CATEGORY_SO, UTF8PROC_CATEGORY_ZS, UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP, UTF8PROC_CATEGORY_CC, UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, UTF8PROC_CATEGORY_CO
@@ -165,7 +167,7 @@ end
 function is_identifier_start_char(c::Char)
     a = ascii_byte(c)
     if a != 0xff
-        return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) || a == UInt8('_')
+        return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_"
     end
     if c < Char(0xA1) || !isvalid(c)
         return false
@@ -178,8 +180,8 @@ end
 function is_identifier_char(c::Char)
     a = ascii_byte(c)
     if a != 0xff
-        return (a >= UInt8('A') && a <= UInt8('Z')) || (a >= UInt8('a') && a <= UInt8('z')) ||
-               a == UInt8('_') || (a >= UInt8('0') && a <= UInt8('9')) || a == UInt8('!')
+        return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") ||
+               a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!"
     end
     if c < Char(0xA1) || !isvalid(c)
         return false

From 8f55073fefc7b400a0789502bae049a2c47c3c69 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 22:09:09 -0400
Subject: [PATCH 09/11] pin utf8proc_jll to a specific minor version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 3dc96fae..05b01bcf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,7 @@ utf8proc_jll = "00992c89-a35c-5347-9984-e6609dacc59a"
 
 [compat]
 julia = "1.0"
-utf8proc_jll = "2.9" # for Unicode 15.1
+utf8proc_jll = "~2.9" # = 2.9.x for Unicode 15.1
 
 [extras]
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"

From 1506668d001aa263cf6b2c39c6ec8708c8ee093e Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 4 Nov 2023 22:11:34 -0400
Subject: [PATCH 10/11] clarify comments

---
 src/unicode.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/unicode.jl b/src/unicode.jl
index 97cc280b..8edaacbd 100644
--- a/src/unicode.jl
+++ b/src/unicode.jl
@@ -156,7 +156,7 @@ function _is_identifier_start_char(c::UInt32, cat::Integer)
 end
 
 # utility function to return the ASCII byte if isascii(c),
-# and otherwise (for ASCII or invalid chars) return 0xff,
+# and otherwise (for non-ASCII or invalid chars) return 0xff,
 # based on the isascii source code.
 @inline function ascii_byte(c::Char)
     x = bswap(reinterpret(UInt32, c))
@@ -166,7 +166,7 @@ end
 # from jl_id_start_char in julia/src/flisp/julia_extensions.c
 function is_identifier_start_char(c::Char)
     a = ascii_byte(c)
-    if a != 0xff
+    if a != 0xff # ascii fast path
         return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_"
     end
     if c < Char(0xA1) || !isvalid(c)
@@ -179,7 +179,7 @@ end
 # from jl_id_char in julia/src/flisp/julia_extensions.c
 function is_identifier_char(c::Char)
     a = ascii_byte(c)
-    if a != 0xff
+    if a != 0xff # ascii fast path
         return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") ||
                a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!"
     end

From c820724ceea2da1e4d31ea916e1939e75359fbe9 Mon Sep 17 00:00:00 2001
From: Claire Foster <aka.c42f@gmail.com>
Date: Tue, 23 Jan 2024 16:03:13 +1000
Subject: [PATCH 11/11] Use UnicodeNext for unicode functionality

Here I've put tokenization-related functionality like
`is_id_start_char()` in the Tokenize module.
---
 Project.toml           |   3 +-
 src/JuliaSyntax.jl     |   3 +-
 src/literal_parsing.jl |  19 +++-
 src/source_files.jl    |   2 +-
 src/tokenize.jl        | 162 +++++++++++++++++++++++++++-----
 src/unicode.jl         | 205 -----------------------------------------
 6 files changed, 160 insertions(+), 234 deletions(-)
 delete mode 100644 src/unicode.jl

diff --git a/Project.toml b/Project.toml
index 05b01bcf..bed57ee1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,11 +4,10 @@ authors = ["Claire Foster <aka.c42f@gmail.com> and contributors"]
 version = "0.4.6"
 
 [deps]
-utf8proc_jll = "00992c89-a35c-5347-9984-e6609dacc59a"
+UnicodeNext = "7b9d9d2f-29eb-4111-b31d-f1cfc33d1412"
 
 [compat]
 julia = "1.0"
-utf8proc_jll = "~2.9" # = 2.9.x for Unicode 15.1
 
 [extras]
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
diff --git a/src/JuliaSyntax.jl b/src/JuliaSyntax.jl
index 8a8f6140..bd1fe5e0 100644
--- a/src/JuliaSyntax.jl
+++ b/src/JuliaSyntax.jl
@@ -1,5 +1,7 @@
 module JuliaSyntax
 
+using UnicodeNext
+
 # Conservative list of exports - only export the most common/useful things
 # here.
 
@@ -20,7 +22,6 @@ export SyntaxNode
 
 # Helper utilities
 include("utils.jl")
-include("unicode.jl")
 
 include("kinds.jl")
 
diff --git a/src/literal_parsing.jl b/src/literal_parsing.jl
index 81ab5b32..3277380e 100644
--- a/src/literal_parsing.jl
+++ b/src/literal_parsing.jl
@@ -331,7 +331,24 @@ end
 #-------------------------------------------------------------------------------
 # Unicode normalization.
 
-using .Unicode: normalize_identifier
+function normalize_identifier(c::Char)
+    if c <= '~'
+        return c # ASCII common case
+    end
+    return c == '\u025B' ? '\u03B5' : # 'ɛ' => 'ε'
+           c == '\u00B5' ? '\u03BC' : # 'µ' => 'μ'
+           c == '\u00B7' ? '\u22C5' : # '·' => '⋅'
+           c == '\u0387' ? '\u22C5' : # '·' => '⋅'
+           c == '\u2212' ? '\u002D' : # '−' (\minus) => '-'
+           c == '\u210F' ? '\u0127' : # 'ℏ' (\hslash) => 'ħ' \hbar
+           c
+end
+
+function normalize_identifier(str::AbstractString)
+    isascii(str) ? str :
+        UnicodeNext.normalize(str, stable=true, compose=true,
+                              chartransform=normalize_identifier)
+end
 
 #-------------------------------------------------------------------------------
 function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange)
diff --git a/src/source_files.jl b/src/source_files.jl
index a0871347..5b37c395 100644
--- a/src/source_files.jl
+++ b/src/source_files.jl
@@ -145,7 +145,7 @@ function _print_marker_line(io, prefix_str, str, underline, singleline, color,
     # Getting exactly the same width of whitespace as `str` is tricky.
     # Especially for mixtures of tabs and spaces.
     # tabs are zero width according to textwidth
-    indent = join(Unicode.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)
+    indent = join(UnicodeNext.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)
 
     # Assume tabs are 4 wide rather than 0. (fixme: implement tab alignment?)
     w = textwidth(str) + 4*count(c->c=='\t', str)
diff --git a/src/tokenize.jl b/src/tokenize.jl
index c8d36bed..37445a34 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -2,9 +2,9 @@ module Tokenize
 
 export tokenize, untokenize
 
-using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str
+using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str, @u8_str
 
-import ..JuliaSyntax: kind, Unicode,
+import ..JuliaSyntax: kind,
     is_literal, is_error, is_contextual_keyword, is_word_operator
 
 #-------------------------------------------------------------------------------
@@ -12,7 +12,119 @@ import ..JuliaSyntax: kind, Unicode,
 
 const EOF_CHAR = typemax(Char)
 
-using .Unicode: is_identifier_char, is_identifier_start_char
+# Julia identifier parsing predicates
+
+using UnicodeNext
+
+import UnicodeNext: CATEGORY_CS, CATEGORY_LL, CATEGORY_LM, CATEGORY_LO,
+    CATEGORY_LT, CATEGORY_LU, CATEGORY_MC, CATEGORY_ME, CATEGORY_MN,
+    CATEGORY_ND, CATEGORY_NL, CATEGORY_NO, CATEGORY_PC, CATEGORY_PD,
+    CATEGORY_PO, CATEGORY_SC, CATEGORY_SK, CATEGORY_SO, CATEGORY_ZS
+
+# port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c
+function _is_identifier_start_char(c::UInt32, cat::Integer)
+    return (cat == CATEGORY_LU || cat == CATEGORY_LL ||
+            cat == CATEGORY_LT || cat == CATEGORY_LM ||
+            cat == CATEGORY_LO || cat == CATEGORY_NL ||
+            cat == CATEGORY_SC ||  # allow currency symbols
+            # other symbols, but not arrows or replacement characters
+            (cat == CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) &&
+             c != 0xfffc && c != 0xfffd &&
+             c != 0x233f &&  # notslash
+             c != 0x00a6) || # broken bar
+
+            # math symbol (category Sm) whitelist
+            (c >= 0x2140 && c <= 0x2a1c &&
+             ((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄
+              c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿
+              c == 0x22a4 || c == 0x22a5 ||   # ⊤ ⊥
+
+              (c >= 0x2200 && c <= 0x2233 &&
+               (c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆
+                c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏
+                c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄
+                c == 0x2210 || c == 0x2211 || # ∐, ∑
+                c == 0x221e || c == 0x221f || # ∞, ∟
+                c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳
+
+              (c >= 0x22c0 && c <= 0x22c3) ||  # N-ary big ops: ⋀, ⋁, ⋂, ⋃
+              (c >= 0x25F8 && c <= 0x25ff) ||  # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿
+
+              (c >= 0x266f &&
+               (c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙
+                (c >= 0x27c0 && c <= 0x27c1) ||  # ⟀, ⟁
+                (c >= 0x29b0 && c <= 0x29b4) ||  # ⦰, ⦱, ⦲, ⦳, ⦴
+                (c >= 0x2a00 && c <= 0x2a06) ||  # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆
+                (c >= 0x2a09 && c <= 0x2a16) ||  # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖
+                c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜
+
+            (c >= 0x1d6c1 && # variants of \nabla and \partial
+             (c == 0x1d6c1 || c == 0x1d6db ||
+              c == 0x1d6fb || c == 0x1d715 ||
+              c == 0x1d735 || c == 0x1d74f ||
+              c == 0x1d76f || c == 0x1d789 ||
+              c == 0x1d7a9 || c == 0x1d7c3)) ||
+
+            # super- and subscript +-=()
+            (c >= 0x207a && c <= 0x207e) ||
+            (c >= 0x208a && c <= 0x208e) ||
+
+            # angle symbols
+            (c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢
+            (c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯
+
+            # Other_ID_Start
+            c == 0x2118 || c == 0x212E || # ℘, ℮
+            (c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks
+
+            # bold-digits and double-struck digits
+            (c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive)
+end
+
+# utility function to return the ASCII byte if isascii(c),
+# and otherwise (for non-ASCII or invalid chars) return 0xff,
+# based on the isascii source code.
+@inline function _ascii_byte(c::Char)
+    x = bswap(reinterpret(UInt32, c))
+    return x < 0x80 ? x % UInt8 : 0xff
+end
+
+# from jl_id_start_char in julia/src/flisp/julia_extensions.c
+function is_identifier_start_char(c::Char)
+    a = _ascii_byte(c)
+    if a != 0xff # ascii fast path
+        return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_"
+    end
+    if c < Char(0xA1) || !isvalid(c)
+        return false
+    end
+    x = UInt32(c)
+    return _is_identifier_start_char(x, UnicodeNext.category_code(x))
+end
+
+# from jl_id_char in julia/src/flisp/julia_extensions.c
+function is_identifier_char(c::Char)
+    a = _ascii_byte(c)
+    if a != 0xff # ascii fast path
+        return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") ||
+                a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!"
+    end
+    if c < Char(0xA1) || !isvalid(c)
+        return false
+    end
+    x = UInt32(c)
+    cat = UnicodeNext.category_code(x)
+    _is_identifier_start_char(x, cat) && return true
+    if (cat == CATEGORY_MN || cat == CATEGORY_MC ||
+        cat == CATEGORY_ND || cat == CATEGORY_PC ||
+        cat == CATEGORY_SK || cat == CATEGORY_ME ||
+        cat == CATEGORY_NO ||
+        # primes (single, double, triple, their reverses, and quadruple)
+        (x >= 0x2032 && x <= 0x2037) || (x == 0x2057))
+        return true
+    end
+    return false
+end
 
 function is_invisible_char(c::Char)
     # These are the chars considered invisible by the reference parser.
@@ -33,15 +145,15 @@ end
 # Chars that we will never allow to be part of a valid non-operator identifier
 function is_never_id_char(ch::Char)
     isvalid(ch) || return true
-    cat = Unicode.category_code(ch)
+    cat = UnicodeNext.category_code(ch)
     c = UInt32(ch)
     return (
         # spaces and control characters:
-        (cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
+        (cat >= CATEGORY_ZS && cat <= CATEGORY_CS) ||
 
         # ASCII and Latin1 non-connector punctuation
         (c < 0xff &&
-         cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
+         cat >= CATEGORY_PD && cat <= CATEGORY_PO) ||
 
         c == UInt32('`') ||
 
@@ -61,7 +173,7 @@ end
 readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
 
 # Some unicode operators are normalized by the tokenizer into their equivalent
-# kinds. See also Unicode.normalize_identifier()
+# kinds. See also normalize_identifier()
 const _ops_with_unicode_aliases = [
     # \minus '−' is normalized into K"-",
     '−' => K"-"
@@ -126,10 +238,10 @@ end
     if (u < 0xa1 || u > 0x10ffff)
         return false
     end
-    cat = Unicode.category_code(u)
-    if (cat == Unicode.UTF8PROC_CATEGORY_MN ||
-        cat == Unicode.UTF8PROC_CATEGORY_MC ||
-        cat == Unicode.UTF8PROC_CATEGORY_ME)
+    cat = UnicodeNext.category_code(u)
+    if (cat == CATEGORY_MN ||
+        cat == CATEGORY_MC ||
+        cat == CATEGORY_ME)
         return true
     end
     # Additional allowed cases
@@ -215,7 +327,7 @@ end
 @inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
 @inline isbinary(c::Char) = c == '0' || c == '1'
 @inline isoctal(c::Char) =  '0' ≤ c ≤ '7'
-@inline iswhitespace(c::Char) = (isvalid(c) && Unicode.isspace(c)) || c === '\ufeff'
+@inline iswhitespace(c::Char) = (isvalid(c) && UnicodeNext.isspace(c)) || c === '\ufeff'
 
 struct StringState
     triplestr::Bool
@@ -1278,25 +1390,27 @@ function lex_identifier(l::Lexer, c)
     h = simple_hash(c, UInt64(0))
     n = 1
     ascii = isascii(c)
-    graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
-    graphemestate_peek = Ref(zero(Int32))
+    graphemestate = UnicodeNext.GraphemeState(c)
     while true
         pc, ppc = dpeekchar(l)
-        pc_byte = Unicode.ascii_byte(pc)
+        pc_byte = _ascii_byte(pc)
         ascii = ascii && pc_byte != 0xff
         if ascii # fast path
             @inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1]
                 break
             end
-        elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
-            if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
-                break
-            end
-        elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
-            # ZWJ/ZWNJ only within grapheme sequences, not at end
-            graphemestate_peek[] = graphemestate[]
-            if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc)
-                break
+        else
+            graphemestate, isbreak = UnicodeNext.isgraphemebreak(graphemestate, pc)
+            if isbreak
+                if ((pc == '!' && ppc == '=') || !is_identifier_char(pc))
+                    break
+                end
+            elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
+                # ZWJ/ZWNJ only within grapheme sequences, not at end
+                _, isbreak_peek = UnicodeNext.isgraphemebreak(graphemestate, ppc)
+                if isbreak_peek
+                    break
+                end
             end
         end
         c = readchar(l)
diff --git a/src/unicode.jl b/src/unicode.jl
deleted file mode 100644
index 8edaacbd..00000000
--- a/src/unicode.jl
+++ /dev/null
@@ -1,205 +0,0 @@
-# this is a mirror of some of Base.Unicode and related functions,
-# but using the utf8proc_jll version tied to JuliaSyntax, so that
-# the supported Unicode version is the same across Julia versions.
-
-module Unicode
-
-export category_code, normalize_identifier, is_identifier_char, is_identifier_start_char
-
-using utf8proc_jll: libutf8proc
-
-import ..JuliaSyntax: @u8_str
-
-# these constants have been stable across all utf8proc versions,
-# so no need to redefine them:
-import Base.Unicode: UTF8PROC_CATEGORY_CN, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_MN, UTF8PROC_CATEGORY_MC, UTF8PROC_CATEGORY_ME, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO, UTF8PROC_CATEGORY_PC, UTF8PROC_CATEGORY_PD, UTF8PROC_CATEGORY_PS, UTF8PROC_CATEGORY_PE, UTF8PROC_CATEGORY_PI, UTF8PROC_CATEGORY_PF, UTF8PROC_CATEGORY_PO, UTF8PROC_CATEGORY_SM, UTF8PROC_CATEGORY_SC, UTF8PROC_CATEGORY_SK, UTF8PROC_CATEGORY_SO, UTF8PROC_CATEGORY_ZS, UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP, UTF8PROC_CATEGORY_CC, UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, UTF8PROC_CATEGORY_CO
-
-using Base: ismalformed
-
-#####################################################################
-# functions copied almost as-is from Base.Unicode, with the only change
-# being that they now ccall into utf8proc_jll.
-
-utf8proc_error(result) = error(unsafe_string(ccall((:utf8proc_errmsg,libutf8proc), Cstring, (Cssize_t,), result)))
-
-# Stateful grapheme break required by Unicode-9 rules: the string
-# must be processed in sequence, with state initialized to Ref{Int32}(0).
-# Requires utf8proc v2.0 or later.
-function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar)
-    if ismalformed(c1) || ismalformed(c2)
-        state[] = 0
-        return true
-    end
-    ccall((:utf8proc_grapheme_break_stateful,libutf8proc), Bool,
-          (UInt32, UInt32, Ref{Int32}), c1, c2, state)
-end
-
-# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
-function category_code(c::AbstractChar)
-    !ismalformed(c) ? category_code(UInt32(c)) : Cint(31)
-end
-
-# doesn't check validity of x:
-@inline _category_code(x::Integer) = ccall((:utf8proc_category,libutf8proc), Cint, (UInt32,), x)
-
-function category_code(x::Integer)
-    x ≤ 0x10ffff ? _category_code(x) : Cint(30)
-end
-
-@inline isspace(c::AbstractChar) =
-    c == ' ' || '\t' <= c <= '\r' || c == '\u85' ||
-    '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
-
-#####################################################################
-# Julia identifier normalization, closely based on functions
-# from Base.Unicode except that we hard-code the Julia
-# chartransform (working around JuliaLang/julia#45716)
-
-# Julia's custom character normalization mapping, based on
-# julia/src/flisp/julia_charmap.h:
-function julia_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32
-    (codepoint < 0x007f ? codepoint : # optimize for ASCII common case
-    codepoint == 0x025B ? 0x03B5 :  # 'ɛ' => 'ε'
-    codepoint == 0x00B5 ? 0x03BC :   # 'µ' => 'μ'
-    codepoint == 0x00B7 ? 0x22C5 :   # '·' => '⋅'
-    codepoint == 0x0387 ? 0x22C5 :   # '·' => '⋅'
-    codepoint == 0x2212 ? 0x002D :   # '−' (\minus) => '-'
-    codepoint == 0x210F ? 0x0127 :   # 'ℏ' (\hslash) => 'ħ' \hbar
-    codepoint)
-end
-
-function utf8proc_decompose_julia(str, options, buffer, nwords)
-    ret = ccall((:utf8proc_decompose_custom,libutf8proc), Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ptr{Cvoid}),
-                str, sizeof(str), buffer, nwords, options,
-                @cfunction(julia_custom_func, UInt32, (UInt32, Ptr{Cvoid})), C_NULL)
-    ret < 0 && utf8proc_error(ret)
-    return ret
-end
-
-function utf8proc_map_julia(str::Union{String,SubString{String}}, options::Integer)
-    nwords = utf8proc_decompose_julia(str, options, C_NULL, 0)
-    buffer = Base.StringVector(nwords*4)
-    nwords = utf8proc_decompose_julia(str, options, buffer, nwords)
-    nbytes = ccall((:utf8proc_reencode,libutf8proc), Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
-    nbytes < 0 && utf8proc_error(nbytes)
-    return String(resize!(buffer, nbytes))
-end
-
-function normalize_identifier(str)
-    # note that the values of UTF8PROC_x constants have not changed
-    # over many utf8proc versions, so we can use them from Base.Unicode
-    flags = Base.Unicode.UTF8PROC_STABLE | Base.Unicode.UTF8PROC_COMPOSE
-    return isascii(str) ? str : utf8proc_map_julia(str, flags)
-end
-
-#####################################################################
-# Julia identifier parsing predicates
-
-# port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c
-function _is_identifier_start_char(c::UInt32, cat::Integer)
-    return (cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LL ||
-            cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LM ||
-            cat == UTF8PROC_CATEGORY_LO || cat == UTF8PROC_CATEGORY_NL ||
-            cat == UTF8PROC_CATEGORY_SC ||  # allow currency symbols
-            # other symbols, but not arrows or replacement characters
-            (cat == UTF8PROC_CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) &&
-             c != 0xfffc && c != 0xfffd &&
-             c != 0x233f &&  # notslash
-             c != 0x00a6) || # broken bar
-
-            # math symbol (category Sm) whitelist
-            (c >= 0x2140 && c <= 0x2a1c &&
-             ((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄
-              c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿
-              c == 0x22a4 || c == 0x22a5 ||   # ⊤ ⊥
-
-              (c >= 0x2200 && c <= 0x2233 &&
-               (c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆
-                c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏
-                c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄
-                c == 0x2210 || c == 0x2211 || # ∐, ∑
-                c == 0x221e || c == 0x221f || # ∞, ∟
-                c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳
-
-              (c >= 0x22c0 && c <= 0x22c3) ||  # N-ary big ops: ⋀, ⋁, ⋂, ⋃
-              (c >= 0x25F8 && c <= 0x25ff) ||  # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿
-
-              (c >= 0x266f &&
-               (c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙
-                (c >= 0x27c0 && c <= 0x27c1) ||  # ⟀, ⟁
-                (c >= 0x29b0 && c <= 0x29b4) ||  # ⦰, ⦱, ⦲, ⦳, ⦴
-                (c >= 0x2a00 && c <= 0x2a06) ||  # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆
-                (c >= 0x2a09 && c <= 0x2a16) ||  # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖
-                c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜
-
-            (c >= 0x1d6c1 && # variants of \nabla and \partial
-             (c == 0x1d6c1 || c == 0x1d6db ||
-              c == 0x1d6fb || c == 0x1d715 ||
-              c == 0x1d735 || c == 0x1d74f ||
-              c == 0x1d76f || c == 0x1d789 ||
-              c == 0x1d7a9 || c == 0x1d7c3)) ||
-
-            # super- and subscript +-=()
-            (c >= 0x207a && c <= 0x207e) ||
-            (c >= 0x208a && c <= 0x208e) ||
-
-            # angle symbols
-            (c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢
-            (c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯
-
-            # Other_ID_Start
-            c == 0x2118 || c == 0x212E || # ℘, ℮
-            (c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks
-
-            # bold-digits and double-struck digits
-            (c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive)
-end
-
-# utility function to return the ASCII byte if isascii(c),
-# and otherwise (for non-ASCII or invalid chars) return 0xff,
-# based on the isascii source code.
-@inline function ascii_byte(c::Char)
-    x = bswap(reinterpret(UInt32, c))
-    return x < 0x80 ? x % UInt8 : 0xff
-end
-
-# from jl_id_start_char in julia/src/flisp/julia_extensions.c
-function is_identifier_start_char(c::Char)
-    a = ascii_byte(c)
-    if a != 0xff # ascii fast path
-        return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_"
-    end
-    if c < Char(0xA1) || !isvalid(c)
-        return false
-    end
-    x = UInt32(c)
-    return _is_identifier_start_char(x, _category_code(x))
-end
-
-# from jl_id_char in julia/src/flisp/julia_extensions.c
-function is_identifier_char(c::Char)
-    a = ascii_byte(c)
-    if a != 0xff # ascii fast path
-        return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") ||
-               a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!"
-    end
-    if c < Char(0xA1) || !isvalid(c)
-        return false
-    end
-    x = UInt32(c)
-    cat = _category_code(x)
-    _is_identifier_start_char(x, cat) && return true
-    if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC ||
-        cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC ||
-        cat == UTF8PROC_CATEGORY_SK || cat == UTF8PROC_CATEGORY_ME ||
-        cat == UTF8PROC_CATEGORY_NO ||
-        # primes (single, double, triple, their reverses, and quadruple)
-        (x >= 0x2032 && x <= 0x2037) || (x == 0x2057))
-        return true
-    end
-    return false
-end
-
-#####################################################################
-
-end