Skip to content
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ uuid = "70703baa-626e-46a2-a12c-08ffd08c73b4"
authors = ["Claire Foster <[email protected]> and contributors"]
version = "0.4.6"

[deps]
UnicodeNext = "7b9d9d2f-29eb-4111-b31d-f1cfc33d1412"

[compat]
julia = "1.0"

[deps]

[extras]
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
2 changes: 2 additions & 0 deletions src/JuliaSyntax.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
module JuliaSyntax

using UnicodeNext

# Conservative list of exports - only export the most common/useful things
# here.

Expand Down
54 changes: 16 additions & 38 deletions src/literal_parsing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -329,48 +329,27 @@ function unescape_julia_string(io::IO, txtbuf::Vector{UInt8},
end

#-------------------------------------------------------------------------------
# Unicode normalization. As of Julia 1.8, this is part of Base and the Unicode
# stdlib under the name `Unicode.julia_chartransform`. See
# https://github.com/JuliaLang/julia/pull/42561
#
# To allow use on older Julia versions and to workaround the bug
# https://github.com/JuliaLang/julia/issues/45716
# we reproduce a specialized version of that logic here.
# Unicode normalization.

# static wrapper around user callback function
function utf8proc_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32
(codepoint == 0x025B ? 0x03B5 : # 'ɛ' => 'ε'
codepoint == 0x00B5 ? 0x03BC : # 'µ' => 'μ'
codepoint == 0x00B7 ? 0x22C5 : # '·' => '⋅'
codepoint == 0x0387 ? 0x22C5 : # '·' => '⋅'
codepoint == 0x2212 ? 0x002D : # '−' (\minus) => '-'
codepoint == 0x210F ? 0x0127 : # 'ℏ' (\hslash) => 'ħ' \hbar
codepoint)
end

function utf8proc_decompose(str, options, buffer, nwords)
ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ptr{Cvoid}),
str, sizeof(str), buffer, nwords, options,
@cfunction(utf8proc_custom_func, UInt32, (UInt32, Ptr{Cvoid})), C_NULL)
ret < 0 && Base.Unicode.utf8proc_error(ret)
return ret
end

function utf8proc_map(str::Union{String,SubString{String}}, options::Integer)
nwords = utf8proc_decompose(str, options, C_NULL, 0)
buffer = Base.StringVector(nwords*4)
nwords = utf8proc_decompose(str, options, buffer, nwords)
nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
nbytes < 0 && Base.Unicode.utf8proc_error(nbytes)
return String(resize!(buffer, nbytes))
function normalize_identifier(c::Char)
if c <= '~'
return c # ASCII common case
end
return c == '\u025B' ? '\u03B5' : # 'ɛ' => 'ε'
c == '\u00B5' ? '\u03BC' : # 'µ' => 'μ'
c == '\u00B7' ? '\u22C5' : # '·' => '⋅'
c == '\u0387' ? '\u22C5' : # '·' => '⋅'
c == '\u2212' ? '\u002D' : # '−' (\minus) => '-'
c == '\u210F' ? '\u0127' : # 'ℏ' (\hslash) => 'ħ' \hbar
c
end

function normalize_identifier(str)
flags = Base.Unicode.UTF8PROC_STABLE | Base.Unicode.UTF8PROC_COMPOSE
return isascii(str) ? str : utf8proc_map(str, flags)
function normalize_identifier(str::AbstractString)
isascii(str) ? str :
UnicodeNext.normalize(str, stable=true, compose=true,
chartransform=normalize_identifier)
end


#-------------------------------------------------------------------------------
function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange)
k = kind(head)
Expand Down Expand Up @@ -451,4 +430,3 @@ function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange)
ErrorVal()
end
end

2 changes: 1 addition & 1 deletion src/source_files.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ function _print_marker_line(io, prefix_str, str, underline, singleline, color,
# Getting exactly the same width of whitespace as `str` is tricky.
# Especially for mixtures of tabs and spaces.
# tabs are zero width according to textwidth
indent = join(isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)
indent = join(UnicodeNext.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)

# Assume tabs are 4 wide rather than 0. (fixme: implement tab alignment?)
w = textwidth(str) + 4*count(c->c=='\t', str)
Expand Down
163 changes: 133 additions & 30 deletions src/tokenize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,128 @@ module Tokenize

export tokenize, untokenize

using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str, @u8_str

import ..JuliaSyntax: kind,
is_literal, is_error, is_contextual_keyword, is_word_operator

#-------------------------------------------------------------------------------
# Character-based predicates for tokenization
import Base.Unicode

const EOF_CHAR = typemax(Char)

function is_identifier_char(c::Char)
c == EOF_CHAR && return false
isvalid(c) || return false
return Base.is_id_char(c)
# Julia identifier parsing predicates

using UnicodeNext

import UnicodeNext: CATEGORY_CS, CATEGORY_LL, CATEGORY_LM, CATEGORY_LO,
CATEGORY_LT, CATEGORY_LU, CATEGORY_MC, CATEGORY_ME, CATEGORY_MN,
CATEGORY_ND, CATEGORY_NL, CATEGORY_NO, CATEGORY_PC, CATEGORY_PD,
CATEGORY_PO, CATEGORY_SC, CATEGORY_SK, CATEGORY_SO, CATEGORY_ZS

# port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c
function _is_identifier_start_char(c::UInt32, cat::Integer)
return (cat == CATEGORY_LU || cat == CATEGORY_LL ||
cat == CATEGORY_LT || cat == CATEGORY_LM ||
cat == CATEGORY_LO || cat == CATEGORY_NL ||
cat == CATEGORY_SC || # allow currency symbols
# other symbols, but not arrows or replacement characters
(cat == CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) &&
c != 0xfffc && c != 0xfffd &&
c != 0x233f && # notslash
c != 0x00a6) || # broken bar

# math symbol (category Sm) whitelist
(c >= 0x2140 && c <= 0x2a1c &&
((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄
c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿
c == 0x22a4 || c == 0x22a5 || # ⊤ ⊥

(c >= 0x2200 && c <= 0x2233 &&
(c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆
c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏
c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄
c == 0x2210 || c == 0x2211 || # ∐, ∑
c == 0x221e || c == 0x221f || # ∞, ∟
c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳

(c >= 0x22c0 && c <= 0x22c3) || # N-ary big ops: ⋀, ⋁, ⋂, ⋃
(c >= 0x25F8 && c <= 0x25ff) || # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿

(c >= 0x266f &&
(c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙
(c >= 0x27c0 && c <= 0x27c1) || # ⟀, ⟁
(c >= 0x29b0 && c <= 0x29b4) || # ⦰, ⦱, ⦲, ⦳, ⦴
(c >= 0x2a00 && c <= 0x2a06) || # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆
(c >= 0x2a09 && c <= 0x2a16) || # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖
c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜

(c >= 0x1d6c1 && # variants of \nabla and \partial
(c == 0x1d6c1 || c == 0x1d6db ||
c == 0x1d6fb || c == 0x1d715 ||
c == 0x1d735 || c == 0x1d74f ||
c == 0x1d76f || c == 0x1d789 ||
c == 0x1d7a9 || c == 0x1d7c3)) ||

# super- and subscript +-=()
(c >= 0x207a && c <= 0x207e) ||
(c >= 0x208a && c <= 0x208e) ||

# angle symbols
(c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢
(c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯

# Other_ID_Start
c == 0x2118 || c == 0x212E || # ℘, ℮
(c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks

# bold-digits and double-struck digits
(c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive)
end

# utility function to return the ASCII byte if isascii(c),
# and otherwise (for non-ASCII or invalid chars) return 0xff,
# based on the isascii source code.
@inline function _ascii_byte(c::Char)
x = bswap(reinterpret(UInt32, c))
return x < 0x80 ? x % UInt8 : 0xff
end

# from jl_id_start_char in julia/src/flisp/julia_extensions.c
function is_identifier_start_char(c::Char)
c == EOF_CHAR && return false
isvalid(c) || return false
return Base.is_id_start_char(c)
a = _ascii_byte(c)
if a != 0xff # ascii fast path
return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_"
end
if c < Char(0xA1) || !isvalid(c)
return false
end
x = UInt32(c)
return _is_identifier_start_char(x, UnicodeNext.category_code(x))
end

# from jl_id_char in julia/src/flisp/julia_extensions.c
function is_identifier_char(c::Char)
a = _ascii_byte(c)
if a != 0xff # ascii fast path
return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") ||
a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!"
end
if c < Char(0xA1) || !isvalid(c)
return false
end
x = UInt32(c)
cat = UnicodeNext.category_code(x)
_is_identifier_start_char(x, cat) && return true
if (cat == CATEGORY_MN || cat == CATEGORY_MC ||
cat == CATEGORY_ND || cat == CATEGORY_PC ||
cat == CATEGORY_SK || cat == CATEGORY_ME ||
cat == CATEGORY_NO ||
# primes (single, double, triple, their reverses, and quadruple)
(x >= 0x2032 && x <= 0x2037) || (x == 0x2057))
return true
end
return false
end

function is_invisible_char(c::Char)
Expand All @@ -44,15 +145,15 @@ end
# Chars that we will never allow to be part of a valid non-operator identifier
function is_never_id_char(ch::Char)
isvalid(ch) || return true
cat = Unicode.category_code(ch)
cat = UnicodeNext.category_code(ch)
c = UInt32(ch)
return (
# spaces and control characters:
(cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
(cat >= CATEGORY_ZS && cat <= CATEGORY_CS) ||

# ASCII and Latin1 non-connector punctuation
(c < 0xff &&
cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
cat >= CATEGORY_PD && cat <= CATEGORY_PO) ||

c == UInt32('`') ||

Expand Down Expand Up @@ -137,10 +238,10 @@ end
if (u < 0xa1 || u > 0x10ffff)
return false
end
cat = Base.Unicode.category_code(u)
if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
cat = UnicodeNext.category_code(u)
if (cat == CATEGORY_MN ||
cat == CATEGORY_MC ||
cat == CATEGORY_ME)
return true
end
# Additional allowed cases
Expand Down Expand Up @@ -226,7 +327,7 @@ end
@inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
@inline isbinary(c::Char) = c == '0' || c == '1'
@inline isoctal(c::Char) = '0' ≤ c ≤ '7'
@inline iswhitespace(c::Char) = (isvalid(c) && Base.isspace(c)) || c === '\ufeff'
@inline iswhitespace(c::Char) = (isvalid(c) && UnicodeNext.isspace(c)) || c === '\ufeff'

struct StringState
triplestr::Bool
Expand Down Expand Up @@ -1289,25 +1390,27 @@ function lex_identifier(l::Lexer, c)
h = simple_hash(c, UInt64(0))
n = 1
ascii = isascii(c)
graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
graphemestate_peek = Ref(zero(Int32))
graphemestate = UnicodeNext.GraphemeState(c)
while true
pc, ppc = dpeekchar(l)
ascii = ascii && isascii(pc)
pc_byte = _ascii_byte(pc)
ascii = ascii && pc_byte != 0xff
if ascii # fast path
pc_byte = pc % UInt8
@inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1]
break
end
elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
break
end
elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
# ZWJ/ZWNJ only within grapheme sequences, not at end
graphemestate_peek[] = graphemestate[]
if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc)
break
else
graphemestate, isbreak = UnicodeNext.isgraphemebreak(graphemestate, pc)
if isbreak
if ((pc == '!' && ppc == '=') || !is_identifier_char(pc))
break
end
elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
# ZWJ/ZWNJ only within grapheme sequences, not at end
_, isbreak_peek = UnicodeNext.isgraphemebreak(graphemestate, ppc)
if isbreak_peek
break
end
end
end
c = readchar(l)
Expand Down
3 changes: 1 addition & 2 deletions test/tokenize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ end
end # testset

@testset "tokenize unicode" begin
# FIXME: rm VERSION check once we implement our own is_identifier_char
emoji = VERSION < v"1.5" ? "😄" : "\U1F3F3\UFE0F\U200D\U1F308" # 🏳️‍🌈 requires newer Unicode
emoji = "\U1F3F3\UFE0F\U200D\U1F308" # == "🏳️‍🌈"
str = "𝘋 =2"*emoji
for s in [str, IOBuffer(str)]
l = tokenize(s)
Expand Down