Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimental support for unicode identifiers. #1407

Draft
wants to merge 20 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def runPerformanceTests(String testsPath, String stancFlags = ""){
cd cmdstan; make clean-all;
"""

if (stancFlags?.trim()) {
sh "cd performance-tests-cmdstan/cmdstan && echo 'STANCFLAGS= $stancFlags' >> make/local"
}
// if (stancFlags?.trim()) {
sh "cd performance-tests-cmdstan/cmdstan && echo 'STANCFLAGS= --allow-unicode $stancFlags' >> make/local"
// }

sh """
cd performance-tests-cmdstan/cmdstan
Expand All @@ -67,7 +67,7 @@ def cleanCheckout() {
userRemoteConfigs: scm.userRemoteConfigs,
])
}

sh 'git clean -xffd'
}

Expand Down Expand Up @@ -101,7 +101,7 @@ pipeline {
GIT_AUTHOR_EMAIL = '[email protected]'
GIT_COMMITTER_NAME = 'Stan Jenkins'
GIT_COMMITTER_EMAIL = '[email protected]'
MULTIARCH_DOCKER_TAG = 'multiarch-ocaml-4.14-v2'
MULTIARCH_DOCKER_TAG = 'multiarch-unicode'
}
stages {
stage('Verify changes') {
Expand Down
4 changes: 4 additions & 0 deletions dune-project
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
(= 0.9.0))
(yojson
(= 2.1.0))
(uucp
(= 15.1.0))
(uunf
(= 15.1.0))
(ocamlformat
(and
:with-test
Expand Down
16 changes: 4 additions & 12 deletions scripts/docker/debian-windows/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,17 @@ WORKDIR /home/jenkins
ENV DEBIAN_FRONTEND noninteractive
ENV OPAMROOT /usr/local/opam

#Copy our script and install ocaml + init
# Copy our script and install ocaml + init
COPY ./scripts/install_opam.sh ./
RUN printf "\n" | bash -x install_opam.sh

#Copy our script and install ocaml + init
# Copy our script and install ocaml + init
COPY ./scripts/install_ocaml.sh ./
RUN printf "\n" | bash -x install_ocaml.sh "stanc"

#Copy our script and install build dependencies
# Copy our script and install build dependencies
COPY ./scripts/install_build_deps_windows.sh ./
RUN bash -x install_build_deps_windows.sh

#Copy our script and install dev dependencies
COPY ./scripts/install_dev_deps.sh ./
RUN bash -x install_dev_deps.sh

# Install Javascript dev environment
COPY ./scripts/install_js_deps.sh ./
RUN opam update; bash -x install_js_deps.sh

#Specify our entrypoint
# Specify our entrypoint
ENTRYPOINT [ "opam", "config", "exec", "--" ]
2 changes: 1 addition & 1 deletion scripts/docker/debian/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ RUN opam update; bash -x install_build_deps.sh
COPY ./scripts/install_dev_deps.sh ./
RUN opam update; bash -x install_dev_deps.sh

# Install Javascript dev environment (js_of_ocaml 5.5.2)
# Install Javascript dev environment
COPY ./scripts/install_js_deps.sh ./
RUN opam update; bash -x install_js_deps.sh

Expand Down
3 changes: 2 additions & 1 deletion scripts/docker/multiarch/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,5 @@ RUN eval $(opam env) && opam install -y menhir.20230608
RUN eval $(opam env) && opam install -y ppx_deriving.5.2.1
RUN eval $(opam env) && opam install -y fmt.0.9.0
RUN eval $(opam env) && opam install -y yojson.2.1.0
RUN eval $(opam env)
RUN eval $(opam env) && opam install -y uucp.15.1.0 uunf.15.1.0
RUN eval $(opam env)
8 changes: 4 additions & 4 deletions scripts/docker/static/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ RUN CONTAINERS_COMMON_VER=$(curl -s https://dl-cdn.alpinelinux.org/alpine/latest
#Switch back to the normal user
USER jenkins

#Init opam, create and switch to 4.14.0, update shell environment
# Init opam, create and switch to 4.14.1, update shell environment
RUN opam init --disable-sandboxing --bare -y
RUN opam switch create 4.14.0
RUN opam switch 4.14.0
RUN opam switch create 4.14.1
RUN opam switch 4.14.1
RUN eval $(opam env)

RUN opam repo add internet https://opam.ocaml.org
Expand All @@ -45,4 +45,4 @@ RUN opam update; bash -x install_build_deps.sh
RUN opam install odoc -y

#Specify our entrypoint
ENTRYPOINT [ "opam", "config", "exec", "--" ]
ENTRYPOINT [ "opam", "config", "exec", "--" ]
2 changes: 1 addition & 1 deletion scripts/install_build_deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ eval $(opam env)

opam pin -y core v0.16.1 --no-action

opam install -y dune core.v0.16.1 menhir.20230608 ppx_deriving.5.2.1 fmt.0.9.0 yojson.2.1.0
opam install -y dune core.v0.16.1 menhir.20230608 ppx_deriving.5.2.1 fmt.0.9.0 yojson.2.1.0 uucp.15.1.0 uunf.15.1.0

eval $(opam env)
5 changes: 1 addition & 4 deletions scripts/install_build_deps_windows.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,10 @@ eval $(opam env)
opam repository add windows http://github.com/ocaml-cross/opam-cross-windows.git
opam update windows

# Request the compiler to be built with flambda optimizers
opam install -y conf-flambda-windows

# Install the compiler
opam install -y "ocaml-windows64=4.14.1"

# Install dependencies
opam install -y core.v0.16.1 core-windows.v0.16.1 menhir.20230608 menhir-windows.20230608 ppx_deriving.5.2.1 ppx_deriving-windows.5.2.1 fmt.0.9.0 fmt-windows.0.9.0 yojson.2.1.0 yojson-windows.2.1.0
opam install -y core.v0.16.1 core-windows.v0.16.1 menhir.20230608 menhir-windows.20230608 ppx_deriving.5.2.1 ppx_deriving-windows.5.2.1 fmt.0.9.0 fmt-windows.0.9.0 yojson.2.1.0 yojson-windows.2.1.0 uucp.15.1.0 uucp-windows.15.1.0 uunf.15.1.0 uunf-windows.15.1.0

eval $(opam env)
107 changes: 107 additions & 0 deletions src/common/Unicode.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
let pp_uchar ppf u =
let u_int = Uchar.to_int u in
if u_int < 128 then Fmt.string ppf (Char.chr u_int |> Char.escaped)
else Fmt.pf ppf "U+%04X" u_int

let is_ascii s =
let rec loop max b i =
if i > max then true
else if Bytes.get_uint8 b i < 128 then loop max b (i + 1)
else false in
let b = Bytes.of_string s in
loop (Bytes.length b - 1) b 0

let normalize = Uunf_string.normalize_utf_8 `NFKC

let foldi_uchars ~f acc str =
let len = String.length str in
let rec loop pos acc =
if pos == len then acc
else
let decode = String.get_utf_8_uchar str pos in
let char_length = Uchar.utf_decode_length decode in
let uchar = Uchar.utf_decode_uchar decode in
let acc = f acc pos uchar in
loop (pos + char_length) acc in
loop 0 acc

let iteri_uchars ~f str =
let f' buf pos c =
f pos c;
Buffer.add_utf_8_uchar buf c;
buf in
let s_after =
Buffer.contents
@@ foldi_uchars ~f:f' (Buffer.create (String.length str)) str in
(* another sanity check *)
if not (String.equal str s_after) then
Core.(
ICE.internal_compiler_error
[%message
"Failed to round-trip unicode string!"
(str : string)
(s_after : string)])

(* WIP:

While not strictly necessary, there are some additional restrictions which
are good to implement for validation and preventing strings that are visually
identical from being distinct identifiers.
A good summary can be found here: https://perl11.org/blog/unicode-identifiers.html

Most of these are only a problem if you assume maliciousness of the user,
so they may not be important for an initial version in Stan.
*)

(* Defined in https://www.unicode.org/reports/tr39/#Confusable_Detection *)
let confusable x y =
let skeleton x =
let x = Uunf_string.normalize_utf_8 `NFD x in
let f acc _ c =
if Uucp.Gen.is_default_ignorable c then ()
else
(* TODO!! replace with prototype - need data? *)
Buffer.add_utf_8_uchar acc c;
acc in
let buf = foldi_uchars ~f (Buffer.create (String.length x)) x in
let x = Buffer.contents buf in
let x = Uunf_string.normalize_utf_8 `NFD x in
x in
String.compare (skeleton x) (skeleton y)

module ScriptSet = Set.Make (Uucp.Script)

(** copied from UUCP's definition of [Uucp.Script.t] *)
let all =
ScriptSet.of_list
[ `Adlm; `Aghb; `Ahom; `Arab; `Armi; `Armn; `Avst; `Bali; `Bamu; `Bass; `Batk
; `Beng; `Bhks; `Bopo; `Brah; `Brai; `Bugi; `Buhd; `Cakm; `Cans; `Cari
; `Cham; `Cher; `Chrs; `Copt; `Cpmn; `Cprt; `Cyrl; `Deva; `Diak; `Dogr
; `Dsrt; `Dupl; `Egyp; `Elba; `Elym; `Ethi; `Geor; `Glag; `Gong; `Gonm
; `Goth; `Gran; `Grek; `Gujr; `Guru; `Hang; `Hani; `Hano; `Hatr; `Hebr
; `Hira; `Hluw; `Hmng; `Hmnp; `Hrkt; `Hung; `Ital; `Java; `Kali; `Kana
; `Kawi; `Khar; `Khmr; `Khoj; `Knda; `Kthi; `Kits; `Lana; `Laoo; `Latn
; `Lepc; `Limb; `Lina; `Linb; `Lisu; `Lyci; `Lydi; `Mahj; `Maka; `Mand
; `Mani; `Marc; `Medf; `Mend; `Merc; `Mero; `Mlym; `Modi; `Mong; `Mroo
; `Mtei; `Mult; `Mymr; `Nagm; `Nand; `Narb; `Nbat; `Newa; `Nkoo; `Nshu
; `Ogam; `Olck; `Orkh; `Orya; `Osge; `Osma; `Ougr; `Palm; `Pauc; `Perm
; `Phag; `Phli; `Phlp; `Phnx; `Plrd; `Prti; `Qaai; `Rjng; `Rohg; `Runr
; `Samr; `Sarb; `Saur; `Sgnw; `Shaw; `Shrd; `Sidd; `Sind; `Sinh; `Sogd
; `Sogo; `Sora; `Soyo; `Sund; `Sylo; `Syrc; `Tagb; `Takr; `Tale; `Talu
; `Taml; `Tang; `Tavt; `Telu; `Tfng; `Tglg; `Thaa; `Thai; `Tibt; `Tirh
; `Tnsa; `Toto; `Ugar; `Vaii; `Vith; `Wara; `Wcho; `Xpeo; `Xsux; `Yezi
; `Yiii; `Zanb; `Zinh; `Zyyy; `Zzzz ]

let extended s =
if ScriptSet.mem `Zyyy s || ScriptSet.mem `Zinh s then all else s

(* Defined in https://www.unicode.org/reports/tr39/#Restriction_Level_Detection *)
let restriction_level x =
let f acc _ c =
let scripts =
Uucp.Script.script_extensions c |> ScriptSet.of_list |> extended in
scripts :: acc in
let soss = foldi_uchars ~f [] x in
let resolved = List.fold_right ScriptSet.inter soss all in
if not @@ ScriptSet.is_empty resolved then `Single
else `Unrestricted (* TODO implement levels 3-5 *)
2 changes: 1 addition & 1 deletion src/common/dune
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
(library
(name common)
(public_name stanc.common)
(libraries core str fmt)
(libraries core str fmt uunf uucp)
(instrumentation
(backend bisect_ppx))
(inline_tests)
Expand Down
6 changes: 3 additions & 3 deletions src/frontend/Errors.ml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ module Str = Re.Str

(** Our type of syntax error information *)
type syntax_error =
| Lexing of Middle.Location.t
| Lexing of string * Middle.Location.t
| UnexpectedEOF of Middle.Location.t
| Include of string * Middle.Location.t
| Parsing of string * Middle.Location_span.t
Expand Down Expand Up @@ -47,12 +47,12 @@ let pp_syntax_error ?printed_filename ?code ppf = function
(Middle.Location_span.to_string ?printed_filename loc_span)
(pp_context_with_message ?code)
(message, loc_span.begin_loc)
| Lexing loc ->
| Lexing (message, loc) ->
Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
(Middle.Location.to_string ?printed_filename
{loc with col_num= loc.col_num - 1})
(pp_context_with_message ?code)
("Invalid character found.", loc)
(message, loc)
| UnexpectedEOF loc ->
Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
(Middle.Location.to_string ?printed_filename
Expand Down
2 changes: 1 addition & 1 deletion src/frontend/Errors.mli
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

(** Our type of syntax error information *)
type syntax_error =
| Lexing of Middle.Location.t
| Lexing of string * Middle.Location.t
| UnexpectedEOF of Middle.Location.t
| Include of string * Middle.Location.t
| Parsing of string * Middle.Location_span.t
Expand Down
45 changes: 45 additions & 0 deletions src/frontend/Identifiers.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
open Common.Unicode

let allow_unicode = ref false

let error ~loc msg =
raise
(Errors.SyntaxError
(Errors.Lexing (msg, Preprocessor.location_of_position loc)))

let validate_ascii_id ~loc id =
Debugging.lexer_logger ("ascii id: " ^ id);
let first = String.get_uint8 id 0 in
if
(first >= Char.code 'A' && first <= Char.code 'Z')
|| (first >= Char.code 'a' && first <= Char.code 'z')
then id
else error ~loc "Invalid character found."

(* Validation based on the
Unicode Standard Annex #31: Unicode Identifiers and Syntax
https://www.unicode.org/reports/tr31 *)

let validate_utf8_id ~loc id =
if not !allow_unicode then
error ~loc
"Unicode identifiers are not supported without the (experimental) \
allow-unicode flag";
if not (String.is_valid_utf_8 id) then
error ~loc "Identifier is not valid UTF-8 string";
Debugging.lexer_logger ("unicode id: " ^ id);
(* normalize to NFKC as recommended *)
let id = normalize id in
let f pos uchar =
if pos == 0 then (
if not (Uucp.Id.is_xid_start uchar) then
error ~loc (Fmt.str "Invalid character: '%a'" pp_uchar uchar))
else if not (Uucp.Id.is_xid_continue uchar) then
error ~loc
(Fmt.str "Invalid character in identifier at offset %d: '%a'" pos
pp_uchar uchar) in
iteri_uchars ~f id;
id

let validate loc id =
if is_ascii id then validate_ascii_id ~loc id else validate_utf8_id ~loc id
2 changes: 2 additions & 0 deletions src/frontend/Identifiers.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
val allow_unicode : bool ref
val validate : Lexing.position -> string -> string
10 changes: 8 additions & 2 deletions src/frontend/dune
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
(library
(name frontend)
(public_name stanc.frontend)
(libraries core re menhirLib fmt middle common yojson)
(libraries core re menhirLib fmt middle common yojson uucp)
(instrumentation
(backend bisect_ppx))
(inline_tests)
(preprocess
(pps ppx_jane ppx_deriving.fold ppx_deriving.map)))

(ocamllex lexer)
(rule
(target lexer.ml)
(deps lexer.mll)
(action
(chdir
%{workspace_root}
(run %{bin:ocamllex} -ml -o %{target} %{deps}))))

(rule
(targets parsing_errors.ml)
Expand Down
Loading