forked from dbuenzli/uuseg
-
Notifications
You must be signed in to change notification settings - Fork 1
/
usegtrip.ml
179 lines (152 loc) · 6.06 KB
/
usegtrip.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
(*---------------------------------------------------------------------------
Copyright (c) 2014 The uuseg programmers. All rights reserved.
SPDX-License-Identifier: ISC
---------------------------------------------------------------------------*)
let str = Printf.sprintf
let pp = Format.fprintf
let pp_pos ppf d = pp ppf "%d.%d:(%d,%06X) "
(Uutf.decoder_line d) (Uutf.decoder_col d) (Uutf.decoder_count d)
(Uutf.decoder_byte_count d)
let pp_malformed ppf bs =
let l = String.length bs in
pp ppf "@[malformed bytes @[(";
if l > 0 then pp ppf "%02X" (Char.code (bs.[0]));
for i = 1 to l - 1 do pp ppf "@ %02X" (Char.code (bs.[i])) done;
pp ppf ")@]@]"
let exec = Filename.basename Sys.executable_name
let log f = Format.eprintf ("%s: " ^^ f ^^ "@?") exec
let input_malformed = ref false
let log_malformed inf d bs =
input_malformed := true;
log "%s:%a: %a@." inf pp_pos d pp_malformed bs
let u_rep = `Uchar Uutf.u_rep
(* Output *)
let uchar_ascii delim ppf =
let last_was_u = ref false in
function
| `Uchar u ->
if !last_was_u then (Format.pp_print_char ppf ' ');
last_was_u := true; pp ppf "U+%04X" (Uchar.to_int u)
| `Boundary ->
last_was_u := false; pp ppf "%s" delim
| `End -> ()
let uchar_encoder enc delim =
let enc = match enc with
| `ISO_8859_1 | `US_ASCII -> `UTF_8
| #Uutf.encoding as enc -> enc
in
let delim =
let add acc _ = function
| `Uchar _ as u -> u :: acc
| `Malformed bs ->
log "delimiter: %a" pp_malformed bs; u_rep :: acc
in
List.rev (Uutf.String.fold_utf_8 add [] delim)
in
let e = Uutf.encoder enc (`Channel stdout) in
function
| `Uchar _ | `End as v -> ignore (Uutf.encode e v)
| `Boundary -> List.iter (fun u -> ignore (Uutf.encode e u)) delim
let out_fun delim ascii oe =
if ascii then uchar_ascii delim Format.std_formatter else
uchar_encoder oe delim
(* Trip *)
let segment boundary inf d first_dec out =
let segmenter = Uuseg.create boundary in
let rec add v = match Uuseg.add segmenter v with
| `Uchar _ | `Boundary as v -> out v; add `Await
| `Await | `End -> ()
in
let rec loop d = function
| `Uchar _ as v -> add v; loop d (Uutf.decode d)
| `End as v -> add v; out `End
| `Malformed bs -> log_malformed inf d bs; add u_rep; loop d (Uutf.decode d)
| `Await -> assert false
in
if Uutf.decoder_removed_bom d then add (`Uchar Uutf.u_bom);
loop d first_dec
let trip seg inf enc delim ascii =
try
let ic = if inf = "-" then stdin else open_in inf in
let d = Uutf.decoder ?encoding:enc (`Channel ic) in
let first_dec = Uutf.decode d in (* guess encoding if needed. *)
let out = out_fun delim ascii (Uutf.decoder_encoding d) in
segment seg inf d first_dec out;
if inf <> "-" then close_in ic;
flush stdout
with Sys_error e -> log "%s@." e; exit 1
(* Version *)
let unicode_version () = Format.printf "%s@." Uuseg.unicode_version
(* Cmd *)
let do_cmd cmd seg inf enc delim ascii = match cmd with
| `Unicode_version -> unicode_version ()
| `Trip -> trip seg inf enc delim ascii
(* Cmdline interface *)
open Cmdliner
let cmd =
let doc = "Output supported Unicode version." in
let unicode_version = `Unicode_version, Arg.info ["unicode-version"] ~doc in
Arg.(value & vflag `Trip [unicode_version])
let seg_docs = "SEGMENTATION"
let seg =
let docs = seg_docs in
let doc = "Line break opportunities boundaries." in
let line = `Line_break, Arg.info ["l"; "line"] ~doc ~docs in
let doc = "Grapheme cluster boundaries." in
let gc = `Grapheme_cluster, Arg.info ["g"; "grapheme-cluster"] ~doc ~docs in
let doc = "Word boundaries (default)." in
let w = `Word, Arg.info ["w"; "word"] ~doc ~docs in
let doc = "Sentence boundaries." in
let s = `Sentence, Arg.info ["s"; "sentence"] ~doc ~docs in
Arg.(value & vflag `Word [line; gc; w; s])
let file =
let doc = "The input file. Reads from stdin if unspecified." in
Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE")
let enc =
let enc = [ "UTF-8", `UTF_8; "UTF-16", `UTF_16; "UTF-16LE", `UTF_16LE;
"UTF-16BE", `UTF_16BE; "ASCII", `US_ASCII; "latin1", `ISO_8859_1 ]
in
let doc = str "Input encoding, must %s. If unspecified the encoding is \
guessed. The output encoding is the same as the input \
encoding except for ASCII and latin1 where UTF-8 is output."
(Arg.doc_alts_enum enc)
in
Arg.(value & opt (some (enum enc)) None & info [ "e"; "encoding" ] ~doc)
let ascii =
let doc = "Output the input text as space (U+0020) separated Unicode
scalar values written in the US-ASCII charset."
in
Arg.(value & flag & info ["a"; "ascii"] ~doc)
let delim =
let doc = "The UTF-8 encoded delimiter used to denote boundaries." in
Arg.(value & opt string "|" & Arg.info [ "d"; "delimiter" ] ~doc ~docv:"SEP")
let cmd =
let doc = "segment Unicode text" in
let man = [
`S "DESCRIPTION";
`P "$(tname) inputs Unicode text from stdin and rewrites it
to stdout with segment boundaries as determined according
the locale independent specifications of UAX 29 and UAX 14.
Boundaries are represented by the UTF-8 encoded delimiter string
specified with the option $(b,-d) (defaults to `|').";
`P "Invalid byte sequences in the input are reported on stderr and
replaced by the Unicode replacement character (U+FFFD) in the output.";
`S seg_docs;
`S "OPTIONS";
`S "EXIT STATUS";
`P "$(tname) exits with one of the following values:";
`I ("0", "no error occured");
`I ("1", "a command line parsing error occured");
`I ("2", "the input text was malformed");
`S "BUGS";
`P "This program is distributed with the Uuseg OCaml library.
See http://erratique.ch/software/uuseg for contact
information."; ]
in
Cmd.v (Cmd.info "usegtrip" ~version:"%%VERSION%%" ~doc ~man)
Term.(const do_cmd $ cmd $ seg $ file $ enc $ delim $ ascii)
let main () = match Cmd.eval cmd with
| 0 -> if !input_malformed then exit 2 else exit 0
| c when c = Cmd.Exit.cli_error -> exit 1
| c -> exit c
let () = if !Sys.interactive then () else main ()