-
Notifications
You must be signed in to change notification settings - Fork 37
/
characters.lisp
354 lines (304 loc) · 14.8 KB
/
characters.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
(in-package #:ichiran/characters)
(defparameter *sokuon-characters* '(:sokuon "っッ"))
(defparameter *iteration-characters* '(:iter "ゝヽ" :iter-v "ゞヾ"))
(defparameter *modifier-characters* '(:+a "ぁァ" :+i "ぃィ" :+u "ぅゥ" :+e "ぇェ" :+o "ぉォ"
:+ya "ゃャ" :+yu "ゅュ" :+yo "ょョ" :+wa "ゎヮ"
:long-vowel "ー"))
(defparameter *kana-characters*
'(:a "あア" :i "いイ" :u "うウ" :e "えエ" :o "おオ"
:ka "かカ" :ki "きキ" :ku "くク" :ke "けケ" :ko "こコ"
:sa "さサ" :shi "しシ" :su "すス" :se "せセ" :so "そソ"
:ta "たタ" :chi "ちチ" :tsu "つツ" :te "てテ" :to "とト"
:na "なナ" :ni "にニ" :nu "ぬヌ" :ne "ねネ" :no "のノ"
:ha "はハ" :hi "ひヒ" :fu "ふフ" :he "へヘ" :ho "ほホ"
:ma "まマ" :mi "みミ" :mu "むム" :me "めメ" :mo "もモ"
:ya "やヤ" :yu "ゆユ" :yo "よヨ"
:ra "らラ" :ri "りリ" :ru "るル" :re "れレ" :ro "ろロ"
:wa "わワ" :wi "ゐヰ" :we "ゑヱ" :wo "をヲ"
:n "んン"
:ga "がガ" :gi "ぎギ" :gu "ぐグ" :ge "げゲ" :go "ごゴ"
:za "ざザ" :ji "じジ" :zu "ずズ" :ze "ぜゼ" :zo "ぞゾ"
:da "だダ" :dji "ぢヂ" :dzu "づヅ" :de "でデ" :do "どド"
:ba "ばバ" :bi "びビ" :bu "ぶブ" :be "べベ" :bo "ぼボ"
:pa "ぱパ" :pi "ぴピ" :pu "ぷプ" :pe "ぺペ" :po "ぽポ"
:vu "ゔヴ"
))
(defparameter *all-characters* (append *sokuon-characters*
*iteration-characters*
*modifier-characters*
*kana-characters*))
(defparameter *char-class-hash*
(let ((hash (make-hash-table)))
(loop for (class chars) on *all-characters* by #'cddr
do (loop for char across chars
do (setf (gethash char hash) class)))
hash))
(defmacro hash-from-list (var list &key test)
(alexandria:with-gensyms (hash key val)
`(defparameter ,var
(let ((,hash (make-hash-table ,@(when test `(:test ,test)))))
(loop for (,key ,val) on ,list by #'cddr
do (setf (gethash ,key ,hash) ,val))
,hash))))
(hash-from-list *dakuten-hash*
'(:ka :ga :ki :gi :ku :gu :ke :ge :ko :go
:sa :za :shi :ji :su :zu :se :ze :so :zo
:ta :da :chi :dji :tsu :dzu :te :de :to :do
:ha :ba :hi :bi :fu :bu :he :be :ho :bo
:u :vu))
(hash-from-list *handakuten-hash*
'(:ha :pa :hi :pi :fu :pu :he :pe :ho :po))
(hash-from-list *undakuten-hash*
'(:ga :ka :gi :ki :gu :ku :ge :ke :go :ko
:za :sa :ji :shi :zu :su :ze :se :zo :so
:da :ta :dji :chi :dzu :tsu :de :te :do :to
:ba :ha :bi :hi :bu :fu :be :he :bo :ho
:pa :ha :pi :hi :pu :fu :pe :he :po :ho
:vu :u))
(defun voice-char (cc)
"Returns a voiced form of character class, or the same character class"
(gethash cc *dakuten-hash* cc))
(defparameter *punctuation-marks*
'("【" " [" "】" "] "
"、" ", " "," ", "
"。" ". " "・・・" "... " "・" " " " " " "
"「" " \"" "」" "\" " "゛" "\""
"『" " «" "』" "» "
"〜" " - " ":" ": " "!" "! " "?" "? " ";" "; "))
(defun dakuten-join (dakuten-hash char)
(loop for (cc . ccd) in (alexandria:hash-table-alist dakuten-hash)
for kc = (getf *kana-characters* cc)
for kcd = (getf *kana-characters* ccd)
for offset = (- (length kc) (length kcd))
if (> offset 0) do (setf kc (subseq kc offset))
nconcing (loop for idx from 0 below (length kc)
nconcing (list (coerce (list (char kc idx) char) 'string)
(coerce (list (char kcd idx)) 'string)))))
(defparameter *dakuten-join*
(append (dakuten-join *dakuten-hash* #\゛) (dakuten-join *handakuten-hash* #\゜)))
(defparameter *abnormal-chars*
(concatenate 'string
"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#$%&()*+/〈=〉?@[]^_‘{|}~"
"・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙゚"))
(defparameter *normal-chars*
(concatenate 'string "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#$%&()*+/<=>?@[]^_`{|}~"
"・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゛゜"))
(defparameter *katakana-regex* "[ァ-ヺヽヾー]")
(defparameter *katakana-uniq-regex* "[ァ-ヺヽヾ]")
(defparameter *hiragana-regex* "[ぁ-ゔゝゞー]")
(defparameter *kanji-regex* "[々ヶ〆一-龯]")
(defparameter *kanji-char-regex* "[一-龯]")
(defparameter *nonword-regex* "[^々ヶ〆一-龯ァ-ヺヽヾぁ-ゔゝゞー〇]")
(defparameter *numeric-regex* "[0-90-9〇一二三四五六七八九零壱弐参拾十百千万億兆京]")
(defparameter *num-word-regex* "[0-90-9〇々ヶ〆一-龯ァ-ヺヽヾぁ-ゔゝゞー]")
(defparameter *word-regex* "[々ヶ〆一-龯ァ-ヺヽヾぁ-ゔゝゞー〇]")
(defparameter *digit-regex* "[0-90-9〇]")
(defparameter *decimal-point-regex* "[.,]")
(defparameter *basic-split-regex*
(format nil "((?:(?<!~a|~a)~a+|~a)~a*~a|~a)"
*decimal-point-regex* *digit-regex* *digit-regex*
*word-regex* *num-word-regex* *word-regex* *word-regex*))
(defparameter *char-class-regex-mapping*
`((:katakana ,*katakana-regex*)
(:katakana-uniq ,*katakana-uniq-regex*)
(:hiragana ,*hiragana-regex*)
(:kanji ,*kanji-regex*)
(:kanji-char ,*kanji-char-regex*)
(:kana ,(format nil "(~a|~a)" *katakana-regex* *hiragana-regex*))
(:traditional ,(format nil "(~a|~a)" *hiragana-regex* *kanji-regex*))
(:nonword ,*nonword-regex*)
(:number ,*numeric-regex*)))
(deftype char-class () '(member :katakana :katakana-uniq
:hiragana :kanji :kanji-char
:kana :traditional :nonword :number))
(defparameter *char-scanners*
(mapcar (lambda (pair) (cons (car pair) (ppcre:create-scanner (format nil "^~a+$" (cadr pair)))))
*char-class-regex-mapping*))
(defparameter *char-scanners-inner*
(mapcar (lambda (pair)
(cons (car pair) (ppcre:create-scanner `(:greedy-repetition 1 nil (:regex ,@(cdr pair))))))
*char-class-regex-mapping*))
(defun test-word (word char-class)
(declare (type char-class char-class))
(let ((regex (cdr (assoc char-class *char-scanners*))))
(ppcre:scan regex word)))
(defun count-char-class (word char-class)
(declare (type char-class char-class))
(let ((cnt 0)
(regex (cadr (assoc char-class *char-class-regex-mapping*))))
(ppcre:do-matches (s e regex word cnt)
(incf cnt))))
(defun collect-char-class (word char-class)
(declare (type char-class char-class))
(let ((regex (cadr (assoc char-class *char-class-regex-mapping*)))
result)
(ppcre:do-matches-as-strings (s regex word (nreverse result))
(push s result))))
(defun sequential-kanji-positions (word &optional (offset 0))
(let (positions)
(ppcre:do-matches (s e "(?=[々一-龯][々一-龯])" word)
(push (+ s 1 offset) positions))
(nreverse positions)))
(defun kanji-mask (word)
"SQL LIKE mask for word"
(let ((regex (ppcre:create-scanner `(:greedy-repetition 1 nil (:regex ,*kanji-regex*)))))
(ppcre:regex-replace-all regex word "%")))
(defun kanji-regex (word)
(ppcre:create-scanner
`(:sequence
:start-anchor
,@(loop for char across (kanji-mask word)
if (char= char #\%)
collect '(:greedy-repetition 1 nil :everything)
else collect char)
:end-anchor)))
(defun kanji-match (word reading)
(ppcre:scan (kanji-regex word) reading))
(defun kanji-cross-match (word reading new-word)
(let* ((m (mismatch word new-word))
(r-cut (+ m (- (length reading) (length word)))))
(when (and (> m 0) (<= 0 r-cut (length reading)))
(let ((reading-head (subseq reading 0 r-cut)))
(concatenate 'string reading-head (subseq new-word m))))))
(defun simplify-ngrams (str map)
(let* ((alist (loop for (from to) on map by #'cddr collect (cons from to)))
(scanner (ppcre:create-scanner (cons :alternation (mapcar #'car alist)))))
(ppcre:regex-replace-all scanner str
(lambda (match &rest rest)
(declare (ignore rest))
(cdr (assoc match alist :test #'equal)))
:simple-calls t)))
(defun to-normal-char (char)
(let ((pos (position char *abnormal-chars*)))
(when pos
(char *normal-chars* pos))))
(defun normalize (str)
(loop for i from 0 below (length str)
for char = (char str i)
for normal-char = (to-normal-char char)
if normal-char do (setf (char str i) normal-char))
(setf str (simplify-ngrams str (append *punctuation-marks* *dakuten-join*))))
(defun split-by-regex (regex str)
(remove-if (lambda (seg) (= (length seg) 0))
(ppcre:split regex str :with-registers-p t)))
(defun basic-split (str)
"splits string into segments of japanese and misc characters"
(let* ((split1 (split-by-regex *basic-split-regex* str)))
(loop for segment in split1
for misc = (test-word segment :nonword) then (not misc)
collect (cons (if misc :misc :word) segment))))
(defun mora-length (str)
"like length but doesn't count modifier characters"
(count-if-not (lambda (char)
(find char "っッぁァぃィぅゥぇェぉォゃャゅュょョー"))
str))
(defun as-hiragana (str)
"convert katakana to hiragana"
(map 'string
(lambda (char)
(let* ((char (or (to-normal-char char) char))
(class (gethash char *char-class-hash*)))
(if class
(char (getf *all-characters* class) 0)
char)))
str))
(defun as-katakana (str)
"convert hiragana to katakana"
(map 'string
(lambda (char)
(let* ((char (or (to-normal-char char) char))
(class (gethash char *char-class-hash*)))
(if class
(alexandria:last-elt (getf *all-characters* class))
char)))
str))
(defun consecutive-char-groups (char-class str &key (start 0) (end (length str)))
(let ((regex (cdr (assoc char-class *char-scanners-inner*)))
result)
(ppcre:do-matches (s e regex str (nreverse result)
:start start :end end)
(push (cons s e) result))))
(defun kanji-prefix (word)
(or
(let ((regex (format nil "^.*~a" *kanji-regex*)))
(ppcre:scan-to-strings regex word))
""))
(defun unrendaku (txt &key fresh)
(if fresh (setf txt (copy-seq txt)))
(if (zerop (length txt)) txt
(let* ((first-char (char txt 0))
(cc (gethash first-char *char-class-hash*))
(unvoiced (gethash cc *undakuten-hash*)))
(unless unvoiced (return-from unrendaku txt))
(let* ((pos (position first-char (getf *kana-characters* cc)))
(new-char (char (getf *kana-characters* unvoiced) pos)))
(setf (char txt 0) new-char)
txt))))
(defun rendaku (txt &key fresh handakuten)
(if fresh (setf txt (copy-seq txt)))
(if (zerop (length txt)) txt
(let* ((first-char (char txt 0))
(cc (gethash first-char *char-class-hash*))
(use-hash (if handakuten *handakuten-hash* *dakuten-hash*))
(voiced (gethash cc use-hash)))
(unless voiced (return-from rendaku txt))
(let* ((pos (position first-char (getf *kana-characters* cc)))
(new-char (char (getf *kana-characters* voiced) pos)))
(setf (char txt 0) new-char)
txt))))
(defun geminate (txt &key fresh)
(if fresh (setf txt (copy-seq txt)))
(if (zerop (length txt)) txt
(progn (setf (char txt (1- (length txt))) #\っ) txt)))
(defun destem (word stem &optional (char-class :kana))
"Remove `stem` characters of char-class + whatever else gets in the way from the end of `word`"
(declare (type char-class char-class))
(when (= stem 0) (return-from destem word))
(let ((regex (cadr (assoc char-class *char-class-regex-mapping*)))
pos)
(ppcre:do-matches (s e regex word) (push s pos))
(let ((tail (nthcdr (1- stem) pos)))
(if tail (subseq word 0 (car tail)) ""))))
(defun match-diff (s1 s2 &aux (l1 (length s1)) (l2 (length s2)))
"Match strings s1 and s2 optimally. Similar to ichiran/kanji:match-readings, but works with any strings."
(cond
((zerop l1))
((zerop l2))
(t (let ((m (mismatch s1 s2)))
(cond
((not m) (values (list s1) l1))
((or (= l1 1) (= l2 1)) (values (list (list s1 s2)) 0))
((= m 0)
(let ((best-match nil)
(best-match-value nil))
(loop for c1 across s1
for i from 0
unless (zerop i)
do (loop for c2 across s2
for j from 0
if (and (not (zerop j)) (char= c1 c2))
do (multiple-value-bind (match value) (match-diff (subseq s1 i) (subseq s2 j))
(when (and match (or (not best-match-value) (> value best-match-value)))
(setf best-match (cons (list (subseq s1 0 i) (subseq s2 0 j)) match)
best-match-value value)))))
(when best-match
(values best-match best-match-value))))
((= m l1)
(values (list (subseq s1 0 (1- l1)) (list (subseq s1 (1- l1)) (subseq s2 (1- l1)))) (1- l1)))
((= m l2)
(values (list (subseq s2 0 (1- l2)) (list (subseq s1 (1- l2)) (subseq s2 (1- l2)))) (1- l2)))
(t
(multiple-value-bind (match value) (match-diff (subseq s1 m) (subseq s2 m))
(when match
(values (cons (subseq s1 0 m) match) (+ value m))))))))))
(defun safe-subseq (sequence start &optional end)
(let ((len (length sequence)))
(when (and (<= 0 start len)
(or (not end) (<= start end len)))
(subseq sequence start end))))
(defun join (separator list &key key)
(with-output-to-string (out)
(loop for (obj . more) on list
for string = (if key (funcall key obj) obj)
do (princ string out)
if more do (princ separator out))))