-
Notifications
You must be signed in to change notification settings - Fork 22
/
parser.lisp
433 lines (391 loc) · 16.1 KB
/
parser.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
(in-package :cl-csv)
(defclass read-dispatch-table ()
((parse-stream :accessor parse-stream :initarg :parse-stream :initform nil)
(buffer
:accessor buffer
:initarg :buffer
:initform (make-array *buffer-size*
:element-type 'character
:initial-element #\null
:adjustable t :fill-pointer 0))
(entries :initform nil :initarg :entries :accessor entries )
(line-idx :initform 0 :initarg :line-idx :accessor line-idx)
(character-line-idx
:initform 0 :initarg :character-line-idx :accessor character-line-idx)
(character-idx :initform 0 :initarg :character-idx :accessor character-idx)
)
(:documentation " A stream parser that collects characters
and when a certain delimiter is matched will call a certain function.
These delimiter / function pairs are read-dispatch table entries
It contains all the state for the parse process
See: csv-reader "))
(defmethod print-object ((o read-dispatch-table) s)
"Print the auto-print-items for this instance."
(print-unreadable-object (o s :type t :identity t)
(iter (for slt in '(line-idx character-line-idx character-idx))
(for v = (funcall slt o))
(when v (format s "~A:~S " slt v)))
(ignore-errors (format s "~S" (string (buffer o))))))
(defclass read-dispatch-table-entry ()
((delimiter :type (or (vector (or boolean character)) null)
:accessor delimiter :initarg :delimiter :initform nil)
(didx :type fixnum :initform -1 :accessor didx :initarg :didx)
(dlen :type fixnum :initform 0 :accessor dlen :initarg :dlen)
(dlen-1 :type fixnum :initform -1 :accessor dlen-1 :initarg :dlen-1)
(dispatch :type (or function null) :initform nil :accessor dispatch :initarg :dispatch)
)
(:documentation "When a certain delimiter is matched it will call a certain function
T matches anything
create these with make-table-entry"))
(defun make-table-entry (delimiter dispatch
&key (class 'read-dispatch-table-entry))
"Creates a table entry ensuring everything has the correct
types and values"
(let* ((d (if (eql delimiter t)
#(t)
(etypecase delimiter
(null (return-from make-table-entry nil))
(character (vector delimiter))
(string delimiter)
((or simple-vector vector simple-array array) delimiter)
(list (apply #'vector delimiter)))))
(te (make-instance
class
:delimiter d
:dispatch dispatch
:dlen (length d)
:dlen-1 (- (length d) 1))))
te))
(defun reset-table-entry (te)
"resets the entry state when it doesnt match"
(setf (didx te) -1))
(defmethod check-table-entry (table entry c)
"Given the next character in a stream check if the table entry matches
reset if it matches fully or doesnt match"
(declare (ignore table))
(incf (didx entry))
(let ((de-char (aref (delimiter entry) (didx entry))))
(cond
((or (eql t de-char)
(char= c de-char))
(when (eql (didx entry) (dlen-1 entry))
(reset-table-entry entry)
t))
(t
(reset-table-entry entry)
nil))))
;; holds the states that the state machine events will drive
(defclass csv-reader (read-dispatch-table)
((rows :initform
(make-array 10
:element-type 'list
:initial-element ()
:adjustable t :fill-pointer 0)
:accessor rows :initarg rows)
(line-data :initform
(make-array 10
:element-type 'string
:initial-element ""
:adjustable t :fill-pointer 0)
:accessor line-data)
(reading-quoted? :initform () :accessor reading-quoted?)
(after-quoted? :initform () :accessor after-quoted?)
(row-fn :initform () :accessor row-fn :initarg :row-fn)
(map-fn :initform () :accessor map-fn :initarg :map-fn)
(data-map-fn :initform 'map-empty-string-to-nil :accessor data-map-fn :initarg :data-map-fn)
(skip-row? :initform nil :initarg :skip-row? :accessor skip-row?)
)
(:documentation "the state of the csv reader, which is also is a read table"))
(defun map-empty-string-to-nil (data &key csv-reader &allow-other-keys)
(if (and
(stringp data) ;; other filters /mapping could have happend
(zerop (length data))
(or (and (after-quoted? csv-reader)
*quoted-empty-string-is-nil*)
(and (not (after-quoted? csv-reader))
*unquoted-empty-string-is-nil*)))
nil
data))
(defun last-item (buff &key (n 1))
(let ((idx (- (fill-pointer buff) n)))
(if (plusp idx)
(aref buff idx)
nil)))
(defun (setf last-item) (new buff)
(let ((idx (- (fill-pointer buff) 1)))
(setf (aref buff idx) new)))
(defun %next-char (reader)
(incf (character-idx reader))
(incf (character-line-idx reader))
(read-char (parse-stream reader) nil nil))
(defmethod reading-quoted-or-escaped (csv-reader c &key table-entry)
"Method to handle reading a quote or a pair of quotes"
(declare (ignorable table-entry))
(assert (and (eql *escape-mode* :quote)
(or (null *quote-escape*)
(%escape-is-double-quote))))
(cond
;; we finished reading quoted and got more
((after-quoted? csv-reader)
(csv-parse-error
"we are reading non quoted csv data and found a quote at~%~A~%~A"
csv-reader c))
;; not reading quoted so start to
((not (reading-quoted? csv-reader))
(setf (reading-quoted? csv-reader) t))
;; if we are reading quoted, read a quote and the next char is a quote
;; store the quote and skip a char
((and (reading-quoted? csv-reader)
(equal *quote* (peek-char nil (parse-stream csv-reader) nil nil)))
(%next-char csv-reader)
(vector-push-extend c (buffer csv-reader)))
((reading-quoted? csv-reader)
(setf (after-quoted? csv-reader) t
(reading-quoted? csv-reader) nil))
)
t)
(defmethod reading-quoted (csv-reader c &key table-entry)
"Method to handle reading a quote
NB: this interacts wierdly with escape-mode :quote "
(declare (ignorable table-entry))
(when (and (not (reading-quoted? csv-reader))
(plusp (fill-pointer (buffer csv-reader))))
;; TODO: this could probably be removed and just let that fly
(csv-parse-error "we are reading non quoted csv data and found a quote at~%~A~%~A"
csv-reader c))
(setf (reading-quoted? csv-reader) (not (reading-quoted? csv-reader)))
(when (not (reading-quoted? csv-reader))
(setf (after-quoted? csv-reader) t))
t)
(defmethod reading-following-escaped (csv-reader c &key table-entry)
"We read an escape sequence and need to handle storing
the escaped character"
(declare (ignorable table-entry))
(ecase *escape-mode*
(:following
;; replace the previous character with
;; the char escaped
(setf (last-item (buffer csv-reader)) c)
))
t)
(defmethod reading-escaped (csv-reader c &key table-entry)
"We read an escape sequence and need to handle storing
the escaped character"
(drop-delimiter-chars csv-reader table-entry)
(vector-push-extend *quote* (buffer csv-reader))
t)
(defun %trim-datum (csv-reader &aux (b (buffer csv-reader)))
(when (and (not (after-quoted? csv-reader))
(not (reading-quoted? csv-reader))
*trim-outer-whitespace*)
(iter
(while (and (white-space? (last-item b))
(plusp (fill-pointer b))))
(decf (fill-pointer b))))
b
)
(defun collect-datum (csv-reader
&aux (data-map-fn (data-map-fn csv-reader)))
(let ((d (copy-seq (%trim-datum csv-reader))))
(setf d (csv-data-read d :csv-reader csv-reader))
(when data-map-fn
(setf d (funcall data-map-fn d :csv-reader csv-reader)))
(vector-push-extend d (line-data csv-reader)))
(setf (fill-pointer (buffer csv-reader)) 0
(reading-quoted? csv-reader) nil
(after-quoted? csv-reader) nil))
(defun collect-row-data (csv-reader
&aux (map-fn (map-fn csv-reader))
(row-fn (row-fn csv-reader)))
(collect-datum csv-reader)
(let ((row (coerce (line-data csv-reader) 'list)))
(setf (fill-pointer (line-data csv-reader)) 0)
(setf row (csv-row-read row :csv-reader csv-reader))
(if (skip-row? csv-reader)
(setf (skip-row? csv-reader) nil) ;; we skipped
(progn
(when map-fn
(setf row (funcall map-fn row)))
(if row-fn
(funcall row-fn row)
(vector-push-extend row (rows csv-reader)))))))
(defun drop-delimiter-chars (table entry)
"This backs up the buffer till the delimiter is not in it
we call this without having adding the character we just got
that dispatched"
(iter (for i from (- (length (delimiter entry)) 2) downto 0)
(while (eql (last-item (buffer table))
(aref (delimiter entry) i)))
(decf (fill-pointer (buffer table)))
))
(defmethod reading-separator (csv-reader c &key table-entry)
"We got the data separator character which will be handled
differently based on if we are in quoted data or not"
(cond
((not (reading-quoted? csv-reader))
;; rewind to before delim
(drop-delimiter-chars csv-reader table-entry)
(collect-datum csv-reader)
t)
(t (vector-push-extend c (buffer csv-reader)))))
(defmethod reading-newline (csv-reader c &key table-entry)
"We got the newline character which will be handled
differently based on if we are in quoted data or not"
(declare (ignorable table-entry))
(incf (line-idx csv-reader))
(setf (character-line-idx csv-reader) 0)
(cond
((not (reading-quoted? csv-reader))
;; rewind to before delim
(drop-delimiter-chars csv-reader table-entry)
(collect-row-data csv-reader)
t)
(t (vector-push-extend c (buffer csv-reader))
t )))
(defun reading-character (csv-reader c &key table-entry)
"We read a random character that was not otherwise dispatched on"
(declare (ignorable table-entry))
(cond
((and *trim-outer-whitespace*
(not (reading-quoted? csv-reader))
(white-space? c)
(or (zerop (fill-pointer (buffer csv-reader)))
(after-quoted? csv-reader)))
;; skip storing whitespace before or after
T)
((after-quoted? csv-reader)
(cond
((not (white-space? c))
(csv-parse-error "non whitespace after quoted data ~a ~a"
csv-reader c))
((not *trim-outer-whitespace*)
(csv-parse-error "whitespace after quoted data thats not supposed to be trimmed ~a ~a"
csv-reader c))
(t (vector-push-extend c (buffer csv-reader)))))
(t
;; store the character
(vector-push-extend c (buffer csv-reader))
t)))
(defun %escape-is-double-quote
(&aux
(x (typecase *quote*
((or sequence string) (concatenate 'vector *quote* *quote*))
(character (vector *quote* *quote*)))))
(typecase *quote-escape*
((or null character) nil)
(sequence
(equalp x *quote-escape*))
))
(defun make-default-csv-reader ()
"Creates the default csv dispatch table
This can usually be fully changed simply by tweaking the special variables
defined in vars. You will need to reinstantiate this object when you change those variables
(which is what happens by default)"
(make-instance
'csv-reader
:entries
(apply
#'vector
(alexandria:flatten
(list
(when *quote*
(cond
;; Escape is two adjacent quotes in quoted data
((and (eql *escape-mode* :quote)
(or (null *quote-escape*)
(%escape-is-double-quote)))
(make-table-entry *quote* #'reading-quoted-or-escaped))
;; escape is a literal sequence of chars that mean quote
((eql *escape-mode* :quote)
(list (make-table-entry *quote-escape* #'reading-escaped)
(make-table-entry *quote* #'reading-quoted)))
;; escape is a backslash followed by the char being escaped
((eql *escape-mode* :following)
(list
(make-table-entry
(etypecase *quote-escape*
(null (vector #\\ t))
(string (apply #'vector (append (coerce *quote-escape* 'list) '(t))))
(character (vector *quote-escape* t)))
#'reading-following-escaped)
(make-table-entry *quote* #'reading-quoted)))))
(make-table-entry *separator* #'reading-separator)
(if (member *read-newline* '(t nil) :test #'equalp)
(list
(make-table-entry (vector #\return #\newline) #'reading-newline)
(make-table-entry (vector #\return #\return) #'reading-newline)
(make-table-entry (vector #\newline) #'reading-newline))
(make-table-entry *read-newline* #'reading-newline))
(make-table-entry t #'reading-character)
)))))
(defun check-and-distpatch (table c)
"Check all the entries in a read-dispatch-table to find a match
if it matches, call the function with the table character and entry"
(iter (for entry in-vector (entries table))
(when (typep entry 'read-dispatch-table-entry)
(when (check-table-entry table entry c)
(funcall (dispatch entry) table c :table-entry entry)
(return t)))))
(defun read-with-dispatch-table (table stream &aux (read-cnt 0))
"A generic function for processing all the characters of a stream until
a match arises and collecting that data as it goes"
(iter
(setf (parse-stream table) stream)
(for c = (%next-char table))
(while c)
(incf read-cnt)
(cond
((check-and-distpatch table c)
t ;; it handles what to do with the data etc
)
(t
;; didnt dispatch so store
(vector-push-extend c (buffer table)))))
(when (zerop read-cnt)
(error (make-condition 'end-of-file :stream stream)))
(when (reading-quoted? table)
(restart-case
(csv-parse-error "End of file while collecting quoted item: ~A" table)
(finish-item ())))
(when (or (plusp (fill-pointer (buffer table)))
(plusp (fill-pointer (line-data table))))
(collect-row-data table)))
(defun read-csv-with-reader (stream-or-string
&key csv-reader
(row-fn nil row-fn-p)
(map-fn nil map-fn-p)
(data-map-fn nil data-map-fn-p)
skip-first-p
&allow-other-keys)
"Read a whole csv from the input"
(unless csv-reader
(setf csv-reader (make-default-csv-reader)))
(when row-fn-p (setf (row-fn csv-reader) row-fn))
(when map-fn-p (setf (map-fn csv-reader) map-fn))
(when data-map-fn-p (setf (data-map-fn csv-reader) data-map-fn))
(setf (skip-row? csv-reader) skip-first-p)
(handler-case
(with-csv-input-stream (in-stream stream-or-string)
(read-with-dispatch-table csv-reader in-stream)
(coerce (rows csv-reader) 'list))
(end-of-file (c)
;; this is signaled if we read an empty csv
;; it needs to happend for read-csv-row to work
;; with iterate, but doesnt need to happen in this fn
(declare (ignore c))
(coerce (rows csv-reader) 'list))))
(defun read-csv-row-with-reader (stream-or-string
&key csv-reader
(map-fn nil map-fn-p)
(data-map-fn nil data-map-fn-p)
&allow-other-keys)
"Read a row of csv from the input"
(flet ((return-row (it)
(return-from read-csv-row-with-reader it)))
(unless csv-reader
(setf csv-reader (make-default-csv-reader)))
(when map-fn-p (setf (map-fn csv-reader) map-fn))
(when data-map-fn-p (setf (data-map-fn csv-reader) data-map-fn))
(setf (row-fn csv-reader) #'return-row)
(with-csv-input-stream (in-stream stream-or-string)
(read-with-dispatch-table csv-reader in-stream))))