Skip to content

Commit

Permalink
Working on escaping to fix regressions found in pgloader
Browse files Browse the repository at this point in the history
 * allow quote to be nil and just read quotes directly as chars
 * there were some bugs with reading two quotes as the escape
 * added tests for each case

re #27
  • Loading branch information
bobbysmith007 committed Feb 26, 2018
1 parent d2a60c8 commit 2101afa
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 67 deletions.
6 changes: 3 additions & 3 deletions cl-csv.asd
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
:author "Russ Tyndall ([email protected]), Acceleration.net"
:description "Facilities for reading and writing CSV format files"
:license "BSD"
:version "1.0.4"
:version "1.0.5"
:serial t
:components ((:file "packages")
(:file "vars")
Expand All @@ -25,7 +25,7 @@
:description "Tests for a library providing a cl-csv class, and useful
functionality around this"
:license "BSD"
:version "1.0.4"
:version "1.0.5"
:components ((:module :tests
:serial t
:components ((:file "csv")
Expand All @@ -43,7 +43,7 @@
:description "Tests for a library providing a cl-csv class, and useful
functionality around this"
:license "BSD"
:version "1.0.4"
:version "1.0.5"
:components ((:file "csv-old")
(:module :tests
:serial t
Expand Down
2 changes: 2 additions & 0 deletions csv-old.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
((:newline *read-newline*) *read-newline*)
((:escape-mode *escape-mode*) *escape-mode*)
&aux
(*quote-escape* (or *quote-escape*
#?"${ *quote* }${ *quote* }"))
(*separator* (etypecase *separator*
(string (if (= 1 (length *separator*))
(schar *separator* 0)
Expand Down
192 changes: 132 additions & 60 deletions parser.lisp
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
(in-package :cl-csv)

(defclass read-dispatch-table ()
((buffer
((parse-stream :accessor parse-stream :initarg :parse-stream :initform nil)
(buffer
:accessor buffer
:initarg :buffer
:initform (make-array *buffer-size*
Expand Down Expand Up @@ -41,18 +42,20 @@ See: csv-reader "))
T matches anything
create these with make-table-entry"))

(defun make-table-entry (delimiter dispatch)
(defun make-table-entry (delimiter dispatch
&key (class 'read-dispatch-table-entry))
"Creates a table entry ensuring everything has the correct
types and values"
(let* ((d (if (eql delimiter t)
#(t)
(etypecase delimiter
(null (return-from make-table-entry nil))
(character (vector delimiter))
(string delimiter)
((or simple-vector vector simple-array array) delimiter)
(list (apply #'vector delimiter)))))
(te (make-instance
'read-dispatch-table-entry
class
:delimiter d
:dispatch dispatch
:dlen (length d)
Expand All @@ -63,19 +66,20 @@ See: csv-reader "))
"resets the entry state when it doesnt match"
(setf (didx te) -1))

(defun check-table-entry (te c)
(defmethod check-table-entry (table entry c)
"Given the next character in a stream check if the table entry matches
reset if it matches fully or doesnt match"
(incf (didx te))
(let ((de-char (aref (delimiter te) (didx te))))
(declare (ignore table))
(incf (didx entry))
(let ((de-char (aref (delimiter entry) (didx entry))))
(cond
((or (eql t de-char)
(char= c de-char))
(when (eql (didx te) (dlen-1 te))
(reset-table-entry te)
(when (eql (didx entry) (dlen-1 entry))
(reset-table-entry entry)
t))
(t
(reset-table-entry te)
(reset-table-entry entry)
nil))))

;; holds the states that the state machine events will drive
Expand All @@ -93,11 +97,12 @@ See: csv-reader "))
:adjustable t :fill-pointer 0)
:accessor line-data)
(reading-quoted? :initform () :accessor reading-quoted?)
(after-quoted? :initform () :accessor after-quoted?)
(after-quoted? :initform () :accessor after-quoted?)
(row-fn :initform () :accessor row-fn :initarg :row-fn)
(map-fn :initform () :accessor map-fn :initarg :map-fn)
(data-map-fn :initform 'map-empty-string-to-nil :accessor data-map-fn :initarg :data-map-fn)
(skip-row? :initform nil :initarg :skip-row? :accessor skip-row?))
(skip-row? :initform nil :initarg :skip-row? :accessor skip-row?)
)
(:documentation "the state of the csv reader, which is also is a read table"))

(defun map-empty-string-to-nil (data &key csv-reader &allow-other-keys)
Expand All @@ -112,65 +117,103 @@ See: csv-reader "))
data))


(defun last-item (buff &key (n 1))
(let ((idx (- (fill-pointer buff) n)))
(if (plusp idx)
(aref buff idx)
nil)))

(defun (setf last-item) (new buff)
(let ((idx (- (fill-pointer buff) 1)))
(setf (aref buff idx) new)))

(defun %next-char (reader)
(incf (character-idx reader))
(incf (character-line-idx reader))
(read-char (parse-stream reader) nil nil))

(defmethod reading-quoted-or-escaped (csv-reader c &key table-entry)
"Method to handle reading a quote or a pair of quotes"
(declare (ignorable table-entry))
(assert (and (eql *escape-mode* :quote)
(or (null *quote-escape*)
(%escape-is-double-quote))))
(cond
;; we finished reading quoted and got more
((after-quoted? csv-reader)
(csv-parse-error
"we are reading non quoted csv data and found a quote at~%~A~%~A"
csv-reader c))

;; not reading quoted so start to
((not (reading-quoted? csv-reader))
(setf (reading-quoted? csv-reader) t))

;; if we are reading quoted, read a quote and the next char is a quote
;; store the quote and skip a char

((and (reading-quoted? csv-reader)
(equal *quote* (peek-char nil (parse-stream csv-reader) nil nil)))
(%next-char csv-reader)
(vector-push-extend c (buffer csv-reader)))

((reading-quoted? csv-reader)
(setf (after-quoted? csv-reader) t
(reading-quoted? csv-reader) nil))
)
t)

(defmethod reading-quoted (csv-reader c &key table-entry)
"Method to handle reading a quote
NB: this interacts wierdly with escape-mode :quote "
(declare (ignorable table-entry))
;; skips collection of quote characters


(when (and (not (reading-quoted? csv-reader))
(plusp (fill-pointer (buffer csv-reader))))
;; TODO: this could probably be removed and just let that fly
(csv-parse-error "we are reading non quoted csv data and found a quote at~%~A~%~A"
csv-reader c))
(setf (reading-quoted? csv-reader)
(not (reading-quoted? csv-reader)))

(setf (reading-quoted? csv-reader) (not (reading-quoted? csv-reader)))

(when (not (reading-quoted? csv-reader))
(setf (after-quoted? csv-reader) t))
t)

(defun last-item (buff)
(let ((idx (- (fill-pointer buff) 1)))
(if (plusp idx)
(aref buff idx)
nil)))

(defun (setf last-item) (new buff)
(let ((idx (- (fill-pointer buff) 1)))
(setf (aref buff idx) new)))

(defmethod reading-escaped (csv-reader c &key table-entry)
(defmethod reading-following-escaped (csv-reader c &key table-entry)
"We read an escape sequence and need to handle storing
the escaped character"
(declare (ignorable table-entry))
(case *escape-mode*
(:quote
(cond
;; read an empty data item, let it ride
((zerop (fill-pointer (buffer csv-reader)))
(setf (reading-quoted? csv-reader) nil
(after-quoted? csv-reader) t))
(t
;; we got an escape instead of an end quote
(setf (reading-quoted? csv-reader) t
(after-quoted? csv-reader) nil)
(vector-push-extend c (buffer csv-reader)))))
(ecase *escape-mode*
(:following
;; replace the previous character with
;; the char escaped
(setf (last-item (buffer csv-reader)) c)
))
t)

(defun collect-datum (csv-reader
&aux (data-map-fn (data-map-fn csv-reader)))
(defmethod reading-escaped (csv-reader c &key table-entry)
"We read an escape sequence and need to handle storing
the escaped character"
(drop-delimiter-chars csv-reader table-entry)
(vector-push-extend *quote* (buffer csv-reader))
t)

(defun %trim-datum (csv-reader &aux (b (buffer csv-reader)))
(when (and (not (after-quoted? csv-reader))
(not (reading-quoted? csv-reader))
*trim-outer-whitespace*)
(iter
(while (and (white-space? (last-item (buffer csv-reader)))
(plusp (fill-pointer (buffer csv-reader)))))
(decf (fill-pointer (buffer csv-reader)))))
(while (and (white-space? (last-item b))
(plusp (fill-pointer b))))
(decf (fill-pointer b))))
b
)

(let ((d (copy-seq (string (buffer csv-reader)))))
(defun collect-datum (csv-reader
&aux (data-map-fn (data-map-fn csv-reader)))
(let ((d (copy-seq (%trim-datum csv-reader))))
(setf d (csv-data-read d :csv-reader csv-reader))
(when data-map-fn
(setf d (funcall data-map-fn d :csv-reader csv-reader)))
Expand Down Expand Up @@ -255,6 +298,17 @@ See: csv-reader "))
(vector-push-extend c (buffer csv-reader))
t)))

(defun %escape-is-double-quote
(&aux
(x (typecase *quote*
((or sequence string) (concatenate 'vector *quote* *quote*))
(character (vector *quote* *quote*)))))
(typecase *quote-escape*
((or null character) nil)
(sequence
(equalp x *quote-escape*))
))

(defun make-default-csv-reader ()
"Creates the default csv dispatch table
This can usually be fully changed simply by tweaking the special variables
Expand All @@ -267,10 +321,28 @@ See: csv-reader "))
#'vector
(alexandria:flatten
(list
(if (equal *escape-mode* :following)
(make-table-entry (vector *quote-escape* t) #'reading-escaped)
(make-table-entry (vector *quote* *quote*) #'reading-escaped))
(make-table-entry *quote* #'reading-quoted)
(when *quote*
(cond
;; Escape is two adjacent quotes in quoted data
((and (eql *escape-mode* :quote)
(or (null *quote-escape*)
(%escape-is-double-quote)))
(make-table-entry *quote* #'reading-quoted-or-escaped))
;; escape is a literal sequence of chars that mean quote
((eql *escape-mode* :quote)
(list (make-table-entry *quote-escape* #'reading-escaped)
(make-table-entry *quote* #'reading-quoted)))
;; escape is a backslash followed by the char being escaped
((eql *escape-mode* :following)
(list
(make-table-entry
(etypecase *quote-escape*
(null (vector #\\ t))
(string (apply #'vector (append (coerce *quote-escape* 'list) '(t))))
(character (vector *quote-escape* t)))
#'reading-following-escaped)
(make-table-entry *quote* #'reading-quoted)))))

(make-table-entry *separator* #'reading-separator)

(if (member *read-newline* '(t nil) :test #'equalp)
Expand All @@ -289,25 +361,25 @@ See: csv-reader "))
if it matches, call the function with the table character and entry"
(iter (for entry in-vector (entries table))
(when (typep entry 'read-dispatch-table-entry)
(when (check-table-entry entry c)
(when (check-table-entry table entry c)
(funcall (dispatch entry) table c :table-entry entry)
(return t)))))

(defun read-with-dispatch-table (table stream &aux (read-cnt 0))
"A generic function for processing all the characters of a stream until
a match arises and collecting that data as it goes"
(iter (for c = (read-char stream nil nil))
(while c)
(incf read-cnt)
(incf (character-line-idx table))
(incf (character-idx table))
(cond
((check-and-distpatch table c)
t ;; it handles what to do with the data etc
)
(t
;; didnt dispatch so store
(vector-push-extend c (buffer table)))))
(iter
(setf (parse-stream table) stream)
(for c = (%next-char table))
(while c)
(incf read-cnt)
(cond
((check-and-distpatch table c)
t ;; it handles what to do with the data etc
)
(t
;; didnt dispatch so store
(vector-push-extend c (buffer table)))))
(when (zerop read-cnt)
(error (make-condition 'end-of-file :stream stream)))
(when (reading-quoted? table)
Expand Down
20 changes: 17 additions & 3 deletions tests/csv.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -459,9 +459,10 @@ multiline" (nth 3 (first data)) ))
))

(define-test backslash-escapes (:tags '(backslash escapes parsing))
(lisp-unit2:assert-error
'csv-parse-error
(cl-csv:read-csv +test-backslash-escapes+ :escape #?|\"|))
(let ((res
(cl-csv:read-csv-row +test-backslash-escapes+ :escape #?|\"|)))
(assert-equal "\"\\id\"" (first res)
"we removed the quotes, but not the rest of the escapes"))
(let ((results (cl-csv:read-csv
+test-backslash-escapes+
:escape #\\ :escape-mode :following )))
Expand Down Expand Up @@ -525,3 +526,16 @@ order by t1.fspace, t1.fad_target_classid) TO STDOUT DELIMITER ',' NULL 'null' C
(iter (for r in +test-next-delim-csv-rows+)
(for s in rows)
(assert-equal r s))))

(define-test escape-regression (:tags '(issue-27 parsing))
(assert-equal
'("2918" "\"?\"" "" "\"" "test")
(cl-csv:read-csv-row
#?|2918,"""?""","","""","test"|
:separator #\, :quote #\" :escape "\"\"" :escape-mode :quote)
))

(define-test unquoted-data (:tags '(issue-26 parsing))
(cl-csv:read-csv-row
"2918,this is a \"no quote\" test, does it work?"
:separator #\, :quote nil))
Expand Down
3 changes: 2 additions & 1 deletion vars.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
"When writing what should the newline convention be ")

(defvar *always-quote* nil "Default setting for always quoting")
(defvar *quote-escape* #?"${ *quote* }${ *quote* }" "Default setting for escaping quotes")
(defvar *quote-escape* nil
"Default setting for escaping quotes - by default this is a vector of #(*quote* *quote*)")
(defvar *unquoted-empty-string-is-nil* nil
"Should unquoted empty string values, be nil or \"\".")

Expand Down

0 comments on commit 2101afa

Please sign in to comment.