Lexer: escape chars

nois-lang · Jun 13, 2023 · 7592b0b · 7592b0b
1 parent da05b4d
commit 7592b0b
Show file tree

Hide file tree

Showing 2 changed files with 120 additions and 55 deletions.
diff --git a/src/lexer/lexer.spec.ts b/src/lexer/lexer.spec.ts
@@ -72,18 +72,75 @@ let main = (): Unit {
         })
     })
 
-    it('tokenize string literal', () => {
-        expect(tokenize(`"string 123 ok"`)).toEqual([
-            { kind: 'string', value: `"string 123 ok"`, location: { start: 0, end: 14 } },
-            { kind: 'eof', value: '', location: { start: 15, end: 15 } }
-        ])
+    describe('tokenize char', () => {
+        it('plain', () => {
+            expect(tokenize(`'?'`)).toEqual([
+                { kind: 'char', value: `'?'`, location: { start: 0, end: 2 } },
+                { kind: 'eof', value: '', location: { start: 3, end: 3 } }
+            ])
+        })
+
+        it('escape', () => {
+            expect(tokenize(`'\\n''\\r''\\\\'`)).toEqual([
+                { kind: 'char', location: { end: 3, start: 0 }, value: '\'\\n\'' },
+                { kind: 'char', location: { end: 7, start: 4 }, value: '\'\\r\'' },
+                { kind: 'char', location: { end: 11, start: 8 }, value: '\'\\\\\'' },
+                { kind: 'eof', location: { end: 12, start: 12 }, value: '' }])
+        })
+
+        it('escape char', () => {
+            expect(tokenize(`'\\''`)).toEqual([
+                { kind: 'char', value: `'\\''`, location: { start: 0, end: 3 } },
+                { kind: 'eof', value: '', location: { start: 4, end: 4 } }
+            ])
+        })
+
+        it('unterminated', () => {
+            expect(tokenize(`'h`)).toEqual([
+                { kind: 'unterminated-char', value: `'h`, location: { start: 0, end: 2 } },
+                { kind: 'eof', value: '', location: { start: 3, end: 3 } }
+            ])
+        })
     })
 
-    it('tokenize char literal', () => {
-        expect(tokenize(`'?'`)).toEqual([
-            { kind: 'char', value: `'?'`, location: { start: 0, end: 2 } },
-            { kind: 'eof', value: '', location: { start: 3, end: 3 } }
-        ])
+    describe('tokenize string', () => {
+
+        it('plain', () => {
+            expect(tokenize(`"string 123 ok"`)).toEqual([
+                { kind: 'string', value: `"string 123 ok"`, location: { start: 0, end: 14 } },
+                { kind: 'eof', value: '', location: { start: 15, end: 15 } }
+            ])
+        })
+
+        it('escape', () => {
+            expect(tokenize(`"escape\\n \\r \\\\"`)).toEqual([
+                { kind: 'string', value: `"escape\\n \\r \\\\"`, location: { start: 0, end: 15 } },
+                { kind: 'eof', value: '', location: { start: 16, end: 16 } }
+            ])
+        })
+
+        it('escape string', () => {
+            expect(tokenize(`"\\""`)).toEqual([
+                { kind: 'string', value: `"\\""`, location: { start: 0, end: 3 } },
+                { kind: 'eof', value: '', location: { start: 4, end: 4 } }
+            ])
+        })
+
+        it('quotes', () => {
+            expect(tokenize(`"quotes '\`\\""`)).toEqual([
+                { kind: 'string', value: `"quotes '\`\\""`, location: { start: 0, end: 12 } },
+                { kind: 'eof', value: '', location: { start: 13, end: 13 } }
+            ])
+        })
+
+        it('unterminated', () => {
+            expect(tokenize(`"string 123 ok\n`)).toEqual([
+                { kind: 'unterminated-string', value: `"string 123 ok`, location: { start: 0, end: 13 } },
+                { kind: 'newline', value: `\n`, location: { start: 14, end: 14 } },
+                { kind: 'eof', value: '', location: { start: 15, end: 15 } }
+            ])
+        })
+
     })
 
     it('tokenize expression', () => {

diff --git a/src/lexer/lexer.ts b/src/lexer/lexer.ts
@@ -113,6 +113,9 @@ export const constTokenKindMap: Map<TokenKind, string> = new Map([
 
 const intRegex = /^\d+/
 const floatRegex = /^((\d+(\.\d*)?e[+-]?\d+)|(\d+\.\d*)|(\d*\.\d+))/
+const singleCharRegex = /(([^\\\n\r])|(\\[abtnvfr\\'"])|(\\u{[0-9a-fA-F]{1,4}}))/
+const charRegex = new RegExp(`^'((\\')|` + singleCharRegex.source + `)'`)
+const stringRegex = new RegExp(`^"((\\")|` + singleCharRegex.source + `)+"`)
 
 /**
  * Independent tokens are automatically advanced by parser by default
@@ -264,66 +267,71 @@ const parseInt = (chars: string[], tokens: ParseToken[], pos: { pos: number }):
     return true
 }
 
-/**
- * TODO: escape characters
- * TODO: UTF characters
- *
- * @param chars
- * @param tokens
- * @param pos
- */
 const parseCharLiteral = (chars: string[], tokens: ParseToken[], pos: { pos: number }): boolean => {
     const quote = `'`
-    if (chars[pos.pos] === quote) {
+    if (chars[pos.pos] !== quote) return false
+
+    const leftCode = chars.slice(pos.pos).join('')
+    const match = leftCode.match(charRegex)
+    if (match) {
         const start = pos.pos
-        pos.pos++
-        const charLiteral: string[] = []
-        while (chars[pos.pos] !== quote) {
-            if (isNewline(chars[pos.pos]) || pos.pos === chars.length) {
-                pos.pos++
-                tokens.push(createToken('unterminated-char', quote + charLiteral.join(''), pos, start))
-                return true
-            }
-            charLiteral.push(chars[pos.pos])
+        const char = match[0]
+        pos.pos += char.length
+        tokens.push(createToken('char', char, pos, start))
+    } else {
+        parseUnterminatedChar(chars, tokens, pos)
+    }
+
+    return true
+}
+
+const parseUnterminatedChar = (chars: string[], tokens: ParseToken[], pos: { pos: number }): void => {
+    const quote = `'`
+    const start = pos.pos
+    pos.pos++
+    const char: string[] = []
+    while (chars[pos.pos] !== quote) {
+        if (isNewline(chars[pos.pos]) || pos.pos === chars.length) {
             pos.pos++
+            tokens.push(createToken('unterminated-char', quote + char.join(''), pos, start))
+            return
         }
+        char.push(chars[pos.pos])
         pos.pos++
-        // TODO: verify literal
-        tokens.push(createToken('char', quote + charLiteral.join('') + quote, pos, start))
-        return true
     }
-    return false
 }
 
-/**
- * TODO: escape characters
- * TODO: UTF characters
- *
- * @param chars
- * @param tokens
- * @param pos
- */
 const parseStringLiteral = (chars: string[], tokens: ParseToken[], pos: { pos: number }): boolean => {
     const quote = '"'
-    if (chars[pos.pos] === quote) {
+    if (chars[pos.pos] !== quote) return false
+
+    const leftCode = chars.slice(pos.pos).join('')
+    const match = leftCode.match(stringRegex)
+    if (match) {
         const start = pos.pos
-        pos.pos++
-        const stringLiteral: string[] = []
-        while (chars[pos.pos] !== quote) {
-            if (isNewline(chars[pos.pos]) || pos.pos === chars.length) {
-                pos.pos++
-                tokens.push(createToken('unterminated-string', quote + stringLiteral.join(''), pos, start))
-                return true
-            }
-            stringLiteral.push(chars[pos.pos])
-            pos.pos++
+        const str = match[0]
+        pos.pos += str.length
+        tokens.push(createToken('string', str, pos, start))
+    } else {
+        parseUnterminatedString(chars, tokens, pos)
+    }
+
+    return true
+}
+
+const parseUnterminatedString = (chars: string[], tokens: ParseToken[], pos: { pos: number }): void => {
+    const quote = '"'
+    const start = pos.pos
+    pos.pos++
+    const str: string[] = []
+    while (chars[pos.pos] !== quote) {
+        if (isNewline(chars[pos.pos]) || pos.pos === chars.length) {
+            tokens.push(createToken('unterminated-string', quote + str.join(''), pos, start))
+            return
         }
+        str.push(chars[pos.pos])
         pos.pos++
-        // TODO: verify literal
-        tokens.push(createToken('string', quote + stringLiteral.join('') + quote, pos, start))
-        return true
     }
-    return false
 }
 
 const createToken = (