Skip to content

Commit

Permalink
Lexer: escape chars
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanjermakov committed Jun 13, 2023
1 parent da05b4d commit 7592b0b
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 55 deletions.
77 changes: 67 additions & 10 deletions src/lexer/lexer.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,75 @@ let main = (): Unit {
})
})

it('tokenize string literal', () => {
expect(tokenize(`"string 123 ok"`)).toEqual([
{ kind: 'string', value: `"string 123 ok"`, location: { start: 0, end: 14 } },
{ kind: 'eof', value: '', location: { start: 15, end: 15 } }
])
describe('tokenize char', () => {
it('plain', () => {
expect(tokenize(`'?'`)).toEqual([
{ kind: 'char', value: `'?'`, location: { start: 0, end: 2 } },
{ kind: 'eof', value: '', location: { start: 3, end: 3 } }
])
})

it('escape', () => {
expect(tokenize(`'\\n''\\r''\\\\'`)).toEqual([
{ kind: 'char', location: { end: 3, start: 0 }, value: '\'\\n\'' },
{ kind: 'char', location: { end: 7, start: 4 }, value: '\'\\r\'' },
{ kind: 'char', location: { end: 11, start: 8 }, value: '\'\\\\\'' },
{ kind: 'eof', location: { end: 12, start: 12 }, value: '' }])
})

it('escape char', () => {
expect(tokenize(`'\\''`)).toEqual([
{ kind: 'char', value: `'\\''`, location: { start: 0, end: 3 } },
{ kind: 'eof', value: '', location: { start: 4, end: 4 } }
])
})

it('unterminated', () => {
expect(tokenize(`'h`)).toEqual([
{ kind: 'unterminated-char', value: `'h`, location: { start: 0, end: 2 } },
{ kind: 'eof', value: '', location: { start: 3, end: 3 } }
])
})
})

it('tokenize char literal', () => {
expect(tokenize(`'?'`)).toEqual([
{ kind: 'char', value: `'?'`, location: { start: 0, end: 2 } },
{ kind: 'eof', value: '', location: { start: 3, end: 3 } }
])
describe('tokenize string', () => {

it('plain', () => {
expect(tokenize(`"string 123 ok"`)).toEqual([
{ kind: 'string', value: `"string 123 ok"`, location: { start: 0, end: 14 } },
{ kind: 'eof', value: '', location: { start: 15, end: 15 } }
])
})

it('escape', () => {
expect(tokenize(`"escape\\n \\r \\\\"`)).toEqual([
{ kind: 'string', value: `"escape\\n \\r \\\\"`, location: { start: 0, end: 15 } },
{ kind: 'eof', value: '', location: { start: 16, end: 16 } }
])
})

it('escape string', () => {
expect(tokenize(`"\\""`)).toEqual([
{ kind: 'string', value: `"\\""`, location: { start: 0, end: 3 } },
{ kind: 'eof', value: '', location: { start: 4, end: 4 } }
])
})

it('quotes', () => {
expect(tokenize(`"quotes '\`\\""`)).toEqual([
{ kind: 'string', value: `"quotes '\`\\""`, location: { start: 0, end: 12 } },
{ kind: 'eof', value: '', location: { start: 13, end: 13 } }
])
})

it('unterminated', () => {
expect(tokenize(`"string 123 ok\n`)).toEqual([
{ kind: 'unterminated-string', value: `"string 123 ok`, location: { start: 0, end: 13 } },
{ kind: 'newline', value: `\n`, location: { start: 14, end: 14 } },
{ kind: 'eof', value: '', location: { start: 15, end: 15 } }
])
})

})

it('tokenize expression', () => {
Expand Down
98 changes: 53 additions & 45 deletions src/lexer/lexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ export const constTokenKindMap: Map<TokenKind, string> = new Map([

const intRegex = /^\d+/
const floatRegex = /^((\d+(\.\d*)?e[+-]?\d+)|(\d+\.\d*)|(\d*\.\d+))/
const singleCharRegex = /(([^\\\n\r])|(\\[abtnvfr\\'"])|(\\u{[0-9a-fA-F]{1,4}}))/
const charRegex = new RegExp(`^'((\\')|` + singleCharRegex.source + `)'`)
const stringRegex = new RegExp(`^"((\\")|` + singleCharRegex.source + `)+"`)

/**
* Independent tokens are automatically advanced by parser by default
Expand Down Expand Up @@ -264,66 +267,71 @@ const parseInt = (chars: string[], tokens: ParseToken[], pos: { pos: number }):
return true
}

/**
* TODO: escape characters
* TODO: UTF characters
*
* @param chars
* @param tokens
* @param pos
*/
const parseCharLiteral = (chars: string[], tokens: ParseToken[], pos: { pos: number }): boolean => {
const quote = `'`
if (chars[pos.pos] === quote) {
if (chars[pos.pos] !== quote) return false

const leftCode = chars.slice(pos.pos).join('')
const match = leftCode.match(charRegex)
if (match) {
const start = pos.pos
pos.pos++
const charLiteral: string[] = []
while (chars[pos.pos] !== quote) {
if (isNewline(chars[pos.pos]) || pos.pos === chars.length) {
pos.pos++
tokens.push(createToken('unterminated-char', quote + charLiteral.join(''), pos, start))
return true
}
charLiteral.push(chars[pos.pos])
const char = match[0]
pos.pos += char.length
tokens.push(createToken('char', char, pos, start))
} else {
parseUnterminatedChar(chars, tokens, pos)
}

return true
}

const parseUnterminatedChar = (chars: string[], tokens: ParseToken[], pos: { pos: number }): void => {
const quote = `'`
const start = pos.pos
pos.pos++
const char: string[] = []
while (chars[pos.pos] !== quote) {
if (isNewline(chars[pos.pos]) || pos.pos === chars.length) {
pos.pos++
tokens.push(createToken('unterminated-char', quote + char.join(''), pos, start))
return
}
char.push(chars[pos.pos])
pos.pos++
// TODO: verify literal
tokens.push(createToken('char', quote + charLiteral.join('') + quote, pos, start))
return true
}
return false
}

/**
* TODO: escape characters
* TODO: UTF characters
*
* @param chars
* @param tokens
* @param pos
*/
const parseStringLiteral = (chars: string[], tokens: ParseToken[], pos: { pos: number }): boolean => {
const quote = '"'
if (chars[pos.pos] === quote) {
if (chars[pos.pos] !== quote) return false

const leftCode = chars.slice(pos.pos).join('')
const match = leftCode.match(stringRegex)
if (match) {
const start = pos.pos
pos.pos++
const stringLiteral: string[] = []
while (chars[pos.pos] !== quote) {
if (isNewline(chars[pos.pos]) || pos.pos === chars.length) {
pos.pos++
tokens.push(createToken('unterminated-string', quote + stringLiteral.join(''), pos, start))
return true
}
stringLiteral.push(chars[pos.pos])
pos.pos++
const str = match[0]
pos.pos += str.length
tokens.push(createToken('string', str, pos, start))
} else {
parseUnterminatedString(chars, tokens, pos)
}

return true
}

const parseUnterminatedString = (chars: string[], tokens: ParseToken[], pos: { pos: number }): void => {
const quote = '"'
const start = pos.pos
pos.pos++
const str: string[] = []
while (chars[pos.pos] !== quote) {
if (isNewline(chars[pos.pos]) || pos.pos === chars.length) {
tokens.push(createToken('unterminated-string', quote + str.join(''), pos, start))
return
}
str.push(chars[pos.pos])
pos.pos++
// TODO: verify literal
tokens.push(createToken('string', quote + stringLiteral.join('') + quote, pos, start))
return true
}
return false
}

const createToken = (
Expand Down

0 comments on commit 7592b0b

Please sign in to comment.