diff --git a/src/error.ts b/src/error.ts index 924222ac..ac6b445d 100644 --- a/src/error.ts +++ b/src/error.ts @@ -1,17 +1,20 @@ import { TokenName } from './parser/parser' import { indexToLocation, LocationRange, prettyIndex, prettyLocation } from './location' import { Source } from './source' +import { LexerToken } from './lexer/lexer' export interface SyntaxErrorInfo { - tokenChain: TokenName[], expected: TokenName[], got: TokenName, location: LocationRange } +export const prettyLexerError = (token: LexerToken): string => { + return `lexer error: unknown token \`${token.value}\`` +} + export const prettySyntaxError = (error: SyntaxErrorInfo): string => { - const chain = error.tokenChain.filter(n => !n.endsWith('_')).slice(-2).join('/') - return `syntax error: expected \`${error.expected}\`, got \`${error.got}\`, while parsing \`${chain}\`` + return `syntax error: expected \`${error.expected}\`, got \`${error.got}\`` } export const prettySourceMessage = (message: string, index: number, source: Source): string => { diff --git a/src/index.ts b/src/index.ts index 296fab3d..a16dbc17 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,7 +2,8 @@ import { compactToken, flattenToken, parse } from './parser/parser' import { tokenize } from './lexer/lexer' import { readFileSync } from 'fs' import { join, resolve } from 'path' -import { prettySourceMessage, prettySyntaxError } from './error' +import { prettyLexerError, prettySourceMessage, prettySyntaxError } from './error' + const version = JSON.parse(readFileSync(join(__dirname, '..', 'package.json')).toString()).version @@ -16,10 +17,14 @@ if (!path) { console.log(usage) process.exit() } - const source = { str: readFileSync(resolve(path)).toString(), filename: path } -const token = parse(tokenize(source.str)) +const tokens = tokenize(source.str) +if ('name' in tokens) { + console.error(prettySourceMessage(prettyLexerError(tokens), tokens.location.start, source)) + process.exit(1) +} +const token = parse(tokens) if ('expected' in token) { console.error(prettySourceMessage(prettySyntaxError(token), token.location.start, source)) process.exit(1) diff --git a/src/lexer/lexer.spec.ts b/src/lexer/lexer.spec.ts index fd031fe3..c43c5195 100644 --- a/src/lexer/lexer.spec.ts +++ b/src/lexer/lexer.spec.ts @@ -1,13 +1,23 @@ -import { tokenize } from './lexer' +import { LexerToken, tokenize } from './lexer' import { expect } from '@jest/globals' +import { prettyLexerError } from '../error' describe('lexer', () => { + + const testTokenize = (code: string): LexerToken[] => { + const tokens = tokenize(code) + if ('name' in tokens) { + throw Error(prettyLexerError(tokens)) + } + return tokens + } + it('tokenize basic', () => { const code = `\ let main = (): Unit { print(4) }` - const tokens = tokenize(code) + const tokens = testTokenize(code) expect(tokens.map(t => [t.name, t.value])).toEqual([ ['let-keyword_', 'let'], ['identifier', 'main'], @@ -29,28 +39,28 @@ let main = (): Unit { }) it('tokenize number literal simple', () => { - expect(tokenize('14')).toEqual([ + expect(testTokenize('14')).toEqual([ { name: 'number', value: '14', location: { start: 0, end: 1 } }, { name: 'eof', value: '', location: { start: 2, end: 2 } } ]) }) it('tokenize string literal', () => { - expect(tokenize(`"string 123 \n ok"`)).toEqual([ + expect(testTokenize(`"string 123 \n ok"`)).toEqual([ { name: 'string', value: `"string 123 \n ok"`, location: { start: 0, end: 16 } }, { name: 'eof', value: '', location: { start: 17, end: 17 } } ]) }) it('tokenize char literal', () => { - expect(tokenize(`'?'`)).toEqual([ + expect(testTokenize(`'?'`)).toEqual([ { name: 'char', value: `'?'`, location: { start: 0, end: 2 } }, { name: 'eof', value: '', location: { start: 3, end: 3 } } ]) }) it('tokenize expression', () => { - const tokens = tokenize(`1+call("str").ok() / (12 - a())`) + const tokens = testTokenize(`1+call("str").ok() / (12 - a())`) expect(tokens.map(t => [t.name, t.value])).toEqual([ ['number', '1'], ['plus', '+'], diff --git a/src/lexer/lexer.ts b/src/lexer/lexer.ts index 2a9da2cc..75ed9341 100644 --- a/src/lexer/lexer.ts +++ b/src/lexer/lexer.ts @@ -40,7 +40,6 @@ export const lexerTokenNames = [ 'colon_', 'comma_', 'equals_', - 'semicolon_', // dynamic 'identifier', @@ -102,7 +101,7 @@ export const isWhitespace = (char: string): boolean => char === ' ' || char === export const isNewline = (char: string): boolean => char === '\n' || char === '\r' -export const tokenize = (code: String): LexerToken[] => { +export const tokenize = (code: String): LexerToken[] | LexerToken => { const pos = { pos: 0 } const chars = code.split('') const tokens: LexerToken[] = [] @@ -127,11 +126,11 @@ export const tokenize = (code: String): LexerToken[] => { if (parseStringLiteral(chars, tokens, pos)) { continue } - throw Error(`unknown token \`${chars[pos.pos]}\``) + return createToken('unknown', chars[pos.pos], pos) } pos.pos++ - tokens.push(createToken('eof', '', pos)) + tokens.push(createToken('eof', '', pos, pos.pos - 1)) return tokens } @@ -244,7 +243,7 @@ const parseStringLiteral = (chars: string[], tokens: LexerToken[], pos: { pos: n const createToken = ( name: LexerTokenName, value: string, pos: { pos: number }, - start: number = pos.pos - 1 + start: number = pos.pos ): LexerToken => { return { name, value, location: { start, end: pos.pos - 1 } } } diff --git a/src/parser/locate.ts b/src/parser/locate.ts new file mode 100644 index 00000000..d333011f --- /dev/null +++ b/src/parser/locate.ts @@ -0,0 +1,50 @@ +import { LexerTokenName } from '../lexer/lexer' +import { ParseBranch, ParserTokenName, rules, TokenName, Transform } from './parser' + +export const firstTokens = (token: TokenName): Set => { + const rule = rules.get(token) + if (rule) { + return new Set(rule.branches.flatMap(b => [...transformFirstTokens({ + name: token, + branch: b + })])) + } else { + return new Set([token]) + } +} + +const transformFirstTokens = (transform: Transform, index: number = 0): Set => { + if (index >= transform.branch.length) { + return new Set() + } + const tokens: Set = new Set() + const t = transform.branch[index] + const rule = rules.get(t) + if (rule) { + if (canMatchEmpty(rule.name)) { + transformFirstTokens(transform, index + 1).forEach(t => tokens.add(t)) + } + firstTokens(rule.name).forEach(t => tokens.add(t)) + } else { + tokens.add(t) + } + return tokens +} + +const canMatchEmpty = (token: TokenName): boolean => { + const rule = rules.get(token) + if (rule) { + return rule.branches.some(b => branchCanMatchEmpty(b)) + } else { + return token === 'e' + } +} + +const branchCanMatchEmpty = (branch: ParseBranch): boolean => { + for (let t of branch) { + if (!canMatchEmpty(t)) { + return false + } + } + return true +} diff --git a/src/parser/parser.spec.ts b/src/parser/parser.spec.ts index be72bd21..a86a3c16 100644 --- a/src/parser/parser.spec.ts +++ b/src/parser/parser.spec.ts @@ -1,12 +1,15 @@ import { tokenize } from '../lexer/lexer' import { expect } from '@jest/globals' import { compactToken, flattenToken, parse, ParserTokenName, Token } from './parser' -import { prettySyntaxError } from '../error' +import { prettyLexerError, prettySyntaxError } from '../error' describe('parser', () => { const parseToken = (source: string, root: ParserTokenName = 'program'): Token => { const tokens = tokenize(source) + if ('name' in tokens) { + throw Error(prettyLexerError(tokens)) + } const token = parse(tokens, root) if ('expected' in token) { throw Error(prettySyntaxError(token)) diff --git a/src/parser/parser.ts b/src/parser/parser.ts index 744d8bd0..4f91b52a 100644 --- a/src/parser/parser.ts +++ b/src/parser/parser.ts @@ -3,6 +3,7 @@ import { readFileSync } from 'fs' import { join } from 'path' import { SyntaxErrorInfo } from '../error' import { LocationRange } from '../location' +import { firstTokens } from './locate' export const parserTokenNames = [ 'program', @@ -53,25 +54,29 @@ export type ParseBranch = TokenName[] const rawRules = JSON.parse(readFileSync(join(__dirname, '..', 'grammar.json')).toString()).rules export const rules: Map = new Map(rawRules.map((r: Rule) => [r.name, r])) +export const firstTokensMap: Map> = + new Map([...rules.keys()].map((n) => [n, firstTokens(n)])) + export const parse = (tokens: LexerToken[], node: TokenName = 'program'): Token | SyntaxErrorInfo => { const token = parseToken(tokens, node) if (token === true) { - return { tokenChain: [node], expected: [node], got: tokens[0].name, location: tokens[0].location } + return { expected: [node], got: tokens[0].name, location: tokens[0].location } } if ('expected' in token) { return token } + console.dir(compactToken(flattenToken(token)), { depth: null, colors: true, compact: true }) if (tokenSize(token) === tokens.length - 1) { return token } else { - return { tokenChain: [node], expected: ['eof'], got: tokens[0].name, location: tokens[0].location } + const lastParsed = tokens[tokenSize(token)] + return { expected: ['eof'], got: lastParsed.name, location: lastParsed.location } } } export const parseToken = (tokens: LexerToken[], node: TokenName = 'program', - index: number = 0, - tokenChain: TokenName[] = [node] + index: number = 0 ): Token | SyntaxErrorInfo | true => { const rule = rules.get(node)! if (rule) { @@ -79,7 +84,7 @@ export const parseToken = (tokens: LexerToken[], for (const branch of rule.branches) { if (isEmptyBranch(branch)) return true const transform = { name: node, branch } - const branchToken = parseTransform(transform, tokens, index, tokenChain) + const branchToken = parseTransform(transform, tokens, index) if ('name' in branchToken) { return branchToken } else { @@ -91,7 +96,6 @@ export const parseToken = (tokens: LexerToken[], return syntaxError! } else { const error: SyntaxErrorInfo = { - tokenChain, expected: [node], got: tokens[index].name, location: tokens[index].location @@ -103,11 +107,10 @@ export const parseToken = (tokens: LexerToken[], const parseTransform = (transform: Transform, tokens: LexerToken[], index: number, - tokenChain: TokenName[] ): Token | SyntaxErrorInfo => { const nodes = [] for (const branchTokenName of transform.branch) { - const branchToken = parseToken(tokens, branchTokenName, index, [...tokenChain, transform.name]) + const branchToken = parseToken(tokens, branchTokenName, index) if (branchToken === true) continue if ('expected' in branchToken) return branchToken nodes.push(branchToken)