Skip to content

Commit

Permalink
fix: capture groups with quantifiers are not repeated
Browse files Browse the repository at this point in the history
fixes: 31
  • Loading branch information
ColinEberhardt committed Feb 9, 2021
1 parent bead49e commit 84027b8
Show file tree
Hide file tree
Showing 8 changed files with 170 additions and 46 deletions.
129 changes: 110 additions & 19 deletions assembly/__spec_tests__/generated.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -373,17 +373,41 @@ it("line: 51 - matches ^(b+?|a){1,2}?c against 'bc'", () => {
expect(match.matches[0]).toBe("bc".substring(0, 2));
expect(match.matches[1]).toBe("bc".substring(0, 1));
});
xit("line: 52 - issues with repeated capture groups", () => {});
xit("line: 53 - issues with repeated capture groups", () => {});
xit("line: 54 - issues with repeated capture groups", () => {});
xit("line: 55 - issues with repeated capture groups", () => {});
it("line: 52 - matches ^(b+?|a){1,2}?c against 'bbc'", () => {
const match = exec("^(b+?|a){1,2}?c", "bbc", "s");
expect(match.matches[0]).toBe("bbc".substring(0, 3));
expect(match.matches[1]).toBe("bbc".substring(1, 2));
});
it("line: 53 - matches ^(b+?|a){1,2}?c against 'bbbc'", () => {
const match = exec("^(b+?|a){1,2}?c", "bbbc", "s");
expect(match.matches[0]).toBe("bbbc".substring(0, 4));
expect(match.matches[1]).toBe("bbbc".substring(1, 3));
});
it("line: 54 - matches ^(b+?|a){1,2}?c against 'bac'", () => {
const match = exec("^(b+?|a){1,2}?c", "bac", "s");
expect(match.matches[0]).toBe("bac".substring(0, 3));
expect(match.matches[1]).toBe("bac".substring(1, 2));
});
it("line: 55 - matches ^(b+?|a){1,2}?c against 'bbac'", () => {
const match = exec("^(b+?|a){1,2}?c", "bbac", "s");
expect(match.matches[0]).toBe("bbac".substring(0, 4));
expect(match.matches[1]).toBe("bbac".substring(2, 3));
});
it("line: 56 - matches ^(b+?|a){1,2}?c against 'aac'", () => {
const match = exec("^(b+?|a){1,2}?c", "aac", "s");
expect(match.matches[0]).toBe("aac".substring(0, 3));
expect(match.matches[1]).toBe("aac".substring(1, 2));
});
xit("line: 57 - issues with repeated capture groups", () => {});
xit("line: 58 - issues with repeated capture groups", () => {});
it("line: 57 - matches ^(b+?|a){1,2}?c against 'abbbbbbbbbbbc'", () => {
const match = exec("^(b+?|a){1,2}?c", "abbbbbbbbbbbc", "s");
expect(match.matches[0]).toBe("abbbbbbbbbbbc".substring(0, 13));
expect(match.matches[1]).toBe("abbbbbbbbbbbc".substring(1, 12));
});
it("line: 58 - matches ^(b+?|a){1,2}?c against 'bbbbbbbbbbbac'", () => {
const match = exec("^(b+?|a){1,2}?c", "bbbbbbbbbbbac", "s");
expect(match.matches[0]).toBe("bbbbbbbbbbbac".substring(0, 13));
expect(match.matches[1]).toBe("bbbbbbbbbbbac".substring(11, 12));
});
it("line: 59 - matches ^(b+?|a){1,2}?c against 'aaac'", () => {
expectNotMatch("^(b+?|a){1,2}?c", ["aaac"]);
});
Expand All @@ -400,12 +424,36 @@ it("line: 62 - matches ^(b+|a){1,2}c against 'bbc'", () => {
expect(match.matches[0]).toBe("bbc".substring(0, 3));
expect(match.matches[1]).toBe("bbc".substring(0, 2));
});
xit("line: 63 - issues with repeated capture groups", () => {});
xit("line: 64 - issues with repeated capture groups", () => {});
xit("line: 65 - issues with repeated capture groups", () => {});
xit("line: 66 - issues with repeated capture groups", () => {});
xit("line: 67 - issues with repeated capture groups", () => {});
xit("line: 68 - issues with repeated capture groups", () => {});
it("line: 63 - matches ^(b+|a){1,2}c against 'bbbc'", () => {
const match = exec("^(b+|a){1,2}c", "bbbc", "s");
expect(match.matches[0]).toBe("bbbc".substring(0, 4));
expect(match.matches[1]).toBe("bbbc".substring(0, 3));
});
it("line: 64 - matches ^(b+|a){1,2}c against 'bac'", () => {
const match = exec("^(b+|a){1,2}c", "bac", "s");
expect(match.matches[0]).toBe("bac".substring(0, 3));
expect(match.matches[1]).toBe("bac".substring(1, 2));
});
it("line: 65 - matches ^(b+|a){1,2}c against 'bbac'", () => {
const match = exec("^(b+|a){1,2}c", "bbac", "s");
expect(match.matches[0]).toBe("bbac".substring(0, 4));
expect(match.matches[1]).toBe("bbac".substring(2, 3));
});
it("line: 66 - matches ^(b+|a){1,2}c against 'aac'", () => {
const match = exec("^(b+|a){1,2}c", "aac", "s");
expect(match.matches[0]).toBe("aac".substring(0, 3));
expect(match.matches[1]).toBe("aac".substring(1, 2));
});
it("line: 67 - matches ^(b+|a){1,2}c against 'abbbbbbbbbbbc'", () => {
const match = exec("^(b+|a){1,2}c", "abbbbbbbbbbbc", "s");
expect(match.matches[0]).toBe("abbbbbbbbbbbc".substring(0, 13));
expect(match.matches[1]).toBe("abbbbbbbbbbbc".substring(1, 12));
});
it("line: 68 - matches ^(b+|a){1,2}c against 'bbbbbbbbbbbac'", () => {
const match = exec("^(b+|a){1,2}c", "bbbbbbbbbbbac", "s");
expect(match.matches[0]).toBe("bbbbbbbbbbbac".substring(0, 13));
expect(match.matches[1]).toBe("bbbbbbbbbbbac".substring(11, 12));
});
it("line: 69 - matches ^(b+|a){1,2}c against 'aaac'", () => {
expectNotMatch("^(b+|a){1,2}c", ["aaac"]);
});
Expand All @@ -417,8 +465,16 @@ it("line: 71 - matches ^(b+|a){1,2}?bc against 'bbc'", () => {
expect(match.matches[0]).toBe("bbc".substring(0, 3));
expect(match.matches[1]).toBe("bbc".substring(0, 1));
});
xit("line: 72 - issues with repeated capture groups", () => {});
xit("line: 73 - issues with repeated capture groups", () => {});
it("line: 72 - matches ^(b*|ba){1,2}?bc against 'babc'", () => {
const match = exec("^(b*|ba){1,2}?bc", "babc", "s");
expect(match.matches[0]).toBe("babc".substring(0, 4));
expect(match.matches[1]).toBe("babc".substring(0, 2));
});
it("line: 73 - matches ^(b*|ba){1,2}?bc against 'bbabc'", () => {
const match = exec("^(b*|ba){1,2}?bc", "bbabc", "s");
expect(match.matches[0]).toBe("bbabc".substring(0, 5));
expect(match.matches[1]).toBe("bbabc".substring(1, 3));
});
it("line: 74 - matches ^(b*|ba){1,2}?bc against 'bababc'", () => {
const match = exec("^(b*|ba){1,2}?bc", "bababc", "s");
expect(match.matches[0]).toBe("bababc".substring(0, 6));
Expand All @@ -435,7 +491,11 @@ it("line: 77 - matches ^(ba|b*){1,2}?bc against 'babc'", () => {
expect(match.matches[0]).toBe("babc".substring(0, 4));
expect(match.matches[1]).toBe("babc".substring(0, 2));
});
xit("line: 78 - issues with repeated capture groups", () => {});
it("line: 78 - matches ^(ba|b*){1,2}?bc against 'bbabc'", () => {
const match = exec("^(ba|b*){1,2}?bc", "bbabc", "s");
expect(match.matches[0]).toBe("bbabc".substring(0, 5));
expect(match.matches[1]).toBe("bbabc".substring(1, 3));
});
it("line: 79 - matches ^(ba|b*){1,2}?bc against 'bababc'", () => {
const match = exec("^(ba|b*){1,2}?bc", "bababc", "s");
expect(match.matches[0]).toBe("bababc".substring(0, 6));
Expand Down Expand Up @@ -1199,8 +1259,32 @@ it("line: 261 - matches ^From +([^ ]+) +[a-zA-Z][a-zA-Z][a-zA-Z] +[a-zA-Z][a-zA-
"From abcd Mon Sep 01 12:33:02 1997".substring(5, 9)
);
});
xit("line: 262 - issues with repeated capture groups", () => {});
xit("line: 263 - issues with repeated capture groups", () => {});
it("line: 262 - matches ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d against 'From abcd Mon Sep 01 12:33:02 1997'", () => {
const match = exec(
"^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d",
"From abcd Mon Sep 01 12:33:02 1997",
"s"
);
expect(match.matches[0]).toBe(
"From abcd Mon Sep 01 12:33:02 1997".substring(0, 27)
);
expect(match.matches[1]).toBe(
"From abcd Mon Sep 01 12:33:02 1997".substring(15, 19)
);
});
it("line: 263 - matches ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d against 'From abcd Mon Sep 1 12:33:02 1997'", () => {
const match = exec(
"^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d",
"From abcd Mon Sep 1 12:33:02 1997",
"s"
);
expect(match.matches[0]).toBe(
"From abcd Mon Sep 1 12:33:02 1997".substring(0, 27)
);
expect(match.matches[1]).toBe(
"From abcd Mon Sep 1 12:33:02 1997".substring(15, 20)
);
});
it("line: 264 - matches ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d against 'From abcd Sep 01 12:33:02 1997'", () => {
expectNotMatch(
"^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d",
Expand Down Expand Up @@ -2089,8 +2173,15 @@ it("line: 1390 - matches ^[abc]{12} against 'abcabcabcabc'", () => {
const match = exec("^[abc]{12}", "abcabcabcabc", "s");
expect(match.matches[0]).toBe("abcabcabcabc".substring(0, 12));
});
xit("line: 1391 - issues with repeated capture groups", () => {});
xit("line: 1392 - issues with repeated capture groups", () => {});
it("line: 1391 - matches ^[a-c]{12} against 'abcabcabcabc'", () => {
const match = exec("^[a-c]{12}", "abcabcabcabc", "s");
expect(match.matches[0]).toBe("abcabcabcabc".substring(0, 12));
});
it("line: 1392 - matches ^(a|b|c){12} against 'abcabcabcabc '", () => {
const match = exec("^(a|b|c){12}", "abcabcabcabc ", "s");
expect(match.matches[0]).toBe("abcabcabcabc ".substring(0, 12));
expect(match.matches[1]).toBe("abcabcabcabc ".substring(11, 12));
});
it("line: 1393 - matches ^[abcdefghijklmnopqrstuvwxy0123456789] against 'n'", () => {
const match = exec("^[abcdefghijklmnopqrstuvwxy0123456789]", "n", "s");
expect(match.matches[0]).toBe("n".substring(0, 1));
Expand Down
6 changes: 6 additions & 0 deletions assembly/__tests__/capture-group.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,9 @@ it("repeated capture groups should return the last match", () => {
expect(match.matches[0]).toBe("ac");
expect(match.matches[1]).toBe("c");
});

it("range repitition capture groups should return the last match", () => {
const match = exec("([a-c]){2}", "ac");
expect(match.matches[0]).toBe("ac");
expect(match.matches[1]).toBe("c");
});
8 changes: 4 additions & 4 deletions assembly/nfa/nfa.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ export class GroupStartMarkerState extends State {
// captures from the path through the NFA that reaches the end are flagged
flagged: bool = false;

constructor(next: State) {
constructor(next: State, public id: i32) {
super();
this.transitions.push(next);
}
Expand Down Expand Up @@ -162,10 +162,10 @@ function oneOrMore(nfa: Automata, greedy: bool): Automata {
return new Automata(start, end);
}

function group(nfa: Automata): Automata {
function group(nfa: Automata, id: i32): Automata {
// groups are implemented by wrapping the automata with
// a pair of markers that record matches
const startMarker = new GroupStartMarkerState(nfa.start);
const startMarker = new GroupStartMarkerState(nfa.start, id);
const end = new State();
const endMarker = new GroupEndMarkerState(end, startMarker);
nfa.end.transitions.push(endMarker);
Expand Down Expand Up @@ -236,7 +236,7 @@ class AutomataFactor {
);
case NodeType.Group: {
const node = expression as GroupNode;
return group(this.automataForNode(node.expression));
return group(this.automataForNode(node.expression), node.id);
}
case NodeType.Assertion:
return Automata.fromEpsilon();
Expand Down
9 changes: 7 additions & 2 deletions assembly/parser/node.ts
Original file line number Diff line number Diff line change
Expand Up @@ -206,17 +206,22 @@ export class AlternationNode extends Node {
}
}

let _id = 0;

export class GroupNode extends Node {
constructor(public expression: Node) {
constructor(public expression: Node, public id: i32 = -1) {
super(NodeType.Group);
if (id == -1) {
this.id = _id++;
}
}

children(): Node[] {
return [this.expression];
}

clone(): Node {
return new GroupNode(this.expression.clone());
return new GroupNode(this.expression.clone(), this.id);
}

replace(node: Node, replacement: Node): void {
Expand Down
7 changes: 5 additions & 2 deletions assembly/parser/walker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,11 @@ export function expandRepetitions(visitor: NodeVisitor): void {
// create multiple clones
const clones = new Array<Node>(from);
// a{4} => aaaa
for (let i = 0; i < from; i++) {
clones[i] = expression.clone();
if (from > 0) {
clones[0] = expression;
for (let i = 1; i < from; i++) {
clones[i] = expression.clone();
}
}

if (rangeRepNode.to == -1) {
Expand Down
35 changes: 32 additions & 3 deletions assembly/regexp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,30 @@ export class Flags {
}
}

// capture groups are implemented as GroupStart / GroupEnd states that record (capture)
// the value of the current state of the string being matched.
// Repeated capture groups, via rage repetitions (e.g. {2,3}) share the same 'id'. The
// returned regex should only return the value of the final repetition.
function filterCaptures(groupMarkers: GroupStartMarkerState[]): string[] {
if (!groupMarkers.length) {
return [];
}
const values = [first(groupMarkers).capture];
let currrentId = first(groupMarkers).id;
for (let i = 0; i < groupMarkers.length; i++) {
const gm = groupMarkers[i];
if (gm.id != currrentId) {
currrentId = gm.id;
values.push(gm.capture);
} else {
if (gm.flagged) {
values[values.length - 1] = gm.capture;
}
}
}
return values;
}

export class RegExp {
lastIndex: i32 = 0;
private flags: Flags;
Expand Down Expand Up @@ -143,15 +167,20 @@ export class RegExp {
this.nfa.start,
str.substr(matchIndex)
);

// we have found a match
if (matchStr != null) {
// remove any non-flagged captures
groupMarkers.forEach((gm) => {
gm.capture = gm.flagged ? gm.capture : "";
});

const match = new Match(
[matchStr!].concat(
groupMarkers.map<string>((m) => (m.flagged ? m.capture : ""))
),
[matchStr!].concat(filterCaptures(groupMarkers)),
matchIndex,
str
);

// return this match (checking end of input condition)
const matchEndIndex = match.index + match.matches[0].length;
if (!this.endOfInput || (this.endOfInput && matchEndIndex == len)) {
Expand Down
13 changes: 0 additions & 13 deletions spec/test-generator.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,6 @@ const knownIssues = {
...range(487, 494),
...range(1077, 1082),
],
"issues with repeated capture groups": [
262,
263,
...range(63, 68),
1391,
1392,
...range(52, 55),
57,
58,
72,
73,
78,
],
"lazy quantifiers should still yield the longest overall regex match": [
...range(141, 143),
1288,
Expand Down
9 changes: 6 additions & 3 deletions ts/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ globalAny.log = console.log;

import { RegExp } from "../assembly/regexp";

const regexObj = new RegExp("ba{0}b");
const match = regexObj.exec("bb");
const regexObj = new RegExp("^(a){1,3}");
const match = regexObj.exec("abc");
console.log(JSON.stringify(match, null, 2));

console.log(match);
const regexObj2 = new RegExp("(a|b)c|a(b|c)");
const match2 = regexObj2.exec("ab");
console.log(JSON.stringify(match2, null, 2));

0 comments on commit 84027b8

Please sign in to comment.