Skip to content

Commit

Permalink
Merge pull request #275 from lake-of-fire/master
Browse files Browse the repository at this point in the history
Optimizations
  • Loading branch information
scinfu authored Jul 26, 2024
2 parents 1c537b7 + 86f4189 commit 91cc703
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 55 deletions.
97 changes: 57 additions & 40 deletions Sources/CharacterReader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,18 @@ public final class CharacterReader {
return CharacterReader.EOF
}
let val = input[pos]
pos = input.index(after: pos)
input.formIndex(after: &pos)
return val
}

public func unconsume() {
guard pos > input.startIndex else { return }
pos = input.index(before: pos)
input.formIndex(before: &pos)
}

public func advance() {
guard pos < input.endIndex else { return }
pos = input.index(after: pos)
input.formIndex(after: &pos)
}

public func markPos() {
Expand All @@ -65,12 +65,12 @@ public final class CharacterReader {
pos = mark
}

public func consumeAsString() -> String {
guard pos < input.endIndex else { return "" }
let str = String(input[pos])
pos = input.index(after: pos)
return str
}
// public func consumeAsString() -> String {
// guard pos < input.endIndex else { return "" }
// let str = String(input[pos])
// input.formIndex(after: &pos)
// return str
// }

/**
* Locate the next occurrence of a Unicode scalar
Expand Down Expand Up @@ -100,7 +100,7 @@ public final class CharacterReader {
var current = firstCharIx
// Then manually match subsequent scalars
for scalar in targetScalars.dropFirst() {
current = input.index(after: current)
input.formIndex(after: &current)
guard current < input.endIndex else { return nil }
if input[current] != scalar {
start = input.index(after: firstCharIx)
Expand Down Expand Up @@ -130,36 +130,47 @@ public final class CharacterReader {
return consumed
}

public func consumeToAny(_ chars: UnicodeScalar...) -> String {
return consumeToAny(chars)
}
// public func consumeToAny(_ chars: UnicodeScalar...) -> String {
// return consumeToAny(Set(chars))
// }

public func consumeToAny(_ chars: [UnicodeScalar]) -> String {
// public func consumeToAny(_ chars: Set<UnicodeScalar>) -> String {
// let endIndex = input.endIndex
// let start = pos
// while pos < endIndex {
// if chars.contains(input[pos]) {
// break
// }
// input.formIndex(after: &pos)
// }
// return cacheString(start, pos)
// }

public func consumeToAny(_ chars: Set<UnicodeScalar>) -> String {
let start = pos
while pos < input.endIndex {
if chars.contains(input[pos]) {
break
}
pos = input.index(after: pos)
if let nextIndex = input[pos...].firstIndex(where: { chars.contains($0) }) {
pos = nextIndex
} else {
pos = input.endIndex
}
return cacheString(start, pos)
}

public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String {
return consumeToAny(chars)
}

public func consumeToAnySorted(_ chars: [UnicodeScalar]) -> String {
// public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String {
// return consumeToAny(chars)
// }

public func consumeToAnySorted(_ chars: Set<UnicodeScalar>) -> String {
return consumeToAny(chars)
}

static let dataTerminators: [UnicodeScalar] = [.Ampersand, .LessThan, TokeniserStateVars.nullScalr]
static let dataTerminators: Set<UnicodeScalar> = Set([.Ampersand, .LessThan, TokeniserStateVars.nullScalr])
// read to &, <, or null
public func consumeData() -> String {
return consumeToAny(CharacterReader.dataTerminators)
}

static let tagNameTerminators: [UnicodeScalar] = [.BackslashT, .BackslashN, .BackslashR, .BackslashF, .Space, .Slash, .GreaterThan, TokeniserStateVars.nullScalr]
static let tagNameTerminators: Set<UnicodeScalar> = Set([.BackslashT, .BackslashN, .BackslashR, .BackslashF, .Space, .Slash, .GreaterThan, TokeniserStateVars.nullScalr])
// read to '\t', '\n', '\r', '\f', ' ', '/', '>', or nullChar
public func consumeTagName() -> String {
return consumeToAny(CharacterReader.tagNameTerminators)
Expand All @@ -173,10 +184,11 @@ public final class CharacterReader {

public func consumeLetterSequence() -> String {
let start = pos
while pos < input.endIndex {
let endIndex = input.endIndex
while pos < endIndex {
let c = input[pos]
if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) {
pos = input.index(after: pos)
input.formIndex(after: &pos)
} else {
break
}
Expand All @@ -186,18 +198,19 @@ public final class CharacterReader {

public func consumeLetterThenDigitSequence() -> String {
let start = pos
while pos < input.endIndex {
let endIndex = input.endIndex
while pos < endIndex {
let c = input[pos]
if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) {
pos = input.index(after: pos)
input.formIndex(after: &pos)
} else {
break
}
}
while pos < input.endIndex {
while pos < endIndex {
let c = input[pos]
if (c >= "0" && c <= "9") {
pos = input.index(after: pos)
input.formIndex(after: &pos)
} else {
break
}
Expand All @@ -207,10 +220,11 @@ public final class CharacterReader {

public func consumeHexSequence() -> String {
let start = pos
while pos < input.endIndex {
let endIndex = input.endIndex
while pos < endIndex {
let c = input[pos]
if ((c >= "0" && c <= "9") || (c >= "A" && c <= "F") || (c >= "a" && c <= "f")) {
pos = input.index(after: pos)
input.formIndex(after: &pos)
} else {
break
}
Expand All @@ -220,10 +234,11 @@ public final class CharacterReader {

public func consumeDigitSequence() -> String {
let start = pos
while pos < input.endIndex {
let endIndex = input.endIndex
while pos < endIndex {
let c = input[pos]
if (c >= "0" && c <= "9") {
pos = input.index(after: pos)
input.formIndex(after: &pos)
} else {
break
}
Expand All @@ -239,14 +254,16 @@ public final class CharacterReader {
public func matches(_ seq: String, ignoreCase: Bool = false, consume: Bool = false) -> Bool {
var current = pos
let scalars = seq.unicodeScalars
let endIndex = input.endIndex
for scalar in scalars {
guard current < input.endIndex else { return false }
guard current < endIndex else { return false }
let c = input[current]
if ignoreCase {
guard input[current].uppercase == scalar.uppercase else { return false }
guard c.uppercase == scalar.uppercase else { return false }
} else {
guard input[current] == scalar else { return false }
guard c == scalar else { return false }
}
current = input.index(after: current)
input.formIndex(after: &current)
}
if consume {
pos = current
Expand Down
4 changes: 2 additions & 2 deletions Sources/Entities.swift
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public class Entities {
return left.value != right.value
}

private static let codeDelims: [UnicodeScalar] = [",", ";"]
private static let codeDelims: Set<UnicodeScalar> = Set([",", ";"])

init(string: String, size: Int, id: Int) {

Expand Down Expand Up @@ -102,7 +102,7 @@ public class Entities {
var matches: [String] = []
while ix < entitiesByCodepoint.endIndex && entitiesByCodepoint[ix].scalar == codepoint {
matches.append(entitiesByCodepoint[ix].name)
ix = entitiesByCodepoint.index(after: ix)
entitiesByCodepoint.formIndex(after: &ix)
}
return matches.isEmpty ? nil : matches.sorted().last!
}
Expand Down
31 changes: 18 additions & 13 deletions Sources/TokeniserState.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@ protocol TokeniserStateProtocol {
public class TokeniserStateVars {
public static let nullScalr: UnicodeScalar = "\u{0000}"

static let attributeSingleValueCharsSorted = ["'", UnicodeScalar.Ampersand, nullScalr].sorted()
static let attributeDoubleValueCharsSorted = ["\"", UnicodeScalar.Ampersand, nullScalr].sorted()
static let attributeNameCharsSorted = [UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", "/", "=", ">", nullScalr, "\"", "'", UnicodeScalar.LessThan].sorted()
static let attributeValueUnquoted = [UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", UnicodeScalar.Ampersand, ">", nullScalr, "\"", "'", UnicodeScalar.LessThan, "=", "`"].sorted()
static let attributeSingleValueChars = Set(["'", UnicodeScalar.Ampersand, nullScalr])
static let attributeDoubleValueChars = Set(["\"", UnicodeScalar.Ampersand, nullScalr])
static let attributeNameChars = Set([UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", "/", "=", ">", nullScalr, "\"", "'", UnicodeScalar.LessThan])
static let attributeValueUnquoted = Set([UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", UnicodeScalar.Ampersand, ">", nullScalr, "\"", "'", UnicodeScalar.LessThan, "=", "`"])

static let dataDefaultStopChars: Set<UnicodeScalar> = [UnicodeScalar.Ampersand, UnicodeScalar.LessThan, TokeniserStateVars.nullScalr]
static let commentDefaultStopChars: Set<UnicodeScalar> = ["-", TokeniserStateVars.nullScalr]
static let readDataDefaultStopChars: Set<UnicodeScalar> = [UnicodeScalar.LessThan, TokeniserStateVars.nullScalr]


static let replacementChar: UnicodeScalar = Tokeniser.replacementChar
static let replacementStr: String = String(Tokeniser.replacementChar)
Expand Down Expand Up @@ -94,7 +99,7 @@ enum TokeniserState: TokeniserStateProtocol {
case BogusDoctype
case CdataSection

internal func read(_ t: Tokeniser, _ r: CharacterReader)throws {
internal func read(_ t: Tokeniser, _ r: CharacterReader) throws {
switch self {
case .Data:
switch (r.current()) {
Expand Down Expand Up @@ -137,7 +142,7 @@ enum TokeniserState: TokeniserStateProtocol {
try t.emit(Token.EOF())
break
default:
let data = r.consumeToAny(UnicodeScalar.Ampersand, UnicodeScalar.LessThan, TokeniserStateVars.nullScalr)
let data = r.consumeToAny(TokeniserStateVars.dataDefaultStopChars)
t.emit(data)
break
}
Expand Down Expand Up @@ -417,7 +422,7 @@ enum TokeniserState: TokeniserStateProtocol {
t.emit(TokeniserStateVars.replacementChar)
break
default:
let data = r.consumeToAny("-", UnicodeScalar.LessThan, TokeniserStateVars.nullScalr)
let data = r.consumeToAny(TokeniserStateVars.dataDefaultStopChars)
t.emit(data)
}
break
Expand Down Expand Up @@ -528,7 +533,7 @@ enum TokeniserState: TokeniserStateProtocol {
t.transition(.Data)
break
default:
let data = r.consumeToAny("-", UnicodeScalar.LessThan, TokeniserStateVars.nullScalr)
let data = r.consumeToAny(TokeniserStateVars.dataDefaultStopChars)
t.emit(data)
}
break
Expand Down Expand Up @@ -633,7 +638,7 @@ enum TokeniserState: TokeniserStateProtocol {
}
break
case .AttributeName:
let name = r.consumeToAnySorted(TokeniserStateVars.attributeNameCharsSorted)
let name = r.consumeToAnySorted(TokeniserStateVars.attributeNameChars)
t.tagPending.appendAttributeName(name)

let c = r.consume()
Expand Down Expand Up @@ -764,7 +769,7 @@ enum TokeniserState: TokeniserStateProtocol {
}
break
case .AttributeValue_doubleQuoted:
let value = r.consumeToAny(TokeniserStateVars.attributeDoubleValueCharsSorted)
let value = r.consumeToAny(TokeniserStateVars.attributeDoubleValueChars)
if (value.count > 0) {
t.tagPending.appendAttributeValue(value)
} else {
Expand Down Expand Up @@ -798,7 +803,7 @@ enum TokeniserState: TokeniserStateProtocol {
}
break
case .AttributeValue_singleQuoted:
let value = r.consumeToAny(TokeniserStateVars.attributeSingleValueCharsSorted)
let value = r.consumeToAny(TokeniserStateVars.attributeSingleValueChars)
if (value.count > 0) {
t.tagPending.appendAttributeValue(value)
} else {
Expand Down Expand Up @@ -1008,7 +1013,7 @@ enum TokeniserState: TokeniserStateProtocol {
t.transition(.Data)
break
default:
t.commentPending.data.append(r.consumeToAny("-", TokeniserStateVars.nullScalr))
t.commentPending.data.append(r.consumeToAny(TokeniserStateVars.commentDefaultStopChars))
}
break
case .CommentEndDash:
Expand Down Expand Up @@ -1592,7 +1597,7 @@ enum TokeniserState: TokeniserStateProtocol {
try t.emit(Token.EOF())
break
default:
let data = r.consumeToAny(UnicodeScalar.LessThan, TokeniserStateVars.nullScalr)
let data = r.consumeToAny(TokeniserStateVars.readDataDefaultStopChars)
t.emit(data)
break
}
Expand Down

0 comments on commit 91cc703

Please sign in to comment.