Merge pull request #17 from LCSB-BioCore/mk-doc-lex

exaexa · web-flow · commit f90a08dff66e · 2023-08-05T18:39:09.000+02:00
document the scanning&amp;lexing use
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "PikaParser"
 uuid = "3bbf5609-3e7b-44cd-8549-7c69f321e792"
 authors = ["The developers of PikaParser.jl"]
-version = "0.5.1"
+version = "0.5.2"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,6 +1,7 @@
 using Documenter, Literate, PikaParser
 
-examples = filter(x -> endswith(x, ".jl"), readdir(joinpath(@__DIR__, "src"), join = true))
+examples =
+    sort(filter(x -> endswith(x, ".jl"), readdir(joinpath(@__DIR__, "src"), join = true)))
 
 for example in examples
     Literate.markdown(
diff --git a/docs/src/json.jl b/docs/src/json.jl
@@ -14,6 +14,8 @@
 #   to remove unnecessary spaces)
 # - support for numbers is very ad-hoc, `Float64`-only
 # - the escape sequences allowed in strings are rather incomplete
+#
+# ## Preparing the grammar
 
 import PikaParser as P
 
@@ -43,6 +45,8 @@ rules = Dict(
     :json => P.first(:obj, :array, :string, :number, :t, :f, :null),
 );
 
+# ## Making the "fold" function
+#
 # To manage the folding easily, we keep the fold functions in a data structure
 # with the same order as `rules`:
 folds = Dict(
@@ -69,12 +73,14 @@ default_fold(v, subvals) = isempty(subvals) ? nothing : subvals[1]
 
 g = P.make_grammar([:json], P.flatten(rules, Char));
 
+# ## Parsing JSON
+#
 # Let's parse a simple JSONish string that demonstrates most of the rules:
 input = """{"something":123,"other":false,"refs":[1,-2.345,[],{},true,false,null,[1,2,3,"haha"],{"is\\"Finished\\"":true}]}""";
 
 p = P.parse(g, input);
 
-# Let's build a Julia JSON-like structure:
+# From the result we can build a Julia JSON-like structure:
 result = P.traverse_match(
     p,
     P.find_match_at!(p, :json, 1),
diff --git a/docs/src/scheme.jl b/docs/src/scheme.jl
@@ -12,6 +12,8 @@
 # We choose not to implement any of the Scheme data types except numbers and
 # identifiers; also all top-level expressions must be parenthesized "command"
 # S-expressions.
+#
+# ## Implementing the grammar
 
 import PikaParser as P
 
@@ -39,6 +41,8 @@ rules = Dict(
 # spaces.  This way prevents unnecessary checking (and redundant matching) of
 # the tokens, and buildup of uninteresting entries in the memo table.
 
+# ## Parsing input
+#
 # Let's test the grammar on a piece of source code that contains lots of
 # whitespace and some errors.
 
@@ -67,6 +71,8 @@ fold_scheme(m, p, s) =
     m.rule == :insexpr ? Expr(:call, :S, s...) :
     m.rule == :sexpr ? s[2] : m.rule == :top ? s[2] : length(s) > 0 ? s[1] : nothing;
 
+# ## Recovering from errors and showing partial parses
+#
 # We can run through all `top` matches, tracking the position where we would
 # expect the next match:
 
@@ -81,12 +87,14 @@ while next_pos <= lastindex(p.input)
         pos = nextind(p.input, pos)
     end
     pos > next_pos && # if we skipped something, report it
-        @error "Got parsing problems" p.input[next_pos:prevind(p.input, pos)]
+        println(
+            "Got problems understanding this: $(p.input[next_pos:prevind(p.input, pos)])",
+        )
     if mid == 0
         break # if we skipped all the way to the end, quit
     else # we have an actual match, print it.
         value = P.traverse_match(p, mid, fold = fold_scheme)
-        @info "Got a toplevel value" value
+        println("Got a good value: $value")
         m = p.matches[mid] # skip the whole match and continue
         next_pos = nextind(p.input, m.last)
     end
diff --git a/docs/src/scheme_lex.jl b/docs/src/scheme_lex.jl
@@ -0,0 +1,144 @@
+
+# # Example: Faster parsing with lexers
+
+# One disadvantage of pika-style parsers is the large amount of redundant
+# intermediate matches that are produced in the right-to-left parsing process.
+# These generally pollute the match table and cause inefficiency.
+#
+# PikaParser supports greedily pre-lexing the parser input using the terminals
+# in the grammar, which allows you to produce much more precise terminal
+# matches, thus also more compact match table, and, in result, much **faster**
+# and more robust parser.
+#
+# In this example, we simply rewrite the Scheme grammar from [the Scheme
+# tutorial](scheme.md) to use [`PikaParser.scan`](@ref) (which allows you to
+# match many interesting kinds of tokens quickly) and then
+# [`PikaParser.parse_lex`](@ref) (which runs the greedy lexing and uses the
+# result for more efficient parsing).
+#
+# As the main change, we removed the "simple" matches of `:digit` and `:letter`
+# from the grammar, and replaced them with manual matchers of whole tokens.
+#
+# ## Writing scanners
+#
+# First, let's make a very useful helper function that lets us convert any
+# `Char`-matching function into a scanner. This neatens the grammar code later.
+#
+# When constructing the scanner functions, remember that it is important to use
+# the overloaded indexing functions (`nextind`, `prevind`, `firstindex`,
+# `lastindex`) instead of manually computing the integer indexes. Consider what
+# happens with Unicode strings if you try to get an index like `"kůň"[3]`!
+# Compute indexes manually only if you are *perfectly* certain that the input
+# indexing is flat.
+
+takewhile1(f) = (input) -> begin
+    isempty(input) && return 0
+    for i in eachindex(input)
+        if !f(input[i])
+            return prevind(input, i)
+        end
+    end
+    return lastindex(input)
+end;
+
+# The situation for matching `:ident` is a little more complicated -- we need a
+# different match on the first letter and there are extra characters to think
+# about. So we just make a specialized function for that:
+
+function take_ident(input)
+    isempty(input) && return 0
+    i = firstindex(input)
+    isletter(input[i]) || return 0
+    i = nextind(input, i)
+    while i <= lastindex(input)
+        c = input[i]
+        if !(isletter(c) || isdigit(c) || c == '-')
+            return prevind(input, i)
+        end
+        i = nextind(input, i)
+    end
+    return lastindex(input)
+end;
+
+# ## Using scanners in a grammar
+#
+# The grammar becomes slightly simpler than in the original version:
+
+import PikaParser as P
+
+rules = Dict(
+    :ws => P.first(:spaces => P.scan(takewhile1(isspace)), P.epsilon),
+    :popen => P.seq(P.token('('), :ws),
+    :pclose => P.seq(P.token(')'), :ws),
+    :sexpr => P.seq(:popen, :insexpr => P.many(:scheme), :pclose),
+    :scheme => P.seq(
+        :basic => P.first(
+            :number => P.seq(P.scan(takewhile1(isdigit)), P.not_followed_by(:ident)),
+            :ident => P.scan(take_ident),
+            :sexpr,
+        ),
+        :ws,
+    ),
+    :top => P.seq(:ws, :sexpr), #support leading blanks
+);
+
+# ## Using the scanners for lexing the input
+#
+# Let's try the lexing on the same input as in the Scheme example:
+
+input = """
+(plus 1 2 3)
+(minus 1 2(plus 3 2)  ) woohoo extra parenthesis here )
+(complex
+  id3nt1f13r5 αβγδ भरत kůň)
+(invalid 1d3n7)
+(something
+  1
+  2
+  valid)
+(straight (out (missing(parenthesis error))
+(apply (make-function) (make-data))
+""";
+grammar = P.make_grammar([:top], P.flatten(rules, Char));
+
+P.lex(grammar, input)
+
+# The result is a vector of possible terminals that can be matched at given
+# input positions. As a minor victory, you may see that no terminals are
+# matched inside the initial `plus` token.
+#
+# Now, the lexed input could be used via the argument `fast_match` of
+# [`PikaParser.parse`](@ref), but usually it is much simpler to have the
+# combined function [`PikaParser.parse_lex`](@ref) do everything:
+
+p = P.parse_lex(grammar, input);
+
+# The rest is now essentially same as with the [previous Scheme example](scheme.md):
+
+fold_scheme(m, p, s) =
+    m.rule == :number ? parse(Int, m.view) :
+    m.rule == :ident ? Symbol(m.view) :
+    m.rule == :insexpr ? Expr(:call, :S, s...) :
+    m.rule == :sexpr ? s[2] : m.rule == :top ? s[2] : length(s) > 0 ? s[1] : nothing;
+
+next_pos = 1
+while next_pos <= lastindex(p.input)
+    global next_pos
+    pos = next_pos
+    mid = 0
+    while pos <= lastindex(p.input) # try to find a match
+        mid = P.find_match_at!(p, :top, pos)
+        mid != 0 && break
+        pos = nextind(p.input, pos)
+    end
+    pos > next_pos && # if we skipped something, report it
+        println("Problems with: $(p.input[next_pos:prevind(p.input, pos)])")
+    if mid == 0
+        break # if we skipped all the way to the end, quit
+    else # we have an actual match, print it.
+        value = P.traverse_match(p, mid, fold = fold_scheme)
+        println("Parsed OK: $value")
+        m = p.matches[mid] # skip the whole match and continue
+        next_pos = nextind(p.input, m.last)
+    end
+end
diff --git a/src/frontend.jl b/src/frontend.jl
@@ -17,8 +17,8 @@ Build a [`Scan`](@ref) clause. Translate to strongly typed grammar with [`flatte
 
 # Example
 
-    # rule to match a pair of equal tokens
-    scan(m -> m[1] == m[2] ? 2 : -1)
+    # a rule to match any pair of equal tokens
+    scan(m -> (length(m) >= 2 && m[1] == m[2]) ? 2 : 0)
 """
 scan(f::Function) = Scan{Any,Any}(f)
 
diff --git a/src/parse.jl b/src/parse.jl
@@ -203,8 +203,21 @@ end
 """
 $(TYPEDSIGNATURES)
 
-Greedily find terminals in the input sequence, while avoiding any attempts at
-parsing terminals where another terminal was already parsed successfully.
+Greedily find terminals in the input sequence. For performance and uniqueness
+purposes, terminals are only looked for at stream indexes that follow the final
+indexes of terminals found previously. That allows the lexing process to skip
+many redundant matches that could not ever be found by the grammar.
+
+As a main outcome, this prevents the typical pika-parser behavior when matching
+sequences using [`many`](@ref), where e.g. an identifier like `abcd` also
+produces redundant (and often invalid) matches for `bcd`, `cd` and `d`.
+Colaterally, greedy lexing also creates less tokens in the match table, which
+results in faster parsing.
+
+To produce good terminal matches quickly, use [`scan`](@ref).
+
+In a typical use, this function is best called indirectly via
+[`parse_lex`](@ref).
 """
 function lex(g::Grammar{G,T}, input::I)::Vector{Vector{Tuple{G,Int}}} where {G,T,I}
     q = PikaQueue(lastindex(input))
diff --git a/src/structs.jl b/src/structs.jl
@@ -51,8 +51,9 @@ $(TYPEDEF)
 A single terminal, possibly made out of multiple input tokens.
 
 Given the input stream view, the `match` function scans the input forward and
-returns the position of the last item of the terminal starting at the beginning
-of the stream. In case there's no match, it returns a zero.
+returns the position of the last item of the matched terminal (which is assumed
+to start at the beginning of the stream view). In case there's no match, it
+returns zero.
 
 # Fields
 $(TYPEDFIELDS)