v0.2.4 use 'stringi' instead of 'stringr' for lighter dependency

coolbutuseless · Aug 30, 2021 · 0ec55c7 · 0ec55c7
1 parent a01563c
commit 0ec55c7
Show file tree

Hide file tree

Showing 13 changed files with 140 additions and 61 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: flexo
 Type: Package
 Title: Simple Tools for Lexing/Parsing Text Data
-Version: 0.2.1
+Version: 0.2.4
 Author: mikefc
 Maintainer: mikefc <[email protected]>
 Description: Simple tools for lexing/parsing text data.
@@ -12,7 +12,7 @@ Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.1.1
 Imports:
-  stringr,
+  stringi,
   R6
 Suggests: 
   knitr,

diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2018-2020 [email protected]
+Copyright (c) 2018-2021 [email protected]
 
     Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,6 @@
 
 export(TokenStream)
 export(lex)
-export(regex)
+export(re)
 import(R6)
-import(stringr)
+import(stringi)
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,23 @@
 NEWS
 ============
 
+v0.2.4 2021-08-30
+------------------
+
+Switch to `stringi` instead of `stringr`
+
+v0.2.3 2020-12-12
+------------------
+
+* Bug-fixing of 'consume_until'
+* Improved print statement for TokenStream
+* renamed `regex` to `re` to avoid clash with `stringr`
+
+v0.2.2 2020-12-10
+------------------
+
+* Stricter checks on 'regex_idx' validity
+
 v0.2.1 2020-12-08
 ------------------
 

diff --git a/R/TokenStream.R b/R/TokenStream.R
@@ -320,11 +320,11 @@ TokenStream <- R6::R6Class(
 
       if (!inclusive) {
         idx <- idx - 1L
-        if (length(idx) > 0 && idx == 0) return(character(0))
+        if (length(idx) > 0 && !is.na(idx) && idx == self$position - 1L) return(character(0))
       }
 
       if (length(idx) == 0L || is.na(idx)) {
-        message("End not found. Returning all")
+        # message("End not found. Returning all")
         n <- length(self$named_values) - self$position + 1L
       } else {
         n <- idx - self$position + 1L
@@ -359,9 +359,19 @@ TokenStream <- R6::R6Class(
     #' @param n number of elements to print
     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     print = function(n = 5) {
-      cat("Position ", self$position, "/", length(self$named_values), ".  ",
-          "Next ", n, " elements:\n", sep = "")
-      print(self$named_values[self$position + seq(n) - 1L])
+      if (self$end_of_stream()) {
+        print("End of stream")
+      } else {
+        cat("Position ", self$position, "/", length(self$named_values), ".\n", sep = "")
+
+        n2 <- length(self$named_values) - self$position + 1L
+        n  <- min(n2, n)
+
+        if (n > 0) {
+          cat("Next", n, "elements:\n")
+          print(self$named_values[self$position + seq(n) - 1L])
+        }
+      }
     }
 
   )
@@ -375,16 +385,12 @@ if (FALSE) {
 
   stream <- TokenStream$new(named_values)
 
-  stream$consume(2)
-  stream$position
-
-  stream$consume_until(value = 4)
+  stream$consume_until(name = 'one', inclusive = FALSE)
+  stream
 
-  stream$reset()
-  stream$position
 
-  stream$consume_while(value = 1:4)
-  stream$position
+  stream$consume_until(name = 'two', inclusive = FALSE)
+  stream
 }
 
 

diff --git a/R/lex.R b/R/lex.R
@@ -1,7 +1,7 @@
 
 
 
-#-----------------------------------------------------------------------------
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #' Break a string into labelled tokens based upon a set of patterns
 #'
 #' @param text a single character string
@@ -20,70 +20,87 @@
 #'         regular expression.
 #'
 #' @examples
-#' lex("hello there 123.45", regexes=c(number=regex$number, word="(\\w+)", whitespace="(\\s+)"))
+#' lex("hello there 123.45", regexes=c(number=re$number, word="(\\w+)", whitespace="(\\s+)"))
 #'
-#' @import stringr
+#' @import stringi
 #' @export
-#-----------------------------------------------------------------------------
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 lex <- function(text, regexes, verbose=FALSE) {
 
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # disallow multiple capture groups in a single pattern.
   # i.e. regexes = c("(a|b)", "(c)|(d)")
-  #---------------------------------------------------------------------------
-  captured_groups <- stringr::str_match_all(regexes, "(?<!\\\\)\\([^?]")
-  n_captured_groups <- vapply(captured_groups, FUN = nrow, integer(1))
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  captured_groups <- stringi::stri_match_all(regexes, regex = "(?<!\\\\)\\([^?]")
+
+  n_captured_groups <- vapply(captured_groups, FUN = function(x) {
+    if (anyNA(x)) {
+      0L
+    } else {
+      nrow(x)
+    }
+  }, integer(1))
   if (any(n_captured_groups > 1)) {
     stop("Regexes can define at most only a single capture group. Patterns which need fixing",
          deparse(regexes[n_captured_groups > 1]))
   }
 
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # Any regex that has 0 capture groups has its whole regex become the
   # capture group
-  #---------------------------------------------------------------------------
-  idx <-n_captured_groups == 0
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  idx <- n_captured_groups == 0
   regexes[idx] <- paste0("(", regexes[idx], ")")
 
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # Insert a default pattern to match anything missed by the provided regexes
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   regexes        <- c(regexes, .missing="(.)")
   regex_labels   <- names(regexes)
 
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # All regexes must be named
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   stopifnot(!anyNA(regex_labels))
   stopifnot(!any(regex_labels == ''))
 
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # Combine all the patterns into a single regex
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   regex <- paste(regexes, collapse='|')
 
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # Match all regex against the text
-  #---------------------------------------------------------------------------
-  token_matching  <- stringr::str_match_all(text, regex)[[1]]
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  token_matching  <- stringi::stri_match_all(text, regex = regex)[[1]]
 
   if (verbose) {
     colnames(token_matching) <- c("all", regex_labels)
     print(token_matching)
   }
 
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # Extract the actual token and the regex which matched the token
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   regex_idx      <- apply(token_matching[, -1, drop=FALSE], 1, function(x) {  which(!is.na(x))})
   tokens         <- apply(token_matching[, -1, drop=FALSE], 1, function(x) {x[which(!is.na(x))]})
-  names(tokens)  <- regex_labels[regex_idx]
+
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  # if 'regex_idx' is a list, then a location was matched by multiple regexes
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  if (is.list(regex_idx)) {
+    lens <- lengths(regex_idx)
+    idx  <- which(lens > 1)
+    stop("lex issues at the following locations within 'text': ", deparse(idx))
+  }
 
 
-  #---------------------------------------------------------------------------
+  names(tokens)  <- regex_labels[regex_idx]
+
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # If any tokens were captured by the '.missing' regex, then show
   # a warning message
-  #---------------------------------------------------------------------------
+  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   if (verbose && any(names(tokens) == '.missing')) {
     not_captured <- sort(unique(tokens[names(tokens) == '.missing']))
     warning("The following characters were not captured: ", deparse(not_captured))
@@ -94,15 +111,14 @@ lex <- function(text, regexes, verbose=FALSE) {
 
 
 
-#-----------------------------------------------------------------------------
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #' Regexes to match common elements
 #' @export
-#-----------------------------------------------------------------------------
-regex <- list(
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+re <- list(
   number    = '[+\\-]?(?:0|[1-9]\\d*)(?:\\.\\d*)?(?:[eE][+\\-]?\\d+)?',
   email     = '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}',
   ipaddress = '(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'
 )
 
 
-
diff --git a/README.Rmd b/README.Rmd
@@ -52,11 +52,10 @@ image_read("man/figures/logo.png")
 
 
 
-# flexo: Simple Lex/Parse Tools in R    <img src="man/figures/logo.png" align="right" height=300 title="An homage to the old logo for the Berlin Hilton"/>
+# flexo: Simple Lex/Parse Tools in R    <img src="man/figures/logo.png" align="right" height=300 />
 
 <!-- badges: start -->
 ![](https://img.shields.io/badge/cool-useless-green.svg)
-[![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental)
 [![R build status](https://github.com/coolbutuseless/flexo/workflows/R-CMD-check/badge.svg)](https://github.com/coolbutuseless/flexo/actions)
 <!-- badges: end -->
 
@@ -186,7 +185,7 @@ game_regexes <- c(
   whitespace  = "\\s+",
   sep         = "\\|",
   mark        = "X|O",
-  order       = flexo::regex$number
+  order       = flexo::re$number
 )
 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/README.md b/README.md
@@ -1,13 +1,11 @@
 
 <!-- README.md is generated from README.Rmd. Please edit that file -->
 
-# flexo: Simple Lex/Parse Tools in R <img src="man/figures/logo.png" align="right" height=300 title="An homage to the old logo for the Berlin Hilton"/>
+# flexo: Simple Lex/Parse Tools in R <img src="man/figures/logo.png" align="right" height=300 />
 
 <!-- badges: start -->
 
-![](https://img.shields.io/badge/cool-useless-green.svg) [![Lifecycle:
-experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental)
-[![R build
+![](https://img.shields.io/badge/cool-useless-green.svg) [![R build
 status](https://github.com/coolbutuseless/flexo/workflows/R-CMD-check/badge.svg)](https://github.com/coolbutuseless/flexo/actions)
 <!-- badges: end -->
 
@@ -147,7 +145,7 @@ game_regexes <- c(
   whitespace  = "\\s+",
   sep         = "\\|",
   mark        = "X|O",
-  order       = flexo::regex$number
+  order       = flexo::re$number
 )
 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/man/lex.Rd b/man/lex.Rd
diff --git a/man/regex.Rd → man/re.Rd b/man/regex.Rd → man/re.Rd
diff --git a/tests/testthat/test-consume_until.R b/tests/testthat/test-consume_until.R
@@ -0,0 +1,43 @@
+
+
+
+test_that("consume_until works", {
+
+
+  named_values <- c(one = 1, two = 2, three = 3, four = 4, five = 5)
+  stream <- TokenStream$new(named_values)
+
+  jnk <- stream$consume_until(name = 'one', inclusive = FALSE)
+  expect_equivalent(stream$read(1), 1)
+
+  jnk <- stream$consume_until(name = 'one', inclusive = FALSE)
+  expect_equivalent(stream$read(1), 1)
+
+  jnk <- stream$consume_until(name = 'two', inclusive = FALSE)
+  expect_equivalent(stream$read(1), 2)
+
+  jnk <- stream$consume_until(name = 'two', inclusive = FALSE)
+  expect_equivalent(stream$read(1), 2)
+
+
+
+
+
+  stream <- TokenStream$new(named_values)
+
+  jnk <- stream$consume_until(name = 'one', inclusive = TRUE)
+  expect_equivalent(stream$read(1), 2)
+
+  jnk <- stream$consume_until(name = 'one', inclusive = FALSE)
+  expect_true(stream$end_of_stream())
+
+  stream$reset()
+
+  jnk <- stream$consume_until(name = 'two', inclusive = TRUE)
+  expect_equivalent(stream$read(1), 3)
+
+  jnk <- stream$consume_until(name = 'two', inclusive = TRUE)
+  expect_true(stream$end_of_stream())
+
+
+})
diff --git a/vignettes/Scrabble.Rmd b/vignettes/Scrabble.Rmd
@@ -77,7 +77,7 @@ gcg_regexes <- c(
   whitespace    = '\\s+',
   player        = '>(.*?):',                  # start of each line with a `>`
   location      = '[a-o]\\d+|\\d+[a-o]|--|-', # Number first for horizontal words. -/-- for specials
-  number        = flexo::regex$number,
+  number        = flexo::re$number,
   symbol        = '[-+\\w\\./\\?\\(?:\\)]+',
   comma         = ","
 )

diff --git a/vignettes/parse_obj.Rmd b/vignettes/parse_obj.Rmd
@@ -80,7 +80,7 @@ Use `lex()` to turn the text into tokens
 ```{r}
 obj_regexes <- c(
   comment    = '(#.*?)\n',  # assume comments take up the whole line
-  number     = flexo::regex$number, # matches most numeric values
+  number     = flexo::re$number, # matches most numeric values
   symbol     = '\\w+',
   newline    = '\n',
   whitespace = '\\s+'