Skip to content

Commit

Permalink
v0.2.4 use 'stringi' instead of 'stringr' for lighter dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
coolbutuseless committed Aug 30, 2021
1 parent a01563c commit 0ec55c7
Show file tree
Hide file tree
Showing 13 changed files with 140 additions and 61 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: flexo
Type: Package
Title: Simple Tools for Lexing/Parsing Text Data
Version: 0.2.1
Version: 0.2.4
Author: mikefc
Maintainer: mikefc <[email protected]>
Description: Simple tools for lexing/parsing text data.
Expand All @@ -12,7 +12,7 @@ Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.1
Imports:
stringr,
stringi,
R6
Suggests:
knitr,
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2018-2020 [email protected]
Copyright (c) 2018-2021 [email protected]

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

export(TokenStream)
export(lex)
export(regex)
export(re)
import(R6)
import(stringr)
import(stringi)
17 changes: 17 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
NEWS
============

v0.2.4 2021-08-30
------------------

Switch to `stringi` instead of `stringr`

v0.2.3 2020-12-12
------------------

* Bug-fixing of 'consume_until'
* Improved print statement for TokenStream
* renamed `regex` to `re` to avoid clash with `stringr`

v0.2.2 2020-12-10
------------------

* Stricter checks on 'regex_idx' validity

v0.2.1 2020-12-08
------------------

Expand Down
32 changes: 19 additions & 13 deletions R/TokenStream.R
Original file line number Diff line number Diff line change
Expand Up @@ -320,11 +320,11 @@ TokenStream <- R6::R6Class(

if (!inclusive) {
idx <- idx - 1L
if (length(idx) > 0 && idx == 0) return(character(0))
if (length(idx) > 0 && !is.na(idx) && idx == self$position - 1L) return(character(0))
}

if (length(idx) == 0L || is.na(idx)) {
message("End not found. Returning all")
# message("End not found. Returning all")
n <- length(self$named_values) - self$position + 1L
} else {
n <- idx - self$position + 1L
Expand Down Expand Up @@ -359,9 +359,19 @@ TokenStream <- R6::R6Class(
#' @param n number of elements to print
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print = function(n = 5) {
cat("Position ", self$position, "/", length(self$named_values), ". ",
"Next ", n, " elements:\n", sep = "")
print(self$named_values[self$position + seq(n) - 1L])
if (self$end_of_stream()) {
print("End of stream")
} else {
cat("Position ", self$position, "/", length(self$named_values), ".\n", sep = "")

n2 <- length(self$named_values) - self$position + 1L
n <- min(n2, n)

if (n > 0) {
cat("Next", n, "elements:\n")
print(self$named_values[self$position + seq(n) - 1L])
}
}
}

)
Expand All @@ -375,16 +385,12 @@ if (FALSE) {

stream <- TokenStream$new(named_values)

stream$consume(2)
stream$position

stream$consume_until(value = 4)
stream$consume_until(name = 'one', inclusive = FALSE)
stream

stream$reset()
stream$position

stream$consume_while(value = 1:4)
stream$position
stream$consume_until(name = 'two', inclusive = FALSE)
stream
}


Expand Down
74 changes: 45 additions & 29 deletions R/lex.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@



#-----------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' Break a string into labelled tokens based upon a set of patterns
#'
#' @param text a single character string
Expand All @@ -20,70 +20,87 @@
#' regular expression.
#'
#' @examples
#' lex("hello there 123.45", regexes=c(number=regex$number, word="(\\w+)", whitespace="(\\s+)"))
#' lex("hello there 123.45", regexes=c(number=re$number, word="(\\w+)", whitespace="(\\s+)"))
#'
#' @import stringr
#' @import stringi
#' @export
#-----------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
lex <- function(text, regexes, verbose=FALSE) {

#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# disallow multiple capture groups in a single pattern.
# i.e. regexes = c("(a|b)", "(c)|(d)")
#---------------------------------------------------------------------------
captured_groups <- stringr::str_match_all(regexes, "(?<!\\\\)\\([^?]")
n_captured_groups <- vapply(captured_groups, FUN = nrow, integer(1))
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
captured_groups <- stringi::stri_match_all(regexes, regex = "(?<!\\\\)\\([^?]")

n_captured_groups <- vapply(captured_groups, FUN = function(x) {
if (anyNA(x)) {
0L
} else {
nrow(x)
}
}, integer(1))
if (any(n_captured_groups > 1)) {
stop("Regexes can define at most only a single capture group. Patterns which need fixing",
deparse(regexes[n_captured_groups > 1]))
}

#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Any regex that has 0 capture groups has its whole regex become the
# capture group
#---------------------------------------------------------------------------
idx <-n_captured_groups == 0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
idx <- n_captured_groups == 0
regexes[idx] <- paste0("(", regexes[idx], ")")

#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Insert a default pattern to match anything missed by the provided regexes
#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
regexes <- c(regexes, .missing="(.)")
regex_labels <- names(regexes)

#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# All regexes must be named
#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
stopifnot(!anyNA(regex_labels))
stopifnot(!any(regex_labels == ''))

#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Combine all the patterns into a single regex
#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
regex <- paste(regexes, collapse='|')

#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Match all regex against the text
#---------------------------------------------------------------------------
token_matching <- stringr::str_match_all(text, regex)[[1]]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
token_matching <- stringi::stri_match_all(text, regex = regex)[[1]]

if (verbose) {
colnames(token_matching) <- c("all", regex_labels)
print(token_matching)
}

#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Extract the actual token and the regex which matched the token
#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
regex_idx <- apply(token_matching[, -1, drop=FALSE], 1, function(x) { which(!is.na(x))})
tokens <- apply(token_matching[, -1, drop=FALSE], 1, function(x) {x[which(!is.na(x))]})
names(tokens) <- regex_labels[regex_idx]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# if 'regex_idx' is a list, then a location was matched by multiple regexes
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if (is.list(regex_idx)) {
lens <- lengths(regex_idx)
idx <- which(lens > 1)
stop("lex issues at the following locations within 'text': ", deparse(idx))
}


#---------------------------------------------------------------------------
names(tokens) <- regex_labels[regex_idx]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If any tokens were captured by the '.missing' regex, then show
# a warning message
#---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if (verbose && any(names(tokens) == '.missing')) {
not_captured <- sort(unique(tokens[names(tokens) == '.missing']))
warning("The following characters were not captured: ", deparse(not_captured))
Expand All @@ -94,15 +111,14 @@ lex <- function(text, regexes, verbose=FALSE) {



#-----------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' Regexes to match common elements
#' @export
#-----------------------------------------------------------------------------
regex <- list(
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
re <- list(
number = '[+\\-]?(?:0|[1-9]\\d*)(?:\\.\\d*)?(?:[eE][+\\-]?\\d+)?',
email = '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}',
ipaddress = '(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'
)



5 changes: 2 additions & 3 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,10 @@ image_read("man/figures/logo.png")



# flexo: Simple Lex/Parse Tools in R <img src="man/figures/logo.png" align="right" height=300 title="An homage to the old logo for the Berlin Hilton"/>
# flexo: Simple Lex/Parse Tools in R <img src="man/figures/logo.png" align="right" height=300 />

<!-- badges: start -->
![](https://img.shields.io/badge/cool-useless-green.svg)
[![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental)
[![R build status](https://github.com/coolbutuseless/flexo/workflows/R-CMD-check/badge.svg)](https://github.com/coolbutuseless/flexo/actions)
<!-- badges: end -->

Expand Down Expand Up @@ -186,7 +185,7 @@ game_regexes <- c(
whitespace = "\\s+",
sep = "\\|",
mark = "X|O",
order = flexo::regex$number
order = flexo::re$number
)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@

<!-- README.md is generated from README.Rmd. Please edit that file -->

# flexo: Simple Lex/Parse Tools in R <img src="man/figures/logo.png" align="right" height=300 title="An homage to the old logo for the Berlin Hilton"/>
# flexo: Simple Lex/Parse Tools in R <img src="man/figures/logo.png" align="right" height=300 />

<!-- badges: start -->

![](https://img.shields.io/badge/cool-useless-green.svg) [![Lifecycle:
experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental)
[![R build
![](https://img.shields.io/badge/cool-useless-green.svg) [![R build
status](https://github.com/coolbutuseless/flexo/workflows/R-CMD-check/badge.svg)](https://github.com/coolbutuseless/flexo/actions)
<!-- badges: end -->

Expand Down Expand Up @@ -147,7 +145,7 @@ game_regexes <- c(
whitespace = "\\s+",
sep = "\\|",
mark = "X|O",
order = flexo::regex$number
order = flexo::re$number
)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion man/lex.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/regex.Rd → man/re.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

43 changes: 43 additions & 0 deletions tests/testthat/test-consume_until.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@



test_that("consume_until works", {


named_values <- c(one = 1, two = 2, three = 3, four = 4, five = 5)
stream <- TokenStream$new(named_values)

jnk <- stream$consume_until(name = 'one', inclusive = FALSE)
expect_equivalent(stream$read(1), 1)

jnk <- stream$consume_until(name = 'one', inclusive = FALSE)
expect_equivalent(stream$read(1), 1)

jnk <- stream$consume_until(name = 'two', inclusive = FALSE)
expect_equivalent(stream$read(1), 2)

jnk <- stream$consume_until(name = 'two', inclusive = FALSE)
expect_equivalent(stream$read(1), 2)





stream <- TokenStream$new(named_values)

jnk <- stream$consume_until(name = 'one', inclusive = TRUE)
expect_equivalent(stream$read(1), 2)

jnk <- stream$consume_until(name = 'one', inclusive = FALSE)
expect_true(stream$end_of_stream())

stream$reset()

jnk <- stream$consume_until(name = 'two', inclusive = TRUE)
expect_equivalent(stream$read(1), 3)

jnk <- stream$consume_until(name = 'two', inclusive = TRUE)
expect_true(stream$end_of_stream())


})
2 changes: 1 addition & 1 deletion vignettes/Scrabble.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ gcg_regexes <- c(
whitespace = '\\s+',
player = '>(.*?):', # start of each line with a `>`
location = '[a-o]\\d+|\\d+[a-o]|--|-', # Number first for horizontal words. -/-- for specials
number = flexo::regex$number,
number = flexo::re$number,
symbol = '[-+\\w\\./\\?\\(?:\\)]+',
comma = ","
)
Expand Down
2 changes: 1 addition & 1 deletion vignettes/parse_obj.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ Use `lex()` to turn the text into tokens
```{r}
obj_regexes <- c(
comment = '(#.*?)\n', # assume comments take up the whole line
number = flexo::regex$number, # matches most numeric values
number = flexo::re$number, # matches most numeric values
symbol = '\\w+',
newline = '\n',
whitespace = '\\s+'
Expand Down

0 comments on commit 0ec55c7

Please sign in to comment.