Skip to content

Commit

Permalink
wip worditer deregex
Browse files Browse the repository at this point in the history
  • Loading branch information
commonquail committed Sep 23, 2023
1 parent 040d51e commit b8a4fb8
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 28 deletions.
7 changes: 0 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ regex = "1.6.0"
unicode-segmentation = "1.8.0"
clap = "2.34.0"
encoding = "0.2.33"
lazy_static = "1.4.0"

[dev-dependencies]
pretty_assertions = "1.2.1"
Expand Down
3 changes: 0 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
#[macro_use]
extern crate lazy_static;

use clap::crate_description;
use clap::crate_name;
use clap::crate_version;
Expand Down
61 changes: 44 additions & 17 deletions src/worditer.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use regex::Regex;
use std::borrow::Cow;

/// An iterator over "words" in some text. A word is generally a sequence of non-whitespace
Expand All @@ -19,20 +18,6 @@ pub(crate) struct WordIter<'text> {
naive_words: core::str::Split<'text, char>,
}

lazy_static! {
static ref FOOTNOTE_REFERENCE: Regex = Regex::new(
r"(?x)
^
# One or more unseparated '[any]' tokens...
(?:\[[^]]+\])+
# ... optionally immediately followed by any sort of punctuation, for
# example in case this is the end of the sentence.
(?:\p{Punctuation}*)
$"
)
.unwrap();
}

impl<'text> WordIter<'text> {
pub fn new(text: &'text str, comment_char: char) -> Self {
WordIter {
Expand All @@ -44,8 +29,42 @@ impl<'text> WordIter<'text> {

fn is_non_breaking_word(&self, word: &str) -> bool {
word.starts_with(self.comment_char)
|| word.chars().all(|c| c.is_ascii_punctuation())
|| FOOTNOTE_REFERENCE.is_match(word)
|| match WordIter::describe_word(word) {
WordJoinerState::FootnoteRefUnseen => true,
WordJoinerState::FootnoteRefOpen => false,
WordJoinerState::FootnoteRefNamed => false,
WordJoinerState::FootnoteRefValid => true,
WordJoinerState::TrailingNonPunctuation => false,
}
}

fn describe_word(word: &str) -> WordJoinerState {
let mut state = WordJoinerState::FootnoteRefUnseen;
for c in word.chars() {
match state {
WordJoinerState::FootnoteRefUnseen | WordJoinerState::FootnoteRefValid => {
if c == '[' {
state = WordJoinerState::FootnoteRefOpen;
} else if !c.is_ascii_punctuation() {
state = WordJoinerState::TrailingNonPunctuation;
}
}
WordJoinerState::FootnoteRefOpen => {
state = if c == ']' {
WordJoinerState::FootnoteRefUnseen
} else {
WordJoinerState::FootnoteRefNamed
};
}
WordJoinerState::FootnoteRefNamed => {
if c == ']' {
state = WordJoinerState::FootnoteRefValid;
}
}
WordJoinerState::TrailingNonPunctuation => {}
}
}
state
}
}

Expand Down Expand Up @@ -84,6 +103,14 @@ impl<'text> Iterator for WordIter<'text> {

impl std::iter::FusedIterator for WordIter<'_> {}

enum WordJoinerState {
FootnoteRefUnseen,
FootnoteRefOpen,
FootnoteRefNamed,
FootnoteRefValid,
TrailingNonPunctuation,
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down

0 comments on commit b8a4fb8

Please sign in to comment.