Skip to content

Commit f9d96c3

Browse files
committed
Added matcher lint
1 parent 2d03f8e commit f9d96c3

File tree

8 files changed

+250
-34
lines changed

8 files changed

+250
-34
lines changed

demo.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
Harper is a language checker for artists. it can detect
2-
improper capitalization and mispelled words. There are some cases,
2+
improper capitalization and misspelled words. There are some cases,
33
where the the standard grammar checkers don't cut it.
44

55
That's where Harper comes in handy.
6+
7+
kid regards, Elijah

harper-core/src/document.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ impl Document {
156156
)
157157
}
158158

159+
/// Defensively attempt to grab a specific token.
160+
pub fn get_token(&self, index: usize) -> Option<Token> {
161+
self.tokens.get(index).copied()
162+
}
163+
159164
pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
160165
self.tokens.iter().copied()
161166
}

harper-core/src/linting/lint_set.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use super::{spaces::Spaces, Linter};
44
use paste::paste;
55

66
use super::{
7-
long_sentences::LongSentences, repeated_words::RepeatedWords,
7+
long_sentences::LongSentences, matcher::Matcher, repeated_words::RepeatedWords,
88
sentence_capitalization::SentenceCapitalization, spell_check::SpellCheck,
99
unclosed_quotes::UnclosedQuotes, wrong_quotes::WrongQuotes,
1010
};
@@ -40,6 +40,7 @@ impl LintSet {
4040
.add_unclosed_quotes()
4141
.add_sentence_capitalization()
4242
.add_spell_check(dictionary)
43+
.add_matcher()
4344
.add_spaces();
4445
self
4546
}
@@ -99,5 +100,6 @@ create_simple_builder_methods!(
99100
WrongQuotes,
100101
LongSentences,
101102
RepeatedWords,
102-
Spaces
103+
Spaces,
104+
Matcher
103105
);

harper-core/src/linting/matcher.rs

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
use crate::{
2+
spell::DictWord, Document, Lint, LintKind, Linter, Punctuation, Span, Suggestion, Token,
3+
TokenKind,
4+
};
5+
6+
#[derive(Debug, PartialEq, PartialOrd, Clone)]
7+
struct PatternToken {
8+
kind: TokenKind,
9+
content: Option<DictWord>,
10+
}
11+
12+
impl PatternToken {
13+
fn from_token(token: Token, document: &Document) -> Self {
14+
if token.kind.is_word() {
15+
Self {
16+
kind: token.kind,
17+
content: Some(document.get_span_content(token.span).into()),
18+
}
19+
} else {
20+
Self {
21+
kind: token.kind,
22+
content: None,
23+
}
24+
}
25+
}
26+
}
27+
28+
macro_rules! vecword {
29+
($lit:literal) => {
30+
$lit.chars().collect()
31+
};
32+
}
33+
34+
macro_rules! pt {
35+
($str:literal) => {
36+
PatternToken {
37+
kind: TokenKind::Word,
38+
content: Some($str.chars().collect()),
39+
}
40+
};
41+
(Hyphen) => {
42+
PatternToken {
43+
kind: TokenKind::Punctuation(Punctuation::Hyphen),
44+
content: None,
45+
}
46+
};
47+
(Space) => {
48+
PatternToken {
49+
kind: TokenKind::Space(1),
50+
content: None,
51+
}
52+
};
53+
( $($($str:literal),* => $repl:literal),*) => {
54+
vec![
55+
$(
56+
{
57+
let mut rule = Rule {
58+
pattern: vec![$(
59+
pt!($str),
60+
pt!(Space),
61+
)*],
62+
replace_with: $repl.chars().collect()
63+
};
64+
65+
if rule.pattern.len() > 0{
66+
rule.pattern.pop();
67+
}
68+
69+
rule
70+
},
71+
)*
72+
]
73+
};
74+
}
75+
76+
struct Rule {
77+
pattern: Vec<PatternToken>,
78+
replace_with: Vec<char>,
79+
}
80+
81+
/// A linter that uses a variety of curated pattern matches to find and fix common
82+
/// grammatical issues.
83+
pub struct Matcher {
84+
triggers: Vec<Rule>,
85+
}
86+
87+
impl Matcher {
88+
pub fn new() -> Self {
89+
let mut triggers = pt! {
90+
"There","fore" => "Therefore",
91+
"south","America" => "South America",
92+
"South","america" => "South America",
93+
"south","america" => "South America",
94+
"North","america" => "North America",
95+
"north","America" => "North America",
96+
"north","america" => "North America",
97+
"fatal","outcome" => "death",
98+
"geiger","counter" => "Geiger counter",
99+
"veterans","day" => "Veterans Day",
100+
"presidents","day" => "Presidents' Day",
101+
"president's","day" => "Presidents' Day",
102+
"valentines","day" => "Valentine's Day",
103+
"world","war","2" => "World War II",
104+
"World","war","ii" => "World War II",
105+
"world","War","ii" => "World War II",
106+
"World","War","Ii" => "World War II",
107+
"World","War","iI" => "World War II",
108+
"black","sea" => "Black Sea",
109+
"I","a","m" => "I am",
110+
"We","a","re" => "We are",
111+
"The","re" => "There",
112+
"my","french" => "my French",
113+
"It","cam" => "It can",
114+
"can","be","seem" => "can be seen",
115+
"mu","house" => "my house",
116+
"kid","regards" => "kind regards",
117+
"miss","understand" => "misunderstand",
118+
"miss","use" => "misuse",
119+
"miss","used" => "misused",
120+
"bee","there" => "been there",
121+
"want","be" => "won't be",
122+
"more","then" => "more than",
123+
"gong","to" => "going to",
124+
"then","others" => "than others",
125+
"then","before" => "than before",
126+
"then","last","week" => "than last week",
127+
"then","her" => "than her",
128+
"then","hers" => "than hers",
129+
"then","him" => "than him",
130+
"then","his" => "than his"
131+
};
132+
133+
triggers.push(Rule {
134+
pattern: vec![pt!("break"), pt!(Hyphen), pt!("up")],
135+
replace_with: vecword!("break-up"),
136+
});
137+
138+
Self { triggers }
139+
}
140+
}
141+
142+
impl Default for Matcher {
143+
fn default() -> Self {
144+
Self::new()
145+
}
146+
}
147+
148+
impl Linter for Matcher {
149+
fn lint(&mut self, document: &Document) -> Vec<Lint> {
150+
let mut lints = Vec::new();
151+
152+
for (index, _) in document.tokens().enumerate() {
153+
for trigger in &self.triggers {
154+
let mut match_tokens = Vec::new();
155+
156+
for (p_index, pattern) in trigger.pattern.iter().enumerate() {
157+
let Some(token) = document.get_token(index + p_index) else {
158+
break;
159+
};
160+
161+
let t_pattern = PatternToken::from_token(token, document);
162+
163+
if t_pattern != *pattern {
164+
break;
165+
}
166+
167+
match_tokens.push(token);
168+
}
169+
170+
if match_tokens.len() == trigger.pattern.len() && !match_tokens.is_empty() {
171+
let span = Span::new(
172+
match_tokens.first().unwrap().span.start,
173+
match_tokens.last().unwrap().span.end,
174+
);
175+
176+
lints.push(Lint {
177+
span,
178+
lint_kind: LintKind::Miscellaneous,
179+
suggestions: vec![Suggestion::ReplaceWith(trigger.replace_with.to_owned())],
180+
message: format!(
181+
"Did you mean “{}”?",
182+
trigger.replace_with.iter().collect::<String>()
183+
),
184+
})
185+
}
186+
}
187+
}
188+
189+
lints
190+
}
191+
}
192+
193+
#[cfg(test)]
194+
mod tests {
195+
use crate::{Document, Linter};
196+
197+
use super::Matcher;
198+
199+
#[test]
200+
fn matches_therefore() {
201+
let document = Document::new_plain_english("There fore.");
202+
let mut matcher = Matcher::new();
203+
let lints = matcher.lint(&document);
204+
assert!(lints.len() == 1)
205+
}
206+
}

harper-core/src/linting/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
mod lint;
22
mod lint_set;
33
mod long_sentences;
4+
mod matcher;
45
mod repeated_words;
56
mod sentence_capitalization;
67
mod spaces;

harper-core/src/linting/repeated_words.rs

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
use hashbrown::HashSet;
2+
use smallvec::smallvec;
23

34
use crate::{
5+
spell::DictWord,
46
token::{Token, TokenKind, TokenStringExt},
57
Document, Span, Suggestion,
68
};
@@ -10,38 +12,35 @@ use super::{Lint, LintKind, Linter};
1012
#[derive(Debug, Clone)]
1113
pub struct RepeatedWords {
1214
/// The set of words that can be considered for repetition checking.
13-
set: HashSet<Vec<char>>,
15+
set: HashSet<DictWord>,
1416
}
1517

1618
impl RepeatedWords {
1719
pub fn new() -> Self {
1820
let mut set = HashSet::new();
1921

20-
set.insert(vec!['t', 'h', 'e']);
21-
set.insert(vec!['T', 'h', 'e']);
22-
set.insert(vec!['a']);
23-
set.insert(vec!['A']);
24-
set.insert(vec!['a', 'n']);
25-
set.insert(vec!['A', 'n']);
26-
set.insert(vec!['i', 's']);
27-
set.insert(vec!['I', 's']);
28-
set.insert(vec!['w', 'i', 'l', 'l']);
29-
set.insert(vec!['W', 'i', 'l', 'l']);
30-
set.insert(vec!['l', 'i', 'k', 'e']);
31-
set.insert(vec!['L', 'i', 'k', 'e']);
32-
set.insert(vec!['t', 'h', 'a', 't']);
33-
set.insert(vec!['T', 'h', 'a', 't']);
34-
set.insert(vec!['w', 'h', 'a', 't']);
35-
set.insert(vec!['W', 'h', 'a', 't']);
36-
set.insert(vec!['w', 'h', 'i', 'c', 'h']);
37-
set.insert(vec!['W', 'h', 'i', 'c', 'h']);
38-
set.insert(vec!['b', 'e']);
39-
set.insert(vec!['B', 'e']);
40-
set.insert(vec!['a', 'n', 'd']);
41-
set.insert(vec!['A', 'n', 'd']);
42-
set.insert(vec!['I']);
43-
set.insert(vec!['a', 't']);
44-
set.insert(vec!['A', 't']);
22+
macro_rules! add_set {
23+
($lit:literal) => {
24+
set.insert($lit.chars().collect());
25+
};
26+
($($lit:literal),*) => {
27+
$(
28+
add_set!($lit);
29+
)*
30+
}
31+
}
32+
33+
add_set!(
34+
"the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not",
35+
"on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
36+
"they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
37+
"there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
38+
"go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take",
39+
"people", "into", "year", "your", "good", "some", "could", "them", "see", "other",
40+
"than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back",
41+
"after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new",
42+
"want", "because", "any", "these", "give", "day", "most", "us"
43+
);
4544

4645
Self { set }
4746
}

harper-core/src/spell/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ pub use self::dictionary::Dictionary;
66
mod dictionary;
77
mod hunspell;
88

9-
type DictWord = SmallVec<[char; 6]>;
9+
/// A word from a dictionary or other similar structure.
10+
pub type DictWord = SmallVec<[char; 6]>;
1011

1112
/// Suggest a correct spelling for a given misspelled word.
1213
/// [`misspelled_word`] is assumed to be quite small (n < 100).

harper-core/src/token.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ impl Token {
2323
}
2424

2525
/// A [`Token`] that holds its content as a fat [`Vec<char>`] rather than as a [`Span`].
26-
#[derive(Debug, Clone, Serialize, Deserialize)]
26+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)]
2727
pub struct FatToken {
2828
pub content: Vec<char>,
2929
pub kind: TokenKind,
3030
}
3131

32-
#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Default)]
32+
#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Default, PartialOrd)]
3333
#[serde(tag = "kind", content = "value")]
3434
pub enum TokenKind {
3535
#[default]
@@ -59,7 +59,7 @@ impl TokenKind {
5959
}
6060
}
6161

62-
#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
62+
#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd)]
6363
#[serde(tag = "kind")]
6464
pub enum Punctuation {
6565
/// .
@@ -104,7 +104,7 @@ pub enum Punctuation {
104104
Equal,
105105
}
106106

107-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
107+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd)]
108108
pub struct Quote {
109109
/// The location of the matching quote, if it exists.
110110
pub twin_loc: Option<usize>,

0 commit comments

Comments
 (0)