fix(Automattic#228): add stricter guards to make sure it is a complete sentence

elijah-potter · elijah-potter · commit 922791340734 · 2024-11-29T22:21:04.000-07:00
diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs
@@ -500,6 +500,7 @@ impl TokenStringExt for Document {
     create_fns_on_doc!(ellipsis);
     create_fns_on_doc!(unlintable);
     create_fns_on_doc!(sentence_terminator);
+    create_fns_on_doc!(paragraph_break);
     create_fns_on_doc!(chunk_terminator);
     create_fns_on_doc!(punctuation);
     create_fns_on_doc!(likely_homograph);
@@ -528,6 +529,10 @@ impl TokenStringExt for Document {
         self.tokens.iter_chunks()
     }
 
+    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
+        self.tokens.iter_paragraphs()
+    }
+
     fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
         self.tokens.iter_sentences()
     }
diff --git a/harper-core/src/linting/sentence_capitalization.rs b/harper-core/src/linting/sentence_capitalization.rs
@@ -3,7 +3,7 @@ use itertools::Itertools;
 use super::lint::Suggestion;
 use super::{Lint, LintKind, Linter};
 use crate::document::Document;
-use crate::TokenStringExt;
+use crate::{Token, TokenKind, TokenStringExt};
 
 #[derive(Debug, Clone, Copy, Default)]
 pub struct SentenceCapitalization;
@@ -14,26 +14,45 @@ impl Linter for SentenceCapitalization {
     fn lint(&mut self, document: &Document) -> Vec<Lint> {
         let mut lints = Vec::new();
 
-        for sentence in document.iter_sentences() {
-            if let Some(first_word) = sentence.first_non_whitespace() {
-                if !first_word.kind.is_word() {
+        for paragraph in document.iter_paragraphs() {
+            // Allows short, label-like comments in code.
+            if paragraph.iter_sentences().count() == 1 {
+                let only_sentence = paragraph.iter_sentences().next().unwrap();
+
+                if !only_sentence
+                    .iter_chunks()
+                    .map(|c| c.iter_words().count())
+                    .any(|c| c > 5)
+                {
                     continue;
                 }
+            }
+
+            for sentence in paragraph.iter_sentences() {
+                if !is_full_sentence(sentence) {
+                    continue;
+                }
+
+                if let Some(first_word) = sentence.first_non_whitespace() {
+                    if !first_word.kind.is_word() {
+                        continue;
+                    }
 
-                let letters = document.get_span_content(first_word.span);
-
-                if let Some(first_letter) = letters.first() {
-                    if first_letter.is_alphabetic() && !first_letter.is_uppercase() {
-                        lints.push(Lint {
-                            span: first_word.span.with_len(1),
-                            lint_kind: LintKind::Capitalization,
-                            suggestions: vec![Suggestion::ReplaceWith(
-                                first_letter.to_uppercase().collect_vec(),
-                            )],
-                            priority: 31,
-                            message: "This sentence does not start with a capital letter"
-                                .to_string(),
-                        })
+                    let letters = document.get_span_content(first_word.span);
+
+                    if let Some(first_letter) = letters.first() {
+                        if first_letter.is_alphabetic() && !first_letter.is_uppercase() {
+                            lints.push(Lint {
+                                span: first_word.span.with_len(1),
+                                lint_kind: LintKind::Capitalization,
+                                suggestions: vec![Suggestion::ReplaceWith(
+                                    first_letter.to_uppercase().collect_vec(),
+                                )],
+                                priority: 31,
+                                message: "This sentence does not start with a capital letter"
+                                    .to_string(),
+                            })
+                        }
                     }
                 }
             }
@@ -43,25 +62,52 @@ impl Linter for SentenceCapitalization {
     }
 }
 
+fn is_full_sentence(toks: &[Token]) -> bool {
+    let mut has_noun = false;
+    let mut has_verb = false;
+
+    for tok in toks {
+        if let TokenKind::Word(metadata) = tok.kind {
+            if metadata.is_noun() {
+                has_noun = true;
+            }
+
+            if metadata.is_verb() {
+                has_verb = true;
+            }
+        }
+    }
+
+    has_noun && has_verb
+}
+
 #[cfg(test)]
 mod tests {
     use super::super::tests::assert_lint_count;
     use super::SentenceCapitalization;
 
     #[test]
     fn catches_basic() {
-        assert_lint_count("there is no way.", SentenceCapitalization, 1)
+        assert_lint_count(
+            "there is no way she is not guilty.",
+            SentenceCapitalization,
+            1,
+        )
     }
 
     #[test]
     fn no_period() {
-        assert_lint_count("there is no way", SentenceCapitalization, 1)
+        assert_lint_count(
+            "there is no way she is not guilty",
+            SentenceCapitalization,
+            1,
+        )
     }
 
     #[test]
     fn two_sentence() {
         assert_lint_count(
-            "i have complete conviction. she is guilty",
+            "i have complete conviction in this. she is absolutely guilty",
             SentenceCapitalization,
             2,
         )
@@ -111,4 +157,9 @@ mod tests {
             1,
         )
     }
+
+    #[test]
+    fn issue_228_allows_labels() {
+        assert_lint_count("python lsp (fork of pyright)", SentenceCapitalization, 0)
+    }
 }
diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs
@@ -403,6 +403,7 @@ pub trait TokenStringExt {
     create_decl_for!(ellipsis);
     create_decl_for!(unlintable);
     create_decl_for!(sentence_terminator);
+    create_decl_for!(paragraph_break);
     create_decl_for!(chunk_terminator);
     create_decl_for!(punctuation);
     create_decl_for!(likely_homograph);
@@ -420,6 +421,10 @@ pub trait TokenStringExt {
     /// ```
     fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
 
+    /// Get an iterator over token slices that represent the individual
+    /// paragraphs in a document.
+    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
+
     /// Get an iterator over token slices that represent the individual
     /// sentences in a document.
     fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
@@ -437,6 +442,7 @@ impl TokenStringExt for [Token] {
     create_fns_for!(ellipsis);
     create_fns_for!(unlintable);
     create_fns_for!(sentence_terminator);
+    create_fns_for!(paragraph_break);
     create_fns_for!(chunk_terminator);
     create_fns_for!(likely_homograph);
 
@@ -501,6 +507,30 @@ impl TokenStringExt for [Token] {
         first_chunk.into_iter().chain(rest).chain(last)
     }
 
+    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
+        let first_pg = self
+            .iter_paragraph_break_indices()
+            .next()
+            .map(|first_term| &self[0..=first_term]);
+
+        let rest = self
+            .iter_paragraph_break_indices()
+            .tuple_windows()
+            .map(move |(a, b)| &self[a + 1..=b]);
+
+        let last_pg = if let Some(last_i) = self.last_paragraph_break_index() {
+            if last_i + 1 < self.len() {
+                Some(&self[last_i + 1..])
+            } else {
+                None
+            }
+        } else {
+            Some(self)
+        };
+
+        first_pg.into_iter().chain(rest).chain(last_pg)
+    }
+
     fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
         let first_sentence = self
             .iter_sentence_terminator_indices()