Skip to content

Commit b3b6346

Browse files
committed
Added Markdown support
1 parent bdba195 commit b3b6346

File tree

11 files changed

+167
-31
lines changed

11 files changed

+167
-31
lines changed

Cargo.lock

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

english_words.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365403,8 +365403,6 @@ whuther
365403365403
whutter
365404365404
whuttering
365405365405
whuz
365406-
wi
365407-
wy
365408365406
wyandot
365409365407
wyandotte
365410365408
wibble

harper-core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ ahash = "0.8.7"
88
is-macro = "0.3.0"
99
itertools = "0.11.0"
1010
once_cell = "1.19.0"
11+
pulldown-cmark = "0.9.3"
1112
serde = { version = "1.0.190", features = ["derive"] }
1213
smallvec = "1.12.0"
1314

harper-core/src/document.rs

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use itertools::Itertools;
55
use crate::{
66
lex_to_end,
77
linting::Suggestion,
8+
parsing::lex_to_end_md,
89
span::Span,
910
FatToken,
1011
Punctuation::{self},
@@ -14,15 +15,21 @@ use crate::{
1415
pub struct Document {
1516
source: Vec<char>,
1617
tokens: Vec<Token>,
18+
markdown: bool,
1719
}
1820

1921
impl Document {
2022
// Lexes and parses text to produce a document.
21-
pub fn new(text: &str) -> Self {
23+
//
24+
// Choosing to parse with markdown may have a performance penalty
25+
pub fn new(text: &str, markdown: bool) -> Self {
2226
let source: Vec<_> = text.chars().collect();
23-
let tokens = lex_to_end(&source);
2427

25-
let mut doc = Self { source, tokens };
28+
let mut doc = Self {
29+
source,
30+
tokens: Vec::new(),
31+
markdown,
32+
};
2633
doc.parse();
2734

2835
doc
@@ -32,6 +39,12 @@ impl Document {
3239
///
3340
/// Should be run after every change to the underlying [`Self::source`].
3441
fn parse(&mut self) {
42+
if self.markdown {
43+
self.tokens = lex_to_end_md(&self.source);
44+
} else {
45+
self.tokens = lex_to_end(&self.source);
46+
}
47+
3548
self.match_quotes();
3649
}
3750

@@ -182,21 +195,26 @@ mod tests {
182195
use crate::Token;
183196

184197
impl Document {
185-
fn from_raw_parts(source: Vec<char>, tokens: Vec<Token>) -> Self {
186-
Self { source, tokens }
198+
fn from_raw_parts(source: Vec<char>, tokens: Vec<Token>, markdown: bool) -> Self {
199+
Self {
200+
source,
201+
tokens,
202+
markdown,
203+
}
187204
}
188205
}
189206

190207
#[test]
191-
fn parses_sentances_correctly() {
208+
fn parses_sentences_correctly() {
192209
let text = "There were three little pigs. They built three little homes.";
193-
let document = Document::new(text);
210+
let document = Document::new(text, false);
194211

195212
let mut sentence_strs = vec![];
196213

197214
for sentence in document.sentences() {
198215
sentence_strs.push(
199-
Document::from_raw_parts(document.source.clone(), sentence.to_vec()).to_string(),
216+
Document::from_raw_parts(document.source.clone(), sentence.to_vec(), false)
217+
.to_string(),
200218
);
201219
}
202220

harper-core/src/parsing/lexer.rs

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,52 @@ pub struct FoundToken {
1313
pub token: TokenKind,
1414
}
1515

16+
/// Same as [`lex_to_end`], but with additional infrastructure to intelligently ignore Markdown.
17+
pub fn lex_to_end_md(source: &[char]) -> Vec<Token> {
18+
let source_str: String = source.iter().collect();
19+
let md_parser = pulldown_cmark::Parser::new(&source_str);
20+
21+
let mut tokens = Vec::new();
22+
23+
let mut traversed_bytes = 0;
24+
let mut traversed_chars = 0;
25+
26+
// NOTE: the range spits out __byte__ indices, not char indices.
27+
// This is why we keep track above.
28+
for (event, range) in md_parser.into_offset_iter() {
29+
if let pulldown_cmark::Event::Text(text) = event {
30+
traversed_chars += source_str[traversed_bytes..range.start].chars().count();
31+
traversed_bytes = range.start;
32+
33+
let mut new_tokens = lex_to_end_str(text);
34+
35+
new_tokens
36+
.iter_mut()
37+
.for_each(|token| token.span.offset(traversed_chars));
38+
39+
for token in new_tokens.iter() {
40+
dbg!(token.span);
41+
}
42+
43+
tokens.append(&mut new_tokens);
44+
}
45+
}
46+
47+
tokens
48+
}
49+
50+
/// Same as [`lex_to_end_str`], but with additional infrastructure to intelligently ignore Markdown.
51+
///
52+
/// Yes, I am aware this implementation is doubly redundant, but I prefer to have a consistent API.
53+
/// If its an issue, we can use a different markdown parser.
54+
pub fn lex_to_end_md_str(source: impl AsRef<str>) -> Vec<Token> {
55+
let r = source.as_ref();
56+
57+
let chars: Vec<_> = r.chars().collect();
58+
59+
lex_to_end_md(&chars)
60+
}
61+
1662
pub fn lex_to_end_str(source: impl AsRef<str>) -> Vec<Token> {
1763
let r = source.as_ref();
1864

@@ -200,26 +246,34 @@ fn lex_quote(source: &[char]) -> Option<FoundToken> {
200246

201247
#[cfg(test)]
202248
mod tests {
249+
use super::{lex_to_end_md_str, lex_to_end_str};
203250
use crate::{
204-
lex_to_end_str, Punctuation,
251+
Punctuation,
205252
TokenKind::{self, *},
206253
};
207254

208-
fn assert_tokens_eq(test_str: impl AsRef<str>, expected: &[TokenKind]) {
255+
fn assert_tokens_eq_plain(test_str: impl AsRef<str>, expected: &[TokenKind]) {
209256
let tokens = lex_to_end_str(test_str);
210257
let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();
211258

212259
assert_eq!(&kinds, expected)
213260
}
214261

262+
fn assert_tokens_eq_md(test_str: impl AsRef<str>, expected: &[TokenKind]) {
263+
let tokens = lex_to_end_md_str(test_str);
264+
let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();
265+
266+
assert_eq!(&kinds, expected)
267+
}
268+
215269
#[test]
216270
fn single_letter() {
217-
assert_tokens_eq("a", &[Word])
271+
assert_tokens_eq_plain("a", &[Word])
218272
}
219273

220274
#[test]
221275
fn sentence() {
222-
assert_tokens_eq(
276+
assert_tokens_eq_plain(
223277
"hello world, my friend",
224278
&[
225279
Word,
@@ -233,4 +287,21 @@ mod tests {
233287
],
234288
)
235289
}
290+
291+
#[test]
292+
fn sentence_md() {
293+
assert_tokens_eq_md(
294+
"__hello__ world, [my]() friend",
295+
&[
296+
Word,
297+
Space(1),
298+
Word,
299+
Punctuation(Punctuation::Comma),
300+
Space(1),
301+
Word,
302+
Space(1),
303+
Word,
304+
],
305+
);
306+
}
236307
}

harper-core/src/parsing/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
mod lexer;
22
mod token;
33

4-
pub use lexer::{lex_to_end, lex_to_end_str};
4+
pub use lexer::{lex_to_end, lex_to_end_md, lex_to_end_md_str, lex_to_end_str};
55
pub use token::{FatToken, Punctuation, Quote, Token, TokenKind, TokenStringExt};

harper-core/src/span.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ impl Span {
2727
pub fn get_content<'a>(&self, source: &'a [char]) -> &'a [char] {
2828
if cfg!(debug_assertions) {
2929
assert!(self.start < self.end);
30+
assert!(self.start < source.len());
3031
assert!(self.end <= source.len());
3132
}
3233

@@ -46,6 +47,12 @@ impl Span {
4647
cloned.set_len(length);
4748
cloned
4849
}
50+
51+
// Add an amount to both [`Self::start`] and [`Self::end`]
52+
pub fn offset(&mut self, by: usize) {
53+
self.start += by;
54+
self.end += by;
55+
}
4956
}
5057

5158
#[cfg(test)]

harper-ls/src/diagnostics.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ fn open_url(url: &Url) -> Result<String> {
7777

7878
#[cached::proc_macro::cached]
7979
fn lint_string(text: String) -> Vec<Lint> {
80-
let document = Document::new(&text);
80+
let document = Document::new(&text, true);
8181
let dictionary = Dictionary::new();
8282
all_linters(&document, dictionary)
8383
}

harper-serve/src/main.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use harper_core::{all_linters, Dictionary, Document, FatToken, Lint, Span, Suggestion};
44
use std::net::SocketAddr;
55
use tokio::time::Instant;
6-
use tracing::{debug, info, Level};
6+
use tracing::{info, Level};
77
use tracing_subscriber::FmtSubscriber;
88

99
use axum::{
@@ -12,7 +12,7 @@ use axum::{
1212
http::StatusCode,
1313
middleware::{self, Next},
1414
response::Response,
15-
routing::{get, post},
15+
routing::post,
1616
Json, Router,
1717
};
1818
use serde::{Deserialize, Serialize};
@@ -41,6 +41,8 @@ async fn main() {
4141
}
4242

4343
async fn timing_middleware(request: Request<Body>, next: Next<Body>) -> Response {
44+
info!("Handling request at endpoint: {}", request.uri().path());
45+
4446
let uri = request.uri().clone();
4547

4648
let start = Instant::now();
@@ -69,7 +71,7 @@ async fn root() -> &'static str {
6971
async fn parse_text(Json(payload): Json<ParseRequest>) -> (StatusCode, Json<ParseResponse>) {
7072
let text = payload.text;
7173

72-
let document = Document::new(&text);
74+
let document = Document::new(&text, true);
7375
let tokens: Vec<_> = document.fat_tokens().collect();
7476

7577
(StatusCode::ACCEPTED, Json(ParseResponse { tokens }))
@@ -89,7 +91,7 @@ async fn lint(Json(payload): Json<LintRequest>) -> (StatusCode, Json<LintRespons
8991
let text = payload.text;
9092

9193
let dictionary = Dictionary::new();
92-
let document = Document::new(&text);
94+
let document = Document::new(&text, true);
9395

9496
let lints = all_linters(&document, dictionary);
9597

@@ -110,7 +112,7 @@ async fn apply_suggestion(
110112
Json(payload): Json<ApplySuggestionRequest>,
111113
) -> (StatusCode, Json<ApplySuggestionResponse>) {
112114
let text = payload.text;
113-
let mut document = Document::new(&text);
115+
let mut document = Document::new(&text, true);
114116
document.apply_suggestion(&payload.suggestion, payload.span);
115117

116118
(

0 commit comments

Comments
 (0)