@@ -13,6 +13,52 @@ pub struct FoundToken {
13
13
pub token : TokenKind ,
14
14
}
15
15
16
+ /// Same as [`lex_to_end`], but with additional infrastructure to intelligently ignore Markdown.
17
+ pub fn lex_to_end_md ( source : & [ char ] ) -> Vec < Token > {
18
+ let source_str: String = source. iter ( ) . collect ( ) ;
19
+ let md_parser = pulldown_cmark:: Parser :: new ( & source_str) ;
20
+
21
+ let mut tokens = Vec :: new ( ) ;
22
+
23
+ let mut traversed_bytes = 0 ;
24
+ let mut traversed_chars = 0 ;
25
+
26
+ // NOTE: the range spits out __byte__ indices, not char indices.
27
+ // This is why we keep track above.
28
+ for ( event, range) in md_parser. into_offset_iter ( ) {
29
+ if let pulldown_cmark:: Event :: Text ( text) = event {
30
+ traversed_chars += source_str[ traversed_bytes..range. start ] . chars ( ) . count ( ) ;
31
+ traversed_bytes = range. start ;
32
+
33
+ let mut new_tokens = lex_to_end_str ( text) ;
34
+
35
+ new_tokens
36
+ . iter_mut ( )
37
+ . for_each ( |token| token. span . offset ( traversed_chars) ) ;
38
+
39
+ for token in new_tokens. iter ( ) {
40
+ dbg ! ( token. span) ;
41
+ }
42
+
43
+ tokens. append ( & mut new_tokens) ;
44
+ }
45
+ }
46
+
47
+ tokens
48
+ }
49
+
50
+ /// Same as [`lex_to_end_str`], but with additional infrastructure to intelligently ignore Markdown.
51
+ ///
52
+ /// Yes, I am aware this implementation is doubly redundant, but I prefer to have a consistent API.
53
+ /// If its an issue, we can use a different markdown parser.
54
+ pub fn lex_to_end_md_str ( source : impl AsRef < str > ) -> Vec < Token > {
55
+ let r = source. as_ref ( ) ;
56
+
57
+ let chars: Vec < _ > = r. chars ( ) . collect ( ) ;
58
+
59
+ lex_to_end_md ( & chars)
60
+ }
61
+
16
62
pub fn lex_to_end_str ( source : impl AsRef < str > ) -> Vec < Token > {
17
63
let r = source. as_ref ( ) ;
18
64
@@ -200,26 +246,34 @@ fn lex_quote(source: &[char]) -> Option<FoundToken> {
200
246
201
247
#[ cfg( test) ]
202
248
mod tests {
249
+ use super :: { lex_to_end_md_str, lex_to_end_str} ;
203
250
use crate :: {
204
- lex_to_end_str , Punctuation ,
251
+ Punctuation ,
205
252
TokenKind :: { self , * } ,
206
253
} ;
207
254
208
- fn assert_tokens_eq ( test_str : impl AsRef < str > , expected : & [ TokenKind ] ) {
255
+ fn assert_tokens_eq_plain ( test_str : impl AsRef < str > , expected : & [ TokenKind ] ) {
209
256
let tokens = lex_to_end_str ( test_str) ;
210
257
let kinds: Vec < _ > = tokens. into_iter ( ) . map ( |v| v. kind ) . collect ( ) ;
211
258
212
259
assert_eq ! ( & kinds, expected)
213
260
}
214
261
262
+ fn assert_tokens_eq_md ( test_str : impl AsRef < str > , expected : & [ TokenKind ] ) {
263
+ let tokens = lex_to_end_md_str ( test_str) ;
264
+ let kinds: Vec < _ > = tokens. into_iter ( ) . map ( |v| v. kind ) . collect ( ) ;
265
+
266
+ assert_eq ! ( & kinds, expected)
267
+ }
268
+
215
269
#[ test]
216
270
fn single_letter ( ) {
217
- assert_tokens_eq ( "a" , & [ Word ] )
271
+ assert_tokens_eq_plain ( "a" , & [ Word ] )
218
272
}
219
273
220
274
#[ test]
221
275
fn sentence ( ) {
222
- assert_tokens_eq (
276
+ assert_tokens_eq_plain (
223
277
"hello world, my friend" ,
224
278
& [
225
279
Word ,
@@ -233,4 +287,21 @@ mod tests {
233
287
] ,
234
288
)
235
289
}
290
+
291
+ #[ test]
292
+ fn sentence_md ( ) {
293
+ assert_tokens_eq_md (
294
+ "__hello__ world, [my]() friend" ,
295
+ & [
296
+ Word ,
297
+ Space ( 1 ) ,
298
+ Word ,
299
+ Punctuation ( Punctuation :: Comma ) ,
300
+ Space ( 1 ) ,
301
+ Word ,
302
+ Space ( 1 ) ,
303
+ Word ,
304
+ ] ,
305
+ ) ;
306
+ }
236
307
}
0 commit comments