Skip to content

Commit cbb8a46

Browse files
committed
Implement text_direction_codepoint lint
Detects Unicode BiDi control codepoints in Cargo.toml files to mitigate Trojan Source attacks (CVE-2021-42574). Features: - Detects 9 Unicode BiDi control codepoints - Groups multiple codepoints on same line into single error (like rustc) - Moves workspace check into lint function for cleaner architecture - Uses rustc-style diagnostics with codepoint info in label annotation - Includes explanatory note about BiDi codepoint risks - Deny-by-default lint level
1 parent d1bcb39 commit cbb8a46

12 files changed

Lines changed: 537 additions & 21 deletions

File tree

.DS_Store

8 KB
Binary file not shown.

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ thiserror = "2.0.17"
109109
time = { version = "0.3.44", features = ["parsing", "formatting", "serde"] }
110110
toml = { version = "0.9.10", default-features = false }
111111
toml_edit = { version = "0.24.0", features = ["serde"] }
112+
toml_parser = "1.0.6"
112113
tracing = { version = "0.1.44", default-features = false, features = ["std"] } # be compatible with rustc_log: https://github.com/rust-lang/rust/blob/e51e98dde6a/compiler/rustc_log/Cargo.toml#L9
113114
tracing-chrome = "0.7.2"
114115
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
@@ -214,6 +215,7 @@ thiserror.workspace = true
214215
time.workspace = true
215216
toml = { workspace = true, features = ["std", "serde", "parse", "display", "preserve_order"] }
216217
toml_edit.workspace = true
218+
toml_parser.workspace = true
217219
tracing = { workspace = true, features = ["attributes"] }
218220
tracing-subscriber.workspace = true
219221
unicase.workspace = true

benches/.DS_Store

6 KB
Binary file not shown.

crates/.DS_Store

10 KB
Binary file not shown.

credential/.DS_Store

8 KB
Binary file not shown.

src/.DS_Store

6 KB
Binary file not shown.

src/cargo/core/workspace.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ use crate::lints::analyze_cargo_lints_table;
2525
use crate::lints::rules::blanket_hint_mostly_unused;
2626
use crate::lints::rules::check_im_a_teapot;
2727
use crate::lints::rules::implicit_minimum_version_req;
28+
use crate::lints::rules::text_direction_codepoint;
2829
use crate::ops;
2930
use crate::sources::{CRATES_IO_INDEX, CRATES_IO_REGISTRY, PathSource, SourceConfigMap};
3031
use crate::util::context::{FeatureUnification, Value};
@@ -1313,6 +1314,13 @@ impl<'gctx> Workspace<'gctx> {
13131314
&mut run_error_count,
13141315
self.gctx,
13151316
)?;
1317+
text_direction_codepoint(
1318+
pkg.into(),
1319+
&path,
1320+
&cargo_lints,
1321+
&mut run_error_count,
1322+
self.gctx,
1323+
)?;
13161324

13171325
if run_error_count > 0 {
13181326
let plural = if run_error_count == 1 { "" } else { "s" };
@@ -1370,6 +1378,13 @@ impl<'gctx> Workspace<'gctx> {
13701378
&mut run_error_count,
13711379
self.gctx,
13721380
)?;
1381+
text_direction_codepoint(
1382+
self.root_maybe().into(),
1383+
self.root_manifest(),
1384+
&cargo_lints,
1385+
&mut run_error_count,
1386+
self.gctx,
1387+
)?;
13731388
}
13741389

13751390
// This is a short term hack to allow `blanket_hint_mostly_unused`

src/cargo/lints/rules/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,19 @@
11
mod blanket_hint_mostly_unused;
22
mod im_a_teapot;
33
mod implicit_minimum_version_req;
4+
mod text_direction_codepoint;
45
mod unknown_lints;
56

67
pub use blanket_hint_mostly_unused::blanket_hint_mostly_unused;
78
pub use im_a_teapot::check_im_a_teapot;
89
pub use implicit_minimum_version_req::implicit_minimum_version_req;
10+
pub use text_direction_codepoint::text_direction_codepoint;
911
pub use unknown_lints::output_unknown_lints;
1012

1113
pub const LINTS: &[crate::lints::Lint] = &[
1214
blanket_hint_mostly_unused::LINT,
1315
implicit_minimum_version_req::LINT,
1416
im_a_teapot::LINT,
17+
text_direction_codepoint::LINT,
1518
unknown_lints::LINT,
1619
];
Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
use std::path::Path;
2+
3+
use annotate_snippets::AnnotationKind;
4+
use annotate_snippets::Group;
5+
use annotate_snippets::Level;
6+
use annotate_snippets::Snippet;
7+
use cargo_util_schemas::manifest::TomlToolLints;
8+
use toml_parser::Source;
9+
use toml_parser::Span;
10+
use toml_parser::parser::EventReceiver;
11+
12+
use crate::CargoResult;
13+
use crate::GlobalContext;
14+
use crate::core::MaybePackage;
15+
use crate::lints::Lint;
16+
use crate::lints::LintLevel;
17+
use crate::lints::ManifestFor;
18+
use crate::lints::rel_cwd_manifest_path;
19+
20+
const UNICODE_BIDI_CODEPOINTS: &[(char, &str)] = &[
21+
('\u{202A}', "LEFT-TO-RIGHT EMBEDDING"),
22+
('\u{202B}', "RIGHT-TO-LEFT EMBEDDING"),
23+
('\u{202C}', "POP DIRECTIONAL FORMATTING"),
24+
('\u{202D}', "LEFT-TO-RIGHT OVERRIDE"),
25+
('\u{202E}', "RIGHT-TO-LEFT OVERRIDE"),
26+
('\u{2066}', "LEFT-TO-RIGHT ISOLATE"),
27+
('\u{2067}', "RIGHT-TO-LEFT ISOLATE"),
28+
('\u{2068}', "FIRST STRONG ISOLATE"),
29+
('\u{2069}', "POP DIRECTIONAL ISOLATE"),
30+
];
31+
32+
pub const LINT: Lint = Lint {
33+
name: "text_direction_codepoint",
34+
desc: "unicode codepoint changing visible direction of text present in manifest",
35+
groups: &[],
36+
default_level: LintLevel::Deny,
37+
edition_lint_opts: None,
38+
feature_gate: None,
39+
docs: None,
40+
};
41+
42+
/// Paths where BiDi codepoints are allowed (legitimate RTL content).
43+
const ALLOWED_BIDI_PATHS: &[&[&str]] = &[
44+
&["package", "description"],
45+
&["package", "metadata"],
46+
&["workspace", "metadata"],
47+
];
48+
49+
fn is_allowed_bidi_path(path: &[String]) -> bool {
50+
ALLOWED_BIDI_PATHS.iter().any(|allowed| {
51+
if path.len() < allowed.len() {
52+
return false;
53+
}
54+
path.iter()
55+
.zip(allowed.iter())
56+
.all(|(a, b)| a.as_str() == *b)
57+
})
58+
}
59+
60+
struct Finding {
61+
byte_idx: usize,
62+
ch: char,
63+
name: &'static str,
64+
in_allowed_value: bool,
65+
}
66+
67+
struct BiDiCollector<'a> {
68+
contents: &'a str,
69+
findings: &'a mut Vec<Finding>,
70+
key_path: Vec<String>,
71+
table_stack: Vec<Vec<String>>,
72+
}
73+
74+
impl<'a> BiDiCollector<'a> {
75+
fn new(contents: &'a str, findings: &'a mut Vec<Finding>) -> Self {
76+
Self {
77+
contents,
78+
findings,
79+
key_path: Vec::new(),
80+
table_stack: Vec::new(),
81+
}
82+
}
83+
84+
fn check_span_for_bidi(&mut self, span: Span, in_value: bool) {
85+
let text = &self.contents[span.start()..span.end()];
86+
for (offset, ch) in text.char_indices() {
87+
if let Some((_, name)) = UNICODE_BIDI_CODEPOINTS.iter().find(|(c, _)| *c == ch) {
88+
let in_allowed_value = in_value && is_allowed_bidi_path(&self.key_path);
89+
self.findings.push(Finding {
90+
byte_idx: span.start() + offset,
91+
ch,
92+
name,
93+
in_allowed_value,
94+
});
95+
}
96+
}
97+
}
98+
99+
fn current_path(&self) -> Vec<String> {
100+
let mut path = self.table_stack.last().cloned().unwrap_or_default();
101+
path.extend(self.key_path.clone());
102+
path
103+
}
104+
}
105+
106+
impl EventReceiver for BiDiCollector<'_> {
107+
fn std_table_open(&mut self, _span: Span, _error: &mut dyn toml_parser::ErrorSink) {
108+
self.table_stack.push(self.key_path.clone());
109+
self.key_path.clear();
110+
}
111+
112+
fn std_table_close(&mut self, _span: Span, _error: &mut dyn toml_parser::ErrorSink) {
113+
if let Some(last) = self.table_stack.last_mut() {
114+
*last = self.key_path.clone();
115+
}
116+
self.key_path.clear();
117+
}
118+
119+
fn array_table_open(&mut self, _span: Span, _error: &mut dyn toml_parser::ErrorSink) {
120+
self.table_stack.push(self.key_path.clone());
121+
self.key_path.clear();
122+
}
123+
124+
fn array_table_close(&mut self, _span: Span, _error: &mut dyn toml_parser::ErrorSink) {
125+
if let Some(last) = self.table_stack.last_mut() {
126+
*last = self.key_path.clone();
127+
}
128+
self.key_path.clear();
129+
}
130+
131+
fn simple_key(
132+
&mut self,
133+
span: Span,
134+
_kind: Option<toml_parser::decoder::Encoding>,
135+
_error: &mut dyn toml_parser::ErrorSink,
136+
) {
137+
self.check_span_for_bidi(span, false);
138+
139+
let key_text = &self.contents[span.start()..span.end()];
140+
let key = if (key_text.starts_with('"') && key_text.ends_with('"'))
141+
|| (key_text.starts_with('\'') && key_text.ends_with('\''))
142+
{
143+
key_text[1..key_text.len() - 1].to_string()
144+
} else {
145+
key_text.to_string()
146+
};
147+
self.key_path.push(key);
148+
}
149+
150+
fn key_sep(&mut self, _span: Span, _error: &mut dyn toml_parser::ErrorSink) {}
151+
152+
fn key_val_sep(&mut self, _span: Span, _error: &mut dyn toml_parser::ErrorSink) {}
153+
154+
fn scalar(
155+
&mut self,
156+
span: Span,
157+
_kind: Option<toml_parser::decoder::Encoding>,
158+
_error: &mut dyn toml_parser::ErrorSink,
159+
) {
160+
let full_path = self.current_path();
161+
let saved_key_path = std::mem::replace(&mut self.key_path, full_path);
162+
self.check_span_for_bidi(span, true);
163+
self.key_path = saved_key_path;
164+
self.key_path.clear();
165+
}
166+
167+
fn comment(&mut self, span: Span, _error: &mut dyn toml_parser::ErrorSink) {
168+
self.check_span_for_bidi(span, false);
169+
}
170+
}
171+
172+
pub fn text_direction_codepoint(
173+
manifest: ManifestFor<'_>,
174+
manifest_path: &Path,
175+
cargo_lints: &TomlToolLints,
176+
error_count: &mut usize,
177+
gctx: &GlobalContext,
178+
) -> CargoResult<()> {
179+
let (lint_level, reason) = manifest.lint_level(cargo_lints, LINT);
180+
181+
if lint_level == LintLevel::Allow {
182+
return Ok(());
183+
}
184+
185+
if matches!(&manifest, ManifestFor::Workspace(MaybePackage::Package(_))) {
186+
return Ok(());
187+
}
188+
189+
let contents = manifest.contents();
190+
191+
let has_bidi = contents.chars().any(|ch| {
192+
UNICODE_BIDI_CODEPOINTS
193+
.iter()
194+
.any(|(bidi_ch, _)| *bidi_ch == ch)
195+
});
196+
197+
if !has_bidi {
198+
return Ok(());
199+
}
200+
201+
let mut findings = Vec::new();
202+
{
203+
let source = Source::new(contents);
204+
let tokens = source.lex().into_vec();
205+
let mut collector = BiDiCollector::new(contents, &mut findings);
206+
let mut errors = Vec::new();
207+
toml_parser::parser::parse_document(&tokens, &mut collector, &mut errors);
208+
}
209+
210+
let disallowed_findings: Vec<_> = findings
211+
.into_iter()
212+
.filter(|f| !f.in_allowed_value)
213+
.collect();
214+
215+
if disallowed_findings.is_empty() {
216+
return Ok(());
217+
}
218+
219+
let manifest_path_str = rel_cwd_manifest_path(manifest_path, gctx);
220+
221+
let mut line_map = Vec::new();
222+
line_map.push(0);
223+
for (idx, ch) in contents.char_indices() {
224+
if ch == '\n' {
225+
line_map.push(idx + 1);
226+
}
227+
}
228+
229+
let mut findings_by_line: std::collections::BTreeMap<usize, Vec<_>> =
230+
std::collections::BTreeMap::new();
231+
for finding in disallowed_findings {
232+
let line_num = line_map
233+
.iter()
234+
.rposition(|&line_start| line_start <= finding.byte_idx)
235+
.map(|i| i + 1)
236+
.unwrap_or(1);
237+
findings_by_line
238+
.entry(line_num)
239+
.or_insert_with(Vec::new)
240+
.push(finding);
241+
}
242+
243+
let level = lint_level.to_diagnostic_level();
244+
let emitted_source = LINT.emitted_source(lint_level, reason);
245+
246+
for (line_idx, line_findings) in findings_by_line.values().enumerate() {
247+
if lint_level.is_error() {
248+
*error_count += 1;
249+
}
250+
251+
let title = LINT.desc.to_string();
252+
253+
let labels: Vec<String> = line_findings
254+
.iter()
255+
.map(|f| format!(r#"`\u{{{:04X}}}` ({})"#, f.ch as u32, f.name))
256+
.collect();
257+
258+
let mut snippet = Snippet::source(contents).path(&manifest_path_str);
259+
for (finding, label) in line_findings.iter().zip(labels.iter()) {
260+
let span = finding.byte_idx..(finding.byte_idx + finding.ch.len_utf8());
261+
snippet = snippet.annotation(AnnotationKind::Primary.span(span).label(label));
262+
}
263+
264+
let mut group = Group::with_title(level.clone().primary_title(&title)).element(snippet);
265+
266+
if line_idx == 0 {
267+
group = group.element(Level::NOTE.message(&emitted_source));
268+
group = group.element(Level::NOTE.message(
269+
"these kinds of unicode codepoints change the way text flows on screen, \
270+
but can cause confusion because they change the order of characters",
271+
));
272+
}
273+
group = group.element(Level::HELP.message(
274+
"if their presence wasn't intentional, you can remove them, \
275+
or use their escape sequence (e.g., \\u{202E}) in double-quoted strings",
276+
));
277+
278+
gctx.shell().print_report(&[group], lint_level.force())?;
279+
}
280+
281+
Ok(())
282+
}

0 commit comments

Comments
 (0)