Skip to content

Commit c82e875

Browse files
authored
[crashtracker] Continue symbolicating after errors (#991)
1 parent b79a04f commit c82e875

8 files changed

Lines changed: 625 additions & 18 deletions

File tree

bin_tests/tests/crashtracker_bin_test.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ fn assert_telemetry_message(crash_telemetry: &[u8], crash_typ: &str) {
329329

330330
let base_expected_tags: std::collections::HashSet<&str> =
331331
std::collections::HashSet::from_iter([
332-
"data_schema_version:1.2",
332+
"data_schema_version:1.3",
333333
"incomplete:false",
334334
"is_crash:true",
335335
"profiler_collecting_sample:1",

crashtracker/src/crash_info/error_data.rs

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,47 @@ pub struct ErrorData {
1919
#[cfg(unix)]
2020
impl ErrorData {
2121
pub fn normalize_ips(&mut self, pid: u32) -> anyhow::Result<()> {
22+
let mut errors = 0;
2223
let normalizer = blazesym::normalize::Normalizer::new();
2324
let pid = pid.into();
24-
// TODO, should we continue after error or just exit?
25-
self.stack.normalize_ips(&normalizer, pid)?;
25+
self.stack
26+
.normalize_ips(&normalizer, pid)
27+
.unwrap_or_else(|_| errors += 1);
28+
2629
for thread in &mut self.threads {
27-
thread.stack.normalize_ips(&normalizer, pid)?;
30+
thread
31+
.stack
32+
.normalize_ips(&normalizer, pid)
33+
.unwrap_or_else(|_| errors += 1);
2834
}
35+
anyhow::ensure!(
36+
errors == 0,
37+
"Failed to normalize ips, see frame comments for details"
38+
);
2939
Ok(())
3040
}
3141

3242
pub fn resolve_names(&mut self, pid: u32) -> anyhow::Result<()> {
43+
let mut errors = 0;
3344
let mut process = blazesym::symbolize::Process::new(pid.into());
3445
// https://github.com/libbpf/blazesym/issues/518
3546
process.map_files = false;
3647
let src = blazesym::symbolize::Source::Process(process);
3748
let symbolizer = blazesym::symbolize::Symbolizer::new();
38-
self.stack.resolve_names(&src, &symbolizer)?;
49+
self.stack
50+
.resolve_names(&src, &symbolizer)
51+
.unwrap_or_else(|_| errors += 1);
3952

4053
for thread in &mut self.threads {
41-
thread.stack.resolve_names(&src, &symbolizer)?;
54+
thread
55+
.stack
56+
.resolve_names(&src, &symbolizer)
57+
.unwrap_or_else(|_| errors += 1);
4258
}
59+
anyhow::ensure!(
60+
errors == 0,
61+
"Failed to resolve names, see frame comments for details"
62+
);
4363
Ok(())
4464
}
4565
}

crashtracker/src/crash_info/mod.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ pub struct CrashInfo {
6262

6363
impl CrashInfo {
6464
pub fn current_schema_version() -> String {
65-
"1.2".to_string()
65+
"1.3".to_string()
6666
}
6767
}
6868

@@ -123,13 +123,23 @@ impl CrashInfo {
123123

124124
#[cfg(test)]
125125
mod tests {
126+
use schemars::schema::RootSchema;
127+
use std::fs;
128+
126129
use super::*;
127-
#[ignore]
128130
#[test]
129-
/// Utility function to print the schema.
130-
fn print_schema() {
131+
fn test_schema_matches_rfc() {
132+
let rfc_schema_filename = concat!(
133+
env!("CARGO_MANIFEST_DIR"),
134+
"/../docs/RFCs/artifacts/0008-crashtracker-schema.json"
135+
);
136+
let rfc_schema_json = fs::read_to_string(rfc_schema_filename).expect("File to exist");
137+
let rfc_schema: RootSchema = serde_json::from_str(&rfc_schema_json).expect("Valid json");
131138
let schema = schemars::schema_for!(CrashInfo);
132-
println!("{}", serde_json::to_string_pretty(&schema).unwrap());
139+
140+
assert_eq!(rfc_schema, schema);
141+
// If it doesn't match, you can use this command to generate a new schema json
142+
// println!("{}", serde_json::to_string_pretty(&schema).unwrap());
133143
}
134144

135145
impl test_utils::TestInstance for CrashInfo {

crashtracker/src/crash_info/stacktrace.rs

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,18 +74,26 @@ impl StackTrace {
7474
#[cfg(unix)]
7575
impl StackTrace {
7676
pub fn normalize_ips(&mut self, normalizer: &Normalizer, pid: Pid) -> anyhow::Result<()> {
77+
let mut errors = 0;
7778
for frame in &mut self.frames {
78-
// TODO: Should this keep going on failure, and report at the end?
79-
frame.normalize_ip(normalizer, pid)?;
79+
frame.normalize_ip(normalizer, pid).unwrap_or_else(|e| {
80+
frame.comments.push(e.to_string());
81+
errors += 1;
82+
});
8083
}
84+
anyhow::ensure!(errors == 0);
8185
Ok(())
8286
}
8387

8488
pub fn resolve_names(&mut self, src: &Source, symbolizer: &Symbolizer) -> anyhow::Result<()> {
89+
let mut errors = 0;
8590
for frame in &mut self.frames {
86-
// TODO: Should this keep going on failure, and report at the end?
87-
frame.resolve_names(src, symbolizer)?;
91+
frame.resolve_names(src, symbolizer).unwrap_or_else(|e| {
92+
frame.comments.push(e.to_string());
93+
errors += 1;
94+
});
8895
}
96+
anyhow::ensure!(errors == 0);
8997
Ok(())
9098
}
9199
}
@@ -129,6 +137,10 @@ pub struct StackFrame {
129137
pub function: Option<String>,
130138
#[serde(default, skip_serializing_if = "Option::is_none")]
131139
pub line: Option<u32>,
140+
141+
// Additional Info
142+
#[serde(default, skip_serializing_if = "Vec::is_empty")]
143+
pub comments: Vec<String>,
132144
}
133145

134146
impl StackFrame {
@@ -242,6 +254,8 @@ impl super::test_utils::TestInstance for StackFrame {
242254
let file = Some(format!("banana{seed}.rs"));
243255
let function = Some(format!("Bar::baz{seed}"));
244256
let line = Some((2 * seed + 1) as u32);
257+
258+
let comments = vec![format!("This is a comment on frame {seed}")];
245259
Self {
246260
ip,
247261
module_base_address,
@@ -256,6 +270,7 @@ impl super::test_utils::TestInstance for StackFrame {
256270
file,
257271
function,
258272
line,
273+
comments,
259274
}
260275
}
261276
}

crashtracker/src/crash_info/telemetry.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ mod tests {
278278
assert_eq!(
279279
HashSet::from_iter([
280280
"collecting_sample:1",
281-
"data_schema_version:1.2",
281+
"data_schema_version:1.3",
282282
"incomplete:true",
283283
"is_crash:true",
284284
"not_profiling:0",

crashtracker/src/receiver/entry_points.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,12 @@ fn resolve_frames(
133133
.as_ref()
134134
.context("Unable to resolve frames: No PID specified")?
135135
.pid;
136-
crash_info.resolve_names(pid)?;
137-
crash_info.normalize_ips(pid)?;
136+
let rval1 = crash_info.resolve_names(pid);
137+
let rval2 = crash_info.normalize_ips(pid);
138+
anyhow::ensure!(
139+
rval1.is_ok() && rval2.is_ok(),
140+
"resolve_names: {rval1:?}\tnormalize_ips: {rval2:?}"
141+
);
138142
}
139143
Ok(())
140144
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# RFC 0008: Crashtracker Structured Log Format (Version 1.3). Adds stackframe comments.
2+
3+
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [IETF RFC 2119](https://datatracker.ietf.org/doc/html/rfc2119).
4+
5+
## Summary
6+
7+
This document describes version 1.3 of the crashinfo data format.
8+
9+
## Motivation
10+
11+
The `libdatadog` crashtracker detects program crashes.
12+
It automatically collects information relevant to the characterizing and debugging the crash, including stack-traces, the crash-type (e.g. SIGSIGV, SIGBUS, etc) crash, the library version, etc.
13+
In some cases, a crashtracker collector may have information related to stackframes that does not fit in the current schema.
14+
For example, if a stackframe failed to symbolicate, the crashtracker implementation may wish to record the reason for the failure to allow debugging.
15+
16+
## Proposed format
17+
18+
The format is an extension of the [1.0 json schema](0005-crashtracker-structured-log-format.md), with the following changes.
19+
The updated schema is given in Appendix A.
20+
Any field not listed as "Required" is optional.
21+
Consumers MUST accept json with elided optional fields.
22+
23+
### Fields
24+
25+
- `data_schema_version`: **[required]** \*\*[UPDATED]
26+
A string containing the semver ID of the crashtracker data schema ("1.3" for the current version).
27+
28+
### Stackframes
29+
30+
A stackframe consists of all the fields specified in [1.0 json schema](0005-crashtracker-structured-log-format.md), with the additional
31+
32+
- `comments`: **[optional]** **[NEW]**
33+
An array of Strings, containing comments about the given stackframe.
34+
35+
## Appendix A: Json Schema
36+
37+
[Available here](artifacts/0006-crashtracker-schema.json)

0 commit comments

Comments
 (0)