Skip to content

Commit

Permalink
feat: telemetry improvements (#1393)
Browse files Browse the repository at this point in the history
  • Loading branch information
meskill authored Mar 15, 2024
1 parent c9849cd commit 1cb6fbb
Show file tree
Hide file tree
Showing 36 changed files with 270 additions and 145 deletions.
14 changes: 14 additions & 0 deletions .github/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ Thank you for considering contributing to **Tailcall**! This document outlines t
cargo test
```

## Telemetry

Tailcall implements high observability standards that by following [OpenTelemetry](https://opentelemetry.io) specification. This implementation relies on the following crates:

- [rust-opentelemetry](https://docs.rs/opentelemetry/latest/opentelemetry/index.html) and related crates to implement support for collecting and exporting data
- [tracing](https://docs.rs/tracing/latest/tracing/index.html) and [tracing-opentelemetry](https://docs.rs/tracing-opentelemetry/latest/tracing_opentelemetry/index.html) to define logs and traces and thanks to integration with opentelemetry that data is automatically transferred to opentelemetry crates. Such a wrapper for telemetry allows to use well-defined library like tracing that works well for different cases and could be used as simple telemetry system for logging without involving opentelemetry if it's not required
When implementing any functionality that requires observability consider the following points:
- Add traces for significant amount of work that represents single operation. This will make it easier to investigate problems and slowdowns later.
- For naming spans refer to the [opentelemetry specs](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md#span) and make the name as specific as possible but without involving high cardinality for possible values.
- Due to limitations of tracing libraries span names could only be defined as static strings. This could be solved by specifying an additional field with special name `otel.name` (for details refer `tracing-opentelemetry` docs).
- The naming of the attributes should follow the opentelemetry's [semantic convention](https://opentelemetry.io/docs/concepts/semantic-conventions/). Existing constants can be obtained with the [opentelemetry_semantic_conventions](https://docs.rs/opentelemetry-semantic-conventions/latest/opentelemetry_semantic_conventions/index.html) crate.

## Benchmarks Comparison

### Criterion Benchmarks
Expand Down
4 changes: 2 additions & 2 deletions autogen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ use schemars::schema::{RootSchema, Schema};
use schemars::Map;
use serde_json::{json, Value};
use tailcall::scalar::CUSTOM_SCALARS;
use tailcall::tracing::default_crate_tracing;
use tailcall::tracing::default_tracing_for_name;
use tailcall::{cli, config};

static JSON_SCHEMA_FILE: &str = "../generated/.tailcallrc.schema.json";

#[tokio::main]
async fn main() {
tracing::subscriber::set_global_default(default_crate_tracing("autogen")).unwrap();
tracing::subscriber::set_global_default(default_tracing_for_name("autogen")).unwrap();
let args: Vec<String> = env::args().collect();
let arg = args.get(1);

Expand Down
10 changes: 3 additions & 7 deletions aws-lambda/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::str::FromStr as _;
use std::sync::Arc;

use dotenvy::dotenv;
Expand All @@ -9,6 +8,7 @@ use tailcall::async_graphql_hyper::GraphQLRequest;
use tailcall::blueprint::Blueprint;
use tailcall::config::reader::ConfigReader;
use tailcall::http::{handle_request, AppContext};
use tailcall::tracing::get_log_level;

mod http;
mod runtime;
Expand All @@ -17,16 +17,12 @@ mod runtime;
async fn main() -> Result<(), Error> {
let _ = dotenv();

let trace: tracing::Level = std::env::var("TC_LOG_LEVEL")
.ok()
.or_else(|| std::env::var("TAILCALL_LOG_LEVEL").ok())
.as_ref()
.and_then(|x| tracing::Level::from_str(x).ok())
let level: tracing::Level = get_log_level()
// log everything by default since logs can be filtered by level in CloudWatch.
.unwrap_or(tracing::Level::TRACE);

tracing_subscriber::fmt()
.with_max_level(trace)
.with_max_level(level)
// disable printing the name of the module in every log line.
.with_target(false)
// disabling time is handy because CloudWatch will add the ingestion time.
Expand Down
3 changes: 2 additions & 1 deletion examples/telemetry-otlp.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ schema
]
}
}
requestHeaders: ["user-id"]
) {
query: Query
}

type Query {
posts: [Post] @http(path: "/posts")
posts: [Post] @http(path: "/posts") @cache(maxAge: 3000)
user(id: Int!): User @http(path: "/users/{{args.id}}")
}

Expand Down
12 changes: 12 additions & 0 deletions generated/.tailcallrc.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,12 @@ of their applications.
"""
directive @telemetry(
export: TelemetryExporter
"""
The list of headers that will be sent as additional attributes to telemetry exporters
Be careful about **leaking sensitive information** from requests when enabling the
headers that may contain sensitive data
"""
requestHeaders: [String!]
) on FIELD_DEFINITION

"""
Expand Down Expand Up @@ -750,6 +756,12 @@ of their applications.
"""
input Telemetry {
export: TelemetryExporter
"""
The list of headers that will be sent as additional attributes to telemetry exporters
Be careful about **leaking sensitive information** from requests when enabling the
headers that may contain sensitive data
"""
requestHeaders: [String!]
}
input TelemetryExporter {
stdout: StdoutExporter
Expand Down
23 changes: 15 additions & 8 deletions generated/.tailcallrc.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,6 @@
"$ref": "#/definitions/Link"
}
},
"opentelemetry": {
"description": "Enable [opentelemetry](https://opentelemetry.io) support",
"allOf": [
{
"$ref": "#/definitions/Telemetry"
}
]
},
"schema": {
"description": "Specifies the entry points for query and mutation in the generated GraphQL schema.",
"allOf": [
Expand All @@ -38,6 +30,14 @@
}
]
},
"telemetry": {
"description": "Enable [opentelemetry](https://opentelemetry.io) support",
"allOf": [
{
"$ref": "#/definitions/Telemetry"
}
]
},
"types": {
"description": "A map of all the types in the schema.",
"default": {},
Expand Down Expand Up @@ -1722,6 +1722,13 @@
"type": "null"
}
]
},
"requestHeaders": {
"description": "The list of headers that will be sent as additional attributes to telemetry exporters Be careful about **leaking sensitive information** from requests when enabling the headers that may contain sensitive data",
"type": "array",
"items": {
"type": "string"
}
}
}
},
Expand Down
2 changes: 1 addition & 1 deletion src/blueprint/blueprint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pub struct Blueprint {
pub schema: SchemaDefinition,
pub server: Server,
pub upstream: Upstream,
pub opentelemetry: Telemetry,
pub telemetry: Telemetry,
}

#[derive(Clone, Debug)]
Expand Down
4 changes: 2 additions & 2 deletions src/blueprint/from_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ pub fn config_blueprint<'a>() -> TryFold<'a, ConfigModule, Blueprint, String> {
});

let opentelemetry = to_opentelemetry().transform::<Blueprint>(
|opentelemetry, blueprint| blueprint.opentelemetry(opentelemetry),
|blueprint| blueprint.opentelemetry,
|opentelemetry, blueprint| blueprint.telemetry(opentelemetry),
|blueprint| blueprint.telemetry,
);

server
Expand Down
4 changes: 2 additions & 2 deletions src/blueprint/into_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ fn to_type(def: &Definition) -> dynamic::Type {
}
Some(expr) => {
let span = tracing::info_span!(
"field::resolver",
name = ctx.path_node.map(|p| p.to_string()).unwrap_or(field_name.clone()), graphql.returnType = %type_ref
"field_resolver",
otel.name = ctx.path_node.map(|p| p.to_string()).unwrap_or(field_name.clone()), graphql.returnType = %type_ref
);
let expr = expr.to_owned();
FieldFuture::new(
Expand Down
8 changes: 6 additions & 2 deletions src/blueprint/telemetry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pub enum TelemetryExporter {
#[derive(Debug, Default, Clone)]
pub struct Telemetry {
pub export: Option<TelemetryExporter>,
pub request_headers: Vec<String>,
}

fn to_url(url: &str) -> Valid<Url, String> {
Expand All @@ -50,7 +51,7 @@ fn to_headers(headers: Vec<KeyValue>) -> Valid<HeaderMap, String> {

pub fn to_opentelemetry<'a>() -> TryFold<'a, ConfigModule, Telemetry, String> {
TryFoldConfig::<Telemetry>::new(|config, up| {
if let Some(export) = config.opentelemetry.export.as_ref() {
if let Some(export) = config.telemetry.export.as_ref() {
let export = match export {
config::TelemetryExporter::Stdout(config) => {
Valid::succeed(TelemetryExporter::Stdout(config.clone()))
Expand All @@ -67,7 +68,10 @@ pub fn to_opentelemetry<'a>() -> TryFold<'a, ConfigModule, Telemetry, String> {
};

export
.map(|export| Telemetry { export: Some(export) })
.map(|export| Telemetry {
export: Some(export),
request_headers: config.telemetry.request_headers.clone(),
})
.trace(config::Telemetry::trace_name().as_str())
} else {
Valid::succeed(up)
Expand Down
2 changes: 1 addition & 1 deletion src/cli/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ fn cache_metrics(runtime: &TargetRuntime) -> Result<()> {
let meter = opentelemetry::global::meter("cache");
let cache = runtime.cache.clone();
let counter = meter
.f64_observable_counter("hit_rate")
.f64_observable_gauge("cache.hit_rate")
.with_description("Cache hit rate ratio")
.init();

Expand Down
5 changes: 1 addition & 4 deletions src/cli/server/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,7 @@ impl Server {
self.config_module.extensions.endpoints,
));

init_opentelemetry(
blueprint.opentelemetry.clone(),
&server_config.app_ctx.runtime,
)?;
init_opentelemetry(blueprint.telemetry.clone(), &server_config.app_ctx.runtime)?;

match blueprint.server.http.clone() {
Http::HTTP2 { cert, key } => {
Expand Down
2 changes: 1 addition & 1 deletion src/cli/server/server_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ impl ServerConfig {

let mut extensions = vec![];

if let Some(TelemetryExporter::Apollo(apollo)) = blueprint.opentelemetry.export.as_ref() {
if let Some(TelemetryExporter::Apollo(apollo)) = blueprint.telemetry.export.as_ref() {
let (graph_id, variant) = apollo.graph_ref.split_once('@').unwrap();
extensions.push(SchemaExtension::new(ApolloTracing::new(
apollo.api_key.clone(),
Expand Down
18 changes: 12 additions & 6 deletions src/cli/telemetry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use super::metrics::init_metrics;
use crate::blueprint::telemetry::{OtlpExporter, Telemetry, TelemetryExporter};
use crate::cli::CLIError;
use crate::runtime::TargetRuntime;
use crate::tracing::{default_tailcall_tracing, tailcall_filter_target};
use crate::tracing::{default_tracing_tailcall, get_log_level, tailcall_filter_target};

static RESOURCE: Lazy<Resource> = Lazy::new(|| {
Resource::default().merge(&Resource::new(vec![
Expand Down Expand Up @@ -93,7 +93,10 @@ fn set_trace_provider(
TelemetryExporter::Apollo(_) => return Ok(None),
};
let tracer = provider.tracer("tracing");
let telemetry = tracing_opentelemetry::layer().with_tracer(tracer);
let telemetry = tracing_opentelemetry::layer()
.with_location(false)
.with_threads(false)
.with_tracer(tracer);

global::set_tracer_provider(provider);

Expand Down Expand Up @@ -202,7 +205,7 @@ pub fn init_opentelemetry(config: Telemetry, runtime: &TargetRuntime) -> anyhow:
| global::Error::Metric(MetricsError::Other(_))
| global::Error::Log(LogError::Other(_)),
) {
tracing::subscriber::with_default(default_tailcall_tracing(), || {
tracing::subscriber::with_default(default_tracing_tailcall(), || {
let cli = crate::cli::CLIError::new("Open Telemetry Error")
.caused_by(vec![CLIError::new(error.to_string().as_str())])
.trace(vec!["schema".to_string(), "@telemetry".to_string()]);
Expand All @@ -216,7 +219,7 @@ pub fn init_opentelemetry(config: Telemetry, runtime: &TargetRuntime) -> anyhow:
set_meter_provider(export)?;

let subscriber = tracing_subscriber::registry()
.with(trace_layer.with_filter(LevelFilter::INFO))
.with(trace_layer)
.with(
log_layer.with_filter(dynamic_filter_fn(|_metatada, context| {
// ignore logs that are generated inside tracing::Span since they will be logged
Expand All @@ -225,13 +228,16 @@ pub fn init_opentelemetry(config: Telemetry, runtime: &TargetRuntime) -> anyhow:
context.lookup_current().is_none()
})),
)
.with(tailcall_filter_target());
.with(tailcall_filter_target())
.with(LevelFilter::from_level(
get_log_level().unwrap_or(tracing::Level::INFO),
));

init_metrics(runtime)?;

set_tracing_subscriber(subscriber);
} else {
set_tracing_subscriber(default_tailcall_tracing());
set_tracing_subscriber(default_tracing_tailcall());
}

Ok(())
Expand Down
14 changes: 3 additions & 11 deletions src/config/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ pub struct Config {
pub links: Vec<Link>,
#[serde(default, skip_serializing_if = "is_default")]
/// Enable [opentelemetry](https://opentelemetry.io) support
pub opentelemetry: Telemetry,
pub telemetry: Telemetry,
}

impl Config {
Expand Down Expand Up @@ -177,17 +177,9 @@ impl Config {
let schema = self.schema.merge_right(other.schema.clone());
let upstream = self.upstream.merge_right(other.upstream.clone());
let links = merge_links(self.links, other.links.clone());
let opentelemetry = self.opentelemetry.merge_right(other.opentelemetry.clone());
let telemetry = self.telemetry.merge_right(other.telemetry.clone());

Self {
server,
upstream,
types,
schema,
unions,
links,
opentelemetry,
}
Self { server, upstream, types, schema, unions, links, telemetry }
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/config/from_document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,16 @@ pub fn from_document(doc: ServiceDocument) -> Valid<Config, String> {
.fuse(unions)
.fuse(schema)
.fuse(links(sd))
.fuse(opentelemetry(sd))
.fuse(telemetry(sd))
.map(
|(server, upstream, types, unions, schema, links, opentelemetry)| Config {
|(server, upstream, types, unions, schema, links, telemetry)| Config {
server,
upstream,
types,
unions,
schema,
links,
opentelemetry,
telemetry,
},
)
})
Expand Down Expand Up @@ -117,7 +117,7 @@ fn links(schema_definition: &SchemaDefinition) -> Valid<Vec<Link>, String> {
process_schema_multiple_directives(schema_definition, config::Link::directive_name().as_str())
}

fn opentelemetry(schema_definition: &SchemaDefinition) -> Valid<Telemetry, String> {
fn telemetry(schema_definition: &SchemaDefinition) -> Valid<Telemetry, String> {
process_schema_directives(
schema_definition,
config::telemetry::Telemetry::directive_name().as_str(),
Expand Down
4 changes: 2 additions & 2 deletions src/config/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ impl ConfigReader {

fn update_opentelemetry(&self, config_module: &mut ConfigModule) -> anyhow::Result<()> {
let server = &mut config_module.config.server;
let opentelemetry = &mut config_module.config.opentelemetry;
let telemetry = &mut config_module.config.telemetry;

let reader_ctx = ConfigReaderContext {
env: self.runtime.env.clone(),
Expand All @@ -307,7 +307,7 @@ impl ConfigReader {
.collect(),
};

opentelemetry.render_mustache(&reader_ctx)?;
telemetry.render_mustache(&reader_ctx)?;

Ok(())
}
Expand Down
Loading

1 comment on commit 1cb6fbb

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Running 30s test @ http://localhost:8000/graphql

4 threads and 100 connections

Thread Stats Avg Stdev Max +/- Stdev
Latency 6.87ms 3.09ms 86.88ms 71.74%
Req/Sec 3.68k 301.11 6.79k 88.44%

440504 requests in 30.10s, 2.21GB read

Requests/sec: 14635.69

Transfer/sec: 75.12MB

Please sign in to comment.