Skip to content

Commit 7bf722d

Browse files
authored
feat: OpenTelemetry + X-Ray tracing (#103)
1 parent bcff38f commit 7bf722d

File tree

10 files changed

+237
-60
lines changed

10 files changed

+237
-60
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,5 @@ cdk.out/
105105
node_modules
106106
cdk.context.json
107107
*.nc
108+
109+
.test-deploy-env

infrastructure/aws/cdk/app.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ def __init__(
4343
id: str,
4444
memory: int = 1024,
4545
timeout: int = 30,
46-
runtime: aws_lambda.Runtime = aws_lambda.Runtime.PYTHON_3_12,
4746
concurrent: Optional[int] = None,
4847
permissions: Optional[List[iam.PolicyStatement]] = None,
4948
environment: Optional[Dict] = None,
@@ -138,12 +137,19 @@ def __init__(
138137
**environment,
139138
"TITILER_MULTIDIM_ROOT_PATH": app_settings.root_path,
140139
"TITILER_MULTIDIM_CACHE_HOST": redis_cluster.attr_redis_endpoint_address,
140+
"OTEL_METRICS_EXPORTER": "none", # Disable metrics - only using traces
141+
"OTEL_PYTHON_DISABLED_INSTRUMENTATIONS": "aws-lambda,requests,urllib3,aiohttp-client", # Disable aws-lambda auto-instrumentation (handled by otel_wrapper.py)
142+
"OTEL_PROPAGATORS": "tracecontext,baggage,xray",
143+
"OPENTELEMETRY_COLLECTOR_CONFIG_URI": "/opt/collector-config/config.yaml",
144+
# AWS_LAMBDA_LOG_FORMAT not set - using custom JSON formatter in handler.py
145+
"AWS_LAMBDA_EXEC_WRAPPER": "/opt/otel-instrument", # Enable OTEL wrapper to avoid circular import
141146
},
142147
log_retention=logs.RetentionDays.ONE_WEEK,
143148
vpc=vpc,
144149
vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
145150
allow_public_subnet=True,
146151
role=veda_reader_role,
152+
tracing=aws_lambda.Tracing.ACTIVE,
147153
)
148154

149155
for perm in permissions:
@@ -207,6 +213,17 @@ def __init__(
207213
)
208214
)
209215

216+
# Add X-Ray permissions for tracing
217+
perms.append(
218+
iam.PolicyStatement(
219+
actions=[
220+
"xray:PutTraceSegments",
221+
"xray:PutTelemetryRecords",
222+
],
223+
resources=["*"],
224+
)
225+
)
226+
210227

211228
lambda_stack = LambdaStack(
212229
app,
Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,40 @@
11
ARG PYTHON_VERSION=3.12
22

3-
# Build stage - includes all build tools and dependencies
4-
FROM public.ecr.aws/lambda/python:${PYTHON_VERSION} AS builder
3+
# Stage 1: OTEL
4+
# Download the OpenTelemetry layer
5+
# ref: https://github.com/athewsey/opentelemetry-lambda-container/blob/98069d5eb6d812ccd28d5c80e2f9d6c8a8c76fb9/python-example/lambda-function/Dockerfile
6+
FROM public.ecr.aws/lambda/python:${PYTHON_VERSION} as otel-builder
7+
RUN <<EOF
8+
dnf install -y unzip wget
9+
wget https://github.com/aws-observability/aws-otel-python-instrumentation/releases/download/v0.12.1/layer.zip -O /tmp/layer.zip
10+
mkdir -p /opt-builder
11+
unzip /tmp/layer.zip -d /opt-builder/
12+
EOF
513

6-
# Copy uv for faster dependency management
14+
# Stage 2: titiler-multidim application and dependencies
15+
FROM public.ecr.aws/lambda/python:${PYTHON_VERSION} AS builder
716
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
817

9-
# Install system dependencies needed for compilation
1018
RUN dnf install -y gcc-c++ && dnf clean all
1119

12-
# Set working directory for build
1320
WORKDIR /build
1421

15-
# Copy dependency files first for better caching
1622
COPY README.md uv.lock .python-version pyproject.toml ./
1723
COPY src/titiler/ ./src/titiler/
1824

19-
# Install dependencies to temporary directory with Lambda-specific optimizations
20-
RUN uv export --locked --no-editable --no-dev --extra lambda --format requirements.txt -o requirements.txt && \
21-
uv pip install \
25+
RUN <<EOF
26+
uv export --locked --no-editable --no-dev --extra lambda --format requirements.txt -o requirements.txt
27+
uv pip install \
2228
--compile-bytecode \
2329
--no-binary pydantic \
2430
--target /deps \
2531
--no-cache-dir \
2632
--disable-pip-version-check \
2733
-r requirements.txt
34+
EOF
2835

2936
# Aggressive cleanup to minimize size and optimize for Lambda container
37+
# Clean up app dependencies in /deps
3038
WORKDIR /deps
3139
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
3240
RUN <<EOF
@@ -46,34 +54,27 @@ find . -name 'README*' -delete
4654
find . -name '*.md' -delete
4755
# Strip debug symbols from shared libraries (preserve numpy.libs)
4856
find . -type f -name '*.so*' -not -path "*/numpy.libs/*" -exec strip --strip-unneeded {} \; 2>/dev/null || true
49-
# Create a manifest file for debugging
50-
du -sh . > /tmp/package_size.txt
5157
EOF
5258

53-
# Final runtime stage - minimal Lambda image optimized for container runtime
59+
# Stage 3: Final runtime stage - minimal Lambda image optimized for container runtime
5460
FROM public.ecr.aws/lambda/python:${PYTHON_VERSION}
5561

56-
# Set Lambda-specific environment variables for optimal performance
57-
ENV PYTHONPATH=${LAMBDA_RUNTIME_DIR} \
58-
PYTHONUNBUFFERED=1 \
59-
PYTHONDONTWRITEBYTECODE=1 \
60-
AWS_LWA_ENABLE_COMPRESSION=true
61-
62-
# Copy only the cleaned dependencies from builder stage
63-
# Copy required system library
64-
COPY --from=builder /deps /usr/lib64/libexpat.so.1 ${LAMBDA_RUNTIME_DIR}/
62+
ENV PYTHONUNBUFFERED=1 \
63+
PYTHONDONTWRITEBYTECODE=1
6564

66-
# Copy application handler
65+
COPY --from=otel-builder /opt-builder/ /opt/
66+
COPY infrastructure/aws/lambda/collector-config.yaml /opt/collector-config/config.yaml
67+
COPY --from=builder /deps ${LAMBDA_RUNTIME_DIR}/
68+
COPY --from=builder /usr/lib64/libexpat.so.1 ${LAMBDA_RUNTIME_DIR}/
6769
COPY infrastructure/aws/lambda/handler.py ${LAMBDA_RUNTIME_DIR}/
6870

69-
# Ensure handler is executable and optimize permissions
7071
RUN <<EOF
7172
chmod 644 "${LAMBDA_RUNTIME_DIR}"/handler.py
73+
chmod -R 755 /opt/
7274
# Pre-compile the handler for faster cold starts
7375
python -c "import py_compile; py_compile.compile('${LAMBDA_RUNTIME_DIR}/handler.py', doraise=True)"
7476
# Create cache directories with proper permissions
7577
mkdir -p /tmp/.cache && chmod 777 /tmp/.cache
7678
EOF
7779

78-
# Set the Lambda handler
7980
CMD ["handler.lambda_handler"]
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
extensions:
2+
# AWS Proxy extension - forwards X-Ray segments to local X-Ray daemon via UDP
3+
# This avoids the awsxray exporter making HTTPS API calls
4+
awsproxy:
5+
endpoint: 127.0.0.1:2000
6+
7+
receivers:
8+
otlp:
9+
protocols:
10+
grpc:
11+
endpoint: localhost:4317
12+
http:
13+
endpoint: localhost:4318
14+
15+
processors:
16+
batch:
17+
timeout: 1s
18+
send_batch_size: 50
19+
20+
exporters:
21+
# Export to AWS X-Ray via local X-Ray daemon (UDP, no internet required)
22+
# The awsproxy extension bridges the collector to the daemon at 127.0.0.1:2000
23+
awsxray:
24+
endpoint: http://127.0.0.1:2000
25+
local_mode: true # Use local X-Ray daemon instead of direct API calls
26+
index_all_attributes: true
27+
28+
# Debug exporter to see traces in CloudWatch logs
29+
debug:
30+
verbosity: detailed
31+
32+
service:
33+
extensions: [awsproxy]
34+
pipelines:
35+
traces:
36+
receivers: [otlp]
37+
processors: [batch]
38+
exporters: [debug, awsxray]
Lines changed: 133 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,141 @@
1-
"""AWS Lambda handler optimized for container runtime."""
1+
"""AWS Lambda handler optimized for container runtime with OTEL instrumentation."""
22

3+
import json
34
import logging
5+
import os
46
import warnings
7+
from datetime import datetime, timezone
58
from typing import Any, Dict
69

710
from mangum import Mangum
811

912
from titiler.multidim.main import app
1013

14+
15+
def otel_trace_id_to_xray_format(otel_trace_id: str) -> str:
16+
"""
17+
Convert OpenTelemetry trace ID to X-Ray format.
18+
19+
OTEL format: 32 hex chars (e.g., "68eeb2ec45b07caf760899f308d34ab6")
20+
X-Ray format: "1-{first 8 chars}-{remaining 24 chars}" (e.g., "1-68eeb2ec-45b07caf760899f308d34ab6")
21+
22+
The first 8 hex chars represent the Unix timestamp, which is how X-Ray generates compatible IDs.
23+
"""
24+
if len(otel_trace_id) == 32:
25+
return f"1-{otel_trace_id[:8]}-{otel_trace_id[8:]}"
26+
return otel_trace_id
27+
28+
29+
class XRayJsonFormatter(logging.Formatter):
30+
"""
31+
Custom JSON formatter that includes X-Ray trace ID for log correlation.
32+
33+
This formatter outputs logs as JSON and includes:
34+
- Standard log fields (timestamp, level, message, logger)
35+
- X-Ray trace ID (converted from OTEL format)
36+
- OTEL trace context fields (if present)
37+
- Any extra fields passed via logger.info("msg", extra={...})
38+
"""
39+
40+
# Standard fields that shouldn't be duplicated in the output
41+
RESERVED_ATTRS = {
42+
"name",
43+
"msg",
44+
"args",
45+
"created",
46+
"filename",
47+
"funcName",
48+
"levelname",
49+
"levelno",
50+
"lineno",
51+
"module",
52+
"msecs",
53+
"message",
54+
"pathname",
55+
"process",
56+
"processName",
57+
"relativeCreated",
58+
"thread",
59+
"threadName",
60+
"exc_info",
61+
"exc_text",
62+
"stack_info",
63+
"taskName",
64+
}
65+
66+
def format(self, record: logging.LogRecord) -> str: # noqa: C901
67+
"""Format log record as JSON with X-Ray trace ID."""
68+
# Build base log object with standard fields
69+
log_object = {
70+
"timestamp": datetime.fromtimestamp(record.created, tz=timezone.utc)
71+
.isoformat()
72+
.replace("+00:00", "Z"),
73+
"level": record.levelname,
74+
"message": record.getMessage(),
75+
"logger": record.name,
76+
}
77+
78+
# Add X-Ray trace ID
79+
xray_trace_id = None
80+
81+
# Method 1: Extract from Lambda's X-Ray environment variable (preferred)
82+
trace_header = os.environ.get("_X_AMZN_TRACE_ID", "")
83+
if trace_header:
84+
for part in trace_header.split(";"):
85+
if part.startswith("Root="):
86+
xray_trace_id = part.split("=", 1)[1]
87+
break
88+
89+
# Method 2: Convert OTEL trace ID if available (fallback)
90+
if not xray_trace_id and hasattr(record, "otelTraceID"):
91+
xray_trace_id = otel_trace_id_to_xray_format(record.otelTraceID)
92+
93+
if xray_trace_id:
94+
log_object["xray_trace_id"] = xray_trace_id
95+
96+
# Add exception info if present
97+
if record.exc_info:
98+
log_object["exception"] = self.formatException(record.exc_info)
99+
100+
# Add OTEL fields if present
101+
for attr in [
102+
"otelSpanID",
103+
"otelTraceID",
104+
"otelTraceSampled",
105+
"otelServiceName",
106+
]:
107+
if hasattr(record, attr):
108+
log_object[attr] = getattr(record, attr)
109+
110+
# Add AWS request ID if available
111+
if hasattr(record, "aws_request_id"):
112+
log_object["requestId"] = record.aws_request_id
113+
114+
# Add any extra fields from record.__dict__ that aren't standard
115+
for key, value in record.__dict__.items():
116+
if key not in self.RESERVED_ATTRS and key not in log_object:
117+
log_object[key] = value
118+
119+
return json.dumps(log_object)
120+
121+
122+
# Configure root logger with custom JSON formatter that includes X-Ray trace ID
123+
root_logger = logging.getLogger()
124+
root_logger.setLevel(logging.WARN)
125+
126+
# Remove any existing handlers
127+
for log_handler in root_logger.handlers[:]:
128+
root_logger.removeHandler(log_handler)
129+
130+
# Add StreamHandler with our custom JSON formatter
131+
json_handler = logging.StreamHandler()
132+
json_handler.setFormatter(XRayJsonFormatter())
133+
root_logger.addHandler(json_handler)
134+
135+
# Set titiler loggers to INFO level
136+
logging.getLogger("titiler").setLevel(logging.INFO)
137+
138+
# Keep specific loggers at ERROR/WARNING levels
11139
logging.getLogger("mangum.lifespan").setLevel(logging.ERROR)
12140
logging.getLogger("mangum.http").setLevel(logging.ERROR)
13141
logging.getLogger("botocore").setLevel(logging.WARNING)
@@ -16,15 +144,8 @@
16144
warnings.filterwarnings("ignore", category=UserWarning)
17145
warnings.filterwarnings("ignore", category=FutureWarning)
18146

19-
20-
# Pre-import commonly used modules for faster cold starts
21-
try:
22-
import numpy # noqa: F401
23-
import pandas # noqa: F401
24-
import rioxarray # noqa: F401
25-
import xarray # noqa: F401
26-
except ImportError:
27-
pass
147+
# LoggingInstrumentor().instrument(set_logging_format=False)
148+
# FastAPIInstrumentor.instrument_app(app)
28149

29150
handler = Mangum(
30151
app,
@@ -40,10 +161,5 @@
40161

41162

42163
def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
43-
"""Lambda handler with container-specific optimizations."""
44-
response = handler(event, context)
45-
46-
return response
47-
48-
49-
handler.lambda_handler = lambda_handler
164+
"""Lambda handler with container-specific optimizations and OTEL tracing."""
165+
return handler(event, context)

infrastructure/aws/lambda/requirements-lambda.txt

Lines changed: 0 additions & 15 deletions
This file was deleted.

package-lock.json

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ server = [
5151
]
5252
lambda = [
5353
"mangum==0.19.0",
54-
"aiobotocore>=2.24.0,<2.24.2"
54+
"aiobotocore>=2.24.0,<2.24.2",
5555
]
5656

5757
[dependency-groups]

0 commit comments

Comments
 (0)