Skip to content

Commit

Permalink
feature: allow configuration for Go x/trace.FlightRecorder
Browse files Browse the repository at this point in the history
Signed-off-by: Sandor Szücs <[email protected]>
  • Loading branch information
szuecs committed Mar 19, 2024
1 parent 1efdf1e commit e9d5125
Show file tree
Hide file tree
Showing 5 changed files with 240 additions and 145 deletions.
103 changes: 59 additions & 44 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,50 +69,55 @@ type Config struct {
CompressEncodings *listFlag `yaml:"compress-encodings"`

// logging, metrics, profiling, tracing:
EnablePrometheusMetrics bool `yaml:"enable-prometheus-metrics"`
OpenTracing string `yaml:"opentracing"`
OpenTracingInitialSpan string `yaml:"opentracing-initial-span"`
OpenTracingExcludedProxyTags string `yaml:"opentracing-excluded-proxy-tags"`
OpenTracingDisableFilterSpans bool `yaml:"opentracing-disable-filter-spans"`
OpentracingLogFilterLifecycleEvents bool `yaml:"opentracing-log-filter-lifecycle-events"`
OpentracingLogStreamEvents bool `yaml:"opentracing-log-stream-events"`
OpentracingBackendNameTag bool `yaml:"opentracing-backend-name-tag"`
MetricsListener string `yaml:"metrics-listener"`
MetricsPrefix string `yaml:"metrics-prefix"`
EnableProfile bool `yaml:"enable-profile"`
BlockProfileRate int `yaml:"block-profile-rate"`
MutexProfileFraction int `yaml:"mutex-profile-fraction"`
MemProfileRate int `yaml:"memory-profile-rate"`
DebugGcMetrics bool `yaml:"debug-gc-metrics"`
RuntimeMetrics bool `yaml:"runtime-metrics"`
ServeRouteMetrics bool `yaml:"serve-route-metrics"`
ServeRouteCounter bool `yaml:"serve-route-counter"`
ServeHostMetrics bool `yaml:"serve-host-metrics"`
ServeHostCounter bool `yaml:"serve-host-counter"`
ServeMethodMetric bool `yaml:"serve-method-metric"`
ServeStatusCodeMetric bool `yaml:"serve-status-code-metric"`
BackendHostMetrics bool `yaml:"backend-host-metrics"`
AllFiltersMetrics bool `yaml:"all-filters-metrics"`
CombinedResponseMetrics bool `yaml:"combined-response-metrics"`
RouteResponseMetrics bool `yaml:"route-response-metrics"`
RouteBackendErrorCounters bool `yaml:"route-backend-error-counters"`
RouteStreamErrorCounters bool `yaml:"route-stream-error-counters"`
RouteBackendMetrics bool `yaml:"route-backend-metrics"`
RouteCreationMetrics bool `yaml:"route-creation-metrics"`
MetricsUseExpDecaySample bool `yaml:"metrics-exp-decay-sample"`
HistogramMetricBucketsString string `yaml:"histogram-metric-buckets"`
HistogramMetricBuckets []float64 `yaml:"-"`
DisableMetricsCompat bool `yaml:"disable-metrics-compat"`
ApplicationLog string `yaml:"application-log"`
ApplicationLogLevel log.Level `yaml:"-"`
ApplicationLogLevelString string `yaml:"application-log-level"`
ApplicationLogPrefix string `yaml:"application-log-prefix"`
ApplicationLogJSONEnabled bool `yaml:"application-log-json-enabled"`
AccessLog string `yaml:"access-log"`
AccessLogDisabled bool `yaml:"access-log-disabled"`
AccessLogJSONEnabled bool `yaml:"access-log-json-enabled"`
AccessLogStripQuery bool `yaml:"access-log-strip-query"`
SuppressRouteUpdateLogs bool `yaml:"suppress-route-update-logs"`
EnablePrometheusMetrics bool `yaml:"enable-prometheus-metrics"`
OpenTracing string `yaml:"opentracing"`
OpenTracingInitialSpan string `yaml:"opentracing-initial-span"`
OpenTracingExcludedProxyTags string `yaml:"opentracing-excluded-proxy-tags"`
OpenTracingDisableFilterSpans bool `yaml:"opentracing-disable-filter-spans"`
OpentracingLogFilterLifecycleEvents bool `yaml:"opentracing-log-filter-lifecycle-events"`
OpentracingLogStreamEvents bool `yaml:"opentracing-log-stream-events"`
OpentracingBackendNameTag bool `yaml:"opentracing-backend-name-tag"`
MetricsListener string `yaml:"metrics-listener"`
MetricsPrefix string `yaml:"metrics-prefix"`
EnableProfile bool `yaml:"enable-profile"`
BlockProfileRate int `yaml:"block-profile-rate"`
MutexProfileFraction int `yaml:"mutex-profile-fraction"`
MemProfileRate int `yaml:"memory-profile-rate"`
EnableFlightRecorder bool `yaml:"enable-flight-recorder"`
FlightRecorderSize int `yaml:"flight-recorder-size"`
FlightRecorderPeriod time.Duration `yaml:"flight-recorder-period"`
FlightRecorderProxyTookTooLong time.Duration `yaml:"flight-recorder-proxy-took-too-long"`
FlightRecorderTargetURL string `yaml:"flight-recorder-target-url"`
DebugGcMetrics bool `yaml:"debug-gc-metrics"`
RuntimeMetrics bool `yaml:"runtime-metrics"`
ServeRouteMetrics bool `yaml:"serve-route-metrics"`
ServeRouteCounter bool `yaml:"serve-route-counter"`
ServeHostMetrics bool `yaml:"serve-host-metrics"`
ServeHostCounter bool `yaml:"serve-host-counter"`
ServeMethodMetric bool `yaml:"serve-method-metric"`
ServeStatusCodeMetric bool `yaml:"serve-status-code-metric"`
BackendHostMetrics bool `yaml:"backend-host-metrics"`
AllFiltersMetrics bool `yaml:"all-filters-metrics"`
CombinedResponseMetrics bool `yaml:"combined-response-metrics"`
RouteResponseMetrics bool `yaml:"route-response-metrics"`
RouteBackendErrorCounters bool `yaml:"route-backend-error-counters"`
RouteStreamErrorCounters bool `yaml:"route-stream-error-counters"`
RouteBackendMetrics bool `yaml:"route-backend-metrics"`
RouteCreationMetrics bool `yaml:"route-creation-metrics"`
MetricsUseExpDecaySample bool `yaml:"metrics-exp-decay-sample"`
HistogramMetricBucketsString string `yaml:"histogram-metric-buckets"`
HistogramMetricBuckets []float64 `yaml:"-"`
DisableMetricsCompat bool `yaml:"disable-metrics-compat"`
ApplicationLog string `yaml:"application-log"`
ApplicationLogLevel log.Level `yaml:"-"`
ApplicationLogLevelString string `yaml:"application-log-level"`
ApplicationLogPrefix string `yaml:"application-log-prefix"`
ApplicationLogJSONEnabled bool `yaml:"application-log-json-enabled"`
AccessLog string `yaml:"access-log"`
AccessLogDisabled bool `yaml:"access-log-disabled"`
AccessLogJSONEnabled bool `yaml:"access-log-json-enabled"`
AccessLogStripQuery bool `yaml:"access-log-strip-query"`
SuppressRouteUpdateLogs bool `yaml:"suppress-route-update-logs"`

// route sources:
EtcdUrls string `yaml:"etcd-urls"`
Expand Down Expand Up @@ -378,6 +383,11 @@ func NewConfig() *Config {
flag.IntVar(&cfg.BlockProfileRate, "block-profile-rate", 0, "block profile sample rate, see runtime.SetBlockProfileRate")
flag.IntVar(&cfg.MutexProfileFraction, "mutex-profile-fraction", 0, "mutex profile fraction rate, see runtime.SetMutexProfileFraction")
flag.IntVar(&cfg.MemProfileRate, "memory-profile-rate", 0, "memory profile rate, see runtime.SetMemProfileRate, keeps default 512 kB")
flag.BoolVar(&cfg.EnableFlightRecorder, "enable-flight-recorder", false, "enable flightrecorder Go tracer")
flag.IntVar(&cfg.FlightRecorderSize, "flight-recorder-size", 0, "max flight-recorder trace data size")
flag.DurationVar(&cfg.FlightRecorderPeriod, "flight-recorder-period", 0, "sets the approximate time duration that the flight recorder's circular buffer represents.")
flag.DurationVar(&cfg.FlightRecorderProxyTookTooLong, "flight-recorder-proxy-took-too-long", 0, "sets the threshold, if proxy took longer than that the flight recorder will write out a trace.")
flag.StringVar(&cfg.FlightRecorderTargetURL, "flight-recorder-target-url", "", "sets the flight recorder target URL that is used to write out the trace to.")
flag.BoolVar(&cfg.DebugGcMetrics, "debug-gc-metrics", false, "enables reporting of the Go garbage collector statistics exported in debug.GCStats")
flag.BoolVar(&cfg.RuntimeMetrics, "runtime-metrics", true, "enables reporting of the Go runtime statistics exported in runtime and specifically runtime.MemStats")
flag.BoolVar(&cfg.ServeRouteMetrics, "serve-route-metrics", false, "enables reporting total serve time metrics for each route")
Expand Down Expand Up @@ -745,6 +755,11 @@ func (c *Config) ToOptions() skipper.Options {
EnableProfile: c.EnableProfile,
BlockProfileRate: c.BlockProfileRate,
MutexProfileFraction: c.MutexProfileFraction,
EnableFlightRecorder: c.EnableFlightRecorder,
FlightRecorderSize: c.FlightRecorderSize,
FlightRecorderPeriod: c.FlightRecorderPeriod,
FlightRecorderProxyTookTooLong: c.FlightRecorderProxyTookTooLong,
FlightRecorderTargetURL: c.FlightRecorderTargetURL,
EnableDebugGcMetrics: c.DebugGcMetrics,
EnableRuntimeMetrics: c.RuntimeMetrics,
EnableServeRouteMetrics: c.ServeRouteMetrics,
Expand Down
1 change: 1 addition & 0 deletions filters/builtin/builtin.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ func Filters() []filters.Spec {
diag.NewNormalResponseLatency(),
diag.NewHistogramRequestLatency(),
diag.NewHistogramResponseLatency(),
diag.NewTrace(),

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / tests

undefined: diag.NewTrace

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / tests

undefined: diag.NewTrace

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / tests

undefined: diag.NewTrace

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / tests

undefined: diag.NewTrace

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / tests

undefined: diag.NewTrace

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / tests

undefined: diag.NewTrace

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / tests

undefined: diag.NewTrace

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / tests

undefined: diag.NewTrace

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / Analyze (go)

undefined: diag.NewTrace

Check failure on line 192 in filters/builtin/builtin.go

View workflow job for this annotation

GitHub Actions / check-race

undefined: diag.NewTrace
tee.NewTee(),
tee.NewTeeDeprecated(),
tee.NewTeeNoFollow(),
Expand Down
1 change: 1 addition & 0 deletions filters/filters.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ const (
NormalResponseLatencyName = "normalResponseLatency"
HistogramRequestLatencyName = "histogramRequestLatency"
HistogramResponseLatencyName = "histogramResponseLatency"
TraceName = "trace"
LogBodyName = "logBody"
LogHeaderName = "logHeader"
TeeName = "tee"
Expand Down
192 changes: 116 additions & 76 deletions proxy/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"runtime"
"strconv"
"strings"
"sync"
"time"
"unicode/utf8"

Expand Down Expand Up @@ -318,6 +317,16 @@ type Params struct {

// PassiveHealthCheck defines the parameters for the healthy endpoints checker.
PassiveHealthCheck *PassiveHealthCheck

// FlightRecorder is a started instance of https://pkg.go.dev/golang.org/x/exp/trace#FlightRecorder
FlightRecorder *trace.FlightRecorder

// FlightRecorderTargetURL is the target to write the trace
// to. Supported targets are http URL and file URL.
FlightRecorderTargetURL string

// FlightRecorderProxyTookTooLong defines the threshold when to write out a trace
FlightRecorderProxyTookTooLong time.Duration
}

type (
Expand Down Expand Up @@ -387,34 +396,34 @@ type PriorityRoute interface {
// Proxy instances implement Skipper proxying functionality. For
// initializing, see the WithParams the constructor and Params.
type Proxy struct {
experimentalUpgrade bool
experimentalUpgradeAudit bool
accessLogDisabled bool
maxLoops int
defaultHTTPStatus int
routing *routing.Routing
registry *routing.EndpointRegistry
fadein *fadeIn
heathlyEndpoints *healthyEndpoints
roundTripper http.RoundTripper
priorityRoutes []PriorityRoute
flags Flags
metrics metrics.Metrics
quit chan struct{}
flushInterval time.Duration
breakers *circuit.Registry
limiters *ratelimit.Registry
log logging.Logger
tracing *proxyTracing
upgradeAuditLogOut io.Writer
upgradeAuditLogErr io.Writer
auditLogHook chan struct{}
clientTLS *tls.Config
hostname string
onPanicSometimes rate.Sometimes
flightRecorder *trace.FlightRecorder
traceOnce sync.Once
tooLong time.Duration
experimentalUpgrade bool
experimentalUpgradeAudit bool
accessLogDisabled bool
maxLoops int
defaultHTTPStatus int
routing *routing.Routing
registry *routing.EndpointRegistry
fadein *fadeIn
heathlyEndpoints *healthyEndpoints
roundTripper http.RoundTripper
priorityRoutes []PriorityRoute
flags Flags
metrics metrics.Metrics
quit chan struct{}
flushInterval time.Duration
breakers *circuit.Registry
limiters *ratelimit.Registry
log logging.Logger
tracing *proxyTracing
upgradeAuditLogOut io.Writer
upgradeAuditLogErr io.Writer
auditLogHook chan struct{}
clientTLS *tls.Config
hostname string
onPanicSometimes rate.Sometimes
flightRecorder *trace.FlightRecorder
flightRecorderURL *url.URL
flightRecorderProxyTookTooLong time.Duration
}

// proxyError is used to wrap errors during proxying and to indicate
Expand Down Expand Up @@ -801,13 +810,15 @@ func WithParams(p Params) *Proxy {
endpointRegistry: p.EndpointRegistry,
}
}
// TODO(sszuecs): expose an option to start it
fr := trace.NewFlightRecorder()
//fr.SetPeriod(d)
//fr.SetSize(bytes int)
err := fr.Start()
if err != nil {
println("Failed to start FlightRecorder:", err.Error())

var frURL *url.URL
if p.FlightRecorder != nil {
var err error
frURL, err = url.Parse(p.FlightRecorderTargetURL)
if err != nil {
p.FlightRecorder.Stop()
p.FlightRecorder = nil
}
}

return &Proxy{
Expand All @@ -817,53 +828,82 @@ func WithParams(p Params) *Proxy {
rnd: rand.New(loadbalancer.NewLockedSource()),
endpointRegistry: p.EndpointRegistry,
},
heathlyEndpoints: healthyEndpointsChooser,
roundTripper: p.CustomHttpRoundTripperWrap(tr),
priorityRoutes: p.PriorityRoutes,
flags: p.Flags,
metrics: m,
quit: quit,
flushInterval: p.FlushInterval,
experimentalUpgrade: p.ExperimentalUpgrade,
experimentalUpgradeAudit: p.ExperimentalUpgradeAudit,
maxLoops: p.MaxLoopbacks,
breakers: p.CircuitBreakers,
limiters: p.RateLimiters,
log: &logging.DefaultLog{},
defaultHTTPStatus: defaultHTTPStatus,
tracing: newProxyTracing(p.OpenTracing),
accessLogDisabled: p.AccessLogDisabled,
upgradeAuditLogOut: os.Stdout,
upgradeAuditLogErr: os.Stderr,
clientTLS: tr.TLSClientConfig,
hostname: hostname,
onPanicSometimes: rate.Sometimes{First: 3, Interval: 1 * time.Minute},
flightRecorder: fr,
traceOnce: sync.Once{},
tooLong: 250 * time.Millisecond,
heathlyEndpoints: healthyEndpointsChooser,
roundTripper: p.CustomHttpRoundTripperWrap(tr),
priorityRoutes: p.PriorityRoutes,
flags: p.Flags,
metrics: m,
quit: quit,
flushInterval: p.FlushInterval,
experimentalUpgrade: p.ExperimentalUpgrade,
experimentalUpgradeAudit: p.ExperimentalUpgradeAudit,
maxLoops: p.MaxLoopbacks,
breakers: p.CircuitBreakers,
limiters: p.RateLimiters,
log: &logging.DefaultLog{},
defaultHTTPStatus: defaultHTTPStatus,
tracing: newProxyTracing(p.OpenTracing),
accessLogDisabled: p.AccessLogDisabled,
upgradeAuditLogOut: os.Stdout,
upgradeAuditLogErr: os.Stderr,
clientTLS: tr.TLSClientConfig,
hostname: hostname,
onPanicSometimes: rate.Sometimes{First: 3, Interval: 1 * time.Minute},
flightRecorder: p.FlightRecorder,
flightRecorderURL: frURL,
flightRecorderProxyTookTooLong: p.FlightRecorderProxyTookTooLong,
}
}

func (p *Proxy) writeTraceIfTooSlow(ctx *context) {
p.log.Infof("write trace if too slow: %s > %s", time.Since(ctx.startServe), p.tooLong)
if time.Since(ctx.startServe) > p.tooLong {
p.log.Info("too slow")
// Do it only once for simplicitly, but you can take more than one.
p.traceOnce.Do(func() {
p.log.Info("write trace because we were too slow")
// Grab the snapshot.
var b bytes.Buffer
_, err := p.flightRecorder.WriteTo(&b)
if err != nil {
p.log.Errorf("Failed to write flightrecorder data: %v", err)
if p.flightRecorder == nil || p.flightRecorderURL == nil {
return
}

d := p.flightRecorderProxyTookTooLong
if e, ok := ctx.StateBag()[filters.TraceName]; ok {
d = e.(time.Duration)
}
if d < 1*time.Microsecond {
return
}

p.log.Infof("write trace if too slow: %s > %s", time.Since(ctx.startServe), d)
if time.Since(ctx.startServe) > d {
var b bytes.Buffer
_, err := p.flightRecorder.WriteTo(&b)
if err != nil {
p.log.Errorf("Failed to write flightrecorder data: %v", err)
return
}

switch p.flightRecorderURL.Scheme {
case "file":
if err := os.WriteFile(p.flightRecorderURL.Path, b.Bytes(), 0o644); err != nil {
p.log.Errorf("Failed to write file trace.out: %v", err)
return
} else {
p.log.Infof("FlightRecorder wrote %d bytes to trace file %q", b.Len(), p.flightRecorderURL.Path)
}
// Write it to a file.
if err := os.WriteFile("trace.out", b.Bytes(), 0o755); err != nil {
p.log.Errorf("Failed to write trace.out: %v", err)
return
case "http", "https":
req, err := http.NewRequest("PUT", p.flightRecorderURL.String(), &b)
if err != nil {
p.log.Errorf("Failed to create request to %q to send a trace: %v", p.flightRecorderURL.String(), err)
}
})

rsp, err := http.DefaultClient.Do(req)
if err != nil {
p.log.Errorf("Failed to write trace to %q: %v", p.flightRecorderURL.String(), err)
}
switch rsp.StatusCode {
case 200, 201, 204:
p.log.Infof("Successful send of a trace to %q", p.flightRecorderURL.String())
default:
p.log.Errorf("Failed to get successful response from %s: (%d) %s", p.flightRecorderURL.String(), rsp.StatusCode, rsp.Status)
}
default:
p.log.Errorf("Failed to write trace, unknown FlightRecorderURL %q", p.flightRecorderURL.Scheme)
}
}
}

Expand Down
Loading

0 comments on commit e9d5125

Please sign in to comment.