Open telemetry impact the performance in case of spans and logging creations are happening concurrently #2538

gopinathr143 · 2024-11-19T06:46:08Z

What version of OpenTelemetry are you using?

"@opentelemetry/api": "^1.9.0",
"@opentelemetry/exporter-trace-otlp-grpc": "^0.55.0",
"@opentelemetry/instrumentation": "^0.55.0",
"@opentelemetry/instrumentation-express": "^0.44.0",
"@opentelemetry/instrumentation-http": "^0.55.0",
"@opentelemetry/instrumentation-winston": "^0.41.0",
"@opentelemetry/propagator-b3": "^1.28.0",
"@opentelemetry/resources": "^1.28.0",
"@opentelemetry/sdk-node": "^0.55.0",
"@opentelemetry/sdk-trace-base": "^1.28.0",
"@opentelemetry/sdk-trace-node": "^1.28.0",
"@opentelemetry/semantic-conventions": "^1.27.0",
"@opentelemetry/winston-transport": "^0.7.0",
"express": "^4.21.1",
"winston": "^3.17.0"

What version of Node are you using?

Node.js v20.17.0

What did you do?

PFB code for Node service with Open telemetry

index.js

const { initTracing } = require("./tracing");
const config = {
    service_name: "SIMPLE-NODE-API",
    tracing: {
        enableDebug: false,
        url: "http://localhost:4317",
        urls_to_be_skipped: ["/ignore"],
        concurrencyLimit: 50,
        timeoutMillis: 3000
    },
    batchSpanProcessorConfig: {
        maxQueueSize: 2048,
        maxExportBatchSize: 256,
        scheduledDelayMillis: 200,
        exportTimeoutMillis: 6000
    }
}

initTracing(config);
const express = require('express');
const {logger} = require("./logger");
const app = express();
const port = 3002;


app.use(express.json());

const asyncOperation = (id, delay) => {
    return new Promise((resolve) => {
        logger.info(`Operation${id} started`);
        setTimeout(() => {
            logger.info(`Operation${id} completed`);
            resolve(`Result from operation ${id}`);
        }, delay);
    });
};

app.get('/api/simple-express/v1/healthz', async (req, res) => {
   try{
       logger.info(`healthz api started`);
       const syncResults = 'Synchronous operation result';

       const asyncTasks = [
           asyncOperation(1, 100),
           asyncOperation(2, 200),
           asyncOperation(3, 100),
       ]

       const asyncResults = await Promise.all(asyncTasks);

       const finalResults ={
           syncResults,
           asyncResults
       };
       logger.info(`healthz api completed`);
       res.status(200).json(finalResults);
   } catch (err) {
       res.status(500).json({error: 'API Failed'});
   }
});

app.listen(port, () => {
    logger.info(`Simple node API running at port ${port}`);
});

logger/index.js

const {
    OpenTelemetryTransportV3,
} = require("@opentelemetry/winston-transport");
const winston = require("winston");

module.exports = {
    logger: winston.createLogger({
        level: "info",
        format: winston.format.combine(
            winston.format.timestamp(),
            winston.format.json(),
        ),
        transports: [
            new OpenTelemetryTransportV3(),
            new winston.transports.Console(),
        ],
    }),
};

tracing/index.js

const { BatchSpanProcessor, AlwaysOnSampler } = require("@opentelemetry/sdk-trace-base");
const { NodeTracerProvider } = require("@opentelemetry/sdk-trace-node");
const { Resource } = require("@opentelemetry/resources");
const { ATTR_SERVICE_NAME } = require("@opentelemetry/semantic-conventions");
const { diag, DiagConsoleLogger, DiagLogLevel } = require("@opentelemetry/api");
const { registerInstrumentations } = require("@opentelemetry/instrumentation");
const { OTLPTraceExporter } = require("@opentelemetry/exporter-trace-otlp-grpc");
const { B3Propagator, B3InjectEncoding } = require("@opentelemetry/propagator-b3");
const { HttpInstrumentation } = require("@opentelemetry/instrumentation-http");
const { WinstonInstrumentation } = require("@opentelemetry/instrumentation-winston");
const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express');

module.exports = {
  initTracing: (config) => {
    if (config?.tracing?.enableDebug) {
      diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.ALL);
    }
    const exporter = new OTLPTraceExporter(config.tracing);

    const provider = new NodeTracerProvider({
      sampler: new AlwaysOnSampler(),
      resource: new Resource({
        [ATTR_SERVICE_NAME]: config.service_name
      })
    });
    provider.addSpanProcessor(new BatchSpanProcessor(exporter, config?.batchSpanProcessorConfig));
    provider.register({
      propagator: new B3Propagator({
        injectEncoding: B3InjectEncoding.MULTI_HEADER
      })
    });

    registerInstrumentations({
      instrumentations: [
        new ExpressInstrumentation(),
        new HttpInstrumentation({
          ignoreIncomingRequestHook: function (request) {
            if (config.tracing.urls_to_be_skipped.includes(request.url)) {
              return true;
            }
          },
          applyCustomAttributesOnSpan: function (span, request, response) {
            span.updateName(request.url);
          },
          ignoreOutgoingRequestHook: () => true
        }),
        new WinstonInstrumentation({
          enabled: true,
          logHook: (span, record) => {
            span.addEvent("log", record);
          }
        })
      ],
      tracerProvider: provider
    });

    return provider;
  }
};

package.json

{
  "name": "simple-node-api",
  "version": "1.0.0",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "description": "",
  "dependencies": {
    "@opentelemetry/api": "^1.9.0",
    "@opentelemetry/exporter-trace-otlp-grpc": "^0.55.0",
    "@opentelemetry/instrumentation": "^0.55.0",
    "@opentelemetry/instrumentation-express": "^0.44.0",
    "@opentelemetry/instrumentation-http": "^0.55.0",
    "@opentelemetry/instrumentation-winston": "^0.41.0",
    "@opentelemetry/propagator-b3": "^1.28.0",
    "@opentelemetry/resources": "^1.28.0",
    "@opentelemetry/sdk-node": "^0.55.0",
    "@opentelemetry/sdk-trace-base": "^1.28.0",
    "@opentelemetry/sdk-trace-node": "^1.28.0",
    "@opentelemetry/semantic-conventions": "^1.27.0",
    "@opentelemetry/winston-transport": "^0.7.0",
    "express": "^4.21.1",
    "winston": "^3.17.0"
  }
}

Above attached code is impacting high performance. Please find the below apache benchmark report

Server Hostname:        127.0.0.1
Server Port:            3002

Document Path:          /api/simple-express/v1/healthz
Document Length:        141 bytes

Concurrency Level:      100
Time taken for tests:   25.213 seconds
Complete requests:      10000
Failed requests:        0
Total transferred:      3490000 bytes
HTML transferred:       1410000 bytes
Requests per second:    396.62 [#/sec] (mean)
Time per request:       252.128 [ms] (mean)
Time per request:       2.521 [ms] (mean, across all concurrent requests)
Transfer rate:          135.18 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:        1   18  16.0     14      89
Processing:   202  230  59.5    216     707
Waiting:      202  227  53.0    215     705
Total:        203  248  64.9    230     755

Percentage of the requests served within a certain time (ms)
  50%    230
  66%    240
  75%    249
  80%    256
  90%    285
  95%    325
  98%    558
  99%    664
 100%    755 (longest request)

apache benchmark report without open telemetry

Server Hostname:        127.0.0.1
Server Port:            3001

Document Path:          /api/simple-express/v1/healthz
Document Length:        141 bytes

Concurrency Level:      100
Time taken for tests:   23.161 seconds
Complete requests:      10000
Failed requests:        0
Total transferred:      3490000 bytes
HTML transferred:       1410000 bytes
Requests per second:    431.75 [#/sec] (mean)
Time per request:       231.614 [ms] (mean)
Time per request:       2.316 [ms] (mean, across all concurrent requests)
Transfer rate:          147.15 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:        1   13  12.5     10     100
Processing:   203  215   9.1    213     285
Waiting:      202  213   8.4    211     283
Total:        204  228  20.1    223     350

Percentage of the requests served within a certain time (ms)
  50%    223
  66%    227
  75%    230
  80%    232
  90%    240
  95%    254
  98%    317
  99%    338
 100%    350 (longest request)

What did you expect to see?

Open telemetry should not impact the performance in case of concurrent span and logs creation.

What did you see instead?

Open telemetry is not working properly in case of concurrent logs and span creation.

Additional context

Code for without Open telemetry

index.js

const express = require('express');
const app = express();
const port = 3001

app.use(express.json());

const asyncOperation = (id, delay) => {
    return new Promise((resolve) => {
        setTimeout(() => {
            resolve(`Result from operation ${id}`);
        }, delay);
    });
};

app.get('/api/simple-express/v1/healthz', async (req, res) => {
   try{
       const syncResults = 'Synchronous operation result';

       const asyncTasks = [
           asyncOperation(1, 100),
           asyncOperation(2, 200),
           asyncOperation(3, 100),
       ]

       const asyncResults = await Promise.all(asyncTasks);

       const finalResults ={
           syncResults,
           asyncResults
       };

       res.status(200).json(finalResults);
   } catch (err) {
       res.status(500).json({error: 'API Failed'});
   }
});

app.listen(port, () => {
    console.log(`Simple node API running at port ${port}`);
});

package.json

{
  "name": "simple-node-api",
  "version": "1.0.0",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "description": "",
  "dependencies": {
    "express": "^4.21.1"
  }
}

The text was updated successfully, but these errors were encountered:

gopinathr143 · 2024-12-02T11:27:12Z

Hi Team,

Any updates on this issue?. It would help us lot if its rectified.

dyladan · 2024-12-04T17:00:54Z

Sorry I meant to reply to this when I self assigned it. It looks like what is mostly affected is the worst case latency.

It seems most likely to me that there is some sort of contention in the export pipeline, garbage collection, or some other component. In the issue you mention this only happens when you are exporting logs and traces concurrently? Have you confirmed that this does not happen if you are only exporting traces or only exporting logs? I see in your examples you only have the trace grpc exporter.

gopinathr143 · 2024-12-06T08:22:20Z

I am exporting traces along with logs. I am adding logs to the span using addevent in the loghook of winston instumentation.

I tried with using OTLP log exporter first. But I am unable to push the logger to collector. So I used logHook for adding logs to the span events.

blumamir · 2024-12-11T18:40:13Z

Hi @gopinathr143,

This isn’t a direct solution to the issue you raised, but I wanted to share some insights from the performance testing and research we conducted at Odigos.

Instrumentation inherently introduces runtime overhead. This is because it requires creating memory allocations for telemetry data, batching this data in memory for export, encoding it, and transmitting it over the network. While it’s impossible to eliminate this overhead entirely, for most requests - the performance impact tends to be minimal.

However, we observed that this overhead can significantly impact tail latencies (e.g., the 99.9th percentile). This is primarily due to garbage collection and the CPU cycles consumed by the telemetry exporter when transmitting data outside the process boundary. In our tests (conducted with Go service), the overall request-latency impact was relatively low but could increase by up to 4x for tail requests when comparing no-instrumentation to OpenTelemetry-instrumented applications.

For Node.js, the impact might be even greater due to its single-threaded event loop. When OpenTelemetry performs tasks like OTLP encoding and exporting telemetry data, it can delay your application’s business logic, further increasing latency.

You can read more about our findings and performance measurements here

gopinathr143 added the bug Something isn't working label Nov 19, 2024

gopinathr143 changed the title ~~Open telemetry impact the performance in case of spans and loggings are happening concurrently~~ Open telemetry impact the performance in case of spans and logging creations are happening concurrently Nov 19, 2024

dyladan self-assigned this Nov 20, 2024

dyladan added the triage label Nov 20, 2024

pichlermarc added priority:p3 Bugs which cause problems in user apps not related to correctness (performance, memory use, etc) and removed triage labels Dec 11, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Open telemetry impact the performance in case of spans and logging creations are happening concurrently #2538

Open telemetry impact the performance in case of spans and logging creations are happening concurrently #2538

gopinathr143 commented Nov 19, 2024 •

edited

Loading

gopinathr143 commented Dec 2, 2024

dyladan commented Dec 4, 2024

gopinathr143 commented Dec 6, 2024

blumamir commented Dec 11, 2024

Open telemetry impact the performance in case of spans and logging creations are happening concurrently #2538

Open telemetry impact the performance in case of spans and logging creations are happening concurrently #2538

Comments

gopinathr143 commented Nov 19, 2024 • edited Loading

What version of OpenTelemetry are you using?

What version of Node are you using?

What did you do?

What did you expect to see?

What did you see instead?

Additional context

gopinathr143 commented Dec 2, 2024

dyladan commented Dec 4, 2024

gopinathr143 commented Dec 6, 2024

blumamir commented Dec 11, 2024

gopinathr143 commented Nov 19, 2024 •

edited

Loading