Skip to content

Commit 310ff93

Browse files
fs-eireCopilot
andauthored
[node] Fix logging mutex crash at exit on macOS (#26445)
### Description This change fixes a bug that causes crash on macOS (and also potentially other platforms using libc) at `OrtReleaseEnv`. Instead of using static variables, now they are function local static so that compiler can handle the destruction order correctly. ### Motivation and Context Fixes #24579 --------- Co-authored-by: Copilot <[email protected]>
1 parent ec9b8fe commit 310ff93

File tree

12 files changed

+240
-55
lines changed

12 files changed

+240
-55
lines changed

js/node/src/inference_session_wrap.cc

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "common.h"
77
#include "inference_session_wrap.h"
88
#include "ort_instance_data.h"
9+
#include "ort_singleton_data.h"
910
#include "run_options_helper.h"
1011
#include "session_options_helper.h"
1112
#include "tensor_helper.h"
@@ -76,7 +77,7 @@ Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo& info) {
7677
Napi::String value = info[0].As<Napi::String>();
7778

7879
ParseSessionOptions(info[1].As<Napi::Object>(), sessionOptions);
79-
this->session_.reset(new Ort::Session(*OrtInstanceData::OrtEnv(),
80+
this->session_.reset(new Ort::Session(OrtSingletonData::Env(),
8081
#ifdef _WIN32
8182
reinterpret_cast<const wchar_t*>(value.Utf16Value().c_str()),
8283
#else
@@ -91,7 +92,7 @@ Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo& info) {
9192
int64_t bytesLength = info[2].As<Napi::Number>().Int64Value();
9293

9394
ParseSessionOptions(info[3].As<Napi::Object>(), sessionOptions);
94-
this->session_.reset(new Ort::Session(*OrtInstanceData::OrtEnv(),
95+
this->session_.reset(new Ort::Session(OrtSingletonData::Env(),
9596
reinterpret_cast<char*>(buffer) + bytesOffset, bytesLength,
9697
sessionOptions));
9798
} else {
@@ -211,7 +212,7 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo& info) {
211212
ParseRunOptions(info[2].As<Napi::Object>(), runOptions);
212213
}
213214
if (preferredOutputLocations_.size() == 0) {
214-
session_->Run(runOptions == nullptr ? *OrtInstanceData::OrtDefaultRunOptions() : runOptions,
215+
session_->Run(runOptions == nullptr ? OrtSingletonData::DefaultRunOptions() : runOptions,
215216
inputIndex == 0 ? nullptr : &inputNames_cstr[0], inputIndex == 0 ? nullptr : &inputValues[0],
216217
inputIndex, outputIndex == 0 ? nullptr : &outputNames_cstr[0],
217218
outputIndex == 0 ? nullptr : &outputValues[0], outputIndex);
@@ -240,7 +241,7 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo& info) {
240241
}
241242
}
242243

243-
session_->Run(runOptions == nullptr ? *OrtInstanceData::OrtDefaultRunOptions() : runOptions, *ioBinding_);
244+
session_->Run(runOptions == nullptr ? OrtSingletonData::DefaultRunOptions() : runOptions, *ioBinding_);
244245

245246
auto outputs = ioBinding_->GetOutputValues();
246247
ORT_NAPI_THROW_ERROR_IF(outputs.size() != outputIndex, env, "Output count mismatch.");

js/node/src/ort_instance_data.cc

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,10 @@
66

77
#include "common.h"
88
#include "ort_instance_data.h"
9+
#include "ort_singleton_data.h"
910
#include "onnxruntime_cxx_api.h"
1011

11-
std::unique_ptr<Ort::Env> OrtInstanceData::ortEnv;
12-
std::unique_ptr<Ort::RunOptions> OrtInstanceData::ortDefaultRunOptions;
13-
std::mutex OrtInstanceData::ortEnvMutex;
14-
std::atomic<uint64_t> OrtInstanceData::ortEnvRefCount;
15-
std::atomic<bool> OrtInstanceData::ortEnvDestroyed;
16-
1712
OrtInstanceData::OrtInstanceData() {
18-
++ortEnvRefCount;
19-
}
20-
21-
OrtInstanceData::~OrtInstanceData() {
22-
if (--ortEnvRefCount == 0) {
23-
std::lock_guard<std::mutex> lock(ortEnvMutex);
24-
if (ortEnv) {
25-
ortDefaultRunOptions.reset(nullptr);
26-
ortEnv.reset();
27-
ortEnvDestroyed = true;
28-
}
29-
}
3013
}
3114

3215
void OrtInstanceData::Create(Napi::Env env, Napi::Function inferenceSessionWrapperFunction) {
@@ -42,14 +25,8 @@ void OrtInstanceData::InitOrt(Napi::Env env, int log_level, Napi::Function tenso
4225

4326
data->ortTensorConstructor = Napi::Persistent(tensorConstructor);
4427

45-
if (!ortEnv) {
46-
std::lock_guard<std::mutex> lock(ortEnvMutex);
47-
if (!ortEnv) {
48-
ORT_NAPI_THROW_ERROR_IF(ortEnvDestroyed, env, "OrtEnv already destroyed.");
49-
ortEnv.reset(new Ort::Env{OrtLoggingLevel(log_level), "onnxruntime-node"});
50-
ortDefaultRunOptions.reset(new Ort::RunOptions{});
51-
}
52-
}
28+
// Only the first time call to OrtSingletonData::GetOrCreateOrtObjects() will create the Ort::Env
29+
OrtSingletonData::GetOrCreateOrtObjects(log_level);
5330
}
5431

5532
const Napi::FunctionReference& OrtInstanceData::TensorConstructor(Napi::Env env) {

js/node/src/ort_instance_data.h

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,6 @@
88

99
/**
1010
* The OrtInstanceData class is designed to manage the lifecycle of necessary instance data, including:
11-
* - The Ort::Env singleton instance.
12-
* This is a global singleton that is shared across all InferenceSessionWrap instances. It is created when the first
13-
* time `InferenceSession.initOrtOnce()` is called. It is destroyed when the last active NAPI Env is destroyed.
14-
* Once destroyed, it cannot be created again.
15-
*
1611
* - The Object reference of the InferenceSessionWrap class and the Tensor constructor.
1712
* This is a per-env data that has the same lifecycle as the Napi::Env. If there are worker threads, each thread will
1813
* have its own handle to the InferenceSessionWrap class and the Tensor constructor.
@@ -27,24 +22,11 @@ struct OrtInstanceData {
2722
static void InitOrt(Napi::Env env, int log_level, Napi::Function tensorConstructor);
2823
// Get the Tensor constructor reference for the Napi::Env
2924
static const Napi::FunctionReference& TensorConstructor(Napi::Env env);
30-
// Get the global Ort::Env
31-
static const Ort::Env* OrtEnv() { return ortEnv.get(); }
32-
// Get the default Ort::RunOptions
33-
static Ort::RunOptions* OrtDefaultRunOptions() { return ortDefaultRunOptions.get(); }
34-
35-
~OrtInstanceData();
3625

3726
private:
3827
OrtInstanceData();
3928

4029
// per env persistent constructors
4130
Napi::FunctionReference wrappedSessionConstructor;
4231
Napi::FunctionReference ortTensorConstructor;
43-
44-
// ORT env (global singleton)
45-
static std::unique_ptr<Ort::Env> ortEnv;
46-
static std::unique_ptr<Ort::RunOptions> ortDefaultRunOptions;
47-
static std::mutex ortEnvMutex;
48-
static std::atomic<uint64_t> ortEnvRefCount;
49-
static std::atomic<bool> ortEnvDestroyed;
5032
};

js/node/src/ort_singleton_data.cc

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#include "ort_singleton_data.h"
5+
6+
OrtSingletonData::OrtObjects::OrtObjects(int log_level)
7+
: env{OrtLoggingLevel(log_level), "onnxruntime-node"},
8+
default_run_options{} {
9+
}
10+
11+
OrtSingletonData::OrtObjects& OrtSingletonData::GetOrCreateOrtObjects(int log_level) {
12+
static OrtObjects ort_objects(log_level);
13+
return ort_objects;
14+
}
15+
16+
const Ort::Env& OrtSingletonData::Env() {
17+
return GetOrCreateOrtObjects().env;
18+
}
19+
20+
const Ort::RunOptions& OrtSingletonData::DefaultRunOptions() {
21+
return GetOrCreateOrtObjects().default_run_options;
22+
}

js/node/src/ort_singleton_data.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
6+
#include <napi.h>
7+
#include "onnxruntime_cxx_api.h"
8+
9+
/**
10+
* The OrtSingletonData class is designed to manage the lifecycle of necessary singleton instance data, including:
11+
*
12+
* - The Ort::Env singleton instance.
13+
* This is a global singleton that is shared across all InferenceSessionWrap instances. It is created when the first
14+
* time `InferenceSession.initOrtOnce()` is called.
15+
*
16+
* - The Ort::RunOptions singleton instance.
17+
* This is an empty default RunOptions instance. It is created once to allow reuse across all session inference runs.
18+
*
19+
* The OrtSingletonData class uses the "Meyers Singleton" pattern to ensure thread-safe lazy initialization, as well as
20+
* proper destruction order at program exit.
21+
*/
22+
struct OrtSingletonData {
23+
struct OrtObjects {
24+
Ort::Env env;
25+
Ort::RunOptions default_run_options;
26+
27+
private:
28+
// The following pattern ensures that OrtObjects can only be created by OrtSingletonData
29+
OrtObjects(int log_level);
30+
friend struct OrtSingletonData;
31+
};
32+
33+
static OrtObjects& GetOrCreateOrtObjects(int log_level = ORT_LOGGING_LEVEL_WARNING);
34+
35+
// Get the global Ort::Env
36+
static const Ort::Env& Env();
37+
38+
// Get the default Ort::RunOptions
39+
static const Ort::RunOptions& DefaultRunOptions();
40+
};

js/node/test/e2e/inference-session-run.ts renamed to js/node/test/api/inference-session-run.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import * as path from 'path';
66

77
import { assertTensorEqual, SQUEEZENET_INPUT0_DATA, SQUEEZENET_OUTPUT0_DATA, TEST_DATA_ROOT } from '../test-utils';
88

9-
describe('E2E Tests - InferenceSession.run()', async () => {
9+
describe('API Tests - InferenceSession.run()', async () => {
1010
let session: InferenceSession;
1111
const input0 = new Tensor('float32', SQUEEZENET_INPUT0_DATA, [1, 3, 224, 224]);
1212
const expectedOutput0 = new Tensor('float32', SQUEEZENET_OUTPUT0_DATA, [1, 1000, 1, 1]);

js/node/test/e2e/simple-e2e-tests.ts renamed to js/node/test/api/simple-api-tests.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ const MODEL_TEST_TYPES_CASES: Array<{
8787
},
8888
];
8989

90-
describe('E2E Tests - simple E2E tests', () => {
90+
describe('API Tests - simple API tests', () => {
9191
MODEL_TEST_TYPES_CASES.forEach((testCase) => {
9292
it(`${testCase.model}`, async () => {
9393
const session = await InferenceSession.create(testCase.model);

js/node/test/e2e/worker-test.ts renamed to js/node/test/api/worker-test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import { assertTensorEqual, SQUEEZENET_INPUT0_DATA, SQUEEZENET_OUTPUT0_DATA, TES
77
import * as path from 'path';
88

99
if (isMainThread) {
10-
describe('E2E Tests - worker test', () => {
10+
describe('API Tests - worker test', () => {
1111
it('should run in worker', (done) => {
1212
const worker = new Worker(__filename, {
1313
stdout: true,

js/node/test/standalone/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Process Exit Tests
2+
3+
These tests verify that the ORT Node.js binding handles various process exit scenarios without crashing, specifically addressing the mutex crash issue reported in [#24579](https://github.com/microsoft/onnxruntime/issues/24579).
4+
5+
## What This Tests
6+
7+
- **Normal process exit** - Verifies clean shutdown without mutex crashes
8+
- **`process.exit()` calls** - Tests the primary crash scenario that was fixed
9+
- **Uncaught exceptions** - Ensures crashes don't occur during unexpected exits
10+
- **Session cleanup** - Tests both explicit `session.release()` and automatic cleanup
11+
- **Stability** - Multiple runs to ensure consistent behavior
12+
13+
## How It Works
14+
15+
Each test runs in a separate Node.js process to isolate the test environment. Tests use command-line flags to control behavior:
16+
17+
- `--process-exit`: Triggers `process.exit(0)`
18+
- `--throw-exception`: Throws an uncaught exception
19+
- `--release`: Calls `session.release()` before exit
20+
21+
## Expected Result
22+
23+
All tests should pass without `mutex lock failed` or `std::__1::system_error` messages in stderr.

js/node/test/standalone/index.ts

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
import { spawn } from 'child_process';
5+
import * as assert from 'assert';
6+
import * as path from 'path';
7+
8+
describe('Standalone Process Tests', () => {
9+
// Helper function to run test script in a separate process
10+
const runTest = async (args: string[] = []): Promise<{ code: number; stdout: string; stderr: string }> =>
11+
new Promise((resolve, reject) => {
12+
// Use the compiled main.js file from the lib directory
13+
const testFile = path.join(__dirname, './main.js');
14+
15+
const child = spawn('node', [testFile, ...args], { stdio: 'pipe' });
16+
17+
let stdout = '';
18+
let stderr = '';
19+
20+
child.stdout.on('data', (data) => (stdout += data.toString()));
21+
child.stderr.on('data', (data) => (stderr += data.toString()));
22+
23+
child.on('close', (code) => {
24+
resolve({ code: code || 0, stdout, stderr });
25+
});
26+
27+
child.on('error', reject);
28+
});
29+
30+
// Helper function to check basic success criteria
31+
const assertSuccess = (result: { code: number; stdout: string; stderr: string }) => {
32+
assert.strictEqual(result.code, 0);
33+
assert.ok(result.stdout.includes('SUCCESS: Inference completed'));
34+
assert.ok(!result.stderr.includes('mutex lock failed'));
35+
};
36+
37+
// Helper function to check that no mutex crashes occurred
38+
const assertNoMutexErrors = (stderr: string) => {
39+
assert.ok(!stderr.includes('mutex lock failed'));
40+
assert.ok(!stderr.includes('std::__1::system_error'));
41+
};
42+
43+
it('should handle normal process exit', async () => {
44+
const result = await runTest([]);
45+
assertSuccess(result);
46+
});
47+
48+
it('should handle process.exit() call', async () => {
49+
const result = await runTest(['--process-exit']);
50+
assertSuccess(result);
51+
});
52+
53+
it('should handle uncaught exceptions', async () => {
54+
const result = await runTest(['--throw-exception']);
55+
56+
assert.notStrictEqual(result.code, 0);
57+
assert.ok(result.stdout.includes('SUCCESS: Inference completed'));
58+
assert.ok(result.stderr.includes('Test exception'));
59+
assertNoMutexErrors(result.stderr);
60+
});
61+
62+
it('should handle multiple process exits consistently', async () => {
63+
for (let i = 0; i < 3; i++) {
64+
const result = await runTest(['--process-exit']);
65+
assertSuccess(result);
66+
}
67+
});
68+
69+
it('should handle session.release() before normal exit', async () => {
70+
const result = await runTest(['--release']);
71+
assertSuccess(result);
72+
assert.ok(result.stdout.includes('Session released'));
73+
});
74+
75+
it('should handle session.release() before process.exit()', async () => {
76+
const result = await runTest(['--release', '--process-exit']);
77+
assertSuccess(result);
78+
assert.ok(result.stdout.includes('Session released'));
79+
});
80+
81+
it('should handle no session.release() before process.exit()', async () => {
82+
const result = await runTest(['--process-exit']);
83+
assertSuccess(result);
84+
assert.ok(result.stdout.includes('Session NOT released'));
85+
});
86+
});

0 commit comments

Comments
 (0)