diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake index f3afaf7033fd1..b6910795391b1 100644 --- a/cmake/onnxruntime_webassembly.cmake +++ b/cmake/onnxruntime_webassembly.cmake @@ -443,7 +443,8 @@ jsepDownload:_pp_") "SHELL:-s ASSERTIONS=0" "SHELL:-s SAFE_HEAP=0" "SHELL:-s STACK_OVERFLOW_CHECK=0" - --closure 1 + ## comment out closure compiler so that it's easier to debug + # --closure 1 ) endif() diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch index 6f5a50530a88a..b260552ab80f8 100644 --- a/cmake/patches/dawn/dawn.patch +++ b/cmake/patches/dawn/dawn.patch @@ -34,8 +34,74 @@ index efd6491cd6..8ebc5d28b6 100644 emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../.. ninja +diff --git a/third_party/emdawnwebgpu/library_webgpu.js b/third_party/emdawnwebgpu/library_webgpu.js +index 5862ce4045..45df259bb7 100644 +--- a/third_party/emdawnwebgpu/library_webgpu.js ++++ b/third_party/emdawnwebgpu/library_webgpu.js +@@ -811,6 +811,61 @@ var LibraryWebGPU = { + {{{ runtimeKeepalivePush() }}} + WebGPU.Internals.futureInsert(futureId, adapter.requestDevice(desc).then((device) => { + {{{ runtimeKeepalivePop() }}} ++ ++ if (globalThis["WEBGPU_STAT"]) { ++ // a set that caches all active buffers ++ const buffers = WebGPU.Internals.buffers ??= new Set(); ++ // key is buffer usage, value is total size of buffers with that usage ++ const buffersTotalSize = WebGPU.Internals.buffersTotalSize ??= new Map(); ++ ++ WebGPU.Internals.buffersCreated ??= 0; ++ WebGPU.Internals.buffersDestroyed ??= 0; ++ WebGPU.Internals.buffersUploads ??= 0; ++ WebGPU.Internals.buffersExternalUploads ??= 0; ++ WebGPU.Internals.buffersDownloads ??= 0; ++ WebGPU.Internals.buffersExternalDownloads ??= 0; ++ ++ // create a proxy so that we can monitor buffer usages ++ device = new Proxy(device, { ++ // when call device.createBuffer(), the returned buffer should be added into buffers ++ get: (target, prop, _receiver) => { ++ if (prop === 'createBuffer') { ++ return (desc) => { ++ const buffer = target.createBuffer(desc); ++ const originalDestroy = buffer.destroy.bind(buffer); ++ buffer.destroy = () => { ++ const previousTotal = buffersTotalSize.get(buffer.usage); ++ buffersTotalSize.set(buffer.usage, previousTotal - buffer.size); ++ buffers.delete(buffer); ++ WebGPU.Internals.buffersDestroyed++; ++ originalDestroy(); ++ }; ++ ++ if (buffer.usage === (GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC)) { ++ WebGPU.Internals.buffersUploads++; ++ } ++ if (buffer.usage === (GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ)) { ++ WebGPU.Internals.buffersDownloads++; ++ } ++ ++ buffers.add(buffer); ++ const previousTotal = buffersTotalSize.get(buffer.usage) ?? 0; ++ buffersTotalSize.set(buffer.usage, previousTotal + buffer.size); ++ WebGPU.Internals.buffersCreated++; ++ return buffer; ++ }; ++ } ++ const propertyValue = Reflect.get(target, prop); ++ if (typeof propertyValue === 'function') { ++ return propertyValue.bind(target); ++ } else { ++ return propertyValue; ++ } ++ }, ++ set: (target, prop, value, _receiver) => Reflect.set(target, prop, value), ++ }); ++ } ++ + WebGPU.Internals.jsObjectInsert(queuePtr, device.queue); + WebGPU.Internals.jsObjectInsert(devicePtr, device); + diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp -index ca52b1237b..b11462fb87 100644 +index ca52b1237b..a30ca583c3 100644 --- a/third_party/emdawnwebgpu/webgpu.cpp +++ b/third_party/emdawnwebgpu/webgpu.cpp @@ -131,7 +131,6 @@ class RefCounted : NonMovable { @@ -62,7 +128,14 @@ index ca52b1237b..b11462fb87 100644 void Destroy(); const void* GetConstMappedRange(size_t offset, size_t size); -@@ -1168,7 +1169,11 @@ WGPUBuffer emwgpuCreateBuffer(const EventSource* source, +@@ -1164,11 +1165,17 @@ WGPUAdapter emwgpuCreateAdapter(const EventSource* source) { + + WGPUBuffer emwgpuCreateBuffer(const EventSource* source, + bool mappedAtCreation = false) { +- return new WGPUBufferImpl(source, mappedAtCreation); ++ auto x = new WGPUBufferImpl(source, mappedAtCreation); ++ // printf(" #C++: emwgpuCreateBuffer %p\n", x); ++ return x; } WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) { @@ -75,7 +148,7 @@ index ca52b1237b..b11462fb87 100644 } WGPUQueue emwgpuCreateQueue(const EventSource* source) { -@@ -1284,6 +1289,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation) +@@ -1284,6 +1291,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation) } } @@ -86,7 +159,7 @@ index ca52b1237b..b11462fb87 100644 void WGPUBufferImpl::Destroy() { emwgpuBufferDestroy(this); AbortPendingMap("Buffer was destroyed before mapping was resolved."); -@@ -1504,6 +1513,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo( +@@ -1504,6 +1515,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo( void wgpu##Name##Release(WGPU##Name o) { \ if (o->Release()) { \ delete o; \ @@ -94,3 +167,19 @@ index ca52b1237b..b11462fb87 100644 } \ } WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE) +@@ -1587,6 +1599,7 @@ WGPUFuture wgpuAdapterRequestDevice( + // ---------------------------------------------------------------------------- + + void wgpuBufferDestroy(WGPUBuffer buffer) { ++ // printf(" #C++: wgpuBufferDestroy %p\n", buffer); + buffer->Destroy(); + } + +@@ -1639,6 +1652,7 @@ void wgpuBufferUnmap(WGPUBuffer buffer) { + WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device, + const WGPUBufferDescriptor* descriptor) { + WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation); ++ // printf(" #C++: wgpuDeviceCreateBuffer %p\n", buffer); + emwgpuDeviceCreateBuffer(device, descriptor, buffer); + return buffer; + } diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts index a0010df4643a4..3c04b500e0ab4 100644 --- a/js/web/lib/wasm/jsep/backend-webgpu.ts +++ b/js/web/lib/wasm/jsep/backend-webgpu.ts @@ -231,6 +231,16 @@ export class WebGpuBackend { private queryTimeBase?: bigint; queryType: TimestampQuery; + buffers = new Set(); + buffersTotalSize = new Map(); + + buffersCreated = 0; + buffersDestroyed = 0; + buffersUploads = 0; + buffersExternalUploads = 0; + buffersDownloads = 0; + buffersExternalDownloads = 0; + env: Env; sessionStatus: SessionState = 'default'; /** @@ -280,6 +290,67 @@ export class WebGpuBackend { } this.device = await adapter.requestDevice(deviceDescriptor); + + // @ts-expect-error Element implicitly has an 'any' type because type 'typeof globalThis' has no index signature.ts(7017) + if (globalThis.WEBGPU_STAT) { + const buffers = this.buffers; + const buffersTotalSize = this.buffersTotalSize; + + const buffersUploadsIncrement = () => { + this.buffersUploads++; + }; + const buffersDownloadsIncrement = () => { + this.buffersDownloads++; + }; + const buffersCreatedIncrement = () => { + this.buffersCreated++; + }; + const buffersDestroyedIncrement = () => { + this.buffersDestroyed++; + }; + + this.device = new Proxy(this.device, { + // when call device.createBuffer(), the returned buffer should be added into buffers + get: (target, prop, _receiver) => { + if (prop === 'createBuffer') { + return (desc: GPUBufferDescriptor) => { + const buffer = target.createBuffer(desc); + const originalDestroy = buffer.destroy.bind(buffer); + buffer.destroy = () => { + const previousTotal = buffersTotalSize.get(buffer.usage); + buffersTotalSize.set(buffer.usage, previousTotal - buffer.size); + buffers.delete(buffer); + buffersDestroyedIncrement(); + originalDestroy(); + }; + + // eslint-disable-next-line no-bitwise + if (buffer.usage === (GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC)) { + buffersUploadsIncrement(); + } + // eslint-disable-next-line no-bitwise + if (buffer.usage === (GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ)) { + buffersDownloadsIncrement(); + } + + buffers.add(buffer); + const previousTotal = buffersTotalSize.get(buffer.usage) ?? 0; + buffersTotalSize.set(buffer.usage, previousTotal + buffer.size); + buffersCreatedIncrement(); + return buffer; + }; + } + const propertyValue = Reflect.get(target, prop); + if (typeof propertyValue === 'function') { + return propertyValue.bind(target); + } else { + return propertyValue; + } + }, + set: (target, prop, value, _receiver) => Reflect.set(target, prop, value), + }); + } + this.deviceInfo = new DeviceInfoImpl(this.device); this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo())); this.gpuDataManager = createGpuDataManager(this); @@ -844,6 +915,7 @@ export class WebGpuBackend { ): () => Promise { return async () => { const data = await downloadGpuData(this, gpuBuffer, size); + this.buffersExternalDownloads++; return createView(data.buffer, type); }; } diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts index dbcf80adf3552..4bfc7925043fe 100644 --- a/js/web/lib/wasm/wasm-core-impl.ts +++ b/js/web/lib/wasm/wasm-core-impl.ts @@ -260,6 +260,8 @@ export const createSession = async ( let modelDataOffset: number, modelDataLength: number; const wasm = getInstance(); + wasm.webgpuStat?.('createSession_start'); + if (Array.isArray(modelData)) { // if model data is an array, it must be a 2-elements tuple containing the pointer and size of the model data [modelDataOffset, modelDataLength] = modelData; @@ -327,6 +329,7 @@ export const createSession = async ( } wasm.jsepOnCreateSession?.(); + wasm.webgpuStat?.('createSession_end'); // clear current MLContext after session creation if (wasm.currentContext) { @@ -436,6 +439,7 @@ export const createSession = async ( export const releaseSession = (sessionId: number): void => { const wasm = getInstance(); + wasm.webgpuStat?.('releaseSession_start'); const session = activeSessions.get(sessionId); if (!session) { throw new Error(`cannot release session. invalid session id: ${sessionId}`); @@ -462,6 +466,8 @@ export const releaseSession = (sessionId: number): void => { checkLastError("Can't release session."); } activeSessions.delete(sessionId); + + wasm.webgpuStat?.('releaseSession_end'); }; export const prepareInputOutputTensor = async ( @@ -633,6 +639,8 @@ export const run = async ( const outputValuesOffset = wasm.stackAlloc(outputCount * ptrSize); const outputNamesOffset = wasm.stackAlloc(outputCount * ptrSize); + wasm.webgpuStat?.('run_start'); + try { [runOptionsHandle, runOptionsAllocs] = setRunOptions(options); @@ -722,6 +730,7 @@ export const run = async ( } wasm.jsepOnRunStart?.(sessionHandle); + //wasm.webgpuStat?.('run_beforeAPI'); let errorCode: number; if (!BUILD_DEFS.DISABLE_JSEP && ioBindingState) { @@ -745,6 +754,8 @@ export const run = async ( ); } + //wasm.webgpuStat?.('run_afterAPI'); + if (errorCode !== 0) { checkLastError('failed to call OrtRun().'); } @@ -926,6 +937,8 @@ export const run = async ( false, ]); } + wasm.webgpuStat?.('run_end'); + return output; } finally { wasm.stackRestore(beforeRunStack); diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts index 9b2ec71fd351d..7e94fec52c374 100644 --- a/js/web/lib/wasm/wasm-types.ts +++ b/js/web/lib/wasm/wasm-types.ts @@ -294,6 +294,7 @@ export declare namespace WebGpu { webgpuUnregisterBuffer(buffer: GPUBuffer): void; webgpuGetBuffer(bufferHandle: number): GPUBuffer; webgpuCreateDownloader(gpuBuffer: GPUBuffer, size: number, sessionHandle: number): () => Promise; + webgpuStat(label?: string): void; } } diff --git a/js/web/package.json b/js/web/package.json index 5defe05e78c1f..6651c05bce5be 100644 --- a/js/web/package.json +++ b/js/web/package.json @@ -72,7 +72,7 @@ "import": "./dist/ort.node.min.mjs", "require": "./dist/ort.node.min.js" }, - "import": "./dist/ort.bundle.min.mjs", + "import": "./dist/ort.bundle.mjs", "require": "./dist/ort.min.js" }, "./all": { diff --git a/js/web/script/build.ts b/js/web/script/build.ts index fd9224a2dcf8b..afbdcc5924836 100644 --- a/js/web/script/build.ts +++ b/js/web/script/build.ts @@ -46,14 +46,17 @@ const DEBUG = process.env.npm_config_debug || args.debug; // boolean|'verbose'|' /** * --webgpu-ep - * --no-webgpu-ep (default) + * --no-webgpu-ep * * Enable or disable the use of WebGPU EP. If enabled, the WebGPU EP will be used. If disabled, the WebGPU backend will * be used with JSEP. * + * The default value is not set. If not set, onnxruntime-web will determine whether to use WebGPU EP or JSEP based on + * the environment (globalThis.WEBGPU_EP). + * * (temporary) This flag is used to test the WebGPU EP integration. It will be removed in the future. */ -const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep'] ?? true; +const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep']; /** * Root folder of the source code: `/js/` @@ -69,7 +72,7 @@ const DEFAULT_DEFINE = { 'BUILD_DEFS.DISABLE_WASM': 'false', 'BUILD_DEFS.DISABLE_WASM_PROXY': 'false', 'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'false', - 'BUILD_DEFS.USE_WEBGPU_EP': JSON.stringify(!!USE_WEBGPU_EP), + 'BUILD_DEFS.USE_WEBGPU_EP': USE_WEBGPU_EP === undefined ? 'globalThis.WEBGPU_EP' : JSON.stringify(!!USE_WEBGPU_EP), 'BUILD_DEFS.IS_ESM': 'false', 'BUILD_DEFS.ESM_IMPORT_META_URL': 'undefined', @@ -601,6 +604,12 @@ async function main() { outputName: 'ort', define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true' }, }); + // ort.bundle.mjs + await buildOrt({ + outputName: 'ort.bundle', + format: 'esm', + define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'true' }, + }); // ort.bundle.min.mjs await buildOrt({ isProduction: true, diff --git a/onnxruntime/wasm/post-webgpu.js b/onnxruntime/wasm/post-webgpu.js index e7631a97c34c6..b84dfd733af57 100644 --- a/onnxruntime/wasm/post-webgpu.js +++ b/onnxruntime/wasm/post-webgpu.js @@ -202,6 +202,8 @@ Module["webgpuInit"] = (setDefaultDevice) => { await gpuReadBuffer.mapAsync(GPUMapMode.READ); + WebGPU.Internals.buffersExternalDownloads++; + const arrayBuffer = gpuReadBuffer.getMappedRange(); return arrayBuffer.slice(0, originalSize); } finally { @@ -258,6 +260,23 @@ Module["webgpuInit"] = (setDefaultDevice) => { size ); webgpuCurrentDevice.queue.submit([commandEncoder.finish()]); + WebGPU.Internals.buffersExternalUploads++; gpuBufferForUploading.destroy(); }; + + Module["webgpuStat"] = (label) => { + if (globalThis["WEBGPU_STAT"]) { + console.log( + `[${label}] BufferCount: ${ + WebGPU.Internals.buffers?.size ?? 0 + }, Created: ${WebGPU.Internals.buffersCreated ?? 0}, Destroyed: ${ + WebGPU.Internals.buffersDestroyed ?? 0 + } Uploads: ${WebGPU.Internals.buffersUploads ?? 0}, Downloads: ${ + WebGPU.Internals.buffersDownloads ?? 0 + }, ExtUploads: ${ + WebGPU.Internals.buffersExternalUploads ?? 0 + }, ExtDownloads: ${WebGPU.Internals.buffersExternalDownloads ?? 0}` + ); + } + }; }; diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js index a35ab129280c4..04507e5defae9 100644 --- a/onnxruntime/wasm/pre-jsep.js +++ b/onnxruntime/wasm/pre-jsep.js @@ -93,6 +93,15 @@ Module["jsepInit"] = (name, params) => { Module.jsepUploadExternalBuffer = (dataId, buffer) => { backend["upload"](dataId, buffer); + backend["buffersExternalUploads"]++; + }; + + Module["webgpuStat"] = (label) => { + if (globalThis["WEBGPU_STAT"]) { + console.log( + `[${label}] BufferCount: ${backend["buffers"].size}, Created: ${backend["buffersCreated"]}, Destroyed: ${backend["buffersDestroyed"]}, Uploads: ${backend["buffersUploads"]}, Downloads: ${backend["buffersDownloads"]}, ExtUploads: ${backend["buffersExternalUploads"]}, ExtDownloads: ${backend["buffersExternalDownloads"]}` + ); + } }; } else if (name === "webnn") { // Functions called from EM_ASM need to be assigned in a way that can be minified.