From 1b6b2361c5580e6887b96dd48fbd4691d0702294 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Fri, 7 Feb 2025 12:08:43 -0800 Subject: [PATCH 1/2] enable WebGPU EP in WebAssembly build --- .../external/onnxruntime_external_deps.cmake | 7 + cmake/onnxruntime_webassembly.cmake | 37 +- cmake/patches/dawn/dawn.patch | 73 +++ cmake/patches/emscripten/webgpu-externs.js | 577 ++++++++++++++++++ js/build_webgpu.bat | 79 +++ js/web/lib/build-def.d.ts | 7 + js/web/lib/wasm/jsep/init.ts | 136 +++-- js/web/lib/wasm/session-options.ts | 116 ++-- js/web/lib/wasm/wasm-core-impl.ts | 97 ++- js/web/lib/wasm/wasm-types.ts | 68 ++- js/web/script/build.ts | 17 +- .../core/framework/external_data_loader.cc | 7 +- .../core/framework/external_data_loader.h | 2 +- .../providers/webgpu/external_data_loader.cc | 40 ++ .../providers/webgpu/external_data_loader.h | 30 + onnxruntime/core/providers/webgpu/program.cc | 20 + onnxruntime/core/providers/webgpu/program.h | 1 + .../core/providers/webgpu/webgpu_context.cc | 53 +- .../webgpu/webgpu_execution_provider.cc | 7 + .../webgpu/webgpu_execution_provider.h | 3 + .../webgpu/webgpu_provider_factory.cc | 6 + onnxruntime/wasm/api.cc | 26 +- onnxruntime/wasm/api.h | 24 +- onnxruntime/wasm/js_post_js.js | 2 +- onnxruntime/wasm/js_post_js_64.js | 2 +- onnxruntime/wasm/post-webgpu.js | 263 ++++++++ onnxruntime/wasm/pre-async.js | 142 +++++ onnxruntime/wasm/pre-jsep.js | 308 ++++------ onnxruntime/wasm/pre.js | 15 +- tools/ci_build/build.py | 7 +- 30 files changed, 1783 insertions(+), 389 deletions(-) create mode 100644 cmake/patches/emscripten/webgpu-externs.js create mode 100644 js/build_webgpu.bat create mode 100644 onnxruntime/core/providers/webgpu/external_data_loader.cc create mode 100644 onnxruntime/core/providers/webgpu/external_data_loader.h create mode 100644 onnxruntime/wasm/post-webgpu.js create mode 100644 onnxruntime/wasm/pre-async.js diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 1b1e11c9772f9..7717caf54945b 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -637,6 +637,13 @@ if (onnxruntime_USE_WEBGPU) set(DAWN_BUILD_TESTS OFF CACHE BOOL "" FORCE) if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(DAWN_EMSCRIPTEN_TOOLCHAIN "${REPO_ROOT}/cmake/external/emsdk/upstream/emscripten" CACHE STRING "" FORCE) + + # Update a few files in Emscripten + # + # The following files should be updated in Emscripten. We are waiting for the next Emscripten release to include + # these changes. For now, we apply the changes manually. + # - ${DAWN_EMSCRIPTEN_TOOLCHAIN}/src/closure-externs/webgpu-externs.js + execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${PROJECT_SOURCE_DIR}/patches/emscripten/webgpu-externs.js" "${DAWN_EMSCRIPTEN_TOOLCHAIN}/src/closure-externs/webgpu-externs.js") else() if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE BOOL "" FORCE) diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake index 8106e46ccf580..f3afaf7033fd1 100644 --- a/cmake/onnxruntime_webassembly.cmake +++ b/cmake/onnxruntime_webassembly.cmake @@ -211,10 +211,14 @@ else() target_link_libraries(onnxruntime_webassembly PRIVATE tensorboard) endif() + set(onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre.js") + + set(EXPORTED_FUNCTIONS "_malloc,_free") if (onnxruntime_USE_JSEP) - set(EXPORTED_FUNCTIONS "_malloc,_free,_JsepOutput,_JsepGetNodeName") - else() - set(EXPORTED_FUNCTIONS "_malloc,_free") + string(APPEND EXPORTED_FUNCTIONS ",_JsepOutput,_JsepGetNodeName") + endif() + if (onnxruntime_USE_WEBGPU) + string(APPEND EXPORTED_FUNCTIONS ",_wgpuBufferRelease,_wgpuCreateInstance") endif() if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64) @@ -312,13 +316,15 @@ else() target_compile_options(noexcep_operators PRIVATE ${SMEMORY_FLAG} -Wno-experimental) endif() target_link_options(onnxruntime_webassembly PRIVATE - --post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js" + "SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js\"" ) + list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js") else () set(MAXIMUM_MEMORY "4294967296") target_link_options(onnxruntime_webassembly PRIVATE - --post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js" + "SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js.js\"" ) + list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js") endif () target_link_options(onnxruntime_webassembly PRIVATE @@ -372,7 +378,6 @@ jsepDownload:_pp_") "SHELL:-s SIGNATURE_CONVERSIONS='${SIGNATURE_CONVERSIONS}'" ) endif () - set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre.js) if (onnxruntime_USE_JSEP) # NOTE: "-s ASYNCIFY=1" is required for JSEP to work with WebGPU @@ -382,10 +387,8 @@ jsepDownload:_pp_") target_compile_definitions(onnxruntime_webassembly PRIVATE USE_JSEP=1) target_link_options(onnxruntime_webassembly PRIVATE "SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js\"" - "SHELL:-s ASYNCIFY=1" - "SHELL:-s ASYNCIFY_STACK_SIZE=65536" ) - set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js) + list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js") if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64) target_link_options(onnxruntime_webassembly PRIVATE @@ -397,6 +400,20 @@ jsepDownload:_pp_") if (onnxruntime_USE_WEBGPU) target_compile_definitions(onnxruntime_webassembly PRIVATE USE_WEBGPU=1) + target_link_options(onnxruntime_webassembly PRIVATE + "SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js\"" + ) + list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js") + endif() + + if (onnxruntime_USE_JSEP OR onnxruntime_USE_WEBGPU OR onnxruntime_USE_WEBNN) + # if any of the above is enabled, we need to use the asyncify library + target_link_options(onnxruntime_webassembly PRIVATE + "SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-async.js\"" + "SHELL:-s ASYNCIFY=1" + "SHELL:-s ASYNCIFY_STACK_SIZE=65536" + ) + list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-async.js") endif() if (onnxruntime_EMSCRIPTEN_SETTINGS) @@ -458,6 +475,8 @@ jsepDownload:_pp_") ) endif() + set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS "${onnxruntime_webassembly_script_deps}") + set(target_name_list ort) if (onnxruntime_ENABLE_TRAINING_APIS) diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch index 2f85d5ab473b5..ac4c42bc15fce 100644 --- a/cmake/patches/dawn/dawn.patch +++ b/cmake/patches/dawn/dawn.patch @@ -34,3 +34,76 @@ index 6e8ae37593..633af91eef 100644 -q "${EM_BUILD_GEN_DIR}/struct_info_webgpu.json" "-I=${EM_BUILD_GEN_DIR}/include" +diff --git a/src/emdawnwebgpu/README.md b/src/emdawnwebgpu/README.md +index efd6491cd6..8ebc5d28b6 100644 +--- a/src/emdawnwebgpu/README.md ++++ b/src/emdawnwebgpu/README.md +@@ -56,7 +56,7 @@ Set up the build directory using emcmake + mkdir out/cmake-wasm + cd out/cmake-wasm + +-# Make sure the path is to the source checkout of Emscripten, not emsdk's release. ++# If using Emscripten v4.0.2 or lower, make sure the path is to the source checkout of Emscripten, not emsdk's release. + emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../.. + + ninja +diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp +index ca52b1237b..b11462fb87 100644 +--- a/third_party/emdawnwebgpu/webgpu.cpp ++++ b/third_party/emdawnwebgpu/webgpu.cpp +@@ -131,7 +131,6 @@ class RefCounted : NonMovable { + bool Release() { + if (mRefCount.fetch_sub(1u, std::memory_order_release) == 1u) { + std::atomic_thread_fence(std::memory_order_acquire); +- emwgpuDelete(this); + return true; + } + return false; +@@ -234,6 +233,7 @@ class Ref { + static void Release(T value) { + if (value != nullptr && value->RefCounted::Release()) { + delete value; ++ emwgpuDelete(value); + } + } + +@@ -642,6 +642,7 @@ struct WGPUBufferImpl final : public EventSource, + public RefCountedWithExternalCount { + public: + WGPUBufferImpl(const EventSource* source, bool mappedAtCreation); ++ ~WGPUBufferImpl(); + + void Destroy(); + const void* GetConstMappedRange(size_t offset, size_t size); +@@ -1168,7 +1169,11 @@ WGPUBuffer emwgpuCreateBuffer(const EventSource* source, + } + + WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) { +- return new WGPUDeviceImpl(source, queue); ++ // This function is only called from JS via `importJsDevice()`, which ++ // needs to increment the external ref count to fix the behavior. ++ WGPUDeviceImpl* device = new WGPUDeviceImpl(source, queue); ++ device->AddExternalRef(); ++ return device; + } + + WGPUQueue emwgpuCreateQueue(const EventSource* source) { +@@ -1284,6 +1289,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation) + } + } + ++WGPUBufferImpl::~WGPUBufferImpl() { ++ Destroy(); ++} ++ + void WGPUBufferImpl::Destroy() { + emwgpuBufferDestroy(this); + AbortPendingMap("Buffer was destroyed before mapping was resolved."); +@@ -1504,6 +1513,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo( + void wgpu##Name##Release(WGPU##Name o) { \ + if (o->Release()) { \ + delete o; \ ++ emwgpuDelete(o); \ + } \ + } + WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE) diff --git a/cmake/patches/emscripten/webgpu-externs.js b/cmake/patches/emscripten/webgpu-externs.js new file mode 100644 index 0000000000000..9dc1a6943ed51 --- /dev/null +++ b/cmake/patches/emscripten/webgpu-externs.js @@ -0,0 +1,577 @@ +/* + * WebGPU globals + * Generated using https://github.com/kainino0x/webidl-to-closure-externs + * against the spec's WebIDL: https://gpuweb.github.io/gpuweb/webgpu.idl + */ + +/** @type {?GPU} */ +Navigator.prototype.gpu; + +/** @type {?GPU} */ +WorkerNavigator.prototype.gpu; + +const GPUBufferUsage = {}; +/** @type {number} */ +GPUBufferUsage.MAP_READ; +/** @type {number} */ +GPUBufferUsage.MAP_WRITE; +/** @type {number} */ +GPUBufferUsage.COPY_SRC; +/** @type {number} */ +GPUBufferUsage.COPY_DST; +/** @type {number} */ +GPUBufferUsage.INDEX; +/** @type {number} */ +GPUBufferUsage.VERTEX; +/** @type {number} */ +GPUBufferUsage.UNIFORM; +/** @type {number} */ +GPUBufferUsage.STORAGE; +/** @type {number} */ +GPUBufferUsage.INDIRECT; +/** @type {number} */ +GPUBufferUsage.QUERY_RESOLVE; + +const GPUMapMode = {}; +/** @type {number} */ +GPUMapMode.READ; +/** @type {number} */ +GPUMapMode.WRITE; + +const GPUTextureUsage = {}; +/** @type {number} */ +GPUTextureUsage.COPY_SRC; +/** @type {number} */ +GPUTextureUsage.COPY_DST; +/** @type {number} */ +GPUTextureUsage.TEXTURE_BINDING; +/** @type {number} */ +GPUTextureUsage.STORAGE_BINDING; +/** @type {number} */ +GPUTextureUsage.RENDER_ATTACHMENT; + +const GPUShaderStage = {}; +/** @type {number} */ +GPUShaderStage.VERTEX; +/** @type {number} */ +GPUShaderStage.FRAGMENT; +/** @type {number} */ +GPUShaderStage.COMPUTE; + +const GPUColorWrite = {}; +/** @type {number} */ +GPUColorWrite.RED; +/** @type {number} */ +GPUColorWrite.GREEN; +/** @type {number} */ +GPUColorWrite.BLUE; +/** @type {number} */ +GPUColorWrite.ALPHA; +/** @type {number} */ +GPUColorWrite.ALL; + +/** @constructor */ +function GPUSupportedLimits() {} +/** @type {number} */ +GPUSupportedLimits.prototype.maxTextureDimension1D; +/** @type {number} */ +GPUSupportedLimits.prototype.maxTextureDimension2D; +/** @type {number} */ +GPUSupportedLimits.prototype.maxTextureDimension3D; +/** @type {number} */ +GPUSupportedLimits.prototype.maxTextureArrayLayers; +/** @type {number} */ +GPUSupportedLimits.prototype.maxBindGroups; +/** @type {number} */ +GPUSupportedLimits.prototype.maxBindGroupsPlusVertexBuffers; +/** @type {number} */ +GPUSupportedLimits.prototype.maxBindingsPerBindGroup; +/** @type {number} */ +GPUSupportedLimits.prototype.maxDynamicUniformBuffersPerPipelineLayout; +/** @type {number} */ +GPUSupportedLimits.prototype.maxDynamicStorageBuffersPerPipelineLayout; +/** @type {number} */ +GPUSupportedLimits.prototype.maxSampledTexturesPerShaderStage; +/** @type {number} */ +GPUSupportedLimits.prototype.maxSamplersPerShaderStage; +/** @type {number} */ +GPUSupportedLimits.prototype.maxStorageBuffersPerShaderStage; +/** @type {number} */ +GPUSupportedLimits.prototype.maxStorageTexturesPerShaderStage; +/** @type {number} */ +GPUSupportedLimits.prototype.maxUniformBuffersPerShaderStage; +/** @type {number} */ +GPUSupportedLimits.prototype.maxUniformBufferBindingSize; +/** @type {number} */ +GPUSupportedLimits.prototype.maxStorageBufferBindingSize; +/** @type {number} */ +GPUSupportedLimits.prototype.minUniformBufferOffsetAlignment; +/** @type {number} */ +GPUSupportedLimits.prototype.minStorageBufferOffsetAlignment; +/** @type {number} */ +GPUSupportedLimits.prototype.maxVertexBuffers; +/** @type {number} */ +GPUSupportedLimits.prototype.maxBufferSize; +/** @type {number} */ +GPUSupportedLimits.prototype.maxVertexAttributes; +/** @type {number} */ +GPUSupportedLimits.prototype.maxVertexBufferArrayStride; +/** @type {number} */ +GPUSupportedLimits.prototype.maxInterStageShaderComponents; +/** @type {number} */ +GPUSupportedLimits.prototype.maxInterStageShaderVariables; +/** @type {number} */ +GPUSupportedLimits.prototype.maxColorAttachments; +/** @type {number} */ +GPUSupportedLimits.prototype.maxColorAttachmentBytesPerSample; +/** @type {number} */ +GPUSupportedLimits.prototype.maxComputeWorkgroupStorageSize; +/** @type {number} */ +GPUSupportedLimits.prototype.maxComputeInvocationsPerWorkgroup; +/** @type {number} */ +GPUSupportedLimits.prototype.maxComputeWorkgroupSizeX; +/** @type {number} */ +GPUSupportedLimits.prototype.maxComputeWorkgroupSizeY; +/** @type {number} */ +GPUSupportedLimits.prototype.maxComputeWorkgroupSizeZ; +/** @type {number} */ +GPUSupportedLimits.prototype.maxComputeWorkgroupsPerDimension; + +/** @constructor */ +function GPUSupportedFeatures() {} +/** @type {number} */ +GPUSupportedFeatures.prototype.size; +/** @return {!Iterable} */ +GPUSupportedFeatures.prototype.entries = function() {}; +/** @return {!Iterable} */ +GPUSupportedFeatures.prototype.keys = function() {}; +/** @return {!Iterable} */ +GPUSupportedFeatures.prototype.values = function() {}; +/** @return {undefined} */ +GPUSupportedFeatures.prototype.forEach = function() {}; +/** @return {boolean} */ +GPUSupportedFeatures.prototype.has = function() {}; + +/** @constructor */ +function WGSLLanguageFeatures() {} +/** @type {number} */ +WGSLLanguageFeatures.prototype.size; +/** @return {!Iterable} */ +WGSLLanguageFeatures.prototype.entries = function() {}; +/** @return {!Iterable} */ +WGSLLanguageFeatures.prototype.keys = function() {}; +/** @return {!Iterable} */ +WGSLLanguageFeatures.prototype.values = function() {}; +/** @return {undefined} */ +WGSLLanguageFeatures.prototype.forEach = function() {}; +/** @return {boolean} */ +WGSLLanguageFeatures.prototype.has = function() {}; + +/** @constructor */ +function GPUAdapterInfo() {} +/** @type {string} */ +GPUAdapterInfo.prototype.vendor; +/** @type {string} */ +GPUAdapterInfo.prototype.architecture; +/** @type {string} */ +GPUAdapterInfo.prototype.device; +/** @type {string} */ +GPUAdapterInfo.prototype.description; + +/** @constructor */ +function GPU() {} +/** @return {!Promise} */ +GPU.prototype.requestAdapter = function() {}; +/** @return {string} */ +GPU.prototype.getPreferredCanvasFormat = function() {}; +/** @type {!WGSLLanguageFeatures} */ +GPU.prototype.wgslLanguageFeatures; + +/** @constructor */ +function GPUAdapter() {} +/** @type {!GPUSupportedFeatures} */ +GPUAdapter.prototype.features; +/** @type {!GPUSupportedLimits} */ +GPUAdapter.prototype.limits; +/** @type {boolean} */ +GPUAdapter.prototype.isFallbackAdapter; +/** @return {!Promise} */ +GPUAdapter.prototype.requestDevice = function() {}; +/** @return {!Promise} */ +GPUAdapter.prototype.requestAdapterInfo = function() {}; +/** @type {!GPUAdapterInfo} */ +GPUAdapter.prototype.info; + +/** @constructor */ +function GPUDevice() {} +/** @type {string} */ +GPUDevice.prototype.label; +/** @type {!GPUSupportedFeatures} */ +GPUDevice.prototype.features; +/** @type {!GPUSupportedLimits} */ +GPUDevice.prototype.limits; +/** @type {!GPUQueue} */ +GPUDevice.prototype.queue; +/** @return {undefined} */ +GPUDevice.prototype.destroy = function() {}; +/** @return {!GPUBuffer} */ +GPUDevice.prototype.createBuffer = function() {}; +/** @return {!GPUTexture} */ +GPUDevice.prototype.createTexture = function() {}; +/** @return {!GPUSampler} */ +GPUDevice.prototype.createSampler = function() {}; +/** @return {!GPUExternalTexture} */ +GPUDevice.prototype.importExternalTexture = function() {}; +/** @return {!GPUBindGroupLayout} */ +GPUDevice.prototype.createBindGroupLayout = function() {}; +/** @return {!GPUPipelineLayout} */ +GPUDevice.prototype.createPipelineLayout = function() {}; +/** @return {!GPUBindGroup} */ +GPUDevice.prototype.createBindGroup = function() {}; +/** @return {!GPUShaderModule} */ +GPUDevice.prototype.createShaderModule = function() {}; +/** @return {!GPUComputePipeline} */ +GPUDevice.prototype.createComputePipeline = function() {}; +/** @return {!GPURenderPipeline} */ +GPUDevice.prototype.createRenderPipeline = function() {}; +/** @return {!Promise} */ +GPUDevice.prototype.createComputePipelineAsync = function() {}; +/** @return {!Promise} */ +GPUDevice.prototype.createRenderPipelineAsync = function() {}; +/** @return {!GPUCommandEncoder} */ +GPUDevice.prototype.createCommandEncoder = function() {}; +/** @return {!GPURenderBundleEncoder} */ +GPUDevice.prototype.createRenderBundleEncoder = function() {}; +/** @return {!GPUQuerySet} */ +GPUDevice.prototype.createQuerySet = function() {}; +/** @type {!Promise} */ +GPUDevice.prototype.lost; +/** @return {undefined} */ +GPUDevice.prototype.pushErrorScope = function() {}; +/** @return {!Promise} */ +GPUDevice.prototype.popErrorScope = function() {}; +/** @type {!Function} */ +GPUDevice.prototype.onuncapturederror; +/** @type {!GPUAdapterInfo} */ +GPUDevice.prototype.adapterInfo; + +/** @constructor */ +function GPUBuffer() {} +/** @type {string} */ +GPUBuffer.prototype.label; +/** @type {number} */ +GPUBuffer.prototype.size; +/** @type {number} */ +GPUBuffer.prototype.usage; +/** @type {string} */ +GPUBuffer.prototype.mapState; +/** @return {!Promise} */ +GPUBuffer.prototype.mapAsync = function() {}; +/** @return {!ArrayBuffer} */ +GPUBuffer.prototype.getMappedRange = function() {}; +/** @return {undefined} */ +GPUBuffer.prototype.unmap = function() {}; +/** @return {undefined} */ +GPUBuffer.prototype.destroy = function() {}; + +/** @constructor */ +function GPUTexture() {} +/** @type {string} */ +GPUTexture.prototype.label; +/** @return {!GPUTextureView} */ +GPUTexture.prototype.createView = function() {}; +/** @return {undefined} */ +GPUTexture.prototype.destroy = function() {}; +/** @type {number} */ +GPUTexture.prototype.width; +/** @type {number} */ +GPUTexture.prototype.height; +/** @type {number} */ +GPUTexture.prototype.depthOrArrayLayers; +/** @type {number} */ +GPUTexture.prototype.mipLevelCount; +/** @type {number} */ +GPUTexture.prototype.sampleCount; +/** @type {string} */ +GPUTexture.prototype.dimension; +/** @type {string} */ +GPUTexture.prototype.format; +/** @type {number} */ +GPUTexture.prototype.usage; + +/** @constructor */ +function GPUTextureView() {} +/** @type {string} */ +GPUTextureView.prototype.label; + +/** @constructor */ +function GPUExternalTexture() {} +/** @type {string} */ +GPUExternalTexture.prototype.label; + +/** @constructor */ +function GPUSampler() {} +/** @type {string} */ +GPUSampler.prototype.label; + +/** @constructor */ +function GPUBindGroupLayout() {} +/** @type {string} */ +GPUBindGroupLayout.prototype.label; + +/** @constructor */ +function GPUBindGroup() {} +/** @type {string} */ +GPUBindGroup.prototype.label; + +/** @constructor */ +function GPUPipelineLayout() {} +/** @type {string} */ +GPUPipelineLayout.prototype.label; + +/** @constructor */ +function GPUShaderModule() {} +/** @type {string} */ +GPUShaderModule.prototype.label; +/** @return {!Promise} */ +GPUShaderModule.prototype.getCompilationInfo = function() {}; + +/** @constructor */ +function GPUCompilationMessage() {} +/** @type {string} */ +GPUCompilationMessage.prototype.message; +/** @type {string} */ +GPUCompilationMessage.prototype.type; +/** @type {number} */ +GPUCompilationMessage.prototype.lineNum; +/** @type {number} */ +GPUCompilationMessage.prototype.linePos; +/** @type {number} */ +GPUCompilationMessage.prototype.offset; +/** @type {number} */ +GPUCompilationMessage.prototype.length; + +/** @constructor */ +function GPUCompilationInfo() {} +/** @type {!Array} */ +GPUCompilationInfo.prototype.messages; + +/** @constructor */ +function GPUPipelineError() {} +/** @type {string} */ +GPUPipelineError.prototype.reason; + +/** @constructor */ +function GPUComputePipeline() {} +/** @type {string} */ +GPUComputePipeline.prototype.label; +/** @return {!GPUBindGroupLayout} */ +GPUComputePipeline.prototype.getBindGroupLayout = function() {}; + +/** @constructor */ +function GPURenderPipeline() {} +/** @type {string} */ +GPURenderPipeline.prototype.label; +/** @return {!GPUBindGroupLayout} */ +GPURenderPipeline.prototype.getBindGroupLayout = function() {}; + +/** @constructor */ +function GPUCommandBuffer() {} +/** @type {string} */ +GPUCommandBuffer.prototype.label; + +/** @constructor */ +function GPUCommandEncoder() {} +/** @type {string} */ +GPUCommandEncoder.prototype.label; +/** @return {undefined} */ +GPUCommandEncoder.prototype.pushDebugGroup = function() {}; +/** @return {undefined} */ +GPUCommandEncoder.prototype.popDebugGroup = function() {}; +/** @return {undefined} */ +GPUCommandEncoder.prototype.insertDebugMarker = function() {}; +/** @return {!GPURenderPassEncoder} */ +GPUCommandEncoder.prototype.beginRenderPass = function() {}; +/** @return {!GPUComputePassEncoder} */ +GPUCommandEncoder.prototype.beginComputePass = function() {}; +/** @return {undefined} */ +GPUCommandEncoder.prototype.copyBufferToBuffer = function() {}; +/** @return {undefined} */ +GPUCommandEncoder.prototype.copyBufferToTexture = function() {}; +/** @return {undefined} */ +GPUCommandEncoder.prototype.copyTextureToBuffer = function() {}; +/** @return {undefined} */ +GPUCommandEncoder.prototype.copyTextureToTexture = function() {}; +/** @return {undefined} */ +GPUCommandEncoder.prototype.clearBuffer = function() {}; +/** @return {undefined} */ +GPUCommandEncoder.prototype.resolveQuerySet = function() {}; +/** @return {!GPUCommandBuffer} */ +GPUCommandEncoder.prototype.finish = function() {}; + +/** @constructor */ +function GPUComputePassEncoder() {} +/** @type {string} */ +GPUComputePassEncoder.prototype.label; +/** @return {undefined} */ +GPUComputePassEncoder.prototype.pushDebugGroup = function() {}; +/** @return {undefined} */ +GPUComputePassEncoder.prototype.popDebugGroup = function() {}; +/** @return {undefined} */ +GPUComputePassEncoder.prototype.insertDebugMarker = function() {}; +/** @return {undefined} */ +GPUComputePassEncoder.prototype.setBindGroup = function() {}; +/** @return {undefined} */ +GPUComputePassEncoder.prototype.setBindGroup = function() {}; +/** @return {undefined} */ +GPUComputePassEncoder.prototype.setPipeline = function() {}; +/** @return {undefined} */ +GPUComputePassEncoder.prototype.dispatchWorkgroups = function() {}; +/** @return {undefined} */ +GPUComputePassEncoder.prototype.dispatchWorkgroupsIndirect = function() {}; +/** @return {undefined} */ +GPUComputePassEncoder.prototype.end = function() {}; + +/** @constructor */ +function GPURenderPassEncoder() {} +/** @type {string} */ +GPURenderPassEncoder.prototype.label; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.pushDebugGroup = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.popDebugGroup = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.insertDebugMarker = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.setBindGroup = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.setBindGroup = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.setPipeline = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.setIndexBuffer = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.setVertexBuffer = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.draw = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.drawIndexed = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.drawIndirect = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.drawIndexedIndirect = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.setViewport = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.setScissorRect = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.setBlendConstant = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.setStencilReference = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.beginOcclusionQuery = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.endOcclusionQuery = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.executeBundles = function() {}; +/** @return {undefined} */ +GPURenderPassEncoder.prototype.end = function() {}; + +/** @constructor */ +function GPURenderBundle() {} +/** @type {string} */ +GPURenderBundle.prototype.label; + +/** @constructor */ +function GPURenderBundleEncoder() {} +/** @type {string} */ +GPURenderBundleEncoder.prototype.label; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.pushDebugGroup = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.popDebugGroup = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.insertDebugMarker = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.setBindGroup = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.setBindGroup = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.setPipeline = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.setIndexBuffer = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.setVertexBuffer = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.draw = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.drawIndexed = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.drawIndirect = function() {}; +/** @return {undefined} */ +GPURenderBundleEncoder.prototype.drawIndexedIndirect = function() {}; +/** @return {!GPURenderBundle} */ +GPURenderBundleEncoder.prototype.finish = function() {}; + +/** @constructor */ +function GPUQueue() {} +/** @type {string} */ +GPUQueue.prototype.label; +/** @return {undefined} */ +GPUQueue.prototype.submit = function() {}; +/** @return {!Promise} */ +GPUQueue.prototype.onSubmittedWorkDone = function() {}; +/** @return {undefined} */ +GPUQueue.prototype.writeBuffer = function() {}; +/** @return {undefined} */ +GPUQueue.prototype.writeTexture = function() {}; +/** @return {undefined} */ +GPUQueue.prototype.copyExternalImageToTexture = function() {}; + +/** @constructor */ +function GPUQuerySet() {} +/** @type {string} */ +GPUQuerySet.prototype.label; +/** @return {undefined} */ +GPUQuerySet.prototype.destroy = function() {}; +/** @type {string} */ +GPUQuerySet.prototype.type; +/** @type {number} */ +GPUQuerySet.prototype.count; + +/** @constructor */ +function GPUCanvasContext() {} +/** @type {!HTMLCanvasElement|!OffscreenCanvas} */ +GPUCanvasContext.prototype.canvas; +/** @return {undefined} */ +GPUCanvasContext.prototype.configure = function() {}; +/** @return {undefined} */ +GPUCanvasContext.prototype.unconfigure = function() {}; +/** @return {!GPUTexture} */ +GPUCanvasContext.prototype.getCurrentTexture = function() {}; + +/** @constructor */ +function GPUDeviceLostInfo() {} +/** @type {string} */ +GPUDeviceLostInfo.prototype.reason; +/** @type {string} */ +GPUDeviceLostInfo.prototype.message; + +/** @constructor */ +function GPUError() {} +/** @type {string} */ +GPUError.prototype.message; + +/** @constructor */ +function GPUValidationError() {} + +/** @constructor */ +function GPUOutOfMemoryError() {} + +/** @constructor */ +function GPUInternalError() {} + +/** @constructor */ +function GPUUncapturedErrorEvent() {} +/** @type {!GPUError} */ +GPUUncapturedErrorEvent.prototype.error; diff --git a/js/build_webgpu.bat b/js/build_webgpu.bat new file mode 100644 index 0000000000000..95413509e701d --- /dev/null +++ b/js/build_webgpu.bat @@ -0,0 +1,79 @@ +@echo off + +rem build_webgpu.bat --- build onnxruntime-web with WebGPU EP +rem +rem Usage: +rem build_webgpu.bat config [clean] +rem +rem Options: +rem config Build configuration, "d" or "r" +rem clean Perform a clean build, "clean" or empty + +setlocal enabledelayedexpansion + +set ROOT=%~dp0..\ +set BUILD_DIR=%ROOT%build_webgpu + +:arg1 +if ["%~1"]==["d"] ( + set CONFIG=Debug + set CONFIG_EXTRA_FLAG= + @rem --enable_wasm_profiling --wasm_run_tests_in_browser + @rem --cmake_extra_defines onnxruntime_ENABLE_WEBASSEMBLY_OUTPUT_OPTIMIZED_MODEL=1 + @rem --enable_wasm_debug_info + goto :arg2 +) +if ["%~1"]==["r"] ( + set CONFIG=Release + set CONFIG_EXTRA_FLAG= + @rem --enable_wasm_api_exception_catching --disable_rtti + goto :arg2 +) +echo Invalid configuration "%~1", must be "d"(Debug) or "r"(Release) +exit /b 1 + +:arg2 +if ["%~2"]==["clean"] ( + goto :clean +) +if not exist "%ROOT%js\web\dist" ( + goto :npm_ci +) + +goto :build_wasm + +:clean +if exist "%BUILD_DIR%" ( + rd /s /q %BUILD_DIR% +) + +pushd %ROOT% +git submodule sync --recursive +git submodule update --init --recursive +popd + +:npm_ci +pushd %ROOT%js +call npm ci +popd +pushd %ROOT%js\common +call npm ci +popd +pushd %ROOT%js\web +call npm ci +call npm run pull:wasm +popd + +:build_wasm + +set PATH=C:\Program Files\Git\usr\bin;%PATH% + +call %ROOT%build.bat --config %CONFIG% %CONFIG_EXTRA_FLAG% --skip_submodule_sync --build_wasm --target onnxruntime_webassembly --skip_tests^ + --enable_wasm_simd --enable_wasm_threads --use_jsep --use_webnn --use_webgpu --build_dir %BUILD_DIR% + +IF NOT "%ERRORLEVEL%" == "0" ( + exit /b %ERRORLEVEL% +) + +copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.wasm %ROOT%js\web\dist\ +copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.mjs %ROOT%js\web\dist\ diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts index 59f64a3179605..83a52ebaefe05 100644 --- a/js/web/lib/build-def.d.ts +++ b/js/web/lib/build-def.d.ts @@ -40,6 +40,13 @@ interface BuildDefinitions { */ readonly ENABLE_BUNDLE_WASM_JS: boolean; + /** + * defines whether to use WebGPU EP instead of JSEP for WebGPU backend. + * + * This flag requires the corresponding WebAssembly artifact to be built with `--use_webgpu` flag. + */ + readonly USE_WEBGPU_EP: boolean; + // #endregion // #region Build definitions for ESM diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts index b4071eae51c8f..fe9576b87ad72 100644 --- a/js/web/lib/wasm/jsep/init.ts +++ b/js/web/lib/wasm/jsep/init.ts @@ -1,17 +1,17 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -import { Env } from 'onnxruntime-common'; +import type { Env } from 'onnxruntime-common'; import { calculateTensorSizeInBytes, DataType } from '../wasm-common'; import type { OrtWasmModule } from '../wasm-types'; -import { WebGpuBackend } from './backend-webgpu'; +import type { WebGpuBackend } from './backend-webgpu'; import { LOG_DEBUG } from './log'; -import { TensorView } from './tensor-view'; +import type { TensorView } from './tensor-view'; import { ShapeUtil } from './util'; -import { +import type { AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, @@ -205,79 +205,83 @@ export const init = async ( } if (name === 'webgpu') { - const backend = new WebGpuBackend(); - await backend.initialize(env, gpuAdapter!); + if (!BUILD_DEFS.USE_WEBGPU_EP) { + // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires + const webGpuBackendImpl = require('./backend-webgpu').WebGpuBackend; + const backend = new webGpuBackendImpl(); + await backend.initialize(env, gpuAdapter!); - jsepInit('webgpu', [ - // backend - backend, + jsepInit('webgpu', [ + // backend + backend, + + // jsepAlloc() + (size: number) => backend.alloc(Number(size)), - // jsepAlloc() - (size: number) => backend.alloc(Number(size)), + // jsepFree() + (ptr: number) => backend.free(ptr), - // jsepFree() - (ptr: number) => backend.free(ptr), + // jsepCopy(src, dst, size, isSourceGpu) + (src: number, dst: number, size: number, isSourceGpu = false) => { + if (isSourceGpu) { + LOG_DEBUG( + 'verbose', + () => `[WebGPU] jsepCopyGpuToGpu: src=${Number(src)}, dst=${Number(dst)}, size=${Number(size)}`, + ); + backend.memcpy(Number(src), Number(dst)); + } else { + LOG_DEBUG( + 'verbose', + () => + `[WebGPU] jsepCopyCpuToGpu: dataOffset=${Number(src)}, gpuDataId=${Number(dst)}, size=${Number(size)}`, + ); + const data = module.HEAPU8.subarray(Number(src >>> 0), Number(src >>> 0) + Number(size)); + backend.upload(Number(dst), data); + } + }, - // jsepCopy(src, dst, size, isSourceGpu) - (src: number, dst: number, size: number, isSourceGpu = false) => { - if (isSourceGpu) { + // jsepCopyAsync(src, dst, size) + async (gpuDataId: number, dataOffset: number, size: number): Promise => { LOG_DEBUG( 'verbose', - () => `[WebGPU] jsepCopyGpuToGpu: src=${Number(src)}, dst=${Number(dst)}, size=${Number(size)}`, + () => `[WebGPU] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`, ); - backend.memcpy(Number(src), Number(dst)); - } else { - LOG_DEBUG( - 'verbose', - () => - `[WebGPU] jsepCopyCpuToGpu: dataOffset=${Number(src)}, gpuDataId=${Number(dst)}, size=${Number(size)}`, - ); - const data = module.HEAPU8.subarray(Number(src >>> 0), Number(src >>> 0) + Number(size)); - backend.upload(Number(dst), data); - } - }, - // jsepCopyAsync(src, dst, size) - async (gpuDataId: number, dataOffset: number, size: number): Promise => { - LOG_DEBUG( - 'verbose', - () => `[WebGPU] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`, - ); - - await backend.download(Number(gpuDataId), () => - module.HEAPU8.subarray(Number(dataOffset) >>> 0, Number(dataOffset + size) >>> 0), - ); - }, + await backend.download(Number(gpuDataId), () => + module.HEAPU8.subarray(Number(dataOffset) >>> 0, Number(dataOffset + size) >>> 0), + ); + }, - // jsepCreateKernel - (kernelType: string, kernelId: number, attribute: unknown) => - backend.createKernel( - kernelType, - Number(kernelId), - attribute, - module.UTF8ToString(module._JsepGetNodeName!(Number(kernelId))), - ), + // jsepCreateKernel + (kernelType: string, kernelId: number, attribute: unknown) => + backend.createKernel( + kernelType, + Number(kernelId), + attribute, + module.UTF8ToString(module._JsepGetNodeName!(Number(kernelId))), + ), - // jsepReleaseKernel - (kernel: number) => backend.releaseKernel(kernel), + // jsepReleaseKernel + (kernel: number) => backend.releaseKernel(kernel), - // jsepRun - (kernel: number, contextDataOffset: number, sessionHandle: number, errors: Array>) => { - LOG_DEBUG( - 'verbose', - () => - `[WebGPU] jsepRun: sessionHandle=${sessionHandle}, kernel=${kernel}, contextDataOffset=${contextDataOffset}`, - ); - const context = new ComputeContextImpl(module, backend, Number(contextDataOffset)); - return backend.computeKernel(Number(kernel), context, errors); - }, - // jsepCaptureBegin - () => backend.captureBegin(), - // jsepCaptureEnd - () => backend.captureEnd(), - // jsepReplay - () => backend.replay(), - ]); + // jsepRun + (kernel: number, contextDataOffset: number, sessionHandle: number, errors: Array>) => { + LOG_DEBUG( + 'verbose', + () => + `[WebGPU] jsepRun: sessionHandle=${sessionHandle}, kernel=${kernel}, contextDataOffset=${contextDataOffset}`, + ); + const context = new ComputeContextImpl(module, backend, Number(contextDataOffset)); + return backend.computeKernel(Number(kernel), context, errors); + }, + // jsepCaptureBegin + () => backend.captureBegin(), + // jsepCaptureEnd + () => backend.captureEnd(), + // jsepReplay + () => backend.replay(), + ]); + } } else { const backend = new WebNNBackend(env); jsepInit('webnn', [ diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts index 17e564247863d..89a4484e5a1c4 100644 --- a/js/web/lib/wasm/session-options.ts +++ b/js/web/lib/wasm/session-options.ts @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -import { InferenceSession } from 'onnxruntime-common'; +import type { InferenceSession } from 'onnxruntime-common'; import { getInstance } from './wasm-factory'; import { allocWasmString, checkLastError, iterateExtraOptions } from './wasm-utils'; @@ -54,13 +54,28 @@ const appendDefaultOptions = (options: InferenceSession.SessionOptions): void => } }; -const setExecutionProviders = ( +const appendSessionConfig = (sessionOptionsHandle: number, key: string, value: string, allocs: number[]): void => { + const keyDataOffset = allocWasmString(key, allocs); + const valueDataOffset = allocWasmString(value, allocs); + if (getInstance()._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) { + checkLastError(`Can't set a session config entry: ${key} - ${value}.`); + } +}; + +const appendEpOption = (epOptions: Array<[number, number]>, key: string, value: string, allocs: number[]): void => { + const keyDataOffset = allocWasmString(key, allocs); + const valueDataOffset = allocWasmString(value, allocs); + epOptions.push([keyDataOffset, valueDataOffset]); +}; + +const setExecutionProviders = async ( sessionOptionsHandle: number, executionProviders: readonly InferenceSession.ExecutionProviderConfig[], allocs: number[], -): void => { +): Promise => { for (const ep of executionProviders) { let epName = typeof ep === 'string' ? ep : ep.name; + const epOptions: Array<[number, number]> = []; // check EP name switch (epName) { @@ -71,26 +86,44 @@ const setExecutionProviders = ( // const context = (webnnOptions as InferenceSession.WebNNOptionsWithMLContext)?.context; const deviceType = (webnnOptions as InferenceSession.WebNNContextOptions)?.deviceType; if (deviceType) { - const keyDataOffset = allocWasmString('deviceType', allocs); - const valueDataOffset = allocWasmString(deviceType, allocs); - if (getInstance()._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) { - checkLastError(`Can't set a session config entry: 'deviceType' - ${deviceType}.`); - } + appendSessionConfig(sessionOptionsHandle, 'deviceType', deviceType, allocs); } } break; case 'webgpu': - epName = 'JS'; - if (typeof ep !== 'string') { - const webgpuOptions = ep as InferenceSession.WebGpuExecutionProviderOption; - if (webgpuOptions?.preferredLayout) { - if (webgpuOptions.preferredLayout !== 'NCHW' && webgpuOptions.preferredLayout !== 'NHWC') { - throw new Error(`preferredLayout must be either 'NCHW' or 'NHWC': ${webgpuOptions.preferredLayout}`); + if (BUILD_DEFS.USE_WEBGPU_EP) { + epName = 'WebGPU'; + let customDevice: GPUDevice | undefined; + + if (typeof ep !== 'string') { + const customOptions = ep as unknown as { device: GPUDevice }; + if (customOptions.device) { + if (typeof GPUDevice !== 'undefined' && customOptions.device instanceof GPUDevice) { + customDevice = customOptions.device; + } else { + throw new Error('Invalid GPU device set in WebGPU EP options.'); + } } - const keyDataOffset = allocWasmString('preferredLayout', allocs); - const valueDataOffset = allocWasmString(webgpuOptions.preferredLayout, allocs); - if (getInstance()._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) { - checkLastError(`Can't set a session config entry: 'preferredLayout' - ${webgpuOptions.preferredLayout}.`); + + // TODO: handle more options + } + + const info = getInstance().webgpuRegisterDevice!(customDevice); + if (info) { + const [deviceId, instanceHandle, deviceHandle] = info; + appendEpOption(epOptions, 'deviceId', deviceId.toString(), allocs); + appendEpOption(epOptions, 'webgpuInstance', instanceHandle.toString(), allocs); + appendEpOption(epOptions, 'webgpuDevice', deviceHandle.toString(), allocs); + } + } else { + epName = 'JS'; + if (typeof ep !== 'string') { + const webgpuOptions = ep as InferenceSession.WebGpuExecutionProviderOption; + if (webgpuOptions?.preferredLayout) { + if (webgpuOptions.preferredLayout !== 'NCHW' && webgpuOptions.preferredLayout !== 'NHWC') { + throw new Error(`preferredLayout must be either 'NCHW' or 'NHWC': ${webgpuOptions.preferredLayout}`); + } + appendSessionConfig(sessionOptionsHandle, 'preferredLayout', webgpuOptions.preferredLayout, allocs); } } } @@ -103,13 +136,34 @@ const setExecutionProviders = ( } const epNameDataOffset = allocWasmString(epName, allocs); - if (getInstance()._OrtAppendExecutionProvider(sessionOptionsHandle, epNameDataOffset) !== 0) { + const epOptionsCount = epOptions.length; + let keysOffset = 0; + let valuesOffset = 0; + if (epOptionsCount > 0) { + keysOffset = getInstance()._malloc(epOptionsCount * getInstance().PTR_SIZE); + allocs.push(keysOffset); + valuesOffset = getInstance()._malloc(epOptionsCount * getInstance().PTR_SIZE); + allocs.push(valuesOffset); + for (let i = 0; i < epOptionsCount; i++) { + getInstance().setValue(keysOffset + i * getInstance().PTR_SIZE, epOptions[i][0], '*'); + getInstance().setValue(valuesOffset + i * getInstance().PTR_SIZE, epOptions[i][1], '*'); + } + } + if ( + (await getInstance()._OrtAppendExecutionProvider( + sessionOptionsHandle, + epNameDataOffset, + keysOffset, + valuesOffset, + epOptionsCount, + )) !== 0 + ) { checkLastError(`Can't append execution provider: ${epName}.`); } } }; -export const setSessionOptions = (options?: InferenceSession.SessionOptions): [number, number[]] => { +export const setSessionOptions = async (options?: InferenceSession.SessionOptions): Promise<[number, number[]]> => { const wasm = getInstance(); let sessionOptionsHandle = 0; const allocs: number[] = []; @@ -155,20 +209,19 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n } if (sessionOptions.executionProviders) { - setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs); + await setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs); } if (sessionOptions.enableGraphCapture !== undefined) { if (typeof sessionOptions.enableGraphCapture !== 'boolean') { throw new Error(`enableGraphCapture must be a boolean value: ${sessionOptions.enableGraphCapture}`); } - const keyDataOffset = allocWasmString('enableGraphCapture', allocs); - const valueDataOffset = allocWasmString(sessionOptions.enableGraphCapture.toString(), allocs); - if (wasm._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) { - checkLastError( - `Can't set a session config entry: 'enableGraphCapture' - ${sessionOptions.enableGraphCapture}.`, - ); - } + appendSessionConfig( + sessionOptionsHandle, + 'enableGraphCapture', + sessionOptions.enableGraphCapture.toString(), + allocs, + ); } if (sessionOptions.freeDimensionOverrides) { @@ -188,12 +241,7 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n if (sessionOptions.extra !== undefined) { iterateExtraOptions(sessionOptions.extra, '', new WeakSet>(), (key, value) => { - const keyDataOffset = allocWasmString(key, allocs); - const valueDataOffset = allocWasmString(value, allocs); - - if (wasm._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) { - checkLastError(`Can't set a session config entry: ${key} - ${value}.`); - } + appendSessionConfig(sessionOptionsHandle, key, value, allocs); }); } diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts index 4bccfa76fdda3..dbcf80adf3552 100644 --- a/js/web/lib/wasm/wasm-core-impl.ts +++ b/js/web/lib/wasm/wasm-core-impl.ts @@ -102,11 +102,20 @@ export const initRuntime = async (env: Env): Promise => { * @param epName */ export const initEp = async (env: Env, epName: string): Promise => { + // initialize ASYNCIFY support + getInstance().asyncInit?.(); + + if (epName === 'webgpu' && BUILD_DEFS.USE_WEBGPU_EP) { + getInstance().webgpuInit!((device) => { + env.webgpu.device = device; + }); + } + if (!BUILD_DEFS.DISABLE_JSEP) { // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires const initJsep = require('./jsep/init').init; - if (epName === 'webgpu') { + if (epName === 'webgpu' && !BUILD_DEFS.USE_WEBGPU_EP) { // perform WebGPU availability check if (typeof navigator === 'undefined' || !navigator.gpu) { throw new Error('WebGPU is not supported in current environment'); @@ -270,7 +279,7 @@ export const createSession = async ( const outputNamesUTF8Encoded = []; try { - [sessionOptionsHandle, allocs] = setSessionOptions(options); + [sessionOptionsHandle, allocs] = await setSessionOptions(options); if (options?.externalData && wasm.mountExternalData) { const loadingPromises = []; @@ -278,7 +287,7 @@ export const createSession = async ( const path = typeof file === 'string' ? file : file.path; loadingPromises.push( loadFile(typeof file === 'string' ? file : file.data).then((data) => { - wasm.mountExternalData!(path, data); + wasm.mountExternalData(path, data); }), ); } @@ -312,6 +321,7 @@ export const createSession = async ( } sessionHandle = await wasm._OrtCreateSession(modelDataOffset, modelDataLength, sessionOptionsHandle); + wasm.webgpuOnCreateSession?.(sessionHandle); if (sessionHandle === 0) { checkLastError("Can't create a session."); } @@ -444,6 +454,7 @@ export const releaseSession = (sessionId: number): void => { } wasm.jsepOnReleaseSession?.(sessionId); + wasm.webgpuOnReleaseSession?.(sessionId); inputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf)); outputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf)); @@ -491,11 +502,20 @@ export const prepareInputOutputTensor = async ( const gpuBuffer = tensor[2].gpuBuffer; dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!; - const registerBuffer = wasm.jsepRegisterBuffer; - if (!registerBuffer) { - throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.'); + if (BUILD_DEFS.USE_WEBGPU_EP) { + const registerBuffer = wasm.webgpuRegisterBuffer; + if (!registerBuffer) { + throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.'); + } + + rawData = registerBuffer(gpuBuffer, sessionId); + } else { + const registerBuffer = wasm.jsepRegisterBuffer; + if (!registerBuffer) { + throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.'); + } + rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength); } - rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength); } else if (location === 'ml-tensor') { const mlTensor = tensor[2].mlTensor as MLTensor; dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!; @@ -791,7 +811,7 @@ export const run = async ( // If a certain output's preferred location is GPU but the tensor is empty, we still need to create a CPU // tensor for it. There is no mapping GPU buffer for an empty tensor. if (preferredLocation === 'gpu-buffer' && size > 0) { - const getBuffer = wasm.jsepGetBuffer; + const getBuffer = BUILD_DEFS.USE_WEBGPU_EP ? wasm.webgpuGetBuffer : wasm.jsepGetBuffer; if (!getBuffer) { throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.'); } @@ -804,20 +824,43 @@ export const run = async ( // do not release the tensor right now. it will be released when user calls tensor.dispose(). keepOutputTensor = true; - output.push([ - type, - dims, - { - gpuBuffer, - download: wasm.jsepCreateDownloader!(gpuBuffer, bufferSize, type), - dispose: () => { - if (wasm._OrtReleaseTensor(tensor) !== 0) { - checkLastError("Can't release tensor."); - } + if (BUILD_DEFS.USE_WEBGPU_EP) { + wasm.webgpuRegisterBuffer!(gpuBuffer, sessionId, dataOffset); + const downloadDataFunction = wasm.webgpuCreateDownloader!(gpuBuffer, bufferSize, sessionId); + output.push([ + type, + dims, + { + gpuBuffer, + download: async () => { + const arrayBuffer = await downloadDataFunction(); + const data = new (tensorTypeToTypedArrayConstructor(type!))(arrayBuffer); + return data as Tensor.DataTypeMap[Tensor.GpuBufferDataTypes]; + }, + dispose: () => { + if (wasm._OrtReleaseTensor(tensor) !== 0) { + checkLastError("Can't release tensor."); + } + }, }, - }, - 'gpu-buffer', - ]); + 'gpu-buffer', + ]); + } else { + output.push([ + type, + dims, + { + gpuBuffer, + download: wasm.jsepCreateDownloader!(gpuBuffer, bufferSize, type), + dispose: () => { + if (wasm._OrtReleaseTensor(tensor) !== 0) { + checkLastError("Can't release tensor."); + } + }, + }, + 'gpu-buffer', + ]); + } } else if (preferredLocation === 'ml-tensor' && size > 0) { const ensureTensor = wasm.jsepEnsureTensor; if (!ensureTensor) { @@ -887,6 +930,18 @@ export const run = async ( } finally { wasm.stackRestore(beforeRunStack); + if (BUILD_DEFS.USE_WEBGPU_EP) { + inputTensors.forEach((t) => { + if (t && t[3] === 'gpu-buffer') { + wasm.webgpuUnregisterBuffer!(t[2].gpuBuffer); + } + }); + outputTensors.forEach((t) => { + if (t && t[3] === 'gpu-buffer') { + wasm.webgpuUnregisterBuffer!(t[2].gpuBuffer); + } + }); + } inputTensorHandles.forEach((v) => wasm._OrtReleaseTensor(v)); outputTensorHandles.forEach((v) => wasm._OrtReleaseTensor(v)); inputOutputAllocs.forEach((p) => wasm._free(p)); diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts index b4871e145f4d7..9b2ec71fd351d 100644 --- a/js/web/lib/wasm/wasm-types.ts +++ b/js/web/lib/wasm/wasm-types.ts @@ -41,18 +41,6 @@ export declare namespace JSEP { type DownloadTensorFunction = (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise; export interface Module extends WebGpuModule, WebNnModule { - /** - * Mount the external data file to an internal map, which will be used during session initialization. - * - * @param externalDataFilePath - specify the relative path of the external data file. - * @param externalDataFileData - specify the content data. - */ - mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void; - /** - * Unmount all external data files from the internal map. - */ - unmountExternalData(): void; - /** * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime per * backend. This function initializes Asyncify support. If name is 'webgpu', also initializes WebGPU backend and @@ -294,6 +282,21 @@ export declare namespace JSEP { } } +export declare namespace WebGpu { + export interface Module { + webgpuInit(setDefaultDevice: (device: GPUDevice) => void): void; + webgpuRegisterDevice( + device?: GPUDevice, + ): undefined | [deviceId: number, instanceHandle: number, deviceHandle: number]; + webgpuOnCreateSession(sessionHandle: number): void; + webgpuOnReleaseSession(sessionHandle: number): void; + webgpuRegisterBuffer(buffer: GPUBuffer, sessionHandle: number, bufferHandle?: number): number; + webgpuUnregisterBuffer(buffer: GPUBuffer): void; + webgpuGetBuffer(bufferHandle: number): GPUBuffer; + webgpuCreateDownloader(gpuBuffer: GPUBuffer, size: number, sessionHandle: number): () => Promise; + } +} + export interface OrtInferenceAPIs { _OrtInit(numThreads: number, loggingLevel: number): number; @@ -358,7 +361,13 @@ export interface OrtInferenceAPIs { logVerbosityLevel: number, optimizedModelFilePath: number, ): number; - _OrtAppendExecutionProvider(sessionOptionsHandle: number, name: number): number; + _OrtAppendExecutionProvider( + sessionOptionsHandle: number, + name: number, + providerOptionsKeys: number, + providerOptionsValues: number, + numKeys: number, + ): Promise; _OrtAddFreeDimensionOverride(sessionOptionsHandle: number, name: number, dim: number): number; _OrtAddSessionConfigEntry(sessionOptionsHandle: number, configKey: number, configValue: number): number; _OrtReleaseSessionOptions(sessionOptionsHandle: number): number; @@ -373,8 +382,11 @@ export interface OrtInferenceAPIs { /** * The interface of the WebAssembly module for ONNX Runtime, compiled from C++ source code by Emscripten. */ -export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Partial { - PTR_SIZE: number; +export interface OrtWasmModule + extends EmscriptenModule, + OrtInferenceAPIs, + Partial, + Partial { // #region emscripten functions stackSave(): number; stackRestore(stack: number): void; @@ -387,7 +399,31 @@ export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Parti stringToUTF8(str: string, offset: number, maxBytes: number): void; // #endregion + // #region ORT shared + + readonly PTR_SIZE: 4 | 8; + + /** + * Mount the external data file to an internal map, which will be used during session initialization. + * + * @param externalDataFilePath - specify the relative path of the external data file. + * @param externalDataFileData - specify the content data. + */ + mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void; + /** + * Unmount all external data files from the internal map. + */ + unmountExternalData(): void; + + /** + * This function patches the WebAssembly module to support Asyncify. This function should be called at least once + * before any ORT API is called. + */ + asyncInit?(): void; + + // #endregion + // #region config - numThreads?: number; + readonly numThreads?: number; // #endregion } diff --git a/js/web/script/build.ts b/js/web/script/build.ts index 6006de62b41b6..fd9224a2dcf8b 100644 --- a/js/web/script/build.ts +++ b/js/web/script/build.ts @@ -27,7 +27,8 @@ const args = minimist(process.argv.slice(2)); * --bundle-mode=node * Build a single ort-web bundle for nodejs. */ -const BUNDLE_MODE: 'prod' | 'dev' | 'perf' | 'node' = args['bundle-mode'] || 'prod'; +const BUNDLE_MODE: 'prod' | 'dev' | 'perf' | 'node' = + process.env.npm_config_bundle_mode || args['bundle-mode'] || 'prod'; /** * --debug @@ -41,7 +42,18 @@ const BUNDLE_MODE: 'prod' | 'dev' | 'perf' | 'node' = args['bundle-mode'] || 'pr * Enable debug mode. In this mode, esbuild metafile feature will be enabled. Full bundle analysis will be saved to a * file as JSON. */ -const DEBUG = args.debug; // boolean|'verbose'|'save' +const DEBUG = process.env.npm_config_debug || args.debug; // boolean|'verbose'|'save' + +/** + * --webgpu-ep + * --no-webgpu-ep (default) + * + * Enable or disable the use of WebGPU EP. If enabled, the WebGPU EP will be used. If disabled, the WebGPU backend will + * be used with JSEP. + * + * (temporary) This flag is used to test the WebGPU EP integration. It will be removed in the future. + */ +const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep'] ?? true; /** * Root folder of the source code: `/js/` @@ -57,6 +69,7 @@ const DEFAULT_DEFINE = { 'BUILD_DEFS.DISABLE_WASM': 'false', 'BUILD_DEFS.DISABLE_WASM_PROXY': 'false', 'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'false', + 'BUILD_DEFS.USE_WEBGPU_EP': JSON.stringify(!!USE_WEBGPU_EP), 'BUILD_DEFS.IS_ESM': 'false', 'BUILD_DEFS.ESM_IMPORT_META_URL': 'undefined', diff --git a/onnxruntime/core/framework/external_data_loader.cc b/onnxruntime/core/framework/external_data_loader.cc index fe73a55735631..c577805e69cc4 100644 --- a/onnxruntime/core/framework/external_data_loader.cc +++ b/onnxruntime/core/framework/external_data_loader.cc @@ -60,7 +60,12 @@ common::Status LoadWebAssemblyExternalData(const Env& env, break; case 1: // Load external data to GPU. - Module.jsepUploadExternalBuffer(dataIdOrBuffer, data); + // TODO: use a unified interface for upload external buffer. + if (Module.webgpuUploadExternalBuffer) { + Module.webgpuUploadExternalBuffer(dataIdOrBuffer, data); + } else { + Module.jsepUploadExternalBuffer(dataIdOrBuffer, data); + } break; default: return 4; // Unknown error occurred in memory copy. diff --git a/onnxruntime/core/framework/external_data_loader.h b/onnxruntime/core/framework/external_data_loader.h index 117da7d0a4afa..90d48ca800797 100644 --- a/onnxruntime/core/framework/external_data_loader.h +++ b/onnxruntime/core/framework/external_data_loader.h @@ -42,7 +42,7 @@ class IExternalDataLoader { enum class ExternalDataLoadType { CPU = 0, -#if defined(USE_JSEP) +#if defined(USE_JSEP) || defined(USE_WEBGPU) WEBGPU_BUFFER = 1, #endif }; diff --git a/onnxruntime/core/providers/webgpu/external_data_loader.cc b/onnxruntime/core/providers/webgpu/external_data_loader.cc new file mode 100644 index 0000000000000..6da9598b146f5 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/external_data_loader.cc @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#if defined(__wasm__) + +#include + +#include "core/framework/tensor.h" +#include "core/providers/webgpu/external_data_loader.h" + +namespace onnxruntime { +namespace webgpu { + +bool ExternalDataLoader::CanLoad(const OrtMemoryInfo& target_memory_info) const { + return target_memory_info.device.Type() == OrtDevice::CPU || + (target_memory_info.device.Type() == OrtDevice::GPU && target_memory_info.name == WEBGPU_BUFFER); +} + +common::Status ExternalDataLoader::LoadTensor(const Env& env, + const std::filesystem::path& data_file_path, + FileOffsetType data_offset, + SafeInt data_length, + Tensor& tensor) const { + ExternalDataLoadType load_type; + if (tensor.Location().device.Type() == OrtDevice::CPU) { + load_type = ExternalDataLoadType::CPU; + } else if (tensor.Location().device.Type() == OrtDevice::GPU && + tensor.Location().name == WEBGPU_BUFFER) { + load_type = ExternalDataLoadType::WEBGPU_BUFFER; + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported tensor location: ", tensor.Location().ToString()); + } + + return LoadWebAssemblyExternalData(env, data_file_path, data_offset, data_length, load_type, tensor.MutableDataRaw()); +} + +} // namespace webgpu +} // namespace onnxruntime + +#endif diff --git a/onnxruntime/core/providers/webgpu/external_data_loader.h b/onnxruntime/core/providers/webgpu/external_data_loader.h new file mode 100644 index 0000000000000..7ced4e930bf7a --- /dev/null +++ b/onnxruntime/core/providers/webgpu/external_data_loader.h @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#if defined(__wasm__) + +#include "core/framework/external_data_loader.h" + +namespace onnxruntime { +namespace webgpu { + +class ExternalDataLoader : public IExternalDataLoader { + public: + ExternalDataLoader() {}; + ~ExternalDataLoader() {}; + + bool CanLoad(const OrtMemoryInfo& target_memory_info) const override; + + common::Status LoadTensor(const Env& env, + const std::filesystem::path& data_file_path, + FileOffsetType data_offset, + SafeInt data_length, + Tensor& tensor) const override; +}; + +} // namespace webgpu +} // namespace onnxruntime + +#endif diff --git a/onnxruntime/core/providers/webgpu/program.cc b/onnxruntime/core/providers/webgpu/program.cc index d1d4c242c4697..976b7927ac3dd 100644 --- a/onnxruntime/core/providers/webgpu/program.cc +++ b/onnxruntime/core/providers/webgpu/program.cc @@ -206,6 +206,26 @@ ProgramVariableDataType ToProgramVariableDataType(int32_t element_type, int comp } } +std::ostream& operator<<(std::ostream& os, ValidationMode mode) { + switch (mode) { + case ValidationMode::Disabled: + os << "Disabled"; + break; + case ValidationMode::WGPUOnly: + os << "WGPUOnly"; + break; + case ValidationMode::Basic: + os << "Basic"; + break; + case ValidationMode::Full: + os << "Full"; + break; + default: + os << "Unknown(" << static_cast(mode) << ")"; + } + return os; +} + namespace { TensorShape GetReducedShape(const TensorShape& shape, int component /* > 1 */) { ORT_ENFORCE(shape.NumDimensions() > 0 && shape.GetDims()[shape.NumDimensions() - 1] % component == 0, diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h index 7bfd9e8800099..95fef36144025 100644 --- a/onnxruntime/core/providers/webgpu/program.h +++ b/onnxruntime/core/providers/webgpu/program.h @@ -237,6 +237,7 @@ enum class ValidationMode { Basic, Full }; +std::ostream& operator<<(std::ostream& os, ValidationMode mode); namespace details { class ProgramWrapper; diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index 163dd691b7f16..e8e93a9cb6a8f 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -134,6 +134,8 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi ORT_ENFORCE(device_ != nullptr, "Failed to get a WebGPU device."); } + LOGS_DEFAULT(VERBOSE) << "WebGPU EP Context is created for: Instance=" << instance_.Get() << ", Device=" << device_.Get() << "."; + // cache adapter info ORT_ENFORCE(Device().GetAdapterInfo(&adapter_info_)); // cache device limits @@ -708,45 +710,46 @@ WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& co WGPUInstance instance = config.instance; WGPUDevice device = config.device; - if (context_id == 0) { - // context ID is preserved for the default context. User cannot use context ID 0 as a custom context. - ORT_ENFORCE(instance == nullptr && device == nullptr, - "WebGPU EP default context (contextId=0) must not have custom WebGPU instance or device."); - - std::call_once(init_default_flag_, [ + std::call_once(init_default_flag_, [ #if !defined(__wasm__) - dawn_proc_table = config.dawn_proc_table + dawn_proc_table = config.dawn_proc_table #endif - ]() { - // Step.1 - setup dawn proc table (only for non-WASM build) + ]() { + // Step.1 - setup dawn proc table (only for non-WASM build) #if !defined(__wasm__) - const DawnProcTable* dawn_procs = reinterpret_cast(dawn_proc_table); + const DawnProcTable* dawn_procs = reinterpret_cast(dawn_proc_table); #if defined(BUILD_DAWN_MONOLITHIC_LIBRARY) - ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn."); + ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn."); #else #if !defined(USE_EXTERNAL_DAWN) - if (dawn_procs == nullptr) { - dawn_procs = &dawn::native::GetProcs(); - } + if (dawn_procs == nullptr) { + dawn_procs = &dawn::native::GetProcs(); + } #else - ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided."); + ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided."); #endif - dawnProcSetProcs(dawn_procs); + dawnProcSetProcs(dawn_procs); #endif #endif - // Step.2 - Create wgpu::Instance + // Step.2 - Create wgpu::Instance #if !defined(__wasm__) - wgpu::InstanceDescriptor instance_desc{}; - instance_desc.capabilities.timedWaitAnyEnable = true; - default_instance_ = wgpu::CreateInstance(&instance_desc); + wgpu::InstanceDescriptor instance_desc{}; + instance_desc.capabilities.timedWaitAnyEnable = true; + default_instance_ = wgpu::CreateInstance(&instance_desc); #else - default_instance_ = wgpu::CreateInstance(nullptr); + default_instance_ = wgpu::CreateInstance(nullptr); #endif - ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance."); - }); + ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance."); + }); + + if (context_id == 0) { + // context ID is preserved for the default context. User cannot use context ID 0 as a custom context. + ORT_ENFORCE(instance == nullptr && device == nullptr, + "WebGPU EP default context (contextId=0) must not have custom WebGPU instance or device."); + instance = default_instance_.Get(); } else { // for context ID > 0, user must provide custom WebGPU instance and device. @@ -800,5 +803,9 @@ void CleanupWebGpuContexts() { WebGpuContextFactory::Cleanup(); } +WGPUDevice GetDevice(int context_id) { + return WebGpuContextFactory::GetContext(context_id).Device().Get(); +} + } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index 87383fe197477..b5a663fb7c455 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -23,6 +23,7 @@ #include "core/providers/webgpu/webgpu_context.h" #include "core/providers/webgpu/data_transfer.h" +#include "core/providers/webgpu/external_data_loader.h" #include "core/providers/webgpu/webgpu_profiler.h" namespace onnxruntime { @@ -821,6 +822,12 @@ std::unique_ptr WebGpuExecutionProvider::GetDataTran return std::make_unique(context_); } +#if defined(__wasm__) +std::unique_ptr WebGpuExecutionProvider::GetExternalDataLoader() const { + return std::make_unique(); +} +#endif + WebGpuExecutionProvider::~WebGpuExecutionProvider() { WebGpuContextFactory::ReleaseContext(context_id_); } diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h index 7a0ade97aa3df..dc25636821651 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h @@ -49,6 +49,9 @@ class WebGpuExecutionProvider : public IExecutionProvider { std::shared_ptr GetKernelRegistry() const override; std::unique_ptr GetDataTransfer() const override; +#if defined(__wasm__) + std::unique_ptr GetExternalDataLoader() const override; +#endif DataLayout GetPreferredLayout() const override { return preferred_data_layout_; } diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc index 60c61b2ca5665..1d779152f91f3 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc @@ -151,6 +151,12 @@ std::shared_ptr WebGpuProviderFactoryCreator::Create( validation_mode, }; + LOGS_DEFAULT(VERBOSE) << "WebGPU EP Device ID: " << context_id; + LOGS_DEFAULT(VERBOSE) << "WebGPU EP WGPUInstance: " << webgpu_instance; + LOGS_DEFAULT(VERBOSE) << "WebGPU EP WGPUDevice: " << webgpu_device; + LOGS_DEFAULT(VERBOSE) << "WebGPU EP DawnProcTable: " << dawn_proc_table; + LOGS_DEFAULT(VERBOSE) << "WebGPU EP ValidationMode: " << validation_mode; + // // STEP.3 - prepare parameters for WebGPU context initialization. // diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc index 7adfc6a2b2ccb..1ad35b51bb1c1 100644 --- a/onnxruntime/wasm/api.cc +++ b/onnxruntime/wasm/api.cc @@ -8,6 +8,14 @@ #include "core/session/onnxruntime_cxx_api.h" #include "api.h" +#ifdef USE_WEBGPU +namespace onnxruntime { +namespace webgpu { +WGPUDevice GetDevice(int); +} +} // namespace onnxruntime +#endif + #include #include #include @@ -164,8 +172,12 @@ OrtSessionOptions* OrtCreateSessionOptions(size_t graph_optimization_level, return UNREGISTER_AUTO_RELEASE(session_options); } -int OrtAppendExecutionProvider(ort_session_options_handle_t session_options, const char* name) { - return CHECK_STATUS(SessionOptionsAppendExecutionProvider, session_options, name, nullptr, nullptr, 0); +int OrtAppendExecutionProvider(ort_session_options_handle_t session_options, + const char* name, + const char* const* provider_options_keys, + const char* const* provider_options_values, + size_t num_keys) { + return CHECK_STATUS(SessionOptionsAppendExecutionProvider, session_options, name, provider_options_keys, provider_options_values, num_keys); } int OrtAddFreeDimensionOverride(ort_session_options_handle_t session_options, @@ -507,6 +519,16 @@ char* OrtEndProfiling(ort_session_handle_t session) { : nullptr; } +// WebGPU API Section + +#ifdef USE_WEBGPU + +WGPUDevice OrtGetWebGpuDevice(int device_id) { + return onnxruntime::webgpu::GetDevice(device_id); +} + +#endif + // Training API Section #ifdef ENABLE_TRAINING_APIS diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h index f44c515d98f6b..9ff1eb55ecedc 100644 --- a/onnxruntime/wasm/api.h +++ b/onnxruntime/wasm/api.h @@ -10,6 +10,10 @@ #include +#ifdef USE_WEBGPU +#include +#endif + #include struct OrtSession; @@ -85,7 +89,10 @@ ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t * @returns ORT error code. If not zero, call OrtGetLastError() to get detailed error message. */ int EMSCRIPTEN_KEEPALIVE OrtAppendExecutionProvider(ort_session_options_handle_t session_options, - const char* name); + const char* name, + const char* const* provider_options_keys, + const char* const* provider_options_values, + size_t num_keys); /** * add a free dimension override for one dimension of a session's input. @@ -294,6 +301,21 @@ int EMSCRIPTEN_KEEPALIVE OrtRun(ort_session_handle_t session, */ char* EMSCRIPTEN_KEEPALIVE OrtEndProfiling(ort_session_handle_t session); +// WebGPU API Section + +#ifdef USE_WEBGPU + +/** + * get the GPU Device by device ID. + * + * This function is only available after the GPU Device is initialized in WebGpuContextFactory. + * + * @returns a WGPUDevice handle. + */ +WGPUDevice EMSCRIPTEN_KEEPALIVE OrtGetWebGpuDevice(int device_id); + +#endif + // Training API Section #ifdef ENABLE_TRAINING_APIS diff --git a/onnxruntime/wasm/js_post_js.js b/onnxruntime/wasm/js_post_js.js index b77d82fbd7d10..be5a4d3c7415a 100644 --- a/onnxruntime/wasm/js_post_js.js +++ b/onnxruntime/wasm/js_post_js.js @@ -2,6 +2,6 @@ // Licensed under the MIT License. -'use strict'; +"use strict"; Module["PTR_SIZE"] = 4; diff --git a/onnxruntime/wasm/js_post_js_64.js b/onnxruntime/wasm/js_post_js_64.js index b140df927ebbd..b16383b746b8a 100644 --- a/onnxruntime/wasm/js_post_js_64.js +++ b/onnxruntime/wasm/js_post_js_64.js @@ -2,6 +2,6 @@ // Licensed under the MIT License. -'use strict'; +"use strict"; Module["PTR_SIZE"] = 8; diff --git a/onnxruntime/wasm/post-webgpu.js b/onnxruntime/wasm/post-webgpu.js new file mode 100644 index 0000000000000..e7631a97c34c6 --- /dev/null +++ b/onnxruntime/wasm/post-webgpu.js @@ -0,0 +1,263 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +"use strict"; + +// +// This file contains the post-run code for the ORT WebAssembly module. The code in this file will be injected into the +// final module using Emscripten's `--post-js` option. +// +// This file will only be used in build with flag `--use_webgpu`. + +/** + * This function is called only once when initializing the WebGPU backend. + * + * @param {(gpuDevice: GPUDevice) => void} setDefaultDevice A callback function to set the default device. + */ +Module["webgpuInit"] = (setDefaultDevice) => { + /** + * a map from GPUDevice to [deviceId, instanceHandle, deviceHandle] + * + * only stores custom devices (ie. devices created by the user, not the default device created by ORT) + * + * key is the GPUDevice object. + * + * value is a tuple of 3 elements: + * - deviceId: a unique ID for the device. Must be positive integer. + * - instanceHandle: the instance handle(pointer) of the device. + * - deviceHandle: the device handle(pointer) of the device. + * + * @type {WeakMap} + */ + const webgpuActiveDevices = new WeakMap(); + /** + * a number that is used to assign a unique ID to the next custom device. + */ + let webgpuNextDeviceId = 1; + /** + * a function to set the default device. + * + * @type {(gpuDevice: GPUDevice) => void} + */ + const webgpuSetDefaultDevice = setDefaultDevice; + /** + * the current device that is being used to create a WebGPU EP inference session. + * + * the value of this variable is only valid during the creation of a WebGPU EP inference session. + * + * @type {GPUDevice|undefined} + */ + let webgpuCurrentDevice = undefined; + /** + * the current device ID that is being used to create a WebGPU EP inference session. + * + * the value of this variable is only valid during the creation of a WebGPU EP inference session. + * + * @type {number|undefined} + */ + let webgpuCurrentDeviceId = undefined; + + /** + * This function is called only when a custom device is used, during preparation of session options. + * + * @param {GPUDevice} device the user provided device object. + * @returns {undefined|[number, number, number]} a tuple of device id, instance handle, and device handle. + */ + Module["webgpuRegisterDevice"] = (device) => { + if (webgpuCurrentDeviceId !== undefined) { + throw new Error("another WebGPU EP inference session is being created."); + } + + if (device) { + let deviceInfo = webgpuActiveDevices.get(device); + if (!deviceInfo) { + const instanceHandle = _wgpuCreateInstance(0); + const deviceHandle = WebGPU.importJsDevice(device, instanceHandle); + deviceInfo = [webgpuNextDeviceId++, instanceHandle, deviceHandle]; + webgpuActiveDevices.set(device, deviceInfo); + } + + // The current device ID is a temporary storage for the device ID to be used in the session that is being created. + // + // Soon after `webgpuRegisterDevice` (this function) is called, `webgpuOnCreateSession` will be called so that the + // value of `webgpuCurrentDeviceId` is used and reset then. + webgpuCurrentDevice = device; + webgpuCurrentDeviceId = deviceInfo[0]; + return deviceInfo; + } else { + webgpuCurrentDevice = undefined; + webgpuCurrentDeviceId = 0; + return undefined; + } + }; + + const webgpuActiveSessions = new Map(); + Module["webgpuOnCreateSession"] = (sessionHandle) => { + if (webgpuCurrentDeviceId === undefined) { + // do nothing if webgpuCurrentDeviceId is undefined. + // this means no WebGPU EP is being created. + return; + } + + const deviceId = webgpuCurrentDeviceId; + webgpuCurrentDeviceId = undefined; + + if (sessionHandle) { + // when session created successfully + const deviceHandle = _OrtGetWebGpuDevice(deviceId); + webgpuActiveSessions.set(sessionHandle, deviceHandle); + + if (deviceId === 0) { + const device = webgpuCurrentDevice ?? WebGPU.getJsObject(deviceHandle); + webgpuSetDefaultDevice(device); + } + } + webgpuCurrentDevice = undefined; + }; + + Module["webgpuOnReleaseSession"] = (sessionHandle) => { + webgpuActiveSessions.delete(sessionHandle); + }; + + const gpuBufferMetadataSymbol = Symbol("gpuBufferMetadata"); + + Module["webgpuRegisterBuffer"] = (buffer, sessionHandle, bufferHandle) => { + const metadata = buffer[gpuBufferMetadataSymbol]; + if (bufferHandle) { + // This is a buffer that was created by ORT. Metadata is [bufferHandle, NaN] + + buffer[gpuBufferMetadataSymbol] = [bufferHandle, NaN]; + return bufferHandle; + } else { + // This is a buffer that was created by the user. Metadata is [bufferHandle, refCount] + + if (metadata) { + metadata[1]++; + return metadata[0]; + } + + const deviceHandle = webgpuActiveSessions.get(sessionHandle); + if (deviceHandle === undefined) { + throw new Error( + "Invalid session handle passed to webgpuRegisterBuffer" + ); + } + + const bufferHandle = WebGPU.importJsBuffer(buffer, deviceHandle); + buffer[gpuBufferMetadataSymbol] = [bufferHandle, 1]; + return bufferHandle; + } + }; + + Module["webgpuUnregisterBuffer"] = (buffer) => { + const metadata = buffer[gpuBufferMetadataSymbol]; + if (!metadata) { + throw new Error("Buffer is not registered"); + } + metadata[1]--; + // For buffers created by ORT, metadata[1] will always be NaN. This function will not release the buffer. + // Instead, the buffer will be released when user calls `Tensor.dispose()` in JavaScript. + if (metadata[1] === 0) { + _wgpuBufferRelease(metadata[0]); + delete buffer[gpuBufferMetadataSymbol]; + } + }; + + Module["webgpuGetBuffer"] = (bufferHandle) => { + return WebGPU.getJsObject(bufferHandle); + }; + + Module["webgpuCreateDownloader"] = (gpuBuffer, bufferSize, sessionHandle) => { + const deviceHandle = webgpuActiveSessions.get(sessionHandle); + if (deviceHandle === undefined) { + throw new Error("Invalid session handle passed to webgpuRegisterBuffer"); + } + + const buffer = gpuBuffer; + const device = WebGPU.getJsObject(deviceHandle); + const originalSize = bufferSize; + const size = Math.ceil(Number(originalSize) / 16) * 16; + + return async () => { + // prettier-ignore + // + // the line above is used to force prettier to skip formatting the next statement. + // this is because prettier will remove the quotes around the property names, but we need to keep them + // because otherwise closure compiler may rename them and break the code. + const gpuReadBufferDescriptor = { + "size": size, + "usage": 9 /* GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ */, + }; + const gpuReadBuffer = device.createBuffer(gpuReadBufferDescriptor); + try { + const commandEncoder = device.createCommandEncoder(); + commandEncoder.copyBufferToBuffer( + buffer /* source buffer */, + 0 /* source offset */, + gpuReadBuffer /* destination buffer */, + 0 /* destination offset */, + size /* size */ + ); + device.queue.submit([commandEncoder.finish()]); + + await gpuReadBuffer.mapAsync(GPUMapMode.READ); + + const arrayBuffer = gpuReadBuffer.getMappedRange(); + return arrayBuffer.slice(0, originalSize); + } finally { + gpuReadBuffer.destroy(); + } + }; + }; + + // Setup a callback function for loading external buffers (model weights). + Module.webgpuUploadExternalBuffer = (bufferHandle, data) => { + const srcArrayBuffer = data.buffer; + const srcOffset = data.byteOffset; + const srcLength = data.byteLength; + const size = Math.ceil(Number(srcLength) / 16) * 16; + + const gpuBuffer = WebGPU.getJsObject(bufferHandle); + + // get current device + if (!webgpuCurrentDevice) { + const deviceHandle = _OrtGetWebGpuDevice(webgpuCurrentDeviceId); + webgpuCurrentDevice = WebGPU.getJsObject(deviceHandle); + } + + // create gpu buffer + + // prettier-ignore + // + // the line above is used to force prettier to skip formatting the next statement. + // this is because prettier will remove the quotes around the property names, but we need to keep them + // because otherwise closure compiler may rename them and break the code. + const gpuBufferForUploadingDescriptor = { + "mappedAtCreation": true, + "size": size, + "usage": 6 /* GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC */, + }; + const gpuBufferForUploading = webgpuCurrentDevice.createBuffer( + gpuBufferForUploadingDescriptor + ); + + // copy (upload) data + const arrayBuffer = gpuBufferForUploading.getMappedRange(); + new Uint8Array(arrayBuffer).set( + new Uint8Array(srcArrayBuffer, srcOffset, srcLength) + ); + gpuBufferForUploading.unmap(); + + // GPU copy + const commandEncoder = webgpuCurrentDevice.createCommandEncoder(); + commandEncoder.copyBufferToBuffer( + gpuBufferForUploading, + 0, + gpuBuffer, + 0, + size + ); + webgpuCurrentDevice.queue.submit([commandEncoder.finish()]); + gpuBufferForUploading.destroy(); + }; +}; diff --git a/onnxruntime/wasm/pre-async.js b/onnxruntime/wasm/pre-async.js new file mode 100644 index 0000000000000..a1e66d854d296 --- /dev/null +++ b/onnxruntime/wasm/pre-async.js @@ -0,0 +1,142 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +"use strict"; + +// +// This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the +// final module using Emscripten's `--pre-js` option. +// +// This file will only be used in build with flag `-s ASYNCIFY=1`. + +/** + * initialize for asyncify support. + */ +let initAsyncImpl = () => { + // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1) + // It removes some overhead in cwarp() and ccall() that we don't need. + // + // Currently in ASYNCIFY build, we only use this for the following functions: + // - OrtCreateSession() + // - OrtRun() + // - OrtRunWithBinding() + // - OrtBindInput() + // + // Note: about parameters "getFunc" and "setFunc": + // - Emscripten has different behaviors for Debug and Release builds for generating exported function wrapper. + // + // - In Debug build, it will generate a wrapper function for each exported function. For example, it generates a + // wrapper for OrtRun() like this (minified): + // ``` + // var _OrtRun = Module["_OrtRun"] = createExportWrapper("OrtRun"); + // ``` + // + // - In Release build, it will generate a lazy loading wrapper for each exported function. For example, it generates + // a wrapper for OrtRun() like this (minified): + // ``` + // d._OrtRun = (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q); + // ``` + // + // The behavior of these two wrappers are different. The debug build will assign `Module["_OrtRun"]` only once + // because `createExportWrapper()` does not reset `Module["_OrtRun"]` inside. The release build, however, will + // reset d._OrtRun to J.ka when the first time it is called. + // + // The difference is important because we need to design the async wrapper in a way that it can handle both cases. + // + // Now, let's look at how the async wrapper is designed to work for both cases: + // + // - Debug build: + // 1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to `createExportWrapper("OrtRun")`. + // 2. When the first time `Module["initAsync"]` is called, `Module["_OrtRun"]` is re-assigned to a new async + // wrapper function. + // Value of `Module["_OrtRun"]` will not be changed again. + // + // - Release build: + // 1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to a lazy loading wrapper function. + // 2. When the first time `Module["initAsync"]` is called, `Module["_OrtRun"]` is re-assigned to a new async + // wrapper function. + // 3. When the first time `Module["_OrtRun"]` is called, the async wrapper will be called. It will call into this + // function: + // ``` + // (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q); + // ``` + // This function will assign d._OrtRun (ie. the minimized `Module["_OrtRun"]`) to the real function (J.ka). + // 4. Since d._OrtRun is re-assigned, we need to update the async wrapper to re-assign its stored + // function to the updated value (J.ka), and re-assign the value of `d._OrtRun` back to the async wrapper. + // Value of `Module["_OrtRun"]` will not be changed again. + // + // The value of `Module["_OrtRun"]` will need to be assigned for 2 times for debug build and 4 times for release + // build. + // + // This is why we need this `getFunc` and `setFunc` parameters. They are used to get the current value of an + // exported function and set the new value of an exported function. + // + const wrapAsync = (func, getFunc, setFunc) => { + return (...args) => { + // cache the async data before calling the function. + const previousAsync = Asyncify.currData; + + const previousFunc = getFunc?.(); + const ret = func(...args); + const newFunc = getFunc?.(); + if (previousFunc !== newFunc) { + // The exported function has been updated. + // Set the sync function reference to the new function. + func = newFunc; + // Set the exported function back to the async wrapper. + setFunc(previousFunc); + // Remove getFunc and setFunc. They are no longer needed. + setFunc = null; + getFunc = null; + } + + // If the async data has been changed, it means that the function started an async operation. + if (Asyncify.currData != previousAsync) { + // returns the promise + return Asyncify.whenDone(); + } + // the function is synchronous. returns the result. + return ret; + }; + }; + + // replace the original functions with asyncified versions + Module["_OrtAppendExecutionProvider"] = wrapAsync( + Module["_OrtAppendExecutionProvider"], + () => Module["_OrtAppendExecutionProvider"], + (v) => (Module["_OrtAppendExecutionProvider"] = v) + ); + Module["_OrtCreateSession"] = wrapAsync( + Module["_OrtCreateSession"], + () => Module["_OrtCreateSession"], + (v) => (Module["_OrtCreateSession"] = v) + ); + Module["_OrtRun"] = wrapAsync( + Module["_OrtRun"], + () => Module["_OrtRun"], + (v) => (Module["_OrtRun"] = v) + ); + Module["_OrtRunWithBinding"] = wrapAsync( + Module["_OrtRunWithBinding"], + () => Module["_OrtRunWithBinding"], + (v) => (Module["_OrtRunWithBinding"] = v) + ); + Module["_OrtBindInput"] = wrapAsync( + Module["_OrtBindInput"], + () => Module["_OrtBindInput"], + (v) => (Module["_OrtBindInput"] = v) + ); + + // If JSEP is enabled, wrap OrtRun() and OrtRunWithBinding() with asyncify. + if (typeof jsepRunAsync !== "undefined") { + Module["_OrtRun"] = jsepRunAsync(Module["_OrtRun"]); + Module["_OrtRunWithBinding"] = jsepRunAsync(Module["_OrtRunWithBinding"]); + } + + // remove this function to make sure it is called only once. + initAsyncImpl = undefined; +}; + +Module["asyncInit"] = () => { + initAsyncImpl?.(); +}; diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js index 0c83e71a921cb..a35ab129280c4 100644 --- a/onnxruntime/wasm/pre-jsep.js +++ b/onnxruntime/wasm/pre-jsep.js @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -'use strict'; +"use strict"; // // This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the @@ -9,247 +9,151 @@ // // This file will only be used in build with flag `--use_jsep`. - -/** - * initialize JSEP for asyncify support. - */ -let jsepInitAsync = () => { - // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1) - // It removes some overhead in cwarp() and ccall() that we don't need. - // - // Currently in JSEP build, we only use this for the following functions: - // - OrtRun() - // - OrtRunWithBinding() - // - OrtBindInput() - // - // Note: about parameters "getFunc" and "setFunc": - // - Emscripten has different behaviors for Debug and Release builds for generating exported function wrapper. - // - // - In Debug build, it will generate a wrapper function for each exported function. For example, it generates a - // wrapper for OrtRun() like this (minified): - // ``` - // var _OrtRun = Module["_OrtRun"] = createExportWrapper("OrtRun"); - // ``` - // - // - In Release build, it will generate a lazy loading wrapper for each exported function. For example, it generates - // a wrapper for OrtRun() like this (minified): - // ``` - // d._OrtRun = (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q); - // ``` - // - // The behavior of these two wrappers are different. The debug build will assign `Module["_OrtRun"]` only once - // because `createExportWrapper()` does not reset `Module["_OrtRun"]` inside. The release build, however, will - // reset d._OrtRun to J.ka when the first time it is called. - // - // The difference is important because we need to design the async wrapper in a way that it can handle both cases. - // - // Now, let's look at how the async wrapper is designed to work for both cases: - // - // - Debug build: - // 1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to `createExportWrapper("OrtRun")`. - // 2. When the first time `Module["jsepInit"]` is called, `Module["_OrtRun"]` is re-assigned to a new async - // wrapper function. - // Value of `Module["_OrtRun"]` will not be changed again. - // - // - Release build: - // 1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to a lazy loading wrapper function. - // 2. When the first time `Module["jsepInit"]` is called, `Module["_OrtRun"]` is re-assigned to a new async - // wrapper function. - // 3. When the first time `Module["_OrtRun"]` is called, the async wrapper will be called. It will call into this - // function: - // ``` - // (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q); - // ``` - // This function will assign d._OrtRun (ie. the minimized `Module["_OrtRun"]`) to the real function (J.ka). - // 4. Since d._OrtRun is re-assigned, we need to update the async wrapper to re-assign its stored - // function to the updated value (J.ka), and re-assign the value of `d._OrtRun` back to the async wrapper. - // Value of `Module["_OrtRun"]` will not be changed again. - // - // The value of `Module["_OrtRun"]` will need to be assigned for 2 times for debug build and 4 times for release - // build. - // - // This is why we need this `getFunc` and `setFunc` parameters. They are used to get the current value of an - // exported function and set the new value of an exported function. - // - const jsepWrapAsync = (func, getFunc, setFunc) => { - return (...args) => { - // cache the async data before calling the function. - const previousAsync = Asyncify.currData; - - const previousFunc = getFunc?.(); - const ret = func(...args); - const newFunc = getFunc?.(); - if (previousFunc !== newFunc) { - // The exported function has been updated. - // Set the sync function reference to the new function. - func = newFunc; - // Set the exported function back to the async wrapper. - setFunc(previousFunc); - // Remove getFunc and setFunc. They are no longer needed. - setFunc = null; - getFunc = null; +// This is a wrapper for OrtRun() and OrtRunWithBinding() to ensure that Promises are handled correctly. +const jsepRunAsync = (runAsyncFunc) => { + return async (...args) => { + try { + // Module.jsepSessionState should be null, unless we are in the middle of a session. + // If it is not null, it means that the previous session has not finished yet. + if (Module.jsepSessionState) { + throw new Error("Session already started"); } + const state = (Module.jsepSessionState = { + sessionHandle: args[0], + errors: [], + }); - // If the async data has been changed, it means that the function started an async operation. - if (Asyncify.currData != previousAsync) { - // returns the promise - return Asyncify.whenDone(); - } - // the function is synchronous. returns the result. - return ret; - }; - }; - - // This is a wrapper for OrtRun() and OrtRunWithBinding() to ensure that Promises are handled correctly. - const runAsync = (runAsyncFunc) => { - return async (...args) => { - try { - // Module.jsepSessionState should be null, unless we are in the middle of a session. - // If it is not null, it means that the previous session has not finished yet. - if (Module.jsepSessionState) { - throw new Error('Session already started'); - } - const state = Module.jsepSessionState = {sessionHandle: args[0], errors: []}; + // Run the acyncified function: OrtRun() or OrtRunWithBinding() + const ret = await runAsyncFunc(...args); - // Run the acyncified function: OrtRun() or OrtRunWithBinding() - const ret = await runAsyncFunc(...args); - - // Check if the session is still valid. this object should be the same as the one we set above. - if (Module.jsepSessionState !== state) { - throw new Error('Session mismatch'); - } + // Check if the session is still valid. this object should be the same as the one we set above. + if (Module.jsepSessionState !== state) { + throw new Error("Session mismatch"); + } - // Flush the backend. This will submit all pending commands to the GPU. - Module.jsepBackend?.['flush'](); + // Flush the backend. This will submit all pending commands to the GPU. + Module.jsepBackend?.["flush"](); - // Await all pending promises. This includes GPU validation promises for diagnostic purposes. - const errorPromises = state.errors; - if (errorPromises.length > 0) { - let errors = await Promise.all(errorPromises); - errors = errors.filter(e => e); - if (errors.length > 0) { - throw new Error(errors.join('\n')); - } + // Await all pending promises. This includes GPU validation promises for diagnostic purposes. + const errorPromises = state.errors; + if (errorPromises.length > 0) { + let errors = await Promise.all(errorPromises); + errors = errors.filter((e) => e); + if (errors.length > 0) { + throw new Error(errors.join("\n")); } - - return ret; - } finally { - Module.jsepSessionState = null; } - }; - }; - // replace the original functions with asyncified versions - Module['_OrtCreateSession'] = jsepWrapAsync( - Module['_OrtCreateSession'], - () => Module['_OrtCreateSession'], - v => Module['_OrtCreateSession'] = v); - Module['_OrtRun'] = runAsync(jsepWrapAsync( - Module['_OrtRun'], - () => Module['_OrtRun'], - v => Module['_OrtRun'] = v)); - Module['_OrtRunWithBinding'] = runAsync(jsepWrapAsync( - Module['_OrtRunWithBinding'], - () => Module['_OrtRunWithBinding'], - v => Module['_OrtRunWithBinding'] = v)); - Module['_OrtBindInput'] = jsepWrapAsync( - Module['_OrtBindInput'], - () => Module['_OrtBindInput'], - v => Module['_OrtBindInput'] = v); - - // remove this function to make sure it is called only once. - jsepInitAsync = undefined; + return ret; + } finally { + Module.jsepSessionState = null; + } + }; }; - /** - * initialize JSEP for WebGPU. + * initialize JSEP for WebGPU and WebNN. */ -Module['jsepInit'] = (name, params) => { - jsepInitAsync?.(); - - if (name === 'webgpu') { - [Module.jsepBackend, - Module.jsepAlloc, - Module.jsepFree, - Module.jsepCopy, - Module.jsepCopyAsync, - Module.jsepCreateKernel, - Module.jsepReleaseKernel, - Module.jsepRunKernel, - Module.jsepCaptureBegin, - Module.jsepCaptureEnd, - Module.jsepReplay] = params; +Module["jsepInit"] = (name, params) => { + if (name === "webgpu") { + [ + Module.jsepBackend, + Module.jsepAlloc, + Module.jsepFree, + Module.jsepCopy, + Module.jsepCopyAsync, + Module.jsepCreateKernel, + Module.jsepReleaseKernel, + Module.jsepRunKernel, + Module.jsepCaptureBegin, + Module.jsepCaptureEnd, + Module.jsepReplay, + ] = params; // expose webgpu backend functions const backend = Module.jsepBackend; - Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => { - return backend['registerBuffer'](sessionId, index, buffer, size); + Module["jsepRegisterBuffer"] = (sessionId, index, buffer, size) => { + return backend["registerBuffer"](sessionId, index, buffer, size); }; - Module['jsepGetBuffer'] = (dataId) => { - return backend['getBuffer'](dataId); + Module["jsepGetBuffer"] = (dataId) => { + return backend["getBuffer"](dataId); }; - Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => { - return backend['createDownloader'](gpuBuffer, size, type); + Module["jsepCreateDownloader"] = (gpuBuffer, size, type) => { + return backend["createDownloader"](gpuBuffer, size, type); }; - Module['jsepOnCreateSession'] = sessionId => { - backend['onCreateSession'](sessionId); + Module["jsepOnCreateSession"] = (sessionId) => { + backend["onCreateSession"](sessionId); }; - Module['jsepOnReleaseSession'] = sessionId => { - backend['onReleaseSession'](sessionId); + Module["jsepOnReleaseSession"] = (sessionId) => { + backend["onReleaseSession"](sessionId); }; - Module['jsepOnRunStart'] = sessionId => { - return backend['onRunStart'](sessionId); + Module["jsepOnRunStart"] = (sessionId) => { + return backend["onRunStart"](sessionId); }; Module.jsepUploadExternalBuffer = (dataId, buffer) => { - backend['upload'](dataId, buffer); + backend["upload"](dataId, buffer); }; - } else if (name === 'webnn') { + } else if (name === "webnn") { // Functions called from EM_ASM need to be assigned in a way that can be minified. // Functions called via emscripten::val::module_property need to be assigned by name so that the minifier doesn't // change the name. - [Module.jsepBackend, - Module.jsepReserveTensorId, - Module.jsepReleaseTensorId, - Module['jsepEnsureTensor'], - Module.jsepUploadTensor, - Module['jsepDownloadTensor'], + [ + Module.jsepBackend, + Module.jsepReserveTensorId, + Module.jsepReleaseTensorId, + Module["jsepEnsureTensor"], + Module.jsepUploadTensor, + Module["jsepDownloadTensor"], ] = params; // This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name. - Module['jsepReleaseTensorId'] = Module.jsepReleaseTensorId; - Module['jsepUploadTensor'] = Module.jsepUploadTensor; + Module["jsepReleaseTensorId"] = Module.jsepReleaseTensorId; + Module["jsepUploadTensor"] = Module.jsepUploadTensor; // Functions called from JS also need to have explicit names. const backend = Module.jsepBackend; - Module['jsepOnRunStart'] = sessionId => { - return backend['onRunStart'](sessionId); + Module["jsepOnRunStart"] = (sessionId) => { + return backend["onRunStart"](sessionId); }; - Module['jsepOnRunEnd'] = backend['onRunEnd'].bind(backend); - Module['jsepRegisterMLContext'] = (sessionId, mlContext) => { - backend['registerMLContext'](sessionId, mlContext); + Module["jsepOnRunEnd"] = backend["onRunEnd"].bind(backend); + Module["jsepRegisterMLContext"] = (sessionId, mlContext) => { + backend["registerMLContext"](sessionId, mlContext); }; - Module['jsepOnReleaseSession'] = sessionId => { - backend['onReleaseSession'](sessionId); + Module["jsepOnReleaseSession"] = (sessionId) => { + backend["onReleaseSession"](sessionId); }; - Module['jsepCreateMLTensorDownloader'] = (tensorId, type) => { - return backend['createMLTensorDownloader'](tensorId, type); - } - Module['jsepRegisterMLTensor'] = (sessionId, tensor, dataType, shape) => { - return backend['registerMLTensor'](sessionId, tensor, dataType, shape); + Module["jsepCreateMLTensorDownloader"] = (tensorId, type) => { + return backend["createMLTensorDownloader"](tensorId, type); + }; + Module["jsepRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => { + return backend["registerMLTensor"](sessionId, tensor, dataType, shape); }; - Module['jsepCreateMLContext'] = (optionsOrGpuDevice) => { - return backend['createMLContext'](optionsOrGpuDevice); + Module["jsepCreateMLContext"] = (optionsOrGpuDevice) => { + return backend["createMLContext"](optionsOrGpuDevice); }; - Module['jsepRegisterMLConstant'] = (externalFilePath, dataOffset, dataLength, builder, desc) => { - return backend['registerMLConstant']( - externalFilePath, dataOffset, dataLength, builder, desc, Module.MountedFiles); + Module["jsepRegisterMLConstant"] = ( + externalFilePath, + dataOffset, + dataLength, + builder, + desc + ) => { + return backend["registerMLConstant"]( + externalFilePath, + dataOffset, + dataLength, + builder, + desc, + Module.MountedFiles + ); }; - Module['jsepRegisterGraphInput'] = backend['registerGraphInput'].bind(backend); - Module['jsepIsGraphInput'] = backend['isGraphInput'].bind(backend); + Module["jsepRegisterGraphInput"] = + backend["registerGraphInput"].bind(backend); + Module["jsepIsGraphInput"] = backend["isGraphInput"].bind(backend); - Module['jsepCreateTemporaryTensor'] = backend['createTemporaryTensor'].bind(backend); + Module["jsepCreateTemporaryTensor"] = + backend["createTemporaryTensor"].bind(backend); } }; diff --git a/onnxruntime/wasm/pre.js b/onnxruntime/wasm/pre.js index 9b5f3ce545b78..6da28fc355899 100644 --- a/onnxruntime/wasm/pre.js +++ b/onnxruntime/wasm/pre.js @@ -1,21 +1,20 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -'use strict'; +"use strict"; // // This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the // final module using Emscripten's `--pre-js` option. - /** * Mount external data files of a model to an internal map, which will be used during session initialization. * * @param {string} externalDataFilesPath * @param {Uint8Array} externalDataFilesData */ -Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => { - if (externalDataFilePath.startsWith('./')) { +Module["mountExternalData"] = (externalDataFilePath, externalDataFileData) => { + if (externalDataFilePath.startsWith("./")) { externalDataFilePath = externalDataFilePath.substring(2); } const files = Module.MountedFiles || (Module.MountedFiles = new Map()); @@ -25,7 +24,7 @@ Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => { /** * Unmount external data files of a model. */ -Module['unmountExternalData'] = () => { +Module["unmountExternalData"] = () => { delete Module.MountedFiles; }; @@ -48,5 +47,7 @@ Module['unmountExternalData'] = () => { * * @suppress {checkVars} */ -var SharedArrayBuffer = globalThis.SharedArrayBuffer ?? - new WebAssembly.Memory({'initial': 0, 'maximum': 0, 'shared': true}).buffer.constructor; +var SharedArrayBuffer = + globalThis.SharedArrayBuffer ?? + new WebAssembly.Memory({ initial: 0, maximum: 0, shared: true }).buffer + .constructor; diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index ecdffbe5fd6a0..78e4f2ce9adef 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1592,8 +1592,11 @@ def generate_build_tree( raise BuildError("WebNN is only available for WebAssembly build.") cmake_args += ["-Donnxruntime_USE_WEBNN=ON"] - if args.use_jsep and args.use_webgpu: - raise BuildError("JSEP (--use_jsep) and WebGPU (--use_webgpu) cannot be enabled at the same time.") + # TODO: currently we allows building with both --use_jsep and --use_webgpu in this working branch. + # This situation is temporary. Eventually, those two flags will be mutually exclusive. + # + # if args.use_jsep and args.use_webgpu: + # raise BuildError("JSEP (--use_jsep) and WebGPU (--use_webgpu) cannot be enabled at the same time.") if args.use_external_dawn and not args.use_webgpu: raise BuildError("External Dawn (--use_external_dawn) must be enabled with WebGPU (--use_webgpu).") From 085bf324d5e7f7bcf59505421c376893e657d7d6 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Thu, 13 Feb 2025 19:56:45 -0800 Subject: [PATCH 2/2] easier debugging for integration --- cmake/onnxruntime_webassembly.cmake | 3 +- cmake/patches/dawn/dawn.patch | 97 ++++++++++++++++++++++++-- js/web/lib/wasm/jsep/backend-webgpu.ts | 72 +++++++++++++++++++ js/web/lib/wasm/wasm-core-impl.ts | 13 ++++ js/web/lib/wasm/wasm-types.ts | 1 + js/web/package.json | 2 +- js/web/script/build.ts | 15 +++- onnxruntime/wasm/post-webgpu.js | 19 +++++ onnxruntime/wasm/pre-jsep.js | 9 +++ 9 files changed, 222 insertions(+), 9 deletions(-) diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake index f3afaf7033fd1..b6910795391b1 100644 --- a/cmake/onnxruntime_webassembly.cmake +++ b/cmake/onnxruntime_webassembly.cmake @@ -443,7 +443,8 @@ jsepDownload:_pp_") "SHELL:-s ASSERTIONS=0" "SHELL:-s SAFE_HEAP=0" "SHELL:-s STACK_OVERFLOW_CHECK=0" - --closure 1 + ## comment out closure compiler so that it's easier to debug + # --closure 1 ) endif() diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch index ac4c42bc15fce..4c96eca093607 100644 --- a/cmake/patches/dawn/dawn.patch +++ b/cmake/patches/dawn/dawn.patch @@ -47,8 +47,74 @@ index efd6491cd6..8ebc5d28b6 100644 emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../.. ninja +diff --git a/third_party/emdawnwebgpu/library_webgpu.js b/third_party/emdawnwebgpu/library_webgpu.js +index 5862ce4045..45df259bb7 100644 +--- a/third_party/emdawnwebgpu/library_webgpu.js ++++ b/third_party/emdawnwebgpu/library_webgpu.js +@@ -811,6 +811,61 @@ var LibraryWebGPU = { + {{{ runtimeKeepalivePush() }}} + WebGPU.Internals.futureInsert(futureId, adapter.requestDevice(desc).then((device) => { + {{{ runtimeKeepalivePop() }}} ++ ++ if (globalThis["WEBGPU_STAT"]) { ++ // a set that caches all active buffers ++ const buffers = WebGPU.Internals.buffers ??= new Set(); ++ // key is buffer usage, value is total size of buffers with that usage ++ const buffersTotalSize = WebGPU.Internals.buffersTotalSize ??= new Map(); ++ ++ WebGPU.Internals.buffersCreated ??= 0; ++ WebGPU.Internals.buffersDestroyed ??= 0; ++ WebGPU.Internals.buffersUploads ??= 0; ++ WebGPU.Internals.buffersExternalUploads ??= 0; ++ WebGPU.Internals.buffersDownloads ??= 0; ++ WebGPU.Internals.buffersExternalDownloads ??= 0; ++ ++ // create a proxy so that we can monitor buffer usages ++ device = new Proxy(device, { ++ // when call device.createBuffer(), the returned buffer should be added into buffers ++ get: (target, prop, _receiver) => { ++ if (prop === 'createBuffer') { ++ return (desc) => { ++ const buffer = target.createBuffer(desc); ++ const originalDestroy = buffer.destroy.bind(buffer); ++ buffer.destroy = () => { ++ const previousTotal = buffersTotalSize.get(buffer.usage); ++ buffersTotalSize.set(buffer.usage, previousTotal - buffer.size); ++ buffers.delete(buffer); ++ WebGPU.Internals.buffersDestroyed++; ++ originalDestroy(); ++ }; ++ ++ if (buffer.usage === (GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC)) { ++ WebGPU.Internals.buffersUploads++; ++ } ++ if (buffer.usage === (GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ)) { ++ WebGPU.Internals.buffersDownloads++; ++ } ++ ++ buffers.add(buffer); ++ const previousTotal = buffersTotalSize.get(buffer.usage) ?? 0; ++ buffersTotalSize.set(buffer.usage, previousTotal + buffer.size); ++ WebGPU.Internals.buffersCreated++; ++ return buffer; ++ }; ++ } ++ const propertyValue = Reflect.get(target, prop); ++ if (typeof propertyValue === 'function') { ++ return propertyValue.bind(target); ++ } else { ++ return propertyValue; ++ } ++ }, ++ set: (target, prop, value, _receiver) => Reflect.set(target, prop, value), ++ }); ++ } ++ + WebGPU.Internals.jsObjectInsert(queuePtr, device.queue); + WebGPU.Internals.jsObjectInsert(devicePtr, device); + diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp -index ca52b1237b..b11462fb87 100644 +index ca52b1237b..a30ca583c3 100644 --- a/third_party/emdawnwebgpu/webgpu.cpp +++ b/third_party/emdawnwebgpu/webgpu.cpp @@ -131,7 +131,6 @@ class RefCounted : NonMovable { @@ -75,7 +141,14 @@ index ca52b1237b..b11462fb87 100644 void Destroy(); const void* GetConstMappedRange(size_t offset, size_t size); -@@ -1168,7 +1169,11 @@ WGPUBuffer emwgpuCreateBuffer(const EventSource* source, +@@ -1164,11 +1165,17 @@ WGPUAdapter emwgpuCreateAdapter(const EventSource* source) { + + WGPUBuffer emwgpuCreateBuffer(const EventSource* source, + bool mappedAtCreation = false) { +- return new WGPUBufferImpl(source, mappedAtCreation); ++ auto x = new WGPUBufferImpl(source, mappedAtCreation); ++ // printf(" #C++: emwgpuCreateBuffer %p\n", x); ++ return x; } WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) { @@ -88,7 +161,7 @@ index ca52b1237b..b11462fb87 100644 } WGPUQueue emwgpuCreateQueue(const EventSource* source) { -@@ -1284,6 +1289,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation) +@@ -1284,6 +1291,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation) } } @@ -99,7 +172,7 @@ index ca52b1237b..b11462fb87 100644 void WGPUBufferImpl::Destroy() { emwgpuBufferDestroy(this); AbortPendingMap("Buffer was destroyed before mapping was resolved."); -@@ -1504,6 +1513,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo( +@@ -1504,6 +1515,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo( void wgpu##Name##Release(WGPU##Name o) { \ if (o->Release()) { \ delete o; \ @@ -107,3 +180,19 @@ index ca52b1237b..b11462fb87 100644 } \ } WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE) +@@ -1587,6 +1599,7 @@ WGPUFuture wgpuAdapterRequestDevice( + // ---------------------------------------------------------------------------- + + void wgpuBufferDestroy(WGPUBuffer buffer) { ++ // printf(" #C++: wgpuBufferDestroy %p\n", buffer); + buffer->Destroy(); + } + +@@ -1639,6 +1652,7 @@ void wgpuBufferUnmap(WGPUBuffer buffer) { + WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device, + const WGPUBufferDescriptor* descriptor) { + WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation); ++ // printf(" #C++: wgpuDeviceCreateBuffer %p\n", buffer); + emwgpuDeviceCreateBuffer(device, descriptor, buffer); + return buffer; + } diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts index a0010df4643a4..3c04b500e0ab4 100644 --- a/js/web/lib/wasm/jsep/backend-webgpu.ts +++ b/js/web/lib/wasm/jsep/backend-webgpu.ts @@ -231,6 +231,16 @@ export class WebGpuBackend { private queryTimeBase?: bigint; queryType: TimestampQuery; + buffers = new Set(); + buffersTotalSize = new Map(); + + buffersCreated = 0; + buffersDestroyed = 0; + buffersUploads = 0; + buffersExternalUploads = 0; + buffersDownloads = 0; + buffersExternalDownloads = 0; + env: Env; sessionStatus: SessionState = 'default'; /** @@ -280,6 +290,67 @@ export class WebGpuBackend { } this.device = await adapter.requestDevice(deviceDescriptor); + + // @ts-expect-error Element implicitly has an 'any' type because type 'typeof globalThis' has no index signature.ts(7017) + if (globalThis.WEBGPU_STAT) { + const buffers = this.buffers; + const buffersTotalSize = this.buffersTotalSize; + + const buffersUploadsIncrement = () => { + this.buffersUploads++; + }; + const buffersDownloadsIncrement = () => { + this.buffersDownloads++; + }; + const buffersCreatedIncrement = () => { + this.buffersCreated++; + }; + const buffersDestroyedIncrement = () => { + this.buffersDestroyed++; + }; + + this.device = new Proxy(this.device, { + // when call device.createBuffer(), the returned buffer should be added into buffers + get: (target, prop, _receiver) => { + if (prop === 'createBuffer') { + return (desc: GPUBufferDescriptor) => { + const buffer = target.createBuffer(desc); + const originalDestroy = buffer.destroy.bind(buffer); + buffer.destroy = () => { + const previousTotal = buffersTotalSize.get(buffer.usage); + buffersTotalSize.set(buffer.usage, previousTotal - buffer.size); + buffers.delete(buffer); + buffersDestroyedIncrement(); + originalDestroy(); + }; + + // eslint-disable-next-line no-bitwise + if (buffer.usage === (GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC)) { + buffersUploadsIncrement(); + } + // eslint-disable-next-line no-bitwise + if (buffer.usage === (GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ)) { + buffersDownloadsIncrement(); + } + + buffers.add(buffer); + const previousTotal = buffersTotalSize.get(buffer.usage) ?? 0; + buffersTotalSize.set(buffer.usage, previousTotal + buffer.size); + buffersCreatedIncrement(); + return buffer; + }; + } + const propertyValue = Reflect.get(target, prop); + if (typeof propertyValue === 'function') { + return propertyValue.bind(target); + } else { + return propertyValue; + } + }, + set: (target, prop, value, _receiver) => Reflect.set(target, prop, value), + }); + } + this.deviceInfo = new DeviceInfoImpl(this.device); this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo())); this.gpuDataManager = createGpuDataManager(this); @@ -844,6 +915,7 @@ export class WebGpuBackend { ): () => Promise { return async () => { const data = await downloadGpuData(this, gpuBuffer, size); + this.buffersExternalDownloads++; return createView(data.buffer, type); }; } diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts index dbcf80adf3552..4bfc7925043fe 100644 --- a/js/web/lib/wasm/wasm-core-impl.ts +++ b/js/web/lib/wasm/wasm-core-impl.ts @@ -260,6 +260,8 @@ export const createSession = async ( let modelDataOffset: number, modelDataLength: number; const wasm = getInstance(); + wasm.webgpuStat?.('createSession_start'); + if (Array.isArray(modelData)) { // if model data is an array, it must be a 2-elements tuple containing the pointer and size of the model data [modelDataOffset, modelDataLength] = modelData; @@ -327,6 +329,7 @@ export const createSession = async ( } wasm.jsepOnCreateSession?.(); + wasm.webgpuStat?.('createSession_end'); // clear current MLContext after session creation if (wasm.currentContext) { @@ -436,6 +439,7 @@ export const createSession = async ( export const releaseSession = (sessionId: number): void => { const wasm = getInstance(); + wasm.webgpuStat?.('releaseSession_start'); const session = activeSessions.get(sessionId); if (!session) { throw new Error(`cannot release session. invalid session id: ${sessionId}`); @@ -462,6 +466,8 @@ export const releaseSession = (sessionId: number): void => { checkLastError("Can't release session."); } activeSessions.delete(sessionId); + + wasm.webgpuStat?.('releaseSession_end'); }; export const prepareInputOutputTensor = async ( @@ -633,6 +639,8 @@ export const run = async ( const outputValuesOffset = wasm.stackAlloc(outputCount * ptrSize); const outputNamesOffset = wasm.stackAlloc(outputCount * ptrSize); + wasm.webgpuStat?.('run_start'); + try { [runOptionsHandle, runOptionsAllocs] = setRunOptions(options); @@ -722,6 +730,7 @@ export const run = async ( } wasm.jsepOnRunStart?.(sessionHandle); + //wasm.webgpuStat?.('run_beforeAPI'); let errorCode: number; if (!BUILD_DEFS.DISABLE_JSEP && ioBindingState) { @@ -745,6 +754,8 @@ export const run = async ( ); } + //wasm.webgpuStat?.('run_afterAPI'); + if (errorCode !== 0) { checkLastError('failed to call OrtRun().'); } @@ -926,6 +937,8 @@ export const run = async ( false, ]); } + wasm.webgpuStat?.('run_end'); + return output; } finally { wasm.stackRestore(beforeRunStack); diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts index 9b2ec71fd351d..7e94fec52c374 100644 --- a/js/web/lib/wasm/wasm-types.ts +++ b/js/web/lib/wasm/wasm-types.ts @@ -294,6 +294,7 @@ export declare namespace WebGpu { webgpuUnregisterBuffer(buffer: GPUBuffer): void; webgpuGetBuffer(bufferHandle: number): GPUBuffer; webgpuCreateDownloader(gpuBuffer: GPUBuffer, size: number, sessionHandle: number): () => Promise; + webgpuStat(label?: string): void; } } diff --git a/js/web/package.json b/js/web/package.json index 5defe05e78c1f..6651c05bce5be 100644 --- a/js/web/package.json +++ b/js/web/package.json @@ -72,7 +72,7 @@ "import": "./dist/ort.node.min.mjs", "require": "./dist/ort.node.min.js" }, - "import": "./dist/ort.bundle.min.mjs", + "import": "./dist/ort.bundle.mjs", "require": "./dist/ort.min.js" }, "./all": { diff --git a/js/web/script/build.ts b/js/web/script/build.ts index fd9224a2dcf8b..afbdcc5924836 100644 --- a/js/web/script/build.ts +++ b/js/web/script/build.ts @@ -46,14 +46,17 @@ const DEBUG = process.env.npm_config_debug || args.debug; // boolean|'verbose'|' /** * --webgpu-ep - * --no-webgpu-ep (default) + * --no-webgpu-ep * * Enable or disable the use of WebGPU EP. If enabled, the WebGPU EP will be used. If disabled, the WebGPU backend will * be used with JSEP. * + * The default value is not set. If not set, onnxruntime-web will determine whether to use WebGPU EP or JSEP based on + * the environment (globalThis.WEBGPU_EP). + * * (temporary) This flag is used to test the WebGPU EP integration. It will be removed in the future. */ -const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep'] ?? true; +const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep']; /** * Root folder of the source code: `/js/` @@ -69,7 +72,7 @@ const DEFAULT_DEFINE = { 'BUILD_DEFS.DISABLE_WASM': 'false', 'BUILD_DEFS.DISABLE_WASM_PROXY': 'false', 'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'false', - 'BUILD_DEFS.USE_WEBGPU_EP': JSON.stringify(!!USE_WEBGPU_EP), + 'BUILD_DEFS.USE_WEBGPU_EP': USE_WEBGPU_EP === undefined ? 'globalThis.WEBGPU_EP' : JSON.stringify(!!USE_WEBGPU_EP), 'BUILD_DEFS.IS_ESM': 'false', 'BUILD_DEFS.ESM_IMPORT_META_URL': 'undefined', @@ -601,6 +604,12 @@ async function main() { outputName: 'ort', define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true' }, }); + // ort.bundle.mjs + await buildOrt({ + outputName: 'ort.bundle', + format: 'esm', + define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'true' }, + }); // ort.bundle.min.mjs await buildOrt({ isProduction: true, diff --git a/onnxruntime/wasm/post-webgpu.js b/onnxruntime/wasm/post-webgpu.js index e7631a97c34c6..b84dfd733af57 100644 --- a/onnxruntime/wasm/post-webgpu.js +++ b/onnxruntime/wasm/post-webgpu.js @@ -202,6 +202,8 @@ Module["webgpuInit"] = (setDefaultDevice) => { await gpuReadBuffer.mapAsync(GPUMapMode.READ); + WebGPU.Internals.buffersExternalDownloads++; + const arrayBuffer = gpuReadBuffer.getMappedRange(); return arrayBuffer.slice(0, originalSize); } finally { @@ -258,6 +260,23 @@ Module["webgpuInit"] = (setDefaultDevice) => { size ); webgpuCurrentDevice.queue.submit([commandEncoder.finish()]); + WebGPU.Internals.buffersExternalUploads++; gpuBufferForUploading.destroy(); }; + + Module["webgpuStat"] = (label) => { + if (globalThis["WEBGPU_STAT"]) { + console.log( + `[${label}] BufferCount: ${ + WebGPU.Internals.buffers?.size ?? 0 + }, Created: ${WebGPU.Internals.buffersCreated ?? 0}, Destroyed: ${ + WebGPU.Internals.buffersDestroyed ?? 0 + } Uploads: ${WebGPU.Internals.buffersUploads ?? 0}, Downloads: ${ + WebGPU.Internals.buffersDownloads ?? 0 + }, ExtUploads: ${ + WebGPU.Internals.buffersExternalUploads ?? 0 + }, ExtDownloads: ${WebGPU.Internals.buffersExternalDownloads ?? 0}` + ); + } + }; }; diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js index a35ab129280c4..04507e5defae9 100644 --- a/onnxruntime/wasm/pre-jsep.js +++ b/onnxruntime/wasm/pre-jsep.js @@ -93,6 +93,15 @@ Module["jsepInit"] = (name, params) => { Module.jsepUploadExternalBuffer = (dataId, buffer) => { backend["upload"](dataId, buffer); + backend["buffersExternalUploads"]++; + }; + + Module["webgpuStat"] = (label) => { + if (globalThis["WEBGPU_STAT"]) { + console.log( + `[${label}] BufferCount: ${backend["buffers"].size}, Created: ${backend["buffersCreated"]}, Destroyed: ${backend["buffersDestroyed"]}, Uploads: ${backend["buffersUploads"]}, Downloads: ${backend["buffersDownloads"]}, ExtUploads: ${backend["buffersExternalUploads"]}, ExtDownloads: ${backend["buffersExternalDownloads"]}` + ); + } }; } else if (name === "webnn") { // Functions called from EM_ASM need to be assigned in a way that can be minified.