From 1b6b2361c5580e6887b96dd48fbd4691d0702294 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 7 Feb 2025 12:08:43 -0800
Subject: [PATCH 1/2] enable WebGPU EP in WebAssembly build

---
 .../external/onnxruntime_external_deps.cmake  |   7 +
 cmake/onnxruntime_webassembly.cmake           |  37 +-
 cmake/patches/dawn/dawn.patch                 |  73 +++
 cmake/patches/emscripten/webgpu-externs.js    | 577 ++++++++++++++++++
 js/build_webgpu.bat                           |  79 +++
 js/web/lib/build-def.d.ts                     |   7 +
 js/web/lib/wasm/jsep/init.ts                  | 136 +++--
 js/web/lib/wasm/session-options.ts            | 116 ++--
 js/web/lib/wasm/wasm-core-impl.ts             |  97 ++-
 js/web/lib/wasm/wasm-types.ts                 |  68 ++-
 js/web/script/build.ts                        |  17 +-
 .../core/framework/external_data_loader.cc    |   7 +-
 .../core/framework/external_data_loader.h     |   2 +-
 .../providers/webgpu/external_data_loader.cc  |  40 ++
 .../providers/webgpu/external_data_loader.h   |  30 +
 onnxruntime/core/providers/webgpu/program.cc  |  20 +
 onnxruntime/core/providers/webgpu/program.h   |   1 +
 .../core/providers/webgpu/webgpu_context.cc   |  53 +-
 .../webgpu/webgpu_execution_provider.cc       |   7 +
 .../webgpu/webgpu_execution_provider.h        |   3 +
 .../webgpu/webgpu_provider_factory.cc         |   6 +
 onnxruntime/wasm/api.cc                       |  26 +-
 onnxruntime/wasm/api.h                        |  24 +-
 onnxruntime/wasm/js_post_js.js                |   2 +-
 onnxruntime/wasm/js_post_js_64.js             |   2 +-
 onnxruntime/wasm/post-webgpu.js               | 263 ++++++++
 onnxruntime/wasm/pre-async.js                 | 142 +++++
 onnxruntime/wasm/pre-jsep.js                  | 308 ++++------
 onnxruntime/wasm/pre.js                       |  15 +-
 tools/ci_build/build.py                       |   7 +-
 30 files changed, 1783 insertions(+), 389 deletions(-)
 create mode 100644 cmake/patches/emscripten/webgpu-externs.js
 create mode 100644 js/build_webgpu.bat
 create mode 100644 onnxruntime/core/providers/webgpu/external_data_loader.cc
 create mode 100644 onnxruntime/core/providers/webgpu/external_data_loader.h
 create mode 100644 onnxruntime/wasm/post-webgpu.js
 create mode 100644 onnxruntime/wasm/pre-async.js

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 1b1e11c9772f9..7717caf54945b 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -637,6 +637,13 @@ if (onnxruntime_USE_WEBGPU)
   set(DAWN_BUILD_TESTS OFF CACHE BOOL "" FORCE)
   if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     set(DAWN_EMSCRIPTEN_TOOLCHAIN "${REPO_ROOT}/cmake/external/emsdk/upstream/emscripten" CACHE STRING "" FORCE)
+
+    # Update a few files in Emscripten
+    #
+    # The following files should be updated in Emscripten. We are waiting for the next Emscripten release to include
+    # these changes. For now, we apply the changes manually.
+    # - ${DAWN_EMSCRIPTEN_TOOLCHAIN}/src/closure-externs/webgpu-externs.js
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${PROJECT_SOURCE_DIR}/patches/emscripten/webgpu-externs.js" "${DAWN_EMSCRIPTEN_TOOLCHAIN}/src/closure-externs/webgpu-externs.js")
   else()
     if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
       set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE BOOL "" FORCE)
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 8106e46ccf580..f3afaf7033fd1 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -211,10 +211,14 @@ else()
     target_link_libraries(onnxruntime_webassembly PRIVATE tensorboard)
   endif()
 
+  set(onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre.js")
+
+  set(EXPORTED_FUNCTIONS "_malloc,_free")
   if (onnxruntime_USE_JSEP)
-    set(EXPORTED_FUNCTIONS "_malloc,_free,_JsepOutput,_JsepGetNodeName")
-  else()
-    set(EXPORTED_FUNCTIONS "_malloc,_free")
+    string(APPEND EXPORTED_FUNCTIONS ",_JsepOutput,_JsepGetNodeName")
+  endif()
+  if (onnxruntime_USE_WEBGPU)
+    string(APPEND EXPORTED_FUNCTIONS ",_wgpuBufferRelease,_wgpuCreateInstance")
   endif()
 
   if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
@@ -312,13 +316,15 @@ else()
         target_compile_options(noexcep_operators PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
     endif()
     target_link_options(onnxruntime_webassembly PRIVATE
-      --post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js"
+      "SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js\""
     )
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js")
   else ()
     set(MAXIMUM_MEMORY "4294967296")
     target_link_options(onnxruntime_webassembly PRIVATE
-      --post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js"
+      "SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js.js\""
     )
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js")
   endif ()
 
   target_link_options(onnxruntime_webassembly PRIVATE
@@ -372,7 +378,6 @@ jsepDownload:_pp_")
       "SHELL:-s SIGNATURE_CONVERSIONS='${SIGNATURE_CONVERSIONS}'"
     )
   endif ()
-  set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre.js)
 
   if (onnxruntime_USE_JSEP)
     # NOTE: "-s ASYNCIFY=1" is required for JSEP to work with WebGPU
@@ -382,10 +387,8 @@ jsepDownload:_pp_")
     target_compile_definitions(onnxruntime_webassembly PRIVATE USE_JSEP=1)
     target_link_options(onnxruntime_webassembly PRIVATE
       "SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js\""
-      "SHELL:-s ASYNCIFY=1"
-      "SHELL:-s ASYNCIFY_STACK_SIZE=65536"
     )
-    set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js)
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js")
 
     if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
       target_link_options(onnxruntime_webassembly PRIVATE
@@ -397,6 +400,20 @@ jsepDownload:_pp_")
 
   if (onnxruntime_USE_WEBGPU)
     target_compile_definitions(onnxruntime_webassembly PRIVATE USE_WEBGPU=1)
+    target_link_options(onnxruntime_webassembly PRIVATE
+      "SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js\""
+    )
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js")
+  endif()
+
+  if (onnxruntime_USE_JSEP OR onnxruntime_USE_WEBGPU OR onnxruntime_USE_WEBNN)
+    # if any of the above is enabled, we need to use the asyncify library
+    target_link_options(onnxruntime_webassembly PRIVATE
+      "SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-async.js\""
+      "SHELL:-s ASYNCIFY=1"
+      "SHELL:-s ASYNCIFY_STACK_SIZE=65536"
+    )
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-async.js")
   endif()
 
   if (onnxruntime_EMSCRIPTEN_SETTINGS)
@@ -458,6 +475,8 @@ jsepDownload:_pp_")
     )
   endif()
 
+  set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS "${onnxruntime_webassembly_script_deps}")
+
   set(target_name_list ort)
 
   if (onnxruntime_ENABLE_TRAINING_APIS)
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
index 2f85d5ab473b5..ac4c42bc15fce 100644
--- a/cmake/patches/dawn/dawn.patch
+++ b/cmake/patches/dawn/dawn.patch
@@ -34,3 +34,76 @@ index 6e8ae37593..633af91eef 100644
              -q
              "${EM_BUILD_GEN_DIR}/struct_info_webgpu.json"
              "-I=${EM_BUILD_GEN_DIR}/include"
+diff --git a/src/emdawnwebgpu/README.md b/src/emdawnwebgpu/README.md
+index efd6491cd6..8ebc5d28b6 100644
+--- a/src/emdawnwebgpu/README.md
++++ b/src/emdawnwebgpu/README.md
+@@ -56,7 +56,7 @@ Set up the build directory using emcmake
+ mkdir out/cmake-wasm
+ cd out/cmake-wasm
+ 
+-# Make sure the path is to the source checkout of Emscripten, not emsdk's release.
++# If using Emscripten v4.0.2 or lower, make sure the path is to the source checkout of Emscripten, not emsdk's release.
+ emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../..
+ 
+ ninja
+diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp
+index ca52b1237b..b11462fb87 100644
+--- a/third_party/emdawnwebgpu/webgpu.cpp
++++ b/third_party/emdawnwebgpu/webgpu.cpp
+@@ -131,7 +131,6 @@ class RefCounted : NonMovable {
+   bool Release() {
+     if (mRefCount.fetch_sub(1u, std::memory_order_release) == 1u) {
+       std::atomic_thread_fence(std::memory_order_acquire);
+-      emwgpuDelete(this);
+       return true;
+     }
+     return false;
+@@ -234,6 +233,7 @@ class Ref {
+   static void Release(T value) {
+     if (value != nullptr && value->RefCounted::Release()) {
+       delete value;
++      emwgpuDelete(value);
+     }
+   }
+ 
+@@ -642,6 +642,7 @@ struct WGPUBufferImpl final : public EventSource,
+                               public RefCountedWithExternalCount {
+  public:
+   WGPUBufferImpl(const EventSource* source, bool mappedAtCreation);
++  ~WGPUBufferImpl();
+ 
+   void Destroy();
+   const void* GetConstMappedRange(size_t offset, size_t size);
+@@ -1168,7 +1169,11 @@ WGPUBuffer emwgpuCreateBuffer(const EventSource* source,
+ }
+ 
+ WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) {
+-  return new WGPUDeviceImpl(source, queue);
++  // This function is only called from JS via `importJsDevice()`, which
++  // needs to increment the external ref count to fix the behavior.
++  WGPUDeviceImpl* device = new WGPUDeviceImpl(source, queue);
++  device->AddExternalRef();
++  return device;
+ }
+ 
+ WGPUQueue emwgpuCreateQueue(const EventSource* source) {
+@@ -1284,6 +1289,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation)
+   }
+ }
+ 
++WGPUBufferImpl::~WGPUBufferImpl() {
++  Destroy();
++}
++
+ void WGPUBufferImpl::Destroy() {
+   emwgpuBufferDestroy(this);
+   AbortPendingMap("Buffer was destroyed before mapping was resolved.");
+@@ -1504,6 +1513,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo(
+   void wgpu##Name##Release(WGPU##Name o) {       \
+     if (o->Release()) {                          \
+       delete o;                                  \
++      emwgpuDelete(o);                           \
+     }                                            \
+   }
+ WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE)
diff --git a/cmake/patches/emscripten/webgpu-externs.js b/cmake/patches/emscripten/webgpu-externs.js
new file mode 100644
index 0000000000000..9dc1a6943ed51
--- /dev/null
+++ b/cmake/patches/emscripten/webgpu-externs.js
@@ -0,0 +1,577 @@
+/*
+ * WebGPU globals
+ * Generated using https://github.com/kainino0x/webidl-to-closure-externs
+ * against the spec's WebIDL: https://gpuweb.github.io/gpuweb/webgpu.idl
+ */
+
+/** @type {?GPU} */
+Navigator.prototype.gpu;
+
+/** @type {?GPU} */
+WorkerNavigator.prototype.gpu;
+
+const GPUBufferUsage = {};
+/** @type {number} */
+GPUBufferUsage.MAP_READ;
+/** @type {number} */
+GPUBufferUsage.MAP_WRITE;
+/** @type {number} */
+GPUBufferUsage.COPY_SRC;
+/** @type {number} */
+GPUBufferUsage.COPY_DST;
+/** @type {number} */
+GPUBufferUsage.INDEX;
+/** @type {number} */
+GPUBufferUsage.VERTEX;
+/** @type {number} */
+GPUBufferUsage.UNIFORM;
+/** @type {number} */
+GPUBufferUsage.STORAGE;
+/** @type {number} */
+GPUBufferUsage.INDIRECT;
+/** @type {number} */
+GPUBufferUsage.QUERY_RESOLVE;
+
+const GPUMapMode = {};
+/** @type {number} */
+GPUMapMode.READ;
+/** @type {number} */
+GPUMapMode.WRITE;
+
+const GPUTextureUsage = {};
+/** @type {number} */
+GPUTextureUsage.COPY_SRC;
+/** @type {number} */
+GPUTextureUsage.COPY_DST;
+/** @type {number} */
+GPUTextureUsage.TEXTURE_BINDING;
+/** @type {number} */
+GPUTextureUsage.STORAGE_BINDING;
+/** @type {number} */
+GPUTextureUsage.RENDER_ATTACHMENT;
+
+const GPUShaderStage = {};
+/** @type {number} */
+GPUShaderStage.VERTEX;
+/** @type {number} */
+GPUShaderStage.FRAGMENT;
+/** @type {number} */
+GPUShaderStage.COMPUTE;
+
+const GPUColorWrite = {};
+/** @type {number} */
+GPUColorWrite.RED;
+/** @type {number} */
+GPUColorWrite.GREEN;
+/** @type {number} */
+GPUColorWrite.BLUE;
+/** @type {number} */
+GPUColorWrite.ALPHA;
+/** @type {number} */
+GPUColorWrite.ALL;
+
+/** @constructor */
+function GPUSupportedLimits() {}
+/** @type {number} */
+GPUSupportedLimits.prototype.maxTextureDimension1D;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxTextureDimension2D;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxTextureDimension3D;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxTextureArrayLayers;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxBindGroups;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxBindGroupsPlusVertexBuffers;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxBindingsPerBindGroup;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxDynamicUniformBuffersPerPipelineLayout;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxDynamicStorageBuffersPerPipelineLayout;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxSampledTexturesPerShaderStage;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxSamplersPerShaderStage;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxStorageBuffersPerShaderStage;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxStorageTexturesPerShaderStage;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxUniformBuffersPerShaderStage;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxUniformBufferBindingSize;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxStorageBufferBindingSize;
+/** @type {number} */
+GPUSupportedLimits.prototype.minUniformBufferOffsetAlignment;
+/** @type {number} */
+GPUSupportedLimits.prototype.minStorageBufferOffsetAlignment;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxVertexBuffers;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxBufferSize;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxVertexAttributes;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxVertexBufferArrayStride;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxInterStageShaderComponents;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxInterStageShaderVariables;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxColorAttachments;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxColorAttachmentBytesPerSample;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxComputeWorkgroupStorageSize;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxComputeInvocationsPerWorkgroup;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxComputeWorkgroupSizeX;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxComputeWorkgroupSizeY;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxComputeWorkgroupSizeZ;
+/** @type {number} */
+GPUSupportedLimits.prototype.maxComputeWorkgroupsPerDimension;
+
+/** @constructor */
+function GPUSupportedFeatures() {}
+/** @type {number} */
+GPUSupportedFeatures.prototype.size;
+/** @return {!Iterable<string>} */
+GPUSupportedFeatures.prototype.entries = function() {};
+/** @return {!Iterable<string>} */
+GPUSupportedFeatures.prototype.keys = function() {};
+/** @return {!Iterable<string>} */
+GPUSupportedFeatures.prototype.values = function() {};
+/** @return {undefined} */
+GPUSupportedFeatures.prototype.forEach = function() {};
+/** @return {boolean} */
+GPUSupportedFeatures.prototype.has = function() {};
+
+/** @constructor */
+function WGSLLanguageFeatures() {}
+/** @type {number} */
+WGSLLanguageFeatures.prototype.size;
+/** @return {!Iterable<string>} */
+WGSLLanguageFeatures.prototype.entries = function() {};
+/** @return {!Iterable<string>} */
+WGSLLanguageFeatures.prototype.keys = function() {};
+/** @return {!Iterable<string>} */
+WGSLLanguageFeatures.prototype.values = function() {};
+/** @return {undefined} */
+WGSLLanguageFeatures.prototype.forEach = function() {};
+/** @return {boolean} */
+WGSLLanguageFeatures.prototype.has = function() {};
+
+/** @constructor */
+function GPUAdapterInfo() {}
+/** @type {string} */
+GPUAdapterInfo.prototype.vendor;
+/** @type {string} */
+GPUAdapterInfo.prototype.architecture;
+/** @type {string} */
+GPUAdapterInfo.prototype.device;
+/** @type {string} */
+GPUAdapterInfo.prototype.description;
+
+/** @constructor */
+function GPU() {}
+/** @return {!Promise<?GPUAdapter>} */
+GPU.prototype.requestAdapter = function() {};
+/** @return {string} */
+GPU.prototype.getPreferredCanvasFormat = function() {};
+/** @type {!WGSLLanguageFeatures} */
+GPU.prototype.wgslLanguageFeatures;
+
+/** @constructor */
+function GPUAdapter() {}
+/** @type {!GPUSupportedFeatures} */
+GPUAdapter.prototype.features;
+/** @type {!GPUSupportedLimits} */
+GPUAdapter.prototype.limits;
+/** @type {boolean} */
+GPUAdapter.prototype.isFallbackAdapter;
+/** @return {!Promise<!GPUDevice>} */
+GPUAdapter.prototype.requestDevice = function() {};
+/** @return {!Promise<!GPUAdapterInfo>} */
+GPUAdapter.prototype.requestAdapterInfo = function() {};
+/** @type {!GPUAdapterInfo} */
+GPUAdapter.prototype.info;
+
+/** @constructor */
+function GPUDevice() {}
+/** @type {string} */
+GPUDevice.prototype.label;
+/** @type {!GPUSupportedFeatures} */
+GPUDevice.prototype.features;
+/** @type {!GPUSupportedLimits} */
+GPUDevice.prototype.limits;
+/** @type {!GPUQueue} */
+GPUDevice.prototype.queue;
+/** @return {undefined} */
+GPUDevice.prototype.destroy = function() {};
+/** @return {!GPUBuffer} */
+GPUDevice.prototype.createBuffer = function() {};
+/** @return {!GPUTexture} */
+GPUDevice.prototype.createTexture = function() {};
+/** @return {!GPUSampler} */
+GPUDevice.prototype.createSampler = function() {};
+/** @return {!GPUExternalTexture} */
+GPUDevice.prototype.importExternalTexture = function() {};
+/** @return {!GPUBindGroupLayout} */
+GPUDevice.prototype.createBindGroupLayout = function() {};
+/** @return {!GPUPipelineLayout} */
+GPUDevice.prototype.createPipelineLayout = function() {};
+/** @return {!GPUBindGroup} */
+GPUDevice.prototype.createBindGroup = function() {};
+/** @return {!GPUShaderModule} */
+GPUDevice.prototype.createShaderModule = function() {};
+/** @return {!GPUComputePipeline} */
+GPUDevice.prototype.createComputePipeline = function() {};
+/** @return {!GPURenderPipeline} */
+GPUDevice.prototype.createRenderPipeline = function() {};
+/** @return {!Promise<!GPUComputePipeline>} */
+GPUDevice.prototype.createComputePipelineAsync = function() {};
+/** @return {!Promise<!GPURenderPipeline>} */
+GPUDevice.prototype.createRenderPipelineAsync = function() {};
+/** @return {!GPUCommandEncoder} */
+GPUDevice.prototype.createCommandEncoder = function() {};
+/** @return {!GPURenderBundleEncoder} */
+GPUDevice.prototype.createRenderBundleEncoder = function() {};
+/** @return {!GPUQuerySet} */
+GPUDevice.prototype.createQuerySet = function() {};
+/** @type {!Promise<!GPUDeviceLostInfo>} */
+GPUDevice.prototype.lost;
+/** @return {undefined} */
+GPUDevice.prototype.pushErrorScope = function() {};
+/** @return {!Promise<?GPUError>} */
+GPUDevice.prototype.popErrorScope = function() {};
+/** @type {!Function} */
+GPUDevice.prototype.onuncapturederror;
+/** @type {!GPUAdapterInfo} */
+GPUDevice.prototype.adapterInfo;
+
+/** @constructor */
+function GPUBuffer() {}
+/** @type {string} */
+GPUBuffer.prototype.label;
+/** @type {number} */
+GPUBuffer.prototype.size;
+/** @type {number} */
+GPUBuffer.prototype.usage;
+/** @type {string} */
+GPUBuffer.prototype.mapState;
+/** @return {!Promise<undefined>} */
+GPUBuffer.prototype.mapAsync = function() {};
+/** @return {!ArrayBuffer} */
+GPUBuffer.prototype.getMappedRange = function() {};
+/** @return {undefined} */
+GPUBuffer.prototype.unmap = function() {};
+/** @return {undefined} */
+GPUBuffer.prototype.destroy = function() {};
+
+/** @constructor */
+function GPUTexture() {}
+/** @type {string} */
+GPUTexture.prototype.label;
+/** @return {!GPUTextureView} */
+GPUTexture.prototype.createView = function() {};
+/** @return {undefined} */
+GPUTexture.prototype.destroy = function() {};
+/** @type {number} */
+GPUTexture.prototype.width;
+/** @type {number} */
+GPUTexture.prototype.height;
+/** @type {number} */
+GPUTexture.prototype.depthOrArrayLayers;
+/** @type {number} */
+GPUTexture.prototype.mipLevelCount;
+/** @type {number} */
+GPUTexture.prototype.sampleCount;
+/** @type {string} */
+GPUTexture.prototype.dimension;
+/** @type {string} */
+GPUTexture.prototype.format;
+/** @type {number} */
+GPUTexture.prototype.usage;
+
+/** @constructor */
+function GPUTextureView() {}
+/** @type {string} */
+GPUTextureView.prototype.label;
+
+/** @constructor */
+function GPUExternalTexture() {}
+/** @type {string} */
+GPUExternalTexture.prototype.label;
+
+/** @constructor */
+function GPUSampler() {}
+/** @type {string} */
+GPUSampler.prototype.label;
+
+/** @constructor */
+function GPUBindGroupLayout() {}
+/** @type {string} */
+GPUBindGroupLayout.prototype.label;
+
+/** @constructor */
+function GPUBindGroup() {}
+/** @type {string} */
+GPUBindGroup.prototype.label;
+
+/** @constructor */
+function GPUPipelineLayout() {}
+/** @type {string} */
+GPUPipelineLayout.prototype.label;
+
+/** @constructor */
+function GPUShaderModule() {}
+/** @type {string} */
+GPUShaderModule.prototype.label;
+/** @return {!Promise<!GPUCompilationInfo>} */
+GPUShaderModule.prototype.getCompilationInfo = function() {};
+
+/** @constructor */
+function GPUCompilationMessage() {}
+/** @type {string} */
+GPUCompilationMessage.prototype.message;
+/** @type {string} */
+GPUCompilationMessage.prototype.type;
+/** @type {number} */
+GPUCompilationMessage.prototype.lineNum;
+/** @type {number} */
+GPUCompilationMessage.prototype.linePos;
+/** @type {number} */
+GPUCompilationMessage.prototype.offset;
+/** @type {number} */
+GPUCompilationMessage.prototype.length;
+
+/** @constructor */
+function GPUCompilationInfo() {}
+/** @type {!Array<!GPUCompilationMessage>} */
+GPUCompilationInfo.prototype.messages;
+
+/** @constructor */
+function GPUPipelineError() {}
+/** @type {string} */
+GPUPipelineError.prototype.reason;
+
+/** @constructor */
+function GPUComputePipeline() {}
+/** @type {string} */
+GPUComputePipeline.prototype.label;
+/** @return {!GPUBindGroupLayout} */
+GPUComputePipeline.prototype.getBindGroupLayout = function() {};
+
+/** @constructor */
+function GPURenderPipeline() {}
+/** @type {string} */
+GPURenderPipeline.prototype.label;
+/** @return {!GPUBindGroupLayout} */
+GPURenderPipeline.prototype.getBindGroupLayout = function() {};
+
+/** @constructor */
+function GPUCommandBuffer() {}
+/** @type {string} */
+GPUCommandBuffer.prototype.label;
+
+/** @constructor */
+function GPUCommandEncoder() {}
+/** @type {string} */
+GPUCommandEncoder.prototype.label;
+/** @return {undefined} */
+GPUCommandEncoder.prototype.pushDebugGroup = function() {};
+/** @return {undefined} */
+GPUCommandEncoder.prototype.popDebugGroup = function() {};
+/** @return {undefined} */
+GPUCommandEncoder.prototype.insertDebugMarker = function() {};
+/** @return {!GPURenderPassEncoder} */
+GPUCommandEncoder.prototype.beginRenderPass = function() {};
+/** @return {!GPUComputePassEncoder} */
+GPUCommandEncoder.prototype.beginComputePass = function() {};
+/** @return {undefined} */
+GPUCommandEncoder.prototype.copyBufferToBuffer = function() {};
+/** @return {undefined} */
+GPUCommandEncoder.prototype.copyBufferToTexture = function() {};
+/** @return {undefined} */
+GPUCommandEncoder.prototype.copyTextureToBuffer = function() {};
+/** @return {undefined} */
+GPUCommandEncoder.prototype.copyTextureToTexture = function() {};
+/** @return {undefined} */
+GPUCommandEncoder.prototype.clearBuffer = function() {};
+/** @return {undefined} */
+GPUCommandEncoder.prototype.resolveQuerySet = function() {};
+/** @return {!GPUCommandBuffer} */
+GPUCommandEncoder.prototype.finish = function() {};
+
+/** @constructor */
+function GPUComputePassEncoder() {}
+/** @type {string} */
+GPUComputePassEncoder.prototype.label;
+/** @return {undefined} */
+GPUComputePassEncoder.prototype.pushDebugGroup = function() {};
+/** @return {undefined} */
+GPUComputePassEncoder.prototype.popDebugGroup = function() {};
+/** @return {undefined} */
+GPUComputePassEncoder.prototype.insertDebugMarker = function() {};
+/** @return {undefined} */
+GPUComputePassEncoder.prototype.setBindGroup = function() {};
+/** @return {undefined} */
+GPUComputePassEncoder.prototype.setBindGroup = function() {};
+/** @return {undefined} */
+GPUComputePassEncoder.prototype.setPipeline = function() {};
+/** @return {undefined} */
+GPUComputePassEncoder.prototype.dispatchWorkgroups = function() {};
+/** @return {undefined} */
+GPUComputePassEncoder.prototype.dispatchWorkgroupsIndirect = function() {};
+/** @return {undefined} */
+GPUComputePassEncoder.prototype.end = function() {};
+
+/** @constructor */
+function GPURenderPassEncoder() {}
+/** @type {string} */
+GPURenderPassEncoder.prototype.label;
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.pushDebugGroup = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.popDebugGroup = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.insertDebugMarker = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.setBindGroup = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.setBindGroup = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.setPipeline = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.setIndexBuffer = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.setVertexBuffer = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.draw = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.drawIndexed = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.drawIndirect = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.drawIndexedIndirect = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.setViewport = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.setScissorRect = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.setBlendConstant = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.setStencilReference = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.beginOcclusionQuery = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.endOcclusionQuery = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.executeBundles = function() {};
+/** @return {undefined} */
+GPURenderPassEncoder.prototype.end = function() {};
+
+/** @constructor */
+function GPURenderBundle() {}
+/** @type {string} */
+GPURenderBundle.prototype.label;
+
+/** @constructor */
+function GPURenderBundleEncoder() {}
+/** @type {string} */
+GPURenderBundleEncoder.prototype.label;
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.pushDebugGroup = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.popDebugGroup = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.insertDebugMarker = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.setBindGroup = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.setBindGroup = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.setPipeline = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.setIndexBuffer = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.setVertexBuffer = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.draw = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.drawIndexed = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.drawIndirect = function() {};
+/** @return {undefined} */
+GPURenderBundleEncoder.prototype.drawIndexedIndirect = function() {};
+/** @return {!GPURenderBundle} */
+GPURenderBundleEncoder.prototype.finish = function() {};
+
+/** @constructor */
+function GPUQueue() {}
+/** @type {string} */
+GPUQueue.prototype.label;
+/** @return {undefined} */
+GPUQueue.prototype.submit = function() {};
+/** @return {!Promise<undefined>} */
+GPUQueue.prototype.onSubmittedWorkDone = function() {};
+/** @return {undefined} */
+GPUQueue.prototype.writeBuffer = function() {};
+/** @return {undefined} */
+GPUQueue.prototype.writeTexture = function() {};
+/** @return {undefined} */
+GPUQueue.prototype.copyExternalImageToTexture = function() {};
+
+/** @constructor */
+function GPUQuerySet() {}
+/** @type {string} */
+GPUQuerySet.prototype.label;
+/** @return {undefined} */
+GPUQuerySet.prototype.destroy = function() {};
+/** @type {string} */
+GPUQuerySet.prototype.type;
+/** @type {number} */
+GPUQuerySet.prototype.count;
+
+/** @constructor */
+function GPUCanvasContext() {}
+/** @type {!HTMLCanvasElement|!OffscreenCanvas} */
+GPUCanvasContext.prototype.canvas;
+/** @return {undefined} */
+GPUCanvasContext.prototype.configure = function() {};
+/** @return {undefined} */
+GPUCanvasContext.prototype.unconfigure = function() {};
+/** @return {!GPUTexture} */
+GPUCanvasContext.prototype.getCurrentTexture = function() {};
+
+/** @constructor */
+function GPUDeviceLostInfo() {}
+/** @type {string} */
+GPUDeviceLostInfo.prototype.reason;
+/** @type {string} */
+GPUDeviceLostInfo.prototype.message;
+
+/** @constructor */
+function GPUError() {}
+/** @type {string} */
+GPUError.prototype.message;
+
+/** @constructor */
+function GPUValidationError() {}
+
+/** @constructor */
+function GPUOutOfMemoryError() {}
+
+/** @constructor */
+function GPUInternalError() {}
+
+/** @constructor */
+function GPUUncapturedErrorEvent() {}
+/** @type {!GPUError} */
+GPUUncapturedErrorEvent.prototype.error;
diff --git a/js/build_webgpu.bat b/js/build_webgpu.bat
new file mode 100644
index 0000000000000..95413509e701d
--- /dev/null
+++ b/js/build_webgpu.bat
@@ -0,0 +1,79 @@
+@echo off
+
+rem build_webgpu.bat --- build onnxruntime-web with WebGPU EP
+rem
+rem Usage:
+rem   build_webgpu.bat  config  [clean]
+rem
+rem Options:
+rem   config      Build configuration, "d" or "r"
+rem   clean       Perform a clean build, "clean" or empty
+
+setlocal enabledelayedexpansion
+
+set ROOT=%~dp0..\
+set BUILD_DIR=%ROOT%build_webgpu
+
+:arg1
+if ["%~1"]==["d"] (
+    set CONFIG=Debug
+    set CONFIG_EXTRA_FLAG=
+    @rem --enable_wasm_profiling --wasm_run_tests_in_browser
+    @rem --cmake_extra_defines onnxruntime_ENABLE_WEBASSEMBLY_OUTPUT_OPTIMIZED_MODEL=1
+    @rem --enable_wasm_debug_info
+    goto :arg2
+)
+if ["%~1"]==["r"] (
+    set CONFIG=Release
+    set CONFIG_EXTRA_FLAG=
+    @rem --enable_wasm_api_exception_catching --disable_rtti
+    goto :arg2
+)
+echo Invalid configuration "%~1", must be "d"(Debug) or "r"(Release)
+exit /b 1
+
+:arg2
+if ["%~2"]==["clean"] (
+    goto :clean
+)
+if not exist "%ROOT%js\web\dist" (
+    goto :npm_ci
+)
+
+goto :build_wasm
+
+:clean
+if exist "%BUILD_DIR%" (
+    rd /s /q %BUILD_DIR%
+)
+
+pushd %ROOT%
+git submodule sync --recursive
+git submodule update --init --recursive
+popd
+
+:npm_ci
+pushd %ROOT%js
+call npm ci
+popd
+pushd %ROOT%js\common
+call npm ci
+popd
+pushd %ROOT%js\web
+call npm ci
+call npm run pull:wasm
+popd
+
+:build_wasm
+
+set PATH=C:\Program Files\Git\usr\bin;%PATH%
+
+call %ROOT%build.bat --config %CONFIG% %CONFIG_EXTRA_FLAG% --skip_submodule_sync --build_wasm --target onnxruntime_webassembly --skip_tests^
+ --enable_wasm_simd --enable_wasm_threads --use_jsep --use_webnn --use_webgpu --build_dir %BUILD_DIR%
+
+IF NOT "%ERRORLEVEL%" == "0" (
+  exit /b %ERRORLEVEL%
+)
+
+copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.wasm %ROOT%js\web\dist\
+copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.mjs %ROOT%js\web\dist\
diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts
index 59f64a3179605..83a52ebaefe05 100644
--- a/js/web/lib/build-def.d.ts
+++ b/js/web/lib/build-def.d.ts
@@ -40,6 +40,13 @@ interface BuildDefinitions {
    */
   readonly ENABLE_BUNDLE_WASM_JS: boolean;
 
+  /**
+   * defines whether to use WebGPU EP instead of JSEP for WebGPU backend.
+   *
+   * This flag requires the corresponding WebAssembly artifact to be built with `--use_webgpu` flag.
+   */
+  readonly USE_WEBGPU_EP: boolean;
+
   // #endregion
 
   // #region Build definitions for ESM
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index b4071eae51c8f..fe9576b87ad72 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -1,17 +1,17 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import { Env } from 'onnxruntime-common';
+import type { Env } from 'onnxruntime-common';
 
 import { calculateTensorSizeInBytes, DataType } from '../wasm-common';
 
 import type { OrtWasmModule } from '../wasm-types';
 
-import { WebGpuBackend } from './backend-webgpu';
+import type { WebGpuBackend } from './backend-webgpu';
 import { LOG_DEBUG } from './log';
-import { TensorView } from './tensor-view';
+import type { TensorView } from './tensor-view';
 import { ShapeUtil } from './util';
-import {
+import type {
   AdapterInfo,
   ComputeContext,
   ComputeContextInputsOutputsMapping,
@@ -205,79 +205,83 @@ export const init = async (
   }
 
   if (name === 'webgpu') {
-    const backend = new WebGpuBackend();
-    await backend.initialize(env, gpuAdapter!);
+    if (!BUILD_DEFS.USE_WEBGPU_EP) {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
+      const webGpuBackendImpl = require('./backend-webgpu').WebGpuBackend;
+      const backend = new webGpuBackendImpl();
+      await backend.initialize(env, gpuAdapter!);
 
-    jsepInit('webgpu', [
-      // backend
-      backend,
+      jsepInit('webgpu', [
+        // backend
+        backend,
+
+        // jsepAlloc()
+        (size: number) => backend.alloc(Number(size)),
 
-      // jsepAlloc()
-      (size: number) => backend.alloc(Number(size)),
+        // jsepFree()
+        (ptr: number) => backend.free(ptr),
 
-      // jsepFree()
-      (ptr: number) => backend.free(ptr),
+        // jsepCopy(src, dst, size, isSourceGpu)
+        (src: number, dst: number, size: number, isSourceGpu = false) => {
+          if (isSourceGpu) {
+            LOG_DEBUG(
+              'verbose',
+              () => `[WebGPU] jsepCopyGpuToGpu: src=${Number(src)}, dst=${Number(dst)}, size=${Number(size)}`,
+            );
+            backend.memcpy(Number(src), Number(dst));
+          } else {
+            LOG_DEBUG(
+              'verbose',
+              () =>
+                `[WebGPU] jsepCopyCpuToGpu: dataOffset=${Number(src)}, gpuDataId=${Number(dst)}, size=${Number(size)}`,
+            );
+            const data = module.HEAPU8.subarray(Number(src >>> 0), Number(src >>> 0) + Number(size));
+            backend.upload(Number(dst), data);
+          }
+        },
 
-      // jsepCopy(src, dst, size, isSourceGpu)
-      (src: number, dst: number, size: number, isSourceGpu = false) => {
-        if (isSourceGpu) {
+        // jsepCopyAsync(src, dst, size)
+        async (gpuDataId: number, dataOffset: number, size: number): Promise<void> => {
           LOG_DEBUG(
             'verbose',
-            () => `[WebGPU] jsepCopyGpuToGpu: src=${Number(src)}, dst=${Number(dst)}, size=${Number(size)}`,
+            () => `[WebGPU] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`,
           );
-          backend.memcpy(Number(src), Number(dst));
-        } else {
-          LOG_DEBUG(
-            'verbose',
-            () =>
-              `[WebGPU] jsepCopyCpuToGpu: dataOffset=${Number(src)}, gpuDataId=${Number(dst)}, size=${Number(size)}`,
-          );
-          const data = module.HEAPU8.subarray(Number(src >>> 0), Number(src >>> 0) + Number(size));
-          backend.upload(Number(dst), data);
-        }
-      },
 
-      // jsepCopyAsync(src, dst, size)
-      async (gpuDataId: number, dataOffset: number, size: number): Promise<void> => {
-        LOG_DEBUG(
-          'verbose',
-          () => `[WebGPU] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`,
-        );
-
-        await backend.download(Number(gpuDataId), () =>
-          module.HEAPU8.subarray(Number(dataOffset) >>> 0, Number(dataOffset + size) >>> 0),
-        );
-      },
+          await backend.download(Number(gpuDataId), () =>
+            module.HEAPU8.subarray(Number(dataOffset) >>> 0, Number(dataOffset + size) >>> 0),
+          );
+        },
 
-      // jsepCreateKernel
-      (kernelType: string, kernelId: number, attribute: unknown) =>
-        backend.createKernel(
-          kernelType,
-          Number(kernelId),
-          attribute,
-          module.UTF8ToString(module._JsepGetNodeName!(Number(kernelId))),
-        ),
+        // jsepCreateKernel
+        (kernelType: string, kernelId: number, attribute: unknown) =>
+          backend.createKernel(
+            kernelType,
+            Number(kernelId),
+            attribute,
+            module.UTF8ToString(module._JsepGetNodeName!(Number(kernelId))),
+          ),
 
-      // jsepReleaseKernel
-      (kernel: number) => backend.releaseKernel(kernel),
+        // jsepReleaseKernel
+        (kernel: number) => backend.releaseKernel(kernel),
 
-      // jsepRun
-      (kernel: number, contextDataOffset: number, sessionHandle: number, errors: Array<Promise<string | null>>) => {
-        LOG_DEBUG(
-          'verbose',
-          () =>
-            `[WebGPU] jsepRun: sessionHandle=${sessionHandle}, kernel=${kernel}, contextDataOffset=${contextDataOffset}`,
-        );
-        const context = new ComputeContextImpl(module, backend, Number(contextDataOffset));
-        return backend.computeKernel(Number(kernel), context, errors);
-      },
-      // jsepCaptureBegin
-      () => backend.captureBegin(),
-      // jsepCaptureEnd
-      () => backend.captureEnd(),
-      // jsepReplay
-      () => backend.replay(),
-    ]);
+        // jsepRun
+        (kernel: number, contextDataOffset: number, sessionHandle: number, errors: Array<Promise<string | null>>) => {
+          LOG_DEBUG(
+            'verbose',
+            () =>
+              `[WebGPU] jsepRun: sessionHandle=${sessionHandle}, kernel=${kernel}, contextDataOffset=${contextDataOffset}`,
+          );
+          const context = new ComputeContextImpl(module, backend, Number(contextDataOffset));
+          return backend.computeKernel(Number(kernel), context, errors);
+        },
+        // jsepCaptureBegin
+        () => backend.captureBegin(),
+        // jsepCaptureEnd
+        () => backend.captureEnd(),
+        // jsepReplay
+        () => backend.replay(),
+      ]);
+    }
   } else {
     const backend = new WebNNBackend(env);
     jsepInit('webnn', [
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 17e564247863d..89a4484e5a1c4 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import { InferenceSession } from 'onnxruntime-common';
+import type { InferenceSession } from 'onnxruntime-common';
 
 import { getInstance } from './wasm-factory';
 import { allocWasmString, checkLastError, iterateExtraOptions } from './wasm-utils';
@@ -54,13 +54,28 @@ const appendDefaultOptions = (options: InferenceSession.SessionOptions): void =>
   }
 };
 
-const setExecutionProviders = (
+const appendSessionConfig = (sessionOptionsHandle: number, key: string, value: string, allocs: number[]): void => {
+  const keyDataOffset = allocWasmString(key, allocs);
+  const valueDataOffset = allocWasmString(value, allocs);
+  if (getInstance()._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
+    checkLastError(`Can't set a session config entry: ${key} - ${value}.`);
+  }
+};
+
+const appendEpOption = (epOptions: Array<[number, number]>, key: string, value: string, allocs: number[]): void => {
+  const keyDataOffset = allocWasmString(key, allocs);
+  const valueDataOffset = allocWasmString(value, allocs);
+  epOptions.push([keyDataOffset, valueDataOffset]);
+};
+
+const setExecutionProviders = async (
   sessionOptionsHandle: number,
   executionProviders: readonly InferenceSession.ExecutionProviderConfig[],
   allocs: number[],
-): void => {
+): Promise<void> => {
   for (const ep of executionProviders) {
     let epName = typeof ep === 'string' ? ep : ep.name;
+    const epOptions: Array<[number, number]> = [];
 
     // check EP name
     switch (epName) {
@@ -71,26 +86,44 @@ const setExecutionProviders = (
           // const context = (webnnOptions as InferenceSession.WebNNOptionsWithMLContext)?.context;
           const deviceType = (webnnOptions as InferenceSession.WebNNContextOptions)?.deviceType;
           if (deviceType) {
-            const keyDataOffset = allocWasmString('deviceType', allocs);
-            const valueDataOffset = allocWasmString(deviceType, allocs);
-            if (getInstance()._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
-              checkLastError(`Can't set a session config entry: 'deviceType' - ${deviceType}.`);
-            }
+            appendSessionConfig(sessionOptionsHandle, 'deviceType', deviceType, allocs);
           }
         }
         break;
       case 'webgpu':
-        epName = 'JS';
-        if (typeof ep !== 'string') {
-          const webgpuOptions = ep as InferenceSession.WebGpuExecutionProviderOption;
-          if (webgpuOptions?.preferredLayout) {
-            if (webgpuOptions.preferredLayout !== 'NCHW' && webgpuOptions.preferredLayout !== 'NHWC') {
-              throw new Error(`preferredLayout must be either 'NCHW' or 'NHWC': ${webgpuOptions.preferredLayout}`);
+        if (BUILD_DEFS.USE_WEBGPU_EP) {
+          epName = 'WebGPU';
+          let customDevice: GPUDevice | undefined;
+
+          if (typeof ep !== 'string') {
+            const customOptions = ep as unknown as { device: GPUDevice };
+            if (customOptions.device) {
+              if (typeof GPUDevice !== 'undefined' && customOptions.device instanceof GPUDevice) {
+                customDevice = customOptions.device;
+              } else {
+                throw new Error('Invalid GPU device set in WebGPU EP options.');
+              }
             }
-            const keyDataOffset = allocWasmString('preferredLayout', allocs);
-            const valueDataOffset = allocWasmString(webgpuOptions.preferredLayout, allocs);
-            if (getInstance()._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
-              checkLastError(`Can't set a session config entry: 'preferredLayout' - ${webgpuOptions.preferredLayout}.`);
+
+            // TODO: handle more options
+          }
+
+          const info = getInstance().webgpuRegisterDevice!(customDevice);
+          if (info) {
+            const [deviceId, instanceHandle, deviceHandle] = info;
+            appendEpOption(epOptions, 'deviceId', deviceId.toString(), allocs);
+            appendEpOption(epOptions, 'webgpuInstance', instanceHandle.toString(), allocs);
+            appendEpOption(epOptions, 'webgpuDevice', deviceHandle.toString(), allocs);
+          }
+        } else {
+          epName = 'JS';
+          if (typeof ep !== 'string') {
+            const webgpuOptions = ep as InferenceSession.WebGpuExecutionProviderOption;
+            if (webgpuOptions?.preferredLayout) {
+              if (webgpuOptions.preferredLayout !== 'NCHW' && webgpuOptions.preferredLayout !== 'NHWC') {
+                throw new Error(`preferredLayout must be either 'NCHW' or 'NHWC': ${webgpuOptions.preferredLayout}`);
+              }
+              appendSessionConfig(sessionOptionsHandle, 'preferredLayout', webgpuOptions.preferredLayout, allocs);
             }
           }
         }
@@ -103,13 +136,34 @@ const setExecutionProviders = (
     }
 
     const epNameDataOffset = allocWasmString(epName, allocs);
-    if (getInstance()._OrtAppendExecutionProvider(sessionOptionsHandle, epNameDataOffset) !== 0) {
+    const epOptionsCount = epOptions.length;
+    let keysOffset = 0;
+    let valuesOffset = 0;
+    if (epOptionsCount > 0) {
+      keysOffset = getInstance()._malloc(epOptionsCount * getInstance().PTR_SIZE);
+      allocs.push(keysOffset);
+      valuesOffset = getInstance()._malloc(epOptionsCount * getInstance().PTR_SIZE);
+      allocs.push(valuesOffset);
+      for (let i = 0; i < epOptionsCount; i++) {
+        getInstance().setValue(keysOffset + i * getInstance().PTR_SIZE, epOptions[i][0], '*');
+        getInstance().setValue(valuesOffset + i * getInstance().PTR_SIZE, epOptions[i][1], '*');
+      }
+    }
+    if (
+      (await getInstance()._OrtAppendExecutionProvider(
+        sessionOptionsHandle,
+        epNameDataOffset,
+        keysOffset,
+        valuesOffset,
+        epOptionsCount,
+      )) !== 0
+    ) {
       checkLastError(`Can't append execution provider: ${epName}.`);
     }
   }
 };
 
-export const setSessionOptions = (options?: InferenceSession.SessionOptions): [number, number[]] => {
+export const setSessionOptions = async (options?: InferenceSession.SessionOptions): Promise<[number, number[]]> => {
   const wasm = getInstance();
   let sessionOptionsHandle = 0;
   const allocs: number[] = [];
@@ -155,20 +209,19 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
     }
 
     if (sessionOptions.executionProviders) {
-      setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs);
+      await setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs);
     }
 
     if (sessionOptions.enableGraphCapture !== undefined) {
       if (typeof sessionOptions.enableGraphCapture !== 'boolean') {
         throw new Error(`enableGraphCapture must be a boolean value: ${sessionOptions.enableGraphCapture}`);
       }
-      const keyDataOffset = allocWasmString('enableGraphCapture', allocs);
-      const valueDataOffset = allocWasmString(sessionOptions.enableGraphCapture.toString(), allocs);
-      if (wasm._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
-        checkLastError(
-          `Can't set a session config entry: 'enableGraphCapture' - ${sessionOptions.enableGraphCapture}.`,
-        );
-      }
+      appendSessionConfig(
+        sessionOptionsHandle,
+        'enableGraphCapture',
+        sessionOptions.enableGraphCapture.toString(),
+        allocs,
+      );
     }
 
     if (sessionOptions.freeDimensionOverrides) {
@@ -188,12 +241,7 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
 
     if (sessionOptions.extra !== undefined) {
       iterateExtraOptions(sessionOptions.extra, '', new WeakSet<Record<string, unknown>>(), (key, value) => {
-        const keyDataOffset = allocWasmString(key, allocs);
-        const valueDataOffset = allocWasmString(value, allocs);
-
-        if (wasm._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
-          checkLastError(`Can't set a session config entry: ${key} - ${value}.`);
-        }
+        appendSessionConfig(sessionOptionsHandle, key, value, allocs);
       });
     }
 
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 4bccfa76fdda3..dbcf80adf3552 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -102,11 +102,20 @@ export const initRuntime = async (env: Env): Promise<void> => {
  * @param epName
  */
 export const initEp = async (env: Env, epName: string): Promise<void> => {
+  // initialize ASYNCIFY support
+  getInstance().asyncInit?.();
+
+  if (epName === 'webgpu' && BUILD_DEFS.USE_WEBGPU_EP) {
+    getInstance().webgpuInit!((device) => {
+      env.webgpu.device = device;
+    });
+  }
+
   if (!BUILD_DEFS.DISABLE_JSEP) {
     // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
     const initJsep = require('./jsep/init').init;
 
-    if (epName === 'webgpu') {
+    if (epName === 'webgpu' && !BUILD_DEFS.USE_WEBGPU_EP) {
       // perform WebGPU availability check
       if (typeof navigator === 'undefined' || !navigator.gpu) {
         throw new Error('WebGPU is not supported in current environment');
@@ -270,7 +279,7 @@ export const createSession = async (
   const outputNamesUTF8Encoded = [];
 
   try {
-    [sessionOptionsHandle, allocs] = setSessionOptions(options);
+    [sessionOptionsHandle, allocs] = await setSessionOptions(options);
 
     if (options?.externalData && wasm.mountExternalData) {
       const loadingPromises = [];
@@ -278,7 +287,7 @@ export const createSession = async (
         const path = typeof file === 'string' ? file : file.path;
         loadingPromises.push(
           loadFile(typeof file === 'string' ? file : file.data).then((data) => {
-            wasm.mountExternalData!(path, data);
+            wasm.mountExternalData(path, data);
           }),
         );
       }
@@ -312,6 +321,7 @@ export const createSession = async (
     }
 
     sessionHandle = await wasm._OrtCreateSession(modelDataOffset, modelDataLength, sessionOptionsHandle);
+    wasm.webgpuOnCreateSession?.(sessionHandle);
     if (sessionHandle === 0) {
       checkLastError("Can't create a session.");
     }
@@ -444,6 +454,7 @@ export const releaseSession = (sessionId: number): void => {
   }
 
   wasm.jsepOnReleaseSession?.(sessionId);
+  wasm.webgpuOnReleaseSession?.(sessionId);
 
   inputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf));
   outputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf));
@@ -491,11 +502,20 @@ export const prepareInputOutputTensor = async (
     const gpuBuffer = tensor[2].gpuBuffer;
     dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!;
 
-    const registerBuffer = wasm.jsepRegisterBuffer;
-    if (!registerBuffer) {
-      throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');
+    if (BUILD_DEFS.USE_WEBGPU_EP) {
+      const registerBuffer = wasm.webgpuRegisterBuffer;
+      if (!registerBuffer) {
+        throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');
+      }
+
+      rawData = registerBuffer(gpuBuffer, sessionId);
+    } else {
+      const registerBuffer = wasm.jsepRegisterBuffer;
+      if (!registerBuffer) {
+        throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');
+      }
+      rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength);
     }
-    rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength);
   } else if (location === 'ml-tensor') {
     const mlTensor = tensor[2].mlTensor as MLTensor;
     dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!;
@@ -791,7 +811,7 @@ export const run = async (
           // If a certain output's preferred location is GPU but the tensor is empty, we still need to create a CPU
           // tensor for it. There is no mapping GPU buffer for an empty tensor.
           if (preferredLocation === 'gpu-buffer' && size > 0) {
-            const getBuffer = wasm.jsepGetBuffer;
+            const getBuffer = BUILD_DEFS.USE_WEBGPU_EP ? wasm.webgpuGetBuffer : wasm.jsepGetBuffer;
             if (!getBuffer) {
               throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');
             }
@@ -804,20 +824,43 @@ export const run = async (
             // do not release the tensor right now. it will be released when user calls tensor.dispose().
             keepOutputTensor = true;
 
-            output.push([
-              type,
-              dims,
-              {
-                gpuBuffer,
-                download: wasm.jsepCreateDownloader!(gpuBuffer, bufferSize, type),
-                dispose: () => {
-                  if (wasm._OrtReleaseTensor(tensor) !== 0) {
-                    checkLastError("Can't release tensor.");
-                  }
+            if (BUILD_DEFS.USE_WEBGPU_EP) {
+              wasm.webgpuRegisterBuffer!(gpuBuffer, sessionId, dataOffset);
+              const downloadDataFunction = wasm.webgpuCreateDownloader!(gpuBuffer, bufferSize, sessionId);
+              output.push([
+                type,
+                dims,
+                {
+                  gpuBuffer,
+                  download: async () => {
+                    const arrayBuffer = await downloadDataFunction();
+                    const data = new (tensorTypeToTypedArrayConstructor(type!))(arrayBuffer);
+                    return data as Tensor.DataTypeMap[Tensor.GpuBufferDataTypes];
+                  },
+                  dispose: () => {
+                    if (wasm._OrtReleaseTensor(tensor) !== 0) {
+                      checkLastError("Can't release tensor.");
+                    }
+                  },
                 },
-              },
-              'gpu-buffer',
-            ]);
+                'gpu-buffer',
+              ]);
+            } else {
+              output.push([
+                type,
+                dims,
+                {
+                  gpuBuffer,
+                  download: wasm.jsepCreateDownloader!(gpuBuffer, bufferSize, type),
+                  dispose: () => {
+                    if (wasm._OrtReleaseTensor(tensor) !== 0) {
+                      checkLastError("Can't release tensor.");
+                    }
+                  },
+                },
+                'gpu-buffer',
+              ]);
+            }
           } else if (preferredLocation === 'ml-tensor' && size > 0) {
             const ensureTensor = wasm.jsepEnsureTensor;
             if (!ensureTensor) {
@@ -887,6 +930,18 @@ export const run = async (
   } finally {
     wasm.stackRestore(beforeRunStack);
 
+    if (BUILD_DEFS.USE_WEBGPU_EP) {
+      inputTensors.forEach((t) => {
+        if (t && t[3] === 'gpu-buffer') {
+          wasm.webgpuUnregisterBuffer!(t[2].gpuBuffer);
+        }
+      });
+      outputTensors.forEach((t) => {
+        if (t && t[3] === 'gpu-buffer') {
+          wasm.webgpuUnregisterBuffer!(t[2].gpuBuffer);
+        }
+      });
+    }
     inputTensorHandles.forEach((v) => wasm._OrtReleaseTensor(v));
     outputTensorHandles.forEach((v) => wasm._OrtReleaseTensor(v));
     inputOutputAllocs.forEach((p) => wasm._free(p));
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index b4871e145f4d7..9b2ec71fd351d 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -41,18 +41,6 @@ export declare namespace JSEP {
   type DownloadTensorFunction = (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise<undefined>;
 
   export interface Module extends WebGpuModule, WebNnModule {
-    /**
-     * Mount the external data file to an internal map, which will be used during session initialization.
-     *
-     * @param externalDataFilePath - specify the relative path of the external data file.
-     * @param externalDataFileData - specify the content data.
-     */
-    mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
-    /**
-     * Unmount all external data files from the internal map.
-     */
-    unmountExternalData(): void;
-
     /**
      * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime per
      * backend. This function initializes Asyncify support. If name is 'webgpu', also initializes WebGPU backend and
@@ -294,6 +282,21 @@ export declare namespace JSEP {
   }
 }
 
+export declare namespace WebGpu {
+  export interface Module {
+    webgpuInit(setDefaultDevice: (device: GPUDevice) => void): void;
+    webgpuRegisterDevice(
+      device?: GPUDevice,
+    ): undefined | [deviceId: number, instanceHandle: number, deviceHandle: number];
+    webgpuOnCreateSession(sessionHandle: number): void;
+    webgpuOnReleaseSession(sessionHandle: number): void;
+    webgpuRegisterBuffer(buffer: GPUBuffer, sessionHandle: number, bufferHandle?: number): number;
+    webgpuUnregisterBuffer(buffer: GPUBuffer): void;
+    webgpuGetBuffer(bufferHandle: number): GPUBuffer;
+    webgpuCreateDownloader(gpuBuffer: GPUBuffer, size: number, sessionHandle: number): () => Promise<ArrayBuffer>;
+  }
+}
+
 export interface OrtInferenceAPIs {
   _OrtInit(numThreads: number, loggingLevel: number): number;
 
@@ -358,7 +361,13 @@ export interface OrtInferenceAPIs {
     logVerbosityLevel: number,
     optimizedModelFilePath: number,
   ): number;
-  _OrtAppendExecutionProvider(sessionOptionsHandle: number, name: number): number;
+  _OrtAppendExecutionProvider(
+    sessionOptionsHandle: number,
+    name: number,
+    providerOptionsKeys: number,
+    providerOptionsValues: number,
+    numKeys: number,
+  ): Promise<number>;
   _OrtAddFreeDimensionOverride(sessionOptionsHandle: number, name: number, dim: number): number;
   _OrtAddSessionConfigEntry(sessionOptionsHandle: number, configKey: number, configValue: number): number;
   _OrtReleaseSessionOptions(sessionOptionsHandle: number): number;
@@ -373,8 +382,11 @@ export interface OrtInferenceAPIs {
 /**
  * The interface of the WebAssembly module for ONNX Runtime, compiled from C++ source code by Emscripten.
  */
-export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Partial<JSEP.Module> {
-  PTR_SIZE: number;
+export interface OrtWasmModule
+  extends EmscriptenModule,
+    OrtInferenceAPIs,
+    Partial<JSEP.Module>,
+    Partial<WebGpu.Module> {
   // #region emscripten functions
   stackSave(): number;
   stackRestore(stack: number): void;
@@ -387,7 +399,31 @@ export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Parti
   stringToUTF8(str: string, offset: number, maxBytes: number): void;
   // #endregion
 
+  // #region ORT shared
+
+  readonly PTR_SIZE: 4 | 8;
+
+  /**
+   * Mount the external data file to an internal map, which will be used during session initialization.
+   *
+   * @param externalDataFilePath - specify the relative path of the external data file.
+   * @param externalDataFileData - specify the content data.
+   */
+  mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
+  /**
+   * Unmount all external data files from the internal map.
+   */
+  unmountExternalData(): void;
+
+  /**
+   * This function patches the WebAssembly module to support Asyncify. This function should be called at least once
+   * before any ORT API is called.
+   */
+  asyncInit?(): void;
+
+  // #endregion
+
   // #region config
-  numThreads?: number;
+  readonly numThreads?: number;
   // #endregion
 }
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 6006de62b41b6..fd9224a2dcf8b 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -27,7 +27,8 @@ const args = minimist(process.argv.slice(2));
  * --bundle-mode=node
  *   Build a single ort-web bundle for nodejs.
  */
-const BUNDLE_MODE: 'prod' | 'dev' | 'perf' | 'node' = args['bundle-mode'] || 'prod';
+const BUNDLE_MODE: 'prod' | 'dev' | 'perf' | 'node' =
+  process.env.npm_config_bundle_mode || args['bundle-mode'] || 'prod';
 
 /**
  * --debug
@@ -41,7 +42,18 @@ const BUNDLE_MODE: 'prod' | 'dev' | 'perf' | 'node' = args['bundle-mode'] || 'pr
  *  Enable debug mode. In this mode, esbuild metafile feature will be enabled. Full bundle analysis will be saved to a
  * file as JSON.
  */
-const DEBUG = args.debug; // boolean|'verbose'|'save'
+const DEBUG = process.env.npm_config_debug || args.debug; // boolean|'verbose'|'save'
+
+/**
+ * --webgpu-ep
+ * --no-webgpu-ep (default)
+ *
+ * Enable or disable the use of WebGPU EP. If enabled, the WebGPU EP will be used. If disabled, the WebGPU backend will
+ * be used with JSEP.
+ *
+ * (temporary) This flag is used to test the WebGPU EP integration. It will be removed in the future.
+ */
+const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep'] ?? true;
 
 /**
  * Root folder of the source code: `<ORT_ROOT>/js/`
@@ -57,6 +69,7 @@ const DEFAULT_DEFINE = {
   'BUILD_DEFS.DISABLE_WASM': 'false',
   'BUILD_DEFS.DISABLE_WASM_PROXY': 'false',
   'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'false',
+  'BUILD_DEFS.USE_WEBGPU_EP': JSON.stringify(!!USE_WEBGPU_EP),
 
   'BUILD_DEFS.IS_ESM': 'false',
   'BUILD_DEFS.ESM_IMPORT_META_URL': 'undefined',
diff --git a/onnxruntime/core/framework/external_data_loader.cc b/onnxruntime/core/framework/external_data_loader.cc
index fe73a55735631..c577805e69cc4 100644
--- a/onnxruntime/core/framework/external_data_loader.cc
+++ b/onnxruntime/core/framework/external_data_loader.cc
@@ -60,7 +60,12 @@ common::Status LoadWebAssemblyExternalData(const Env& env,
                                      break;
                                    case 1:
                                      // Load external data to GPU.
-                                     Module.jsepUploadExternalBuffer(dataIdOrBuffer, data);
+                                     // TODO: use a unified interface for upload external buffer.
+                                     if (Module.webgpuUploadExternalBuffer) {
+                                       Module.webgpuUploadExternalBuffer(dataIdOrBuffer, data);
+                                     } else {
+                                       Module.jsepUploadExternalBuffer(dataIdOrBuffer, data);
+                                     }
                                      break;
                                    default:
                                      return 4;  // Unknown error occurred in memory copy.
diff --git a/onnxruntime/core/framework/external_data_loader.h b/onnxruntime/core/framework/external_data_loader.h
index 117da7d0a4afa..90d48ca800797 100644
--- a/onnxruntime/core/framework/external_data_loader.h
+++ b/onnxruntime/core/framework/external_data_loader.h
@@ -42,7 +42,7 @@ class IExternalDataLoader {
 
 enum class ExternalDataLoadType {
   CPU = 0,
-#if defined(USE_JSEP)
+#if defined(USE_JSEP) || defined(USE_WEBGPU)
   WEBGPU_BUFFER = 1,
 #endif
 };
diff --git a/onnxruntime/core/providers/webgpu/external_data_loader.cc b/onnxruntime/core/providers/webgpu/external_data_loader.cc
new file mode 100644
index 0000000000000..6da9598b146f5
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/external_data_loader.cc
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if defined(__wasm__)
+
+#include <emscripten.h>
+
+#include "core/framework/tensor.h"
+#include "core/providers/webgpu/external_data_loader.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+bool ExternalDataLoader::CanLoad(const OrtMemoryInfo& target_memory_info) const {
+  return target_memory_info.device.Type() == OrtDevice::CPU ||
+         (target_memory_info.device.Type() == OrtDevice::GPU && target_memory_info.name == WEBGPU_BUFFER);
+}
+
+common::Status ExternalDataLoader::LoadTensor(const Env& env,
+                                              const std::filesystem::path& data_file_path,
+                                              FileOffsetType data_offset,
+                                              SafeInt<size_t> data_length,
+                                              Tensor& tensor) const {
+  ExternalDataLoadType load_type;
+  if (tensor.Location().device.Type() == OrtDevice::CPU) {
+    load_type = ExternalDataLoadType::CPU;
+  } else if (tensor.Location().device.Type() == OrtDevice::GPU &&
+             tensor.Location().name == WEBGPU_BUFFER) {
+    load_type = ExternalDataLoadType::WEBGPU_BUFFER;
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported tensor location: ", tensor.Location().ToString());
+  }
+
+  return LoadWebAssemblyExternalData(env, data_file_path, data_offset, data_length, load_type, tensor.MutableDataRaw());
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/core/providers/webgpu/external_data_loader.h b/onnxruntime/core/providers/webgpu/external_data_loader.h
new file mode 100644
index 0000000000000..7ced4e930bf7a
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/external_data_loader.h
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if defined(__wasm__)
+
+#include "core/framework/external_data_loader.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class ExternalDataLoader : public IExternalDataLoader {
+ public:
+  ExternalDataLoader() {};
+  ~ExternalDataLoader() {};
+
+  bool CanLoad(const OrtMemoryInfo& target_memory_info) const override;
+
+  common::Status LoadTensor(const Env& env,
+                            const std::filesystem::path& data_file_path,
+                            FileOffsetType data_offset,
+                            SafeInt<size_t> data_length,
+                            Tensor& tensor) const override;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/core/providers/webgpu/program.cc b/onnxruntime/core/providers/webgpu/program.cc
index d1d4c242c4697..976b7927ac3dd 100644
--- a/onnxruntime/core/providers/webgpu/program.cc
+++ b/onnxruntime/core/providers/webgpu/program.cc
@@ -206,6 +206,26 @@ ProgramVariableDataType ToProgramVariableDataType(int32_t element_type, int comp
   }
 }
 
+std::ostream& operator<<(std::ostream& os, ValidationMode mode) {
+  switch (mode) {
+    case ValidationMode::Disabled:
+      os << "Disabled";
+      break;
+    case ValidationMode::WGPUOnly:
+      os << "WGPUOnly";
+      break;
+    case ValidationMode::Basic:
+      os << "Basic";
+      break;
+    case ValidationMode::Full:
+      os << "Full";
+      break;
+    default:
+      os << "Unknown(" << static_cast<int>(mode) << ")";
+  }
+  return os;
+}
+
 namespace {
 TensorShape GetReducedShape(const TensorShape& shape, int component /* > 1 */) {
   ORT_ENFORCE(shape.NumDimensions() > 0 && shape.GetDims()[shape.NumDimensions() - 1] % component == 0,
diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h
index 7bfd9e8800099..95fef36144025 100644
--- a/onnxruntime/core/providers/webgpu/program.h
+++ b/onnxruntime/core/providers/webgpu/program.h
@@ -237,6 +237,7 @@ enum class ValidationMode {
   Basic,
   Full
 };
+std::ostream& operator<<(std::ostream& os, ValidationMode mode);
 
 namespace details {
 class ProgramWrapper;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 163dd691b7f16..e8e93a9cb6a8f 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -134,6 +134,8 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi
       ORT_ENFORCE(device_ != nullptr, "Failed to get a WebGPU device.");
     }
 
+    LOGS_DEFAULT(VERBOSE) << "WebGPU EP Context is created for: Instance=" << instance_.Get() << ", Device=" << device_.Get() << ".";
+
     // cache adapter info
     ORT_ENFORCE(Device().GetAdapterInfo(&adapter_info_));
     // cache device limits
@@ -708,45 +710,46 @@ WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& co
   WGPUInstance instance = config.instance;
   WGPUDevice device = config.device;
 
-  if (context_id == 0) {
-    // context ID is preserved for the default context. User cannot use context ID 0 as a custom context.
-    ORT_ENFORCE(instance == nullptr && device == nullptr,
-                "WebGPU EP default context (contextId=0) must not have custom WebGPU instance or device.");
-
-    std::call_once(init_default_flag_, [
+  std::call_once(init_default_flag_, [
 #if !defined(__wasm__)
-                                           dawn_proc_table = config.dawn_proc_table
+                                         dawn_proc_table = config.dawn_proc_table
 #endif
-    ]() {
-    // Step.1 - setup dawn proc table (only for non-WASM build)
+  ]() {
+  // Step.1 - setup dawn proc table (only for non-WASM build)
 
 #if !defined(__wasm__)
-      const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
+    const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
 #if defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
-      ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
+    ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
 #else
 #if !defined(USE_EXTERNAL_DAWN)
-      if (dawn_procs == nullptr) {
-        dawn_procs = &dawn::native::GetProcs();
-      }
+    if (dawn_procs == nullptr) {
+      dawn_procs = &dawn::native::GetProcs();
+    }
 #else
-      ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
+    ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
 #endif
-      dawnProcSetProcs(dawn_procs);
+    dawnProcSetProcs(dawn_procs);
 #endif
 #endif
 
-      // Step.2 - Create wgpu::Instance
+    // Step.2 - Create wgpu::Instance
 #if !defined(__wasm__)
-      wgpu::InstanceDescriptor instance_desc{};
-      instance_desc.capabilities.timedWaitAnyEnable = true;
-      default_instance_ = wgpu::CreateInstance(&instance_desc);
+    wgpu::InstanceDescriptor instance_desc{};
+    instance_desc.capabilities.timedWaitAnyEnable = true;
+    default_instance_ = wgpu::CreateInstance(&instance_desc);
 #else
-      default_instance_ = wgpu::CreateInstance(nullptr);
+    default_instance_ = wgpu::CreateInstance(nullptr);
 #endif
 
-      ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance.");
-    });
+    ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance.");
+  });
+
+  if (context_id == 0) {
+    // context ID is preserved for the default context. User cannot use context ID 0 as a custom context.
+    ORT_ENFORCE(instance == nullptr && device == nullptr,
+                "WebGPU EP default context (contextId=0) must not have custom WebGPU instance or device.");
+
     instance = default_instance_.Get();
   } else {
     // for context ID > 0, user must provide custom WebGPU instance and device.
@@ -800,5 +803,9 @@ void CleanupWebGpuContexts() {
   WebGpuContextFactory::Cleanup();
 }
 
+WGPUDevice GetDevice(int context_id) {
+  return WebGpuContextFactory::GetContext(context_id).Device().Get();
+}
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 87383fe197477..b5a663fb7c455 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -23,6 +23,7 @@
 
 #include "core/providers/webgpu/webgpu_context.h"
 #include "core/providers/webgpu/data_transfer.h"
+#include "core/providers/webgpu/external_data_loader.h"
 #include "core/providers/webgpu/webgpu_profiler.h"
 
 namespace onnxruntime {
@@ -821,6 +822,12 @@ std::unique_ptr<onnxruntime::IDataTransfer> WebGpuExecutionProvider::GetDataTran
   return std::make_unique<webgpu::DataTransfer>(context_);
 }
 
+#if defined(__wasm__)
+std::unique_ptr<onnxruntime::IExternalDataLoader> WebGpuExecutionProvider::GetExternalDataLoader() const {
+  return std::make_unique<webgpu::ExternalDataLoader>();
+}
+#endif
+
 WebGpuExecutionProvider::~WebGpuExecutionProvider() {
   WebGpuContextFactory::ReleaseContext(context_id_);
 }
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
index 7a0ade97aa3df..dc25636821651 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -49,6 +49,9 @@ class WebGpuExecutionProvider : public IExecutionProvider {
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
   std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const override;
+#if defined(__wasm__)
+  std::unique_ptr<onnxruntime::IExternalDataLoader> GetExternalDataLoader() const override;
+#endif
 
   DataLayout GetPreferredLayout() const override { return preferred_data_layout_; }
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
index 60c61b2ca5665..1d779152f91f3 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -151,6 +151,12 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
       validation_mode,
   };
 
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP Device ID: " << context_id;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP WGPUInstance: " << webgpu_instance;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP WGPUDevice: " << webgpu_device;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP DawnProcTable: " << dawn_proc_table;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP ValidationMode: " << validation_mode;
+
   //
   // STEP.3 - prepare parameters for WebGPU context initialization.
   //
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index 7adfc6a2b2ccb..1ad35b51bb1c1 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -8,6 +8,14 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "api.h"
 
+#ifdef USE_WEBGPU
+namespace onnxruntime {
+namespace webgpu {
+WGPUDevice GetDevice(int);
+}
+}  // namespace onnxruntime
+#endif
+
 #include <iostream>
 #include <sstream>
 #include <vector>
@@ -164,8 +172,12 @@ OrtSessionOptions* OrtCreateSessionOptions(size_t graph_optimization_level,
   return UNREGISTER_AUTO_RELEASE(session_options);
 }
 
-int OrtAppendExecutionProvider(ort_session_options_handle_t session_options, const char* name) {
-  return CHECK_STATUS(SessionOptionsAppendExecutionProvider, session_options, name, nullptr, nullptr, 0);
+int OrtAppendExecutionProvider(ort_session_options_handle_t session_options,
+                               const char* name,
+                               const char* const* provider_options_keys,
+                               const char* const* provider_options_values,
+                               size_t num_keys) {
+  return CHECK_STATUS(SessionOptionsAppendExecutionProvider, session_options, name, provider_options_keys, provider_options_values, num_keys);
 }
 
 int OrtAddFreeDimensionOverride(ort_session_options_handle_t session_options,
@@ -507,6 +519,16 @@ char* OrtEndProfiling(ort_session_handle_t session) {
              : nullptr;
 }
 
+// WebGPU API Section
+
+#ifdef USE_WEBGPU
+
+WGPUDevice OrtGetWebGpuDevice(int device_id) {
+  return onnxruntime::webgpu::GetDevice(device_id);
+}
+
+#endif
+
 // Training API Section
 
 #ifdef ENABLE_TRAINING_APIS
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index f44c515d98f6b..9ff1eb55ecedc 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -10,6 +10,10 @@
 
 #include <emscripten.h>
 
+#ifdef USE_WEBGPU
+#include <webgpu/webgpu.h>
+#endif
+
 #include <stddef.h>
 
 struct OrtSession;
@@ -85,7 +89,10 @@ ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t
  * @returns ORT error code. If not zero, call OrtGetLastError() to get detailed error message.
  */
 int EMSCRIPTEN_KEEPALIVE OrtAppendExecutionProvider(ort_session_options_handle_t session_options,
-                                                    const char* name);
+                                                    const char* name,
+                                                    const char* const* provider_options_keys,
+                                                    const char* const* provider_options_values,
+                                                    size_t num_keys);
 
 /**
  * add a free dimension override for one dimension of a session's input.
@@ -294,6 +301,21 @@ int EMSCRIPTEN_KEEPALIVE OrtRun(ort_session_handle_t session,
  */
 char* EMSCRIPTEN_KEEPALIVE OrtEndProfiling(ort_session_handle_t session);
 
+// WebGPU API Section
+
+#ifdef USE_WEBGPU
+
+/**
+ * get the GPU Device by device ID.
+ *
+ * This function is only available after the GPU Device is initialized in WebGpuContextFactory.
+ *
+ * @returns a WGPUDevice handle.
+ */
+WGPUDevice EMSCRIPTEN_KEEPALIVE OrtGetWebGpuDevice(int device_id);
+
+#endif
+
 // Training API Section
 
 #ifdef ENABLE_TRAINING_APIS
diff --git a/onnxruntime/wasm/js_post_js.js b/onnxruntime/wasm/js_post_js.js
index b77d82fbd7d10..be5a4d3c7415a 100644
--- a/onnxruntime/wasm/js_post_js.js
+++ b/onnxruntime/wasm/js_post_js.js
@@ -2,6 +2,6 @@
 
 // Licensed under the MIT License.
 
-'use strict';
+"use strict";
 
 Module["PTR_SIZE"] = 4;
diff --git a/onnxruntime/wasm/js_post_js_64.js b/onnxruntime/wasm/js_post_js_64.js
index b140df927ebbd..b16383b746b8a 100644
--- a/onnxruntime/wasm/js_post_js_64.js
+++ b/onnxruntime/wasm/js_post_js_64.js
@@ -2,6 +2,6 @@
 
 // Licensed under the MIT License.
 
-'use strict';
+"use strict";
 
 Module["PTR_SIZE"] = 8;
diff --git a/onnxruntime/wasm/post-webgpu.js b/onnxruntime/wasm/post-webgpu.js
new file mode 100644
index 0000000000000..e7631a97c34c6
--- /dev/null
+++ b/onnxruntime/wasm/post-webgpu.js
@@ -0,0 +1,263 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+"use strict";
+
+//
+// This file contains the post-run code for the ORT WebAssembly module. The code in this file will be injected into the
+// final module using Emscripten's `--post-js` option.
+//
+// This file will only be used in build with flag `--use_webgpu`.
+
+/**
+ * This function is called only once when initializing the WebGPU backend.
+ *
+ * @param {(gpuDevice: GPUDevice) => void} setDefaultDevice A callback function to set the default device.
+ */
+Module["webgpuInit"] = (setDefaultDevice) => {
+  /**
+   * a map from GPUDevice to [deviceId, instanceHandle, deviceHandle]
+   *
+   * only stores custom devices (ie. devices created by the user, not the default device created by ORT)
+   *
+   * key is the GPUDevice object.
+   *
+   * value is a tuple of 3 elements:
+   * - deviceId: a unique ID for the device. Must be positive integer.
+   * - instanceHandle: the instance handle(pointer) of the device.
+   * - deviceHandle: the device handle(pointer) of the device.
+   *
+   * @type {WeakMap<GPUDevice, [number, number, number]>}
+   */
+  const webgpuActiveDevices = new WeakMap();
+  /**
+   * a number that is used to assign a unique ID to the next custom device.
+   */
+  let webgpuNextDeviceId = 1;
+  /**
+   * a function to set the default device.
+   *
+   * @type {(gpuDevice: GPUDevice) => void}
+   */
+  const webgpuSetDefaultDevice = setDefaultDevice;
+  /**
+   * the current device that is being used to create a WebGPU EP inference session.
+   *
+   * the value of this variable is only valid during the creation of a WebGPU EP inference session.
+   *
+   * @type {GPUDevice|undefined}
+   */
+  let webgpuCurrentDevice = undefined;
+  /**
+   * the current device ID that is being used to create a WebGPU EP inference session.
+   *
+   * the value of this variable is only valid during the creation of a WebGPU EP inference session.
+   *
+   * @type {number|undefined}
+   */
+  let webgpuCurrentDeviceId = undefined;
+
+  /**
+   * This function is called only when a custom device is used, during preparation of session options.
+   *
+   * @param {GPUDevice} device the user provided device object.
+   * @returns {undefined|[number, number, number]} a tuple of device id, instance handle, and device handle.
+   */
+  Module["webgpuRegisterDevice"] = (device) => {
+    if (webgpuCurrentDeviceId !== undefined) {
+      throw new Error("another WebGPU EP inference session is being created.");
+    }
+
+    if (device) {
+      let deviceInfo = webgpuActiveDevices.get(device);
+      if (!deviceInfo) {
+        const instanceHandle = _wgpuCreateInstance(0);
+        const deviceHandle = WebGPU.importJsDevice(device, instanceHandle);
+        deviceInfo = [webgpuNextDeviceId++, instanceHandle, deviceHandle];
+        webgpuActiveDevices.set(device, deviceInfo);
+      }
+
+      // The current device ID is a temporary storage for the device ID to be used in the session that is being created.
+      //
+      // Soon after `webgpuRegisterDevice` (this function) is called, `webgpuOnCreateSession` will be called so that the
+      // value of `webgpuCurrentDeviceId` is used and reset then.
+      webgpuCurrentDevice = device;
+      webgpuCurrentDeviceId = deviceInfo[0];
+      return deviceInfo;
+    } else {
+      webgpuCurrentDevice = undefined;
+      webgpuCurrentDeviceId = 0;
+      return undefined;
+    }
+  };
+
+  const webgpuActiveSessions = new Map();
+  Module["webgpuOnCreateSession"] = (sessionHandle) => {
+    if (webgpuCurrentDeviceId === undefined) {
+      // do nothing if webgpuCurrentDeviceId is undefined.
+      // this means no WebGPU EP is being created.
+      return;
+    }
+
+    const deviceId = webgpuCurrentDeviceId;
+    webgpuCurrentDeviceId = undefined;
+
+    if (sessionHandle) {
+      // when session created successfully
+      const deviceHandle = _OrtGetWebGpuDevice(deviceId);
+      webgpuActiveSessions.set(sessionHandle, deviceHandle);
+
+      if (deviceId === 0) {
+        const device = webgpuCurrentDevice ?? WebGPU.getJsObject(deviceHandle);
+        webgpuSetDefaultDevice(device);
+      }
+    }
+    webgpuCurrentDevice = undefined;
+  };
+
+  Module["webgpuOnReleaseSession"] = (sessionHandle) => {
+    webgpuActiveSessions.delete(sessionHandle);
+  };
+
+  const gpuBufferMetadataSymbol = Symbol("gpuBufferMetadata");
+
+  Module["webgpuRegisterBuffer"] = (buffer, sessionHandle, bufferHandle) => {
+    const metadata = buffer[gpuBufferMetadataSymbol];
+    if (bufferHandle) {
+      // This is a buffer that was created by ORT. Metadata is [bufferHandle, NaN]
+
+      buffer[gpuBufferMetadataSymbol] = [bufferHandle, NaN];
+      return bufferHandle;
+    } else {
+      // This is a buffer that was created by the user. Metadata is [bufferHandle, refCount]
+
+      if (metadata) {
+        metadata[1]++;
+        return metadata[0];
+      }
+
+      const deviceHandle = webgpuActiveSessions.get(sessionHandle);
+      if (deviceHandle === undefined) {
+        throw new Error(
+          "Invalid session handle passed to webgpuRegisterBuffer"
+        );
+      }
+
+      const bufferHandle = WebGPU.importJsBuffer(buffer, deviceHandle);
+      buffer[gpuBufferMetadataSymbol] = [bufferHandle, 1];
+      return bufferHandle;
+    }
+  };
+
+  Module["webgpuUnregisterBuffer"] = (buffer) => {
+    const metadata = buffer[gpuBufferMetadataSymbol];
+    if (!metadata) {
+      throw new Error("Buffer is not registered");
+    }
+    metadata[1]--;
+    // For buffers created by ORT, metadata[1] will always be NaN. This function will not release the buffer.
+    // Instead, the buffer will be released when user calls `Tensor.dispose()` in JavaScript.
+    if (metadata[1] === 0) {
+      _wgpuBufferRelease(metadata[0]);
+      delete buffer[gpuBufferMetadataSymbol];
+    }
+  };
+
+  Module["webgpuGetBuffer"] = (bufferHandle) => {
+    return WebGPU.getJsObject(bufferHandle);
+  };
+
+  Module["webgpuCreateDownloader"] = (gpuBuffer, bufferSize, sessionHandle) => {
+    const deviceHandle = webgpuActiveSessions.get(sessionHandle);
+    if (deviceHandle === undefined) {
+      throw new Error("Invalid session handle passed to webgpuRegisterBuffer");
+    }
+
+    const buffer = gpuBuffer;
+    const device = WebGPU.getJsObject(deviceHandle);
+    const originalSize = bufferSize;
+    const size = Math.ceil(Number(originalSize) / 16) * 16;
+
+    return async () => {
+      // prettier-ignore
+      //
+      // the line above is used to force prettier to skip formatting the next statement.
+      // this is because prettier will remove the quotes around the property names, but we need to keep them
+      // because otherwise closure compiler may rename them and break the code.
+      const gpuReadBufferDescriptor = {
+        "size": size,
+        "usage": 9 /* GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ */,
+      };
+      const gpuReadBuffer = device.createBuffer(gpuReadBufferDescriptor);
+      try {
+        const commandEncoder = device.createCommandEncoder();
+        commandEncoder.copyBufferToBuffer(
+          buffer /* source buffer */,
+          0 /* source offset */,
+          gpuReadBuffer /* destination buffer */,
+          0 /* destination offset */,
+          size /* size */
+        );
+        device.queue.submit([commandEncoder.finish()]);
+
+        await gpuReadBuffer.mapAsync(GPUMapMode.READ);
+
+        const arrayBuffer = gpuReadBuffer.getMappedRange();
+        return arrayBuffer.slice(0, originalSize);
+      } finally {
+        gpuReadBuffer.destroy();
+      }
+    };
+  };
+
+  // Setup a callback function for loading external buffers (model weights).
+  Module.webgpuUploadExternalBuffer = (bufferHandle, data) => {
+    const srcArrayBuffer = data.buffer;
+    const srcOffset = data.byteOffset;
+    const srcLength = data.byteLength;
+    const size = Math.ceil(Number(srcLength) / 16) * 16;
+
+    const gpuBuffer = WebGPU.getJsObject(bufferHandle);
+
+    // get current device
+    if (!webgpuCurrentDevice) {
+      const deviceHandle = _OrtGetWebGpuDevice(webgpuCurrentDeviceId);
+      webgpuCurrentDevice = WebGPU.getJsObject(deviceHandle);
+    }
+
+    // create gpu buffer
+
+    // prettier-ignore
+    //
+    // the line above is used to force prettier to skip formatting the next statement.
+    // this is because prettier will remove the quotes around the property names, but we need to keep them
+    // because otherwise closure compiler may rename them and break the code.
+    const gpuBufferForUploadingDescriptor = {
+      "mappedAtCreation": true,
+      "size": size,
+      "usage": 6 /* GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC */,
+    };
+    const gpuBufferForUploading = webgpuCurrentDevice.createBuffer(
+      gpuBufferForUploadingDescriptor
+    );
+
+    // copy (upload) data
+    const arrayBuffer = gpuBufferForUploading.getMappedRange();
+    new Uint8Array(arrayBuffer).set(
+      new Uint8Array(srcArrayBuffer, srcOffset, srcLength)
+    );
+    gpuBufferForUploading.unmap();
+
+    // GPU copy
+    const commandEncoder = webgpuCurrentDevice.createCommandEncoder();
+    commandEncoder.copyBufferToBuffer(
+      gpuBufferForUploading,
+      0,
+      gpuBuffer,
+      0,
+      size
+    );
+    webgpuCurrentDevice.queue.submit([commandEncoder.finish()]);
+    gpuBufferForUploading.destroy();
+  };
+};
diff --git a/onnxruntime/wasm/pre-async.js b/onnxruntime/wasm/pre-async.js
new file mode 100644
index 0000000000000..a1e66d854d296
--- /dev/null
+++ b/onnxruntime/wasm/pre-async.js
@@ -0,0 +1,142 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+"use strict";
+
+//
+// This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the
+// final module using Emscripten's `--pre-js` option.
+//
+// This file will only be used in build with flag `-s ASYNCIFY=1`.
+
+/**
+ * initialize for asyncify support.
+ */
+let initAsyncImpl = () => {
+  // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1)
+  // It removes some overhead in cwarp() and ccall() that we don't need.
+  //
+  // Currently in ASYNCIFY build, we only use this for the following functions:
+  // - OrtCreateSession()
+  // - OrtRun()
+  // - OrtRunWithBinding()
+  // - OrtBindInput()
+  //
+  // Note: about parameters "getFunc" and "setFunc":
+  // - Emscripten has different behaviors for Debug and Release builds for generating exported function wrapper.
+  //
+  //   - In Debug build, it will generate a wrapper function for each exported function. For example, it generates a
+  //     wrapper for OrtRun() like this (minified):
+  //     ```
+  //     var _OrtRun = Module["_OrtRun"] = createExportWrapper("OrtRun");
+  //     ```
+  //
+  //   - In Release build, it will generate a lazy loading wrapper for each exported function. For example, it generates
+  //     a wrapper for OrtRun() like this (minified):
+  //     ```
+  //     d._OrtRun = (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q);
+  //     ```
+  //
+  //   The behavior of these two wrappers are different. The debug build will assign `Module["_OrtRun"]` only once
+  //   because `createExportWrapper()` does not reset `Module["_OrtRun"]` inside. The release build, however, will
+  //   reset d._OrtRun to J.ka when the first time it is called.
+  //
+  //   The difference is important because we need to design the async wrapper in a way that it can handle both cases.
+  //
+  //   Now, let's look at how the async wrapper is designed to work for both cases:
+  //
+  //   - Debug build:
+  //      1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to `createExportWrapper("OrtRun")`.
+  //      2. When the first time `Module["initAsync"]` is called, `Module["_OrtRun"]` is re-assigned to a new async
+  //         wrapper function.
+  //      Value of `Module["_OrtRun"]` will not be changed again.
+  //
+  //   - Release build:
+  //      1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to a lazy loading wrapper function.
+  //      2. When the first time `Module["initAsync"]` is called, `Module["_OrtRun"]` is re-assigned to a new async
+  //         wrapper function.
+  //      3. When the first time `Module["_OrtRun"]` is called, the async wrapper will be called. It will call into this
+  //         function:
+  //         ```
+  //         (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q);
+  //         ```
+  //         This function will assign d._OrtRun (ie. the minimized `Module["_OrtRun"]`) to the real function (J.ka).
+  //      4. Since d._OrtRun is re-assigned, we need to update the async wrapper to re-assign its stored
+  //         function to the updated value (J.ka), and re-assign the value of `d._OrtRun` back to the async wrapper.
+  //      Value of `Module["_OrtRun"]` will not be changed again.
+  //
+  //   The value of `Module["_OrtRun"]` will need to be assigned for 2 times for debug build and 4 times for release
+  //   build.
+  //
+  //   This is why we need this `getFunc` and `setFunc` parameters. They are used to get the current value of an
+  //   exported function and set the new value of an exported function.
+  //
+  const wrapAsync = (func, getFunc, setFunc) => {
+    return (...args) => {
+      // cache the async data before calling the function.
+      const previousAsync = Asyncify.currData;
+
+      const previousFunc = getFunc?.();
+      const ret = func(...args);
+      const newFunc = getFunc?.();
+      if (previousFunc !== newFunc) {
+        // The exported function has been updated.
+        // Set the sync function reference to the new function.
+        func = newFunc;
+        // Set the exported function back to the async wrapper.
+        setFunc(previousFunc);
+        // Remove getFunc and setFunc. They are no longer needed.
+        setFunc = null;
+        getFunc = null;
+      }
+
+      // If the async data has been changed, it means that the function started an async operation.
+      if (Asyncify.currData != previousAsync) {
+        // returns the promise
+        return Asyncify.whenDone();
+      }
+      // the function is synchronous. returns the result.
+      return ret;
+    };
+  };
+
+  // replace the original functions with asyncified versions
+  Module["_OrtAppendExecutionProvider"] = wrapAsync(
+    Module["_OrtAppendExecutionProvider"],
+    () => Module["_OrtAppendExecutionProvider"],
+    (v) => (Module["_OrtAppendExecutionProvider"] = v)
+  );
+  Module["_OrtCreateSession"] = wrapAsync(
+    Module["_OrtCreateSession"],
+    () => Module["_OrtCreateSession"],
+    (v) => (Module["_OrtCreateSession"] = v)
+  );
+  Module["_OrtRun"] = wrapAsync(
+    Module["_OrtRun"],
+    () => Module["_OrtRun"],
+    (v) => (Module["_OrtRun"] = v)
+  );
+  Module["_OrtRunWithBinding"] = wrapAsync(
+    Module["_OrtRunWithBinding"],
+    () => Module["_OrtRunWithBinding"],
+    (v) => (Module["_OrtRunWithBinding"] = v)
+  );
+  Module["_OrtBindInput"] = wrapAsync(
+    Module["_OrtBindInput"],
+    () => Module["_OrtBindInput"],
+    (v) => (Module["_OrtBindInput"] = v)
+  );
+
+  // If JSEP is enabled, wrap OrtRun() and OrtRunWithBinding() with asyncify.
+  if (typeof jsepRunAsync !== "undefined") {
+    Module["_OrtRun"] = jsepRunAsync(Module["_OrtRun"]);
+    Module["_OrtRunWithBinding"] = jsepRunAsync(Module["_OrtRunWithBinding"]);
+  }
+
+  // remove this function to make sure it is called only once.
+  initAsyncImpl = undefined;
+};
+
+Module["asyncInit"] = () => {
+  initAsyncImpl?.();
+};
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index 0c83e71a921cb..a35ab129280c4 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-'use strict';
+"use strict";
 
 //
 // This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the
@@ -9,247 +9,151 @@
 //
 // This file will only be used in build with flag `--use_jsep`.
 
-
-/**
- * initialize JSEP for asyncify support.
- */
-let jsepInitAsync = () => {
-  // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1)
-  // It removes some overhead in cwarp() and ccall() that we don't need.
-  //
-  // Currently in JSEP build, we only use this for the following functions:
-  // - OrtRun()
-  // - OrtRunWithBinding()
-  // - OrtBindInput()
-  //
-  // Note: about parameters "getFunc" and "setFunc":
-  // - Emscripten has different behaviors for Debug and Release builds for generating exported function wrapper.
-  //
-  //   - In Debug build, it will generate a wrapper function for each exported function. For example, it generates a
-  //     wrapper for OrtRun() like this (minified):
-  //     ```
-  //     var _OrtRun = Module["_OrtRun"] = createExportWrapper("OrtRun");
-  //     ```
-  //
-  //   - In Release build, it will generate a lazy loading wrapper for each exported function. For example, it generates
-  //     a wrapper for OrtRun() like this (minified):
-  //     ```
-  //     d._OrtRun = (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q);
-  //     ```
-  //
-  //   The behavior of these two wrappers are different. The debug build will assign `Module["_OrtRun"]` only once
-  //   because `createExportWrapper()` does not reset `Module["_OrtRun"]` inside. The release build, however, will
-  //   reset d._OrtRun to J.ka when the first time it is called.
-  //
-  //   The difference is important because we need to design the async wrapper in a way that it can handle both cases.
-  //
-  //   Now, let's look at how the async wrapper is designed to work for both cases:
-  //
-  //   - Debug build:
-  //      1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to `createExportWrapper("OrtRun")`.
-  //      2. When the first time `Module["jsepInit"]` is called, `Module["_OrtRun"]` is re-assigned to a new async
-  //         wrapper function.
-  //      Value of `Module["_OrtRun"]` will not be changed again.
-  //
-  //   - Release build:
-  //      1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to a lazy loading wrapper function.
-  //      2. When the first time `Module["jsepInit"]` is called, `Module["_OrtRun"]` is re-assigned to a new async
-  //         wrapper function.
-  //      3. When the first time `Module["_OrtRun"]` is called, the async wrapper will be called. It will call into this
-  //         function:
-  //         ```
-  //         (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q);
-  //         ```
-  //         This function will assign d._OrtRun (ie. the minimized `Module["_OrtRun"]`) to the real function (J.ka).
-  //      4. Since d._OrtRun is re-assigned, we need to update the async wrapper to re-assign its stored
-  //         function to the updated value (J.ka), and re-assign the value of `d._OrtRun` back to the async wrapper.
-  //      Value of `Module["_OrtRun"]` will not be changed again.
-  //
-  //   The value of `Module["_OrtRun"]` will need to be assigned for 2 times for debug build and 4 times for release
-  //   build.
-  //
-  //   This is why we need this `getFunc` and `setFunc` parameters. They are used to get the current value of an
-  //   exported function and set the new value of an exported function.
-  //
-  const jsepWrapAsync = (func, getFunc, setFunc) => {
-    return (...args) => {
-      // cache the async data before calling the function.
-      const previousAsync = Asyncify.currData;
-
-      const previousFunc = getFunc?.();
-      const ret = func(...args);
-      const newFunc = getFunc?.();
-      if (previousFunc !== newFunc) {
-        // The exported function has been updated.
-        // Set the sync function reference to the new function.
-        func = newFunc;
-        // Set the exported function back to the async wrapper.
-        setFunc(previousFunc);
-        // Remove getFunc and setFunc. They are no longer needed.
-        setFunc = null;
-        getFunc = null;
+// This is a wrapper for OrtRun() and OrtRunWithBinding() to ensure that Promises are handled correctly.
+const jsepRunAsync = (runAsyncFunc) => {
+  return async (...args) => {
+    try {
+      // Module.jsepSessionState should be null, unless we are in the middle of a session.
+      // If it is not null, it means that the previous session has not finished yet.
+      if (Module.jsepSessionState) {
+        throw new Error("Session already started");
       }
+      const state = (Module.jsepSessionState = {
+        sessionHandle: args[0],
+        errors: [],
+      });
 
-      // If the async data has been changed, it means that the function started an async operation.
-      if (Asyncify.currData != previousAsync) {
-        // returns the promise
-        return Asyncify.whenDone();
-      }
-      // the function is synchronous. returns the result.
-      return ret;
-    };
-  };
-
-  // This is a wrapper for OrtRun() and OrtRunWithBinding() to ensure that Promises are handled correctly.
-  const runAsync = (runAsyncFunc) => {
-    return async (...args) => {
-      try {
-        // Module.jsepSessionState should be null, unless we are in the middle of a session.
-        // If it is not null, it means that the previous session has not finished yet.
-        if (Module.jsepSessionState) {
-          throw new Error('Session already started');
-        }
-        const state = Module.jsepSessionState = {sessionHandle: args[0], errors: []};
+      // Run the acyncified function: OrtRun() or OrtRunWithBinding()
+      const ret = await runAsyncFunc(...args);
 
-        // Run the acyncified function: OrtRun() or OrtRunWithBinding()
-        const ret = await runAsyncFunc(...args);
-
-        // Check if the session is still valid. this object should be the same as the one we set above.
-        if (Module.jsepSessionState !== state) {
-          throw new Error('Session mismatch');
-        }
+      // Check if the session is still valid. this object should be the same as the one we set above.
+      if (Module.jsepSessionState !== state) {
+        throw new Error("Session mismatch");
+      }
 
-        // Flush the backend. This will submit all pending commands to the GPU.
-        Module.jsepBackend?.['flush']();
+      // Flush the backend. This will submit all pending commands to the GPU.
+      Module.jsepBackend?.["flush"]();
 
-        // Await all pending promises. This includes GPU validation promises for diagnostic purposes.
-        const errorPromises = state.errors;
-        if (errorPromises.length > 0) {
-          let errors = await Promise.all(errorPromises);
-          errors = errors.filter(e => e);
-          if (errors.length > 0) {
-            throw new Error(errors.join('\n'));
-          }
+      // Await all pending promises. This includes GPU validation promises for diagnostic purposes.
+      const errorPromises = state.errors;
+      if (errorPromises.length > 0) {
+        let errors = await Promise.all(errorPromises);
+        errors = errors.filter((e) => e);
+        if (errors.length > 0) {
+          throw new Error(errors.join("\n"));
         }
-
-        return ret;
-      } finally {
-        Module.jsepSessionState = null;
       }
-    };
-  };
 
-  // replace the original functions with asyncified versions
-  Module['_OrtCreateSession'] = jsepWrapAsync(
-      Module['_OrtCreateSession'],
-      () => Module['_OrtCreateSession'],
-      v => Module['_OrtCreateSession'] = v);
-  Module['_OrtRun'] = runAsync(jsepWrapAsync(
-      Module['_OrtRun'],
-      () => Module['_OrtRun'],
-      v => Module['_OrtRun'] = v));
-  Module['_OrtRunWithBinding'] = runAsync(jsepWrapAsync(
-      Module['_OrtRunWithBinding'],
-      () => Module['_OrtRunWithBinding'],
-      v => Module['_OrtRunWithBinding'] = v));
-  Module['_OrtBindInput'] = jsepWrapAsync(
-      Module['_OrtBindInput'],
-      () => Module['_OrtBindInput'],
-      v => Module['_OrtBindInput'] = v);
-
-  // remove this function to make sure it is called only once.
-  jsepInitAsync = undefined;
+      return ret;
+    } finally {
+      Module.jsepSessionState = null;
+    }
+  };
 };
 
-
 /**
- * initialize JSEP for WebGPU.
+ * initialize JSEP for WebGPU and WebNN.
  */
-Module['jsepInit'] = (name, params) => {
-  jsepInitAsync?.();
-
-  if (name === 'webgpu') {
-    [Module.jsepBackend,
-     Module.jsepAlloc,
-     Module.jsepFree,
-     Module.jsepCopy,
-     Module.jsepCopyAsync,
-     Module.jsepCreateKernel,
-     Module.jsepReleaseKernel,
-     Module.jsepRunKernel,
-     Module.jsepCaptureBegin,
-     Module.jsepCaptureEnd,
-     Module.jsepReplay] = params;
+Module["jsepInit"] = (name, params) => {
+  if (name === "webgpu") {
+    [
+      Module.jsepBackend,
+      Module.jsepAlloc,
+      Module.jsepFree,
+      Module.jsepCopy,
+      Module.jsepCopyAsync,
+      Module.jsepCreateKernel,
+      Module.jsepReleaseKernel,
+      Module.jsepRunKernel,
+      Module.jsepCaptureBegin,
+      Module.jsepCaptureEnd,
+      Module.jsepReplay,
+    ] = params;
 
     // expose webgpu backend functions
     const backend = Module.jsepBackend;
-    Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => {
-      return backend['registerBuffer'](sessionId, index, buffer, size);
+    Module["jsepRegisterBuffer"] = (sessionId, index, buffer, size) => {
+      return backend["registerBuffer"](sessionId, index, buffer, size);
     };
-    Module['jsepGetBuffer'] = (dataId) => {
-      return backend['getBuffer'](dataId);
+    Module["jsepGetBuffer"] = (dataId) => {
+      return backend["getBuffer"](dataId);
     };
-    Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => {
-      return backend['createDownloader'](gpuBuffer, size, type);
+    Module["jsepCreateDownloader"] = (gpuBuffer, size, type) => {
+      return backend["createDownloader"](gpuBuffer, size, type);
     };
-    Module['jsepOnCreateSession'] = sessionId => {
-      backend['onCreateSession'](sessionId);
+    Module["jsepOnCreateSession"] = (sessionId) => {
+      backend["onCreateSession"](sessionId);
     };
-    Module['jsepOnReleaseSession'] = sessionId => {
-      backend['onReleaseSession'](sessionId);
+    Module["jsepOnReleaseSession"] = (sessionId) => {
+      backend["onReleaseSession"](sessionId);
     };
-    Module['jsepOnRunStart'] = sessionId => {
-      return backend['onRunStart'](sessionId);
+    Module["jsepOnRunStart"] = (sessionId) => {
+      return backend["onRunStart"](sessionId);
     };
 
     Module.jsepUploadExternalBuffer = (dataId, buffer) => {
-      backend['upload'](dataId, buffer);
+      backend["upload"](dataId, buffer);
     };
-  } else if (name === 'webnn') {
+  } else if (name === "webnn") {
     // Functions called from EM_ASM need to be assigned in a way that can be minified.
     // Functions called via emscripten::val::module_property need to be assigned by name so that the minifier doesn't
     // change the name.
 
-    [Module.jsepBackend,
-     Module.jsepReserveTensorId,
-     Module.jsepReleaseTensorId,
-     Module['jsepEnsureTensor'],
-     Module.jsepUploadTensor,
-     Module['jsepDownloadTensor'],
+    [
+      Module.jsepBackend,
+      Module.jsepReserveTensorId,
+      Module.jsepReleaseTensorId,
+      Module["jsepEnsureTensor"],
+      Module.jsepUploadTensor,
+      Module["jsepDownloadTensor"],
     ] = params;
 
     // This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name.
-    Module['jsepReleaseTensorId'] = Module.jsepReleaseTensorId;
-    Module['jsepUploadTensor'] = Module.jsepUploadTensor;
+    Module["jsepReleaseTensorId"] = Module.jsepReleaseTensorId;
+    Module["jsepUploadTensor"] = Module.jsepUploadTensor;
 
     // Functions called from JS also need to have explicit names.
     const backend = Module.jsepBackend;
-    Module['jsepOnRunStart'] = sessionId => {
-      return backend['onRunStart'](sessionId);
+    Module["jsepOnRunStart"] = (sessionId) => {
+      return backend["onRunStart"](sessionId);
     };
-    Module['jsepOnRunEnd'] = backend['onRunEnd'].bind(backend);
-    Module['jsepRegisterMLContext'] = (sessionId, mlContext) => {
-      backend['registerMLContext'](sessionId, mlContext);
+    Module["jsepOnRunEnd"] = backend["onRunEnd"].bind(backend);
+    Module["jsepRegisterMLContext"] = (sessionId, mlContext) => {
+      backend["registerMLContext"](sessionId, mlContext);
     };
-    Module['jsepOnReleaseSession'] = sessionId => {
-      backend['onReleaseSession'](sessionId);
+    Module["jsepOnReleaseSession"] = (sessionId) => {
+      backend["onReleaseSession"](sessionId);
     };
-    Module['jsepCreateMLTensorDownloader'] = (tensorId, type) => {
-      return backend['createMLTensorDownloader'](tensorId, type);
-    }
-    Module['jsepRegisterMLTensor'] = (sessionId, tensor, dataType, shape) => {
-      return backend['registerMLTensor'](sessionId, tensor, dataType, shape);
+    Module["jsepCreateMLTensorDownloader"] = (tensorId, type) => {
+      return backend["createMLTensorDownloader"](tensorId, type);
+    };
+    Module["jsepRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => {
+      return backend["registerMLTensor"](sessionId, tensor, dataType, shape);
     };
-    Module['jsepCreateMLContext'] = (optionsOrGpuDevice) => {
-      return backend['createMLContext'](optionsOrGpuDevice);
+    Module["jsepCreateMLContext"] = (optionsOrGpuDevice) => {
+      return backend["createMLContext"](optionsOrGpuDevice);
     };
-    Module['jsepRegisterMLConstant'] = (externalFilePath, dataOffset, dataLength, builder, desc) => {
-      return backend['registerMLConstant'](
-          externalFilePath, dataOffset, dataLength, builder, desc, Module.MountedFiles);
+    Module["jsepRegisterMLConstant"] = (
+      externalFilePath,
+      dataOffset,
+      dataLength,
+      builder,
+      desc
+    ) => {
+      return backend["registerMLConstant"](
+        externalFilePath,
+        dataOffset,
+        dataLength,
+        builder,
+        desc,
+        Module.MountedFiles
+      );
     };
-    Module['jsepRegisterGraphInput'] = backend['registerGraphInput'].bind(backend);
-    Module['jsepIsGraphInput'] = backend['isGraphInput'].bind(backend);
+    Module["jsepRegisterGraphInput"] =
+      backend["registerGraphInput"].bind(backend);
+    Module["jsepIsGraphInput"] = backend["isGraphInput"].bind(backend);
 
-    Module['jsepCreateTemporaryTensor'] = backend['createTemporaryTensor'].bind(backend);
+    Module["jsepCreateTemporaryTensor"] =
+      backend["createTemporaryTensor"].bind(backend);
   }
 };
diff --git a/onnxruntime/wasm/pre.js b/onnxruntime/wasm/pre.js
index 9b5f3ce545b78..6da28fc355899 100644
--- a/onnxruntime/wasm/pre.js
+++ b/onnxruntime/wasm/pre.js
@@ -1,21 +1,20 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-'use strict';
+"use strict";
 
 //
 // This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the
 // final module using Emscripten's `--pre-js` option.
 
-
 /**
  * Mount external data files of a model to an internal map, which will be used during session initialization.
  *
  * @param {string} externalDataFilesPath
  * @param {Uint8Array} externalDataFilesData
  */
-Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => {
-  if (externalDataFilePath.startsWith('./')) {
+Module["mountExternalData"] = (externalDataFilePath, externalDataFileData) => {
+  if (externalDataFilePath.startsWith("./")) {
     externalDataFilePath = externalDataFilePath.substring(2);
   }
   const files = Module.MountedFiles || (Module.MountedFiles = new Map());
@@ -25,7 +24,7 @@ Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => {
 /**
  * Unmount external data files of a model.
  */
-Module['unmountExternalData'] = () => {
+Module["unmountExternalData"] = () => {
   delete Module.MountedFiles;
 };
 
@@ -48,5 +47,7 @@ Module['unmountExternalData'] = () => {
  *
  * @suppress {checkVars}
  */
-var SharedArrayBuffer = globalThis.SharedArrayBuffer ??
-    new WebAssembly.Memory({'initial': 0, 'maximum': 0, 'shared': true}).buffer.constructor;
+var SharedArrayBuffer =
+  globalThis.SharedArrayBuffer ??
+  new WebAssembly.Memory({ initial: 0, maximum: 0, shared: true }).buffer
+    .constructor;
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index ecdffbe5fd6a0..78e4f2ce9adef 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1592,8 +1592,11 @@ def generate_build_tree(
             raise BuildError("WebNN is only available for WebAssembly build.")
         cmake_args += ["-Donnxruntime_USE_WEBNN=ON"]
 
-    if args.use_jsep and args.use_webgpu:
-        raise BuildError("JSEP (--use_jsep) and WebGPU (--use_webgpu) cannot be enabled at the same time.")
+    # TODO: currently we allows building with both --use_jsep and --use_webgpu in this working branch.
+    #       This situation is temporary. Eventually, those two flags will be mutually exclusive.
+    #
+    # if args.use_jsep and args.use_webgpu:
+    #     raise BuildError("JSEP (--use_jsep) and WebGPU (--use_webgpu) cannot be enabled at the same time.")
 
     if args.use_external_dawn and not args.use_webgpu:
         raise BuildError("External Dawn (--use_external_dawn) must be enabled with WebGPU (--use_webgpu).")

From 085bf324d5e7f7bcf59505421c376893e657d7d6 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 13 Feb 2025 19:56:45 -0800
Subject: [PATCH 2/2] easier debugging for integration

---
 cmake/onnxruntime_webassembly.cmake    |  3 +-
 cmake/patches/dawn/dawn.patch          | 97 ++++++++++++++++++++++++--
 js/web/lib/wasm/jsep/backend-webgpu.ts | 72 +++++++++++++++++++
 js/web/lib/wasm/wasm-core-impl.ts      | 13 ++++
 js/web/lib/wasm/wasm-types.ts          |  1 +
 js/web/package.json                    |  2 +-
 js/web/script/build.ts                 | 15 +++-
 onnxruntime/wasm/post-webgpu.js        | 19 +++++
 onnxruntime/wasm/pre-jsep.js           |  9 +++
 9 files changed, 222 insertions(+), 9 deletions(-)

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index f3afaf7033fd1..b6910795391b1 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -443,7 +443,8 @@ jsepDownload:_pp_")
       "SHELL:-s ASSERTIONS=0"
       "SHELL:-s SAFE_HEAP=0"
       "SHELL:-s STACK_OVERFLOW_CHECK=0"
-      --closure 1
+      ## comment out closure compiler so that it's easier to debug
+      # --closure 1
     )
   endif()
 
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
index ac4c42bc15fce..4c96eca093607 100644
--- a/cmake/patches/dawn/dawn.patch
+++ b/cmake/patches/dawn/dawn.patch
@@ -47,8 +47,74 @@ index efd6491cd6..8ebc5d28b6 100644
  emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../..
  
  ninja
+diff --git a/third_party/emdawnwebgpu/library_webgpu.js b/third_party/emdawnwebgpu/library_webgpu.js
+index 5862ce4045..45df259bb7 100644
+--- a/third_party/emdawnwebgpu/library_webgpu.js
++++ b/third_party/emdawnwebgpu/library_webgpu.js
+@@ -811,6 +811,61 @@ var LibraryWebGPU = {
+     {{{ runtimeKeepalivePush() }}}
+     WebGPU.Internals.futureInsert(futureId, adapter.requestDevice(desc).then((device) => {
+       {{{ runtimeKeepalivePop() }}}
++
++      if (globalThis["WEBGPU_STAT"]) {
++        // a set that caches all active buffers
++        const buffers = WebGPU.Internals.buffers ??= new Set();
++        // key is buffer usage, value is total size of buffers with that usage
++        const buffersTotalSize = WebGPU.Internals.buffersTotalSize ??= new Map();
++
++        WebGPU.Internals.buffersCreated ??= 0;
++        WebGPU.Internals.buffersDestroyed ??= 0;
++        WebGPU.Internals.buffersUploads ??= 0;
++        WebGPU.Internals.buffersExternalUploads ??= 0;
++        WebGPU.Internals.buffersDownloads ??= 0;
++        WebGPU.Internals.buffersExternalDownloads ??= 0;
++
++        // create a proxy so that we can monitor buffer usages
++        device = new Proxy(device, {
++          // when call device.createBuffer(), the returned buffer should be added into buffers
++          get: (target, prop, _receiver) => {
++            if (prop === 'createBuffer') {
++              return (desc) => {
++                const buffer = target.createBuffer(desc);
++                const originalDestroy = buffer.destroy.bind(buffer);
++                buffer.destroy = () => {
++                  const previousTotal = buffersTotalSize.get(buffer.usage);
++                  buffersTotalSize.set(buffer.usage, previousTotal - buffer.size);
++                  buffers.delete(buffer);
++                  WebGPU.Internals.buffersDestroyed++;
++                  originalDestroy();
++                };
++
++                if (buffer.usage === (GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC)) {
++                  WebGPU.Internals.buffersUploads++;
++                }
++                if (buffer.usage === (GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ)) {
++                  WebGPU.Internals.buffersDownloads++;
++                }
++
++                buffers.add(buffer);
++                const previousTotal = buffersTotalSize.get(buffer.usage) ?? 0;
++                buffersTotalSize.set(buffer.usage, previousTotal + buffer.size);
++                WebGPU.Internals.buffersCreated++;
++                return buffer;
++              };
++            }
++            const propertyValue = Reflect.get(target, prop);
++            if (typeof propertyValue === 'function') {
++              return propertyValue.bind(target);
++            } else {
++              return propertyValue;
++            }
++          },
++          set: (target, prop, value, _receiver) => Reflect.set(target, prop, value),
++        });
++      }
++
+       WebGPU.Internals.jsObjectInsert(queuePtr, device.queue);
+       WebGPU.Internals.jsObjectInsert(devicePtr, device);
+ 
 diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp
-index ca52b1237b..b11462fb87 100644
+index ca52b1237b..a30ca583c3 100644
 --- a/third_party/emdawnwebgpu/webgpu.cpp
 +++ b/third_party/emdawnwebgpu/webgpu.cpp
 @@ -131,7 +131,6 @@ class RefCounted : NonMovable {
@@ -75,7 +141,14 @@ index ca52b1237b..b11462fb87 100644
  
    void Destroy();
    const void* GetConstMappedRange(size_t offset, size_t size);
-@@ -1168,7 +1169,11 @@ WGPUBuffer emwgpuCreateBuffer(const EventSource* source,
+@@ -1164,11 +1165,17 @@ WGPUAdapter emwgpuCreateAdapter(const EventSource* source) {
+ 
+ WGPUBuffer emwgpuCreateBuffer(const EventSource* source,
+                               bool mappedAtCreation = false) {
+-  return new WGPUBufferImpl(source, mappedAtCreation);
++  auto x = new WGPUBufferImpl(source, mappedAtCreation);
++  // printf(" #C++: emwgpuCreateBuffer %p\n", x);
++  return x;
  }
  
  WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) {
@@ -88,7 +161,7 @@ index ca52b1237b..b11462fb87 100644
  }
  
  WGPUQueue emwgpuCreateQueue(const EventSource* source) {
-@@ -1284,6 +1289,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation)
+@@ -1284,6 +1291,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation)
    }
  }
  
@@ -99,7 +172,7 @@ index ca52b1237b..b11462fb87 100644
  void WGPUBufferImpl::Destroy() {
    emwgpuBufferDestroy(this);
    AbortPendingMap("Buffer was destroyed before mapping was resolved.");
-@@ -1504,6 +1513,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo(
+@@ -1504,6 +1515,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo(
    void wgpu##Name##Release(WGPU##Name o) {       \
      if (o->Release()) {                          \
        delete o;                                  \
@@ -107,3 +180,19 @@ index ca52b1237b..b11462fb87 100644
      }                                            \
    }
  WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE)
+@@ -1587,6 +1599,7 @@ WGPUFuture wgpuAdapterRequestDevice(
+ // ----------------------------------------------------------------------------
+ 
+ void wgpuBufferDestroy(WGPUBuffer buffer) {
++  // printf(" #C++: wgpuBufferDestroy %p\n", buffer);
+   buffer->Destroy();
+ }
+ 
+@@ -1639,6 +1652,7 @@ void wgpuBufferUnmap(WGPUBuffer buffer) {
+ WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device,
+                                   const WGPUBufferDescriptor* descriptor) {
+   WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation);
++  // printf(" #C++: wgpuDeviceCreateBuffer %p\n", buffer);
+   emwgpuDeviceCreateBuffer(device, descriptor, buffer);
+   return buffer;
+ }
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index a0010df4643a4..3c04b500e0ab4 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -231,6 +231,16 @@ export class WebGpuBackend {
   private queryTimeBase?: bigint;
   queryType: TimestampQuery;
 
+  buffers = new Set();
+  buffersTotalSize = new Map();
+
+  buffersCreated = 0;
+  buffersDestroyed = 0;
+  buffersUploads = 0;
+  buffersExternalUploads = 0;
+  buffersDownloads = 0;
+  buffersExternalDownloads = 0;
+
   env: Env;
   sessionStatus: SessionState = 'default';
   /**
@@ -280,6 +290,67 @@ export class WebGpuBackend {
     }
 
     this.device = await adapter.requestDevice(deviceDescriptor);
+
+    // @ts-expect-error Element implicitly has an 'any' type because type 'typeof globalThis' has no index signature.ts(7017)
+    if (globalThis.WEBGPU_STAT) {
+      const buffers = this.buffers;
+      const buffersTotalSize = this.buffersTotalSize;
+
+      const buffersUploadsIncrement = () => {
+        this.buffersUploads++;
+      };
+      const buffersDownloadsIncrement = () => {
+        this.buffersDownloads++;
+      };
+      const buffersCreatedIncrement = () => {
+        this.buffersCreated++;
+      };
+      const buffersDestroyedIncrement = () => {
+        this.buffersDestroyed++;
+      };
+
+      this.device = new Proxy(this.device, {
+        // when call device.createBuffer(), the returned buffer should be added into buffers
+        get: (target, prop, _receiver) => {
+          if (prop === 'createBuffer') {
+            return (desc: GPUBufferDescriptor) => {
+              const buffer = target.createBuffer(desc);
+              const originalDestroy = buffer.destroy.bind(buffer);
+              buffer.destroy = () => {
+                const previousTotal = buffersTotalSize.get(buffer.usage);
+                buffersTotalSize.set(buffer.usage, previousTotal - buffer.size);
+                buffers.delete(buffer);
+                buffersDestroyedIncrement();
+                originalDestroy();
+              };
+
+              // eslint-disable-next-line no-bitwise
+              if (buffer.usage === (GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC)) {
+                buffersUploadsIncrement();
+              }
+              // eslint-disable-next-line no-bitwise
+              if (buffer.usage === (GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ)) {
+                buffersDownloadsIncrement();
+              }
+
+              buffers.add(buffer);
+              const previousTotal = buffersTotalSize.get(buffer.usage) ?? 0;
+              buffersTotalSize.set(buffer.usage, previousTotal + buffer.size);
+              buffersCreatedIncrement();
+              return buffer;
+            };
+          }
+          const propertyValue = Reflect.get(target, prop);
+          if (typeof propertyValue === 'function') {
+            return propertyValue.bind(target);
+          } else {
+            return propertyValue;
+          }
+        },
+        set: (target, prop, value, _receiver) => Reflect.set(target, prop, value),
+      });
+    }
+
     this.deviceInfo = new DeviceInfoImpl(this.device);
     this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo()));
     this.gpuDataManager = createGpuDataManager(this);
@@ -844,6 +915,7 @@ export class WebGpuBackend {
   ): () => Promise<Tensor.DataType> {
     return async () => {
       const data = await downloadGpuData(this, gpuBuffer, size);
+      this.buffersExternalDownloads++;
       return createView(data.buffer, type);
     };
   }
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index dbcf80adf3552..4bfc7925043fe 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -260,6 +260,8 @@ export const createSession = async (
   let modelDataOffset: number, modelDataLength: number;
   const wasm = getInstance();
 
+  wasm.webgpuStat?.('createSession_start');
+
   if (Array.isArray(modelData)) {
     // if model data is an array, it must be a 2-elements tuple containing the pointer and size of the model data
     [modelDataOffset, modelDataLength] = modelData;
@@ -327,6 +329,7 @@ export const createSession = async (
     }
 
     wasm.jsepOnCreateSession?.();
+    wasm.webgpuStat?.('createSession_end');
 
     // clear current MLContext after session creation
     if (wasm.currentContext) {
@@ -436,6 +439,7 @@ export const createSession = async (
 
 export const releaseSession = (sessionId: number): void => {
   const wasm = getInstance();
+  wasm.webgpuStat?.('releaseSession_start');
   const session = activeSessions.get(sessionId);
   if (!session) {
     throw new Error(`cannot release session. invalid session id: ${sessionId}`);
@@ -462,6 +466,8 @@ export const releaseSession = (sessionId: number): void => {
     checkLastError("Can't release session.");
   }
   activeSessions.delete(sessionId);
+
+  wasm.webgpuStat?.('releaseSession_end');
 };
 
 export const prepareInputOutputTensor = async (
@@ -633,6 +639,8 @@ export const run = async (
   const outputValuesOffset = wasm.stackAlloc(outputCount * ptrSize);
   const outputNamesOffset = wasm.stackAlloc(outputCount * ptrSize);
 
+  wasm.webgpuStat?.('run_start');
+
   try {
     [runOptionsHandle, runOptionsAllocs] = setRunOptions(options);
 
@@ -722,6 +730,7 @@ export const run = async (
     }
 
     wasm.jsepOnRunStart?.(sessionHandle);
+    //wasm.webgpuStat?.('run_beforeAPI');
 
     let errorCode: number;
     if (!BUILD_DEFS.DISABLE_JSEP && ioBindingState) {
@@ -745,6 +754,8 @@ export const run = async (
       );
     }
 
+    //wasm.webgpuStat?.('run_afterAPI');
+
     if (errorCode !== 0) {
       checkLastError('failed to call OrtRun().');
     }
@@ -926,6 +937,8 @@ export const run = async (
         false,
       ]);
     }
+    wasm.webgpuStat?.('run_end');
+
     return output;
   } finally {
     wasm.stackRestore(beforeRunStack);
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index 9b2ec71fd351d..7e94fec52c374 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -294,6 +294,7 @@ export declare namespace WebGpu {
     webgpuUnregisterBuffer(buffer: GPUBuffer): void;
     webgpuGetBuffer(bufferHandle: number): GPUBuffer;
     webgpuCreateDownloader(gpuBuffer: GPUBuffer, size: number, sessionHandle: number): () => Promise<ArrayBuffer>;
+    webgpuStat(label?: string): void;
   }
 }
 
diff --git a/js/web/package.json b/js/web/package.json
index 5defe05e78c1f..6651c05bce5be 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -72,7 +72,7 @@
         "import": "./dist/ort.node.min.mjs",
         "require": "./dist/ort.node.min.js"
       },
-      "import": "./dist/ort.bundle.min.mjs",
+      "import": "./dist/ort.bundle.mjs",
       "require": "./dist/ort.min.js"
     },
     "./all": {
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index fd9224a2dcf8b..afbdcc5924836 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -46,14 +46,17 @@ const DEBUG = process.env.npm_config_debug || args.debug; // boolean|'verbose'|'
 
 /**
  * --webgpu-ep
- * --no-webgpu-ep (default)
+ * --no-webgpu-ep
  *
  * Enable or disable the use of WebGPU EP. If enabled, the WebGPU EP will be used. If disabled, the WebGPU backend will
  * be used with JSEP.
  *
+ * The default value is not set. If not set, onnxruntime-web will determine whether to use WebGPU EP or JSEP based on
+ * the environment (globalThis.WEBGPU_EP).
+ *
  * (temporary) This flag is used to test the WebGPU EP integration. It will be removed in the future.
  */
-const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep'] ?? true;
+const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep'];
 
 /**
  * Root folder of the source code: `<ORT_ROOT>/js/`
@@ -69,7 +72,7 @@ const DEFAULT_DEFINE = {
   'BUILD_DEFS.DISABLE_WASM': 'false',
   'BUILD_DEFS.DISABLE_WASM_PROXY': 'false',
   'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'false',
-  'BUILD_DEFS.USE_WEBGPU_EP': JSON.stringify(!!USE_WEBGPU_EP),
+  'BUILD_DEFS.USE_WEBGPU_EP': USE_WEBGPU_EP === undefined ? 'globalThis.WEBGPU_EP' : JSON.stringify(!!USE_WEBGPU_EP),
 
   'BUILD_DEFS.IS_ESM': 'false',
   'BUILD_DEFS.ESM_IMPORT_META_URL': 'undefined',
@@ -601,6 +604,12 @@ async function main() {
       outputName: 'ort',
       define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true' },
     });
+    // ort.bundle.mjs
+    await buildOrt({
+      outputName: 'ort.bundle',
+      format: 'esm',
+      define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'true' },
+    });
     // ort.bundle.min.mjs
     await buildOrt({
       isProduction: true,
diff --git a/onnxruntime/wasm/post-webgpu.js b/onnxruntime/wasm/post-webgpu.js
index e7631a97c34c6..b84dfd733af57 100644
--- a/onnxruntime/wasm/post-webgpu.js
+++ b/onnxruntime/wasm/post-webgpu.js
@@ -202,6 +202,8 @@ Module["webgpuInit"] = (setDefaultDevice) => {
 
         await gpuReadBuffer.mapAsync(GPUMapMode.READ);
 
+        WebGPU.Internals.buffersExternalDownloads++;
+
         const arrayBuffer = gpuReadBuffer.getMappedRange();
         return arrayBuffer.slice(0, originalSize);
       } finally {
@@ -258,6 +260,23 @@ Module["webgpuInit"] = (setDefaultDevice) => {
       size
     );
     webgpuCurrentDevice.queue.submit([commandEncoder.finish()]);
+    WebGPU.Internals.buffersExternalUploads++;
     gpuBufferForUploading.destroy();
   };
+
+  Module["webgpuStat"] = (label) => {
+    if (globalThis["WEBGPU_STAT"]) {
+      console.log(
+        `[${label}] BufferCount: ${
+          WebGPU.Internals.buffers?.size ?? 0
+        }, Created: ${WebGPU.Internals.buffersCreated ?? 0}, Destroyed: ${
+          WebGPU.Internals.buffersDestroyed ?? 0
+        } Uploads: ${WebGPU.Internals.buffersUploads ?? 0}, Downloads: ${
+          WebGPU.Internals.buffersDownloads ?? 0
+        }, ExtUploads: ${
+          WebGPU.Internals.buffersExternalUploads ?? 0
+        }, ExtDownloads: ${WebGPU.Internals.buffersExternalDownloads ?? 0}`
+      );
+    }
+  };
 };
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index a35ab129280c4..04507e5defae9 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -93,6 +93,15 @@ Module["jsepInit"] = (name, params) => {
 
     Module.jsepUploadExternalBuffer = (dataId, buffer) => {
       backend["upload"](dataId, buffer);
+      backend["buffersExternalUploads"]++;
+    };
+
+    Module["webgpuStat"] = (label) => {
+      if (globalThis["WEBGPU_STAT"]) {
+        console.log(
+          `[${label}] BufferCount: ${backend["buffers"].size}, Created: ${backend["buffersCreated"]}, Destroyed: ${backend["buffersDestroyed"]}, Uploads: ${backend["buffersUploads"]}, Downloads: ${backend["buffersDownloads"]}, ExtUploads: ${backend["buffersExternalUploads"]}, ExtDownloads: ${backend["buffersExternalDownloads"]}`
+        );
+      }
     };
   } else if (name === "webnn") {
     // Functions called from EM_ASM need to be assigned in a way that can be minified.