diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index f3afaf7033fd1..b6910795391b1 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -443,7 +443,8 @@ jsepDownload:_pp_")
       "SHELL:-s ASSERTIONS=0"
       "SHELL:-s SAFE_HEAP=0"
       "SHELL:-s STACK_OVERFLOW_CHECK=0"
-      --closure 1
+      ## comment out closure compiler so that it's easier to debug
+      # --closure 1
     )
   endif()
 
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
index 6f5a50530a88a..b260552ab80f8 100644
--- a/cmake/patches/dawn/dawn.patch
+++ b/cmake/patches/dawn/dawn.patch
@@ -34,8 +34,74 @@ index efd6491cd6..8ebc5d28b6 100644
  emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../..
  
  ninja
+diff --git a/third_party/emdawnwebgpu/library_webgpu.js b/third_party/emdawnwebgpu/library_webgpu.js
+index 5862ce4045..45df259bb7 100644
+--- a/third_party/emdawnwebgpu/library_webgpu.js
++++ b/third_party/emdawnwebgpu/library_webgpu.js
+@@ -811,6 +811,61 @@ var LibraryWebGPU = {
+     {{{ runtimeKeepalivePush() }}}
+     WebGPU.Internals.futureInsert(futureId, adapter.requestDevice(desc).then((device) => {
+       {{{ runtimeKeepalivePop() }}}
++
++      if (globalThis["WEBGPU_STAT"]) {
++        // a set that caches all active buffers
++        const buffers = WebGPU.Internals.buffers ??= new Set();
++        // key is buffer usage, value is total size of buffers with that usage
++        const buffersTotalSize = WebGPU.Internals.buffersTotalSize ??= new Map();
++
++        WebGPU.Internals.buffersCreated ??= 0;
++        WebGPU.Internals.buffersDestroyed ??= 0;
++        WebGPU.Internals.buffersUploads ??= 0;
++        WebGPU.Internals.buffersExternalUploads ??= 0;
++        WebGPU.Internals.buffersDownloads ??= 0;
++        WebGPU.Internals.buffersExternalDownloads ??= 0;
++
++        // create a proxy so that we can monitor buffer usages
++        device = new Proxy(device, {
++          // when call device.createBuffer(), the returned buffer should be added into buffers
++          get: (target, prop, _receiver) => {
++            if (prop === 'createBuffer') {
++              return (desc) => {
++                const buffer = target.createBuffer(desc);
++                const originalDestroy = buffer.destroy.bind(buffer);
++                buffer.destroy = () => {
++                  const previousTotal = buffersTotalSize.get(buffer.usage);
++                  buffersTotalSize.set(buffer.usage, previousTotal - buffer.size);
++                  buffers.delete(buffer);
++                  WebGPU.Internals.buffersDestroyed++;
++                  originalDestroy();
++                };
++
++                if (buffer.usage === (GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC)) {
++                  WebGPU.Internals.buffersUploads++;
++                }
++                if (buffer.usage === (GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ)) {
++                  WebGPU.Internals.buffersDownloads++;
++                }
++
++                buffers.add(buffer);
++                const previousTotal = buffersTotalSize.get(buffer.usage) ?? 0;
++                buffersTotalSize.set(buffer.usage, previousTotal + buffer.size);
++                WebGPU.Internals.buffersCreated++;
++                return buffer;
++              };
++            }
++            const propertyValue = Reflect.get(target, prop);
++            if (typeof propertyValue === 'function') {
++              return propertyValue.bind(target);
++            } else {
++              return propertyValue;
++            }
++          },
++          set: (target, prop, value, _receiver) => Reflect.set(target, prop, value),
++        });
++      }
++
+       WebGPU.Internals.jsObjectInsert(queuePtr, device.queue);
+       WebGPU.Internals.jsObjectInsert(devicePtr, device);
+ 
 diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp
-index ca52b1237b..b11462fb87 100644
+index ca52b1237b..a30ca583c3 100644
 --- a/third_party/emdawnwebgpu/webgpu.cpp
 +++ b/third_party/emdawnwebgpu/webgpu.cpp
 @@ -131,7 +131,6 @@ class RefCounted : NonMovable {
@@ -62,7 +128,14 @@ index ca52b1237b..b11462fb87 100644
  
    void Destroy();
    const void* GetConstMappedRange(size_t offset, size_t size);
-@@ -1168,7 +1169,11 @@ WGPUBuffer emwgpuCreateBuffer(const EventSource* source,
+@@ -1164,11 +1165,17 @@ WGPUAdapter emwgpuCreateAdapter(const EventSource* source) {
+ 
+ WGPUBuffer emwgpuCreateBuffer(const EventSource* source,
+                               bool mappedAtCreation = false) {
+-  return new WGPUBufferImpl(source, mappedAtCreation);
++  auto x = new WGPUBufferImpl(source, mappedAtCreation);
++  // printf(" #C++: emwgpuCreateBuffer %p\n", x);
++  return x;
  }
  
  WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) {
@@ -75,7 +148,7 @@ index ca52b1237b..b11462fb87 100644
  }
  
  WGPUQueue emwgpuCreateQueue(const EventSource* source) {
-@@ -1284,6 +1289,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation)
+@@ -1284,6 +1291,10 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation)
    }
  }
  
@@ -86,7 +159,7 @@ index ca52b1237b..b11462fb87 100644
  void WGPUBufferImpl::Destroy() {
    emwgpuBufferDestroy(this);
    AbortPendingMap("Buffer was destroyed before mapping was resolved.");
-@@ -1504,6 +1513,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo(
+@@ -1504,6 +1515,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo(
    void wgpu##Name##Release(WGPU##Name o) {       \
      if (o->Release()) {                          \
        delete o;                                  \
@@ -94,3 +167,19 @@ index ca52b1237b..b11462fb87 100644
      }                                            \
    }
  WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE)
+@@ -1587,6 +1599,7 @@ WGPUFuture wgpuAdapterRequestDevice(
+ // ----------------------------------------------------------------------------
+ 
+ void wgpuBufferDestroy(WGPUBuffer buffer) {
++  // printf(" #C++: wgpuBufferDestroy %p\n", buffer);
+   buffer->Destroy();
+ }
+ 
+@@ -1639,6 +1652,7 @@ void wgpuBufferUnmap(WGPUBuffer buffer) {
+ WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device,
+                                   const WGPUBufferDescriptor* descriptor) {
+   WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation);
++  // printf(" #C++: wgpuDeviceCreateBuffer %p\n", buffer);
+   emwgpuDeviceCreateBuffer(device, descriptor, buffer);
+   return buffer;
+ }
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index a0010df4643a4..3c04b500e0ab4 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -231,6 +231,16 @@ export class WebGpuBackend {
   private queryTimeBase?: bigint;
   queryType: TimestampQuery;
 
+  buffers = new Set();
+  buffersTotalSize = new Map();
+
+  buffersCreated = 0;
+  buffersDestroyed = 0;
+  buffersUploads = 0;
+  buffersExternalUploads = 0;
+  buffersDownloads = 0;
+  buffersExternalDownloads = 0;
+
   env: Env;
   sessionStatus: SessionState = 'default';
   /**
@@ -280,6 +290,67 @@ export class WebGpuBackend {
     }
 
     this.device = await adapter.requestDevice(deviceDescriptor);
+
+    // @ts-expect-error Element implicitly has an 'any' type because type 'typeof globalThis' has no index signature.ts(7017)
+    if (globalThis.WEBGPU_STAT) {
+      const buffers = this.buffers;
+      const buffersTotalSize = this.buffersTotalSize;
+
+      const buffersUploadsIncrement = () => {
+        this.buffersUploads++;
+      };
+      const buffersDownloadsIncrement = () => {
+        this.buffersDownloads++;
+      };
+      const buffersCreatedIncrement = () => {
+        this.buffersCreated++;
+      };
+      const buffersDestroyedIncrement = () => {
+        this.buffersDestroyed++;
+      };
+
+      this.device = new Proxy(this.device, {
+        // when call device.createBuffer(), the returned buffer should be added into buffers
+        get: (target, prop, _receiver) => {
+          if (prop === 'createBuffer') {
+            return (desc: GPUBufferDescriptor) => {
+              const buffer = target.createBuffer(desc);
+              const originalDestroy = buffer.destroy.bind(buffer);
+              buffer.destroy = () => {
+                const previousTotal = buffersTotalSize.get(buffer.usage);
+                buffersTotalSize.set(buffer.usage, previousTotal - buffer.size);
+                buffers.delete(buffer);
+                buffersDestroyedIncrement();
+                originalDestroy();
+              };
+
+              // eslint-disable-next-line no-bitwise
+              if (buffer.usage === (GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC)) {
+                buffersUploadsIncrement();
+              }
+              // eslint-disable-next-line no-bitwise
+              if (buffer.usage === (GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ)) {
+                buffersDownloadsIncrement();
+              }
+
+              buffers.add(buffer);
+              const previousTotal = buffersTotalSize.get(buffer.usage) ?? 0;
+              buffersTotalSize.set(buffer.usage, previousTotal + buffer.size);
+              buffersCreatedIncrement();
+              return buffer;
+            };
+          }
+          const propertyValue = Reflect.get(target, prop);
+          if (typeof propertyValue === 'function') {
+            return propertyValue.bind(target);
+          } else {
+            return propertyValue;
+          }
+        },
+        set: (target, prop, value, _receiver) => Reflect.set(target, prop, value),
+      });
+    }
+
     this.deviceInfo = new DeviceInfoImpl(this.device);
     this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo()));
     this.gpuDataManager = createGpuDataManager(this);
@@ -844,6 +915,7 @@ export class WebGpuBackend {
   ): () => Promise<Tensor.DataType> {
     return async () => {
       const data = await downloadGpuData(this, gpuBuffer, size);
+      this.buffersExternalDownloads++;
       return createView(data.buffer, type);
     };
   }
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index dbcf80adf3552..4bfc7925043fe 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -260,6 +260,8 @@ export const createSession = async (
   let modelDataOffset: number, modelDataLength: number;
   const wasm = getInstance();
 
+  wasm.webgpuStat?.('createSession_start');
+
   if (Array.isArray(modelData)) {
     // if model data is an array, it must be a 2-elements tuple containing the pointer and size of the model data
     [modelDataOffset, modelDataLength] = modelData;
@@ -327,6 +329,7 @@ export const createSession = async (
     }
 
     wasm.jsepOnCreateSession?.();
+    wasm.webgpuStat?.('createSession_end');
 
     // clear current MLContext after session creation
     if (wasm.currentContext) {
@@ -436,6 +439,7 @@ export const createSession = async (
 
 export const releaseSession = (sessionId: number): void => {
   const wasm = getInstance();
+  wasm.webgpuStat?.('releaseSession_start');
   const session = activeSessions.get(sessionId);
   if (!session) {
     throw new Error(`cannot release session. invalid session id: ${sessionId}`);
@@ -462,6 +466,8 @@ export const releaseSession = (sessionId: number): void => {
     checkLastError("Can't release session.");
   }
   activeSessions.delete(sessionId);
+
+  wasm.webgpuStat?.('releaseSession_end');
 };
 
 export const prepareInputOutputTensor = async (
@@ -633,6 +639,8 @@ export const run = async (
   const outputValuesOffset = wasm.stackAlloc(outputCount * ptrSize);
   const outputNamesOffset = wasm.stackAlloc(outputCount * ptrSize);
 
+  wasm.webgpuStat?.('run_start');
+
   try {
     [runOptionsHandle, runOptionsAllocs] = setRunOptions(options);
 
@@ -722,6 +730,7 @@ export const run = async (
     }
 
     wasm.jsepOnRunStart?.(sessionHandle);
+    //wasm.webgpuStat?.('run_beforeAPI');
 
     let errorCode: number;
     if (!BUILD_DEFS.DISABLE_JSEP && ioBindingState) {
@@ -745,6 +754,8 @@ export const run = async (
       );
     }
 
+    //wasm.webgpuStat?.('run_afterAPI');
+
     if (errorCode !== 0) {
       checkLastError('failed to call OrtRun().');
     }
@@ -926,6 +937,8 @@ export const run = async (
         false,
       ]);
     }
+    wasm.webgpuStat?.('run_end');
+
     return output;
   } finally {
     wasm.stackRestore(beforeRunStack);
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index 9b2ec71fd351d..7e94fec52c374 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -294,6 +294,7 @@ export declare namespace WebGpu {
     webgpuUnregisterBuffer(buffer: GPUBuffer): void;
     webgpuGetBuffer(bufferHandle: number): GPUBuffer;
     webgpuCreateDownloader(gpuBuffer: GPUBuffer, size: number, sessionHandle: number): () => Promise<ArrayBuffer>;
+    webgpuStat(label?: string): void;
   }
 }
 
diff --git a/js/web/package.json b/js/web/package.json
index 5defe05e78c1f..6651c05bce5be 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -72,7 +72,7 @@
         "import": "./dist/ort.node.min.mjs",
         "require": "./dist/ort.node.min.js"
       },
-      "import": "./dist/ort.bundle.min.mjs",
+      "import": "./dist/ort.bundle.mjs",
       "require": "./dist/ort.min.js"
     },
     "./all": {
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index fd9224a2dcf8b..afbdcc5924836 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -46,14 +46,17 @@ const DEBUG = process.env.npm_config_debug || args.debug; // boolean|'verbose'|'
 
 /**
  * --webgpu-ep
- * --no-webgpu-ep (default)
+ * --no-webgpu-ep
  *
  * Enable or disable the use of WebGPU EP. If enabled, the WebGPU EP will be used. If disabled, the WebGPU backend will
  * be used with JSEP.
  *
+ * The default value is not set. If not set, onnxruntime-web will determine whether to use WebGPU EP or JSEP based on
+ * the environment (globalThis.WEBGPU_EP).
+ *
  * (temporary) This flag is used to test the WebGPU EP integration. It will be removed in the future.
  */
-const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep'] ?? true;
+const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep'];
 
 /**
  * Root folder of the source code: `<ORT_ROOT>/js/`
@@ -69,7 +72,7 @@ const DEFAULT_DEFINE = {
   'BUILD_DEFS.DISABLE_WASM': 'false',
   'BUILD_DEFS.DISABLE_WASM_PROXY': 'false',
   'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'false',
-  'BUILD_DEFS.USE_WEBGPU_EP': JSON.stringify(!!USE_WEBGPU_EP),
+  'BUILD_DEFS.USE_WEBGPU_EP': USE_WEBGPU_EP === undefined ? 'globalThis.WEBGPU_EP' : JSON.stringify(!!USE_WEBGPU_EP),
 
   'BUILD_DEFS.IS_ESM': 'false',
   'BUILD_DEFS.ESM_IMPORT_META_URL': 'undefined',
@@ -601,6 +604,12 @@ async function main() {
       outputName: 'ort',
       define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true' },
     });
+    // ort.bundle.mjs
+    await buildOrt({
+      outputName: 'ort.bundle',
+      format: 'esm',
+      define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'true' },
+    });
     // ort.bundle.min.mjs
     await buildOrt({
       isProduction: true,
diff --git a/onnxruntime/wasm/post-webgpu.js b/onnxruntime/wasm/post-webgpu.js
index e7631a97c34c6..b84dfd733af57 100644
--- a/onnxruntime/wasm/post-webgpu.js
+++ b/onnxruntime/wasm/post-webgpu.js
@@ -202,6 +202,8 @@ Module["webgpuInit"] = (setDefaultDevice) => {
 
         await gpuReadBuffer.mapAsync(GPUMapMode.READ);
 
+        WebGPU.Internals.buffersExternalDownloads++;
+
         const arrayBuffer = gpuReadBuffer.getMappedRange();
         return arrayBuffer.slice(0, originalSize);
       } finally {
@@ -258,6 +260,23 @@ Module["webgpuInit"] = (setDefaultDevice) => {
       size
     );
     webgpuCurrentDevice.queue.submit([commandEncoder.finish()]);
+    WebGPU.Internals.buffersExternalUploads++;
     gpuBufferForUploading.destroy();
   };
+
+  Module["webgpuStat"] = (label) => {
+    if (globalThis["WEBGPU_STAT"]) {
+      console.log(
+        `[${label}] BufferCount: ${
+          WebGPU.Internals.buffers?.size ?? 0
+        }, Created: ${WebGPU.Internals.buffersCreated ?? 0}, Destroyed: ${
+          WebGPU.Internals.buffersDestroyed ?? 0
+        } Uploads: ${WebGPU.Internals.buffersUploads ?? 0}, Downloads: ${
+          WebGPU.Internals.buffersDownloads ?? 0
+        }, ExtUploads: ${
+          WebGPU.Internals.buffersExternalUploads ?? 0
+        }, ExtDownloads: ${WebGPU.Internals.buffersExternalDownloads ?? 0}`
+      );
+    }
+  };
 };
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index a35ab129280c4..04507e5defae9 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -93,6 +93,15 @@ Module["jsepInit"] = (name, params) => {
 
     Module.jsepUploadExternalBuffer = (dataId, buffer) => {
       backend["upload"](dataId, buffer);
+      backend["buffersExternalUploads"]++;
+    };
+
+    Module["webgpuStat"] = (label) => {
+      if (globalThis["WEBGPU_STAT"]) {
+        console.log(
+          `[${label}] BufferCount: ${backend["buffers"].size}, Created: ${backend["buffersCreated"]}, Destroyed: ${backend["buffersDestroyed"]}, Uploads: ${backend["buffersUploads"]}, Downloads: ${backend["buffersDownloads"]}, ExtUploads: ${backend["buffersExternalUploads"]}, ExtDownloads: ${backend["buffersExternalDownloads"]}`
+        );
+      }
     };
   } else if (name === "webnn") {
     // Functions called from EM_ASM need to be assigned in a way that can be minified.