pytorch
diff --git a/‎.ci/scripts/gather_benchmark_configs.py‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/gather_benchmark_configs.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/scripts/extract_benchmark_results.py‎
Lines changed: 5 additions & 1 deletion b/‎.github/scripts/extract_benchmark_results.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 7 additions & 9 deletions b/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 15 additions & 15 deletions b/‎.lintrunner.toml‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operators/op_avg_pool2d.py‎
Lines changed: 24 additions & 10 deletions b/‎backends/arm/operators/op_avg_pool2d.py‎
Lines changed: 24 additions & 10 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/cadence/aot/tests/test_memory_passes.py‎
Lines changed: 0 additions & 2 deletions b/‎backends/cadence/aot/tests/test_memory_passes.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/cadence/aot/tests/test_remove_ops_passes.py‎
Lines changed: 0 additions & 2 deletions b/‎backends/cadence/aot/tests/test_remove_ops_passes.py‎
Lines changed: 0 additions & 2 deletions
@@ -21,6 +21,7 @@
     "apple_iphone_15": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d",
     "apple_iphone_15+ios_18": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/12c8b15c-8d03-4e07-950d-0a627e7595b4",
     "samsung_galaxy_s22": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa",
+    "samsung_galaxy_s22_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/ea6b049d-1508-4233-9a56-5d9eacbe1078",
     "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
     "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
     "google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
 
@@ -349,7 +349,10 @@ def transform(
     # Overwrite the device name here with the job name as it has more information about
     # the device, i.e. Samsung Galaxy S22 5G instead of just Samsung
     for r in benchmark_results:
-        r["deviceInfo"]["device"] = job_name
+        is_private_device = job_report.get("is_private_instance", False)
+        r["deviceInfo"]["device"] = (
+            f"{job_name} (private)" if is_private_device else job_name
+        )
 
     # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
     return [
@@ -363,6 +366,7 @@ def transform(
                     "benchmark_config": json.dumps(benchmark_config),
                     "job_conclusion": "SUCCESS",
                     "job_arn": job_report.get("arn", ""),
+                    "instance_arn": job_report.get("instance_arn", ""),
                 },
             },
             "model": {
 
@@ -23,7 +23,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: google_pixel_3_private_rooted
+        default: samsung_galaxy_s22_private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -39,7 +39,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: google_pixel_3_private_rooted
+        default: samsung_galaxy_s22_private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -58,5 +58,5 @@ jobs:
       contents: read
     with:
       models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
-      devices: google_pixel_3_private_rooted
+      devices: samsung_galaxy_s22_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -1,18 +1,16 @@
 name: apple-perf (private devices)
 
 on:
-  # TODO (huydhn): Disable the schedule run until we land the change to add device pool and device name
-  # to separate between public and private iOS devices
-  # schedule:
-  # - cron: 0 0,4,8,12,16,20 * * *
+  schedule:
+   - cron: 0 0,4,8,12,16,20 * * *
   pull_request:
     paths:
       - .github/workflows/apple-perf-private-device-experiment.yml
-  # push:
-  #   branches:
-  #     - main
-  #   paths:
-  #     - .github/workflows/apple-perf-private-device-experiment.yml
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/apple-perf-private-device-experiment.yml
   # Note: GitHub has an upper limit of 10 inputs
   workflow_dispatch:
     inputs:
 
@@ -10,7 +10,7 @@ exclude_patterns = [
     'exir/serde/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -19,7 +19,7 @@ command = [
     '@{{PATHSFILE}}'
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -41,7 +41,7 @@ exclude_patterns = [
     'exir/serde/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -50,7 +50,7 @@ command = [
     '@{{PATHSFILE}}'
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -83,7 +83,7 @@ exclude_patterns = [
     'runtime/core/portable_type/c10/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -94,7 +94,7 @@ command = [
     '@{{PATHSFILE}}'
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -116,7 +116,7 @@ exclude_patterns = [
     '**/third-party/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -126,7 +126,7 @@ command = [
     '@{{PATHSFILE}}',
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -150,7 +150,7 @@ exclude_patterns = [
     '**/third-party/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -191,7 +191,7 @@ exclude_patterns = [
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -226,7 +226,7 @@ exclude_patterns = [
     'util/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -275,7 +275,7 @@ exclude_patterns = [
     'util/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -325,7 +325,7 @@ exclude_patterns = [
     'backends/arm/test/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -337,7 +337,7 @@ command = [
     '@{{PATHSFILE}}'
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -356,7 +356,7 @@ exclude_patterns = [
     '.lintrunner.toml',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
 
@@ -49,8 +49,8 @@ Key value propositions of ExecuTorch are:
 ## Getting Started
 To get started you can:
 
-- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index) to get things running locally and deploy a model to a device
-- Use this [Colab Notebook](https://pytorch.org/executorch/main/getting-started-setup#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away
+- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
+- Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
 - Jump straight into LLM use cases by following specific instructions for [Llama](examples/models/llama/README.md) and [Llava](examples/models/llava/README.md)
 
 ## Feedback and Engagement
 
@@ -85,8 +85,12 @@ def define_node(
     ) -> None:
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-        input_tensor = inputs[0]
-        assert input_tensor.dtype == ts.DType.INT8
+        supported_dtypes = [ts.DType.INT8]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f"IO data type needs to be one of {supported_dtypes}, got "
+                f'"{inputs[0].dtype}"'
+            )
 
         accumulator_type = ts.DType.INT32
 
@@ -118,9 +122,12 @@ def define_node(
     ) -> None:
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-        assert (
-            inputs[0].dtype == ts.DType.INT8 or inputs[0].dtype == ts.DType.FP32
-        ), "Only FP32 and INT8 supported"
+        supported_dtypes = [ts.DType.INT8, ts.DType.FP32]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f"IO data type needs to be one of {supported_dtypes}, got "
+                f'"{inputs[0].dtype}"'
+            )
 
         if inputs[0].dtype == ts.DType.INT8:
             super().define_node(node, tosa_graph, inputs, output)
@@ -205,8 +212,12 @@ def define_node(
     ) -> None:
         import serializer.tosa_serializer as ts  # type: ignore
 
-        input_tensor = inputs[0]
-        assert input_tensor.dtype == ts.DType.INT8
+        supported_dtypes = [ts.DType.INT8]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f"IO data type needs to be one of {supported_dtypes}, got "
+                f'"{inputs[0].dtype}"'
+            )
 
         accumulator_type = ts.DType.INT32
 
@@ -241,9 +252,12 @@ def define_node(
     ) -> None:
         import serializer.tosa_serializer as ts  # type: ignore
 
-        assert (
-            inputs[0].dtype == ts.DType.INT8 or inputs[0].dtype == ts.DType.FP32
-        ), "Only FP32 and INT8 supported"
+        supported_dtypes = [ts.DType.INT8, ts.DType.FP32]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f"IO data type needs to be one of {supported_dtypes}, got "
+                f'"{inputs[0].dtype}"'
+            )
 
         if inputs[0].dtype == ts.DType.INT8:
             super().define_node(node, tosa_graph, inputs, output)
 
@@ -328,7 +328,6 @@ def forward(self, x):
         model = M()
         graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
         graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
-        graph_module.print_readable()
 
         self.check_op_counts(
             graph_module,
 
@@ -711,7 +711,6 @@ def forward(self, x) -> torch.Tensor:
             .exported_program()
             .graph_module
         )
-        graph_module.print_readable()
         self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1)
         self.assertEqual(
             count_node(graph_module, torch.ops.aten._slice_copy_nop.Tensor_out), 0
@@ -741,7 +740,6 @@ def forward(self, x) -> torch.Tensor:
             .exported_program()
             .graph_module
         )
-        graph_module.print_readable()
         self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 2)
         self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0)
         self.verify_nop_memory_alloc(graph_module)
 
@@ -100,7 +100,6 @@ def forward(self, t: torch.Tensor):
         p = RemoveNopAddOpPass()
 
         graph_after_passes = cast(PassResult, p(graph_module)).graph_module
-        graph_module.print_readable()
         self.assertEqual(
             count_node(graph_after_passes, exir_ops.edge.aten.add.Tensor),
             0,
@@ -140,7 +139,6 @@ def forward(self, t: torch.Tensor):
         p = RemoveNopMulOpPass()
 
         graph_after_passes = cast(PassResult, p(graph_module)).graph_module
-        graph_module.print_readable()
         self.assertEqual(
             count_node(graph_after_passes, exir_ops.edge.aten.mul.Tensor),
             0,