Qualcomm AI Engine Direct - enable operator max_pool3d by decomposition (#15897)

jethroqti · web-flow · commit e804065cbf56 · 2025-12-10T10:29:39.000-08:00
Summary:
Enable max_pool3d operator by using max_pool2d two times

Test plan:
python backends/qualcomm/tests/test_qnn_delegate.py
TestQNNFloatingPointOperator.test_qnn_backend_max_pool3d -b
build-android -H HOST -s DEVICE -m CHIPID
python backends/qualcomm/tests/test_qnn_delegate.py
TestQNNQuantizedOperator.test_qnn_backend_max_pool3d -b build-android -H
HOST -s DEVICE -m CHIPID
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
@@ -22,6 +22,7 @@
 from .decompose_floor_divide import DecomposeFloorDivide
 from .decompose_glu import DecomposeGlu
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
+from .decompose_maxpool3d import DecomposeMaxPool3d
 from .decompose_minmaxdim import DecomposeMinMaxDim
 from .decompose_roll import DecomposeRoll
 from .decompose_silu import DecomposeSilu
@@ -68,6 +69,7 @@
     DecomposeFloorDivide,
     DecomposeGlu,
     DecomposeLinalgVectorNorm,
+    DecomposeMaxPool3d,
     DecomposeMinMaxDim,
     DecomposeRoll,
     DecomposeSilu,
diff --git a/backends/qualcomm/_passes/decompose_maxpool3d.py b/backends/qualcomm/_passes/decompose_maxpool3d.py
@@ -0,0 +1,133 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import warnings
+from typing import cast, List
+
+import torch
+import torch.nn as nn
+from executorch.exir import to_edge
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import merge_decomposed_graph
+
+
+class ModelMaxPool3D(torch.nn.Module):
+    def __init__(
+        self, filter_size, stride, padding, dilation, return_indices, ceil_mode
+    ):
+        super().__init__()
+
+        self.pool2d_hw = nn.MaxPool2d(
+            kernel_size=[1, filter_size[2]],  # (H, W) part
+            stride=[1, stride[2]],
+            padding=[0, padding[2]],
+            return_indices=return_indices,
+            ceil_mode=ceil_mode,
+        )
+        self.pool2d_dh = nn.MaxPool2d(
+            kernel_size=filter_size[:2],  # (D, H) part
+            stride=stride[:2],
+            padding=padding[:2],
+            return_indices=return_indices,
+            ceil_mode=ceil_mode,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        N, C, D, H, W = x.shape
+        x_ = x.permute(0, 1, 4, 2, 3)
+        x1_1d = x_.reshape(N * C, W, D, H)
+        # first pool over (D, H)
+        out_pool1d_0 = self.pool2d_dh(x1_1d)
+        D_out = out_pool1d_0.shape[2]
+        # NC, W, D, H-> NC, D, H, W
+        x1b = out_pool1d_0.permute(0, 2, 3, 1)
+        # second pool over (H, W)
+        out4d = self.pool2d_hw(x1b)
+        H_out2 = out4d.shape[2]
+        W_out = out4d.shape[3]
+        out = out4d.reshape(N, C, D_out, H_out2, W_out)
+        return out
+
+
+class DecomposeMaxPool3d(ExportPass):
+    # The max_pool3d is not supported yet by QNN.
+    # Decompose: input -> permute -> reshape -> max_pool2d -> permute -> max_pool2d -> reshape -> output
+
+    def __init__(self, quantization_capture=False) -> None:
+        super().__init__()
+        self.quantization_capture = quantization_capture
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.op == "call_function" and "max_pool3d" in str(node.target):
+                # kernel info
+                filter_size = cast(List[int], node.args[1])
+                if len(filter_size) == 1:
+                    filter_size *= 3
+
+                num_args = len(node.args)
+
+                # stride info
+                stride = filter_size
+                if num_args > 2:
+                    stride = cast(List[int], node.args[2])
+                    if len(stride) == 1:
+                        stride *= 3
+
+                # padding info
+                padding = [0, 0, 0]
+                if num_args > 3:
+                    padding = cast(List[int], node.args[3])
+                    if len(padding) == 1:
+                        padding *= 3
+
+                # dilation info
+                dilation = [1, 1, 1]
+                if num_args > 4:
+                    dilation = cast(List[int], node.args[4])
+                    if len(padding) == 1:
+                        dilation *= 3
+
+                ceil_mode = node.args[5] if num_args > 5 else False
+                return_indices = node.args[6] if num_args > 6 else False
+                if return_indices:
+                    warnings.warn(
+                        "[QNN Delegate Op Builder]: The case return_indices=True is not be support, fallback",
+                        stacklevel=1,
+                    )
+                    return
+
+                model = ModelMaxPool3D(
+                    filter_size, stride, padding, dilation, return_indices, ceil_mode
+                )
+                if self.quantization_capture:
+                    decomposed_module = torch.export.export(
+                        model, (node.args[0].meta["val"],), strict=True
+                    ).module()
+                else:
+                    edge_mgr = to_edge(
+                        torch.export.export(
+                            model, (node.args[0].meta["val"],), strict=True
+                        )
+                    )
+                    decomposed_module = edge_mgr.exported_program()
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": node.args[0]}
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -27,6 +27,7 @@
     DecomposeFloorDivide,
     DecomposeGlu,
     DecomposeLinalgVectorNorm,
+    DecomposeMaxPool3d,
     DecomposeMinMaxDim,
     DecomposeRoll,
     DecomposeSilu,
@@ -98,6 +99,7 @@ def get_capture_program_passes():
         (FoldQDQ, True),
         (I64toI32, True),
         (LayoutTransform, True),
+        (DecomposeMaxPool3d, True),
         (RecomposePixelUnshuffle, True),
         (RecomposeRmsNorm, True),
         (Remove0DTensor, True),
@@ -201,6 +203,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(ReplaceArangeArgs())
         self.add_pass(DecomposeBinaryAlpha())
         self.add_pass(DecomposeCDist())
+        self.add_pass(DecomposeMaxPool3d(quantization_capture=True))
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeSilu())
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
@@ -69,6 +69,7 @@ def get_passes_dependency_for_capture_program():
         DecomposeAny,
         DecomposeColIm,
         DecomposeLinalgVectorNorm,
+        DecomposeMaxPool3d,
         ExpandBroadcastTensorShape,
         FixedLinearKeepDim,
         FoldQDQ,
@@ -93,6 +94,7 @@ def get_passes_dependency_for_capture_program():
         DecomposeAny: [RemoveRedundancy],
         DecomposeColIm: [FoldQDQ],
         DecomposeLinalgVectorNorm: [RemoveRedundancy],
+        DecomposeMaxPool3d: [RemoveRedundancy],
         ExpandBroadcastTensorShape: [FoldQDQ],
         FixedLinearKeepDim: [FoldQDQ],
         FoldQDQ: [AnnotateQuantAttrs, AnnotateStack, AnnotateUnbind],
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -1466,6 +1466,24 @@ def forward(self, x):
         return self.max_pool2d(x)
 
 
+class MaxPool3d(torch.nn.Module):
+    def __init__(
+        self, kernel_size, stride, padding, dilation, ceil_mode, return_indices
+    ):
+        super().__init__()
+        self.max_pool3d = torch.nn.MaxPool3d(
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            return_indices=return_indices,
+            ceil_mode=ceil_mode,
+        )
+
+    def forward(self, x):
+        return self.max_pool3d(x)
+
+
 class Mean(torch.nn.Module):
     def __init__(
         self,
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1406,6 +1406,24 @@ def test_qnn_backend_max_pool2d(self):
         sample_input = (torch.randn(4, 3, 24, 24),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_max_pool3d(self):
+        # NOTE: The pad should be at most half of effective kernel size.
+        modules = [
+            MaxPool3d((3), (1), (1), (1), False, False),  # noqa: F405
+            MaxPool3d((7), (1), (3), (1), False, False),  # noqa: F405
+            MaxPool3d((7), (1), (3), (1), True, False),  # noqa: F405
+            MaxPool3d(  # noqa: F405
+                (7, 7, 7), (1, 1, 1), (3, 3, 3), (1, 1, 1), True, False
+            ),  # noqa: F405
+            MaxPool3d(  # noqa: F405
+                (7, 9, 13), (1, 1, 1), (3, 4, 6), (1, 1, 1), False, False
+            ),  # noqa: F405
+        ]
+        sample_input = (torch.randn(1, 7, 21, 35, 28),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_mean(self):
         test_comb = [
             # Reduce over last two dims, keepdim=True
@@ -3636,6 +3654,25 @@ def test_qnn_backend_max_pool2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_max_pool3d(self):
+        # NOTE: The pad should be at most half of effective kernel size.
+        modules = [
+            MaxPool3d((3), (1), (1), (1), False, False),  # noqa: F405
+            MaxPool3d((7), (1), (3), (1), False, False),  # noqa: F405
+            MaxPool3d((7), (1), (3), (1), True, False),  # noqa: F405
+            MaxPool3d(  # noqa: F405
+                (7, 7, 7), (1, 1, 1), (3, 3, 3), (1, 1, 1), True, False
+            ),  # noqa: F405
+            MaxPool3d(  # noqa: F405
+                (7, 9, 13), (1, 1, 1), (3, 4, 6), (1, 1, 1), False, False
+            ),  # noqa: F405
+        ]
+        sample_input = (torch.randn(1, 7, 21, 35, 28),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_mean(self):
         test_comb = [
             # Reduce over last two dims, keepdim=True