InfiniTensor
diff --git a/‎operatorspy/tests/avg_pool.py
Lines changed: 22 additions & 5 deletions b/‎operatorspy/tests/avg_pool.py
Lines changed: 22 additions & 5 deletions
diff --git a/‎operatorspy/tests/conv.py
Lines changed: 43 additions & 28 deletions b/‎operatorspy/tests/conv.py
Lines changed: 43 additions & 28 deletions
diff --git a/‎operatorspy/tests/max_pool.py
Lines changed: 17 additions & 3 deletions b/‎operatorspy/tests/max_pool.py
Lines changed: 17 additions & 3 deletions
diff --git a/‎src/ops/add/musa/add_musa.mu
Lines changed: 3 additions & 3 deletions b/‎src/ops/add/musa/add_musa.mu
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/ops/conv/musa/conv_musa.cc
Lines changed: 120 additions & 0 deletions b/‎src/ops/conv/musa/conv_musa.cc
Lines changed: 120 additions & 0 deletions
diff --git a/‎src/ops/conv/musa/conv_musa.h
Lines changed: 45 additions & 0 deletions b/‎src/ops/conv/musa/conv_musa.h
Lines changed: 45 additions & 0 deletions
@@ -89,8 +89,8 @@ def test(
         f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
     )
 
-    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
+    x = torch.ones(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.zeros(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
 
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = pool(x, k_shape, padding, strides)
@@ -152,6 +152,10 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
 
+
+    print(x)
+    print(y)
+    print(ans)
     assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
 
@@ -184,12 +188,23 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_musa(lib, test_cases):
+    import torch_musa
+
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        # test(lib, handle, "musa", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "musa", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
 
 if __name__ == "__main__":
     test_cases = [
         # x_shape, kernel_shape, padding, strides
-        ((1, 1, 10), (3,), (1,), (1,)),
-        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
+        # ((1, 1, 10), (3,), (1,), (1,)),
+        ((1, 1, 2, 2), (2, 2), (1, 1), (1, 1)),
+        ((32, 4, 224, 224), (3, 3), (1, 1), (2, 2)),
         ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
     ]
     args = get_args()
@@ -230,6 +245,8 @@ def test_bang(lib, test_cases):
         test_cuda(lib, test_cases)
     if args.bang:
         test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
         test_cpu(lib, test_cases)
     print("\033[92mTest passed!\033[0m")
@@ -39,22 +39,25 @@ class ConvDescriptor(Structure):
 
 
 def conv(x, w, stride, padding, dilation):
-    match len(x.shape) - 2:
-        case 1:
-            return F.conv1d(
-                x, w, stride=stride, padding=padding, dilation=dilation
-            )
-        case 2:
-            return F.conv2d(
-                x, w, stride=stride, padding=padding, dilation=dilation
-            )
-        case 3:
-            return F.conv3d(
-                x, w, stride=stride, padding=padding, dilation=dilation
-            )
-        case _:
-            print("Error: Pytorch -> Unsupported tensor dimension")
-            return None
+    ndim = len(x.shape) - 2
+    conv_func_map = {
+        1: F.conv1d,
+        2: F.conv2d,
+        3: F.conv3d
+    }
+
+    if ndim not in conv_func_map:
+        print("Error: Pytorch -> Unsupported tensor dimension")
+        return None
+
+    # Select the appropriate convolution function
+    conv_func = conv_func_map[ndim]
+
+    if PROFILE:
+        ans = conv_func(x, w, stride=stride, padding=padding, dilation=dilation)
+        torch.cuda.synchronize()
+        return ans
+    return conv_func(x, w, stride=stride, padding=padding, dilation=dilation)
 
 
 # infer the shape of the output given the inputs for a N-ary convolution
@@ -206,18 +209,28 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_musa(lib, test_cases):
+    import torch_musa
+
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        # test(lib, handle, "musa", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "musa", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
 
 if __name__ == "__main__":
     test_cases = [
         # x_shape, w_shape, pads, strides, dilations, x_strides
-        (
-            (32, 3, 4),
-            (32, 3, 5),
-            (1,),
-            (1,),
-            (1,),
-            None,
-        ),
+        # (
+        #     (32, 3, 4),
+        #     (32, 3, 5),
+        #     (1,),
+        #     (1,),
+        #     (1,),
+        #     None,
+        # ),
         (
             (1, 3, 4, 4),
             (2, 3, 3, 3),
@@ -228,9 +241,9 @@ def test_bang(lib, test_cases):
         ),
         (
             (32, 3, 128, 128),
-            (64, 3, 5, 5),
-            (2, 2),
-            (2, 2),
+            (1, 3, 3, 3),
+            (1, 1),
+            (1, 1),
             (1, 1),
             None,
         ),
@@ -286,6 +299,8 @@ def test_bang(lib, test_cases):
         test_cuda(lib, test_cases)
     if args.bang:
         test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
         test_cpu(lib, test_cases)
     print("\033[92mTest passed!\033[0m")
@@ -88,7 +88,7 @@ def test(
 
     x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
     y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
-
+    
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = pool(x, k_shape, padding, strides)
     if PROFILE:
@@ -148,7 +148,9 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
-
+    print(x)
+    print(y)
+    print(ans)
     assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor))
 
@@ -181,6 +183,16 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_musa(lib, test_cases):
+    import torch_musa
+
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        # test(lib, handle, "musa", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "musa", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
 
 if __name__ == "__main__":
     test_cases = [
@@ -227,6 +239,8 @@ def test_bang(lib, test_cases):
         test_cuda(lib, test_cases)
     if args.bang:
         test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
         test_cpu(lib, test_cases)
     print("\033[92mTest passed!\033[0m")
@@ -69,7 +69,7 @@ __global__ void add(
 }
 
 template<typename Tdata, typename BTdata>
-void _add_nv_gpu(AddMusaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) {
+void _add_mt_gpu(AddMusaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) {
     if (data_size == 0) {
         return;
     }
@@ -92,13 +92,13 @@ infiniopStatus_t add_mt_gpu(AddMusaDescriptor_t desc, void *c, void const *a, vo
     const auto a_vec = reinterpret_cast<const Tdata *>(a);
     const auto b_vec = reinterpret_cast<const Tdata *>(b);
     const auto c_vec = reinterpret_cast<Tdata *>(c);
-    _add_nv_gpu<Tdata, TIdata>(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream);
+    _add_mt_gpu<Tdata, TIdata>(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream);
 
     const auto remainder = desc->c_data_size % pack_size;
     const auto a_ = reinterpret_cast<const TIdata *>(a);
     const auto b_ = reinterpret_cast<const TIdata *>(b);
     const auto c_ = reinterpret_cast<TIdata *>(c);
-    _add_nv_gpu<TIdata, TIdata>(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream);
+    _add_mt_gpu<TIdata, TIdata>(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream);
     return STATUS_SUCCESS;
 }
 
 
@@ -0,0 +1,120 @@
+#include "conv_musa.h"
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include <vector>
+
+infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t handle,
+                                          ConvMusaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t x,
+                                          infiniopTensorDescriptor_t w,
+                                          void const *pads,
+                                          void const *strides,
+                                          void const *dilations,
+                                          uint64_t n) {
+    uint64_t ndim = y->ndim;
+    if (ndim < 3 || ndim != x->ndim || ndim != w->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x->shape[0] != y->shape[0] || w->shape[0] != y->shape[1] || x->shape[1] != w->shape[1]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt || y->dt != w->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    const auto new_ndim = std::max(4UL, ndim);
+    // convert pads, strides, dilations into int32[]
+    int *pad = new int[new_ndim];
+    int *stride = new int[new_ndim];
+    int *dilation = new int[new_ndim];
+    int64_t *x_shape = new int64_t[new_ndim];
+    int64_t *w_shape = new int64_t[new_ndim];
+    int64_t *y_shape = new int64_t[new_ndim];
+    auto pads_ = reinterpret_cast<uint64_t const *>(pads);
+    auto strides_ = reinterpret_cast<int64_t const *>(strides);
+    auto dilations_ = reinterpret_cast<uint64_t const *>(dilations);
+    for (size_t i = 0; i < new_ndim; ++i) {
+        pad[i] = i < ndim - 2 ? static_cast<int>(pads_[i]) : 0;
+        stride[i] = i < ndim - 2 ? static_cast<int>(strides_[i]) : 1;
+        dilation[i] = i < ndim - 2 ? static_cast<int>(dilations_[i]) : 1;
+        x_shape[i] = i < ndim ? static_cast<int64_t>(x->shape[i]) : 1;
+        w_shape[i] = i < ndim ? static_cast<int64_t>(w->shape[i]) : 1;
+        y_shape[i] = i < ndim ? static_cast<int64_t>(y->shape[i]) : 1;
+    }
+
+    musa::dnn::Tensor *x_tensor = new musa::dnn::Tensor();
+    musa::dnn::Tensor *y_tensor = new musa::dnn::Tensor();
+    musa::dnn::Tensor *w_tensor = new musa::dnn::Tensor();
+
+    if (y->dt == F16) {
+        x_tensor->SetType(musa::dnn::Tensor::Type::HALF);
+        y_tensor->SetType(musa::dnn::Tensor::Type::HALF);
+        w_tensor->SetType(musa::dnn::Tensor::Type::HALF);
+    } else if (y->dt == F32) {
+        x_tensor->SetType(musa::dnn::Tensor::Type::FLOAT);
+        y_tensor->SetType(musa::dnn::Tensor::Type::FLOAT);
+        w_tensor->SetType(musa::dnn::Tensor::Type::FLOAT);
+    }
+
+    x_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+    y_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+    w_tensor->SetFormat(musa::dnn::Tensor::Format::NCHW);
+
+    x_tensor->SetNdInfo((int) new_ndim, x_shape);
+    y_tensor->SetNdInfo((int) new_ndim, y_shape);
+    w_tensor->SetNdInfo((int) new_ndim, w_shape);
+
+    musa::dnn::Convolution* conv_operator = new musa::dnn::Convolution();
+    conv_operator->SetNdInfo((int) new_ndim-2, pad, stride, dilation);
+    musa::dnn::Convolution::Algorithm algo = musa::dnn::Convolution::Algorithm::DIRECT;
+    size_t workspace_size = 0;
+
+    use_mudnn(handle->mudnn_handles_t, handle->device_id, nullptr, [&](musa::dnn::Handle* handle) {
+        printf(" %d \n", conv_operator->GetRecommendForwardAlgorithm(*handle, algo, *y_tensor, *x_tensor, *w_tensor));
+        // printf(" %d \n", conv_operator->GetForwardWorkspaceSize(*handle, workspace_size, *y_tensor, *x_tensor, *w_tensor, algo));
+    });
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    printf("after: %d\n", algo);
+
+    printf("A\n");
+
+    *desc_ptr = new ConvMusaDescriptor{
+        DevMtGpu,
+        y->dt,
+        handle->device_id,
+        handle->mudnn_handles_t,
+        x_tensor,
+        w_tensor,
+        y_tensor,
+        conv_operator,
+        algo,
+        alpha,
+        beta,
+        workspace_size};
+
+    delete[] pad;
+    delete[] stride;
+    delete[] dilation;
+    delete[] x_shape;
+    delete[] w_shape;
+    delete[] y_shape;
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaGetConvWorkspaceSize(ConvMusaDescriptor_t desc, uint64_t *size) {
+    *size = desc->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyConvDescriptor(ConvMusaDescriptor_t desc) {
+
+    desc->mudnn_handles_t = nullptr;
+    delete desc;
+    return STATUS_SUCCESS;
+}
@@ -0,0 +1,45 @@
+#ifndef __MUSA_CONV_H__
+#define __MUSA_CONV_H__
+
+#include "../../../devices/musa/common_musa.h"
+#include "../../../devices/musa/musa_handle.h"
+#include "operators.h"
+#include <mudnn.h>
+
+struct ConvMusaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    std::shared_ptr<Pool<musa::dnn::Handle>> mudnn_handles_t;
+    musa::dnn::Tensor* x_tensor;
+    musa::dnn::Tensor* w_tensor;
+    musa::dnn::Tensor* y_tensor;
+    musa::dnn::Convolution* conv_operator;
+    musa::dnn::Convolution::Algorithm algo;
+    const float alpha;
+    const float beta;
+    uint64_t workspace_size;
+};
+
+typedef struct ConvMusaDescriptor *ConvMusaDescriptor_t;
+
+infiniopStatus_t musaCreateConvDescriptor(MusaHandle_t,
+                                          ConvMusaDescriptor_t *,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t x,
+                                          infiniopTensorDescriptor_t w,
+                                          void const *pads,
+                                          void const *strides,
+                                          void const *dilations,
+                                          uint64_t n);
+
+infiniopStatus_t musaGetConvWorkspaceSize(ConvMusaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t musaConv(ConvMusaDescriptor_t desc,
+                          void *workspace, uint64_t workspace_size,
+                          void *y, void const *x, void const *w,
+                          void *stream);
+
+infiniopStatus_t musaDestroyConvDescriptor(ConvMusaDescriptor_t desc);
+
+#endif