fix: 解决一些warning，并把sync操作从算子内部移除

Chamberlain0w0 · YdrMaster · commit 2258c1ee2d09 · 2024-01-31T14:22:15.000+08:00
diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
@@ -128,8 +128,8 @@ namespace refactor::kernel {
             auto y = outputs[0];
 
             void *xTrans = workspace;
-            void *yTrans = xTrans + xTransSize;
-            void *cursor = yTrans + xTransSize;
+            void *yTrans = reinterpret_cast<uint8_t *>(xTrans) + xTransSize;
+            void *cursor = reinterpret_cast<uint8_t *>(yTrans) + xTransSize;
 
             // transpose NCHW input to NHWC
             CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x,
@@ -147,7 +147,6 @@ namespace refactor::kernel {
             CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->inDescTrans, yTrans,
                                          d->inDesc, y, cursor, workspaceSize));
 
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
 
         return {std::move(routine), totalWorkspaceSize};
diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.cc b/src/04kernel/src/kernels/cast/cnnl_kernel.cc
@@ -65,7 +65,6 @@ namespace refactor::kernel {
         return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
             CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore<CnnlContext>()->handle,
                                          d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0]));
-            // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
     }
 
diff --git a/src/04kernel/src/kernels/clip/cnnl_kernel.cc b/src/04kernel/src/kernels/clip/cnnl_kernel.cc
@@ -57,7 +57,6 @@ namespace refactor::kernel {
                                     CNNL_POINTER_MODE_DEVICE, d->t,
                                     inputs[0], inputs[1], hasMax ? inputs[2] : nullptr,
                                     d->t, outputs[0]));
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
     }
 
diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.cc b/src/04kernel/src/kernels/concat/cnnl_kernel.cc
@@ -52,7 +52,7 @@ namespace refactor::kernel {
             }
             ~Descriptors() noexcept(false) {
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(in));
-                for (auto i = 0; i < out.size(); i++) {
+                for (size_t i = 0; i < out.size(); i++) {
                     CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i]));
                 }
             }
@@ -62,7 +62,7 @@ namespace refactor::kernel {
         };
         auto d = std::make_shared<Descriptors>(info.num, info.dataType != DT::F64);
         setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));
-        for (auto i = 0; i < info.outDims.size(); i++) {
+        for (size_t i = 0; i < info.outDims.size(); i++) {
             setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));
         }
 
diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.cc b/src/04kernel/src/kernels/conv/cnnl_kernel.cc
@@ -209,9 +209,9 @@ namespace refactor::kernel {
             // }
 
             void *xTrans = workspace;
-            void *wTrans = xTrans + xTransSize;
-            void *yTrans = wTrans + wTransSize;
-            void *opWorkspace = yTrans + yTransSize;
+            void *wTrans = reinterpret_cast<uint8_t *>(xTrans) + xTransSize;
+            void *yTrans = reinterpret_cast<uint8_t *>(wTrans) + wTransSize;
+            void *opWorkspace = reinterpret_cast<uint8_t *>(yTrans) + yTransSize;
 
             // transpose NCHW input to NHWC
             CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->x, x,
diff --git a/src/04kernel/src/kernels/expand/cnnl_kernel.cc b/src/04kernel/src/kernels/expand/cnnl_kernel.cc
@@ -60,7 +60,6 @@ namespace refactor::kernel {
         return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
             CNNL_ASSERT(cnnlExpand(res.fetchOrStore<CnnlContext>()->handle,
                                    d->inDesc, inputs[0], d->outDesc, outputs[0]));
-            // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
     }
 #endif
diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
@@ -79,7 +79,6 @@ namespace refactor::kernel {
                                      d->inDesc, inputs[0], reinterpret_cast<const int *>(workspace),
                                      d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
                                      d->outDesc, outputs[0]));
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
        };
 
         return {std::move(routine), workspaceSize};
diff --git a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc
@@ -141,7 +141,6 @@ namespace refactor::kernel {
                     workspace, algoWorkspaceSize));
             }
 
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
 
         return {std::move(routine), algoWorkspaceSize};
diff --git a/src/04kernel/src/kernels/pool/cnnl_kernel.cc b/src/04kernel/src/kernels/pool/cnnl_kernel.cc
@@ -130,7 +130,7 @@ namespace refactor::kernel {
             auto handle = res.fetchOrStore<CnnlContext>()->handle;
            
             void *extraInputDev = workspace;
-            void *poolWorkSpace = workspace + extraInputSize;
+            void *poolWorkSpace = reinterpret_cast<uint8_t *>(workspace) + extraInputSize;
 
             void *extraInputHost = malloc(extraInputSize);
             CNNL_ASSERT(cnnlInitPoolingExtraInput(handle, d->pooling, d->x, d->y, extraInputHost));
@@ -145,7 +145,7 @@ namespace refactor::kernel {
                 &b, extraInputDev, d->y, outputs[0],
                 poolWorkSpace, workspaceSize));
 
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
+            res.fetchOrStore<CnnlContext>()->queueSync();
 
             free(extraInputHost);
         };
diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
@@ -180,7 +180,6 @@ namespace refactor::kernel {
                                          workspace, workspaceSize));
             }
             
-            BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
         };
 
         return {std::move(routine), workspaceSize};
diff --git a/src/04kernel/src/kernels/slice/cnnl_kernel.cc b/src/04kernel/src/kernels/slice/cnnl_kernel.cc
@@ -64,7 +64,7 @@ namespace refactor::kernel {
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data()));
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->out, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDim.size(), info.outDim.data()));
         std::vector<int> begin, end, stride;
-        for (auto i = 0; i < info.dims.size(); i++) {
+        for (size_t i = 0; i < info.dims.size(); i++) {
             // [begin, end), end is not inclued
             begin.push_back(info.dims[i].start);
             auto sign = info.dims[i].step > 0 ? 1 : -1;
diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc
@@ -80,7 +80,6 @@ namespace refactor::kernel {
                 CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
                 &a, d->t, inputs[0],
                 &b, d->t, outputs[0]));
-            res.fetchOrStore<CnnlContext>()->queueSync();
         };
     }
 
diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.cc b/src/04kernel/src/kernels/split/cnnl_kernel.cc
@@ -69,7 +69,7 @@ namespace refactor::kernel {
             }
             ~Descriptors() noexcept(false) {
                 CNNL_ASSERT(cnnlDestroyTensorDescriptor(in));
-                for (auto i = 0; i < out.size(); i++) {
+                for (size_t i = 0; i < out.size(); i++) {
                     CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i]));
                 }
             }
@@ -81,7 +81,7 @@ namespace refactor::kernel {
         // setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));
         CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data()));
 
-        for (auto i = 0; i < info.outDims.size(); i++) {
+        for (size_t i = 0; i < info.outDims.size(); i++) {
             // setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));
             CNNL_ASSERT(cnnlSetTensorDescriptor(d->out[i], CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDims[i].size(), info.outDims[i].data()));
         }
diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.cc b/src/04kernel/src/kernels/where/cnnl_kernel.cc
@@ -102,7 +102,6 @@ namespace refactor::kernel {
                 d->y, inputs[2], workspace, workspaceSize,
                 d->ans, outputs[0]));
 
-            res.fetchOrStore<CnnlContext>()->queueSync();
         };
 
         return {std::move(routine), workspaceSize};
diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.cc b/src/04kernel/src/utilities/bang/cnrt_functions.cc
@@ -4,7 +4,7 @@
 #include <cnrt.h>
 #include <cstdio>
 
-namespace refactor::kernel::cnnl {
+namespace refactor::kernel::bang {
 
     int currentDevice() {
         int device;
@@ -22,6 +22,6 @@ namespace refactor::kernel::cnnl {
                                CNRT_MEM_TRANS_DIR_DEV2HOST));
     }
 
-}// namespace refactor::kernel::cnnl
+}// namespace refactor::kernel::bang
 
 #endif
diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.h b/src/04kernel/src/utilities/bang/cnrt_functions.h
@@ -3,14 +3,14 @@
 
 #include "common.h"
 
-namespace refactor::kernel::cnnl {
+namespace refactor::kernel::bang {
 
     int currentDevice();
 
     void sync();
 
     void copyOut(void *dst, const void *src, size_t size);
 
-}// namespace refactor::kernel::cnnl
+}// namespace refactor::kernel::bang
 
 #endif// KERNEL_CNRT_FUNCTIONS_H
diff --git a/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../../../src/kernels/batch_normalization/cnnl_kernel.hh"
 #include "../../../src/kernels/batch_normalization/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -57,6 +58,7 @@ TEST(kernel, BatchNormalizationCnnl) {
         void const *inputs[]{*mluIn, *mluScale, *mluBias, *mluMean, *mluVar};
         void *outputs[]{*mluOut};
         rMlu(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     // take output data
     std::vector<float> result(outTensor->elementsSize());
diff --git a/src/04kernel/test/kernels/cast/test_cnnl.cpp b/src/04kernel/test/kernels/cast/test_cnnl.cpp
@@ -1,7 +1,8 @@
 ﻿#ifdef USE_BANG
 
-#include "../../../src/kernels/cast/cpu_kernel.hh"
 #include "../../../src/kernels/cast/cnnl_kernel.hh"
+#include "../../../src/kernels/cast/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -34,6 +35,7 @@ TEST(kernel, CastCnnl) {
         void const *inputs[]{*xMlu};
         void *outputs[]{*yMlu};
         routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{x_.data()};
diff --git a/src/04kernel/test/kernels/clip/test_cnnl.cpp b/src/04kernel/test/kernels/clip/test_cnnl.cpp
@@ -1,7 +1,8 @@
 ﻿#ifdef USE_BANG
 
-#include "../../../src/kernels/clip/cpu_kernel.hh"
 #include "../../../src/kernels/clip/cnnl_kernel.hh"
+#include "../../../src/kernels/clip/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -36,6 +37,7 @@ TEST(kernel, ClipCnnl) {
         void const *inputs[]{*mluMem, *mluMin, *mluMax};
         void *outputs[]{*mluMem};
         routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{value.data(), &min, &max};
diff --git a/src/04kernel/test/kernels/concat/test_cnnl.cpp b/src/04kernel/test/kernels/concat/test_cnnl.cpp
@@ -1,7 +1,8 @@
 ﻿#ifdef USE_BANG
 
-#include "../../../src/kernels/concat/cpu_kernel.hh"
 #include "../../../src/kernels/concat/cnnl_kernel.hh"
+#include "../../../src/kernels/concat/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -65,6 +66,7 @@ TEST(kernel, ConcatCnnl) {
         void const *inputs[]{*mluIns[0], *mluIns[1], *mluIns[2], *mluIns[3]};
         void *outputs[]{*mluOut};
         routine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{cpuIns[0].data(), cpuIns[1].data(), cpuIns[2].data(), cpuIns[3].data()};
diff --git a/src/04kernel/test/kernels/conv/test_cnnl.cpp b/src/04kernel/test/kernels/conv/test_cnnl.cpp
@@ -1,6 +1,7 @@
 #ifdef USE_BANG
 
 #include "../../../src/kernels/conv/cnnl_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -39,15 +40,7 @@ void testConvCnnl(int rank, const int64_t *pads, const int64_t *strides, const i
     void const *inputs[]{*xMlu, *wMlu};
     void *outputs[]{*yMlu};
     routine(res, *workspace, inputs, outputs);
-
-    xMlu->copyToHost(xData.data(), xTensor->bytesSize());
-    wMlu->copyToHost(wData.data(), wTensor->bytesSize());
-    // fmt::println("{}", vec2str(xData));
-    // fmt::println("{}", vec2str(wData));
-
-    // std::vector<float> ws(workspaceSize);
-    // workspace->copyToHost(ws.data(), workspaceSize);
-    // fmt::println("{}", vec2str(ws));
+    kernel::bang::sync();
 
     // take output data
     std::vector<float> result(yTensor->elementsSize());
diff --git a/src/04kernel/test/kernels/expand/test_cnnl.cpp b/src/04kernel/test/kernels/expand/test_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../../../src/kernels/expand/cnnl_kernel.hh"
 #include "../../../src/kernels/expand/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 #include <numeric>
@@ -36,6 +37,7 @@ TEST(kernel, ExpandCnnl) {
         void const *inputs[]{*mluIn};
         void *outputs[]{*mluOut};
         routine(res, nullptr, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{data.data()};
diff --git a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../src/kernels/gather/cnnl_kernel.hh"
 #include "../src/kernels/gather/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -39,6 +40,7 @@ TEST(kernel, GatherCnnl) {
             void const *inputs[]{*aMLU, *bMLU};
             void *outputs[]{*cMLU};
             cnnlRoutine(res, *workspace, inputs, outputs);
+            kernel::bang::sync();
         }
         {
             void const *inputs[]{a.data(), b.data()};
@@ -81,6 +83,7 @@ TEST(kernel, GatherCnnl) {
             void const *inputs[]{*aMLU, *bMLU};
             void *outputs[]{*cMLU};
             cnnlRoutine(res, *workspace, inputs, outputs);
+            kernel::bang::sync();
         }
         {
             void const *inputs[]{a.data(), b.data()};
@@ -110,7 +113,7 @@ TEST(kernel, GatherCnnl) {
         auto cpuRoutine = cpuKernel->lower(res).routine;
         // Init inputs and outputs
         std::vector<float> a;
-        for (auto i = 0; i < data->elementsSize(); i++) {
+        for (size_t i = 0; i < data->elementsSize(); i++) {
             a.push_back(i + 0.1f);
         }
         std::vector<int64_t> b(indices->elementsSize(), 0);
@@ -126,6 +129,7 @@ TEST(kernel, GatherCnnl) {
             void const *inputs[]{*aMLU, *bMLU};
             void *outputs[]{*cMLU};
             cnnlRoutine(res, *workspace, inputs, outputs);
+            kernel::bang::sync();
         }
         {
             void const *inputs[]{a.data(), b.data()};
diff --git a/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp
@@ -2,6 +2,7 @@
 
 #include "../src/kernels/mat_mul/cnnl_kernel.hh"
 #include "../src/kernels/mat_mul/cpu_kernel.hh"
+#include "../src/utilities/bang/cnrt_functions.h"
 #include "hardware/device_manager.h"
 #include <gtest/gtest.h>
 
@@ -48,6 +49,7 @@ TEST(kernel, MatMulCnnl_OnlyBias) {
     void const *inputs[]{*ma, *mb, *mc};
     void *outputs[]{*my};
     routine(res, *workspace, inputs, outputs);
+    kernel::bang::sync();
     // take output data
     std::vector<float> result(Y->elementsSize());
     my->copyToHost(result.data(), Y->bytesSize());
@@ -91,6 +93,7 @@ TEST(kernel, MatMulCnnl_Broadcast) {
         void const *inputs[]{*ma, *mb, *mc};
         void *outputs[]{*my};
         mluRoutine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{dataA.data(), dataB.data(), dataC.data()};
@@ -135,6 +138,7 @@ TEST(kernel, MatMulCnnl_TransABNoBias) {
         void const *inputs[]{*ma, *mb};
         void *outputs[]{*my};
         mluRoutine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{dataA.data(), dataB.data()};
@@ -189,6 +193,7 @@ TEST(kernel, MatMulCnnl_Large) {
         void const *inputs[]{*ma, *mb, *mc};
         void *outputs[]{*my};
         mluRoutine(res, *workspace, inputs, outputs);
+        kernel::bang::sync();
     }
     {
         void const *inputs[]{dataA.data(), dataB.data(), dataC.data()};
diff --git a/src/04kernel/test/kernels/pool/test_cnnl.cpp b/src/04kernel/test/kernels/pool/test_cnnl.cpp
diff --git a/src/04kernel/test/kernels/reduce/test_cnnl.cpp b/src/04kernel/test/kernels/reduce/test_cnnl.cpp
diff --git a/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp
diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp
diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
diff --git a/src/04kernel/test/kernels/slice/test_cnnl.cpp b/src/04kernel/test/kernels/slice/test_cnnl.cpp
diff --git a/src/04kernel/test/kernels/softmax/test_cnnl.cpp b/src/04kernel/test/kernels/softmax/test_cnnl.cpp
diff --git a/src/04kernel/test/kernels/split/test_cnnl.cpp b/src/04kernel/test/kernels/split/test_cnnl.cpp
diff --git a/src/04kernel/test/kernels/transpose/test_cnnl.cpp b/src/04kernel/test/kernels/transpose/test_cnnl.cpp
diff --git a/src/04kernel/test/kernels/where/test_cnnl.cpp b/src/04kernel/test/kernels/where/test_cnnl.cpp
diff --git a/src/09python_ffi/src/executor.cc b/src/09python_ffi/src/executor.cc

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,6 @@ namespace refactor::kernel {`
`65`	`65`	`return [d = std::move(d)](Resources &res, void workspace, void const const inputs, void const *outputs) {`
`66`	`66`	`CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore<CnnlContext>()->handle,`
`67`	`67`	`d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0]));`
`68`		`- // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));`
`69`	`68`	`};`
`70`	`69`	`}`
`71`	`70`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,6 @@ namespace refactor::kernel {`
`57`	`57`	`CNNL_POINTER_MODE_DEVICE, d->t,`
`58`	`58`	`inputs[0], inputs[1], hasMax ? inputs[2] : nullptr,`
`59`	`59`	`d->t, outputs[0]));`
`60`		`- BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));`
`61`	`60`	`};`
`62`	`61`	`}`
`63`	`62`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ namespace refactor::kernel {`
`52`	`52`	`}`
`53`	`53`	`~Descriptors() noexcept(false) {`
`54`	`54`	`CNNL_ASSERT(cnnlDestroyTensorDescriptor(in));`
`55`		`- for (auto i = 0; i < out.size(); i++) {`
	`55`	`+ for (size_t i = 0; i < out.size(); i++) {`
`56`	`56`	`CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i]));`
`57`	`57`	`}`
`58`	`58`	`}`
`@@ -62,7 +62,7 @@ namespace refactor::kernel {`
`62`	`62`	`};`
`63`	`63`	`auto d = std::make_shared<Descriptors>(info.num, info.dataType != DT::F64);`
`64`	`64`	`setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));`
`65`		`- for (auto i = 0; i < info.outDims.size(); i++) {`
	`65`	`+ for (size_t i = 0; i < info.outDims.size(); i++) {`
`66`	`66`	`setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));`
`67`	`67`	`}`
`68`	`68`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,6 @@ namespace refactor::kernel {`
`60`	`60`	`return [d = std::move(d)](Resources &res, void workspace, void const const inputs, void const *outputs) {`
`61`	`61`	`CNNL_ASSERT(cnnlExpand(res.fetchOrStore<CnnlContext>()->handle,`
`62`	`62`	`d->inDesc, inputs[0], d->outDesc, outputs[0]));`
`63`		`- // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));`
`64`	`63`	`};`
`65`	`64`	`}`
`66`	`65`	`#endif`
Original file line number	Diff line number	Diff line change
`@@ -141,7 +141,6 @@ namespace refactor::kernel {`
`141`	`141`	`workspace, algoWorkspaceSize));`
`142`	`142`	`}`
`143`	`143`
`144`		`- BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));`
`145`	`144`	`};`
`146`	`145`
`147`	`146`	`return {std::move(routine), algoWorkspaceSize};`
Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,6 @@ namespace refactor::kernel {`
`180`	`180`	`workspace, workspaceSize));`
`181`	`181`	`}`
`182`	`182`
`183`		`- BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));`
`184`	`183`	`};`
`185`	`184`
`186`	`185`	`return {std::move(routine), workspaceSize};`
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,6 @@ namespace refactor::kernel {`
`80`	`80`	`CNNL_COMPUTATION_ULTRAHIGH_PRECISION,`
`81`	`81`	`&a, d->t, inputs[0],`
`82`	`82`	`&b, d->t, outputs[0]));`
`83`		`- res.fetchOrStore<CnnlContext>()->queueSync();`
`84`	`83`	`};`
`85`	`84`	`}`
`86`	`85`
Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ namespace refactor::kernel {`
`69`	`69`	`}`
`70`	`70`	`~Descriptors() noexcept(false) {`
`71`	`71`	`CNNL_ASSERT(cnnlDestroyTensorDescriptor(in));`
`72`		`- for (auto i = 0; i < out.size(); i++) {`
	`72`	`+ for (size_t i = 0; i < out.size(); i++) {`
`73`	`73`	`CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i]));`
`74`	`74`	`}`
`75`	`75`	`}`
`@@ -81,7 +81,7 @@ namespace refactor::kernel {`
`81`	`81`	`// setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size()));`
`82`	`82`	`CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data()));`
`83`	`83`
`84`		`- for (auto i = 0; i < info.outDims.size(); i++) {`
	`84`	`+ for (size_t i = 0; i < info.outDims.size(); i++) {`
`85`	`85`	`// setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size()));`
`86`	`86`	`CNNL_ASSERT(cnnlSetTensorDescriptor(d->out[i], CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDims[i].size(), info.outDims[i].data()));`
`87`	`87`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`
`3`	`3`	`#include "../../../src/kernels/expand/cnnl_kernel.hh"`
`4`	`4`	`#include "../../../src/kernels/expand/cpu_kernel.hh"`
	`5`	`+#include "../src/utilities/bang/cnrt_functions.h"`
`5`	`6`	`#include "hardware/device_manager.h"`
`6`	`7`	`#include <gtest/gtest.h>`
`7`	`8`	`#include <numeric>`
`@@ -36,6 +37,7 @@ TEST(kernel, ExpandCnnl) {`
`36`	`37`	`void const inputs[]{mluIn};`
`37`	`38`	`void outputs[]{mluOut};`
`38`	`39`	`routine(res, nullptr, inputs, outputs);`
	`40`	`+ kernel::bang::sync();`
`39`	`41`	`}`
`40`	`42`	`{`
`41`	`43`	`void const *inputs[]{data.data()};`
Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`
`3`	`3`	`#include "../src/kernels/gather/cnnl_kernel.hh"`
`4`	`4`	`#include "../src/kernels/gather/cpu_kernel.hh"`
	`5`	`+#include "../src/utilities/bang/cnrt_functions.h"`
`5`	`6`	`#include "hardware/device_manager.h"`
`6`	`7`	`#include <gtest/gtest.h>`
`7`	`8`
`@@ -39,6 +40,7 @@ TEST(kernel, GatherCnnl) {`
`39`	`40`	`void const inputs[]{aMLU, *bMLU};`
`40`	`41`	`void outputs[]{cMLU};`
`41`	`42`	`cnnlRoutine(res, *workspace, inputs, outputs);`
	`43`	`+ kernel::bang::sync();`
`42`	`44`	`}`
`43`	`45`	`{`
`44`	`46`	`void const *inputs[]{a.data(), b.data()};`
`@@ -81,6 +83,7 @@ TEST(kernel, GatherCnnl) {`
`81`	`83`	`void const inputs[]{aMLU, *bMLU};`
`82`	`84`	`void outputs[]{cMLU};`
`83`	`85`	`cnnlRoutine(res, *workspace, inputs, outputs);`
	`86`	`+ kernel::bang::sync();`
`84`	`87`	`}`
`85`	`88`	`{`
`86`	`89`	`void const *inputs[]{a.data(), b.data()};`
`@@ -110,7 +113,7 @@ TEST(kernel, GatherCnnl) {`
`110`	`113`	`auto cpuRoutine = cpuKernel->lower(res).routine;`
`111`	`114`	`// Init inputs and outputs`
`112`	`115`	`std::vector<float> a;`
`113`		`- for (auto i = 0; i < data->elementsSize(); i++) {`
	`116`	`+ for (size_t i = 0; i < data->elementsSize(); i++) {`
`114`	`117`	`a.push_back(i + 0.1f);`
`115`	`118`	`}`
`116`	`119`	`std::vector<int64_t> b(indices->elementsSize(), 0);`
`@@ -126,6 +129,7 @@ TEST(kernel, GatherCnnl) {`
`126`	`129`	`void const inputs[]{aMLU, *bMLU};`
`127`	`130`	`void outputs[]{cMLU};`
`128`	`131`	`cnnlRoutine(res, *workspace, inputs, outputs);`
	`132`	`+ kernel::bang::sync();`
`129`	`133`	`}`
`130`	`134`	`{`
`131`	`135`	`void const *inputs[]{a.data(), b.data()};`