format

hipudding · Jul 8, 2024 · e9a550c · e9a550c
1 parent 34f957f
commit e9a550c
Show file tree

Hide file tree

Showing 18 changed files with 309 additions and 315 deletions.
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -766,7 +766,7 @@ if (LLAMA_CANN)
 
         # Set header and libs
         if(LLAMA_CANN)
-            set(CANN_INCLUDE_DIRS 
+            set(CANN_INCLUDE_DIRS
                 ${CANN_INSTALL_DIR}/include
                 ${CANN_INSTALL_DIR}/include/aclnn
                 ${CANN_INSTALL_DIR}/acllib/include

diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
@@ -864,7 +864,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
             fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__,
                     node->name, ggml_op_name(node->op));
         }
-        // if not synchronize, aclrtSynchronizeStream in 
+        // if not synchronize, aclrtSynchronizeStream in
         // ggml_backend_cann_synchronize() will raise error.
         ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
         GGML_ASSERT(ok);

diff --git a/ggml/src/ggml-cann/acl_ops.h b/ggml/src/ggml-cann/acl_ops.h
@@ -69,4 +69,4 @@ struct OpCaller {
     OpCaller& run(aclrtStream stream = nullptr);
 };
 
-#endif  // CANN_ACL_OPS
+#endif  // CANN_ACL_OPS
diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -59,7 +59,7 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
 
     aclTensor* acl_tensor = aclCreateTensor(
         acl_ne, dims, type_mapping(tensor->type), acl_stride,
-        offset / ggml_element_size(tensor), format, &acl_storage_ne, 1, 
+        offset / ggml_element_size(tensor), format, &acl_storage_ne, 1,
         tensor->data);
 
     return acl_tensor;
@@ -79,8 +79,8 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
     std::reverse(tmp_ne, tmp_ne + dims);
     std::reverse(tmp_stride, tmp_stride + dims);
 
-    aclTensor* acl_tensor = aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, 
-                                            offset / type_size, format, tmp_ne, 
+    aclTensor* acl_tensor = aclCreateTensor(tmp_ne, dims, dtype, tmp_stride,
+                                            offset / type_size, format, tmp_ne,
                                             dims, data_ptr);
 
     return acl_tensor;

diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h
@@ -35,4 +35,4 @@ int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
 
 #define BCAST_PARAM(src) bcast_ne_##src, bcast_nb_##src, bcast_dims
 
-#endif  // CANN_ACL_TENSOR_H
+#endif  // CANN_ACL_TENSOR_H
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
@@ -74,7 +74,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
-void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, 
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
                                   ggml_tensor* dst);
 
 template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
@@ -179,4 +179,4 @@ void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
-#endif  // CANN_ACLNN_OPS
+#endif  // CANN_ACLNN_OPS
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
@@ -142,4 +142,4 @@ struct ggml_backend_cann_context {
     aclrtStream stream() { return stream(0); }
 };
 
-#endif  // CANN_COMMON_H
+#endif  // CANN_COMMON_H
diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@@ -16,4 +16,4 @@
 #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
 
-#endif  // ASCENDC_KERNELS_H
+#endif  // ASCENDC_KERNELS_H
diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -10,9 +10,9 @@ template <typename SRC_T, typename DST_T>
 class DupByRows {
    public:
     __aicore__ inline DupByRows() {}
-    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub, 
+    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
                                 size_t *input_nb_ub) {
-        /* Dup by rows when src is contigous on first dimension and dst is 
+        /* Dup by rows when src is contigous on first dimension and dst is
         contiguous, each kernel process one row.
         */
 
@@ -23,49 +23,49 @@ class DupByRows {
         // param
         num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
         num_elem = input_ne_ub[0];
-        
-        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3) 
+
+        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
         idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
-        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])) 
+        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
                   / (input_ne_ub[1]);
-        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) 
+        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
                 - idx_ne2 * input_ne_ub[1];
 
         // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
         src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
                      + input_nb_ub[1] * idx_ne1;
-        
+
         // dst is contiguous
-        dst_stride = (idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) + 
-                      idx_ne2 * input_ne_ub[1] + 
+        dst_stride = (idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) +
+                      idx_ne2 * input_ne_ub[1] +
                       idx_ne1) * (input_ne_ub[0] * sizeof(DST_T));
-        
-        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src + 
+
+        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
                                                                 src_stride));
-        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst + 
+        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
                                                                 dst_stride));
 
-        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem + 
+        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
                                                 32 - 1) / 32 * 32);
-        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem + 
+        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
                                                 32 - 1) / 32 * 32);
     }
 
     __aicore__ inline void copy_in() {
         LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
-        
+
         DataCopyExtParams dataCopyParams;
         dataCopyParams.blockCount = 1;
         dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
         DataCopyPadExtParams<SRC_T> padParams;
         DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
-        
+
         src_queue.EnQue(src_local);
     }
 
     __aicore__ inline void copy_out() {
         LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
-        
+
         DataCopyExtParams dataCopyParams;
         dataCopyParams.blockCount = 1;
         dataCopyParams.blockLen = num_elem * sizeof(DST_T);
@@ -77,13 +77,13 @@ class DupByRows {
     __aicore__ inline void dup() {
         // main process, copy one row data from src to dst.
         copy_in();
-        
+
         LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
         LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-        
+
         int32_t BLOCK_NUM = 32 / sizeof(DST_T);
-        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1) 
-                                        / BLOCK_NUM * BLOCK_NUM);    
+        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
+                                        / BLOCK_NUM * BLOCK_NUM);
         dst_queue.EnQue<DST_T>(dst_local);
 
         src_queue.FreeTensor(src_local);
@@ -94,19 +94,19 @@ class DupByRows {
         // main process, copy one row data from src to dst.
         // cast dtype from src to dst.
         copy_in();
-        
+
         LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
         LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-        
-        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem); 
+
+        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
         dst_queue.EnQue<DST_T>(dst_local);
 
         src_queue.FreeTensor(src_local);
         copy_out();
     }
 
    private:
-  
+
     TPipe pipe;
     GlobalTensor<SRC_T> src_gm;
     GlobalTensor<DST_T> dst_gm;
@@ -118,7 +118,7 @@ class DupByRows {
     int64_t idx_ne1;
     int64_t src_stride;
     int64_t dst_stride;
-    
+
     TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
     TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
 };
@@ -152,7 +152,7 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
 
     DupByRows<half, half> op;
     op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup(); 
+    op.dup();
 }
 
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
@@ -174,7 +174,7 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
 
     DupByRows<float_t, float_t> op;
     op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup(); 
+    op.dup();
 }
 
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
@@ -197,7 +197,7 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
 
     DupByRows<float_t, half> op;
     op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast(); 
+    op.dup_with_cast();
 }
 
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
@@ -221,5 +221,5 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
 
     DupByRows<half, float_t> op;
     op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast(); 
-}
+    op.dup_with_cast();
+}
diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@@ -60,35 +60,35 @@ class GET_ROW_F16 {
 
     __aicore__ inline void copy_in(uint32_t offset, size_t len) {
         LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        size_t tail = len % 32; 
+        size_t tail = len % 32;
         len = len & ~31;
         DataCopy(input_local, input_gm[offset], len);
         if(tail != 0) {
             DataCopyExtParams dataCopyParams;
             dataCopyParams.blockCount = 1;
             dataCopyParams.blockLen = tail * sizeof(half);
             DataCopyPadExtParams<half> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len], 
+            DataCopyPad(input_local[len], input_gm[offset + len],
                         dataCopyParams, padParams);
         }
         input_queue.EnQue(input_local);
     }
 
     __aicore__ inline void copy_out(uint32_t offset, size_t len) {
         LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32; 
+        size_t tail = len % 32;
         len = len & ~31;
         DataCopy(output_gm[offset], output_local, len);
         if(tail != 0) {
             DataCopyExtParams dataCopyParams;
             dataCopyParams.blockCount = 1;
             dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len], 
+            DataCopyPad(output_gm[offset + len], output_local[len],
                         dataCopyParams);
         }
         output_queue.FreeTensor(output_local);
     }
-    
+
     __aicore__ inline void calculate_row(int64_t idx) {
         const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
         const int64_t indices_ne1_idx =
@@ -114,8 +114,8 @@ class GET_ROW_F16 {
         copy_in(input_offset, input_ne[0]);
         LocalTensor<half> input_local = input_queue.DeQue<half>();
         LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-        
-        Cast(output_local, input_local, RoundMode::CAST_NONE, 
+
+        Cast(output_local, input_local, RoundMode::CAST_NONE,
              local_buffer_elems);
         output_queue.EnQue(output_local);
         copy_out(output_offset, input_ne[0]);
@@ -183,4 +183,4 @@ extern "C" __global__ __aicore__ void ascendc_get_row_f16(
     op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
             indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
     op.calculate();
-}
+}
diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@@ -55,35 +55,35 @@ class GET_ROW_F32 {
 
     __aicore__ inline void copy_in(uint32_t offset, size_t len) {
         LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        size_t tail = len % 32; 
+        size_t tail = len % 32;
         len = len & ~31;
         DataCopy(input_local, input_gm[offset], len);
         if(tail != 0) {
             DataCopyExtParams dataCopyParams;
             dataCopyParams.blockCount = 1;
             dataCopyParams.blockLen = tail * sizeof(float);
             DataCopyPadExtParams<float> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len], 
+            DataCopyPad(input_local[len], input_gm[offset + len],
                         dataCopyParams, padParams);
         }
         input_queue.EnQue(input_local);
     }
 
     __aicore__ inline void copy_out(uint32_t offset, size_t len) {
         LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32; 
+        size_t tail = len % 32;
         len = len & ~31;
         DataCopy(output_gm[offset], output_local, len);
         if(tail != 0) {
             DataCopyExtParams dataCopyParams;
             dataCopyParams.blockCount = 1;
             dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len], 
+            DataCopyPad(output_gm[offset + len], output_local[len],
                         dataCopyParams);
         }
         output_queue.FreeTensor(output_local);
     }
-    
+
     __aicore__ inline void calculate_row(int64_t idx) {
         const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
         const int64_t indices_ne1_idx =
@@ -109,7 +109,7 @@ class GET_ROW_F32 {
         copy_in(input_offset, input_ne[0]);
         LocalTensor<float> input_local = input_queue.DeQue<float>();
         LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-        
+
         DataCopy(output_local, input_local, local_buffer_elems);
         output_queue.EnQue(output_local);
         copy_out(output_offset, input_ne[0]);
@@ -177,4 +177,4 @@ extern "C" __global__ __aicore__ void ascendc_get_row_f32(
     op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
             indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
     op.calculate();
-}
+}
diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -190,4 +190,4 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
     op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
             indices_nb_ub, output_ne_ub, output_nb_ub);
     op.calculate();
-}
+}
diff --git a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
@@ -188,4 +188,4 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
     op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
             indices_nb_ub, output_ne_ub, output_nb_ub);
     op.calculate();
-} 
+}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -35,4 +35,4 @@ int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,

		#define BCAST_PARAM(src) bcast_ne_##src, bcast_nb_##src, bcast_dims

		#endif // CANN_ACL_TENSOR_H
		#endif // CANN_ACL_TENSOR_H