[SYCLomatic] Add a test case to migrate the thread_group structure and 3 relevant APIs. (#452)

ShengchenJ · web-flow · commit 8cf37bacfcd3 · 2023-08-29T15:08:41.000+08:00
Signed-off-by: Chen, Sheng S &lt;sheng.s.chen@intel.com&gt;
diff --git a/features/feature_case/cooperative_groups/cooperative_groups_thread_group.cu b/features/feature_case/cooperative_groups/cooperative_groups_thread_group.cu
@@ -0,0 +1,88 @@
+// ====------ cooperative_groups_thread_group.cu --------- *- CUDA -* ----===////
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===----------------------------------------------------------------------===//
+
+#include <cooperative_groups.h>
+#include <cstdio>
+#include <stdlib.h>
+namespace cg = cooperative_groups;
+
+__device__ int testThreadGroup(cg::thread_group g, int *input, int val) {
+
+  int thread_index = g.thread_rank();
+  for (int i = g.size() / 2; i > 0; i /= 2) {
+    input[thread_index] = val;
+    g.sync();
+
+    if (thread_index < i) {
+      val += input[thread_index];
+    }
+    g.sync();
+  }
+  if (thread_index == 0) {
+    return val;
+  } else {
+    return -1;
+  }
+}
+
+__global__ void kernelFunc(unsigned int *ret) {
+  *ret = 0;
+  auto block = cg::this_thread_block();
+  int value = 2;
+  __shared__ int workspace[1024];
+  block.thread_index();
+  auto threadBlockGroup = cg::this_thread_block();
+  int ret1, ret2, ret3;
+  ret1 = testThreadGroup(threadBlockGroup, workspace, value);
+  if (threadBlockGroup.thread_rank() == 0) {
+    printf("value1 is %d\n", ret1);
+  }
+
+  cg::thread_block_tile<16> tilePartition16 =
+      cg::tiled_partition<16>(threadBlockGroup);
+  ret2 = testThreadGroup(tilePartition16, workspace, value);
+  if (threadBlockGroup.thread_rank() == 0) {
+    printf("value2 is %d\n", ret2);
+  }
+
+  cg::thread_block_tile<32> tilePartition32 =
+      cg::tiled_partition<32>(threadBlockGroup);
+  ret3 = testThreadGroup(tilePartition32, workspace, value);
+  if (threadBlockGroup.thread_rank() == 0) {
+    printf("value3 is %d\n", ret3);
+  }
+  if (threadBlockGroup.thread_rank() == 0) {
+    if (ret1 == 512 && ret2 == 32 && ret3 == 64) {
+      *ret = 1;
+    } else {
+      *ret = -1;
+    }
+  }
+}
+
+int main() {
+  bool checker4 = false;
+  unsigned int *ret_result;
+  unsigned int host[1];
+  cudaMalloc(&ret_result, sizeof(unsigned int));
+  kernelFunc<<<1, 256>>>(ret_result);
+  cudaMemcpy(host, ret_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+  cudaFree(ret_result);
+  printf("host valu is %d \n ", host[0]);
+  if (host[0] == 1) {
+    printf(" thread_group migration is run success \n");
+    checker4 = true;
+  } else {
+    printf("thread_group migration is run failed\n ");
+  }
+
+  if (checker4)
+    return 0;
+  return -1;
+}
diff --git a/features/features.xml b/features/features.xml
@@ -268,6 +268,7 @@
     <test testName="cublas_v1_runable" configFile="config/TEMPLATE_cuBlas_11.xml" splitGroup="double"/>
     <test testName="complex" configFile="config/TEMPLATE_complex.xml" splitGroup="double"/>
     <test testName="cooperative_groups" configFile="config/TEMPLATE_cooperative_groups.xml" />
+    <test testName="cooperative_groups_thread_group" configFile="config/TEMPLATE_cooperative_groups.xml" />
     <test testName="ccl-test" configFile="config/TEMPLATE_ccl_api.xml" />
     <test testName="ccl-test2" configFile="config/TEMPLATE_ccl_api.xml" />
     <test testName="cooperative_groups_reduce" configFile="config/TEMPLATE_cooperative_groups_reduce.xml" />
diff --git a/features/test_feature.py b/features/test_feature.py
@@ -50,7 +50,7 @@
               'thrust_tabulate', 'thrust_for_each_n', 'device_info', 'defaultStream', 'cudnn-rnn', 'feature_profiling',
               'thrust_raw_reference_cast', 'thrust_partition_copy', 'thrust_stable_partition_copy',
               'thrust_stable_partition', 'thrust_remove', 'cub_device_segmented_sort_pairs', 'thrust_find_if_not',
-              'thrust_find_if', 'thrust_mismatch', 'thrust_replace_copy', 'thrust_reverse', 'cooperative_groups_reduce',
+              'thrust_find_if', 'thrust_mismatch', 'thrust_replace_copy', 'thrust_reverse', 'cooperative_groups_reduce', 'cooperative_groups_thread_group',
               'remove_unnecessary_wait', 'thrust_equal_range', 'thrust_transform_inclusive_scan', 'thrust_uninitialized_copy_n', 'thrust_uninitialized_copy',
               'thrust_random_type', 'thrust_scatter_if', 'thrust_all_of', 'thrust_none_of', 'thrust_is_partitioned',
               'thrust_is_sorted_until', 'thrust_set_intersection', 'thrust_set_union_by_key', 'thrust_set_union',
@@ -76,7 +76,7 @@ def migrate_test():
             src.append(os.path.abspath(os.path.join(dirpath, filename)))
 
     nd_range_bar_exper = ['grid_sync']
-    logical_group_exper = ['cooperative_groups']
+    logical_group_exper = ['cooperative_groups', 'cooperative_groups_thread_group']
     experimental_bfloat16_tests = ['math-experimental-bf16', 'math-experimental-bf162']
 
     math_extension_tests = ['math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd']