@@ -33,13 +33,47 @@ static endFunction endReduceCallee = NULL;
33
33
34
34
void kokkosp_request_tool_settings (const uint32_t ,
35
35
Kokkos_Tools_ToolSettings* settings) {
36
- if (0 == tool_globFence) {
37
- settings->requires_global_fencing = false ;
36
+ settings->requires_global_fencing = false ;
37
+ }
38
+
39
+ // set of functions from Kokkos ToolProgrammingInterface (includes fence)
40
+ Kokkos::Tools::Experimental::ToolProgrammingInterface tpi_funcs;
41
+
42
+ uint32_t getDeviceID (uint32_t devid_in) {
43
+ int num_device_bits = 7 ;
44
+ int num_instance_bits = 17 ;
45
+ return (~((uint32_t (-1 )) << num_device_bits)) &
46
+ (devid_in >> num_instance_bits);
47
+ }
48
+
49
+ void invoke_ktools_fence (uint32_t devID) {
50
+ if (tpi_funcs.fence != nullptr ) {
51
+ tpi_funcs.fence (devID);
52
+ if (tool_verbosity > 1 ) {
53
+ printf (
54
+ " KokkosP: Sampler utility sucessfully invoked "
55
+ " tool-induced fence on device %d\n " ,
56
+ getDeviceID (devID));
57
+ }
38
58
} else {
39
- settings->requires_global_fencing = true ;
59
+ printf (
60
+ " KokkosP: FATAL: Kokkos Tools Programming Interface's tool-invoked "
61
+ " Fence is NULL!\n " );
62
+ exit (-1 );
40
63
}
41
64
}
42
65
66
+ void kokkosp_provide_tool_programming_interface (
67
+ uint32_t num_funcs, Kokkos_Tools_ToolProgrammingInterface* funcsFromTPI) {
68
+ if (!num_funcs) {
69
+ if (tool_verbosity > 0 )
70
+ printf (
71
+ " KokkosP: Note: Number of functions in Tools Programming Interface "
72
+ " is 0!\n " );
73
+ }
74
+ tpi_funcs = *funcsFromTPI;
75
+ }
76
+
43
77
void kokkosp_init_library (const int loadSeq, const uint64_t interfaceVer,
44
78
const uint32_t devInfoCount, void * deviceInfo) {
45
79
const char * tool_verbose_str = getenv (" KOKKOS_TOOLS_SAMPLER_VERBOSE" );
@@ -164,6 +198,9 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID,
164
198
printf (" KokkosP: sample %llu calling child-begin function...\n " ,
165
199
(unsigned long long )(*kID ));
166
200
}
201
+ if (tool_globFence) {
202
+ invoke_ktools_fence (0 );
203
+ }
167
204
if (NULL != beginForCallee) {
168
205
uint64_t nestedkID = 0 ;
169
206
(*beginForCallee)(name, devID, &nestedkID);
@@ -180,6 +217,9 @@ void kokkosp_end_parallel_for(const uint64_t kID) {
180
217
printf (" KokkosP: sample %llu calling child-end function...\n " ,
181
218
(unsigned long long )(kID ));
182
219
}
220
+ if (tool_globFence) {
221
+ invoke_ktools_fence (0 );
222
+ }
183
223
(*endForCallee)(retrievedNestedkID);
184
224
infokIDSample.erase (kID );
185
225
}
@@ -198,6 +238,9 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID,
198
238
}
199
239
if (NULL != beginScanCallee) {
200
240
uint64_t nestedkID = 0 ;
241
+ if (tool_globFence) {
242
+ invoke_ktools_fence (0 );
243
+ }
201
244
(*beginScanCallee)(name, devID, &nestedkID);
202
245
infokIDSample.insert ({*kID , nestedkID});
203
246
}
@@ -212,6 +255,9 @@ void kokkosp_end_parallel_scan(const uint64_t kID) {
212
255
printf (" KokkosP: sample %llu calling child-end function...\n " ,
213
256
(unsigned long long )(kID ));
214
257
}
258
+ if (tool_globFence) {
259
+ invoke_ktools_fence (0 );
260
+ }
215
261
(*endScanCallee)(retrievedNestedkID);
216
262
infokIDSample.erase (kID );
217
263
}
@@ -228,9 +274,11 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID,
228
274
printf (" KokkosP: sample %llu calling child-begin function...\n " ,
229
275
(unsigned long long )(*kID ));
230
276
}
231
-
232
277
if (NULL != beginReduceCallee) {
233
278
uint64_t nestedkID = 0 ;
279
+ if (tool_globFence) {
280
+ invoke_ktools_fence (0 );
281
+ }
234
282
(*beginReduceCallee)(name, devID, &nestedkID);
235
283
infokIDSample.insert ({*kID , nestedkID});
236
284
}
@@ -245,6 +293,9 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) {
245
293
printf (" KokkosP: sample %llu calling child-end function...\n " ,
246
294
(unsigned long long )(kID ));
247
295
}
296
+ if (tool_globFence) {
297
+ invoke_ktools_fence (0 );
298
+ }
248
299
(*endScanCallee)(retrievedNestedkID);
249
300
infokIDSample.erase (kID );
250
301
}
@@ -257,8 +308,9 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) {
257
308
extern " C" {
258
309
259
310
namespace impl = KokkosTools::Sampler;
260
-
261
311
EXPOSE_TOOL_SETTINGS (impl::kokkosp_request_tool_settings)
312
+ EXPOSE_PROVIDE_TOOL_PROGRAMMING_INTERFACE(
313
+ impl::kokkosp_provide_tool_programming_interface)
262
314
EXPOSE_INIT(impl::kokkosp_init_library)
263
315
EXPOSE_FINALIZE(impl::kokkosp_finalize_library)
264
316
EXPOSE_BEGIN_PARALLEL_FOR(impl::kokkosp_begin_parallel_for)
0 commit comments