Skip to content

Commit 4f24b81

Browse files
committed
Output only maximum memory per device
1 parent 0bf0979 commit 4f24b81

File tree

2 files changed

+28
-6
lines changed

2 files changed

+28
-6
lines changed

clients/common/utility.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -352,10 +352,14 @@ void rocblas_local_handle::rocblas_stream_end_capture()
352352
#endif
353353
}
354354

355-
void rocblas_parallel_initialize_thread(int id)
355+
void rocblas_parallel_initialize_thread(int id, size_t& memory_used)
356356
{
357+
size_t before_init, after_init, total_memory;
357358
CHECK_HIP_ERROR(hipSetDevice(id));
359+
CHECK_HIP_ERROR(hipMemGetInfo(&before_init, &total_memory));
358360
rocblas_initialize();
361+
CHECK_HIP_ERROR(hipMemGetInfo(&after_init, &total_memory));
362+
memory_used = before_init - after_init;
359363
}
360364

361365
/*!
@@ -369,17 +373,26 @@ void rocblas_parallel_initialize_thread(int id)
369373
*/
370374
void rocblas_parallel_initialize(int parallel_devices)
371375
{
372-
auto thread = std::make_unique<std::thread[]>(parallel_devices);
376+
auto thread = std::make_unique<std::thread[]>(parallel_devices);
377+
std::vector<size_t> init_memory(parallel_devices);
373378

374379
// Store the start timepoint of rocblas initialize
375380
auto start_time = std::chrono::steady_clock::now();
376381

377382
if(parallel_devices == 1)
383+
{
384+
size_t before_init, after_init, total_memory;
385+
CHECK_HIP_ERROR(hipMemGetInfo(&before_init, &total_memory));
378386
rocblas_initialize();
387+
CHECK_HIP_ERROR(hipMemGetInfo(&after_init, &total_memory));
388+
init_memory[0] = before_init - after_init;
389+
}
379390
else
380391
{
392+
381393
for(int id = 0; id < parallel_devices; ++id)
382-
thread[id] = std::thread(rocblas_parallel_initialize_thread, id);
394+
thread[id]
395+
= std::thread(rocblas_parallel_initialize_thread, id, std::ref(init_memory[id]));
383396
for(int id = 0; id < parallel_devices; ++id)
384397
thread[id].join();
385398
}
@@ -410,4 +423,15 @@ void rocblas_parallel_initialize(int parallel_devices)
410423
rocblas_cerr << "\nrocBLAS info: average time to initialize each device exceeded the max "
411424
"duration of "
412425
<< max_duration << " milliseconds. Check CPU's load metrics." << std::endl;
426+
427+
constexpr static float max_memory = 1.0;
428+
auto max_library_size
429+
= *std::max_element(std::begin(init_memory), std::end(init_memory)) * 1.0e-9;
430+
431+
rocblas_cout << "\nrocBLAS info: maximum library size per device is " << max_library_size
432+
<< " GB." << std::endl;
433+
if(max_library_size > max_memory)
434+
rocblas_cerr << "\nrocBLAS info: max kernel library size " << max_library_size
435+
<< " GB exceeds the max recommended memory " << max_memory
436+
<< " GB. Check library logic file sizes." << std::endl;
413437
}

library/src/tensile_host.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -701,14 +701,12 @@ namespace
701701
int g = glob(dir.c_str(), GLOB_NOSORT, nullptr, &glob_result);
702702
if(!g)
703703
{
704-
const char* experimental = getenv("ROCBLAS_TENSILE_EXPERIMENTAL_SELECTION");
705704
for(size_t i = 0; i < glob_result.gl_pathc; ++i)
706705
{
707706
std::string cofile = glob_result.gl_pathv[i];
708707
if(!skip_xnack.empty() && cofile.find(skip_xnack) != std::string::npos)
709708
continue;
710-
if((experimental == nullptr || experimental[0] == '\0')
711-
&& cofile.find("Experimental") != std::string::npos)
709+
if(cofile.find("Experimental") != std::string::npos)
712710
continue;
713711
adapter.loadCodeObjectFile(cofile);
714712
}

0 commit comments

Comments
 (0)