diff --git a/CMakeLists.txt b/CMakeLists.txt index 695d4fc..3f3b996 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,11 @@ if(HPXMP_WITH_OMPT) src/ompt-internal.h) endif() +set(HPXMP_WITH_OMP_50_ENABLED OFF CACHE BOOL "enable openmp 5.0 features in hpxmp") +if(HPXMP_WITH_OMP_50_ENABLED) + add_definitions(-DHPXMP_HAVE_OMP_50_ENABLED) +endif() + #decide whether debug or release build set(RELEASE_BUILD FALSE) string(TOLOWER "${CMAKE_BUILD_TYPE}" libhpxmp_build_type_lowercase) @@ -74,7 +79,7 @@ if(NOT HPX_WITH_HPXMP) target_link_libraries(hpxmp PRIVATE -L${dir}) endforeach() - target_link_libraries(hpxmp PRIVATE ${HPX_LIBRARIES}) + target_link_libraries(hpxmp PRIVATE ${HPX_LIBRARIES} hpx_preprocessor) else() diff --git a/README.md b/README.md index 196f44d..d80a942 100644 --- a/README.md +++ b/README.md @@ -55,4 +55,7 @@ Please refer to [Usage Examples](doc/usecases.md) for simple use examples. Please refer to [dones and todos](doc/done-todo.md) for information about what has been implemented in hpxMP and what needs to be done. +# Credit +Please cite as +Jeremy Kemp, Tianyi Zhang, Bryce Adelstein, Shahrzad Shirzad, Hartmut Kaiser, Parsa Amini, & Bibek Wagle. (2018, November 1). hpxMP (Version 0.1.0). Zenodo. http://doi.org/10.5281/zenodo.2662481 diff --git a/doc/done-todo.md b/doc/done-todo.md index a4b3aab..03919cd 100644 --- a/doc/done-todo.md +++ b/doc/done-todo.md @@ -63,10 +63,14 @@ The following pragmas and clauses are tested in hpxMP. *#pragma omp task untied +*#pragma omp taskgroup task_reduction(+:sum) + +*#pragma omp task in_reduction(+:sum) + *#pragma omp taskwait **Note**: Directives not implemented in hpxMP: -cancellation, taskgroup, threadprivate, copyprivate, and copyin +cancellation, threadprivate, copyprivate, and copyin # Compiler Support hpxMP works wich clang/gcc, @@ -84,5 +88,6 @@ A list of callbacks to be supported can be found [here](ompt-priorities.txt). **Note**: Currently Mandatory Events are under control, but they needs to be reviewed and improved. Currently, OMPT does not work with gcc. -Other events are not implemented yet. +Other events are not implemented yet. + diff --git a/src/gcc_hpxMP.cpp b/src/gcc_hpxMP.cpp index da84b2a..1f342d2 100644 --- a/src/gcc_hpxMP.cpp +++ b/src/gcc_hpxMP.cpp @@ -698,7 +698,7 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *), void *data, unsigned n #endif start_backend(); //__kmpc_push_num_threads - omp_task_data * my_data = hpx_backend->get_task_data(); + auto my_data = hpx_backend->get_task_data(); my_data->set_threads_requested(num_threads); __kmp_GOMP_fork_call(task,(microtask_t )__kmp_GOMP_microtask_wrapper, 2, task, data); @@ -714,7 +714,7 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task) (void *), void *data, //from gomp parallel start_backend(); //from __kmpc_push_num_threads - omp_task_data * my_data = hpx_backend->get_task_data(); + auto my_data = hpx_backend->get_task_data(); my_data->set_threads_requested(num_threads); __kmp_GOMP_fork_call(task, @@ -730,7 +730,7 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task) (void *), void *data, long ub, long str, long chunk_sz, unsigned flags) \ { \ start_backend(); \ - omp_task_data *my_data = hpx_backend->get_task_data(); \ + auto my_data = hpx_backend->get_task_data(); \ my_data->set_threads_requested(num_threads); \ __kmp_GOMP_fork_call(task, \ (microtask_t) __kmp_GOMP_parallel_microtask_wrapper, 9, task, \ diff --git a/src/hpx_runtime.cpp b/src/hpx_runtime.cpp index 94658de..807f559 100644 --- a/src/hpx_runtime.cpp +++ b/src/hpx_runtime.cpp @@ -30,15 +30,15 @@ extern boost::shared_ptr hpx_backend; void wait_for_startup(std::mutex& startup_mtx, std::condition_variable& cond, bool& running) { -#if defined DEBUG - cout << "HPX OpenMP runtime has started" << endl; -#endif { // Let the main thread know that we're done. //std::scoped_lock lk(startup_mtx); std::lock_guard lk(startup_mtx); running = true; cond.notify_all(); } +#if defined DEBUG + std::cerr << "HPX OpenMP runtime has started" << endl; +#endif } void fini_runtime() @@ -46,8 +46,10 @@ void fini_runtime() #if defined DEBUG cout << "Stopping HPX OpenMP runtime" << endl; #endif + std::this_thread::sleep_for(std::chrono::milliseconds(50)); //this should only be done if this runtime started hpx - hpx::get_runtime().stop(); + hpx::threads::run_as_hpx_thread([]() { hpx::finalize(); }); + hpx::stop(); #if defined DEBUG cout << "Stopped" << endl; #endif @@ -162,7 +164,7 @@ bool hpx_runtime::set_thread_data_check() { return false; } -omp_task_data* hpx_runtime::get_task_data() +intrusive_ptr hpx_runtime::get_task_data() { omp_task_data *data; if(hpx::threads::get_self_ptr()) { @@ -171,10 +173,12 @@ omp_task_data* hpx_runtime::get_task_data() std::cerr<<"trying to get null hpx thread data\n"; data = initial_thread.get(); } - } else { + } + else { data = initial_thread.get(); } - return data; + intrusive_ptr data_ptr(data); + return data_ptr; } double hpx_runtime::get_time() { @@ -209,82 +213,63 @@ void hpx_runtime::barrier_wait(){ //hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); hpx::this_thread::yield(); } -#else - int count = 1; - int max_count = 100000; - while(team->num_tasks > 0) { - if(count == 1) { - hpx::this_thread::yield(); - } else { - int sleep_time = count; - if(count > max_count) - sleep_time = max_count; - hpx::this_thread::sleep_for(std::chrono::microseconds(sleep_time)); - } - count = count * 2; - //hpx::this_thread::yield(); - } #endif if(team->num_threads > 1) { team->globalBarrier.wait(); } + //wait for all child tasks to be done + team->teamTaskLatch.wait(); } //TODO: Does the spec say that outstanding tasks need to end before this begins? bool hpx_runtime::start_taskgroup() { - auto *task = get_task_data(); + auto task = get_task_data(); +#if HPXMP_HAVE_OMP_50_ENABLED + intrusive_ptr tg_new(new kmp_taskgroup_t()); + tg_new->reduce_num_data = 0; + task->td_taskgroup = tg_new; +#endif task->in_taskgroup = true; #ifdef OMP_COMPLIANT //FIXME: why is this local_thread_num? shouldn't it be team->num_threads //task->tg_exec.reset(new local_priority_queue_executor(task->local_thread_num)); task->tg_exec.reset(new local_priority_queue_executor(task->team->num_threads)); #else - task->tg_num_tasks.reset(new atomic{0}); + task->taskgroupLatch.reset(new hpxmp_latch(1)); #endif return true; } void hpx_runtime::end_taskgroup() { - auto *task = get_task_data(); + auto task = get_task_data(); #ifdef OMP_COMPLIANT task->tg_exec.reset(); #else - while( *(task->tg_num_tasks) > 0 ) { - //hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); - hpx::this_thread::yield(); - } - task->tg_num_tasks.reset(); + task->taskgroupLatch->count_down_and_wait(); #endif task->in_taskgroup = false; + +#if HPXMP_HAVE_OMP_50_ENABLED + auto taskgroup = task->td_taskgroup; + if (taskgroup->reduce_data != NULL) // need to reduce? + __kmp_task_reduction_fini(nullptr,taskgroup); +#endif } void hpx_runtime::task_wait() { - auto *task = get_task_data(); - //TODO: Is this just an optimization? IT seems unnecessary. - //if(task->df_map.size() > 0) { - // task->last_df_task.wait(); - //} - //int count = 0; - //int max_count = 10; - while( *(task->num_child_tasks) > 0 ) { - //int sleep_time = 10*count; - //if(count > max_count) - // sleep_time = 10*max_count; - //hpx::this_thread::sleep_for(std::chrono::milliseconds(sleep_time)); - hpx::this_thread::yield(); - } + auto task = get_task_data(); + intrusive_ptr task_ptr(task); + task_ptr->taskLatch.wait(); } -void task_setup( int gtid, kmp_task_t *task, omp_icv icv, - shared_ptr> parent_task_counter, - parallel_region *team) +void task_setup( int gtid, intrusive_ptr kmp_task_ptr, intrusive_ptr parent_task_ptr) { - auto task_func = task->routine; - omp_task_data task_data(gtid, team, icv); - set_thread_data( get_self_id(), reinterpret_cast(&task_data)); + auto task_func = kmp_task_ptr->routine; + intrusive_ptr current_task_ptr(new omp_task_data(gtid, parent_task_ptr->team, parent_task_ptr->icv)); + set_thread_data( get_self_id(), reinterpret_cast(current_task_ptr.get())); #if HPXMP_HAVE_OMPT ompt_data_t *my_task_data = &hpx_backend->get_task_data()->task_data; if (ompt_enabled.ompt_callback_task_create) @@ -302,18 +287,16 @@ void task_setup( int gtid, kmp_task_t *task, omp_icv icv, prior_task_data, status, my_task_data); } #endif - -if(! task->gcc) - task_func(gtid, task); + // actually running the taskfunctions +if(! kmp_task_ptr->gcc) + task_func(gtid, kmp_task_ptr.get()); else - ((void (*)(void *))(*(task->routine)))(task->shareds); - - *(parent_task_counter) -= 1; + ((void (*)(void *))(*(kmp_task_ptr->routine)))(kmp_task_ptr->shareds); #ifndef OMP_COMPLIANT - team->num_tasks--; + //count down number of tasks under team + current_task_ptr->team->teamTaskLatch.count_down(1); #endif - if(task->part_id ==0) - delete[] (char*)task; + #if HPXMP_HAVE_OMPT ompt_task_status_t status_fin = ompt_task_complete; /* let OMPT know that we're returning to the callee task */ @@ -323,8 +306,11 @@ else my_task_data, status_fin, prior_task_data); } #endif -// make sure nothing is accessing this thread data after task_data got destroyed - set_thread_data( get_self_id(), reinterpret_cast(nullptr)); + //if task is in taskgroup, count down taskgroup latch as this task is done + if(parent_task_ptr->in_taskgroup) + parent_task_ptr->taskgroupLatch->count_down(1); + //tell parent I am done + parent_task_ptr->taskLatch.count_down(1); } #ifdef OMP_COMPLIANT @@ -346,11 +332,10 @@ void tg_task_setup( int gtid, kmp_task_t *task, omp_icv icv, //shared_ptr is used for these counters, because the parent/calling task may terminate at any time, //causing its omp_task_data to be deallocated. -void hpx_runtime::create_task( kmp_routine_entry_t task_func, int gtid, kmp_task_t *thunk) +void hpx_runtime::create_task( kmp_routine_entry_t task_func, int gtid, intrusive_ptr kmp_task_ptr) { - auto *current_task = get_task_data(); - - if(current_task->team->num_threads > 0) { + auto current_task_ptr = get_task_data(); + if(current_task_ptr->team->num_threads > 0) { #ifdef OMP_COMPLIANT if(current_task->in_taskgroup) { hpx::apply( *(current_task->tg_exec), tg_task_setup, gtid, thunk, current_task->icv, @@ -361,17 +346,19 @@ void hpx_runtime::create_task( kmp_routine_entry_t task_func, int gtid, kmp_task current_task->num_child_tasks, current_task->team ); } #else - //TODO: add taskgroups in non compliant version - *(current_task->num_child_tasks) += 1; - current_task->team->num_tasks++; + //this is waited in taskwait, wait for all tasks before taskwait created to be done + // create_task function is not supposed to wait anything + current_task_ptr->taskLatch.count_up(1); + //count up number of tasks in this team + current_task_ptr->team->teamTaskLatch.count_up(1); + //count up number of task in taskgroup if we are under taskgroup construct + if(current_task_ptr->in_taskgroup) + current_task_ptr->taskgroupLatch->count_up(1); //this fixes hpx::apply changes in hpx backend hpx::applier::register_thread_nullary( - std::bind(&task_setup, gtid, thunk, current_task->icv, - current_task->num_child_tasks, current_task->team), + std::bind(&task_setup, gtid, kmp_task_ptr, current_task_ptr), "omp_explicit_task", hpx::threads::pending, true, hpx::threads::thread_priority_normal); -// hpx::apply(task_setup, gtid, thunk, current_task->icv, -// current_task->num_child_tasks, current_task->team ); #endif } // else { @@ -380,12 +367,10 @@ void hpx_runtime::create_task( kmp_routine_entry_t task_func, int gtid, kmp_task // } } -void df_task_wrapper( int gtid, kmp_task_t *task, omp_icv icv, - shared_ptr> task_counter, - parallel_region *team, - vector> deps) +//deps will notify when_all function +void df_task_wrapper( int gtid, kmp_task_t *task, intrusive_ptr parent_task_ptr, vector> deps) { - task_setup( gtid, task, icv, task_counter, team); + task_setup( gtid, task, parent_task_ptr); } #ifdef OMP_COMPLIANT @@ -406,34 +391,32 @@ void hpx_runtime::create_df_task( int gtid, kmp_task_t *thunk, int ndeps, kmp_depend_info_t *dep_list, int ndeps_noalias, kmp_depend_info_t *noalias_dep_list ) { - auto task = get_task_data(); - auto team = task->team; - if(team->num_threads == 1 ) { - create_task(thunk->routine, gtid, thunk); - } + auto current_task_ptr = get_task_data(); + auto team = current_task_ptr->team; vector> dep_futures; dep_futures.reserve( ndeps + ndeps_noalias); //Populating a vector of futures that the task depends on for(int i = 0; i < ndeps;i++) { - if(task->df_map.count( dep_list[i].base_addr) > 0) { - dep_futures.push_back(task->df_map[dep_list[i].base_addr]); + if(current_task_ptr->df_map.count( dep_list[i].base_addr) > 0) { + dep_futures.push_back(current_task_ptr->df_map[dep_list[i].base_addr]); } } for(int i = 0; i < ndeps_noalias;i++) { - if(task->df_map.count( noalias_dep_list[i].base_addr) > 0) { - dep_futures.push_back(task->df_map[noalias_dep_list[i].base_addr]); + if(current_task_ptr->df_map.count( noalias_dep_list[i].base_addr) > 0) { + dep_futures.push_back(current_task_ptr->df_map[noalias_dep_list[i].base_addr]); } } shared_future new_task; - if(task->in_taskgroup) { + if(current_task_ptr->in_taskgroup) { + current_task_ptr->taskgroupLatch->count_up(1); } else { - *(task->num_child_tasks) += 1; + current_task_ptr->taskLatch.count_up(1); } #ifndef OMP_COMPLIANT - team->num_tasks++; + team->teamTaskLatch.count_up(1); #endif if(dep_futures.size() == 0) { #ifdef OMP_COMPLIANT @@ -445,8 +428,7 @@ void hpx_runtime::create_df_task( int gtid, kmp_task_t *thunk, task->num_child_tasks, team); } #else - new_task = hpx::async( task_setup, gtid, thunk, task->icv, - task->num_child_tasks, team); + new_task = hpx::async(task_setup, gtid, thunk, current_task_ptr); #endif } else { @@ -466,19 +448,17 @@ void hpx_runtime::create_df_task( int gtid, kmp_task_t *thunk, team, hpx::when_all(dep_futures) ); } #else - new_task = dataflow( unwrapping(df_task_wrapper), gtid, thunk, task->icv, - task->num_child_tasks, - team, hpx::when_all(dep_futures) ); + new_task = dataflow( unwrapping(df_task_wrapper), gtid, thunk, current_task_ptr, hpx::when_all(dep_futures)); #endif } for(int i = 0 ; i < ndeps; i++) { if(dep_list[i].flags.out) { - task->df_map[dep_list[i].base_addr] = new_task; + current_task_ptr->df_map[dep_list[i].base_addr] = new_task; } } for(int i = 0 ; i < ndeps_noalias; i++) { if(noalias_dep_list[i].flags.out) { - task->df_map[noalias_dep_list[i].base_addr] = new_task; + current_task_ptr->df_map[noalias_dep_list[i].base_addr] = new_task; } } //task->last_df_task = new_task; @@ -557,7 +537,7 @@ void hpx_runtime::create_future_task( int gtid, kmp_task_t *thunk, *(input_futures[0]), *(input_futures[1]), *(input_futures[2]) ); } else { - cout << "too many dependencies for now" << endl; + std::cerr << "too many dependencies for now" << endl; } } #endif @@ -566,14 +546,12 @@ void hpx_runtime::create_future_task( int gtid, kmp_task_t *thunk, void thread_setup( invoke_func kmp_invoke, microtask_t thread_func, int argc, void **argv, int tid, - parallel_region *team, omp_task_data *parent, - mutex_type& barrier_mtx, - hpx::lcos::local::condition_variable_any& cond, - int& running_threads ) + parallel_region *team, intrusive_ptr parent, + hpxmp_latch& threadLatch) { - omp_task_data task_data(tid, team, parent); + intrusive_ptr task_data_ptr(new omp_task_data(tid, team, parent.get())); - set_thread_data( get_self_id(), reinterpret_cast(&task_data)); + set_thread_data( get_self_id(), reinterpret_cast(task_data_ptr.get())); if(argc == 0) { //note: kmp_invoke segfaults iff argc == 0 thread_func(&tid, &tid); @@ -607,42 +585,16 @@ void thread_setup( invoke_func kmp_invoke, microtask_t thread_func, } #endif } - int count = 0; - int max_count = 10; - while (*(task_data.num_child_tasks) > 0 ) { - if(count == 0) { - hpx::this_thread::yield(); - } else { - int sleep_time = 10*count; - if(count > max_count) - sleep_time = 10*max_count; - hpx::this_thread::sleep_for(std::chrono::milliseconds(sleep_time)); - } - count++; - } - - //This keeps the task_data on this stack allocated. When is that needed? - // if tasks are created without a barrier or taskwait, they could still - // reference their parents metadata(task_data above). - //This combined with the waiting on child tasks above fufills the requirements - // of an OpenMP barrier. - std::unique_lock lk(barrier_mtx); - if(--running_threads == 0) { - //hpx::lcos::local::spinlock::scoped_lock lk(barrier_mtx); - //std::unique_lock lk(barrier_mtx); - hpx::util::ignore_while_checking > il(&lk); - cond.notify_all(); - } + threadLatch.count_down(1); } // This is the only place where get_thread can't be called, since // that data is not initialized for the new hpx threads yet. void fork_worker( invoke_func kmp_invoke, microtask_t thread_func, int argc, void **argv, - omp_task_data *parent) + intrusive_ptr parent) { parallel_region team(parent->team, parent->threads_requested); - #if HPXMP_HAVE_OMPT //TODO:HOW TO FIND OUT INVOKER ompt_invoker_t a = ompt_invoker_runtime; @@ -659,47 +611,20 @@ void fork_worker( invoke_func kmp_invoke, microtask_t thread_func, #ifdef OMP_COMPLIANT team.exec.reset(new local_priority_queue_executor(parent->threads_requested)); #endif - hpx::lcos::local::condition_variable_any cond; - mutex_type barrier_mtx; int running_threads = parent->threads_requested; + hpxmp_latch threadLatch(running_threads+1); for( int i = 0; i < parent->threads_requested; i++ ) { hpx::applier::register_thread_nullary( std::bind( &thread_setup, kmp_invoke, thread_func, argc, argv, i, &team, parent, - boost::ref(barrier_mtx), boost::ref(cond), boost::ref(running_threads) ), + boost::ref(threadLatch)), "omp_implicit_task", hpx::threads::pending, true, hpx::threads::thread_priority_low, i ); //true, hpx::threads::thread_priority_normal, i ); } - { - //hpx::lcos::local::spinlock::scoped_lock lk(barrier_mtx); - std::unique_lock lk(barrier_mtx); - while( running_threads > 0 ) { - cond.wait(lk); - } - } - //The executor containing the tasks will be destroyed as this call goes out - //of scope, which will wait on all tasks contained in it. So, nothing needs - //to be done here for it. - - //I shouldn't need this. Tasks should be done before the thread exit. - //FIXME: Remove this once the rest of the cond vars are in. -#ifndef OMP_COMPLIANT - int count = 0; - int max_count = 10; - while(team.num_tasks > 0) { - if(count == 0) { - hpx::this_thread::yield(); - } else { - int sleep_time = 10*count; - if(count > max_count) - sleep_time = 10*max_count; - hpx::this_thread::sleep_for(std::chrono::milliseconds(sleep_time)); - } - count++; - } -#endif - + threadLatch.count_down_and_wait(); + // wait for all the tasks in the team to finish + team.teamTaskLatch.wait(); #if HPXMP_HAVE_OMPT if (ompt_enabled.ompt_callback_parallel_end) { ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( @@ -713,15 +638,15 @@ void fork_worker( invoke_func kmp_invoke, microtask_t thread_func, //TODO: according to the spec, the current thread should be thread 0 of the new team, and execute the new work. void hpx_runtime::fork(invoke_func kmp_invoke, microtask_t thread_func, int argc, void** argv) { - omp_task_data *current_task = get_task_data(); + auto current_task_ptr = get_task_data(); if( hpx::threads::get_self_ptr() ) { - fork_worker(kmp_invoke, thread_func, argc, argv, current_task); + fork_worker(kmp_invoke, thread_func, argc, argv, current_task_ptr); } else { - //this handles the sync for hox threads. + //this handles the sync for hpx threads. hpx::threads::run_as_hpx_thread(&fork_worker,kmp_invoke, thread_func, argc, argv, - current_task); + current_task_ptr); } - current_task->set_threads_requested(current_task->icv.nthreads ); + current_task_ptr->set_threads_requested(current_task_ptr->icv.nthreads ); } diff --git a/src/hpx_runtime.h b/src/hpx_runtime.h index 3f5ccb8..557a1a1 100644 --- a/src/hpx_runtime.h +++ b/src/hpx_runtime.h @@ -27,6 +27,7 @@ #include #include #include +#include //#include //#include @@ -41,10 +42,12 @@ using std::atomic; using boost::shared_ptr; using hpx::threads::executors::local_priority_queue_executor; using hpx::lcos::local::barrier; +using hpx::lcos::local::latch; using hpx::lcos::shared_future; using hpx::lcos::future; using std::vector; using hpx::util::high_resolution_timer; +using boost::intrusive_ptr; typedef void (*microtask_t)( int *gtid, int *npr, ... ); @@ -64,15 +67,27 @@ typedef int (* kmp_routine_entry_t)( int, void * ); typedef std::map> depends_map; -typedef struct kmp_task { +struct kmp_task_t { void * shareds; kmp_routine_entry_t routine; int part_id; bool gcc; + atomic pointer_counter; #if OMP_40_ENABLED kmp_routine_entry_t destructors; #endif -} kmp_task_t; +}; + +inline void intrusive_ptr_add_ref(kmp_task_t *x) +{ + ++x->pointer_counter; +} + +inline void intrusive_ptr_release(kmp_task_t *x) +{ + if (x->pointer_counter == 0) + delete x; +} typedef struct kmp_depend_info { @@ -84,6 +99,58 @@ typedef struct kmp_depend_info { } flags; } kmp_depend_info_t; +#if HPXMP_HAVE_OMP_50_ENABLED +struct kmp_task_red_flags_t { + unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects) + unsigned reserved31 : 31; +}; + +// internal structure for reduction data item related info +struct kmp_task_red_data_t { + void *reduce_shar; // shared reduction item + size_t reduce_size; // size of data item + void *reduce_priv; // thread specific data + void *reduce_pend; // end of private data for comparison op + void *reduce_init; // data initialization routine + void *reduce_fini; // data finalization routine + void *reduce_comb; // data combiner routine + kmp_task_red_flags_t flags; // flags for additional info from compiler +}; + +// structure sent us by compiler - one per reduction item +struct kmp_task_red_input_t { + void *reduce_shar; // shared reduction item + size_t reduce_size; // size of data item + void *reduce_init; // data initialization routine + void *reduce_fini; // data finalization routine + void *reduce_comb; // data combiner routine + kmp_task_red_flags_t flags; // flags for additional info from compiler +}; + +struct kmp_taskgroup_t { + std::atomic count; // number of allocated and incomplete tasks + std::atomic + cancel_request; // request for cancellation of this taskgroup + kmp_taskgroup_t* parent; // parent taskgroup + // Block of data to perform task reduction + shared_ptr> reduce_data; // reduction related info + int reduce_num_data; // number of data items to reduce + atomic pointer_counter; +}; + +inline void intrusive_ptr_add_ref(kmp_taskgroup_t *x) +{ + ++x->pointer_counter; +} + +inline void intrusive_ptr_release(kmp_taskgroup_t *x) +{ + if (x->pointer_counter == 0) + delete x; +} + +#endif + class loop_data { public: loop_data(int NT, int L, int U, int S, int C, int sched) @@ -123,13 +190,26 @@ class loop_data { std::vector iter_count; }; +//temp solution for cout_up does not allow starting from 0 in HPX +class hpxmp_latch: public latch { +public: + using latch::latch; + void count_up(std::ptrdiff_t n) + { + HPX_ASSERT(n >= 0); + + std::ptrdiff_t old_count = + counter_.fetch_add(n, std::memory_order_acq_rel); + } +}; + //Does this need to keep track of the parallel region it is nested in, // the omp_task_data of the parent thread, or both? //template struct parallel_region { parallel_region( int N ) : num_threads(N), globalBarrier(N), - depth(0), reduce_data(N) + depth(0), reduce_data(N), teamTaskLatch(0) {}; parallel_region( parallel_region *parent, int threads_requested ) : parallel_region(threads_requested) @@ -147,13 +227,13 @@ struct parallel_region { mutex_type thread_mtx{}; mutex_type single_mtx{}; int depth; - atomic num_tasks{0}; atomic single_counter{0}; atomic current_single_thread{-1}; void *copyprivate_data; vector reduce_data; vector loop_list; mutex_type loop_mtx; + hpxmp_latch teamTaskLatch; #if (HPXMP_HAVE_OMPT) ompt_data_t parent_data = ompt_data_none; ompt_data_t parallel_data = ompt_data_none; @@ -171,8 +251,8 @@ class omp_task_data { public: //This constructor should only be used once for the implicit task omp_task_data( parallel_region *T, omp_device_icv *global, int init_num_threads) - : team(T), num_child_tasks(new atomic{0}) - { + : team(T),taskLatch(0) + { local_thread_num = 0; icv.device = global; icv.nthreads = init_num_threads; @@ -191,7 +271,7 @@ class omp_task_data { //This is for explicit tasks omp_task_data(int tid, parallel_region *T, omp_icv icv_vars) - : local_thread_num(tid), team(T), num_child_tasks(new atomic{0}), icv(icv_vars) + : local_thread_num(tid), team(T), icv(icv_vars),taskLatch(0) { threads_requested = icv.nthreads; icv_vars.device = icv.device; @@ -218,10 +298,11 @@ class omp_task_data { parallel_region *team; //mutex_type thread_mutex; //hpx::lcos::local::condition_variable_any thread_cond; - shared_ptr> num_child_tasks; int single_counter{0}; int loop_num{0}; bool in_taskgroup{false}; + hpxmp_latch taskLatch; + atomic pointer_counter{0}; //shared_future last_df_task; #if HPXMP_HAVE_OMPT ompt_data_t task_data = ompt_data_none; @@ -230,13 +311,27 @@ class omp_task_data { #ifdef OMP_COMPLIANT shared_ptr tg_exec; #else - shared_ptr> tg_num_tasks; + shared_ptr taskgroupLatch; #endif omp_icv icv; depends_map df_map; +#if HPXMP_HAVE_OMP_50_ENABLED + intrusive_ptr td_taskgroup; +#endif }; +inline void intrusive_ptr_add_ref(omp_task_data *x) +{ + ++x->pointer_counter; +} + +inline void intrusive_ptr_release(omp_task_data *x) +{ + if (--x->pointer_counter == 0) + delete x; +} + struct raw_data { void *data; size_t size; @@ -248,7 +343,7 @@ class hpx_runtime { void fork(invoke_func kmp_invoke, microtask_t thread_func, int argc, void** argv); parallel_region* get_team(); bool set_thread_data_check(); - omp_task_data* get_task_data(); + intrusive_ptr get_task_data(); int get_thread_num(); int get_num_threads(); int get_num_procs(); @@ -256,7 +351,7 @@ class hpx_runtime { void barrier_wait(); void create_task( omp_task_func taskfunc, void *frame_pointer, void *firstprivates, int is_tied, int blocks_parent); - void create_task( kmp_routine_entry_t taskfunc, int gtid, kmp_task_t *task); + void create_task( kmp_routine_entry_t taskfunc, int gtid, intrusive_ptr task); void create_df_task( int gtid, kmp_task_t *thunk, int ndeps, kmp_depend_info_t *dep_list, int ndeps_noalias, kmp_depend_info_t *noalias_dep_list ); @@ -277,7 +372,7 @@ class hpx_runtime { private: shared_ptr implicit_region; - shared_ptr initial_thread; + intrusive_ptr initial_thread; int num_procs; shared_ptr walltime; bool external_hpx; diff --git a/src/intel_hpxMP.cpp b/src/intel_hpxMP.cpp index 01c6c8b..5d7bd6f 100644 --- a/src/intel_hpxMP.cpp +++ b/src/intel_hpxMP.cpp @@ -92,12 +92,13 @@ __kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, //kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *) & flags; //TODO: do I need to do something with these flags? int task_size = sizeof_kmp_task_t + (-sizeof_kmp_task_t%8); - + //can be sure that no deletion of task happens here, no need of intrusive ptr kmp_task_t *task = (kmp_task_t*)new char[task_size + sizeof_shareds]; //This gets deleted at the end of task_setup task->routine = task_entry; task->gcc = false; + task->pointer_counter = 0; if( sizeof_shareds == 0 ) { task->shareds = NULL; } else { @@ -112,7 +113,8 @@ int __kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task){ std::cout<<"__kmpc_omp_task"<create_task(new_task->routine, gtid, new_task); + intrusive_ptr new_task_ptr(new_task); + hpx_backend->create_task(new_task_ptr->routine, gtid, new_task_ptr); return 1; } @@ -202,6 +204,146 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part ){ return 0; } +#if HPXMP_HAVE_OMP_50_ENABLED +// Task Reduction implementation +void *__kmpc_task_reduction_init(int gtid, int num, void *data) { + auto thread = hpx_backend->get_task_data(); + intrusive_ptr tg = thread->td_taskgroup; + int nth = thread->team->num_threads; + kmp_task_red_input_t *input = (kmp_task_red_input_t *)data; + + if (nth == 1) { + return (void *)tg.get(); + } + shared_ptr> arr(new(vector)); + arr->reserve(num); + for (int i = 0; i < num; ++i) { + void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init); + size_t size = input[i].reduce_size - 1; + // round the size up to cache line per thread-specific item + size += 64 - size % 64; + (*arr)[i].reduce_shar = input[i].reduce_shar; + (*arr)[i].reduce_size = size; + (*arr)[i].reduce_init = input[i].reduce_init; + (*arr)[i].reduce_fini = input[i].reduce_fini; + (*arr)[i].reduce_comb = input[i].reduce_comb; + (*arr)[i].flags = input[i].flags; + if (!input[i].flags.lazy_priv) { + // allocate cache-line aligned block and fill it with zeros + (*arr)[i].reduce_priv = new char[nth * size]; + (*arr)[i].reduce_pend = (char *)((*arr)[i].reduce_priv) + nth * size; + if (f_init != NULL) { + // initialize thread-specific items + for (int j = 0; j < nth; ++j) { + f_init((char *)((*arr)[i].reduce_priv) + j * size); + } + } + } else { + // only allocate space for pointers now, + // objects will be lazily allocated/initialized once requested + (*arr)[i].reduce_priv = new char[nth * sizeof(void *)]; + } + } + tg->reduce_data = arr; + tg->reduce_num_data = num; + return (void *)tg.get(); +} + +/*! +@ingroup TASKING +@param gtid Global thread ID +@param tskgrp The taskgroup ID (optional) +@param data Shared location of the item +@return The pointer to per-thread data + +Get thread-specific location of data item +*/ +void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { + auto thread = hpx_backend->get_task_data(); + kmp_int32 nth = thread->team->num_threads; + if (nth == 1) + return data; // nothing to do + + intrusive_ptr tg = (kmp_taskgroup_t*)tskgrp; + if (tg == NULL) + tg = thread->td_taskgroup; + shared_ptr> arr = tg->reduce_data; + kmp_int32 num = tg->reduce_num_data; + kmp_int32 tid = gtid; + + while (tg != NULL) { + for (int i = 0; i < num; ++i) { + if (!(*arr)[i].flags.lazy_priv) { + if (data == (*arr)[i].reduce_shar || + (data >= (*arr)[i].reduce_priv && data < (*arr)[i].reduce_pend)) + return (char *)((*arr)[i].reduce_priv) + tid * (*arr)[i].reduce_size; + } else { + // check shared location first + void **p_priv = (void **)((*arr)[i].reduce_priv); + if (data == (*arr)[i].reduce_shar) + goto found; + // check if we get some thread specific location as parameter + for (int j = 0; j < nth; ++j) + if (data == p_priv[j]) + goto found; + continue; // not found, continue search + found: + if (p_priv[tid] == NULL) { + // allocate thread specific object lazily + void (*f_init)(void *) = (void (*)(void *))((*arr)[i].reduce_init); + p_priv[tid] = new char[(*arr)[i].reduce_size]; + if (f_init != NULL) { + f_init(p_priv[tid]); + } + } + return p_priv[tid]; + } + } + tg = tg->parent; + arr = tg->reduce_data; + num = tg->reduce_num_data; + } + return NULL; // ERROR, this line never executed +} + +// Finalize task reduction. +// Called from __kmpc_end_taskgroup() +void __kmp_task_reduction_fini(void *thr, intrusive_ptr tg) { + auto th = hpx_backend->get_task_data(); + kmp_int32 nth = th->team->num_threads; + shared_ptr> arr = tg->reduce_data; + kmp_int32 num = tg->reduce_num_data; + for (int i = 0; i < num; ++i) { + void *sh_data = (*arr)[i].reduce_shar; + void (*f_fini)(void *) = (void (*)(void *))((*arr)[i].reduce_fini); + void (*f_comb)(void *, void *) = + (void (*)(void *, void *))((*arr)[i].reduce_comb); + if (!(*arr)[i].flags.lazy_priv) { + void *pr_data = (*arr)[i].reduce_priv; + size_t size = (*arr)[i].reduce_size; + for (int j = 0; j < nth; ++j) { + void *priv_data = (char *)pr_data + j * size; + f_comb(sh_data, priv_data); // combine results + if (f_fini) + f_fini(priv_data); // finalize if needed + } + } else { + void **pr_data = (void **)((*arr)[i].reduce_priv); + for (int j = 0; j < nth; ++j) { + if (pr_data[j] != NULL) { + f_comb(sh_data, pr_data[j]); // combine results + if (f_fini) + f_fini(pr_data[j]); // finalize if needed + delete(pr_data[j]); + } + } + } + } + tg->reduce_data = NULL; + tg->reduce_num_data = 0; +} +#endif + void __kmpc_taskgroup( ident_t* loc, int gtid ) { if( hpx_backend->start_taskgroup() ) { #if defined DEBUG && defined HPXMP_HAVE_TRACE @@ -221,8 +363,9 @@ void __kmpc_taskgroup( ident_t* loc, int gtid ) { } } #endif - cout << "Warning, taskgroup failed to start" << endl; } + else + cout << "Warning, taskgroup failed to start" << endl; //hpx_backend->get_task_data()->num_taskgroup_tasks.reset(new atomic{0}); } @@ -300,7 +443,7 @@ __kmpc_push_num_threads( ident_t *loc, #endif start_backend(); start_backend(); - omp_task_data *data = hpx_backend->get_task_data(); + auto data = hpx_backend->get_task_data(); data->set_threads_requested( num_threads ); } @@ -371,7 +514,7 @@ int __kmpc_single(ident_t *loc, int tid){ if(!hpx_backend || !hpx::threads::get_self_ptr() ) { return 1; } - auto *task = hpx_backend->get_task_data(); + auto task = hpx_backend->get_task_data(); auto *team = task->team; int do_work = 0; diff --git a/src/intel_hpxMP.h b/src/intel_hpxMP.h index eab0fae..a6bad4e 100644 --- a/src/intel_hpxMP.h +++ b/src/intel_hpxMP.h @@ -265,3 +265,9 @@ extern "C" void omp_unset_nest_lock(omp_lock_t **lock); extern "C" int omp_test_lock(omp_lock_t **lock); extern "C" int omp_test_nest_lock(omp_lock_t **lock); +#if HPXMP_HAVE_OMP_50_ENABLED +extern "C" void *__kmpc_task_reduction_init(int gtid, int num, void *data); +extern "C" void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data); +void __kmp_task_reduction_fini(void *thr, intrusive_ptr tg); +#endif + diff --git a/src/loop_schedule.cpp b/src/loop_schedule.cpp index 7c01054..951b1e3 100644 --- a/src/loop_schedule.cpp +++ b/src/loop_schedule.cpp @@ -56,6 +56,9 @@ __kmpc_for_static_init_4( ident_t *loc, int32_t gtid, int32_t schedtype, int32_t *p_last_iter,int32_t *p_lower, int32_t *p_upper, int32_t *p_stride, int32_t incr, int32_t chunk ) { + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_for_static_init_4"<( gtid, schedtype, p_last_iter, p_lower, p_upper, p_stride, incr, chunk ); } @@ -65,6 +68,9 @@ __kmpc_for_static_init_4u( ident_t *loc, int32_t gtid, int32_t schedtype, int32_t *p_last_iter, uint32_t *p_lower, uint32_t *p_upper, int32_t *p_stride, int32_t incr, int32_t chunk ) { + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_for_static_init_4u"<( gtid, schedtype, p_last_iter, p_lower, p_upper, p_stride, incr, chunk ); } @@ -75,6 +81,9 @@ __kmpc_for_static_init_8( ident_t *loc, int32_t gtid, int64_t *p_lower, int64_t *p_upper, int64_t *p_stride, int64_t incr, int64_t chunk ) { + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_for_static_init_8"<( gtid, schedtype, p_last_iter, p_lower, p_upper, p_stride, incr, chunk ); } @@ -85,6 +94,9 @@ __kmpc_for_static_init_8u( ident_t *loc, int32_t gtid, uint64_t *p_lower, uint64_t *p_upper, int64_t *p_stride, int64_t incr, int64_t chunk ) { + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_for_static_init_8u"<( gtid, schedtype, p_last_iter, p_lower, p_upper, p_stride, incr, chunk ); } @@ -92,6 +104,9 @@ __kmpc_for_static_init_8u( ident_t *loc, int32_t gtid, void __kmpc_for_static_fini( ident_t *loc, int32_t gtid ){ //Only seems to do internal tracking in intel runtime + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_for_static_fini"<( gtid, schedule, lb, ub, st, chunk ); } @@ -138,6 +156,9 @@ void __kmpc_dispatch_init_4u( ident_t *loc, int32_t gtid, enum sched_type schedule, uint32_t lb, uint32_t ub, int32_t st, int32_t chunk ) { + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_dispatch_init_4u"<( gtid, schedule, lb, ub, st, chunk ); } @@ -145,6 +166,9 @@ void __kmpc_dispatch_init_8( ident_t *loc, int32_t gtid, enum sched_type schedule, int64_t lb, int64_t ub, int64_t st, int64_t chunk ) { + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_dispatch_init_8"<( gtid, schedule, lb, ub, st, chunk ); } @@ -152,6 +176,9 @@ void __kmpc_dispatch_init_8u( ident_t *loc, int32_t gtid, enum sched_type schedule, uint64_t lb, uint64_t ub, int64_t st, int64_t chunk ) { + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_dispatch_init_8u"<( gtid, schedule, lb, ub, st, chunk ); } @@ -258,40 +285,67 @@ int kmp_next( int gtid, int *p_last, T *p_lower, T *p_upper, D *p_stride ) { int __kmpc_dispatch_next_4( ident_t *loc, int32_t gtid, int32_t *p_last, int32_t *p_lb, int32_t *p_ub, int32_t *p_st ){ + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_dispatch_next_4"<(gtid, p_last, p_lb, p_ub, p_st); } int __kmpc_dispatch_next_4u( ident_t *loc, int32_t gtid, int32_t *p_last, uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st ){ + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_dispatch_next_4u"<(gtid, p_last, p_lb, p_ub, p_st); } int __kmpc_dispatch_next_8( ident_t *loc, int32_t gtid, int32_t *p_last, int64_t *p_lb, int64_t *p_ub, int64_t *p_st ){ + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_dispatch_next_8"<(gtid, p_last, p_lb, p_ub, p_st); } int __kmpc_dispatch_next_8u( ident_t *loc, int32_t gtid, int32_t *p_last, uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st ){ + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_dispatch_next_8u"<(gtid, p_last, p_lb, p_ub, p_st); } void __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ){ + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_dispatch_fini_4"<get_task_data()->loop_num - 1; auto loop_sched = &(hpx_backend->get_team()->loop_list[ current_loop ]); while( loop_sched->ordered_count < loop_sched->first_iter[global_tid] || @@ -301,6 +355,9 @@ void __kmpc_ordered(ident_t *, kmp_int32 global_tid ) { } void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid ) { + #if defined DEBUG && defined HPXMP_HAVE_TRACE + std::cout<<"__kmpc_end_ordered"<get_task_data()->loop_num - 1; auto loop_sched = &(hpx_backend->get_team()->loop_list[ current_loop ]); loop_sched->ordered_count++; diff --git a/src/ompt_hpx-general.cpp b/src/ompt_hpx-general.cpp index adedfcc..7699c14 100644 --- a/src/ompt_hpx-general.cpp +++ b/src/ompt_hpx-general.cpp @@ -302,7 +302,7 @@ ompt_data_t *__ompt_get_thread_data_internal() { ompt_data_t* __ompt_get_parallel_data_internal() { - omp_task_data* omp_task = hpx_backend->get_task_data(); + auto omp_task = hpx_backend->get_task_data(); return &omp_task->team->parallel_data; } diff --git a/tests/openmp/regressions/CMakeLists.txt b/tests/openmp/regressions/CMakeLists.txt index 9675ee4..42c367d 100644 --- a/tests/openmp/regressions/CMakeLists.txt +++ b/tests/openmp/regressions/CMakeLists.txt @@ -7,7 +7,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") set(tests get_thread_num parallel_multi - task_no_parallel task_tree_simple task_untied ) diff --git a/tests/openmp/regressions/parallel_multi.cpp b/tests/openmp/regressions/parallel_multi.cpp index 3f5716e..f77bbbf 100644 --- a/tests/openmp/regressions/parallel_multi.cpp +++ b/tests/openmp/regressions/parallel_multi.cpp @@ -13,7 +13,7 @@ int main(){ for (int j = 0; j < 1000; j++) { #pragma omp parallel { - printf(" \n"); + int t = 0; } } } diff --git a/tests/openmp/regressions/task_no_parallel.cpp b/tests/openmp/regressions/task_no_parallel.cpp deleted file mode 100644 index 9164d6b..0000000 --- a/tests/openmp/regressions/task_no_parallel.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (c) 2018 Tianyi Zhang -// -// Distributed under the Boost Software License, Version 1.0. (See accompanying -// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - -// Demonstrating #9: terminating with uncaught exception - -#include -int main (int argc, char *argv[]) -{ -#pragma omp task - printf("This task is not in parallel regin\n"); -} diff --git a/tests/openmp/unit/CMakeLists.txt b/tests/openmp/unit/CMakeLists.txt index 61c8494..fe9131a 100644 --- a/tests/openmp/unit/CMakeLists.txt +++ b/tests/openmp/unit/CMakeLists.txt @@ -7,7 +7,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") set(tests app_lu app_old_fib - app_omp_fib app_vla atomic barrier @@ -32,15 +31,18 @@ set(tests #single_copyprivate_2 #failure sometime #single_copyprivate_1var #failure sometime single_nowait - task_depend - taskgroup #to be implemented, not checking correctness + taskgroup task_fp task_tree taskwait taskwait_2 #threadprivate #failure sometime ) - +if(HPXMP_WITH_OMP_50_ENABLED) + set(tests_omp50 + task_in_reduction + ) +endif() enable_testing() macro(do_test name) @@ -57,8 +59,38 @@ macro(do_test name) ) endmacro(do_test) +macro(do_test_enhanced name) + add_test(${name} ${name}) + set_target_properties(${name} PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + ) + #maybe make the omp_num_threads configurable later + set_tests_properties(${name} PROPERTIES + ENVIRONMENT "LD_PRELOAD=${PROJECT_BINARY_DIR}/libhpxmp.so;OMP_NUM_THREADS=2" + TIMEOUT 60 + ) +endmacro(do_test_enhanced) + + + foreach(test ${tests}) set(sources ${test}.cpp) add_executable(${test} ${sources}) do_test(${test}) endforeach() + +foreach(test ${tests_enhanced}) + set(sources ${test}.cpp) + add_executable(${test} ${sources}) + do_test_enhanced(${test}) +endforeach() + +if(HPXMP_WITH_OMP_50_ENABLED) + foreach(test ${tests_omp50}) + set(sources ${test}.cpp) + add_executable(${test} ${sources}) + do_test_enhanced(${test}) + endforeach() +endif() diff --git a/tests/openmp/unit/app_omp_fib.cpp b/tests/openmp/unit/app_omp_fib.cpp index 94b5dda..d26f8ef 100644 --- a/tests/openmp/unit/app_omp_fib.cpp +++ b/tests/openmp/unit/app_omp_fib.cpp @@ -8,62 +8,61 @@ #include #include #include +#include long fib1(int k); long fib2(int k); //int num_tasks = 0; -int cutoff = 26; +int cutoff = 15; int main(int argc, char* argv[]) { - struct timeval t1; - struct timeval t2; - - int input; - long s, u; - long f; - double m; - - if (argc != 2 && argc != 3) - { - //fprintf(stderr, "Usage: ./fib \n"); - input = 24; - cutoff = 0; - //return 1; - } - else - { - input = atoi(argv[1]); - if (argc == 3) - { - cutoff = atoi(argv[2]); + for(int i=2; i<=20; i++) { + omp_set_num_threads(i/2); + struct timeval t1; + struct timeval t2; + + int input; + long s, u; + long f; + double m; + + if (argc != 2 && argc != 3) { + //fprintf(stderr, "Usage: ./fib \n"); + input = 24; + //return 1; + } else { + input = atoi(argv[1]); + if (argc == 3) { + cutoff = atoi(argv[2]); + } } - } - gettimeofday(&t1, NULL); + gettimeofday(&t1, NULL); #pragma omp parallel - { -#pragma omp master { -#pragma omp task shared(f) +#pragma omp master { - f = fib1(input); +#pragma omp task shared(f) + { + f = fib1(input); + } } } - } - gettimeofday(&t2, NULL); - printf("fib(%d) = %d\n", input, f); + gettimeofday(&t2, NULL); + printf("fib(%d) = %d\n", input, f); - s = t2.tv_sec - t1.tv_sec; - u = t2.tv_usec - t1.tv_usec; - m = (s * 1000 + u / 1000.0) + 0.5; - printf("cutoff = %d\n", cutoff); - printf("time = %.2lfms\n", m); - if (f != 46368) - return 1; + s = t2.tv_sec - t1.tv_sec; + u = t2.tv_usec - t1.tv_usec; + m = (s * 1000 + u / 1000.0) + 0.5; + printf("cutoff = %d\n", cutoff); + printf("time = %.2lfms\n", m); + if (f != 46368) + return 1; + } return 0; } diff --git a/tests/openmp/unit/sections_2.cpp b/tests/openmp/unit/sections_2.cpp index 5af585b..a09e577 100644 --- a/tests/openmp/unit/sections_2.cpp +++ b/tests/openmp/unit/sections_2.cpp @@ -12,18 +12,12 @@ int main() { std::atomic position(-1); int result[4]; - + int ret = 0; position++; result[position] = 1; printf("The "); #pragma omp parallel sections { -#pragma omp section - { - position++; - result[position] = 2; - printf("STE||AR "); - } #pragma omp section { position++; @@ -36,9 +30,10 @@ int main() printf("is awesome\n "); printf("\n"); - for(int i = 0; i < 3; i++){ + for(int i = 0; i < 2; i++){ + printf("%d ",result[i]); if(result[i] > result[i+1]) - return 1; + ret = 1; } - return 0; + return ret; } diff --git a/tests/openmp/unit/task_depend.cpp b/tests/openmp/unit/task_depend.cpp index c20c816..8a0759e 100644 --- a/tests/openmp/unit/task_depend.cpp +++ b/tests/openmp/unit/task_depend.cpp @@ -5,6 +5,7 @@ // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #include +#include int foo() { @@ -20,24 +21,24 @@ int main(int argc, char **argv) { int x = 5, y = 1, z = 0; printf("&x = %p\n", &x); + for(int i=2; i<=20; i++) { + omp_set_num_threads(i/2); #pragma omp parallel - { -#pragma omp single { +#pragma omp single + { #pragma omp task depend(out : x) - x = foo(); + x = foo(); #pragma omp task depend(in : x) depend(out : y) - y = bar(x); + y = bar(x); #pragma omp task depend(in : y) - z = bar(y); + z = bar(y); + } } + if (x != 42 || y != 53 || z != 64) + return 1; } - printf("x = %d\n", x); - printf("y = %d\n", y); - printf("z = %d\n", z); - if (x != 42 || y != 53 || z != 64) - return 1; return 0; } diff --git a/tests/openmp/unit/task_in_reduction.cpp b/tests/openmp/unit/task_in_reduction.cpp new file mode 100644 index 0000000..7edbc52 --- /dev/null +++ b/tests/openmp/unit/task_in_reduction.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2018 Tianyi Zhang +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include + +int main(int argc, char **argv) +{ + int sum = 0; + std::vector a(100,1); +#pragma omp parallel +#pragma omp single + { + #pragma omp taskgroup task_reduction(+:sum) + { + #pragma omp task in_reduction(+:sum) + for (int i=0; i<50; i++) sum += a[i]; + #pragma omp task in_reduction(+:sum) + for (int i=50; i<100; i++) sum += a[i]; + } + std::cout< #include -//not checking correctness as taskgroup is not implemented properly +//this code is not working with release under both openmp and hpxmp int main(int argc, char *argv[]) { - int result[5]; + std::vector result(5,0); std::atomic position(-1); #pragma omp parallel { @@ -40,15 +40,15 @@ int main(int argc, char *argv[]) { } } } - printf("is fun to watch "); + printf(" fun to watch "); position++; result[position] = 3; } } // End of parallel region - printf("\n"); + for(int i = 0; i < 3; i++){ if(result[i] > result[i+1]) - return 0; + return 1; } return 0; } diff --git a/tests/openmp/unit/taskgroup_.cpp b/tests/openmp/unit/taskgroup_.cpp new file mode 100644 index 0000000..0acdf48 --- /dev/null +++ b/tests/openmp/unit/taskgroup_.cpp @@ -0,0 +1,67 @@ +// Copyright (c) 2018 Tianyi Zhang +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include + +//this code is not working with release under both openmp and hpxmp +int main(int argc, char *argv[]) { + std::vector result(5,0); + std::atomic position(-1); +#pragma omp parallel + { +#pragma omp single + { + printf("A "); + position++; + result[position] = 1; +#pragma omp taskgroup + { +#pragma omp task + { + printf("race "); + position++; + result[position] = 2; + } +#pragma omp task + { + printf("car "); + position++; + result[position] = 2; +#pragma omp task + { + printf("is"); + position++; + result[position] = 2; + } + } + } + printf(" fun to watch "); + position++; + result[position] = 3; +#pragma omp taskgroup + { +#pragma omp task + { + printf(", good "); + position++; + result[position] = 4; + } + } + printf("job ! "); + position++; + result[position] = 5; + } + } // End of parallel region + printf("\n"); + + for(int i = 0; i < 5; i++){ + if(result[i] > result[i+1]) + return 1; + } + return 0; +}