diff --git a/CMakeLists.txt b/CMakeLists.txt
index 695d4fc..3f3b996 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,6 +40,11 @@ if(HPXMP_WITH_OMPT)
             src/ompt-internal.h)
 endif()
 
+set(HPXMP_WITH_OMP_50_ENABLED OFF CACHE BOOL "enable openmp 5.0 features in hpxmp")
+if(HPXMP_WITH_OMP_50_ENABLED)
+    add_definitions(-DHPXMP_HAVE_OMP_50_ENABLED)
+endif()
+
 #decide whether debug or release build
 set(RELEASE_BUILD FALSE)
 string(TOLOWER "${CMAKE_BUILD_TYPE}" libhpxmp_build_type_lowercase)
@@ -74,7 +79,7 @@ if(NOT HPX_WITH_HPXMP)
         target_link_libraries(hpxmp PRIVATE -L${dir})
     endforeach()
 
-    target_link_libraries(hpxmp PRIVATE ${HPX_LIBRARIES})
+    target_link_libraries(hpxmp PRIVATE ${HPX_LIBRARIES} hpx_preprocessor)
 
 else()
 
diff --git a/README.md b/README.md
index 196f44d..d80a942 100644
--- a/README.md
+++ b/README.md
@@ -55,4 +55,7 @@ Please refer to [Usage Examples](doc/usecases.md) for simple use examples.
 Please refer to [dones and todos](doc/done-todo.md) for information about
 what has been implemented in hpxMP and what needs to be done.
 
+# Credit
 
+Please cite as
+Jeremy Kemp, Tianyi Zhang, Bryce Adelstein, Shahrzad Shirzad, Hartmut Kaiser, Parsa Amini, & Bibek Wagle. (2018, November 1). hpxMP (Version 0.1.0). Zenodo. http://doi.org/10.5281/zenodo.2662481
diff --git a/doc/done-todo.md b/doc/done-todo.md
index a4b3aab..03919cd 100644
--- a/doc/done-todo.md
+++ b/doc/done-todo.md
@@ -63,10 +63,14 @@ The following pragmas and clauses are tested in hpxMP.
 
 *#pragma omp task untied
 
+*#pragma omp taskgroup task_reduction(+:sum)
+
+*#pragma omp task in_reduction(+:sum)
+
 *#pragma omp taskwait
 
 **Note**: Directives not implemented in hpxMP:
-cancellation, taskgroup, threadprivate, copyprivate, and copyin 
+cancellation, threadprivate, copyprivate, and copyin 
 
 # Compiler Support
 hpxMP works wich clang/gcc, 
@@ -84,5 +88,6 @@ A list of callbacks to be supported can be found [here](ompt-priorities.txt).
 
 **Note**: Currently Mandatory Events are under control, but they needs to be 
 reviewed and improved. Currently, OMPT does not work with gcc.
-Other events are not implemented yet. 
+Other events are not implemented yet.
+
 
diff --git a/src/gcc_hpxMP.cpp b/src/gcc_hpxMP.cpp
index da84b2a..1f342d2 100644
--- a/src/gcc_hpxMP.cpp
+++ b/src/gcc_hpxMP.cpp
@@ -698,7 +698,7 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *), void *data, unsigned n
 #endif
     start_backend();
     //__kmpc_push_num_threads
-    omp_task_data * my_data = hpx_backend->get_task_data();
+    auto my_data = hpx_backend->get_task_data();
     my_data->set_threads_requested(num_threads);
 
     __kmp_GOMP_fork_call(task,(microtask_t )__kmp_GOMP_microtask_wrapper, 2, task, data);
@@ -714,7 +714,7 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task) (void *), void *data,
     //from gomp parallel
     start_backend();
     //from __kmpc_push_num_threads
-    omp_task_data * my_data = hpx_backend->get_task_data();
+    auto my_data = hpx_backend->get_task_data();
     my_data->set_threads_requested(num_threads);
 
     __kmp_GOMP_fork_call(task,
@@ -730,7 +730,7 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task) (void *), void *data,
         long ub, long str, long chunk_sz, unsigned flags)                      \
     {                                                                          \
         start_backend();                                                       \
-        omp_task_data *my_data = hpx_backend->get_task_data();                 \
+        auto my_data = hpx_backend->get_task_data();                 \
         my_data->set_threads_requested(num_threads);                           \
         __kmp_GOMP_fork_call(task,                                             \
             (microtask_t) __kmp_GOMP_parallel_microtask_wrapper, 9, task,      \
diff --git a/src/hpx_runtime.cpp b/src/hpx_runtime.cpp
index 94658de..807f559 100644
--- a/src/hpx_runtime.cpp
+++ b/src/hpx_runtime.cpp
@@ -30,15 +30,15 @@ extern boost::shared_ptr<hpx_runtime> hpx_backend;
 
 void wait_for_startup(std::mutex& startup_mtx, std::condition_variable& cond, bool& running)
 {
-#if defined DEBUG
-    cout << "HPX OpenMP runtime has started" << endl;
-#endif
     {   // Let the main thread know that we're done.
         //std::scoped_lock lk(startup_mtx);
         std::lock_guard<std::mutex> lk(startup_mtx);
         running = true;
         cond.notify_all();
     }
+#if defined DEBUG
+    std::cerr << "HPX OpenMP runtime has started" << endl;
+#endif
 }
 
 void fini_runtime()
@@ -46,8 +46,10 @@ void fini_runtime()
 #if defined DEBUG
     cout << "Stopping HPX OpenMP runtime" << endl;
 #endif
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
     //this should only be done if this runtime started hpx
-    hpx::get_runtime().stop();
+    hpx::threads::run_as_hpx_thread([]() { hpx::finalize(); });
+    hpx::stop();
 #if defined DEBUG
     cout << "Stopped" << endl;
 #endif
@@ -162,7 +164,7 @@ bool hpx_runtime::set_thread_data_check() {
     return false;
 }
 
-omp_task_data* hpx_runtime::get_task_data()
+intrusive_ptr<omp_task_data> hpx_runtime::get_task_data()
 {
     omp_task_data *data;
     if(hpx::threads::get_self_ptr()) {
@@ -171,10 +173,12 @@ omp_task_data* hpx_runtime::get_task_data()
             std::cerr<<"trying to get null hpx thread data\n";
             data = initial_thread.get();
         }
-    } else {
+    }
+    else {
         data = initial_thread.get();
     }
-    return data;
+    intrusive_ptr<omp_task_data> data_ptr(data);
+    return data_ptr;
 }
 
 double hpx_runtime::get_time() {
@@ -209,82 +213,63 @@ void hpx_runtime::barrier_wait(){
         //hpx::this_thread::sleep_for(std::chrono::milliseconds(100));
         hpx::this_thread::yield();
     }
-#else
-    int count = 1;
-    int max_count = 100000;
-    while(team->num_tasks > 0) {
-        if(count == 1) {
-            hpx::this_thread::yield();
-        } else {
-            int sleep_time = count;
-            if(count > max_count)
-                sleep_time = max_count;
-            hpx::this_thread::sleep_for(std::chrono::microseconds(sleep_time));
-        }
-        count = count * 2;
-        //hpx::this_thread::yield();
-    }
 #endif
     if(team->num_threads > 1) {
         team->globalBarrier.wait();
     }
+    //wait for all child tasks to be done
+    team->teamTaskLatch.wait();
 }
 
 //TODO: Does the spec say that outstanding tasks need to end before this begins?
 bool hpx_runtime::start_taskgroup()
 {
-    auto *task = get_task_data();
+    auto task = get_task_data();
+#if HPXMP_HAVE_OMP_50_ENABLED
+    intrusive_ptr<kmp_taskgroup_t> tg_new(new kmp_taskgroup_t());
+    tg_new->reduce_num_data = 0;
+    task->td_taskgroup = tg_new;
+#endif
     task->in_taskgroup = true;
 #ifdef OMP_COMPLIANT
     //FIXME: why is this local_thread_num? shouldn't it be team->num_threads
     //task->tg_exec.reset(new local_priority_queue_executor(task->local_thread_num));
     task->tg_exec.reset(new local_priority_queue_executor(task->team->num_threads));
 #else
-    task->tg_num_tasks.reset(new atomic<int64_t>{0});
+    task->taskgroupLatch.reset(new hpxmp_latch(1));
 #endif
     return true;
 }
 
 void hpx_runtime::end_taskgroup()
 {
-    auto *task = get_task_data();
+    auto task = get_task_data();
 #ifdef OMP_COMPLIANT
     task->tg_exec.reset();
 #else
-    while( *(task->tg_num_tasks) > 0 ) {
-        //hpx::this_thread::sleep_for(std::chrono::milliseconds(100));
-        hpx::this_thread::yield();
-    }
-    task->tg_num_tasks.reset();
+    task->taskgroupLatch->count_down_and_wait();
 #endif
     task->in_taskgroup = false;
+
+#if HPXMP_HAVE_OMP_50_ENABLED
+    auto taskgroup = task->td_taskgroup;
+    if (taskgroup->reduce_data != NULL) // need to reduce?
+        __kmp_task_reduction_fini(nullptr,taskgroup);
+#endif
 }
 
 void hpx_runtime::task_wait()
 {
-    auto *task = get_task_data();
-    //TODO: Is this just an optimization? IT seems unnecessary.
-    //if(task->df_map.size() > 0) {
-    //    task->last_df_task.wait();
-    //}
-    //int count = 0;
-    //int max_count = 10;
-    while( *(task->num_child_tasks) > 0 ) {
-        //int sleep_time = 10*count;
-        //if(count > max_count)
-        //    sleep_time = 10*max_count;
-        //hpx::this_thread::sleep_for(std::chrono::milliseconds(sleep_time));
-        hpx::this_thread::yield();
-    }
+    auto task = get_task_data();
+    intrusive_ptr task_ptr(task);
+    task_ptr->taskLatch.wait();
 }
 
-void task_setup( int gtid, kmp_task_t *task, omp_icv icv,
-                 shared_ptr<atomic<int64_t>> parent_task_counter,
-                 parallel_region *team)
+void task_setup( int gtid, intrusive_ptr<kmp_task_t> kmp_task_ptr, intrusive_ptr<omp_task_data> parent_task_ptr)
 {
-    auto task_func = task->routine;
-    omp_task_data task_data(gtid, team, icv);
-    set_thread_data( get_self_id(), reinterpret_cast<size_t>(&task_data));
+    auto task_func = kmp_task_ptr->routine;
+    intrusive_ptr current_task_ptr(new omp_task_data(gtid, parent_task_ptr->team, parent_task_ptr->icv));
+    set_thread_data( get_self_id(), reinterpret_cast<size_t>(current_task_ptr.get()));
 #if HPXMP_HAVE_OMPT
     ompt_data_t *my_task_data = &hpx_backend->get_task_data()->task_data;
     if (ompt_enabled.ompt_callback_task_create)
@@ -302,18 +287,16 @@ void task_setup( int gtid, kmp_task_t *task, omp_icv icv,
             prior_task_data, status, my_task_data);
     }
 #endif
-
-if(! task->gcc)
-    task_func(gtid, task);
+    // actually running the taskfunctions
+if(! kmp_task_ptr->gcc)
+    task_func(gtid, kmp_task_ptr.get());
 else
-    ((void (*)(void *))(*(task->routine)))(task->shareds);
-
-    *(parent_task_counter) -= 1;
+    ((void (*)(void *))(*(kmp_task_ptr->routine)))(kmp_task_ptr->shareds);
 #ifndef OMP_COMPLIANT
-    team->num_tasks--;
+    //count down number of tasks under team
+    current_task_ptr->team->teamTaskLatch.count_down(1);
 #endif
-    if(task->part_id ==0)
-        delete[] (char*)task;
+
 #if HPXMP_HAVE_OMPT
     ompt_task_status_t status_fin = ompt_task_complete;
     /* let OMPT know that we're returning to the callee task */
@@ -323,8 +306,11 @@ else
             my_task_data, status_fin, prior_task_data);
     }
 #endif
-//  make sure nothing is accessing this thread data after task_data got destroyed
-    set_thread_data( get_self_id(), reinterpret_cast<size_t>(nullptr));
+    //if task is in taskgroup, count down taskgroup latch as this task is done
+    if(parent_task_ptr->in_taskgroup)
+        parent_task_ptr->taskgroupLatch->count_down(1);
+    //tell parent I am done
+    parent_task_ptr->taskLatch.count_down(1);
 }
 
 #ifdef OMP_COMPLIANT
@@ -346,11 +332,10 @@ void tg_task_setup( int gtid, kmp_task_t *task, omp_icv icv,
 
 //shared_ptr is used for these counters, because the parent/calling task may terminate at any time,
 //causing its omp_task_data to be deallocated.
-void hpx_runtime::create_task( kmp_routine_entry_t task_func, int gtid, kmp_task_t *thunk)
+void hpx_runtime::create_task( kmp_routine_entry_t task_func, int gtid, intrusive_ptr<kmp_task_t> kmp_task_ptr)
 {
-    auto *current_task = get_task_data();
-
-    if(current_task->team->num_threads > 0) {
+    auto current_task_ptr = get_task_data();
+    if(current_task_ptr->team->num_threads > 0) {
 #ifdef OMP_COMPLIANT
         if(current_task->in_taskgroup) {
             hpx::apply( *(current_task->tg_exec), tg_task_setup, gtid, thunk, current_task->icv,
@@ -361,17 +346,19 @@ void hpx_runtime::create_task( kmp_routine_entry_t task_func, int gtid, kmp_task
                         current_task->num_child_tasks, current_task->team );
         }
 #else
-        //TODO: add taskgroups in non compliant version
-        *(current_task->num_child_tasks) += 1;
-        current_task->team->num_tasks++;
+        //this is waited in taskwait, wait for all tasks before taskwait created to be done
+        // create_task function is not supposed to wait anything
+        current_task_ptr->taskLatch.count_up(1);
+        //count up number of tasks in this team
+        current_task_ptr->team->teamTaskLatch.count_up(1);
+        //count up number of task in taskgroup if we are under taskgroup construct
+        if(current_task_ptr->in_taskgroup)
+            current_task_ptr->taskgroupLatch->count_up(1);
         //this fixes hpx::apply changes in hpx backend
         hpx::applier::register_thread_nullary(
-            std::bind(&task_setup, gtid, thunk, current_task->icv,
-                current_task->num_child_tasks, current_task->team),
+            std::bind(&task_setup, gtid, kmp_task_ptr, current_task_ptr),
             "omp_explicit_task", hpx::threads::pending, true,
             hpx::threads::thread_priority_normal);
-//        hpx::apply(task_setup, gtid, thunk, current_task->icv,
-//                    current_task->num_child_tasks, current_task->team );
 #endif
     }
 //    else {
@@ -380,12 +367,10 @@ void hpx_runtime::create_task( kmp_routine_entry_t task_func, int gtid, kmp_task
 //    }
 }
 
-void df_task_wrapper( int gtid, kmp_task_t *task, omp_icv icv,
-                      shared_ptr<atomic<int64_t>> task_counter,
-                      parallel_region *team,
-                      vector<shared_future<void>> deps)
+//deps will notify when_all function
+void df_task_wrapper( int gtid, kmp_task_t *task, intrusive_ptr<omp_task_data> parent_task_ptr, vector<shared_future<void>> deps)
 {
-    task_setup( gtid, task, icv, task_counter, team);
+    task_setup( gtid, task, parent_task_ptr);
 }
 
 #ifdef OMP_COMPLIANT
@@ -406,34 +391,32 @@ void hpx_runtime::create_df_task( int gtid, kmp_task_t *thunk,
                            int ndeps, kmp_depend_info_t *dep_list,
                            int ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
 {
-    auto task = get_task_data();
-    auto team = task->team;
-    if(team->num_threads == 1 ) {
-        create_task(thunk->routine, gtid, thunk);
-    }
+    auto current_task_ptr = get_task_data();
+    auto team = current_task_ptr->team;
     vector<shared_future<void>> dep_futures;
     dep_futures.reserve( ndeps + ndeps_noalias);
 
     //Populating a vector of futures that the task depends on
     for(int i = 0; i < ndeps;i++) {
-        if(task->df_map.count( dep_list[i].base_addr) > 0) {
-            dep_futures.push_back(task->df_map[dep_list[i].base_addr]);
+        if(current_task_ptr->df_map.count( dep_list[i].base_addr) > 0) {
+            dep_futures.push_back(current_task_ptr->df_map[dep_list[i].base_addr]);
         }
     }
     for(int i = 0; i < ndeps_noalias;i++) {
-        if(task->df_map.count( noalias_dep_list[i].base_addr) > 0) {
-            dep_futures.push_back(task->df_map[noalias_dep_list[i].base_addr]);
+        if(current_task_ptr->df_map.count( noalias_dep_list[i].base_addr) > 0) {
+            dep_futures.push_back(current_task_ptr->df_map[noalias_dep_list[i].base_addr]);
         }
     }
 
     shared_future<void> new_task;
 
-    if(task->in_taskgroup) {
+    if(current_task_ptr->in_taskgroup) {
+        current_task_ptr->taskgroupLatch->count_up(1);
     } else {
-        *(task->num_child_tasks) += 1;
+        current_task_ptr->taskLatch.count_up(1);
     }
 #ifndef OMP_COMPLIANT
-    team->num_tasks++;
+    team->teamTaskLatch.count_up(1);
 #endif
     if(dep_futures.size() == 0) {
 #ifdef OMP_COMPLIANT
@@ -445,8 +428,7 @@ void hpx_runtime::create_df_task( int gtid, kmp_task_t *thunk,
                                     task->num_child_tasks, team);
         }
 #else
-        new_task = hpx::async( task_setup, gtid, thunk, task->icv,
-                                task->num_child_tasks, team);
+        new_task = hpx::async(task_setup, gtid, thunk, current_task_ptr);
 #endif
     } else {
 
@@ -466,19 +448,17 @@ void hpx_runtime::create_df_task( int gtid, kmp_task_t *thunk,
                                  team, hpx::when_all(dep_futures) );
         }
 #else
-        new_task = dataflow( unwrapping(df_task_wrapper), gtid, thunk, task->icv,
-                             task->num_child_tasks,
-                             team, hpx::when_all(dep_futures) );
+        new_task = dataflow( unwrapping(df_task_wrapper), gtid, thunk, current_task_ptr, hpx::when_all(dep_futures));
 #endif
     }
     for(int i = 0 ; i < ndeps; i++) {
         if(dep_list[i].flags.out) {
-            task->df_map[dep_list[i].base_addr] = new_task;
+            current_task_ptr->df_map[dep_list[i].base_addr] = new_task;
         }
     }
     for(int i = 0 ; i < ndeps_noalias; i++) {
         if(noalias_dep_list[i].flags.out) {
-            task->df_map[noalias_dep_list[i].base_addr] = new_task;
+            current_task_ptr->df_map[noalias_dep_list[i].base_addr] = new_task;
         }
     }
     //task->last_df_task = new_task;
@@ -557,7 +537,7 @@ void hpx_runtime::create_future_task( int gtid, kmp_task_t *thunk,
                                                 *(input_futures[0]), *(input_futures[1]),
                                                 *(input_futures[2]) );
     } else {
-        cout << "too many dependencies for now" << endl;
+        std::cerr << "too many dependencies for now" << endl;
     }
 }
 #endif
@@ -566,14 +546,12 @@ void hpx_runtime::create_future_task( int gtid, kmp_task_t *thunk,
 
 void thread_setup( invoke_func kmp_invoke, microtask_t thread_func,
                    int argc, void **argv, int tid,
-                   parallel_region *team, omp_task_data *parent,
-                   mutex_type& barrier_mtx,
-                   hpx::lcos::local::condition_variable_any& cond,
-                   int& running_threads )
+                   parallel_region *team, intrusive_ptr<omp_task_data> parent,
+                   hpxmp_latch& threadLatch)
 {
-    omp_task_data task_data(tid, team, parent);
+    intrusive_ptr task_data_ptr(new omp_task_data(tid, team, parent.get()));
 
-    set_thread_data( get_self_id(), reinterpret_cast<size_t>(&task_data));
+    set_thread_data( get_self_id(), reinterpret_cast<size_t>(task_data_ptr.get()));
 
     if(argc == 0) { //note: kmp_invoke segfaults iff argc == 0
         thread_func(&tid, &tid);
@@ -607,42 +585,16 @@ void thread_setup( invoke_func kmp_invoke, microtask_t thread_func,
     }
 #endif
 }
-    int count = 0;
-    int max_count = 10;
-    while (*(task_data.num_child_tasks) > 0 ) {
-        if(count == 0) {
-            hpx::this_thread::yield();
-        } else {
-            int sleep_time = 10*count;
-            if(count > max_count)
-                sleep_time = 10*max_count;
-            hpx::this_thread::sleep_for(std::chrono::milliseconds(sleep_time));
-        }
-        count++;
-    }
-
-    //This keeps the task_data on this stack allocated. When is that needed?
-    //  if tasks are created without a barrier or taskwait, they could still
-    //  reference their parents metadata(task_data above).
-    //This combined with the waiting on child tasks above fufills the requirements
-    //  of an OpenMP barrier.
-    std::unique_lock<mutex_type> lk(barrier_mtx);
-    if(--running_threads == 0) {
-        //hpx::lcos::local::spinlock::scoped_lock lk(barrier_mtx);
-        //std::unique_lock<mutex_type> lk(barrier_mtx);
-        hpx::util::ignore_while_checking<std::unique_lock<mutex_type> > il(&lk);
-        cond.notify_all();
-    }
+    threadLatch.count_down(1);
 }
 
 // This is the only place where get_thread can't be called, since
 // that data is not initialized for the new hpx threads yet.
 void fork_worker( invoke_func kmp_invoke, microtask_t thread_func,
                   int argc, void **argv,
-                  omp_task_data *parent)
+                  intrusive_ptr<omp_task_data> parent)
 {
     parallel_region team(parent->team, parent->threads_requested);
-
 #if HPXMP_HAVE_OMPT
     //TODO:HOW TO FIND OUT INVOKER
     ompt_invoker_t a = ompt_invoker_runtime;
@@ -659,47 +611,20 @@ void fork_worker( invoke_func kmp_invoke, microtask_t thread_func,
 #ifdef OMP_COMPLIANT
     team.exec.reset(new local_priority_queue_executor(parent->threads_requested));
 #endif
-    hpx::lcos::local::condition_variable_any  cond;
-    mutex_type barrier_mtx;
     int running_threads = parent->threads_requested;
+    hpxmp_latch threadLatch(running_threads+1);
 
     for( int i = 0; i < parent->threads_requested; i++ ) {
         hpx::applier::register_thread_nullary(
                 std::bind( &thread_setup, kmp_invoke, thread_func, argc, argv, i, &team, parent,
-                           boost::ref(barrier_mtx), boost::ref(cond), boost::ref(running_threads) ),
+                           boost::ref(threadLatch)),
                 "omp_implicit_task", hpx::threads::pending,
                 true, hpx::threads::thread_priority_low, i );
                 //true, hpx::threads::thread_priority_normal, i );
     }
-    {
-        //hpx::lcos::local::spinlock::scoped_lock lk(barrier_mtx);
-        std::unique_lock<mutex_type> lk(barrier_mtx);
-        while( running_threads > 0 ) {
-            cond.wait(lk);
-        }
-    }
-    //The executor containing the tasks will be destroyed as this call goes out
-    //of scope, which will wait on all tasks contained in it. So, nothing needs
-    //to be done here for it.
-
-    //I shouldn't need this. Tasks should be done before the thread exit.
-    //FIXME: Remove this once the rest of the cond vars are in.
-#ifndef OMP_COMPLIANT
-    int count = 0;
-    int max_count = 10;
-    while(team.num_tasks > 0) {
-        if(count == 0) {
-            hpx::this_thread::yield();
-        } else {
-            int sleep_time = 10*count;
-            if(count > max_count)
-                sleep_time = 10*max_count;
-            hpx::this_thread::sleep_for(std::chrono::milliseconds(sleep_time));
-        }
-        count++;
-    }
-#endif
-
+    threadLatch.count_down_and_wait();
+    // wait for all the tasks in the team to finish
+    team.teamTaskLatch.wait();
 #if HPXMP_HAVE_OMPT
     if (ompt_enabled.ompt_callback_parallel_end) {
         ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
@@ -713,15 +638,15 @@ void fork_worker( invoke_func kmp_invoke, microtask_t thread_func,
 //TODO: according to the spec, the current thread should be thread 0 of the new team, and execute the new work.
 void hpx_runtime::fork(invoke_func kmp_invoke, microtask_t thread_func, int argc, void** argv)
 {
-    omp_task_data *current_task = get_task_data();
+    auto current_task_ptr = get_task_data();
 
     if( hpx::threads::get_self_ptr() ) {
-        fork_worker(kmp_invoke, thread_func, argc, argv, current_task);
+        fork_worker(kmp_invoke, thread_func, argc, argv, current_task_ptr);
     } else {
-        //this handles the sync for hox threads.
+        //this handles the sync for hpx threads.
         hpx::threads::run_as_hpx_thread(&fork_worker,kmp_invoke, thread_func, argc, argv,
-                                        current_task);
+                                        current_task_ptr);
     }
-    current_task->set_threads_requested(current_task->icv.nthreads );
+    current_task_ptr->set_threads_requested(current_task_ptr->icv.nthreads );
 }
 
diff --git a/src/hpx_runtime.h b/src/hpx_runtime.h
index 3f5ccb8..557a1a1 100644
--- a/src/hpx_runtime.h
+++ b/src/hpx_runtime.h
@@ -27,6 +27,7 @@
 #include <boost/lexical_cast.hpp>
 #include <boost/assign/std/vector.hpp>
 #include <boost/cstdint.hpp>
+#include <boost/intrusive_ptr.hpp>
 //#include <boost/thread/mutex.hpp>
 //#include <boost/thread/condition.hpp>
 
@@ -41,10 +42,12 @@ using std::atomic;
 using boost::shared_ptr;
 using hpx::threads::executors::local_priority_queue_executor;
 using hpx::lcos::local::barrier;
+using hpx::lcos::local::latch;
 using hpx::lcos::shared_future;
 using hpx::lcos::future;
 using std::vector;
 using hpx::util::high_resolution_timer;
+using boost::intrusive_ptr;
 
 
 typedef void (*microtask_t)( int *gtid, int *npr, ... );
@@ -64,15 +67,27 @@ typedef int (* kmp_routine_entry_t)( int, void * );
 
 typedef std::map<int64_t, hpx::shared_future<void>> depends_map;
 
-typedef struct kmp_task {
+struct kmp_task_t {
     void *              shareds;
     kmp_routine_entry_t routine;
     int                 part_id;
     bool                gcc;
+    atomic<int> pointer_counter;
 #if OMP_40_ENABLED
     kmp_routine_entry_t destructors;
 #endif
-} kmp_task_t;
+};
+
+inline void intrusive_ptr_add_ref(kmp_task_t *x)
+{
+    ++x->pointer_counter;
+}
+
+inline void intrusive_ptr_release(kmp_task_t *x)
+{
+    if (x->pointer_counter == 0)
+        delete x;
+}
 
 
 typedef struct kmp_depend_info {
@@ -84,6 +99,58 @@ typedef struct kmp_depend_info {
     } flags;
 } kmp_depend_info_t;
 
+#if HPXMP_HAVE_OMP_50_ENABLED
+struct kmp_task_red_flags_t {
+    unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
+    unsigned reserved31 : 31;
+};
+
+// internal structure for reduction data item related info
+struct kmp_task_red_data_t {
+    void *reduce_shar; // shared reduction item
+    size_t reduce_size; // size of data item
+    void *reduce_priv; // thread specific data
+    void *reduce_pend; // end of private data for comparison op
+    void *reduce_init; // data initialization routine
+    void *reduce_fini; // data finalization routine
+    void *reduce_comb; // data combiner routine
+    kmp_task_red_flags_t flags; // flags for additional info from compiler
+};
+
+// structure sent us by compiler - one per reduction item
+struct kmp_task_red_input_t {
+    void *reduce_shar; // shared reduction item
+    size_t reduce_size; // size of data item
+    void *reduce_init; // data initialization routine
+    void *reduce_fini; // data finalization routine
+    void *reduce_comb; // data combiner routine
+    kmp_task_red_flags_t flags; // flags for additional info from compiler
+};
+
+struct kmp_taskgroup_t {
+    std::atomic<int> count; // number of allocated and incomplete tasks
+    std::atomic<int>
+            cancel_request; // request for cancellation of this taskgroup
+     kmp_taskgroup_t* parent; // parent taskgroup
+    // Block of data to perform task reduction
+    shared_ptr<vector<kmp_task_red_data_t>> reduce_data; // reduction related info
+    int reduce_num_data; // number of data items to reduce
+    atomic<int> pointer_counter;
+};
+
+inline void intrusive_ptr_add_ref(kmp_taskgroup_t *x)
+{
+    ++x->pointer_counter;
+}
+
+inline void intrusive_ptr_release(kmp_taskgroup_t *x)
+{
+    if (x->pointer_counter == 0)
+        delete x;
+}
+
+#endif
+
 class loop_data {
     public:
         loop_data(int NT, int L, int U, int S, int C, int sched)
@@ -123,13 +190,26 @@ class loop_data {
         std::vector<int> iter_count;
 };
 
+//temp solution for cout_up does not allow starting from 0 in HPX
+class hpxmp_latch: public latch {
+public:
+    using latch::latch;
+    void count_up(std::ptrdiff_t n)
+    {
+        HPX_ASSERT(n >= 0);
+
+        std::ptrdiff_t old_count =
+                counter_.fetch_add(n, std::memory_order_acq_rel);
+    }
+};
+
 //Does this need to keep track of the parallel region it is nested in,
 // the omp_task_data of the parent thread, or both?
 //template<typename scheduler>
 struct parallel_region {
 
     parallel_region( int N ) : num_threads(N), globalBarrier(N),
-                               depth(0), reduce_data(N)
+                               depth(0), reduce_data(N), teamTaskLatch(0)
     {};
 
     parallel_region( parallel_region *parent, int threads_requested ) : parallel_region(threads_requested)
@@ -147,13 +227,13 @@ struct parallel_region {
     mutex_type thread_mtx{};
     mutex_type single_mtx{};
     int depth;
-    atomic<int64_t> num_tasks{0};
     atomic<int> single_counter{0};
     atomic<int> current_single_thread{-1};
     void *copyprivate_data;
     vector<void*> reduce_data;
     vector<loop_data> loop_list;
     mutex_type loop_mtx;
+    hpxmp_latch teamTaskLatch;
 #if (HPXMP_HAVE_OMPT)
     ompt_data_t parent_data = ompt_data_none;
     ompt_data_t parallel_data = ompt_data_none;
@@ -171,8 +251,8 @@ class omp_task_data {
     public:
         //This constructor should only be used once for the implicit task
         omp_task_data( parallel_region *T, omp_device_icv *global, int init_num_threads)
-            : team(T), num_child_tasks(new atomic<int64_t>{0})
-              {
+            : team(T),taskLatch(0)
+        {
             local_thread_num = 0;
             icv.device = global;
             icv.nthreads = init_num_threads;
@@ -191,7 +271,7 @@ class omp_task_data {
 
         //This is for explicit tasks
         omp_task_data(int tid, parallel_region *T, omp_icv icv_vars)
-            : local_thread_num(tid), team(T), num_child_tasks(new atomic<int64_t>{0}), icv(icv_vars)
+            : local_thread_num(tid), team(T), icv(icv_vars),taskLatch(0)
         {
             threads_requested = icv.nthreads;
             icv_vars.device = icv.device;
@@ -218,10 +298,11 @@ class omp_task_data {
         parallel_region *team;
         //mutex_type thread_mutex;
         //hpx::lcos::local::condition_variable_any thread_cond;
-        shared_ptr<atomic<int64_t>> num_child_tasks;
         int single_counter{0};
         int loop_num{0};
         bool in_taskgroup{false};
+        hpxmp_latch taskLatch;
+        atomic<int> pointer_counter{0};
         //shared_future<void> last_df_task;
 #if HPXMP_HAVE_OMPT
         ompt_data_t task_data = ompt_data_none;
@@ -230,13 +311,27 @@ class omp_task_data {
 #ifdef OMP_COMPLIANT
         shared_ptr<local_priority_queue_executor> tg_exec;
 #else
-        shared_ptr<atomic<int64_t>> tg_num_tasks;
+        shared_ptr<hpxmp_latch> taskgroupLatch;
 #endif
 
         omp_icv icv;
         depends_map df_map;
+#if HPXMP_HAVE_OMP_50_ENABLED
+        intrusive_ptr<kmp_taskgroup_t> td_taskgroup;
+#endif
 };
 
+inline void intrusive_ptr_add_ref(omp_task_data *x)
+{
+    ++x->pointer_counter;
+}
+
+inline void intrusive_ptr_release(omp_task_data *x)
+{
+    if (--x->pointer_counter == 0)
+        delete x;
+}
+
 struct raw_data {
     void *data;
     size_t size;
@@ -248,7 +343,7 @@ class hpx_runtime {
         void fork(invoke_func kmp_invoke, microtask_t thread_func, int argc, void** argv);
         parallel_region* get_team();
         bool set_thread_data_check();
-        omp_task_data* get_task_data();
+        intrusive_ptr<omp_task_data> get_task_data();
         int get_thread_num();
         int get_num_threads();
         int get_num_procs();
@@ -256,7 +351,7 @@ class hpx_runtime {
         void barrier_wait();
         void create_task( omp_task_func taskfunc, void *frame_pointer,
                           void *firstprivates, int is_tied, int blocks_parent);
-        void create_task( kmp_routine_entry_t taskfunc, int gtid, kmp_task_t *task);
+        void create_task( kmp_routine_entry_t taskfunc, int gtid, intrusive_ptr<kmp_task_t> task);
         void create_df_task( int gtid, kmp_task_t *thunk,
                              int ndeps, kmp_depend_info_t *dep_list,
                              int ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
@@ -277,7 +372,7 @@ class hpx_runtime {
 
     private:
         shared_ptr<parallel_region> implicit_region;
-        shared_ptr<omp_task_data> initial_thread;
+        intrusive_ptr<omp_task_data> initial_thread;
         int num_procs;
         shared_ptr<high_resolution_timer> walltime;
         bool external_hpx;
diff --git a/src/intel_hpxMP.cpp b/src/intel_hpxMP.cpp
index 01c6c8b..5d7bd6f 100644
--- a/src/intel_hpxMP.cpp
+++ b/src/intel_hpxMP.cpp
@@ -92,12 +92,13 @@ __kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
     //kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *) & flags;
     //TODO: do I need to do something with these flags?
     int task_size = sizeof_kmp_task_t + (-sizeof_kmp_task_t%8);
-
+    //can be sure that no deletion of task happens here, no need of intrusive ptr
     kmp_task_t *task = (kmp_task_t*)new char[task_size + sizeof_shareds]; 
 
     //This gets deleted at the end of task_setup
     task->routine = task_entry;
     task->gcc = false;
+    task->pointer_counter = 0;
     if( sizeof_shareds == 0 ) {
         task->shareds = NULL;
     } else {
@@ -112,7 +113,8 @@ int __kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task){
         std::cout<<"__kmpc_omp_task"<<std::endl;
     #endif
     start_backend();
-    hpx_backend->create_task(new_task->routine, gtid, new_task);
+    intrusive_ptr new_task_ptr(new_task);
+    hpx_backend->create_task(new_task_ptr->routine, gtid, new_task_ptr);
     return 1;
 }
 
@@ -202,6 +204,146 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part ){
     return 0;
 }
 
+#if HPXMP_HAVE_OMP_50_ENABLED
+// Task Reduction implementation
+void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+    auto thread = hpx_backend->get_task_data();
+    intrusive_ptr<kmp_taskgroup_t> tg = thread->td_taskgroup;
+    int nth = thread->team->num_threads;
+    kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
+
+    if (nth == 1) {
+        return (void *)tg.get();
+    }
+    shared_ptr<vector<kmp_task_red_data_t>> arr(new(vector<kmp_task_red_data_t>));
+    arr->reserve(num);
+    for (int i = 0; i < num; ++i) {
+        void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
+        size_t size = input[i].reduce_size - 1;
+        // round the size up to cache line per thread-specific item
+        size += 64 - size % 64;
+        (*arr)[i].reduce_shar = input[i].reduce_shar;
+        (*arr)[i].reduce_size = size;
+        (*arr)[i].reduce_init = input[i].reduce_init;
+        (*arr)[i].reduce_fini = input[i].reduce_fini;
+        (*arr)[i].reduce_comb = input[i].reduce_comb;
+        (*arr)[i].flags = input[i].flags;
+        if (!input[i].flags.lazy_priv) {
+            // allocate cache-line aligned block and fill it with zeros
+            (*arr)[i].reduce_priv = new char[nth * size];
+            (*arr)[i].reduce_pend = (char *)((*arr)[i].reduce_priv) + nth * size;
+            if (f_init != NULL) {
+                // initialize thread-specific items
+                for (int j = 0; j < nth; ++j) {
+                    f_init((char *)((*arr)[i].reduce_priv) + j * size);
+                }
+            }
+        } else {
+            // only allocate space for pointers now,
+            // objects will be lazily allocated/initialized once requested
+            (*arr)[i].reduce_priv = new char[nth * sizeof(void *)];
+        }
+    }
+    tg->reduce_data = arr;
+    tg->reduce_num_data = num;
+    return (void *)tg.get();
+}
+
+/*!
+@ingroup TASKING
+@param gtid    Global thread ID
+@param tskgrp  The taskgroup ID (optional)
+@param data    Shared location of the item
+@return The pointer to per-thread data
+
+Get thread-specific location of data item
+*/
+void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
+    auto thread = hpx_backend->get_task_data();
+    kmp_int32 nth = thread->team->num_threads;
+    if (nth == 1)
+        return data; // nothing to do
+
+    intrusive_ptr<kmp_taskgroup_t> tg = (kmp_taskgroup_t*)tskgrp;
+    if (tg == NULL)
+        tg = thread->td_taskgroup;
+    shared_ptr<vector<kmp_task_red_data_t>> arr = tg->reduce_data;
+    kmp_int32 num = tg->reduce_num_data;
+    kmp_int32 tid = gtid;
+
+    while (tg != NULL) {
+        for (int i = 0; i < num; ++i) {
+            if (!(*arr)[i].flags.lazy_priv) {
+                if (data == (*arr)[i].reduce_shar ||
+                    (data >= (*arr)[i].reduce_priv && data < (*arr)[i].reduce_pend))
+                    return (char *)((*arr)[i].reduce_priv) + tid * (*arr)[i].reduce_size;
+            } else {
+                // check shared location first
+                void **p_priv = (void **)((*arr)[i].reduce_priv);
+                if (data == (*arr)[i].reduce_shar)
+                    goto found;
+                // check if we get some thread specific location as parameter
+                for (int j = 0; j < nth; ++j)
+                    if (data == p_priv[j])
+                        goto found;
+                continue; // not found, continue search
+                found:
+                if (p_priv[tid] == NULL) {
+                    // allocate thread specific object lazily
+                    void (*f_init)(void *) = (void (*)(void *))((*arr)[i].reduce_init);
+                    p_priv[tid] = new char[(*arr)[i].reduce_size];
+                    if (f_init != NULL) {
+                        f_init(p_priv[tid]);
+                    }
+                }
+                return p_priv[tid];
+            }
+        }
+        tg = tg->parent;
+        arr = tg->reduce_data;
+        num = tg->reduce_num_data;
+    }
+    return NULL; // ERROR, this line never executed
+}
+
+// Finalize task reduction.
+// Called from __kmpc_end_taskgroup()
+void __kmp_task_reduction_fini(void *thr, intrusive_ptr<kmp_taskgroup_t> tg) {
+    auto th = hpx_backend->get_task_data();
+    kmp_int32 nth = th->team->num_threads;
+    shared_ptr<vector<kmp_task_red_data_t>> arr = tg->reduce_data;
+    kmp_int32 num = tg->reduce_num_data;
+    for (int i = 0; i < num; ++i) {
+        void *sh_data = (*arr)[i].reduce_shar;
+        void (*f_fini)(void *) = (void (*)(void *))((*arr)[i].reduce_fini);
+        void (*f_comb)(void *, void *) =
+        (void (*)(void *, void *))((*arr)[i].reduce_comb);
+        if (!(*arr)[i].flags.lazy_priv) {
+            void *pr_data = (*arr)[i].reduce_priv;
+            size_t size = (*arr)[i].reduce_size;
+            for (int j = 0; j < nth; ++j) {
+                void *priv_data = (char *)pr_data + j * size;
+                f_comb(sh_data, priv_data); // combine results
+                if (f_fini)
+                    f_fini(priv_data); // finalize if needed
+            }
+        } else {
+            void **pr_data = (void **)((*arr)[i].reduce_priv);
+            for (int j = 0; j < nth; ++j) {
+                if (pr_data[j] != NULL) {
+                    f_comb(sh_data, pr_data[j]); // combine results
+                    if (f_fini)
+                        f_fini(pr_data[j]); // finalize if needed
+                        delete(pr_data[j]);
+                }
+            }
+        }
+    }
+    tg->reduce_data = NULL;
+    tg->reduce_num_data = 0;
+}
+#endif
+
 void __kmpc_taskgroup( ident_t* loc, int gtid ) {
     if( hpx_backend->start_taskgroup() ) {
         #if defined DEBUG && defined HPXMP_HAVE_TRACE
@@ -221,8 +363,9 @@ void __kmpc_taskgroup( ident_t* loc, int gtid ) {
             }
         }
 #endif
-        cout << "Warning, taskgroup failed to start" << endl;
     }
+    else
+        cout << "Warning, taskgroup failed to start" << endl;
     //hpx_backend->get_task_data()->num_taskgroup_tasks.reset(new atomic<int>{0});
 }
 
@@ -300,7 +443,7 @@ __kmpc_push_num_threads( ident_t *loc,
     #endif
     start_backend();
     start_backend();
-    omp_task_data *data = hpx_backend->get_task_data();
+    auto data = hpx_backend->get_task_data();
     data->set_threads_requested( num_threads );
 }
 
@@ -371,7 +514,7 @@ int __kmpc_single(ident_t *loc, int tid){
     if(!hpx_backend || !hpx::threads::get_self_ptr() ) {
         return 1;
     }
-    auto *task = hpx_backend->get_task_data();
+    auto task = hpx_backend->get_task_data();
     auto *team = task->team;
     int do_work = 0;
 
diff --git a/src/intel_hpxMP.h b/src/intel_hpxMP.h
index eab0fae..a6bad4e 100644
--- a/src/intel_hpxMP.h
+++ b/src/intel_hpxMP.h
@@ -265,3 +265,9 @@ extern "C" void omp_unset_nest_lock(omp_lock_t **lock);
 extern "C" int omp_test_lock(omp_lock_t **lock);
 extern "C" int omp_test_nest_lock(omp_lock_t **lock);
 
+#if HPXMP_HAVE_OMP_50_ENABLED
+extern "C" void *__kmpc_task_reduction_init(int gtid, int num, void *data);
+extern "C" void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data);
+void __kmp_task_reduction_fini(void *thr, intrusive_ptr<kmp_taskgroup_t> tg);
+#endif
+
diff --git a/src/loop_schedule.cpp b/src/loop_schedule.cpp
index 7c01054..951b1e3 100644
--- a/src/loop_schedule.cpp
+++ b/src/loop_schedule.cpp
@@ -56,6 +56,9 @@ __kmpc_for_static_init_4( ident_t *loc, int32_t gtid, int32_t schedtype,
                           int32_t *p_last_iter,int32_t *p_lower, int32_t *p_upper, 
                           int32_t *p_stride, int32_t incr, int32_t chunk ) 
 {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_for_static_init_4"<<std::endl;
+    #endif
     omp_static_init<int>( gtid, schedtype, p_last_iter, p_lower, p_upper,
                           p_stride, incr, chunk );
 }
@@ -65,6 +68,9 @@ __kmpc_for_static_init_4u( ident_t *loc, int32_t gtid, int32_t schedtype,
                            int32_t *p_last_iter, uint32_t *p_lower, uint32_t *p_upper,
                            int32_t *p_stride, int32_t incr, int32_t chunk )
 {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_for_static_init_4u"<<std::endl;
+    #endif
     omp_static_init<uint32_t, int>( gtid, schedtype, p_last_iter,
                                     p_lower, p_upper, p_stride, incr, chunk );
 }
@@ -75,6 +81,9 @@ __kmpc_for_static_init_8( ident_t *loc, int32_t gtid,
                           int64_t *p_lower, int64_t *p_upper, 
                           int64_t *p_stride, int64_t incr, int64_t chunk ) 
 {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_for_static_init_8"<<std::endl;
+    #endif
     omp_static_init<int64_t>( gtid, schedtype, p_last_iter,
                                p_lower, p_upper, p_stride, incr, chunk );
 }
@@ -85,6 +94,9 @@ __kmpc_for_static_init_8u( ident_t *loc, int32_t gtid,
                            uint64_t *p_lower, uint64_t *p_upper,
                            int64_t *p_stride, int64_t incr, int64_t chunk )
 {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_for_static_init_8u"<<std::endl;
+    #endif
     omp_static_init<uint64_t, int64_t>( gtid, schedtype, p_last_iter,
                                     p_lower, p_upper, p_stride, incr, chunk );
 }
@@ -92,6 +104,9 @@ __kmpc_for_static_init_8u( ident_t *loc, int32_t gtid,
 void
 __kmpc_for_static_fini( ident_t *loc, int32_t gtid ){
     //Only seems to do internal tracking in intel runtime
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_for_static_fini"<<std::endl;
+    #endif
 }
 
 //------------------------------------------------------------------------
@@ -131,6 +146,9 @@ void scheduler_init( int gtid, int schedtype, T lower, T upper, D stride, D chun
 void 
 __kmpc_dispatch_init_4( ident_t *loc, int32_t gtid, enum sched_type schedule,
                         int32_t lb, int32_t ub, int32_t st, int32_t chunk ) {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_init_4"<<std::endl;
+    #endif
     scheduler_init<int32_t>( gtid, schedule, lb, ub, st, chunk );
 }
 
@@ -138,6 +156,9 @@ void
 __kmpc_dispatch_init_4u( ident_t *loc, int32_t gtid, enum sched_type schedule,
                          uint32_t lb, uint32_t ub, 
                          int32_t st, int32_t chunk ) {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_init_4u"<<std::endl;
+    #endif
     scheduler_init<uint32_t, int32_t>( gtid, schedule, lb, ub, st, chunk );
 }
 
@@ -145,6 +166,9 @@ void
 __kmpc_dispatch_init_8( ident_t *loc, int32_t gtid, enum sched_type schedule,
                         int64_t lb, int64_t ub, 
                         int64_t st, int64_t chunk ) {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_init_8"<<std::endl;
+    #endif
     scheduler_init<int64_t>( gtid, schedule, lb, ub, st, chunk );
 }
 
@@ -152,6 +176,9 @@ void
 __kmpc_dispatch_init_8u( ident_t *loc, int32_t gtid, enum sched_type schedule,
                          uint64_t lb, uint64_t ub, 
                          int64_t st, int64_t chunk ) {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_init_8u"<<std::endl;
+    #endif
     scheduler_init<uint64_t, int64_t>( gtid, schedule, lb, ub, st, chunk );
 }
 
@@ -258,40 +285,67 @@ int kmp_next( int gtid, int *p_last, T *p_lower, T *p_upper, D *p_stride ) {
 int
 __kmpc_dispatch_next_4( ident_t *loc, int32_t gtid, int32_t *p_last,
                         int32_t *p_lb, int32_t *p_ub, int32_t *p_st ){
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_next_4"<<std::endl;
+    #endif
     return kmp_next<int32_t>(gtid, p_last, p_lb, p_ub, p_st);
 }
 
 int
 __kmpc_dispatch_next_4u( ident_t *loc, int32_t gtid, int32_t *p_last,
                         uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st ){
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_next_4u"<<std::endl;
+    #endif
     return kmp_next<uint32_t, int32_t>(gtid, p_last, p_lb, p_ub, p_st);
 }
 
 int
 __kmpc_dispatch_next_8( ident_t *loc, int32_t gtid, int32_t *p_last,
                         int64_t *p_lb, int64_t *p_ub, int64_t *p_st ){
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_next_8"<<std::endl;
+    #endif
     return kmp_next<int64_t>(gtid, p_last, p_lb, p_ub, p_st);
 }
 
 int
 __kmpc_dispatch_next_8u( ident_t *loc, int32_t gtid, int32_t *p_last,
                         uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st ){
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_next_8u"<<std::endl;
+    #endif
     return kmp_next<uint64_t, int64_t>(gtid, p_last, p_lb, p_ub, p_st);
 }
 
 void __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ){
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_fini_4"<<std::endl;
+    #endif
 }
 
 void __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ){
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_fini_8"<<std::endl;
+    #endif
 }
 
 void __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ){
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_fini_4u"<<std::endl;
+    #endif
 }
 
 void __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ){
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_dispatch_fini_8u"<<std::endl;
+    #endif
 }
 
 void __kmpc_ordered(ident_t *, kmp_int32 global_tid ) {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_ordered"<<std::endl;
+    #endif
     int current_loop = hpx_backend->get_task_data()->loop_num - 1;
     auto loop_sched = &(hpx_backend->get_team()->loop_list[ current_loop ]);
     while( loop_sched->ordered_count < loop_sched->first_iter[global_tid] ||
@@ -301,6 +355,9 @@ void __kmpc_ordered(ident_t *, kmp_int32 global_tid ) {
 }
 
 void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid ) {
+    #if defined DEBUG && defined HPXMP_HAVE_TRACE
+        std::cout<<"__kmpc_end_ordered"<<std::endl;
+    #endif
     int current_loop = hpx_backend->get_task_data()->loop_num - 1;
     auto loop_sched = &(hpx_backend->get_team()->loop_list[ current_loop ]);
     loop_sched->ordered_count++;
diff --git a/src/ompt_hpx-general.cpp b/src/ompt_hpx-general.cpp
index adedfcc..7699c14 100644
--- a/src/ompt_hpx-general.cpp
+++ b/src/ompt_hpx-general.cpp
@@ -302,7 +302,7 @@ ompt_data_t *__ompt_get_thread_data_internal() {
 
 ompt_data_t* __ompt_get_parallel_data_internal()
 {
-    omp_task_data* omp_task = hpx_backend->get_task_data();
+    auto omp_task = hpx_backend->get_task_data();
     return &omp_task->team->parallel_data;
 }
 
diff --git a/tests/openmp/regressions/CMakeLists.txt b/tests/openmp/regressions/CMakeLists.txt
index 9675ee4..42c367d 100644
--- a/tests/openmp/regressions/CMakeLists.txt
+++ b/tests/openmp/regressions/CMakeLists.txt
@@ -7,7 +7,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
 set(tests
         get_thread_num
         parallel_multi
-        task_no_parallel
         task_tree_simple
         task_untied
         )
diff --git a/tests/openmp/regressions/parallel_multi.cpp b/tests/openmp/regressions/parallel_multi.cpp
index 3f5716e..f77bbbf 100644
--- a/tests/openmp/regressions/parallel_multi.cpp
+++ b/tests/openmp/regressions/parallel_multi.cpp
@@ -13,7 +13,7 @@ int main(){
         for (int j = 0; j < 1000; j++) {
 #pragma omp parallel
             {
-                printf(" \n");
+                int t = 0;
             }
         }
     }
diff --git a/tests/openmp/regressions/task_no_parallel.cpp b/tests/openmp/regressions/task_no_parallel.cpp
deleted file mode 100644
index 9164d6b..0000000
--- a/tests/openmp/regressions/task_no_parallel.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-//  Copyright (c) 2018 Tianyi Zhang
-//
-//  Distributed under the Boost Software License, Version 1.0. (See accompanying
-//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-
-// Demonstrating #9: terminating with uncaught exception
-
-#include <stdio.h>
-int main (int argc, char *argv[])
-{
-#pragma omp task
-    printf("This task is not in parallel regin\n");
-}
diff --git a/tests/openmp/unit/CMakeLists.txt b/tests/openmp/unit/CMakeLists.txt
index 61c8494..fe9131a 100644
--- a/tests/openmp/unit/CMakeLists.txt
+++ b/tests/openmp/unit/CMakeLists.txt
@@ -7,7 +7,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
 set(tests
         app_lu
         app_old_fib
-        app_omp_fib
         app_vla
         atomic
         barrier
@@ -32,15 +31,18 @@ set(tests
         #single_copyprivate_2   #failure sometime
         #single_copyprivate_1var    #failure sometime
         single_nowait
-        task_depend
-        taskgroup #to be implemented, not checking correctness
+        taskgroup
         task_fp
         task_tree
         taskwait
         taskwait_2
         #threadprivate  #failure sometime
         )
-
+if(HPXMP_WITH_OMP_50_ENABLED)
+    set(tests_omp50
+            task_in_reduction
+            )
+endif()
 enable_testing()
 
 macro(do_test name)
@@ -57,8 +59,38 @@ macro(do_test name)
             )
 endmacro(do_test)
 
+macro(do_test_enhanced name)
+    add_test(${name} ${name})
+    set_target_properties(${name} PROPERTIES
+            CXX_STANDARD 11
+            CXX_STANDARD_REQUIRED ON
+            CXX_EXTENSIONS OFF
+            )
+    #maybe make the omp_num_threads configurable later
+    set_tests_properties(${name} PROPERTIES
+            ENVIRONMENT "LD_PRELOAD=${PROJECT_BINARY_DIR}/libhpxmp.so;OMP_NUM_THREADS=2"
+            TIMEOUT 60
+            )
+endmacro(do_test_enhanced)
+
+
+
 foreach(test ${tests})
     set(sources ${test}.cpp)
     add_executable(${test} ${sources})
     do_test(${test})
 endforeach()
+
+foreach(test ${tests_enhanced})
+    set(sources ${test}.cpp)
+    add_executable(${test} ${sources})
+    do_test_enhanced(${test})
+endforeach()
+
+if(HPXMP_WITH_OMP_50_ENABLED)
+    foreach(test ${tests_omp50})
+        set(sources ${test}.cpp)
+        add_executable(${test} ${sources})
+        do_test_enhanced(${test})
+    endforeach()
+endif()
diff --git a/tests/openmp/unit/app_omp_fib.cpp b/tests/openmp/unit/app_omp_fib.cpp
index 94b5dda..d26f8ef 100644
--- a/tests/openmp/unit/app_omp_fib.cpp
+++ b/tests/openmp/unit/app_omp_fib.cpp
@@ -8,62 +8,61 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
+#include <omp.h>
 
 long fib1(int k);
 long fib2(int k);
 
 //int num_tasks = 0;
-int cutoff = 26;
+int cutoff = 15;
 
 int main(int argc, char* argv[])
 {
-    struct timeval t1;
-    struct timeval t2;
-
-    int input;
-    long s, u;
-    long f;
-    double m;
-
-    if (argc != 2 && argc != 3)
-    {
-        //fprintf(stderr, "Usage: ./fib <input> <cutoff>\n");
-        input = 24;
-        cutoff = 0;
-        //return 1;
-    }
-    else
-    {
-        input = atoi(argv[1]);
-        if (argc == 3)
-        {
-            cutoff = atoi(argv[2]);
+    for(int i=2; i<=20; i++) {
+        omp_set_num_threads(i/2);
+        struct timeval t1;
+        struct timeval t2;
+
+        int input;
+        long s, u;
+        long f;
+        double m;
+
+        if (argc != 2 && argc != 3) {
+            //fprintf(stderr, "Usage: ./fib <input> <cutoff>\n");
+            input = 24;
+            //return 1;
+        } else {
+            input = atoi(argv[1]);
+            if (argc == 3) {
+                cutoff = atoi(argv[2]);
+            }
         }
-    }
 
-    gettimeofday(&t1, NULL);
+        gettimeofday(&t1, NULL);
 
 #pragma omp parallel
-    {
-#pragma omp master
         {
-#pragma omp task shared(f)
+#pragma omp master
             {
-                f = fib1(input);
+#pragma omp task shared(f)
+                {
+                    f = fib1(input);
+                }
             }
         }
-    }
 
-    gettimeofday(&t2, NULL);
-    printf("fib(%d) = %d\n", input, f);
+        gettimeofday(&t2, NULL);
+        printf("fib(%d) = %d\n", input, f);
 
-    s = t2.tv_sec - t1.tv_sec;
-    u = t2.tv_usec - t1.tv_usec;
-    m = (s * 1000 + u / 1000.0) + 0.5;
-    printf("cutoff = %d\n", cutoff);
-    printf("time = %.2lfms\n", m);
-    if (f != 46368)
-        return 1;
+        s = t2.tv_sec - t1.tv_sec;
+        u = t2.tv_usec - t1.tv_usec;
+        m = (s * 1000 + u / 1000.0) + 0.5;
+        printf("cutoff = %d\n", cutoff);
+        printf("time = %.2lfms\n", m);
+        if (f != 46368)
+            return 1;
+    }
     return 0;
 }
 
diff --git a/tests/openmp/unit/sections_2.cpp b/tests/openmp/unit/sections_2.cpp
index 5af585b..a09e577 100644
--- a/tests/openmp/unit/sections_2.cpp
+++ b/tests/openmp/unit/sections_2.cpp
@@ -12,18 +12,12 @@ int main()
 {
     std::atomic<int> position(-1);
     int result[4];
-
+    int ret = 0;
     position++;
     result[position] = 1;
     printf("The ");
 #pragma omp parallel sections
     {
-#pragma omp section
-            {
-                position++;
-                result[position] = 2;
-                printf("STE||AR ");
-            }
 #pragma omp section
             {
                 position++;
@@ -36,9 +30,10 @@ int main()
     printf("is awesome\n ");
 
     printf("\n");
-    for(int i = 0; i < 3; i++){
+    for(int i = 0; i < 2; i++){
+        printf("%d ",result[i]);
         if(result[i] > result[i+1])
-            return 1;
+            ret = 1;
     }
-    return 0;
+    return ret;
 }
diff --git a/tests/openmp/unit/task_depend.cpp b/tests/openmp/unit/task_depend.cpp
index c20c816..8a0759e 100644
--- a/tests/openmp/unit/task_depend.cpp
+++ b/tests/openmp/unit/task_depend.cpp
@@ -5,6 +5,7 @@
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
 #include <stdio.h>
+#include <omp.h>
 
 int foo()
 {
@@ -20,24 +21,24 @@ int main(int argc, char **argv)
 {
     int x = 5, y = 1, z = 0;
     printf("&x = %p\n", &x);
+    for(int i=2; i<=20; i++) {
+        omp_set_num_threads(i/2);
 #pragma omp parallel
-    {
-#pragma omp single
         {
+#pragma omp single
+            {
 #pragma omp task depend(out : x)
-            x = foo();
+                x = foo();
 
 #pragma omp task depend(in : x) depend(out : y)
-            y = bar(x);
+                y = bar(x);
 
 #pragma omp task depend(in : y)
-            z = bar(y);
+                z = bar(y);
+            }
         }
+        if (x != 42 || y != 53 || z != 64)
+            return 1;
     }
-    printf("x = %d\n", x);
-    printf("y = %d\n", y);
-    printf("z = %d\n", z);
-    if (x != 42 || y != 53 || z != 64)
-        return 1;
     return 0;
 }
diff --git a/tests/openmp/unit/task_in_reduction.cpp b/tests/openmp/unit/task_in_reduction.cpp
new file mode 100644
index 0000000..7edbc52
--- /dev/null
+++ b/tests/openmp/unit/task_in_reduction.cpp
@@ -0,0 +1,28 @@
+//  Copyright (c) 2018 Tianyi Zhang
+//
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <vector>
+
+int main(int argc, char **argv)
+{
+    int sum = 0;
+    std::vector<int> a(100,1);
+#pragma omp parallel
+#pragma omp single
+    {
+    #pragma omp taskgroup task_reduction(+:sum)
+        {
+        #pragma omp task in_reduction(+:sum)
+            for (int i=0; i<50; i++) sum += a[i];
+        #pragma omp task in_reduction(+:sum)
+            for (int i=50; i<100; i++) sum += a[i];
+        }
+        std::cout<<sum<<std::endl;
+    }
+    if(sum!=100)
+        return 1;
+    return 0;
+}
diff --git a/tests/openmp/unit/taskgroup.cpp b/tests/openmp/unit/taskgroup.cpp
index 7356569..8d8f67c 100644
--- a/tests/openmp/unit/taskgroup.cpp
+++ b/tests/openmp/unit/taskgroup.cpp
@@ -8,9 +8,9 @@
 #include <vector>
 #include <atomic>
 
-//not checking correctness as taskgroup is not implemented properly
+//this code is not working with release under both openmp and hpxmp
 int main(int argc, char *argv[]) {
-    int result[5];
+    std::vector<int> result(5,0);
     std::atomic<int> position(-1);
 #pragma omp parallel
     {
@@ -40,15 +40,15 @@ int main(int argc, char *argv[]) {
                     }
                 }
             }
-            printf("is fun to watch ");
+            printf(" fun to watch ");
             position++;
             result[position] = 3;
         }
     } // End of parallel region
-    printf("\n");
+
     for(int i = 0; i < 3; i++){
         if(result[i] > result[i+1])
-            return 0;
+            return 1;
     }
     return 0;
 }
diff --git a/tests/openmp/unit/taskgroup_.cpp b/tests/openmp/unit/taskgroup_.cpp
new file mode 100644
index 0000000..0acdf48
--- /dev/null
+++ b/tests/openmp/unit/taskgroup_.cpp
@@ -0,0 +1,67 @@
+//  Copyright (c) 2018 Tianyi Zhang
+//
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <omp.h>
+#include <iostream>
+#include <vector>
+#include <atomic>
+
+//this code is not working with release under both openmp and hpxmp
+int main(int argc, char *argv[]) {
+    std::vector<int> result(5,0);
+    std::atomic<int> position(-1);
+#pragma omp parallel
+    {
+#pragma omp single
+        {
+            printf("A ");
+            position++;
+            result[position] = 1;
+#pragma omp taskgroup
+            {
+#pragma omp task
+                {
+                    printf("race ");
+                    position++;
+                    result[position] = 2;
+                }
+#pragma omp task
+                {
+                    printf("car ");
+                    position++;
+                    result[position] = 2;
+#pragma omp task
+                    {
+                        printf("is");
+                        position++;
+                        result[position] = 2;
+                    }
+                }
+            }
+            printf(" fun to watch ");
+            position++;
+            result[position] = 3;
+#pragma omp taskgroup
+            {
+#pragma omp task
+                {
+                    printf(", good ");
+                    position++;
+                    result[position] = 4;
+                }
+            }
+            printf("job ! ");
+            position++;
+            result[position] = 5;
+        }
+    } // End of parallel region
+    printf("\n");
+
+    for(int i = 0; i < 5; i++){
+        if(result[i] > result[i+1])
+            return 1;
+    }
+    return 0;
+}