NVIDIA · davebayer · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
@@ -82,8 +82,7 @@ workflows:
     - {jobs: ['test'], project: 'nvbench_helper', ctk: '13.0', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
     - {jobs: ['test'], project: 'nvbench_helper', ctk: '13.X', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
     # NVHPC build
-    # - {jobs: ['build'], cxx: 'nvhpc', ctk: 'nvhpc', std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
-    - {jobs: ['build'], cxx: 'nvhpc', ctk: 'nvhpc', std: 'all', project: ['libcudacxx', 'cub', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
+    - {jobs: ['build'], cxx: 'nvhpc', ctk: 'nvhpc', std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
     # clang-cuda
     - {jobs: ['build'], cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda',  std: 'all', sm: '75;80;90;100'}
     # libc++
@@ -105,24 +104,24 @@ workflows:
     # libcudacxx - Specialized, testing default SM
     - {project: 'libcudacxx', jobs: ['test'], std: 'max', cxx: ['gcc', 'msvc'], gpu: 'rtx2080', sm: 'gpu'}
     - {project: 'libcudacxx', jobs: ['build'], std: 'max', cxx: 'clang'}
-    # - {project: 'libcudacxx', jobs: ['build'], std: 'max', ctk: 'nvhpc', cxx: 'nvhpc'}
+    - {project: 'libcudacxx', jobs: ['build'], std: 'max', ctk: 'nvhpc', cxx: 'nvhpc'}
     - {project: 'libcudacxx', jobs: ['build'], std: 'max', cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', sm: '70;80;90;100'}
     - {project: 'libcudacxx', jobs: ['nvrtc'], std: 'max', gpu: 'rtx2080', sm: 'gpu'}
     - {project: 'libcudacxx', jobs: ['verify_codegen']}
     # CUB - Specialized, testing default SM
     - {project: 'cub', jobs: ['test_nolid', 'test_lid0'], std: 'max', cxx: ['gcc', 'msvc'], gpu: 'rtxa6000', sm: 'gpu'}
     - {project: 'cub', jobs: ['build_nolid', 'build_lid0'], std: 'max', cxx: 'clang'}
-    # - {project: 'cub', jobs: ['build_nolid', 'build_lid0'], std: 'max', ctk: 'nvhpc', cxx: 'nvhpc'}
+    - {project: 'cub', jobs: ['build_nolid', 'build_lid0'], std: 'max', ctk: 'nvhpc', cxx: 'nvhpc'}
     - {project: 'cub', jobs: ['build_nolid', 'build_lid0'], std: 'max', cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', sm: '75;80;90;100'}
     # Thrust - Keep number of sm small. Kernel coverage is in CUB. This just tests dispatch / glue in lite mode:
     - {project: 'thrust', jobs: ['test'], std: 'max', cxx: ['gcc', 'msvc'], gpu: 'rtx4090', sm: 'gpu'}
     - {project: 'thrust', jobs: ['build'], std: 'max', cxx: 'clang', sm: '75;120'}
-    # - {project: 'thrust', jobs: ['build'], std: 'max', ctk: 'nvhpc', cxx: 'nvhpc', sm: '75;120'}
+    - {project: 'thrust', jobs: ['build'], std: 'max', ctk: 'nvhpc', cxx: 'nvhpc', sm: '75;120'}
     - {project: 'thrust', jobs: ['build'], std: 'max', cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', sm: '75;100'}
     # cudax
     - {project: 'cudax', jobs: ['test'], std: 'max', cxx: ['gcc', 'msvc'], gpu: 'rtx2080', sm: 'gpu'}
     - {project: 'cudax', jobs: ['build'], std: 'max', cxx: 'clang', sm: '75;120'}
-    # - {project: 'cudax', jobs: ['build'], std: 'max', ctk: 'nvhpc', cxx: 'nvhpc', sm: '75;120'}
+    - {project: 'cudax', jobs: ['build'], std: 'max', ctk: 'nvhpc', cxx: 'nvhpc', sm: '75;120'}
     # stdpar
     - {project: 'stdpar', jobs: ['build'], std: 'max', ctk: 'nvhpc', cxx: 'nvhpc'}
     # Python + support

@@ -5,11 +5,21 @@
 #  error "This file must be included before <immintrin.h>"
 #endif // _IMMINTRIN_H_INCLUDED
 
+#if defined(__NVCC__) && defined(__CUDACC__)
 // Forward declare builtins used by gcc 12. Clang and nvc++ define __GNUC__, too, so we need to explicitly leave them
 // out.
-#if defined(__NVCC__) && defined(__CUDACC__) && defined(__GNUC__) && !defined(__clang__) && !defined(__NVCOMPILER)
-#  if __CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ == 0 && __GNUC__ == 12
+#  if defined(__GNUC__) && !defined(__clang__) && !defined(__NVCOMPILER)
+#    if __CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ == 0 && __GNUC__ == 12
 void __builtin_ia32_ldtilecfg(const void*);
 void __builtin_ia32_sttilecfg(void*);
-#  endif // __CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ == 0 && __GNUC__ == 12
-#endif // __NVCC__ && __CUDACC__ && __GNUC__ && !__clang__ && !__NVCOMPILER
+#    endif // __CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ == 0 && __GNUC__ == 12
+#  endif // __GNUC__ && !__clang__ && !__NVCOMPILER
+
+// cudafe++ has problems with many builtins used in <avx512fp16intrin.h> and <avx512vlfp16intrin.h> when compiling with
+// nvc++ as the host compiler. Since those headers are not used by thrust nor tbb, we can prevent their inclusion by
+// defining their include guard macros.
+#  if defined(__NVCOMPILER)
+#    define __AVX512FP16INTRIN_H
+#    define __AVX512VLFP16INTRIN_H
+#  endif // __NVCOMPILER
+#endif // __NVCC__ && __CUDACC__