NVIDIA
diff --git a/‎README.md
Lines changed: 2 additions & 0 deletions b/‎README.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎Samples/4_CUDA_Libraries/cudaNvSciNvMedia/cuda_consumer.cu
Lines changed: 2 additions & 15 deletions b/‎Samples/4_CUDA_Libraries/cudaNvSciNvMedia/cuda_consumer.cu
Lines changed: 2 additions & 15 deletions
diff --git a/‎Samples/4_CUDA_Libraries/jitLto/Makefile
Lines changed: 3 additions & 3 deletions b/‎Samples/4_CUDA_Libraries/jitLto/Makefile
Lines changed: 3 additions & 3 deletions
diff --git a/‎Samples/4_CUDA_Libraries/jitLto/jitLto.cpp
Lines changed: 251 additions & 0 deletions b/‎Samples/4_CUDA_Libraries/jitLto/jitLto.cpp
Lines changed: 251 additions & 0 deletions
diff --git a/‎Samples/4_CUDA_Libraries/jitLto/jitLto_vs2017.vcxproj
Lines changed: 1 addition & 1 deletion b/‎Samples/4_CUDA_Libraries/jitLto/jitLto_vs2017.vcxproj
Lines changed: 1 addition & 1 deletion
diff --git a/‎Samples/4_CUDA_Libraries/jitLto/jitLto_vs2019.vcxproj
Lines changed: 1 addition & 1 deletion b/‎Samples/4_CUDA_Libraries/jitLto/jitLto_vs2019.vcxproj
Lines changed: 1 addition & 1 deletion
diff --git a/‎Samples/4_CUDA_Libraries/jitLto/jitLto_vs2022.vcxproj
Lines changed: 1 addition & 1 deletion b/‎Samples/4_CUDA_Libraries/jitLto/jitLto_vs2022.vcxproj
Lines changed: 1 addition & 1 deletion
diff --git a/‎Samples/6_Performance/LargeKernelParameter/.vscode/c_cpp_properties.json
Lines changed: 18 additions & 0 deletions b/‎Samples/6_Performance/LargeKernelParameter/.vscode/c_cpp_properties.json
Lines changed: 18 additions & 0 deletions
diff --git a/‎Samples/6_Performance/LargeKernelParameter/.vscode/extensions.json
Lines changed: 7 additions & 0 deletions b/‎Samples/6_Performance/LargeKernelParameter/.vscode/extensions.json
Lines changed: 7 additions & 0 deletions
diff --git a/‎Samples/6_Performance/LargeKernelParameter/.vscode/launch.json
Lines changed: 10 additions & 0 deletions b/‎Samples/6_Performance/LargeKernelParameter/.vscode/launch.json
Lines changed: 10 additions & 0 deletions
@@ -7,6 +7,8 @@ Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This ve
 This section describes the release notes for the CUDA Samples on GitHub only.
 
 ### CUDA 12.1
+* Added JIT LTO Sample
+* Adding Large Kernel Sample
 
 ### [older versions...](./CHANGELOG.md)
 
 
@@ -158,12 +158,6 @@ static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj,
   pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_Layout;
   pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneBitsPerPixel;
   pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneOffset;
-  pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlanePitch;
-  pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneAlignedHeight;
-
-  uint32_t planePitchs[10];
-  uint32_t planePixel[10];
-  uint32_t planeAlignedHeight[10];
 
   checkNvSciErrors(NvSciBufAttrListGetAttrs(attrlist, pairArrayOut, numAttrs));
 
@@ -183,13 +177,6 @@ static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj,
          cudaExtResObj.planeCount * sizeof(int32_t));
   memcpy(cudaExtResObj.planeOffset, (uint64_t *)pairArrayOut[7].value,
          cudaExtResObj.planeCount * sizeof(uint64_t));
-  memcpy(planePixel, (uint32_t *)pairArrayOut[6].value,
-         cudaExtResObj.planeCount * sizeof(uint32_t));
-  memcpy(planePitchs, (uint32_t *)pairArrayOut[8].value,
-         cudaExtResObj.planeCount * sizeof(uint32_t));
-  memcpy(planeAlignedHeight, (uint32_t *)pairArrayOut[9].value,
-         cudaExtResObj.planeCount * sizeof(uint32_t));
-  
 
   NvSciBufAttrValImageLayoutType layout =
       *(NvSciBufAttrValImageLayoutType *)pairArrayOut[5].value;
@@ -214,8 +201,8 @@ static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj,
   for (int i = 0; i < cudaExtResObj.planeCount; i++) {
     cudaExtent extent = {};
     memset(&extent, 0, sizeof(extent));
-    extent.width = planePitchs[i] / (planePixel[i] / 8);
-    extent.height = planeAlignedHeight[i];
+    extent.width = cudaExtResObj.imageWidth[i];
+    extent.height = cudaExtResObj.imageHeight[i];
     extent.depth = 0;
     cudaChannelFormatDesc desc;
     switch (channelCount) {
 
@@ -384,10 +384,10 @@ else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 
-jitlto.o:jitlto.cpp
+jitLto.o:jitLto.cpp
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
-jitLto: jitlto.o
+jitLto: jitLto.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
@@ -398,7 +398,7 @@ run: build
 testrun: build
 
 clean:
-	rm -f jitLto jitlto.o
+	rm -f jitLto jitLto.o
 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/jitLto
 
 clobber: clean
@@ -0,0 +1,251 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda.h>
+#include <nvJitLink.h>
+#include <nvrtc.h>
+#include <iostream>
+#include <cstring>
+
+#define NUM_THREADS 128
+#define NUM_BLOCKS 32
+
+#define NVRTC_SAFE_CALL(x)                                        \
+  do {                                                            \
+    nvrtcResult result = x;                                       \
+    if (result != NVRTC_SUCCESS) {                                \
+      std::cerr << "\nerror: " #x " failed with error "           \
+                << nvrtcGetErrorString(result) << '\n';           \
+      exit(1);                                                    \
+    }                                                             \
+  } while(0)
+#define CUDA_SAFE_CALL(x)                                         \
+  do {                                                            \
+    CUresult result = x;                                          \
+    if (result != CUDA_SUCCESS) {                                 \
+      const char *msg;                                            \
+      cuGetErrorName(result, &msg);                               \
+      std::cerr << "\nerror: " #x " failed with error "           \
+                << msg << '\n';                                   \
+      exit(1);                                                    \
+    }                                                             \
+  } while(0)
+#define NVJITLINK_SAFE_CALL(h,x)                                         \
+  do {                                                            \
+    nvJitLinkResult result = x;                                          \
+    if (result != NVJITLINK_SUCCESS) {                                 \
+      std::cerr << "\nerror: " #x " failed with error "           \
+                << result << '\n';                                   \
+      size_t lsize;                                               \
+      result = nvJitLinkGetErrorLogSize(h, &lsize);               \
+      if (result == NVJITLINK_SUCCESS && lsize > 0) {             \
+        char *log = (char*)malloc(lsize);                         \
+	result = nvJitLinkGetErrorLog(h, log);                    \
+	if (result == NVJITLINK_SUCCESS) {                        \
+	  std::cerr << "error log: " << log << '\n';                  \
+	  free(log);                                              \
+	}                                                         \
+      }                                                           \
+      exit(1);                                                    \
+    }                                                             \
+  } while(0)
+
+const char *lto_saxpy = "                                       \n\
+extern __device__ float compute(float a, float x, float y);     \n\
+                                                                \n\
+extern \"C\" __global__                                         \n\
+void saxpy(float a, float *x, float *y, float *out, size_t n)   \n\
+{                                                               \n\
+  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;           \n\
+  if (tid < n) {                                                \n\
+    out[tid] = compute(a, x[tid], y[tid]);                      \n\
+  }                                                             \n\
+}                                                               \n";
+
+const char *lto_compute = "                                     \n\
+__device__  float compute(float a, float x, float y) {          \n\
+  return a * x + y;                                             \n\
+}                                                               \n";
+
+// compile code into LTOIR, returning the IR and its size
+static void getLTOIR (const char *code, const char *name, 
+                      char **ltoIR, size_t *ltoIRSize)
+{
+  // Create an instance of nvrtcProgram with the code string.
+  nvrtcProgram prog;
+  NVRTC_SAFE_CALL(
+    nvrtcCreateProgram(&prog,            // prog
+                       code,             // buffer
+                       name,             // name
+                       0,                // numHeaders
+                       NULL,             // headers
+                       NULL));           // includeNames
+  
+  // specify that LTO IR should be generated for LTO operation
+  const char *opts[] = {"-dlto",
+                        "--relocatable-device-code=true"};
+  nvrtcResult compileResult = nvrtcCompileProgram(prog,  // prog
+                                                  2,     // numOptions
+                                                  opts); // options
+  // Obtain compilation log from the program.
+  size_t logSize;
+  NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
+  char *log = new char[logSize];
+  NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
+  std::cout << log << '\n';
+  delete[] log;
+  if (compileResult != NVRTC_SUCCESS) {
+    exit(1);
+  }
+  // Obtain generated LTO IR from the program.
+  NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize));
+  *ltoIR = new char[*ltoIRSize];
+  NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR));
+  // Destroy the program.
+  NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
+}
+
+int main(int argc, char *argv[])
+{
+  char *ltoIR1;
+  char *ltoIR2;
+  size_t ltoIR1Size;
+  size_t ltoIR2Size;
+  // getLTOIR uses nvrtc to get the LTOIR.
+  // We could also use nvcc offline with -dlto -fatbin
+  // to generate the IR, but using nvrtc keeps the build simpler.
+  getLTOIR(lto_saxpy, "lto_saxpy.cu", &ltoIR1, &ltoIR1Size);
+  getLTOIR(lto_compute, "lto_compute.cu", &ltoIR2, &ltoIR2Size);
+
+  CUdevice cuDevice;
+  CUcontext context;
+  CUmodule module;
+  CUfunction kernel;
+  CUDA_SAFE_CALL(cuInit(0));
+  CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
+  CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
+
+  // Dynamically determine the arch to link for
+  int major = 0;
+  int minor = 0;
+  CUDA_SAFE_CALL(cuDeviceGetAttribute(&major, 
+                   CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+  CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor, 
+                   CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+  int arch = major*10 + minor;
+  char smbuf[16];
+  memset(smbuf,0,16);
+  sprintf(smbuf, "-arch=sm_%d", arch);
+
+  // Load the generated LTO IR and link them together
+  nvJitLinkHandle handle;
+  const char *lopts[] = {"-lto", smbuf};
+  NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts));
+
+  NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, 
+                        	(void *)ltoIR1, ltoIR1Size, "lto_saxpy"));
+  NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, 
+                        	(void *)ltoIR2, ltoIR2Size, "lto_compute"));
+
+  // The call to nvJitLinkComplete causes linker to link together the two
+  // LTO IR modules, do optimization on the linked LTO IR,
+  // and generate cubin from it.
+  NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle));
+
+  // check error log
+  size_t logSize;
+  NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize));
+  if (logSize > 0) {
+    char *log = (char*)malloc(logSize+1);
+    NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log));
+    std::cout << "Error log: " << log << std::endl;
+    free(log);
+  }
+
+  // get linked cubin
+  size_t cubinSize;
+  NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize));
+  void *cubin = malloc(cubinSize);
+  NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin));
+
+  NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle));
+  delete[] ltoIR1;
+  delete[] ltoIR2;
+
+  // cubin is linked, so now load it
+  CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin));
+  CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy"));
+  
+  // Generate input for execution, and create output buffers.
+  size_t n = NUM_THREADS * NUM_BLOCKS;
+  size_t bufferSize = n * sizeof(float);
+  float a = 5.1f;
+  float *hX = new float[n], *hY = new float[n], *hOut = new float[n];
+  for (size_t i = 0; i < n; ++i) {
+    hX[i] = static_cast<float>(i);
+    hY[i] = static_cast<float>(i * 2);
+  }
+  CUdeviceptr dX, dY, dOut;
+  CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize));
+  CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize));
+  CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize));
+  CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize));
+  CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize));
+  // Execute SAXPY.
+  void *args[] = { &a, &dX, &dY, &dOut, &n };
+  CUDA_SAFE_CALL(
+    cuLaunchKernel(kernel,
+                   NUM_BLOCKS, 1, 1,    // grid dim
+                   NUM_THREADS, 1, 1,   // block dim
+                   0, NULL,             // shared mem and stream
+                   args, 0));           // arguments
+  CUDA_SAFE_CALL(cuCtxSynchronize());
+  // Retrieve and print output.
+  CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize));
+  
+  for (size_t i = 0; i < n; ++i) {
+    std::cout << a << " * " << hX[i] << " + " << hY[i]
+              << " = " << hOut[i] << '\n';
+  }
+  // check last value to verify
+  if (hOut[n-1] == 29074.5) {
+    std::cout << "PASSED!\n";
+  } else {
+    std::cout << "values not expected?\n";
+  }
+  // Release resources.
+  CUDA_SAFE_CALL(cuMemFree(dX));
+  CUDA_SAFE_CALL(cuMemFree(dY));
+  CUDA_SAFE_CALL(cuMemFree(dOut));
+  CUDA_SAFE_CALL(cuModuleUnload(module));
+  CUDA_SAFE_CALL(cuCtxDestroy(context));
+  free(cubin);
+  delete[] hX;
+  delete[] hY;
+  delete[] hOut;
+  return 0;
+}
@@ -102,7 +102,7 @@
     </CudaCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <ClCompile Include="jitlto.cpp" />
+    <ClCompile Include="jitLto.cpp" />
 
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 
@@ -98,7 +98,7 @@
     </CudaCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <ClCompile Include="jitlto.cpp" />
+    <ClCompile Include="jitLto.cpp" />
 
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 
@@ -98,7 +98,7 @@
     </CudaCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <ClCompile Include="jitlto.cpp" />
+    <ClCompile Include="jitLto.cpp" />
 
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 
@@ -0,0 +1,18 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "${workspaceFolder}/../../../Common"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/local/cuda/bin/nvcc",
+            "cStandard": "gnu17",
+            "cppStandard": "gnu++14",
+            "intelliSenseMode": "linux-gcc-x64",
+            "configurationProvider": "ms-vscode.makefile-tools"
+        }
+    ],
+    "version": 4
+}
@@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cpptools",
+        "ms-vscode.makefile-tools"
+    ]
+}
@@ -0,0 +1,10 @@
+{
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": "${workspaceFolder}/LargeKernelParameter"
+        }
+    ]
+}