Skip to content

Commit 8004ad5

Browse files
committed
Fix #194 and add Large Kernel Parameters Sample
1 parent e612904 commit 8004ad5

20 files changed

+924
-51
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This ve
77
This section describes the release notes for the CUDA Samples on GitHub only.
88

99
### CUDA 12.1
10+
* Added JIT LTO Sample
11+
* Adding Large Kernel Sample
1012

1113
### [older versions...](./CHANGELOG.md)
1214

Samples/4_CUDA_Libraries/cudaNvSciNvMedia/cuda_consumer.cu

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -158,12 +158,6 @@ static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj,
158158
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_Layout;
159159
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneBitsPerPixel;
160160
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneOffset;
161-
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlanePitch;
162-
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneAlignedHeight;
163-
164-
uint32_t planePitchs[10];
165-
uint32_t planePixel[10];
166-
uint32_t planeAlignedHeight[10];
167161

168162
checkNvSciErrors(NvSciBufAttrListGetAttrs(attrlist, pairArrayOut, numAttrs));
169163

@@ -183,13 +177,6 @@ static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj,
183177
cudaExtResObj.planeCount * sizeof(int32_t));
184178
memcpy(cudaExtResObj.planeOffset, (uint64_t *)pairArrayOut[7].value,
185179
cudaExtResObj.planeCount * sizeof(uint64_t));
186-
memcpy(planePixel, (uint32_t *)pairArrayOut[6].value,
187-
cudaExtResObj.planeCount * sizeof(uint32_t));
188-
memcpy(planePitchs, (uint32_t *)pairArrayOut[8].value,
189-
cudaExtResObj.planeCount * sizeof(uint32_t));
190-
memcpy(planeAlignedHeight, (uint32_t *)pairArrayOut[9].value,
191-
cudaExtResObj.planeCount * sizeof(uint32_t));
192-
193180

194181
NvSciBufAttrValImageLayoutType layout =
195182
*(NvSciBufAttrValImageLayoutType *)pairArrayOut[5].value;
@@ -214,8 +201,8 @@ static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj,
214201
for (int i = 0; i < cudaExtResObj.planeCount; i++) {
215202
cudaExtent extent = {};
216203
memset(&extent, 0, sizeof(extent));
217-
extent.width = planePitchs[i] / (planePixel[i] / 8);
218-
extent.height = planeAlignedHeight[i];
204+
extent.width = cudaExtResObj.imageWidth[i];
205+
extent.height = cudaExtResObj.imageHeight[i];
219206
extent.depth = 0;
220207
cudaChannelFormatDesc desc;
221208
switch (channelCount) {

Samples/4_CUDA_Libraries/jitLto/Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -384,10 +384,10 @@ else
384384
@echo "Sample is ready - all dependencies have been met"
385385
endif
386386

387-
jitlto.o:jitlto.cpp
387+
jitLto.o:jitLto.cpp
388388
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
389389

390-
jitLto: jitlto.o
390+
jitLto: jitLto.o
391391
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
392392
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
393393
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
@@ -398,7 +398,7 @@ run: build
398398
testrun: build
399399

400400
clean:
401-
rm -f jitLto jitlto.o
401+
rm -f jitLto jitLto.o
402402
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/jitLto
403403

404404
clobber: clean
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2+
*
3+
* Redistribution and use in source and binary forms, with or without
4+
* modification, are permitted provided that the following conditions
5+
* are met:
6+
* * Redistributions of source code must retain the above copyright
7+
* notice, this list of conditions and the following disclaimer.
8+
* * Redistributions in binary form must reproduce the above copyright
9+
* notice, this list of conditions and the following disclaimer in the
10+
* documentation and/or other materials provided with the distribution.
11+
* * Neither the name of NVIDIA CORPORATION nor the names of its
12+
* contributors may be used to endorse or promote products derived
13+
* from this software without specific prior written permission.
14+
*
15+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*/
27+
28+
#include <cuda.h>
29+
#include <nvJitLink.h>
30+
#include <nvrtc.h>
31+
#include <iostream>
32+
#include <cstring>
33+
34+
#define NUM_THREADS 128
35+
#define NUM_BLOCKS 32
36+
37+
#define NVRTC_SAFE_CALL(x) \
38+
do { \
39+
nvrtcResult result = x; \
40+
if (result != NVRTC_SUCCESS) { \
41+
std::cerr << "\nerror: " #x " failed with error " \
42+
<< nvrtcGetErrorString(result) << '\n'; \
43+
exit(1); \
44+
} \
45+
} while(0)
46+
#define CUDA_SAFE_CALL(x) \
47+
do { \
48+
CUresult result = x; \
49+
if (result != CUDA_SUCCESS) { \
50+
const char *msg; \
51+
cuGetErrorName(result, &msg); \
52+
std::cerr << "\nerror: " #x " failed with error " \
53+
<< msg << '\n'; \
54+
exit(1); \
55+
} \
56+
} while(0)
57+
#define NVJITLINK_SAFE_CALL(h,x) \
58+
do { \
59+
nvJitLinkResult result = x; \
60+
if (result != NVJITLINK_SUCCESS) { \
61+
std::cerr << "\nerror: " #x " failed with error " \
62+
<< result << '\n'; \
63+
size_t lsize; \
64+
result = nvJitLinkGetErrorLogSize(h, &lsize); \
65+
if (result == NVJITLINK_SUCCESS && lsize > 0) { \
66+
char *log = (char*)malloc(lsize); \
67+
result = nvJitLinkGetErrorLog(h, log); \
68+
if (result == NVJITLINK_SUCCESS) { \
69+
std::cerr << "error log: " << log << '\n'; \
70+
free(log); \
71+
} \
72+
} \
73+
exit(1); \
74+
} \
75+
} while(0)
76+
77+
const char *lto_saxpy = " \n\
78+
extern __device__ float compute(float a, float x, float y); \n\
79+
\n\
80+
extern \"C\" __global__ \n\
81+
void saxpy(float a, float *x, float *y, float *out, size_t n) \n\
82+
{ \n\
83+
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\
84+
if (tid < n) { \n\
85+
out[tid] = compute(a, x[tid], y[tid]); \n\
86+
} \n\
87+
} \n";
88+
89+
const char *lto_compute = " \n\
90+
__device__ float compute(float a, float x, float y) { \n\
91+
return a * x + y; \n\
92+
} \n";
93+
94+
// compile code into LTOIR, returning the IR and its size
95+
static void getLTOIR (const char *code, const char *name,
96+
char **ltoIR, size_t *ltoIRSize)
97+
{
98+
// Create an instance of nvrtcProgram with the code string.
99+
nvrtcProgram prog;
100+
NVRTC_SAFE_CALL(
101+
nvrtcCreateProgram(&prog, // prog
102+
code, // buffer
103+
name, // name
104+
0, // numHeaders
105+
NULL, // headers
106+
NULL)); // includeNames
107+
108+
// specify that LTO IR should be generated for LTO operation
109+
const char *opts[] = {"-dlto",
110+
"--relocatable-device-code=true"};
111+
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
112+
2, // numOptions
113+
opts); // options
114+
// Obtain compilation log from the program.
115+
size_t logSize;
116+
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
117+
char *log = new char[logSize];
118+
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
119+
std::cout << log << '\n';
120+
delete[] log;
121+
if (compileResult != NVRTC_SUCCESS) {
122+
exit(1);
123+
}
124+
// Obtain generated LTO IR from the program.
125+
NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize));
126+
*ltoIR = new char[*ltoIRSize];
127+
NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR));
128+
// Destroy the program.
129+
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
130+
}
131+
132+
int main(int argc, char *argv[])
133+
{
134+
char *ltoIR1;
135+
char *ltoIR2;
136+
size_t ltoIR1Size;
137+
size_t ltoIR2Size;
138+
// getLTOIR uses nvrtc to get the LTOIR.
139+
// We could also use nvcc offline with -dlto -fatbin
140+
// to generate the IR, but using nvrtc keeps the build simpler.
141+
getLTOIR(lto_saxpy, "lto_saxpy.cu", &ltoIR1, &ltoIR1Size);
142+
getLTOIR(lto_compute, "lto_compute.cu", &ltoIR2, &ltoIR2Size);
143+
144+
CUdevice cuDevice;
145+
CUcontext context;
146+
CUmodule module;
147+
CUfunction kernel;
148+
CUDA_SAFE_CALL(cuInit(0));
149+
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
150+
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
151+
152+
// Dynamically determine the arch to link for
153+
int major = 0;
154+
int minor = 0;
155+
CUDA_SAFE_CALL(cuDeviceGetAttribute(&major,
156+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
157+
CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor,
158+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
159+
int arch = major*10 + minor;
160+
char smbuf[16];
161+
memset(smbuf,0,16);
162+
sprintf(smbuf, "-arch=sm_%d", arch);
163+
164+
// Load the generated LTO IR and link them together
165+
nvJitLinkHandle handle;
166+
const char *lopts[] = {"-lto", smbuf};
167+
NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts));
168+
169+
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
170+
(void *)ltoIR1, ltoIR1Size, "lto_saxpy"));
171+
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
172+
(void *)ltoIR2, ltoIR2Size, "lto_compute"));
173+
174+
// The call to nvJitLinkComplete causes linker to link together the two
175+
// LTO IR modules, do optimization on the linked LTO IR,
176+
// and generate cubin from it.
177+
NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle));
178+
179+
// check error log
180+
size_t logSize;
181+
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize));
182+
if (logSize > 0) {
183+
char *log = (char*)malloc(logSize+1);
184+
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log));
185+
std::cout << "Error log: " << log << std::endl;
186+
free(log);
187+
}
188+
189+
// get linked cubin
190+
size_t cubinSize;
191+
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize));
192+
void *cubin = malloc(cubinSize);
193+
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin));
194+
195+
NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle));
196+
delete[] ltoIR1;
197+
delete[] ltoIR2;
198+
199+
// cubin is linked, so now load it
200+
CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin));
201+
CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy"));
202+
203+
// Generate input for execution, and create output buffers.
204+
size_t n = NUM_THREADS * NUM_BLOCKS;
205+
size_t bufferSize = n * sizeof(float);
206+
float a = 5.1f;
207+
float *hX = new float[n], *hY = new float[n], *hOut = new float[n];
208+
for (size_t i = 0; i < n; ++i) {
209+
hX[i] = static_cast<float>(i);
210+
hY[i] = static_cast<float>(i * 2);
211+
}
212+
CUdeviceptr dX, dY, dOut;
213+
CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize));
214+
CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize));
215+
CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize));
216+
CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize));
217+
CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize));
218+
// Execute SAXPY.
219+
void *args[] = { &a, &dX, &dY, &dOut, &n };
220+
CUDA_SAFE_CALL(
221+
cuLaunchKernel(kernel,
222+
NUM_BLOCKS, 1, 1, // grid dim
223+
NUM_THREADS, 1, 1, // block dim
224+
0, NULL, // shared mem and stream
225+
args, 0)); // arguments
226+
CUDA_SAFE_CALL(cuCtxSynchronize());
227+
// Retrieve and print output.
228+
CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize));
229+
230+
for (size_t i = 0; i < n; ++i) {
231+
std::cout << a << " * " << hX[i] << " + " << hY[i]
232+
<< " = " << hOut[i] << '\n';
233+
}
234+
// check last value to verify
235+
if (hOut[n-1] == 29074.5) {
236+
std::cout << "PASSED!\n";
237+
} else {
238+
std::cout << "values not expected?\n";
239+
}
240+
// Release resources.
241+
CUDA_SAFE_CALL(cuMemFree(dX));
242+
CUDA_SAFE_CALL(cuMemFree(dY));
243+
CUDA_SAFE_CALL(cuMemFree(dOut));
244+
CUDA_SAFE_CALL(cuModuleUnload(module));
245+
CUDA_SAFE_CALL(cuCtxDestroy(context));
246+
free(cubin);
247+
delete[] hX;
248+
delete[] hY;
249+
delete[] hOut;
250+
return 0;
251+
}

Samples/4_CUDA_Libraries/jitLto/jitLto_vs2017.vcxproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@
102102
</CudaCompile>
103103
</ItemDefinitionGroup>
104104
<ItemGroup>
105-
<ClCompile Include="jitlto.cpp" />
105+
<ClCompile Include="jitLto.cpp" />
106106

107107
</ItemGroup>
108108
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

Samples/4_CUDA_Libraries/jitLto/jitLto_vs2019.vcxproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@
9898
</CudaCompile>
9999
</ItemDefinitionGroup>
100100
<ItemGroup>
101-
<ClCompile Include="jitlto.cpp" />
101+
<ClCompile Include="jitLto.cpp" />
102102

103103
</ItemGroup>
104104
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

Samples/4_CUDA_Libraries/jitLto/jitLto_vs2022.vcxproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@
9898
</CudaCompile>
9999
</ItemDefinitionGroup>
100100
<ItemGroup>
101-
<ClCompile Include="jitlto.cpp" />
101+
<ClCompile Include="jitLto.cpp" />
102102

103103
</ItemGroup>
104104
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"configurations": [
3+
{
4+
"name": "Linux",
5+
"includePath": [
6+
"${workspaceFolder}/**",
7+
"${workspaceFolder}/../../../Common"
8+
],
9+
"defines": [],
10+
"compilerPath": "/usr/local/cuda/bin/nvcc",
11+
"cStandard": "gnu17",
12+
"cppStandard": "gnu++14",
13+
"intelliSenseMode": "linux-gcc-x64",
14+
"configurationProvider": "ms-vscode.makefile-tools"
15+
}
16+
],
17+
"version": 4
18+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"recommendations": [
3+
"nvidia.nsight-vscode-edition",
4+
"ms-vscode.cpptools",
5+
"ms-vscode.makefile-tools"
6+
]
7+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"configurations": [
3+
{
4+
"name": "CUDA C++: Launch",
5+
"type": "cuda-gdb",
6+
"request": "launch",
7+
"program": "${workspaceFolder}/LargeKernelParameter"
8+
}
9+
]
10+
}

0 commit comments

Comments
 (0)