Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CudaLaunchError : CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES #331

Open
nwils opened this issue Feb 26, 2021 · 0 comments
Open

CudaLaunchError : CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES #331

nwils opened this issue Feb 26, 2021 · 0 comments

Comments

@nwils
Copy link

nwils commented Feb 26, 2021

Hi,

I am trying to run the latest sagemaker neo compiled model on Jetson Nano. My compilation parameters are
**os - linux
accelerator - nvidia
arch - arm64
compiler options - {"cuda-ver": "10.0", "trt-ver": "6.0.1", "gpu-code": "sm_53"}
**
neo-ai-dlr release version -1.7
The board has jetpack4.3. My input model is an MXNet ssd mobilenet model. I am using the libdlr.so module in the compiled model.
But it produces the cuda out of resources error.

File` "/usr/local/lib/python3.8/site-packages/dlr/dlr_model.py", line 451, in run 
    self._run() 
  File "/usr/local/lib/python3.8/site-packages/dlr/dlr_model.py", line 333, in _run 
    self._check_call(self._lib.RunDLRModel(byref(self.handle))) 
  File "/usr/local/lib/python3.8/site-packages/dlr/dlr_model.py", line 160, in _check_call 
    raise DLRError(self._lib.DLRGetLastError().decode('ascii')) 
dlr.dlr_model.DLRError: TVMError:  
--------------------------------------------------------------- 
An internal invariant was violated during the execution of TVM. 
Please read TVM's error reporting guidelines. 
More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793. 
--------------------------------------------------------------- 
 
  Check failed: ret == 0 (-1 vs. 0) : TVMError: CUDALaunch Error: CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES 
 grid=(1,1,1),  block=(1024,1,1) 
// func_name=fused_vision_non_max_suppression_kernel2 
// CUDA Source 
// ----------- 
// 
// Generated by NVIDIA NVVM Compiler 
// 
// Compiler Build ID: CL-24817639 
// Cuda compilation tools, release 10.0, V10.0.130 
// Based on LLVM 3.4svn 
// 
 
.version 6.3 
.target sm_53 
.address_size 64 
 
 // .globl fused_vision_get_valid_counts_kernel1 
 
.visible .entry fused_vision_get_valid_counts_kernel1( 
 .param .u64 fused_vision_get_valid_counts_kernel1_param_0, 
 .param .u64 fused_vision_get_valid_counts_kernel1_param_1, 
 .param .u64 fused_vision_get_valid_counts_kernel1_param_2 
) 
{ 
 .reg .pred  %p<2>; 
 .reg .b32  %r<6>; 
 .reg .b64  %rd<12>; 
 
 
 ld.param.u64  %rd1, [fused_vision_get_valid_counts_kernel1_param_0]; 
 ld.param.u64  %rd2, [fused_vision_get_valid_counts_kernel1_param_1]; 
 ld.param.u64  %rd3, [fused_vision_get_valid_counts_kernel1_param_2]; 
 mov.u32  %r1, %tid.x; 
 setp.gt.s32 %p1, %r1, 0; 
 @%p1 bra  BB0_2; 
 
 cvta.to.global.u64  %rd4, %rd2; 
 mul.lo.s32  %r2, %r1, 12264; 
 mul.wide.s32  %rd5, %r2, 4; 
 add.s64  %rd6, %rd4, %rd5; 
 cvta.to.global.u64  %rd7, %rd3; 
 add.s64  %rd8, %rd7, %rd5; 
 ld.global.nc.u32  %r3, [%rd8+49052]; 
 ld.global.nc.u32  %r4, [%rd6+49052]; 
 add.s32  %r5, %r3, %r4; 
 cvta.to.global.u64  %rd9, %rd1; 
 mul.wide.s32  %rd10, %r1, 4; 
 add.s64  %rd11, %rd9, %rd10; 
 st.global.u32  [%rd11], %r5; 
 
BB0_2: 
 ret; 
} 
 
 // .globl fused_vision_non_max_suppression_kernel3 
.visible .entry fused_vision_non_max_suppression_kernel3( 
 .param .u64 fused_vision_non_max_suppression_kernel3_param_0, 
 .param .u64 fused_vision_non_max_suppression_kernel3_param_1, 
 .param .u64 fused_vision_non_max_suppression_kernel3_param_2, 
 .param .u64 fused_vision_non_max_suppression_kernel3_param_3 
) 
{ 
 .reg .pred  %p<2>; 
 .reg .f32  %f<7>; 
 .reg .b32  %r<10>; 
 .reg .b64  %rd<16>; 
 
 
 ld.param.u64  %rd1, [fused_vision_non_max_suppression_kernel3_param_0]; 
 ld.param.u64  %rd2, [fused_vision_non_max_suppression_kernel3_param_1]; 
 ld.param.u64  %rd3, [fused_vision_non_max_suppression_kernel3_param_2]; 
 ld.param.u64  %rd4, [fused_vision_non_max_suppression_kernel3_param_3]; 
 mov.u32  %r1, %ctaid.x; 
 shl.b32  %r4, %r1, 10; 
 mov.u32  %r2, %tid.x; 
 add.s32  %r3, %r4, %r2; 
 setp.gt.s32 %p1, %r3, 12263; 
 @%p1 bra  BB1_2; 
 
 cvta.to.global.u64  %rd5, %rd1; 
 cvta.to.global.u64  %rd6, %rd2; 
 cvta.to.global.u64  %rd7, %rd3; 
 shl.b32  %r5, %r2, 2; 
 shl.b32  %r6, %r1, 12; 
 add.s32  %r7, %r5, %r6; 
 mul.wide.s32  %rd8, %r7, 4; 
 add.s64  %rd9, %rd6, %rd8; 
 ld.global.nc.f32  %f1, [%rd9]; 
 mul.lo.s32  %r8, %r1, 6144; 
 mad.lo.s32  %r9, %r2, 6, %r8; 
 mul.wide.s32  %rd10, %r9, 4; 
 add.s64  %rd11, %rd5, %rd10; 
 ld.global.nc.f32  %f2, [%rd9+4]; 
 ld.global.nc.f32  %f3, [%rd9+8]; 
 ld.global.nc.f32  %f4, [%rd9+12]; 
 st.global.f32  [%rd11+8], %f1; 
 st.global.f32  [%rd11+12], %f2; 
 st.global.f32  [%rd11+16], %f3; 
 st.global.f32  [%rd11+20], %f4; 
 mul.wide.s32  %rd12, %r3, 4; 
 add.s64  %rd13, %rd7, %rd12; 
 ld.global.nc.f32  %f5, [%rd13]; 
 st.global.f32  [%rd11+4], %f5; 
 cvta.to.global.u64  %rd14, %rd4; 
 add.s64  %rd15, %rd14, %rd12; 
 ld.global.nc.f32  %f6, [%rd15]; 
 st.global.f32  [%rd11], %f6; 
 
BB1_2: 
 ret; 
} 
 
 // .globl fused_vision_get_valid_counts_kernel3 
.visible .entry fused_vision_get_valid_counts_kernel3( 
 .param .u64 fused_vision_get_valid_counts_kernel3_param_0, 
 .param .u64 fused_vision_get_valid_counts_kernel3_param_1, 
 .param .u64 fused_vision_get_valid_counts_kernel3_param_2, 
 .param .u64 fused_vision_get_valid_counts_kernel3_param_3, 
 .param .u64 fused_vision_get_valid_counts_kernel3_param_4 
) 
{ 
 .reg .pred  %p<3>; 
 .reg .f32  %f<7>; 
 .reg .b32  %r<10>; 
 .reg .b64  %rd<21>; 
 
 
 ld.param.u64  %rd1, [fused_vision_get_valid_counts_kernel3_param_0]; 
 ld.param.u64  %rd2, [fused_vision_get_valid_counts_kernel3_param_1]; 
 ld.param.u64  %rd3, [fused_vision_get_valid_counts_kernel3_param_2]; 
 ld.param.u64  %rd4, [fused_vision_get_valid_counts_kernel3_param_3]; 
 ld.param.u64  %rd5, [fused_vision_get_valid_counts_kernel3_param_4]; 
 mov.u32  %r1, %ctaid.x; 
 shl.b32  %r4, %r1, 10; 
 mov.u32  %r2, %tid.x; 
 add.s32  %r3, %r4, %r2; 
 setp.gt.s32 %p1, %r3, 12263; 
 @%p1 bra  BB2_3; 
 
 cvta.to.global.u64  %rd6, %rd1; 
 mul.wide.s32  %rd7, %r3, 4; 
 add.s64  %rd8, %rd6, %rd7; 
 ld.global.nc.u32  %r5, [%rd8]; 
 setp.lt.s32 %p2, %r5, 1; 
 @%p2 bra  BB2_3; 
 
 cvta.to.global.u64  %rd9, %rd2; 
 cvta.to.global.u64  %rd10, %rd3; 
 cvta.to.global.u64  %rd11, %rd5; 
 cvta.to.global.u64  %rd12, %rd4; 
 add.s64  %rd14, %rd12, %rd7; 
 mul.lo.s32  %r6, %r1, 6144; 
 mad.lo.s32  %r7, %r2, 6, %r6; 
 ld.global.nc.u32  %r8, [%rd14]; 
 mul.lo.s32  %r9, %r8, 6; 
 mul.wide.s32  %rd15, %r7, 4; 
 add.s64  %rd16, %rd10, %rd15; 
 ld.global.nc.f32  %f1, [%rd16]; 
 mul.wide.s32  %rd17, %r9, 4; 
 add.s64  %rd18, %rd9, %rd17; 
 ld.global.nc.f32  %f2, [%rd16+4]; 
 ld.global.nc.f32  %f3, [%rd16+8]; 
 ld.global.nc.f32  %f4, [%rd16+12]; 
 ld.global.nc.f32  %f5, [%rd16+16]; 
 ld.global.nc.f32  %f6, [%rd16+20]; 
 st.global.f32  [%rd18], %f1; 
 st.global.f32  [%rd18+4], %f2; 
 st.global.f32  [%rd18+8], %f3; 
 st.global.f32  [%rd18+12], %f4; 
 st.global.f32  [%rd18+16], %f5; 
 st.global.f32  [%rd18+20], %f6; 
 mul.wide.s32  %rd19, %r8, 4; 
 add.s64  %rd20, %rd11, %rd19; 
 st.global.u32  [%rd20], %r3; 
 
BB2_3: 
 ret; 
} 
 
 // .globl fused_vision_non_max_suppression_kernel1 
.visible .entry fused_vision_non_max_suppression_kernel1( 
 .param .u64 fused_vision_non_max_suppression_kernel1_param_0, 
 .param .u64 fused_vision_non_max_suppression_kernel1_param_1, 
 .pa

I could run the models compiled with neo-ai release 1.7 or earlier. I tried to update the neo-ai-dlr runtime version but the error exists. How can run the new sagemaker neo compiled models? Should I change the compiler_options parameters?
Also it would be great, if you could mention the regions that use neo-ai release 1.7.

Thank you in advance.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant