@@ -42,7 +42,10 @@ Hip_CannySobel_U16_U8_3x3_L1NORM(uint dstWidth, uint dstHeight,
42
42
{ // load 136x18 bytes into local memory using 16x16 workgroup
43
43
int loffset = ly * 136 + (lx << 3 );
44
44
int goffset = (y - 1 ) * srcImageStrideInBytes + x - 4 ;
45
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
45
+ if (goffset >= 0 ) {
46
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
47
+ }
48
+
46
49
bool doExtraLoad = false ;
47
50
if (ly < 2 ) {
48
51
loffset += 16 * 136 ;
@@ -54,7 +57,7 @@ Hip_CannySobel_U16_U8_3x3_L1NORM(uint dstWidth, uint dstHeight,
54
57
goffset = (y - ly + id - 1 ) * srcImageStrideInBytes + (((x >> 3 ) - lx) << 3 ) + 124 ;
55
58
doExtraLoad = (id < 18 ) ? true : false ;
56
59
}
57
- if (doExtraLoad) {
60
+ if (doExtraLoad && goffset >= 0 ) {
58
61
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
59
62
}
60
63
__syncthreads ();
@@ -267,7 +270,9 @@ Hip_CannySobel_U16_U8_5x5_L1NORM(uint dstWidth, uint dstHeight,
267
270
{ // load 136x20 bytes into local memory using 16x16 workgroup
268
271
int loffset = ly * 136 + (lx << 3 );
269
272
int goffset = (y - 2 ) * srcImageStrideInBytes + x - 4 ;
270
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
273
+ if (goffset >= 0 ) {
274
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
275
+ }
271
276
bool doExtraLoad = false ;
272
277
if (ly < 4 ) {
273
278
loffset += 16 * 136 ;
@@ -279,7 +284,7 @@ Hip_CannySobel_U16_U8_5x5_L1NORM(uint dstWidth, uint dstHeight,
279
284
goffset = (y - ly + id - 2 ) * srcImageStrideInBytes + (((x >> 3 ) - lx) << 3 ) + 124 ;
280
285
doExtraLoad = (id < 20 ) ? true : false ;
281
286
}
282
- if (doExtraLoad) {
287
+ if (doExtraLoad && goffset >= 0 ) {
283
288
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
284
289
}
285
290
__syncthreads ();
@@ -759,7 +764,9 @@ Hip_CannySobel_U16_U8_7x7_L1NORM(uint dstWidth, uint dstHeight,
759
764
{ // load 136x22 bytes into local memory using 16x16 workgroup
760
765
int loffset = ly * 136 + (lx << 3 );
761
766
int goffset = (y - 3 ) * srcImageStrideInBytes + x - 4 ;
762
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
767
+ if (goffset >= 0 ) {
768
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
769
+ }
763
770
bool doExtraLoad = false ;
764
771
if (ly < 6 ) {
765
772
loffset += 16 * 136 ;
@@ -771,7 +778,7 @@ Hip_CannySobel_U16_U8_7x7_L1NORM(uint dstWidth, uint dstHeight,
771
778
goffset = (y - ly + id - 3 ) * srcImageStrideInBytes + (((x >> 3 ) - lx) << 3 ) + 124 ;
772
779
doExtraLoad = (id < 22 ) ? true : false ;
773
780
}
774
- if (doExtraLoad) {
781
+ if (doExtraLoad && goffset >= 0 ) {
775
782
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
776
783
}
777
784
__syncthreads ();
@@ -1646,7 +1653,9 @@ Hip_CannySobel_U16_U8_3x3_L2NORM(uint dstWidth, uint dstHeight,
1646
1653
{ // load 136x18 bytes into local memory using 16x16 workgroup
1647
1654
int loffset = ly * 136 + (lx << 3 );
1648
1655
int goffset = (y - 1 ) * srcImageStrideInBytes + x - 4 ;
1649
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
1656
+ if (goffset >= 0 ) {
1657
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
1658
+ }
1650
1659
bool doExtraLoad = false ;
1651
1660
if (ly < 2 ) {
1652
1661
loffset += 16 * 136 ;
@@ -1658,7 +1667,7 @@ Hip_CannySobel_U16_U8_3x3_L2NORM(uint dstWidth, uint dstHeight,
1658
1667
goffset = (y - ly + id - 1 ) * srcImageStrideInBytes + (((x >> 3 ) - lx) << 3 ) + 124 ;
1659
1668
doExtraLoad = (id < 18 ) ? true : false ;
1660
1669
}
1661
- if (doExtraLoad) {
1670
+ if (doExtraLoad && goffset >= 0 ) {
1662
1671
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
1663
1672
}
1664
1673
__syncthreads ();
@@ -1871,7 +1880,9 @@ Hip_CannySobel_U16_U8_5x5_L2NORM(uint dstWidth, uint dstHeight,
1871
1880
{ // load 136x20 bytes into local memory using 16x16 workgroup
1872
1881
int loffset = ly * 136 + (lx << 3 );
1873
1882
int goffset = (y - 2 ) * srcImageStrideInBytes + x - 4 ;
1874
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
1883
+ if (goffset >= 0 ) {
1884
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
1885
+ }
1875
1886
bool doExtraLoad = false ;
1876
1887
if (ly < 4 ) {
1877
1888
loffset += 16 * 136 ;
@@ -1883,7 +1894,7 @@ Hip_CannySobel_U16_U8_5x5_L2NORM(uint dstWidth, uint dstHeight,
1883
1894
goffset = (y - ly + id - 2 ) * srcImageStrideInBytes + (((x >> 3 ) - lx) << 3 ) + 124 ;
1884
1895
doExtraLoad = (id < 20 ) ? true : false ;
1885
1896
}
1886
- if (doExtraLoad) {
1897
+ if (doExtraLoad && goffset >= 0 ) {
1887
1898
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
1888
1899
}
1889
1900
__syncthreads ();
@@ -2361,7 +2372,9 @@ Hip_CannySobel_U16_U8_7x7_L2NORM(uint dstWidth, uint dstHeight,
2361
2372
{ // load 136x22 bytes into local memory using 16x16 workgroup
2362
2373
int loffset = ly * 136 + (lx << 3 );
2363
2374
int goffset = (y - 3 ) * srcImageStrideInBytes + x - 4 ;
2364
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
2375
+ if (goffset >= 0 ) {
2376
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
2377
+ }
2365
2378
bool doExtraLoad = false ;
2366
2379
if (ly < 6 ) {
2367
2380
loffset += 16 * 136 ;
@@ -2373,7 +2386,7 @@ Hip_CannySobel_U16_U8_7x7_L2NORM(uint dstWidth, uint dstHeight,
2373
2386
goffset = (y - ly + id - 3 ) * srcImageStrideInBytes + (((x >> 3 ) - lx) << 3 ) + 124 ;
2374
2387
doExtraLoad = (id < 22 ) ? true : false ;
2375
2388
}
2376
- if (doExtraLoad) {
2389
+ if (doExtraLoad && goffset >= 0 ) {
2377
2390
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
2378
2391
}
2379
2392
__syncthreads ();
@@ -3263,7 +3276,7 @@ Hip_CannySuppThreshold_U8XY_U16_3x3(uint dstWidth, uint dstHeight,
3263
3276
goffset = (y - ly + id - 1 ) * srcImageStrideInBytes + ((x - lx) << 3 ) + 124 ;
3264
3277
doExtraLoad = (id < 18 ) ? true : false ;
3265
3278
}
3266
- if (doExtraLoad) {
3279
+ if (doExtraLoad && goffset >= 0 ) {
3267
3280
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
3268
3281
}
3269
3282
__syncthreads ();
@@ -3715,7 +3728,9 @@ Hip_HarrisSobel_HG3_U8_3x3(uint dstWidth, uint dstHeight,
3715
3728
{ // load 136x18 bytes into local memory using 16x16 workgroup
3716
3729
int loffset = ly * 136 + (lx << 3 );
3717
3730
int goffset = (y - 1 ) * srcImageStrideInBytes + x - 4 ;
3718
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
3731
+ if (goffset >= 0 ) {
3732
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
3733
+ }
3719
3734
bool doExtraLoad = false ;
3720
3735
if (ly < 2 ) {
3721
3736
loffset += 16 * 136 ;
@@ -3727,7 +3742,7 @@ Hip_HarrisSobel_HG3_U8_3x3(uint dstWidth, uint dstHeight,
3727
3742
goffset = (y - ly + id - 1 ) * srcImageStrideInBytes + (((x >> 3 ) - lx) << 3 ) + 124 ;
3728
3743
doExtraLoad = (id < 18 ) ? true : false ;
3729
3744
}
3730
- if (doExtraLoad) {
3745
+ if (doExtraLoad && goffset >= 0 ) {
3731
3746
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
3732
3747
}
3733
3748
__syncthreads ();
@@ -3933,7 +3948,9 @@ Hip_HarrisSobel_HG3_U8_5x5(uint dstWidth, uint dstHeight,
3933
3948
{ // load 136x20 bytes into local memory using 16x16 workgroup
3934
3949
int loffset = ly * 136 + (lx << 3 );
3935
3950
int goffset = (y - 2 ) * srcImageStrideInBytes + x - 4 ;
3936
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
3951
+ if (goffset >= 0 ) {
3952
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
3953
+ }
3937
3954
bool doExtraLoad = false ;
3938
3955
if (ly < 4 ) {
3939
3956
loffset += 16 * 136 ;
@@ -3945,7 +3962,7 @@ Hip_HarrisSobel_HG3_U8_5x5(uint dstWidth, uint dstHeight,
3945
3962
goffset = (y - ly + id - 2 ) * srcImageStrideInBytes + (((x >> 3 ) - lx) << 3 ) + 124 ;
3946
3963
doExtraLoad = (id < 20 ) ? true : false ;
3947
3964
}
3948
- if (doExtraLoad) {
3965
+ if (doExtraLoad && goffset >= 0 ) {
3949
3966
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
3950
3967
}
3951
3968
__syncthreads ();
@@ -4412,7 +4429,9 @@ Hip_HarrisSobel_HG3_U8_7x7(uint dstWidth, uint dstHeight,
4412
4429
{ // load 136x22 bytes into local memory using 16x16 workgroup
4413
4430
int loffset = ly * 136 + (lx << 3 );
4414
4431
int goffset = (y - 3 ) * srcImageStrideInBytes + x - 4 ;
4415
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
4432
+ if (goffset >= 0 ) {
4433
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
4434
+ }
4416
4435
bool doExtraLoad = false ;
4417
4436
if (ly < 6 ) {
4418
4437
loffset += 16 * 136 ;
@@ -4424,7 +4443,7 @@ Hip_HarrisSobel_HG3_U8_7x7(uint dstWidth, uint dstHeight,
4424
4443
goffset = (y - ly + id - 3 ) * srcImageStrideInBytes + (((x >> 3 ) - lx) << 3 ) + 124 ;
4425
4444
doExtraLoad = (id < 22 ) ? true : false ;
4426
4445
}
4427
- if (doExtraLoad) {
4446
+ if (doExtraLoad && goffset >= 0 ) {
4428
4447
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
4429
4448
}
4430
4449
__syncthreads ();
@@ -6297,7 +6316,9 @@ Hip_NonMaxSupp_XY_ANY_3x3(char *pDstList, uint dstListOffset, uint capacityOfLis
6297
6316
{ // load 136x18 bytes into local memory using 16x16 workgroup
6298
6317
int loffset = ly * 136 + (lx << 3 );
6299
6318
int goffset = (gy - 1 ) * srcImageStrideInBytes + (gx << 3 ) - 4 ;
6300
- *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
6319
+ if (goffset >= 0 ) {
6320
+ *((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
6321
+ }
6301
6322
bool doExtraLoad = false ;
6302
6323
if (ly < 2 ) {
6303
6324
loffset += 16 * 136 ;
@@ -6309,7 +6330,7 @@ Hip_NonMaxSupp_XY_ANY_3x3(char *pDstList, uint dstListOffset, uint capacityOfLis
6309
6330
goffset = (gy - ly + id - 1 ) * srcImageStrideInBytes + ((gx - lx) << 3 ) + 124 ;
6310
6331
doExtraLoad = (id < 18 ) ? true : false ;
6311
6332
}
6312
- if (doExtraLoad) {
6333
+ if (doExtraLoad && goffset >= 0 ) {
6313
6334
*((uint2 *)(&lbuf[loffset])) = *((uint2 *)(&pSrcImage[goffset]));
6314
6335
}
6315
6336
__syncthreads ();
0 commit comments