Skip to content

Commit 72d65c1

Browse files
committed
Using hipStream to improve performance
1 parent e544ca5 commit 72d65c1

File tree

3 files changed

+73
-73
lines changed

3 files changed

+73
-73
lines changed

FastChwHwcConverterROCm.hpp

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -552,26 +552,26 @@ inline void hwc2chw_rocm(
552552
const size_t input_size = pixel_size * sizeof(uint8_t);
553553
const size_t output_size = pixel_size * sizeof(float);
554554

555-
void* rocm_input_memory = nullptr;
556-
void* rocm_output_memory = nullptr;
555+
hipDeviceptr_t rocm_input_memory = 0;
556+
hipDeviceptr_t rocm_output_memory = 0;
557557

558558
// Allocate host-pinned memory
559-
hipError_t hipRes0 = hipHostMalloc(&rocm_input_memory, input_size, 0);
560-
hipError_t hipRes1 = hipHostMalloc(&rocm_output_memory, output_size, 0);
559+
hipError_t hipRes0 = hipMallocAsync(&rocm_input_memory, input_size, rocmstream);
560+
hipError_t hipRes1 = hipMallocAsync(&rocm_output_memory, output_size, rocmstream);
561561

562562
if (hipRes0 != 0 || hipRes1 != 0) {
563-
hipHostFree(rocm_input_memory);
564-
hipHostFree(rocm_output_memory);
563+
hipFreeAsync(rocm_input_memory, rocmstream);
564+
hipFreeAsync(rocm_output_memory, rocmstream);
565565
hwc2chw<uint8_t, float>(h, w, c, src, dst, alpha);
566566
return;
567567
}
568568

569569
// Copy host memory to device memory
570-
hipError_t hipRes2 = hipMemcpyAsync(rocm_input_memory, src, input_size, hipMemcpyKind::hipMemcpyHostToDevice, rocmstream);
570+
hipError_t hipRes2 = hipMemcpyHtoDAsync(rocm_input_memory, src, input_size, rocmstream);
571571

572572
if (hipRes2 != 0) {
573-
hipHostFree(rocm_input_memory);
574-
hipHostFree(rocm_output_memory);
573+
hipFreeAsync(rocm_input_memory, rocmstream);
574+
hipFreeAsync(rocm_output_memory, rocmstream);
575575
hwc2chw<uint8_t, float>(h, w, c, src, dst, alpha);
576576
return;
577577
}
@@ -593,25 +593,25 @@ inline void hwc2chw_rocm(
593593
0, rocmstream, args, nullptr);
594594

595595
if (hipRes3 != 0) {
596-
hipHostFree(rocm_input_memory);
597-
hipHostFree(rocm_output_memory);
596+
hipFreeAsync(rocm_input_memory, rocmstream);
597+
hipFreeAsync(rocm_output_memory, rocmstream);
598598
hwc2chw<uint8_t, float>(h, w, c, src, dst, alpha);
599599
return;
600600
}
601601

602602
// Copy device memory to host memory
603-
hipError_t hipRes5 = hipMemcpyAsync(dst, rocm_output_memory, output_size, hipMemcpyKind::hipMemcpyDeviceToHost, rocmstream);
603+
hipError_t hipRes5 = hipMemcpyDtoHAsync(dst, rocm_output_memory, output_size, rocmstream);
604604

605605
if (hipRes5 != 0) {
606-
hipHostFree(rocm_input_memory);
607-
hipHostFree(rocm_output_memory);
606+
hipFreeAsync(rocm_input_memory, rocmstream);
607+
hipFreeAsync(rocm_output_memory, rocmstream);
608608
hwc2chw<uint8_t, float>(h, w, c, src, dst, alpha);
609609
return;
610610
}
611611

612612
// Free memory
613-
hipHostFree(rocm_input_memory);
614-
hipHostFree(rocm_output_memory);
613+
hipFreeAsync(rocm_input_memory, rocmstream);
614+
hipFreeAsync(rocm_output_memory, rocmstream);
615615

616616
// Stream synchronization
617617
hipError_t hipRes4 = hipStreamSynchronize(rocmstream);
@@ -645,25 +645,25 @@ inline void chw2hwc_rocm(
645645
const size_t pixel_size = h * w * c;
646646
size_t input_size = pixel_size * sizeof(float);
647647
size_t output_size = pixel_size * sizeof(uint8_t);
648-
void* rocm_input_memory = nullptr;
649-
void* rocm_output_memory = nullptr;
648+
hipDeviceptr_t rocm_input_memory = 0;
649+
hipDeviceptr_t rocm_output_memory = 0;
650650

651651
// Allocate device memory
652-
hipError_t hipRes0 = hipHostMalloc(&rocm_input_memory, input_size, 0);
653-
hipError_t hipRes1 = hipHostMalloc(&rocm_output_memory, output_size, 0);
652+
hipError_t hipRes0 = hipMallocAsync(&rocm_input_memory, input_size, rocmstream);
653+
hipError_t hipRes1 = hipMallocAsync(&rocm_output_memory, output_size, rocmstream);
654654

655655
if (hipRes0 != 0 || hipRes1 != 0) {
656-
hipHostFree(rocm_input_memory);
657-
hipHostFree(rocm_output_memory);
656+
hipFreeAsync(rocm_input_memory, rocmstream);
657+
hipFreeAsync(rocm_output_memory, rocmstream);
658658
chw2hwc<float, uint8_t>(h, w, c, src, dst, alpha); return;
659659
}
660660

661661
// Copy host memory to device memory
662-
hipError_t hipRes2 = hipMemcpyAsync(rocm_input_memory, src, input_size, hipMemcpyKind::hipMemcpyHostToDevice, rocmstream);
662+
hipError_t hipRes2 = hipMemcpyHtoDAsync(rocm_input_memory, src, input_size, rocmstream);
663663

664664
if (hipRes2 != 0) {
665-
hipHostFree(rocm_input_memory);
666-
hipHostFree(rocm_output_memory);
665+
hipFreeAsync(rocm_input_memory, rocmstream);
666+
hipFreeAsync(rocm_output_memory, rocmstream);
667667
chw2hwc<float, uint8_t>(h, w, c, src, dst, alpha); return;
668668
}
669669

@@ -686,23 +686,23 @@ inline void chw2hwc_rocm(
686686
0, rocmstream, args, nullptr);
687687

688688
if (hipRes3 != 0) {
689-
hipHostFree(rocm_input_memory);
690-
hipHostFree(rocm_output_memory);
689+
hipFreeAsync(rocm_input_memory, rocmstream);
690+
hipFreeAsync(rocm_output_memory, rocmstream);
691691
chw2hwc<float, uint8_t>(h, w, c, src, dst, alpha); return;
692692
}
693693

694694
// Copy device memory to host memory
695-
hipError_t hipRes5 = hipMemcpyAsync(dst, rocm_output_memory, output_size, hipMemcpyKind::hipMemcpyDeviceToHost, rocmstream);
695+
hipError_t hipRes5 = hipMemcpyDtoHAsync(dst, rocm_output_memory, output_size, rocmstream);
696696

697697
if (hipRes5 != 0) {
698-
hipHostFree(rocm_input_memory);
699-
hipHostFree(rocm_output_memory);
698+
hipFreeAsync(rocm_input_memory, rocmstream);
699+
hipFreeAsync(rocm_output_memory, rocmstream);
700700
chw2hwc<float, uint8_t>(h, w, c, src, dst, alpha); return;
701701
}
702702

703703
// Free memory
704-
hipHostFree(rocm_input_memory);
705-
hipHostFree(rocm_output_memory);
704+
hipFreeAsync(rocm_input_memory, rocmstream);
705+
hipFreeAsync(rocm_output_memory, rocmstream);
706706

707707
hipError_t hipRes4 = hipStreamSynchronize(rocmstream);
708708

@@ -725,7 +725,7 @@ inline void chw2hwc_rocm(
725725
*/
726726
inline void hwc2chw_rocm(
727727
const size_t h, const size_t w, const size_t c,
728-
void* src, void* dst,
728+
hipDeviceptr_t src, hipDeviceptr_t dst,
729729
const float alpha = 1.f / 255.f) {
730730

731731
const size_t pixel_size = h * w * c;
@@ -769,7 +769,7 @@ inline void hwc2chw_rocm(
769769
*/
770770
inline void chw2hwc_rocm(
771771
const size_t c, const size_t h, const size_t w,
772-
void* src, void* dst,
772+
hipDeviceptr_t src, hipDeviceptr_t dst,
773773
const uint8_t alpha = 255.0f) {
774774

775775
const unsigned int blockDimX = 32, blockDimY = 32, blockDimZ = 1;

README.md

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -338,33 +338,33 @@ The table below shows the benchmark performance timing for different image dimen
338338

339339
GPU(ROCm): AMD Radeon RX 6900 XT
340340

341-
| | | | CPU | CPU |CPU(OpenMP)|CPU(OpenMP)| CUDA | CUDA | ROCm | ROCm |
342-
|-------|--------|---------|----------|----------|-----------|-----------|----------|----------|----------|---------|
343-
| W | H | C | hwc2chw | chw2hwc | hwc2chw | chw2hwc | hwc2chw | chw2hwc | hwc2chw | chw2hwc |
344-
| 426 | 240 | 1 | 0.097ms | 0.110ms | 0.113ms | 0.030ms | 0.006ms | 0.006ms | 0.119ms | 0.052ms |
345-
| 426 | 240 | 3 | 0.331ms | 0.314ms | 0.061ms | 0.068ms | 0.008ms | 0.007ms | 0.118ms | 0.055ms |
346-
| 426 | 240 | 4 | 0.439ms | 0.415ms | 0.082ms | 0.082ms | 0.010ms | 0.010ms | 0.113ms | 0.057ms |
347-
| 640 | 360 | 1 | 0.217ms | 0.236ms | 0.048ms | 0.052ms | 0.010ms | 0.010ms | 0.111ms | 0.051ms |
348-
| 640 | 360 | 3 | 0.743ms | 0.705ms | 0.147ms | 0.140ms | 0.013ms | 0.015ms | 0.129ms | 0.055ms |
349-
| 640 | 360 | 4 | 0.881ms | 0.921ms | 0.219ms | 0.203ms | 0.015ms | 0.016ms | 0.121ms | 0.057ms |
350-
| 854 | 480 | 1 | 0.393ms | 0.415ms | 0.094ms | 0.089ms | 0.009ms | 0.008ms | 0.282ms | 0.050ms |
351-
| 854 | 480 | 3 | 1.328ms | 1.269ms | 0.250ms | 0.232ms | 0.019ms | 0.020ms | 0.325ms | 0.059ms |
352-
| 854 | 480 | 4 | 1.717ms | 1.670ms | 0.263ms | 0.262ms | 0.024ms | 0.025ms | 0.334ms | 0.065ms |
353-
| 1280 | 720 | 1 | 0.873ms | 0.937ms | 0.130ms | 0.180ms | 0.019ms | 0.016ms | 0.301ms | 0.053ms |
354-
| 1280 | 720 | 3 | 2.877ms | 2.828ms | 0.449ms | 0.457ms | 0.039ms | 0.039ms | 0.390ms | 0.062ms |
355-
| 1280 | 720 | 4 | 3.558ms | 3.848ms | 0.719ms | 0.616ms | 0.049ms | 0.051ms | 0.336ms | 0.063ms |
356-
| 1920 | 1080 | 1 | 1.949ms | 2.136ms | 0.374ms | 0.342ms | 0.036ms | 0.032ms | 0.616ms | 0.055ms |
357-
| 1920 | 1080 | 3 | 6.587ms | 6.469ms | 1.000ms | 0.672ms | 0.081ms | 0.084ms | 0.827ms | 0.065ms |
358-
| 1920 | 1080 | 4 | 8.144ms | 8.615ms | 0.832ms | 0.914ms | 0.106ms | 0.109ms | 0.697ms | 0.075ms |
359-
| 2560 | 1440 | 1 | 3.530ms | 3.800ms | 0.423ms | 0.476ms | 0.061ms | 0.056ms | 1.060ms | 0.067ms |
360-
| 2560 | 1440 | 3 | 11.470ms | 11.611ms | 1.323ms | 1.169ms | 0.141ms | 0.144ms | 1.444ms | 0.085ms |
361-
| 2560 | 1440 | 4 | 14.139ms | 15.273ms | 2.391ms | 2.567ms | 0.188ms | 0.191ms | 1.279ms | 0.087ms |
362-
| 3840 | 2160 | 1 | 7.976ms | 8.494ms | 1.103ms | 1.387ms | 0.134ms | 0.122ms | 2.317ms | 0.097ms |
363-
| 3840 | 2160 | 3 | 26.299ms | 25.824ms | 5.339ms | 4.438ms | 0.318ms | 0.329ms | 3.296ms | 0.086ms |
364-
| 3840 | 2160 | 4 | 32.941ms | 34.718ms | 5.805ms | 4.514ms | 0.421ms | 0.427ms | 2.950ms | 0.116ms |
365-
| 7680 | 4320 | 1 | 31.536ms | 34.100ms | 5.742ms | 4.976ms | 0.527ms | 0.476ms | 9.400ms | 0.215ms |
366-
| 7680 | 4320 | 3 | 102.875ms| 102.419ms| 19.261ms | 17.294ms | 1.252ms | 1.290ms | 13.089ms | 0.223ms |
367-
| 7680 | 4320 | 4 | 133.081ms| 136.308ms| 23.398ms | 18.445ms | 1.670ms | 1.688ms | 11.529ms | 0.218ms |
341+
| | | | CPU | CPU |CPU(OpenMP)|CPU(OpenMP)| CUDA | CUDA | ROCm | ROCm |
342+
|-------|-----|----|----------|----------|-----------|-----------|-----------|---------|---------|---------|
343+
| W | H | C | hwc2chw | chw2hwc | hwc2chw | chw2hwc | hwc2chw | chw2hwc | hwc2chw | chw2hwc |
344+
| 426 | 240 | 1 | 0.097ms | 0.110ms | 0.113ms | 0.030ms | 0.022ms | 0.019ms | 0.059ms | 0.053ms |
345+
| 426 | 240 | 3 | 0.331ms | 0.314ms | 0.061ms | 0.068ms | 0.022ms | 0.019ms | 0.062ms | 0.059ms |
346+
| 426 | 240 | 4 | 0.439ms | 0.415ms | 0.082ms | 0.082ms | 0.020ms | 0.019ms | 0.062ms | 0.061ms |
347+
| 640 | 360 | 1 | 0.217ms | 0.236ms | 0.048ms | 0.052ms | 0.022ms | 0.021ms | 0.062ms | 0.061ms |
348+
| 640 | 360 | 3 | 0.743ms | 0.705ms | 0.147ms | 0.140ms | 0.036ms | 0.021ms | 0.060ms | 0.059ms |
349+
| 640 | 360 | 4 | 0.881ms | 0.921ms | 0.219ms | 0.203ms | 0.025ms | 0.021ms | 0.057ms | 0.053ms |
350+
| 854 | 480 | 1 | 0.393ms | 0.415ms | 0.094ms | 0.089ms | 0.025ms | 0.024ms | 0.063ms | 0.060ms |
351+
| 854 | 480 | 3 | 1.328ms | 1.269ms | 0.250ms | 0.232ms | 0.029ms | 0.024ms | 0.052ms | 0.052ms |
352+
| 854 | 480 | 4 | 1.717ms | 1.670ms | 0.263ms | 0.262ms | 0.034ms | 0.027ms | 0.054ms | 0.051ms |
353+
| 1280 | 720 | 1 | 0.873ms | 0.937ms | 0.130ms | 0.180ms | 0.053ms | 0.040ms | 0.060ms | 0.052ms |
354+
| 1280 | 720 | 3 | 2.877ms | 2.828ms | 0.449ms | 0.457ms | 0.052ms | 0.042ms | 0.061ms | 0.056ms |
355+
| 1280 | 720 | 4 | 3.558ms | 3.848ms | 0.719ms | 0.616ms | 0.054ms | 0.045ms | 0.062ms | 0.056ms |
356+
| 1920 | 1080 | 1 | 1.949ms | 2.136ms | 0.374ms | 0.342ms | 0.081ms | 0.067ms | 0.079ms | 0.060ms |
357+
| 1920 | 1080 | 3 | 6.587ms | 6.469ms | 1.000ms | 0.672ms | 0.087ms | 0.074ms | 0.080ms | 0.064ms |
358+
| 1920 | 1080 | 4 | 8.144ms | 8.615ms | 0.832ms | 0.914ms | 0.103ms | 0.080ms | 0.077ms | 0.057ms |
359+
| 2560 | 1440 | 1 | 3.530ms | 3.800ms | 0.423ms | 0.476ms | 0.114ms | 0.116ms | 0.094ms | 0.074ms |
360+
| 2560 | 1440 | 3 | 11.470ms | 11.611ms | 1.323ms | 1.169ms | 0.142ms | 0.127ms | 0.089ms | 0.070ms |
361+
| 2560 | 1440 | 4 | 14.139ms | 15.273ms | 2.391ms | 2.567ms | 0.154ms | 0.136ms | 0.094ms | 0.075ms |
362+
| 3840 | 2160 | 1 | 7.976ms | 8.494ms | 1.103ms | 1.387ms | 0.234ms | 0.227ms | 0.129ms | 0.097ms |
363+
| 3840 | 2160 | 3 | 26.299ms | 25.824ms | 5.339ms | 4.438ms | 0.307ms | 0.253ms | 0.132ms | 0.096ms |
364+
| 3840 | 2160 | 4 | 32.941ms | 34.718ms | 5.805ms | 4.514ms | 0.323ms | 0.272ms | 0.131ms | 0.097ms |
365+
| 7680 | 4320 | 1 | 31.536ms | 34.100ms | 5.742ms | 4.976ms | 0.836ms | 0.741ms | 0.484ms | 0.214ms |
366+
| 7680 | 4320 | 3 | 102.875ms| 102.419ms| 19.261ms | 17.294ms | 1.057ms | 0.890ms | 0.621ms | 0.222ms |
367+
| 7680 | 4320 | 4 | 133.081ms| 136.308ms| 23.398ms | 18.445ms | 1.144ms | 1.013ms | 0.686ms | 0.220ms |
368368

369369
## Contact
370370
For any questions or suggestions, please open an issue or contact the me.

test/rocm_benchmark.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,15 @@ int main() {
4242
//std::vector<uint8_t> out_uint8(pixel_size); // Inference output data(hwc)
4343

4444
// 2. device memory
45-
void* src_uint8 = 0;
46-
void* src_float = 0;
47-
hipHostMalloc(&src_uint8, pixel_size * sizeof(uint8_t), 0);
48-
hipHostMalloc(&src_float, pixel_size * sizeof(float), 0);
45+
hipDeviceptr_t src_uint8 = 0;
46+
hipDeviceptr_t src_float = 0;
47+
hipMalloc(&src_uint8, pixel_size * sizeof(uint8_t));
48+
hipMalloc(&src_float, pixel_size * sizeof(float));
4949

50-
void* out_float = 0;
51-
void* out_uint8 = 0;
52-
hipHostMalloc(&out_float, pixel_size * sizeof(float), 0);
53-
hipHostMalloc(&out_uint8, pixel_size * sizeof(uint8_t), 0);
50+
hipDeviceptr_t out_float = 0;
51+
hipDeviceptr_t out_uint8 = 0;
52+
hipMalloc(&out_float, pixel_size * sizeof(float));
53+
hipMalloc(&out_uint8, pixel_size * sizeof(uint8_t));
5454

5555
auto startTime = std::chrono::high_resolution_clock::now();
5656
for (size_t i = 0; i < TEST_COUNT; ++i) {
@@ -79,10 +79,10 @@ int main() {
7979
auto chw2hwcDuration = std::chrono::duration_cast<std::chrono::microseconds>(endTime - startTime) / double(TEST_COUNT);
8080

8181
// 2. device memory
82-
hipHostFree(src_uint8);
83-
hipHostFree(src_float);
84-
hipHostFree(out_float);
85-
hipHostFree(out_uint8);
82+
hipFree(src_uint8);
83+
hipFree(src_float);
84+
hipFree(out_float);
85+
hipFree(out_uint8);
8686

8787
std::cout << width << ",\t" << height << ",\t" << channel << ",\t"
8888
<< std::fixed << std::setprecision(3)

0 commit comments

Comments
 (0)