You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
using KernelAbstractions
using Random
# using CUDA, CUDA.CUDAKernelsusing oneAPI
@kernelfunctionmatmul_kernel!(output, a, b)
i, j =@index(Global, NTuple)
# creating a temporary sum variable for matrix multiplication
tmp_sum =zero(eltype(output))
for k in1:size(a)[2]
tmp_sum += a[i, k] * b[k, j]
end
output[i, j] = tmp_sum
end# Creating a wrapper kernel for launching with error checksfunctionmatmul!(output, a, b)
ifsize(a)[2] !=size(b)[1]
println("Matrix size mismatch!")
returnnothingend
backend = KernelAbstractions.get_backend(a)
kernel! =matmul_kernel!(backend)
kernel!(output, a, b, ndrange =size(output))
returnend
backend =oneAPIBackend()
# backend = CPU()# backend = CUDABackend()
a =randn!(allocate(backend, Float32, 256, 123));
b =randn!(allocate(backend, Float32, size(a, 2), 45));
output = KernelAbstractions.zeros(backend, Float32, size(a, 1), size(b, 2));
matmul!(output, a, b)
KernelAbstractions.synchronize(backend)
@show output ≈ a * b
@show output ≈ a * b
When I run this code on my laptop I get
output ≈ a * b = false
output ≈ a * b = true
[47667] signal 11 (1): Segmentation fault
in expression starting at none:0
_ZN3NEO13DrmAllocation15makeBOsResidentEPNS_9OsContextEjPSt6vectorIPNS_12BufferObjectESaIS5_EEb at /home/mose/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN3NEO24DrmCommandStreamReceiverINS_15XeHpgCoreFamilyEE16processResidencyERKSt6vectorIPNS_18GraphicsAllocationESaIS5_EEj at /home/mose/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN3NEO24DrmCommandStreamReceiverINS_15XeHpgCoreFamilyEE13flushInternalERKNS_11BatchBufferERKSt6vectorIPNS_18GraphicsAllocationESaIS8_EE at /home/mose/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN3NEO24DrmCommandStreamReceiverINS_15XeHpgCoreFamilyEE5flushERNS_11BatchBufferERSt6vectorIPNS_18GraphicsAllocationESaIS7_EE at /home/mose/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN3NEO21CommandStreamReceiver17submitBatchBufferERNS_11BatchBufferERSt6vectorIPNS_18GraphicsAllocationESaIS5_EE at /home/mose/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN2L015CommandQueueImp17submitBatchBufferEmRSt6vectorIPN3NEO18GraphicsAllocationESaIS4_EEPvb at /home/mose/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN2L014CommandQueueHwIL14GFXCORE_FAMILY3079EE26executeCommandListsRegularERNS2_27CommandListExecutionContextEjPP25_ze_command_list_handle_tP18_ze_fence_handle_tPN3NEO12LinearStreamE at /home/mose/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN2L014CommandQueueHwIL14GFXCORE_FAMILY3079EE19executeCommandListsEjPP25_ze_command_list_handle_tP18_ze_fence_handle_tbPN3NEO12LinearStreamE at /home/mose/.julia/artifacts/df06a45fdfc25a70826c358bdd37c510002509f2/lib/libze_intel_gpu.so.1 (unknown line)
_ZN25ur_queue_handle_legacy_t_18executeCommandListENSt8__detail14_Node_iteratorISt4pairIKP25_ze_command_list_handle_t22ur_command_list_info_tELb0ELb0EEEbb at /home/mose/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libur_adapter_level_zero.so.0 (unknown line)
_ZN25ur_queue_handle_legacy_t_12queueReleaseEv at /home/mose/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libur_adapter_level_zero.so.0 (unknown line)
urQueueRelease at /home/mose/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/./libur_loader.so.0 (unknown line)
_ZN4sycl3_V16detail10queue_implD2Ev at /home/mose/.julia/artifacts/4a18fb08eef19534a609707234107ea4ab173392/lib/libsycl.so.8 (unknown line)
_M_release at /opt/x86_64-linux-gnu/x86_64-linux-gnu/include/c++/8.1.0/bits/shared_ptr_base.h:161 [inlined]
~__shared_count at /opt/x86_64-linux-gnu/x86_64-linux-gnu/include/c++/8.1.0/bits/shared_ptr_base.h:712 [inlined]
~__shared_ptr at /opt/x86_64-linux-gnu/x86_64-linux-gnu/include/c++/8.1.0/bits/shared_ptr_base.h:1151 [inlined]
~shared_ptr at /opt/x86_64-linux-gnu/x86_64-linux-gnu/include/c++/8.1.0/bits/shared_ptr.h:103 [inlined]
~queue at /workspace/x86_64-linux-gnu-libgfortran5-cxx11/destdir/include/sycl/queue.hpp:110 [inlined]
~syclQueue_st at /workspace/srcdir/oneAPI.jl/deps/src/sycl.hpp:19 [inlined]
syclQueueDestroy at /workspace/srcdir/oneAPI.jl/deps/src/sycl.cpp:75
syclQueueDestroy at /home/mose/.julia/packages/oneAPI/CNvkW/lib/support/liboneapi_support.jl:58 [inlined]
#7 at /home/mose/.julia/packages/oneAPI/CNvkW/lib/sycl/SYCL.jl:74
unknown function (ip: 0x7f27693f9092)
run_finalizer at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/gc.c:299
jl_gc_run_finalizers_in_list at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/gc.c:389
run_finalizers at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/gc.c:435
ijl_atexit_hook at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/init.c:299
jl_repl_entrypoint at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/src/jlapi.c:1060
main at /cache/build/builder-demeter6-3/julialang/julia-release-1-dot-11/cli/loader_exe.c:58
unknown function (ip: 0x7f277059fd67)
__libc_start_main at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
unknown function (ip: 0x4010b8)
Allocations: 28785806 (Pool: 28784960; Big: 846); GC: 19
This is quite reproducible: I basically always have that the first time the result is approximately inaccurate, but the second one is fine, and then a segfault at Julia exit. No problem when using the CUDA backend of KernelAbstractions
For the record, this is probably related to #445 and changing the code to set SYCL_PI_LEVEL_ZERO_BATCH_SIZE=1 solves the issues for me (both the inconsistent ≈ and the segfault, yay), for example replacing
Consider the following matmul example following KernelAbstractions documentation:
When I run this code on my laptop I get
This is quite reproducible: I basically always have that the first time the result is approximately inaccurate, but the second one is fine, and then a segfault at Julia exit. No problem when using the CUDA backend of KernelAbstractions
CC: @vchuravy.
The text was updated successfully, but these errors were encountered: