diff --git a/Project.toml b/Project.toml index 428914d297..312751ff53 100644 --- a/Project.toml +++ b/Project.toml @@ -41,12 +41,14 @@ demumble_jll = "1e29f10c-031c-5a83-9565-69cddfc27673" [weakdeps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" [extensions] ChainRulesCoreExt = "ChainRulesCore" EnzymeCoreExt = "EnzymeCore" +FillArraysExt = "FillArrays" SparseMatricesCSRExt = "SparseMatricesCSR" SpecialFunctionsExt = "SpecialFunctions" @@ -64,6 +66,7 @@ Crayons = "4" DataFrames = "1.5" EnzymeCore = "0.8.2" ExprTools = "0.1" +FillArrays = "1" GPUArrays = "11.2.4" GPUCompiler = "1.4" GPUToolbox = "0.3, 1" @@ -94,5 +97,6 @@ julia = "1.10" [extras] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" diff --git a/ext/FillArraysExt.jl b/ext/FillArraysExt.jl new file mode 100644 index 0000000000..8470d47e6f --- /dev/null +++ b/ext/FillArraysExt.jl @@ -0,0 +1,71 @@ +module FillArraysExt + +using CUDA +using CUDA.CUSPARSE +using LinearAlgebra +using SparseArrays +using FillArrays + +# kron between CuSparseMatrixCOO and Diagonal{T, AbstractFill} +# This is optimized for FillArrays since the diagonal is a constant value +function LinearAlgebra.kron(A::CUSPARSE.CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T2, <:FillArrays.AbstractFill{T2}}) where {Ti, T1, T2} + T = promote_type(T1, T2) + mA, nA = size(A) + mB, nB = size(B) + out_shape = (mA * mB, nA * nB) + Annz = Int64(A.nnz) + Bnnz = nB + + if Annz == 0 || Bnnz == 0 + return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape) + end + + # Get the fill value from the diagonal + fill_value = FillArrays.getindex_value(B.diag) + + row = (A.rowInd .- 1) .* mB + row = repeat(row, inner = Bnnz) + col = (A.colInd .- 1) .* nB + col = repeat(col, inner = Bnnz) + data = repeat(convert(CUDA.CuVector{T}, A.nzVal), inner = Bnnz) + + row .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1 + col .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1 + + # Multiply by the fill value (already promoted type T) + data .*= fill_value + + CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo) +end + +# kron between Diagonal{T, AbstractFill} and CuSparseMatrixCOO +function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::CUSPARSE.CuSparseMatrixCOO{T2, Ti}) where {Ti, T1, T2} + T = promote_type(T1, T2) + mA, nA = size(A) + mB, nB = size(B) + out_shape = (mA * mB, nA * nB) + Annz = nA + Bnnz = Int64(B.nnz) + + if Annz == 0 || Bnnz == 0 + return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape) + end + + # Get the fill value from the diagonal + fill_value = FillArrays.getindex_value(A.diag) + + row = (0:nA-1) .* mB + row = CUDA.CuVector(repeat(row, inner = Bnnz)) + col = (0:nA-1) .* nB + col = CUDA.CuVector(repeat(col, inner = Bnnz)) + data = CUDA.fill(T(fill_value), nA * Bnnz) + + row .+= repeat(B.rowInd .- 1, outer = Annz) .+ 1 + col .+= repeat(B.colInd .- 1, outer = Annz) .+ 1 + + data .*= repeat(B.nzVal, outer = Annz) + + CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo) +end + +end # extension module diff --git a/lib/cusparse/linalg.jl b/lib/cusparse/linalg.jl index 4be578e284..b0ad01dd94 100644 --- a/lib/cusparse/linalg.jl +++ b/lib/cusparse/linalg.jl @@ -82,7 +82,8 @@ function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::CuSparseMatrixCOO{T, sparse(row, col, data, out_shape..., fmt = :coo) end -function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::Diagonal) where {Ti, T} +function LinearAlgebra.kron(A::CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T2, <:CuVector{T2}}) where {Ti, T1, T2} + T = promote_type(T1, T2) mA,nA = size(A) mB,nB = size(B) out_shape = (mA * mB, nA * nB) @@ -97,17 +98,18 @@ function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::Diagonal) where {Ti, row = repeat(row, inner = Bnnz) col = (A.colInd .- 1) .* nB col = repeat(col, inner = Bnnz) - data = repeat(A.nzVal, inner = Bnnz) + data = repeat(convert(CuVector{T}, A.nzVal), inner = Bnnz) row .+= CuVector(repeat(0:nB-1, outer = Annz)) .+ 1 col .+= CuVector(repeat(0:nB-1, outer = Annz)) .+ 1 - data .*= repeat(CUDA.ones(T, nB), outer = Annz) + data .*= repeat(B.diag, outer = Annz) sparse(row, col, data, out_shape..., fmt = :coo) end -function LinearAlgebra.kron(A::Diagonal, B::CuSparseMatrixCOO{T, Ti}) where {Ti, T} +function LinearAlgebra.kron(A::Diagonal{T1, <:CuVector{T1}}, B::CuSparseMatrixCOO{T2, Ti}) where {Ti, T1, T2} + T = promote_type(T1, T2) mA,nA = size(A) mB,nB = size(B) out_shape = (mA * mB, nA * nB) @@ -122,7 +124,7 @@ function LinearAlgebra.kron(A::Diagonal, B::CuSparseMatrixCOO{T, Ti}) where {Ti, row = CuVector(repeat(row, inner = Bnnz)) col = (0:nA-1) .* nB col = CuVector(repeat(col, inner = Bnnz)) - data = repeat(CUDA.ones(T, nA), inner = Bnnz) + data = repeat(convert(CuVector{T}, A.diag), inner = Bnnz) row .+= repeat(B.rowInd .- 1, outer = Annz) .+ 1 col .+= repeat(B.colInd .- 1, outer = Annz) .+ 1 diff --git a/test/libraries/cusparse/linalg.jl b/test/libraries/cusparse/linalg.jl index b4232a8ef6..2fb4008c7d 100644 --- a/test/libraries/cusparse/linalg.jl +++ b/test/libraries/cusparse/linalg.jl @@ -6,7 +6,9 @@ m = 10 A = sprand(T, m, m, 0.2) B = sprand(T, m, m, 0.3) ZA = spzeros(T, m, m) - C = I(div(m, 2)) + C = Diagonal(rand(T, div(m, 2))) + + dC = adapt(CuArray, C) @testset "type = $typ" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC] dA = typ(A) dB = typ(B) @@ -35,12 +37,32 @@ m = 10 end end @testset "kronecker product with I opa = $opa" for opa in (identity, transpose, adjoint) - @test collect(kron(opa(dA), C)) ≈ kron(opa(A), C) - @test collect(kron(C, opa(dA))) ≈ kron(C, opa(A)) - @test collect(kron(opa(dZA), C)) ≈ kron(opa(ZA), C) - @test collect(kron(C, opa(dZA))) ≈ kron(C, opa(ZA)) + @test collect(kron(opa(dA), dC)) ≈ kron(opa(A), C) + @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A)) + @test collect(kron(opa(dZA), dC)) ≈ kron(opa(ZA), C) + @test collect(kron(dC, opa(dZA))) ≈ kron(C, opa(ZA)) end end + + # Test type promotion for kron with Diagonal + @testset "Type promotion - T1 = $T1, T2 = $T2" for (T1, T2) in [(Float32, Float64), (Float64, ComplexF64)] + A_T1 = sprand(T1, m, m, 0.2) + dA_T1 = CuSparseMatrixCSR(A_T1) + dC_T2 = Diagonal(CUDA.rand(T2, div(m, 2))) + C_T2 = collect(dC_T2) + + # Test kron(sparse_T1, diagonal_T2) + result_gpu = kron(dA_T1, dC_T2) + result_cpu = kron(A_T1, C_T2) + @test eltype(result_gpu) == promote_type(T1, T2) + @test collect(result_gpu) ≈ result_cpu + + # Test kron(diagonal_T2, sparse_T1) + result_gpu = kron(dC_T2, dA_T1) + result_cpu = kron(C_T2, A_T1) + @test eltype(result_gpu) == promote_type(T1, T2) + @test collect(result_gpu) ≈ result_cpu + end end @testset "Reshape $typ (100,100) -> (20, 500) and droptol" for diff --git a/test/libraries/fillarrays.jl b/test/libraries/fillarrays.jl new file mode 100644 index 0000000000..fa108e197a --- /dev/null +++ b/test/libraries/fillarrays.jl @@ -0,0 +1,102 @@ +using CUDA.CUSPARSE +using LinearAlgebra, SparseArrays +using FillArrays + +@testset "FillArrays - Kronecker product" begin + @testset "T = $T" for T in [Float32, Float64, ComplexF32, ComplexF64] + m, n = 100, 80 + A = sprand(T, m, n, 0.2) + + # Test with Ones + @testset "Ones - size $diag_size" for diag_size in [5, 10] + C_ones = Diagonal(Ones{T}(diag_size)) + dA = CuSparseMatrixCSR(A) + + # Test kron(sparse, diagonal) + result_gpu = collect(kron(dA, C_ones)) + result_cpu = kron(A, collect(C_ones)) + @test result_gpu ≈ result_cpu + + # Test kron(diagonal, sparse) + result_gpu = collect(kron(C_ones, dA)) + result_cpu = kron(collect(C_ones), A) + @test result_gpu ≈ result_cpu + end + + # Test with Fill (constant value) + @testset "Fill - value $val, size $diag_size" for val in [T(2.5), T(-1.0)], diag_size in [5, 10] + C_fill = Diagonal(Fill(val, diag_size)) + dA = CuSparseMatrixCSR(A) + + # Test kron(sparse, diagonal) + result_gpu = collect(kron(dA, C_fill)) + result_cpu = kron(A, collect(C_fill)) + @test result_gpu ≈ result_cpu + + # Test kron(diagonal, sparse) + result_gpu = collect(kron(C_fill, dA)) + result_cpu = kron(collect(C_fill), A) + @test result_gpu ≈ result_cpu + end + + # Test with Zeros + @testset "Zeros - size $diag_size" for diag_size in [5, 10] + C_zeros = Diagonal(Zeros{T}(diag_size)) + dA = CuSparseMatrixCSR(A) + + # Test kron(sparse, diagonal) + result_gpu = collect(kron(dA, C_zeros)) + result_cpu = kron(A, collect(C_zeros)) + @test result_gpu ≈ result_cpu + @test nnz(kron(dA, C_zeros)) == 0 + + # Test kron(diagonal, sparse) + result_gpu = collect(kron(C_zeros, dA)) + result_cpu = kron(collect(C_zeros), A) + @test result_gpu ≈ result_cpu + @test nnz(kron(C_zeros, dA)) == 0 + end + + # Test with transpose and adjoint wrappers + @testset "Transpose/Adjoint with Fill" begin + diag_size = 5 + val = T(3.0) + C_fill = Diagonal(Fill(val, diag_size)) + + @testset "opa = $opa" for opa in (identity, transpose, adjoint) + dA = CuSparseMatrixCSR(A) + + # Test kron(opa(sparse), diagonal) + result_gpu = collect(kron(opa(dA), C_fill)) + result_cpu = kron(opa(A), collect(C_fill)) + @test result_gpu ≈ result_cpu + + # Test kron(diagonal, opa(sparse)) + result_gpu = collect(kron(C_fill, opa(dA))) + result_cpu = kron(collect(C_fill), opa(A)) + @test result_gpu ≈ result_cpu + end + end + + # Test type promotion + @testset "Type promotion" begin + @testset "T1 = $T1, T2 = $T2" for (T1, T2) in [(Float32, Float64), (Float64, ComplexF64), (Float32, ComplexF32)] + A_T1 = sprand(T1, m, n, 0.2) + dA_T1 = CuSparseMatrixCSR(A_T1) + C_fill_T2 = Diagonal(Fill(T2(2.5), 5)) + + # Test kron(sparse_T1, diagonal_T2) + result_gpu = kron(dA_T1, C_fill_T2) + result_cpu = kron(A_T1, collect(C_fill_T2)) + @test eltype(result_gpu) == promote_type(T1, T2) + @test collect(result_gpu) ≈ result_cpu + + # Test kron(diagonal_T2, sparse_T1) + result_gpu = kron(C_fill_T2, dA_T1) + result_cpu = kron(collect(C_fill_T2), A_T1) + @test eltype(result_gpu) == promote_type(T1, T2) + @test collect(result_gpu) ≈ result_cpu + end + end + end +end