Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,14 @@ demumble_jll = "1e29f10c-031c-5a83-9565-69cddfc27673"
[weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"

[extensions]
ChainRulesCoreExt = "ChainRulesCore"
EnzymeCoreExt = "EnzymeCore"
FillArraysExt = "FillArrays"
SparseMatricesCSRExt = "SparseMatricesCSR"
SpecialFunctionsExt = "SpecialFunctions"

Expand All @@ -64,6 +66,7 @@ Crayons = "4"
DataFrames = "1.5"
EnzymeCore = "0.8.2"
ExprTools = "0.1"
FillArrays = "1"
GPUArrays = "11.2.4"
GPUCompiler = "1.4"
GPUToolbox = "0.3, 1"
Expand Down Expand Up @@ -94,5 +97,6 @@ julia = "1.10"
[extras]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
71 changes: 71 additions & 0 deletions ext/FillArraysExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
module FillArraysExt

using CUDA
using CUDA.CUSPARSE
using LinearAlgebra
using SparseArrays
using FillArrays

# kron between CuSparseMatrixCOO and Diagonal{T, AbstractFill}
# This is optimized for FillArrays since the diagonal is a constant value
function LinearAlgebra.kron(A::CUSPARSE.CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T2, <:FillArrays.AbstractFill{T2}}) where {Ti, T1, T2}
T = promote_type(T1, T2)
mA, nA = size(A)
mB, nB = size(B)
out_shape = (mA * mB, nA * nB)
Annz = Int64(A.nnz)
Bnnz = nB

if Annz == 0 || Bnnz == 0
return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape)
end

# Get the fill value from the diagonal
fill_value = FillArrays.getindex_value(B.diag)

row = (A.rowInd .- 1) .* mB
row = repeat(row, inner = Bnnz)
col = (A.colInd .- 1) .* nB
col = repeat(col, inner = Bnnz)
data = repeat(convert(CUDA.CuVector{T}, A.nzVal), inner = Bnnz)

row .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
col .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1

# Multiply by the fill value (already promoted type T)
data .*= fill_value

CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end

# kron between Diagonal{T, AbstractFill} and CuSparseMatrixCOO
function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::CUSPARSE.CuSparseMatrixCOO{T2, Ti}) where {Ti, T1, T2}
T = promote_type(T1, T2)
mA, nA = size(A)
mB, nB = size(B)
out_shape = (mA * mB, nA * nB)
Annz = nA
Bnnz = Int64(B.nnz)

if Annz == 0 || Bnnz == 0
return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape)
end

# Get the fill value from the diagonal
fill_value = FillArrays.getindex_value(A.diag)

row = (0:nA-1) .* mB
row = CUDA.CuVector(repeat(row, inner = Bnnz))
col = (0:nA-1) .* nB
col = CUDA.CuVector(repeat(col, inner = Bnnz))
data = CUDA.fill(T(fill_value), nA * Bnnz)

row .+= repeat(B.rowInd .- 1, outer = Annz) .+ 1
col .+= repeat(B.colInd .- 1, outer = Annz) .+ 1

data .*= repeat(B.nzVal, outer = Annz)

CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end

end # extension module
12 changes: 7 additions & 5 deletions lib/cusparse/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::CuSparseMatrixCOO{T,
sparse(row, col, data, out_shape..., fmt = :coo)
end

function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::Diagonal) where {Ti, T}
function LinearAlgebra.kron(A::CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T2, <:CuVector{T2}}) where {Ti, T1, T2}
T = promote_type(T1, T2)
mA,nA = size(A)
mB,nB = size(B)
out_shape = (mA * mB, nA * nB)
Expand All @@ -97,17 +98,18 @@ function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::Diagonal) where {Ti,
row = repeat(row, inner = Bnnz)
col = (A.colInd .- 1) .* nB
col = repeat(col, inner = Bnnz)
data = repeat(A.nzVal, inner = Bnnz)
data = repeat(convert(CuVector{T}, A.nzVal), inner = Bnnz)

row .+= CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
col .+= CuVector(repeat(0:nB-1, outer = Annz)) .+ 1

data .*= repeat(CUDA.ones(T, nB), outer = Annz)
data .*= repeat(B.diag, outer = Annz)

sparse(row, col, data, out_shape..., fmt = :coo)
end

function LinearAlgebra.kron(A::Diagonal, B::CuSparseMatrixCOO{T, Ti}) where {Ti, T}
function LinearAlgebra.kron(A::Diagonal{T1, <:CuVector{T1}}, B::CuSparseMatrixCOO{T2, Ti}) where {Ti, T1, T2}
T = promote_type(T1, T2)
mA,nA = size(A)
mB,nB = size(B)
out_shape = (mA * mB, nA * nB)
Expand All @@ -122,7 +124,7 @@ function LinearAlgebra.kron(A::Diagonal, B::CuSparseMatrixCOO{T, Ti}) where {Ti,
row = CuVector(repeat(row, inner = Bnnz))
col = (0:nA-1) .* nB
col = CuVector(repeat(col, inner = Bnnz))
data = repeat(CUDA.ones(T, nA), inner = Bnnz)
data = repeat(convert(CuVector{T}, A.diag), inner = Bnnz)

row .+= repeat(B.rowInd .- 1, outer = Annz) .+ 1
col .+= repeat(B.colInd .- 1, outer = Annz) .+ 1
Expand Down
32 changes: 27 additions & 5 deletions test/libraries/cusparse/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ m = 10
A = sprand(T, m, m, 0.2)
B = sprand(T, m, m, 0.3)
ZA = spzeros(T, m, m)
C = I(div(m, 2))
C = Diagonal(rand(T, div(m, 2)))

dC = adapt(CuArray, C)
@testset "type = $typ" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC]
dA = typ(A)
dB = typ(B)
Expand Down Expand Up @@ -35,12 +37,32 @@ m = 10
end
end
@testset "kronecker product with I opa = $opa" for opa in (identity, transpose, adjoint)
@test collect(kron(opa(dA), C)) ≈ kron(opa(A), C)
@test collect(kron(C, opa(dA))) ≈ kron(C, opa(A))
@test collect(kron(opa(dZA), C)) ≈ kron(opa(ZA), C)
@test collect(kron(C, opa(dZA))) ≈ kron(C, opa(ZA))
@test collect(kron(opa(dA), dC)) ≈ kron(opa(A), C)
@test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A))
@test collect(kron(opa(dZA), dC)) ≈ kron(opa(ZA), C)
@test collect(kron(dC, opa(dZA))) ≈ kron(C, opa(ZA))
end
end

# Test type promotion for kron with Diagonal
@testset "Type promotion - T1 = $T1, T2 = $T2" for (T1, T2) in [(Float32, Float64), (Float64, ComplexF64)]
A_T1 = sprand(T1, m, m, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
dC_T2 = Diagonal(CUDA.rand(T2, div(m, 2)))
C_T2 = collect(dC_T2)

# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, dC_T2)
result_cpu = kron(A_T1, C_T2)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu

# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(dC_T2, dA_T1)
result_cpu = kron(C_T2, A_T1)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
end
end

@testset "Reshape $typ (100,100) -> (20, 500) and droptol" for
Expand Down
102 changes: 102 additions & 0 deletions test/libraries/fillarrays.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
using CUDA.CUSPARSE
using LinearAlgebra, SparseArrays
using FillArrays

@testset "FillArrays - Kronecker product" begin
@testset "T = $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
m, n = 100, 80
A = sprand(T, m, n, 0.2)

# Test with Ones
@testset "Ones - size $diag_size" for diag_size in [5, 10]
C_ones = Diagonal(Ones{T}(diag_size))
dA = CuSparseMatrixCSR(A)

# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_ones))
result_cpu = kron(A, collect(C_ones))
@test result_gpu ≈ result_cpu

# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_ones, dA))
result_cpu = kron(collect(C_ones), A)
@test result_gpu ≈ result_cpu
end

# Test with Fill (constant value)
@testset "Fill - value $val, size $diag_size" for val in [T(2.5), T(-1.0)], diag_size in [5, 10]
C_fill = Diagonal(Fill(val, diag_size))
dA = CuSparseMatrixCSR(A)

# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_fill))
result_cpu = kron(A, collect(C_fill))
@test result_gpu ≈ result_cpu

# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_fill, dA))
result_cpu = kron(collect(C_fill), A)
@test result_gpu ≈ result_cpu
end

# Test with Zeros
@testset "Zeros - size $diag_size" for diag_size in [5, 10]
C_zeros = Diagonal(Zeros{T}(diag_size))
dA = CuSparseMatrixCSR(A)

# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_zeros))
result_cpu = kron(A, collect(C_zeros))
@test result_gpu ≈ result_cpu
@test nnz(kron(dA, C_zeros)) == 0

# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_zeros, dA))
result_cpu = kron(collect(C_zeros), A)
@test result_gpu ≈ result_cpu
@test nnz(kron(C_zeros, dA)) == 0
end

# Test with transpose and adjoint wrappers
@testset "Transpose/Adjoint with Fill" begin
diag_size = 5
val = T(3.0)
C_fill = Diagonal(Fill(val, diag_size))

@testset "opa = $opa" for opa in (identity, transpose, adjoint)
dA = CuSparseMatrixCSR(A)

# Test kron(opa(sparse), diagonal)
result_gpu = collect(kron(opa(dA), C_fill))
result_cpu = kron(opa(A), collect(C_fill))
@test result_gpu ≈ result_cpu

# Test kron(diagonal, opa(sparse))
result_gpu = collect(kron(C_fill, opa(dA)))
result_cpu = kron(collect(C_fill), opa(A))
@test result_gpu ≈ result_cpu
end
end

# Test type promotion
@testset "Type promotion" begin
@testset "T1 = $T1, T2 = $T2" for (T1, T2) in [(Float32, Float64), (Float64, ComplexF64), (Float32, ComplexF32)]
A_T1 = sprand(T1, m, n, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
C_fill_T2 = Diagonal(Fill(T2(2.5), 5))

# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, C_fill_T2)
result_cpu = kron(A_T1, collect(C_fill_T2))
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu

# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(C_fill_T2, dA_T1)
result_cpu = kron(collect(C_fill_T2), A_T1)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
end
end
end
end