JuliaGPU · nHackel · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/Project.toml b/Project.toml
@@ -4,6 +4,7 @@ version = "11.4.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
@@ -24,6 +25,7 @@ JLD2Ext = "JLD2"
 
 [compat]
 Adapt = "4.0"
+Atomix = "1"
 GPUArraysCore = "= 0.2.0"
 JLD2 = "0.4, 0.5, 0.6"
 KernelAbstractions = "0.9.28, 0.10"

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
@@ -13,7 +13,7 @@ using GPUArrays
 using Adapt
 using SparseArrays, LinearAlgebra
 
-import GPUArrays: dense_array_type
+import GPUArrays: dense_array_type, GPUSparseMatrixCSC, GPUSparseMatrixCSR
 
 import KernelAbstractions
 import KernelAbstractions: Adapt, StaticArrays, Backend, Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config
@@ -150,6 +150,9 @@ mutable struct JLSparseMatrixCSC{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCSC
         new{Tv, Ti}(colPtr, rowVal, nzVal, dims, length(nzVal))
     end
 end
+function GPUSparseMatrixCSC(colPtr::JLArray{Ti, 1}, rowVal::JLArray{Ti, 1}, nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer}) where {Tv, Ti <: Integer}
+    return JLSparseMatrixCSC(colPtr, rowVal, nzVal, dims)
+end
 function JLSparseMatrixCSC(colPtr::JLArray{Ti, 1}, rowVal::JLArray{Ti, 1}, nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer}) where {Tv, Ti <: Integer}
     return JLSparseMatrixCSC{Tv, Ti}(colPtr, rowVal, nzVal, dims)
 end
@@ -181,6 +184,9 @@ end
 function JLSparseMatrixCSR(rowPtr::JLArray{Ti, 1}, colVal::JLArray{Ti, 1}, nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer}) where {Tv, Ti <: Integer}
     return JLSparseMatrixCSR{Tv, Ti}(rowPtr, colVal, nzVal, dims)
 end
+function GPUSparseMatrixCSR(rowPtr::JLArray{Ti, 1}, colVal::JLArray{Ti, 1}, nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer}) where {Tv, Ti <: Integer}
+    return JLSparseMatrixCSR(rowPtr, colVal, nzVal, dims)
+end
 function SparseArrays.SparseMatrixCSC(x::JLSparseMatrixCSR) 
     x_transpose = SparseMatrixCSC(size(x, 2), size(x, 1), Array(x.rowPtr), Array(x.colVal), Array(x.nzVal))
     return SparseMatrixCSC(transpose(x_transpose))

diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -16,6 +16,7 @@ using Reexport
 @reexport using GPUArraysCore
 
 using KernelAbstractions
+using Atomix
 
 # device functionality
 include("device/abstractarray.jl")

diff --git a/src/host/sparse.jl b/src/host/sparse.jl
@@ -1,5 +1,6 @@
 using LinearAlgebra
 using LinearAlgebra: BlasFloat
+export GPUSparseMatrix, GPUSparseMatrixCSC, GPUSparseMatrixCSR, GPUSparseMatrixCOO, GPUSparseMatrixBSR
 
 abstract type AbstractGPUSparseArray{Tv, Ti, N} <: AbstractSparseArray{Tv, Ti, N} end
 const AbstractGPUSparseVector{Tv, Ti} = AbstractGPUSparseArray{Tv, Ti, 1}
@@ -10,6 +11,59 @@ abstract type AbstractGPUSparseMatrixCSR{Tv, Ti} <: AbstractGPUSparseArray{Tv, T
 abstract type AbstractGPUSparseMatrixCOO{Tv, Ti} <: AbstractGPUSparseArray{Tv, Ti, 2} end
 abstract type AbstractGPUSparseMatrixBSR{Tv, Ti} <: AbstractGPUSparseArray{Tv, Ti, 2} end
 
+function GPUSparseMatrixCOO end
+function GPUSparseMatrixCSC end
+function GPUSparseMatrixCSR end
+function GPUSparseMatrixBSR end 
+
+function compute_sparse_pointers(indices::AbstractGPUVector{T}, n::Integer) where T
+    ptr = similar(indices, T, n + 1)
+    fill!(ptr, zero(T))
+
+    @kernel function count_indices(@Const(indices), ptr)
+        idx = @index(Global, Linear)
+        if idx == 1
+            ptr[idx] = one(T)
+        end
+        Atomix.@atomic ptr[indices[idx] + 1] += one(T)
+    end
+
+    backend = get_backend(indices)
+    kernel! = count_indices(backend)
+    kernel!(indices, ptr, ndrange=length(indices))
+    return cumsum(ptr)
+end
+
+function GPUSparseMatrix(I::AbstractGPUVector, J::AbstractGPUVector, V::AbstractGPUVector, dims::NTuple{2, <:Integer}; format = default_sparse_format(I))
+	m, n = dims
+    if format == :coo
+		GPUSparseMatrix(Val(:coo), I, J, V, dims)
+    elseif format == :csc
+        # Create composite key for sorting
+        key = J .* (m + 1) .+ I
+        perm = sortperm(key)
+
+        ptr = compute_sparse_pointers(J[perm], n)
+        return GPUSparseMatrix(Val(:csc), ptr, I[perm], V[perm], dims)
+    elseif format == :csr
+        # Create composite key for sorting
+        key = I .* (n + 1) .+ J
+        perm = sortperm(key)
+
+        ptr = compute_sparse_pointers(I[perm], m)
+        return GPUSparseMatrix(Val(:csr), ptr, J[perm], V[perm], dims)
+    else
+        throw(ArgumentError("Conversion to sparse format $format is not implemented"))
+	end
+end
+
+
+GPUSparseMatrix(::Val{:coo}, I, J, V, dims) = GPUSparseMatrixCOO(I, J, V, dims)
+GPUSparseMatrix(::Val{:csc}, colPtr, rowVal, nzVal, dims) = GPUSparseMatrixCSC(colPtr, rowVal, nzVal, dims)
+GPUSparseMatrix(::Val{:csr}, rowPtr, colVal, nzVal, dims) = GPUSparseMatrixCSR(rowPtr, colVal, nzVal, dims)
+#GPUSparseMatrix(::Val{:bsr}, ...) = GPUSparseMatrixBSR(...)
+
+
 const AbstractGPUSparseVecOrMat = Union{AbstractGPUSparseVector,AbstractGPUSparseMatrix}
 
 SparseArrays.nnz(g::T) where {T<:AbstractGPUSparseArray} = g.nnz

diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
@@ -9,6 +9,7 @@
         elseif sparse_AT <: AbstractSparseMatrix
             matrix(sparse_AT, eltypes)
             matrix_construction(sparse_AT, eltypes)
+            direct_vector_construction(sparse_AT, eltypes)
             broadcasting_matrix(sparse_AT, eltypes)
             mapreduce_matrix(sparse_AT, eltypes)
             linalg(sparse_AT, eltypes)
@@ -151,6 +152,64 @@ function matrix_construction(AT, eltypes)
     end
 end
 
+# Helper function to derive direct matrix formats:
+# Create colptr, rowval, nzval for m x n matrix with 3 values per column
+function csc_vectors(m::Int, n::Int, ::Type{ET}; I::Type{<:Integer}=Int32) where {ET}
+    # Fixed, deterministic 3 nnz per column; random nz values
+    colptr = Vector{I}(undef, n + 1)
+    rowval = Vector{I}()
+    nzval  = Vector{ET}()
+
+    colptr[1] = I(1)
+    nnz_acc = 0
+    for j in 1:n
+        # Magic numbers
+        rows_j = sort(unique(mod.(j .+ (1, 7, 13), m) .+ 1))
+        append!(rowval, I.(rows_j))
+        append!(nzval, rand(ET, length(rows_j)))
+        nnz_acc += length(rows_j)
+        colptr[j + 1] = I(nnz_acc + 1)
+    end
+    return colptr, rowval, nzval
+end
+function csr_vectors(m::Int, n::Int, ::Type{ET}; I::Type{<:Integer}=Int32) where {ET}
+    # Build CSC for (n, m), then interpret as CSR for (m, n)
+    colptr_nm, rowval_nm, nzval_nm = csc_vectors(n, m, ET; I=I)
+    rowptr = colptr_nm
+    colind = rowval_nm
+    nzval  = nzval_nm
+    return rowptr, colind, nzval
+end
+# Construct appropriate sparse arrays
+function construct_sparse_matrix(AT::Type{<:GPUArrays.AbstractGPUSparseMatrixCSC}, ::Type{ET}, m::Int, n::Int; I::Type{<:Integer}=Int32) where {ET}
+    colptr, rowval, nzval = csc_vectors(m, n, ET; I=I)
+    dense_AT = GPUArrays.dense_array_type(AT)
+    d_colptr = dense_AT(colptr)
+    d_rowval = dense_AT(rowval)
+    d_nzval  = dense_AT(nzval)
+    return GPUSparseMatrixCSC(d_colptr, d_rowval, d_nzval, (m, n))
+end
+function construct_sparse_matrix(AT::Type{<:GPUArrays.AbstractGPUSparseMatrixCSR}, ::Type{ET}, m::Int, n::Int; I::Type{<:Integer}=Int32) where {ET}
+    rowptr, colind, nzval = csr_vectors(m, n, ET; I=I)
+    dense_AT = GPUArrays.dense_array_type(AT)
+    d_rowptr = dense_AT(rowptr)
+    d_colind = dense_AT(colind)
+    d_nzval  = dense_AT(nzval)
+    return GPUSparseMatrixCSR(d_rowptr, d_colind, d_nzval, (m, n))
+end
+function direct_vector_construction(AT::Type{<:GPUArrays.AbstractGPUSparseMatrix}, eltypes)
+    for ET in eltypes
+        m = 25
+        n = 35
+        x = construct_sparse_matrix(AT, ET, m, n)
+        @test x isa AT{ET}
+        @test size(x) == (m, n)
+    end
+end
+function direct_vector_construction(AT, eltypes)
+    # NOP
+end
+
 function broadcasting_vector(AT, eltypes)
     dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes