Skip to content

Writing support #87

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ deps/build.log
*.jl.mem
Manifest.toml
.vscode
test/write_tests/
11 changes: 6 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ uuid = "d71aba96-b539-5138-91ee-935c3ee1374c"
version = "1.1.2-DEV"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
DataValues = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
ReadStat_jll = "a4dc8951-f1cc-5499-9034-9ec1c3e64557"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea of this package here is that it is a low-level wrapper around the C library that itself doesn't take any dependency on either Tables.jl or TableTraits.jl or anything like that. The idea is also that this package here typically won't be used by end-users directly, in the realm of Queryverse the end-user package really is https://github.com/queryverse/StatFiles.jl, and that is the package that for example brings the integration with TableTraits.jl along.

So ideally this package here would continue to not take a dependency on either Tables nor TableTraits, but instead just expose relatively low-level functions to write files and stay a package with as few dependencies as possible. And we can then add user-facing APIs to either StatFiles.jl, or any other package if someone wants to provide a more Tables.jl centric experience, and then those user-facing packages can share the implementation in this package here.


[compat]
julia = "1.3"
DataValues = "0.4.13"
ReadStat_jll = "1.1.1"
julia = "1.3"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
63 changes: 62 additions & 1 deletion src/C_interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ function readstat_get_var_count(metadata::Ptr{Nothing})
end

function readstat_value_is_missing(value::ReadStatValue, variable::Ptr{Nothing})
return Bool(ccall((:readstat_value_is_missing, libreadstat), Cint, (ReadStatValue,Ptr{Nothing}), value, variable))
return Bool(ccall((:readstat_value_is_missing, libreadstat), Cint, (ReadStatValue, Ptr{Nothing}), value, variable))
end

function readstat_variable_get_index(variable::Ptr{Nothing})
Expand Down Expand Up @@ -78,3 +78,64 @@ end
function readstat_variable_get_missing_ranges_count(variable::Ptr{Nothing})
return ccall((:readstat_variable_get_missing_ranges_count, libreadstat), Cint, (Ptr{Nothing},), variable)
end


function readstat_begin_row(writer)
return ccall((:readstat_begin_row, libreadstat), Int, (Ptr{Nothing},), writer)
end

function readstat_end_row(writer)
return ccall((:readstat_end_row, libreadstat), Int, (Ptr{Nothing},), writer)
end

function readstat_begin_writing(writer, filetype::Val{:dta}, io, row_count)
return ccall((:readstat_begin_writing_dta, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_begin_writing(writer, filetype::Val{:sav}, io, row_count)
return ccall((:readstat_begin_writing_sav, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_begin_writing(writer, filetype::Val{:por}, io, row_count)
return ccall((:readstat_begin_writing_por, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_begin_writing(writer, filetype::Val{:sas7bdat}, io, row_count)
return ccall((:readstat_begin_writing_sas7bdat, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_begin_writing(writer, filetype::Val{:xport}, io, row_count)
return ccall((:readstat_begin_writing_xport, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_insert_double_value(writer, variable, value)
return ccall((:readstat_insert_double_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cdouble), writer, variable, value)
end

function readstat_insert_float_value(writer, variable, value)
return ccall((:readstat_insert_float_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cfloat), writer, variable, value)
end

function readstat_insert_int32_value(writer, variable, value)
return ccall((:readstat_insert_int32_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, variable, value)
end

function readstat_insert_int16_value(writer, variable, value)
return ccall((:readstat_insert_int16_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cshort), writer, variable, value)
end

function readstat_insert_int8_value(writer, variable, value)
return ccall((:readstat_insert_int8_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cchar), writer, variable, value)
end

function readstat_insert_string_value(writer, variable, value)
return ccall((:readstat_insert_string_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cstring), writer, variable, value)
end

function readstat_insert_missing_value(writer, variable)
return ccall((:readstat_insert_missing_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), writer, variable)
end

function readstat_add_variable(writer, name, type, width)
return ccall((:readstat_add_variable, libreadstat), Ptr{Nothing}, (Ptr{Nothing}, Cstring, Cint, Cint), writer, name, type, width)
end
89 changes: 88 additions & 1 deletion src/ReadStat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ using ReadStat_jll
using DataValues: DataValueVector
import DataValues
using Dates
import Tables

export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat, read_xport
export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat, read_xport, write_dta, write_sav, write_por, write_sas7bdat, write_xport

##############################################################################
##
Expand Down Expand Up @@ -287,10 +288,96 @@ function parse_data_file!(ds::ReadStatDataFrame, parser::Ptr{Nothing}, filename:
retval == 0 || error("Error parsing $filename: $(error_message(retval))")
end

function handle_write!(data::Ptr{UInt8}, len::Cint, ctx::Ptr)
io = unsafe_pointer_to_objref(ctx) # restore io
actual_data = unsafe_wrap(Array{UInt8}, data, (len, )) # we may want to specify the type later
write(io, actual_data)
return len
end

function Writer(; filelabel)
writer = ccall((:readstat_writer_init, libreadstat), Ptr{Nothing}, ())
write_bytes = @cfunction(handle_write!, Cint, (Ptr{UInt8}, Cint, Ptr{Nothing}))
ccall((:readstat_set_data_writer, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), writer, write_bytes)
ccall((:readstat_writer_set_file_label, libreadstat), Cvoid, (Ptr{Nothing}, Cstring), writer, filelabel)
return writer
end

function write_data_file(filename::AbstractString, filetype::Val, source; kwargs...)
io = open(filename, "w")
write_data_file(filetype::Val, io, source; kwargs...)
close(io)
end


function write_data_file(filetype::Val, io::IO, source; filelabel = "")
writer = Writer(; filelabel = filelabel)

rows = Tables.rows(source)
schema = Tables.schema(rows)
if schema === nothing
error("Could not determine table schema for data source.")
end
variables_array = []

variables_array = map(schema.names, schema.types) do column_name, column_type
readstat_type, storage_width = readstat_column_type_and_width(source, column_name, nonmissingtype(column_type))
return add_variable!(writer, column_name, readstat_type, storage_width)
# readstat_variable_set_label(variable, String(field)) TODO: label for a variable
end

readstat_begin_writing(writer, filetype, io, length(rows))

for row in rows
readstat_begin_row(writer)
Tables.eachcolumn(schema, row) do val, i, name
insert_value!(writer, variables_array[i], val)
end
readstat_end_row(writer);
end

ccall((:readstat_end_writing, libreadstat), Int, (Ptr{Nothing},), writer)
ccall((:readstat_writer_free, libreadstat), Cvoid, (Ptr{Nothing},), writer)
end

readstat_column_type_and_width(_, _, other_type) = error("Cannot handle column with element type $other_type. Is this type supported by ReadStat?")
readstat_column_type_and_width(_, _, ::Type{Float64}) = READSTAT_TYPE_DOUBLE, 0
readstat_column_type_and_width(_, _, ::Type{Float32}) = READSTAT_TYPE_FLOAT, 0
readstat_column_type_and_width(_, _, ::Type{Int32}) = READSTAT_TYPE_INT32, 0
readstat_column_type_and_width(_, _, ::Type{Int16}) = READSTAT_TYPE_INT16, 0
readstat_column_type_and_width(_, _, ::Type{Int8}) = READSTAT_TYPE_CHAR, 0
function readstat_column_type_and_width(source, colname, ::Type{String})
col = Tables.getcolumn(source, colname)
maxlen = maximum(col) do str
str === missing ? 0 : ncodeunits(str)
end
if maxlen >= 2045 # maximum length of normal strings
return READSTAT_TYPE_LONG_STRING, 0
else
return READSTAT_TYPE_STRING, maxlen
end
end

add_variable!(writer, name, type, width = 0) = readstat_add_variable(writer, name, type, width)

insert_value!(writer, variable, value::Float64) = readstat_insert_double_value(writer, variable, value)
insert_value!(writer, variable, value::Float32) = readstat_insert_float_value(writer, variable, value)
insert_value!(writer, variable, ::Missing) = readstat_insert_missing_value(writer, variable)
insert_value!(writer, variable, value::Int8) = readstat_insert_int8_value(writer, variable, value)
insert_value!(writer, variable, value::Int16) = readstat_insert_int16_value(writer, variable, value)
insert_value!(writer, variable, value::Int32) = readstat_insert_int32_value(writer, variable, value)
insert_value!(writer, variable, value::AbstractString) = readstat_insert_string_value(writer, variable, value)

read_dta(filename::AbstractString) = read_data_file(filename, Val(:dta))
read_sav(filename::AbstractString) = read_data_file(filename, Val(:sav))
read_por(filename::AbstractString) = read_data_file(filename, Val(:por))
read_sas7bdat(filename::AbstractString) = read_data_file(filename, Val(:sas7bdat))
read_xport(filename::AbstractString) = read_data_file(filename, Val(:xport))

write_dta(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:dta), source; kwargs...)
write_sav(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:sav), source; kwargs...)
write_por(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:por), source; kwargs...)
write_sas7bdat(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:sas7bdat), source; kwargs...)
write_xport(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:xport), source; kwargs...)

end #module ReadStat
89 changes: 71 additions & 18 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,75 @@ using ReadStat
using DataValues
using Test

@testset "ReadStat: $ext files" for (reader, ext) in
((read_dta, "dta"),
(read_sav, "sav"),
(read_sas7bdat, "sas7bdat"),
(read_xport, "xpt"))

dtafile = joinpath(dirname(@__FILE__), "types.$ext")
rsdf = reader(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
testdir = joinpath(@__DIR__, "write_tests")
if isdir(testdir)
rm(testdir, recursive = true)
end
mkdir(testdir)

@testset "ReadStat" begin
@testset "$ext files" for (reader, writer, ext) in
((read_dta, write_dta, "dta"),
(read_sav, write_sav, "sav"),
(read_sas7bdat, write_sas7bdat, "sas7bdat"),
(read_xport, write_xport, "xpt"))

@testset "Reading" begin
dtafile = joinpath(@__DIR__, "types.$ext")
rsdf = reader(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end

@testset "Writing" begin
data = (
vdouble = [3.14, 7., missing],
vfloat = [3.14f0, 7.f0, missing],
vint32 = [Int32(2), Int32(7), missing],
vint16 = [Int16(2), Int16(7), missing],
vint8 = [Int8(2), Int8(7), missing],
vstring = ["2", "7", missing],
)
filepath = joinpath(testdir, "testwrite.$ext")
writer(filepath, data)
rsdf = reader(filepath)
data_read = rsdf.data
@test length(data_read) == length(data)
@test rsdf.headers == collect(keys(data))

same_value(a::DataValue, b) = a.hasvalue && get(a) == b # SAS and SPSS only support Float64 and String, so we can't test ===
same_value(a::DataValue, b::Missing) = !a.hasvalue
# missing String appears to be read back in as the empty string ""
same_value(a::DataValue{String}, b::Missing) = a.hasvalue && get(a) == ""

@test all(zip(data_read, values(data))) do (col_read, col)
all(Base.splat(same_value), zip(col_read, col))
end
end

@testset "Long string" begin
data = (x = ["a" ^ 2046, missing],)
filepath = joinpath(testdir, "testwrite_longstring.$ext")
writer(filepath, data)
rsdf = reader(filepath)
data_read = rsdf.data
@test_broken get(data_read[1][1]) == "a" ^ 2046
end

@testset "File metadata" begin
data = (a = Int32[1, 2, 3],)
filepath = joinpath(testdir, "testwrite_file_metadata.$ext")
writer(filepath, data; filelabel = "Test label")
rsdf = reader(filepath)
@test rsdf.filelabel == "Test label"
end
end
end