diff --git a/.gitignore b/.gitignore index 369ae03..9df679a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ deps/build.log *.jl.mem Manifest.toml .vscode +test/write_tests/ \ No newline at end of file diff --git a/Project.toml b/Project.toml index 761d41d..6ac772b 100644 --- a/Project.toml +++ b/Project.toml @@ -3,17 +3,18 @@ uuid = "d71aba96-b539-5138-91ee-935c3ee1374c" version = "1.1.2-DEV" [deps] -Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DataValues = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" ReadStat_jll = "a4dc8951-f1cc-5499-9034-9ec1c3e64557" - -[extras] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -julia = "1.3" DataValues = "0.4.13" ReadStat_jll = "1.1.1" +julia = "1.3" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Test"] diff --git a/src/C_interface.jl b/src/C_interface.jl index 4880bcd..c486ff9 100644 --- a/src/C_interface.jl +++ b/src/C_interface.jl @@ -20,7 +20,7 @@ function readstat_get_var_count(metadata::Ptr{Nothing}) end function readstat_value_is_missing(value::ReadStatValue, variable::Ptr{Nothing}) - return Bool(ccall((:readstat_value_is_missing, libreadstat), Cint, (ReadStatValue,Ptr{Nothing}), value, variable)) + return Bool(ccall((:readstat_value_is_missing, libreadstat), Cint, (ReadStatValue, Ptr{Nothing}), value, variable)) end function readstat_variable_get_index(variable::Ptr{Nothing}) @@ -78,3 +78,64 @@ end function readstat_variable_get_missing_ranges_count(variable::Ptr{Nothing}) return ccall((:readstat_variable_get_missing_ranges_count, libreadstat), Cint, (Ptr{Nothing},), variable) end + + +function readstat_begin_row(writer) + return ccall((:readstat_begin_row, libreadstat), Int, (Ptr{Nothing},), writer) +end + +function readstat_end_row(writer) + return ccall((:readstat_end_row, libreadstat), Int, (Ptr{Nothing},), writer) +end + +function readstat_begin_writing(writer, filetype::Val{:dta}, io, row_count) + return ccall((:readstat_begin_writing_dta, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count)) +end + +function readstat_begin_writing(writer, filetype::Val{:sav}, io, row_count) + return ccall((:readstat_begin_writing_sav, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count)) +end + +function readstat_begin_writing(writer, filetype::Val{:por}, io, row_count) + return ccall((:readstat_begin_writing_por, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count)) +end + +function readstat_begin_writing(writer, filetype::Val{:sas7bdat}, io, row_count) + return ccall((:readstat_begin_writing_sas7bdat, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count)) +end + +function readstat_begin_writing(writer, filetype::Val{:xport}, io, row_count) + return ccall((:readstat_begin_writing_xport, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count)) +end + +function readstat_insert_double_value(writer, variable, value) + return ccall((:readstat_insert_double_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cdouble), writer, variable, value) +end + +function readstat_insert_float_value(writer, variable, value) + return ccall((:readstat_insert_float_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cfloat), writer, variable, value) +end + +function readstat_insert_int32_value(writer, variable, value) + return ccall((:readstat_insert_int32_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, variable, value) +end + +function readstat_insert_int16_value(writer, variable, value) + return ccall((:readstat_insert_int16_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cshort), writer, variable, value) +end + +function readstat_insert_int8_value(writer, variable, value) + return ccall((:readstat_insert_int8_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cchar), writer, variable, value) +end + +function readstat_insert_string_value(writer, variable, value) + return ccall((:readstat_insert_string_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cstring), writer, variable, value) +end + +function readstat_insert_missing_value(writer, variable) + return ccall((:readstat_insert_missing_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), writer, variable) +end + +function readstat_add_variable(writer, name, type, width) + return ccall((:readstat_add_variable, libreadstat), Ptr{Nothing}, (Ptr{Nothing}, Cstring, Cint, Cint), writer, name, type, width) +end \ No newline at end of file diff --git a/src/ReadStat.jl b/src/ReadStat.jl index a0e5e1e..5de462a 100644 --- a/src/ReadStat.jl +++ b/src/ReadStat.jl @@ -11,8 +11,9 @@ using ReadStat_jll using DataValues: DataValueVector import DataValues using Dates +import Tables -export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat, read_xport +export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat, read_xport, write_dta, write_sav, write_por, write_sas7bdat, write_xport ############################################################################## ## @@ -287,10 +288,96 @@ function parse_data_file!(ds::ReadStatDataFrame, parser::Ptr{Nothing}, filename: retval == 0 || error("Error parsing $filename: $(error_message(retval))") end +function handle_write!(data::Ptr{UInt8}, len::Cint, ctx::Ptr) + io = unsafe_pointer_to_objref(ctx) # restore io + actual_data = unsafe_wrap(Array{UInt8}, data, (len, )) # we may want to specify the type later + write(io, actual_data) + return len + end + +function Writer(; filelabel) + writer = ccall((:readstat_writer_init, libreadstat), Ptr{Nothing}, ()) + write_bytes = @cfunction(handle_write!, Cint, (Ptr{UInt8}, Cint, Ptr{Nothing})) + ccall((:readstat_set_data_writer, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), writer, write_bytes) + ccall((:readstat_writer_set_file_label, libreadstat), Cvoid, (Ptr{Nothing}, Cstring), writer, filelabel) + return writer +end + +function write_data_file(filename::AbstractString, filetype::Val, source; kwargs...) + io = open(filename, "w") + write_data_file(filetype::Val, io, source; kwargs...) + close(io) +end + + +function write_data_file(filetype::Val, io::IO, source; filelabel = "") + writer = Writer(; filelabel = filelabel) + + rows = Tables.rows(source) + schema = Tables.schema(rows) + if schema === nothing + error("Could not determine table schema for data source.") + end + variables_array = [] + + variables_array = map(schema.names, schema.types) do column_name, column_type + readstat_type, storage_width = readstat_column_type_and_width(source, column_name, nonmissingtype(column_type)) + return add_variable!(writer, column_name, readstat_type, storage_width) + # readstat_variable_set_label(variable, String(field)) TODO: label for a variable + end + + readstat_begin_writing(writer, filetype, io, length(rows)) + + for row in rows + readstat_begin_row(writer) + Tables.eachcolumn(schema, row) do val, i, name + insert_value!(writer, variables_array[i], val) + end + readstat_end_row(writer); + end + + ccall((:readstat_end_writing, libreadstat), Int, (Ptr{Nothing},), writer) + ccall((:readstat_writer_free, libreadstat), Cvoid, (Ptr{Nothing},), writer) +end + +readstat_column_type_and_width(_, _, other_type) = error("Cannot handle column with element type $other_type. Is this type supported by ReadStat?") +readstat_column_type_and_width(_, _, ::Type{Float64}) = READSTAT_TYPE_DOUBLE, 0 +readstat_column_type_and_width(_, _, ::Type{Float32}) = READSTAT_TYPE_FLOAT, 0 +readstat_column_type_and_width(_, _, ::Type{Int32}) = READSTAT_TYPE_INT32, 0 +readstat_column_type_and_width(_, _, ::Type{Int16}) = READSTAT_TYPE_INT16, 0 +readstat_column_type_and_width(_, _, ::Type{Int8}) = READSTAT_TYPE_CHAR, 0 +function readstat_column_type_and_width(source, colname, ::Type{String}) + col = Tables.getcolumn(source, colname) + maxlen = maximum(col) do str + str === missing ? 0 : ncodeunits(str) + end + if maxlen >= 2045 # maximum length of normal strings + return READSTAT_TYPE_LONG_STRING, 0 + else + return READSTAT_TYPE_STRING, maxlen + end +end + +add_variable!(writer, name, type, width = 0) = readstat_add_variable(writer, name, type, width) + +insert_value!(writer, variable, value::Float64) = readstat_insert_double_value(writer, variable, value) +insert_value!(writer, variable, value::Float32) = readstat_insert_float_value(writer, variable, value) +insert_value!(writer, variable, ::Missing) = readstat_insert_missing_value(writer, variable) +insert_value!(writer, variable, value::Int8) = readstat_insert_int8_value(writer, variable, value) +insert_value!(writer, variable, value::Int16) = readstat_insert_int16_value(writer, variable, value) +insert_value!(writer, variable, value::Int32) = readstat_insert_int32_value(writer, variable, value) +insert_value!(writer, variable, value::AbstractString) = readstat_insert_string_value(writer, variable, value) + read_dta(filename::AbstractString) = read_data_file(filename, Val(:dta)) read_sav(filename::AbstractString) = read_data_file(filename, Val(:sav)) read_por(filename::AbstractString) = read_data_file(filename, Val(:por)) read_sas7bdat(filename::AbstractString) = read_data_file(filename, Val(:sas7bdat)) read_xport(filename::AbstractString) = read_data_file(filename, Val(:xport)) +write_dta(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:dta), source; kwargs...) +write_sav(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:sav), source; kwargs...) +write_por(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:por), source; kwargs...) +write_sas7bdat(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:sas7bdat), source; kwargs...) +write_xport(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:xport), source; kwargs...) + end #module ReadStat diff --git a/test/runtests.jl b/test/runtests.jl index dbc10b0..09344fa 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,22 +2,75 @@ using ReadStat using DataValues using Test -@testset "ReadStat: $ext files" for (reader, ext) in - ((read_dta, "dta"), - (read_sav, "sav"), - (read_sas7bdat, "sas7bdat"), - (read_xport, "xpt")) - - dtafile = joinpath(dirname(@__FILE__), "types.$ext") - rsdf = reader(dtafile) - data = rsdf.data - - @test length(data) == 6 - @test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring] - @test data[1] == DataValueArray{Float32}([3.14, 7., NA]) - @test data[2] == DataValueArray{Float64}([3.14, 7., NA]) - @test data[3] == DataValueArray{Int32}([2, 7, NA]) - @test data[4] == DataValueArray{Int16}([2, 7, NA]) - @test data[5] == DataValueArray{Int8}([2, 7., NA]) - @test data[6] == DataValueArray{String}(["2", "7", ""]) +testdir = joinpath(@__DIR__, "write_tests") +if isdir(testdir) + rm(testdir, recursive = true) +end +mkdir(testdir) + +@testset "ReadStat" begin + @testset "$ext files" for (reader, writer, ext) in + ((read_dta, write_dta, "dta"), + (read_sav, write_sav, "sav"), + (read_sas7bdat, write_sas7bdat, "sas7bdat"), + (read_xport, write_xport, "xpt")) + + @testset "Reading" begin + dtafile = joinpath(@__DIR__, "types.$ext") + rsdf = reader(dtafile) + data = rsdf.data + + @test length(data) == 6 + @test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring] + @test data[1] == DataValueArray{Float32}([3.14, 7., NA]) + @test data[2] == DataValueArray{Float64}([3.14, 7., NA]) + @test data[3] == DataValueArray{Int32}([2, 7, NA]) + @test data[4] == DataValueArray{Int16}([2, 7, NA]) + @test data[5] == DataValueArray{Int8}([2, 7., NA]) + @test data[6] == DataValueArray{String}(["2", "7", ""]) + end + + @testset "Writing" begin + data = ( + vdouble = [3.14, 7., missing], + vfloat = [3.14f0, 7.f0, missing], + vint32 = [Int32(2), Int32(7), missing], + vint16 = [Int16(2), Int16(7), missing], + vint8 = [Int8(2), Int8(7), missing], + vstring = ["2", "7", missing], + ) + filepath = joinpath(testdir, "testwrite.$ext") + writer(filepath, data) + rsdf = reader(filepath) + data_read = rsdf.data + @test length(data_read) == length(data) + @test rsdf.headers == collect(keys(data)) + + same_value(a::DataValue, b) = a.hasvalue && get(a) == b # SAS and SPSS only support Float64 and String, so we can't test === + same_value(a::DataValue, b::Missing) = !a.hasvalue + # missing String appears to be read back in as the empty string "" + same_value(a::DataValue{String}, b::Missing) = a.hasvalue && get(a) == "" + + @test all(zip(data_read, values(data))) do (col_read, col) + all(Base.splat(same_value), zip(col_read, col)) + end + end + + @testset "Long string" begin + data = (x = ["a" ^ 2046, missing],) + filepath = joinpath(testdir, "testwrite_longstring.$ext") + writer(filepath, data) + rsdf = reader(filepath) + data_read = rsdf.data + @test_broken get(data_read[1][1]) == "a" ^ 2046 + end + + @testset "File metadata" begin + data = (a = Int32[1, 2, 3],) + filepath = joinpath(testdir, "testwrite_file_metadata.$ext") + writer(filepath, data; filelabel = "Test label") + rsdf = reader(filepath) + @test rsdf.filelabel == "Test label" + end + end end