Merge pull request #27 from jaakkor2/compactsubtype

jaakkor2 · web-flow · commit 26390858982d · 2025-06-16T22:49:33.000+03:00
Implement reading of compact subtype character format
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "JMPReader"
 uuid = "d9f7e686-cf87-4d12-8d7a-0e9b8c9fba29"
 authors = ["Jaakko Ruohio <jaakkor2@gmail.com>"]
-version = "0.1.15"
+version = "0.1.16"
 
 [deps]
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
diff --git a/docs/src/dev.md b/docs/src/dev.md
@@ -11,11 +11,15 @@ For example,
 ```julia
 JMPReader.scandir(joinpath(pathof(JMPReader), "..", "..", "test"))
 ```
-reads 12 JMP-files, and
+reads 12 JMP-files,
 ```julia
 JMPReader.scandir(raw"C:\Program Files\SAS\JMPPRO\17\Samples\Data")
 ```
-reads successfully 605 JMP-files.
+reads successfully 605 JMP-files, and
+```julia
+JMPReader.scandir(raw"C:\Program Files\JMP\JMPPRO\18\Samples\Data")
+```
+reads successfully 612 JMP-files.
 
 ## Looking into the binary .jmp file
 
diff --git a/src/column.jl b/src/column.jl
@@ -132,7 +132,7 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
     end
 
     # character
-    if dt1 in [0x02, 0x09] && dt2 in  [0x01, 0x02]
+    if dt1 in [0x02, 0x09] && dt2 in [0x01, 0x02]
 
         # constant width
         if ([dt3, dt4] == [0x00, 0x00] && dt5 > 0) ||
@@ -146,15 +146,37 @@ function column_data(io, info, i::Int, deflatebuffer::Vector{UInt8})
         # variable width
         if [dt3, dt4, dt5] == [0x00, 0x00, 0x00]
             if dt1 == 0x09 # compressed
-                widthbytes = a[9]
-                if widthbytes == 1
-                    widths = reinterpret(Int8, @view a[13 .+ (1:info.nrows)])
-                    data = a[13 + info.nrows + 1:end]
-                elseif widthbytes == 2
-                    widths = reinterpret(Int16, @view a[13 .+ (1:2*info.nrows)])
-                    data = a[13 + 2*info.nrows + 1:end]
+                if reinterpret(Int64, a[1:8])[1] == length(a) # pooled data
+                    io2 = IOBuffer(a)
+                    reclen = read(io2, Int64)
+                    foo = read(io2, 9)
+                    foo2 = read(io2, Int64)
+                    # indices to pool
+                    wb = read(io2, Int8)
+                    T = wb == 1 ? Int8 : Int16
+                    idx = [read(io2, T) for _ in 1:info.nrows]
+                    # pool
+                    npools = maximum(idx)
+                    wb = read(io2, Int8)
+                    T = wb == 1 ? Int8 : wb == 2 ? Int16 : error()
+                    pool = []
+                    for i = 1:npools
+                        n = read(io2, T)
+                        push!(pool, String(read(io2, n)))
+                    end
+                    str = [idx == 0 ? "" : pool[idx] for idx in idx]
+                    return str
                 else
-                    throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i"))
+                    widthbytes = a[9]
+                    if widthbytes == 1
+                        widths = reinterpret(Int8, @view a[13 .+ (1:info.nrows)])
+                        data = a[13 + info.nrows + 1:end]
+                    elseif widthbytes == 2
+                        widths = reinterpret(Int16, @view a[13 .+ (1:2*info.nrows)])
+                        data = a[13 + 2*info.nrows + 1:end]
+                    else
+                        throw(ErrorException("Unknown `widthbytes=$widthbytes`, some offset is wrong somewhere, column i=$i"))
+                    end
                 end
             else # uncompressed
                 # continue after dt1,...,dt6 were read
diff --git a/src/constants.jl b/src/constants.jl
@@ -8,9 +8,6 @@ const GZIP_SECTION_START = [0xef, 0xbe, 0xfe, 0xca] # cafebeef
 # JMP uses 1904 date system
 const JMP_STARTDATE = DateTime(1904, 1, 1)
 
-# offset for number of rows
-const OFFSET_NROWS = 368
-
 # row state
 const rowstatemarkers = [
     '•', '+', 'X', '□',
diff --git a/src/metadata.jl b/src/metadata.jl
@@ -1,5 +1,9 @@
 function metadata(io)
-    seek(io, OFFSET_NROWS)
+    seekstart(io)
+    seq = [0x07, 0x00, 0x08, 0x00, 0x00, 0x00]
+    readuntil(io, seq)
+    seek(io, position(io) - 38)
+
     nrows = read(io, Int64)
     ncols = read(io, Int32)
     foo1 = read_reals(io, Int16, 5) # ??
diff --git a/test/compact.jmp b/test/compact.jmp
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -129,4 +129,17 @@ end
     @test df.rowstate3[2].marker == '▲'
     @test df.rowstate3[3].marker == 'ꙮ'
     @test df.rowstate2[3].color == RGB{N0f8}(0.753,0.753,0.753)
+end
+
+@testset "compact subtype" begin
+    df = readjmp("compact.jmp")
+    data = ["aa", "b", "ccc", "dd", "dd"]
+    @test df.normalsubtype == data
+    @test df.compactsubtype == data
+    @test df.longcompact[1] == repeat('x', 254)
+    @test df.longcompact[2] == repeat('y', 255)
+    @test isempty(df.longcompact[3])
+    z = repeat('z', 256)
+    @test df.longcompact[4] == z
+    @test df.longcompact[5] == z
 end