diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 33810c5..2603a81 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -35,4 +35,4 @@ jobs: - name: build run: go build ./... - name: test - run: go test ./... + run: go test ./... \ No newline at end of file diff --git a/cpg.go b/cpg.go new file mode 100644 index 0000000..6ee44ae --- /dev/null +++ b/cpg.go @@ -0,0 +1,44 @@ +package shapefile + +import ( + "archive/zip" + "fmt" + "io" + "strings" + + "golang.org/x/net/html/charset" +) + +// CPG a CPG is a .cpg file. +type CPG struct { + Charset string +} + +// ReadCPG reads a CPG from an io.Reader. +func ReadCPG(r io.Reader, _ int64) (*CPG, error) { + data, err := io.ReadAll(r) + if err != nil { + return nil, err + } + enc, name := charset.Lookup(strings.ToLower(string(data))) + if enc == nil { + return nil, fmt.Errorf("unknown charset '%s'", (string(data))) + } + return &CPG{ + Charset: name, + }, nil +} + +// ReadCPGZipFile reads a CPG from a *zip.File. +func ReadCPGZipFile(zipFile *zip.File) (*CPG, error) { + readCloser, err := zipFile.Open() + if err != nil { + return nil, err + } + defer readCloser.Close() + cpg, err := ReadCPG(readCloser, int64(zipFile.UncompressedSize64)) + if err != nil { + return nil, fmt.Errorf("%s: %w", zipFile.Name, err) + } + return cpg, nil +} diff --git a/dbf.go b/dbf.go index fecab76..6191fa4 100644 --- a/dbf.go +++ b/dbf.go @@ -2,7 +2,7 @@ package shapefile // FIXME support dBase version 7 files if needed, see https://www.dbase.com/Knowledgebase/INT/db7_file_fmt.htm // FIXME work through https://www.clicketyclick.dk/databases/xbase/format/dbf.html and add any missing features -// FIXME add unmarshaller that unmarshals a record into a Go struct with `dbf:"..."` tags?s +// FIXME add unmarshaller that unmarshalls a record into a Go struct with `dbf:"..."` tags?s // FIXME validate logical implementation // FIXME add support for memos @@ -17,6 +17,8 @@ import ( "strings" "time" + "golang.org/x/net/html/charset" + "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" ) @@ -27,12 +29,12 @@ const ( var ( knownFieldTypes = map[byte]struct{}{ - 'C': {}, - 'D': {}, - 'F': {}, - 'L': {}, - 'M': {}, - 'N': {}, + 'C': {}, // Character + 'D': {}, // Date + 'F': {}, // Floating point binary numeric + 'L': {}, // Binary coded decimal numeric + 'M': {}, // Memo + 'N': {}, // Numeric } knownLogicalValues = map[byte]any{ @@ -46,8 +48,6 @@ var ( 't': true, 'y': true, } - - iso8859_1Decoder = charmap.ISO8859_1.NewDecoder() ) // A DBFHeader is a DBF header. @@ -86,6 +86,7 @@ type ReadDBFOptions struct { MaxHeaderSize int MaxRecordSize int MaxRecords int + Charset string } // A DBFMemo is a DBF memo. @@ -145,6 +146,16 @@ func ReadDBF(r io.Reader, _ int64, options *ReadDBFOptions) (*DBF, error) { return nil, errors.New("invalid total length of fields") } + var decoder *encoding.Decoder + if options != nil && options.Charset != "" { + enc, _ := charset.Lookup(options.Charset) + if enc == nil { + return nil, fmt.Errorf("unknown charset '%s'", options.Charset) + } + decoder = enc.NewDecoder() + } else { + decoder = charmap.ISO8859_1.NewDecoder() + } records := make([][]any, 0, header.Records) for i := 0; i < header.Records; i++ { recordData := make([]byte, header.RecordSize) @@ -158,7 +169,7 @@ func ReadDBF(r io.Reader, _ int64, options *ReadDBFOptions) (*DBF, error) { for _, fieldDescriptor := range fieldDescriptors { fieldData := recordData[offset : offset+fieldDescriptor.Length] offset += fieldDescriptor.Length - field, err := fieldDescriptor.ParseRecord(fieldData) + field, err := fieldDescriptor.ParseRecord(fieldData, decoder) if err != nil { return nil, fmt.Errorf("field %s: %w", fieldDescriptor.Name, err) } @@ -267,10 +278,10 @@ func (d *DBF) Record(i int) map[string]any { } // ParseRecord parses a record from data. -func (d *DBFFieldDescriptor) ParseRecord(data []byte) (any, error) { +func (d *DBFFieldDescriptor) ParseRecord(data []byte, decoder *encoding.Decoder) (any, error) { switch d.Type { case 'C': - return parseCharacter(data) + return parseCharacter(data, decoder) case 'D': return parseDate(data) case 'F': @@ -296,8 +307,11 @@ func TrimTrailingZeros(data []byte) []byte { return nil } -func parseCharacter(data []byte) (string, error) { - return iso8859_1Decoder.String(string(bytes.TrimSpace(TrimTrailingZeros(data)))) +func parseCharacter(data []byte, decoder *encoding.Decoder) (string, error) { + if decoder == nil { + return "", fmt.Errorf("decoder is nil") + } + return decoder.String(string(bytes.TrimSpace(TrimTrailingZeros(data)))) } func parseDate(data []byte) (time.Time, error) { diff --git a/dbf_test.go b/dbf_test.go index 22f5c30..70a41c8 100644 --- a/dbf_test.go +++ b/dbf_test.go @@ -11,7 +11,7 @@ import ( func FuzzReadDBF(f *testing.F) { require.NoError(f, addFuzzDataFromFS(f, os.DirFS("."), "testdata", ".dbf")) - f.Fuzz(func(t *testing.T, data []byte) { + f.Fuzz(func(_ *testing.T, data []byte) { r := bytes.NewReader(data) _, _ = ReadDBF(r, int64(len(data)), &ReadDBFOptions{ MaxHeaderSize: 4096, diff --git a/go.mod b/go.mod index e5197f2..e716e02 100644 --- a/go.mod +++ b/go.mod @@ -5,11 +5,13 @@ go 1.19 require ( github.com/stretchr/testify v1.8.1 github.com/twpayne/go-geom v1.4.4 + golang.org/x/net v0.0.0-20220722155237-a158d28d115b golang.org/x/text v0.4.0 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index e891622..e3bda10 100644 --- a/go.sum +++ b/go.sum @@ -12,6 +12,10 @@ github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKs github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/twpayne/go-geom v1.4.4 h1:bcCPAvvNSzjmpUqR0Uqh39ClCKtPx6kZVR7EakQaVJI= github.com/twpayne/go-geom v1.4.4/go.mod h1:Kz4sX4LtdesDQgkhsMERazLlH/NiCg90s6FPaNr0KNI= +golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc h1:ao2WRsKSzW6KuUY9IWPwWahcHCgR0s52IfwutMfEbdM= +golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/text v0.4.0 h1:BrVqGRd7+k1DiOgtnFvAkoQEWQvBc25ouMJM6429SFg= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= diff --git a/scanner.go b/scanner.go new file mode 100644 index 0000000..768d5f9 --- /dev/null +++ b/scanner.go @@ -0,0 +1,790 @@ +// Package shapefile reads ESRI Shapefiles. +// +// See https://support.esri.com/en/white-paper/279. +package shapefile + +// FIXME provide lazy, random access to individual records, using SHX + +import ( + "archive/zip" + "bufio" + "errors" + "fmt" + "io" + "os" + "path" + "strings" + "sync" + + "golang.org/x/exp/constraints" + "golang.org/x/net/html/charset" + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/charmap" +) + +func max[T constraints.Ordered](x T, y ...T) T { //nolint: ireturn + a := x + for _, b := range y { + if a < b { + a = b + } + } + return a +} + +// bufioReadCloser ... +type bufioReadCloser = struct { + *bufio.Reader + io.Closer +} + +// Scanner ... +type Scanner struct { + scanSHP *ScannerSHP + scanDBF *ScannerDBF + scanSHX *ScannerSHX + filePRJ *PRJ + fileCPG *CPG + scanRecords int64 + estimatedRecords int64 + err error +} + +// ReadScanner read a scanner and create a shapefile. +func ReadScanner(scanner *Scanner) (*Shapefile, error) { + if scanner == nil { + return nil, nil + } + var shp *SHP + var shx *SHX + var dbf *DBF + var cpg *CPG + var prj *PRJ + + if scanner.SHPHeader() != nil { + shp = &SHP{SHxHeader: *scanner.SHPHeader()} + } + if scanner.SHxHeader() != nil { + shx = &SHX{SHxHeader: *scanner.SHxHeader()} + } + + if scanner.DBFHeader() != nil { + dbf = &DBF{DBFHeader: *scanner.DBFHeader(), FieldDescriptors: scanner.DBFFieldDescriptors()} + } + + if scanner.Projection() != "" { + prj = &PRJ{Projection: scanner.Projection()} + } + + if scanner.Charset() != "" { + cpg = &CPG{Charset: scanner.Charset()} + } + + for scanner.Next() { + recSHP, recSHX, recDBF := scanner.Scan() + if shp != nil && recSHP != nil { + shp.Records = append(shp.Records, recSHP) + } + if dbf != nil && recDBF != nil { + dbf.Records = append(dbf.Records, recDBF) + } + if shx != nil && recSHX != nil { + shx.Records = append(shx.Records, *recSHX) + } + } + + if err := scanner.Error(); err != nil && !errors.Is(err, io.EOF) { + return nil, fmt.Errorf("read scanner [%d]: %w", scanner.scanRecords, err) + } + + return &Shapefile{ + SHP: shp, + DBF: dbf, + SHX: shx, + PRJ: prj, + CPG: cpg, + }, nil +} + +// NewScannerFromBasename reads files based of Basename and create a scanner. +func NewScannerFromBasename(basename string, options *ReadShapefileOptions) (*Scanner, error) { + if options == nil { + options = &ReadShapefileOptions{} + } + + readers := make(map[string]io.ReadCloser) + sizes := make(map[string]int64) + + dbfFile, dbfSize, err := openWithSize(basename + ".dbf") + switch { + case errors.Is(err, os.ErrNotExist): + // Do nothing. + case err != nil: + return nil, fmt.Errorf("%s.dbf: %w", basename, err) + default: + readers[".dbf"] = dbfFile + sizes[".dbf"] = dbfSize + } + + prjFile, prjSize, err := openWithSize(basename + ".prj") + switch { + case errors.Is(err, os.ErrNotExist): + // Do nothing. + case err != nil: + return nil, fmt.Errorf("%s.prj: %w", basename, err) + default: + readers[".prj"] = prjFile + sizes[".prj"] = prjSize + } + + cpgFile, cpgSize, err := openWithSize(basename + ".cpg") + switch { + case errors.Is(err, os.ErrNotExist): + // Do nothing. + case err != nil: + return nil, fmt.Errorf("%s.cpg: %w", basename, err) + default: + readers[".cpg"] = cpgFile + sizes[".cpg"] = cpgSize + } + + shxFile, shxSize, err := openWithSize(basename + ".shx") + switch { + case errors.Is(err, os.ErrNotExist): + // Do nothing. + case err != nil: + return nil, fmt.Errorf("%s.shx: %w", basename, err) + default: + readers[".shx"] = shxFile + sizes[".shx"] = shxSize + } + + shpFile, shpSize, err := openWithSize(basename + ".shp") + switch { + case errors.Is(err, os.ErrNotExist): + // Do nothing. + case err != nil: + return nil, fmt.Errorf("%s.shp: %w", basename, err) + default: + readers[".shp"] = shpFile + sizes[".shp"] = shpSize + } + + scanner, err := NewScanner(readers, sizes, options) + if err != nil { + return nil, fmt.Errorf("NewScanner: %w", err) + } + return scanner, nil +} + +// NewScannerFromZipFile reads a .zip file and create a scanner. +func NewScannerFromZipFile(name string, options *ReadShapefileOptions) (*Scanner, error) { + file, err := os.Open(name) + if err != nil { + return nil, err + } + // defer file.Close() + + fileInfo, err := file.Stat() + if err != nil { + return nil, err + } + + zipReader, err := zip.NewReader(file, fileInfo.Size()) + if err != nil { + return nil, err + } + + scanner, err := NewScannerFromZipReader(zipReader, options) + if err != nil { + return nil, fmt.Errorf("%s: %w", name, err) + } + return scanner, nil +} + +// NewScannerFromZipReader reads a *zip.Reader and create a scanner. +func NewScannerFromZipReader(zipReader *zip.Reader, options *ReadShapefileOptions) (*Scanner, error) { + var dbfFiles []*zip.File + var prjFiles []*zip.File + var cpgFiles []*zip.File + var shxFiles []*zip.File + var shpFiles []*zip.File + for _, zipFile := range zipReader.File { + switch strings.ToLower(path.Ext(zipFile.Name)) { + case ".dbf": + dbfFiles = append(dbfFiles, zipFile) + case ".prj": + prjFiles = append(prjFiles, zipFile) + case ".cpg": + cpgFiles = append(cpgFiles, zipFile) + case ".shp": + shpFiles = append(shpFiles, zipFile) + case ".shx": + shxFiles = append(shxFiles, zipFile) + } + } + + readers := make(map[string]io.ReadCloser) + sizes := make(map[string]int64) + + switch len(dbfFiles) { + case 0: + // Do nothing. + case 1: + readCloser, err := dbfFiles[0].Open() + if err != nil { + return nil, err + } + readers[".dbf"] = readCloser + sizes[".dbf"] = int64(dbfFiles[0].UncompressedSize64) + default: + return nil, errors.New("too many .dbf files") + } + + switch len(prjFiles) { + case 0: + // Do nothing. + case 1: + readCloser, err := prjFiles[0].Open() + if err != nil { + return nil, err + } + readers[".prj"] = readCloser + sizes[".prj"] = int64(prjFiles[0].UncompressedSize64) + default: + return nil, errors.New("too many .prj files") + } + + switch len(cpgFiles) { + case 0: + // Do nothing. + case 1: + readCloser, err := cpgFiles[0].Open() + if err != nil { + return nil, err + } + readers[".cpg"] = readCloser + sizes[".cpg"] = int64(cpgFiles[0].UncompressedSize64) + default: + return nil, errors.New("too many .cpg files") + } + + switch len(shpFiles) { + case 0: + // Do nothing. + case 1: + readCloser, err := shpFiles[0].Open() + if err != nil { + return nil, err + } + readers[".shp"] = readCloser + sizes[".shp"] = int64(shpFiles[0].UncompressedSize64) + default: + return nil, errors.New("too many .shp files") + } + + switch len(shxFiles) { + case 0: + // Do nothing. + case 1: + readCloser, err := shxFiles[0].Open() + if err != nil { + return nil, err + } + readers[".shx"] = readCloser + sizes[".shx"] = int64(shxFiles[0].UncompressedSize64) + default: + return nil, errors.New("too many .shx files") + } + + scanner, err := NewScanner(readers, sizes, options) + if err != nil { + return nil, fmt.Errorf("NewScanner: %w", err) + } + return scanner, nil +} + +// NewScanner Create a new scanner. +func NewScanner( + readers map[string]io.ReadCloser, + sizes map[string]int64, + options *ReadShapefileOptions, +) (*Scanner, error) { + if options == nil { + options = &ReadShapefileOptions{} + } + + var cpg *CPG + if reader, ok := readers[".cpg"]; ok { + scanner, err := ReadCPG(reader, sizes[".cpg"]) + if err != nil { + return nil, fmt.Errorf("ReadCPG: %w", err) + } + cpg = scanner + switch { + case options == nil: + options = &ReadShapefileOptions{&ReadDBFOptions{Charset: scanner.Charset}, &ReadSHPOptions{}} + case options.DBF == nil: + options.DBF = &ReadDBFOptions{Charset: scanner.Charset} + default: + options.DBF.Charset = scanner.Charset + } + } + + var prj *PRJ + if reader, ok := readers[".prj"]; ok { + scanner, err := ReadPRJ(reader, sizes[".prj"]) + if err != nil { + return nil, fmt.Errorf("ReadPRJ: %w", err) + } + prj = scanner + } + + var wg sync.WaitGroup + var scannerSHP *ScannerSHP + var scannerSHX *ScannerSHX + var scannerDBF *ScannerDBF + var estimatedSHX, estimatedDBF int64 + var errSHP, errSHX, errDBF error + + wg.Add(3) + go func() { + defer wg.Done() + if reader, ok := readers[".shp"]; ok { + scanner, err := NewScannerSHP(reader, sizes[".shp"], options.SHP) + if err != nil { + errSHP = fmt.Errorf("NewScannerSHP: %w", err) + } + scannerSHP = scanner + } + }() + + go func() { + defer wg.Done() + if reader, ok := readers[".dbf"]; ok { + scanner, err := NewScannerDBF(reader, options.DBF) + if err != nil { + errDBF = fmt.Errorf("NewScannerDBF: %w", err) + } + scannerDBF = scanner + estimatedDBF = (sizes[".dbf"] - dbfHeaderLength) / int64(scanner.header.RecordSize) + } + }() + + go func() { + defer wg.Done() + if reader, ok := readers[".shx"]; ok { + scanner, err := NewScannerSHX(reader, sizes[".shx"]) + if err != nil { + errSHX = fmt.Errorf("NewScannerSHX: %w", err) + } + scannerSHX = scanner + estimatedSHX = (sizes[".shx"] - headerSize) / 8 + } + }() + + wg.Wait() + if err := errors.Join(errSHP, errDBF, errSHX); err != nil { + return nil, err + } + + return &Scanner{ + scanSHP: scannerSHP, + scanSHX: scannerSHX, + scanDBF: scannerDBF, + filePRJ: prj, + fileCPG: cpg, + estimatedRecords: max(estimatedDBF, estimatedSHX), + }, nil +} + +// Scan Scanner records. +func (s *Scanner) Scan() (recordSHP *SHPRecord, recordSHX *SHXRecord, recordDBF DBFRecord) { + if s.err != nil { + return nil, nil, nil + } + + var wg sync.WaitGroup + var errSHP, errSHX, errDBF error + + wg.Add(3) + go func() { + defer wg.Done() + if s.scanSHP != nil { + record, err := s.scanSHP.Scan() + if err != nil { + errSHP = fmt.Errorf("scanning SHP: %w", err) + } else { + recordSHP = record + } + } + }() + + go func() { + defer wg.Done() + if s.scanDBF != nil { + if record, err := s.scanDBF.Scan(); err != nil { + errDBF = fmt.Errorf("scanning DBF: %w", err) + } else { + recordDBF = record + } + } + }() + + go func() { + defer wg.Done() + if s.scanSHX != nil { + if record, err := s.scanSHX.Scan(); err != nil { + errSHX = fmt.Errorf("scanning SHX: %w", err) + } else { + recordSHX = record + } + } + }() + + wg.Wait() + if err := errors.Join(errSHP, errDBF, errSHX); err != nil { + s.err = err + return nil, nil, nil + } + + s.scanRecords++ + return recordSHP, recordSHX, recordDBF +} + +func (s *Scanner) Next() bool { + return s.err == nil +} + +// Discard Discards n records for concurrent scan. +func (s *Scanner) Discard(n int) (int, error) { + var errSHP, errSHX, errDBF error + var nSHP, nSHX, nDBF int + var wg sync.WaitGroup + wg.Add(2) + + go func() { + defer wg.Done() + if s.scanDBF != nil { + nb, err := s.scanDBF.reader.Discard(n * s.scanDBF.header.RecordSize) + if err != nil { + errDBF = err + nDBF = nb / s.scanDBF.header.RecordSize + return + } + s.scanDBF.scanRecords += n + } + }() + + go func() { + defer wg.Done() + if s.scanSHX != nil { + data, err := s.scanSHX.reader.Peek(8) + if err != nil { + errSHX = err + return + } + record := ParseSHXRecord(data) + offsetInit := record.Offset + nb, err := s.scanSHX.reader.Discard(n * 8) + if err != nil { + nSHX = nb / 8 + errSHX = err + return + } + s.scanSHX.scanRecords += n + + if s.scanSHP != nil { + data, err := s.scanSHX.reader.Peek(8) + if err != nil { + errSHX = err + return + } + record := ParseSHXRecord(data) + offsetEnd := record.Offset + nb, err := s.scanSHP.reader.Discard(offsetEnd - offsetInit) + if err != nil { + nSHP = nb / record.ContentLength + errSHP = err + return + } + s.scanSHP.scanRecords += n + } + } else if s.scanSHP != nil { + errSHP = fmt.Errorf("can't discard .shp file without .shx file") + return + } + }() + + wg.Wait() + if err := errors.Join(errSHP, errDBF, errSHX); err != nil { + s.err = err + return max(nSHX, nDBF, nSHP), err + } + + s.scanRecords += int64(n) + return n, nil +} + +func (s *Scanner) Close() error { + var err error + if s.scanDBF != nil { + err = errors.Join(err, s.scanDBF.reader.Close()) + } + if s.scanSHP != nil { + err = errors.Join(err, s.scanSHP.reader.Close()) + } + if s.scanSHX != nil { + err = errors.Join(err, s.scanSHX.reader.Close()) + } + return err +} + +func (s Scanner) ScannedRecords() int64 { + return s.scanRecords +} + +func (s Scanner) EstimatedRecords() int64 { + return s.estimatedRecords +} + +func (s *Scanner) DBFHeader() *DBFHeader { + if s.scanDBF != nil { + return s.scanDBF.header + } + return nil +} + +func (s *Scanner) DBFFieldDescriptors() []*DBFFieldDescriptor { + if s.scanDBF != nil { + return s.scanDBF.fieldDescriptors + } + return nil +} + +func (s *Scanner) SHPHeader() *SHxHeader { + if s.scanSHP != nil { + return s.scanSHP.header + } + return nil +} + +func (s *Scanner) SHxHeader() *SHxHeader { + if s.scanSHX != nil { + return s.scanSHX.header + } + return nil +} + +func (s Scanner) Charset() string { + if s.fileCPG != nil { + return s.fileCPG.Charset + } + return "" +} + +func (s Scanner) Projection() string { + if s.filePRJ != nil { + return s.filePRJ.Projection + } + return "" +} + +func (s Scanner) Error() error { + return s.err +} + +type ScannerSHP struct { + reader bufioReadCloser + options *ReadSHPOptions + header *SHxHeader + scanRecords int + err error +} + +func NewScannerSHP(reader io.ReadCloser, size int64, options *ReadSHPOptions) (*ScannerSHP, error) { + header, err := readSHxHeader(reader, size) + if err != nil { + return nil, err + } + return &ScannerSHP{ + reader: bufioReadCloser{bufio.NewReader(reader), reader}, + header: header, + options: options, + }, nil +} + +func (s *ScannerSHP) Scan() (*SHPRecord, error) { + if s.err != nil { + return nil, s.err + } + + record, err := ReadSHPRecord(s.reader, s.options) + switch { + case errors.Is(err, io.EOF): + s.err = io.EOF + return nil, s.err + case err != nil: + s.err = fmt.Errorf("record %d: %w", s.scanRecords, err) + return nil, s.err + case record.Number != s.scanRecords+1: + s.err = fmt.Errorf("record %d: invalid record number (expected %d)", s.scanRecords, record.Number) + return nil, s.err + default: + s.scanRecords++ + return record, nil + } +} + +type ScannerSHX struct { + reader bufioReadCloser + header *SHxHeader + scanRecords int + err error +} + +func NewScannerSHX(reader io.ReadCloser, size int64) (*ScannerSHX, error) { + header, err := readSHxHeader(reader, size) + if err != nil { + return nil, err + } + return &ScannerSHX{ + reader: bufioReadCloser{bufio.NewReader(reader), reader}, + header: header, + }, nil +} + +func (s *ScannerSHX) Scan() (*SHXRecord, error) { + if s.err != nil { + return nil, s.err + } + + data := make([]byte, 8) + if err := readFull(s.reader, data); err != nil { + s.err = err + return nil, err + } + record := ParseSHXRecord(data) + s.scanRecords++ + return &record, nil +} + +type DBFRecord = []any + +type ScannerDBF struct { + reader bufioReadCloser + options *ReadDBFOptions + header *DBFHeader + fieldDescriptors []*DBFFieldDescriptor + decoder *encoding.Decoder + scanRecords int + err error +} + +func NewScannerDBF(reader io.ReadCloser, options *ReadDBFOptions) (*ScannerDBF, error) { + headerData := make([]byte, dbfHeaderLength) + if err := readFull(reader, headerData); err != nil { + return nil, err + } + header, err := ParseDBFHeader(headerData, options) + if err != nil { + return nil, err + } + + var fieldDescriptors []*DBFFieldDescriptor + for i := 0; ; i++ { + fieldDescriptorData := make([]byte, dbfFieldDescriptorSize) + if err := readFull(reader, fieldDescriptorData[:1]); err != nil { + return nil, err + } + if fieldDescriptorData[0] == '\x0d' { + break + } + if err := readFull(reader, fieldDescriptorData[1:]); err != nil { + return nil, err + } + + name := string(TrimTrailingZeros(fieldDescriptorData[:11])) + fieldType := fieldDescriptorData[11] + if _, ok := knownFieldTypes[fieldType]; !ok { + return nil, fmt.Errorf("field %d: %d: invalid field type", i, fieldType) + } + length := int(fieldDescriptorData[16]) + workAreaID := fieldDescriptorData[20] + setFields := fieldDescriptorData[23] + + fieldDescriptor := &DBFFieldDescriptor{ + Name: name, + Type: fieldType, + Length: length, + WorkAreaID: workAreaID, + SetFields: setFields, + } + fieldDescriptors = append(fieldDescriptors, fieldDescriptor) + } + + totalLength := 0 + for _, fieldDescriptor := range fieldDescriptors { + totalLength += fieldDescriptor.Length + } + if totalLength+1 != header.RecordSize { + return nil, fmt.Errorf("invalid total length of fields") + } + + var decoder *encoding.Decoder + if options != nil && options.Charset != "" { + enc, _ := charset.Lookup(options.Charset) + if enc == nil { + return nil, fmt.Errorf("unknown charset '%s'", options.Charset) + } + decoder = enc.NewDecoder() + } else { + decoder = charmap.ISO8859_1.NewDecoder() + } + + return &ScannerDBF{ + reader: bufioReadCloser{bufio.NewReader(reader), reader}, + options: options, + header: header, + fieldDescriptors: fieldDescriptors, + decoder: decoder, + }, nil +} + +func (s *ScannerDBF) Scan() (DBFRecord, error) { + if s.err != nil { + return nil, s.err + } + + recordData := make([]byte, s.header.RecordSize) + if err := readFull(s.reader, recordData); err != nil { + s.err = err + return nil, s.err + } + switch recordData[0] { + case ' ': + record := make([]any, 0, len(s.fieldDescriptors)) + offset := 1 + for _, fieldDescriptor := range s.fieldDescriptors { + fieldData := recordData[offset : offset+fieldDescriptor.Length] + offset += fieldDescriptor.Length + field, err := fieldDescriptor.ParseRecord(fieldData, s.decoder) + if err != nil { + s.err = fmt.Errorf("field %s: %w", fieldDescriptor.Name, err) + return nil, s.err + } + record = append(record, field) + } + s.scanRecords++ + return record, nil + case '*': + return nil, nil + default: + s.err = fmt.Errorf("%d: invalid record flag", recordData[0]) + return nil, s.err + } +} + +func (s *ScannerDBF) FieldDescriptors() []*DBFFieldDescriptor { + return s.fieldDescriptors +} diff --git a/scanner_test.go b/scanner_test.go new file mode 100644 index 0000000..5d14168 --- /dev/null +++ b/scanner_test.go @@ -0,0 +1,398 @@ +package shapefile + +import ( + "math" + "path" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/twpayne/go-geom" +) + +func TestReadScanner(t *testing.T) { + for _, tc := range []struct { + skipReason string + basename string + hasDBF bool + hasPRJ bool + hasCPG bool + hasSHX bool + expectedErr string + expectedShapeType ShapeType + expectedBounds *geom.Bounds + expectedNumRecords int + expectedGeom0 geom.T + expectedDBFRecord0 []any + expectedExport any + }{ + { + basename: "line", + hasSHX: true, + expectedShapeType: ShapeTypePolyLine, + expectedBounds: geom.NewBounds(geom.XY).Set(1, 1, 5, 6), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "MULTILINESTRING ((1 5,5 5,5 1,3 3,1 1),(3 2,2 6))"), + }, + { + basename: "linem", + expectedShapeType: ShapeTypePolyLineM, + expectedBounds: geom.NewBounds(geom.XYM).Set(1, 1, 0, 5, 6, 3), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "MULTILINESTRING M ((1 5 0,5 5 -1E+39,5 1 3,3 3 -1E+39,1 1 0),(3 2 -1E+39,2 6 -1E+39))"), + }, + { + basename: "linez", + expectedShapeType: ShapeTypePolyLineZ, + expectedBounds: geom.NewBounds(geom.XYZM).Set(1, 1, 0, 0, 5, 9, 22, 3), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "MULTILINESTRING ZM ((1 5 18 -1E+39,5 5 20 -1E+39,5 1 22 -1E+39,3 3 0 -1E+39,1 1 0 -1E+39),(3 2 0 -1E+39,2 6 0 -1E+39),(3 2 15 0,2 6 13 3,1 9 14 2))"), + }, + { + skipReason: "first record has number 1, not 0", + basename: "multi_polygon", + }, + { + skipReason: "multipatch is not supported", + basename: "multipatch", + expectedShapeType: ShapeTypeMultiPatch, + }, + { + basename: "multipoint", + expectedShapeType: ShapeTypeMultiPoint, + expectedBounds: geom.NewBounds(geom.XY).Set(122, 32, 124, 37), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "MULTIPOINT ((122 37),(124 32))"), + }, + { + basename: "multipointz", + expectedShapeType: ShapeTypeMultiPointZ, + expectedBounds: geom.NewBounds(geom.XYZM).Set(1422671.7232666016, 4188903.4295959473, 71.99445343017578, math.Inf(1), 1422672.1022949219, 4188903.7578430176, 72.00995635986328, math.Inf(-1)), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "MULTIPOINT ZM ((1422671.7232666016 4188903.4295959473 72.00995635986328 -1E38),(1422672.1022949219 4188903.4295959473 72.0060806274414 -1E38),(1422671.9127807617 4188903.7578430176 72.00220489501953 -1E38),(1422671.9127807617 4188903.539001465 71.99445343017578 -1E38))"), + }, + { + basename: "point", + hasSHX: true, + expectedShapeType: ShapeTypePoint, + expectedBounds: geom.NewBounds(geom.XY).Set(122, 37, 122, 37), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "POINT (122 37)"), + }, + { + skipReason: "bounds in header do not match bounds of data; record 1 has record number 0, should be 1", + basename: "pointz", + expectedShapeType: ShapeTypePointZ, + expectedBounds: geom.NewBounds(geom.XYZM).Set(1422459.0908050265, 4188942.211755641, 72.40956470558095, 0, 1422464.3681007193, 4188962.3364355816, 72.58286959604922, 0), + expectedNumRecords: 2, + expectedGeom0: newGeomFromWKT(t, "POINT ZM (1422464.3681007193 4188962.3364355816 72.40956470558095 -1e+39)"), + }, + { + basename: "polygon_hole", + hasSHX: true, + expectedShapeType: ShapeTypePolygon, + expectedBounds: geom.NewBounds(geom.XY).Set(-120, -60, 120, 60), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "POLYGON ((-120 60,120 60,120 -60,-120 -60,-120 60),(-60 30,-60 -30,60 -30,60 30,-60 30))"), + }, + { + skipReason: "rings are not closed", + basename: "polygon", + expectedShapeType: ShapeTypePolygon, + expectedBounds: geom.NewBounds(geom.XY).Set(15, 2, 122, 37), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "POLYGON ((122 37,117 36,115 32,118 20,113 24,122 37),(15 2,17 6,22 7,15 2),(122 37,117 36,115 32,122 37))"), + }, + { + basename: "polygonm", + expectedShapeType: ShapeTypePolygonM, + expectedBounds: geom.NewBounds(geom.XYM).Set(159374.30785312195, 5403473.287488617, 0, 160420.36722814097, 5404314.139043656, 0), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "POLYGON M ((159814.75390576152 5404314.139043656 0,160420.36722814097 5403703.520652497 0,159374.30785312195 5403473.287488617 0,159814.753905761517 5404314.139043656 0))"), + }, + { + basename: "polygonz", + expectedShapeType: ShapeTypePolygonZ, + expectedBounds: geom.NewBounds(geom.XYZM).Set(1422691.1637959871, 4188837.293869424, 0, math.Inf(1), 1422692.1644789441, 4188838.2945523816, 0, math.Inf(-1)), + expectedNumRecords: 1, + expectedGeom0: newGeomFromWKT(t, "POLYGON ZM ((1422692.1644789441 4188837.794210903 72.46632654472523 0, 1422692.1625749937 4188837.75060327 72.46632654472523 1, 1422692.156877633 4188837.7073275167 72.46632654472523 2, 1422692.1474302218 4188837.664712999 72.46632654472523 3, 1422692.1343046608 4188837.6230840385 72.46632654472523 4, 1422692.1176008438 4188837.582757457 72.46632654472523 5, 1422692.0974458966 4188837.5440401635 72.46632654472523 6, 1422692.0739932107 4188837.5072268206 72.46632654472523 7, 1422692.047421275 4188837.4725976 72.46632654472523 8, 1422692.017932318 4188837.4404160506 72.46632654472523 9, 1422691.9857507686 4188837.4109270936 72.46632654472523 10, 1422691.951121548 4188837.384355158 72.46632654472523 11, 1422691.914308205 4188837.360902472 72.46632654472523 12, 1422691.8755909116 4188837.3407475245 72.46632654472523 13, 1422691.8352643298 4188837.3240437075 72.46632654472523 14, 1422691.7936353693 4188837.3109181467 72.46632654472523 15, 1422691.7510208515 4188837.3014707356 72.46632654472523 16, 1422691.7077450987 4188837.295773375 72.46632654472523 17, 1422691.6641374656 4188837.293869424 72.46632654472523 18, 1422691.6205298326 4188837.295773375 72.46632654472523 19, 1422691.5772540797 4188837.3014707356 72.46632654472523 20, 1422691.534639562 4188837.3109181467 72.46632654472523 21, 1422691.4930106015 4188837.3240437075 72.46632654472523 22, 1422691.4526840197 4188837.3407475245 72.46632654472523 23, 1422691.4139667263 4188837.360902472 72.46632654472523 24, 1422691.3771533833 4188837.384355158 72.46632654472523 25, 1422691.3425241627 4188837.4109270936 72.46632654472523 26, 1422691.3103426134 4188837.4404160506 72.46632654472523 27, 1422691.2808536564 4188837.4725976 72.46632654472523 28, 1422691.2542817206 4188837.5072268206 72.46632654472523 29, 1422691.2308290347 4188837.5440401635 72.46632654472523 30, 1422691.2106740875 4188837.582757457 72.46632654472523 31, 1422691.1939702705 4188837.6230840385 72.46632654472523 32, 1422691.1808447095 4188837.664712999 72.46632654472523 33, 1422691.1713972983 4188837.7073275167 72.46632654472523 34, 1422691.1656999376 4188837.75060327 72.46632654472523 35, 1422691.1637959871 4188837.794210903 72.46632654472523 36, 1422691.1656999376 4188837.837818536 72.46632654472523 37, 1422691.1713972983 4188837.881094289 72.46632654472523 38, 1422691.1808447095 4188837.9237088067 72.46632654472523 39, 1422691.1939702705 4188837.9653377673 72.46632654472523 40, 1422691.2106740875 4188838.0056643486 72.46632654472523 41, 1422691.2308290347 4188838.0443816422 72.46632654472523 42, 1422691.2542817206 4188838.081194985 72.46632654472523 43, 1422691.2808536564 4188838.115824206 72.46632654472523 44, 1422691.3103426134 4188838.148005755 72.46632654472523 45, 1422691.3425241627 4188838.177494712 72.46632654472523 46, 1422691.3771533833 4188838.2040666477 72.46632654472523 47, 1422691.4139667263 4188838.227519334 72.46632654472523 48, 1422691.4526840197 4188838.2476742812 72.46632654472523 49, 1422691.4930106015 4188838.2643780983 72.46632654472523 50, 1422691.534639562 4188838.277503659 72.46632654472523 51, 1422691.5772540797 4188838.28695107 72.46632654472523 52, 1422691.6205298326 4188838.292648431 72.46632654472523 53, 1422691.6641374656 4188838.2945523816 72.46632654472523 54, 1422691.7077450987 4188838.292648431 72.46632654472523 55, 1422691.7510208515 4188838.28695107 72.46632654472523 56, 1422691.7936353693 4188838.277503659 72.46632654472523 57, 1422691.8352643298 4188838.2643780983 72.46632654472523 58, 1422691.8755909116 4188838.2476742812 72.46632654472523 59, 1422691.914308205 4188838.227519334 72.46632654472523 60, 1422691.951121548 4188838.2040666477 72.46632654472523 61, 1422691.9857507686 4188838.177494712 72.46632654472523 62, 1422692.017932318 4188838.148005755 72.46632654472523 63, 1422692.047421275 4188838.115824206 72.46632654472523 64, 1422692.0739932107 4188838.081194985 72.46632654472523 65, 1422692.0974458966 4188838.0443816422 72.46632654472523 66, 1422692.1176008438 4188838.0056643486 72.46632654472523 67, 1422692.1343046608 4188837.9653377673 72.46632654472523 68, 1422692.1474302218 4188837.9237088067 72.46632654472523 69, 1422692.156877633 4188837.881094289 72.46632654472523 70, 1422692.1625749937 4188837.837818536 72.46632654472523 71, 1422692.1644789441 4188837.794210903 72.46632654472523 72))"), + }, + { + basename: "poly", + hasDBF: true, + hasPRJ: true, + hasSHX: true, + expectedShapeType: ShapeTypePolygon, + expectedBounds: geom.NewBounds(geom.XY).Set(478315.531250, 4762880.5, 481645.312500, 4765610.5), + expectedNumRecords: 10, + expectedGeom0: newGeomFromWKT(t, "POLYGON ((479819.84375 4765180.5,479690.1875 4765259.5,479647.0 4765369.5,479730.375 4765400.5,480039.03125 4765539.5,480035.34375 4765558.5,480159.78125 4765610.5,480202.28125 4765482.0,480365.0 4765015.5,480389.6875 4764950.0,480133.96875 4764856.5,480080.28125 4764979.5,480082.96875 4765049.5,480088.8125 4765139.5,480059.90625 4765239.5,480019.71875 4765319.5,479980.21875 4765409.5,479909.875 4765370.0,479859.875 4765270.0,479819.84375 4765180.5))"), + expectedDBFRecord0: []any{215229.266, 168, "35043411"}, + expectedExport: struct { + Geometry geom.T `geom:"geometry"` + Area float32 `geom:"area"` + EAS int `geom:"eas_id"` + PRFEDEA string `geom:"prfedea"` + }{ + Geometry: newGeomFromWKT(t, "POLYGON ((479819.84375 4765180.5,479690.1875 4765259.5,479647.0 4765369.5,479730.375 4765400.5,480039.03125 4765539.5,480035.34375 4765558.5,480159.78125 4765610.5,480202.28125 4765482.0,480365.0 4765015.5,480389.6875 4764950.0,480133.96875 4764856.5,480080.28125 4764979.5,480082.96875 4765049.5,480088.8125 4765139.5,480059.90625 4765239.5,480019.71875 4765319.5,479980.21875 4765409.5,479909.875 4765370.0,479859.875 4765270.0,479819.84375 4765180.5))"), + Area: 215229.266, + EAS: 168, + PRFEDEA: "35043411", + }, + }, + } { + t.Run(tc.basename, func(t *testing.T) { + if tc.skipReason != "" { + t.Skip(tc.skipReason) + } + + t.Run("Read", func(t *testing.T) { + scanner, err := NewScannerFromBasename(path.Join("testdata", tc.basename), nil) + require.NoError(t, err) + require.NotNil(t, scanner) + shapefile, err := ReadScanner(scanner) + require.NoError(t, err) + require.NotNil(t, shapefile) + + assert.Equal(t, tc.expectedShapeType, shapefile.SHP.SHxHeader.ShapeType) + assert.Equal(t, tc.expectedBounds, shapefile.SHP.SHxHeader.Bounds) + assert.Equal(t, tc.expectedNumRecords, shapefile.NumRecords()) + assert.Equal(t, tc.expectedGeom0, shapefile.SHP.Records[0].Geom) + + if tc.hasDBF { + assert.Equal(t, shapefile.NumRecords(), tc.expectedNumRecords) + assert.Equal(t, tc.expectedDBFRecord0, shapefile.DBF.Records[0]) + } else { + assert.Nil(t, shapefile.DBF) + } + + if tc.hasPRJ { + assert.NotNil(t, shapefile.PRJ.Projection) + } else { + assert.Nil(t, shapefile.PRJ) + } + if tc.hasCPG { + assert.NotNil(t, shapefile.CPG.Charset) + } else { + assert.Nil(t, shapefile.CPG) + } + + if tc.hasSHX { + assert.Equal(t, tc.expectedShapeType, shapefile.SHP.SHxHeader.ShapeType) + assert.Equal(t, tc.expectedBounds, shapefile.SHP.SHxHeader.Bounds) + assert.Equal(t, shapefile.NumRecords(), tc.expectedNumRecords) + } + }) + }) + } +} + +func TestReadScannerFSAndZipFile(t *testing.T) { + for _, tc := range []struct { + filename string + basename string + expectedShapeType ShapeType + expectedBounds *geom.Bounds + expectedRecordsLen int + expectedDBFRecord0Fields map[string]any + expectedSHPRecord0 *SHPRecord + expectedExport any + }{ + { + filename: "testdata/110m-admin-0-countries.zip", + basename: "ne_110m_admin_0_countries", + expectedShapeType: ShapeTypePolygon, + expectedBounds: geom.NewBounds(geom.XY).Set( + -179.99999999999997, -90.00000000000003, + 180.00000000000014, 83.64513000000001, + ), + expectedRecordsLen: 177, + expectedDBFRecord0Fields: map[string]any{ + "ABBREV": "Afg.", + "ADM0_A3": "AFG", + "ADM0_DIF": 0., + "ADMIN": "Afghanistan", + "FIPS_10_": 0., + "FeatureCla": "Admin-0 countries", + "GDP_MD_EST": 22270., + "GEOUNIT": "Afghanistan", + "GEOU_DIF": 0., + "GU_A3": "AFG", + "ISO_A2": "AF", + "ISO_A3": "AFG", + "ISO_N3": 4., + "LEVEL": 2., + "LabelRank": 1, + "MAP_COLOR": 7., + "NAME": "Afghanistan", + "NAME_FORMA": "Islamic State of Afghanistan", + "NAME_SORT": "Afghanistan", + "POP_EST": 28400000., + "POSTAL": "AF", + "SOVEREIGNT": "Afghanistan", + "SOV_A3": "AFG", + "SUBUNIT": "Afghanistan", + "SU_A3": "AFG", + "SU_DIF": 0., + "ScaleRank": 1, + "TERR_": "", + "TYPE": "Sovereign country", + }, + expectedSHPRecord0: &SHPRecord{ + Number: 1, + ContentLength: 1152, + ShapeType: ShapeTypePolygon, + }, + }, + { + filename: "testdata/Luftfahrthindernisse.zip", + basename: "Luftfahrthindernisse", + expectedShapeType: ShapeTypePoint, + expectedBounds: geom.NewBounds(geom.XY).Set( + 13.580271133050555, 46.621281718756464, + 16.12994444409849, 47.78517335054476, + ), + expectedRecordsLen: 1097, + expectedDBFRecord0Fields: map[string]any{ + "Art": "Windkraftanlage", + "Befeuert": "N", + "Betreiber": "Viktor Kaplan Mürz GmbH", + "GZ": "FA18E-88-1082/2002-18", + "Hoehe_Fp": 1580., + "Hoehe_Obj": 100., + "LFH_ID": 2, + "Name": "Windkraftanlage Windpark Moschkogel WKA 04", + "OBJECTID": 191, + "POINT_X": 15.74447664, + "POINT_Y": 47.56136608, + "Protnr": 17829, + "Tagkennzg": "N", + "WGS_Breite": "47 33 41,0", + "WGS_Laenge": "15 44 40,0", + "changeDate": "20210222130000", + "changeUser": "", + "createDate": "20210222130000", + "createUser": "", + }, + expectedSHPRecord0: &SHPRecord{ + Number: 1, + ContentLength: 20, + ShapeType: ShapeTypePoint, + Geom: newGeomFromWKT(t, "POINT (15.744476635247011 47.56136608020768)"), + }, + expectedExport: struct { + Geometry geom.T `geom:"geometry"` + Art string `geom:"art"` + Befeuert string `geom:"befeuert"` + Betreiber string `geom:"betreiber"` + GZ string `geom:"gz"` + HoeheFp float32 `geom:"hoehe_fp"` + HoeheObj float32 `geom:"hoehe_obj"` + LfhID int `geom:"lfh_id"` + Name string `geom:"name"` + ObjectID int `geom:"objectid"` + PointX float64 `geom:"point_x"` + PointY float64 `geom:"point_y"` + Protnr int `geom:"protnr"` + Tagkennzg string `geom:"tagkennzg"` + WGSBreite string `geom:"wgs_breite"` + WGSLaenge string `geom:"wgs_laenge"` + ChangeDate string `geom:"change_date"` + ChangeUser string `geom:"change_user"` + CreateDate string `geom:"create_date"` + CreateUser string `geom:"create_user"` + }{ + Geometry: newGeomFromWKT(t, "POINT (15.744476635247011 47.56136608020768)"), + Art: "Windkraftanlage", + Befeuert: "N", + Betreiber: "Viktor Kaplan Mürz GmbH", + GZ: "FA18E-88-1082/2002-18", + HoeheFp: 1580., + HoeheObj: 100., + LfhID: 2, + Name: "Windkraftanlage Windpark Moschkogel WKA 04", + ObjectID: 191, + PointX: 15.74447664, + PointY: 47.56136608, + Protnr: 17829, + Tagkennzg: "N", + WGSBreite: "47 33 41,0", + WGSLaenge: "15 44 40,0", + ChangeDate: "20210222130000", + ChangeUser: "", + CreateDate: "20210222130000", + CreateUser: "", + }, + }, + { + filename: "testdata/SZ.exe", + basename: "sz", + expectedShapeType: ShapeTypePolygon, + expectedBounds: geom.NewBounds(geom.XY).Set( + 5.9661102294921875, 45.829437255859375, + 10.488912582397461, 47.806938171386720, + ), + expectedRecordsLen: 26, + expectedDBFRecord0Fields: map[string]any{ + "ADMIN_NAME": "Aargau", + "CNTRY_NAME": "Switzerland", + "COLOR_MAP": "6", + "CONTINENT": "Europe", + "FIPS_ADMIN": "SZ01", + "FIPS_CNTRY": "SZ", + "GMI_ADMIN": "CHE-AAR", + "GMI_CNTRY": "CHE", + "POP_ADMIN": 524648, + "REGION": "Western Europe", + "SQKM_ADMIN": 1441.17, + "SQMI_ADMIN": 556.436, + "TYPE_ENG": "Canton", + "TYPE_LOC": "Canton(French), Cantone(Italian), Kanton(German)", + }, + expectedSHPRecord0: &SHPRecord{ + Number: 1, + ContentLength: 1248, + ShapeType: ShapeTypePolygon, + }, + }, + } { + t.Run(tc.filename, func(t *testing.T) { + testShapefile := func(t *testing.T, shapefile *Shapefile) { + t.Helper() + assert.Equal(t, tc.expectedShapeType, shapefile.SHP.SHxHeader.ShapeType) + assert.Equal(t, tc.expectedBounds, shapefile.SHP.SHxHeader.Bounds) + + assert.Equal(t, shapefile.NumRecords(), tc.expectedRecordsLen) + if tc.expectedDBFRecord0Fields != nil { + fields, geom := shapefile.Record(0) + assert.Equal(t, tc.expectedDBFRecord0Fields, fields) + if tc.expectedSHPRecord0.Geom != nil { + assert.Equal(t, tc.expectedSHPRecord0.Geom, geom) + } + } + + assert.Equal(t, shapefile.NumRecords(), tc.expectedRecordsLen) + if tc.expectedSHPRecord0 != nil { + shpRecord0 := shapefile.SHP.Records[0] + assert.Equal(t, tc.expectedSHPRecord0.Number, shpRecord0.Number) + assert.Equal(t, tc.expectedSHPRecord0.ContentLength, shpRecord0.ContentLength) + assert.Equal(t, tc.expectedSHPRecord0.ShapeType, shpRecord0.ShapeType) + if tc.expectedSHPRecord0.Geom != nil { + assert.Equal(t, tc.expectedSHPRecord0.Geom, shpRecord0.Geom) + } + } + + assert.Equal(t, shapefile.NumRecords(), tc.expectedRecordsLen) + } + + t.Run("ReadZipFile", func(t *testing.T) { + scanner, err := NewScannerFromZipFile(tc.filename, nil) + require.NoError(t, err) + assert.NotNil(t, scanner) + shapefile, err := ReadScanner(scanner) + require.NoError(t, err) + assert.NotNil(t, scanner) + testShapefile(t, shapefile) + }) + }) + } +} diff --git a/shapefile.go b/shapefile.go index f7e120b..68b7aad 100644 --- a/shapefile.go +++ b/shapefile.go @@ -70,6 +70,7 @@ var ( type Shapefile struct { DBF *DBF PRJ *PRJ + CPG *CPG SHP *SHP SHX *SHX } @@ -86,6 +87,24 @@ func Read(basename string, options *ReadShapefileOptions) (*Shapefile, error) { options = &ReadShapefileOptions{} } + var cpg *CPG + cpgFile, cpgSize, err := openWithSize(basename + ".cpg") + if cpgFile != nil { + defer cpgFile.Close() + } + switch { + case errors.Is(err, fs.ErrNotExist): + // Do nothing. + case err != nil: + return nil, fmt.Errorf("%s.cpg: %w", basename, err) + default: + var err error + cpg, err = ReadCPG(cpgFile, cpgSize) + if err != nil { + return nil, err + } + } + var dbf *DBF dbfFile, dbfSize, err := openWithSize(basename + ".dbf") if dbfFile != nil { @@ -98,7 +117,18 @@ func Read(basename string, options *ReadShapefileOptions) (*Shapefile, error) { return nil, fmt.Errorf("%s.dbf: %w", basename, err) default: var err error - dbf, err = ReadDBF(dbfFile, dbfSize, options.DBF) + var readDBFOptions *ReadDBFOptions + if options != nil { + readDBFOptions = options.DBF + } + if cpg != nil { + if readDBFOptions == nil { + readDBFOptions = &ReadDBFOptions{Charset: cpg.Charset} + } else { + readDBFOptions.Charset = cpg.Charset + } + } + dbf, err = ReadDBF(dbfFile, dbfSize, readDBFOptions) if err != nil { return nil, err } @@ -167,6 +197,7 @@ func Read(basename string, options *ReadShapefileOptions) (*Shapefile, error) { return &Shapefile{ DBF: dbf, PRJ: prj, + CPG: cpg, SHP: shp, SHX: shx, }, nil @@ -174,6 +205,24 @@ func Read(basename string, options *ReadShapefileOptions) (*Shapefile, error) { // ReadFS reads a Shapefile from fsys with the given basename. func ReadFS(fsys fs.FS, basename string, options *ReadShapefileOptions) (*Shapefile, error) { + var cpg *CPG + switch cpgFile, err := fsys.Open(basename + ".cpg"); { + case errors.Is(err, fs.ErrNotExist): + // Do nothing. + case err != nil: + return nil, err + default: + defer cpgFile.Close() + fileInfo, err := cpgFile.Stat() + if err != nil { + return nil, err + } + cpg, err = ReadCPG(cpgFile, fileInfo.Size()) + if err != nil { + return nil, fmt.Errorf("%s.cpg: %w", basename, err) + } + } + var dbf *DBF switch dbfFile, err := fsys.Open(basename + ".dbf"); { case errors.Is(err, fs.ErrNotExist): @@ -190,6 +239,13 @@ func ReadFS(fsys fs.FS, basename string, options *ReadShapefileOptions) (*Shapef if options != nil { readDBFOptions = options.DBF } + if cpg != nil { + if readDBFOptions == nil { + readDBFOptions = &ReadDBFOptions{Charset: cpg.Charset} + } else { + readDBFOptions.Charset = cpg.Charset + } + } dbf, err = ReadDBF(dbfFile, fileInfo.Size(), readDBFOptions) if err != nil { return nil, fmt.Errorf("%s.dbf: %w", basename, err) @@ -291,6 +347,7 @@ func ReadZipFile(name string, options *ReadShapefileOptions) (*Shapefile, error) func ReadZipReader(zipReader *zip.Reader, options *ReadShapefileOptions) (*Shapefile, error) { var dbfFiles []*zip.File var prjFiles []*zip.File + var cpgFiles []*zip.File var shxFiles []*zip.File var shpFiles []*zip.File for _, zipFile := range zipReader.File { @@ -299,12 +356,27 @@ func ReadZipReader(zipReader *zip.Reader, options *ReadShapefileOptions) (*Shape dbfFiles = append(dbfFiles, zipFile) case ".prj": prjFiles = append(prjFiles, zipFile) + case ".cpg": + cpgFiles = append(cpgFiles, zipFile) case ".shp": shpFiles = append(shpFiles, zipFile) case ".shx": shxFiles = append(shxFiles, zipFile) } } + var cpg *CPG + switch len(cpgFiles) { + case 0: + // Do nothing. + case 1: + var err error + cpg, err = ReadCPGZipFile(cpgFiles[0]) + if err != nil { + return nil, err + } + default: + return nil, errors.New("too many .cpg files") + } var dbf *DBF switch len(dbfFiles) { @@ -315,6 +387,13 @@ func ReadZipReader(zipReader *zip.Reader, options *ReadShapefileOptions) (*Shape if options != nil { readDBFOptions = options.DBF } + if cpg != nil { + if readDBFOptions == nil { + readDBFOptions = &ReadDBFOptions{Charset: cpg.Charset} + } else { + readDBFOptions.Charset = cpg.Charset + } + } var err error dbf, err = ReadDBFZipFile(dbfFiles[0], readDBFOptions) if err != nil { @@ -379,6 +458,7 @@ func ReadZipReader(zipReader *zip.Reader, options *ReadShapefileOptions) (*Shape return &Shapefile{ DBF: dbf, PRJ: prj, + CPG: cpg, SHP: shp, SHX: shx, }, nil diff --git a/shapefile_test.go b/shapefile_test.go index ffd7238..20f33a5 100644 --- a/shapefile_test.go +++ b/shapefile_test.go @@ -388,7 +388,7 @@ func TestReadFSAndZipFile(t *testing.T) { func addFuzzDataFromFS(f *testing.F, fsys fs.FS, root, ext string) error { f.Helper() - return fs.WalkDir(fsys, root, func(path string, dirEntry fs.DirEntry, err error) error { + return fs.WalkDir(fsys, root, func(path string, _ fs.DirEntry, err error) error { if err != nil { return err } diff --git a/shp_test.go b/shp_test.go index f8dba37..47ed2b9 100644 --- a/shp_test.go +++ b/shp_test.go @@ -11,7 +11,7 @@ import ( func FuzzReadSHP(f *testing.F) { require.NoError(f, addFuzzDataFromFS(f, os.DirFS("."), "testdata", ".shp")) - f.Fuzz(func(t *testing.T, data []byte) { + f.Fuzz(func(_ *testing.T, data []byte) { r := bytes.NewReader(data) _, _ = ReadSHP(r, int64(len(data)), &ReadSHPOptions{ MaxParts: 128, diff --git a/shx_test.go b/shx_test.go index 556719c..9dde4aa 100644 --- a/shx_test.go +++ b/shx_test.go @@ -11,7 +11,7 @@ import ( func FuzzReadSHX(f *testing.F) { require.NoError(f, addFuzzDataFromFS(f, os.DirFS("."), "testdata", ".shx")) - f.Fuzz(func(t *testing.T, data []byte) { + f.Fuzz(func(_ *testing.T, data []byte) { r := bytes.NewReader(data) _, _ = ReadSHX(r, int64(len(data))) })