diff --git a/cmd/parquet-cli/cmd_col_stats.go b/cmd/parquet-cli/cmd_col_stats.go index 0cab373..1e22ca5 100644 --- a/cmd/parquet-cli/cmd_col_stats.go +++ b/cmd/parquet-cli/cmd_col_stats.go @@ -9,6 +9,7 @@ import ( type colStats struct { outputOptions + Verbose bool `short:"v" optional:"" help:"Print additional information"` File string `arg:""` Columns []int `short:"c" optional:"" help:"Restrict the output to the following columns"` } @@ -19,10 +20,10 @@ func (cs *colStats) Run() error { return err } - rowStats, err := inspect.NewColStatCalculator(file, cs.Columns) + stats, err := inspect.NewColStatCalculator(file, cs.Columns, cs.Verbose) if err != nil { return err } - return output.PrintTable(os.Stdout, cs.Output, rowStats) + return output.Print(os.Stdout, stats, &output.PrintOptions{Format: cs.Output}) } diff --git a/pkg/inspect/col_stats.go b/pkg/inspect/col_stats.go index 17faad4..6279a45 100644 --- a/pkg/inspect/col_stats.go +++ b/pkg/inspect/col_stats.go @@ -11,6 +11,18 @@ import ( var ( columnStatHeader = [...]any{ + "Index", + "Name", + "Max Def", + "Max Rep", + "Size", + "Compressed size", + "Pages", + "Rows", + "Values", + "Nulls", + } + columnStatHeaderFull = [...]any{ "Index", "Name", "Max Def", @@ -33,53 +45,63 @@ var ( type ColumnStats struct { Index int `json:"index"` Name string `json:"name"` - MaxDef int `json:"maxDef"` - MaxRep int `json:"maxRep"` + MaxDef int `json:"max_def"` + MaxRep int `json:"max_rep"` Size int64 `json:"size"` - CompressedSize int64 `json:"compressedSize"` + CompressedSize int64 `json:"compressed_size"` Pages int `json:"pages"` Rows int64 `json:"rows"` - PageMinRows int64 `json:"pageMinRows"` - PageMaxRows int64 `json:"pageMaxRows"` Values int64 `json:"values"` - PageMinValues int64 `json:"pageMinValues"` - PageMaxValues int64 `json:"pageMaxValues"` Nulls int64 `json:"nulls"` - PageMinNulls int64 `json:"pageMinNulls"` - PageMaxNulls int64 `json:"pageMaxNulls"` +} - cells []any +func (rs *ColumnStats) Cells() []any { + return []any{ + rs.Index, + rs.Name, + rs.MaxDef, + rs.MaxRep, + rs.Size, + rs.CompressedSize, + rs.Pages, + rs.Rows, + rs.Values, + rs.Nulls, + } } -func (rs *ColumnStats) SerializableData() any { - return rs +type ColumnStatsFull struct { + ColumnStats + PageMinRows int64 `json:"page_min_rows"` + PageMaxRows int64 `json:"page_max_rows"` + PageMinValues int64 `json:"page_min_values"` + PageMaxValues int64 `json:"page_max_values"` + PageMinNulls int64 `json:"page_min_nulls"` + PageMaxNulls int64 `json:"page_max_nulls"` } -func (rs *ColumnStats) Cells() []any { - if rs.cells == nil { - rs.cells = []any{ - rs.Index, - rs.Name, - rs.MaxDef, - rs.MaxRep, - rs.Size, - rs.CompressedSize, - rs.Pages, - rs.Rows, - rs.PageMinRows, - rs.PageMaxRows, - rs.Values, - rs.PageMinValues, - rs.PageMaxValues, - rs.Nulls, - rs.PageMinNulls, - rs.PageMaxNulls, - } +func (rs *ColumnStatsFull) Cells() []any { + return []any{ + rs.Index, + rs.Name, + rs.MaxDef, + rs.MaxRep, + rs.Size, + rs.CompressedSize, + rs.Pages, + rs.Rows, + rs.PageMinRows, + rs.PageMaxRows, + rs.Values, + rs.PageMinValues, + rs.PageMaxValues, + rs.Nulls, + rs.PageMinNulls, + rs.PageMaxNulls, } - return rs.cells } -func NewColStatCalculator(file *parquet.File, selectedCols []int) (*ColStatCalculator, error) { +func NewColStatCalculator(file *parquet.File, selectedCols []int, verbose bool) (*ColStatCalculator, error) { all := LeafColumns(file) var columns []*parquet.Column @@ -95,16 +117,20 @@ func NewColStatCalculator(file *parquet.File, selectedCols []int) (*ColStatCalcu } } - return &ColStatCalculator{file: file, columns: columns}, nil + return &ColStatCalculator{file: file, columns: columns, verbose: verbose}, nil } type ColStatCalculator struct { file *parquet.File + verbose bool columns []*parquet.Column current int } func (cc *ColStatCalculator) Header() []any { + if cc.verbose { + return columnStatHeaderFull[:] + } return columnStatHeader[:] } @@ -115,11 +141,13 @@ func (cc *ColStatCalculator) NextRow() (output.TableRow, error) { col := cc.columns[cc.current] cc.current++ - stats := ColumnStats{ - Index: col.Index(), - Name: col.Name(), - MaxDef: col.MaxDefinitionLevel(), - MaxRep: col.MaxRepetitionLevel(), + stats := ColumnStatsFull{ + ColumnStats: ColumnStats{ + Index: col.Index(), + Name: col.Name(), + MaxDef: col.MaxDefinitionLevel(), + MaxRep: col.MaxRepetitionLevel(), + }, } for _, rg := range cc.file.RowGroups() { @@ -141,20 +169,30 @@ func (cc *ColStatCalculator) NextRow() (output.TableRow, error) { stats.Pages++ stats.Size += page.Size() stats.Rows += page.NumRows() - stats.PageMinRows = min(stats.PageMinRows, page.NumRows()) - stats.PageMaxRows = max(stats.PageMaxRows, page.NumRows()) stats.Values += page.NumValues() - stats.PageMinValues = min(stats.PageMinValues, page.NumRows()) - stats.PageMaxValues = max(stats.PageMaxValues, page.NumRows()) stats.Nulls += page.NumNulls() + stats.PageMinNulls = min(stats.PageMinNulls, page.NumNulls()) stats.PageMaxNulls = max(stats.PageMaxNulls, page.NumNulls()) + stats.PageMinValues = min(stats.PageMinValues, page.NumRows()) + stats.PageMaxValues = max(stats.PageMaxValues, page.NumRows()) + stats.PageMinRows = min(stats.PageMinRows, page.NumRows()) + stats.PageMaxRows = max(stats.PageMaxRows, page.NumRows()) + page, err = pages.ReadPage() } + if !errors.Is(err, io.EOF) { return nil, fmt.Errorf("unable to read page rom column '%s': %w", col.Name(), err) } } - return &stats, nil + if cc.verbose { + return &stats, nil + } + return &stats.ColumnStats, nil +} + +func (cc *ColStatCalculator) NextSerializable() (any, error) { + return cc.NextRow() } diff --git a/pkg/inspect/schema.go b/pkg/inspect/schema.go index 7eeb7ba..ec9c63c 100644 --- a/pkg/inspect/schema.go +++ b/pkg/inspect/schema.go @@ -10,7 +10,7 @@ import ( "github.com/parquet-go/parquet-go" ) -var headers = []any{ +var schemaHeader = [...]any{ "Index", "Name", "Optional", @@ -51,7 +51,7 @@ func (s *Schema) Text() (string, error) { } func (s *Schema) Header() []any { - return headers + return schemaHeader[:] } func (s *Schema) NextRow() (output.TableRow, error) {