Skip to content

Commit

Permalink
Improve col-stats sub-command
Browse files Browse the repository at this point in the history
  • Loading branch information
stoewer committed May 6, 2024
1 parent 049330c commit 41674bd
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 49 deletions.
5 changes: 3 additions & 2 deletions cmd/parquet-cli/cmd_col_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

type colStats struct {
outputOptions
Verbose bool `short:"v" optional:"" help:"Print additional information"`
File string `arg:""`
Columns []int `short:"c" optional:"" help:"Restrict the output to the following columns"`
}
Expand All @@ -19,10 +20,10 @@ func (cs *colStats) Run() error {
return err
}

rowStats, err := inspect.NewColStatCalculator(file, cs.Columns)
stats, err := inspect.NewColStatCalculator(file, cs.Columns, cs.Verbose)
if err != nil {
return err
}

return output.PrintTable(os.Stdout, cs.Output, rowStats)
return output.Print(os.Stdout, stats, &output.PrintOptions{Format: cs.Output})
}
128 changes: 83 additions & 45 deletions pkg/inspect/col_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@ import (

var (
columnStatHeader = [...]any{
"Index",
"Name",
"Max Def",
"Max Rep",
"Size",
"Compressed size",
"Pages",
"Rows",
"Values",
"Nulls",
}
columnStatHeaderFull = [...]any{
"Index",
"Name",
"Max Def",
Expand All @@ -33,53 +45,63 @@ var (
type ColumnStats struct {
Index int `json:"index"`
Name string `json:"name"`
MaxDef int `json:"maxDef"`
MaxRep int `json:"maxRep"`
MaxDef int `json:"max_def"`
MaxRep int `json:"max_rep"`
Size int64 `json:"size"`
CompressedSize int64 `json:"compressedSize"`
CompressedSize int64 `json:"compressed_size"`
Pages int `json:"pages"`
Rows int64 `json:"rows"`
PageMinRows int64 `json:"pageMinRows"`
PageMaxRows int64 `json:"pageMaxRows"`
Values int64 `json:"values"`
PageMinValues int64 `json:"pageMinValues"`
PageMaxValues int64 `json:"pageMaxValues"`
Nulls int64 `json:"nulls"`
PageMinNulls int64 `json:"pageMinNulls"`
PageMaxNulls int64 `json:"pageMaxNulls"`
}

cells []any
func (rs *ColumnStats) Cells() []any {
return []any{
rs.Index,
rs.Name,
rs.MaxDef,
rs.MaxRep,
rs.Size,
rs.CompressedSize,
rs.Pages,
rs.Rows,
rs.Values,
rs.Nulls,
}
}

func (rs *ColumnStats) SerializableData() any {
return rs
type ColumnStatsFull struct {
ColumnStats
PageMinRows int64 `json:"page_min_rows"`
PageMaxRows int64 `json:"page_max_rows"`
PageMinValues int64 `json:"page_min_values"`
PageMaxValues int64 `json:"page_max_values"`
PageMinNulls int64 `json:"page_min_nulls"`
PageMaxNulls int64 `json:"page_max_nulls"`
}

func (rs *ColumnStats) Cells() []any {
if rs.cells == nil {
rs.cells = []any{
rs.Index,
rs.Name,
rs.MaxDef,
rs.MaxRep,
rs.Size,
rs.CompressedSize,
rs.Pages,
rs.Rows,
rs.PageMinRows,
rs.PageMaxRows,
rs.Values,
rs.PageMinValues,
rs.PageMaxValues,
rs.Nulls,
rs.PageMinNulls,
rs.PageMaxNulls,
}
func (rs *ColumnStatsFull) Cells() []any {
return []any{
rs.Index,
rs.Name,
rs.MaxDef,
rs.MaxRep,
rs.Size,
rs.CompressedSize,
rs.Pages,
rs.Rows,
rs.PageMinRows,
rs.PageMaxRows,
rs.Values,
rs.PageMinValues,
rs.PageMaxValues,
rs.Nulls,
rs.PageMinNulls,
rs.PageMaxNulls,
}
return rs.cells
}

func NewColStatCalculator(file *parquet.File, selectedCols []int) (*ColStatCalculator, error) {
func NewColStatCalculator(file *parquet.File, selectedCols []int, verbose bool) (*ColStatCalculator, error) {
all := LeafColumns(file)
var columns []*parquet.Column

Expand All @@ -95,16 +117,20 @@ func NewColStatCalculator(file *parquet.File, selectedCols []int) (*ColStatCalcu
}
}

return &ColStatCalculator{file: file, columns: columns}, nil
return &ColStatCalculator{file: file, columns: columns, verbose: verbose}, nil
}

type ColStatCalculator struct {
file *parquet.File
verbose bool
columns []*parquet.Column
current int
}

func (cc *ColStatCalculator) Header() []any {
if cc.verbose {
return columnStatHeaderFull[:]
}
return columnStatHeader[:]
}

Expand All @@ -115,11 +141,13 @@ func (cc *ColStatCalculator) NextRow() (output.TableRow, error) {

col := cc.columns[cc.current]
cc.current++
stats := ColumnStats{
Index: col.Index(),
Name: col.Name(),
MaxDef: col.MaxDefinitionLevel(),
MaxRep: col.MaxRepetitionLevel(),
stats := ColumnStatsFull{
ColumnStats: ColumnStats{
Index: col.Index(),
Name: col.Name(),
MaxDef: col.MaxDefinitionLevel(),
MaxRep: col.MaxRepetitionLevel(),
},
}

for _, rg := range cc.file.RowGroups() {
Expand All @@ -141,20 +169,30 @@ func (cc *ColStatCalculator) NextRow() (output.TableRow, error) {
stats.Pages++
stats.Size += page.Size()
stats.Rows += page.NumRows()
stats.PageMinRows = min(stats.PageMinRows, page.NumRows())
stats.PageMaxRows = max(stats.PageMaxRows, page.NumRows())
stats.Values += page.NumValues()
stats.PageMinValues = min(stats.PageMinValues, page.NumRows())
stats.PageMaxValues = max(stats.PageMaxValues, page.NumRows())
stats.Nulls += page.NumNulls()

stats.PageMinNulls = min(stats.PageMinNulls, page.NumNulls())
stats.PageMaxNulls = max(stats.PageMaxNulls, page.NumNulls())
stats.PageMinValues = min(stats.PageMinValues, page.NumRows())
stats.PageMaxValues = max(stats.PageMaxValues, page.NumRows())
stats.PageMinRows = min(stats.PageMinRows, page.NumRows())
stats.PageMaxRows = max(stats.PageMaxRows, page.NumRows())

page, err = pages.ReadPage()
}

if !errors.Is(err, io.EOF) {
return nil, fmt.Errorf("unable to read page rom column '%s': %w", col.Name(), err)
}
}

return &stats, nil
if cc.verbose {
return &stats, nil
}
return &stats.ColumnStats, nil
}

func (cc *ColStatCalculator) NextSerializable() (any, error) {
return cc.NextRow()
}
4 changes: 2 additions & 2 deletions pkg/inspect/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
"github.com/parquet-go/parquet-go"
)

var headers = []any{
var schemaHeader = [...]any{
"Index",
"Name",
"Optional",
Expand Down Expand Up @@ -51,7 +51,7 @@ func (s *Schema) Text() (string, error) {
}

func (s *Schema) Header() []any {
return headers
return schemaHeader[:]
}

func (s *Schema) NextRow() (output.TableRow, error) {
Expand Down

0 comments on commit 41674bd

Please sign in to comment.