Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions arrow/array/binary.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,55 @@ func (a *Binary) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}

// Validate performs a basic, O(1) consistency check on the array data.
// It returns an error if:
// - The offset buffer is too small for the array length and offset
// - The last offset exceeds the data buffer length
//
// This is useful for detecting corrupted data from untrusted sources (e.g.
// Arrow Flight / Flight SQL servers) before accessing values, which may
// otherwise cause a runtime panic.
func (a *Binary) Validate() error {
if a.data.length == 0 {
return nil
}
if a.data.buffers[1] == nil {
return fmt.Errorf("arrow/array: non-empty binary array has no offsets buffer")
}
expNumOffsets := a.data.offset + a.data.length + 1
if len(a.valueOffsets) < expNumOffsets {
return fmt.Errorf("arrow/array: binary offset buffer must have at least %d values, got %d", expNumOffsets, len(a.valueOffsets))
}
lastOffset := int(a.valueOffsets[expNumOffsets-1])
if lastOffset > len(a.valueBytes) {
return fmt.Errorf("arrow/array: binary offset %d out of bounds of data buffer (length %d)", lastOffset, len(a.valueBytes))
}
return nil
}

// ValidateFull performs a full O(n) consistency check on the array data.
// In addition to the checks performed by Validate, it also verifies that
// all offsets are non-negative and monotonically non-decreasing.
func (a *Binary) ValidateFull() error {
if err := a.Validate(); err != nil {
return err
}
if a.data.length == 0 {
return nil
}
offsets := a.valueOffsets[a.data.offset : a.data.offset+a.data.length+1]
if offsets[0] < 0 {
return fmt.Errorf("arrow/array: binary offset at index %d is negative: %d", a.data.offset, offsets[0])
}
for i := 1; i < len(offsets); i++ {
if offsets[i] < offsets[i-1] {
return fmt.Errorf("arrow/array: binary offsets are not monotonically non-decreasing at index %d: %d < %d",
a.data.offset+i, offsets[i], offsets[i-1])
}
}
return nil
}

func arrayEqualBinary(left, right *Binary) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
Expand Down Expand Up @@ -309,6 +358,55 @@ func (a *LargeBinary) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}

// Validate performs a basic, O(1) consistency check on the array data.
// It returns an error if:
// - The offset buffer is too small for the array length and offset
// - The last offset exceeds the data buffer length
//
// This is useful for detecting corrupted data from untrusted sources (e.g.
// Arrow Flight / Flight SQL servers) before accessing values, which may
// otherwise cause a runtime panic.
func (a *LargeBinary) Validate() error {
if a.data.length == 0 {
return nil
}
if a.data.buffers[1] == nil {
return fmt.Errorf("arrow/array: non-empty large binary array has no offsets buffer")
}
expNumOffsets := a.data.offset + a.data.length + 1
if len(a.valueOffsets) < expNumOffsets {
return fmt.Errorf("arrow/array: large binary offset buffer must have at least %d values, got %d", expNumOffsets, len(a.valueOffsets))
}
lastOffset := int(a.valueOffsets[expNumOffsets-1])
if lastOffset > len(a.valueBytes) {
return fmt.Errorf("arrow/array: large binary offset %d out of bounds of data buffer (length %d)", lastOffset, len(a.valueBytes))
}
return nil
}

// ValidateFull performs a full O(n) consistency check on the array data.
// In addition to the checks performed by Validate, it also verifies that
// all offsets are non-negative and monotonically non-decreasing.
func (a *LargeBinary) ValidateFull() error {
if err := a.Validate(); err != nil {
return err
}
if a.data.length == 0 {
return nil
}
offsets := a.valueOffsets[a.data.offset : a.data.offset+a.data.length+1]
if offsets[0] < 0 {
return fmt.Errorf("arrow/array: large binary offset at index %d is negative: %d", a.data.offset, offsets[0])
}
for i := 1; i < len(offsets); i++ {
if offsets[i] < offsets[i-1] {
return fmt.Errorf("arrow/array: large binary offsets are not monotonically non-decreasing at index %d: %d < %d",
a.data.offset+i, offsets[i], offsets[i-1])
}
}
return nil
}

func arrayEqualLargeBinary(left, right *LargeBinary) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
Expand Down
98 changes: 98 additions & 0 deletions arrow/array/string.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,55 @@ func (a *String) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}

// Validate performs a basic, O(1) consistency check on the array data.
// It returns an error if:
// - The offset buffer is too small for the array length and offset
// - The last offset exceeds the data buffer length
//
// This is useful for detecting corrupted data from untrusted sources (e.g.
// Arrow Flight / Flight SQL servers) before accessing values, which may
// otherwise cause a runtime panic.
func (a *String) Validate() error {
if a.data.length == 0 {
return nil
}
if a.data.buffers[1] == nil {
return fmt.Errorf("arrow/array: non-empty string array has no offsets buffer")
}
expNumOffsets := a.data.offset + a.data.length + 1
if len(a.offsets) < expNumOffsets {
return fmt.Errorf("arrow/array: string offset buffer must have at least %d values, got %d", expNumOffsets, len(a.offsets))
}
lastOffset := int(a.offsets[expNumOffsets-1])
if lastOffset > len(a.values) {
return fmt.Errorf("arrow/array: string offset %d out of bounds of data buffer (length %d)", lastOffset, len(a.values))
}
return nil
}

// ValidateFull performs a full O(n) consistency check on the array data.
// In addition to the checks performed by Validate, it also verifies that
// all offsets are non-negative and monotonically non-decreasing.
func (a *String) ValidateFull() error {
if err := a.Validate(); err != nil {
return err
}
if a.data.length == 0 {
return nil
}
offsets := a.offsets[a.data.offset : a.data.offset+a.data.length+1]
if offsets[0] < 0 {
return fmt.Errorf("arrow/array: string offset at index %d is negative: %d", a.data.offset, offsets[0])
}
for i := 1; i < len(offsets); i++ {
if offsets[i] < offsets[i-1] {
return fmt.Errorf("arrow/array: string offsets are not monotonically non-decreasing at index %d: %d < %d",
a.data.offset+i, offsets[i], offsets[i-1])
}
}
return nil
}

func arrayEqualString(left, right *String) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
Expand Down Expand Up @@ -312,6 +361,55 @@ func (a *LargeString) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}

// Validate performs a basic, O(1) consistency check on the array data.
// It returns an error if:
// - The offset buffer is too small for the array length and offset
// - The last offset exceeds the data buffer length
//
// This is useful for detecting corrupted data from untrusted sources (e.g.
// Arrow Flight / Flight SQL servers) before accessing values, which may
// otherwise cause a runtime panic.
func (a *LargeString) Validate() error {
if a.data.length == 0 {
return nil
}
if a.data.buffers[1] == nil {
return fmt.Errorf("arrow/array: non-empty large string array has no offsets buffer")
}
expNumOffsets := a.data.offset + a.data.length + 1
if len(a.offsets) < expNumOffsets {
return fmt.Errorf("arrow/array: large string offset buffer must have at least %d values, got %d", expNumOffsets, len(a.offsets))
}
lastOffset := int(a.offsets[expNumOffsets-1])
if lastOffset > len(a.values) {
return fmt.Errorf("arrow/array: large string offset %d out of bounds of data buffer (length %d)", lastOffset, len(a.values))
}
return nil
}

// ValidateFull performs a full O(n) consistency check on the array data.
// In addition to the checks performed by Validate, it also verifies that
// all offsets are non-negative and monotonically non-decreasing.
func (a *LargeString) ValidateFull() error {
if err := a.Validate(); err != nil {
return err
}
if a.data.length == 0 {
return nil
}
offsets := a.offsets[a.data.offset : a.data.offset+a.data.length+1]
if offsets[0] < 0 {
return fmt.Errorf("arrow/array: large string offset at index %d is negative: %d", a.data.offset, offsets[0])
}
for i := 1; i < len(offsets); i++ {
if offsets[i] < offsets[i-1] {
return fmt.Errorf("arrow/array: large string offsets are not monotonically non-decreasing at index %d: %d < %d",
a.data.offset+i, offsets[i], offsets[i-1])
}
}
return nil
}

func arrayEqualLargeString(left, right *LargeString) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
Expand Down
82 changes: 82 additions & 0 deletions arrow/array/validate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package array

import (
"fmt"

"github.com/apache/arrow-go/v18/arrow"
)

// Validator is implemented by array types that can validate their internal
// consistency. See Validate and ValidateFull for top-level dispatch.
type Validator interface {
arrow.Array
// Validate performs a basic O(1) consistency check.
Validate() error
// ValidateFull performs a thorough O(n) consistency check.
ValidateFull() error
}

// Validate performs a basic O(1) consistency check on arr, returning an error
// if the array's internal buffers are inconsistent. For array types that do not
// implement Validator, nil is returned.
//
// Use this to detect corrupted data from untrusted sources such as Arrow Flight
// or Flight SQL servers before accessing values, which may otherwise panic.
func Validate(arr arrow.Array) error {
if v, ok := arr.(Validator); ok {
return v.Validate()
}
return nil
}

// ValidateFull performs a thorough O(n) consistency check on arr, returning an
// error if the array's internal buffers are inconsistent. For array types that
// do not implement Validator, nil is returned.
//
// Unlike Validate, this checks every element and is therefore O(n). Use this
// when receiving data from untrusted sources where subtle corruption (e.g.
// non-monotonic offsets) may not be detected by Validate alone.
func ValidateFull(arr arrow.Array) error {
if v, ok := arr.(Validator); ok {
return v.ValidateFull()
}
return nil
}

// ValidateRecord validates each column in rec using Validate, returning the
// first error encountered. The error includes the column index and field name.
func ValidateRecord(rec arrow.RecordBatch) error {
for i := int64(0); i < rec.NumCols(); i++ {
if err := Validate(rec.Column(int(i))); err != nil {
return fmt.Errorf("column %d (%s): %w", i, rec.Schema().Field(int(i)).Name, err)
}
}
return nil
}

// ValidateRecordFull validates each column in rec using ValidateFull, returning
// the first error encountered. The error includes the column index and field name.
func ValidateRecordFull(rec arrow.RecordBatch) error {
for i := int64(0); i < rec.NumCols(); i++ {
if err := ValidateFull(rec.Column(int(i))); err != nil {
return fmt.Errorf("column %d (%s): %w", i, rec.Schema().Field(int(i)).Name, err)
}
}
return nil
}
Loading