Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 92 additions & 1 deletion arrow/array/dictionary.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,8 @@ func arrayApproxEqualDict(l, r *Dictionary, opt equalOption) bool {
// helper for building the properly typed indices of the dictionary builder
type IndexBuilder struct {
Builder
Append func(int)
Append func(int)
UnsafeAppend func(int)
}

func createIndexBuilder(mem memory.Allocator, dt arrow.FixedWidthDataType) (ret IndexBuilder, err error) {
Expand All @@ -324,34 +325,58 @@ func createIndexBuilder(mem memory.Allocator, dt arrow.FixedWidthDataType) (ret
ret.Append = func(idx int) {
ret.Builder.(*Int8Builder).Append(int8(idx))
}
ret.UnsafeAppend = func(idx int) {
ret.Builder.(*Int8Builder).UnsafeAppend(int8(idx))
}
case arrow.UINT8:
ret.Append = func(idx int) {
ret.Builder.(*Uint8Builder).Append(uint8(idx))
}
ret.UnsafeAppend = func(idx int) {
ret.Builder.(*Uint8Builder).UnsafeAppend(uint8(idx))
}
case arrow.INT16:
ret.Append = func(idx int) {
ret.Builder.(*Int16Builder).Append(int16(idx))
}
ret.UnsafeAppend = func(idx int) {
ret.Builder.(*Int16Builder).UnsafeAppend(int16(idx))
}
case arrow.UINT16:
ret.Append = func(idx int) {
ret.Builder.(*Uint16Builder).Append(uint16(idx))
}
ret.UnsafeAppend = func(idx int) {
ret.Builder.(*Uint16Builder).UnsafeAppend(uint16(idx))
}
case arrow.INT32:
ret.Append = func(idx int) {
ret.Builder.(*Int32Builder).Append(int32(idx))
}
ret.UnsafeAppend = func(idx int) {
ret.Builder.(*Int32Builder).UnsafeAppend(int32(idx))
}
case arrow.UINT32:
ret.Append = func(idx int) {
ret.Builder.(*Uint32Builder).Append(uint32(idx))
}
ret.UnsafeAppend = func(idx int) {
ret.Builder.(*Uint32Builder).UnsafeAppend(uint32(idx))
}
case arrow.INT64:
ret.Append = func(idx int) {
ret.Builder.(*Int64Builder).Append(int64(idx))
}
ret.UnsafeAppend = func(idx int) {
ret.Builder.(*Int64Builder).UnsafeAppend(int64(idx))
}
case arrow.UINT64:
ret.Append = func(idx int) {
ret.Builder.(*Uint64Builder).Append(uint64(idx))
}
ret.UnsafeAppend = func(idx int) {
ret.Builder.(*Uint64Builder).UnsafeAppend(uint64(idx))
}
default:
debug.Assert(false, "dictionary index type must be integral")
err = fmt.Errorf("dictionary index type must be integral, not %s", dt)
Expand Down Expand Up @@ -646,6 +671,14 @@ func (b *dictionaryBuilder) AppendEmptyValues(n int) {
}
}

func (b *dictionaryBuilder) UnsafeAppendBoolToBitmap(v bool) {
if !v {
b.nulls += 1
}
b.length += 1
b.idxBuilder.UnsafeAppendBoolToBitmap(v)
}

func (b *dictionaryBuilder) Reserve(n int) {
b.idxBuilder.Reserve(n)
}
Expand Down Expand Up @@ -781,6 +814,13 @@ func (b *dictionaryBuilder) insertDictBytes(val []byte) error {
return err
}

func (b *dictionaryBuilder) unsafeAppendValue(val interface{}) error {
idx, _, err := b.memoTable.GetOrInsert(val)
b.idxBuilder.UnsafeAppend(idx)
b.length += 1
return err
}

func (b *dictionaryBuilder) appendValue(val interface{}) error {
idx, _, err := b.memoTable.GetOrInsert(val)
b.idxBuilder.Append(idx)
Expand Down Expand Up @@ -990,7 +1030,39 @@ type dictBuilder[T arrow.ValueType] struct {
dictionaryBuilder
}

func (b *dictBuilder[T]) UnsafeAppend(v T) {
// SAFETY: it is safe to ignore the value returned by the calls to `unsafeAppendValue()`
// here since `UnsafeAppend()` is statically typed and the only case in which that method
// errors is when trying to insert an invalid `interface{}` into the `memoTable`.
var err error
switch val := any(v).(type) {
case arrow.Duration:
err = b.unsafeAppendValue(int64(val))
case arrow.Timestamp:
err = b.unsafeAppendValue(int64(val))
case arrow.Time32:
err = b.unsafeAppendValue(int32(val))
case arrow.Time64:
err = b.unsafeAppendValue(int64(val))
case arrow.Date32:
err = b.unsafeAppendValue(int32(val))
case arrow.Date64:
err = b.unsafeAppendValue(int64(val))
case arrow.MonthInterval:
err = b.unsafeAppendValue(int32(val))
default:
err = b.unsafeAppendValue(v)
}
debug.Assert(err == nil, "Trying to insert wrong type into memoTable even though this method is statically typed. This is an implementation bug.")
}

func (b *dictBuilder[T]) Append(v T) error {
// TODO: it is safe to ignore the value returned by the calls to `appendValue()`
// here since `Append()` is statically typed and the only case in which that
// method errors is when trying to insert an invalid `interface{}` into the `memoTable`.
//
// This would be a breaking change to the public API of `dictBuilder`, so it needs
// to happen over a major release.
switch val := any(v).(type) {
case arrow.Duration:
return b.appendValue(int64(val))
Expand Down Expand Up @@ -1058,6 +1130,12 @@ func (b *BinaryDictionaryBuilder) Append(v []byte) error {
return nil
}

// TODO: it is safe to ignore the value returned by the call to `appendBytes()`
// here since `Append()` is statically typed and the only case in which that
// method errors is when trying to insert an invalid `interface{}` into the `memoTable`.
//
// This would be a breaking change to the public API of `BinaryDictionaryBuilder`,
// so it needs to happen over a major release.
return b.appendBytes(v)
}

Expand Down Expand Up @@ -1134,6 +1212,13 @@ type fixedSizeDictionaryBuilder[T fsbType] struct {
}

func (b *fixedSizeDictionaryBuilder[T]) Append(v T) error {
// TODO: it is safe to ignore the value returned by the calls to `appendValue()`
// and `appendBytes()` here since `Append()` is statically typed and the only
// case in which these method error is when trying to insert an invalid
// `interface{}` into the `memoTable`.
//
// This would be a breaking change to the public API of `fixedSizeDictionaryBuilder`,
// so it needs to happen over a major release.
if v, ok := any(v).([]byte); ok {
return b.appendBytes(v[:b.byteWidth])
}
Expand Down Expand Up @@ -1164,6 +1249,12 @@ type FixedSizeBinaryDictionaryBuilder struct {
}

func (b *FixedSizeBinaryDictionaryBuilder) Append(v []byte) error {
// TODO: it is safe to ignore the value returned by the calls to `appendValue()`
// here since `Append()` is statically typed and the only case in which that
// method errors is when trying to insert an invalid `interface{}` into the `memoTable`.
//
// This would be a breaking change to the public API of `FixedSizeBinaryDictionaryBuilder`,
// so it needs to happen over a major release.
return b.appendValue(v[:b.byteWidth])
}

Expand Down
41 changes: 41 additions & 0 deletions arrow/array/dictionary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,47 @@ func (p *PrimitiveDictionaryTestSuite) TestDictionaryBuilderInit() {
p.True(array.Equal(expected, arr))
}

func (p *PrimitiveDictionaryTestSuite) TestDictionaryBuilderReserveAndAppend() {
expectedType := &arrow.DictionaryType{IndexType: &arrow.Int8Type{}, ValueType: p.typ}
bldr := array.NewDictionaryBuilder(p.mem, expectedType)
defer bldr.Release()

builder := reflect.ValueOf(bldr)
appendFn := builder.MethodByName("UnsafeAppend")
validFn := builder.MethodByName("UnsafeAppendBoolToBitmap")

bldr.Reserve(7)
validFn.Call([]reflect.Value{reflect.ValueOf(true)})
validFn.Call([]reflect.Value{reflect.ValueOf(false)})
appendFn.Call([]reflect.Value{reflect.ValueOf(0).Convert(p.reftyp)})
appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)})
validFn.Call([]reflect.Value{reflect.ValueOf(false)})
appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)})
appendFn.Call([]reflect.Value{reflect.ValueOf(2).Convert(p.reftyp)})

p.EqualValues(7, bldr.Len())
p.EqualValues(2, bldr.NullN())

p.EqualValues(3, bldr.DictionarySize())

arr := bldr.NewArray().(*array.Dictionary)
defer arr.Release()

p.True(arrow.TypeEqual(expectedType, arr.DataType()))
expectedDict, _, err := array.FromJSON(p.mem, expectedType.ValueType, strings.NewReader("[0, 1, 2]"))
p.NoError(err)
defer expectedDict.Release()

expectedIndices, _, err := array.FromJSON(p.mem, expectedType.IndexType, strings.NewReader("[0, null, 0, 1, null, 1, 2]"))
p.NoError(err)
defer expectedIndices.Release()

expected := array.NewDictionaryArray(expectedType, expectedIndices, expectedDict)
defer expected.Release()

p.True(array.Equal(expected, arr))
}

func (p *PrimitiveDictionaryTestSuite) TestDictionaryNewBuilder() {
valueType := p.typ
dictArr, _, err := array.FromJSON(p.mem, valueType, strings.NewReader("[1, 2]"))
Expand Down
4 changes: 4 additions & 0 deletions arrow/array/encoded.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,10 @@ func (b *RunEndEncodedBuilder) AppendNulls(n int) {
}
}

func (b *RunEndEncodedBuilder) UnsafeAppendBoolToBitmap(v bool) {
panic("Calling UnsafeAppendBoolToBitmap on a run-end encoded array is semantically undefined.")
}

func (b *RunEndEncodedBuilder) NullN() int {
return UnknownNullCount
}
Expand Down