diff --git a/arrow/array/dictionary.go b/arrow/array/dictionary.go index 109d2a97..8f9fe245 100644 --- a/arrow/array/dictionary.go +++ b/arrow/array/dictionary.go @@ -314,7 +314,8 @@ func arrayApproxEqualDict(l, r *Dictionary, opt equalOption) bool { // helper for building the properly typed indices of the dictionary builder type IndexBuilder struct { Builder - Append func(int) + Append func(int) + UnsafeAppend func(int) } func createIndexBuilder(mem memory.Allocator, dt arrow.FixedWidthDataType) (ret IndexBuilder, err error) { @@ -324,34 +325,58 @@ func createIndexBuilder(mem memory.Allocator, dt arrow.FixedWidthDataType) (ret ret.Append = func(idx int) { ret.Builder.(*Int8Builder).Append(int8(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Int8Builder).UnsafeAppend(int8(idx)) + } case arrow.UINT8: ret.Append = func(idx int) { ret.Builder.(*Uint8Builder).Append(uint8(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Uint8Builder).UnsafeAppend(uint8(idx)) + } case arrow.INT16: ret.Append = func(idx int) { ret.Builder.(*Int16Builder).Append(int16(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Int16Builder).UnsafeAppend(int16(idx)) + } case arrow.UINT16: ret.Append = func(idx int) { ret.Builder.(*Uint16Builder).Append(uint16(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Uint16Builder).UnsafeAppend(uint16(idx)) + } case arrow.INT32: ret.Append = func(idx int) { ret.Builder.(*Int32Builder).Append(int32(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Int32Builder).UnsafeAppend(int32(idx)) + } case arrow.UINT32: ret.Append = func(idx int) { ret.Builder.(*Uint32Builder).Append(uint32(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Uint32Builder).UnsafeAppend(uint32(idx)) + } case arrow.INT64: ret.Append = func(idx int) { ret.Builder.(*Int64Builder).Append(int64(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Int64Builder).UnsafeAppend(int64(idx)) + } case arrow.UINT64: ret.Append = func(idx int) { ret.Builder.(*Uint64Builder).Append(uint64(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Uint64Builder).UnsafeAppend(uint64(idx)) + } default: debug.Assert(false, "dictionary index type must be integral") err = fmt.Errorf("dictionary index type must be integral, not %s", dt) @@ -646,6 +671,14 @@ func (b *dictionaryBuilder) AppendEmptyValues(n int) { } } +func (b *dictionaryBuilder) UnsafeAppendBoolToBitmap(v bool) { + if !v { + b.nulls += 1 + } + b.length += 1 + b.idxBuilder.UnsafeAppendBoolToBitmap(v) +} + func (b *dictionaryBuilder) Reserve(n int) { b.idxBuilder.Reserve(n) } @@ -781,6 +814,13 @@ func (b *dictionaryBuilder) insertDictBytes(val []byte) error { return err } +func (b *dictionaryBuilder) unsafeAppendValue(val interface{}) error { + idx, _, err := b.memoTable.GetOrInsert(val) + b.idxBuilder.UnsafeAppend(idx) + b.length += 1 + return err +} + func (b *dictionaryBuilder) appendValue(val interface{}) error { idx, _, err := b.memoTable.GetOrInsert(val) b.idxBuilder.Append(idx) @@ -990,7 +1030,39 @@ type dictBuilder[T arrow.ValueType] struct { dictionaryBuilder } +func (b *dictBuilder[T]) UnsafeAppend(v T) { + // SAFETY: it is safe to ignore the value returned by the calls to `unsafeAppendValue()` + // here since `UnsafeAppend()` is statically typed and the only case in which that method + // errors is when trying to insert an invalid `interface{}` into the `memoTable`. + var err error + switch val := any(v).(type) { + case arrow.Duration: + err = b.unsafeAppendValue(int64(val)) + case arrow.Timestamp: + err = b.unsafeAppendValue(int64(val)) + case arrow.Time32: + err = b.unsafeAppendValue(int32(val)) + case arrow.Time64: + err = b.unsafeAppendValue(int64(val)) + case arrow.Date32: + err = b.unsafeAppendValue(int32(val)) + case arrow.Date64: + err = b.unsafeAppendValue(int64(val)) + case arrow.MonthInterval: + err = b.unsafeAppendValue(int32(val)) + default: + err = b.unsafeAppendValue(v) + } + debug.Assert(err == nil, "Trying to insert wrong type into memoTable even though this method is statically typed. This is an implementation bug.") +} + func (b *dictBuilder[T]) Append(v T) error { + // TODO: it is safe to ignore the value returned by the calls to `appendValue()` + // here since `Append()` is statically typed and the only case in which that + // method errors is when trying to insert an invalid `interface{}` into the `memoTable`. + // + // This would be a breaking change to the public API of `dictBuilder`, so it needs + // to happen over a major release. switch val := any(v).(type) { case arrow.Duration: return b.appendValue(int64(val)) @@ -1058,6 +1130,12 @@ func (b *BinaryDictionaryBuilder) Append(v []byte) error { return nil } + // TODO: it is safe to ignore the value returned by the call to `appendBytes()` + // here since `Append()` is statically typed and the only case in which that + // method errors is when trying to insert an invalid `interface{}` into the `memoTable`. + // + // This would be a breaking change to the public API of `BinaryDictionaryBuilder`, + // so it needs to happen over a major release. return b.appendBytes(v) } @@ -1134,6 +1212,13 @@ type fixedSizeDictionaryBuilder[T fsbType] struct { } func (b *fixedSizeDictionaryBuilder[T]) Append(v T) error { + // TODO: it is safe to ignore the value returned by the calls to `appendValue()` + // and `appendBytes()` here since `Append()` is statically typed and the only + // case in which these method error is when trying to insert an invalid + // `interface{}` into the `memoTable`. + // + // This would be a breaking change to the public API of `fixedSizeDictionaryBuilder`, + // so it needs to happen over a major release. if v, ok := any(v).([]byte); ok { return b.appendBytes(v[:b.byteWidth]) } @@ -1164,6 +1249,12 @@ type FixedSizeBinaryDictionaryBuilder struct { } func (b *FixedSizeBinaryDictionaryBuilder) Append(v []byte) error { + // TODO: it is safe to ignore the value returned by the calls to `appendValue()` + // here since `Append()` is statically typed and the only case in which that + // method errors is when trying to insert an invalid `interface{}` into the `memoTable`. + // + // This would be a breaking change to the public API of `FixedSizeBinaryDictionaryBuilder`, + // so it needs to happen over a major release. return b.appendValue(v[:b.byteWidth]) } diff --git a/arrow/array/dictionary_test.go b/arrow/array/dictionary_test.go index 9b9d3b1f..24aab674 100644 --- a/arrow/array/dictionary_test.go +++ b/arrow/array/dictionary_test.go @@ -145,6 +145,47 @@ func (p *PrimitiveDictionaryTestSuite) TestDictionaryBuilderInit() { p.True(array.Equal(expected, arr)) } +func (p *PrimitiveDictionaryTestSuite) TestDictionaryBuilderReserveAndAppend() { + expectedType := &arrow.DictionaryType{IndexType: &arrow.Int8Type{}, ValueType: p.typ} + bldr := array.NewDictionaryBuilder(p.mem, expectedType) + defer bldr.Release() + + builder := reflect.ValueOf(bldr) + appendFn := builder.MethodByName("UnsafeAppend") + validFn := builder.MethodByName("UnsafeAppendBoolToBitmap") + + bldr.Reserve(7) + validFn.Call([]reflect.Value{reflect.ValueOf(true)}) + validFn.Call([]reflect.Value{reflect.ValueOf(false)}) + appendFn.Call([]reflect.Value{reflect.ValueOf(0).Convert(p.reftyp)}) + appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)}) + validFn.Call([]reflect.Value{reflect.ValueOf(false)}) + appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)}) + appendFn.Call([]reflect.Value{reflect.ValueOf(2).Convert(p.reftyp)}) + + p.EqualValues(7, bldr.Len()) + p.EqualValues(2, bldr.NullN()) + + p.EqualValues(3, bldr.DictionarySize()) + + arr := bldr.NewArray().(*array.Dictionary) + defer arr.Release() + + p.True(arrow.TypeEqual(expectedType, arr.DataType())) + expectedDict, _, err := array.FromJSON(p.mem, expectedType.ValueType, strings.NewReader("[0, 1, 2]")) + p.NoError(err) + defer expectedDict.Release() + + expectedIndices, _, err := array.FromJSON(p.mem, expectedType.IndexType, strings.NewReader("[0, null, 0, 1, null, 1, 2]")) + p.NoError(err) + defer expectedIndices.Release() + + expected := array.NewDictionaryArray(expectedType, expectedIndices, expectedDict) + defer expected.Release() + + p.True(array.Equal(expected, arr)) +} + func (p *PrimitiveDictionaryTestSuite) TestDictionaryNewBuilder() { valueType := p.typ dictArr, _, err := array.FromJSON(p.mem, valueType, strings.NewReader("[1, 2]")) diff --git a/arrow/array/encoded.go b/arrow/array/encoded.go index 08800d4b..85432a13 100644 --- a/arrow/array/encoded.go +++ b/arrow/array/encoded.go @@ -398,6 +398,10 @@ func (b *RunEndEncodedBuilder) AppendNulls(n int) { } } +func (b *RunEndEncodedBuilder) UnsafeAppendBoolToBitmap(v bool) { + panic("Calling UnsafeAppendBoolToBitmap on a run-end encoded array is semantically undefined.") +} + func (b *RunEndEncodedBuilder) NullN() int { return UnknownNullCount }