Fix bulkinsert bug that segments are compacted after import

Signed-off-by: yhmo <[email protected]>
yhmo · Nov 7, 2023 · 244205b · 244205b
1 parent 765aaf3
commit 244205b
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 30 deletions.
diff --git a/internal/datanode/services.go b/internal/datanode/services.go
@@ -499,8 +499,8 @@ func (node *DataNode) Import(ctx context.Context, req *datapb.ImportTaskRequest)
 
 	// parse files and generate segments
 	segmentSize := Params.DataCoordCfg.SegmentMaxSize.GetAsInt64() * 1024 * 1024
-	importWrapper := importutil.NewImportWrapper(newCtx, collectionInfo, segmentSize, node.allocator.GetIDAlloactor(),
-		node.chunkManager, importResult, reportFunc)
+	importWrapper := importutil.NewImportWrapper(newCtx, collectionInfo, segmentSize, Params.DataNodeCfg.BinLogMaxSize.GetAsInt64(),
+		node.allocator.GetIDAlloactor(), node.chunkManager, importResult, reportFunc)
 	importWrapper.SetCallbackFunctions(assignSegmentFunc(node, req),
 		createBinLogsFunc(node, req, colInfo.GetSchema(), ts),
 		saveSegmentFunc(node, req, importResult, ts))

diff --git a/internal/util/importutil/import_wrapper.go b/internal/util/importutil/import_wrapper.go
@@ -39,8 +39,8 @@ const (
 	JSONFileExt  = ".json"
 	NumpyFileExt = ".npy"
 
-	// supposed size of a single block, to control a binlog file size, the max biglog file size is no more than 2*SingleBlockSize
-	SingleBlockSize = 16 * 1024 * 1024 // 16MB
+	// parsers read JSON/Numpy/CSV files buffer by buffer, this limitation is to define the buffer size.
+	ReadBufferSize = 16 * 1024 * 1024 // 16MB
 
 	// this limitation is to avoid this OOM risk:
 	// simetimes system segment max size is a large number, a single segment fields data might cause OOM.
@@ -92,6 +92,7 @@ type ImportWrapper struct {
 	cancel         context.CancelFunc     // for canceling parse process
 	collectionInfo *CollectionInfo        // collection details including schema
 	segmentSize    int64                  // maximum size of a segment(unit:byte) defined by dataCoord.segment.maxSize (milvus.yml)
+	binlogSize     int64                  // average binlog size(unit:byte), the max biglog file size is no more than 2*binlogSize
 	rowIDAllocator *allocator.IDAllocator // autoid allocator
 	chunkManager   storage.ChunkManager
 
@@ -107,7 +108,7 @@ type ImportWrapper struct {
 	progressPercent int64                             // working progress percent
 }
 
-func NewImportWrapper(ctx context.Context, collectionInfo *CollectionInfo, segmentSize int64,
+func NewImportWrapper(ctx context.Context, collectionInfo *CollectionInfo, segmentSize int64, maxBinlogSize int64,
 	idAlloc *allocator.IDAllocator, cm storage.ChunkManager, importResult *rootcoordpb.ImportResult,
 	reportFunc func(res *rootcoordpb.ImportResult) error,
 ) *ImportWrapper {
@@ -120,11 +121,19 @@ func NewImportWrapper(ctx context.Context, collectionInfo *CollectionInfo, segme
 
 	ctx, cancel := context.WithCancel(ctx)
 
+	// average binlogSize is expected to be half of the maxBinlogSize
+	// and avoid binlogSize to be a tiny value
+	binlogSize := int64(float32(maxBinlogSize) * 0.5)
+	if binlogSize < ReadBufferSize {
+		binlogSize = ReadBufferSize
+	}
+
 	wrapper := &ImportWrapper{
 		ctx:                  ctx,
 		cancel:               cancel,
 		collectionInfo:       collectionInfo,
 		segmentSize:          segmentSize,
+		binlogSize:           binlogSize,
 		rowIDAllocator:       idAlloc,
 		chunkManager:         cm,
 		importResult:         importResult,
@@ -282,7 +291,7 @@ func (p *ImportWrapper) Import(filePaths []string, options ImportOptions) error
 			printFieldsDataInfo(fields, "import wrapper: prepare to flush binlog data", filePaths)
 			return p.flushFunc(fields, shardID, partitionID)
 		}
-		parser, err := NewNumpyParser(p.ctx, p.collectionInfo, p.rowIDAllocator, SingleBlockSize,
+		parser, err := NewNumpyParser(p.ctx, p.collectionInfo, p.rowIDAllocator, p.binlogSize,
 			p.chunkManager, flushFunc, p.updateProgressPercent)
 		if err != nil {
 			return err
@@ -384,7 +393,7 @@ func (p *ImportWrapper) doBinlogImport(filePaths []string, tsStartPoint uint64,
 		printFieldsDataInfo(fields, "import wrapper: prepare to flush binlog data", filePaths)
 		return p.flushFunc(fields, shardID, partitionID)
 	}
-	parser, err := NewBinlogParser(p.ctx, p.collectionInfo, SingleBlockSize,
+	parser, err := NewBinlogParser(p.ctx, p.collectionInfo, p.binlogSize,
 		p.chunkManager, flushFunc, p.updateProgressPercent, tsStartPoint, tsEndPoint)
 	if err != nil {
 		return err
@@ -433,7 +442,7 @@ func (p *ImportWrapper) parseRowBasedJSON(filePath string, onlyValidate bool) er
 		}
 	}
 
-	consumer, err := NewJSONRowConsumer(p.ctx, p.collectionInfo, p.rowIDAllocator, SingleBlockSize, flushFunc)
+	consumer, err := NewJSONRowConsumer(p.ctx, p.collectionInfo, p.rowIDAllocator, p.binlogSize, flushFunc)
 	if err != nil {
 		return err
 	}

diff --git a/internal/util/importutil/import_wrapper_test.go b/internal/util/importutil/import_wrapper_test.go
@@ -185,7 +185,7 @@ func Test_ImportWrapperNew(t *testing.T) {
 	ctx := context.Background()
 	cm, err := f.NewPersistentStorageChunkManager(ctx)
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, nil, 1, nil, cm, nil, nil)
+	wrapper := NewImportWrapper(ctx, nil, 1, ReadBufferSize, nil, cm, nil, nil)
 	assert.Nil(t, wrapper)
 
 	schema := &schemapb.CollectionSchema{
@@ -205,7 +205,7 @@ func Test_ImportWrapperNew(t *testing.T) {
 	})
 	collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
 	assert.NoError(t, err)
-	wrapper = NewImportWrapper(ctx, collectionInfo, 1, nil, cm, nil, nil)
+	wrapper = NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, nil, cm, nil, nil)
 	assert.NotNil(t, wrapper)
 
 	assignSegFunc := func(shardID int, partID int64) (int64, string, error) {
@@ -282,7 +282,7 @@ func Test_ImportWrapperRowBased(t *testing.T) {
 	assert.NoError(t, err)
 
 	t.Run("success case", func(t *testing.T) {
-		wrapper := NewImportWrapper(ctx, collectionInfo, 1, idAllocator, cm, importResult, reportFunc)
+		wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
 		wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
 		files := make([]string, 0)
 		files = append(files, filePath)
@@ -308,7 +308,7 @@ func Test_ImportWrapperRowBased(t *testing.T) {
 		assert.NoError(t, err)
 
 		importResult.State = commonpb.ImportState_ImportStarted
-		wrapper := NewImportWrapper(ctx, collectionInfo, 1, idAllocator, cm, importResult, reportFunc)
+		wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
 		wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
 		files := make([]string, 0)
 		files = append(files, filePath)
@@ -320,7 +320,7 @@ func Test_ImportWrapperRowBased(t *testing.T) {
 	t.Run("file doesn't exist", func(t *testing.T) {
 		files := make([]string, 0)
 		files = append(files, "/dummy/dummy.json")
-		wrapper := NewImportWrapper(ctx, collectionInfo, 1, idAllocator, cm, importResult, reportFunc)
+		wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
 		err = wrapper.Import(files, ImportOptions{OnlyValidate: true})
 		assert.Error(t, err)
 	})
@@ -362,7 +362,7 @@ func Test_ImportWrapperColumnBased_numpy(t *testing.T) {
 	files := createSampleNumpyFiles(t, cm)
 
 	t.Run("success case", func(t *testing.T) {
-		wrapper := NewImportWrapper(ctx, collectionInfo, 1, idAllocator, cm, importResult, reportFunc)
+		wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
 		wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
 
 		err = wrapper.Import(files, DefaultImportOptions())
@@ -380,7 +380,7 @@ func Test_ImportWrapperColumnBased_numpy(t *testing.T) {
 		files[1] = filePath
 
 		importResult.State = commonpb.ImportState_ImportStarted
-		wrapper := NewImportWrapper(ctx, collectionInfo, 1, idAllocator, cm, importResult, reportFunc)
+		wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
 		wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
 
 		err = wrapper.Import(files, DefaultImportOptions())
@@ -391,7 +391,7 @@ func Test_ImportWrapperColumnBased_numpy(t *testing.T) {
 	t.Run("file doesn't exist", func(t *testing.T) {
 		files := make([]string, 0)
 		files = append(files, "/dummy/dummy.npy")
-		wrapper := NewImportWrapper(ctx, collectionInfo, 1, idAllocator, cm, importResult, reportFunc)
+		wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
 		err = wrapper.Import(files, DefaultImportOptions())
 		assert.Error(t, err)
 	})
@@ -511,7 +511,7 @@ func Test_ImportWrapperRowBased_perf(t *testing.T) {
 	}
 	collectionInfo, err := NewCollectionInfo(schema, int32(shardNum), []int64{1})
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), idAllocator, cm, importResult, reportFunc)
+	wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, importResult, reportFunc)
 	wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
 
 	files := make([]string, 0)
@@ -555,7 +555,7 @@ func Test_ImportWrapperFileValidation(t *testing.T) {
 
 	collectionInfo, err := NewCollectionInfo(schema, int32(shardNum), []int64{1})
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), idAllocator, cm, nil, nil)
+	wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
 
 	t.Run("unsupported file type", func(t *testing.T) {
 		files := []string{"uid.txt"}
@@ -605,7 +605,7 @@ func Test_ImportWrapperFileValidation(t *testing.T) {
 	t.Run("empty file list", func(t *testing.T) {
 		files := []string{}
 		cm.size = 0
-		wrapper = NewImportWrapper(ctx, collectionInfo, int64(segmentSize), idAllocator, cm, nil, nil)
+		wrapper = NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
 		rowBased, err := wrapper.fileValidation(files)
 		assert.NoError(t, err)
 		assert.False(t, rowBased)
@@ -614,7 +614,7 @@ func Test_ImportWrapperFileValidation(t *testing.T) {
 	t.Run("file size exceed MaxFileSize limit", func(t *testing.T) {
 		files := []string{"a/1.json"}
 		cm.size = params.Params.CommonCfg.ImportMaxFileSize.GetAsInt64() + 1
-		wrapper = NewImportWrapper(ctx, collectionInfo, int64(segmentSize), idAllocator, cm, nil, nil)
+		wrapper = NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
 		rowBased, err := wrapper.fileValidation(files)
 		assert.Error(t, err)
 		assert.True(t, rowBased)
@@ -685,7 +685,7 @@ func Test_ImportWrapperReportFailRowBased(t *testing.T) {
 	}
 	collectionInfo, err := NewCollectionInfo(sampleSchema(), 2, []int64{1})
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, collectionInfo, 1, idAllocator, cm, importResult, reportFunc)
+	wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
 	wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
 
 	files := []string{filePath}
@@ -732,7 +732,7 @@ func Test_ImportWrapperReportFailColumnBased_numpy(t *testing.T) {
 	}
 	collectionInfo, err := NewCollectionInfo(sampleSchema(), 2, []int64{1})
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, collectionInfo, 1, idAllocator, cm, importResult, reportFunc)
+	wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
 	wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
 
 	wrapper.reportImportAttempts = 2
@@ -767,7 +767,7 @@ func Test_ImportWrapperIsBinlogImport(t *testing.T) {
 
 	collectionInfo, err := NewCollectionInfo(schema, int32(shardNum), []int64{1})
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), idAllocator, cm, nil, nil)
+	wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
 
 	// empty paths
 	paths := []string{}
@@ -831,7 +831,7 @@ func Test_ImportWrapperDoBinlogImport(t *testing.T) {
 
 	collectionInfo, err := NewCollectionInfo(schema, int32(shardNum), []int64{1})
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), idAllocator, cm, nil, nil)
+	wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
 	paths := []string{
 		"/tmp",
 		"/tmp",
@@ -894,7 +894,7 @@ func Test_ImportWrapperReportPersisted(t *testing.T) {
 	}
 	collectionInfo, err := NewCollectionInfo(sampleSchema(), 2, []int64{1})
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, collectionInfo, int64(1024), nil, nil, importResult, reportFunc)
+	wrapper := NewImportWrapper(ctx, collectionInfo, int64(1024), ReadBufferSize, nil, nil, importResult, reportFunc)
 	assert.NotNil(t, wrapper)
 
 	rowCounter := &rowCounterTest{}
@@ -937,7 +937,7 @@ func Test_ImportWrapperUpdateProgressPercent(t *testing.T) {
 
 	collectionInfo, err := NewCollectionInfo(sampleSchema(), 2, []int64{1})
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, collectionInfo, 1, nil, nil, nil, nil)
+	wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, nil, nil, nil, nil)
 	assert.NotNil(t, wrapper)
 	assert.Equal(t, int64(0), wrapper.progressPercent)
 
@@ -976,7 +976,7 @@ func Test_ImportWrapperFlushFunc(t *testing.T) {
 	schema := sampleSchema()
 	collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
 	assert.NoError(t, err)
-	wrapper := NewImportWrapper(ctx, collectionInfo, 1, nil, nil, importResult, reportFunc)
+	wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, nil, nil, importResult, reportFunc)
 	assert.NotNil(t, wrapper)
 	wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
 

diff --git a/internal/util/importutil/json_parser.go b/internal/util/importutil/json_parser.go
@@ -73,7 +73,7 @@ func adjustBufSize(parser *JSONParser, collectionSchema *schemapb.CollectionSche
 	// for low dimensional vector, the bufSize is a large value, read more rows each time
 	bufRowCount := parser.bufRowCount
 	for {
-		if bufRowCount*sizePerRecord > SingleBlockSize {
+		if bufRowCount*sizePerRecord > ReadBufferSize {
 			bufRowCount--
 		} else {
 			break

diff --git a/internal/util/importutil/numpy_adapter.go b/internal/util/importutil/numpy_adapter.go
@@ -563,9 +563,9 @@ func (n *NumpyAdapter) ReadString(count int) ([]string, error) {
 	// read string one by one is not efficient, here we read strings batch by batch, each bach size is no more than 16MB
 	batchRead := 1 // rows of each batch, make sure this value is equal or greater than 1
 	if utf {
-		batchRead += SingleBlockSize / (utf8.UTFMax * maxLen)
+		batchRead += ReadBufferSize / (utf8.UTFMax * maxLen)
 	} else {
-		batchRead += SingleBlockSize / maxLen
+		batchRead += ReadBufferSize / maxLen
 	}
 
 	log.Info("Numpy adapter: prepare to read varchar batch by batch",