80
80
81
81
import static org .assertj .core .api .Assertions .assertThat ;
82
82
import static org .assertj .core .api .Assertions .tuple ;
83
+ import static sleeper .core .properties .instance .CdkDefinedInstanceProperty .BULK_IMPORT_BUCKET ;
83
84
import static sleeper .core .properties .instance .CdkDefinedInstanceProperty .CONFIG_BUCKET ;
84
85
import static sleeper .core .properties .instance .CdkDefinedInstanceProperty .DATA_BUCKET ;
85
86
import static sleeper .core .properties .instance .CommonProperty .FILE_SYSTEM ;
@@ -372,6 +373,55 @@ void shouldNotThrowExceptionIfProvidedWithDirectoryWhichContainsParquetAndNonPar
372
373
ingestFinishedStatus (summary (startTime , endTime , 200 , 200 ), 1 ))));
373
374
}
374
375
376
+ @ ParameterizedTest
377
+ @ MethodSource ("getParameters" )
378
+ void shouldDeleteJsonFileAfterImport (BulkImportJobRunner runner ) throws IOException {
379
+ // Given
380
+ // - Write some data to be imported
381
+ List <Record > records = getRecords ();
382
+ writeRecordsToFile (records , dataDir + "/import/a.parquet" );
383
+ List <String > inputFiles = new ArrayList <>();
384
+ inputFiles .add (dataDir + "/import/a.parquet" );
385
+ // - State store
386
+ StateStore stateStore = createTable (instanceProperties , tableProperties );
387
+
388
+ // When
389
+ BulkImportJob job = jobForTable (tableProperties ).id ("my-job" ).files (inputFiles ).build ();
390
+ runJob (runner , instanceProperties , job );
391
+
392
+ // Then
393
+ List <FileReference > fileReferences = stateStore .getFileReferences ();
394
+ List <Record > readRecords = new ArrayList <>();
395
+ for (FileReference fileReference : fileReferences ) {
396
+ try (ParquetRecordReader reader = new ParquetRecordReader (new Path (fileReference .getFilename ()), schema )) {
397
+ List <Record > recordsInThisFile = new ArrayList <>();
398
+ Record record = reader .read ();
399
+ while (null != record ) {
400
+ Record clonedRecord = new Record (record );
401
+ readRecords .add (clonedRecord );
402
+ recordsInThisFile .add (clonedRecord );
403
+ record = reader .read ();
404
+ }
405
+ assertThat (recordsInThisFile ).isSortedAccordingTo (new RecordComparator (getSchema ()));
406
+ }
407
+ }
408
+ assertThat (readRecords ).hasSameSizeAs (records );
409
+
410
+ List <Record > expectedRecords = new ArrayList <>(records );
411
+ sortRecords (expectedRecords );
412
+ sortRecords (readRecords );
413
+ assertThat (readRecords ).isEqualTo (expectedRecords );
414
+ IngestJob ingestJob = job .toIngestJob ();
415
+ assertThat (tracker .getAllJobs (tableProperties .get (TABLE_ID )))
416
+ .containsExactly (ingestJobStatus (ingestJob , jobRunOnTask (taskId ,
417
+ ingestAcceptedStatus (ingestJob , validationTime ),
418
+ validatedIngestStartedStatus (ingestJob , startTime ),
419
+ ingestFinishedStatus (summary (startTime , endTime , 200 , 200 ), 1 ))));
420
+
421
+ // Check json file has been deleted
422
+ assertThat (listObjectKeys (instanceProperties .get (BULK_IMPORT_BUCKET ))).isEmpty ();
423
+ }
424
+
375
425
private static List <Record > readRecords (String filename , Schema schema ) {
376
426
try (ParquetRecordReader reader = new ParquetRecordReader (new Path (filename ), schema )) {
377
427
List <Record > readRecords = new ArrayList <>();
@@ -395,8 +445,10 @@ public InstanceProperties createInstanceProperties(String dir) {
395
445
InstanceProperties instanceProperties = createTestInstanceProperties ();
396
446
instanceProperties .set (DATA_BUCKET , dir );
397
447
instanceProperties .set (FILE_SYSTEM , "file://" );
448
+ instanceProperties .set (BULK_IMPORT_BUCKET , "bulkimport" );
398
449
399
450
createBucket (instanceProperties .get (CONFIG_BUCKET ));
451
+ createBucket (instanceProperties .get (BULK_IMPORT_BUCKET ));
400
452
DynamoDBTableIndexCreator .create (dynamoClient , instanceProperties );
401
453
new TransactionLogStateStoreCreator (instanceProperties , dynamoClient ).create ();
402
454
return instanceProperties ;
0 commit comments