Skip to content

Commit

Permalink
HIVE-28167: Iceberg: Full table deletion fails when using Copy-on-write
Browse files Browse the repository at this point in the history
  • Loading branch information
SourabhBadhya committed Apr 2, 2024
1 parent 0bc624a commit 7aae580
Show file tree
Hide file tree
Showing 7 changed files with 360 additions and 20 deletions.
4 changes: 2 additions & 2 deletions common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
Original file line number Diff line number Diff line change
Expand Up @@ -2691,8 +2691,8 @@ public static enum ConfVars {
"\n" +
"If the skew information is correctly stored in the metadata, hive.optimize.skewjoin.compiletime\n" +
"would change the query plan to take care of it, and hive.optimize.skewjoin will be a no-op."),

HIVE_OPTIMIZE_REPLACE_DELETE_WITH_TRUNCATE("hive.optimize.delete.all", false,
@Deprecated
HIVE_OPTIMIZE_REPLACE_DELETE_WITH_TRUNCATE("hive.optimize.delete.all", true,
"Optimize delete the entire data from table, use truncate instead"),
HIVE_OPTIMIZE_METADATA_DELETE("hive.optimize.delete.metadata.only", true,
"Optimize delete the entire data from table, use truncate instead"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
--! qt:replace:/(\S\"removed-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/

set hive.vectorized.execution.enabled=true;
set hive.optimize.delete.all=true;

create table ice01 (id int, key int) Stored by Iceberg stored as ORC
TBLPROPERTIES('format-version'='2', 'iceberg.delete.skiprowdata'='false');
Expand All @@ -35,4 +34,41 @@ select count(*) from ice01;
select * from ice01;
describe formatted ice01;

drop table ice01;
drop table ice01;

-- Create a V2 table with Copy-on-write as the deletion mode.
create table ice01 (id int, key int) stored by iceberg stored as orc tblproperties ('format-version'='2', 'write.delete.mode'='copy-on-write');

insert into ice01 values (1,1),(2,1),(3,1),(4,1);
insert into ice01 values (1,2),(2,2),(3,2),(4,2);
insert into ice01 values (1,3),(2,3),(3,3),(4,3);
insert into ice01 values (1,4),(2,4),(3,4),(4,4);
insert into ice01 values (1,5),(2,5),(3,5),(4,5);

explain analyze delete from ice01;

delete from ice01;

select count(*) from ice01;
select * from ice01;
describe formatted ice01;
drop table ice01;

-- Create a V1 table.
create table ice01 (id int, key int) stored by iceberg stored as orc;

insert into ice01 values (1,1),(2,1),(3,1),(4,1);
insert into ice01 values (1,2),(2,2),(3,2),(4,2);
insert into ice01 values (1,3),(2,3),(3,3),(4,3);
insert into ice01 values (1,4),(2,4),(3,4),(4,4);
insert into ice01 values (1,5),(2,5),(3,5),(4,5);

explain analyze delete from ice01;

-- Perform delete on the V1 table
delete from ice01;

select count(*) from ice01;
select * from ice01;
describe formatted ice01;
drop table ice01;
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,302 @@ POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@ice01
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice01
PREHOOK: query: create table ice01 (id int, key int) stored by iceberg stored as orc tblproperties ('format-version'='2', 'write.delete.mode'='copy-on-write')
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@ice01
POSTHOOK: query: create table ice01 (id int, key int) stored by iceberg stored as orc tblproperties ('format-version'='2', 'write.delete.mode'='copy-on-write')
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,1),(2,1),(3,1),(4,1)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,1),(2,1),(3,1),(4,1)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,2),(2,2),(3,2),(4,2)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,2),(2,2),(3,2),(4,2)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,3),(2,3),(3,3),(4,3)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,3),(2,3),(3,3),(4,3)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,4),(2,4),(3,4),(4,4)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,4),(2,4),(3,4),(4,4)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,5),(2,5),(3,5),(4,5)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,5),(2,5),(3,5),(4,5)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: delete from ice01
PREHOOK: type: TRUNCATETABLE
PREHOOK: Output: default@ice01
POSTHOOK: query: delete from ice01
POSTHOOK: type: TRUNCATETABLE
POSTHOOK: Output: default@ice01
PREHOOK: query: explain analyze delete from ice01
PREHOOK: type: TRUNCATETABLE
PREHOOK: Output: default@ice01
POSTHOOK: query: explain analyze delete from ice01
POSTHOOK: type: TRUNCATETABLE
POSTHOOK: Output: default@ice01
STAGE DEPENDENCIES:
Stage-0 is a root stage

STAGE PLANS:
Stage: Stage-0
Truncate Table or Partition
table name: default.ice01

PREHOOK: query: delete from ice01
PREHOOK: type: TRUNCATETABLE
PREHOOK: Output: default@ice01
POSTHOOK: query: delete from ice01
POSTHOOK: type: TRUNCATETABLE
POSTHOOK: Output: default@ice01
PREHOOK: query: select count(*) from ice01
PREHOOK: type: QUERY
PREHOOK: Input: default@ice01
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: select count(*) from ice01
POSTHOOK: type: QUERY
POSTHOOK: Input: default@ice01
POSTHOOK: Output: hdfs://### HDFS PATH ###
0
PREHOOK: query: select * from ice01
PREHOOK: type: QUERY
PREHOOK: Input: default@ice01
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: select * from ice01
POSTHOOK: type: QUERY
POSTHOOK: Input: default@ice01
POSTHOOK: Output: hdfs://### HDFS PATH ###
PREHOOK: query: describe formatted ice01
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@ice01
POSTHOOK: query: describe formatted ice01
POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@ice01
# col_name data_type comment
id int
key int

# Detailed Table Information
Database: default
#### A masked pattern was here ####
Retention: 0
#### A masked pattern was here ####
Table Type: EXTERNAL_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
EXTERNAL TRUE
bucketing_version 2
current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":false,\"type\":\"int\"},{\"id\":2,\"name\":\"key\",\"required\":false,\"type\":\"int\"}]}
current-snapshot-id #Masked#
current-snapshot-summary {\"deleted-data-files\":\"5\",\"deleted-records\":\"20\",\"removed-files-size\":\"#Masked#\",\"changed-partition-count\":\"1\",\"total-records\":\"0\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"0\",\"total-delete-files\":\"0\",\"total-position-deletes\":\"0\",\"total-equality-deletes\":\"0\"}
current-snapshot-timestamp-ms #Masked#
format-version 2
iceberg.orc.files.only true
metadata_location hdfs://### HDFS PATH ###
numFiles 0
numRows 0
parquet.compression zstd
previous_metadata_location hdfs://### HDFS PATH ###
rawDataSize 0
serialization.format 1
snapshot-count 6
storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
table_type ICEBERG
totalSize #Masked#
#### A masked pattern was here ####
uuid #Masked#
write.delete.mode copy-on-write
write.format.default orc
write.merge.mode merge-on-read
write.update.mode merge-on-read

# Storage Information
SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
Compressed: No
Sort Columns: []
PREHOOK: query: drop table ice01
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@ice01
PREHOOK: Output: database:default
PREHOOK: Output: default@ice01
POSTHOOK: query: drop table ice01
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@ice01
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice01
PREHOOK: query: create table ice01 (id int, key int) stored by iceberg stored as orc
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@ice01
POSTHOOK: query: create table ice01 (id int, key int) stored by iceberg stored as orc
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,1),(2,1),(3,1),(4,1)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,1),(2,1),(3,1),(4,1)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,2),(2,2),(3,2),(4,2)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,2),(2,2),(3,2),(4,2)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,3),(2,3),(3,3),(4,3)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,3),(2,3),(3,3),(4,3)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,4),(2,4),(3,4),(4,4)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,4),(2,4),(3,4),(4,4)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 values (1,5),(2,5),(3,5),(4,5)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 values (1,5),(2,5),(3,5),(4,5)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: delete from ice01
PREHOOK: type: TRUNCATETABLE
PREHOOK: Output: default@ice01
POSTHOOK: query: delete from ice01
POSTHOOK: type: TRUNCATETABLE
POSTHOOK: Output: default@ice01
PREHOOK: query: explain analyze delete from ice01
PREHOOK: type: TRUNCATETABLE
PREHOOK: Output: default@ice01
POSTHOOK: query: explain analyze delete from ice01
POSTHOOK: type: TRUNCATETABLE
POSTHOOK: Output: default@ice01
STAGE DEPENDENCIES:
Stage-0 is a root stage

STAGE PLANS:
Stage: Stage-0
Truncate Table or Partition
table name: default.ice01

PREHOOK: query: delete from ice01
PREHOOK: type: TRUNCATETABLE
PREHOOK: Output: default@ice01
POSTHOOK: query: delete from ice01
POSTHOOK: type: TRUNCATETABLE
POSTHOOK: Output: default@ice01
PREHOOK: query: select count(*) from ice01
PREHOOK: type: QUERY
PREHOOK: Input: default@ice01
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: select count(*) from ice01
POSTHOOK: type: QUERY
POSTHOOK: Input: default@ice01
POSTHOOK: Output: hdfs://### HDFS PATH ###
0
PREHOOK: query: select * from ice01
PREHOOK: type: QUERY
PREHOOK: Input: default@ice01
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: select * from ice01
POSTHOOK: type: QUERY
POSTHOOK: Input: default@ice01
POSTHOOK: Output: hdfs://### HDFS PATH ###
PREHOOK: query: describe formatted ice01
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@ice01
POSTHOOK: query: describe formatted ice01
POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@ice01
# col_name data_type comment
id int
key int

# Detailed Table Information
Database: default
#### A masked pattern was here ####
Retention: 0
#### A masked pattern was here ####
Table Type: EXTERNAL_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
EXTERNAL TRUE
bucketing_version 2
current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":false,\"type\":\"int\"},{\"id\":2,\"name\":\"key\",\"required\":false,\"type\":\"int\"}]}
current-snapshot-id #Masked#
current-snapshot-summary {\"deleted-data-files\":\"5\",\"deleted-records\":\"20\",\"removed-files-size\":\"#Masked#\",\"changed-partition-count\":\"1\",\"total-records\":\"0\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"0\",\"total-delete-files\":\"0\",\"total-position-deletes\":\"0\",\"total-equality-deletes\":\"0\"}
current-snapshot-timestamp-ms #Masked#
format-version 2
iceberg.orc.files.only true
metadata_location hdfs://### HDFS PATH ###
numFiles 0
numRows 0
parquet.compression zstd
previous_metadata_location hdfs://### HDFS PATH ###
rawDataSize 0
serialization.format 1
snapshot-count 6
storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
table_type ICEBERG
totalSize #Masked#
#### A masked pattern was here ####
uuid #Masked#
write.format.default orc

# Storage Information
SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
Compressed: No
Sort Columns: []
PREHOOK: query: drop table ice01
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@ice01
PREHOOK: Output: database:default
PREHOOK: Output: default@ice01
POSTHOOK: query: drop table ice01
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@ice01
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice01
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@
import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lockmgr.LockException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.rewrite.DeleteStatement;
import org.apache.hadoop.hive.ql.parse.rewrite.RewriterFactory;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.session.SessionState;

import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -89,6 +91,19 @@ private void genTruncatePlan(Table table, ASTNode tabNameNode) throws SemanticEx
rewrittenCtx.setEnableUnparse(false);
truncate.analyze(rewrittenTree, rewrittenCtx);

boolean hasAcidDdl = truncate.getAcidDdlDesc() != null && truncate.getAcidDdlDesc().mayNeedWriteId();
if (hasAcidDdl) {
try {
String fqTableName = truncate.getAcidDdlDesc().getFullTableName();
TableName tableName = HiveTableName.of(fqTableName);
long writeId = SessionState.get().initTxnMgr(conf)
.getTableWriteId(tableName.getDb(), tableName.getTable());
truncate.getAcidDdlDesc().setWriteId(writeId);
} catch (LockException e) {
throw new SemanticException(e);
}
}

rootTasks = truncate.getRootTasks();
outputs = truncate.getOutputs();
updateOutputs(table);
Expand Down
Loading

0 comments on commit 7aae580

Please sign in to comment.