Skip to content

Commit

Permalink
HIVE-28266: Iceberg: Select count(*) from data_files metadata tables …
Browse files Browse the repository at this point in the history
…gives wrong result (Dmitriy Fingerman, reviewed by Butao Zhang, Denys Kuzmenko)

Closes apache#5253
  • Loading branch information
difin authored and dengzhhu653 committed Sep 20, 2024
1 parent 16af516 commit 9928754
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ private ColumnStatistics readColStats(Table table, Path statsPath) {

@Override
public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
if (getStatsSource().equals(HiveMetaHook.ICEBERG)) {
if (getStatsSource().equals(HiveMetaHook.ICEBERG) && hmsTable.getMetaTable() == null) {
Table table = getTable(hmsTable);
if (table.currentSnapshot() != null) {
Map<String, String> summary = table.currentSnapshot().summary();
Expand Down Expand Up @@ -1022,7 +1022,7 @@ public void alterTableSnapshotRefOperation(org.apache.hadoop.hive.ql.metadata.Ta

@Override
public boolean isValidMetadataTable(String metaTableName) {
return IcebergMetadataTables.isValidMetaTable(metaTableName);
return metaTableName != null && IcebergMetadataTables.isValidMetaTable(metaTableName);
}

@Override
Expand Down Expand Up @@ -1502,7 +1502,7 @@ private String collectColumnAndReplaceDummyValues(ExprNodeDesc node, String foun
private void fallbackToNonVectorizedModeBasedOnProperties(Properties tableProps) {
Schema tableSchema = SchemaParser.fromJson(tableProps.getProperty(InputFormatConfig.TABLE_SCHEMA));
if (FileFormat.AVRO.name().equalsIgnoreCase(tableProps.getProperty(TableProperties.DEFAULT_FILE_FORMAT)) ||
(tableProps.containsKey("metaTable") && isValidMetadataTable(tableProps.getProperty("metaTable"))) ||
isValidMetadataTable(tableProps.getProperty(IcebergAcidUtil.META_TABLE_PROPERTY)) ||
hasOrcTimeInSchema(tableProps, tableSchema) ||
!hasParquetNestedTypeWithinListOrMap(tableProps, tableSchema)) {
conf.setBoolean(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.varname, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ private IcebergAcidUtil() {
private static final Types.NestedField PARTITION_STRUCT_META_COL = null; // placeholder value in the map
private static final Map<Types.NestedField, Integer> FILE_READ_META_COLS = Maps.newLinkedHashMap();
private static final Map<String, Types.NestedField> VIRTUAL_COLS_TO_META_COLS = Maps.newLinkedHashMap();
public static final String META_TABLE_PROPERTY = "metaTable";

static {
FILE_READ_META_COLS.put(MetadataColumns.SPEC_ID, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ static Table getTable(Configuration configuration, org.apache.hadoop.hive.metast
* @return an Iceberg table
*/
static Table getTable(Configuration configuration, Properties properties, boolean skipCache) {
String metaTable = properties.getProperty("metaTable");
String metaTable = properties.getProperty(IcebergAcidUtil.META_TABLE_PROPERTY);
String tableName = properties.getProperty(Catalogs.NAME);
String location = properties.getProperty(Catalogs.LOCATION);
if (metaTable != null) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
-- Mask random uuid
--! qt:replace:/(\s+uuid\s+)\S+(\s*)/$1#Masked#$2/
-- Mask a random snapshot id
--! qt:replace:/(\s+current-snapshot-id\s+)\S+(\s*)/$1#Masked#/
-- Mask current-snapshot-timestamp-ms
--! qt:replace:/(\s+current-snapshot-timestamp-ms\s+)\S+(\s*)/$1#Masked#$2/
-- Mask added file size
--! qt:replace:/(\S\"added-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/
-- Mask total file size
--! qt:replace:/(\S\"total-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/
-- Mask show compactions fields that change across runs
--! qt:replace:/(MAJOR\s+succeeded\s+)[a-zA-Z0-9\-\.\s+]+(\s+manual)/$1#Masked#$2/
--! qt:replace:/(\s+totalSize\s+)\S+(\s+)/$1#Masked#/

set hive.llap.io.enabled=true;
set hive.vectorized.execution.enabled=true;
set hive.optimize.shared.work.merge.ts.schema=true;
set iceberg.mr.schema.auto.conversion=true;

CREATE TABLE x (name VARCHAR(50), age TINYINT, num_clicks BIGINT)
stored by iceberg stored as orc
TBLPROPERTIES ('external.table.purge'='true','format-version'='2');

insert into x values
('amy', 35, 123412344),
('adxfvy', 36, 123412534),
('amsdfyy', 37, 123417234),
('asafmy', 38, 123412534);

insert into x values
('amerqwy', 39, 123441234),
('amyxzcv', 40, 123341234),
('erweramy', 45, 122341234);

select * from default.x.data_files;
select count(*) from default.x.data_files;

alter table x compact 'major' and wait;

show compactions;
desc formatted x;

select * from default.x.data_files;
select count(*) from default.x.data_files;
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
PREHOOK: query: CREATE TABLE x (name VARCHAR(50), age TINYINT, num_clicks BIGINT)
stored by iceberg stored as orc
TBLPROPERTIES ('external.table.purge'='true','format-version'='2')
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@x
POSTHOOK: query: CREATE TABLE x (name VARCHAR(50), age TINYINT, num_clicks BIGINT)
stored by iceberg stored as orc
TBLPROPERTIES ('external.table.purge'='true','format-version'='2')
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@x
PREHOOK: query: insert into x values
('amy', 35, 123412344),
('adxfvy', 36, 123412534),
('amsdfyy', 37, 123417234),
('asafmy', 38, 123412534)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@x
POSTHOOK: query: insert into x values
('amy', 35, 123412344),
('adxfvy', 36, 123412534),
('amsdfyy', 37, 123417234),
('asafmy', 38, 123412534)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@x
PREHOOK: query: insert into x values
('amerqwy', 39, 123441234),
('amyxzcv', 40, 123341234),
('erweramy', 45, 122341234)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@x
POSTHOOK: query: insert into x values
('amerqwy', 39, 123441234),
('amyxzcv', 40, 123341234),
('erweramy', 45, 122341234)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@x
PREHOOK: query: select * from default.x.data_files
PREHOOK: type: QUERY
PREHOOK: Input: default@x
#### A masked pattern was here ####
POSTHOOK: query: select * from default.x.data_files
POSTHOOK: type: QUERY
POSTHOOK: Input: default@x
#### A masked pattern was here ####
PREHOOK: query: select count(*) from default.x.data_files
PREHOOK: type: QUERY
PREHOOK: Input: default@x
#### A masked pattern was here ####
POSTHOOK: query: select count(*) from default.x.data_files
POSTHOOK: type: QUERY
POSTHOOK: Input: default@x
#### A masked pattern was here ####
2
PREHOOK: query: alter table x compact 'major' and wait
PREHOOK: type: ALTERTABLE_COMPACT
PREHOOK: Input: default@x
PREHOOK: Output: default@x
POSTHOOK: query: alter table x compact 'major' and wait
POSTHOOK: type: ALTERTABLE_COMPACT
POSTHOOK: Input: default@x
POSTHOOK: Output: default@x
PREHOOK: query: show compactions
PREHOOK: type: SHOW COMPACTIONS
POSTHOOK: query: show compactions
POSTHOOK: type: SHOW COMPACTIONS
CompactionId Database Table Partition Type State Worker host Worker Enqueue Time Start Time Duration(ms) HadoopJobId Error message Initiator host Initiator Pool name TxnId Next TxnId Commit Time Highest WriteId
1 default x --- MAJOR succeeded #Masked# manual default 0 0 0 ---
PREHOOK: query: desc formatted x
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@x
POSTHOOK: query: desc formatted x
POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@x
# col_name data_type comment
name string
age int
num_clicks bigint

# Detailed Table Information
Database: default
#### A masked pattern was here ####
Retention: 0
#### A masked pattern was here ####
Table Type: EXTERNAL_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"age\":\"true\",\"name\":\"true\",\"num_clicks\":\"true\"}}
EXTERNAL TRUE
bucketing_version 2
current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"name\",\"required\":false,\"type\":\"string\"},{\"id\":2,\"name\":\"age\",\"required\":false,\"type\":\"int\"},{\"id\":3,\"name\":\"num_clicks\",\"required\":false,\"type\":\"long\"}]}
current-snapshot-id #Masked#
current-snapshot-summary {\"replace-partitions\":\"true\",\"added-data-files\":\"1\",\"added-records\":\"7\",\"added-files-size\":\"#Masked#\",\"changed-partition-count\":\"1\",\"total-records\":\"7\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"1\",\"total-delete-files\":\"0\",\"total-position-deletes\":\"0\",\"total-equality-deletes\":\"0\"}
current-snapshot-timestamp-ms #Masked#
external.table.purge true
format-version 2
iceberg.orc.files.only true
#### A masked pattern was here ####
numFiles 1
numRows 7
parquet.compression zstd
#### A masked pattern was here ####
rawDataSize 0
serialization.format 1
snapshot-count 4
storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
table_type ICEBERG
totalSize #Masked#
#### A masked pattern was here ####
uuid #Masked#
write.delete.mode merge-on-read
write.format.default orc
write.merge.mode merge-on-read
write.update.mode merge-on-read

# Storage Information
SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
Compressed: No
Sort Columns: []
PREHOOK: query: select * from default.x.data_files
PREHOOK: type: QUERY
PREHOOK: Input: default@x
#### A masked pattern was here ####
POSTHOOK: query: select * from default.x.data_files
POSTHOOK: type: QUERY
POSTHOOK: Input: default@x
#### A masked pattern was here ####
PREHOOK: query: select count(*) from default.x.data_files
PREHOOK: type: QUERY
PREHOOK: Input: default@x
#### A masked pattern was here ####
POSTHOOK: query: select count(*) from default.x.data_files
POSTHOOK: type: QUERY
POSTHOOK: Input: default@x
#### A masked pattern was here ####
1
1 change: 1 addition & 0 deletions itests/src/test/resources/testconfiguration.properties
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,7 @@ iceberg.llap.query.files=\
iceberg.llap.query.compactor.files=\
iceberg_major_compaction_partition_evolution.q,\
iceberg_major_compaction_partitioned.q,\
iceberg_major_compaction_query_metadata.q,\
iceberg_major_compaction_schema_evolution.q,\
iceberg_major_compaction_unpartitioned.q,\
iceberg_optimize_table_unpartitioned.q
Expand Down

0 comments on commit 9928754

Please sign in to comment.