From 9928754f917b73b4c025628b6c844a0c3a465470 Mon Sep 17 00:00:00 2001 From: Dmitriy Fingerman Date: Wed, 22 May 2024 04:10:46 -0400 Subject: [PATCH] HIVE-28266: Iceberg: Select count(*) from data_files metadata tables gives wrong result (Dmitriy Fingerman, reviewed by Butao Zhang, Denys Kuzmenko) Closes #5253 --- .../mr/hive/HiveIcebergStorageHandler.java | 6 +- .../iceberg/mr/hive/IcebergAcidUtil.java | 1 + .../iceberg/mr/hive/IcebergTableUtil.java | 2 +- .../iceberg_major_compaction_query_metadata.q | 44 ++++++ ...berg_major_compaction_query_metadata.q.out | 142 ++++++++++++++++++ .../resources/testconfiguration.properties | 1 + 6 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 iceberg/iceberg-handler/src/test/queries/positive/iceberg_major_compaction_query_metadata.q create mode 100644 iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_query_metadata.q.out diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index 8c00b909115a..37d44708641f 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -586,7 +586,7 @@ private ColumnStatistics readColStats(Table table, Path statsPath) { @Override public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { - if (getStatsSource().equals(HiveMetaHook.ICEBERG)) { + if (getStatsSource().equals(HiveMetaHook.ICEBERG) && hmsTable.getMetaTable() == null) { Table table = getTable(hmsTable); if (table.currentSnapshot() != null) { Map summary = table.currentSnapshot().summary(); @@ -1022,7 +1022,7 @@ public void alterTableSnapshotRefOperation(org.apache.hadoop.hive.ql.metadata.Ta @Override public boolean isValidMetadataTable(String metaTableName) { - return IcebergMetadataTables.isValidMetaTable(metaTableName); + return metaTableName != null && IcebergMetadataTables.isValidMetaTable(metaTableName); } @Override @@ -1502,7 +1502,7 @@ private String collectColumnAndReplaceDummyValues(ExprNodeDesc node, String foun private void fallbackToNonVectorizedModeBasedOnProperties(Properties tableProps) { Schema tableSchema = SchemaParser.fromJson(tableProps.getProperty(InputFormatConfig.TABLE_SCHEMA)); if (FileFormat.AVRO.name().equalsIgnoreCase(tableProps.getProperty(TableProperties.DEFAULT_FILE_FORMAT)) || - (tableProps.containsKey("metaTable") && isValidMetadataTable(tableProps.getProperty("metaTable"))) || + isValidMetadataTable(tableProps.getProperty(IcebergAcidUtil.META_TABLE_PROPERTY)) || hasOrcTimeInSchema(tableProps, tableSchema) || !hasParquetNestedTypeWithinListOrMap(tableProps, tableSchema)) { conf.setBoolean(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.varname, false); diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergAcidUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergAcidUtil.java index 43195e38846e..b4f9566ad158 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergAcidUtil.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergAcidUtil.java @@ -48,6 +48,7 @@ private IcebergAcidUtil() { private static final Types.NestedField PARTITION_STRUCT_META_COL = null; // placeholder value in the map private static final Map FILE_READ_META_COLS = Maps.newLinkedHashMap(); private static final Map VIRTUAL_COLS_TO_META_COLS = Maps.newLinkedHashMap(); + public static final String META_TABLE_PROPERTY = "metaTable"; static { FILE_READ_META_COLS.put(MetadataColumns.SPEC_ID, 0); diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java index 8f2a16b91a83..1c0835e030ca 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java @@ -101,7 +101,7 @@ static Table getTable(Configuration configuration, org.apache.hadoop.hive.metast * @return an Iceberg table */ static Table getTable(Configuration configuration, Properties properties, boolean skipCache) { - String metaTable = properties.getProperty("metaTable"); + String metaTable = properties.getProperty(IcebergAcidUtil.META_TABLE_PROPERTY); String tableName = properties.getProperty(Catalogs.NAME); String location = properties.getProperty(Catalogs.LOCATION); if (metaTable != null) { diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_major_compaction_query_metadata.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_major_compaction_query_metadata.q new file mode 100644 index 000000000000..f843d7920d98 --- /dev/null +++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_major_compaction_query_metadata.q @@ -0,0 +1,44 @@ +-- Mask random uuid +--! qt:replace:/(\s+uuid\s+)\S+(\s*)/$1#Masked#$2/ +-- Mask a random snapshot id +--! qt:replace:/(\s+current-snapshot-id\s+)\S+(\s*)/$1#Masked#/ +-- Mask current-snapshot-timestamp-ms +--! qt:replace:/(\s+current-snapshot-timestamp-ms\s+)\S+(\s*)/$1#Masked#$2/ +-- Mask added file size +--! qt:replace:/(\S\"added-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/ +-- Mask total file size +--! qt:replace:/(\S\"total-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/ +-- Mask show compactions fields that change across runs +--! qt:replace:/(MAJOR\s+succeeded\s+)[a-zA-Z0-9\-\.\s+]+(\s+manual)/$1#Masked#$2/ +--! qt:replace:/(\s+totalSize\s+)\S+(\s+)/$1#Masked#/ + +set hive.llap.io.enabled=true; +set hive.vectorized.execution.enabled=true; +set hive.optimize.shared.work.merge.ts.schema=true; +set iceberg.mr.schema.auto.conversion=true; + +CREATE TABLE x (name VARCHAR(50), age TINYINT, num_clicks BIGINT) +stored by iceberg stored as orc +TBLPROPERTIES ('external.table.purge'='true','format-version'='2'); + +insert into x values +('amy', 35, 123412344), +('adxfvy', 36, 123412534), +('amsdfyy', 37, 123417234), +('asafmy', 38, 123412534); + +insert into x values +('amerqwy', 39, 123441234), +('amyxzcv', 40, 123341234), +('erweramy', 45, 122341234); + +select * from default.x.data_files; +select count(*) from default.x.data_files; + +alter table x compact 'major' and wait; + +show compactions; +desc formatted x; + +select * from default.x.data_files; +select count(*) from default.x.data_files; diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_query_metadata.q.out b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_query_metadata.q.out new file mode 100644 index 000000000000..4e369d322ef7 --- /dev/null +++ b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_query_metadata.q.out @@ -0,0 +1,142 @@ +PREHOOK: query: CREATE TABLE x (name VARCHAR(50), age TINYINT, num_clicks BIGINT) +stored by iceberg stored as orc +TBLPROPERTIES ('external.table.purge'='true','format-version'='2') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x +POSTHOOK: query: CREATE TABLE x (name VARCHAR(50), age TINYINT, num_clicks BIGINT) +stored by iceberg stored as orc +TBLPROPERTIES ('external.table.purge'='true','format-version'='2') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x +PREHOOK: query: insert into x values +('amy', 35, 123412344), +('adxfvy', 36, 123412534), +('amsdfyy', 37, 123417234), +('asafmy', 38, 123412534) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x +POSTHOOK: query: insert into x values +('amy', 35, 123412344), +('adxfvy', 36, 123412534), +('amsdfyy', 37, 123417234), +('asafmy', 38, 123412534) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x +PREHOOK: query: insert into x values +('amerqwy', 39, 123441234), +('amyxzcv', 40, 123341234), +('erweramy', 45, 122341234) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x +POSTHOOK: query: insert into x values +('amerqwy', 39, 123441234), +('amyxzcv', 40, 123341234), +('erweramy', 45, 122341234) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x +PREHOOK: query: select * from default.x.data_files +PREHOOK: type: QUERY +PREHOOK: Input: default@x +#### A masked pattern was here #### +POSTHOOK: query: select * from default.x.data_files +POSTHOOK: type: QUERY +POSTHOOK: Input: default@x +#### A masked pattern was here #### +PREHOOK: query: select count(*) from default.x.data_files +PREHOOK: type: QUERY +PREHOOK: Input: default@x +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from default.x.data_files +POSTHOOK: type: QUERY +POSTHOOK: Input: default@x +#### A masked pattern was here #### +2 +PREHOOK: query: alter table x compact 'major' and wait +PREHOOK: type: ALTERTABLE_COMPACT +PREHOOK: Input: default@x +PREHOOK: Output: default@x +POSTHOOK: query: alter table x compact 'major' and wait +POSTHOOK: type: ALTERTABLE_COMPACT +POSTHOOK: Input: default@x +POSTHOOK: Output: default@x +PREHOOK: query: show compactions +PREHOOK: type: SHOW COMPACTIONS +POSTHOOK: query: show compactions +POSTHOOK: type: SHOW COMPACTIONS +CompactionId Database Table Partition Type State Worker host Worker Enqueue Time Start Time Duration(ms) HadoopJobId Error message Initiator host Initiator Pool name TxnId Next TxnId Commit Time Highest WriteId +1 default x --- MAJOR succeeded #Masked# manual default 0 0 0 --- +PREHOOK: query: desc formatted x +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@x +POSTHOOK: query: desc formatted x +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@x +# col_name data_type comment +name string +age int +num_clicks bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: EXTERNAL_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"age\":\"true\",\"name\":\"true\",\"num_clicks\":\"true\"}} + EXTERNAL TRUE + bucketing_version 2 + current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"name\",\"required\":false,\"type\":\"string\"},{\"id\":2,\"name\":\"age\",\"required\":false,\"type\":\"int\"},{\"id\":3,\"name\":\"num_clicks\",\"required\":false,\"type\":\"long\"}]} + current-snapshot-id #Masked# + current-snapshot-summary {\"replace-partitions\":\"true\",\"added-data-files\":\"1\",\"added-records\":\"7\",\"added-files-size\":\"#Masked#\",\"changed-partition-count\":\"1\",\"total-records\":\"7\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"1\",\"total-delete-files\":\"0\",\"total-position-deletes\":\"0\",\"total-equality-deletes\":\"0\"} + current-snapshot-timestamp-ms #Masked# + external.table.purge true + format-version 2 + iceberg.orc.files.only true +#### A masked pattern was here #### + numFiles 1 + numRows 7 + parquet.compression zstd +#### A masked pattern was here #### + rawDataSize 0 + serialization.format 1 + snapshot-count 4 + storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler + table_type ICEBERG + totalSize #Masked# +#### A masked pattern was here #### + uuid #Masked# + write.delete.mode merge-on-read + write.format.default orc + write.merge.mode merge-on-read + write.update.mode merge-on-read + +# Storage Information +SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe +InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat +OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat +Compressed: No +Sort Columns: [] +PREHOOK: query: select * from default.x.data_files +PREHOOK: type: QUERY +PREHOOK: Input: default@x +#### A masked pattern was here #### +POSTHOOK: query: select * from default.x.data_files +POSTHOOK: type: QUERY +POSTHOOK: Input: default@x +#### A masked pattern was here #### +PREHOOK: query: select count(*) from default.x.data_files +PREHOOK: type: QUERY +PREHOOK: Input: default@x +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from default.x.data_files +POSTHOOK: type: QUERY +POSTHOOK: Input: default@x +#### A masked pattern was here #### +1 diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index c50c5a9b77e6..3489810ed547 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -421,6 +421,7 @@ iceberg.llap.query.files=\ iceberg.llap.query.compactor.files=\ iceberg_major_compaction_partition_evolution.q,\ iceberg_major_compaction_partitioned.q,\ + iceberg_major_compaction_query_metadata.q,\ iceberg_major_compaction_schema_evolution.q,\ iceberg_major_compaction_unpartitioned.q,\ iceberg_optimize_table_unpartitioned.q