diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 71bd7feeea6e24..620dbd5d37820b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -65,8 +65,11 @@ public abstract class BaseAnalysisTask { public static final long LIMIT_SIZE = 1024 * 1024 * 1024; // 1GB public static final double LIMIT_FACTOR = 1.2; - protected static final String FULL_ANALYZE_TEMPLATE = - "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, " + protected static final String FULL_ANALYZE_TEMPLATE = "WITH cte1 AS (" + + "SELECT `${colName}` " + + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}), " + + "cte2 AS (" + + "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, " + "${catalogId} AS `catalog_id`, " + "${dbId} AS `db_id`, " + "${tblId} AS `tbl_id`, " @@ -79,9 +82,20 @@ public abstract class BaseAnalysisTask { + "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS `max`, " + "${dataSizeFunction} AS `data_size`, " - + "NOW() AS `update_time`, " - + "null as `hot_value` " - + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}"; + + "NOW() " + + "FROM cte1), " + + "cte3 AS (" + + "SELECT GROUP_CONCAT(CONCAT(" + + "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"), \";\", \"\\\\;\"), " + + "\" :\", ROUND(t.`count` / ${rowCount2}, 2)), \" ;\") " + + "as `hot_value` " + + "FROM (" + + "SELECT ${subStringColName} as `hash_value`, " + + "MAX(`${colName}`) as `column_key`, " + + "COUNT(1) AS `count` " + + "FROM cte1 WHERE `${colName}` IS NOT NULL " + + "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT ${hotValueCollectCount}) t) " + + "SELECT * FROM cte2 CROSS JOIN cte3"; protected static final String LINEAR_ANALYZE_TEMPLATE = "WITH cte1 AS (" + "SELECT `${colName}` " diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java index 72beb343956253..afeee0bcffe3c7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java @@ -21,6 +21,7 @@ import org.apache.doris.common.FeConstants; import org.apache.doris.common.NotImplementedException; import org.apache.doris.datasource.ExternalTable; +import org.apache.doris.qe.SessionVariable; import org.apache.commons.text.StringSubstitutor; @@ -59,9 +60,10 @@ protected void deleteNotExistPartitionStats(AnalysisInfo jobInfo) throws DdlExce protected void doFull() throws Exception { StringBuilder sb = new StringBuilder(); Map params = buildSqlParams(); - params.put("min", getMinFunction()); - params.put("max", getMaxFunction()); params.put("dataSizeFunction", getDataSizeFunction(col, false)); + params.put("hotValueCollectCount", String.valueOf(SessionVariable.getHotValueCollectCount())); + params.put("subStringColName", getStringTypeColName(col)); + params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `${colName}` IS NOT NULL)"); if (LOG.isDebugEnabled()) { LOG.debug("Will do full collection for column {}", col.getName()); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 187eff6d40a147..a79e410e973e9a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -328,7 +328,11 @@ protected void doFull() throws Exception { if (StatisticsUtil.enablePartitionAnalyze() && tbl.isPartitionedTable()) { doPartitionTable(); } else { - StringSubstitutor stringSubstitutor = new StringSubstitutor(buildSqlParams()); + Map params = buildSqlParams(); + params.put("hotValueCollectCount", String.valueOf(SessionVariable.getHotValueCollectCount())); + params.put("subStringColName", getStringTypeColName(col)); + params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `${colName}` IS NOT NULL)"); + StringSubstitutor stringSubstitutor = new StringSubstitutor(params); runQuery(stringSubstitutor.replace(FULL_ANALYZE_TEMPLATE)); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java index e870b0b3bfd4e2..d34d32be46800a 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java @@ -24,6 +24,7 @@ import org.apache.doris.common.Pair; import org.apache.doris.datasource.CatalogIf; import org.apache.doris.datasource.hive.HMSExternalTable; +import org.apache.doris.qe.SessionVariable; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.collect.ImmutableList; @@ -198,18 +199,35 @@ public Set getPartitionNames() { } }; + new MockUp() { + @Mock + public int getHotValueCollectCount() { + return 10; + } + }; + new MockUp() { @Mock public void runQuery(String sql) { - Assertions.assertEquals("SELECT CONCAT(30001, '-', -1, '-', 'hour') AS `id`, " + Assertions.assertEquals("WITH cte1 AS (SELECT `hour` " + + "FROM `hms`.`default`.`test` ), " + + "cte2 AS (SELECT CONCAT(30001, '-', -1, '-', 'hour') AS `id`, " + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, " + "-1 AS `idx_id`, 'hour' AS `col_id`, NULL AS `part_id`, " + "COUNT(1) AS `row_count`, NDV(`hour`) AS `ndv`, " + "COUNT(1) - COUNT(`hour`) AS `null_count`, " + "SUBSTRING(CAST(MIN(`hour`) AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST(MAX(`hour`) AS STRING), 1, 1024) AS `max`, " - + "COUNT(1) * 4 AS `data_size`, NOW() AS `update_time`, " - + "null as `hot_value` FROM `hms`.`default`.`test` ", sql); + + "COUNT(1) * 4 AS `data_size`, NOW() FROM cte1), " + + "cte3 AS (SELECT GROUP_CONCAT(CONCAT(" + + "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"), \";\", \"\\\\;\"), " + + "\" :\", ROUND(t.`count` / " + + "(SELECT COUNT(1) FROM cte1 WHERE `hour` IS NOT NULL), 2)), \" ;\") " + + "as `hot_value` FROM (SELECT `hour` as `hash_value`, " + + "MAX(`hour`) as `column_key`, COUNT(1) AS `count` " + + "FROM cte1 WHERE `hour` IS NOT NULL " + + "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT 10) t) " + + "SELECT * FROM cte2 CROSS JOIN cte3", sql); } }; diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 89a0a67d810f95..f719fbcab6eada 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -40,6 +40,7 @@ import org.apache.doris.common.FeConstants; import org.apache.doris.common.Pair; import org.apache.doris.datasource.CatalogIf; +import org.apache.doris.qe.SessionVariable; import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; import org.apache.doris.statistics.AnalysisInfo.JobType; import org.apache.doris.statistics.util.StatisticsUtil; @@ -754,4 +755,150 @@ public void testMergePartitionSql() { + "WHERE `catalog_id` = 0 AND `db_id` = 1 AND `tbl_id` = 2 AND `idx_id` = 3 AND `col_id` = 'col1'", sql); } + + @Test + public void testFullAnalyzeTemplateSql() { + Map params = new HashMap<>(); + params.put("catalogId", "0"); + params.put("dbId", "1"); + params.put("tblId", "2"); + params.put("idxId", "3"); + params.put("colId", "col1"); + params.put("colName", "col1"); + params.put("dataSizeFunction", "SUM(LENGTH(`col1`))"); + params.put("catalogName", "internal"); + params.put("dbName", "db1"); + params.put("tblName", "tbl1"); + params.put("index", ""); + params.put("hotValueCollectCount", "10"); + params.put("subStringColName", "`col1`"); + params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `col1` IS NOT NULL)"); + StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + String sql = stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE); + Assertions.assertTrue(sql.startsWith("WITH cte1 AS (")); + Assertions.assertTrue(sql.contains("cte3 AS (SELECT GROUP_CONCAT")); + Assertions.assertTrue(sql.contains("as `hot_value`")); + Assertions.assertTrue(sql.contains("CROSS JOIN cte3")); + Assertions.assertTrue(sql.contains("LIMIT 10")); + Assertions.assertTrue(sql.contains("GROUP BY `hash_value` ORDER BY `count` DESC")); + Assertions.assertFalse(sql.contains("null as `hot_value`")); + } + + @Test + public void testDoFullHotValue(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, + @Mocked OlapTable tableIf) throws Exception { + + new Expectations() { + { + tableIf.getId(); + result = 30001; + tableIf.getName(); + result = "testTbl"; + catalogIf.getId(); + result = 10001; + catalogIf.getName(); + result = "catalogName"; + databaseIf.getId(); + result = 20001; + databaseIf.getFullName(); + result = "testDb"; + } + }; + + new MockUp() { + @Mock + public boolean enablePartitionAnalyze() { + return false; + } + }; + + new MockUp() { + @Mock + public int getHotValueCollectCount() { + return 10; + } + }; + + new MockUp() { + @Mock + public void runQuery(String sql) { + Assertions.assertTrue(sql.startsWith("WITH cte1 AS (SELECT `testCol` " + + "FROM `catalogName`.`testDb`.`testTbl` "), sql); + Assertions.assertTrue(sql.contains("cte3 AS (SELECT GROUP_CONCAT"), sql); + Assertions.assertTrue(sql.contains("`testCol` as `hash_value`"), sql); + Assertions.assertTrue(sql.contains("LIMIT 10"), sql); + Assertions.assertTrue(sql.contains("CROSS JOIN cte3"), sql); + Assertions.assertFalse(sql.contains("null as `hot_value`"), sql); + } + }; + + OlapAnalysisTask task = new OlapAnalysisTask(); + task.col = new Column("testCol", Type.fromPrimitiveType(PrimitiveType.INT), + true, null, null, null); + task.tbl = tableIf; + AnalysisInfoBuilder builder = new AnalysisInfoBuilder(); + builder.setJobType(AnalysisInfo.JobType.MANUAL); + builder.setColName("testCol"); + task.info = builder.build(); + task.catalog = catalogIf; + task.db = databaseIf; + task.doFull(); + } + + @Test + public void testDoFullHotValueStringColumn(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, + @Mocked OlapTable tableIf) throws Exception { + + new Expectations() { + { + tableIf.getId(); + result = 30001; + tableIf.getName(); + result = "testTbl"; + catalogIf.getId(); + result = 10001; + catalogIf.getName(); + result = "catalogName"; + databaseIf.getId(); + result = 20001; + databaseIf.getFullName(); + result = "testDb"; + } + }; + + new MockUp() { + @Mock + public boolean enablePartitionAnalyze() { + return false; + } + }; + + new MockUp() { + @Mock + public int getHotValueCollectCount() { + return 10; + } + }; + + new MockUp() { + @Mock + public void runQuery(String sql) { + Assertions.assertTrue(sql.contains( + "xxhash_64(SUBSTRING(CAST(`strCol` AS STRING), 1, 1024)) as `hash_value`"), sql); + Assertions.assertTrue(sql.contains("MAX(`strCol`) as `column_key`"), sql); + } + }; + + OlapAnalysisTask task = new OlapAnalysisTask(); + task.col = new Column("strCol", Type.fromPrimitiveType(PrimitiveType.STRING), + true, null, null, null); + task.tbl = tableIf; + AnalysisInfoBuilder builder = new AnalysisInfoBuilder(); + builder.setJobType(AnalysisInfo.JobType.MANUAL); + builder.setColName("strCol"); + task.info = builder.build(); + task.catalog = catalogIf; + task.db = databaseIf; + task.doFull(); + } } diff --git a/regression-test/suites/statistics/test_full_analyze_hot_value.groovy b/regression-test/suites/statistics/test_full_analyze_hot_value.groovy new file mode 100644 index 00000000000000..5d7463df38f0d0 --- /dev/null +++ b/regression-test/suites/statistics/test_full_analyze_hot_value.groovy @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_full_analyze_hot_value") { + + sql """drop database if exists test_full_analyze_hot_value""" + sql """create database test_full_analyze_hot_value""" + sql """use test_full_analyze_hot_value""" + sql """set global enable_auto_analyze=false""" + + // Test 1: Full analyze collects hot_value for varchar column with skewed distribution + sql """drop table if exists full_hot_skew""" + sql """CREATE TABLE full_hot_skew ( + key1 int NULL, + value1 varchar(25) NULL + )ENGINE=OLAP + DUPLICATE KEY(`key1`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`key1`) BUCKETS 2 + PROPERTIES ( + "replication_num" = "1" + ) + """ + // Insert 100 rows: value1 has 2 values, "0" and "1", each appearing 50 times + sql """insert into full_hot_skew select number, number % 2 from numbers("number"="100")""" + + sql """analyze table full_hot_skew with sync""" + def result = sql """show column stats full_hot_skew(value1)""" + logger.info("Test1 result: " + result) + assertEquals(1, result.size()) + assertEquals("100.0", result[0][2]) + // Full analyze should now collect hot_value + assertTrue(result[0][17] != "null", "Full analyze should collect hot_value, but got null") + String[] hotValues = result[0][17].split(";") + assertEquals(2, hotValues.length) + assertTrue(hotValues[0].trim() == "'1':0.5" || hotValues[0].trim() == "'0':0.5") + assertTrue(hotValues[1].trim() == "'1':0.5" || hotValues[1].trim() == "'0':0.5") + + // Verify cached stats also have hot_value + result = sql """show column cached stats full_hot_skew(value1)""" + logger.info("Test1 cached result: " + result) + assertEquals(1, result.size()) + hotValues = result[0][17].split(";") + assertEquals(2, hotValues.length) + assertTrue(hotValues[0].trim() == "'1':0.5" || hotValues[0].trim() == "'0':0.5") + assertTrue(hotValues[1].trim() == "'1':0.5" || hotValues[1].trim() == "'0':0.5") + + // Test 2: Full analyze collects hot_value for int column + result = sql """show column stats full_hot_skew(key1)""" + logger.info("Test2 result: " + result) + assertEquals(1, result.size()) + assertEquals("100.0", result[0][2]) + // key1 has 100 unique values, top 10 will each have proportion 0.01 -> ROUND to 0.01 + assertTrue(result[0][17] != "null", "Full analyze should collect hot_value for int column") + + // Test 3: Full analyze with special characters in values + sql """drop table if exists full_hot_special""" + sql """CREATE TABLE full_hot_special ( + key1 int NULL, + value1 varchar(50) NULL + )ENGINE=OLAP + DUPLICATE KEY(`key1`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`key1`) BUCKETS 2 + PROPERTIES ( + "replication_num" = "1" + ) + """ + sql """insert into full_hot_special select number, " : ;a" from numbers("number"="100")""" + + sql """analyze table full_hot_special with sync""" + result = sql """show column stats full_hot_special(value1)""" + logger.info("Test3 result: " + result) + assertEquals(1, result.size()) + // All 100 rows have the same value " : ;a", so it should appear with ratio 1.0 + assertEquals("' : ;a':1.0", result[0][17]) + + // Test 4: Full analyze then sample analyze, hot_value should be updated + sql """drop stats full_hot_skew""" + sql """analyze table full_hot_skew with sample rows 400 with sync""" + result = sql """show column stats full_hot_skew(value1)""" + logger.info("Test4 result: " + result) + assertEquals(1, result.size()) + assertTrue(result[0][17] != "null", "Sample analyze should also collect hot_value") + hotValues = result[0][17].split(";") + assertEquals(2, hotValues.length) + assertTrue(hotValues[0].trim() == "'1':0.5" || hotValues[0].trim() == "'0':0.5") + assertTrue(hotValues[1].trim() == "'1':0.5" || hotValues[1].trim() == "'0':0.5") + + // Test 5: Verify full analyze produces same hot_value as sample analyze for same data + sql """drop stats full_hot_skew""" + sql """analyze table full_hot_skew with sync""" + def fullResult = sql """show column stats full_hot_skew(value1)""" + logger.info("Test5 full result: " + fullResult) + assertEquals(1, fullResult.size()) + assertTrue(fullResult[0][17] != "null") + def fullParts = fullResult[0][17].split(";").collect { it.trim() }.sort() + + sql """drop stats full_hot_skew""" + sql """analyze table full_hot_skew with sample rows 40000 with sync""" + def sampleResult = sql """show column stats full_hot_skew(value1)""" + logger.info("Test5 sample result: " + sampleResult) + assertEquals(1, sampleResult.size()) + assertTrue(sampleResult[0][17] != "null") + // Both full and sample should produce the same hot_value entries (order may differ) + def sampleParts = sampleResult[0][17].split(";").collect { it.trim() }.sort() + assertEquals(fullParts, sampleParts) + + // Test 6: Full analyze on empty table should produce null hot_value + sql """drop table if exists full_hot_empty""" + sql """CREATE TABLE full_hot_empty ( + key1 int NULL, + value1 varchar(25) NULL + )ENGINE=OLAP + DUPLICATE KEY(`key1`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`key1`) BUCKETS 2 + PROPERTIES ( + "replication_num" = "1" + ) + """ + sql """analyze table full_hot_empty with sync""" + result = sql """show column stats full_hot_empty(value1)""" + logger.info("Test6 empty table result: " + result) + assertEquals(1, result.size()) + assertEquals("0.0", result[0][2]) + assertEquals("null", result[0][17]) + + // Test 7: Full analyze on all-NULL column should produce null hot_value + sql """drop table if exists full_hot_all_null""" + sql """CREATE TABLE full_hot_all_null ( + key1 int NULL, + value1 varchar(25) NULL + )ENGINE=OLAP + DUPLICATE KEY(`key1`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`key1`) BUCKETS 2 + PROPERTIES ( + "replication_num" = "1" + ) + """ + sql """insert into full_hot_all_null select number, null from numbers("number"="100")""" + sql """analyze table full_hot_all_null with sync""" + result = sql """show column stats full_hot_all_null(value1)""" + logger.info("Test7 all-null result: " + result) + assertEquals(1, result.size()) + assertEquals("100.0", result[0][2]) + assertEquals("100.0", result[0][4]) + assertEquals("null", result[0][17]) + + sql """drop database if exists test_full_analyze_hot_value""" +} diff --git a/regression-test/suites/statistics/test_hot_value.groovy b/regression-test/suites/statistics/test_hot_value.groovy index c4021f17555eba..77f0cc89425441 100644 --- a/regression-test/suites/statistics/test_hot_value.groovy +++ b/regression-test/suites/statistics/test_hot_value.groovy @@ -81,28 +81,31 @@ suite("test_hot_value") { wait_row_count_reported("test_hot_value", "test1", 0, 4, "10000") wait_row_count_reported("test_hot_value", "test2", 0, 4, "10000") sql """analyze table test1 with sync""" - logger.info("1. memo plan ") - explain { - sql("memo plan select * from test1") - contains "hotValues=(null)" - } def result = sql """show column stats test1(key1)""" assertEquals(1, result.size()) assertEquals("10000.0", result[0][2]) - assertEquals("null", result[0][17]) + assertTrue(result[0][17] != "null") result = sql """show column stats test1(value1)""" logger.info("0. result " + result) assertEquals(1, result.size()) assertEquals("10000.0", result[0][2]) - assertEquals("null", result[0][17]) + String[] fullHotValues = result[0][17].split(";") + logger.info("0.1 fullHotValues " + result[0][17]) + assertEquals(2, fullHotValues.length) + assertTrue(fullHotValues[0].trim() == "'1':0.5" || fullHotValues[0].trim() == "'0':0.5") + assertTrue(fullHotValues[1].trim() == "'1':0.5" || fullHotValues[1].trim() == "'0':0.5") result = sql """show column cached stats test1(key1)""" assertEquals(1, result.size()) assertEquals("10000.0", result[0][2]) + // cached stats filter hot values by threshold; key1 with 10000 unique values has ratio~0, so cached hot_value is null assertEquals("null", result[0][17]) result = sql """show column cached stats test1(value1)""" assertEquals(1, result.size()) assertEquals("10000.0", result[0][2]) - assertEquals("null", result[0][17]) + fullHotValues = result[0][17].split(";") + assertEquals(2, fullHotValues.length) + assertTrue(fullHotValues[0].trim() == "'1':0.5" || fullHotValues[0].trim() == "'0':0.5") + assertTrue(fullHotValues[1].trim() == "'1':0.5" || fullHotValues[1].trim() == "'0':0.5") sql """drop stats test1""" sql """analyze table test1 with sample rows 400 with sync""" result = sql """show column stats test1(key1)"""