Skip to content

Commit e380000

Browse files
yujun777Copilot
andcommitted
[improvement](statistics) Collect hot value during full statistics analysis
### What problem does this PR solve? Issue Number: close #xxx Problem Summary: Previously, full statistics collection (ANALYZE TABLE ... WITH SYNC) hardcoded `null as hot_value` in FULL_ANALYZE_TEMPLATE, meaning hot values (frequent values) were only collected during sample-based analysis. This made full analysis produce less complete statistics than sample analysis. This change rewrites FULL_ANALYZE_TEMPLATE to use a CTE-based structure (matching LINEAR_ANALYZE_TEMPLATE and DUJ1_ANALYZE_TEMPLATE patterns) that computes hot values via GROUP BY + TOP-N aggregation. Both OlapAnalysisTask and ExternalAnalysisTask are updated to pass the required parameters (hotValueCollectCount, subStringColName, rowCount2) to the template. ### Release note Full statistics collection now also collects hot value (frequent value) information, matching the behavior of sample-based collection. ### Check List (For Author) - Test: Regression test (test_hot_value, test_full_analyze_hot_value) / Unit Test (OlapAnalysisTaskTest, HMSAnalysisTaskTest) - Behavior changed: Yes - full analyze now produces hot_value instead of null - Does this need documentation: No Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 87d592e commit e380000

7 files changed

Lines changed: 373 additions & 19 deletions

File tree

fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,11 @@ public abstract class BaseAnalysisTask {
6565
public static final long LIMIT_SIZE = 1024 * 1024 * 1024; // 1GB
6666
public static final double LIMIT_FACTOR = 1.2;
6767

68-
protected static final String FULL_ANALYZE_TEMPLATE =
69-
"SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
68+
protected static final String FULL_ANALYZE_TEMPLATE = "WITH cte1 AS ("
69+
+ "SELECT `${colName}` "
70+
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}), "
71+
+ "cte2 AS ("
72+
+ "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
7073
+ "${catalogId} AS `catalog_id`, "
7174
+ "${dbId} AS `db_id`, "
7275
+ "${tblId} AS `tbl_id`, "
@@ -79,9 +82,20 @@ public abstract class BaseAnalysisTask {
7982
+ "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) AS `min`, "
8083
+ "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS `max`, "
8184
+ "${dataSizeFunction} AS `data_size`, "
82-
+ "NOW() AS `update_time`, "
83-
+ "null as `hot_value` "
84-
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}";
85+
+ "NOW() "
86+
+ "FROM cte1), "
87+
+ "cte3 AS ("
88+
+ "SELECT GROUP_CONCAT(CONCAT("
89+
+ "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"), \";\", \"\\\\;\"), "
90+
+ "\" :\", ROUND(t.`count` / ${rowCount2}, 2)), \" ;\") "
91+
+ "as `hot_value` "
92+
+ "FROM ("
93+
+ "SELECT ${subStringColName} as `hash_value`, "
94+
+ "MAX(`${colName}`) as `column_key`, "
95+
+ "COUNT(1) AS `count` "
96+
+ "FROM cte1 WHERE `${colName}` IS NOT NULL "
97+
+ "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT ${hotValueCollectCount}) t) "
98+
+ "SELECT * FROM cte2 CROSS JOIN cte3";
8599

86100
protected static final String LINEAR_ANALYZE_TEMPLATE = "WITH cte1 AS ("
87101
+ "SELECT `${colName}` "

fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.apache.doris.common.FeConstants;
2222
import org.apache.doris.common.NotImplementedException;
2323
import org.apache.doris.datasource.ExternalTable;
24+
import org.apache.doris.qe.SessionVariable;
2425

2526
import org.apache.commons.text.StringSubstitutor;
2627

@@ -59,9 +60,10 @@ protected void deleteNotExistPartitionStats(AnalysisInfo jobInfo) throws DdlExce
5960
protected void doFull() throws Exception {
6061
StringBuilder sb = new StringBuilder();
6162
Map<String, String> params = buildSqlParams();
62-
params.put("min", getMinFunction());
63-
params.put("max", getMaxFunction());
6463
params.put("dataSizeFunction", getDataSizeFunction(col, false));
64+
params.put("hotValueCollectCount", String.valueOf(SessionVariable.getHotValueCollectCount()));
65+
params.put("subStringColName", getStringTypeColName(col));
66+
params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `${colName}` IS NOT NULL)");
6567
if (LOG.isDebugEnabled()) {
6668
LOG.debug("Will do full collection for column {}", col.getName());
6769
}

fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,11 @@ protected void doFull() throws Exception {
328328
if (StatisticsUtil.enablePartitionAnalyze() && tbl.isPartitionedTable()) {
329329
doPartitionTable();
330330
} else {
331-
StringSubstitutor stringSubstitutor = new StringSubstitutor(buildSqlParams());
331+
Map<String, String> params = buildSqlParams();
332+
params.put("hotValueCollectCount", String.valueOf(SessionVariable.getHotValueCollectCount()));
333+
params.put("subStringColName", getStringTypeColName(col));
334+
params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `${colName}` IS NOT NULL)");
335+
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
332336
runQuery(stringSubstitutor.replace(FULL_ANALYZE_TEMPLATE));
333337
}
334338
}

fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.apache.doris.common.Pair;
2525
import org.apache.doris.datasource.CatalogIf;
2626
import org.apache.doris.datasource.hive.HMSExternalTable;
27+
import org.apache.doris.qe.SessionVariable;
2728
import org.apache.doris.statistics.util.StatisticsUtil;
2829

2930
import com.google.common.collect.ImmutableList;
@@ -198,18 +199,35 @@ public Set<String> getPartitionNames() {
198199
}
199200
};
200201

202+
new MockUp<SessionVariable>() {
203+
@Mock
204+
public int getHotValueCollectCount() {
205+
return 10;
206+
}
207+
};
208+
201209
new MockUp<HMSAnalysisTask>() {
202210
@Mock
203211
public void runQuery(String sql) {
204-
Assertions.assertEquals("SELECT CONCAT(30001, '-', -1, '-', 'hour') AS `id`, "
212+
Assertions.assertEquals("WITH cte1 AS (SELECT `hour` "
213+
+ "FROM `hms`.`default`.`test` ), "
214+
+ "cte2 AS (SELECT CONCAT(30001, '-', -1, '-', 'hour') AS `id`, "
205215
+ "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, "
206216
+ "-1 AS `idx_id`, 'hour' AS `col_id`, NULL AS `part_id`, "
207217
+ "COUNT(1) AS `row_count`, NDV(`hour`) AS `ndv`, "
208218
+ "COUNT(1) - COUNT(`hour`) AS `null_count`, "
209219
+ "SUBSTRING(CAST(MIN(`hour`) AS STRING), 1, 1024) AS `min`, "
210220
+ "SUBSTRING(CAST(MAX(`hour`) AS STRING), 1, 1024) AS `max`, "
211-
+ "COUNT(1) * 4 AS `data_size`, NOW() AS `update_time`, "
212-
+ "null as `hot_value` FROM `hms`.`default`.`test` ", sql);
221+
+ "COUNT(1) * 4 AS `data_size`, NOW() FROM cte1), "
222+
+ "cte3 AS (SELECT GROUP_CONCAT(CONCAT("
223+
+ "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"), \";\", \"\\\\;\"), "
224+
+ "\" :\", ROUND(t.`count` / "
225+
+ "(SELECT COUNT(1) FROM cte1 WHERE `hour` IS NOT NULL), 2)), \" ;\") "
226+
+ "as `hot_value` FROM (SELECT `hour` as `hash_value`, "
227+
+ "MAX(`hour`) as `column_key`, COUNT(1) AS `count` "
228+
+ "FROM cte1 WHERE `hour` IS NOT NULL "
229+
+ "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT 10) t) "
230+
+ "SELECT * FROM cte2 CROSS JOIN cte3", sql);
213231
}
214232
};
215233

fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.apache.doris.common.FeConstants;
4141
import org.apache.doris.common.Pair;
4242
import org.apache.doris.datasource.CatalogIf;
43+
import org.apache.doris.qe.SessionVariable;
4344
import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod;
4445
import org.apache.doris.statistics.AnalysisInfo.JobType;
4546
import org.apache.doris.statistics.util.StatisticsUtil;
@@ -754,4 +755,150 @@ public void testMergePartitionSql() {
754755
+ "WHERE `catalog_id` = 0 AND `db_id` = 1 AND `tbl_id` = 2 AND `idx_id` = 3 AND `col_id` = 'col1'",
755756
sql);
756757
}
758+
759+
@Test
760+
public void testFullAnalyzeTemplateSql() {
761+
Map<String, String> params = new HashMap<>();
762+
params.put("catalogId", "0");
763+
params.put("dbId", "1");
764+
params.put("tblId", "2");
765+
params.put("idxId", "3");
766+
params.put("colId", "col1");
767+
params.put("colName", "col1");
768+
params.put("dataSizeFunction", "SUM(LENGTH(`col1`))");
769+
params.put("catalogName", "internal");
770+
params.put("dbName", "db1");
771+
params.put("tblName", "tbl1");
772+
params.put("index", "");
773+
params.put("hotValueCollectCount", "10");
774+
params.put("subStringColName", "`col1`");
775+
params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `col1` IS NOT NULL)");
776+
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
777+
String sql = stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
778+
Assertions.assertTrue(sql.startsWith("WITH cte1 AS ("));
779+
Assertions.assertTrue(sql.contains("cte3 AS (SELECT GROUP_CONCAT"));
780+
Assertions.assertTrue(sql.contains("as `hot_value`"));
781+
Assertions.assertTrue(sql.contains("CROSS JOIN cte3"));
782+
Assertions.assertTrue(sql.contains("LIMIT 10"));
783+
Assertions.assertTrue(sql.contains("GROUP BY `hash_value` ORDER BY `count` DESC"));
784+
Assertions.assertFalse(sql.contains("null as `hot_value`"));
785+
}
786+
787+
@Test
788+
public void testDoFullHotValue(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf,
789+
@Mocked OlapTable tableIf) throws Exception {
790+
791+
new Expectations() {
792+
{
793+
tableIf.getId();
794+
result = 30001;
795+
tableIf.getName();
796+
result = "testTbl";
797+
catalogIf.getId();
798+
result = 10001;
799+
catalogIf.getName();
800+
result = "catalogName";
801+
databaseIf.getId();
802+
result = 20001;
803+
databaseIf.getFullName();
804+
result = "testDb";
805+
}
806+
};
807+
808+
new MockUp<StatisticsUtil>() {
809+
@Mock
810+
public boolean enablePartitionAnalyze() {
811+
return false;
812+
}
813+
};
814+
815+
new MockUp<SessionVariable>() {
816+
@Mock
817+
public int getHotValueCollectCount() {
818+
return 10;
819+
}
820+
};
821+
822+
new MockUp<OlapAnalysisTask>() {
823+
@Mock
824+
public void runQuery(String sql) {
825+
Assertions.assertTrue(sql.startsWith("WITH cte1 AS (SELECT `testCol` "
826+
+ "FROM `catalogName`.`testDb`.`testTbl` "), sql);
827+
Assertions.assertTrue(sql.contains("cte3 AS (SELECT GROUP_CONCAT"), sql);
828+
Assertions.assertTrue(sql.contains("`testCol` as `hash_value`"), sql);
829+
Assertions.assertTrue(sql.contains("LIMIT 10"), sql);
830+
Assertions.assertTrue(sql.contains("CROSS JOIN cte3"), sql);
831+
Assertions.assertFalse(sql.contains("null as `hot_value`"), sql);
832+
}
833+
};
834+
835+
OlapAnalysisTask task = new OlapAnalysisTask();
836+
task.col = new Column("testCol", Type.fromPrimitiveType(PrimitiveType.INT),
837+
true, null, null, null);
838+
task.tbl = tableIf;
839+
AnalysisInfoBuilder builder = new AnalysisInfoBuilder();
840+
builder.setJobType(AnalysisInfo.JobType.MANUAL);
841+
builder.setColName("testCol");
842+
task.info = builder.build();
843+
task.catalog = catalogIf;
844+
task.db = databaseIf;
845+
task.doFull();
846+
}
847+
848+
@Test
849+
public void testDoFullHotValueStringColumn(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf,
850+
@Mocked OlapTable tableIf) throws Exception {
851+
852+
new Expectations() {
853+
{
854+
tableIf.getId();
855+
result = 30001;
856+
tableIf.getName();
857+
result = "testTbl";
858+
catalogIf.getId();
859+
result = 10001;
860+
catalogIf.getName();
861+
result = "catalogName";
862+
databaseIf.getId();
863+
result = 20001;
864+
databaseIf.getFullName();
865+
result = "testDb";
866+
}
867+
};
868+
869+
new MockUp<StatisticsUtil>() {
870+
@Mock
871+
public boolean enablePartitionAnalyze() {
872+
return false;
873+
}
874+
};
875+
876+
new MockUp<SessionVariable>() {
877+
@Mock
878+
public int getHotValueCollectCount() {
879+
return 10;
880+
}
881+
};
882+
883+
new MockUp<OlapAnalysisTask>() {
884+
@Mock
885+
public void runQuery(String sql) {
886+
Assertions.assertTrue(sql.contains(
887+
"xxhash_64(SUBSTRING(CAST(`strCol` AS STRING), 1, 1024)) as `hash_value`"), sql);
888+
Assertions.assertTrue(sql.contains("MAX(`strCol`) as `column_key`"), sql);
889+
}
890+
};
891+
892+
OlapAnalysisTask task = new OlapAnalysisTask();
893+
task.col = new Column("strCol", Type.fromPrimitiveType(PrimitiveType.STRING),
894+
true, null, null, null);
895+
task.tbl = tableIf;
896+
AnalysisInfoBuilder builder = new AnalysisInfoBuilder();
897+
builder.setJobType(AnalysisInfo.JobType.MANUAL);
898+
builder.setColName("strCol");
899+
task.info = builder.build();
900+
task.catalog = catalogIf;
901+
task.db = databaseIf;
902+
task.doFull();
903+
}
757904
}

0 commit comments

Comments
 (0)