Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,11 @@ public abstract class BaseAnalysisTask {
public static final long LIMIT_SIZE = 1024 * 1024 * 1024; // 1GB
public static final double LIMIT_FACTOR = 1.2;

protected static final String FULL_ANALYZE_TEMPLATE =
"SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
protected static final String FULL_ANALYZE_TEMPLATE = "WITH cte1 AS ("
+ "SELECT `${colName}` "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}), "
+ "cte2 AS ("
+ "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
+ "${catalogId} AS `catalog_id`, "
+ "${dbId} AS `db_id`, "
+ "${tblId} AS `tbl_id`, "
Expand All @@ -79,9 +82,20 @@ public abstract class BaseAnalysisTask {
+ "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) AS `min`, "
+ "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS `max`, "
+ "${dataSizeFunction} AS `data_size`, "
+ "NOW() AS `update_time`, "
+ "null as `hot_value` "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}";
+ "NOW() "
+ "FROM cte1), "
+ "cte3 AS ("
+ "SELECT GROUP_CONCAT(CONCAT("
+ "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"), \";\", \"\\\\;\"), "
+ "\" :\", ROUND(t.`count` / ${rowCount2}, 2)), \" ;\") "
+ "as `hot_value` "
+ "FROM ("
+ "SELECT ${subStringColName} as `hash_value`, "
+ "MAX(`${colName}`) as `column_key`, "
+ "COUNT(1) AS `count` "
+ "FROM cte1 WHERE `${colName}` IS NOT NULL "
+ "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT ${hotValueCollectCount}) t) "
+ "SELECT * FROM cte2 CROSS JOIN cte3";

protected static final String LINEAR_ANALYZE_TEMPLATE = "WITH cte1 AS ("
+ "SELECT `${colName}` "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.NotImplementedException;
import org.apache.doris.datasource.ExternalTable;
import org.apache.doris.qe.SessionVariable;

import org.apache.commons.text.StringSubstitutor;

Expand Down Expand Up @@ -59,9 +60,10 @@ protected void deleteNotExistPartitionStats(AnalysisInfo jobInfo) throws DdlExce
protected void doFull() throws Exception {
StringBuilder sb = new StringBuilder();
Map<String, String> params = buildSqlParams();
params.put("min", getMinFunction());
params.put("max", getMaxFunction());
params.put("dataSizeFunction", getDataSizeFunction(col, false));
params.put("hotValueCollectCount", String.valueOf(SessionVariable.getHotValueCollectCount()));
params.put("subStringColName", getStringTypeColName(col));
params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `${colName}` IS NOT NULL)");
if (LOG.isDebugEnabled()) {
LOG.debug("Will do full collection for column {}", col.getName());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,11 @@ protected void doFull() throws Exception {
if (StatisticsUtil.enablePartitionAnalyze() && tbl.isPartitionedTable()) {
doPartitionTable();
} else {
StringSubstitutor stringSubstitutor = new StringSubstitutor(buildSqlParams());
Map<String, String> params = buildSqlParams();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This only fixes the non-partition full-analyze branch. When enable_partition_analyze is true and the table is partitioned, doFull() still goes through doPartitionTable(), which eventually merges with MERGE_PARTITION_TEMPLATE in BaseAnalysisTask and that template still hardcodes null as hot_value. In that configuration, ANALYZE TABLE ... WITH SYNC will continue to produce no hot values for partitioned OLAP tables, so the behavior change described in the PR is still incomplete. The new tests all stub enablePartitionAnalyze() to false, so they won't catch this path.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we no handle with case enable_partition_analyze = true

params.put("hotValueCollectCount", String.valueOf(SessionVariable.getHotValueCollectCount()));
params.put("subStringColName", getStringTypeColName(col));
params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `${colName}` IS NOT NULL)");
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
runQuery(stringSubstitutor.replace(FULL_ANALYZE_TEMPLATE));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.doris.common.Pair;
import org.apache.doris.datasource.CatalogIf;
import org.apache.doris.datasource.hive.HMSExternalTable;
import org.apache.doris.qe.SessionVariable;
import org.apache.doris.statistics.util.StatisticsUtil;

import com.google.common.collect.ImmutableList;
Expand Down Expand Up @@ -198,18 +199,35 @@ public Set<String> getPartitionNames() {
}
};

new MockUp<SessionVariable>() {
@Mock
public int getHotValueCollectCount() {
return 10;
}
};

new MockUp<HMSAnalysisTask>() {
@Mock
public void runQuery(String sql) {
Assertions.assertEquals("SELECT CONCAT(30001, '-', -1, '-', 'hour') AS `id`, "
Assertions.assertEquals("WITH cte1 AS (SELECT `hour` "
+ "FROM `hms`.`default`.`test` ), "
+ "cte2 AS (SELECT CONCAT(30001, '-', -1, '-', 'hour') AS `id`, "
+ "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, "
+ "-1 AS `idx_id`, 'hour' AS `col_id`, NULL AS `part_id`, "
+ "COUNT(1) AS `row_count`, NDV(`hour`) AS `ndv`, "
+ "COUNT(1) - COUNT(`hour`) AS `null_count`, "
+ "SUBSTRING(CAST(MIN(`hour`) AS STRING), 1, 1024) AS `min`, "
+ "SUBSTRING(CAST(MAX(`hour`) AS STRING), 1, 1024) AS `max`, "
+ "COUNT(1) * 4 AS `data_size`, NOW() AS `update_time`, "
+ "null as `hot_value` FROM `hms`.`default`.`test` ", sql);
+ "COUNT(1) * 4 AS `data_size`, NOW() FROM cte1), "
+ "cte3 AS (SELECT GROUP_CONCAT(CONCAT("
+ "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"), \";\", \"\\\\;\"), "
+ "\" :\", ROUND(t.`count` / "
+ "(SELECT COUNT(1) FROM cte1 WHERE `hour` IS NOT NULL), 2)), \" ;\") "
+ "as `hot_value` FROM (SELECT `hour` as `hash_value`, "
+ "MAX(`hour`) as `column_key`, COUNT(1) AS `count` "
+ "FROM cte1 WHERE `hour` IS NOT NULL "
+ "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT 10) t) "
+ "SELECT * FROM cte2 CROSS JOIN cte3", sql);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
import org.apache.doris.datasource.CatalogIf;
import org.apache.doris.qe.SessionVariable;
import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod;
import org.apache.doris.statistics.AnalysisInfo.JobType;
import org.apache.doris.statistics.util.StatisticsUtil;
Expand Down Expand Up @@ -754,4 +755,150 @@ public void testMergePartitionSql() {
+ "WHERE `catalog_id` = 0 AND `db_id` = 1 AND `tbl_id` = 2 AND `idx_id` = 3 AND `col_id` = 'col1'",
sql);
}

@Test
public void testFullAnalyzeTemplateSql() {
Map<String, String> params = new HashMap<>();
params.put("catalogId", "0");
params.put("dbId", "1");
params.put("tblId", "2");
params.put("idxId", "3");
params.put("colId", "col1");
params.put("colName", "col1");
params.put("dataSizeFunction", "SUM(LENGTH(`col1`))");
params.put("catalogName", "internal");
params.put("dbName", "db1");
params.put("tblName", "tbl1");
params.put("index", "");
params.put("hotValueCollectCount", "10");
params.put("subStringColName", "`col1`");
params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `col1` IS NOT NULL)");
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
Assertions.assertTrue(sql.startsWith("WITH cte1 AS ("));
Assertions.assertTrue(sql.contains("cte3 AS (SELECT GROUP_CONCAT"));
Assertions.assertTrue(sql.contains("as `hot_value`"));
Assertions.assertTrue(sql.contains("CROSS JOIN cte3"));
Assertions.assertTrue(sql.contains("LIMIT 10"));
Assertions.assertTrue(sql.contains("GROUP BY `hash_value` ORDER BY `count` DESC"));
Assertions.assertFalse(sql.contains("null as `hot_value`"));
}

@Test
public void testDoFullHotValue(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf,
@Mocked OlapTable tableIf) throws Exception {

new Expectations() {
{
tableIf.getId();
result = 30001;
tableIf.getName();
result = "testTbl";
catalogIf.getId();
result = 10001;
catalogIf.getName();
result = "catalogName";
databaseIf.getId();
result = 20001;
databaseIf.getFullName();
result = "testDb";
}
};

new MockUp<StatisticsUtil>() {
@Mock
public boolean enablePartitionAnalyze() {
return false;
}
};

new MockUp<SessionVariable>() {
@Mock
public int getHotValueCollectCount() {
return 10;
}
};

new MockUp<OlapAnalysisTask>() {
@Mock
public void runQuery(String sql) {
Assertions.assertTrue(sql.startsWith("WITH cte1 AS (SELECT `testCol` "
+ "FROM `catalogName`.`testDb`.`testTbl` "), sql);
Assertions.assertTrue(sql.contains("cte3 AS (SELECT GROUP_CONCAT"), sql);
Assertions.assertTrue(sql.contains("`testCol` as `hash_value`"), sql);
Assertions.assertTrue(sql.contains("LIMIT 10"), sql);
Assertions.assertTrue(sql.contains("CROSS JOIN cte3"), sql);
Assertions.assertFalse(sql.contains("null as `hot_value`"), sql);
}
};

OlapAnalysisTask task = new OlapAnalysisTask();
task.col = new Column("testCol", Type.fromPrimitiveType(PrimitiveType.INT),
true, null, null, null);
task.tbl = tableIf;
AnalysisInfoBuilder builder = new AnalysisInfoBuilder();
builder.setJobType(AnalysisInfo.JobType.MANUAL);
builder.setColName("testCol");
task.info = builder.build();
task.catalog = catalogIf;
task.db = databaseIf;
task.doFull();
}

@Test
public void testDoFullHotValueStringColumn(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf,
@Mocked OlapTable tableIf) throws Exception {

new Expectations() {
{
tableIf.getId();
result = 30001;
tableIf.getName();
result = "testTbl";
catalogIf.getId();
result = 10001;
catalogIf.getName();
result = "catalogName";
databaseIf.getId();
result = 20001;
databaseIf.getFullName();
result = "testDb";
}
};

new MockUp<StatisticsUtil>() {
@Mock
public boolean enablePartitionAnalyze() {
return false;
}
};

new MockUp<SessionVariable>() {
@Mock
public int getHotValueCollectCount() {
return 10;
}
};

new MockUp<OlapAnalysisTask>() {
@Mock
public void runQuery(String sql) {
Assertions.assertTrue(sql.contains(
"xxhash_64(SUBSTRING(CAST(`strCol` AS STRING), 1, 1024)) as `hash_value`"), sql);
Assertions.assertTrue(sql.contains("MAX(`strCol`) as `column_key`"), sql);
}
};

OlapAnalysisTask task = new OlapAnalysisTask();
task.col = new Column("strCol", Type.fromPrimitiveType(PrimitiveType.STRING),
true, null, null, null);
task.tbl = tableIf;
AnalysisInfoBuilder builder = new AnalysisInfoBuilder();
builder.setJobType(AnalysisInfo.JobType.MANUAL);
builder.setColName("strCol");
task.info = builder.build();
task.catalog = catalogIf;
task.db = databaseIf;
task.doFull();
}
}
Loading
Loading