Jibing-Li commented on code in PR #26435:
URL: https://github.com/apache/doris/pull/26435#discussion_r1390553845
##########
fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java:
##########
@@ -85,46 +75,95 @@ public void doExecute() throws Exception {
* 3. insert col stats and partition stats
*/
protected void doSample() throws Exception {
- Pair<List<Long>, Long> pair = calcActualSampleTablets();
+ LOG.info(String.format("Will do sample collection for column %s",
col.getName()));
+ Pair<List<Long>, Long> pair =
calcActualSampleTablets(isPartitionColumn());
+ LOG.info(String.format("Number of tablets selected %d, rows in tablets
%d", pair.first.size(), pair.second));
List<Long> tabletIds = pair.first;
double scaleFactor = (double) tbl.getRowCount() / (double) pair.second;
// might happen if row count in fe metadata hasn't been updated yet
if (Double.isInfinite(scaleFactor) || Double.isNaN(scaleFactor)) {
+ LOG.warn("Scale factor is infinite or Nan, will set scale factor
to 1.");
scaleFactor = 1;
tabletIds = Collections.emptyList();
+ pair.second = tbl.getRowCount();
}
String tabletStr = tabletIds.stream()
.map(Object::toString)
.collect(Collectors.joining(", "));
try (AutoCloseConnectContext r =
StatisticsUtil.buildConnectContext(info.jobType.equals(JobType.SYSTEM))) {
+ // Get basic stats, including min and max.
+ ResultRow basicStats = collectBasicStat(r);
+ long rowCount = tbl.getRowCount();
+ String min =
Base64.getEncoder().encodeToString(basicStats.get(0).getBytes(StandardCharsets.UTF_8));
+ String max =
Base64.getEncoder().encodeToString(basicStats.get(1).getBytes(StandardCharsets.UTF_8));
+
+ boolean limitFlag = false;
+ long rowsToSample = pair.second;
Map<String, String> params = new HashMap<>();
params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
params.put("catalogId", String.valueOf(catalog.getId()));
+ params.put("catalogName", catalog.getName());
params.put("dbId", String.valueOf(db.getId()));
params.put("tblId", String.valueOf(tbl.getId()));
params.put("idxId", String.valueOf(info.indexId));
params.put("colId", String.valueOf(info.colName));
- params.put("dataSizeFunction", getDataSizeFunction(col));
+ params.put("dataSizeFunction", getDataSizeFunction(col, false));
params.put("dbName", db.getFullName());
params.put("colName", info.colName);
params.put("tblName", tbl.getName());
params.put("scaleFactor", String.valueOf(scaleFactor));
- params.put("tablets", tabletStr.isEmpty() ? "" :
String.format("TABLET(%s)", tabletStr));
+ params.put("sampleHints", tabletStr.isEmpty() ? "" :
String.format("TABLET(%s)", tabletStr));
+ params.put("ndvFunction",
getNdvFunction(String.valueOf(rowCount)));
+ params.put("min", min);
+ params.put("max", max);
+ params.put("rowCount", String.valueOf(rowCount));
+ params.put("type", col.getType().toString());
+ params.put("limit", "");
+ if (needLimit()) {
+ // If the tablets to be sampled are too large, use limit to
control the rows to read, and re-calculate
+ // the scaleFactor.
+ limitFlag = true;
+ rowsToSample = Math.min(getSampleRows(), pair.second);
+ params.put("limit", "limit " + rowsToSample);
+ params.put("scaleFactor", String.valueOf(scaleFactor *
(double) pair.second / rowsToSample));
+ }
StringSubstitutor stringSubstitutor = new
StringSubstitutor(params);
- stmtExecutor = new StmtExecutor(r.connectContext,
stringSubstitutor.replace(SAMPLE_COLUMN_SQL_TEMPLATE));
- // Scalar query only return one row
- ColStatsData colStatsData = new
ColStatsData(stmtExecutor.executeInternalQuery().get(0));
- job.appendBuf(this, Collections.singletonList(colStatsData));
+ String sql;
+ // Distribution columns don't fit for DUJ1 estimator, use linear
estimator.
+ if (isDistributionColumn()) {
+ params.put("min", StatisticsUtil.quote(min));
+ params.put("max", StatisticsUtil.quote(max));
+ sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
+ } else {
+ params.put("dataSizeFunction", getDataSizeFunction(col, true));
+ sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
+ }
+ LOG.info(String.format("Sample for column [%s]. Total rows [%s],
rows to sample [%d], scale factor [%s], "
Review Comment:
How about keep this info level log? It's helpful to investigate if error
happens.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]