github-actions[bot] commented on code in PR #61114: URL: https://github.com/apache/doris/pull/61114#discussion_r2904886595
########## regression-test/suites/cloud_p0/cache/ttl/st06_warmup_ttl_type_assert.groovy: ########## @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("st06_warmup_ttl_type_assert") { + def customBeConfig = [ + enable_evict_file_cache_in_advance : false, + file_cache_enter_disk_resource_limit_mode_percent : 99 + ] + + setBeConfigTemporary(customBeConfig) { + def clusters = sql "SHOW CLUSTERS" + if (clusters.size() < 2) { + logger.info("skip st06_warmup_ttl_type_assert, need at least 2 clusters") + return + } + + def sourceCluster = clusters[0][0] + def targetCluster = clusters[1][0] + String tableName = "st06_warmup_ttl_tpl" + + sql """use @${sourceCluster};""" + def ddl = new File("""${context.file.parent}/../ddl/st06_warmup_ttl_type_assert.sql""").text + .replace("\${TABLE_NAME}", tableName) + sql ddl + + def values = (0..<200).collect { i -> "(${i}, 'warmup_tpl_${i}')" }.join(",") + sql """insert into ${tableName} values ${values}""" + qt_source_preheat """select count(*) from ${tableName}""" + + def sourceTablets = sql """show tablets from ${tableName}""" + assertTrue(sourceTablets.size() > 0, "No tablets found for table ${tableName} in source cluster ${sourceCluster}") + def sourceTabletIds = sourceTablets.collect { it[0] as Long } + + // ST-06 部分覆盖点模板:显式断言 warmup 后目标集群缓存类型为 ttl + def jobIdRows = sql """warm up cluster ${targetCluster} with table ${tableName};""" + assertTrue(!jobIdRows.isEmpty()) + def jobId = jobIdRows[0][0] + + def waitWarmUpJobFinished = { Object id, long timeoutMs = 600000L, long intervalMs = 5000L -> + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + def stateRows = sql """SHOW WARM UP JOB WHERE ID = ${id}""" + if (stateRows.isEmpty()) { + sleep(intervalMs) + continue + } + def state = stateRows[0][3].toString() + if ("FINISHED".equalsIgnoreCase(state)) { + return + } + if ("CANCELLED".equalsIgnoreCase(state) || "FAILED".equalsIgnoreCase(state)) { + assertTrue(false, "Warm up job failed, id=${id}, state=${state}") + } + sleep(intervalMs) + } + assertTrue(false, "Timeout waiting warm up job finished, id=${id}") + } + waitWarmUpJobFinished.call(jobId) + + sql """use @${targetCluster};""" + qt_target_query """select count(*) from ${tableName}""" + def targetTablets = sql """show tablets from ${tableName}""" + assertTrue(targetTablets.size() > 0, "No tablets found for table ${tableName} in target cluster ${targetCluster}") + def targetTabletIds = targetTablets.collect { it[0] as Long } + assertTrue(sourceTabletIds.size() == targetTabletIds.size(), + "Tablet size mismatch between source and target, source=${sourceTabletIds.size()}, target=${targetTabletIds.size()}") + + def waitForFileCacheType = { List<Long> sourceIds, List<Long> targetIds, String expectedType, long timeoutMs = 600000L, long intervalMs = 2000L -> + logger.info("waitForFileCacheType, sourceIds=${sourceIds.toString()}, targetIds=${targetIds.toString()}, expectedType=${expectedType}") + assertTrue(sourceIds.size() == targetIds.size(), + "Tablet size mismatch before waiting file cache type, source=${sourceIds.size()}, target=${targetIds.size()}") + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + int sourceMatched = 0 + int targetMatched = 0 + for (Long sourceTabletId in sourceIds) { + def sourceTabletIdStr = sql """select * from information_schema.file_cache_info where tablet_id = ${sourceTabletId}""" + logger.info("[source tablet] tablet_id=${sourceTabletId}, tablet_cache_info=${sourceTabletIdStr.toString()}") + def rows = sql """select type from information_schema.file_cache_info where tablet_id = ${sourceTabletId}""" + if (!rows.isEmpty()) { + def mismatch = rows.find { row -> !row[0]?.toString()?.equalsIgnoreCase(expectedType) } + if (!mismatch) { + sourceMatched++ + } + } + } + for (Long targetTabletId in targetIds) { + def targetTabletIdStr = sql """select * from information_schema.file_cache_info where tablet_id = ${targetTabletId}""" + logger.info("[target tablet] tablet_id=${targetTabletId}, tablet_cache_info=${targetTabletIdStr.toString()}") + def rows = sql """select type from information_schema.file_cache_info where tablet_id = ${targetTabletId}""" + if (!rows.isEmpty()) { + def mismatch = rows.find { row -> !row[0]?.toString()?.equalsIgnoreCase(expectedType) } + if (!mismatch) { + targetMatched++ + } + } + } + sleep(intervalMs) Review Comment: Bug: `waitForFileCacheType` **never returns on success** - it will always time out and fail. The function counts `sourceMatched` and `targetMatched` but never checks if all tablets matched and never executes an early `return`. After the `while` loop exhausts the timeout, it unconditionally calls `assertTrue(false, ...)` on line 114. Suggested fix - add an early return check after counting matches: ```groovy // After the two for loops: if (sourceMatched == sourceIds.size() && targetMatched == targetIds.size()) { return } sleep(intervalMs) ``` ########## regression-test/data/cloud_p0/ttl/st04_alter_ttl_n_to_0_runtime.out: ########## @@ -0,0 +1,4 @@ +-- This file is automatically generated. You should know what you did if you want to edit this Review Comment: Bug: Wrong directory path. This `.out` file is at `data/cloud_p0/ttl/` but the corresponding test is at `suites/cloud_p0/cache/ttl/st04_alter_ttl_n_to_0_runtime.groovy`. The Doris test framework resolves `.out` paths by mirroring the suite path, so this file should be at `data/cloud_p0/cache/ttl/st04_alter_ttl_n_to_0_runtime.out`. This applies to all 5 `st*.out` files: - `st04_alter_ttl_n_to_0_runtime.out` - `st06_warmup_ttl_type_assert.out` - `st07_qcs_consistency.out` - `st10_drop_partition_cleanup.out` - `st10_drop_table_cleanup.out` They should all be moved from `data/cloud_p0/ttl/` to `data/cloud_p0/cache/ttl/`. ########## regression-test/suites/cloud_p0/cache/ttl/st07_qcs_consistency.groovy: ########## @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("st07_qcs_consistency") { + def customBeConfig = [ + enable_evict_file_cache_in_advance : false, + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 + ] + + setBeConfigTemporary(customBeConfig) { + def clusters = sql "SHOW CLUSTERS" + assertTrue(!clusters.isEmpty()) + def validCluster = clusters[0][0] + sql """use @${validCluster};""" + + String tableName = "st07_qcs_tpl" + def ddl = new File("""${context.file.parent}/../ddl/st07_qcs_consistency.sql""").text + .replace("\${TABLE_NAME}", tableName) + sql ddl + + (0..<500).each { i -> Review Comment: Performance: This loop executes 500 individual INSERT statements, which is very slow and may cause unnecessary test timeouts. Other tests in this PR use batch inserts efficiently (e.g., `st04` batches 200 rows at a time). Suggested fix: ```groovy def values = (0..<500).collect { i -> "(${i}, 'qcs_tpl_${i}')" }.join(",") sql """insert into ${tableName} values ${values}""" ``` Note: the test intentionally wants multiple rowsets for compaction, so you could compromise with a few batches (e.g., 5 batches of 100) instead of 500 individual inserts. ########## regression-test/suites/cloud_p0/cache/ttl/st10_drop_partition_cleanup.groovy: ########## @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("st10_drop_partition_cleanup") { + def customBeConfig = [ + enable_evict_file_cache_in_advance : false, + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 + ] + def customFeConfig = [ + rehash_tablet_after_be_dead_seconds : 5 + ] + + setBeConfigTemporary(customBeConfig) { + setFeConfigTemporary(customFeConfig) { + def clusters = sql "SHOW CLUSTERS" + assertTrue(!clusters.isEmpty()) + def validCluster = clusters[0][0] + sql """use @${validCluster};""" + + String tableName = "st10_drop_part_cleanup_tpl" + def ddl = new File("""${context.file.parent}/../ddl/st10_drop_partition_cleanup.sql""").text + .replace("\${TABLE_NAME}", tableName) + sql ddl + + String[][] backends = sql """show backends""" + def backendIdToBackendIP = [:] + def backendIdToBackendHttpPort = [:] + def backendIdToBackendBrpcPort = [:] + for (String[] backend in backends) { + if (backend[9].equals("true") && backend[19].contains("${validCluster}")) { + backendIdToBackendIP.put(backend[0], backend[1]) + backendIdToBackendHttpPort.put(backend[0], backend[4]) + backendIdToBackendBrpcPort.put(backend[0], backend[5]) + } + } + assertEquals(backendIdToBackendIP.size(), 1) + + def backendId = backendIdToBackendIP.keySet()[0] + def clearUrl = backendIdToBackendIP.get(backendId) + ":" + backendIdToBackendHttpPort.get(backendId) + "/api/file_cache?op=clear&sync=true" + httpTest { + endpoint "" + uri clearUrl + op "get" + body "" + printResponse false + check { respCode, body -> + assertEquals("${respCode}".toString(), "200") + } + } + + def waitForFileCacheType = { List<Long> tabletIds, String expectedType, long timeoutMs = 120000L, long intervalMs = 2000L -> + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + boolean allMatch = true + for (Long tabletId in tabletIds) { + def rows = sql """select type from information_schema.file_cache_info where tablet_id = ${tabletId}""" + if (rows.isEmpty()) { + allMatch = false + break + } + def mismatch = rows.find { row -> !row[0]?.toString()?.equalsIgnoreCase(expectedType) } + if (mismatch) { + allMatch = false + break + } + } + if (allMatch) { + return + } + sleep(intervalMs) + } + assertTrue(false, "Timeout waiting for ${expectedType}, tablets=${tabletIds}") + } + + def waitDroppedTabletCacheInfoEmpty = { List<Long> tabletIds, long timeoutMs = 300000L, long intervalMs = 3000L -> + if (tabletIds.isEmpty()) { + return + } + String idList = tabletIds.join(",") + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + def rows = sql """select tablet_id from information_schema.file_cache_info where tablet_id in (${idList}) limit 1""" + if (rows.isEmpty()) { + return + } + sleep(intervalMs) + } + assertTrue(false, "Timeout waiting dropped tablet cache entries cleaned, tablets=${tabletIds}") + } + + def waitTabletCacheInfoNonEmpty = { List<Long> tabletIds, long timeoutMs = 120000L, long intervalMs = 2000L -> + if (tabletIds.isEmpty()) { + assertTrue(false, "tabletIds is empty") + } + String idList = tabletIds.join(",") + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + def rows = sql """select tablet_id from information_schema.file_cache_info where tablet_id in (${idList}) limit 1""" + if (!rows.isEmpty()) { + return + } + sleep(intervalMs) + } + assertTrue(false, "Timeout waiting tablet cache entries exist, tablets=${tabletIds}") + } + + def getBrpcMetricSum = { String metricNameSubstr -> + long sumValue = 0L + httpTest { + endpoint backendIdToBackendIP.get(backendId) + ":" + backendIdToBackendBrpcPort.get(backendId) + uri "/brpc_metrics" + op "get" + check { respCode, body -> + assertEquals("${respCode}".toString(), "200") + String out = "${body}".toString() + def lines = out.split('\n') + for (String line in lines) { + if (line.startsWith("#")) { + continue + } + if (!line.contains(metricNameSubstr)) { + continue + } + logger.info("metric line: ${line}") + def idx = line.indexOf(' ') + if (idx <= 0) { + continue + } + try { + sumValue += line.substring(idx).trim().toLong() + } catch (Exception e) { + logger.warn("ignore unparsable metric line: ${line}") + } + } + } + } + return sumValue + } + + def waitBrpcMetricLE = { String metricNameSubstr, long upperBound, long timeoutMs = 180000L, long intervalMs = 3000L -> + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + long cur = getBrpcMetricSum.call(metricNameSubstr) + if (cur <= upperBound) { + return + } + sleep(intervalMs) + } + long curFinal = getBrpcMetricSum.call(metricNameSubstr) + assertTrue(curFinal <= upperBound, "Metric ${metricNameSubstr} should <= ${upperBound}, actual=${curFinal}") + } + + def getPartitionTabletIds = { String tbl, String partitionName -> + def tablets = sql """show tablets from ${tbl} partition ${partitionName}""" + assertTrue(!tablets.isEmpty(), "No tablets found for partition ${partitionName}") + tablets.collect { it[0] as Long } + } + + def p1Values = (0..<120).collect { i -> "(${i}, 'p1_${i}')" }.join(",") + def p2Values = (1000..<1120).collect { i -> "(${i}, 'p2_${i}')" }.join(",") + sql """insert into ${tableName} values ${p1Values}""" + sql """insert into ${tableName} values ${p2Values}""" + qt_part_preheat """select count(*) from ${tableName}""" + sleep(5000) + + def p1Tablets = getPartitionTabletIds.call(tableName, "p1") + def p2Tablets = getPartitionTabletIds.call(tableName, "p2") + waitForFileCacheType.call((p1Tablets + p2Tablets).unique(), "ttl") + + final String ttlMgrSetMetric = "file_cache_ttl_mgr_tablet_id_set_size" + long ttlMgrSetSizeBeforeDropPartition = getBrpcMetricSum.call(ttlMgrSetMetric) + + sql """alter table ${tableName} drop partition p1 force""" + waitDroppedTabletCacheInfoEmpty.call(p1Tablets) + waitTabletCacheInfoNonEmpty.call(p2Tablets) + waitBrpcMetricLE.call(ttlMgrSetMetric, ttlMgrSetSizeBeforeDropPartition) + + qt_part_survivor """select count(*) from ${tableName} where k1 >= 1000""" Review Comment: Bug: `qt_part_survivor` is used here but its expected output is missing from the `.out` file (`st10_drop_partition_cleanup.out`). The `.out` file only contains `-- !part_preheat --` with value 240, but there is no `-- !part_survivor --` entry. You need to add the expected result for this check tag. It should be the count of rows in partition p2 (k1 >= 1000), which would be 120 rows. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
