Copilot commented on code in PR #12400: URL: https://github.com/apache/gluten/pull/12400#discussion_r3498467561
########## backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxFileHandleCacheSuite.scala: ########## @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.gluten.config.VeloxConfig +import org.apache.gluten.execution.{BasicScanExecTransformer, VeloxWholeStageTransformerSuite} + +import org.apache.spark.SparkConf + +/** + * Test suite for Velox file handle cache behavior. + * + * Tests correctness, config propagation, and edge cases for the file handle cache which caches open + * file handles (descriptors) to avoid repeated open/close overhead. + */ +class VeloxFileHandleCacheSuite extends VeloxWholeStageTransformerSuite { + override protected val resourcePath: String = "/parquet-for-read" + override protected val fileFormat: String = "parquet" + + override protected def sparkConf: SparkConf = { + super.sparkConf + .set(VeloxConfig.COLUMNAR_VELOX_FILE_HANDLE_CACHE_ENABLED.key, "true") + .set(VeloxConfig.COLUMNAR_VELOX_FILE_HANDLE_EXPIRATION_DURATION_MS.key, "600000") + .set(VeloxConfig.COLUMNAR_VELOX_NUM_CACHE_FILE_HANDLES.key, "10000") + } + + testWithSpecifiedSparkVersion( + "basic scan correctness with file handle cache enabled", + "3.5", + "3.5") { + // Verify that enabling file handle cache produces correct scan results + withTempPath { + dir => + spark + .range(10000) + .selectExpr("id", "cast(id % 7 as int) as category", "id * 1.5 as value") + .repartition(10) + .write + .parquet(dir.getCanonicalPath) + + val df = spark.read.parquet(dir.getCanonicalPath) + df.createOrReplaceTempView("t") + + runQueryAndCompare("SELECT count(*) FROM t") { + checkGlutenPlan[BasicScanExecTransformer] + } + runQueryAndCompare("SELECT sum(value) FROM t WHERE category = 3") { + checkGlutenPlan[BasicScanExecTransformer] + } + runQueryAndCompare("SELECT category, count(*) FROM t GROUP BY category") { + checkGlutenPlan[BasicScanExecTransformer] + } + } + } + + testWithSpecifiedSparkVersion( + "repeated scans produce consistent results (cache hit path)", + "3.5", + "3.5") { + // When file handles are cached, repeated scans of the same files must produce + // identical results. This exercises the cache hit path. + withTempPath { + dir => + spark + .range(5000) + .selectExpr("id", "cast(id as string) as name") + .repartition(50) // 50 files to exercise many cache entries + .write + .parquet(dir.getCanonicalPath) + + val path = dir.getCanonicalPath + val expected = spark.read.parquet(path).count() + assert(expected == 5000) + + // Scan the same files multiple times - each should hit the cache + for (i <- 1 to 5) { + val count = spark.read.parquet(path).count() + assert( + count == expected, + s"Iteration $i: expected $expected rows but got $count") + } + + // Verify aggregation consistency across repeated scans + val firstSum = spark.read.parquet(path).selectExpr("sum(id)").collect()(0).getLong(0) + for (i <- 1 to 3) { + val sum = spark.read.parquet(path).selectExpr("sum(id)").collect()(0).getLong(0) + assert( + sum == firstSum, + s"Iteration $i: sum mismatch, expected $firstSum but got $sum") + } + } + } + + testWithSpecifiedSparkVersion( + "many small files do not cause errors with file handle cache", + "3.5", + "3.5") { + // Verify that scanning many small files with caching enabled does not cause + // file descriptor exhaustion or other resource-related errors. + withTempPath { + dir => + // Create 200 small parquet files + spark + .range(20000) + .selectExpr("id", "uuid() as payload") + .repartition(200) + .write + .parquet(dir.getCanonicalPath) + + val fileCount = dir.listFiles().count(_.getName.endsWith(".parquet")) + assert(fileCount >= 100, s"Expected at least 100 files, got $fileCount") Review Comment: This test intends to exercise "many small files" by writing 200 parquet parts (`repartition(200)`), but the assertion only checks `fileCount >= 100`, which could allow the test to pass without actually producing the intended number of files. Tightening the assertion makes the test better at catching regressions in file generation / partitioning. ########## cpp/velox/utils/ConfigExtractor.cc: ########## @@ -292,6 +292,10 @@ std::shared_ptr<facebook::velox::config::ConfigBase> createHiveConnectorConfig( hiveConfMap[facebook::velox::connector::hive::HiveConfig::kEnableFileHandleCache] = conf->get<bool>(kVeloxFileHandleCacheEnabled, kVeloxFileHandleCacheEnabledDefault) ? "true" : "false"; + hiveConfMap[facebook::velox::connector::hive::HiveConfig::kNumCacheFileHandles] = + std::to_string(conf->get<int32_t>(kVeloxNumCacheFileHandles, kVeloxNumCacheFileHandlesDefault)); + hiveConfMap[facebook::velox::connector::hive::HiveConfig::kFileHandleExpirationDurationMs] = std::to_string( + conf->get<int64_t>(kVeloxFileHandleExpirationDurationMs, kVeloxFileHandleExpirationDurationMsDefault)); Review Comment: The assignment for kFileHandleExpirationDurationMs is formatted inconsistently vs the adjacent hiveConfMap entries (it keeps `= std::to_string(` on the same line, while others break after `=`). This hurts readability and is likely to be rewrapped by clang-format; consider formatting it like the surrounding assignments. ########## backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala: ########## @@ -527,10 +527,32 @@ object VeloxConfig extends ConfigRegistry { val COLUMNAR_VELOX_FILE_HANDLE_CACHE_ENABLED = buildStaticConf("spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled") .doc( - "Disables caching if false. File handle cache should be disabled " + - "if files are mutable, i.e. file content may change while file path stays the same.") + "Enables caching of file handles to avoid repeated open/close overhead on remote " + + "filesystems. Should be disabled if files are mutable, i.e. file content may " + + "change while file path stays the same.") .booleanConf - .createWithDefault(false) + .createWithDefault(true) + + val COLUMNAR_VELOX_NUM_CACHE_FILE_HANDLES = + buildStaticConf("spark.gluten.sql.columnar.backend.velox.numCacheFileHandles") + .doc( + "Maximum number of entries in the file handle cache. Each entry holds an open " + + "file descriptor (local FS) or connection state (remote FS). Note that on " + + "local filesystems, high values may approach the OS file descriptor limit " + + "(ulimit -n). On remote object stores (S3, ABFS, GCS) entries are HTTP " + + "connections, not OS file descriptors.") + .intConf + .createWithDefault(10000) + Review Comment: The PR description says `spark.gluten.sql.columnar.backend.velox.numCacheFileHandles` defaults to 20000, but the code default here is 10000. Please align the PR description with the actual default (or update the defaults consistently across Scala + C++ if 20000 is intended). ########## backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxFileHandleCacheSuite.scala: ########## @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.gluten.config.VeloxConfig +import org.apache.gluten.execution.{BasicScanExecTransformer, VeloxWholeStageTransformerSuite} + +import org.apache.spark.SparkConf + +/** + * Test suite for Velox file handle cache behavior. + * + * Tests correctness, config propagation, and edge cases for the file handle cache which caches open + * file handles (descriptors) to avoid repeated open/close overhead. + */ +class VeloxFileHandleCacheSuite extends VeloxWholeStageTransformerSuite { + override protected val resourcePath: String = "/parquet-for-read" + override protected val fileFormat: String = "parquet" + + override protected def sparkConf: SparkConf = { + super.sparkConf + .set(VeloxConfig.COLUMNAR_VELOX_FILE_HANDLE_CACHE_ENABLED.key, "true") + .set(VeloxConfig.COLUMNAR_VELOX_FILE_HANDLE_EXPIRATION_DURATION_MS.key, "600000") + .set(VeloxConfig.COLUMNAR_VELOX_NUM_CACHE_FILE_HANDLES.key, "10000") + } Review Comment: The PR wires `fileHandleExpirationDurationMs` through to Velox, but this suite doesn’t currently have a test that validates TTL-based eviction actually takes effect (it only sets the config). Adding a targeted TTL test would protect against regressions where the config stops being propagated or the eviction behavior changes. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
