This is an automated email from the ASF dual-hosted git repository.
felixybw pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new f26ce78c3a [VL] Fix stoi issue when get parquet write options (#11504)
f26ce78c3a is described below
commit f26ce78c3a7ae1fe829496b6a7f301df79ed7a97
Author: Rex(Hui) An <[email protected]>
AuthorDate: Fri Jan 30 12:36:35 2026 +0800
[VL] Fix stoi issue when get parquet write options (#11504)
The parquet write options could exceed INT_MAX
---
.../sql/execution/VeloxParquetWriteSuite.scala | 44 ++++++++++++++++++++++
cpp/velox/utils/VeloxWriterUtils.cc | 6 +--
2 files changed, 47 insertions(+), 3 deletions(-)
diff --git
a/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteSuite.scala
b/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteSuite.scala
index 81c9870a52..4bc20142a1 100644
---
a/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteSuite.scala
+++
b/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteSuite.scala
@@ -164,4 +164,48 @@ class VeloxParquetWriteSuite extends
VeloxWholeStageTransformerSuite {
Assert.assertTrue(FallbackUtil.hasFallback(df.queryExecution.executedPlan))
}
}
+
+ test("test write parquet with custom block size, block rows and page size") {
+ val blockSize = 64 * 1024 * 1024L // 64MB
+ val blockRows = 10000000L // 10M
+ val pageSize = 1024 * 1024L // 1MB
+
+ withTempPath {
+ f =>
+ spark
+ .range(100)
+ .toDF("id")
+ .write
+ .format("parquet")
+ .option(GlutenConfig.PARQUET_BLOCK_SIZE, blockSize.toString)
+ .option(GlutenConfig.PARQUET_BLOCK_ROWS, blockRows.toString)
+ .option(GlutenConfig.PARQUET_DATAPAGE_SIZE, pageSize.toString)
+ .save(f.getCanonicalPath)
+
+ val parquetDf = spark.read.parquet(f.getCanonicalPath)
+ checkAnswer(parquetDf, spark.range(100).toDF("id"))
+ }
+ }
+
+ test("test write parquet with block size, block rows and page size exceeding
INT_MAX") {
+ val largeBlockSize = 3L * 1024 * 1024 * 1024 // 3GB
+ val largeBlockRows = 3L * 1000 * 1000 * 1000 // 3 billion
+ val largePageSize = 3L * 1024 * 1024 * 1024 // 3GB
+
+ withTempPath {
+ f =>
+ spark
+ .range(100)
+ .toDF("id")
+ .write
+ .format("parquet")
+ .option(GlutenConfig.PARQUET_BLOCK_SIZE, largeBlockSize.toString)
+ .option(GlutenConfig.PARQUET_BLOCK_ROWS, largeBlockRows.toString)
+ .option(GlutenConfig.PARQUET_DATAPAGE_SIZE, largePageSize.toString)
+ .save(f.getCanonicalPath)
+
+ val parquetDf = spark.read.parquet(f.getCanonicalPath)
+ checkAnswer(parquetDf, spark.range(100).toDF("id"))
+ }
+ }
}
diff --git a/cpp/velox/utils/VeloxWriterUtils.cc
b/cpp/velox/utils/VeloxWriterUtils.cc
index fc4b810fb8..50e4ca601e 100644
--- a/cpp/velox/utils/VeloxWriterUtils.cc
+++ b/cpp/velox/utils/VeloxWriterUtils.cc
@@ -42,10 +42,10 @@ std::unique_ptr<WriterOptions> makeParquetWriteOption(const
std::unordered_map<s
int64_t maxRowGroupBytes = 134217728; // 128MB
int64_t maxRowGroupRows = 100000000; // 100M
if (auto it = sparkConfs.find(kParquetBlockSize); it != sparkConfs.end()) {
- maxRowGroupBytes = static_cast<int64_t>(stoi(it->second));
+ maxRowGroupBytes = std::stoll(it->second);
}
if (auto it = sparkConfs.find(kParquetBlockRows); it != sparkConfs.end()) {
- maxRowGroupRows = static_cast<int64_t>(stoi(it->second));
+ maxRowGroupRows = std::stoll(it->second);
}
auto writeOption = std::make_unique<WriterOptions>();
writeOption->parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds
/*micro*/;
@@ -93,7 +93,7 @@ std::unique_ptr<WriterOptions> makeParquetWriteOption(const
std::unordered_map<s
writeOption->arrowMemoryPool =
getDefaultMemoryManager()->getOrCreateArrowMemoryPool("VeloxParquetWrite.ArrowMemoryPool");
if (auto it = sparkConfs.find(kParquetDataPageSize); it != sparkConfs.end())
{
- auto dataPageSize = std::stoi(it->second);
+ auto dataPageSize = std::stoll(it->second);
writeOption->dataPageSize = dataPageSize;
}
if (auto it = sparkConfs.find(kParquetWriterVersion); it !=
sparkConfs.end()) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]