This is an automated email from the ASF dual-hosted git repository.

felixybw pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new f26ce78c3a [VL] Fix stoi issue when get parquet write options (#11504)
f26ce78c3a is described below

commit f26ce78c3a7ae1fe829496b6a7f301df79ed7a97
Author: Rex(Hui) An <[email protected]>
AuthorDate: Fri Jan 30 12:36:35 2026 +0800

    [VL] Fix stoi issue when get parquet write options (#11504)
    
    The parquet write options could exceed INT_MAX
---
 .../sql/execution/VeloxParquetWriteSuite.scala     | 44 ++++++++++++++++++++++
 cpp/velox/utils/VeloxWriterUtils.cc                |  6 +--
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git 
a/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteSuite.scala
 
b/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteSuite.scala
index 81c9870a52..4bc20142a1 100644
--- 
a/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteSuite.scala
+++ 
b/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteSuite.scala
@@ -164,4 +164,48 @@ class VeloxParquetWriteSuite extends 
VeloxWholeStageTransformerSuite {
       
Assert.assertTrue(FallbackUtil.hasFallback(df.queryExecution.executedPlan))
     }
   }
+
+  test("test write parquet with custom block size, block rows and page size") {
+    val blockSize = 64 * 1024 * 1024L // 64MB
+    val blockRows = 10000000L // 10M
+    val pageSize = 1024 * 1024L // 1MB
+
+    withTempPath {
+      f =>
+        spark
+          .range(100)
+          .toDF("id")
+          .write
+          .format("parquet")
+          .option(GlutenConfig.PARQUET_BLOCK_SIZE, blockSize.toString)
+          .option(GlutenConfig.PARQUET_BLOCK_ROWS, blockRows.toString)
+          .option(GlutenConfig.PARQUET_DATAPAGE_SIZE, pageSize.toString)
+          .save(f.getCanonicalPath)
+
+        val parquetDf = spark.read.parquet(f.getCanonicalPath)
+        checkAnswer(parquetDf, spark.range(100).toDF("id"))
+    }
+  }
+
+  test("test write parquet with block size, block rows and page size exceeding 
INT_MAX") {
+    val largeBlockSize = 3L * 1024 * 1024 * 1024 // 3GB
+    val largeBlockRows = 3L * 1000 * 1000 * 1000 // 3 billion
+    val largePageSize = 3L * 1024 * 1024 * 1024 // 3GB
+
+    withTempPath {
+      f =>
+        spark
+          .range(100)
+          .toDF("id")
+          .write
+          .format("parquet")
+          .option(GlutenConfig.PARQUET_BLOCK_SIZE, largeBlockSize.toString)
+          .option(GlutenConfig.PARQUET_BLOCK_ROWS, largeBlockRows.toString)
+          .option(GlutenConfig.PARQUET_DATAPAGE_SIZE, largePageSize.toString)
+          .save(f.getCanonicalPath)
+
+        val parquetDf = spark.read.parquet(f.getCanonicalPath)
+        checkAnswer(parquetDf, spark.range(100).toDF("id"))
+    }
+  }
 }
diff --git a/cpp/velox/utils/VeloxWriterUtils.cc 
b/cpp/velox/utils/VeloxWriterUtils.cc
index fc4b810fb8..50e4ca601e 100644
--- a/cpp/velox/utils/VeloxWriterUtils.cc
+++ b/cpp/velox/utils/VeloxWriterUtils.cc
@@ -42,10 +42,10 @@ std::unique_ptr<WriterOptions> makeParquetWriteOption(const 
std::unordered_map<s
   int64_t maxRowGroupBytes = 134217728; // 128MB
   int64_t maxRowGroupRows = 100000000; // 100M
   if (auto it = sparkConfs.find(kParquetBlockSize); it != sparkConfs.end()) {
-    maxRowGroupBytes = static_cast<int64_t>(stoi(it->second));
+    maxRowGroupBytes = std::stoll(it->second);
   }
   if (auto it = sparkConfs.find(kParquetBlockRows); it != sparkConfs.end()) {
-    maxRowGroupRows = static_cast<int64_t>(stoi(it->second));
+    maxRowGroupRows = std::stoll(it->second);
   }
   auto writeOption = std::make_unique<WriterOptions>();
   writeOption->parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds 
/*micro*/;
@@ -93,7 +93,7 @@ std::unique_ptr<WriterOptions> makeParquetWriteOption(const 
std::unordered_map<s
   writeOption->arrowMemoryPool =
       
getDefaultMemoryManager()->getOrCreateArrowMemoryPool("VeloxParquetWrite.ArrowMemoryPool");
   if (auto it = sparkConfs.find(kParquetDataPageSize); it != sparkConfs.end()) 
{
-    auto dataPageSize = std::stoi(it->second);
+    auto dataPageSize = std::stoll(it->second);
     writeOption->dataPageSize = dataPageSize;
   }
   if (auto it = sparkConfs.find(kParquetWriterVersion); it != 
sparkConfs.end()) {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to