(parquet-java) branch master updated: Allow bytestreamsplit available via Hadoop Configuration (#3340)

gabor Sat, 18 Oct 2025 01:57:46 -0700

This is an automated email from the ASF dual-hosted git repository.

gabor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git



The following commit(s) were added to refs/heads/master by this push:
     new 9e231dca2 Allow bytestreamsplit available via Hadoop Configuration 
(#3340)
9e231dca2 is described below

commit 9e231dca2ad0b5c63ba8873ae09116360ebddfee
Author: Arnav Balyan <[email protected]>
AuthorDate: Mon Oct 6 13:08:15 2025 +0530

    Allow bytestreamsplit available via Hadoop Configuration (#3340)
---
 .../apache/parquet/hadoop/ParquetOutputFormat.java | 10 +++++
 .../hadoop/TestByteStreamSplitConfiguration.java   | 52 ++++++++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git 
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
 
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
index 403666868..868ae634c 100644
--- 
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
+++ 
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
@@ -80,6 +80,9 @@ import org.slf4j.LoggerFactory;
  * # To enable/disable dictionary encoding
  * parquet.enable.dictionary=true # false to disable dictionary encoding
  *
+ * # To enable/disable BYTE_STREAM_SPLIT encoding
+ * parquet.enable.bytestreamsplit=false # true to enable BYTE_STREAM_SPLIT 
encoding
+ *
  * # To enable/disable summary metadata aggregation at the end of a MR job
  * # The default is true (enabled)
  * parquet.enable.summary-metadata=true # false to disable summary aggregation
@@ -137,6 +140,7 @@ public class ParquetOutputFormat<T> extends 
FileOutputFormat<Void, T> {
   public static final String WRITE_SUPPORT_CLASS = 
"parquet.write.support.class";
   public static final String DICTIONARY_PAGE_SIZE = 
"parquet.dictionary.page.size";
   public static final String ENABLE_DICTIONARY = "parquet.enable.dictionary";
+  public static final String ENABLE_BYTE_STREAM_SPLIT = 
"parquet.enable.bytestreamsplit";
   public static final String VALIDATION = "parquet.validation";
   public static final String WRITER_VERSION = "parquet.writer.version";
   public static final String MEMORY_POOL_RATIO = "parquet.memory.pool.ratio";
@@ -270,6 +274,11 @@ public class ParquetOutputFormat<T> extends 
FileOutputFormat<Void, T> {
     return configuration.getBoolean(ENABLE_DICTIONARY, 
ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED);
   }
 
+  public static boolean getByteStreamSplitEnabled(Configuration configuration) 
{
+    return configuration.getBoolean(
+        ENABLE_BYTE_STREAM_SPLIT, 
ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED);
+  }
+
   public static int getMinRowCountForPageSizeCheck(Configuration 
configuration) {
     return configuration.getInt(
         MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, 
ParquetProperties.DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK);
@@ -503,6 +512,7 @@ public class ParquetOutputFormat<T> extends 
FileOutputFormat<Void, T> {
         .withPageSize(getPageSize(conf))
         .withDictionaryPageSize(getDictionaryPageSize(conf))
         .withDictionaryEncoding(getEnableDictionary(conf))
+        .withByteStreamSplitEncoding(getByteStreamSplitEnabled(conf))
         .withWriterVersion(getWriterVersion(conf))
         .estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf))
         .withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf))
diff --git 
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java
 
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java
new file mode 100644
index 000000000..a756d167a
--- /dev/null
+++ 
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.hadoop;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.parquet.column.ParquetProperties;
+import org.junit.Test;
+
+public class TestByteStreamSplitConfiguration {
+  @Test
+  public void testDefault() throws Exception {
+    Configuration conf = new Configuration();
+    // default should be false
+    assertEquals(
+        ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED,
+        ParquetOutputFormat.getByteStreamSplitEnabled(conf));
+  }
+
+  @Test
+  public void testSetTrue() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, true);
+    assertTrue(ParquetOutputFormat.getByteStreamSplitEnabled(conf));
+  }
+
+  @Test
+  public void testSetFalse() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, false);
+    assertFalse(ParquetOutputFormat.getByteStreamSplitEnabled(conf));
+  }
+}

(parquet-java) branch master updated: Allow bytestreamsplit available via Hadoop Configuration (#3340)

Reply via email to