This is an automated email from the ASF dual-hosted git repository.
gabor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new 9e231dca2 Allow bytestreamsplit available via Hadoop Configuration
(#3340)
9e231dca2 is described below
commit 9e231dca2ad0b5c63ba8873ae09116360ebddfee
Author: Arnav Balyan <[email protected]>
AuthorDate: Mon Oct 6 13:08:15 2025 +0530
Allow bytestreamsplit available via Hadoop Configuration (#3340)
---
.../apache/parquet/hadoop/ParquetOutputFormat.java | 10 +++++
.../hadoop/TestByteStreamSplitConfiguration.java | 52 ++++++++++++++++++++++
2 files changed, 62 insertions(+)
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
index 403666868..868ae634c 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
@@ -80,6 +80,9 @@ import org.slf4j.LoggerFactory;
* # To enable/disable dictionary encoding
* parquet.enable.dictionary=true # false to disable dictionary encoding
*
+ * # To enable/disable BYTE_STREAM_SPLIT encoding
+ * parquet.enable.bytestreamsplit=false # true to enable BYTE_STREAM_SPLIT
encoding
+ *
* # To enable/disable summary metadata aggregation at the end of a MR job
* # The default is true (enabled)
* parquet.enable.summary-metadata=true # false to disable summary aggregation
@@ -137,6 +140,7 @@ public class ParquetOutputFormat<T> extends
FileOutputFormat<Void, T> {
public static final String WRITE_SUPPORT_CLASS =
"parquet.write.support.class";
public static final String DICTIONARY_PAGE_SIZE =
"parquet.dictionary.page.size";
public static final String ENABLE_DICTIONARY = "parquet.enable.dictionary";
+ public static final String ENABLE_BYTE_STREAM_SPLIT =
"parquet.enable.bytestreamsplit";
public static final String VALIDATION = "parquet.validation";
public static final String WRITER_VERSION = "parquet.writer.version";
public static final String MEMORY_POOL_RATIO = "parquet.memory.pool.ratio";
@@ -270,6 +274,11 @@ public class ParquetOutputFormat<T> extends
FileOutputFormat<Void, T> {
return configuration.getBoolean(ENABLE_DICTIONARY,
ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED);
}
+ public static boolean getByteStreamSplitEnabled(Configuration configuration)
{
+ return configuration.getBoolean(
+ ENABLE_BYTE_STREAM_SPLIT,
ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED);
+ }
+
public static int getMinRowCountForPageSizeCheck(Configuration
configuration) {
return configuration.getInt(
MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK,
ParquetProperties.DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK);
@@ -503,6 +512,7 @@ public class ParquetOutputFormat<T> extends
FileOutputFormat<Void, T> {
.withPageSize(getPageSize(conf))
.withDictionaryPageSize(getDictionaryPageSize(conf))
.withDictionaryEncoding(getEnableDictionary(conf))
+ .withByteStreamSplitEncoding(getByteStreamSplitEnabled(conf))
.withWriterVersion(getWriterVersion(conf))
.estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf))
.withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf))
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java
new file mode 100644
index 000000000..a756d167a
--- /dev/null
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.hadoop;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.parquet.column.ParquetProperties;
+import org.junit.Test;
+
+public class TestByteStreamSplitConfiguration {
+ @Test
+ public void testDefault() throws Exception {
+ Configuration conf = new Configuration();
+ // default should be false
+ assertEquals(
+ ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED,
+ ParquetOutputFormat.getByteStreamSplitEnabled(conf));
+ }
+
+ @Test
+ public void testSetTrue() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, true);
+ assertTrue(ParquetOutputFormat.getByteStreamSplitEnabled(conf));
+ }
+
+ @Test
+ public void testSetFalse() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, false);
+ assertFalse(ParquetOutputFormat.getByteStreamSplitEnabled(conf));
+ }
+}