This is an automated email from the ASF dual-hosted git repository.
djwang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry-pxf.git
The following commit(s) were added to refs/heads/main by this push:
new ab7f9026 Update parquet to 1.12.3 in PXF
ab7f9026 is described below
commit ab7f902672ec6a579657e157d210cd912b5c210f
Author: Nikolay Antonov <[email protected]>
AuthorDate: Fri Feb 6 08:26:56 2026 +0500
Update parquet to 1.12.3 in PXF
Update parquet to 1.12.3 (latest version with hadoop-client-2.x)
* Add support for ZSTD compression
* Temporary declare LZO as not supported. It causes following error in both
current and in pre-upgrade builds:
```
ERROR: PXF server error : Class com.hadoop.compression.lzo.LzoCodec was
not found (seg1 10.11.0.131:6000 pid=2567556)
```
* Add tests to cover different types of compression
---
.../pxf/automation/features/parquet/ParquetWriteTest.java | 15 +++++++++++++++
docs/content/hdfs_parquet.html.md.erb | 4 ++--
server/build.gradle | 3 ++-
server/gradle.properties | 2 +-
server/pxf-hdfs/build.gradle | 1 +
5 files changed, 21 insertions(+), 4 deletions(-)
diff --git
a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java
b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java
index 358d2233..18218eac 100644
---
a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java
+++
b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java
@@ -195,6 +195,21 @@ public class ParquetWriteTest extends BaseWritableFeature {
runWritePrimitivesScenario("pxf_parquet_write_primitives_gzip_classname",
"pxf_parquet_read_primitives_gzip_classname",
"parquet_write_primitives_gzip_classname", new
String[]{"COMPRESSION_CODEC=org.apache.hadoop.io.compress.GzipCodec"});
}
+ @Test(groups = {"features", "gpdb", "security", "hcfs"})
+ public void parquetWritePrimitivesSnappy() throws Exception {
+ runWritePrimitivesScenario("pxf_parquet_write_primitives_snappy",
"pxf_parquet_read_primitives_snappy", "parquet_write_primitives_snappy", new
String[]{"COMPRESSION_CODEC=snappy"});
+ }
+
+ @Test(groups = {"features", "gpdb", "security", "hcfs"})
+ public void parquetWritePrimitivesUncompressed() throws Exception {
+
runWritePrimitivesScenario("pxf_parquet_write_primitives_uncompressed",
"pxf_parquet_read_primitives_uncompressed",
"parquet_write_primitives_uncompressed", new
String[]{"COMPRESSION_CODEC=uncompressed"});
+ }
+
+ @Test(groups = {"features", "gpdb", "security", "hcfs"})
+ public void parquetWritePrimitivesZStd() throws Exception {
+ runWritePrimitivesScenario("pxf_parquet_write_primitives_zstd",
"pxf_parquet_read_primitives_zstd", "parquet_write_primitives_zstd", new
String[]{"COMPRESSION_CODEC=zstd"});
+ }
+
// Numeric precision not defined, test writing data precision in [1, 38].
All the data should be written correctly.
@Test(groups = {"features", "gpdb", "security", "hcfs"})
public void parquetWriteUndefinedPrecisionNumeric() throws Exception {
diff --git a/docs/content/hdfs_parquet.html.md.erb
b/docs/content/hdfs_parquet.html.md.erb
index 26ee4817..9ad05b78 100644
--- a/docs/content/hdfs_parquet.html.md.erb
+++ b/docs/content/hdfs_parquet.html.md.erb
@@ -23,7 +23,7 @@ under the License.
Use the PXF HDFS connector to read and write Parquet-format data. This section
describes how to read and write HDFS files that are stored in Parquet format,
including how to create, query, and insert into external tables that reference
files in the HDFS data store.
-PXF supports reading or writing Parquet files compressed with these codecs:
`snappy`, `gzip`, and `lzo`.
+PXF supports reading or writing Parquet files compressed with these codecs:
`snappy`, `gzip`, and `zstd`.
PXF currently supports reading and writing primitive Parquet data types only.
@@ -182,7 +182,7 @@ The PXF `hdfs:parquet` profile supports encoding- and
compression-related write
| Write Option | Value Description |
|-------|-------------------------------------|
-| COMPRESSION_CODEC | The compression codec alias. Supported compression
codecs for writing Parquet data include: `snappy`, `gzip`, `lzo`, and
`uncompressed` . If this option is not provided, PXF compresses the data using
`snappy` compression. |
+| COMPRESSION_CODEC | The compression codec alias. Supported compression
codecs for writing Parquet data include: `snappy`, `gzip`, `zstd`, and
`uncompressed` . If this option is not provided, PXF compresses the data using
`snappy` compression. |
| ROWGROUP_SIZE | A Parquet file consists of one or more row groups, a logical
partitioning of the data into rows. `ROWGROUP_SIZE` identifies the size (in
bytes) of the row group. The default row group size is `8 * 1024 * 1024` bytes.
|
| PAGE_SIZE | A row group consists of column chunks that are divided up into
pages. `PAGE_SIZE` is the size (in bytes) of such a page. The default page size
is `1 * 1024 * 1024` bytes. |
| ENABLE\_DICTIONARY | A boolean value that specifies whether or not to enable
dictionary encoding. The default value is `true`; dictionary encoding is
enabled when PXF writes Parquet files. |
diff --git a/server/build.gradle b/server/build.gradle
index 499a0b72..a1b6275c 100644
--- a/server/build.gradle
+++ b/server/build.gradle
@@ -164,7 +164,7 @@ configure(javaProjects) {
}
// Parquet dependencies
- dependency("org.apache.parquet:parquet-format:2.7.0")
+ dependency("org.apache.parquet:parquet-format:2.11.0")
dependencySet(group:"org.apache.parquet",
version:"${parquetVersion}") {
entry("parquet-column")
entry("parquet-common")
@@ -173,6 +173,7 @@ configure(javaProjects) {
entry("parquet-hadoop")
entry("parquet-jackson")
entry("parquet-pig")
+ entry("parquet-format-structures")
}
// Thrift dependencies
diff --git a/server/gradle.properties b/server/gradle.properties
index eb6191df..42da880a 100644
--- a/server/gradle.properties
+++ b/server/gradle.properties
@@ -23,7 +23,7 @@ hiveVersion=2.3.8
hiveStorageApiVersion=2.7.3
hbaseVersion=1.3.2
junitVersion=4.11
-parquetVersion=1.11.1
+parquetVersion=1.12.3
awsJavaSdk=1.12.261
springBootVersion=2.7.18
org.gradle.daemon=true
diff --git a/server/pxf-hdfs/build.gradle b/server/pxf-hdfs/build.gradle
index 673e528e..9705fb6f 100644
--- a/server/pxf-hdfs/build.gradle
+++ b/server/pxf-hdfs/build.gradle
@@ -38,6 +38,7 @@ dependencies {
implementation("org.apache.hadoop:hadoop-hdfs") {
transitive = false }
implementation("org.apache.hadoop:hadoop-hdfs-client") {
transitive = false }
implementation("org.apache.parquet:parquet-format") {
transitive = false }
+ implementation("org.apache.parquet:parquet-format-structures") {
transitive = false }
implementation("org.apache.parquet:parquet-column") {
transitive = false }
implementation("org.apache.parquet:parquet-common") {
transitive = false }
implementation("org.apache.parquet:parquet-encoding") {
transitive = false }
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]