soumilshah1995 opened a new issue, #12169:
URL: https://github.com/apache/hudi/issues/12169
I'm experiencing an issue with the Hudi configuration for the parquet
compression codec. Despite setting the option
"hoodie.parquet.compression.codec": "GZIP" in my Hudi write options, the output
files in my data lake are not showing as compressed files. Instead, I only see
the standard Parquet files.
Configuration:
Hudi Version: 1.0.0-beta2
Spark Version: 3.4
Java Version: OpenJDK 11
```
hudi_options = {
'hoodie.table.name': table_name,
'hoodie.datasource.write.table.type': table_type,
'hoodie.datasource.write.table.name': table_name,
'hoodie.datasource.write.operation': method,
'hoodie.datasource.write.recordkey.field': recordkey,
'hoodie.datasource.write.precombine.field': precombine,
"hoodie.datasource.write.partitionpath.field": partition_fields,
"hoodie.parquet.compression.codec": "GZIP",
"parquet.compression.codec": "GZIP"
}
```
# Test Code
```
try:
import os
import sys
import uuid
import pyspark
import datetime
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from faker import Faker
import datetime
from datetime import datetime
import random
import pandas as pd # Import Pandas library for pretty printing
print("Imports loaded ")
except Exception as e:
print("error", e)
HUDI_VERSION = '1.0.0-beta2'
SPARK_VERSION = '3.4'
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@11"
SUBMIT_ARGS = f"--packages
org.apache.hudi:hudi-spark{SPARK_VERSION}-bundle_2.12:{HUDI_VERSION}
pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
os.environ['PYSPARK_PYTHON'] = sys.executable
# Spark session
spark = SparkSession.builder \
.config('spark.serializer',
'org.apache.spark.serializer.KryoSerializer') \
.config('spark.sql.extensions',
'org.apache.spark.sql.hudi.HoodieSparkSessionExtension') \
.config('className', 'org.apache.hudi') \
.config('spark.sql.hive.convertMetastoreParquet', 'false') \
.getOrCreate()
def write_to_hudi(spark_df,
table_name,
db_name,
method='upsert',
table_type='COPY_ON_WRITE',
recordkey='',
precombine='',
partition_fields='',
index_type='BLOOM',
curr_region='us-east-1'
):
path =
f"file:///Users/soumilshah/IdeaProjects/SparkProject/tem/database={db_name}/table_name={table_name}"
hudi_options = {
'hoodie.table.name': table_name,
'hoodie.datasource.write.table.type': table_type,
'hoodie.datasource.write.table.name': table_name,
'hoodie.datasource.write.operation': method,
'hoodie.datasource.write.recordkey.field': recordkey,
'hoodie.datasource.write.precombine.field': precombine,
"hoodie.datasource.write.partitionpath.field": partition_fields,
"hoodie.clustering.plan.strategy.target.file.max.bytes": "1073741824",
"hoodie.clustering.plan.strategy.small.file.limit": "629145600",
"hoodie.clustering.execution.strategy.class":
"org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy",
"hoodie.clean.automatic": "true"
, "hoodie.parquet.max.file.size": 512 * 1024 * 1024
, "hoodie.parquet.small.file.limit": 104857600,
"hoodie.parquet.compression.codec": "GZIP",
"parquet.compression.codec":"GZIP"
}
spark_df.write.format("hudi"). \
options(**hudi_options). \
mode("append"). \
save(path)
from pyspark.sql.types import StructType, StructField, StringType, LongType
schema = StructType([
StructField("id", StringType(), True),
StructField("message", StringType(), True)
])
# Loop to generate data and write to Hudi
for i in range(1, 5): # Number of iterations
print("Epoch ", str(i))
# Generate epoch timestamp
epoch_time = int(datetime.now().timestamp())
# Create the data
updated_data = [(str(i), "Batch : {} ".format(i))]
# Create the DataFrame with the new data
df = spark.createDataFrame(updated_data, schema)
# Show the DataFrame with the updated "message" column
# Write to Hudi
write_to_hudi(
spark_df=df,
method="upsert",
db_name="default",
table_name="messages",
recordkey="id",
precombine="message"
)
```
# Output

--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]