papablus commented on issue #8019:
URL: https://github.com/apache/hudi/issues/8019#issuecomment-1440254996
Hi @soumilshah1995,
This is my glue job (Glue 4 - Spark 3.3 - Python 3)
`import sys
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import col, to_timestamp,
monotonically_increasing_id, to_date, when
from pyspark.sql.functions import *
from awsglue.utils import getResolvedOptions
from pyspark.sql.types import *
from datetime import datetime, date
import boto3
from functools import reduce
from pyspark.sql import Row
args = getResolvedOptions(sys.argv, ['JOB_NAME',
'OUTPUT_BUCKET',
'HUDI_INIT_SORT_OPTION',
'HUDI_TABLE_NAME',
'HUDI_DB_NAME',
'CATEGORY_ID'
])
print(args)
# print("The name of the bucket is :- ", args['curated_bucket'])
job_start_ts = datetime.now()
ts_format = '%Y-%m-%d %H:%M:%S'
spark =
(SparkSession.builder.config('spark.serializer','org.apache.spark.serializer.KryoSerializer')
\
.config('spark.sql.hive.convertMetastoreParquet','false') \
.config('spark.sql.catalog.spark_catalog','org.apache.spark.sql.hudi.catalog.HoodieCatalog')
\
.config('spark.sql.extensions','org.apache.spark.sql.hudi.HoodieSparkSessionExtension')
\
.config('spark.sql.legacy.pathOptionBehavior.enabled',
'true').getOrCreate())
sc = spark.sparkContext
glueContext = GlueContext(sc)
job = Job(glueContext)
logger = glueContext.get_logger()
job.init(args['JOB_NAME'], args)
def perform_hudi_bulk_insert(
hudi_init_sort_option,
hudi_output_bucket,
hudi_table_name,
hudi_db_name,
category_id
):
hudi_table_type = 'COPY_ON_WRITE'
hudi_table_name = hudi_table_name.lower() + '_' +
hudi_init_sort_option.lower()
# create the schema for the ny trips raw data file in CSV format.
#RECORD_KEY: C
#PRECOMBINE FIELD: F
#PARTITION FIELD: D
# PySpark DataFrame with Ecplicit Schema.
df = spark.createDataFrame([
(10000001, '110000001', 'part1', 'date(1100000001, 81, 1)', 19, 201),
(10000001, '110000001', 'part1', 'date(1100000001, 81, 1)', 19, 202),
(20000001, '220000001', 'part1', 'date(2200000001, 82, 1)', 29,
202),
(20000001, '220000001', 'part1', 'date(2200000001, 82, 1)', 29,
201),
(30000001, '330000001', 'part1', 'date(3300000001, 83, 1)', 30, 203),
(30000001, '330000001', 'part1', 'date(3300000001, 83, 1)', 32, 203),
(30000001, '330000001', 'part1', 'date(3300000001, 83, 1)', 34, 203),
(30000001, '330000001', 'part1', 'date(3300000001, 83, 1)', 36, 203),
(40000001, '440000001', 'part1', 'date(4400000001, 84, 1)', 47, 888),
(40000001, '440000001', 'part1', 'date(4400000001, 84, 1)', 43, 888),
(40000001, '440000001', 'part1', 'date(4400000001, 84, 1)', 45,
888),
(40000001, '440000001', 'part1', 'date(4400000001, 84, 1)', 41, 888)
],
schema='a int, c string, d string, e string, f int, g int')
#df = spark.createDataFrame([
#],
#schema='a int, c string, d string, e string, f int, g int')
# show table
df.show()
# show schema
df.printSchema()
RECORD_KEY = "c"
PARTITION_FIELD = "d"
PRECOMBINE_FIELD = "f"
table_schema = StructType(
[ StructField("a",IntegerType(),True),
StructField("b",DoubleType(),True),
StructField("c",StringType(),True),
StructField("d", DateType(), True),
StructField("e", DateType(), True),
StructField("f", IntegerType(), True)
])
#table_schema2 = '{"type" : "record","name" : "userInfo","namespace" :
"my.example","fields" : [{"name" : "a", "type" : "int"},{"name" : "b", "type" :
"int"},{"name" : "c", "type" : "string"},{"name" : "d", "type" :
"string"},{"name" : "e", "type" : "string"},{"name" : "f", "type" : "int"}]}'
curr_session = boto3.session.Session()
curr_region = curr_session.region_name
# set the hudi configuration
hudi_part_write_config = {
'className': 'org.apache.hudi',
'hoodie.datasource.hive_sync.enable':'true',
'hoodie.datasource.hive_sync.use_jdbc':'false',
'hoodie.datasource.hive_sync.support_timestamp': 'false',
'hoodie.datasource.write.operation': 'bulk_insert',
'hoodie.datasource.write.table.type': hudi_table_type,
'hoodie.table.name': hudi_table_name,
'hoodie.datasource.hive_sync.table': hudi_table_name,
'hoodie.datasource.write.recordkey.field': RECORD_KEY,
'hoodie.datasource.write.precombine.field': PRECOMBINE_FIELD,
'hoodie.datasource.write.partitionpath.field': 'd:SIMPLE',
'hoodie.datasource.write.hive_style_partitioning': 'true',
'hoodie.datasource.hive_sync.partition_extractor_class':
'org.apache.hudi.hive.MultiPartKeysValueExtractor',
'hoodie.datasource.write.hive_style_partitioning': 'true',
'hoodie.datasource.hive_sync.partition_fields': PARTITION_FIELD,
'hoodie.datasource.hive_sync.database': hudi_db_name,
'hoodie.datasource.write.keygenerator.class':
'org.apache.hudi.keygen.CustomKeyGenerator',
'hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled' :
'true',
'hoodie.write.concurrency.mode' : 'single_writer',
'hoodie.cleaner.policy.failed.writes' : 'EAGER',
'hoodie.combine.before.insert' : 'true'
,'hoodie.cleaner.policy' : 'KEEP_LATEST_COMMITS'
,'hoodie.clean.automatic':'true'
#,'hoodie.clean.async':'true'
,'hoodie.cleaner.commits.retained' : 4
,'hoodie.datasource.hive_sync.mode' : 'hms'
}
table_path = 's3://{}/{}'.format(hudi_output_bucket, hudi_table_name)
print('The path for Hudi table where data is stored', table_path)
# only set this hudi parameter is a bulk insert sort option other than
default is choosen
if hudi_init_sort_option.upper() in ['PARTITION_SORT', 'NONE']:
hudi_part_write_config['hoodie.bulkinsert.sort.mode'] =
hudi_init_sort_option
start_tm = datetime.now()
print('The start time for the Bulk Insert :-' , start_tm)
df.write.format("hudi").options(**hudi_part_write_config).mode("append").save(table_path)
end_tm = datetime.now()
print('The End time for the Bulk Insert :-' , end_tm)
time_diff = end_tm - start_tm
print('The time it took for Bulk Insert operation :-' , time_diff)
#Creacion del Save Point
#varSP = spark.sql("call
create_savepoint('hudi2.live_demo_restore_global_sort', '20230124192052120')")
#print(varSP.show())
#Eliminación del SavePoint
#varSP0 = spark.sql("call
delete_savepoint('hudi2.live_demo_restore_global_sort', '20230124192052120')")
#print(varSP0.show())
#Visualizacion del SavePoint
#varSP2 = spark.sql("call
show_savepoints('hudi2.live_demo_restore_global_sort')")
#print(varSP2.show())
#Eliminación del SavePoint
#varSP0 = spark.sql("call
delete_savepoint('hudi2.live_demo_restore_global_sort', '20230118154353907')")
#print(varSP0.show())
#Restauración del SavePoint
#varSP = spark.sql("call
rollback_to_savepoint('hudi2.live_demo_restore_global_sort',
'20230124192052120')")
#print(varSP.show())
if __name__ == "__main__":
try:
perform_hudi_bulk_insert(args['HUDI_INIT_SORT_OPTION'],
args['OUTPUT_BUCKET'],
args['HUDI_TABLE_NAME'],
args['HUDI_DB_NAME'],
args['CATEGORY_ID']
)
except Exception as e:
print(e)
raise
job.commit()`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]