nsivabalan commented on issue #3582:
URL: https://github.com/apache/hudi/issues/3582#issuecomment-931414918


   I got it working. not sure what was the issue earlier. 
   here is the script. 
   
   Non-partitioned.
   ```
   
   val df = 
spark.read.format("parquet").load("/Users/nsb/Documents/personal/datasets/parquet_ny_4parts/")
   
   df.printSchema
   root
    |-- VendorID: integer (nullable = true)
    |-- tpep_pickup_datetime: string (nullable = true)
    |-- tpep_dropoff_datetime: string (nullable = true)
    |-- passenger_count: integer (nullable = true)
    |-- trip_distance: double (nullable = true)
    |-- RatecodeID: integer (nullable = true)
    |-- store_and_fwd_flag: string (nullable = true)
    |-- PULocationID: integer (nullable = true)
    |-- DOLocationID: integer (nullable = true)
    |-- payment_type: integer (nullable = true)
    |-- fare_amount: double (nullable = true)
    |-- extra: double (nullable = true)
    |-- mta_tax: double (nullable = true)
    |-- tip_amount: double (nullable = true)
    |-- tolls_amount: double (nullable = true)
    |-- improvement_surcharge: double (nullable = true)
    |-- total_amount: double (nullable = true)
    |-- congestion_surcharge: double (nullable = true)
    |-- date_col: string (nullable = true)
   
   val basePath = "/tmp/bootstrap_test"
   
   val bootstrapDF = spark.emptyDataFrame
   
   bootstrapDF.write.
         format("hudi").
         option(HoodieWriteConfig.TABLE_NAME, "bootstrap_test").
         option(DataSourceWriteOptions.OPERATION_OPT_KEY, 
DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL).
         option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, 
"tpep_pickup_datetime").
         option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, 
"/Users/nsb/Documents/personal/datasets/parquet_ny_4parts/").
         option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, 
"org.apache.hudi.keygen.NonpartitionedKeyGenerator").
         option("hoodie.bootstrap.parallelism","10").
         mode(SaveMode.Overwrite).
         save(basePath)
         
   
   var hudiDf = spark.
           read.
           format("hudi").
           load(basePath + "/*")
   
   hudiDf.registerTempTable("hudi_tbl")
   
   spark.sql("select count(distinct tpep_pickup_datetime) from hudi_tbl").show()
   spark.sql("select * from hudi_tbl limit 10").show(false)
   
   ```
   
   
   and here is what worked for partitioned dataset.
   ```
   
    val basePath = "/tmp/bootstrap_test_partitioned"
   
   val bootstrapDF = spark.emptyDataFrame
   
   
   // worked as well. 
   
   bootstrapDF.write.
         format("hudi").
         option(HoodieWriteConfig.TABLE_NAME, "bootstrap_test").
         option(DataSourceWriteOptions.OPERATION_OPT_KEY, 
DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL).
         option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, 
"tpep_pickup_datetime").
         option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "date_col").
         option("hoodie.datasource.write.precombine.field", 
"tpep_dropoff_datetime").
         option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, 
"/Users/nsb/Documents/personal/datasets/parquet_ny_4parts/").
         option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, 
"org.apache.hudi.keygen.SimpleKeyGenerator").
         option("hoodie.bootstrap.parallelism","50").
         mode(SaveMode.Overwrite).
         save(basePath)
         
   
   
   var hudiDf1 = spark.
           read.
           format("hudi").
           load(basePath + "/*")
   
   hudiDf1.registerTempTable("hudi_tbl1")
   
   spark.sql("select count(distinct tpep_pickup_datetime) from 
hudi_tbl1").show()
   spark.sql("select * from hudi_tbl1 limit 10").show(false)
   
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to