nsivabalan edited a comment on issue #3582:
URL: https://github.com/apache/hudi/issues/3582#issuecomment-931414918


   I got it working. not sure what was the issue earlier. 
   here is the script. 
   
   Non-partitioned.
   ```
   
   val df = 
spark.read.format("parquet").load("/Users/nsb/Documents/personal/datasets/parquet_ny_4parts/")
   
   df.printSchema
   root
    |-- VendorID: integer (nullable = true)
    |-- tpep_pickup_datetime: string (nullable = true)
    |-- tpep_dropoff_datetime: string (nullable = true)
    |-- passenger_count: integer (nullable = true)
    |-- trip_distance: double (nullable = true)
    |-- RatecodeID: integer (nullable = true)
    |-- store_and_fwd_flag: string (nullable = true)
    |-- PULocationID: integer (nullable = true)
    |-- DOLocationID: integer (nullable = true)
    |-- payment_type: integer (nullable = true)
    |-- fare_amount: double (nullable = true)
    |-- extra: double (nullable = true)
    |-- mta_tax: double (nullable = true)
    |-- tip_amount: double (nullable = true)
    |-- tolls_amount: double (nullable = true)
    |-- improvement_surcharge: double (nullable = true)
    |-- total_amount: double (nullable = true)
    |-- congestion_surcharge: double (nullable = true)
    |-- date_col: string (nullable = true)
   
   val basePath = "/tmp/bootstrap_test"
   
   val bootstrapDF = spark.emptyDataFrame
   
   bootstrapDF.write.
         format("hudi").
         option(HoodieWriteConfig.TABLE_NAME, "bootstrap_test").
         option(DataSourceWriteOptions.OPERATION_OPT_KEY, 
DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL).
         option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, 
"tpep_pickup_datetime").
         option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, 
"/Users/nsb/Documents/personal/datasets/parquet_ny_4parts/").
         option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, 
"org.apache.hudi.keygen.NonpartitionedKeyGenerator").
         option("hoodie.bootstrap.parallelism","10").
         mode(SaveMode.Overwrite).
         save(basePath)
         
   
   var hudiDf = spark.
           read.
           format("hudi").
           load(basePath + "/*")
   
   hudiDf.registerTempTable("hudi_tbl")
   
   spark.sql("select count(distinct tpep_pickup_datetime) from hudi_tbl").show()
   spark.sql("select * from hudi_tbl limit 10").show(false)
   
   ```
   
   
   and here is what worked for partitioned dataset.
   ```
   
    val basePath = "/tmp/bootstrap_test_partitioned"
   
   val bootstrapDF = spark.emptyDataFrame
   
   
   // worked as well. 
   
   bootstrapDF.write.
         format("hudi").
         option(HoodieWriteConfig.TABLE_NAME, "bootstrap_test").
         option(DataSourceWriteOptions.OPERATION_OPT_KEY, 
DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL).
         option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, 
"tpep_pickup_datetime").
         option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "date_col").
         option("hoodie.datasource.write.precombine.field", 
"tpep_dropoff_datetime").
         option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, 
"/Users/nsb/Documents/personal/datasets/parquet_ny_4parts/").
         option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, 
"org.apache.hudi.keygen.SimpleKeyGenerator").
         option("hoodie.bootstrap.parallelism","50").
         mode(SaveMode.Overwrite).
         save(basePath)
         
   
   
   var hudiDf1 = spark.
           read.
           format("hudi").
           load(basePath + "/*")
   
   hudiDf1.registerTempTable("hudi_tbl1")
   
   spark.sql("select count(distinct tpep_pickup_datetime) from 
hudi_tbl1").show()
   spark.sql("select * from hudi_tbl1 limit 10").show(false)
   
   ```
   
   An upsert following this
   
   ```
    val df = 
spark.read.format("parquet").load("/Users/nsb/Documents/personal/datasets/parquet_ny_4parts/").limit(1000)
   import org.apache.hudi.QuickstartUtils._
   import org.apache.spark.sql.SaveMode._
   
   
   df.write.format("hudi").
     options(getQuickstartWriteConfigs).
     option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, 
"tpep_dropoff_datetime").
     option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, 
"tpep_pickup_datetime").
     option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "date_col").
     option(HoodieWriteConfig.TABLE_NAME, "bootstrap_test").
     mode(Append).
     save(basePath)
   ```
   
   Contents of .hoodie/
   ```
   ls -ltr /tmp/bootstrap_test_partitioned/.hoodie
   total 72
   drwxr-xr-x  2 nsb  wheel     64 Sep 30 11:10 archived
   -rw-r--r--  1 nsb  wheel    583 Sep 30 11:10 hoodie.properties
   drwxr-xr-x  4 nsb  wheel    128 Sep 30 11:10 metadata
   -rw-r--r--  1 nsb  wheel      0 Sep 30 11:10 00000000000001.commit.requested
   -rw-r--r--  1 nsb  wheel      0 Sep 30 11:10 00000000000001.inflight
   -rw-r--r--  1 nsb  wheel  20168 Sep 30 11:10 00000000000001.commit
   -rw-r--r--  1 nsb  wheel      0 Sep 30 11:44 20210930114415.commit.requested
   -rw-r--r--  1 nsb  wheel   3152 Sep 30 11:44 20210930114415.inflight
   -rw-r--r--  1 nsb  wheel   5791 Sep 30 11:44 20210930114415.commit
   ```
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to