Github user chenliang613 commented on the issue:
https://github.com/apache/carbondata/pull/1660
@anubhav100 @sounakr you guys also can use my example script to reproduce.
This example simulate 7500000 data, can reproduce 1728, and this pr also can
fix this issue. please @sounakr double check it again.
@anubhav100 i still have some queries, why need append "return true" after
"blockletDetails.get(index).addDeletedRows(blocklet.getDeletedRows());" ?
---------------------------------------------------------------------------------------
package org.apache.carbondata.examples
import java.io.File
import java.text.SimpleDateFormat
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.SparkSession
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.CarbonProperties
object DataUpdateDeleteExample {
def main(args: Array[String]) {
// for local files
val rootPath = new File(this.getClass.getResource("/").getPath
+ "../../../..").getCanonicalPath
// for hdfs files
// var rootPath = "hdfs://hdfs-host/carbon"
var storeLocation = s"$rootPath/examples/spark2/target/store"
var warehouse = s"$rootPath/examples/spark2/target/warehouse"
var metastoredb = s"$rootPath/examples/spark2/target"
import org.apache.spark.sql.CarbonSession._
val spark = SparkSession
.builder()
.master("local")
.appName("DataUpdateDeleteExample")
.config("spark.sql.warehouse.dir", warehouse)
.config("spark.driver.host", "localhost")
.config("spark.sql.crossJoin.enabled", "true")
.getOrCreateCarbonSession(storeLocation)
spark.sparkContext.setLogLevel("WARN")
// Specify date format based on raw data
CarbonProperties.getInstance()
.addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy-MM-dd")
import spark.implicits._
// Drop table
spark.sql("DROP TABLE IF EXISTS t3")
// Simulate data and write to table t3
var sdf = new SimpleDateFormat("yyyy-MM-dd")
var df = spark.sparkContext.parallelize(1 to 7500000)
.map(x => (x, new java.sql.Date(sdf.parse("2015-07-" + (x % 10 +
10)).getTime),
"china", "aaa" + x, "phone" + 555 * x, "ASD" + (60000 + x), 14999 +
x))
.toDF("t3_id", "t3_date", "t3_country", "t3_name",
"t3_phonetype", "t3_serialname", "t3_salary")
df.write
.format("carbondata")
.option("tableName", "t3")
.option("tempCSV", "true")
.option("compress", "true")
.mode(SaveMode.Overwrite)
.save()
// Query data again after the above update
spark.sql("""
SELECT * FROM t3 ORDER BY t3_id
""").show()
spark.sql("delete from t3 where exists (select 1 from t3)").show()
spark.sql("""
SELECT count(*) FROM t3
""").show()
// Drop table
spark.sql("DROP TABLE IF EXISTS t3")
spark.stop()
}
}
---