[ 
https://issues.apache.org/jira/browse/HUDI-8553?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Y Ethan Guo updated HUDI-8553:
------------------------------
    Sprint: Hudi 1.0 Blockers+Bugs Sprint

> Spark SQL UPDATE and DELETE should write record positions
> ---------------------------------------------------------
>
>                 Key: HUDI-8553
>                 URL: https://issues.apache.org/jira/browse/HUDI-8553
>             Project: Apache Hudi
>          Issue Type: Bug
>            Reporter: Y Ethan Guo
>            Assignee: Jonathan Vexler
>            Priority: Blocker
>             Fix For: 1.0.0
>
>
> Though there is no read and write error, Spark SQL UPDATE and DELETE do not 
> write record positions to the log files.
> {code:java}
> spark-sql (default)> CREATE TABLE testing_positions.table2 (
>                    >     ts BIGINT,
>                    >     uuid STRING,
>                    >     rider STRING,
>                    >     driver STRING,
>                    >     fare DOUBLE,
>                    >     city STRING
>                    > ) USING HUDI
>                    > LOCATION 
> 'file:///Users/ethan/Work/tmp/hudi-1.0.0-testing/positional/table2'
>                    > TBLPROPERTIES (
>                    >   type = 'mor',
>                    >   primaryKey = 'uuid',
>                    >   preCombineField = 'ts'
>                    > )
>                    > PARTITIONED BY (city);
> 24/11/16 12:03:26 WARN TableSchemaResolver: Could not find any data file 
> written for commit, so could not get schema for table 
> file:/Users/ethan/Work/tmp/hudi-1.0.0-testing/positional/table2
> Time taken: 0.4 seconds
> spark-sql (default)> INSERT INTO testing_positions.table2
>                    > VALUES
>                    > 
> (1695159649087,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco'),
>                    > 
> (1695091554788,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70
>  ,'san_francisco'),
>                    > 
> (1695046462179,'9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90
>  ,'san_francisco'),
>                    > 
> (1695332066204,'1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco'),
>                    > 
> (1695516137016,'e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo'
>     ),
>                    > 
> (1695376420876,'7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40
>  ,'sao_paulo'    ),
>                    > 
> (1695173887231,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
>  ,'chennai'      ),
>                    > 
> (1695115999911,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');
> 24/11/16 12:03:26 WARN TableSchemaResolver: Could not find any data file 
> written for commit, so could not get schema for table 
> file:/Users/ethan/Work/tmp/hudi-1.0.0-testing/positional/table2
> 24/11/16 12:03:26 WARN TableSchemaResolver: Could not find any data file 
> written for commit, so could not get schema for table 
> file:/Users/ethan/Work/tmp/hudi-1.0.0-testing/positional/table2
> 24/11/16 12:03:29 WARN log: Updating partition stats fast for: table2_ro
> 24/11/16 12:03:29 WARN log: Updated size to 436166
> 24/11/16 12:03:29 WARN log: Updating partition stats fast for: table2_ro
> 24/11/16 12:03:29 WARN log: Updating partition stats fast for: table2_ro
> 24/11/16 12:03:29 WARN log: Updated size to 436185
> 24/11/16 12:03:29 WARN log: Updated size to 436386
> 24/11/16 12:03:30 WARN log: Updating partition stats fast for: table2_rt
> 24/11/16 12:03:30 WARN log: Updating partition stats fast for: table2_rt
> 24/11/16 12:03:30 WARN log: Updated size to 436166
> 24/11/16 12:03:30 WARN log: Updated size to 436386
> 24/11/16 12:03:30 WARN log: Updating partition stats fast for: table2_rt
> 24/11/16 12:03:30 WARN log: Updated size to 436185
> 24/11/16 12:03:30 WARN log: Updating partition stats fast for: table2
> 24/11/16 12:03:30 WARN log: Updated size to 436166
> 24/11/16 12:03:30 WARN log: Updating partition stats fast for: table2
> 24/11/16 12:03:30 WARN log: Updated size to 436386
> 24/11/16 12:03:30 WARN log: Updating partition stats fast for: table2
> 24/11/16 12:03:30 WARN log: Updated size to 436185
> 24/11/16 12:03:30 WARN HiveConf: HiveConf of name 
> hive.internal.ss.authz.settings.applied.marker does not exist
> 24/11/16 12:03:30 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout 
> does not exist
> 24/11/16 12:03:30 WARN HiveConf: HiveConf of name hive.stats.retries.wait 
> does not exist
> Time taken: 4.843 seconds
> spark-sql (default)> 
>                    > SET hoodie.merge.small.file.group.candidates.limit = 0;
> hoodie.merge.small.file.group.candidates.limit    0
> Time taken: 0.018 seconds, Fetched 1 row(s)
> spark-sql (default)> 
>                    > UPDATE testing_positions.table2 SET fare = 20.0 WHERE 
> rider = 'rider-A';
> 24/11/16 12:03:31 WARN SparkStringUtils: Truncated the string representation 
> of a plan since it was too large. This behavior can be adjusted by setting 
> 'spark.sql.debug.maxToStringFields'.
> 24/11/16 12:03:32 WARN HoodieFileIndex: Data skipping requires both Metadata 
> Table and at least one of Column Stats Index, Record Level Index, or 
> Functional Index to be enabled as well! (isMetadataTableEnabled = false, 
> isColumnStatsIndexEnabled = false, isRecordIndexApplicable = false, 
> isFunctionalIndexEnabled = false, isBucketIndexEnable = false, 
> isPartitionStatsIndexEnabled = false), isBloomFiltersIndexEnabled = false)
> 24/11/16 12:03:32 WARN HoodieDataBlock: There are records without valid 
> positions. Skip writing record positions to the data block header.
> 24/11/16 12:03:34 WARN HiveConf: HiveConf of name 
> hive.internal.ss.authz.settings.applied.marker does not exist
> 24/11/16 12:03:34 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout 
> does not exist
> 24/11/16 12:03:34 WARN HiveConf: HiveConf of name hive.stats.retries.wait 
> does not exist
> Time taken: 5.545 seconds
> spark-sql (default)> 
>                    > DELETE FROM testing_positions.table2 WHERE uuid = 
> 'e3cf430c-889d-4015-bc98-59bdce1e530c';
> 24/11/16 12:03:37 WARN HoodieFileIndex: Data skipping requires both Metadata 
> Table and at least one of Column Stats Index, Record Level Index, or 
> Functional Index to be enabled as well! (isMetadataTableEnabled = false, 
> isColumnStatsIndexEnabled = false, isRecordIndexApplicable = false, 
> isFunctionalIndexEnabled = false, isBucketIndexEnable = false, 
> isPartitionStatsIndexEnabled = false), isBloomFiltersIndexEnabled = false)
> 24/11/16 12:03:37 WARN HoodiePositionBasedFileGroupRecordBuffer: No record 
> position info is found when attempt to do position based merge.
> 24/11/16 12:03:37 WARN HoodiePositionBasedFileGroupRecordBuffer: Falling back 
> to key based merge for Read
> 24/11/16 12:03:38 WARN HoodieDeleteBlock: There are delete records without 
> valid positions. Skip writing record positions to the delete block header.
> 24/11/16 12:03:39 WARN HiveConf: HiveConf of name 
> hive.internal.ss.authz.settings.applied.marker does not exist
> 24/11/16 12:03:39 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout 
> does not exist
> 24/11/16 12:03:39 WARN HiveConf: HiveConf of name hive.stats.retries.wait 
> does not exist
> Time taken: 2.992 seconds
> spark-sql (default)> 
>                    > select * from testing_positions.table2;
> 24/11/16 12:03:41 WARN HoodiePositionBasedFileGroupRecordBuffer: No record 
> position info is found when attempt to do position based merge.
> 24/11/16 12:03:41 WARN HoodiePositionBasedFileGroupRecordBuffer: No record 
> position info is found when attempt to do position based merge.
> 24/11/16 12:03:41 WARN HoodiePositionBasedFileGroupRecordBuffer: Falling back 
> to key based merge for Read
> 24/11/16 12:03:41 WARN HoodiePositionBasedFileGroupRecordBuffer: Falling back 
> to key based merge for Read
> 20241116120326527    20241116120326527_0_0    
> 1dced545-862b-4ceb-8b43-d2a568f6616b    city=san_francisco    
> 1ba64ef0-bba2-469e-8ef5-696f8cdbe141-0_0-186-338_20241116120326527.parquet    
> 16953320662041dced545-862b-4ceb-8b43-d2a568f6616b    rider-E    driver-O    
> 93.5    san_francisco
> 20241116120326527    20241116120326527_0_1    
> e96c4396-3fad-413a-a942-4cb36106d721    city=san_francisco    
> 1ba64ef0-bba2-469e-8ef5-696f8cdbe141-0_0-186-338_20241116120326527.parquet    
> 1695091554788e96c4396-3fad-413a-a942-4cb36106d721    rider-C    driver-M    
> 27.7    san_francisco
> 20241116120326527    20241116120326527_0_2    
> 9909a8b1-2d15-4d3d-8ec9-efc48c536a00    city=san_francisco    
> 1ba64ef0-bba2-469e-8ef5-696f8cdbe141-0_0-186-338_20241116120326527.parquet    
> 16950464621799909a8b1-2d15-4d3d-8ec9-efc48c536a00    rider-D    driver-L    
> 33.9    san_francisco
> 20241116120331896    20241116120331896_0_9    
> 334e26e9-8355-45cc-97c6-c31daf0df330    city=san_francisco    
> 1ba64ef0-bba2-469e-8ef5-696f8cdbe141-0    1695159649087    
> 334e26e9-8355-45cc-97c6-c31daf0df330    rider-A    driver-K    20.0    
> san_francisco
> 20241116120326527    20241116120326527_1_1    
> 7a84095f-737f-40bc-b62f-6b69664712d2    city=sao_paulo    
> ba555452-0c3c-47dc-acc0-f90823e12408-0_1-186-339_20241116120326527.parquet    
> 1695376420876    7a84095f-737f-40bc-b62f-6b69664712d2    rider-G    driver-Q  
>   43.4    sao_paulo
> 20241116120326527    20241116120326527_2_0    
> 3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04    city=chennai    
> 8dacb2f9-6901-4ab3-8139-697b51125f16-0_2-186-340_20241116120326527.parquet    
> 1695173887231    3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04    rider-I    driver-S  
>   41.06    chennai
> 20241116120326527    20241116120326527_2_1    
> c8abbe79-8d89-47ea-b4ce-4d224bae5bfa    city=chennai    
> 8dacb2f9-6901-4ab3-8139-697b51125f16-0_2-186-340_20241116120326527.parquet    
> 1695115999911    c8abbe79-8d89-47ea-b4ce-4d224bae5bfa    rider-J    driver-T  
>   17.85    chennai
> Time taken: 1.719 seconds, Fetched 7 row(s) {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to