This is an automated email from the ASF dual-hosted git repository.

yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new c75e2649d8 [HUDI-4556] Improve functional test coverage of column 
stats index (#6319)
c75e2649d8 is described below

commit c75e2649d852e0538aaee5df6fc94bf40f3012e5
Author: Y Ethan Guo <[email protected]>
AuthorDate: Tue Aug 9 18:47:02 2022 -0700

    [HUDI-4556] Improve functional test coverage of column stats index (#6319)
---
 .../cow-updated2-column-stats-index-table.json     |  13 ++
 .../mor-updated2-column-stats-index-table.json     |  13 ++
 ...-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json |  11 ++
 .../hudi/functional/TestColumnStatsIndex.scala     | 217 +++++++++++----------
 4 files changed, 156 insertions(+), 98 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-updated2-column-stats-index-table.json
 
b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-updated2-column-stats-index-table.json
new file mode 100644
index 0000000000..b5882b53fc
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-updated2-column-stats-index-table.json
@@ -0,0 +1,13 @@
+{"c1_maxValue":101,"c1_minValue":101,"c1_nullCount":0,"c2_maxValue":" 
999sdc","c2_minValue":" 
999sdc","c2_nullCount":0,"c3_maxValue":10.329,"c3_minValue":10.329,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.179-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":99,"c5_minValue":99,"c5_nullCount":0,"c6_maxValue":"2020-03-28","c6_minValue":"2020-03-28","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"SA==","c7_nullCount":0,"c8_maxValue":9,"c8_minV
 [...]
+{"c1_maxValue":568,"c1_minValue":8,"c1_nullCount":0,"c2_maxValue":" 
8sdc","c2_minValue":" 
111sdc","c2_nullCount":0,"c3_maxValue":979.272,"c3_minValue":82.111,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.193-08:00","c4_minValue":"2021-11-18T23:34:44.159-08:00","c4_nullCount":0,"c5_maxValue":58,"c5_minValue":2,"c5_nullCount":0,"c6_maxValue":"2020-11-08","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"9g==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue
 [...]
+{"c1_maxValue":715,"c1_minValue":76,"c1_nullCount":0,"c2_maxValue":" 
76sdc","c2_minValue":" 
224sdc","c2_nullCount":0,"c3_maxValue":958.579,"c3_minValue":246.427,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.199-08:00","c4_minValue":"2021-11-18T23:34:44.166-08:00","c4_nullCount":0,"c5_maxValue":73,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-21","c6_minValue":"2020-01-16","c6_nullCount":0,"c7_maxValue":"+g==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minVa
 [...]
+{"c1_maxValue":768,"c1_minValue":59,"c1_nullCount":0,"c2_maxValue":" 
768sdc","c2_minValue":" 
118sdc","c2_nullCount":0,"c3_maxValue":959.131,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.164-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":7,"c5_nullCount":0,"c6_maxValue":"2020-11-20","c6_minValue":"2020-05-04","c6_nullCount":0,"c7_maxValue":"zw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minVa
 [...]
+{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 
769sdc","c2_minValue":" 
309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_min
 [...]
+{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 
985sdc","c2_minValue":" 
309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-18T23:34:44.180-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_min
 [...]
+{"c1_maxValue":770,"c1_minValue":129,"c1_nullCount":0,"c2_maxValue":" 
770sdc","c2_minValue":" 
129sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":153.431,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.169-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":14,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"rw==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_mi
 [...]
+{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 
932sdc","c2_minValue":" 
0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue
 [...]
+{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 
987sdc","c2_minValue":" 
0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-18T23:34:44.180-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue
 [...]
+{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 
943sdc","c2_minValue":" 
200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_min
 [...]
+{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 
984sdc","c2_minValue":" 
200sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-18T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minV
 [...]
+{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 
959sdc","c2_minValue":" 
181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_nullCount":0,"c8_maxValue":9,"c8_minVa
 [...]
+{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 
989sdc","c2_minValue":" 
181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-02-25","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minVa
 [...]
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-updated2-column-stats-index-table.json
 
b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-updated2-column-stats-index-table.json
new file mode 100644
index 0000000000..0c048b5c5f
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-updated2-column-stats-index-table.json
@@ -0,0 +1,13 @@
+{"c1_maxValue":101,"c1_minValue":101,"c1_nullCount":0,"c2_maxValue":" 
999sdc","c2_minValue":" 
999sdc","c2_nullCount":0,"c3_maxValue":10.329,"c3_minValue":10.329,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.179-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":99,"c5_minValue":99,"c5_nullCount":0,"c6_maxValue":"2020-03-28","c6_minValue":"2020-03-28","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"SA==","c7_nullCount":0,"c8_maxValue":9,"c8_minV
 [...]
+{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 
984sdc","c2_minValue":" 
980sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":34,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_min
 [...]
+{"c1_maxValue":568,"c1_minValue":8,"c1_nullCount":0,"c2_maxValue":" 
8sdc","c2_minValue":" 
111sdc","c2_nullCount":0,"c3_maxValue":979.272,"c3_minValue":82.111,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.193-08:00","c4_minValue":"2021-11-18T23:34:44.159-08:00","c4_nullCount":0,"c5_maxValue":58,"c5_minValue":2,"c5_nullCount":0,"c6_maxValue":"2020-11-08","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"9g==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue
 [...]
+{"c1_maxValue":619,"c1_minValue":619,"c1_nullCount":0,"c2_maxValue":" 
985sdc","c2_minValue":" 
985sdc","c2_nullCount":0,"c3_maxValue":230.320,"c3_minValue":230.320,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.180-08:00","c4_minValue":"2021-11-18T23:34:44.180-08:00","c4_nullCount":0,"c5_maxValue":33,"c5_minValue":33,"c5_nullCount":0,"c6_maxValue":"2020-02-13","c6_minValue":"2020-02-13","c6_nullCount":0,"c7_maxValue":"QA==","c7_minValue":"QA==","c7_nullCount":0,"c8_maxValue":9,"c8_mi
 [...]
+{"c1_maxValue":633,"c1_minValue":624,"c1_nullCount":0,"c2_maxValue":" 
987sdc","c2_minValue":" 
986sdc","c2_nullCount":0,"c3_maxValue":580.317,"c3_minValue":375.308,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.180-08:00","c4_minValue":"2021-11-18T23:34:44.180-08:00","c4_nullCount":0,"c5_maxValue":33,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"PQ==","c7_minValue":"NA==","c7_nullCount":0,"c8_maxValue":9,"c8_mi
 [...]
+{"c1_maxValue":639,"c1_minValue":555,"c1_nullCount":0,"c2_maxValue":" 
989sdc","c2_minValue":" 
982sdc","c2_nullCount":0,"c3_maxValue":904.304,"c3_minValue":153.431,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.186-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":44,"c5_minValue":31,"c5_nullCount":0,"c6_maxValue":"2020-08-25","c6_minValue":"2020-03-12","c6_nullCount":0,"c7_maxValue":"MA==","c7_minValue":"rw==","c7_nullCount":0,"c8_maxValue":9,"c8_mi
 [...]
+{"c1_maxValue":715,"c1_minValue":76,"c1_nullCount":0,"c2_maxValue":" 
76sdc","c2_minValue":" 
224sdc","c2_nullCount":0,"c3_maxValue":958.579,"c3_minValue":246.427,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.199-08:00","c4_minValue":"2021-11-18T23:34:44.166-08:00","c4_nullCount":0,"c5_maxValue":73,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-21","c6_minValue":"2020-01-16","c6_nullCount":0,"c7_maxValue":"+g==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minVa
 [...]
+{"c1_maxValue":768,"c1_minValue":59,"c1_nullCount":0,"c2_maxValue":" 
768sdc","c2_minValue":" 
118sdc","c2_nullCount":0,"c3_maxValue":959.131,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.164-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":7,"c5_nullCount":0,"c6_maxValue":"2020-11-20","c6_minValue":"2020-05-04","c6_nullCount":0,"c7_maxValue":"zw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minVa
 [...]
+{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 
769sdc","c2_minValue":" 
309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_min
 [...]
+{"c1_maxValue":770,"c1_minValue":129,"c1_nullCount":0,"c2_maxValue":" 
770sdc","c2_minValue":" 
129sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":153.431,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.169-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":14,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"rw==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_mi
 [...]
+{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 
932sdc","c2_minValue":" 
0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue
 [...]
+{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 
943sdc","c2_minValue":" 
200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_min
 [...]
+{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 
959sdc","c2_minValue":" 
181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_nullCount":0,"c8_maxValue":9,"c8_minVa
 [...]
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
 
b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
new file mode 100644
index 0000000000..27b320cc21
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json
@@ -0,0 +1,11 @@
+{"c1":323,"c2":" 
980sdc","c3":335.770,"c4":"2021-11-18T23:34:44.201-08:00","c5":78,"c6":"2020-01-15","c7":"Ag==","c8":9}
+{"c1":326,"c2":" 
981sdc","c3":64.768,"c4":"2021-11-18T23:34:44.201-08:00","c5":78,"c6":"2020-10-13","c7":"AA==","c8":9}
+{"c1":555,"c2":" 
982sdc","c3":153.431,"c4":"2021-11-18T23:34:44.186-08:00","c5":44,"c6":"2020-03-12","c7":"rw==","c8":9}
+{"c1":556,"c2":" 
983sdc","c3":246.427,"c4":"2021-11-18T23:34:44.186-08:00","c5":44,"c6":"2020-10-08","c7":"qw==","c8":9}
+{"c1":562,"c2":" 
984sdc","c3":977.328,"c4":"2021-11-18T23:34:44.181-08:00","c5":34,"c6":"2020-10-21","c7":"SA==","c8":9}
+{"c1":619,"c2":" 
985sdc","c3":230.320,"c4":"2021-11-18T23:34:44.180-08:00","c5":33,"c6":"2020-02-13","c7":"QA==","c8":9}
+{"c1":624,"c2":" 
986sdc","c3":580.317,"c4":"2021-11-18T23:34:44.180-08:00","c5":33,"c6":"2020-10-10","c7":"PQ==","c8":9}
+{"c1":633,"c2":" 
987sdc","c3":375.308,"c4":"2021-11-18T23:34:44.180-08:00","c5":32,"c6":"2020-01-01","c7":"NA==","c8":9}
+{"c1":638,"c2":" 
988sdc","c3":904.304,"c4":"2021-11-18T23:34:44.179-08:00","c5":32,"c6":"2020-08-25","c7":"MA==","c8":9}
+{"c1":639,"c2":" 
989sdc","c3":398.300,"c4":"2021-11-18T23:34:44.179-08:00","c5":31,"c6":"2020-04-21","c7":"LA==","c8":9}
+{"c1":101,"c2":" 
999sdc","c3":10.329,"c4":"2021-11-18T23:34:44.179-08:00","c5":99,"c6":"2020-03-28","c7":"SF==","c8":9}
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
index 822d2051cb..2c17bb8cdd 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
@@ -24,6 +24,7 @@ import 
org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema
 import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, 
RECORDKEY_FIELD}
 import org.apache.hudi.HoodieConversionUtils.toProperties
 import org.apache.hudi.common.config.HoodieMetadataConfig
+import org.apache.hudi.common.model.HoodieTableType
 import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
 import org.apache.hudi.common.util.ParquetUtils
 import org.apache.hudi.config.{HoodieStorageConfig, HoodieWriteConfig}
@@ -31,12 +32,12 @@ import 
org.apache.hudi.functional.TestColumnStatsIndex.ColumnStatsTestCase
 import org.apache.hudi.testutils.HoodieClientTestBase
 import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions}
 import org.apache.spark.sql._
-import org.apache.spark.sql.functions.{col, lit, typedLit}
+import org.apache.spark.sql.functions.typedLit
 import org.apache.spark.sql.types._
 import org.junit.jupiter.api.Assertions.{assertEquals, assertNotNull, 
assertTrue}
 import org.junit.jupiter.api._
 import org.junit.jupiter.params.ParameterizedTest
-import org.junit.jupiter.params.provider.{Arguments, ArgumentsSource, 
MethodSource, ValueSource}
+import org.junit.jupiter.params.provider.{Arguments, MethodSource, ValueSource}
 
 import java.math.BigInteger
 import java.sql.{Date, Timestamp}
@@ -84,10 +85,11 @@ class TestColumnStatsIndex extends HoodieClientTestBase {
       HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true"
     )
 
-    val opts = Map(
+    val commonOpts = Map(
       "hoodie.insert.shuffle.parallelism" -> "4",
       "hoodie.upsert.shuffle.parallelism" -> "4",
       HoodieWriteConfig.TBL_NAME.key -> "hoodie_test",
+      DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString,
       RECORDKEY_FIELD.key -> "c1",
       PRECOMBINE_FIELD.key -> "c1",
       // NOTE: Currently only this setting is used like following by different 
MT partitions:
@@ -97,88 +99,28 @@ class TestColumnStatsIndex extends HoodieClientTestBase {
       HoodieTableConfig.POPULATE_META_FIELDS.key -> "true"
     ) ++ metadataOpts
 
-    val sourceJSONTablePath = 
getClass.getClassLoader.getResource("index/colstats/input-table-json").toString
-
-    // NOTE: Schema here is provided for validation that the input date is in 
the appropriate format
-    val inputDF = 
spark.read.schema(sourceTableSchema).json(sourceJSONTablePath)
-
-    inputDF
-      .sort("c1")
-      .repartition(4, new Column("c1"))
-      .write
-      .format("hudi")
-      .options(opts)
-      .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024)
-      .option(DataSourceWriteOptions.OPERATION.key, 
DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
-      .mode(SaveMode.Overwrite)
-      .save(basePath)
-
-    metaClient = HoodieTableMetaClient.reload(metaClient)
-
-    val metadataConfig = HoodieMetadataConfig.newBuilder()
-      .fromProperties(toProperties(metadataOpts))
-      .build()
-
-    val requestedColumns: Seq[String] = sourceTableSchema.fieldNames
-
-    val columnStatsIndex = new ColumnStatsIndexSupport(spark, 
sourceTableSchema, metadataConfig, metaClient)
-
-    val expectedColStatsSchema = 
composeIndexSchema(sourceTableSchema.fieldNames, sourceTableSchema)
-
-    columnStatsIndex.loadTransposed(requestedColumns, 
testCase.shouldReadInMemory) { transposedColStatsDF =>
-      // Match against expected column stats table
-      val expectedColStatsIndexTableDf =
-        spark.read
-          .schema(expectedColStatsSchema)
-          
.json(getClass.getClassLoader.getResource("index/colstats/column-stats-index-table.json").toString)
-
-      assertEquals(expectedColStatsIndexTableDf.schema, 
transposedColStatsDF.schema)
-      // NOTE: We have to drop the `fileName` column as it contains 
semi-random components
-      //       that we can't control in this test. Nevertheless, since we 
manually verify composition of the
-      //       ColStats Index by reading Parquet footers from individual 
Parquet files, this is not an issue
-      assertEquals(asJson(sort(expectedColStatsIndexTableDf)), 
asJson(sort(transposedColStatsDF.drop("fileName"))))
-
-      // Collect Column Stats manually (reading individual Parquet files)
-      val manualColStatsTableDF =
-        buildColumnStatsTableManually(basePath, sourceTableSchema.fieldNames, 
sourceTableSchema.fieldNames, expectedColStatsSchema)
-
-      assertEquals(asJson(sort(manualColStatsTableDF)), 
asJson(sort(transposedColStatsDF)))
-    }
-
-    // do an upsert and validate
-    val updateJSONTablePath = 
getClass.getClassLoader.getResource("index/colstats/another-input-table-json").toString
-    val updateDF = spark.read
-      .schema(sourceTableSchema)
-      .json(updateJSONTablePath)
-
-    updateDF.repartition(4)
-      .write
-      .format("hudi")
-      .options(opts)
-      .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024)
-      .option(DataSourceWriteOptions.OPERATION.key, 
DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
-      .mode(SaveMode.Append)
-      .save(basePath)
-
-    metaClient = HoodieTableMetaClient.reload(metaClient)
-
-    val updatedColumnStatsIndex = new ColumnStatsIndexSupport(spark, 
sourceTableSchema, metadataConfig, metaClient)
-
-    updatedColumnStatsIndex.loadTransposed(requestedColumns, 
testCase.shouldReadInMemory) { transposedUpdatedColStatsDF =>
-      val expectedColStatsIndexUpdatedDF =
-        spark.read
-          .schema(expectedColStatsSchema)
-          
.json(getClass.getClassLoader.getResource("index/colstats/updated-column-stats-index-table.json").toString)
-
-      assertEquals(expectedColStatsIndexUpdatedDF.schema, 
transposedUpdatedColStatsDF.schema)
-      assertEquals(asJson(sort(expectedColStatsIndexUpdatedDF)), 
asJson(sort(transposedUpdatedColStatsDF.drop("fileName"))))
-
-      // Collect Column Stats manually (reading individual Parquet files)
-      val manualUpdatedColStatsTableDF =
-        buildColumnStatsTableManually(basePath, sourceTableSchema.fieldNames, 
sourceTableSchema.fieldNames, expectedColStatsSchema)
-
-      assertEquals(asJson(sort(manualUpdatedColStatsTableDF)), 
asJson(sort(transposedUpdatedColStatsDF)))
+    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/input-table-json",
+      expectedColStatsSourcePath = 
"index/colstats/column-stats-index-table.json",
+      operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Overwrite)
+
+    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/another-input-table-json",
+      expectedColStatsSourcePath = 
"index/colstats/updated-column-stats-index-table.json",
+      operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append)
+
+    val expectedColStatsSourcePath = if (testCase.tableType == 
HoodieTableType.COPY_ON_WRITE) {
+      "index/colstats/cow-updated2-column-stats-index-table.json"
+    } else {
+      "index/colstats/mor-updated2-column-stats-index-table.json"
     }
+    doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts,
+      dataSourcePath = "index/colstats/update-input-table-json",
+      expectedColStatsSourcePath = expectedColStatsSourcePath,
+      operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
+      saveMode = SaveMode.Append)
   }
 
   @ParameterizedTest
@@ -365,6 +307,42 @@ class TestColumnStatsIndex extends HoodieClientTestBase {
     })
   }
 
+  private def doWriteAndValidateColumnStats(testCase: ColumnStatsTestCase,
+                                            metadataOpts: Map[String, String],
+                                            hudiOpts: Map[String, String],
+                                            dataSourcePath: String,
+                                            expectedColStatsSourcePath: String,
+                                            operation: String,
+                                            saveMode: SaveMode): Unit = {
+    val sourceJSONTablePath = 
getClass.getClassLoader.getResource(dataSourcePath).toString
+
+    // NOTE: Schema here is provided for validation that the input date is in 
the appropriate format
+    val inputDF = 
spark.read.schema(sourceTableSchema).json(sourceJSONTablePath)
+
+    inputDF
+      .sort("c1")
+      .repartition(4, new Column("c1"))
+      .write
+      .format("hudi")
+      .options(hudiOpts)
+      .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024)
+      .option(DataSourceWriteOptions.OPERATION.key, operation)
+      .mode(saveMode)
+      .save(basePath)
+
+    metaClient = HoodieTableMetaClient.reload(metaClient)
+
+    // Only parquet files are supported for the validation against the 
generated column stats,
+    // constructing the column stats  from parquet data files using Spark SQL 
and comparing that
+    // with column stats index. This means that the following operations are 
support for such
+    // validation: (1) COW: all operations; (2) MOR: insert only.
+    val validateColumnStatsAgainstDataFiles =
+    (testCase.tableType == HoodieTableType.COPY_ON_WRITE
+      || operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL))
+    validateColumnStatsIndex(
+      testCase, metadataOpts, expectedColStatsSourcePath, 
validateColumnStatsAgainstDataFiles)
+  }
+
   private def buildColumnStatsTableManually(tablePath: String,
                                             includedCols: Seq[String],
                                             indexedCols: Seq[String],
@@ -411,6 +389,45 @@ class TestColumnStatsIndex extends HoodieClientTestBase {
     )
   }
 
+  private def validateColumnStatsIndex(testCase: ColumnStatsTestCase,
+                                       metadataOpts: Map[String, String],
+                                       expectedColStatsSourcePath: String,
+                                       validateColumnStatsAgainstDataFiles: 
Boolean): Unit = {
+    val metadataConfig = HoodieMetadataConfig.newBuilder()
+      .fromProperties(toProperties(metadataOpts))
+      .build()
+
+    val columnStatsIndex = new ColumnStatsIndexSupport(spark, 
sourceTableSchema, metadataConfig, metaClient)
+
+    val expectedColStatsSchema = 
composeIndexSchema(sourceTableSchema.fieldNames, sourceTableSchema)
+    val validationSortColumns = Seq("c1_maxValue", "c1_minValue", 
"c2_maxValue", "c2_minValue")
+
+    columnStatsIndex.loadTransposed(sourceTableSchema.fieldNames, 
testCase.shouldReadInMemory) { transposedColStatsDF =>
+      // Match against expected column stats table
+      val expectedColStatsIndexTableDf =
+        spark.read
+          .schema(expectedColStatsSchema)
+          
.json(getClass.getClassLoader.getResource(expectedColStatsSourcePath).toString)
+
+      assertEquals(expectedColStatsIndexTableDf.schema, 
transposedColStatsDF.schema)
+      // NOTE: We have to drop the `fileName` column as it contains 
semi-random components
+      //       that we can't control in this test. Nevertheless, since we 
manually verify composition of the
+      //       ColStats Index by reading Parquet footers from individual 
Parquet files, this is not an issue
+      assertEquals(asJson(sort(expectedColStatsIndexTableDf, 
validationSortColumns)),
+        asJson(sort(transposedColStatsDF.drop("fileName"), 
validationSortColumns)))
+
+      if (validateColumnStatsAgainstDataFiles) {
+        // TODO(HUDI-4557): support validation of column stats of avro log 
files
+        // Collect Column Stats manually (reading individual Parquet files)
+        val manualColStatsTableDF =
+        buildColumnStatsTableManually(basePath, sourceTableSchema.fieldNames, 
sourceTableSchema.fieldNames, expectedColStatsSchema)
+
+        assertEquals(asJson(sort(manualColStatsTableDF, 
validationSortColumns)),
+          asJson(sort(transposedColStatsDF, validationSortColumns)))
+      }
+    }
+  }
+
   private def generateRandomDataFrame(spark: SparkSession): DataFrame = {
     val sourceTableSchema =
       new StructType()
@@ -419,9 +436,9 @@ class TestColumnStatsIndex extends HoodieClientTestBase {
         // NOTE: We're testing different values for precision of the decimal 
to make sure
         //       we execute paths bearing different underlying representations 
in Parquet
         // REF: 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#DECIMAL
-        .add("c3a", DecimalType(9,3))
-        .add("c3b", DecimalType(10,3))
-        .add("c3c", DecimalType(20,3))
+        .add("c3a", DecimalType(9, 3))
+        .add("c3b", DecimalType(10, 3))
+        .add("c3c", DecimalType(20, 3))
         .add("c4", TimestampType)
         .add("c5", ShortType)
         .add("c6", DateType)
@@ -456,24 +473,28 @@ class TestColumnStatsIndex extends HoodieClientTestBase {
       .mkString("\n")
 
   private def sort(df: DataFrame): DataFrame = {
+    sort(df, Seq("c1_maxValue", "c1_minValue"))
+  }
+
+  private def sort(df: DataFrame, sortColumns: Seq[String]): DataFrame = {
     val sortedCols = df.columns.sorted
-    // Sort dataset by the first 2 columns (to minimize non-determinism in 
case multiple files have the same
+    // Sort dataset by specified columns (to minimize non-determinism in case 
multiple files have the same
     // value of the first column)
     df.select(sortedCols.head, sortedCols.tail: _*)
-      .sort("c1_maxValue", "c1_minValue")
+      .sort(sortColumns.head, sortColumns.tail: _*)
   }
-
 }
 
 object TestColumnStatsIndex {
 
-  case class ColumnStatsTestCase(forceFullLogScan: Boolean, 
shouldReadInMemory: Boolean)
+  case class ColumnStatsTestCase(tableType: HoodieTableType, forceFullLogScan: 
Boolean, shouldReadInMemory: Boolean)
 
-  def testMetadataColumnStatsIndexParams: java.util.stream.Stream[Arguments] =
-    java.util.stream.Stream.of(
-      Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = false, 
shouldReadInMemory = true)),
-      Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = false, 
shouldReadInMemory = false)),
-      Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = true, 
shouldReadInMemory = false)),
-      Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = true, 
shouldReadInMemory = true))
-    )
+  def testMetadataColumnStatsIndexParams: java.util.stream.Stream[Arguments] = 
{
+    
java.util.stream.Stream.of(HoodieTableType.values().toStream.flatMap(tableType 
=>
+      Seq(Arguments.arguments(ColumnStatsTestCase(tableType, forceFullLogScan 
= false, shouldReadInMemory = true)),
+        Arguments.arguments(ColumnStatsTestCase(tableType, forceFullLogScan = 
false, shouldReadInMemory = false)),
+        Arguments.arguments(ColumnStatsTestCase(tableType, forceFullLogScan = 
true, shouldReadInMemory = false)),
+        Arguments.arguments(ColumnStatsTestCase(tableType, forceFullLogScan = 
true, shouldReadInMemory = true)))
+    ): _*)
+  }
 }

Reply via email to