nsivabalan commented on code in PR #12671:
URL: https://github.com/apache/hudi/pull/12671#discussion_r1928834375
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java:
##########
@@ -212,6 +212,9 @@ private HoodieMetadataFileSystemView getMetadataView() {
if (metadataView == null ||
!metadataView.equals(metadata.getMetadataFileSystemView())) {
ValidationUtils.checkState(metadata != null, "Metadata table not
initialized");
ValidationUtils.checkState(dataMetaClient != null, "Data table meta
client not initialized");
+ if (metadataView != null) {
+ metadataView.close();
+ }
Review Comment:
we are creating a new metadataView here. So, I am just explicilty closing
the previous one
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala:
##########
@@ -70,7 +69,7 @@ class ColumnStatIndexTestBase extends
HoodieSparkClientTestBase {
.add("c4", TimestampType)
.add("c5", ShortType)
.add("c6", DateType)
- .add("c7", BinaryType)
+ .add("c7", StringType)
Review Comment:
I had to change all test fixtures. So, just switch the type.
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala:
##########
@@ -280,6 +327,62 @@ class ColumnStatIndexTestBase extends
HoodieSparkClientTestBase {
}
}
+ protected def validatePartitionStatsIndex(testCase: ColumnStatsTestCase,
+ metadataOpts: Map[String, String],
+ expectedColStatsSourcePath: String,
+ validatePartitionStatsManually:
Boolean,
+ latestCompletedCommit: String):
Unit = {
+ val metadataConfig = HoodieMetadataConfig.newBuilder()
+ .fromProperties(toProperties(metadataOpts))
+ .build()
+
+ val pStatsIndex = new PartitionStatsIndexSupport(spark, sourceTableSchema,
metadataConfig, metaClient)
+
+ val pIndexedColumns: Seq[String] = HoodieTableMetadataUtil
+ .getColumnsToIndex(metaClient.getTableConfig, metadataConfig,
convertScalaListToJavaList(sourceTableSchema.fieldNames))
+ .asScala.filter(colName => !colName.startsWith("_hoodie")).toSeq.sorted
+
+ val (pExpectedColStatsSchema, _) = composeIndexSchema(pIndexedColumns,
pIndexedColumns, sourceTableSchema)
+ val pValidationSortColumns = if (pIndexedColumns.contains("c5")) {
+ Seq("c1_maxValue", "c1_minValue", "c2_maxValue", "c2_minValue",
"c3_maxValue",
+ "c3_minValue", "c5_maxValue", "c5_minValue")
+ } else {
+ Seq("c1_maxValue", "c1_minValue", "c2_maxValue", "c2_minValue",
"c3_maxValue", "c3_minValue")
+ }
+
+ pStatsIndex.loadTransposed(sourceTableSchema.fieldNames,
testCase.shouldReadInMemory) { pTransposedColStatsDF =>
+ // Match against expected column stats table
+ val pExpectedColStatsIndexTableDf = {
+ spark.read
+ .schema(pExpectedColStatsSchema)
+
.json(getClass.getClassLoader.getResource(expectedColStatsSourcePath).toString)
+ }
+
+ val colsToDrop = if (testCase.tableType ==
HoodieTableType.COPY_ON_WRITE) {
+ Seq("fileName")
+ } else {
+ Seq("fileName","valueCount")
+ }
+
+ //assertEquals(expectedColStatsIndexTableDf.schema,
transposedColStatsDF.schema)
Review Comment:
these are copied over comment. I have cleaned it up
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala:
##########
@@ -280,6 +327,62 @@ class ColumnStatIndexTestBase extends
HoodieSparkClientTestBase {
}
}
+ protected def validatePartitionStatsIndex(testCase: ColumnStatsTestCase,
+ metadataOpts: Map[String, String],
+ expectedColStatsSourcePath: String,
+ validatePartitionStatsManually:
Boolean,
Review Comment:
not used. removed
##########
hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestMetadataTableSupport.java:
##########
@@ -81,15 +81,17 @@ void testRecreateMDTForInsertOverwriteTableOperation() {
.setBasePath(mdtBasePath).build();
HoodieActiveTimeline timeline = mdtMetaClient.getActiveTimeline();
List<HoodieInstant> instants = timeline.getInstants();
- assertEquals(4, instants.size());
+ assertEquals(5, instants.size());
Review Comment:
ack
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala:
##########
@@ -280,6 +327,62 @@ class ColumnStatIndexTestBase extends
HoodieSparkClientTestBase {
}
}
+ protected def validatePartitionStatsIndex(testCase: ColumnStatsTestCase,
Review Comment:
yes. TestPartitionStatsIndex.verifyFilePruning
##########
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java:
##########
@@ -2432,7 +2432,7 @@ public static HoodieData<HoodieRecord>
convertFilesToPartitionStatsRecords(Hoodi
}
Lazy<Option<Schema>> lazyWriterSchemaOpt = writerSchemaOpt.isPresent() ?
Lazy.eagerly(writerSchemaOpt) : Lazy.lazily(() ->
tryResolveSchemaForTable(dataTableMetaClient));
final List<String> columnsToIndex =
getColumnsToIndex(dataTableMetaClient.getTableConfig(), metadataConfig,
Either.right(lazyWriterSchemaOpt),
-
dataTableMetaClient.getActiveTimeline().filterCompletedInstants().empty(),
recordTypeOpt);
+
dataTableMetaClient.getActiveTimeline().getWriteTimeline().filterCompletedInstants().empty(),
recordTypeOpt);
Review Comment:
for a table containing just 1 entry in timeline and its a completed rollback
instant, we dont' won't be able to parse the schema. hence this change.
##########
hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/column-stats-index-table.json:
##########
@@ -1,4 +1,4 @@
{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":"
769sdc","c2_minValue":"
309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":9}
{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":"
932sdc","c2_minValue":"
0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":8}
{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":"
943sdc","c2_minValue":"
200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10}
-{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":"
959sdc","c2_minValue":"
181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":13}
\ No newline at end of file
+{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":"
959sdc","c2_minValue":"
181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"vw==","c7_minValue":"1A==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":13}
Review Comment:
HUDI-8909. I have left a comment in ColumnStatIndexTestBase L72
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]