nsivabalan commented on code in PR #12087:
URL: https://github.com/apache/hudi/pull/12087#discussion_r1797832479
##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMetadataWriterUtils.java:
##########
@@ -151,20 +157,53 @@ private static void
buildBloomFilterMetadata(HoodieTableMetaClient metaClient,
}
private static Dataset<Row> readRecordsAsRow(StoragePath[] paths, SQLContext
sqlContext,
- HoodieTableMetaClient
metaClient, Schema schema) {
- String readPathString =
- String.join(",",
Arrays.stream(paths).map(StoragePath::toString).toArray(String[]::new));
- String globPathString = String.join(",",
Arrays.stream(paths).map(StoragePath::getParent).map(StoragePath::toString).distinct().toArray(String[]::new));
- HashMap<String, String> params = new HashMap<>();
- params.put(QUERY_TYPE_CONFIG, QUERY_TYPE_SNAPSHOT);
- params.put(READ_PATHS_CONFIG, readPathString);
- // Building HoodieFileIndex needs this param to decide query path
- params.put(GLOB_PATHS_CONFIG, globPathString);
- // Let Hudi relations to fetch the schema from the table itself
- BaseRelation relation = SparkAdapterSupport$.MODULE$.sparkAdapter()
- .createRelation(sqlContext, metaClient, schema, paths, params);
-
- return dropMetaFields(sqlContext.baseRelationToDataFrame(relation));
+ HoodieTableMetaClient
metaClient, Schema schema,
+ boolean isBaseFile) {
+ List<HoodieRecord> records = isBaseFile ? getBaseFileRecords(new
HoodieBaseFile(paths[0].toString()), metaClient, schema)
+ :
getUnmergedLogFileRecords(Arrays.stream(paths).map(StoragePath::toString).collect(Collectors.toList()),
metaClient, schema);
+ return dropMetaFields(toDataset(records, schema, sqlContext));
+ }
+
+ private static List<HoodieRecord> getUnmergedLogFileRecords(List<String>
logFilePaths, HoodieTableMetaClient metaClient, Schema readerSchema) {
+ List<HoodieRecord> records = new ArrayList<>();
Review Comment:
there are some intricacies here.
I am working on a fix for col stats.
for eg, for delete blocks, this may fail. for now, lets go ahead. but my fix
which I plan to make it for col stats is req for func index as well. something
to remember
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]