codope commented on code in PR #12087:
URL: https://github.com/apache/hudi/pull/12087#discussion_r1797991295
##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMetadataWriterUtils.java:
##########
@@ -151,20 +157,53 @@ private static void
buildBloomFilterMetadata(HoodieTableMetaClient metaClient,
}
private static Dataset<Row> readRecordsAsRow(StoragePath[] paths, SQLContext
sqlContext,
- HoodieTableMetaClient
metaClient, Schema schema) {
- String readPathString =
- String.join(",",
Arrays.stream(paths).map(StoragePath::toString).toArray(String[]::new));
- String globPathString = String.join(",",
Arrays.stream(paths).map(StoragePath::getParent).map(StoragePath::toString).distinct().toArray(String[]::new));
- HashMap<String, String> params = new HashMap<>();
- params.put(QUERY_TYPE_CONFIG, QUERY_TYPE_SNAPSHOT);
- params.put(READ_PATHS_CONFIG, readPathString);
- // Building HoodieFileIndex needs this param to decide query path
- params.put(GLOB_PATHS_CONFIG, globPathString);
- // Let Hudi relations to fetch the schema from the table itself
- BaseRelation relation = SparkAdapterSupport$.MODULE$.sparkAdapter()
- .createRelation(sqlContext, metaClient, schema, paths, params);
-
- return dropMetaFields(sqlContext.baseRelationToDataFrame(relation));
+ HoodieTableMetaClient
metaClient, Schema schema,
+ boolean isBaseFile) {
+ List<HoodieRecord> records = isBaseFile ? getBaseFileRecords(new
HoodieBaseFile(paths[0].toString()), metaClient, schema)
+ :
getUnmergedLogFileRecords(Arrays.stream(paths).map(StoragePath::toString).collect(Collectors.toList()),
metaClient, schema);
+ return dropMetaFields(toDataset(records, schema, sqlContext));
+ }
+
+ private static List<HoodieRecord> getUnmergedLogFileRecords(List<String>
logFilePaths, HoodieTableMetaClient metaClient, Schema readerSchema) {
+ List<HoodieRecord> records = new ArrayList<>();
Review Comment:
Got it, could you please tag me in the PR once it's ready.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]