danny0405 commented on code in PR #14120:
URL: https://github.com/apache/hudi/pull/14120#discussion_r2450304906
##########
hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/avro/HoodieAvroParquetReader.java:
##########
@@ -55,31 +51,26 @@ public class HoodieAvroParquetReader extends
RecordReader<Void, ArrayWritable> {
private final ParquetRecordReader<GenericData.Record> parquetRecordReader;
private Schema baseSchema;
- public HoodieAvroParquetReader(InputSplit inputSplit, Configuration conf,
Option<InternalSchema> internalSchemaOption) throws IOException {
- // get base schema
- ParquetMetadata fileFooter =
- ParquetFileReader.readFooter(conf, ((ParquetInputSplit)
inputSplit).getPath(), ParquetMetadataConverter.NO_FILTER);
- MessageType messageType = fileFooter.getFileMetaData().getSchema();
- baseSchema = getAvroSchemaConverter(conf).convert(messageType);
-
- if (internalSchemaOption.isPresent()) {
- // do schema reconciliation in case there exists read column which is
not in the file schema.
- InternalSchema mergedInternalSchema = new InternalSchemaMerger(
- AvroInternalSchemaConverter.convert(baseSchema),
- internalSchemaOption.get(),
- true,
- true).mergeSchema();
- baseSchema = AvroInternalSchemaConverter.convert(mergedInternalSchema,
baseSchema.getFullName());
+ public HoodieAvroParquetReader(InputSplit inputSplit, Configuration conf,
Option<InternalSchema> internalSchemaOption, Option<Schema> dataSchema) throws
IOException {
+ if (dataSchema.isPresent()) {
+ baseSchema = dataSchema.get();
+ } else {
+ // get base schema
+ ParquetMetadata fileFooter =
+ ParquetFileReader.readFooter(conf, ((ParquetInputSplit)
inputSplit).getPath(), ParquetMetadataConverter.NO_FILTER);
+ MessageType messageType = fileFooter.getFileMetaData().getSchema();
+ baseSchema = getAvroSchemaConverter(conf).convert(messageType);
+
+ if (internalSchemaOption.isPresent()) {
+ // do schema reconciliation in case there exists read column which is
not in the file schema.
+ InternalSchema mergedInternalSchema = new InternalSchemaMerger(
+ AvroInternalSchemaConverter.convert(baseSchema),
+ internalSchemaOption.get(),
+ true,
+ true).mergeSchema();
+ baseSchema = AvroInternalSchemaConverter.convert(mergedInternalSchema,
baseSchema.getFullName());
+ }
}
-
- // if exists read columns, we need to filter columns.
- List<String> readColNames =
Arrays.asList(HoodieColumnProjectionUtils.getReadColumnNames(conf));
Review Comment:
so this code never works before? looks like it servers for Hive queries.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]