yihua commented on code in PR #13223:
URL: https://github.com/apache/hudi/pull/13223#discussion_r2065538196
##########
hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieFileGroupReader.java:
##########
@@ -228,8 +234,27 @@ private ClosableIterator<T>
makeBootstrapBaseFileIterator(HoodieBaseFile baseFil
if (start != 0) {
throw new IllegalArgumentException("Filegroup reader is doing
bootstrap merge but we are not reading from the start of the base file");
}
+ PartitionPathParser partitionPathParser = new PartitionPathParser();
+ Object[] partitionValues =
partitionPathParser.getPartitionFieldVals(partitionPathFields, partitionPath,
readerContext.getSchemaHandler().getTableSchema());
+ // filter out the partition values that are not required by the data
schema
+ Object[] filteredPartitionValues = new Object[0];
+ Option<String[]> filteredPartitionPathFields = Option.empty();
+ if (partitionPathFields.isPresent()) {
+ Schema dataSchema = dataFileIterator.get().getRight();
+ List<String> fields = new ArrayList<>();
+ List<Object> values = new ArrayList<>();
+ for (int i = 0; i < partitionPathFields.get().length; i++) {
+ String field = partitionPathFields.get()[i];
+ if (dataSchema.getField(field) != null) {
+ fields.add(field);
+ values.add(partitionValues[i]);
+ }
+ }
+ filteredPartitionPathFields = fields.isEmpty() ? Option.empty() :
Option.of(fields.toArray(new String[0]));
+ filteredPartitionValues = values.toArray(new Object[0]);
+ }
Review Comment:
nit: if only `filteredPartitionPathFields` and `filteredPartitionValues` are
needed, the data iterator's schema can be passed to `PartitionPathParser` for
parsing to return a list of field names and values that are needed for merging
bootstrap readers.
##########
hudi-client/hudi-java-client/src/test/java/org/apache/hudi/hadoop/TestHoodieFileGroupReaderOnHive.java:
##########
@@ -104,11 +106,13 @@ public void assertRecordsEqual(Schema schema,
ArrayWritable expected, ArrayWrita
ArrayWritableTestUtil.assertArrayWritableEqual(schema, expected, actual,
false);
}
- private void setupJobconf(JobConf jobConf) {
- Schema schema =
HoodieAvroUtils.addMetadataFields(HoodieTestDataGenerator.AVRO_SCHEMA);
+ private void setupJobconf(JobConf jobConf, Schema schema) {
List<Schema.Field> fields = schema.getFields();
setHiveColumnNameProps(fields, jobConf, USE_FAKE_PARTITION);
- jobConf.set("columns.types","string,string,string,string,string," +
HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES + ",string");
+ List<TypeInfo> types =
TypeInfoUtils.getTypeInfosFromTypeString(HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES);
+ Map<String, String> typeMappings =
HoodieTestDataGenerator.AVRO_SCHEMA.getFields().stream().collect(Collectors.toMap(Schema.Field::name,
field -> types.get(field.pos()).getTypeName()));
+ String columnTypes = fields.stream().map(field ->
typeMappings.getOrDefault(field.name(),
"string")).collect(Collectors.joining(","));
+ jobConf.set("columns.types", columnTypes + ",string");
Review Comment:
Do these configs need to match the columns in the query? Could we still
test the Hive queries with meta columns in some way?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]