vinothchandar commented on a change in pull request #956: [HUDI-298] Fix issue 
with incorrect column mapping casusing bad data, during on-the-fly merge of 
Real Time tables
URL: https://github.com/apache/incubator-hudi/pull/956#discussion_r335120071
 
 

 ##########
 File path: 
hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java
 ##########
 @@ -328,13 +333,40 @@ private void init() throws IOException {
     writerSchema = addPartitionFields(writerSchema, partitioningFields);
     List<String> projectionFields = 
orderFields(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR),
         jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), 
partitioningFields);
+
+    Map<String, Field> schemaFieldsMap = getNameToFieldMap(writerSchema);
+    hiveSchema = constructHiveOrderedSchema(writerSchema, schemaFieldsMap);
     // TODO(vc): In the future, the reader schema should be updated based on 
log files & be able
     // to null out fields not present before
-    readerSchema = generateProjectionSchema(writerSchema, projectionFields);
+
+    readerSchema = generateProjectionSchema(writerSchema, schemaFieldsMap, 
projectionFields);
     LOG.info(String.format("About to read compacted logs %s for base split %s, 
projecting cols %s",
         split.getDeltaFilePaths(), split.getPath(), projectionFields));
   }
 
+  private Schema constructHiveOrderedSchema(Schema writerSchema, Map<String, 
Field> schemaFieldsMap) {
+    String hiveColumnString = jobConf.get("columns");
+    String[] hiveColumns = hiveColumnString.split(",");
+    List<Field> hiveSchemaFields = new ArrayList<>();
+
+    for (String columnName : hiveColumns) {
+      Field field = schemaFieldsMap.get(columnName.toLowerCase());
+
+      if (field != null) {
+        hiveSchemaFields.add(new Schema.Field(field.name(), field.schema(), 
field.doc(), field.defaultValue()));
+      } else {
+        // Hive has some extra virtual columns like 
BLOCK__OFFSET__INSIDE__FILE which do not exist in table schema.
 
 Review comment:
   :) hive

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to