vinothchandar commented on a change in pull request #994: [HUDI-151] Enable HiveOnSpark queries for RT tables URL: https://github.com/apache/incubator-hudi/pull/994#discussion_r344729747
########## File path: hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java ########## @@ -156,24 +158,46 @@ protected static String arrayWritableToString(ArrayWritable writable) { } /** - * Given a comma separated list of field names and positions at which they appear on Hive, return a ordered list of - * field names, that can be passed onto storage. + * Given a comma separated list of field names and positions at which they appear on Hive, return + * an ordered list of field names, that can be passed onto storage. */ private static List<String> orderFields(String fieldNameCsv, String fieldOrderCsv, List<String> partitioningFields) { - - String[] fieldOrders = fieldOrderCsv.split(","); - List<String> fieldNames = Arrays.stream(fieldNameCsv.split(",")).filter(fn -> !partitioningFields.contains(fn)) - .collect(Collectors.toList()); - + // Need to convert the following to Set first since Hive does not handle duplicate field names correctly but + // handles duplicate fields orders correctly. + // Fields Orders -> {@link https://github + // .com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/serde/src/java + // /org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java#L188} + // Field Names -> {@link https://github.com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/serde/src/java + // /org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java#L229} + Set<String> fieldOrdersSet = new HashSet<>(); + String[] fieldOrdersWithDups = fieldOrderCsv.split(","); + List<String> fieldOrdersList = new LinkedList<>(); + for (String fieldOrder : fieldOrdersWithDups) { + if (!fieldOrdersSet.contains(fieldOrder)) { + fieldOrdersList.add(fieldOrder); + } + fieldOrdersSet.add(fieldOrder); + } + String[] fieldOrders = fieldOrdersList.toArray(new String[fieldOrdersList.size()]); + List<String> fieldNames = Arrays.stream(fieldNameCsv.split(",")) + .filter(fn -> !partitioningFields.contains(fn)).collect(Collectors.toList()); + Set<String> fieldNamesSet = new HashSet<>(); + List<String> fieldNamesList = new LinkedList<>(); Review comment: same here. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services