jonvex commented on code in PR #13628:
URL: https://github.com/apache/hudi/pull/13628#discussion_r2246206341


##########
hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java:
##########
@@ -1379,6 +1379,88 @@ public static boolean hasListOrMapField(Schema schema) {
     }
   }
 
+  public static void validateRecordsHaveSameData(Object expected, Object 
actual) {
+    validateRecordsHaveSameData(expected, actual, new LinkedList<>());
+  }
+
+  private static void validateRecordsHaveSameData(Object expected, Object 
actual, Deque<String> fieldNames) {
+    if (expected instanceof GenericRecord) {
+      if (!(actual instanceof GenericRecord)) {
+        throw new HoodieAvroSchemaException("Expected record but got " + 
actual.getClass().getName() + " for " + createFullName(fieldNames));
+      }
+      GenericRecord expectedRecord = (GenericRecord) expected;
+      GenericRecord actualRecord = (GenericRecord) actual;
+      // TODO: add this check when schema evolution has the more flexible 
schema comparison
+      // if (!Objects.equals(expectedRecord.getSchema(), 
actualRecord.getSchema())) {
+      //   throw new HoodieAvroSchemaException("Expected record schema " + 
expectedRecord.getSchema() + " but got " + actualRecord.getSchema() + " for " + 
createFullName(fieldNames));
+      // }
+      for (Schema.Field field : expectedRecord.getSchema().getFields()) {
+        fieldNames.push(field.name());
+        validateRecordsHaveSameData(expectedRecord.get(field.name()), 
actualRecord.get(field.name()), fieldNames);
+        fieldNames.pop();
+      }
+    } else if (expected instanceof Collection) {
+      if (!(actual instanceof Collection)) {
+        throw new HoodieAvroSchemaException("Expected collection but got " + 
actual.getClass().getName());
+      }
+      Collection expectedCollection = (Collection) expected;
+      Collection actualCollection = (Collection) actual;
+      if (expectedCollection.size() != actualCollection.size()) {
+        throw new HoodieAvroSchemaException("Expected collection size " + 
expectedCollection.size() + " but got " + actualCollection.size() + " for " + 
createFullName(fieldNames));
+      }
+      Iterator<?> expectedIterator = expectedCollection.iterator();
+      Iterator<?> actualIterator = actualCollection.iterator();
+      fieldNames.push("element");
+      while (expectedIterator.hasNext() && actualIterator.hasNext()) {
+        validateRecordsHaveSameData(expectedIterator.next(), 
actualIterator.next(), fieldNames);
+      }
+      fieldNames.pop();
+    } else if (expected instanceof Map) {
+      if (!(actual instanceof Map)) {
+        throw new HoodieAvroSchemaException("Expected map but got " + 
actual.getClass().getName() + " for " + createFullName(fieldNames));
+      }
+      Map expectedMap = (Map) expected;
+      Map actualMap = (Map) actual;
+      if (expectedMap.size() != actualMap.size()) {
+        throw new HoodieAvroSchemaException("Expected map size " + 
expectedMap.size() + " but got " + actualMap.size() + " for " + 
createFullName(fieldNames));
+      }
+      if (!expectedMap.keySet().equals(actualMap.keySet())) {
+        Set<String> expectedKeys = (Set<String>) 
expectedMap.keySet().stream().map(Object::toString).collect(Collectors.toSet());
+        Set<String> actualKeys = (Set<String>) 
actualMap.keySet().stream().map(Object::toString).collect(Collectors.toSet());
+        if (!expectedKeys.equals(actualKeys)) {

Review Comment:
   Spark uses UTF8 for some reason. 
https://github.com/apache/hudi/blob/fde04da8904452c27c0aa02a4a09f266eb42bfb3/hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala#L160
 I tried to change this but it was causing other tests to fail. Definitely 
something I would like to explore, but I'm trying to not put bug fixes into 
this pr.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to