liyafan82 commented on a change in pull request #8363:
URL: https://github.com/apache/arrow/pull/8363#discussion_r502369246
##########
File path:
java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java
##########
@@ -115,25 +118,28 @@ public static Field toMemoryFormat(Field field,
BufferAllocator allocator, Map<L
}
ArrowType type;
+ List<Field> fieldChildren;
if (encoding == null) {
type = field.getType();
+ fieldChildren = updatedChildren;
} else {
// re-type the field for in-memory format
type = encoding.getIndexType();
+ fieldChildren = null;
Review comment:
This seems not necessary?
##########
File path:
java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java
##########
@@ -305,6 +321,64 @@ public void testWriteReadWithDictionaries() throws
IOException {
}
}
+ @Test
+ public void testWriteReadWithStructDictionaries() throws IOException {
+ DictionaryProvider.MapDictionaryProvider provider = new
DictionaryProvider.MapDictionaryProvider();
+ provider.put(dictionary4);
+
+ final StructVector vector = newVector(StructVector.class, "D4",
MinorType.STRUCT, allocator);
+ final Map<String, List<Integer>> values = new HashMap<>();
+ // Index: 0, 2, 1, 2, 1, 0, 0
+ values.put("a", Arrays.asList(1, 3, 2, 3, 2, 1, 1));
+ values.put("b", Arrays.asList(4, 6, 5, 6, 5, 4, 4));
+ setVector(vector, values);
+ FieldVector encodedVector = (FieldVector) DictionaryEncoder.encode(vector,
dictionary4);
Review comment:
Maybe we need to verify the encoded vector?
##########
File path:
java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java
##########
@@ -305,6 +321,64 @@ public void testWriteReadWithDictionaries() throws
IOException {
}
}
+ @Test
+ public void testWriteReadWithStructDictionaries() throws IOException {
+ DictionaryProvider.MapDictionaryProvider provider = new
DictionaryProvider.MapDictionaryProvider();
+ provider.put(dictionary4);
+
+ final StructVector vector = newVector(StructVector.class, "D4",
MinorType.STRUCT, allocator);
+ final Map<String, List<Integer>> values = new HashMap<>();
+ // Index: 0, 2, 1, 2, 1, 0, 0
+ values.put("a", Arrays.asList(1, 3, 2, 3, 2, 1, 1));
+ values.put("b", Arrays.asList(4, 6, 5, 6, 5, 4, 4));
+ setVector(vector, values);
+ FieldVector encodedVector = (FieldVector) DictionaryEncoder.encode(vector,
dictionary4);
+
+ List<Field> fields = Arrays.asList(encodedVector.getField());
+ List<FieldVector> vectors = Collections2.asImmutableList(encodedVector);
+ try (VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors,
encodedVector.getValueCount());
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowFileWriter writer = new ArrowFileWriter(root, provider,
newChannel(out));) {
+
+ writer.start();
+ writer.writeBatch();
+ writer.end();
+
+ try (SeekableReadChannel channel = new SeekableReadChannel(
+ new ByteArrayReadableSeekableByteChannel(out.toByteArray()));
+ ArrowFileReader reader = new ArrowFileReader(channel, allocator)) {
+ final VectorSchemaRoot readRoot = reader.getVectorSchemaRoot();
+ final Schema readSchema = readRoot.getSchema();
+ assertEquals(root.getSchema(), readSchema);
+ assertEquals(1, reader.getDictionaryBlocks().size());
+ assertEquals(1, reader.getRecordBlocks().size());
+
+ reader.loadNextBatch();
+ assertEquals(1, readRoot.getFieldVectors().size());
+ assertEquals(1, reader.getDictionaryVectors().size());
+
+ final FieldVector readEncoded = readRoot.getVector(0);
+ assertTrue(readEncoded instanceof IntVector);
Review comment:
should we validate `readEncoded`?
##########
File path:
java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java
##########
@@ -305,6 +321,64 @@ public void testWriteReadWithDictionaries() throws
IOException {
}
}
+ @Test
+ public void testWriteReadWithStructDictionaries() throws IOException {
+ DictionaryProvider.MapDictionaryProvider provider = new
DictionaryProvider.MapDictionaryProvider();
+ provider.put(dictionary4);
+
+ final StructVector vector = newVector(StructVector.class, "D4",
MinorType.STRUCT, allocator);
+ final Map<String, List<Integer>> values = new HashMap<>();
+ // Index: 0, 2, 1, 2, 1, 0, 0
+ values.put("a", Arrays.asList(1, 3, 2, 3, 2, 1, 1));
+ values.put("b", Arrays.asList(4, 6, 5, 6, 5, 4, 4));
+ setVector(vector, values);
+ FieldVector encodedVector = (FieldVector) DictionaryEncoder.encode(vector,
dictionary4);
+
+ List<Field> fields = Arrays.asList(encodedVector.getField());
+ List<FieldVector> vectors = Collections2.asImmutableList(encodedVector);
+ try (VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors,
encodedVector.getValueCount());
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowFileWriter writer = new ArrowFileWriter(root, provider,
newChannel(out));) {
+
+ writer.start();
+ writer.writeBatch();
+ writer.end();
+
+ try (SeekableReadChannel channel = new SeekableReadChannel(
+ new ByteArrayReadableSeekableByteChannel(out.toByteArray()));
+ ArrowFileReader reader = new ArrowFileReader(channel, allocator)) {
+ final VectorSchemaRoot readRoot = reader.getVectorSchemaRoot();
+ final Schema readSchema = readRoot.getSchema();
+ assertEquals(root.getSchema(), readSchema);
+ assertEquals(1, reader.getDictionaryBlocks().size());
+ assertEquals(1, reader.getRecordBlocks().size());
+
+ reader.loadNextBatch();
+ assertEquals(1, readRoot.getFieldVectors().size());
+ assertEquals(1, reader.getDictionaryVectors().size());
+
+ final FieldVector readEncoded = readRoot.getVector(0);
+ assertTrue(readEncoded instanceof IntVector);
+ assertEquals(new FieldType(true, MinorType.INT.getType(),
dictionary4.getEncoding()),
+ readEncoded.getField().getFieldType());
+
+ final Map<Long, Dictionary> readDictionaryMap =
reader.getDictionaryVectors();
+ final Dictionary readDictionary =
+
readDictionaryMap.get(readEncoded.getField().getDictionary().getId());
+ assertNotNull(readDictionary);
Review comment:
should we validate the read dictionary vector?
##########
File path:
java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java
##########
@@ -305,6 +321,64 @@ public void testWriteReadWithDictionaries() throws
IOException {
}
}
+ @Test
+ public void testWriteReadWithStructDictionaries() throws IOException {
+ DictionaryProvider.MapDictionaryProvider provider = new
DictionaryProvider.MapDictionaryProvider();
+ provider.put(dictionary4);
+
+ final StructVector vector = newVector(StructVector.class, "D4",
MinorType.STRUCT, allocator);
+ final Map<String, List<Integer>> values = new HashMap<>();
+ // Index: 0, 2, 1, 2, 1, 0, 0
+ values.put("a", Arrays.asList(1, 3, 2, 3, 2, 1, 1));
+ values.put("b", Arrays.asList(4, 6, 5, 6, 5, 4, 4));
+ setVector(vector, values);
+ FieldVector encodedVector = (FieldVector) DictionaryEncoder.encode(vector,
dictionary4);
+
+ List<Field> fields = Arrays.asList(encodedVector.getField());
+ List<FieldVector> vectors = Collections2.asImmutableList(encodedVector);
+ try (VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors,
encodedVector.getValueCount());
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowFileWriter writer = new ArrowFileWriter(root, provider,
newChannel(out));) {
+
+ writer.start();
+ writer.writeBatch();
+ writer.end();
+
+ try (SeekableReadChannel channel = new SeekableReadChannel(
+ new ByteArrayReadableSeekableByteChannel(out.toByteArray()));
+ ArrowFileReader reader = new ArrowFileReader(channel, allocator)) {
+ final VectorSchemaRoot readRoot = reader.getVectorSchemaRoot();
+ final Schema readSchema = readRoot.getSchema();
+ assertEquals(root.getSchema(), readSchema);
+ assertEquals(1, reader.getDictionaryBlocks().size());
+ assertEquals(1, reader.getRecordBlocks().size());
+
+ reader.loadNextBatch();
+ assertEquals(1, readRoot.getFieldVectors().size());
+ assertEquals(1, reader.getDictionaryVectors().size());
+
+ final FieldVector readEncoded = readRoot.getVector(0);
+ assertTrue(readEncoded instanceof IntVector);
+ assertEquals(new FieldType(true, MinorType.INT.getType(),
dictionary4.getEncoding()),
+ readEncoded.getField().getFieldType());
+
+ final Map<Long, Dictionary> readDictionaryMap =
reader.getDictionaryVectors();
+ final Dictionary readDictionary =
+
readDictionaryMap.get(readEncoded.getField().getDictionary().getId());
+ assertNotNull(readDictionary);
+
+ final ValueVector readVector = DictionaryEncoder.decode(readEncoded,
readDictionary);
+ assertTrue(readVector instanceof StructVector);
+ assertEquals(vector.getValueCount(), readVector.getValueCount());
+ for (int i = 0; i < vector.getValueCount(); i++) {
+ assertEquals(vector.getObject(i), readVector.getObject(i));
+ }
Review comment:
Maybe we can compare the two vectors through a `RangeEqualsVisitor`
object?
##########
File path:
java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
##########
@@ -673,4 +677,28 @@ public static void setVector(FixedSizeListVector vector,
List<Integer>... values
dataVector.setValueCount(curPos);
vector.setValueCount(values.length);
}
+
+ /**
+ * Populate values for {@link StructVector}.
+ */
+ public static void setVector(StructVector vector, Map<String, List<Integer>>
values) {
+ vector.allocateNewSafe();
+
+ int valueCount = 0;
+ for (final Entry<String, List<Integer>> entry : values.entrySet()) {
+ // Add the child
+ final IntVector child = vector.addOrGet(entry.getKey(),
+ FieldType.nullable(MinorType.INT.getType()), IntVector.class);
+
+ // Write the values to the child
+ child.allocateNew();
+ final List<Integer> v = entry.getValue();
+ for (int i = 0; i < v.size(); i++) {
+ child.set(i, v.get(i));
Review comment:
We need to consider the null values in v?
##########
File path:
java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
##########
@@ -673,4 +677,28 @@ public static void setVector(FixedSizeListVector vector,
List<Integer>... values
dataVector.setValueCount(curPos);
vector.setValueCount(values.length);
}
+
+ /**
+ * Populate values for {@link StructVector}.
+ */
+ public static void setVector(StructVector vector, Map<String, List<Integer>>
values) {
+ vector.allocateNewSafe();
+
+ int valueCount = 0;
+ for (final Entry<String, List<Integer>> entry : values.entrySet()) {
+ // Add the child
+ final IntVector child = vector.addOrGet(entry.getKey(),
+ FieldType.nullable(MinorType.INT.getType()), IntVector.class);
+
+ // Write the values to the child
+ child.allocateNew();
+ final List<Integer> v = entry.getValue();
+ for (int i = 0; i < v.size(); i++) {
+ child.set(i, v.get(i));
+ vector.setIndexDefined(i);
+ }
Review comment:
Maybe we also need to set the value count for the child vector
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]