Re: [PR] GH-37841: [Java] Dictionary decoding not using the compression factory from the ArrowReader [arrow]

via GitHub Thu, 01 Feb 2024 05:06:54 -0800


lidavidm commented on code in PR #38371:
URL: https://github.com/apache/arrow/pull/38371#discussion_r1474437452



##########
java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java:
##########
@@ -18,72 +18,247 @@
 package org.apache.arrow.compression;
 
 import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.nio.channels.Channels;
+import java.nio.channels.FileChannel;
+import java.nio.channels.SeekableByteChannel;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Optional;
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.GenerateSampleData;
+import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
 import org.apache.arrow.vector.compression.CompressionUtil;
 import org.apache.arrow.vector.compression.NoCompressionCodec;
+import org.apache.arrow.vector.dictionary.Dictionary;
+import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+import org.apache.arrow.vector.dictionary.DictionaryProvider;
 import org.apache.arrow.vector.ipc.ArrowFileReader;
 import org.apache.arrow.vector.ipc.ArrowFileWriter;
+import org.apache.arrow.vector.ipc.ArrowStreamReader;
+import org.apache.arrow.vector.ipc.ArrowStreamWriter;
 import org.apache.arrow.vector.ipc.message.IpcOption;
 import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.FieldType;
 import org.apache.arrow.vector.types.pojo.Schema;
 import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
+import org.junit.After;
 import org.junit.Assert;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
 
 public class TestArrowReaderWriterWithCompression {
 
-  @Test
-  public void testArrowFileZstdRoundTrip() throws Exception {
-    // Prepare sample data
-    final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE);
+  private BufferAllocator allocator;
+  private ByteArrayOutputStream out;
+  private VectorSchemaRoot root;
+
+  @BeforeEach
+  public void setup() {
+    if (allocator == null) {
+      allocator = new RootAllocator(Integer.MAX_VALUE);
+    }
+    out = new ByteArrayOutputStream();
+    root = null;
+  }
+
+  @After
+  public void tearDown() {
+    if (root != null) {
+      root.close();
+    }
+    if (allocator != null) {
+      allocator.close();
+    }
+    if (out != null) {
+      out.reset();
+    }
+
+  }
+
+  private void createAndWriteArrowFile(DictionaryProvider provider,
+      CompressionUtil.CodecType codecType) throws IOException {
     List<Field> fields = new ArrayList<>();
     fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), 
new ArrayList<>()));
-    VectorSchemaRoot root = VectorSchemaRoot.create(new Schema(fields), 
allocator);
+    root = VectorSchemaRoot.create(new Schema(fields), allocator);
+
     final int rowCount = 10;
     GenerateSampleData.generateTestData(root.getVector(0), rowCount);
     root.setRowCount(rowCount);
 
-    // Write an in-memory compressed arrow file
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    try (final ArrowFileWriter writer =
-           new ArrowFileWriter(root, null, Channels.newChannel(out), new 
HashMap<>(),
-             IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, 
CompressionUtil.CodecType.ZSTD, Optional.of(7))) {
+    try (final ArrowFileWriter writer = new ArrowFileWriter(root, provider, 
Channels.newChannel(out),
+        new HashMap<>(), IpcOption.DEFAULT, 
CommonsCompressionFactory.INSTANCE, codecType, Optional.of(7))) {
       writer.start();
       writer.writeBatch();
       writer.end();
     }
+  }
 
-    // Read the in-memory compressed arrow file with CommonsCompressionFactory 
provided
+  private Dictionary createDictionary(VarCharVector dictionaryVector) {
+    setVector(dictionaryVector,
+        "foo".getBytes(StandardCharsets.UTF_8),
+        "bar".getBytes(StandardCharsets.UTF_8),
+        "baz".getBytes(StandardCharsets.UTF_8));
+
+    return new Dictionary(dictionaryVector,
+        new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, 
/*indexType=*/null));
+  }
+
+  private VarCharVector createVarCharVector(String name, BufferAllocator 
allocator) {
+    VarCharVector vector = (VarCharVector) FieldType
+        .nullable(new ArrowType.Utf8()).createNewSingleVector(name, allocator, 
null);
+    vector.allocateNewSafe();
+    vector.set(0, "foo".getBytes(StandardCharsets.UTF_8));
+    vector.setValueCount(6);
+    return vector;
+  }
+
+  private List<Field> createFields(Dictionary dictionary, BufferAllocator 
allocator) {
+    VarCharVector vector = createVarCharVector("D1", allocator);
+    FieldVector encodedVector = (FieldVector) DictionaryEncoder.encode(vector, 
dictionary);
+    vector.close();
+
+    List<Field> fields = new ArrayList<>();
+    fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), 
new ArrayList<>()));
+    fields.add(encodedVector.getField());
+
+    return fields;
+  }
+
+  private File writeArrowStream(VectorSchemaRoot root, DictionaryProvider 
provider,
+      CompressionUtil.CodecType codecType) throws IOException {
+    File tempFile = File.createTempFile("dictionary_compression", ".arrow");

Review Comment:
   Use an in-memory buffer...the IPC layer doesn't care if it's writing to a 
real file or not.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] GH-37841: [Java] Dictionary decoding not using the compression factory from the ArrowReader [arrow]

Reply via email to