lidavidm commented on code in PR #38371: URL: https://github.com/apache/arrow/pull/38371#discussion_r1474437452
########## java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java: ########## @@ -18,72 +18,247 @@ package org.apache.arrow.compression; import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; import java.nio.channels.Channels; +import java.nio.channels.FileChannel; +import java.nio.channels.SeekableByteChannel; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Optional; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.GenerateSampleData; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.compression.CompressionUtil; import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryEncoder; +import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.ipc.ArrowFileReader; import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; import org.apache.arrow.vector.ipc.message.IpcOption; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; +import org.junit.After; import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; public class TestArrowReaderWriterWithCompression { - @Test - public void testArrowFileZstdRoundTrip() throws Exception { - // Prepare sample data - final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + private BufferAllocator allocator; + private ByteArrayOutputStream out; + private VectorSchemaRoot root; + + @BeforeEach + public void setup() { + if (allocator == null) { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + out = new ByteArrayOutputStream(); + root = null; + } + + @After + public void tearDown() { + if (root != null) { + root.close(); + } + if (allocator != null) { + allocator.close(); + } + if (out != null) { + out.reset(); + } + + } + + private void createAndWriteArrowFile(DictionaryProvider provider, + CompressionUtil.CodecType codecType) throws IOException { List<Field> fields = new ArrayList<>(); fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), new ArrayList<>())); - VectorSchemaRoot root = VectorSchemaRoot.create(new Schema(fields), allocator); + root = VectorSchemaRoot.create(new Schema(fields), allocator); + final int rowCount = 10; GenerateSampleData.generateTestData(root.getVector(0), rowCount); root.setRowCount(rowCount); - // Write an in-memory compressed arrow file - ByteArrayOutputStream out = new ByteArrayOutputStream(); - try (final ArrowFileWriter writer = - new ArrowFileWriter(root, null, Channels.newChannel(out), new HashMap<>(), - IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, CompressionUtil.CodecType.ZSTD, Optional.of(7))) { + try (final ArrowFileWriter writer = new ArrowFileWriter(root, provider, Channels.newChannel(out), + new HashMap<>(), IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, codecType, Optional.of(7))) { writer.start(); writer.writeBatch(); writer.end(); } + } - // Read the in-memory compressed arrow file with CommonsCompressionFactory provided + private Dictionary createDictionary(VarCharVector dictionaryVector) { + setVector(dictionaryVector, + "foo".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8), + "baz".getBytes(StandardCharsets.UTF_8)); + + return new Dictionary(dictionaryVector, + new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/null)); + } + + private VarCharVector createVarCharVector(String name, BufferAllocator allocator) { + VarCharVector vector = (VarCharVector) FieldType + .nullable(new ArrowType.Utf8()).createNewSingleVector(name, allocator, null); + vector.allocateNewSafe(); + vector.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + vector.setValueCount(6); + return vector; + } + + private List<Field> createFields(Dictionary dictionary, BufferAllocator allocator) { + VarCharVector vector = createVarCharVector("D1", allocator); + FieldVector encodedVector = (FieldVector) DictionaryEncoder.encode(vector, dictionary); + vector.close(); + + List<Field> fields = new ArrayList<>(); + fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), new ArrayList<>())); + fields.add(encodedVector.getField()); + + return fields; + } + + private File writeArrowStream(VectorSchemaRoot root, DictionaryProvider provider, + CompressionUtil.CodecType codecType) throws IOException { + File tempFile = File.createTempFile("dictionary_compression", ".arrow"); Review Comment: Use an in-memory buffer...the IPC layer doesn't care if it's writing to a real file or not. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org