This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-cookbook.git
The following commit(s) were added to refs/heads/main by this push:
new f8e0a56 [Java] Adding examples about Dictionary-encoded Layout (#215)
f8e0a56 is described below
commit f8e0a56bf9fdfdfbabad179e670bd01aa2142c39
Author: david dali susanibar arce <[email protected]>
AuthorDate: Wed May 25 07:19:24 2022 -0500
[Java] Adding examples about Dictionary-encoded Layout (#215)
* Addind examples about Dictionary-encoded Layout
* Apply suggestions from code review
Co-authored-by: David Li <[email protected]>
* Solving issues: variable name, null indexType
* Solving issues: variable name
* Adding dictionary id base on DictionaryEncoding and FieldType
Co-authored-by: David Li <[email protected]>
---
java/source/create.rst | 63 +++++++++++++++++++++++++-
java/source/io.rst | 119 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 181 insertions(+), 1 deletion(-)
diff --git a/java/source/create.rst b/java/source/create.rst
index f7680c9..33619fa 100644
--- a/java/source/create.rst
+++ b/java/source/create.rst
@@ -70,6 +70,66 @@ Array of Varchar
[one, two, three]
+Dictionary-Encoded Array of Varchar
+-----------------------------------
+
+In some scenarios `dictionary-encoding`_ a column is useful to save memory.
+
+.. testcode::
+
+ import org.apache.arrow.memory.BufferAllocator;
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.FieldVector;
+ import org.apache.arrow.vector.VarCharVector;
+ import org.apache.arrow.vector.dictionary.Dictionary;
+ import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+ import org.apache.arrow.vector.types.pojo.ArrowType;
+ import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
+
+ import java.nio.charset.StandardCharsets;
+
+ try (BufferAllocator root = new RootAllocator();
+ VarCharVector countries = new VarCharVector("country-dict", root);
+ VarCharVector appUserCountriesUnencoded = new
VarCharVector("app-use-country-dict", root)
+ ) {
+ countries.allocateNew(10);
+ countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+ countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
+ countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
+ countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
+ countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
+ countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
+ countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
+ countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
+ countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
+ countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
+ countries.setValueCount(10);
+
+ Dictionary countriesDictionary = new Dictionary(countries,
+ new DictionaryEncoding(/*id=*/1L, /*ordered=*/false,
/*indexType=*/new ArrowType.Int(8, true)));
+ System.out.println("Dictionary: " + countriesDictionary);
+
+ appUserCountriesUnencoded.allocateNew(5);
+ appUserCountriesUnencoded.set(0,
"Andorra".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(1,
"Guinea".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(2,
"Islandia".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(3,
"Malta".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(4,
"Uganda".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.setValueCount(5);
+ System.out.println("Unencoded data: " + appUserCountriesUnencoded);
+
+ try (FieldVector appUserCountriesDictionaryEncoded = (FieldVector)
DictionaryEncoder
+ .encode(appUserCountriesUnencoded, countriesDictionary)) {
+ System.out.println("Dictionary-encoded data: " +
appUserCountriesDictionaryEncoded);
+ }
+ }
+
+.. testoutput::
+
+ Dictionary: Dictionary
DictionaryEncoding[id=1,ordered=false,indexType=Int(8, true)] [Andorra, Cuba,
Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
+ Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
+ Dictionary-encoded data: [0, 3, 4, 5, 7]
+
Array of List
-------------
@@ -109,4 +169,5 @@ Array of List
[[1,2,3], [10,20,30], [100,200,300], [1000,2000,3000]]
.. _`FieldVector`:
https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/FieldVector.html
-.. _`ValueVector`: https://arrow.apache.org/docs/java/vector.html
\ No newline at end of file
+.. _`ValueVector`: https://arrow.apache.org/docs/java/vector.html
+.. _`dictionary-encoding`:
https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout
\ No newline at end of file
diff --git a/java/source/io.rst b/java/source/io.rst
index ffa06a3..8157c7c 100644
--- a/java/source/io.rst
+++ b/java/source/io.rst
@@ -443,3 +443,122 @@ Reading Parquet File
********************
Please check :doc:`Dataset <./dataset>`
+
+Handling Data with Dictionaries
+*******************************
+
+Reading and writing dictionary-encoded data requires separately tracking the
dictionaries.
+
+.. testcode::
+
+ import org.apache.arrow.memory.BufferAllocator;
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.FieldVector;
+ import org.apache.arrow.vector.ValueVector;
+ import org.apache.arrow.vector.VarCharVector;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import org.apache.arrow.vector.dictionary.Dictionary;
+ import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+ import org.apache.arrow.vector.dictionary.DictionaryProvider;
+ import org.apache.arrow.vector.ipc.ArrowFileReader;
+ import org.apache.arrow.vector.ipc.ArrowFileWriter;
+ import org.apache.arrow.vector.ipc.message.ArrowBlock;
+ import org.apache.arrow.vector.types.Types;
+ import org.apache.arrow.vector.types.pojo.ArrowType;
+ import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
+ import org.apache.arrow.vector.types.pojo.FieldType;
+
+ import java.io.File;
+ import java.io.FileInputStream;
+ import java.io.FileNotFoundException;
+ import java.io.FileOutputStream;
+ import java.io.IOException;
+ import java.nio.charset.StandardCharsets;
+
+ final DictionaryEncoding dictionaryEncoding = new DictionaryEncoding(
+ /*id=*/666L, /*ordered=*/false, /*indexType=*/
+ new ArrowType.Int(8, true)
+ );
+ try (BufferAllocator root = new RootAllocator();
+ VarCharVector countries = new VarCharVector("country-dict", root);
+ VarCharVector appUserCountriesUnencoded = new VarCharVector(
+ "app-use-country-dict",
+ new FieldType(true, Types.MinorType.VARCHAR.getType(),
dictionaryEncoding),
+ root)
+ ) {
+ countries.allocateNew(10);
+ countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+ countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
+ countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
+ countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
+ countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
+ countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
+ countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
+ countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
+ countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
+ countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
+ countries.setValueCount(10);
+
+ Dictionary countriesDictionary = new Dictionary(countries,
dictionaryEncoding);
+ System.out.println("Dictionary: " + countriesDictionary);
+
+ appUserCountriesUnencoded.allocateNew(5);
+ appUserCountriesUnencoded.set(0,
"Andorra".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(1,
"Guinea".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(2,
"Islandia".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(3,
"Malta".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(4,
"Uganda".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.setValueCount(5);
+ System.out.println("Unencoded data: " + appUserCountriesUnencoded);
+
+ File file = new File("random_access_file_with_dictionary.arrow");
+ DictionaryProvider.MapDictionaryProvider provider = new
DictionaryProvider.MapDictionaryProvider();
+ provider.put(countriesDictionary);
+ try (FieldVector appUseCountryDictionaryEncoded = (FieldVector)
DictionaryEncoder
+ .encode(appUserCountriesUnencoded, countriesDictionary);
+ VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.of(appUseCountryDictionaryEncoded);
+ FileOutputStream fileOutputStream = new FileOutputStream(file);
+ ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot,
provider, fileOutputStream.getChannel())
+ ) {
+ System.out.println("Dictionary-encoded data: "
+appUseCountryDictionaryEncoded);
+ System.out.println("Dictionary-encoded ID: "
+appUseCountryDictionaryEncoded.getField().getDictionary().getId());
+ writer.start();
+ writer.writeBatch();
+ writer.end();
+ System.out.println("Record batches written: " +
writer.getRecordBlocks().size() + ". Number of rows written: " +
vectorSchemaRoot.getRowCount());
+ try(
+ BufferAllocator rootAllocator = new RootAllocator();
+ FileInputStream fileInputStream = new FileInputStream(file);
+ ArrowFileReader reader = new
ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
+ ){
+ for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
+ reader.loadRecordBatch(arrowBlock);
+ FieldVector appUseCountryDictionaryEncodedRead =
reader.getVectorSchemaRoot().getVector("app-use-country-dict");
+ DictionaryEncoding dictionaryEncodingRead =
appUseCountryDictionaryEncodedRead.getField().getDictionary();
+ System.out.println("Dictionary-encoded ID recovered: " +
dictionaryEncodingRead.getId());
+ Dictionary appUseCountryDictionaryRead =
reader.getDictionaryVectors().get(dictionaryEncodingRead.getId());
+ System.out.println("Dictionary-encoded data recovered: " +
appUseCountryDictionaryEncodedRead);
+ System.out.println("Dictionary recovered: " +
appUseCountryDictionaryRead);
+ try (ValueVector readVector =
DictionaryEncoder.decode(appUseCountryDictionaryEncodedRead,
appUseCountryDictionaryRead)) {
+ System.out.println("Decoded data: " + readVector);
+ }
+ }
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+.. testoutput::
+
+ Dictionary: Dictionary
DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba,
Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
+ Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
+ Dictionary-encoded data: [0, 3, 4, 5, 7]
+ Dictionary-encoded ID: 666
+ Record batches written: 1. Number of rows written: 5
+ Dictionary-encoded ID recovered: 666
+ Dictionary-encoded data recovered: [0, 3, 4, 5, 7]
+ Dictionary recovered: Dictionary
DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba,
Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
+ Decoded data: [Andorra, Guinea, Islandia, Malta, Uganda]