[parquet-cpp] branch master updated: PARQUET-1273: Properly write dictionary values when writing in chunks

uwe Wed, 18 Apr 2018 01:10:17 -0700

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git



The following commit(s) were added to refs/heads/master by this push:
     new 494658a  PARQUET-1273: Properly write dictionary values when writing 
in chunks
494658a is described below

commit 494658a1f3f411c624dc49f191511febccd0d04c
Author: Joshua Storck <[email protected]>
AuthorDate: Wed Apr 18 10:09:14 2018 +0200

    PARQUET-1273: Properly write dictionary values when writing in chunks
    
    The error was reported here: 
https://issues.apache.org/jira/browse/ARROW-1938.
    
    Because dictionary types are not supported in writing yet, the code 
converts the dictionary column to the actual values first before writing. 
However, the existing code was accidentally using zero as the offset and the 
length of the column as the size. This resulted in writing all of the column 
values for each chunk of the column that was supposed to be written.
    
    The fix is to pass the offset and size when recursively calling through to 
WriteColumnChunk with the "flattened" data.
    
    Author: Joshua Storck <[email protected]>
    
    Closes #453 from joshuastorck/ARROW_1938 and squashes the following commits:
    
    c2af50f [Joshua Storck] Remove extraneous semicolon in unit test
    23f5722 [Joshua Storck] Ran clang-format on arrow-reader-writer-test.cc
    314b159 [Joshua Storck] Removing print statements from AssertTableEqual
    f0bc71a [Joshua Storck] Fixing bug reported in 
https://issues.apache.org/jira/browse/ARROW-1938, namely preventing all of the 
values in a dictionary column from being written to parquet for each chunk 
created as a result of specifying row_group_size
---
 src/parquet/arrow/arrow-reader-writer-test.cc | 63 +++++++++++++++++++++++++++
 src/parquet/arrow/writer.cc                   |  2 +-
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc 
b/src/parquet/arrow/arrow-reader-writer-test.cc
index f2402df..93ecd3c 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1759,6 +1759,69 @@ TEST(TestArrowReadWrite, TableWithDuplicateColumns) {
   CheckSimpleRoundtrip(table, table->num_rows());
 }
 
+TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
+  // This is a regression test for this:
+  //
+  // https://issues.apache.org/jira/browse/ARROW-1938
+  //
+  // As of the writing of this test, columns of type
+  // dictionary are written as their raw/expanded values.
+  // The regression was that the whole column was being
+  // written for each chunk.
+  using ::arrow::ArrayFromVector;
+
+  std::vector<std::string> values = {"first", "second", "third"};
+  auto type = ::arrow::utf8();
+  std::shared_ptr<Array> dict_values;
+  ArrayFromVector<::arrow::StringType, std::string>(values, &dict_values);
+
+  auto dict_type = ::arrow::dictionary(::arrow::int32(), dict_values);
+  auto f0 = field("dictionary", dict_type);
+  std::vector<std::shared_ptr<::arrow::Field>> fields;
+  fields.emplace_back(f0);
+  auto schema = ::arrow::schema(fields);
+
+  std::shared_ptr<Array> f0_values, f1_values;
+  ArrayFromVector<::arrow::Int32Type, int32_t>({0, 1, 0, 2, 1}, &f0_values);
+  ArrayFromVector<::arrow::Int32Type, int32_t>({2, 0, 1, 0, 2}, &f1_values);
+  ::arrow::ArrayVector dict_arrays = {
+      std::make_shared<::arrow::DictionaryArray>(dict_type, f0_values),
+      std::make_shared<::arrow::DictionaryArray>(dict_type, f1_values)};
+
+  std::vector<std::shared_ptr<::arrow::Column>> columns;
+  auto column = MakeColumn("column", dict_arrays, false);
+  columns.emplace_back(column);
+
+  auto table = Table::Make(schema, columns);
+
+  std::shared_ptr<Table> result;
+  DoSimpleRoundtrip(table, 1,
+                    // Just need to make sure that we make
+                    // a chunk size that is smaller than the
+                    // total number of values
+                    2, {}, &result);
+
+  std::vector<std::string> expected_values = {"first",  "second", "first", 
"third",
+                                              "second", "third",  "first", 
"second",
+                                              "first",  "third"};
+  columns.clear();
+
+  std::shared_ptr<Array> expected_array;
+  ArrayFromVector<::arrow::StringType, std::string>(expected_values, 
&expected_array);
+
+  // The column name gets changed on output to the name of the
+  // field, and it also turns into a nullable column
+  columns.emplace_back(MakeColumn("dictionary", expected_array, true));
+
+  fields.clear();
+  fields.emplace_back(::arrow::field("dictionary", ::arrow::utf8()));
+  schema = ::arrow::schema(fields);
+
+  auto expected_table = Table::Make(schema, columns);
+
+  AssertTablesEqual(*expected_table, *result, false);
+}
+
 TEST(TestArrowWrite, CheckChunkSize) {
   const int num_columns = 2;
   const int num_rows = 128;
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index a5dae3c..14e9c6e 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -969,7 +969,7 @@ class FileWriter::Impl {
       ::arrow::compute::Datum cast_output;
       RETURN_NOT_OK(Cast(&ctx, cast_input, dict_type.dictionary()->type(), 
CastOptions(),
                          &cast_output));
-      return WriteColumnChunk(cast_output.chunked_array(), 0, data->length());
+      return WriteColumnChunk(cast_output.chunked_array(), offset, size);
     }
 
     ColumnWriter* column_writer;

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[parquet-cpp] branch master updated: PARQUET-1273: Properly write dictionary values when writing in chunks

Reply via email to