This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git
The following commit(s) were added to refs/heads/master by this push:
new 494658a PARQUET-1273: Properly write dictionary values when writing
in chunks
494658a is described below
commit 494658a1f3f411c624dc49f191511febccd0d04c
Author: Joshua Storck <[email protected]>
AuthorDate: Wed Apr 18 10:09:14 2018 +0200
PARQUET-1273: Properly write dictionary values when writing in chunks
The error was reported here:
https://issues.apache.org/jira/browse/ARROW-1938.
Because dictionary types are not supported in writing yet, the code
converts the dictionary column to the actual values first before writing.
However, the existing code was accidentally using zero as the offset and the
length of the column as the size. This resulted in writing all of the column
values for each chunk of the column that was supposed to be written.
The fix is to pass the offset and size when recursively calling through to
WriteColumnChunk with the "flattened" data.
Author: Joshua Storck <[email protected]>
Closes #453 from joshuastorck/ARROW_1938 and squashes the following commits:
c2af50f [Joshua Storck] Remove extraneous semicolon in unit test
23f5722 [Joshua Storck] Ran clang-format on arrow-reader-writer-test.cc
314b159 [Joshua Storck] Removing print statements from AssertTableEqual
f0bc71a [Joshua Storck] Fixing bug reported in
https://issues.apache.org/jira/browse/ARROW-1938, namely preventing all of the
values in a dictionary column from being written to parquet for each chunk
created as a result of specifying row_group_size
---
src/parquet/arrow/arrow-reader-writer-test.cc | 63 +++++++++++++++++++++++++++
src/parquet/arrow/writer.cc | 2 +-
2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc
b/src/parquet/arrow/arrow-reader-writer-test.cc
index f2402df..93ecd3c 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1759,6 +1759,69 @@ TEST(TestArrowReadWrite, TableWithDuplicateColumns) {
CheckSimpleRoundtrip(table, table->num_rows());
}
+TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
+ // This is a regression test for this:
+ //
+ // https://issues.apache.org/jira/browse/ARROW-1938
+ //
+ // As of the writing of this test, columns of type
+ // dictionary are written as their raw/expanded values.
+ // The regression was that the whole column was being
+ // written for each chunk.
+ using ::arrow::ArrayFromVector;
+
+ std::vector<std::string> values = {"first", "second", "third"};
+ auto type = ::arrow::utf8();
+ std::shared_ptr<Array> dict_values;
+ ArrayFromVector<::arrow::StringType, std::string>(values, &dict_values);
+
+ auto dict_type = ::arrow::dictionary(::arrow::int32(), dict_values);
+ auto f0 = field("dictionary", dict_type);
+ std::vector<std::shared_ptr<::arrow::Field>> fields;
+ fields.emplace_back(f0);
+ auto schema = ::arrow::schema(fields);
+
+ std::shared_ptr<Array> f0_values, f1_values;
+ ArrayFromVector<::arrow::Int32Type, int32_t>({0, 1, 0, 2, 1}, &f0_values);
+ ArrayFromVector<::arrow::Int32Type, int32_t>({2, 0, 1, 0, 2}, &f1_values);
+ ::arrow::ArrayVector dict_arrays = {
+ std::make_shared<::arrow::DictionaryArray>(dict_type, f0_values),
+ std::make_shared<::arrow::DictionaryArray>(dict_type, f1_values)};
+
+ std::vector<std::shared_ptr<::arrow::Column>> columns;
+ auto column = MakeColumn("column", dict_arrays, false);
+ columns.emplace_back(column);
+
+ auto table = Table::Make(schema, columns);
+
+ std::shared_ptr<Table> result;
+ DoSimpleRoundtrip(table, 1,
+ // Just need to make sure that we make
+ // a chunk size that is smaller than the
+ // total number of values
+ 2, {}, &result);
+
+ std::vector<std::string> expected_values = {"first", "second", "first",
"third",
+ "second", "third", "first",
"second",
+ "first", "third"};
+ columns.clear();
+
+ std::shared_ptr<Array> expected_array;
+ ArrayFromVector<::arrow::StringType, std::string>(expected_values,
&expected_array);
+
+ // The column name gets changed on output to the name of the
+ // field, and it also turns into a nullable column
+ columns.emplace_back(MakeColumn("dictionary", expected_array, true));
+
+ fields.clear();
+ fields.emplace_back(::arrow::field("dictionary", ::arrow::utf8()));
+ schema = ::arrow::schema(fields);
+
+ auto expected_table = Table::Make(schema, columns);
+
+ AssertTablesEqual(*expected_table, *result, false);
+}
+
TEST(TestArrowWrite, CheckChunkSize) {
const int num_columns = 2;
const int num_rows = 128;
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index a5dae3c..14e9c6e 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -969,7 +969,7 @@ class FileWriter::Impl {
::arrow::compute::Datum cast_output;
RETURN_NOT_OK(Cast(&ctx, cast_input, dict_type.dictionary()->type(),
CastOptions(),
&cast_output));
- return WriteColumnChunk(cast_output.chunked_array(), 0, data->length());
+ return WriteColumnChunk(cast_output.chunked_array(), offset, size);
}
ColumnWriter* column_writer;
--
To stop receiving notification emails like this one, please contact
[email protected].