[ https://issues.apache.org/jira/browse/PARQUET-1273?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16442096#comment-16442096 ]
ASF GitHub Bot commented on PARQUET-1273: ----------------------------------------- xhochy closed pull request #453: PARQUET-1273: Properly write dictionary values when writing in chunks URL: https://github.com/apache/parquet-cpp/pull/453 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 79a393f6..92b67353 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -1726,6 +1726,69 @@ TEST(TestArrowReadWrite, TableWithDuplicateColumns) { CheckSimpleRoundtrip(table, table->num_rows()); } +TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) { + // This is a regression test for this: + // + // https://issues.apache.org/jira/browse/ARROW-1938 + // + // As of the writing of this test, columns of type + // dictionary are written as their raw/expanded values. + // The regression was that the whole column was being + // written for each chunk. + using ::arrow::ArrayFromVector; + + std::vector<std::string> values = {"first", "second", "third"}; + auto type = ::arrow::utf8(); + std::shared_ptr<Array> dict_values; + ArrayFromVector<::arrow::StringType, std::string>(values, &dict_values); + + auto dict_type = ::arrow::dictionary(::arrow::int32(), dict_values); + auto f0 = field("dictionary", dict_type); + std::vector<std::shared_ptr<::arrow::Field>> fields; + fields.emplace_back(f0); + auto schema = ::arrow::schema(fields); + + std::shared_ptr<Array> f0_values, f1_values; + ArrayFromVector<::arrow::Int32Type, int32_t>({0, 1, 0, 2, 1}, &f0_values); + ArrayFromVector<::arrow::Int32Type, int32_t>({2, 0, 1, 0, 2}, &f1_values); + ::arrow::ArrayVector dict_arrays = { + std::make_shared<::arrow::DictionaryArray>(dict_type, f0_values), + std::make_shared<::arrow::DictionaryArray>(dict_type, f1_values)}; + + std::vector<std::shared_ptr<::arrow::Column>> columns; + auto column = MakeColumn("column", dict_arrays, false); + columns.emplace_back(column); + + auto table = Table::Make(schema, columns); + + std::shared_ptr<Table> result; + DoSimpleRoundtrip(table, 1, + // Just need to make sure that we make + // a chunk size that is smaller than the + // total number of values + 2, {}, &result); + + std::vector<std::string> expected_values = {"first", "second", "first", "third", + "second", "third", "first", "second", + "first", "third"}; + columns.clear(); + + std::shared_ptr<Array> expected_array; + ArrayFromVector<::arrow::StringType, std::string>(expected_values, &expected_array); + + // The column name gets changed on output to the name of the + // field, and it also turns into a nullable column + columns.emplace_back(MakeColumn("dictionary", expected_array, true)); + + fields.clear(); + fields.emplace_back(::arrow::field("dictionary", ::arrow::utf8())); + schema = ::arrow::schema(fields); + + auto expected_table = Table::Make(schema, columns); + + AssertTablesEqual(*expected_table, *result, false); +} + TEST(TestArrowWrite, CheckChunkSize) { const int num_columns = 2; const int num_rows = 128; diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc index 5040e0cc..ce05ef0b 100644 --- a/src/parquet/arrow/writer.cc +++ b/src/parquet/arrow/writer.cc @@ -962,7 +962,7 @@ class FileWriter::Impl { ::arrow::compute::Datum cast_output; RETURN_NOT_OK(Cast(&ctx, cast_input, dict_type.dictionary()->type(), CastOptions(), &cast_output)); - return WriteColumnChunk(cast_output.chunked_array(), 0, data->length()); + return WriteColumnChunk(cast_output.chunked_array(), offset, size); } ColumnWriter* column_writer; ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Error writing to partitioned Parquet dataset > ----------------------------------------------------- > > Key: PARQUET-1273 > URL: https://issues.apache.org/jira/browse/PARQUET-1273 > Project: Parquet > Issue Type: Bug > Components: parquet-cpp > Environment: Linux (Ubuntu 16.04) > Reporter: Robert Dailey > Assignee: Joshua Storck > Priority: Major > Labels: pull-request-available > Fix For: cpp-1.5.0 > > Attachments: ARROW-1938-test-data.csv.gz, ARROW-1938.py, > pyarrow_dataset_error.png > > > I receive the following error after upgrading to pyarrow 0.8.0 when writing > to a dataset: > * ArrowIOError: Column 3 had 187374 while previous column had 10000 > The command was: > write_table_values = {'row_group_size': 10000} > pq.write_to_dataset(pa.Table.from_pandas(df, preserve_index=True), > '/logs/parsed/test', partition_cols=['Product', 'year', 'month', 'day', > 'hour'], **write_table_values) > I've also tried write_table_values = {'chunk_size': 10000} and received the > same error. > This same command works in version 0.7.1. I am trying to troubleshoot the > problem but wanted to submit a ticket. -- This message was sent by Atlassian JIRA (v7.6.3#76005)