[
https://issues.apache.org/jira/browse/PARQUET-1274?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16442273#comment-16442273
]
ASF GitHub Bot commented on PARQUET-1274:
-----------------------------------------
xhochy closed pull request #456: PARQUET-1274: Prevent segfault that was
occurring when writing a nanosecond timestamp with arrow writer properties set
to coerce timestamps and support deprecated int96 timestamps.
URL: https://github.com/apache/parquet-cpp/pull/456
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc
b/src/parquet/arrow/arrow-reader-writer-test.cc
index 79a393f6..bf6f3022 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1403,6 +1403,64 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) {
AssertTablesEqual(*ex_table, *result);
}
+// Regression for ARROW-2802
+TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) {
+ using ::arrow::Column;
+ using ::arrow::Field;
+ using ::arrow::Schema;
+ using ::arrow::Table;
+ using ::arrow::TimeUnit;
+ using ::arrow::TimestampType;
+ using ::arrow::TimestampBuilder;
+ using ::arrow::default_memory_pool;
+
+ auto timestamp_type = std::make_shared<TimestampType>(TimeUnit::NANO);
+
+ TimestampBuilder builder(timestamp_type, default_memory_pool());
+ for (std::int64_t ii = 0; ii < 10; ++ii) {
+ ASSERT_OK(builder.Append(1000000000L * ii));
+ }
+ std::shared_ptr<Array> values;
+ ASSERT_OK(builder.Finish(&values));
+
+ std::vector<std::shared_ptr<Field>> fields;
+ auto field = std::make_shared<Field>("nanos", timestamp_type);
+ fields.emplace_back(field);
+
+ auto schema = std::make_shared<Schema>(fields);
+
+ std::vector<std::shared_ptr<Column>> columns;
+ auto column = std::make_shared<Column>("nanos", values);
+ columns.emplace_back(column);
+
+ auto table = Table::Make(schema, columns);
+
+ auto arrow_writer_properties = ArrowWriterProperties::Builder()
+ .coerce_timestamps(TimeUnit::MICRO)
+ ->enable_deprecated_int96_timestamps()
+ ->build();
+
+ std::shared_ptr<Table> result;
+ DoSimpleRoundtrip(table, 1, table->num_rows(), {}, &result,
arrow_writer_properties);
+
+ ASSERT_EQ(table->num_columns(), result->num_columns());
+ ASSERT_EQ(table->num_rows(), result->num_rows());
+
+ auto actual_column = result->column(0);
+ auto data = actual_column->data();
+ auto expected_values =
+
static_cast<::arrow::NumericArray<TimestampType>*>(values.get())->raw_values();
+ for (int ii = 0; ii < data->num_chunks(); ++ii) {
+ auto chunk =
+
static_cast<::arrow::NumericArray<TimestampType>*>(data->chunk(ii).get());
+ auto values = chunk->raw_values();
+ for (int64_t jj = 0; jj < chunk->length(); ++jj, ++expected_values) {
+ // Check that the nanos have been converted to micros
+ ASSERT_EQ(*expected_values / 1000, values[jj]);
+ }
+ }
+}
+
void MakeDoubleTable(int num_columns, int num_rows, int nchunks,
std::shared_ptr<Table>* out) {
std::shared_ptr<::arrow::Column> column;
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 5040e0cc..9eca41ec 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -595,7 +595,14 @@ Status ArrowColumnWriter::WriteTimestamps(const Array&
values, int64_t num_level
const bool is_nanosecond = type.unit() == TimeUnit::NANO;
- if (is_nanosecond &&
ctx_->properties->support_deprecated_int96_timestamps()) {
+ // In the case where support_deprecated_int96_timestamps was specified
+ // and coerce_timestamps_enabled was specified, a nanosecond column
+ // will have a physical type of int64. In that case, we fall through
+ // to the else if below.
+ //
+ // See https://issues.apache.org/jira/browse/ARROW-2082
+ if (is_nanosecond && ctx_->properties->support_deprecated_int96_timestamps()
&&
+ !ctx_->properties->coerce_timestamps_enabled()) {
return TypedWriteBatch<Int96Type, ::arrow::TimestampType>(values,
num_levels,
def_levels,
rep_levels);
} else if (is_nanosecond ||
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] SegFault in pyarrow.parquet.write_table with specific options
> ----------------------------------------------------------------------
>
> Key: PARQUET-1274
> URL: https://issues.apache.org/jira/browse/PARQUET-1274
> Project: Parquet
> Issue Type: Bug
> Components: parquet-cpp
> Environment: tested on MacOS High Sierra with python 3.6 and Ubuntu
> Xenial (Python 3.5)
> Reporter: Clément Bouscasse
> Assignee: Joshua Storck
> Priority: Major
> Labels: pull-request-available
> Fix For: cpp-1.5.0
>
>
> I originally filed an issue in the pandas project but we've tracked it down
> to arrow itself, when called via pandas in specific circumstances:
> [https://github.com/pandas-dev/pandas/issues/19493]
> basically using
> {code:java}
> df.to_parquet('filename.parquet', flavor='spark'){code}
> gives a seg fault if `df` contains a datetime column.
> Under the covers, pandas translates this to the following call:
> {code:java}
> pq.write_table(table, 'output.parquet', flavor='spark', compression='snappy',
> coerce_timestamps='ms')
> {code}
> which gives me an instant crash.
> There is a repro on the github ticket.
>
>
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)