This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 88179b6362 GH-47924: [C++] Fix issues in CSV reader with invalid
inputs (#47925)
88179b6362 is described below
commit 88179b63620cb2780ee8889ff87a15c7003d4b12
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Oct 28 10:43:59 2025 +0100
GH-47924: [C++] Fix issues in CSV reader with invalid inputs (#47925)
### Rationale for this change
These issues were all found by OSS-Fuzz:
* https://issues.oss-fuzz.com/issues/452079535
* https://issues.oss-fuzz.com/issues/452079536
* https://issues.oss-fuzz.com/issues/452118314
* https://issues.oss-fuzz.com/issues/452701622
* https://issues.oss-fuzz.com/issues/452912678
### Are these changes tested?
Yes, by additional fuzz regression files.
### Are there any user-facing changes?
No.
**This PR contains a "Critical Fix".** (If the changes fix either (a) a
security vulnerability, (b) a bug that caused incorrect or invalid data to be
produced, or (c) a bug that causes a crash (even when the API contract is
upheld), please provide explanation. If not, you can remove this.)
* GitHub Issue: #47924
Lead-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Zehua Zou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/csv/column_builder.cc | 43 +++++++++++++++++++++--------
cpp/src/arrow/csv/fuzz.cc | 22 +++++++++++----
cpp/src/arrow/util/formatting_util_test.cc | 5 ++++
cpp/src/arrow/util/task_group.cc | 7 ++++-
cpp/src/arrow/util/time.h | 34 ++++++++++++++++-------
cpp/src/arrow/util/value_parsing.h | 44 +++++++++++++++++++++---------
cpp/src/arrow/util/value_parsing_test.cc | 15 ++++++++++
testing | 2 +-
8 files changed, 131 insertions(+), 41 deletions(-)
diff --git a/cpp/src/arrow/csv/column_builder.cc
b/cpp/src/arrow/csv/column_builder.cc
index 0b169ebd06..393df18ec4 100644
--- a/cpp/src/arrow/csv/column_builder.cc
+++ b/cpp/src/arrow/csv/column_builder.cc
@@ -19,6 +19,7 @@
#include <cstdint>
#include <memory>
#include <mutex>
+#include <optional>
#include <sstream>
#include <string>
#include <utility>
@@ -82,7 +83,7 @@ class ConcreteColumnBuilder : public ColumnBuilder {
ReserveChunksUnlocked(block_index);
}
- void ReserveChunksUnlocked(int64_t block_index) {
+ virtual void ReserveChunksUnlocked(int64_t block_index) {
// Create a null Array pointer at the back at the list.
size_t chunk_index = static_cast<size_t>(block_index);
if (chunks_.size() <= chunk_index) {
@@ -232,6 +233,7 @@ class InferringColumnBuilder : public ConcreteColumnBuilder
{
Status TryConvertChunk(int64_t chunk_index);
// This must be called unlocked!
void ScheduleConvertChunk(int64_t chunk_index);
+ void ReserveChunksUnlocked(int64_t block_index) override;
// CAUTION: ConvertOptions can grow large (if it customizes hundreds or
// thousands of columns), so avoid copying it in each InferringColumnBuilder.
@@ -243,6 +245,9 @@ class InferringColumnBuilder : public ConcreteColumnBuilder
{
// The parsers corresponding to each chunk (for reconverting)
std::vector<std::shared_ptr<BlockParser>> parsers_;
+
+ // The inference kind for which the current chunks_ were obtained
+ std::vector<std::optional<InferKind>> chunk_kinds_;
};
Status InferringColumnBuilder::Init() { return UpdateType(); }
@@ -261,7 +266,12 @@ Status InferringColumnBuilder::TryConvertChunk(int64_t
chunk_index) {
std::shared_ptr<BlockParser> parser = parsers_[chunk_index];
InferKind kind = infer_status_.kind();
- DCHECK_NE(parser, nullptr);
+ if (chunks_[chunk_index] && chunk_kinds_[chunk_index] == kind) {
+ // Already tried, nothing to do
+ return Status::OK();
+ }
+
+ DCHECK_NE(parser, nullptr) << " for chunk_index " << chunk_index;
lock.unlock();
auto maybe_array = converter->Convert(*parser, col_index_);
@@ -280,34 +290,45 @@ Status InferringColumnBuilder::TryConvertChunk(int64_t
chunk_index) {
// We won't try to reconvert anymore
parsers_[chunk_index].reset();
}
+ chunk_kinds_[chunk_index] = kind;
return SetChunkUnlocked(chunk_index, maybe_array);
}
// Conversion failed, try another type
infer_status_.LoosenType(maybe_array.status());
RETURN_NOT_OK(UpdateType());
+ kind = infer_status_.kind();
// Reconvert past finished chunks
// (unfinished chunks will notice by themselves if they need reconverting)
const auto nchunks = static_cast<int64_t>(chunks_.size());
+ std::vector<int64_t> chunks_to_reconvert;
for (int64_t i = 0; i < nchunks; ++i) {
- if (i != chunk_index && chunks_[i]) {
- // We're assuming the chunk was converted using the wrong type
- // (which should be true unless the executor reorders tasks)
+ if (i != chunk_index && chunks_[i] && chunk_kinds_[i] != kind) {
+ // That chunk was converted using the wrong type
chunks_[i].reset();
- lock.unlock();
- ScheduleConvertChunk(i);
- lock.lock();
+ chunk_kinds_[i].reset();
+ chunks_to_reconvert.push_back(i);
}
}
+ // Reconvert this chunk too
+ chunks_to_reconvert.push_back(chunk_index);
- // Reconvert this chunk
lock.unlock();
- ScheduleConvertChunk(chunk_index);
-
+ for (auto i : chunks_to_reconvert) {
+ ScheduleConvertChunk(i);
+ }
return Status::OK();
}
+void InferringColumnBuilder::ReserveChunksUnlocked(int64_t block_index) {
+ ConcreteColumnBuilder::ReserveChunksUnlocked(block_index);
+ size_t chunk_index = static_cast<size_t>(block_index);
+ if (chunk_kinds_.size() <= chunk_index) {
+ chunk_kinds_.resize(chunk_index + 1);
+ }
+}
+
void InferringColumnBuilder::Insert(int64_t block_index,
const std::shared_ptr<BlockParser>&
parser) {
// Create a slot for the new chunk and spawn a task to convert it
diff --git a/cpp/src/arrow/csv/fuzz.cc b/cpp/src/arrow/csv/fuzz.cc
index 9e500e5281..e745c2c0bd 100644
--- a/cpp/src/arrow/csv/fuzz.cc
+++ b/cpp/src/arrow/csv/fuzz.cc
@@ -17,6 +17,7 @@
#include <cstdint>
#include <memory>
+#include <optional>
#include "arrow/buffer.h"
#include "arrow/csv/reader.h"
@@ -25,10 +26,18 @@
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/util/macros.h"
+#include "arrow/util/thread_pool.h"
namespace arrow::csv {
Status FuzzCsvReader(const uint8_t* data, int64_t size) {
+ // Since the Fuzz-allocated data is not owned, any task that outlives the
TableReader
+ // may try to read memory that has been deallocated. Hence we wait for all
pending
+ // tasks to end before leaving.
+ struct TaskGuard {
+ ~TaskGuard() { ::arrow::internal::GetCpuThreadPool()->WaitForIdle(); }
+ };
+
auto io_context = arrow::io::default_io_context();
auto read_options = ReadOptions::Defaults();
@@ -42,11 +51,14 @@ Status FuzzCsvReader(const uint8_t* data, int64_t size) {
std::make_shared<::arrow::io::BufferReader>(std::make_shared<Buffer>(data,
size));
// TODO test other reader types
- ARROW_ASSIGN_OR_RAISE(auto table_reader,
- TableReader::Make(io_context, input_stream,
read_options,
- parse_options, convert_options));
- ARROW_ASSIGN_OR_RAISE(auto table, table_reader->Read());
- RETURN_NOT_OK(table->ValidateFull());
+ {
+ ARROW_ASSIGN_OR_RAISE(auto table_reader,
+ TableReader::Make(io_context, input_stream,
read_options,
+ parse_options, convert_options));
+ TaskGuard task_guard;
+ ARROW_ASSIGN_OR_RAISE(auto table, table_reader->Read());
+ RETURN_NOT_OK(table->ValidateFull());
+ }
return Status::OK();
}
diff --git a/cpp/src/arrow/util/formatting_util_test.cc
b/cpp/src/arrow/util/formatting_util_test.cc
index 457bb4c88d..186e5a8b43 100644
--- a/cpp/src/arrow/util/formatting_util_test.cc
+++ b/cpp/src/arrow/util/formatting_util_test.cc
@@ -16,6 +16,7 @@
// under the License.
#include <cmath>
+#include <limits>
#include <locale>
#include <stdexcept>
#include <string>
@@ -561,6 +562,10 @@ TEST(Formatting, Timestamp) {
"2018-11-13 17:11:10.000000007");
AssertFormatting(formatter, -2203932304LL * 1000000000LL + 8,
"1900-02-28 12:34:56.000000008");
+ AssertFormatting(formatter, std::numeric_limits<int64_t>::min(),
+ "1677-09-21 00:12:43.145224192");
+ AssertFormatting(formatter, std::numeric_limits<int64_t>::max(),
+ "2262-04-11 23:47:16.854775807");
}
{
diff --git a/cpp/src/arrow/util/task_group.cc b/cpp/src/arrow/util/task_group.cc
index 401513028e..27cd1ba397 100644
--- a/cpp/src/arrow/util/task_group.cc
+++ b/cpp/src/arrow/util/task_group.cc
@@ -117,7 +117,12 @@ class ThreadedTaskGroup : public TaskGroup {
}
self->OneTaskDone();
};
- UpdateStatus(executor_->Spawn(std::move(callable)));
+ auto st = executor_->Spawn(std::move(callable));
+ bool spawn_successful = st.ok();
+ UpdateStatus(std::move(st));
+ if (!spawn_successful) {
+ OneTaskDone();
+ }
}
}
diff --git a/cpp/src/arrow/util/time.h b/cpp/src/arrow/util/time.h
index 981eab5967..05d3b85e05 100644
--- a/cpp/src/arrow/util/time.h
+++ b/cpp/src/arrow/util/time.h
@@ -18,10 +18,15 @@
#pragma once
#include <chrono>
+#include <cstdlib>
#include <memory>
+#include <optional>
+#include <type_traits>
#include <utility>
#include "arrow/type_fwd.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -66,17 +71,26 @@ VisitDuration(TimeUnit::type unit, Visitor&& visitor,
Args&&... args) {
return visitor(std::chrono::seconds{}, std::forward<Args>(args)...);
}
-/// Convert a count of seconds to the corresponding count in a different
TimeUnit
-struct CastSecondsToUnitImpl {
- template <typename Duration>
- int64_t operator()(Duration, int64_t seconds) {
- auto duration =
std::chrono::duration_cast<Duration>(std::chrono::seconds{seconds});
- return static_cast<int64_t>(duration.count());
- }
-};
+inline std::optional<int64_t> CastSecondsToUnit(TimeUnit::type unit, int64_t
seconds) {
+ auto cast_seconds_to_unit = [](auto duration,
+ int64_t seconds) -> std::optional<int64_t> {
+ constexpr auto kMultiplier =
static_cast<int64_t>(decltype(duration)::period::den);
+ int64_t out;
+ if (ARROW_PREDICT_FALSE(
+ ::arrow::internal::MultiplyWithOverflow(seconds, kMultiplier,
&out))) {
+ return {};
+ }
+ return out;
+ };
+ return VisitDuration(unit, cast_seconds_to_unit, seconds);
+}
-inline int64_t CastSecondsToUnit(TimeUnit::type unit, int64_t seconds) {
- return VisitDuration(unit, CastSecondsToUnitImpl{}, seconds);
+inline bool CastSecondsToUnit(TimeUnit::type unit, int64_t seconds, int64_t*
out) {
+ auto maybe_value = CastSecondsToUnit(unit, seconds);
+ if (ARROW_PREDICT_TRUE(maybe_value.has_value())) {
+ *out = *maybe_value;
+ }
+ return maybe_value.has_value();
}
} // namespace util
diff --git a/cpp/src/arrow/util/value_parsing.h
b/cpp/src/arrow/util/value_parsing.h
index d20c0d22b9..195cdc843a 100644
--- a/cpp/src/arrow/util/value_parsing.h
+++ b/cpp/src/arrow/util/value_parsing.h
@@ -33,6 +33,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/config.h"
#include "arrow/util/float16.h"
+#include "arrow/util/int_util_overflow.h"
#include "arrow/util/macros.h"
#include "arrow/util/time.h"
#include "arrow/util/visibility.h"
@@ -696,8 +697,7 @@ static inline bool ParseTimestampISO8601(const char* s,
size_t length,
}
if (length == 10) {
- *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
- return true;
+ return util::CastSecondsToUnit(unit, seconds_since_epoch.count(), out);
}
if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
@@ -768,12 +768,16 @@ static inline bool ParseTimestampISO8601(const char* s,
size_t length,
return false;
}
- seconds_since_epoch += seconds_since_midnight;
- seconds_since_epoch += zone_offset;
+ // Switch to plain integers to take advantage of the overflow arithmetic ops
+ auto count = (seconds_since_midnight + zone_offset).count();
+
+ if (ARROW_PREDICT_FALSE(::arrow::internal::AddWithOverflow(
+ count, seconds_since_epoch.count(), &count))) {
+ return false;
+ }
if (length <= 19) {
- *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
- return true;
+ return util::CastSecondsToUnit(unit, count, out);
}
if (ARROW_PREDICT_FALSE(s[19] != '.')) {
@@ -786,7 +790,12 @@ static inline bool ParseTimestampISO8601(const char* s,
size_t length,
return false;
}
- *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count()) +
subseconds;
+ if (ARROW_PREDICT_FALSE(!util::CastSecondsToUnit(unit, count, out))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(::arrow::internal::AddWithOverflow(*out, subseconds,
out))) {
+ return false;
+ }
return true;
}
@@ -828,8 +837,7 @@ static inline bool ParseTimestampStrptime(const char* buf,
size_t length,
secs -= std::chrono::seconds(result.tm_gmtoff);
#endif
}
- *out = util::CastSecondsToUnit(unit, secs.time_since_epoch().count());
- return true;
+ return util::CastSecondsToUnit(unit, secs.time_since_epoch().count(), out);
}
template <>
@@ -892,13 +900,21 @@ struct StringConverter<TIME_TYPE,
enable_if_time<TIME_TYPE>> {
const auto unit = type.unit();
std::chrono::seconds since_midnight;
+ auto get_seconds_since_midnight = [&](value_type* out) -> bool {
+ int64_t long_out;
+ if (ARROW_PREDICT_FALSE(
+ !util::CastSecondsToUnit(unit, since_midnight.count(),
&long_out))) {
+ return false;
+ }
+ *out = static_cast<value_type>(long_out);
+ return *out == long_out;
+ };
+
if (length == 5) {
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s, &since_midnight))) {
return false;
}
- *out =
- static_cast<value_type>(util::CastSecondsToUnit(unit,
since_midnight.count()));
- return true;
+ return get_seconds_since_midnight(out);
}
if (ARROW_PREDICT_FALSE(length < 8)) {
@@ -908,7 +924,9 @@ struct StringConverter<TIME_TYPE,
enable_if_time<TIME_TYPE>> {
return false;
}
- *out = static_cast<value_type>(util::CastSecondsToUnit(unit,
since_midnight.count()));
+ if (ARROW_PREDICT_FALSE(!get_seconds_since_midnight(out))) {
+ return false;
+ }
if (length == 8) {
return true;
diff --git a/cpp/src/arrow/util/value_parsing_test.cc
b/cpp/src/arrow/util/value_parsing_test.cc
index 03679b85b2..a67f1d97f1 100644
--- a/cpp/src/arrow/util/value_parsing_test.cc
+++ b/cpp/src/arrow/util/value_parsing_test.cc
@@ -16,6 +16,7 @@
// under the License.
#include <cmath>
+#include <limits>
#include <string>
#include <type_traits>
#include <vector>
@@ -814,8 +815,22 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
AssertConversion(type, "1900-02-28 12:34:56.123456789-01:17",
-2203932304000000000LL + 123456789LL + 4620000000000LL);
+ // The theoretical lower bound is "1677-09-21 00:12:43.145224192",
+ // but supporting it would require a bit more care in the timestamp parsing
+ // code.
+ AssertConversion(type, "1677-09-22", -9223286400000000000);
+ AssertConversion(type, "1677-09-22 00:00:00.000000000",
-9223286400000000000);
+ AssertConversion(type, "2262-04-11 23:47:16.854775806",
+ std::numeric_limits<int64_t>::max() - 1);
+ AssertConversion(type, "2262-04-11 23:47:16.854775807",
+ std::numeric_limits<int64_t>::max());
+
// Invalid subseconds
AssertConversionFails(type, "1900-02-28 12:34:56.1234567890");
+ // Out of bounds
+ AssertConversionFails(type, "3989-07-14T11:22:33.000777Z");
+ AssertConversionFails(type, "1677-09-21 00:12:43.145224191");
+ AssertConversionFails(type, "2262-04-11 23:47:16.854775808");
}
}
diff --git a/testing b/testing
index 725fd4a4b1..8da05fdd62 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 725fd4a4b12d01c53c98e80274c0b23aa8397082
+Subproject commit 8da05fdd62a7243ef77aa9757acb62e0586a4d0c