This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 88179b6362 GH-47924: [C++] Fix issues in CSV reader with invalid 
inputs (#47925)
88179b6362 is described below

commit 88179b63620cb2780ee8889ff87a15c7003d4b12
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Oct 28 10:43:59 2025 +0100

    GH-47924: [C++] Fix issues in CSV reader with invalid inputs (#47925)
    
    ### Rationale for this change
    
    These issues were all found by OSS-Fuzz:
    * https://issues.oss-fuzz.com/issues/452079535
    * https://issues.oss-fuzz.com/issues/452079536
    * https://issues.oss-fuzz.com/issues/452118314
    * https://issues.oss-fuzz.com/issues/452701622
    * https://issues.oss-fuzz.com/issues/452912678
    
    ### Are these changes tested?
    
    Yes, by additional fuzz regression files.
    
    ### Are there any user-facing changes?
    
    No.
    
    **This PR contains a "Critical Fix".** (If the changes fix either (a) a 
security vulnerability, (b) a bug that caused incorrect or invalid data to be 
produced, or (c) a bug that causes a crash (even when the API contract is 
upheld), please provide explanation. If not, you can remove this.)
    
    * GitHub Issue: #47924
    
    Lead-authored-by: Antoine Pitrou <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Co-authored-by: Zehua Zou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/csv/column_builder.cc        | 43 +++++++++++++++++++++--------
 cpp/src/arrow/csv/fuzz.cc                  | 22 +++++++++++----
 cpp/src/arrow/util/formatting_util_test.cc |  5 ++++
 cpp/src/arrow/util/task_group.cc           |  7 ++++-
 cpp/src/arrow/util/time.h                  | 34 ++++++++++++++++-------
 cpp/src/arrow/util/value_parsing.h         | 44 +++++++++++++++++++++---------
 cpp/src/arrow/util/value_parsing_test.cc   | 15 ++++++++++
 testing                                    |  2 +-
 8 files changed, 131 insertions(+), 41 deletions(-)

diff --git a/cpp/src/arrow/csv/column_builder.cc 
b/cpp/src/arrow/csv/column_builder.cc
index 0b169ebd06..393df18ec4 100644
--- a/cpp/src/arrow/csv/column_builder.cc
+++ b/cpp/src/arrow/csv/column_builder.cc
@@ -19,6 +19,7 @@
 #include <cstdint>
 #include <memory>
 #include <mutex>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -82,7 +83,7 @@ class ConcreteColumnBuilder : public ColumnBuilder {
     ReserveChunksUnlocked(block_index);
   }
 
-  void ReserveChunksUnlocked(int64_t block_index) {
+  virtual void ReserveChunksUnlocked(int64_t block_index) {
     // Create a null Array pointer at the back at the list.
     size_t chunk_index = static_cast<size_t>(block_index);
     if (chunks_.size() <= chunk_index) {
@@ -232,6 +233,7 @@ class InferringColumnBuilder : public ConcreteColumnBuilder 
{
   Status TryConvertChunk(int64_t chunk_index);
   // This must be called unlocked!
   void ScheduleConvertChunk(int64_t chunk_index);
+  void ReserveChunksUnlocked(int64_t block_index) override;
 
   // CAUTION: ConvertOptions can grow large (if it customizes hundreds or
   // thousands of columns), so avoid copying it in each InferringColumnBuilder.
@@ -243,6 +245,9 @@ class InferringColumnBuilder : public ConcreteColumnBuilder 
{
 
   // The parsers corresponding to each chunk (for reconverting)
   std::vector<std::shared_ptr<BlockParser>> parsers_;
+
+  // The inference kind for which the current chunks_ were obtained
+  std::vector<std::optional<InferKind>> chunk_kinds_;
 };
 
 Status InferringColumnBuilder::Init() { return UpdateType(); }
@@ -261,7 +266,12 @@ Status InferringColumnBuilder::TryConvertChunk(int64_t 
chunk_index) {
   std::shared_ptr<BlockParser> parser = parsers_[chunk_index];
   InferKind kind = infer_status_.kind();
 
-  DCHECK_NE(parser, nullptr);
+  if (chunks_[chunk_index] && chunk_kinds_[chunk_index] == kind) {
+    // Already tried, nothing to do
+    return Status::OK();
+  }
+
+  DCHECK_NE(parser, nullptr) << " for chunk_index " << chunk_index;
 
   lock.unlock();
   auto maybe_array = converter->Convert(*parser, col_index_);
@@ -280,34 +290,45 @@ Status InferringColumnBuilder::TryConvertChunk(int64_t 
chunk_index) {
       // We won't try to reconvert anymore
       parsers_[chunk_index].reset();
     }
+    chunk_kinds_[chunk_index] = kind;
     return SetChunkUnlocked(chunk_index, maybe_array);
   }
 
   // Conversion failed, try another type
   infer_status_.LoosenType(maybe_array.status());
   RETURN_NOT_OK(UpdateType());
+  kind = infer_status_.kind();
 
   // Reconvert past finished chunks
   // (unfinished chunks will notice by themselves if they need reconverting)
   const auto nchunks = static_cast<int64_t>(chunks_.size());
+  std::vector<int64_t> chunks_to_reconvert;
   for (int64_t i = 0; i < nchunks; ++i) {
-    if (i != chunk_index && chunks_[i]) {
-      // We're assuming the chunk was converted using the wrong type
-      // (which should be true unless the executor reorders tasks)
+    if (i != chunk_index && chunks_[i] && chunk_kinds_[i] != kind) {
+      // That chunk was converted using the wrong type
       chunks_[i].reset();
-      lock.unlock();
-      ScheduleConvertChunk(i);
-      lock.lock();
+      chunk_kinds_[i].reset();
+      chunks_to_reconvert.push_back(i);
     }
   }
+  // Reconvert this chunk too
+  chunks_to_reconvert.push_back(chunk_index);
 
-  // Reconvert this chunk
   lock.unlock();
-  ScheduleConvertChunk(chunk_index);
-
+  for (auto i : chunks_to_reconvert) {
+    ScheduleConvertChunk(i);
+  }
   return Status::OK();
 }
 
+void InferringColumnBuilder::ReserveChunksUnlocked(int64_t block_index) {
+  ConcreteColumnBuilder::ReserveChunksUnlocked(block_index);
+  size_t chunk_index = static_cast<size_t>(block_index);
+  if (chunk_kinds_.size() <= chunk_index) {
+    chunk_kinds_.resize(chunk_index + 1);
+  }
+}
+
 void InferringColumnBuilder::Insert(int64_t block_index,
                                     const std::shared_ptr<BlockParser>& 
parser) {
   // Create a slot for the new chunk and spawn a task to convert it
diff --git a/cpp/src/arrow/csv/fuzz.cc b/cpp/src/arrow/csv/fuzz.cc
index 9e500e5281..e745c2c0bd 100644
--- a/cpp/src/arrow/csv/fuzz.cc
+++ b/cpp/src/arrow/csv/fuzz.cc
@@ -17,6 +17,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 
 #include "arrow/buffer.h"
 #include "arrow/csv/reader.h"
@@ -25,10 +26,18 @@
 #include "arrow/status.h"
 #include "arrow/table.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/thread_pool.h"
 
 namespace arrow::csv {
 
 Status FuzzCsvReader(const uint8_t* data, int64_t size) {
+  // Since the Fuzz-allocated data is not owned, any task that outlives the 
TableReader
+  // may try to read memory that has been deallocated. Hence we wait for all 
pending
+  // tasks to end before leaving.
+  struct TaskGuard {
+    ~TaskGuard() { ::arrow::internal::GetCpuThreadPool()->WaitForIdle(); }
+  };
+
   auto io_context = arrow::io::default_io_context();
 
   auto read_options = ReadOptions::Defaults();
@@ -42,11 +51,14 @@ Status FuzzCsvReader(const uint8_t* data, int64_t size) {
       
std::make_shared<::arrow::io::BufferReader>(std::make_shared<Buffer>(data, 
size));
 
   // TODO test other reader types
-  ARROW_ASSIGN_OR_RAISE(auto table_reader,
-                        TableReader::Make(io_context, input_stream, 
read_options,
-                                          parse_options, convert_options));
-  ARROW_ASSIGN_OR_RAISE(auto table, table_reader->Read());
-  RETURN_NOT_OK(table->ValidateFull());
+  {
+    ARROW_ASSIGN_OR_RAISE(auto table_reader,
+                          TableReader::Make(io_context, input_stream, 
read_options,
+                                            parse_options, convert_options));
+    TaskGuard task_guard;
+    ARROW_ASSIGN_OR_RAISE(auto table, table_reader->Read());
+    RETURN_NOT_OK(table->ValidateFull());
+  }
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/util/formatting_util_test.cc 
b/cpp/src/arrow/util/formatting_util_test.cc
index 457bb4c88d..186e5a8b43 100644
--- a/cpp/src/arrow/util/formatting_util_test.cc
+++ b/cpp/src/arrow/util/formatting_util_test.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <cmath>
+#include <limits>
 #include <locale>
 #include <stdexcept>
 #include <string>
@@ -561,6 +562,10 @@ TEST(Formatting, Timestamp) {
                      "2018-11-13 17:11:10.000000007");
     AssertFormatting(formatter, -2203932304LL * 1000000000LL + 8,
                      "1900-02-28 12:34:56.000000008");
+    AssertFormatting(formatter, std::numeric_limits<int64_t>::min(),
+                     "1677-09-21 00:12:43.145224192");
+    AssertFormatting(formatter, std::numeric_limits<int64_t>::max(),
+                     "2262-04-11 23:47:16.854775807");
   }
 
   {
diff --git a/cpp/src/arrow/util/task_group.cc b/cpp/src/arrow/util/task_group.cc
index 401513028e..27cd1ba397 100644
--- a/cpp/src/arrow/util/task_group.cc
+++ b/cpp/src/arrow/util/task_group.cc
@@ -117,7 +117,12 @@ class ThreadedTaskGroup : public TaskGroup {
         }
         self->OneTaskDone();
       };
-      UpdateStatus(executor_->Spawn(std::move(callable)));
+      auto st = executor_->Spawn(std::move(callable));
+      bool spawn_successful = st.ok();
+      UpdateStatus(std::move(st));
+      if (!spawn_successful) {
+        OneTaskDone();
+      }
     }
   }
 
diff --git a/cpp/src/arrow/util/time.h b/cpp/src/arrow/util/time.h
index 981eab5967..05d3b85e05 100644
--- a/cpp/src/arrow/util/time.h
+++ b/cpp/src/arrow/util/time.h
@@ -18,10 +18,15 @@
 #pragma once
 
 #include <chrono>
+#include <cstdlib>
 #include <memory>
+#include <optional>
+#include <type_traits>
 #include <utility>
 
 #include "arrow/type_fwd.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -66,17 +71,26 @@ VisitDuration(TimeUnit::type unit, Visitor&& visitor, 
Args&&... args) {
   return visitor(std::chrono::seconds{}, std::forward<Args>(args)...);
 }
 
-/// Convert a count of seconds to the corresponding count in a different 
TimeUnit
-struct CastSecondsToUnitImpl {
-  template <typename Duration>
-  int64_t operator()(Duration, int64_t seconds) {
-    auto duration = 
std::chrono::duration_cast<Duration>(std::chrono::seconds{seconds});
-    return static_cast<int64_t>(duration.count());
-  }
-};
+inline std::optional<int64_t> CastSecondsToUnit(TimeUnit::type unit, int64_t 
seconds) {
+  auto cast_seconds_to_unit = [](auto duration,
+                                 int64_t seconds) -> std::optional<int64_t> {
+    constexpr auto kMultiplier = 
static_cast<int64_t>(decltype(duration)::period::den);
+    int64_t out;
+    if (ARROW_PREDICT_FALSE(
+            ::arrow::internal::MultiplyWithOverflow(seconds, kMultiplier, 
&out))) {
+      return {};
+    }
+    return out;
+  };
+  return VisitDuration(unit, cast_seconds_to_unit, seconds);
+}
 
-inline int64_t CastSecondsToUnit(TimeUnit::type unit, int64_t seconds) {
-  return VisitDuration(unit, CastSecondsToUnitImpl{}, seconds);
+inline bool CastSecondsToUnit(TimeUnit::type unit, int64_t seconds, int64_t* 
out) {
+  auto maybe_value = CastSecondsToUnit(unit, seconds);
+  if (ARROW_PREDICT_TRUE(maybe_value.has_value())) {
+    *out = *maybe_value;
+  }
+  return maybe_value.has_value();
 }
 
 }  // namespace util
diff --git a/cpp/src/arrow/util/value_parsing.h 
b/cpp/src/arrow/util/value_parsing.h
index d20c0d22b9..195cdc843a 100644
--- a/cpp/src/arrow/util/value_parsing.h
+++ b/cpp/src/arrow/util/value_parsing.h
@@ -33,6 +33,7 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/config.h"
 #include "arrow/util/float16.h"
+#include "arrow/util/int_util_overflow.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/time.h"
 #include "arrow/util/visibility.h"
@@ -696,8 +697,7 @@ static inline bool ParseTimestampISO8601(const char* s, 
size_t length,
   }
 
   if (length == 10) {
-    *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
-    return true;
+    return util::CastSecondsToUnit(unit, seconds_since_epoch.count(), out);
   }
 
   if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
@@ -768,12 +768,16 @@ static inline bool ParseTimestampISO8601(const char* s, 
size_t length,
       return false;
   }
 
-  seconds_since_epoch += seconds_since_midnight;
-  seconds_since_epoch += zone_offset;
+  // Switch to plain integers to take advantage of the overflow arithmetic ops
+  auto count = (seconds_since_midnight + zone_offset).count();
+
+  if (ARROW_PREDICT_FALSE(::arrow::internal::AddWithOverflow(
+          count, seconds_since_epoch.count(), &count))) {
+    return false;
+  }
 
   if (length <= 19) {
-    *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
-    return true;
+    return util::CastSecondsToUnit(unit, count, out);
   }
 
   if (ARROW_PREDICT_FALSE(s[19] != '.')) {
@@ -786,7 +790,12 @@ static inline bool ParseTimestampISO8601(const char* s, 
size_t length,
     return false;
   }
 
-  *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count()) + 
subseconds;
+  if (ARROW_PREDICT_FALSE(!util::CastSecondsToUnit(unit, count, out))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(::arrow::internal::AddWithOverflow(*out, subseconds, 
out))) {
+    return false;
+  }
   return true;
 }
 
@@ -828,8 +837,7 @@ static inline bool ParseTimestampStrptime(const char* buf, 
size_t length,
     secs -= std::chrono::seconds(result.tm_gmtoff);
 #endif
   }
-  *out = util::CastSecondsToUnit(unit, secs.time_since_epoch().count());
-  return true;
+  return util::CastSecondsToUnit(unit, secs.time_since_epoch().count(), out);
 }
 
 template <>
@@ -892,13 +900,21 @@ struct StringConverter<TIME_TYPE, 
enable_if_time<TIME_TYPE>> {
     const auto unit = type.unit();
     std::chrono::seconds since_midnight;
 
+    auto get_seconds_since_midnight = [&](value_type* out) -> bool {
+      int64_t long_out;
+      if (ARROW_PREDICT_FALSE(
+              !util::CastSecondsToUnit(unit, since_midnight.count(), 
&long_out))) {
+        return false;
+      }
+      *out = static_cast<value_type>(long_out);
+      return *out == long_out;
+    };
+
     if (length == 5) {
       if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s, &since_midnight))) {
         return false;
       }
-      *out =
-          static_cast<value_type>(util::CastSecondsToUnit(unit, 
since_midnight.count()));
-      return true;
+      return get_seconds_since_midnight(out);
     }
 
     if (ARROW_PREDICT_FALSE(length < 8)) {
@@ -908,7 +924,9 @@ struct StringConverter<TIME_TYPE, 
enable_if_time<TIME_TYPE>> {
       return false;
     }
 
-    *out = static_cast<value_type>(util::CastSecondsToUnit(unit, 
since_midnight.count()));
+    if (ARROW_PREDICT_FALSE(!get_seconds_since_midnight(out))) {
+      return false;
+    }
 
     if (length == 8) {
       return true;
diff --git a/cpp/src/arrow/util/value_parsing_test.cc 
b/cpp/src/arrow/util/value_parsing_test.cc
index 03679b85b2..a67f1d97f1 100644
--- a/cpp/src/arrow/util/value_parsing_test.cc
+++ b/cpp/src/arrow/util/value_parsing_test.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <cmath>
+#include <limits>
 #include <string>
 #include <type_traits>
 #include <vector>
@@ -814,8 +815,22 @@ TEST(StringConversion, ToTimestampDateTime_ISO8601) {
     AssertConversion(type, "1900-02-28 12:34:56.123456789-01:17",
                      -2203932304000000000LL + 123456789LL + 4620000000000LL);
 
+    // The theoretical lower bound is "1677-09-21 00:12:43.145224192",
+    // but supporting it would require a bit more care in the timestamp parsing
+    // code.
+    AssertConversion(type, "1677-09-22", -9223286400000000000);
+    AssertConversion(type, "1677-09-22 00:00:00.000000000", 
-9223286400000000000);
+    AssertConversion(type, "2262-04-11 23:47:16.854775806",
+                     std::numeric_limits<int64_t>::max() - 1);
+    AssertConversion(type, "2262-04-11 23:47:16.854775807",
+                     std::numeric_limits<int64_t>::max());
+
     // Invalid subseconds
     AssertConversionFails(type, "1900-02-28 12:34:56.1234567890");
+    // Out of bounds
+    AssertConversionFails(type, "3989-07-14T11:22:33.000777Z");
+    AssertConversionFails(type, "1677-09-21 00:12:43.145224191");
+    AssertConversionFails(type, "2262-04-11 23:47:16.854775808");
   }
 }
 
diff --git a/testing b/testing
index 725fd4a4b1..8da05fdd62 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 725fd4a4b12d01c53c98e80274c0b23aa8397082
+Subproject commit 8da05fdd62a7243ef77aa9757acb62e0586a4d0c

Reply via email to