This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c6090ed5d7 GH-49069: [C++] Share Trie instances across CSV value
decoders (#49070)
c6090ed5d7 is described below
commit c6090ed5d7047b97c6cabbd8e7728183a549409c
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Fri Jan 30 20:46:59 2026 +0900
GH-49069: [C++] Share Trie instances across CSV value decoders (#49070)
### Rationale for this change
The CSV converter was building identical Trie data structures (for
null/true/false values) in every decoder instance, causing duplicate memory
allocation and initialization overhead.
### What changes are included in this PR?
- Introduced `TrieCache` struct to hold shared Trie instances (null_trie,
true_trie, false_trie)
- Updated `ValueDecoder` and all decoder subclasses to accept and reference
a shared `TrieCache` instead of building their own Tries
- Updated `Converter` base class to create one `TrieCache` per converter
and pass it to all decoders
### Are these changes tested?
Yes, all existing tests. I ran a simple benchmark showing roughly 2-4%
faster converter creation, and obviously less memory usage.
### Are there any user-facing changes?
No.
* GitHub Issue: #49069
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/csv/converter.cc | 107 ++++++++++++++++++++++++-----------------
cpp/src/arrow/csv/converter.h | 3 ++
2 files changed, 66 insertions(+), 44 deletions(-)
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index ec31d4b1ce..bb59d02cd2 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -105,31 +105,45 @@ Status PresizeBuilder(const BlockParser& parser,
BuilderType* builder) {
}
}
+/////////////////////////////////////////////////////////////////////////
+// Shared Tries cache to avoid rebuilding them for each decoder instance
+
+struct TrieCache {
+ Trie null_trie;
+ Trie true_trie;
+ Trie false_trie;
+
+ static Result<std::shared_ptr<TrieCache>> Make(const ConvertOptions&
options) {
+ auto cache = std::make_shared<TrieCache>();
+ RETURN_NOT_OK(InitializeTrie(options.null_values, &cache->null_trie));
+ RETURN_NOT_OK(InitializeTrie(options.true_values, &cache->true_trie));
+ RETURN_NOT_OK(InitializeTrie(options.false_values, &cache->false_trie));
+ return cache;
+ }
+};
+
/////////////////////////////////////////////////////////////////////////
// Per-type value decoders
struct ValueDecoder {
explicit ValueDecoder(const std::shared_ptr<DataType>& type,
- const ConvertOptions& options)
- : type_(type), options_(options) {}
+ const ConvertOptions& options, const TrieCache*
trie_cache)
+ : type_(type), options_(options), trie_cache_(trie_cache) {}
- Status Initialize() {
- // TODO no need to build a separate Trie for each instance
- return InitializeTrie(options_.null_values, &null_trie_);
- }
+ Status Initialize() { return Status::OK(); }
bool IsNull(const uint8_t* data, uint32_t size, bool quoted) {
if (quoted && !options_.quoted_strings_can_be_null) {
return false;
}
- return null_trie_.Find(std::string_view(reinterpret_cast<const
char*>(data), size)) >=
- 0;
+ return trie_cache_->null_trie.Find(
+ std::string_view(reinterpret_cast<const char*>(data), size)) >=
0;
}
protected:
- Trie null_trie_;
const std::shared_ptr<DataType> type_;
const ConvertOptions& options_;
+ const TrieCache* trie_cache_;
};
//
@@ -140,8 +154,9 @@ struct FixedSizeBinaryValueDecoder : public ValueDecoder {
using value_type = const uint8_t*;
explicit FixedSizeBinaryValueDecoder(const std::shared_ptr<DataType>& type,
- const ConvertOptions& options)
- : ValueDecoder(type, options),
+ const ConvertOptions& options,
+ const TrieCache* trie_cache)
+ : ValueDecoder(type, options, trie_cache),
byte_width_(checked_cast<const
FixedSizeBinaryType&>(*type).byte_width()) {}
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type*
out) {
@@ -207,8 +222,8 @@ struct NumericValueDecoder : public ValueDecoder {
using value_type = typename T::c_type;
NumericValueDecoder(const std::shared_ptr<DataType>& type,
- const ConvertOptions& options)
- : ValueDecoder(type, options),
+ const ConvertOptions& options, const TrieCache*
trie_cache)
+ : ValueDecoder(type, options, trie_cache),
concrete_type_(checked_cast<const T&>(*type)),
string_converter_(MakeStringConverter<T>(options)) {}
@@ -236,31 +251,20 @@ struct BooleanValueDecoder : public ValueDecoder {
using ValueDecoder::ValueDecoder;
- Status Initialize() {
- // TODO no need to build separate Tries for each instance
- RETURN_NOT_OK(InitializeTrie(options_.true_values, &true_trie_));
- RETURN_NOT_OK(InitializeTrie(options_.false_values, &false_trie_));
- return ValueDecoder::Initialize();
- }
-
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type*
out) {
// XXX should quoted values be allowed at all?
- if (false_trie_.Find(std::string_view(reinterpret_cast<const char*>(data),
size)) >=
- 0) {
+ if (trie_cache_->false_trie.Find(
+ std::string_view(reinterpret_cast<const char*>(data), size)) >= 0)
{
*out = false;
return Status::OK();
}
- if (ARROW_PREDICT_TRUE(true_trie_.Find(std::string_view(
+ if (ARROW_PREDICT_TRUE(trie_cache_->true_trie.Find(std::string_view(
reinterpret_cast<const char*>(data), size)) >=
0)) {
*out = true;
return Status::OK();
}
return GenericConversionError(type_, data, size);
}
-
- protected:
- Trie true_trie_;
- Trie false_trie_;
};
//
@@ -271,8 +275,8 @@ struct DecimalValueDecoder : public ValueDecoder {
using value_type = Decimal128;
explicit DecimalValueDecoder(const std::shared_ptr<DataType>& type,
- const ConvertOptions& options)
- : ValueDecoder(type, options),
+ const ConvertOptions& options, const TrieCache*
trie_cache)
+ : ValueDecoder(type, options, trie_cache),
decimal_type_(internal::checked_cast<const DecimalType&>(*type_)),
type_precision_(decimal_type_.precision()),
type_scale_(decimal_type_.scale()) {}
@@ -310,8 +314,10 @@ struct CustomDecimalPointValueDecoder : public
ValueDecoder {
using value_type = typename WrappedDecoder::value_type;
explicit CustomDecimalPointValueDecoder(const std::shared_ptr<DataType>&
type,
- const ConvertOptions& options)
- : ValueDecoder(type, options), wrapped_decoder_(type, options) {}
+ const ConvertOptions& options,
+ const TrieCache* trie_cache)
+ : ValueDecoder(type, options, trie_cache),
+ wrapped_decoder_(type, options, trie_cache) {}
Status Initialize() {
RETURN_NOT_OK(wrapped_decoder_.Initialize());
@@ -321,7 +327,7 @@ struct CustomDecimalPointValueDecoder : public ValueDecoder
{
mapping_[options_.decimal_point] = '.';
mapping_['.'] = options_.decimal_point; // error out on standard decimal
point
temp_.resize(30);
- return Status::OK();
+ return ValueDecoder::Initialize();
}
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type*
out) {
@@ -357,8 +363,9 @@ struct InlineISO8601ValueDecoder : public ValueDecoder {
using value_type = int64_t;
explicit InlineISO8601ValueDecoder(const std::shared_ptr<DataType>& type,
- const ConvertOptions& options)
- : ValueDecoder(type, options),
+ const ConvertOptions& options,
+ const TrieCache* trie_cache)
+ : ValueDecoder(type, options, trie_cache),
unit_(checked_cast<const TimestampType&>(*type_).unit()),
expect_timezone_(!checked_cast<const
TimestampType&>(*type_).timezone().empty()) {
}
@@ -396,8 +403,9 @@ struct SingleParserTimestampValueDecoder : public
ValueDecoder {
using value_type = int64_t;
explicit SingleParserTimestampValueDecoder(const std::shared_ptr<DataType>&
type,
- const ConvertOptions& options)
- : ValueDecoder(type, options),
+ const ConvertOptions& options,
+ const TrieCache* trie_cache)
+ : ValueDecoder(type, options, trie_cache),
unit_(checked_cast<const TimestampType&>(*type_).unit()),
expect_timezone_(!checked_cast<const
TimestampType&>(*type_).timezone().empty()),
parser_(*options_.timestamp_parsers[0]) {}
@@ -436,8 +444,9 @@ struct MultipleParsersTimestampValueDecoder : public
ValueDecoder {
using value_type = int64_t;
explicit MultipleParsersTimestampValueDecoder(const
std::shared_ptr<DataType>& type,
- const ConvertOptions& options)
- : ValueDecoder(type, options),
+ const ConvertOptions& options,
+ const TrieCache* trie_cache)
+ : ValueDecoder(type, options, trie_cache),
unit_(checked_cast<const TimestampType&>(*type_).unit()),
expect_timezone_(!checked_cast<const
TimestampType&>(*type_).timezone().empty()),
parsers_(GetParsers(options_)) {}
@@ -477,8 +486,9 @@ struct DurationValueDecoder : public ValueDecoder {
using value_type = int64_t;
explicit DurationValueDecoder(const std::shared_ptr<DataType>& type,
- const ConvertOptions& options)
- : ValueDecoder(type, options),
+ const ConvertOptions& options,
+ const TrieCache* trie_cache)
+ : ValueDecoder(type, options, trie_cache),
concrete_type_(checked_cast<const DurationType&>(*type)),
string_converter_() {}
@@ -517,7 +527,8 @@ class NullConverter : public ConcreteConverter {
public:
NullConverter(const std::shared_ptr<DataType>& type, const ConvertOptions&
options,
MemoryPool* pool)
- : ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
+ : ConcreteConverter(type, options, pool),
+ decoder_(type_, options_, static_cast<const
TrieCache*>(trie_cache_.get())) {}
Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
int32_t col_index) override {
@@ -551,7 +562,8 @@ class PrimitiveConverter : public ConcreteConverter {
public:
PrimitiveConverter(const std::shared_ptr<DataType>& type, const
ConvertOptions& options,
MemoryPool* pool)
- : ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
+ : ConcreteConverter(type, options, pool),
+ decoder_(type_, options_, static_cast<const
TrieCache*>(trie_cache_.get())) {}
Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
int32_t col_index) override {
@@ -593,7 +605,8 @@ class TypedDictionaryConverter : public
ConcreteDictionaryConverter {
TypedDictionaryConverter(const std::shared_ptr<DataType>& value_type,
const ConvertOptions& options, MemoryPool* pool)
: ConcreteDictionaryConverter(value_type, options, pool),
- decoder_(value_type, options_) {}
+ decoder_(value_type, options_, static_cast<const
TrieCache*>(trie_cache_.get())) {
+ }
Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
int32_t col_index) override {
@@ -684,7 +697,13 @@ std::shared_ptr<ConverterType> MakeRealConverter(const
std::shared_ptr<DataType>
Converter::Converter(const std::shared_ptr<DataType>& type, const
ConvertOptions& options,
MemoryPool* pool)
- : options_(options), pool_(pool), type_(type) {}
+ : options_(options), pool_(pool), type_(type) {
+ // Build shared Trie cache (errors handled in Initialize())
+ auto maybe_cache = TrieCache::Make(options);
+ if (maybe_cache.ok()) {
+ trie_cache_ = std::static_pointer_cast<void>(*std::move(maybe_cache));
+ }
+}
DictionaryConverter::DictionaryConverter(const std::shared_ptr<DataType>&
value_type,
const ConvertOptions& options,
MemoryPool* pool)
diff --git a/cpp/src/arrow/csv/converter.h b/cpp/src/arrow/csv/converter.h
index 639f692f26..c6254bd7ca 100644
--- a/cpp/src/arrow/csv/converter.h
+++ b/cpp/src/arrow/csv/converter.h
@@ -57,6 +57,9 @@ class ARROW_EXPORT Converter {
const ConvertOptions& options_;
MemoryPool* pool_;
std::shared_ptr<DataType> type_;
+ // Opaque TrieCache pointer. TrieCache destructor is called via control
block.
+ // https://en.cppreference.com/w/cpp/memory/shared_ptr
+ std::shared_ptr<void> trie_cache_;
};
class ARROW_EXPORT DictionaryConverter : public Converter {