wesm commented on a change in pull request #8302:
URL: https://github.com/apache/arrow/pull/8302#discussion_r496941604
##########
File path: cpp/src/arrow/ipc/writer.cc
##########
@@ -982,13 +982,7 @@ class ARROW_EXPORT IpcFormatWriter : public
RecordBatchWriter {
RETURN_NOT_OK(CheckStarted());
- if (!wrote_dictionaries_) {
- RETURN_NOT_OK(WriteDictionaries(batch));
- wrote_dictionaries_ = true;
- }
-
- // TODO: Check for delta dictionaries. Can we scan for deltas while
computing
- // the RecordBatch payload to save time?
+ RETURN_NOT_OK(WriteDictionaries(batch));
Review comment:
I assume this will populate `last_dictionaries_` entries even when the
length of the dictionary is zero? I recall there was some discussion about
empty dictionaries (and whether they need to be written at all) at the start of
a stream
##########
File path: cpp/src/arrow/ipc/writer.cc
##########
@@ -1020,13 +1014,30 @@ class ARROW_EXPORT IpcFormatWriter : public
RecordBatchWriter {
Status WriteDictionaries(const RecordBatch& batch) {
ARROW_ASSIGN_OR_RAISE(const auto dictionaries, CollectDictionaries(batch,
mapper_));
+ // TODO: Check for delta dictionaries. Can we scan for deltas while
computing
+ // the RecordBatch payload to save time?
+
for (const auto& pair : dictionaries) {
IpcPayload payload;
int64_t dictionary_id = pair.first;
const auto& dictionary = pair.second;
+ // If a dictionary with this id was already emitted, check if it was the
same.
+ // Note we don't compare dictionaries by value because it may be costly.
+ auto* last_dictionary = &last_dictionaries_[dictionary_id];
+ {
+ const auto maybe_last_dict = last_dictionary->lock();
+ if (maybe_last_dict && maybe_last_dict->data() == dictionary->data()) {
+ // Same dictionary data by pointer => no need to emit it again
+ continue;
+ }
+ }
+
RETURN_NOT_OK(GetDictionaryPayload(dictionary_id, dictionary, options_,
&payload));
RETURN_NOT_OK(payload_writer_->WritePayload(payload));
Review comment:
This should only be allowable if we are writing a stream? Dictionary
replacements aren't allowed for IPC files
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]