R-JunmingChen commented on code in PR #37418:
URL: https://github.com/apache/arrow/pull/37418#discussion_r1356967409
##########
cpp/src/arrow/array/array_dict.cc:
##########
@@ -211,6 +212,105 @@ Result<std::shared_ptr<ArrayData>> TransposeDictIndices(
return out_data;
}
+struct CompactTransposeMapVistor {
+ const std::shared_ptr<ArrayData>& data;
+ arrow::MemoryPool* pool;
+ std::unique_ptr<Buffer> output_map;
+ std::shared_ptr<Array> out_compact_dictionary;
+
+ template <typename IndexArrowType>
+ Status CompactTransposeMapImpl() {
+ int64_t index_length = data->length;
+ int64_t dict_length = data->dictionary->length;
+ if (dict_length == 0) {
+ output_map = nullptr;
+ out_compact_dictionary = nullptr;
+ return Status::OK();
+ } else if (index_length == 0) {
+ ARROW_ASSIGN_OR_RAISE(out_compact_dictionary,
+ MakeEmptyArray(data->dictionary->type, pool));
+ ARROW_ASSIGN_OR_RAISE(output_map, AllocateBuffer(0, pool))
+ return Status::OK();
+ }
+
+ using CType = typename IndexArrowType::c_type;
+ const CType* indices_data = data->GetValues<CType>(1);
+ std::vector<bool> dict_used(dict_length, false);
+ CType dict_len = static_cast<CType>(dict_length);
+ int64_t dict_used_count = 0;
+ for (int64_t i = 0; i < index_length; i++) {
+ if (data->IsNull(i)) {
+ continue;
+ }
+
+ CType current_index = indices_data[i];
+ if (current_index < 0 || current_index >= dict_len) {
+ return Status::IndexError(
+ "Index out of bounds while compacting dictionary array: ",
current_index,
+ "(dictionary is ", dict_length, " long) at position ", i);
+ } else if (!dict_used[current_index]) {
+ dict_used[current_index] = true;
+ dict_used_count++;
+
+ if (dict_used_count == dict_length) {
Review Comment:
I write a follow up issue #38247 to handle the optimization
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]