R-JunmingChen commented on code in PR #37418:
URL: https://github.com/apache/arrow/pull/37418#discussion_r1354351584
##########
cpp/src/arrow/array/array_dict.cc:
##########
@@ -211,6 +212,105 @@ Result<std::shared_ptr<ArrayData>> TransposeDictIndices(
return out_data;
}
+struct CompactTransposeMapVistor {
+ const std::shared_ptr<ArrayData>& data;
+ arrow::MemoryPool* pool;
+ std::unique_ptr<Buffer> output_map;
+ std::shared_ptr<Array> out_compact_dictionary;
+
+ template <typename IndexArrowType>
+ Status CompactTransposeMapImpl() {
+ int64_t index_length = data->length;
+ int64_t dict_length = data->dictionary->length;
+ if (dict_length == 0) {
+ output_map = nullptr;
+ out_compact_dictionary = nullptr;
+ return Status::OK();
+ } else if (index_length == 0) {
+ ARROW_ASSIGN_OR_RAISE(out_compact_dictionary,
+ MakeEmptyArray(data->dictionary->type, pool));
+ ARROW_ASSIGN_OR_RAISE(output_map, AllocateBuffer(0, pool))
+ return Status::OK();
+ }
+
+ using CType = typename IndexArrowType::c_type;
+ const CType* indices_data = data->GetValues<CType>(1);
+ std::vector<bool> dict_used(dict_length, false);
+ CType dict_len = static_cast<CType>(dict_length);
+ int64_t dict_used_count = 0;
+ for (int64_t i = 0; i < index_length; i++) {
+ if (data->IsNull(i)) {
+ continue;
+ }
+
+ CType current_index = indices_data[i];
+ if (current_index < 0 || current_index >= dict_len) {
+ return Status::IndexError(
+ "Index out of bounds while compacting dictionary array: ",
current_index,
+ "(dictionary is ", dict_length, " long) at position ", i);
+ } else if (!dict_used[current_index]) {
+ dict_used[current_index] = true;
+ dict_used_count++;
+
+ if (dict_used_count == dict_length) {
Review Comment:
Hi, @bkietz, I have no idea how to take an advantage of detection on usage
of only a slice of the dictionary. If it's relatively complexed, I prefer to
leave it as an new issue. Since we have another PR which is wating for this PR
to be merged.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]