mapleFU commented on code in PR #42047:
URL: https://github.com/apache/arrow/pull/42047#discussion_r1632769136
##########
cpp/src/arrow/array/array_dict.cc:
##########
@@ -212,106 +213,155 @@ Result<std::shared_ptr<ArrayData>> TransposeDictIndices(
return out_data;
}
-struct CompactTransposeMapVisitor {
- const std::shared_ptr<ArrayData>& data;
- arrow::MemoryPool* pool;
- std::unique_ptr<Buffer> output_map;
- std::shared_ptr<Array> out_compact_dictionary;
-
- template <typename IndexArrowType>
- Status CompactTransposeMapImpl() {
- int64_t index_length = data->length;
- int64_t dict_length = data->dictionary->length;
- if (dict_length == 0) {
- output_map = nullptr;
- out_compact_dictionary = nullptr;
- return Status::OK();
- } else if (index_length == 0) {
- ARROW_ASSIGN_OR_RAISE(out_compact_dictionary,
- MakeEmptyArray(data->dictionary->type, pool));
- ARROW_ASSIGN_OR_RAISE(output_map, AllocateBuffer(0, pool))
- return Status::OK();
+/// \pre data.length > 0
+/// \pre data.dictionary->length > 0
+/// \pre out_dict_used_bitmap is a pointer to a zero-initialized
+/// allocation of BytesForBits(data.dictionary->length)
+///
+/// \tparam IndexCType the C type of the dictionary indices
+/// \param data the dictionary array data
+/// \param out_dict_used_bitmap a bitmap of used indices in the dictionary
+/// \return the number of used indices in the dictionary
+template <typename IndexCType>
+Result<int64_t> PopulateBitmapOfUsedIndices(const ArrayData& data,
+ uint8_t* out_dict_used_bitmap) {
+ DCHECK_EQ(data.type->id(), Type::DICTIONARY);
+ int64_t index_length = data.length;
+ int64_t dict_length = data.dictionary->length;
+ DCHECK_GT(index_length, 0);
+ DCHECK_GT(dict_length, 0);
+
+ const auto* indices_data = data.GetValues<IndexCType>(1);
+ const auto max_index = static_cast<IndexCType>(
+ std::min(static_cast<uint64_t>(std::numeric_limits<IndexCType>::max()),
+ static_cast<uint64_t>(dict_length - 1)));
+ int64_t dict_used_count = 0;
+ for (int64_t i = 0; i < index_length; i++) {
+ if (data.IsNull(i)) {
+ continue;
}
- using CType = typename IndexArrowType::c_type;
- const CType* indices_data = data->GetValues<CType>(1);
- std::vector<bool> dict_used(dict_length, false);
- CType dict_len = static_cast<CType>(dict_length);
- int64_t dict_used_count = 0;
- for (int64_t i = 0; i < index_length; i++) {
- if (data->IsNull(i)) {
- continue;
- }
-
- CType current_index = indices_data[i];
- if (current_index < 0 || current_index >= dict_len) {
- return Status::IndexError(
- "Index out of bounds while compacting dictionary array: ",
current_index,
- "(dictionary is ", dict_length, " long) at position ", i);
- }
- if (dict_used[current_index]) continue;
- dict_used[current_index] = true;
- dict_used_count++;
-
- if (dict_used_count == dict_length) {
- // The dictionary is already compact, so just return here
- output_map = nullptr;
- out_compact_dictionary = nullptr;
- return Status::OK();
- }
+ IndexCType current_index = indices_data[i];
+ if (ARROW_PREDICT_FALSE(current_index < 0 || current_index > max_index)) {
+ return Status::IndexError("Index out of bounds while compacting
dictionary array: ",
+ internal::UpcastInt(current_index), "
(dictionary is ",
+ dict_length, " long) at position ", i);
+ }
+ if (bit_util::GetBit(out_dict_used_bitmap, current_index)) {
+ continue;
}
+ bit_util::SetBit(out_dict_used_bitmap, current_index);
+ dict_used_count++;
- using BuilderType = NumericBuilder<IndexArrowType>;
- using arrow::compute::Take;
- using arrow::compute::TakeOptions;
- BuilderType dict_indices_builder(pool);
- ARROW_RETURN_NOT_OK(dict_indices_builder.Reserve(dict_used_count));
- ARROW_ASSIGN_OR_RAISE(output_map,
- AllocateBuffer(dict_length * sizeof(int32_t), pool));
- auto* output_map_raw = output_map->mutable_data_as<int32_t>();
- int32_t current_index = 0;
- for (CType i = 0; i < dict_len; i++) {
- if (dict_used[i]) {
- dict_indices_builder.UnsafeAppend(i);
- output_map_raw[i] = current_index;
- current_index++;
- } else {
- output_map_raw[i] = -1;
- }
+ if (dict_used_count == dict_length) {
+ // All bits in out_dict_used_bitmap are set
+ break;
}
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> compacted_dict_indices,
- dict_indices_builder.Finish());
- ARROW_ASSIGN_OR_RAISE(auto compacted_dict_res,
- Take(Datum(data->dictionary), compacted_dict_indices,
- TakeOptions::NoBoundsCheck()));
- out_compact_dictionary = compacted_dict_res.make_array();
- return Status::OK();
}
+ return dict_used_count;
+}
- template <typename Type>
- enable_if_integer<Type, Status> Visit(const Type&) {
- return CompactTransposeMapImpl<Type>();
- }
+Result<std::shared_ptr<Array>> TransposeAndMakeArrayNoInline(
+ const std::shared_ptr<ArrayData>& data,
+ const std::shared_ptr<Array>& compact_dictionary,
+ std::unique_ptr<Buffer> transpose_map, MemoryPool* pool) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto transposed,
+ TransposeDictIndices(data, data->type, data->type,
compact_dictionary->data(),
+ transpose_map->data_as<int32_t>(), pool));
+ return MakeArray(std::move(transposed));
+}
- Status Visit(const DataType& type) {
- return Status::TypeError("Expected an Index Type of Int or UInt");
+template <int IndexByteWidth>
+Result<std::shared_ptr<Array>> CompactTransposeMap(const
std::shared_ptr<ArrayData>& data,
+ Type::type index_type_id,
+ arrow::MemoryPool* pool) {
+ const int64_t index_length = data->length;
+ const int64_t dict_length = data->dictionary->length;
+ const bool indices_are_signed = is_signed_integer(index_type_id);
Review Comment:
so this avoid static dispatch to signed and unsigned, and just used unsigned
buffer ( because index shouldn't be 0?)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]