ava6969 opened a new issue, #33834:
URL: https://github.com/apache/arrow/issues/33834

   ### Describe the usage question you have. Please include as many useful 
details as  possible.
   
   
   So I implemented a very inefficient operation to con-cat 2 or more record 
batches that all individually have an array holding their index. I have been 
reading about hash operations. I need help improving my current implementation 
and using arrow functions.
   
   ```
   pd::DataFrame Concatenator::concatenateColumns(
       bool intersect,
       bool ignore_index,
       bool sort)
   {
       auto newIndexes = mergeIndexes(makeJoinIndexes(objs, AxisType::Columns), 
intersect);
       const size_t numRows = newIndexes->length();
   
       if (sort)
       {
           auto sort_indices = 
ReturnOrThrowOnFailure(arrow::compute::SortIndices(
               newIndexes,
               arrow::compute::SortOptions{}));
   
           newIndexes =
               arrow::compute::Take(newIndexes, sort_indices)->make_array();
       }
   
       std::vector<size_t> index_offset;
       index_offset.reserve(objs.size());
       auto newColumnLength = accumulate(
           objs.begin(),
           objs.end(),
           0UL,
           [&index_offset](size_t total, DataFrame const& df)
           {
               index_offset.push_back(total);
               return total + df.num_columns();
           });
   
       arrow::FieldVector fieldVector(newColumnLength);
       arrow::ArrayDataVector arrayVectors(newColumnLength);
   
       for(size_t i = 0UL; i < objs.size(); i++)
       {
           const auto& df = objs[i];
           auto schema = df.array()->schema();
           auto df_index = df.index();
           auto fields = schema->fields();
           auto offset = index_offset[i];
           for (size_t j = 0UL; j < fields.size(); j++)
           {
               std::shared_ptr<arrow::Field> const& columnPerDF = fields[j];
               auto col_name = columnPerDF->name();
               auto array = df.m_array->GetColumnByName(col_name);
               auto array_data = array->data();
   
               if (not array->Equals(newIndexes))
               {
                   auto null = arrow::MakeNullScalar(columnPerDF->type());
   
                   arrow::ScalarVector scalars(newIndexes->length(), null);
   
                   for (int k = 0; k < newIndexes->length(); k++)
                   {
                       auto idx = newIndexes->GetScalar(k).MoveValueUnsafe();
                       auto result = df_index.index(idx);
                       if (result != -1)
                       {
                           scalars[k] = 
array->GetScalar(result).MoveValueUnsafe();
                       }
                   }
                   ASSIGN_OR_ABORT(
                       auto builder,
                       arrow::MakeBuilder(columnPerDF->type()));
   
                   ABORT_NOT_OK(builder->AppendScalars(scalars));
   
                   ABORT_NOT_OK(builder->FinishInternal(&array_data));
               }
   
               auto resolved_idx = offset + j;
               fieldVector[resolved_idx] = ignore_index ?
                   arrow::field(
                       std::to_string(resolved_idx),
                       columnPerDF->type()) :
                   columnPerDF;
   
               arrayVectors[resolved_idx] = array_data;
           }
       }
   
       return { arrow::schema(fieldVector),
                static_cast<int64_t>(numRows),
                arrayVectors,
                newIndexes };
   
   }
   
   pd::DataFrame concatColumnsUnsafe(std::vector<pd::DataFrame> const& objs)
   {
       auto df = objs.at(0).array();
       auto N = df->num_columns();
   
       for (int i = 1; i < objs.size(); i++)
       {
           for (auto const& field : objs[i].array()->schema()->fields())
           {
               auto result =
                   df->AddColumn(N++, field, objs[i][field->name()].m_array);
               if (result.ok())
               {
                   df = result.MoveValueUnsafe();
               }
               else
               {
                   throw std::runtime_error(result.status().ToString());
               }
           }
       }
       return { df, objs.at(0).indexArray() };
   }
   ```
   
   ### Component(s)
   
   C++


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to