Github user zuyu commented on a diff in the pull request:

    https://github.com/apache/incubator-quickstep/pull/171#discussion_r98385945
  
    --- Diff: relational_operators/HashJoinOperator.cpp ---
    @@ -504,123 +616,61 @@ void HashInnerJoinWorkOrder::execute() {
         // hash join is below a reasonable threshold so that we don't blow up
         // temporary memory requirements to an unreasonable degree.
         if (residual_predicate_ != nullptr) {
    -      std::pair<std::vector<tuple_id>, std::vector<tuple_id>> 
filtered_matches;
    +      PairOfVectors filtered_matches;
    +
           for (std::size_t i = 0; i < build_tids.size(); ++i) {
             if (residual_predicate_->matchesForJoinedTuples(*build_accessor,
                                                             build_relation_id,
                                                             build_tids[i],
                                                             *probe_accessor,
                                                             probe_relation_id,
                                                             probe_tids[i])) {
    -          filtered_matches.first.push_back(build_tids[i]);
    -          filtered_matches.second.push_back(probe_tids[i]);
    +          filtered_matches.first.emplace_back(build_tids[i]);
    +          filtered_matches.second.emplace_back(probe_tids[i]);
             }
           }
     
           build_block_entry.second = std::move(filtered_matches);
         }
     
    -    // TODO(chasseur): If all the output expressions are ScalarAttributes,
    -    // we could implement a similar fast-path to 
StorageBlock::selectSimple()
    -    // that avoids a copy.
    -    //
         // TODO(chasseur): See TODO in NestedLoopsJoinOperator.cpp about 
limiting
         // the size of materialized temporary results. In common usage, this
         // probably won't be an issue for hash-joins, but in the worst case a 
hash
         // join can still devolve into a cross-product.
    -    //
    -    // NOTE(chasseur): We could also create one big 
ColumnVectorsValueAccessor
    -    // and accumulate all the results across multiple block pairs into it
    -    // before inserting anything into output blocks, but this would require
    -    // some significant API extensions to the expressions system for a 
dubious
    -    // benefit (probably only a real performance win when there are very 
few
    -    // matching tuples in each individual inner block but very many inner
    -    // blocks with at least one match).
    -
    -    // We now create ordered value accessors for both build and probe side,
    -    // using the joined tuple TIDs. Note that we have to use this 
Lambda-based
    -    // invocation method here because the accessors don't have a virtual
    -    // function that creates such an 
OrderedTupleIdSequenceAdapterValueAccessor.
    -    std::unique_ptr<ValueAccessor> ordered_build_accessor, 
ordered_probe_accessor;
    -    InvokeOnValueAccessorNotAdapter(
    -        build_accessor.get(),
    -        [&](auto *accessor) -> void {  // NOLINT(build/c++11)
    -          ordered_build_accessor.reset(
    -              
accessor->createSharedOrderedTupleIdSequenceAdapter(build_tids));
    -        });
    -
    -    if (probe_accessor->isTupleIdSequenceAdapter()) {
    -      InvokeOnTupleIdSequenceAdapterValueAccessor(
    -        probe_accessor.get(),
    -        [&](auto *accessor) -> void {  // NOLINT(build/c++11)
    -          ordered_probe_accessor.reset(
    -            
accessor->createSharedOrderedTupleIdSequenceAdapter(probe_tids));
    -        });
    -    } else {
    -      InvokeOnValueAccessorNotAdapter(
    -        probe_accessor.get(),
    -        [&](auto *accessor) -> void {  // NOLINT(build/c++11)
    -          ordered_probe_accessor.reset(
    -            
accessor->createSharedOrderedTupleIdSequenceAdapter(probe_tids));
    -        });
    -    }
    -
     
         // We also need a temp value accessor to store results of any scalar 
expressions.
         ColumnVectorsValueAccessor temp_result;
    +    if (!non_trivial_expressions.empty()) {
    +      // The getAllValuesForJoin function below needs joined tuple IDs as a
    +      // vector of pair of (build-tuple-ID, probe-tuple-ID), and we have a 
pair
    +      // of (build-tuple-IDs-vector, probe-tuple-IDs-vector). So we'll 
have to
    +      // zip our two vectors together.
    +      VectorOfPairs zipped_joined_tuple_ids;
    +      zipped_joined_tuple_ids.reserve(build_tids.size());
    +      for (std::size_t i = 0; i < build_tids.size(); ++i) {
    +        zipped_joined_tuple_ids.emplace_back(build_tids[i], probe_tids[i]);
    +      }
     
    -    // Create a map of ValueAccessors and what attributes we want to pick 
from them
    -    std::vector<std::pair<ValueAccessor *, std::vector<attribute_id>>> 
accessor_attribute_map;
    -    const std::vector<ValueAccessor *> accessors{
    -        ordered_build_accessor.get(), ordered_probe_accessor.get(), 
&temp_result};
    -    const unsigned int build_index = 0, probe_index = 1, temp_index = 2;
    -    for (auto &accessor : accessors) {
    -      accessor_attribute_map.push_back(std::make_pair(
    -          accessor,
    -          std::vector<attribute_id>(selection_.size(), 
kInvalidCatalogId)));
    -    }
    -
    -    attribute_id dest_attr = 0;
    -    std::vector<std::pair<tuple_id, tuple_id>> zipped_joined_tuple_ids;
    -
    -    for (auto &selection_cit : selection_) {
    -      // If the Scalar (column) is not an attribute in build/probe blocks, 
then
    -      // insert it into a ColumnVectorsValueAccessor.
    -      if (selection_cit->getDataSource() != 
Scalar::ScalarDataSource::kAttribute) {
    -        // Current destination attribute maps to the column we'll create 
now.
    -        accessor_attribute_map[temp_index].second[dest_attr] = 
temp_result.getNumColumns();
    -
    -        if (temp_result.getNumColumns() == 0) {
    -          // The getAllValuesForJoin function below needs joined tuple IDs 
as
    -          // a vector of pair of (build-tuple-ID, probe-tuple-ID), and we 
have
    -          // a pair of (build-tuple-IDs-vector, probe-tuple-IDs-vector). So
    -          // we'll have to zip our two vectors together. We do this inside
    -          // the loop because most queries don't exercise this code since
    -          // they don't have scalar expressions with attributes from both
    -          // build and probe relations (other expressions would have been
    -          // pushed down to before the join).
    -          zipped_joined_tuple_ids.reserve(build_tids.size());
    -          for (std::size_t i = 0; i < build_tids.size(); ++i) {
    -            
zipped_joined_tuple_ids.push_back(std::make_pair(build_tids[i], probe_tids[i]));
    -          }
    -        }
    -        temp_result.addColumn(
    -            selection_cit
    -                ->getAllValuesForJoin(build_relation_id, 
build_accessor.get(),
    -                                      probe_relation_id, 
probe_accessor.get(),
    -                                      zipped_joined_tuple_ids));
    -      } else {
    -        auto scalar_attr = static_cast<const ScalarAttribute 
*>(selection_cit.get());
    -        const attribute_id attr_id = scalar_attr->getAttribute().getID();
    -        if (scalar_attr->getAttribute().getParent().getID() == 
build_relation_id) {
    -          accessor_attribute_map[build_index].second[dest_attr] = attr_id;
    -        } else {
    -          accessor_attribute_map[probe_index].second[dest_attr] = attr_id;
    -        }
    +      for (const Scalar *scalar : non_trivial_expressions) {
    +        
temp_result.addColumn(scalar->getAllValuesForJoin(build_relation_id,
    +                                                          
build_accessor.get(),
    +                                                          
probe_relation_id,
    +                                                          probe_accessor,
    +                                                          
zipped_joined_tuple_ids));
           }
    -      ++dest_attr;
         }
     
    +    // We now create ordered value accessors for both build and probe side,
    +    // using the joined tuple TIDs.
    --- End diff --
    
    Change from `TIDs` to `IDs`? I guess `TID` means `tuple id`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

Reply via email to