westonpace commented on code in PR #13218:
URL: https://github.com/apache/arrow/pull/13218#discussion_r888563914


##########
cpp/src/arrow/compute/row/encode_internal.h:
##########
@@ -0,0 +1,323 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/exec/util.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/compute/row/row_internal.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace compute {
+
+/// Converts between Arrow's typical column representation to a row-based 
representation
+///
+/// Data is stored as a single array of rows.  Each row combines data from all 
columns.
+/// The conversion is reversible.
+///
+/// Row-oriented storage is beneficial when there is a need for random access
+/// of individual rows and at the same time all included columns are likely to
+/// be accessed together, as in the case of hash table key.
+///
+/// Does not support nested types
+class RowTableEncoder {
+ public:
+  void Init(const std::vector<KeyColumnMetadata>& cols, LightContext* ctx,
+            int row_alignment, int string_alignment);
+
+  const RowTableMetadata& row_metadata() { return row_metadata_; }
+  // GrouperFastImpl right now needs somewhat intrusive visibility into 
RowTableEncoder
+  // This could be cleaned up at some point
+  const std::vector<KeyColumnArray>& batch_all_cols() { return 
batch_all_cols_; }
+
+  /// \brief Prepare to encode a collection of columns
+  /// \param start_row The starting row to encode
+  /// \param num_rows The number of rows to encode
+  /// \param cols The columns to encode.  The order of the columns should
+  ///             be consistent with the order used to create the 
RowTableMetadata
+  void PrepareEncodeSelected(int64_t start_row, int64_t num_rows,
+                             const std::vector<KeyColumnArray>& cols);
+  /// \brief Encode selection of prepared rows into a row table
+  /// \param rows The output row table
+  /// \param num_selected The number of rows to encode
+  /// \param selection indices of the rows to encode
+  Status EncodeSelected(RowTableImpl* rows, uint32_t num_selected,
+                        const uint16_t* selection);
+
+  /// \brief Decode a window of row oriented data into a corresponding
+  ///        window of column oriented storage.
+  /// \param start_row_input The starting row to decode
+  /// \param start_row_output An offset into the output array to write to
+  /// \param num_rows The number of rows to decode
+  /// \param rows The row table to decode from
+  /// \param cols The columns to decode into, should be sized appropriately
+  ///
+  /// The output buffers need to be correctly allocated and sized before
+  /// calling each method.  For that reason decoding is split into two 
functions.
+  /// DecodeFixedLengthBuffers processes everything except for varying length
+  /// buffers.
+  /// The output can be used to find out required varying length buffers sizes
+  /// for the call to DecodeVaryingLengthBuffers
+  void DecodeFixedLengthBuffers(int64_t start_row_input, int64_t 
start_row_output,
+                                int64_t num_rows, const RowTableImpl& rows,
+                                std::vector<KeyColumnArray>* cols);
+
+  /// \brief Decode the varlength columns of a row table into column storage
+  /// \param start_row_input The starting row to decode
+  /// \param start_row_output An offset into the output arrays
+  /// \param num_rows The number of rows to decode
+  /// \param rows The row table to decode from
+  /// \param cols The column arrays to decode into
+  void DecodeVaryingLengthBuffers(int64_t start_row_input, int64_t 
start_row_output,
+                                  int64_t num_rows, const RowTableImpl& rows,
+                                  std::vector<KeyColumnArray>* cols);
+
+ private:
+  /// Prepare column array vectors.
+  /// Output column arrays represent a range of input column arrays
+  /// specified by starting row and number of rows.
+  /// Three vectors are generated:
+  /// - all columns
+  /// - fixed-length columns only
+  /// - varying-length columns only
+  void PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
+                              const std::vector<KeyColumnArray>& cols_in);
+
+  LightContext* ctx_;
+
+  // Data initialized once, based on data types of key columns
+  RowTableMetadata row_metadata_;
+
+  // Data initialized for each input batch.
+  // All elements are ordered according to the order of encoded fields in a 
row.
+  std::vector<KeyColumnArray> batch_all_cols_;
+  std::vector<KeyColumnArray> batch_varbinary_cols_;
+  std::vector<uint32_t> batch_varbinary_cols_base_offsets_;
+};
+
+class EncoderInteger {

Review Comment:
   Some don't.  Any of the encoders that have an `AVX2` implemented method do I 
think.  So if I was going to need an internal header anyways it seemed more 
consistent to just throw them all in.  However, I can prune this down to just 
the encoders needed if that would be better.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to