JerAguilon commented on code in PR #38380: URL: https://github.com/apache/arrow/pull/38380#discussion_r1373695176
########## cpp/src/arrow/acero/unmaterialized_table.h: ########## @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <optional> +#include <vector> +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/memory_pool.h" +#include "arrow/record_batch.h" +#include "arrow/type_traits.h" +#include "arrow/util/logging.h" + +namespace arrow::acero { + +struct CompositeEntry { + RecordBatch* batch; + uint64_t start; + uint64_t end; +}; + +template <size_t MAX_COMPOSITE_TABLES> +struct UnmaterializedSlice { + // A slice is represented by a [start, end) range of rows in a collection of record + // batches, where end-start is the same length + + CompositeEntry components[MAX_COMPOSITE_TABLES]; + size_t num_components; + + inline int64_t Size() const { + if (num_components == 0) { + return 0; + } + return components[0].end - components[0].start; + } +}; + +/// A table of composite reference rows. Rows maintain pointers to the +/// constituent record batches, but the overall table retains shared_ptr +/// references to ensure memory remains resident while the table is live. +/// +/// The main reason for this is that, especially for wide tables, some operations +/// such as sorted_merge or asof_join are effectively row-oriented, rather than +/// column-oriented. Separating the join part from the columnar materialization +/// part simplifies the logic around data types and increases efficiency. +/// +/// We don't put the shared_ptr's into the rows for efficiency reasons, so the caller +/// must manually call addRecordBatchRef to maintain the lifetime of the stored +/// record batches. +template <size_t MAX_COMPOSITE_TABLES> +class UnmaterializedCompositeTable { Review Comment: Somewhat. I have made some slight API implementation changes. The sorted merge node creates true slices of N rows at a time. If the time is 1234, and the next 1000 rows are also 1234, we can greatly improve the speed and prevent unnecessary heapify calls if we just take the current row and the next 1k rows. OTOH, the asof join was just doing things row by row. So the main change is we now store a `record batch, [start, end)` range rather than a `record batch, row` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
