jorisvandenbossche commented on a change in pull request #7534:
URL: https://github.com/apache/arrow/pull/7534#discussion_r445392036
##########
File path: cpp/src/parquet/arrow/reader.cc
##########
@@ -338,22 +348,37 @@ class RowGroupRecordBatchReader : public
::arrow::RecordBatchReader {
// TODO (hatemhelal): Consider refactoring this to share logic with
ReadTable as this
// does not currently honor the use_threads option.
std::vector<std::shared_ptr<ChunkedArray>> columns(field_readers_.size());
- for (size_t i = 0; i < field_readers_.size(); ++i) {
- RETURN_NOT_OK(field_readers_[i]->NextBatch(batch_size_, &columns[i]));
- if (columns[i]->num_chunks() > 1) {
- return Status::NotImplemented("This class cannot yet iterate chunked
arrays");
+ int64_t num_rows = -1;
+
+ if (columns.empty()) {
+ num_rows = std::min(batch_size_, *row_group_remaining_size_);
Review comment:
might be good to add a comment here why this is being done
##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -1605,3 +1605,21 @@ def test_dataset_schema_metadata(tempdir):
assert b"pandas" in schema.metadata
# ensure it is still there in a projected schema (with column selection)
assert schema.equals(projected_schema, check_metadata=True)
+
+
[email protected]
[email protected]
+def test_dataset_project_only_partition_columns(tempdir):
+ # ARROW-8729
+ import pyarrow.parquet as pq
+
+ table = pa.table({'part': 'a a b b'.split(), 'col': list(range(4))})
+
+ path = str(tempdir / 'test_dataset')
+ pq.write_to_dataset(table, path, partition_cols=['part'])
+ dataset = ds.dataset(path, partitioning='hive')
+
+ all_cols = dataset.to_table(use_threads=False)
+ part_only = dataset.to_table(columns=['part'], use_threads=False)
+
+ assert all_cols.column('part') == part_only.column('part')
Review comment:
```suggestion
assert all_cols.column('part').equals(part_only.column('part'))
```
`==` is now element-wise, and the "truthyness" of a boolean chunked array is
always True, regardless of the values in it .. (so as long both tables have a
"part" column, the above check would always return True even if the columns
itself are not equal)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]