jorisvandenbossche commented on a change in pull request #10991:
URL: https://github.com/apache/arrow/pull/10991#discussion_r717380721
##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -2633,6 +2638,67 @@ def test_ipc_format(tempdir, dataset_reader):
assert result.equals(table)
[email protected]
+def test_orc_format(tempdir, dataset_reader):
+ from pyarrow import orc
+ table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
+ 'b': pa.array([.1, .2, .3], type="float64")})
+
+ path = str(tempdir / 'test.orc')
+ orc.write_table(table, path)
+
+ dataset = ds.dataset(path, format=ds.OrcFileFormat())
+ result = dataset_reader.to_table(dataset)
+ result.validate(full=True)
+ assert result.equals(table)
+
+ dataset = ds.dataset(path, format="orc")
+ result = dataset_reader.to_table(dataset)
+ result.validate(full=True)
+ assert result.equals(table)
+
+ result = dataset_reader.to_table(dataset, columns=["b"])
+ result.validate(full=True)
+ assert result.equals(table.select(["b"]))
+
+ assert dataset_reader.count_rows(dataset) == 3
+ assert dataset_reader.count_rows(dataset, filter=ds.field("a") > 2) == 1
+
+
[email protected]
+def test_orc_scan_options(tempdir, dataset_reader):
+ from pyarrow import orc
+ table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
+ 'b': pa.array([.1, .2, .3], type="float64")})
+
+ path = str(tempdir / 'test.orc')
+ orc.write_table(table, path)
+
+ dataset = ds.dataset(path, format="orc")
+ result = list(dataset_reader.to_batches(dataset))
+ assert len(result) == 1
+ assert result[0].num_rows == 3
+ assert result[0].equals(table.to_batches()[0])
+ # TODO batch_size is not yet supported
Review comment:
Ah, not yet for this follow-up. Created
https://issues.apache.org/jira/browse/ARROW-14153 and added that in the TODO
comment
##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -2633,6 +2638,67 @@ def test_ipc_format(tempdir, dataset_reader):
assert result.equals(table)
[email protected]
+def test_orc_format(tempdir, dataset_reader):
+ from pyarrow import orc
+ table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
+ 'b': pa.array([.1, .2, .3], type="float64")})
+
+ path = str(tempdir / 'test.orc')
+ orc.write_table(table, path)
+
+ dataset = ds.dataset(path, format=ds.OrcFileFormat())
+ result = dataset_reader.to_table(dataset)
+ result.validate(full=True)
+ assert result.equals(table)
+
+ dataset = ds.dataset(path, format="orc")
+ result = dataset_reader.to_table(dataset)
+ result.validate(full=True)
+ assert result.equals(table)
+
+ result = dataset_reader.to_table(dataset, columns=["b"])
+ result.validate(full=True)
+ assert result.equals(table.select(["b"]))
+
+ assert dataset_reader.count_rows(dataset) == 3
+ assert dataset_reader.count_rows(dataset, filter=ds.field("a") > 2) == 1
+
+
[email protected]
+def test_orc_scan_options(tempdir, dataset_reader):
+ from pyarrow import orc
+ table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
+ 'b': pa.array([.1, .2, .3], type="float64")})
+
+ path = str(tempdir / 'test.orc')
+ orc.write_table(table, path)
+
+ dataset = ds.dataset(path, format="orc")
+ result = list(dataset_reader.to_batches(dataset))
+ assert len(result) == 1
+ assert result[0].num_rows == 3
+ assert result[0].equals(table.to_batches()[0])
+ # TODO batch_size is not yet supported
+ # result = list(dataset_reader.to_batches(dataset, batch_size=2))
+ # assert len(result) == 2
+ # assert result[0].num_rows == 2
+ # assert result[0].equals(table.slice(0, 2).to_batches()[0])
+ # assert result[1].num_rows == 1
+ # assert result[1].equals(table.slice(2, 1).to_batches()[0])
+
+
+def test_orc_format_not_supported(tempdir, dataset_reader):
Review comment:
Indeed, not necessary here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]