This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 04e9ff7d1e Python: Reduce the use of mock objects (#6438)
04e9ff7d1e is described below
commit 04e9ff7d1e9d7b7cc5811a1ebdf3ebf098e9505a
Author: Fokko Driesprong <[email protected]>
AuthorDate: Sun Dec 18 23:26:26 2022 +0100
Python: Reduce the use of mock objects (#6438)
---
python/pyiceberg/avro/file.py | 7 +-
python/pyiceberg/avro/reader.py | 17 +-
python/pyiceberg/manifest.py | 10 +-
python/pyiceberg/typedef.py | 20 ++
python/tests/avro/test_reader.py | 262 +++++++++++++--------------
python/tests/catalog/test_base.py | 24 ++-
python/tests/catalog/test_hive.py | 5 +-
python/tests/conftest.py | 55 +-----
python/tests/expressions/test_evaluator.py | 18 +-
python/tests/expressions/test_expressions.py | 96 ++++++----
python/tests/expressions/test_visitors.py | 65 ++-----
python/tests/io/test_fsspec.py | 9 +-
python/tests/io/test_io.py | 198 +++-----------------
python/tests/table/test_metadata.py | 70 +++----
python/tests/utils/test_manifest.py | 6 +-
15 files changed, 335 insertions(+), 527 deletions(-)
diff --git a/python/pyiceberg/avro/file.py b/python/pyiceberg/avro/file.py
index 68887a61ec..cc725e9db7 100644
--- a/python/pyiceberg/avro/file.py
+++ b/python/pyiceberg/avro/file.py
@@ -28,11 +28,12 @@ from typing import Optional, Type
from pyiceberg.avro.codecs import KNOWN_CODECS, Codec
from pyiceberg.avro.decoder import BinaryDecoder
-from pyiceberg.avro.reader import AvroStruct, ConstructReader, Reader
+from pyiceberg.avro.reader import ConstructReader, Reader
from pyiceberg.avro.resolver import resolve
from pyiceberg.io import InputFile, InputStream
from pyiceberg.io.memory import MemoryInputStream
from pyiceberg.schema import Schema, visit
+from pyiceberg.typedef import Record
from pyiceberg.types import (
FixedType,
MapType,
@@ -101,7 +102,7 @@ class Block:
def has_next(self) -> bool:
return self.position < self.block_records
- def __next__(self) -> AvroStruct:
+ def __next__(self) -> Record:
if self.has_next():
self.position += 1
return self.reader.read(self.block_decoder)
@@ -168,7 +169,7 @@ class AvroFile:
)
return block_records
- def __next__(self) -> AvroStruct:
+ def __next__(self) -> Record:
if self.block and self.block.has_next():
return next(self.block)
diff --git a/python/pyiceberg/avro/reader.py b/python/pyiceberg/avro/reader.py
index 6ea2b07ced..136498ed34 100644
--- a/python/pyiceberg/avro/reader.py
+++ b/python/pyiceberg/avro/reader.py
@@ -43,7 +43,7 @@ from uuid import UUID
from pyiceberg.avro.decoder import BinaryDecoder
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType
-from pyiceberg.typedef import StructProtocol
+from pyiceberg.typedef import Record, StructProtocol
from pyiceberg.types import (
BinaryType,
BooleanType,
@@ -103,17 +103,6 @@ def _skip_map_array(decoder: BinaryDecoder, skip_entry:
Callable[[], None]) -> N
block_count = decoder.read_int()
-@dataclass(frozen=True)
-class AvroStruct(StructProtocol):
- _data: List[Union[Any, StructProtocol]] = dataclassfield()
-
- def set(self, pos: int, value: Any) -> None:
- self._data[pos] = value
-
- def get(self, pos: int) -> Any:
- return self._data[pos]
-
-
class Reader(Singleton):
@abstractmethod
def read(self, decoder: BinaryDecoder) -> Any:
@@ -275,7 +264,7 @@ class OptionReader(Reader):
class StructReader(Reader):
fields: Tuple[Tuple[Optional[int], Reader], ...] = dataclassfield()
- def read(self, decoder: BinaryDecoder) -> AvroStruct:
+ def read(self, decoder: BinaryDecoder) -> Record:
result: List[Union[Any, StructProtocol]] = [None] * len(self.fields)
for (pos, field) in self.fields:
if pos is not None:
@@ -283,7 +272,7 @@ class StructReader(Reader):
else:
field.skip(decoder)
- return AvroStruct(result)
+ return Record(*result)
def skip(self, decoder: BinaryDecoder) -> None:
for _, field in self.fields:
diff --git a/python/pyiceberg/manifest.py b/python/pyiceberg/manifest.py
index 75ec0b6a14..9f7f394159 100644
--- a/python/pyiceberg/manifest.py
+++ b/python/pyiceberg/manifest.py
@@ -28,9 +28,9 @@ from typing import (
from pydantic import Field
from pyiceberg.avro.file import AvroFile
-from pyiceberg.avro.reader import AvroStruct
from pyiceberg.io import FileIO, InputFile
from pyiceberg.schema import Schema
+from pyiceberg.typedef import Record
from pyiceberg.types import (
IcebergType,
ListType,
@@ -158,14 +158,14 @@ def read_manifest_list(input_file: InputFile) ->
Iterator[ManifestFile]:
@singledispatch
-def _convert_pos_to_dict(schema: Union[Schema, IcebergType], struct:
AvroStruct) -> Dict[str, Any]:
+def _convert_pos_to_dict(schema: Union[Schema, IcebergType], struct: Record)
-> Dict[str, Any]:
"""Converts the positions in the field names
This makes it easy to map it onto a Pydantic model. Might change later on
depending on the performance
Args:
schema (Schema | IcebergType): The schema of the file
- struct (AvroStruct): The struct containing the data by positions
+ struct (Record): The struct containing the data by positions
Raises:
NotImplementedError: If attempting to handle an unknown type in the
schema
@@ -174,12 +174,12 @@ def _convert_pos_to_dict(schema: Union[Schema,
IcebergType], struct: AvroStruct)
@_convert_pos_to_dict.register
-def _(schema: Schema, struct: AvroStruct) -> Dict[str, Any]:
+def _(schema: Schema, struct: Record) -> Dict[str, Any]:
return _convert_pos_to_dict(schema.as_struct(), struct)
@_convert_pos_to_dict.register
-def _(struct_type: StructType, values: AvroStruct) -> Dict[str, Any]:
+def _(struct_type: StructType, values: Record) -> Dict[str, Any]:
"""Iterates over all the fields in the dict, and gets the data from the
struct"""
return (
{field.name: _convert_pos_to_dict(field.field_type, values.get(pos))
for pos, field in enumerate(struct_type.fields)}
diff --git a/python/pyiceberg/typedef.py b/python/pyiceberg/typedef.py
index 436b1a6e79..6ad668f006 100644
--- a/python/pyiceberg/typedef.py
+++ b/python/pyiceberg/typedef.py
@@ -14,12 +14,15 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+from __future__ import annotations
+
from abc import abstractmethod
from decimal import Decimal
from typing import (
Any,
Callable,
Dict,
+ List,
Protocol,
Tuple,
TypeVar,
@@ -78,3 +81,20 @@ class StructProtocol(Protocol): # pragma: no cover
@abstractmethod
def set(self, pos: int, value: Any) -> None:
...
+
+
+class Record(StructProtocol):
+ _data: List[Union[Any, StructProtocol]]
+
+ def __init__(self, *data: Union[Any, StructProtocol]) -> None:
+ self._data = list(data)
+
+ def set(self, pos: int, value: Any) -> None:
+ self._data[pos] = value
+
+ def get(self, pos: int) -> Any:
+ return self._data[pos]
+
+ def __eq__(self, other: Any) -> bool:
+ # For testing
+ return True if isinstance(other, Record) and other._data == self._data
else False
diff --git a/python/tests/avro/test_reader.py b/python/tests/avro/test_reader.py
index fc37f9f9a3..2528882333 100644
--- a/python/tests/avro/test_reader.py
+++ b/python/tests/avro/test_reader.py
@@ -21,7 +21,6 @@ import pytest
from pyiceberg.avro.file import AvroFile
from pyiceberg.avro.reader import (
- AvroStruct,
BinaryReader,
BooleanReader,
ConstructReader,
@@ -37,8 +36,10 @@ from pyiceberg.avro.reader import (
TimestamptzReader,
UUIDReader,
)
+from pyiceberg.io.pyarrow import PyArrowFileIO
from pyiceberg.manifest import _convert_pos_to_dict
from pyiceberg.schema import Schema, visit
+from pyiceberg.typedef import Record
from pyiceberg.types import (
BinaryType,
BooleanType,
@@ -60,11 +61,10 @@ from pyiceberg.types import (
TimeType,
UUIDType,
)
-from tests.io.test_io import LocalInputFile
def test_read_header(generated_manifest_entry_file: str,
iceberg_manifest_entry_schema: Schema) -> None:
- with AvroFile(LocalInputFile(generated_manifest_entry_file)) as reader:
+ with AvroFile(PyArrowFileIO().new_input(generated_manifest_entry_file)) as
reader:
header = reader._read_header()
assert header.magic == b"Obj\x01"
@@ -257,147 +257,141 @@ def test_read_header(generated_manifest_entry_file:
str, iceberg_manifest_entry_
def test_read_manifest_entry_file(generated_manifest_entry_file: str) -> None:
- with AvroFile(LocalInputFile(generated_manifest_entry_file)) as reader:
+ with AvroFile(PyArrowFileIO().new_input(generated_manifest_entry_file)) as
reader:
# Consume the generator
records = list(reader)
assert len(records) == 2, f"Expected 2 records, got {len(records)}"
- assert records[0] == AvroStruct(
- _data=[
- 1,
- 8744736658442914487,
- AvroStruct(
- _data=[
-
"/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet",
- "PARQUET",
- AvroStruct(_data=[None]),
- 19513,
- 388872,
- 67108864,
- {
- 1: 53,
- 2: 98153,
- 3: 98693,
- 4: 53,
- 5: 53,
- 6: 53,
- 7: 17425,
- 8: 18528,
- 9: 53,
- 10: 44788,
- 11: 35571,
- 12: 53,
- 13: 1243,
- 14: 2355,
- 15: 12750,
- 16: 4029,
- 17: 110,
- 18: 47194,
- 19: 2948,
- },
- {
- 1: 19513,
- 2: 19513,
- 3: 19513,
- 4: 19513,
- 5: 19513,
- 6: 19513,
- 7: 19513,
- 8: 19513,
- 9: 19513,
- 10: 19513,
- 11: 19513,
- 12: 19513,
- 13: 19513,
- 14: 19513,
- 15: 19513,
- 16: 19513,
- 17: 19513,
- 18: 19513,
- 19: 19513,
- },
- {
- 1: 19513,
- 2: 0,
- 3: 0,
- 4: 19513,
- 5: 19513,
- 6: 19513,
- 7: 0,
- 8: 0,
- 9: 19513,
- 10: 0,
- 11: 0,
- 12: 19513,
- 13: 0,
- 14: 0,
- 15: 0,
- 16: 0,
- 17: 0,
- 18: 0,
- 19: 0,
- },
- {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0,
14: 0, 15: 0},
- {
- 2: b"2020-04-01 00:00",
- 3: b"2020-04-01 00:12",
- 7: b"\x03\x00\x00\x00",
- 8: b"\x01\x00\x00\x00",
- 10: b"\xf6(\\\x8f\xc2\x05S\xc0",
- 11: b"\x00\x00\x00\x00\x00\x00\x00\x00",
- 13: b"\x00\x00\x00\x00\x00\x00\x00\x00",
- 14: b"\x00\x00\x00\x00\x00\x00\xe0\xbf",
- 15: b")\\\x8f\xc2\xf5(\x08\xc0",
- 16: b"\x00\x00\x00\x00\x00\x00\x00\x00",
- 17: b"\x00\x00\x00\x00\x00\x00\x00\x00",
- 18: b"\xf6(\\\x8f\xc2\xc5S\xc0",
- 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0",
- },
- {
- 2: b"2020-04-30 23:5:",
- 3: b"2020-05-01 00:41",
- 7: b"\t\x01\x00\x00",
- 8: b"\t\x01\x00\x00",
- 10: b"\xcd\xcc\xcc\xcc\xcc,_@",
- 11: b"\x1f\x85\xebQ\\\xe2\xfe@",
- 13: b"\x00\x00\x00\x00\x00\x00\x12@",
- 14: b"\x00\x00\x00\x00\x00\x00\xe0?",
- 15: b"q=\n\xd7\xa3\xf01@",
- 16: b"\x00\x00\x00\x00\x00`B@",
- 17: b"333333\xd3?",
- 18: b"\x00\x00\x00\x00\x00\x18b@",
- 19: b"\x00\x00\x00\x00\x00\x00\x04@",
- },
- None,
- [4],
- 0,
- ]
- ),
- ]
+ assert records[0] == Record(
+ 1,
+ 8744736658442914487,
+ Record(
+
"/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet",
+ "PARQUET",
+ Record(None),
+ 19513,
+ 388872,
+ 67108864,
+ {
+ 1: 53,
+ 2: 98153,
+ 3: 98693,
+ 4: 53,
+ 5: 53,
+ 6: 53,
+ 7: 17425,
+ 8: 18528,
+ 9: 53,
+ 10: 44788,
+ 11: 35571,
+ 12: 53,
+ 13: 1243,
+ 14: 2355,
+ 15: 12750,
+ 16: 4029,
+ 17: 110,
+ 18: 47194,
+ 19: 2948,
+ },
+ {
+ 1: 19513,
+ 2: 19513,
+ 3: 19513,
+ 4: 19513,
+ 5: 19513,
+ 6: 19513,
+ 7: 19513,
+ 8: 19513,
+ 9: 19513,
+ 10: 19513,
+ 11: 19513,
+ 12: 19513,
+ 13: 19513,
+ 14: 19513,
+ 15: 19513,
+ 16: 19513,
+ 17: 19513,
+ 18: 19513,
+ 19: 19513,
+ },
+ {
+ 1: 19513,
+ 2: 0,
+ 3: 0,
+ 4: 19513,
+ 5: 19513,
+ 6: 19513,
+ 7: 0,
+ 8: 0,
+ 9: 19513,
+ 10: 0,
+ 11: 0,
+ 12: 19513,
+ 13: 0,
+ 14: 0,
+ 15: 0,
+ 16: 0,
+ 17: 0,
+ 18: 0,
+ 19: 0,
+ },
+ {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0,
15: 0},
+ {
+ 2: b"2020-04-01 00:00",
+ 3: b"2020-04-01 00:12",
+ 7: b"\x03\x00\x00\x00",
+ 8: b"\x01\x00\x00\x00",
+ 10: b"\xf6(\\\x8f\xc2\x05S\xc0",
+ 11: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+ 13: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+ 14: b"\x00\x00\x00\x00\x00\x00\xe0\xbf",
+ 15: b")\\\x8f\xc2\xf5(\x08\xc0",
+ 16: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+ 17: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+ 18: b"\xf6(\\\x8f\xc2\xc5S\xc0",
+ 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0",
+ },
+ {
+ 2: b"2020-04-30 23:5:",
+ 3: b"2020-05-01 00:41",
+ 7: b"\t\x01\x00\x00",
+ 8: b"\t\x01\x00\x00",
+ 10: b"\xcd\xcc\xcc\xcc\xcc,_@",
+ 11: b"\x1f\x85\xebQ\\\xe2\xfe@",
+ 13: b"\x00\x00\x00\x00\x00\x00\x12@",
+ 14: b"\x00\x00\x00\x00\x00\x00\xe0?",
+ 15: b"q=\n\xd7\xa3\xf01@",
+ 16: b"\x00\x00\x00\x00\x00`B@",
+ 17: b"333333\xd3?",
+ 18: b"\x00\x00\x00\x00\x00\x18b@",
+ 19: b"\x00\x00\x00\x00\x00\x00\x04@",
+ },
+ None,
+ [4],
+ 0,
+ ),
)
def test_read_manifest_file_file(generated_manifest_file_file: str) -> None:
- with AvroFile(LocalInputFile(generated_manifest_file_file)) as reader:
+ with AvroFile(PyArrowFileIO().new_input(generated_manifest_file_file)) as
reader:
# Consume the generator
records = list(reader)
assert len(records) == 1, f"Expected 1 records, got {len(records)}"
actual = records[0]
- expected = AvroStruct(
- _data=[
- actual.get(0),
- 7989,
- 0,
- 9182715666859759686,
- 3,
- 0,
- 0,
- [AvroStruct(_data=[True, False, b"\x01\x00\x00\x00",
b"\x02\x00\x00\x00"])],
- 237993,
- 0,
- 0,
- ]
+ expected = Record(
+ actual.get(0),
+ 7989,
+ 0,
+ 9182715666859759686,
+ 3,
+ 0,
+ 0,
+ [Record(True, False, b"\x01\x00\x00\x00", b"\x02\x00\x00\x00")],
+ 237993,
+ 0,
+ 0,
)
assert actual == expected
@@ -407,7 +401,7 @@ def test_null_list_convert_pos_to_dict() -> None:
Schema(
NestedField(name="field", field_id=1,
field_type=ListType(element_id=2, element=StringType(), element_required=False))
),
- AvroStruct([None]),
+ Record(None),
)
assert data["field"] is None
@@ -421,7 +415,7 @@ def test_null_dict_convert_pos_to_dict() -> None:
field_type=MapType(key_id=2, key_type=StringType(),
value_id=3, value_type=StringType(), value_required=False),
)
),
- AvroStruct([None]),
+ Record(None),
)
assert data["field"] is None
@@ -438,7 +432,7 @@ def test_null_struct_convert_pos_to_dict() -> None:
required=False,
)
),
- AvroStruct([None]),
+ Record(None),
)
assert data["field"] is None
diff --git a/python/tests/catalog/test_base.py
b/python/tests/catalog/test_base.py
index ab9557448e..7281ff839c 100644
--- a/python/tests/catalog/test_base.py
+++ b/python/tests/catalog/test_base.py
@@ -42,10 +42,10 @@ from pyiceberg.io import load_file_io
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC,
PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.table import Table
+from pyiceberg.table.metadata import TableMetadataV1
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
from pyiceberg.transforms import IdentityTransform
from pyiceberg.typedef import EMPTY_DICT
-from tests.table.test_metadata import EXAMPLE_TABLE_METADATA_V1
class InMemoryCatalog(Catalog):
@@ -80,7 +80,27 @@ class InMemoryCatalog(Catalog):
table = Table(
identifier=identifier,
- metadata=EXAMPLE_TABLE_METADATA_V1,
+ metadata=TableMetadataV1(
+ **{
+ "format-version": 1,
+ "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c",
+ "location": "s3://bucket/test/location",
+ "last-updated-ms": 1602638573874,
+ "last-column-id": 3,
+ "schema": {
+ "type": "struct",
+ "fields": [
+ {"id": 1, "name": "x", "required": True,
"type": "long"},
+ {"id": 2, "name": "y", "required": True,
"type": "long", "doc": "comment"},
+ {"id": 3, "name": "z", "required": True,
"type": "long"},
+ ],
+ },
+ "partition-spec": [{"name": "x", "transform":
"identity", "source-id": 1, "field-id": 1000}],
+ "properties": {},
+ "current-snapshot-id": -1,
+ "snapshots": [{"snapshot-id": 1925, "timestamp-ms":
1602638573822}],
+ }
+ ),
metadata_location=f's3://warehouse/{"/".join(identifier)}/metadata/metadata.json',
io=load_file_io(),
)
diff --git a/python/tests/catalog/test_hive.py
b/python/tests/catalog/test_hive.py
index f9a47e516c..1cb7cba04b 100644
--- a/python/tests/catalog/test_hive.py
+++ b/python/tests/catalog/test_hive.py
@@ -42,6 +42,7 @@ from pyiceberg.exceptions import (
NoSuchNamespaceError,
NoSuchTableError,
)
+from pyiceberg.io.pyarrow import PyArrowFileIO
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.serializers import ToOutputFile
@@ -68,7 +69,6 @@ from pyiceberg.types import (
NestedField,
StringType,
)
-from tests.conftest import LocalFileIO
HIVE_CATALOG_NAME = "hive"
HIVE_METASTORE_FAKE_URL = "thrift://unknown:9083"
@@ -78,7 +78,8 @@ HIVE_METASTORE_FAKE_URL = "thrift://unknown:9083"
def hive_table(tmp_path_factory: pytest.TempPathFactory,
example_table_metadata_v2: Dict[str, Any]) -> HiveTable:
metadata_path = str(tmp_path_factory.mktemp("metadata") /
f"{uuid.uuid4()}.metadata.json")
metadata = TableMetadataV2(**example_table_metadata_v2)
- ToOutputFile.table_metadata(metadata,
LocalFileIO().new_output(str(metadata_path)), True)
+
+ ToOutputFile.table_metadata(metadata,
PyArrowFileIO().new_output(location=str(metadata_path)), True)
return HiveTable(
tableName="new_tabl2e",
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
index fffc50547f..f809cfc503 100644
--- a/python/tests/conftest.py
+++ b/python/tests/conftest.py
@@ -31,8 +31,6 @@ from typing import (
Callable,
Dict,
Generator,
- Type,
- Union,
)
from unittest.mock import MagicMock
from urllib.parse import urlparse
@@ -49,14 +47,9 @@ import pytest
from moto import mock_glue, mock_s3
from pyiceberg import schema
-from pyiceberg.io import (
- FileIO,
- InputFile,
- OutputFile,
- OutputStream,
- fsspec,
-)
+from pyiceberg.io import OutputFile, OutputStream, fsspec
from pyiceberg.io.fsspec import FsspecFileIO
+from pyiceberg.io.pyarrow import PyArrowFile, PyArrowFileIO
from pyiceberg.schema import Schema
from pyiceberg.types import (
BinaryType,
@@ -72,7 +65,6 @@ from pyiceberg.types import (
StructType,
)
from tests.catalog.test_base import InMemoryCatalog
-from tests.io.test_io import LocalInputFile
def pytest_addoption(parser: pytest.Parser) -> None:
@@ -85,21 +77,6 @@ def pytest_addoption(parser: pytest.Parser) -> None:
)
-class FooStruct:
- """An example of an object that abides by StructProtocol"""
-
- content: Dict[int, Any]
-
- def __init__(self) -> None:
- self.content = {}
-
- def get(self, pos: int) -> Any:
- return self.content[pos]
-
- def set(self, pos: int, value: Any) -> None:
- self.content[pos] = value
-
-
@pytest.fixture(scope="session")
def table_schema_simple() -> Schema:
return schema.Schema(
@@ -162,11 +139,6 @@ def table_schema_nested() -> Schema:
)
[email protected](scope="session")
-def foo_struct() -> FooStruct:
- return FooStruct()
-
-
@pytest.fixture(scope="session")
def all_avro_types() -> Dict[str, Any]:
return {
@@ -922,8 +894,8 @@ class LocalOutputFile(OutputFile):
def exists(self) -> bool:
return os.path.exists(self._path)
- def to_input_file(self) -> LocalInputFile:
- return LocalInputFile(location=self.location)
+ def to_input_file(self) -> PyArrowFile:
+ return PyArrowFileIO().new_input(location=self.location)
def create(self, overwrite: bool = False) -> OutputStream:
output_file = open(self._path, "wb" if overwrite else "xb")
@@ -932,25 +904,6 @@ class LocalOutputFile(OutputFile):
return output_file
-class LocalFileIO(FileIO):
- """A FileIO implementation for local files (for test use only)"""
-
- def new_input(self, location: str) -> LocalInputFile:
- return LocalInputFile(location=location)
-
- def new_output(self, location: str) -> LocalOutputFile:
- return LocalOutputFile(location=location)
-
- def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
- location = location.location if isinstance(location, (InputFile,
OutputFile)) else location
- os.remove(location)
-
-
[email protected](scope="session", autouse=True)
-def LocalFileIOFixture() -> Type[LocalFileIO]:
- return LocalFileIO
-
-
@pytest.fixture(scope="session")
def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any])
-> Generator[str, None, None]:
from fastavro import parse_schema, writer
diff --git a/python/tests/expressions/test_evaluator.py
b/python/tests/expressions/test_evaluator.py
index 4d4025a268..95766b0024 100644
--- a/python/tests/expressions/test_evaluator.py
+++ b/python/tests/expressions/test_evaluator.py
@@ -14,8 +14,6 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-from typing import Any, List
-
from pyiceberg.expressions import (
AlwaysFalse,
AlwaysTrue,
@@ -37,7 +35,7 @@ from pyiceberg.expressions import (
)
from pyiceberg.expressions.visitors import expression_evaluator
from pyiceberg.schema import Schema
-from pyiceberg.typedef import StructProtocol
+from pyiceberg.typedef import Record
from pyiceberg.types import (
DoubleType,
LongType,
@@ -45,20 +43,6 @@ from pyiceberg.types import (
StringType,
)
-
-class Record(StructProtocol):
- data: List[Any]
-
- def __init__(self, *values: Any) -> None:
- self.data = list(values)
-
- def get(self, pos: int) -> Any:
- return self.data[pos]
-
- def set(self, pos: int, value: Any) -> None:
- self.data[pos] = value
-
-
SIMPLE_SCHEMA = Schema(
NestedField(id=1, name="id", field_type=LongType()), NestedField(id=2,
name="data", field_type=StringType(), required=False)
)
diff --git a/python/tests/expressions/test_expressions.py
b/python/tests/expressions/test_expressions.py
index 233b80a385..7b26d45a28 100644
--- a/python/tests/expressions/test_expressions.py
+++ b/python/tests/expressions/test_expressions.py
@@ -61,6 +61,7 @@ from pyiceberg.expressions import (
from pyiceberg.expressions.literals import Literal, literal
from pyiceberg.expressions.visitors import _from_byte_buffer
from pyiceberg.schema import Accessor, Schema
+from pyiceberg.typedef import Record
from pyiceberg.types import (
DoubleType,
FloatType,
@@ -70,8 +71,29 @@ from pyiceberg.types import (
NestedField,
StringType,
)
-from tests.conftest import FooStruct
-from tests.expressions.test_visitors import ExpressionA, ExpressionB
+from pyiceberg.utils.singleton import Singleton
+
+
+class ExpressionA(BooleanExpression, Singleton):
+ def __invert__(self) -> BooleanExpression:
+ return ExpressionB()
+
+ def __repr__(self) -> str:
+ return "ExpressionA()"
+
+ def __str__(self) -> str:
+ return "testexpra"
+
+
+class ExpressionB(BooleanExpression, Singleton):
+ def __invert__(self) -> BooleanExpression:
+ return ExpressionA()
+
+ def __repr__(self) -> str:
+ return "ExpressionB()"
+
+ def __str__(self) -> str:
+ return "testexprb"
def test_isnull_inverse() -> None:
@@ -580,36 +602,38 @@ def test_invert_always() -> None:
assert ~AlwaysTrue() == AlwaysFalse()
-def test_accessor_base_class(foo_struct: FooStruct) -> None:
+def test_accessor_base_class() -> None:
"""Test retrieving a value at a position of a container using an
accessor"""
+ struct = Record(*([None] * 12))
+
uuid_value = uuid.uuid4()
- foo_struct.set(0, "foo")
- foo_struct.set(1, "bar")
- foo_struct.set(2, "baz")
- foo_struct.set(3, 1)
- foo_struct.set(4, 2)
- foo_struct.set(5, 3)
- foo_struct.set(6, 1.234)
- foo_struct.set(7, Decimal("1.234"))
- foo_struct.set(8, uuid_value)
- foo_struct.set(9, True)
- foo_struct.set(10, False)
- foo_struct.set(11, b"\x19\x04\x9e?")
-
- assert Accessor(position=0).get(foo_struct) == "foo"
- assert Accessor(position=1).get(foo_struct) == "bar"
- assert Accessor(position=2).get(foo_struct) == "baz"
- assert Accessor(position=3).get(foo_struct) == 1
- assert Accessor(position=4).get(foo_struct) == 2
- assert Accessor(position=5).get(foo_struct) == 3
- assert Accessor(position=6).get(foo_struct) == 1.234
- assert Accessor(position=7).get(foo_struct) == Decimal("1.234")
- assert Accessor(position=8).get(foo_struct) == uuid_value
- assert Accessor(position=9).get(foo_struct) is True
- assert Accessor(position=10).get(foo_struct) is False
- assert Accessor(position=11).get(foo_struct) == b"\x19\x04\x9e?"
+ struct.set(0, "foo")
+ struct.set(1, "bar")
+ struct.set(2, "baz")
+ struct.set(3, 1)
+ struct.set(4, 2)
+ struct.set(5, 3)
+ struct.set(6, 1.234)
+ struct.set(7, Decimal("1.234"))
+ struct.set(8, uuid_value)
+ struct.set(9, True)
+ struct.set(10, False)
+ struct.set(11, b"\x19\x04\x9e?")
+
+ assert Accessor(position=0).get(struct) == "foo"
+ assert Accessor(position=1).get(struct) == "bar"
+ assert Accessor(position=2).get(struct) == "baz"
+ assert Accessor(position=3).get(struct) == 1
+ assert Accessor(position=4).get(struct) == 2
+ assert Accessor(position=5).get(struct) == 3
+ assert Accessor(position=6).get(struct) == 1.234
+ assert Accessor(position=7).get(struct) == Decimal("1.234")
+ assert Accessor(position=8).get(struct) == uuid_value
+ assert Accessor(position=9).get(struct) is True
+ assert Accessor(position=10).get(struct) is False
+ assert Accessor(position=11).get(struct) == b"\x19\x04\x9e?"
@pytest.fixture
@@ -876,11 +900,13 @@ def test_less_than_or_equal() -> None:
assert less_than_or_equal == eval(repr(less_than_or_equal))
-def test_bound_reference_eval(table_schema_simple: Schema, foo_struct:
FooStruct) -> None:
+def test_bound_reference_eval(table_schema_simple: Schema) -> None:
"""Test creating a BoundReference and evaluating it on a StructProtocol"""
- foo_struct.set(pos=1, value="foovalue")
- foo_struct.set(pos=2, value=123)
- foo_struct.set(pos=3, value=True)
+ struct = Record(None, None, None, None)
+
+ struct.set(pos=1, value="foovalue")
+ struct.set(pos=2, value=123)
+ struct.set(pos=3, value=True)
position1_accessor = Accessor(position=1)
position2_accessor = Accessor(position=2)
@@ -894,9 +920,9 @@ def test_bound_reference_eval(table_schema_simple: Schema,
foo_struct: FooStruct
bound_ref2 = BoundReference(field=field2, accessor=position2_accessor)
bound_ref3 = BoundReference(field=field3, accessor=position3_accessor)
- assert bound_ref1.eval(foo_struct) == "foovalue"
- assert bound_ref2.eval(foo_struct) == 123
- assert bound_ref3.eval(foo_struct) is True
+ assert bound_ref1.eval(struct) == "foovalue"
+ assert bound_ref2.eval(struct) == 123
+ assert bound_ref3.eval(struct) is True
def test_non_primitive_from_byte_buffer() -> None:
diff --git a/python/tests/expressions/test_visitors.py
b/python/tests/expressions/test_visitors.py
index e179ccf05f..19915cc6b9 100644
--- a/python/tests/expressions/test_visitors.py
+++ b/python/tests/expressions/test_visitors.py
@@ -79,29 +79,6 @@ from pyiceberg.types import (
PrimitiveType,
StringType,
)
-from pyiceberg.utils.singleton import Singleton
-
-
-class ExpressionA(BooleanExpression, Singleton):
- def __invert__(self) -> BooleanExpression:
- return ExpressionB()
-
- def __repr__(self) -> str:
- return "ExpressionA()"
-
- def __str__(self) -> str:
- return "testexpra"
-
-
-class ExpressionB(BooleanExpression, Singleton):
- def __invert__(self) -> BooleanExpression:
- return ExpressionA()
-
- def __repr__(self) -> str:
- return "ExpressionB()"
-
- def __str__(self) -> str:
- return "testexprb"
class ExampleVisitor(BooleanExpressionVisitor[List[str]]):
@@ -135,33 +112,13 @@ class ExampleVisitor(BooleanExpressionVisitor[List[str]]):
return self.visit_history
def visit_unbound_predicate(self, predicate: UnboundPredicate[Any]) ->
List[str]:
- self.visit_history.append("UNBOUND PREDICATE")
+ self.visit_history.append(str(predicate.__class__.__name__).upper())
return self.visit_history
def visit_bound_predicate(self, predicate: BoundPredicate[Any]) ->
List[str]:
- self.visit_history.append("BOUND PREDICATE")
+ self.visit_history.append(str(predicate.__class__.__name__).upper())
return self.visit_history
- def visit_test_expression_a(self) -> List[str]:
- self.visit_history.append("ExpressionA")
- return self.visit_history
-
- def visit_test_expression_b(self) -> List[str]:
- self.visit_history.append("ExpressionB")
- return self.visit_history
-
-
[email protected](ExpressionA)
-def _(obj: ExpressionA, visitor: ExampleVisitor) -> List[str]:
- """Visit a ExpressionA with a BooleanExpressionVisitor"""
- return visitor.visit_test_expression_a()
-
-
[email protected](ExpressionB)
-def _(obj: ExpressionB, visitor: ExampleVisitor) -> List[str]:
- """Visit a ExpressionB with a BooleanExpressionVisitor"""
- return visitor.visit_test_expression_b()
-
class
FooBoundBooleanExpressionVisitor(BoundBooleanExpressionVisitor[List[str]]):
"""A test implementation of a BoundBooleanExpressionVisitor
@@ -250,26 +207,26 @@ class
FooBoundBooleanExpressionVisitor(BoundBooleanExpressionVisitor[List[str]])
def test_boolean_expression_visitor() -> None:
"""Test post-order traversal of boolean expression visit method"""
expr = And(
- Or(Not(ExpressionA()), Not(ExpressionB()), ExpressionA(),
ExpressionB()),
- Not(ExpressionA()),
- ExpressionB(),
+ Or(Not(EqualTo("a", 1)), Not(NotEqualTo("b", 0)), EqualTo("a", 1),
NotEqualTo("b", 0)),
+ Not(EqualTo("a", 1)),
+ NotEqualTo("b", 0),
)
visitor = ExampleVisitor()
result = visit(expr, visitor=visitor)
assert result == [
- "ExpressionA",
+ "EQUALTO",
"NOT",
- "ExpressionB",
+ "NOTEQUALTO",
"NOT",
"OR",
- "ExpressionA",
+ "EQUALTO",
"OR",
- "ExpressionB",
+ "NOTEQUALTO",
"OR",
- "ExpressionA",
+ "EQUALTO",
"NOT",
"AND",
- "ExpressionB",
+ "NOTEQUALTO",
"AND",
]
diff --git a/python/tests/io/test_fsspec.py b/python/tests/io/test_fsspec.py
index 22cad25ea4..fff308b212 100644
--- a/python/tests/io/test_fsspec.py
+++ b/python/tests/io/test_fsspec.py
@@ -16,7 +16,6 @@
# under the License.
import uuid
-from typing import Generator
import pytest
from botocore.awsrequest import AWSRequest
@@ -25,12 +24,12 @@ from requests_mock import Mocker
from pyiceberg.exceptions import SignError
from pyiceberg.io import fsspec
from pyiceberg.io.fsspec import FsspecFileIO, s3v4_rest_signer
-from tests.io.test_io import LocalInputFile
+from pyiceberg.io.pyarrow import PyArrowFileIO
@pytest.mark.s3
def test_fsspec_new_input_file(fsspec_fileio: FsspecFileIO) -> None:
- """Test creating a new input file from an fsspec file-io"""
+ """Test creating a new input file from a fsspec file-io"""
filename = str(uuid.uuid4())
input_file = fsspec_fileio.new_input(f"s3://warehouse/{filename}")
@@ -193,10 +192,10 @@ def
test_fsspec_converting_an_outputfile_to_an_inputfile(fsspec_fileio: FsspecFi
@pytest.mark.s3
-def test_writing_avro_file(generated_manifest_entry_file: Generator[str, None,
None], fsspec_fileio: FsspecFileIO) -> None:
+def test_writing_avro_file(generated_manifest_entry_file: str, fsspec_fileio:
FsspecFileIO) -> None:
"""Test that bytes match when reading a local avro file, writing it using
fsspec file-io, and then reading it again"""
filename = str(uuid.uuid4())
- with LocalInputFile(generated_manifest_entry_file).open() as f:
+ with
PyArrowFileIO().new_input(location=generated_manifest_entry_file).open() as f:
b1 = f.read()
with
fsspec_fileio.new_output(location=f"s3://warehouse/{filename}").create() as
out_f:
out_f.write(b1)
diff --git a/python/tests/io/test_io.py b/python/tests/io/test_io.py
index a83cba1af3..6cb5dbc7d3 100644
--- a/python/tests/io/test_io.py
+++ b/python/tests/io/test_io.py
@@ -17,20 +17,12 @@
import os
import tempfile
-from typing import Type, Union
-from unittest.mock import patch
-from urllib.parse import ParseResult, urlparse
import pytest
from pyiceberg.io import (
ARROW_FILE_IO,
PY_IO_IMPL,
- FileIO,
- InputFile,
- InputStream,
- OutputFile,
- OutputStream,
_import_file_io,
load_file_io,
)
@@ -38,102 +30,7 @@ from pyiceberg.io.fsspec import FsspecFileIO
from pyiceberg.io.pyarrow import PyArrowFileIO
-class LocalInputFile(InputFile):
- """An InputFile implementation for local files (for test use only)"""
-
- def __init__(self, location: str) -> None:
-
- parsed_location = urlparse(location) # Create a ParseResult from the
uri
- if parsed_location.scheme and parsed_location.scheme != "file": #
Validate that a uri is provided with a scheme of `file`
- raise ValueError("LocalInputFile location must have a scheme of
`file`")
- elif parsed_location.netloc:
- raise ValueError(f"Network location is not allowed for
LocalInputFile: {parsed_location.netloc}")
-
- super().__init__(location=location)
- self._parsed_location = parsed_location
-
- @property
- def parsed_location(self) -> ParseResult:
- """The parsed location
-
- Returns:
- ParseResult: The parsed results which has attributes `scheme`,
`netloc`, `path`,
- `params`, `query`, and `fragments`.
- """
- return self._parsed_location
-
- def __len__(self) -> int:
- return os.path.getsize(self.parsed_location.path)
-
- def exists(self) -> bool:
- return os.path.exists(self.parsed_location.path)
-
- def open(self) -> InputStream:
- input_file = open(self.parsed_location.path, "rb")
- if not isinstance(input_file, InputStream):
- raise TypeError("Object returned from LocalInputFile.open() does
not match the OutputStream protocol.")
- return input_file
-
-
-class LocalOutputFile(OutputFile):
- """An OutputFile implementation for local files (for test use only)"""
-
- def __init__(self, location: str) -> None:
- parsed_location = urlparse(location) # Create a ParseResult from the
uri
- if parsed_location.scheme and parsed_location.scheme != "file": #
Validate that a uri is provided with a scheme of `file`
- raise ValueError("LocalOutputFile location must have a scheme of
`file`")
- elif parsed_location.netloc:
- raise ValueError(f"Network location is not allowed for
LocalOutputFile: {parsed_location.netloc}")
-
- super().__init__(location=location)
- self._parsed_location = parsed_location
-
- @property
- def parsed_location(self) -> ParseResult:
- """The parsed location
-
- Returns:
- ParseResult: The parsed results which has attributes `scheme`,
`netloc`, `path`,
- `params`, `query`, and `fragments`.
- """
- return self._parsed_location
-
- def __len__(self) -> int:
- return os.path.getsize(self.parsed_location.path)
-
- def exists(self) -> bool:
- return os.path.exists(self.parsed_location.path)
-
- def to_input_file(self) -> LocalInputFile:
- return LocalInputFile(location=self.location)
-
- def create(self, overwrite: bool = False) -> OutputStream:
- output_file = open(self.parsed_location.path, "wb" if overwrite else
"xb")
- if not issubclass(type(output_file), OutputStream):
- raise TypeError("Object returned from LocalOutputFile.create(...)
does not match the OutputStream protocol.")
- return output_file
-
-
-class LocalFileIO(FileIO):
- """A FileIO implementation for local files (for test use only)"""
-
- def new_input(self, location: str) -> LocalInputFile:
- return LocalInputFile(location=location)
-
- def new_output(self, location: str) -> LocalOutputFile:
- return LocalOutputFile(location=location)
-
- def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
- location = location.location if isinstance(location, (InputFile,
OutputFile)) else location
- parsed_location = urlparse(location)
- try:
- os.remove(parsed_location.path)
- except FileNotFoundError as e:
- raise FileNotFoundError(f"Cannot delete file, does not exist:
{parsed_location.path}") from e
-
-
[email protected]("CustomFileIO", [LocalFileIO, PyArrowFileIO])
-def test_custom_local_input_file(CustomFileIO: Type[FileIO]) -> None:
+def test_custom_local_input_file() -> None:
"""Test initializing an InputFile implementation to read a local file"""
with tempfile.TemporaryDirectory() as tmpdirname:
file_location = os.path.join(tmpdirname, "foo.txt")
@@ -145,7 +42,7 @@ def test_custom_local_input_file(CustomFileIO: Type[FileIO])
-> None:
# Instantiate the input file
absolute_file_location = os.path.abspath(file_location)
- input_file =
CustomFileIO().new_input(location=f"{absolute_file_location}")
+ input_file =
PyArrowFileIO().new_input(location=f"{absolute_file_location}")
# Test opening and reading the file
f = input_file.open()
@@ -154,15 +51,14 @@ def test_custom_local_input_file(CustomFileIO:
Type[FileIO]) -> None:
assert len(input_file) == 3
[email protected]("CustomFileIO", [LocalFileIO, PyArrowFileIO])
-def test_custom_local_output_file(CustomFileIO: Type[FileIO]) -> None:
+def test_custom_local_output_file() -> None:
"""Test initializing an OutputFile implementation to write to a local
file"""
with tempfile.TemporaryDirectory() as tmpdirname:
file_location = os.path.join(tmpdirname, "foo.txt")
# Instantiate the output file
absolute_file_location = os.path.abspath(file_location)
- output_file =
CustomFileIO().new_output(location=f"{absolute_file_location}")
+ output_file =
PyArrowFileIO().new_output(location=f"{absolute_file_location}")
# Create the output file and write to it
f = output_file.create()
@@ -175,8 +71,7 @@ def test_custom_local_output_file(CustomFileIO:
Type[FileIO]) -> None:
assert len(output_file) == 3
[email protected]("CustomFileIO", [LocalFileIO, PyArrowFileIO])
-def test_custom_local_output_file_with_overwrite(CustomFileIO: Type[FileIO])
-> None:
+def test_custom_local_output_file_with_overwrite() -> None:
"""Test initializing an OutputFile implementation to overwrite a local
file"""
with tempfile.TemporaryDirectory() as tmpdirname:
output_file_location = os.path.join(tmpdirname, "foo.txt")
@@ -186,7 +81,7 @@ def
test_custom_local_output_file_with_overwrite(CustomFileIO: Type[FileIO]) ->
write_file.write(b"foo")
# Instantiate an output file
- output_file =
CustomFileIO().new_output(location=f"{output_file_location}")
+ output_file =
PyArrowFileIO().new_output(location=f"{output_file_location}")
# Confirm that a FileExistsError is raised when overwrite=False
with pytest.raises(FileExistsError):
@@ -200,8 +95,7 @@ def
test_custom_local_output_file_with_overwrite(CustomFileIO: Type[FileIO]) ->
assert f.read() == b"bar"
[email protected]("CustomFileIO", [LocalFileIO, PyArrowFileIO])
-def test_custom_file_exists(CustomFileIO: Type[FileIO]) -> None:
+def test_custom_file_exists() -> None:
"""Test that the exists property returns the proper value for existing and
non-existing files"""
with tempfile.TemporaryDirectory() as tmpdirname:
file_location = os.path.join(tmpdirname, "foo.txt")
@@ -218,30 +112,29 @@ def test_custom_file_exists(CustomFileIO: Type[FileIO])
-> None:
non_existent_absolute_file_location =
os.path.abspath(nonexistent_file_location)
# Create InputFile instances
- input_file =
CustomFileIO().new_input(location=f"{absolute_file_location}")
- non_existent_input_file =
CustomFileIO().new_input(location=f"{non_existent_absolute_file_location}")
+ input_file =
PyArrowFileIO().new_input(location=f"{absolute_file_location}")
+ non_existent_input_file =
PyArrowFileIO().new_input(location=f"{non_existent_absolute_file_location}")
# Test opening and reading the file
assert input_file.exists()
assert not non_existent_input_file.exists()
# Create OutputFile instances
- file = CustomFileIO().new_output(location=f"{absolute_file_location}")
- non_existent_file =
CustomFileIO().new_output(location=f"{non_existent_absolute_file_location}")
+ file = PyArrowFileIO().new_output(location=f"{absolute_file_location}")
+ non_existent_file =
PyArrowFileIO().new_output(location=f"{non_existent_absolute_file_location}")
# Test opening and reading the file
assert file.exists()
assert not non_existent_file.exists()
[email protected]("CustomFileIO", [LocalFileIO, PyArrowFileIO])
-def test_output_file_to_input_file(CustomFileIO: Type[FileIO]) -> None:
+def test_output_file_to_input_file() -> None:
"""Test initializing an InputFile using the `to_input_file()` method on an
OutputFile instance"""
with tempfile.TemporaryDirectory() as tmpdirname:
output_file_location = os.path.join(tmpdirname, "foo.txt")
# Create an output file instance
- output_file =
CustomFileIO().new_output(location=f"{output_file_location}")
+ output_file =
PyArrowFileIO().new_output(location=f"{output_file_location}")
# Create the output file and write to it
with output_file.create() as output_stream:
@@ -254,20 +147,17 @@ def test_output_file_to_input_file(CustomFileIO:
Type[FileIO]) -> None:
@pytest.mark.parametrize(
- "CustomFileIO,string_uri",
+ "string_uri",
[
- (LocalFileIO, "foo/bar.parquet"),
- (LocalFileIO, "file:///foo/bar.parquet"),
- (LocalFileIO, "file:/foo/bar/baz.parquet"),
- (PyArrowFileIO, "foo/bar/baz.parquet"),
- (PyArrowFileIO, "file:/foo/bar/baz.parquet"),
- (PyArrowFileIO, "file:/foo/bar/baz.parquet"),
+ "foo/bar/baz.parquet",
+ "file:/foo/bar/baz.parquet",
+ "file:/foo/bar/baz.parquet",
],
)
-def test_custom_file_io_locations(CustomFileIO: Type[FileIO], string_uri: str)
-> None:
+def test_custom_file_io_locations(string_uri: str) -> None:
"""Test that the location property is maintained as the value of the
location argument"""
# Instantiate the file-io and create a new input and output file
- file_io = CustomFileIO()
+ file_io = PyArrowFileIO()
input_file = file_io.new_input(location=string_uri)
assert input_file.location == string_uri
@@ -275,32 +165,7 @@ def test_custom_file_io_locations(CustomFileIO:
Type[FileIO], string_uri: str) -
assert output_file.location == string_uri
[email protected](
- "string_uri_w_netloc",
- ["file://localhost:80/foo/bar.parquet", "file://foo/bar.parquet"],
-)
-def test_raise_on_network_location_in_input_file(string_uri_w_netloc: str) ->
None:
- """Test raising a ValueError when providing a network location to a
LocalInputFile"""
- with pytest.raises(ValueError) as exc_info:
- LocalInputFile(location=string_uri_w_netloc)
-
- assert ("Network location is not allowed for LocalInputFile") in
str(exc_info.value)
-
-
[email protected](
- "string_uri_w_netloc",
- ["file://localhost:80/foo/bar.parquet", "file://foo/bar.parquet"],
-)
-def test_raise_on_network_location_in_output_file(string_uri_w_netloc: str) ->
None:
- """Test raising a ValueError when providing a network location to a
LocalOutputFile"""
- with pytest.raises(ValueError) as exc_info:
- LocalInputFile(location=string_uri_w_netloc)
-
- assert ("Network location is not allowed for LocalInputFile") in
str(exc_info.value)
-
-
[email protected]("CustomFileIO", [LocalFileIO, PyArrowFileIO])
-def test_deleting_local_file_using_file_io(CustomFileIO: Type[FileIO]) -> None:
+def test_deleting_local_file_using_file_io() -> None:
"""Test deleting a local file using FileIO.delete(...)"""
with tempfile.TemporaryDirectory() as tmpdirname:
# Write to the temporary file
@@ -309,7 +174,7 @@ def test_deleting_local_file_using_file_io(CustomFileIO:
Type[FileIO]) -> None:
f.write(b"foo")
# Instantiate the file-io
- file_io = CustomFileIO()
+ file_io = PyArrowFileIO()
# Confirm that the file initially exists
assert os.path.exists(output_file_location)
@@ -321,15 +186,14 @@ def test_deleting_local_file_using_file_io(CustomFileIO:
Type[FileIO]) -> None:
assert not os.path.exists(output_file_location)
[email protected]("CustomFileIO", [LocalFileIO, PyArrowFileIO])
-def test_raise_file_not_found_error_for_fileio_delete(CustomFileIO:
Type[FileIO]) -> None:
+def test_raise_file_not_found_error_for_fileio_delete() -> None:
"""Test raising a FileNotFound error when trying to delete a non-existent
file"""
with tempfile.TemporaryDirectory() as tmpdirname:
# Write to the temporary file
output_file_location = os.path.join(tmpdirname, "foo.txt")
# Instantiate the file-io
- file_io = CustomFileIO()
+ file_io = PyArrowFileIO()
# Delete the non-existent file using the file-io implementations
delete method
with pytest.raises(FileNotFoundError) as exc_info:
@@ -341,8 +205,7 @@ def
test_raise_file_not_found_error_for_fileio_delete(CustomFileIO: Type[FileIO]
assert not os.path.exists(output_file_location)
[email protected]("CustomFileIO", [LocalFileIO, PyArrowFileIO])
-def test_deleting_local_file_using_file_io_input_file(CustomFileIO:
Type[FileIO]) -> None:
+def test_deleting_local_file_using_file_io_input_file() -> None:
"""Test deleting a local file by passing an InputFile instance to
FileIO.delete(...)"""
with tempfile.TemporaryDirectory() as tmpdirname:
# Write to the temporary file
@@ -351,13 +214,13 @@ def
test_deleting_local_file_using_file_io_input_file(CustomFileIO: Type[FileIO]
f.write(b"foo")
# Instantiate the file-io
- file_io = CustomFileIO()
+ file_io = PyArrowFileIO()
# Confirm that the file initially exists
assert os.path.exists(file_location)
# Instantiate the custom InputFile
- input_file = CustomFileIO().new_input(location=f"{file_location}")
+ input_file = PyArrowFileIO().new_input(location=f"{file_location}")
# Delete the file using the file-io implementations delete method
file_io.delete(input_file)
@@ -366,8 +229,7 @@ def
test_deleting_local_file_using_file_io_input_file(CustomFileIO: Type[FileIO]
assert not os.path.exists(file_location)
[email protected]("CustomFileIO", [LocalFileIO, PyArrowFileIO])
-def test_deleting_local_file_using_file_io_output_file(CustomFileIO:
Type[FileIO]) -> None:
+def test_deleting_local_file_using_file_io_output_file() -> None:
"""Test deleting a local file by passing an OutputFile instance to
FileIO.delete(...)"""
with tempfile.TemporaryDirectory() as tmpdirname:
# Write to the temporary file
@@ -376,13 +238,13 @@ def
test_deleting_local_file_using_file_io_output_file(CustomFileIO: Type[FileIO
f.write(b"foo")
# Instantiate the file-io
- file_io = CustomFileIO()
+ file_io = PyArrowFileIO()
# Confirm that the file initially exists
assert os.path.exists(file_location)
# Instantiate the custom OutputFile
- output_file = CustomFileIO().new_output(location=f"{file_location}")
+ output_file = PyArrowFileIO().new_output(location=f"{file_location}")
# Delete the file using the file-io implementations delete method
file_io.delete(output_file)
@@ -426,14 +288,12 @@ def test_load_file_io_location_no_schema() -> None:
assert isinstance(load_file_io({"location": "/no-schema/"}), PyArrowFileIO)
[email protected]("pyiceberg.io.SCHEMA_TO_FILE_IO", {"test":
["tests.io.test_io.LocalFileIO"]})
def test_mock_warehouse_location_file_io() -> None:
# For testing the selection logic
io = load_file_io({"warehouse": "test://some-path/"})
assert io.properties["warehouse"] == "test://some-path/"
[email protected]("pyiceberg.io.SCHEMA_TO_FILE_IO", {"test":
["tests.io.test_io.LocalFileIO"]})
def test_mock_table_location_file_io() -> None:
# For testing the selection logic
io = load_file_io({}, "test://some-path/")
diff --git a/python/tests/table/test_metadata.py
b/python/tests/table/test_metadata.py
index 9f46ca6737..8c464c0e5a 100644
--- a/python/tests/table/test_metadata.py
+++ b/python/tests/table/test_metadata.py
@@ -14,6 +14,7 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+# pylint: disable=redefined-outer-name
import io
import json
@@ -49,30 +50,33 @@ from pyiceberg.types import (
StructType,
)
-EXAMPLE_TABLE_METADATA_V1 = {
- "format-version": 1,
- "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c",
- "location": "s3://bucket/test/location",
- "last-updated-ms": 1602638573874,
- "last-column-id": 3,
- "schema": {
- "type": "struct",
- "fields": [
- {"id": 1, "name": "x", "required": True, "type": "long"},
- {"id": 2, "name": "y", "required": True, "type": "long", "doc":
"comment"},
- {"id": 3, "name": "z", "required": True, "type": "long"},
- ],
- },
- "partition-spec": [{"name": "x", "transform": "identity", "source-id": 1,
"field-id": 1000}],
- "properties": {},
- "current-snapshot-id": -1,
- "snapshots": [{"snapshot-id": 1925, "timestamp-ms": 1602638573822}],
-}
+
[email protected](scope="session")
+def example_table_metadata_v1() -> Dict[str, Any]:
+ return {
+ "format-version": 1,
+ "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c",
+ "location": "s3://bucket/test/location",
+ "last-updated-ms": 1602638573874,
+ "last-column-id": 3,
+ "schema": {
+ "type": "struct",
+ "fields": [
+ {"id": 1, "name": "x", "required": True, "type": "long"},
+ {"id": 2, "name": "y", "required": True, "type": "long",
"doc": "comment"},
+ {"id": 3, "name": "z", "required": True, "type": "long"},
+ ],
+ },
+ "partition-spec": [{"name": "x", "transform": "identity", "source-id":
1, "field-id": 1000}],
+ "properties": {},
+ "current-snapshot-id": -1,
+ "snapshots": [{"snapshot-id": 1925, "timestamp-ms": 1602638573822}],
+ }
-def test_from_dict_v1() -> None:
+def test_from_dict_v1(example_table_metadata_v1: Dict[str, Any]) -> None:
"""Test initialization of a TableMetadata instance from a dictionary"""
- TableMetadataUtil.parse_obj(EXAMPLE_TABLE_METADATA_V1)
+ TableMetadataUtil.parse_obj(example_table_metadata_v1)
def test_from_dict_v2(example_table_metadata_v2: Dict[str, Any]) -> None:
@@ -110,9 +114,9 @@ def test_v2_metadata_parsing(example_table_metadata_v2:
Dict[str, Any]) -> None:
assert table_metadata.default_sort_order_id == 3
-def test_v1_metadata_parsing_directly() -> None:
+def test_v1_metadata_parsing_directly(example_table_metadata_v1: Dict[str,
Any]) -> None:
"""Test retrieving values from a TableMetadata instance of version 1"""
- table_metadata = TableMetadataV1(**EXAMPLE_TABLE_METADATA_V1)
+ table_metadata = TableMetadataV1(**example_table_metadata_v1)
assert isinstance(table_metadata, TableMetadataV1)
@@ -174,8 +178,8 @@ def test_updating_metadata(example_table_metadata_v2:
Dict[str, Any]) -> None:
assert table_metadata.schemas[-1] == Schema(**new_schema)
-def test_serialize_v1() -> None:
- table_metadata = TableMetadataV1(**EXAMPLE_TABLE_METADATA_V1)
+def test_serialize_v1(example_table_metadata_v1: Dict[str, Any]) -> None:
+ table_metadata = TableMetadataV1(**example_table_metadata_v1)
table_metadata_json = table_metadata.json()
expected = """{"location": "s3://bucket/test/location", "table-uuid":
"d20125c8-7284-442c-9aea-15fee620737c", "last-updated-ms": 1602638573874,
"last-column-id": 3, "schemas": [{"type": "struct", "fields": [{"id": 1,
"name": "x", "type": "long", "required": true}, {"id": 2, "name": "y", "type":
"long", "required": true, "doc": "comment"}, {"id": 3, "name": "z", "type":
"long", "required": true}], "schema-id": 0, "identifier-field-ids": []}],
"current-schema-id": 0, "partition-specs": [...]
assert table_metadata_json == expected
@@ -187,17 +191,17 @@ def test_serialize_v2(example_table_metadata_v2:
Dict[str, Any]) -> None:
assert table_metadata == expected
-def test_migrate_v1_schemas() -> None:
- table_metadata = TableMetadataV1(**EXAMPLE_TABLE_METADATA_V1)
+def test_migrate_v1_schemas(example_table_metadata_v1: Dict[str, Any]) -> None:
+ table_metadata = TableMetadataV1(**example_table_metadata_v1)
assert isinstance(table_metadata, TableMetadataV1)
assert len(table_metadata.schemas) == 1
assert table_metadata.schemas[0] == table_metadata.schema_
-def test_migrate_v1_partition_specs() -> None:
+def test_migrate_v1_partition_specs(example_table_metadata_v1: Dict[str, Any])
-> None:
# Copy the example, and add a spec
- table_metadata = TableMetadataV1(**EXAMPLE_TABLE_METADATA_V1)
+ table_metadata = TableMetadataV1(**example_table_metadata_v1)
assert isinstance(table_metadata, TableMetadataV1)
assert len(table_metadata.partition_specs) == 1
# Spec ID gets added automatically
@@ -390,7 +394,7 @@ def test_invalid_partition_spec() -> None:
assert "default-spec-id 1 can't be found" in str(exc_info.value)
-def test_v1_writing_metadata() -> None:
+def test_v1_writing_metadata(example_table_metadata_v1: Dict[str, Any]) ->
None:
"""
https://iceberg.apache.org/spec/#version-2
@@ -398,14 +402,14 @@ def test_v1_writing_metadata() -> None:
- Table metadata field last-sequence-number should not be written
"""
- table_metadata = TableMetadataV1(**EXAMPLE_TABLE_METADATA_V1)
+ table_metadata = TableMetadataV1(**example_table_metadata_v1)
metadata_v1_json = table_metadata.json()
metadata_v1 = json.loads(metadata_v1_json)
assert "last-sequence-number" not in metadata_v1
-def test_v1_metadata_for_v2() -> None:
+def test_v1_metadata_for_v2(example_table_metadata_v1: Dict[str, Any]) -> None:
"""
https://iceberg.apache.org/spec/#version-2
@@ -413,7 +417,7 @@ def test_v1_metadata_for_v2() -> None:
- Table metadata field last-sequence-number must default to 0
"""
- table_metadata = TableMetadataV1(**EXAMPLE_TABLE_METADATA_V1).to_v2()
+ table_metadata = TableMetadataV1(**example_table_metadata_v1).to_v2()
assert table_metadata.last_sequence_number == 0
diff --git a/python/tests/utils/test_manifest.py
b/python/tests/utils/test_manifest.py
index 25ba4730dc..91c111abc1 100644
--- a/python/tests/utils/test_manifest.py
+++ b/python/tests/utils/test_manifest.py
@@ -16,6 +16,7 @@
# under the License.
from pyiceberg.io import load_file_io
+from pyiceberg.io.pyarrow import PyArrowFileIO
from pyiceberg.manifest import (
DataFile,
DataFileContent,
@@ -30,11 +31,10 @@ from pyiceberg.manifest import (
)
from pyiceberg.table import Snapshot
from pyiceberg.table.snapshots import Operation, Summary
-from tests.io.test_io import LocalInputFile
def test_read_manifest_entry(generated_manifest_entry_file: str) -> None:
- input_file = LocalInputFile(generated_manifest_entry_file)
+ input_file =
PyArrowFileIO().new_input(location=generated_manifest_entry_file)
assert list(read_manifest_entry(input_file)) == [
ManifestEntry(
status=1,
@@ -268,7 +268,7 @@ def test_read_manifest_entry(generated_manifest_entry_file:
str) -> None:
def test_read_manifest_list(generated_manifest_file_file: str) -> None:
- input_file = LocalInputFile(generated_manifest_file_file)
+ input_file = PyArrowFileIO().new_input(generated_manifest_file_file)
actual = list(read_manifest_list(input_file))
expected = [
ManifestFile(