ueshin commented on a change in pull request #34174:
URL: https://github.com/apache/spark/pull/34174#discussion_r728513483
##########
File path: python/pyspark/sql/types.py
##########
@@ -786,22 +833,22 @@ def fromJson(cls, json):
UDT = getattr(m, pyClass)
return UDT()
- def __eq__(self, other):
+ def __eq__(self, other: Any) -> bool:
return type(self) == type(other)
_atomic_types = [StringType, BinaryType, BooleanType, DecimalType, FloatType,
DoubleType,
ByteType, ShortType, IntegerType, LongType, DateType,
TimestampType,
TimestampNTZType, NullType]
-_all_atomic_types = dict((t.typeName(), t) for t in _atomic_types)
-_all_complex_types = dict((v.typeName(), v)
+_all_atomic_types = dict((t.typeName(), t) for t in _atomic_types) # type:
ignore[attr-defined]
+_all_complex_types = dict((v.typeName(), v) # type: ignore[attr-defined]
for v in [ArrayType, MapType, StructType])
Review comment:
How about:
```py
_atomic_types: List[Type[DataType]] = ...
_all_atomic_types: Dict[str, Type[DataType]] = dict((t.typeName(), t) for t
in _atomic_types)
_complex_types: List[Type[DataType]] = [ArrayType, MapType, StructType]
_all_complex_types: Dict[str, Type[DataType]] = dict((v.typeName(), v) for v
in _complex_types)
```
##########
File path: python/pyspark/sql/types.py
##########
@@ -1029,10 +1082,14 @@ def _int_size_to_type(size):
# Type code 'u' in Python's array is deprecated since version 3.3, and will be
# removed in version 4.0. See: https://docs.python.org/3/library/array.html
if sys.version_info[0] < 4:
- _array_type_mappings['u'] = StringType
+ _array_type_mappings['u'] = StringType # type: ignore[assignment]
Review comment:
Do we still need this?
##########
File path: python/pyspark/sql/types.py
##########
@@ -1083,22 +1140,30 @@ def _infer_type(obj, infer_dict_as_struct=False,
prefer_timestamp_ntz=False):
raise TypeError("not supported type: %s" % type(obj))
-def _infer_schema(row, names=None, infer_dict_as_struct=False,
prefer_timestamp_ntz=False):
+def _infer_schema(
+ row: Any,
+ names: Optional[List[str]] = None,
+ infer_dict_as_struct: bool = False,
+ prefer_timestamp_ntz: bool = False,
+) -> StructType:
"""Infer the schema from dict/namedtuple/object"""
+ items: Union[zip[Tuple[Any, Any]], List[Tuple[Any, Any]]]
if isinstance(row, dict):
items = sorted(row.items())
elif isinstance(row, (tuple, list)):
if hasattr(row, "__fields__"): # Row
- items = zip(row.__fields__, tuple(row))
+ items = zip(row.__fields__, tuple(row)) # type: ignore[union-attr]
elif hasattr(row, "_fields"): # namedtuple
- items = zip(row._fields, tuple(row))
+ items = zip(row._fields, tuple(row)) # type: ignore[union-attr]
else:
if names is None:
- names = ['_%d' % i for i in range(1, len(row) + 1)]
+ names = [
+ '_%d' % i for i in range(1, len(row) + 1)] # type:
ignore[no-redef, assignment]
elif len(names) < len(row):
- names.extend('_%d' % i for i in range(len(names) + 1, len(row)
+ 1))
- items = zip(names, row)
+ names.extend( # type: ignore[attr-defined]
+ '_%d' % i for i in range(len(names) + 1, len(row) + 1))
+ items = zip(names, row) # type: ignore[arg-type, assignment]
Review comment:
ditto.
##########
File path: python/pyspark/sql/types.py
##########
@@ -1083,22 +1140,30 @@ def _infer_type(obj, infer_dict_as_struct=False,
prefer_timestamp_ntz=False):
raise TypeError("not supported type: %s" % type(obj))
-def _infer_schema(row, names=None, infer_dict_as_struct=False,
prefer_timestamp_ntz=False):
+def _infer_schema(
+ row: Any,
+ names: Optional[List[str]] = None,
+ infer_dict_as_struct: bool = False,
+ prefer_timestamp_ntz: bool = False,
+) -> StructType:
"""Infer the schema from dict/namedtuple/object"""
+ items: Union[zip[Tuple[Any, Any]], List[Tuple[Any, Any]]]
if isinstance(row, dict):
items = sorted(row.items())
elif isinstance(row, (tuple, list)):
if hasattr(row, "__fields__"): # Row
- items = zip(row.__fields__, tuple(row))
+ items = zip(row.__fields__, tuple(row)) # type: ignore[union-attr]
elif hasattr(row, "_fields"): # namedtuple
- items = zip(row._fields, tuple(row))
+ items = zip(row._fields, tuple(row)) # type: ignore[union-attr]
else:
if names is None:
- names = ['_%d' % i for i in range(1, len(row) + 1)]
+ names = [
+ '_%d' % i for i in range(1, len(row) + 1)] # type:
ignore[no-redef, assignment]
elif len(names) < len(row):
- names.extend('_%d' % i for i in range(len(names) + 1, len(row)
+ 1))
- items = zip(names, row)
+ names.extend( # type: ignore[attr-defined]
Review comment:
ditto.
##########
File path: python/pyspark/sql/types.py
##########
@@ -1083,22 +1140,30 @@ def _infer_type(obj, infer_dict_as_struct=False,
prefer_timestamp_ntz=False):
raise TypeError("not supported type: %s" % type(obj))
-def _infer_schema(row, names=None, infer_dict_as_struct=False,
prefer_timestamp_ntz=False):
+def _infer_schema(
+ row: Any,
+ names: Optional[List[str]] = None,
+ infer_dict_as_struct: bool = False,
+ prefer_timestamp_ntz: bool = False,
+) -> StructType:
"""Infer the schema from dict/namedtuple/object"""
+ items: Union[zip[Tuple[Any, Any]], List[Tuple[Any, Any]]]
if isinstance(row, dict):
items = sorted(row.items())
elif isinstance(row, (tuple, list)):
if hasattr(row, "__fields__"): # Row
- items = zip(row.__fields__, tuple(row))
+ items = zip(row.__fields__, tuple(row)) # type: ignore[union-attr]
elif hasattr(row, "_fields"): # namedtuple
- items = zip(row._fields, tuple(row))
+ items = zip(row._fields, tuple(row)) # type: ignore[union-attr]
else:
if names is None:
- names = ['_%d' % i for i in range(1, len(row) + 1)]
+ names = [
+ '_%d' % i for i in range(1, len(row) + 1)] # type:
ignore[no-redef, assignment]
Review comment:
Do we need the `ignore`?
##########
File path: python/pyspark/sql/types.py
##########
@@ -1083,22 +1140,30 @@ def _infer_type(obj, infer_dict_as_struct=False,
prefer_timestamp_ntz=False):
raise TypeError("not supported type: %s" % type(obj))
-def _infer_schema(row, names=None, infer_dict_as_struct=False,
prefer_timestamp_ntz=False):
+def _infer_schema(
+ row: Any,
+ names: Optional[List[str]] = None,
+ infer_dict_as_struct: bool = False,
+ prefer_timestamp_ntz: bool = False,
+) -> StructType:
"""Infer the schema from dict/namedtuple/object"""
+ items: Union[zip[Tuple[Any, Any]], List[Tuple[Any, Any]]]
Review comment:
How about `items: Iterable[Tuple[str, Any]]`?
##########
File path: python/pyspark/sql/types.py
##########
@@ -1128,7 +1193,7 @@ def _has_nulltype(dt):
return isinstance(dt, NullType)
-def _merge_type(a, b, name=None):
+def _merge_type(a: DataType, b: DataType, name: Optional[str] = None) ->
DataType:
Review comment:
Shall we define overloads?
```py
@overload
def _merge_type(a: StructType, b: StructType, name: Optional[str] = ...) ->
StructType:
...
@overload
def _merge_type(a: ArrayType, b: ArrayType, name: Optional[str] = ...) ->
ArrayType:
...
@overload
def _merge_type(a: MapType, b: MapType, name: Optional[str] = ...) ->
MapType:
...
@overload
def _merge_type(a: DataType, b: DataType, name: Optional[str] = ...) ->
DataType:
...
```
Then we can remove the changes in `session.py`.
##########
File path: python/pyspark/sql/types.py
##########
@@ -1161,18 +1226,21 @@ def _merge_type(a, b, name=None):
return StructType(fields)
elif isinstance(a, ArrayType):
- return ArrayType(_merge_type(a.elementType, b.elementType,
+ return ArrayType(_merge_type(cast(ArrayType, a).elementType,
cast(ArrayType, b).elementType,
Review comment:
nit: we don't need `cast` for `a` here.
##########
File path: python/pyspark/sql/types.py
##########
@@ -1161,18 +1226,21 @@ def _merge_type(a, b, name=None):
return StructType(fields)
elif isinstance(a, ArrayType):
- return ArrayType(_merge_type(a.elementType, b.elementType,
+ return ArrayType(_merge_type(cast(ArrayType, a).elementType,
cast(ArrayType, b).elementType,
name='element in array %s' % name), True)
elif isinstance(a, MapType):
- return MapType(_merge_type(a.keyType, b.keyType, name='key of map %s'
% name),
- _merge_type(a.valueType, b.valueType, name='value of
map %s' % name),
- True)
+ return MapType(
+ _merge_type(cast(MapType, a).keyType, cast(MapType, b).keyType,
+ name='key of map %s' % name),
+ _merge_type(cast(MapType, a).valueType, cast(MapType, b).valueType,
+ name='value of map %s' % name),
Review comment:
ditto.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]