Spark 1.5 is the latest that I have access to and where this problem
happens.
I don't see it's fixed in master but I might be wrong. diff atatched.
https://raw.githubusercontent.com/apache/spark/branch-1.5/python/pyspark/sql/types.py
https://raw.githubusercontent.com/apache/spark/d57daf1f7732a7ac54a91fe112deeda0a254f9ef/python/pyspark/sql/types.py
--
Ruslan Dautkhanov
On Wed, Mar 16, 2016 at 4:44 PM, Reynold Xin <[email protected]> wrote:
> We probably should have the alias. Is this still a problem on master
> branch?
>
> On Wed, Mar 16, 2016 at 9:40 AM, Ruslan Dautkhanov <[email protected]>
> wrote:
>
>> Running following:
>>
>> #fix schema for gaid which should not be Double
>>> from pyspark.sql.types import *
>>> customSchema = StructType()
>>> for (col,typ) in tsp_orig.dtypes:
>>> if col=='Agility_GAID':
>>> typ='string'
>>> customSchema.add(col,typ,True)
>>
>>
>> Getting
>>
>> ValueError: Could not parse datatype: bigint
>>
>>
>> Looks like pyspark.sql.types doesn't know anything about bigint..
>> Should it be aliased to LongType in pyspark.sql.types?
>>
>> Thanks
>>
>>
>> On Wed, Mar 16, 2016 at 10:18 AM, Ruslan Dautkhanov <[email protected]
>> > wrote:
>>
>>> Hello,
>>>
>>> Looking at
>>>
>>> https://spark.apache.org/docs/1.5.1/api/python/_modules/pyspark/sql/types.html
>>>
>>> and can't wrap my head around how to convert string data types names to
>>> actual
>>> pyspark.sql.types data types?
>>>
>>> Does pyspark.sql.types has an interface to return StringType() for
>>> "string",
>>> IntegerType() for "integer" etc? If it doesn't exist it would be great
>>> to have such a
>>> mapping function.
>>>
>>> Thank you.
>>>
>>>
>>> ps. I have a data frame, and use its dtypes to loop through all columns
>>> to fix a few
>>> columns' data types as a workaround for SPARK-13866.
>>>
>>>
>>> --
>>> Ruslan Dautkhanov
>>>
>>
>>
>
683a684,806
> _FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
>
>
> _BRACKETS = {'(': ')', '[': ']', '{': '}'}
>
>
> def _parse_basic_datatype_string(s):
> if s in _all_atomic_types.keys():
> return _all_atomic_types[s]()
> elif s == "int":
> return IntegerType()
> elif _FIXED_DECIMAL.match(s):
> m = _FIXED_DECIMAL.match(s)
> return DecimalType(int(m.group(1)), int(m.group(2)))
> else:
> raise ValueError("Could not parse datatype: %s" % s)
>
>
> def _ignore_brackets_split(s, separator):
> """
> Splits the given string by given separator, but ignore separators inside
> brackets pairs, e.g.
> given "a,b" and separator ",", it will return ["a", "b"], but given
> "a<b,c>, d", it will return
> ["a<b,c>", "d"].
> """
> parts = []
> buf = ""
> level = 0
> for c in s:
> if c in _BRACKETS.keys():
> level += 1
> buf += c
> elif c in _BRACKETS.values():
> if level == 0:
> raise ValueError("Brackets are not correctly paired: %s" % s)
> level -= 1
> buf += c
> elif c == separator and level > 0:
> buf += c
> elif c == separator:
> parts.append(buf)
> buf = ""
> else:
> buf += c
>
> if len(buf) == 0:
> raise ValueError("The %s cannot be the last char: %s" % (separator,
> s))
> parts.append(buf)
> return parts
>
>
> def _parse_struct_fields_string(s):
> parts = _ignore_brackets_split(s, ",")
> fields = []
> for part in parts:
> name_and_type = _ignore_brackets_split(part, ":")
> if len(name_and_type) != 2:
> raise ValueError("The strcut field string format is:
> 'field_name:field_type', " +
> "but got: %s" % part)
> field_name = name_and_type[0].strip()
> field_type = _parse_datatype_string(name_and_type[1])
> fields.append(StructField(field_name, field_type))
> return StructType(fields)
>
>
> def _parse_datatype_string(s):
> """
> Parses the given data type string to a :class:`DataType`. The data type
> string format equals
> to `DataType.simpleString`, except that top level struct type can omit
> the `struct<>` and
> atomic types use `typeName()` as their format, e.g. use `byte` instead of
> `tinyint` for
> ByteType. We can also use `int` as a short name for IntegerType.
>
> >>> _parse_datatype_string("int ")
> IntegerType
> >>> _parse_datatype_string("a: byte, b: decimal( 16 , 8 ) ")
>
> StructType(List(StructField(a,ByteType,true),StructField(b,DecimalType(16,8),true)))
> >>> _parse_datatype_string("a: array< short>")
> StructType(List(StructField(a,ArrayType(ShortType,true),true)))
> >>> _parse_datatype_string(" map<string , string > ")
> MapType(StringType,StringType,true)
>
> >>> # Error cases
> >>> _parse_datatype_string("blabla") # doctest: +IGNORE_EXCEPTION_DETAIL
> Traceback (most recent call last):
> ...
> ValueError:...
> >>> _parse_datatype_string("a: int,") # doctest: +IGNORE_EXCEPTION_DETAIL
> Traceback (most recent call last):
> ...
> ValueError:...
> >>> _parse_datatype_string("array<int") # doctest:
> +IGNORE_EXCEPTION_DETAIL
> Traceback (most recent call last):
> ...
> ValueError:...
> >>> _parse_datatype_string("map<int, boolean>>") # doctest:
> +IGNORE_EXCEPTION_DETAIL
> Traceback (most recent call last):
> ...
> ValueError:...
> """
> s = s.strip()
> if s.startswith("array<"):
> if s[-1] != ">":
> raise ValueError("'>' should be the last char, but got: %s" % s)
> return ArrayType(_parse_datatype_string(s[6:-1]))
> elif s.startswith("map<"):
> if s[-1] != ">":
> raise ValueError("'>' should be the last char, but got: %s" % s)
> parts = _ignore_brackets_split(s[4:-1], ",")
> if len(parts) != 2:
> raise ValueError("The map type string format is:
> 'map<key_type,value_type>', " +
> "but got: %s" % s)
> kt = _parse_datatype_string(parts[0])
> vt = _parse_datatype_string(parts[1])
> return MapType(kt, vt)
> elif s.startswith("struct<"):
> if s[-1] != ">":
> raise ValueError("'>' should be the last char, but got: %s" % s)
> return _parse_struct_fields_string(s[7:-1])
> elif ":" in s:
> return _parse_struct_fields_string(s)
> else:
> return _parse_basic_datatype_string(s)
>
>
733,735d855
< _FIXED_DECIMAL = re.compile("decimal\\((\\d+),(\\d+)\\)")
<
<
943,945d1062
< _BRACKETS = {'(': ')', '[': ']', '{': '}'}
<
<
1094c1211
< def _verify_type(obj, dataType):
---
> def _verify_type(obj, dataType, nullable=True):
1096,1097c1213,1217
< Verify the type of obj against dataType, raise an exception if
< they do not match.
---
> Verify the type of obj against dataType, raise a TypeError if they do not
> match.
>
> Also verify the value of obj against datatype, raise a ValueError if it's
> not within the allowed
> range, e.g. using 128 as ByteType will overflow. Note that, Python float
> is not checked, so it
> will become infinity when cast to Java float if it overflows.
1113a1234,1256
> >>> # Check if numeric values are within the allowed range.
> >>> _verify_type(12, ByteType())
> >>> _verify_type(1234, ByteType()) # doctest: +IGNORE_EXCEPTION_DETAIL
> Traceback (most recent call last):
> ...
> ValueError:...
> >>> _verify_type(None, ByteType(), False) # doctest:
> +IGNORE_EXCEPTION_DETAIL
> Traceback (most recent call last):
> ...
> ValueError:...
> >>> _verify_type([1, None], ArrayType(ShortType(), False)) # doctest:
> +IGNORE_EXCEPTION_DETAIL
> Traceback (most recent call last):
> ...
> ValueError:...
> >>> _verify_type({None: 1}, MapType(StringType(), IntegerType()))
> Traceback (most recent call last):
> ...
> ValueError:...
> >>> schema = StructType().add("a", IntegerType()).add("b", StringType(),
> False)
> >>> _verify_type((1, None), schema) # doctest: +IGNORE_EXCEPTION_DETAIL
> Traceback (most recent call last):
> ...
> ValueError:...
1115d1257
< # all objects are nullable
1117c1259,1262
< return
---
> if nullable:
> return
> else:
> raise ValueError("This field is not nullable, but got None")
1130c1275
< assert _type in _acceptable_types, "unknown datatype: %s" % dataType
---
> assert _type in _acceptable_types, "unknown datatype: %s for object %r" %
> (dataType, obj)
1134c1279
< raise TypeError("StructType can not accept object in type %s" %
type(obj))
---
> raise TypeError("StructType can not accept object %r in type %s"
> % (obj, type(obj)))
1138c1283
< raise TypeError("%s can not accept object in type %s" %
(dataType, type(obj)))
---
> raise TypeError("%s can not accept object %r in type %s" %
> (dataType, obj, type(obj)))
1140c1285,1297
< if isinstance(dataType, ArrayType):
---
> if isinstance(dataType, ByteType):
> if obj < -128 or obj > 127:
> raise ValueError("object of ByteType out of range, got: %s" % obj)
>
> elif isinstance(dataType, ShortType):
> if obj < -32768 or obj > 32767:
> raise ValueError("object of ShortType out of range, got: %s" %
> obj)
>
> elif isinstance(dataType, IntegerType):
> if obj < -2147483648 or obj > 2147483647:
> raise ValueError("object of IntegerType out of range, got: %s" %
> obj)
>
> elif isinstance(dataType, ArrayType):
1142c1299
< _verify_type(i, dataType.elementType)
---
> _verify_type(i, dataType.elementType, dataType.containsNull)
1146,1147c1303,1304
< _verify_type(k, dataType.keyType)
< _verify_type(v, dataType.valueType)
---
> _verify_type(k, dataType.keyType, False)
> _verify_type(v, dataType.valueType, dataType.valueContainsNull)
1154c1311
< _verify_type(v, f.dataType)
---
> _verify_type(v, f.dataType, f.nullable)
1178a1336,1337
> >>> row['name'], row['age']
> ('Alice', 11)
1245a1405,1417
> def __getitem__(self, item):
> if isinstance(item, (int, slice)):
> return super(Row, self).__getitem__(item)
> try:
> # it will be slow when it has many fields,
> # but this will not be used in normal cases
> idx = self.__fields__.index(item)
> return super(Row, self).__getitem__(idx)
> except IndexError:
> raise KeyError(item)
> except ValueError:
> raise ValueError(item)
>
1295,1296c1467,1471
< return Timestamp(int(time.mktime(obj.timetuple())) * 1000 +
obj.microsecond // 1000)
<
---
> seconds = (calendar.timegm(obj.utctimetuple()) if obj.tzinfo
> else time.mktime(obj.timetuple()))
> t = Timestamp(int(seconds) * 1000)
> t.setNanos(obj.microsecond * 1000)
> return t
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]