Re: df.dtypes -> pyspark.sql.types

Ruslan Dautkhanov Sat, 19 Mar 2016 17:17:08 -0700

Spark 1.5 is the latest that I have access to and where this problem
happens.


I don't see it's fixed in master but I might be wrong. diff atatched.

https://raw.githubusercontent.com/apache/spark/branch-1.5/python/pyspark/sql/types.py
https://raw.githubusercontent.com/apache/spark/d57daf1f7732a7ac54a91fe112deeda0a254f9ef/python/pyspark/sql/types.py



-- 
Ruslan Dautkhanov

On Wed, Mar 16, 2016 at 4:44 PM, Reynold Xin <[email protected]> wrote:

> We probably should have the alias. Is this still a problem on master
> branch?
>
> On Wed, Mar 16, 2016 at 9:40 AM, Ruslan Dautkhanov <[email protected]>
> wrote:
>
>> Running following:
>>
>> #fix schema for gaid which should not be Double
>>> from pyspark.sql.types import *
>>> customSchema = StructType()
>>> for (col,typ) in tsp_orig.dtypes:
>>>     if col=='Agility_GAID':
>>>         typ='string'
>>>     customSchema.add(col,typ,True)
>>
>>
>> Getting
>>
>>   ValueError: Could not parse datatype: bigint
>>
>>
>> Looks like pyspark.sql.types doesn't know anything about bigint..
>> Should it be aliased to LongType in pyspark.sql.types?
>>
>> Thanks
>>
>>
>> On Wed, Mar 16, 2016 at 10:18 AM, Ruslan Dautkhanov <[email protected]
>> > wrote:
>>
>>> Hello,
>>>
>>> Looking at
>>>
>>> https://spark.apache.org/docs/1.5.1/api/python/_modules/pyspark/sql/types.html
>>>
>>> and can't wrap my head around how to convert string data types names to
>>> actual
>>> pyspark.sql.types data types?
>>>
>>> Does pyspark.sql.types has an interface to return StringType() for
>>> "string",
>>> IntegerType() for "integer" etc? If it doesn't exist it would be great
>>> to have such a
>>> mapping function.
>>>
>>> Thank you.
>>>
>>>
>>> ps. I have a data frame, and use its dtypes to loop through all columns
>>> to fix a few
>>> columns' data types as a workaround for SPARK-13866.
>>>
>>>
>>> --
>>> Ruslan Dautkhanov
>>>
>>
>>
>

683a684,806
> _FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
> 
> 
> _BRACKETS = {'(': ')', '[': ']', '{': '}'}
> 
> 
> def _parse_basic_datatype_string(s):
>     if s in _all_atomic_types.keys():
>         return _all_atomic_types[s]()
>     elif s == "int":
>         return IntegerType()
>     elif _FIXED_DECIMAL.match(s):
>         m = _FIXED_DECIMAL.match(s)
>         return DecimalType(int(m.group(1)), int(m.group(2)))
>     else:
>         raise ValueError("Could not parse datatype: %s" % s)
> 
> 
> def _ignore_brackets_split(s, separator):
>     """
>     Splits the given string by given separator, but ignore separators inside 
> brackets pairs, e.g.
>     given "a,b" and separator ",", it will return ["a", "b"], but given 
> "a<b,c>, d", it will return
>     ["a<b,c>", "d"].
>     """
>     parts = []
>     buf = ""
>     level = 0
>     for c in s:
>         if c in _BRACKETS.keys():
>             level += 1
>             buf += c
>         elif c in _BRACKETS.values():
>             if level == 0:
>                 raise ValueError("Brackets are not correctly paired: %s" % s)
>             level -= 1
>             buf += c
>         elif c == separator and level > 0:
>             buf += c
>         elif c == separator:
>             parts.append(buf)
>             buf = ""
>         else:
>             buf += c
> 
>     if len(buf) == 0:
>         raise ValueError("The %s cannot be the last char: %s" % (separator, 
> s))
>     parts.append(buf)
>     return parts
> 
> 
> def _parse_struct_fields_string(s):
>     parts = _ignore_brackets_split(s, ",")
>     fields = []
>     for part in parts:
>         name_and_type = _ignore_brackets_split(part, ":")
>         if len(name_and_type) != 2:
>             raise ValueError("The strcut field string format is: 
> 'field_name:field_type', " +
>                              "but got: %s" % part)
>         field_name = name_and_type[0].strip()
>         field_type = _parse_datatype_string(name_and_type[1])
>         fields.append(StructField(field_name, field_type))
>     return StructType(fields)
> 
> 
> def _parse_datatype_string(s):
>     """
>     Parses the given data type string to a :class:`DataType`. The data type 
> string format equals
>     to `DataType.simpleString`, except that top level struct type can omit 
> the `struct<>` and
>     atomic types use `typeName()` as their format, e.g. use `byte` instead of 
> `tinyint` for
>     ByteType. We can also use `int` as a short name for IntegerType.
> 
>     >>> _parse_datatype_string("int ")
>     IntegerType
>     >>> _parse_datatype_string("a: byte, b: decimal(  16 , 8   ) ")
>     
> StructType(List(StructField(a,ByteType,true),StructField(b,DecimalType(16,8),true)))
>     >>> _parse_datatype_string("a: array< short>")
>     StructType(List(StructField(a,ArrayType(ShortType,true),true)))
>     >>> _parse_datatype_string(" map<string , string > ")
>     MapType(StringType,StringType,true)
> 
>     >>> # Error cases
>     >>> _parse_datatype_string("blabla") # doctest: +IGNORE_EXCEPTION_DETAIL
>     Traceback (most recent call last):
>         ...
>     ValueError:...
>     >>> _parse_datatype_string("a: int,") # doctest: +IGNORE_EXCEPTION_DETAIL
>     Traceback (most recent call last):
>         ...
>     ValueError:...
>     >>> _parse_datatype_string("array<int") # doctest: 
> +IGNORE_EXCEPTION_DETAIL
>     Traceback (most recent call last):
>         ...
>     ValueError:...
>     >>> _parse_datatype_string("map<int, boolean>>") # doctest: 
> +IGNORE_EXCEPTION_DETAIL
>     Traceback (most recent call last):
>         ...
>     ValueError:...
>     """
>     s = s.strip()
>     if s.startswith("array<"):
>         if s[-1] != ">":
>             raise ValueError("'>' should be the last char, but got: %s" % s)
>         return ArrayType(_parse_datatype_string(s[6:-1]))
>     elif s.startswith("map<"):
>         if s[-1] != ">":
>             raise ValueError("'>' should be the last char, but got: %s" % s)
>         parts = _ignore_brackets_split(s[4:-1], ",")
>         if len(parts) != 2:
>             raise ValueError("The map type string format is: 
> 'map<key_type,value_type>', " +
>                              "but got: %s" % s)
>         kt = _parse_datatype_string(parts[0])
>         vt = _parse_datatype_string(parts[1])
>         return MapType(kt, vt)
>     elif s.startswith("struct<"):
>         if s[-1] != ">":
>             raise ValueError("'>' should be the last char, but got: %s" % s)
>         return _parse_struct_fields_string(s[7:-1])
>     elif ":" in s:
>         return _parse_struct_fields_string(s)
>     else:
>         return _parse_basic_datatype_string(s)
> 
> 
733,735d855
< _FIXED_DECIMAL = re.compile("decimal\\((\\d+),(\\d+)\\)")
< 
< 
943,945d1062
< _BRACKETS = {'(': ')', '[': ']', '{': '}'}
< 
< 
1094c1211
< def _verify_type(obj, dataType):
---
> def _verify_type(obj, dataType, nullable=True):
1096,1097c1213,1217
<     Verify the type of obj against dataType, raise an exception if
<     they do not match.
---
>     Verify the type of obj against dataType, raise a TypeError if they do not 
> match.
> 
>     Also verify the value of obj against datatype, raise a ValueError if it's 
> not within the allowed
>     range, e.g. using 128 as ByteType will overflow. Note that, Python float 
> is not checked, so it
>     will become infinity when cast to Java float if it overflows.
1113a1234,1256
>     >>> # Check if numeric values are within the allowed range.
>     >>> _verify_type(12, ByteType())
>     >>> _verify_type(1234, ByteType()) # doctest: +IGNORE_EXCEPTION_DETAIL
>     Traceback (most recent call last):
>         ...
>     ValueError:...
>     >>> _verify_type(None, ByteType(), False) # doctest: 
> +IGNORE_EXCEPTION_DETAIL
>     Traceback (most recent call last):
>         ...
>     ValueError:...
>     >>> _verify_type([1, None], ArrayType(ShortType(), False)) # doctest: 
> +IGNORE_EXCEPTION_DETAIL
>     Traceback (most recent call last):
>         ...
>     ValueError:...
>     >>> _verify_type({None: 1}, MapType(StringType(), IntegerType()))
>     Traceback (most recent call last):
>         ...
>     ValueError:...
>     >>> schema = StructType().add("a", IntegerType()).add("b", StringType(), 
> False)
>     >>> _verify_type((1, None), schema) # doctest: +IGNORE_EXCEPTION_DETAIL
>     Traceback (most recent call last):
>         ...
>     ValueError:...
1115d1257
<     # all objects are nullable
1117c1259,1262
<         return
---
>         if nullable:
>             return
>         else:
>             raise ValueError("This field is not nullable, but got None")
1130c1275
<     assert _type in _acceptable_types, "unknown datatype: %s" % dataType
---
>     assert _type in _acceptable_types, "unknown datatype: %s for object %r" % 
> (dataType, obj)
1134c1279
<             raise TypeError("StructType can not accept object in type %s" % 
type(obj))
---
>             raise TypeError("StructType can not accept object %r in type %s" 
> % (obj, type(obj)))
1138c1283
<             raise TypeError("%s can not accept object in type %s" % 
(dataType, type(obj)))
---
>             raise TypeError("%s can not accept object %r in type %s" % 
> (dataType, obj, type(obj)))
1140c1285,1297
<     if isinstance(dataType, ArrayType):
---
>     if isinstance(dataType, ByteType):
>         if obj < -128 or obj > 127:
>             raise ValueError("object of ByteType out of range, got: %s" % obj)
> 
>     elif isinstance(dataType, ShortType):
>         if obj < -32768 or obj > 32767:
>             raise ValueError("object of ShortType out of range, got: %s" % 
> obj)
> 
>     elif isinstance(dataType, IntegerType):
>         if obj < -2147483648 or obj > 2147483647:
>             raise ValueError("object of IntegerType out of range, got: %s" % 
> obj)
> 
>     elif isinstance(dataType, ArrayType):
1142c1299
<             _verify_type(i, dataType.elementType)
---
>             _verify_type(i, dataType.elementType, dataType.containsNull)
1146,1147c1303,1304
<             _verify_type(k, dataType.keyType)
<             _verify_type(v, dataType.valueType)
---
>             _verify_type(k, dataType.keyType, False)
>             _verify_type(v, dataType.valueType, dataType.valueContainsNull)
1154c1311
<             _verify_type(v, f.dataType)
---
>             _verify_type(v, f.dataType, f.nullable)
1178a1336,1337
>     >>> row['name'], row['age']
>     ('Alice', 11)
1245a1405,1417
>     def __getitem__(self, item):
>         if isinstance(item, (int, slice)):
>             return super(Row, self).__getitem__(item)
>         try:
>             # it will be slow when it has many fields,
>             # but this will not be used in normal cases
>             idx = self.__fields__.index(item)
>             return super(Row, self).__getitem__(idx)
>         except IndexError:
>             raise KeyError(item)
>         except ValueError:
>             raise ValueError(item)
> 
1295,1296c1467,1471
<         return Timestamp(int(time.mktime(obj.timetuple())) * 1000 + 
obj.microsecond // 1000)
< 
---
>         seconds = (calendar.timegm(obj.utctimetuple()) if obj.tzinfo
>                    else time.mktime(obj.timetuple()))
>         t = Timestamp(int(seconds) * 1000)
>         t.setNanos(obj.microsecond * 1000)
>         return t

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: df.dtypes -> pyspark.sql.types

Reply via email to