[GitHub] spark pull request: [SPARK-7902] [SPARK-6289] [SPARK-8685] [SQL] [...

davies Thu, 09 Jul 2015 00:36:07 -0700

Github user davies commented on a diff in the pull request:

    https://github.com/apache/spark/pull/7301#discussion_r34229953
  
    --- Diff: python/pyspark/sql/types.py ---
    @@ -1106,159 +1104,10 @@ def _verify_type(obj, dataType):
             for v, f in zip(obj, dataType.fields):
                 _verify_type(v, f.dataType)
     
    -_cached_cls = weakref.WeakValueDictionary()
    -
    -
    -def _restore_object(dataType, obj):
    -    """ Restore object during unpickling. """
    -    # use id(dataType) as key to speed up lookup in dict
    -    # Because of batched pickling, dataType will be the
    -    # same object in most cases.
    -    k = id(dataType)
    -    cls = _cached_cls.get(k)
    -    if cls is None or cls.__datatype is not dataType:
    -        # use dataType as key to avoid create multiple class
    -        cls = _cached_cls.get(dataType)
    -        if cls is None:
    -            cls = _create_cls(dataType)
    -            _cached_cls[dataType] = cls
    -        cls.__datatype = dataType
    -        _cached_cls[k] = cls
    -    return cls(obj)
    -
    -
    -def _create_object(cls, v):
    -    """ Create an customized object with class `cls`. """
    -    # datetime.date would be deserialized as datetime.datetime
    -    # from java type, so we need to set it back.
    -    if cls is datetime.date and isinstance(v, datetime.datetime):
    -        return v.date()
    -    return cls(v) if v is not None else v
    -
    -
    -def _create_getter(dt, i):
    -    """ Create a getter for item `i` with schema """
    -    cls = _create_cls(dt)
    -
    -    def getter(self):
    -        return _create_object(cls, self[i])
    -
    -    return getter
    -
    -
    -def _has_struct_or_date(dt):
    -    """Return whether `dt` is or has StructType/DateType in it"""
    -    if isinstance(dt, StructType):
    -        return True
    -    elif isinstance(dt, ArrayType):
    -        return _has_struct_or_date(dt.elementType)
    -    elif isinstance(dt, MapType):
    -        return _has_struct_or_date(dt.keyType) or 
_has_struct_or_date(dt.valueType)
    -    elif isinstance(dt, DateType):
    -        return True
    -    elif isinstance(dt, UserDefinedType):
    -        return True
    -    return False
    -
    -
    -def _create_properties(fields):
    -    """Create properties according to fields"""
    -    ps = {}
    -    for i, f in enumerate(fields):
    -        name = f.name
    -        if (name.startswith("__") and name.endswith("__")
    -                or keyword.iskeyword(name)):
    -            warnings.warn("field name %s can not be accessed in Python,"
    -                          "use position to access it instead" % name)
    -        if _has_struct_or_date(f.dataType):
    -            # delay creating object until accessing it
    -            getter = _create_getter(f.dataType, i)
    -        else:
    -            getter = itemgetter(i)
    -        ps[name] = property(getter)
    -    return ps
    -
    -
    -def _create_cls(dataType):
    -    """
    -    Create an class by dataType
    -
    -    The created class is similar to namedtuple, but can have nested schema.
    -
    -    >>> schema = _parse_schema_abstract("a b c")
    -    >>> row = (1, 1.0, "str")
    -    >>> schema = _infer_schema_type(row, schema)
    -    >>> obj = _create_cls(schema)(row)
    -    >>> import pickle
    -    >>> pickle.loads(pickle.dumps(obj))
    -    Row(a=1, b=1.0, c='str')
    -
    -    >>> row = [[1], {"key": (1, 2.0)}]
    -    >>> schema = _parse_schema_abstract("a[] b{c d}")
    -    >>> schema = _infer_schema_type(row, schema)
    -    >>> obj = _create_cls(schema)(row)
    -    >>> pickle.loads(pickle.dumps(obj))
    -    Row(a=[1], b={'key': Row(c=1, d=2.0)})
    -    >>> pickle.loads(pickle.dumps(obj.a))
    -    [1]
    -    >>> pickle.loads(pickle.dumps(obj.b))
    -    {'key': Row(c=1, d=2.0)}
    -    """
    -
    -    if isinstance(dataType, ArrayType):
    -        cls = _create_cls(dataType.elementType)
    -
    -        def List(l):
    -            if l is None:
    -                return
    -            return [_create_object(cls, v) for v in l]
    -
    -        return List
    -
    -    elif isinstance(dataType, MapType):
    -        kcls = _create_cls(dataType.keyType)
    -        vcls = _create_cls(dataType.valueType)
    -
    -        def Dict(d):
    -            if d is None:
    -                return
    -            return dict((_create_object(kcls, k), _create_object(vcls, v)) 
for k, v in d.items())
    -
    -        return Dict
    -
    -    elif isinstance(dataType, DateType):
    -        return datetime.date
    -
    -    elif isinstance(dataType, UserDefinedType):
    -        return lambda datum: dataType.deserialize(datum)
    -
    -    elif not isinstance(dataType, StructType):
    -        # no wrapper for atomic types
    -        return lambda x: x
    -
    -    class Row(tuple):
    -
    -        """ Row in DataFrame """
    -        __datatype = dataType
    -        __fields__ = tuple(f.name for f in dataType.fields)
    -        __slots__ = ()
    -
    -        # create property for fast access
    -        locals().update(_create_properties(dataType.fields))
    -
    -        def asDict(self):
    -            """ Return as a dict """
    -            return dict((n, getattr(self, n)) for n in self.__fields__)
    -
    -        def __repr__(self):
    -            # call collect __repr__ for nested objects
    -            return ("Row(%s)" % ", ".join("%s=%r" % (n, getattr(self, n))
    -                                          for n in self.__fields__))
    -
    -        def __reduce__(self):
    -            return (_restore_object, (self.__datatype, tuple(self)))
     
    -    return Row
    +# This is used to unpickle a Row from JVM
    +def _create_row_inbound_converter(dataType):
    +    return lambda *a: dataType.deserialize(a)
    --- End diff --
    
    It's same as:
    ```
    def foo(*args): pass
    ```



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-7902] [SPARK-6289] [SPARK-8685] [SQL] [...

Reply via email to