kszucs commented on a change in pull request #7519: URL: https://github.com/apache/arrow/pull/7519#discussion_r447738979
########## File path: python/pyarrow/scalar.pxi ########## @@ -16,1198 +16,745 @@ # under the License. -_NULL = NA = None +import collections cdef class Scalar: """ - The base class for all array elements. + The base class for scalars. """ + def __init__(self): + raise TypeError("Do not call {}'s constructor directly, use " + "pa.scalar() instead.".format(self.__class__.__name__)) -cdef class NullType(Scalar): - """ - Singleton for null array elements. - """ - # TODO rename this NullValue? + cdef void init(self, const shared_ptr[CScalar]& wrapped): + self.wrapped = wrapped - def __cinit__(self): - global NA - if NA is not None: - raise Exception('Cannot create multiple NAType instances') + @staticmethod + cdef wrap(const shared_ptr[CScalar]& wrapped): + cdef: + Scalar self + Type type_id = wrapped.get().type.get().id() + + if type_id == _Type_NA: + return _NULL + + typ = _scalar_classes[type_id] + self = typ.__new__(typ) + self.init(wrapped) + + return self + + cdef inline shared_ptr[CScalar] unwrap(self) nogil: + return self.wrapped + + @property + def type(self): + return pyarrow_wrap_data_type(self.wrapped.get().type) - self.type = null() + @property + def is_valid(self): + return self.wrapped.get().is_valid def __repr__(self): - return 'NULL' + return '<pyarrow.{}: {!r}>'.format( + self.__class__.__name__, self.as_py() + ) - def as_py(self): - """ - Return None - """ - return None + def __str__(self): + return str(self.as_py()) + + def equals(self, Scalar other): + return self.wrapped.get().Equals(other.unwrap().get()[0]) def __eq__(self, other): - return NA + try: + if not isinstance(other, Scalar): + other = scalar(other, type=self.type) + return self.equals(other) + except (TypeError, ValueError, ArrowInvalid): + return NotImplemented + + def __hash__(self): + cdef CScalarHash hasher + return hasher(self.wrapped) + + def as_py(self): + raise NotImplementedError() -_NULL = NA = NullType() +_NULL = NA = None -cdef class ArrayValue(Scalar): +cdef class NullScalar(Scalar): """ - The base class for non-null array elements. + Concrete class for null scalars. """ - def __init__(self): - raise TypeError("Do not call {}'s constructor directly, use array " - "subscription instead." - .format(self.__class__.__name__)) + def __cinit__(self): + global NA + if NA is not None: + raise Exception('Cannot create multiple NAType instances') + self.init(shared_ptr[CScalar](new CNullScalar())) - cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array, - int64_t index): - self.type = type - self.index = index - self._set_array(sp_array) + def __init__(self): + pass - cdef void _set_array(self, const shared_ptr[CArray]& sp_array): - self.sp_array = sp_array + def __eq__(self, other): + return NA - def __repr__(self): - if hasattr(self, 'as_py'): - return repr(self.as_py()) - else: - return super(Scalar, self).__repr__() + def __hash__(self): + cdef CScalarHash hasher + return hasher(self.wrapped) - def __str__(self): - if hasattr(self, 'as_py'): - return str(self.as_py()) - else: - return super(Scalar, self).__str__() + def as_py(self): + """ + Return this value as a Python None. + """ + return None - def __eq__(self, other): - if hasattr(self, 'as_py'): - if isinstance(other, ArrayValue): - other = other.as_py() - return self.as_py() == other - else: - raise NotImplementedError( - "Cannot compare Arrow values that don't support as_py()") - def __hash__(self): - return hash(self.as_py()) +_NULL = NA = NullScalar() -cdef class BooleanValue(ArrayValue): +cdef class BooleanScalar(Scalar): """ - Concrete class for boolean array elements. + Concrete class for boolean scalars. """ def as_py(self): """ Return this value as a Python bool. """ - cdef CBooleanArray* ap = <CBooleanArray*> self.sp_array.get() - return ap.Value(self.index) + cdef CBooleanScalar* sp = <CBooleanScalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class Int8Value(ArrayValue): +cdef class UInt8Scalar(Scalar): """ - Concrete class for int8 array elements. + Concrete class for uint8 scalars. """ def as_py(self): """ Return this value as a Python int. """ - cdef CInt8Array* ap = <CInt8Array*> self.sp_array.get() - return ap.Value(self.index) + cdef CUInt8Scalar* sp = <CUInt8Scalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class UInt8Value(ArrayValue): +cdef class Int8Scalar(Scalar): """ - Concrete class for uint8 array elements. + Concrete class for int8 scalars. """ def as_py(self): """ Return this value as a Python int. """ - cdef CUInt8Array* ap = <CUInt8Array*> self.sp_array.get() - return ap.Value(self.index) + cdef CInt8Scalar* sp = <CInt8Scalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class Int16Value(ArrayValue): +cdef class UInt16Scalar(Scalar): """ - Concrete class for int16 array elements. + Concrete class for uint16 scalars. """ def as_py(self): """ Return this value as a Python int. """ - cdef CInt16Array* ap = <CInt16Array*> self.sp_array.get() - return ap.Value(self.index) + cdef CUInt16Scalar* sp = <CUInt16Scalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class UInt16Value(ArrayValue): +cdef class Int16Scalar(Scalar): """ - Concrete class for uint16 array elements. + Concrete class for int16 scalars. """ def as_py(self): """ Return this value as a Python int. """ - cdef CUInt16Array* ap = <CUInt16Array*> self.sp_array.get() - return ap.Value(self.index) + cdef CInt16Scalar* sp = <CInt16Scalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class Int32Value(ArrayValue): +cdef class UInt32Scalar(Scalar): """ - Concrete class for int32 array elements. + Concrete class for uint32 scalars. """ def as_py(self): """ Return this value as a Python int. """ - cdef CInt32Array* ap = <CInt32Array*> self.sp_array.get() - return ap.Value(self.index) + cdef CUInt32Scalar* sp = <CUInt32Scalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class UInt32Value(ArrayValue): +cdef class Int32Scalar(Scalar): """ - Concrete class for uint32 array elements. + Concrete class for int32 scalars. """ def as_py(self): """ Return this value as a Python int. """ - cdef CUInt32Array* ap = <CUInt32Array*> self.sp_array.get() - return ap.Value(self.index) + cdef CInt32Scalar* sp = <CInt32Scalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class Int64Value(ArrayValue): +cdef class UInt64Scalar(Scalar): """ - Concrete class for int64 array elements. + Concrete class for uint64 scalars. """ def as_py(self): """ Return this value as a Python int. """ - cdef CInt64Array* ap = <CInt64Array*> self.sp_array.get() - return ap.Value(self.index) + cdef CUInt64Scalar* sp = <CUInt64Scalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class UInt64Value(ArrayValue): +cdef class Int64Scalar(Scalar): """ - Concrete class for uint64 array elements. + Concrete class for int64 scalars. """ def as_py(self): """ Return this value as a Python int. """ - cdef CUInt64Array* ap = <CUInt64Array*> self.sp_array.get() - return ap.Value(self.index) + cdef CInt64Scalar* sp = <CInt64Scalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class Date32Value(ArrayValue): +cdef class HalfFloatScalar(Scalar): """ - Concrete class for date32 array elements. + Concrete class for float scalars. """ + def __hash__(self): + cdef CScalarHash hasher + return hasher(self.wrapped) + + def __eq__(self, other): + if hasattr(self, 'as_py'): + if isinstance(other, Scalar): + other = other.as_py() + return self.as_py() == other + else: + raise NotImplementedError + def as_py(self): """ - Return this value as a Python datetime.datetime instance. + Return this value as a Python float. """ - cdef CDate32Array* ap = <CDate32Array*> self.sp_array.get() + cdef CHalfFloatScalar* sp = <CHalfFloatScalar*> self.wrapped.get() + return PyHalf_FromHalf(sp.value) if sp.is_valid else None - # Shift to seconds since epoch - return (datetime.date(1970, 1, 1) + - datetime.timedelta(days=ap.Value(self.index))) + +cdef class FloatScalar(Scalar): + """ + Concrete class for float scalars. + """ + + def as_py(self): + """ + Return this value as a Python float. + """ + cdef CFloatScalar* sp = <CFloatScalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class Date64Value(ArrayValue): +cdef class DoubleScalar(Scalar): """ - Concrete class for date64 array elements. + Concrete class for double scalars. """ def as_py(self): """ - Return this value as a Python datetime.datetime instance. + Return this value as a Python float. """ - cdef CDate64Array* ap = <CDate64Array*> self.sp_array.get() - return (datetime.date(1970, 1, 1) + - datetime.timedelta( - days=ap.Value(self.index) / 86400000)) + cdef CDoubleScalar* sp = <CDoubleScalar*> self.wrapped.get() + return sp.value if sp.is_valid else None -cdef class Time32Value(ArrayValue): +cdef class Decimal128Scalar(Scalar): """ - Concrete class for time32 array elements. + Concrete class for decimal128 scalars. """ def as_py(self): """ - Return this value as a Python datetime.timedelta instance. + Return this value as a Python Decimal. """ cdef: - CTime32Array* ap = <CTime32Array*> self.sp_array.get() - CTime32Type* dtype = <CTime32Type*> ap.type().get() - - if dtype.unit() == TimeUnit_SECOND: - delta = datetime.timedelta(seconds=ap.Value(self.index)) - return (datetime.datetime(1970, 1, 1) + delta).time() + CDecimal128Scalar* sp = <CDecimal128Scalar*> self.wrapped.get() + CDecimal128Type* dtype = <CDecimal128Type*> sp.type.get() + if sp.is_valid: + return _pydecimal.Decimal( + frombytes(sp.value.ToString(dtype.scale())) + ) else: - return _box_time_milli(ap.Value(self.index)) + return None -cdef class Time64Value(ArrayValue): +cdef class Date32Scalar(Scalar): """ - Concrete class for time64 array elements. + Concrete class for date32 scalars. """ def as_py(self): """ - Return this value as a Python datetime.timedelta instance. + Return this value as a Python datetime.datetime instance. """ - cdef: - CTime64Array* ap = <CTime64Array*> self.sp_array.get() - CTime64Type* dtype = <CTime64Type*> ap.type().get() + cdef CDate32Scalar* sp = <CDate32Scalar*> self.wrapped.get() - cdef int64_t val = ap.Value(self.index) - if dtype.unit() == TimeUnit_MICRO: - return _box_time_micro(val) + if sp.is_valid: + # shift to seconds since epoch + return ( + datetime.date(1970, 1, 1) + datetime.timedelta(days=sp.value) + ) else: - return (datetime.datetime(1970, 1, 1) + - datetime.timedelta(microseconds=val / 1000)).time() - + return None -cpdef _box_time_milli(int64_t val): - delta = datetime.timedelta(milliseconds=val) - return (datetime.datetime(1970, 1, 1) + delta).time() +cdef class Date64Scalar(Scalar): + """ + Concrete class for date64 scalars. + """ -cpdef _box_time_micro(int64_t val): - return (datetime.datetime(1970, 1, 1) + - datetime.timedelta(microseconds=val)).time() + def as_py(self): + """ + Return this value as a Python datetime.datetime instance. + """ + cdef CDate64Scalar* sp = <CDate64Scalar*> self.wrapped.get() + if sp.is_valid: + return ( + datetime.date(1970, 1, 1) + + datetime.timedelta(days=sp.value / 86400000) + ) + else: + return None -cdef dict _DATETIME_CONVERSION_FUNCTIONS = {} -cdef c_bool _datetime_conversion_initialized = False +def _datetime_from_int(int64_t value, TimeUnit unit, tzinfo=None): + if unit == TimeUnit_SECOND: + delta = datetime.timedelta(seconds=value) + elif unit == TimeUnit_MILLI: + delta = datetime.timedelta(milliseconds=value) + elif unit == TimeUnit_MICRO: + delta = datetime.timedelta(microseconds=value) + else: + # TimeUnit_NANO: prefer pandas timestamps if available + if _pandas_api.have_pandas: + return _pandas_api.pd.Timestamp(value, tz=tzinfo, unit='ns') + # otherwise safely truncate to microsecond resolution datetime + if value % 1000 != 0: + raise ValueError( + "Nanosecond resolution temporal type {} is not safely " + "convertible to microseconds to convert to datetime.datetime. " + "Install pandas to return as Timestamp with nanosecond " + "support or access the .value attribute.".format(value) + ) + delta = datetime.timedelta(microseconds=value // 1000) -cdef _add_micros_maybe_localize(dt, micros, tzinfo): - import pytz - dt = dt.replace(microsecond=micros) + dt = datetime.datetime(1970, 1, 1) + delta + # adjust timezone if set to the datatype if tzinfo is not None: - if not isinstance(tzinfo, datetime.tzinfo): - tzinfo = string_to_tzinfo(tzinfo) dt = tzinfo.fromutc(dt) - return dt - - -cdef _datetime_from_seconds(int64_t v): - return datetime.datetime(1970, 1, 1) + datetime.timedelta(seconds=v) - - -def _nanoseconds_to_datetime_safe(v, tzinfo): - if v % 1000 != 0: - raise ValueError("Nanosecond timestamp {} is not safely convertible " - " to microseconds to convert to datetime.datetime." - " Install pandas to return as Timestamp with " - " nanosecond support or access the .value attribute.") - v = v // 1000 - micros = v % 1_000_000 - - dt = _datetime_from_seconds(v // 1_000_000) - return _add_micros_maybe_localize(dt, micros, tzinfo) - - -def _microseconds_to_datetime(v, tzinfo): - micros = v % 1_000_000 - dt = _datetime_from_seconds(v // 1_000_000) - return _add_micros_maybe_localize(dt, micros, tzinfo) + return dt -def _millis_to_datetime(v, tzinfo): - millis = v % 1_000 - dt = _datetime_from_seconds(v // 1000) - return _add_micros_maybe_localize(dt, millis * 1000, tzinfo) +cdef class Time32Scalar(Scalar): + """ + Concrete class for time32 scalars. + """ -def _seconds_to_datetime(v, tzinfo): - dt = _datetime_from_seconds(v) - return _add_micros_maybe_localize(dt, 0, tzinfo) + def as_py(self): + """ + Return this value as a Python datetime.timedelta instance. + """ + cdef: + CTime32Scalar* sp = <CTime32Scalar*> self.wrapped.get() + CTime32Type* dtype = <CTime32Type*> sp.type.get() + if sp.is_valid: + return _datetime_from_int(sp.value, unit=dtype.unit()).time() + else: + return None -def _datetime_conversion_functions(): - global _datetime_conversion_initialized - if _datetime_conversion_initialized: - return _DATETIME_CONVERSION_FUNCTIONS - _DATETIME_CONVERSION_FUNCTIONS.update({ - TimeUnit_SECOND: _seconds_to_datetime, - TimeUnit_MILLI: _millis_to_datetime, - TimeUnit_MICRO: _microseconds_to_datetime, - TimeUnit_NANO: _nanoseconds_to_datetime_safe - }) +cdef class Time64Scalar(Scalar): + """ + Concrete class for time64 scalars. + """ - try: - import pandas as pd - _DATETIME_CONVERSION_FUNCTIONS[TimeUnit_NANO] = ( - lambda x, tzinfo: pd.Timestamp( - x, tz=tzinfo, unit='ns', - ) - ) - except ImportError: - pass + def as_py(self): + """ + Return this value as a Python datetime.timedelta instance. + """ + cdef: + CTime64Scalar* sp = <CTime64Scalar*> self.wrapped.get() + CTime64Type* dtype = <CTime64Type*> sp.type.get() - _datetime_conversion_initialized = True - return _DATETIME_CONVERSION_FUNCTIONS + if sp.is_valid: + return _datetime_from_int(sp.value, unit=dtype.unit()).time() + else: + return None -cdef class TimestampValue(ArrayValue): +cdef class TimestampScalar(Scalar): """ - Concrete class for timestamp array elements. + Concrete class for timestamp scalars. """ @property def value(self): - cdef CTimestampArray* ap = <CTimestampArray*> self.sp_array.get() - cdef CTimestampType* dtype = <CTimestampType*> ap.type().get() - return ap.Value(self.index) + cdef CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get() + return sp.value if sp.is_valid else None def as_py(self): """ Return this value as a Pandas Timestamp instance (if available), otherwise as a Python datetime.timedelta instance. """ - cdef CTimestampArray* ap = <CTimestampArray*> self.sp_array.get() - cdef CTimestampType* dtype = <CTimestampType*> ap.type().get() + cdef: + CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get() + CTimestampType* dtype = <CTimestampType*> sp.type.get() - value = self.value + if not sp.is_valid: + return None if not dtype.timezone().empty(): tzinfo = string_to_tzinfo(frombytes(dtype.timezone())) + if not isinstance(tzinfo, datetime.tzinfo): + tzinfo = string_to_tzinfo(tzinfo) Review comment: It's a copy issue, I incrementally ported the previous helper functions. Removing the second call. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org