[jira] [Commented] (ARROW-1718) [Python] Implement casts from timestamp to date32/date64 and support in Array.from_pandas

ASF GitHub Bot (JIRA) Mon, 30 Oct 2017 00:51:17 -0700

    [ 
https://issues.apache.org/jira/browse/ARROW-1718?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16224467#comment-16224467
 ]


ASF GitHub Bot commented on ARROW-1718:
---------------------------------------

xhochy closed pull request #1258: ARROW-1718: [C++/Python] Implement casts from 
timestamp to date32/64, properly handle NumPy datetime64[D] -> date32
URL: https://github.com/apache/arrow/pull/1258
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/.travis.yml b/.travis.yml
index 039ae9520..6419548a6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -51,12 +51,12 @@ matrix:
     os: linux
     group: deprecated
     before_script:
-    - export CC="gcc-4.9"
-    - export CXX="g++-4.9"
     - export ARROW_TRAVIS_USE_TOOLCHAIN=1
     - export ARROW_TRAVIS_VALGRIND=1
     - export ARROW_TRAVIS_PLASMA=1
     - export ARROW_TRAVIS_CLANG_FORMAT=1
+    - export CC="clang-4.0"
+    - export CXX="clang++-4.0"
     - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh
     - $TRAVIS_BUILD_DIR/ci/travis_lint.sh
     - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index 68a2b1237..114ab9af0 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -69,11 +69,18 @@
 namespace arrow {
 namespace compute {
 
+constexpr int64_t kMillisecondsInDay = 86400000;
+
 template <typename T>
-inline const T* GetValuesAs(const ArrayData& data, int i) {
+inline const T* GetValues(const ArrayData& data, int i) {
   return reinterpret_cast<const T*>(data.buffers[i]->data()) + data.offset;
 }
 
+template <typename T>
+inline T* GetMutableValues(const ArrayData* data, int i) {
+  return reinterpret_cast<T*>(data->buffers[i]->mutable_data()) + data->offset;
+}
+
 namespace {
 
 void CopyData(const Array& input, ArrayData* output) {
@@ -164,7 +171,7 @@ struct CastFunctor<T, BooleanType,
     auto in_data = input.data();
     internal::BitmapReader bit_reader(in_data->buffers[1]->data(), 
in_data->offset,
                                       in_data->length);
-    auto out = reinterpret_cast<c_type*>(output->buffers[1]->mutable_data());
+    auto out = GetMutableValues<c_type>(output, 1);
     for (int64_t i = 0; i < input.length(); ++i) {
       *out++ = bit_reader.IsSet() ? kOne : kZero;
       bit_reader.Next();
@@ -214,8 +221,8 @@ struct CastFunctor<O, I, typename 
std::enable_if<std::is_same<BooleanType, O>::v
     using in_type = typename I::c_type;
     DCHECK_EQ(output->offset, 0);
 
-    const in_type* in_data = GetValuesAs<in_type>(*input.data(), 1);
-    uint8_t* out_data = 
reinterpret_cast<uint8_t*>(output->buffers[1]->mutable_data());
+    const in_type* in_data = GetValues<in_type>(*input.data(), 1);
+    uint8_t* out_data = GetMutableValues<uint8_t>(output, 1);
     for (int64_t i = 0; i < input.length(); ++i) {
       BitUtil::SetBitTo(out_data, i, (*in_data++) != 0);
     }
@@ -233,8 +240,8 @@ struct CastFunctor<O, I,
 
     auto in_offset = input.offset();
 
-    const in_type* in_data = GetValuesAs<in_type>(*input.data(), 1);
-    auto out_data = 
reinterpret_cast<out_type*>(output->buffers[1]->mutable_data());
+    const in_type* in_data = GetValues<in_type>(*input.data(), 1);
+    auto out_data = GetMutableValues<out_type>(output, 1);
 
     if (!options.allow_int_overflow) {
       constexpr in_type kMax = 
static_cast<in_type>(std::numeric_limits<out_type>::max());
@@ -276,8 +283,8 @@ struct CastFunctor<O, I,
     using in_type = typename I::c_type;
     using out_type = typename O::c_type;
 
-    const in_type* in_data = GetValuesAs<in_type>(*input.data(), 1);
-    auto out_data = 
reinterpret_cast<out_type*>(output->buffers[1]->mutable_data());
+    const in_type* in_data = GetValues<in_type>(*input.data(), 1);
+    auto out_data = GetMutableValues<out_type>(output, 1);
     for (int64_t i = 0; i < input.length(); ++i) {
       *out_data++ = static_cast<out_type>(*in_data++);
     }
@@ -288,13 +295,16 @@ struct CastFunctor<O, I,
 // From one timestamp to another
 
 template <typename in_type, typename out_type>
-inline void ShiftTime(FunctionContext* ctx, const CastOptions& options,
-                      const bool is_multiply, const int64_t factor, const 
Array& input,
-                      ArrayData* output) {
-  const in_type* in_data = GetValuesAs<in_type>(*input.data(), 1);
-  auto out_data = 
reinterpret_cast<out_type*>(output->buffers[1]->mutable_data());
+void ShiftTime(FunctionContext* ctx, const CastOptions& options, const bool 
is_multiply,
+               const int64_t factor, const Array& input, ArrayData* output) {
+  const in_type* in_data = GetValues<in_type>(*input.data(), 1);
+  auto out_data = GetMutableValues<out_type>(output, 1);
 
-  if (is_multiply) {
+  if (factor == 1) {
+    for (int64_t i = 0; i < input.length(); i++) {
+      out_data[i] = static_cast<out_type>(in_data[i]);
+    }
+  } else if (is_multiply) {
     for (int64_t i = 0; i < input.length(); i++) {
       out_data[i] = static_cast<out_type>(in_data[i] * factor);
     }
@@ -352,6 +362,52 @@ struct CastFunctor<TimestampType, TimestampType> {
   }
 };
 
+template <>
+struct CastFunctor<Date32Type, TimestampType> {
+  void operator()(FunctionContext* ctx, const CastOptions& options, const 
Array& input,
+                  ArrayData* output) {
+    const auto& in_type = static_cast<const TimestampType&>(*input.type());
+
+    static const int64_t kTimestampToDateFactors[4] = {
+        86400LL,                             // SECOND
+        86400LL * 1000LL,                    // MILLI
+        86400LL * 1000LL * 1000LL,           // MICRO
+        86400LL * 1000LL * 1000LL * 1000LL,  // NANO
+    };
+
+    const int64_t factor = 
kTimestampToDateFactors[static_cast<int>(in_type.unit())];
+    ShiftTime<int64_t, int32_t>(ctx, options, false, factor, input, output);
+  }
+};
+
+template <>
+struct CastFunctor<Date64Type, TimestampType> {
+  void operator()(FunctionContext* ctx, const CastOptions& options, const 
Array& input,
+                  ArrayData* output) {
+    const auto& in_type = static_cast<const TimestampType&>(*input.type());
+
+    std::pair<bool, int64_t> conversion =
+        kTimeConversionTable[static_cast<int>(in_type.unit())]
+                            [static_cast<int>(TimeUnit::MILLI)];
+
+    ShiftTime<int64_t, int64_t>(ctx, options, conversion.first, 
conversion.second, input,
+                                output);
+
+    // Ensure that intraday milliseconds have been zeroed out
+    auto out_data = GetMutableValues<int64_t>(output, 1);
+    for (int64_t i = 0; i < input.length(); ++i) {
+      const int64_t remainder = out_data[i] % kMillisecondsInDay;
+      if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && input.IsValid(i) 
&&
+                              remainder > 0)) {
+        ctx->SetStatus(
+            Status::Invalid("Timestamp value had non-zero intraday 
milliseconds"));
+        break;
+      }
+      out_data[i] -= remainder;
+    }
+  }
+};
+
 // ----------------------------------------------------------------------
 // From one time32 or time64 to another
 
@@ -385,8 +441,6 @@ struct CastFunctor<O, I,
 // ----------------------------------------------------------------------
 // Between date32 and date64
 
-constexpr int64_t kMillisecondsInDay = 86400000;
-
 template <>
 struct CastFunctor<Date64Type, Date32Type> {
   void operator()(FunctionContext* ctx, const CastOptions& options, const 
Array& input,
@@ -415,7 +469,7 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, 
const Array& indices,
   internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), 
indices.offset(),
                                            indices.length());
 
-  const index_c_type* in = GetValuesAs<index_c_type>(*indices.data(), 1);
+  const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
 
   uint8_t* out = output->buffers[1]->mutable_data();
   int32_t byte_width =
@@ -479,7 +533,7 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const 
Array& indices,
   internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), 
indices.offset(),
                                            indices.length());
 
-  const index_c_type* in = GetValuesAs<index_c_type>(*indices.data(), 1);
+  const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
   for (int64_t i = 0; i < indices.length(); ++i) {
     if (valid_bits_reader.IsSet()) {
       int32_t length;
@@ -550,7 +604,7 @@ void UnpackPrimitiveDictionary(const Array& indices, const 
c_type* dictionary,
   internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), 
indices.offset(),
                                            indices.length());
 
-  const index_c_type* in = GetValuesAs<index_c_type>(*indices.data(), 1);
+  const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
   for (int64_t i = 0; i < indices.length(); ++i) {
     if (valid_bits_reader.IsSet()) {
       out[i] = dictionary[in[i]];
@@ -575,7 +629,7 @@ struct CastFunctor<T, DictionaryType,
     DCHECK(values_type.Equals(*output->type))
         << "Dictionary type: " << values_type << " target type: " << 
(*output->type);
 
-    const c_type* dictionary = GetValuesAs<c_type>(*type.dictionary()->data(), 
1);
+    const c_type* dictionary = GetValues<c_type>(*type.dictionary()->data(), 
1);
 
     auto out = reinterpret_cast<c_type*>(output->buffers[1]->mutable_data());
     const Array& indices = *dict_array.indices();
@@ -755,7 +809,10 @@ class CastKernel : public UnaryKernel {
   FN(Time64Type, Time32Type);     \
   FN(Time64Type, Time64Type);
 
-#define TIMESTAMP_CASES(FN, IN_TYPE) FN(TimestampType, TimestampType);
+#define TIMESTAMP_CASES(FN, IN_TYPE) \
+  FN(TimestampType, TimestampType);  \
+  FN(TimestampType, Date32Type);     \
+  FN(TimestampType, Date64Type);
 
 #define DICTIONARY_CASES(FN, IN_TYPE) \
   FN(IN_TYPE, NullType);              \
diff --git a/cpp/src/arrow/compute/compute-test.cc 
b/cpp/src/arrow/compute/compute-test.cc
index 8a7ef923b..61d53c4d5 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -355,6 +355,70 @@ TEST_F(TestCast, TimestampToTimestamp) {
                             timestamp(TimeUnit::SECOND), options);
 }
 
+TEST_F(TestCast, TimestampToDate32_Date64) {
+  CastOptions options;
+
+  vector<bool> is_valid = {true, true, false};
+
+  // 2000-01-01, 2000-01-02, null
+  vector<int64_t> v_nano = {946684800000000000, 946771200000000000, 0};
+  vector<int64_t> v_micro = {946684800000000, 946771200000000, 0};
+  vector<int64_t> v_milli = {946684800000, 946771200000, 0};
+  vector<int64_t> v_second = {946684800, 946771200, 0};
+  vector<int32_t> v_day = {10957, 10958, 0};
+
+  // Simple conversions
+  CheckCase<TimestampType, int64_t, Date64Type, int64_t>(
+      timestamp(TimeUnit::NANO), v_nano, is_valid, date64(), v_milli, options);
+  CheckCase<TimestampType, int64_t, Date64Type, int64_t>(
+      timestamp(TimeUnit::MICRO), v_micro, is_valid, date64(), v_milli, 
options);
+  CheckCase<TimestampType, int64_t, Date64Type, int64_t>(
+      timestamp(TimeUnit::MILLI), v_milli, is_valid, date64(), v_milli, 
options);
+  CheckCase<TimestampType, int64_t, Date64Type, int64_t>(
+      timestamp(TimeUnit::SECOND), v_second, is_valid, date64(), v_milli, 
options);
+
+  CheckCase<TimestampType, int64_t, Date32Type, int32_t>(
+      timestamp(TimeUnit::NANO), v_nano, is_valid, date32(), v_day, options);
+  CheckCase<TimestampType, int64_t, Date32Type, int32_t>(
+      timestamp(TimeUnit::MICRO), v_micro, is_valid, date32(), v_day, options);
+  CheckCase<TimestampType, int64_t, Date32Type, int32_t>(
+      timestamp(TimeUnit::MILLI), v_milli, is_valid, date32(), v_day, options);
+  CheckCase<TimestampType, int64_t, Date32Type, int32_t>(
+      timestamp(TimeUnit::SECOND), v_second, is_valid, date32(), v_day, 
options);
+
+  // Disallow truncate, failures
+  vector<int64_t> v_nano_fail = {946684800000000001, 946771200000000001, 0};
+  vector<int64_t> v_micro_fail = {946684800000001, 946771200000001, 0};
+  vector<int64_t> v_milli_fail = {946684800001, 946771200001, 0};
+  vector<int64_t> v_second_fail = {946684801, 946771201, 0};
+
+  options.allow_time_truncate = false;
+  CheckFails<TimestampType>(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, 
date64(),
+                            options);
+  CheckFails<TimestampType>(timestamp(TimeUnit::MICRO), v_micro_fail, 
is_valid, date64(),
+                            options);
+  CheckFails<TimestampType>(timestamp(TimeUnit::MILLI), v_milli_fail, 
is_valid, date64(),
+                            options);
+  CheckFails<TimestampType>(timestamp(TimeUnit::SECOND), v_second_fail, 
is_valid,
+                            date64(), options);
+
+  CheckFails<TimestampType>(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, 
date32(),
+                            options);
+  CheckFails<TimestampType>(timestamp(TimeUnit::MICRO), v_micro_fail, 
is_valid, date32(),
+                            options);
+  CheckFails<TimestampType>(timestamp(TimeUnit::MILLI), v_milli_fail, 
is_valid, date32(),
+                            options);
+  CheckFails<TimestampType>(timestamp(TimeUnit::SECOND), v_second_fail, 
is_valid,
+                            date32(), options);
+
+  // Make sure that nulls are excluded from the truncation checks
+  vector<int64_t> v_second_nofail = {946684800, 946771200, 1};
+  CheckCase<TimestampType, int64_t, Date64Type, int64_t>(
+      timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date64(), 
v_milli, options);
+  CheckCase<TimestampType, int64_t, Date32Type, int32_t>(
+      timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date32(), v_day, 
options);
+}
+
 TEST_F(TestCast, TimeToTime) {
   CastOptions options;
 
diff --git a/cpp/src/arrow/python/builtin_convert.cc 
b/cpp/src/arrow/python/builtin_convert.cc
index d52627ebf..0e775a0fb 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -519,7 +519,26 @@ class UInt64Converter : public 
TypedConverterVisitor<UInt64Builder, UInt64Conver
   }
 };
 
-class DateConverter : public TypedConverterVisitor<Date64Builder, 
DateConverter> {
+class Date32Converter : public TypedConverterVisitor<Date32Builder, 
Date32Converter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    int32_t t;
+    if (PyDate_Check(item.obj())) {
+      auto pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
+      t = static_cast<int32_t>(PyDate_to_s(pydate));
+    } else {
+      int64_t casted_val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
+      RETURN_IF_PYERROR();
+      if (casted_val > std::numeric_limits<int32_t>::max()) {
+        return Status::Invalid("Integer as date32 larger than INT32_MAX");
+      }
+      t = static_cast<int32_t>(casted_val);
+    }
+    return typed_builder_->Append(t);
+  }
+};
+
+class Date64Converter : public TypedConverterVisitor<Date64Builder, 
Date64Converter> {
  public:
   inline Status AppendItem(const OwnedRef& item) {
     int64_t t;
@@ -535,7 +554,7 @@ class DateConverter : public 
TypedConverterVisitor<Date64Builder, DateConverter>
 };
 
 class TimestampConverter
-    : public TypedConverterVisitor<Date64Builder, TimestampConverter> {
+    : public TypedConverterVisitor<TimestampBuilder, TimestampConverter> {
  public:
   explicit TimestampConverter(TimeUnit::type unit) : unit_(unit) {}
 
@@ -717,8 +736,10 @@ std::shared_ptr<SeqConverter> GetConverter(const 
std::shared_ptr<DataType>& type
       return std::make_shared<UInt32Converter>();
     case Type::UINT64:
       return std::make_shared<UInt64Converter>();
+    case Type::DATE32:
+      return std::make_shared<Date32Converter>();
     case Type::DATE64:
-      return std::make_shared<DateConverter>();
+      return std::make_shared<Date64Converter>();
     case Type::TIMESTAMP:
       return std::make_shared<TimestampConverter>(
           static_cast<const TimestampType&>(*type).unit());
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index ead3a0481..c5aff2e4f 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -260,6 +260,7 @@ class NumPyConverter {
       : pool_(pool),
         type_(type),
         arr_(reinterpret_cast<PyArrayObject*>(ao)),
+        dtype_(PyArray_DESCR(arr_)),
         mask_(nullptr),
         use_pandas_null_sentinels_(use_pandas_null_sentinels) {
     if (mo != nullptr && mo != Py_None) {
@@ -431,6 +432,7 @@ class NumPyConverter {
   MemoryPool* pool_;
   std::shared_ptr<DataType> type_;
   PyArrayObject* arr_;
+  PyArray_Descr* dtype_;
   PyArrayObject* mask_;
   int64_t length_;
   int64_t stride_;
@@ -450,7 +452,7 @@ Status NumPyConverter::Convert() {
     return Status::Invalid("only handle 1-dimensional arrays");
   }
 
-  if (PyArray_DESCR(arr_)->type_num == NPY_OBJECT) {
+  if (dtype_->type_num == NPY_OBJECT) {
     return ConvertObjects();
   }
 
@@ -462,33 +464,12 @@ Status NumPyConverter::Convert() {
   return VisitTypeInline(*type_, this);
 }
 
-template <typename T, typename T2>
-void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* 
output_data) {
-  // Passing input_data as non-const is a concession to PyObject*
-  int64_t j = 0;
-  for (int64_t i = 0; i < length; ++i) {
-    output_data[i] = static_cast<T2>(input_data[j]);
-    j += stride;
-  }
-}
-
-template <>
-void CopyStrided<PyObject*, PyObject*>(PyObject** input_data, int64_t length,
-                                       int64_t stride, PyObject** output_data) 
{
-  int64_t j = 0;
-  for (int64_t i = 0; i < length; ++i) {
-    output_data[i] = input_data[j];
-    if (output_data[i] != nullptr) {
-      Py_INCREF(output_data[i]);
-    }
-    j += stride;
-  }
-}
+namespace {
 
-static Status CastBuffer(const std::shared_ptr<Buffer>& input, const int64_t 
length,
-                         const std::shared_ptr<DataType>& in_type,
-                         const std::shared_ptr<DataType>& out_type, 
MemoryPool* pool,
-                         std::shared_ptr<Buffer>* out) {
+Status CastBuffer(const std::shared_ptr<Buffer>& input, const int64_t length,
+                  const std::shared_ptr<DataType>& in_type,
+                  const std::shared_ptr<DataType>& out_type, MemoryPool* pool,
+                  std::shared_ptr<Buffer>* out) {
   // Must cast
   std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, input};
   auto tmp_data = std::make_shared<ArrayData>(in_type, length, buffers, 0);
@@ -499,6 +480,7 @@ static Status CastBuffer(const std::shared_ptr<Buffer>& 
input, const int64_t len
   compute::FunctionContext context(pool);
   compute::CastOptions cast_options;
   cast_options.allow_int_overflow = false;
+  cast_options.allow_time_truncate = false;
 
   RETURN_NOT_OK(
       compute::Cast(&context, *tmp_array, out_type, cast_options, 
&casted_array));
@@ -506,29 +488,47 @@ static Status CastBuffer(const std::shared_ptr<Buffer>& 
input, const int64_t len
   return Status::OK();
 }
 
+template <typename T, typename T2>
+void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* 
output_data) {
+  // Passing input_data as non-const is a concession to PyObject*
+  int64_t j = 0;
+  for (int64_t i = 0; i < length; ++i) {
+    output_data[i] = static_cast<T2>(input_data[j]);
+    j += stride;
+  }
+}
+
 template <typename ArrowType>
-inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
+Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* 
pool,
+                        std::shared_ptr<Buffer>* out) {
   using traits = internal::arrow_traits<ArrowType::type_id>;
   using T = typename traits::T;
 
+  // Strided, must copy into new contiguous memory
+  const int64_t stride = PyArray_STRIDES(arr)[0];
+  const int64_t stride_elements = stride / sizeof(T);
+
+  auto new_buffer = std::make_shared<PoolBuffer>(pool);
+  RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length));
+  CopyStrided(reinterpret_cast<T*>(PyArray_DATA(arr)), length, stride_elements,
+              reinterpret_cast<T*>(new_buffer->mutable_data()));
+  *out = new_buffer;
+  return Status::OK();
+}
+
+}  // namespace
+
+template <typename ArrowType>
+inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
   if (is_strided()) {
-    // Strided, must copy into new contiguous memory
-    const int64_t stride = PyArray_STRIDES(arr_)[0];
-    const int64_t stride_elements = stride / sizeof(T);
-
-    auto new_buffer = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length_));
-    CopyStrided(reinterpret_cast<T*>(PyArray_DATA(arr_)), length_, 
stride_elements,
-                reinterpret_cast<T*>(new_buffer->mutable_data()));
-    *data = new_buffer;
+    RETURN_NOT_OK(CopyStridedArray<ArrowType>(arr_, length_, pool_, data));
   } else {
     // Can zero-copy
     *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
   }
 
   std::shared_ptr<DataType> input_type;
-  RETURN_NOT_OK(
-      NumPyDtypeToArrow(reinterpret_cast<PyObject*>(PyArray_DESCR(arr_)), 
&input_type));
+  RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), 
&input_type));
 
   if (!input_type->Equals(*type_)) {
     RETURN_NOT_OK(CastBuffer(*data, length_, input_type, type_, pool_, data));
@@ -538,45 +538,6 @@ inline Status 
NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
 }
 
 template <>
-inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* 
data) {
-  // Handle LONGLONG->INT64 and other fun things
-  int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num);
-  int type_size = NumPyTypeSize(type_num_compat);
-
-  if (type_size == 4) {
-    // Source and target are INT32, so can refer to the main implementation.
-    return ConvertData<Int32Type>(data);
-  } else if (type_size == 8) {
-    // We need to scale down from int64 to int32
-    auto new_buffer = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(new_buffer->Resize(sizeof(int32_t) * length_));
-
-    auto input = reinterpret_cast<const int64_t*>(PyArray_DATA(arr_));
-    auto output = reinterpret_cast<int32_t*>(new_buffer->mutable_data());
-
-    if (is_strided()) {
-      // Strided, must copy into new contiguous memory
-      const int64_t stride = PyArray_STRIDES(arr_)[0];
-      const int64_t stride_elements = stride / sizeof(int64_t);
-      CopyStrided(input, length_, stride_elements, output);
-    } else {
-      // TODO(wesm): int32 overflow checks
-      for (int64_t i = 0; i < length_; ++i) {
-        *output++ = static_cast<int32_t>(*input++);
-      }
-    }
-    *data = new_buffer;
-  } else {
-    std::stringstream ss;
-    ss << "Cannot convert NumPy array of element size ";
-    ss << type_size << " to a Date32 array";
-    return Status::NotImplemented(ss.str());
-  }
-
-  return Status::OK();
-}
-
-template <>
 inline Status 
NumPyConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* data) {
   int64_t nbytes = BitUtil::BytesForBits(length_);
   auto buffer = std::make_shared<PoolBuffer>(pool_);
@@ -597,6 +558,42 @@ inline Status 
NumPyConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>*
   return Status::OK();
 }
 
+template <>
+inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* 
data) {
+  if (is_strided()) {
+    RETURN_NOT_OK(CopyStridedArray<Date32Type>(arr_, length_, pool_, data));
+  } else {
+    // Can zero-copy
+    *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
+  }
+
+  // If we have inbound datetime64[D] data, this needs to be downcasted
+  // separately here from int64_t to int32_t, because this data is not
+  // supported in compute::Cast
+  auto date_dtype = 
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
+  if (dtype_->type_num == NPY_DATETIME && date_dtype->meta.base == NPY_FR_D) {
+    auto date32_buffer = std::make_shared<PoolBuffer>(pool_);
+    RETURN_NOT_OK(date32_buffer->Resize(sizeof(int32_t) * length_));
+
+    auto datetime64_values = reinterpret_cast<const int64_t*>((*data)->data());
+    auto date32_values = 
reinterpret_cast<int32_t*>(date32_buffer->mutable_data());
+    for (int64_t i = 0; i < length_; ++i) {
+      // TODO(wesm): How pedantic do we really want to be about checking for 
int32
+      // overflow here?
+      *date32_values++ = static_cast<int32_t>(*datetime64_values++);
+    }
+    *data = date32_buffer;
+  } else {
+    std::shared_ptr<DataType> input_type;
+    RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), 
&input_type));
+    if (!input_type->Equals(*type_)) {
+      RETURN_NOT_OK(CastBuffer(*data, length_, input_type, type_, pool_, 
data));
+    }
+  }
+
+  return Status::OK();
+}
+
 template <typename T>
 struct UnboxDate {};
 
diff --git a/cpp/src/arrow/python/util/datetime.h 
b/cpp/src/arrow/python/util/datetime.h
index c110bc64a..e76c2e0db 100644
--- a/cpp/src/arrow/python/util/datetime.h
+++ b/cpp/src/arrow/python/util/datetime.h
@@ -235,6 +235,11 @@ static inline Status PyDateTime_from_int(int64_t val, 
const TimeUnit::type unit,
   return Status::OK();
 }
 
+static inline int64_t PyDate_to_s(PyDateTime_Date* pydate) {
+  return get_days_from_date(PyDateTime_GET_YEAR(pydate), 
PyDateTime_GET_MONTH(pydate),
+                            PyDateTime_GET_DAY(pydate));
+}
+
 static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
   int64_t total_seconds = 0;
   total_seconds += PyDateTime_DATE_GET_SECOND(pydate);
diff --git a/python/pyarrow/tests/test_convert_builtin.py 
b/python/pyarrow/tests/test_convert_builtin.py
index 414266ddb..c7a0d49b4 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -178,6 +178,25 @@ def test_date(self):
         assert arr[2].as_py() == datetime.date(1970, 1, 1)
         assert arr[3].as_py() == datetime.date(2040, 2, 26)
 
+    def test_date32(self):
+        data = [datetime.date(2000, 1, 1), None]
+        arr = pa.array(data, type=pa.date32())
+
+        data2 = [10957, None]
+        arr2 = pa.array(data2, type=pa.date32())
+
+        for x in [arr, arr2]:
+            assert len(x) == 2
+            assert x.type == pa.date32()
+            assert x.null_count == 1
+            assert x[0].as_py() == datetime.date(2000, 1, 1)
+            assert x[1] is pa.NA
+
+        # Overflow
+        data3 = [2**32, None]
+        with pytest.raises(pa.ArrowException):
+            pa.array(data3, type=pa.date32())
+
     def test_timestamp(self):
         data = [
             datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 8360dae54..e3f77c944 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -522,6 +522,16 @@ def test_timestamps_with_timezone(self):
 
         self._check_pandas_roundtrip(df)
 
+    def test_datetime64_to_date32(self):
+        # ARROW-1718
+        arr = pa.array([date(2017, 10, 23), None])
+        c = pa.Column.from_array("d", arr)
+        s = c.to_pandas()
+
+        arr2 = pa.Array.from_pandas(s, type=pa.date32())
+
+        assert arr2.equals(arr.cast('date32'))
+
     def test_date_infer(self):
         df = pd.DataFrame({
             'date': [date(2000, 1, 1),
@@ -981,6 +991,7 @@ def test_numpy_datetime64_columns(self):
                 dtype='datetime64[s]')
         self._check_array_from_pandas_roundtrip(datetime64_s)
 
+    def test_numpy_datetime64_day_unit(self):
         datetime64_d = np.array([
                 '2007-07-13',
                 None,


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Implement casts from timestamp to date32/date64 and support in 
> Array.from_pandas
> -----------------------------------------------------------------------------------------
>
>                 Key: ARROW-1718
>                 URL: https://issues.apache.org/jira/browse/ARROW-1718
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: Python
>            Reporter: Bryan Cutler
>            Assignee: Wes McKinney
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> When calling {{Array.from_pandas}} with a pandas.Series of dates and 
> specifying the desired pyarrow type, an error occurs.  If the type is not 
> specified then {{from_pandas}} will interpret the data as a timestamp type.
> {code}
> import pandas as pd
> import pyarrow as pa
> import datetime
> arr = pa.array([datetime.date(2017, 10, 23)])
> c = pa.Column.from_array("d", arr)
> s = c.to_pandas()
> print(s)
> # 0   2017-10-23
> # Name: d, dtype: datetime64[ns]
> result = pa.Array.from_pandas(s, type=pa.date32())
> print(result)
> """
> Traceback (most recent call last):
>   File "<stdin>", line 1, in <module>
>   File "pyarrow/array.pxi", line 295, in pyarrow.lib.Array.__repr__ 
> (/home/bryan/git/arrow/python/build/temp.linux-x86_64-2.7/lib.cxx:26221)
>   File 
> "/home/bryan/.local/lib/python2.7/site-packages/pyarrow-0.7.2.dev21+ng028f2cd-py2.7-linux-x86_64.egg/pyarrow/formatting.py",
>  line 28, in array_format
>     values.append(value_format(x, 0))
>   File 
> "/home/bryan/.local/lib/python2.7/site-packages/pyarrow-0.7.2.dev21+ng028f2cd-py2.7-linux-x86_64.egg/pyarrow/formatting.py",
>  line 49, in value_format
>     return repr(x)
>   File "pyarrow/scalar.pxi", line 63, in pyarrow.lib.ArrayValue.__repr__ 
> (/home/bryan/git/arrow/python/build/temp.linux-x86_64-2.7/lib.cxx:19535)
>   File "pyarrow/scalar.pxi", line 137, in pyarrow.lib.Date32Value.as_py 
> (/home/bryan/git/arrow/python/build/temp.linux-x86_64-2.7/lib.cxx:20368)
> ValueError: year is out of range
> """
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1718) [Python] Implement casts from timestamp to date32/date64 and support in Array.from_pandas

Reply via email to