[arrow-nanoarrow] branch main updated: fix(r): Warn for possibly out of range int64 -> double conversions (#294)

paleolimbot Fri, 08 Sep 2023 09:39:09 -0700

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 6ea9d4b  fix(r): Warn for possibly out of range int64 -> double 
conversions (#294)
6ea9d4b is described below

commit 6ea9d4b5724ff1c94e05bdf41c20404223392697
Author: Dewey Dunnington <[email protected]>
AuthorDate: Fri Sep 8 13:38:23 2023 -0300

    fix(r): Warn for possibly out of range int64 -> double conversions (#294)
    
    It looks like I had left this as a TODO after my initial push to get
    most conversions implemented. The idea is to warn when converting very
    large int64/uint64 values to double. This an important case to warn for
    because double is the default conversion from int64 and values that
    won't (or might not) roundtrip back to int64 are important to warn
    about. See #293 implementing a safer conversion.
    
    ``` r
    library(nanoarrow)
    
    array <- as_nanoarrow_array(2^54, schema = na_int64())
    convert_array(array, double())
    #> Warning in convert_array.default(array, double()): 1 value(s) may have 
incurred
    #> loss of precision in conversion to double()
    #> [1] 1.80144e+16
    ```
    
    <sup>Created on 2023-09-01 with [reprex
    v2.0.2](https://reprex.tidyverse.org)</sup>
---
 r/src/materialize_dbl.h               | 37 ++++++++++++++++++++++++++++++-----
 r/tests/testthat/test-convert-array.R |  8 ++++++++
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/r/src/materialize_dbl.h b/r/src/materialize_dbl.h
index 3bebe47..9269d88 100644
--- a/r/src/materialize_dbl.h
+++ b/r/src/materialize_dbl.h
@@ -24,6 +24,9 @@
 #include "materialize_common.h"
 #include "nanoarrow.h"
 
+// bit64::as.integer64(2^53)
+#define MAX_DBL_AS_INTEGER 9007199254740992
+
 static inline int nanoarrow_materialize_dbl(struct RConverter* converter) {
   if (converter->src.array_view->array->dictionary != NULL) {
     return ENOTSUP;
@@ -32,6 +35,7 @@ static inline int nanoarrow_materialize_dbl(struct 
RConverter* converter) {
   struct ArrayViewSlice* src = &converter->src;
   struct VectorSlice* dst = &converter->dst;
   double* result = REAL(dst->vec_sexp);
+  int64_t n_bad_values = 0;
 
   // True for all the types supported here
   const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8;
@@ -65,12 +69,8 @@ static inline int nanoarrow_materialize_dbl(struct 
RConverter* converter) {
     case NANOARROW_TYPE_UINT16:
     case NANOARROW_TYPE_INT32:
     case NANOARROW_TYPE_UINT32:
-    case NANOARROW_TYPE_INT64:
-    case NANOARROW_TYPE_UINT64:
     case NANOARROW_TYPE_FLOAT:
-      // TODO: implement bounds check for int64 and uint64, but instead
-      // of setting to NA, just warn (because sequential values might not
-      // roundtrip above 2^51 ish)
+      // No need to bounds check these types
       for (R_xlen_t i = 0; i < dst->length; i++) {
         result[dst->offset + i] =
             ArrowArrayViewGetDoubleUnsafe(src->array_view, src->offset + i);
@@ -86,10 +86,37 @@ static inline int nanoarrow_materialize_dbl(struct 
RConverter* converter) {
       }
       break;
 
+    case NANOARROW_TYPE_INT64:
+    case NANOARROW_TYPE_UINT64:
+      for (R_xlen_t i = 0; i < dst->length; i++) {
+        double value = ArrowArrayViewGetDoubleUnsafe(src->array_view, 
src->offset + i);
+        if (value > MAX_DBL_AS_INTEGER || value < -MAX_DBL_AS_INTEGER) {
+          n_bad_values++;
+        }
+
+        result[dst->offset + i] = value;
+      }
+
+      // Set any nulls to NA_REAL
+      if (is_valid != NULL && src->array_view->array->null_count != 0) {
+        for (R_xlen_t i = 0; i < dst->length; i++) {
+          if (!ArrowBitGet(is_valid, raw_src_offset + i)) {
+            result[dst->offset + i] = NA_REAL;
+          }
+        }
+      }
+      break;
+
     default:
       return EINVAL;
   }
 
+  if (n_bad_values > 0) {
+    Rf_warning(
+        "%ld value(s) may have incurred loss of precision in conversion to 
double()",
+        (long)n_bad_values);
+  }
+
   return NANOARROW_OK;
 }
 
diff --git a/r/tests/testthat/test-convert-array.R 
b/r/tests/testthat/test-convert-array.R
index 592e58b..8d457e0 100644
--- a/r/tests/testthat/test-convert-array.R
+++ b/r/tests/testthat/test-convert-array.R
@@ -515,6 +515,14 @@ test_that("convert to vector works for dictionary<double> 
-> double()", {
   )
 })
 
+test_that("convert to vector warns for possibly invalid double()", {
+  array <- as_nanoarrow_array(2^54, schema = na_int64())
+  expect_warning(
+    convert_array(array, double()),
+    "1 value\\(s\\) may have incurred loss of precision in conversion to 
double()"
+  )
+})
+
 test_that("convert to vector errors for bad array to double()", {
   expect_error(
     convert_array(as_nanoarrow_array(letters), double()),

[arrow-nanoarrow] branch main updated: fix(r): Warn for possibly out of range int64 -> double conversions (#294)

Reply via email to