[GitHub] [arrow] projjal commented on a change in pull request #10023: ARROW-12378: [C++][Gandiva] Implement castVARBINARY functions

GitBox Wed, 12 May 2021 05:31:35 -0700


projjal commented on a change in pull request #10023:
URL: https://github.com/apache/arrow/pull/10023#discussion_r630996233




##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -533,89 +533,102 @@ const char* castVARCHAR_bool_int64(gdv_int64 context, 
gdv_boolean value,
   return out;
 }
 
-// Truncates the string to given length
-FORCE_INLINE
-const char* castVARCHAR_utf8_int64(gdv_int64 context, const char* data,
-                                   gdv_int32 data_len, int64_t out_len,
-                                   int32_t* out_length) {
-  int32_t len = static_cast<int32_t>(out_len);
-
-  if (len < 0) {
-    gdv_fn_context_set_error_msg(context, "Output buffer length can't be 
negative");
-    *out_length = 0;
-    return "";
-  }
-
-  if (len >= data_len || len == 0) {
-    *out_length = data_len;
-    return data;
-  }
-
-  int32_t remaining = len;
-  int32_t index = 0;
-  bool is_multibyte = false;
-  do {
-    // In utf8, MSB of a single byte unicode char is always 0,
-    // whereas for a multibyte character the MSB of each byte is 1.
-    // So for a single byte char, a bitwise-and with x80 (10000000) will be 0
-    // and it won't be 0 for bytes of a multibyte char
-    char* data_ptr = const_cast<char*>(data);
-
-    // we advance byte by byte till the 8 byte boundary then advance 8 bytes 
at a time
-    auto num_bytes = reinterpret_cast<uintptr_t>(data_ptr) & 0x07;
-    num_bytes = (8 - num_bytes) & 0x07;
-    while (num_bytes > 0) {
-      uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index);
-      if ((*ptr & 0x80) != 0) {
-        is_multibyte = true;
-        break;
-      }
-      index++;
-      remaining--;
-      num_bytes--;
-    }
-    if (is_multibyte) break;
-    while (remaining >= 8) {
-      uint64_t* ptr = reinterpret_cast<uint64_t*>(data_ptr + index);
-      if ((*ptr & 0x8080808080808080) != 0) {
-        is_multibyte = true;
-        break;
-      }
-      index += 8;
-      remaining -= 8;
-    }
-    if (is_multibyte) break;
-    if (remaining >= 4) {
-      uint32_t* ptr = reinterpret_cast<uint32_t*>(data_ptr + index);
-      if ((*ptr & 0x80808080) != 0) break;
-      index += 4;
-      remaining -= 4;
-    }
-    while (remaining > 0) {
-      uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index);
-      if ((*ptr & 0x80) != 0) {
-        is_multibyte = true;
-        break;
-      }
-      index++;
-      remaining--;
-    }
-    if (is_multibyte) break;
-    // reached here; all are single byte characters
-    *out_length = len;
-    return data;
-  } while (false);
-
-  // detected multibyte utf8 characters; slow path
-  int32_t byte_pos = utf8_byte_pos(context, data + index, data_len - index, 
len - index);
-  if (byte_pos < 0) {
-    *out_length = 0;
-    return "";
-  }
-
-  *out_length = index + byte_pos;
-  return data;
-}
+// Add functions for castVARBINARY for utf8 and binary
+#define CAST_FROM_STRING_AND_BINARY(OUTPUT, TYPE_NAME)                         
        \
+  FORCE_INLINE                                                                 
        \
+  const char* cast##OUTPUT##_##TYPE_NAME##_int64(gdv_int64 context, const 
char* data,  \
+                                                 gdv_int32 data_len, int64_t 
out_len,  \
+                                                 int32_t* out_length) {        
        \
+    int32_t len = static_cast<int32_t>(out_len);                               
        \
+                                                                               
        \
+    if (len < 0) {                                                             
        \
+      gdv_fn_context_set_error_msg(context, "Output buffer length can't be 
negative"); \
+      *out_length = 0;                                                         
        \
+      return "";                                                               
        \
+    }                                                                          
        \
+                                                                               
        \
+    if (len >= data_len || len == 0) {                                         
        \
+      *out_length = data_len;                                                  
        \
+      return data;                                                             
        \
+    }                                                                          
        \
+                                                                               
        \
+    int32_t remaining = len;                                                   
        \
+    int32_t index = 0;                                                         
        \
+    bool is_multibyte = false;                                                 
        \
+    do {                                                                       
        \
+      /* In utf8, MSB of a single byte unicode char is always 0, */            
        \
+      /* whereas for a multibyte character the MSB of each byte is 1. */       
        \
+      /* So for a single byte char, a bitwise-and with x80 (10000000) will be 
0 */     \
+      /* and it won't be 0 for bytes of a multibyte char */                    
        \
+      char* data_ptr = const_cast<char*>(data);                                
        \
+                                                                               
        \
+      /* advance byte by byte till the 8 byte boundary, advance 8 bytes at a 
time */   \
+      auto num_bytes = reinterpret_cast<uintptr_t>(data_ptr) & 0x07;           
        \
+      num_bytes = (8 - num_bytes) & 0x07;                                      
        \
+      while (num_bytes > 0) {                                                  
        \
+        uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index);           
        \
+        if ((*ptr & 0x80) != 0) {                                              
        \
+          is_multibyte = true;                                                 
        \
+          break;                                                               
        \
+        }                                                                      
        \
+        index++;                                                               
        \
+        remaining--;                                                           
        \
+        num_bytes--;                                                           
        \
+      }                                                                        
        \
+      if (is_multibyte) break;                                                 
        \
+      while (remaining >= 8) {                                                 
        \
+        uint64_t* ptr = reinterpret_cast<uint64_t*>(data_ptr + index);         
        \
+        if ((*ptr & 0x8080808080808080) != 0) {                                
        \
+          is_multibyte = true;                                                 
        \
+          break;                                                               
        \
+        }                                                                      
        \
+        index += 8;                                                            
        \
+        remaining -= 8;                                                        
        \
+      }                                                                        
        \
+      if (is_multibyte) {                                                      
        \
+        break;                                                                 
        \
+      }                                                                        
        \
+      if (remaining >= 4) {                                                    
        \
+        uint32_t* ptr = reinterpret_cast<uint32_t*>(data_ptr + index);         
        \
+        if ((*ptr & 0x80808080) != 0) break;                                   
        \
+        index += 4;                                                            
        \
+        remaining -= 4;                                                        
        \
+      }                                                                        
        \
+      while (remaining > 0) {                                                  
        \
+        uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index);           
        \
+        if ((*ptr & 0x80) != 0) {                                              
        \
+          is_multibyte = true;                                                 
        \
+          break;                                                               
        \
+        }                                                                      
        \
+        index++;                                                               
        \
+        remaining--;                                                           
        \
+      }                                                                        
        \
+      if (is_multibyte) {                                                      
        \
+        break;                                                                 
        \
+      }                                                                        
        \
+      /* reached here; all are single byte characters */                       
        \
+      *out_length = len;                                                       
        \
+      return data;                                                             
        \
+    } while (false);                                                           
        \
+                                                                               
        \
+    /* detected multibyte utf8 characters; slow path */                        
        \
+    int32_t byte_pos =                                                         
        \
+        utf8_byte_pos(context, data + index, data_len - index, len - index);   
        \
+    if (byte_pos < 0) {                                                        
        \
+      *out_length = 0;                                                         
        \
+      return "";                                                               
        \
+    }                                                                          
        \
+                                                                               
        \
+    *out_length = index + byte_pos;                                            
        \
+    return data;                                                               
        \
+  }
+
+CAST_FROM_STRING_AND_BINARY(VARCHAR, utf8)

Review comment:
       Looks like you can't reuse the same function. varchar one takes 
character count which needs utf8 decoding




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] projjal commented on a change in pull request #10023: ARROW-12378: [C++][Gandiva] Implement castVARBINARY functions

Reply via email to