[GitHub] [arrow] maartenbreddels commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

GitBox Mon, 29 Jun 2020 05:39:42 -0700


maartenbreddels commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r446938183




##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -68,6 +76,64 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    this->string_type(), "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TEST(TestStringKernels, Utf8Upper32bitGrowth) {
+  std::string str(0xffff, 'a');
+  arrow::StringBuilder builder;
+  // 0x7fff * 0xffff is the max a 32 bit string array can hold
+  // since the utf8_upper kernel can grow it by 3/2, the max we should accept 
is is
+  // 0x7fff * 0xffff * 2/3 = 0x5555 * 0xffff, so this should give us a 
CapacityError
+  for (int64_t i = 0; i < 0x5556; i++) {
+    ASSERT_OK(builder.Append(str));
+  }
+  std::shared_ptr<arrow::Array> array;
+  arrow::Status st = builder.Finish(&array);
+  const FunctionOptions* options = nullptr;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(CapacityError,
+                                  testing::HasSubstr("Result might not fit"),
+                                  CallFunction("utf8_upper", {array}, 
options));
+}
+
+TYPED_TEST(TestStringKernels, Utf8Upper) {
+  this->CheckUnary("utf8_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", 
this->string_type(),
+                   "[\"AAAZZÆÆ&\", null, \"\", \"B\"]");
+
+  // test varying encoding lenghts and thus changing indices/offsets
+  this->CheckUnary("utf8_upper", "[\"ɑɽⱤoW\", null, \"ıI\", \"b\"]", 
this->string_type(),
+                   "[\"ⱭⱤⱤOW\", null, \"II\", \"B\"]");
+
+  // ῦ to Υ͂ not supported
+  // this->CheckUnary("utf8_upper", "[\"ῦɐɜʞȿ\"]", this->string_type(),
+  // "[\"Υ͂ⱯꞫꞰⱾ\"]");
+
+  // test maximum buffer growth
+  this->CheckUnary("utf8_upper", "[\"ɑɑɑɑ\"]", this->string_type(), 
"[\"ⱭⱭⱭⱭ\"]");
+
+  // Test replacing invalid data by ? (ὖ == \xe1\xbd\x96)
+  this->CheckUnary("utf8_upper", "[\"ɑa\xFFɑ\", \"ɽ\xe1\xbdɽaa\"]", 
this->string_type(),
+                   "[\"ⱭA?Ɑ\", \"Ɽ?ⱤAA\"]");
+}
+
+TYPED_TEST(TestStringKernels, Utf8Lower) {
+  this->CheckUnary("utf8_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", 
this->string_type(),
+                   "[\"aaazzææ&\", null, \"\", \"b\"]");
+
+  // test varying encoding lenghts and thus changing indices/offsets
+  this->CheckUnary("utf8_lower", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", 
this->string_type(),
+                   "[\"ɑɽɽow\", null, \"ıi\", \"b\"]");
+
+  // ῦ to Υ͂ is not supported, but in principle the reverse is, but it would 
need
+  // normalization
+  // this->CheckUnary("utf8_lower", "[\"Υ͂ⱯꞫꞰⱾ\"]", this->string_type(),
+  // "[\"ῦɐɜʞȿ\"]");
+
+  // test maximum buffer growth
+  this->CheckUnary("utf8_lower", "[\"ȺȺȺȺ\"]", this->string_type(), 
"[\"ⱥⱥⱥⱥ\"]");
+
+  // Test replacing invalid data by ? (ὖ == \xe1\xbd\x96)

Review comment:
       I added a truncated encoded byte string of that, I'll clarity in the 
comment.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] maartenbreddels commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

Reply via email to