maartenbreddels commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r446938183



##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -68,6 +76,64 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    this->string_type(), "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TEST(TestStringKernels, Utf8Upper32bitGrowth) {
+  std::string str(0xffff, 'a');
+  arrow::StringBuilder builder;
+  // 0x7fff * 0xffff is the max a 32 bit string array can hold
+  // since the utf8_upper kernel can grow it by 3/2, the max we should accept 
is is
+  // 0x7fff * 0xffff * 2/3 = 0x5555 * 0xffff, so this should give us a 
CapacityError
+  for (int64_t i = 0; i < 0x5556; i++) {
+    ASSERT_OK(builder.Append(str));
+  }
+  std::shared_ptr<arrow::Array> array;
+  arrow::Status st = builder.Finish(&array);
+  const FunctionOptions* options = nullptr;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(CapacityError,
+                                  testing::HasSubstr("Result might not fit"),
+                                  CallFunction("utf8_upper", {array}, 
options));
+}
+
+TYPED_TEST(TestStringKernels, Utf8Upper) {
+  this->CheckUnary("utf8_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", 
this->string_type(),
+                   "[\"AAAZZÆÆ&\", null, \"\", \"B\"]");
+
+  // test varying encoding lenghts and thus changing indices/offsets
+  this->CheckUnary("utf8_upper", "[\"ɑɽⱤoW\", null, \"ıI\", \"b\"]", 
this->string_type(),
+                   "[\"ⱭⱤⱤOW\", null, \"II\", \"B\"]");
+
+  // ῦ to Υ͂ not supported
+  // this->CheckUnary("utf8_upper", "[\"ῦɐɜʞȿ\"]", this->string_type(),
+  // "[\"Υ͂ⱯꞫꞰⱾ\"]");
+
+  // test maximum buffer growth
+  this->CheckUnary("utf8_upper", "[\"ɑɑɑɑ\"]", this->string_type(), 
"[\"ⱭⱭⱭⱭ\"]");
+
+  // Test replacing invalid data by ? (ὖ == \xe1\xbd\x96)
+  this->CheckUnary("utf8_upper", "[\"ɑa\xFFɑ\", \"ɽ\xe1\xbdɽaa\"]", 
this->string_type(),
+                   "[\"ⱭA?Ɑ\", \"Ɽ?ⱤAA\"]");
+}
+
+TYPED_TEST(TestStringKernels, Utf8Lower) {
+  this->CheckUnary("utf8_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", 
this->string_type(),
+                   "[\"aaazzææ&\", null, \"\", \"b\"]");
+
+  // test varying encoding lenghts and thus changing indices/offsets
+  this->CheckUnary("utf8_lower", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", 
this->string_type(),
+                   "[\"ɑɽɽow\", null, \"ıi\", \"b\"]");
+
+  // ῦ to Υ͂ is not supported, but in principle the reverse is, but it would 
need
+  // normalization
+  // this->CheckUnary("utf8_lower", "[\"Υ͂ⱯꞫꞰⱾ\"]", this->string_type(),
+  // "[\"ῦɐɜʞȿ\"]");
+
+  // test maximum buffer growth
+  this->CheckUnary("utf8_lower", "[\"ȺȺȺȺ\"]", this->string_type(), 
"[\"ⱥⱥⱥⱥ\"]");
+
+  // Test replacing invalid data by ? (ὖ == \xe1\xbd\x96)

Review comment:
       I added a truncated encoded byte string of that, I'll clarity in the 
comment.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to