Copilot commented on code in PR #49660:
URL: https://github.com/apache/arrow/pull/49660#discussion_r3054228818


##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +90,44 @@ std::string base64_encode(std::string_view 
string_to_encode) {
   return base64_encode(bytes_to_encode, in_len);
 }
 
-std::string base64_decode(std::string_view encoded_string) {
+Result<std::string> base64_decode(std::string_view encoded_string) {
   size_t in_len = encoded_string.size();
   int i = 0;
   int j = 0;
   int in_ = 0;
   unsigned char char_array_4[4], char_array_3[3];
   std::string ret;
 
-  while (in_len-- && ( encoded_string[in_] != '=') && 
is_base64(encoded_string[in_])) {
-    char_array_4[i++] = encoded_string[in_]; in_++;
-    if (i ==4) {
-      for (i = 0; i <4; i++)
+  if (encoded_string.size() % 4 != 0) {
+    return Status::Invalid("Invalid base64 input: length is not a multiple of 
4");
+  }
+
+  size_t padding_start = encoded_string.find('=');
+  if (padding_start != std::string_view::npos) {
+    size_t padding_count = encoded_string.size() - padding_start;
+    if (padding_count > 2) {
+      return Status::Invalid("Invalid base64 input: too many padding 
characters");
+    }
+
+    for (size_t i = padding_start; i < encoded_string.size(); ++i) {
+      if (encoded_string[i] != '=') {
+        return Status::Invalid("Invalid base64 input: padding characters must 
be at the end");
+      }
+    }
+  }
+
+  while (in_len-- && encoded_string[in_] != '=') {
+    unsigned char c = encoded_string[in_];
+
+    if (base64_chars.find(c) == std::string::npos) {
+      return Status::Invalid("Invalid base64 input: contains non-base64 byte 
at position " + std::to_string(in_));
+    }

Review Comment:
   The validation loop can exit with `i == 2` for inputs with `==` padding 
(e.g. "Zg=="). In the trailing partial-quantum handling later in this function, 
`char_array_3[1]` is computed using `char_array_4[2]` even when `i == 2`, which 
reads an uninitialized stack value (undefined behavior). Consider 
zero-initializing the remaining `char_array_4` slots before computing 
`char_array_3`, or only computing the bytes that will actually be appended 
based on `i`.



##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -238,6 +239,48 @@ TEST(ToChars, FloatingPoint) {
   }
 }
 
+TEST(Base64DecodeTest, ValidInputs) {
+  ASSERT_OK_AND_ASSIGN(auto two_paddings, arrow::util::base64_decode("Zg=="));
+  EXPECT_EQ(two_paddings, "f");
+
+  ASSERT_OK_AND_ASSIGN(auto one_padding, arrow::util::base64_decode("Zm8="));
+  EXPECT_EQ(one_padding, "fo");
+
+  ASSERT_OK_AND_ASSIGN(auto no_padding, arrow::util::base64_decode("Zm9v"));
+  EXPECT_EQ(no_padding, "foo");
+
+  ASSERT_OK_AND_ASSIGN(auto single_char, arrow::util::base64_decode("TQ=="));
+  EXPECT_EQ(single_char, "M");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+  ASSERT_RAISES(Invalid, arrow::util::base64_decode("abc"));
+  ASSERT_RAISES(Invalid, arrow::util::base64_decode("abcde"));
+}
+
+TEST(Base64DecodeTest, InvalidCharacters) {
+  ASSERT_RAISES(Invalid, arrow::util::base64_decode("ab$="));
+}

Review Comment:
   These tests assert that invalid input raises `Invalid`, but the PR 
description/user-facing notes say invalid base64 should return an empty string. 
Please align the tests with the intended public behavior (either update the 
implementation/headers to return "" on invalid input, or update the PR 
description to reflect the new error-returning API).



##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +90,44 @@ std::string base64_encode(std::string_view 
string_to_encode) {
   return base64_encode(bytes_to_encode, in_len);
 }
 
-std::string base64_decode(std::string_view encoded_string) {
+Result<std::string> base64_decode(std::string_view encoded_string) {
   size_t in_len = encoded_string.size();
   int i = 0;
   int j = 0;
   int in_ = 0;
   unsigned char char_array_4[4], char_array_3[3];
   std::string ret;
 
-  while (in_len-- && ( encoded_string[in_] != '=') && 
is_base64(encoded_string[in_])) {
-    char_array_4[i++] = encoded_string[in_]; in_++;
-    if (i ==4) {
-      for (i = 0; i <4; i++)
+  if (encoded_string.size() % 4 != 0) {
+    return Status::Invalid("Invalid base64 input: length is not a multiple of 
4");
+  }

Review Comment:
   PR description says invalid input should "return an empty string", but the 
implementation now returns `Status::Invalid(...)` (and the header signature is 
`Result<std::string>`). Either update the PR description/user-facing notes to 
reflect the new error-reporting API, or adjust the implementation to match the 
documented behavior (e.g. preserve the `std::string` API and return "" on 
invalid input).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to