Re: [PR] GH-49614: [C++] Fix silent truncation in base64_decode on invalid input [arrow]

via GitHub Wed, 08 Apr 2026 06:54:19 -0700


Copilot commented on code in PR #49660:
URL: https://github.com/apache/arrow/pull/49660#discussion_r3051805401



##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
     // to std::to_string which may make ad hoc formatting choices, so we cannot
     // really test much about the result.
     auto result = ToChars(0.0f);
-    ASSERT_TRUE(result.starts_with("0")) << result;
+    ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
     result = ToChars(0.25);
-    ASSERT_TRUE(result.starts_with("0.25")) << result;
+    ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
   }
 }
 
+TEST(Base64DecodeTest, ValidInputs) {
+  auto r1 = arrow::util::base64_decode("Zg==");
+  ASSERT_TRUE(r1.ok());
+  EXPECT_EQ(r1.ValueOrDie(), "f");
+  auto r2 = arrow::util::base64_decode("Zm8=");
+  ASSERT_TRUE(r2.ok());
+  EXPECT_EQ(r2.ValueOrDie(), "fo");
+  auto r3 = arrow::util::base64_decode("Zm9v");
+  ASSERT_TRUE(r3.ok());
+  EXPECT_EQ(r3.ValueOrDie(), "foo");
+  auto r4 = arrow::util::base64_decode("aGVsbG8gd29ybGQ=");
+  ASSERT_TRUE(r4.ok());
+  EXPECT_EQ(r4.ValueOrDie(), "hello world");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+  auto r1 = arrow::util::base64_decode("abc");
+  ASSERT_FALSE(r1.ok());
+  auto r2 = arrow::util::base64_decode("abcde");
+  ASSERT_FALSE(r2.ok());
+}
+
+TEST(Base64DecodeTest, InvalidCharacters) {
+  auto r1 = arrow::util::base64_decode("ab$=");
+  ASSERT_FALSE(r1.ok());
+  auto r2 = arrow::util::base64_decode("Zm9v*");
+  ASSERT_FALSE(r2.ok());
+  auto r3 = arrow::util::base64_decode("abcd$AAA");
+  ASSERT_FALSE(r3.ok());
+}
+
+TEST(Base64DecodeTest, InvalidPadding) {
+  auto r1 = arrow::util::base64_decode("ab=c");
+  ASSERT_FALSE(r1.ok());
+  auto r2 = arrow::util::base64_decode("abc===");
+  ASSERT_FALSE(r2.ok());
+  auto r3 = arrow::util::base64_decode("abcd=AAA");
+  ASSERT_FALSE(r3.ok());
+  auto r4 = arrow::util::base64_decode("Zm=9v");
+  ASSERT_FALSE(r4.ok());
+}
+
+TEST(Base64DecodeTest, EdgeCases) {
+  auto r1 = arrow::util::base64_decode("====");
+  ASSERT_FALSE(r1.ok());
+  auto r2 = arrow::util::base64_decode("TQ==");
+  ASSERT_TRUE(r2.ok());
+  EXPECT_EQ(r2.ValueOrDie(), "M");
+}
+
+TEST(Base64DecodeTest, EmptyInput) {
+  auto r = arrow::util::base64_decode("");
+  ASSERT_TRUE(r.ok());
+  EXPECT_EQ(r.ValueOrDie(), "");
+}
+
+TEST(Base64DecodeTest, NonAsciiInput) {
+  std::string input = std::string("abcd") + char(0xFF) + "==";

Review Comment:
   `NonAsciiInput` is currently guaranteed to fail the new `size % 4 == 0` 
validation (the constructed string length is 7), so it doesn't actually 
exercise the non-ASCII character validation path. Adjust the test input to a 
length that is a multiple of 4 so it fails specifically due to the non-base64 
byte (e.g., keep padding rules valid and include the 0xFF byte).
   ```suggestion
     std::string input = std::string("abc") + static_cast<char>(0xFF);
   ```



##########
cpp/src/arrow/util/base64.h:
##########
@@ -29,7 +31,7 @@ ARROW_EXPORT
 std::string base64_encode(std::string_view s);
 
 ARROW_EXPORT
-std::string base64_decode(std::string_view s);
+arrow::Result<std::string> base64_decode(std::string_view s);

Review Comment:
   `base64_decode` was changed from returning `std::string` to 
`arrow::Result<std::string>`, but there are existing call sites in the repo 
that still treat it as a `std::string` (e.g., constructing streams, assigning 
to std::string). As-is, this is an API/ABI breaking change and will not compile 
unless all callers are updated or a backwards-compatible overload/wrapper is 
kept.
   ```suggestion
   std::string base64_decode(std::string_view s);
   ```



##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view 
string_to_encode) {
   return base64_encode(bytes_to_encode, in_len);
 }
 
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
   size_t in_len = encoded_string.size();
   int i = 0;
   int j = 0;
   int in_ = 0;
   unsigned char char_array_4[4], char_array_3[3];
   std::string ret;
 
-  while (in_len-- && ( encoded_string[in_] != '=') && 
is_base64(encoded_string[in_])) {
+  static const std::string base64_chars =
+               "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+               "abcdefghijklmnopqrstuvwxyz"
+               "0123456789+/";
+  
+  auto is_base64 = [](unsigned char c) -> bool {
+    return (std::isalnum(c) || (c == '+') || (c == '/'));
+  };
+
+  if (encoded_string.size() % 4 != 0) {
+    return arrow::Status::Invalid("Invalid base64 input: length is not a 
multiple of 4");
+  }
+
+  size_t padding_start = encoded_string.find('=');
+  if (padding_start != std::string::npos) {

Review Comment:
   `padding_start` is computed via `encoded_string.find('=')` on a 
`std::string_view`, but compared against `std::string::npos`. This works but is 
inconsistent and easy to misread; prefer comparing against 
`std::string_view::npos` when operating on a string_view.
   ```suggestion
     if (padding_start != std::string_view::npos) {
   ```



##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view 
string_to_encode) {
   return base64_encode(bytes_to_encode, in_len);
 }
 
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
   size_t in_len = encoded_string.size();
   int i = 0;
   int j = 0;
   int in_ = 0;
   unsigned char char_array_4[4], char_array_3[3];
   std::string ret;
 
-  while (in_len-- && ( encoded_string[in_] != '=') && 
is_base64(encoded_string[in_])) {
+  static const std::string base64_chars =
+               "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+               "abcdefghijklmnopqrstuvwxyz"
+               "0123456789+/";
+  
+  auto is_base64 = [](unsigned char c) -> bool {
+    return (std::isalnum(c) || (c == '+') || (c == '/'));
+  };
+

Review Comment:
   Inside `base64_decode`, `base64_chars` and `is_base64` are redefined even 
though the same `base64_chars` and `is_base64` already exist at file scope 
above. This introduces name shadowing (potentially tripping -Wshadow) and also 
leaves the file-scope `is_base64` unused. Prefer reusing the existing helpers 
(or refactor to a single definition) to avoid warnings and duplication.
   ```suggestion
   
   ```



##########
cpp/src/arrow/util/base64.h:
##########
@@ -29,7 +31,7 @@ ARROW_EXPORT
 std::string base64_encode(std::string_view s);
 
 ARROW_EXPORT
-std::string base64_decode(std::string_view s);
+arrow::Result<std::string> base64_decode(std::string_view s);
 

Review Comment:
   This change conflicts with the PR description/user-facing behavior: the 
description says invalid input should return an empty string, but the new 
signature returns `Result<std::string>` and the implementation returns 
`Status::Invalid(...)` on malformed inputs. Please align the implementation/API 
and the stated behavior (either update the description and downstream 
expectations, or preserve the old `std::string` API that returns "" on invalid 
input).



##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view 
string_to_encode) {
   return base64_encode(bytes_to_encode, in_len);
 }
 
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
   size_t in_len = encoded_string.size();
   int i = 0;
   int j = 0;
   int in_ = 0;
   unsigned char char_array_4[4], char_array_3[3];
   std::string ret;
 
-  while (in_len-- && ( encoded_string[in_] != '=') && 
is_base64(encoded_string[in_])) {
+  static const std::string base64_chars =
+               "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+               "abcdefghijklmnopqrstuvwxyz"
+               "0123456789+/";
+  
+  auto is_base64 = [](unsigned char c) -> bool {
+    return (std::isalnum(c) || (c == '+') || (c == '/'));
+  };
+
+  if (encoded_string.size() % 4 != 0) {
+    return arrow::Status::Invalid("Invalid base64 input: length is not a 
multiple of 4");
+  }
+
+  size_t padding_start = encoded_string.find('=');
+  if (padding_start != std::string::npos) {
+    for (size_t k = padding_start; k < encoded_string.size(); ++k) {
+      if (encoded_string[k] != '=') {
+        return arrow::Status::Invalid("Invalid base64 input: padding character 
'=' found at invalid position");
+      }
+    }
+
+    size_t padding_count = encoded_string.size() - padding_start;
+    if (padding_count > 2) {
+      return arrow::Status::Invalid("Invalid base64 input: too many padding 
characters");
+    }
+  }
+
+    for (char c : encoded_string) {
+      if (c != '=' && !is_base64(c)) {
+        return arrow::Status::Invalid("Invalid base64 input: contains 
non-base64 character '" + std::string(1, c) + "'");
+      }
+    }

Review Comment:
   The invalid-character error message embeds the raw byte into the Status 
string (`std::string(1, c)`). For non-printable / non-UTF8 bytes (your tests 
include 0xFF), this can produce unreadable or problematic error 
messages/logging. Consider reporting the offending byte as an escaped/hex value 
(and/or the index) instead of inserting it verbatim.
   ```suggestion
     auto format_hex_byte = [](unsigned char c) -> std::string {
       static const char hex_digits[] = "0123456789ABCDEF";
       std::string hex = "0x00";
       hex[2] = hex_digits[(c >> 4) & 0x0F];
       hex[3] = hex_digits[c & 0x0F];
       return hex;
     };
   
     for (size_t k = 0; k < encoded_string.size(); ++k) {
       unsigned char c = static_cast<unsigned char>(encoded_string[k]);
       if (c != '=' && !is_base64(c)) {
         return arrow::Status::Invalid(
             "Invalid base64 input: contains non-base64 byte " + 
format_hex_byte(c) +
             " at position " + std::to_string(k));
       }
     }
   ```



##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -30,7 +30,11 @@
 */
 
 #include "arrow/util/base64.h"
+#include "arrow/util/logging.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
 #include <iostream>
+#include <cctype>

Review Comment:
   `arrow/util/logging.h` is newly included here but isn't used anywhere in 
this file. Please remove the unused include to avoid unnecessary dependencies 
and potential lint/Werror failures.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-49614: [C++] Fix silent truncation in base64_decode on invalid input [arrow]

Reply via email to