Copilot commented on code in PR #49660:
URL: https://github.com/apache/arrow/pull/49660#discussion_r3051805401
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
+ EXPECT_EQ(r1.ValueOrDie(), "f");
+ auto r2 = arrow::util::base64_decode("Zm8=");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "fo");
+ auto r3 = arrow::util::base64_decode("Zm9v");
+ ASSERT_TRUE(r3.ok());
+ EXPECT_EQ(r3.ValueOrDie(), "foo");
+ auto r4 = arrow::util::base64_decode("aGVsbG8gd29ybGQ=");
+ ASSERT_TRUE(r4.ok());
+ EXPECT_EQ(r4.ValueOrDie(), "hello world");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+ auto r1 = arrow::util::base64_decode("abc");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abcde");
+ ASSERT_FALSE(r2.ok());
+}
+
+TEST(Base64DecodeTest, InvalidCharacters) {
+ auto r1 = arrow::util::base64_decode("ab$=");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("Zm9v*");
+ ASSERT_FALSE(r2.ok());
+ auto r3 = arrow::util::base64_decode("abcd$AAA");
+ ASSERT_FALSE(r3.ok());
+}
+
+TEST(Base64DecodeTest, InvalidPadding) {
+ auto r1 = arrow::util::base64_decode("ab=c");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abc===");
+ ASSERT_FALSE(r2.ok());
+ auto r3 = arrow::util::base64_decode("abcd=AAA");
+ ASSERT_FALSE(r3.ok());
+ auto r4 = arrow::util::base64_decode("Zm=9v");
+ ASSERT_FALSE(r4.ok());
+}
+
+TEST(Base64DecodeTest, EdgeCases) {
+ auto r1 = arrow::util::base64_decode("====");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("TQ==");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "M");
+}
+
+TEST(Base64DecodeTest, EmptyInput) {
+ auto r = arrow::util::base64_decode("");
+ ASSERT_TRUE(r.ok());
+ EXPECT_EQ(r.ValueOrDie(), "");
+}
+
+TEST(Base64DecodeTest, NonAsciiInput) {
+ std::string input = std::string("abcd") + char(0xFF) + "==";
Review Comment:
`NonAsciiInput` is currently guaranteed to fail the new `size % 4 == 0`
validation (the constructed string length is 7), so it doesn't actually
exercise the non-ASCII character validation path. Adjust the test input to a
length that is a multiple of 4 so it fails specifically due to the non-base64
byte (e.g., keep padding rules valid and include the 0xFF byte).
```suggestion
std::string input = std::string("abc") + static_cast<char>(0xFF);
```
##########
cpp/src/arrow/util/base64.h:
##########
@@ -29,7 +31,7 @@ ARROW_EXPORT
std::string base64_encode(std::string_view s);
ARROW_EXPORT
-std::string base64_decode(std::string_view s);
+arrow::Result<std::string> base64_decode(std::string_view s);
Review Comment:
`base64_decode` was changed from returning `std::string` to
`arrow::Result<std::string>`, but there are existing call sites in the repo
that still treat it as a `std::string` (e.g., constructing streams, assigning
to std::string). As-is, this is an API/ABI breaking change and will not compile
unless all callers are updated or a backwards-compatible overload/wrapper is
kept.
```suggestion
std::string base64_decode(std::string_view s);
```
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view
string_to_encode) {
return base64_encode(bytes_to_encode, in_len);
}
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
size_t in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
- while (in_len-- && ( encoded_string[in_] != '=') &&
is_base64(encoded_string[in_])) {
+ static const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
+
+ auto is_base64 = [](unsigned char c) -> bool {
+ return (std::isalnum(c) || (c == '+') || (c == '/'));
+ };
+
+ if (encoded_string.size() % 4 != 0) {
+ return arrow::Status::Invalid("Invalid base64 input: length is not a
multiple of 4");
+ }
+
+ size_t padding_start = encoded_string.find('=');
+ if (padding_start != std::string::npos) {
Review Comment:
`padding_start` is computed via `encoded_string.find('=')` on a
`std::string_view`, but compared against `std::string::npos`. This works but is
inconsistent and easy to misread; prefer comparing against
`std::string_view::npos` when operating on a string_view.
```suggestion
if (padding_start != std::string_view::npos) {
```
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view
string_to_encode) {
return base64_encode(bytes_to_encode, in_len);
}
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
size_t in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
- while (in_len-- && ( encoded_string[in_] != '=') &&
is_base64(encoded_string[in_])) {
+ static const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
+
+ auto is_base64 = [](unsigned char c) -> bool {
+ return (std::isalnum(c) || (c == '+') || (c == '/'));
+ };
+
Review Comment:
Inside `base64_decode`, `base64_chars` and `is_base64` are redefined even
though the same `base64_chars` and `is_base64` already exist at file scope
above. This introduces name shadowing (potentially tripping -Wshadow) and also
leaves the file-scope `is_base64` unused. Prefer reusing the existing helpers
(or refactor to a single definition) to avoid warnings and duplication.
```suggestion
```
##########
cpp/src/arrow/util/base64.h:
##########
@@ -29,7 +31,7 @@ ARROW_EXPORT
std::string base64_encode(std::string_view s);
ARROW_EXPORT
-std::string base64_decode(std::string_view s);
+arrow::Result<std::string> base64_decode(std::string_view s);
Review Comment:
This change conflicts with the PR description/user-facing behavior: the
description says invalid input should return an empty string, but the new
signature returns `Result<std::string>` and the implementation returns
`Status::Invalid(...)` on malformed inputs. Please align the implementation/API
and the stated behavior (either update the description and downstream
expectations, or preserve the old `std::string` API that returns "" on invalid
input).
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view
string_to_encode) {
return base64_encode(bytes_to_encode, in_len);
}
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
size_t in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
- while (in_len-- && ( encoded_string[in_] != '=') &&
is_base64(encoded_string[in_])) {
+ static const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
+
+ auto is_base64 = [](unsigned char c) -> bool {
+ return (std::isalnum(c) || (c == '+') || (c == '/'));
+ };
+
+ if (encoded_string.size() % 4 != 0) {
+ return arrow::Status::Invalid("Invalid base64 input: length is not a
multiple of 4");
+ }
+
+ size_t padding_start = encoded_string.find('=');
+ if (padding_start != std::string::npos) {
+ for (size_t k = padding_start; k < encoded_string.size(); ++k) {
+ if (encoded_string[k] != '=') {
+ return arrow::Status::Invalid("Invalid base64 input: padding character
'=' found at invalid position");
+ }
+ }
+
+ size_t padding_count = encoded_string.size() - padding_start;
+ if (padding_count > 2) {
+ return arrow::Status::Invalid("Invalid base64 input: too many padding
characters");
+ }
+ }
+
+ for (char c : encoded_string) {
+ if (c != '=' && !is_base64(c)) {
+ return arrow::Status::Invalid("Invalid base64 input: contains
non-base64 character '" + std::string(1, c) + "'");
+ }
+ }
Review Comment:
The invalid-character error message embeds the raw byte into the Status
string (`std::string(1, c)`). For non-printable / non-UTF8 bytes (your tests
include 0xFF), this can produce unreadable or problematic error
messages/logging. Consider reporting the offending byte as an escaped/hex value
(and/or the index) instead of inserting it verbatim.
```suggestion
auto format_hex_byte = [](unsigned char c) -> std::string {
static const char hex_digits[] = "0123456789ABCDEF";
std::string hex = "0x00";
hex[2] = hex_digits[(c >> 4) & 0x0F];
hex[3] = hex_digits[c & 0x0F];
return hex;
};
for (size_t k = 0; k < encoded_string.size(); ++k) {
unsigned char c = static_cast<unsigned char>(encoded_string[k]);
if (c != '=' && !is_base64(c)) {
return arrow::Status::Invalid(
"Invalid base64 input: contains non-base64 byte " +
format_hex_byte(c) +
" at position " + std::to_string(k));
}
}
```
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -30,7 +30,11 @@
*/
#include "arrow/util/base64.h"
+#include "arrow/util/logging.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
#include <iostream>
+#include <cctype>
Review Comment:
`arrow/util/logging.h` is newly included here but isn't used anywhere in
this file. Please remove the unused include to avoid unnecessary dependencies
and potential lint/Werror failures.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]