Copilot commented on code in PR #49660:
URL: https://github.com/apache/arrow/pull/49660#discussion_r3054228818
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +90,44 @@ std::string base64_encode(std::string_view
string_to_encode) {
return base64_encode(bytes_to_encode, in_len);
}
-std::string base64_decode(std::string_view encoded_string) {
+Result<std::string> base64_decode(std::string_view encoded_string) {
size_t in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
- while (in_len-- && ( encoded_string[in_] != '=') &&
is_base64(encoded_string[in_])) {
- char_array_4[i++] = encoded_string[in_]; in_++;
- if (i ==4) {
- for (i = 0; i <4; i++)
+ if (encoded_string.size() % 4 != 0) {
+ return Status::Invalid("Invalid base64 input: length is not a multiple of
4");
+ }
+
+ size_t padding_start = encoded_string.find('=');
+ if (padding_start != std::string_view::npos) {
+ size_t padding_count = encoded_string.size() - padding_start;
+ if (padding_count > 2) {
+ return Status::Invalid("Invalid base64 input: too many padding
characters");
+ }
+
+ for (size_t i = padding_start; i < encoded_string.size(); ++i) {
+ if (encoded_string[i] != '=') {
+ return Status::Invalid("Invalid base64 input: padding characters must
be at the end");
+ }
+ }
+ }
+
+ while (in_len-- && encoded_string[in_] != '=') {
+ unsigned char c = encoded_string[in_];
+
+ if (base64_chars.find(c) == std::string::npos) {
+ return Status::Invalid("Invalid base64 input: contains non-base64 byte
at position " + std::to_string(in_));
+ }
Review Comment:
The validation loop can exit with `i == 2` for inputs with `==` padding
(e.g. "Zg=="). In the trailing partial-quantum handling later in this function,
`char_array_3[1]` is computed using `char_array_4[2]` even when `i == 2`, which
reads an uninitialized stack value (undefined behavior). Consider
zero-initializing the remaining `char_array_4` slots before computing
`char_array_3`, or only computing the bytes that will actually be appended
based on `i`.
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -238,6 +239,48 @@ TEST(ToChars, FloatingPoint) {
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ ASSERT_OK_AND_ASSIGN(auto two_paddings, arrow::util::base64_decode("Zg=="));
+ EXPECT_EQ(two_paddings, "f");
+
+ ASSERT_OK_AND_ASSIGN(auto one_padding, arrow::util::base64_decode("Zm8="));
+ EXPECT_EQ(one_padding, "fo");
+
+ ASSERT_OK_AND_ASSIGN(auto no_padding, arrow::util::base64_decode("Zm9v"));
+ EXPECT_EQ(no_padding, "foo");
+
+ ASSERT_OK_AND_ASSIGN(auto single_char, arrow::util::base64_decode("TQ=="));
+ EXPECT_EQ(single_char, "M");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+ ASSERT_RAISES(Invalid, arrow::util::base64_decode("abc"));
+ ASSERT_RAISES(Invalid, arrow::util::base64_decode("abcde"));
+}
+
+TEST(Base64DecodeTest, InvalidCharacters) {
+ ASSERT_RAISES(Invalid, arrow::util::base64_decode("ab$="));
+}
Review Comment:
These tests assert that invalid input raises `Invalid`, but the PR
description/user-facing notes say invalid base64 should return an empty string.
Please align the tests with the intended public behavior (either update the
implementation/headers to return "" on invalid input, or update the PR
description to reflect the new error-returning API).
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +90,44 @@ std::string base64_encode(std::string_view
string_to_encode) {
return base64_encode(bytes_to_encode, in_len);
}
-std::string base64_decode(std::string_view encoded_string) {
+Result<std::string> base64_decode(std::string_view encoded_string) {
size_t in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
- while (in_len-- && ( encoded_string[in_] != '=') &&
is_base64(encoded_string[in_])) {
- char_array_4[i++] = encoded_string[in_]; in_++;
- if (i ==4) {
- for (i = 0; i <4; i++)
+ if (encoded_string.size() % 4 != 0) {
+ return Status::Invalid("Invalid base64 input: length is not a multiple of
4");
+ }
Review Comment:
PR description says invalid input should "return an empty string", but the
implementation now returns `Status::Invalid(...)` (and the header signature is
`Result<std::string>`). Either update the PR description/user-facing notes to
reflect the new error-reporting API, or adjust the implementation to match the
documented behavior (e.g. preserve the `std::string` API and return "" on
invalid input).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]