kou commented on code in PR #49660:
URL: https://github.com/apache/arrow/pull/49660#discussion_r3051620110
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
+ EXPECT_EQ(r1.ValueOrDie(), "f");
+ auto r2 = arrow::util::base64_decode("Zm8=");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "fo");
+ auto r3 = arrow::util::base64_decode("Zm9v");
+ ASSERT_TRUE(r3.ok());
+ EXPECT_EQ(r3.ValueOrDie(), "foo");
+ auto r4 = arrow::util::base64_decode("aGVsbG8gd29ybGQ=");
+ ASSERT_TRUE(r4.ok());
+ EXPECT_EQ(r4.ValueOrDie(), "hello world");
Review Comment:
Why do we need this case? What is the difference with other cases?
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
Review Comment:
Why do we need these changes?
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
Review Comment:
Could you use `ASSERT_OK_AND_ASSIGN()`?
```suggestion
ASSERT_OK_AND_ASSIGN(auto string, arrow::util::base64_decode("Zg=="));
```
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
+ EXPECT_EQ(r1.ValueOrDie(), "f");
+ auto r2 = arrow::util::base64_decode("Zm8=");
Review Comment:
Can we reuse one variable instead of declaring multiple variables (`r1`,
`r2`, ...)?
Or could you use more meaningful variable name for each case such as
`two_paddings`, `one_padding` and `no_padding`?
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
+ EXPECT_EQ(r1.ValueOrDie(), "f");
+ auto r2 = arrow::util::base64_decode("Zm8=");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "fo");
+ auto r3 = arrow::util::base64_decode("Zm9v");
+ ASSERT_TRUE(r3.ok());
+ EXPECT_EQ(r3.ValueOrDie(), "foo");
+ auto r4 = arrow::util::base64_decode("aGVsbG8gd29ybGQ=");
+ ASSERT_TRUE(r4.ok());
+ EXPECT_EQ(r4.ValueOrDie(), "hello world");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+ auto r1 = arrow::util::base64_decode("abc");
+ ASSERT_FALSE(r1.ok());
Review Comment:
Could you use `ASSERT_RAISES()`?
##########
cpp/src/arrow/util/base64.h:
##########
@@ -21,6 +21,8 @@
#include <string_view>
#include "arrow/util/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
Review Comment:
Can we remove this? I think that `arrow/result.h` includes `arrow/status.h`.
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
+ EXPECT_EQ(r1.ValueOrDie(), "f");
+ auto r2 = arrow::util::base64_decode("Zm8=");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "fo");
+ auto r3 = arrow::util::base64_decode("Zm9v");
+ ASSERT_TRUE(r3.ok());
+ EXPECT_EQ(r3.ValueOrDie(), "foo");
+ auto r4 = arrow::util::base64_decode("aGVsbG8gd29ybGQ=");
+ ASSERT_TRUE(r4.ok());
+ EXPECT_EQ(r4.ValueOrDie(), "hello world");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+ auto r1 = arrow::util::base64_decode("abc");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abcde");
+ ASSERT_FALSE(r2.ok());
+}
+
+TEST(Base64DecodeTest, InvalidCharacters) {
+ auto r1 = arrow::util::base64_decode("ab$=");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("Zm9v*");
+ ASSERT_FALSE(r2.ok());
+ auto r3 = arrow::util::base64_decode("abcd$AAA");
+ ASSERT_FALSE(r3.ok());
Review Comment:
What is the difference of the 1st case and the 3rd case? It seems that both
of them checks `$`.
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
+ EXPECT_EQ(r1.ValueOrDie(), "f");
+ auto r2 = arrow::util::base64_decode("Zm8=");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "fo");
+ auto r3 = arrow::util::base64_decode("Zm9v");
+ ASSERT_TRUE(r3.ok());
+ EXPECT_EQ(r3.ValueOrDie(), "foo");
+ auto r4 = arrow::util::base64_decode("aGVsbG8gd29ybGQ=");
+ ASSERT_TRUE(r4.ok());
+ EXPECT_EQ(r4.ValueOrDie(), "hello world");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+ auto r1 = arrow::util::base64_decode("abc");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abcde");
+ ASSERT_FALSE(r2.ok());
+}
+
+TEST(Base64DecodeTest, InvalidCharacters) {
+ auto r1 = arrow::util::base64_decode("ab$=");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("Zm9v*");
Review Comment:
What is the important part of this test? `*`? Is `$` and `*` difference
important here?
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
+ EXPECT_EQ(r1.ValueOrDie(), "f");
+ auto r2 = arrow::util::base64_decode("Zm8=");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "fo");
+ auto r3 = arrow::util::base64_decode("Zm9v");
+ ASSERT_TRUE(r3.ok());
+ EXPECT_EQ(r3.ValueOrDie(), "foo");
+ auto r4 = arrow::util::base64_decode("aGVsbG8gd29ybGQ=");
+ ASSERT_TRUE(r4.ok());
+ EXPECT_EQ(r4.ValueOrDie(), "hello world");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+ auto r1 = arrow::util::base64_decode("abc");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abcde");
+ ASSERT_FALSE(r2.ok());
+}
+
+TEST(Base64DecodeTest, InvalidCharacters) {
+ auto r1 = arrow::util::base64_decode("ab$=");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("Zm9v*");
+ ASSERT_FALSE(r2.ok());
+ auto r3 = arrow::util::base64_decode("abcd$AAA");
+ ASSERT_FALSE(r3.ok());
+}
+
+TEST(Base64DecodeTest, InvalidPadding) {
+ auto r1 = arrow::util::base64_decode("ab=c");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abc===");
+ ASSERT_FALSE(r2.ok());
+ auto r3 = arrow::util::base64_decode("abcd=AAA");
+ ASSERT_FALSE(r3.ok());
+ auto r4 = arrow::util::base64_decode("Zm=9v");
+ ASSERT_FALSE(r4.ok());
+}
+
+TEST(Base64DecodeTest, EdgeCases) {
+ auto r1 = arrow::util::base64_decode("====");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("TQ==");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "M");
Review Comment:
Why is this an edge case?
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
+ EXPECT_EQ(r1.ValueOrDie(), "f");
+ auto r2 = arrow::util::base64_decode("Zm8=");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "fo");
+ auto r3 = arrow::util::base64_decode("Zm9v");
+ ASSERT_TRUE(r3.ok());
+ EXPECT_EQ(r3.ValueOrDie(), "foo");
+ auto r4 = arrow::util::base64_decode("aGVsbG8gd29ybGQ=");
+ ASSERT_TRUE(r4.ok());
+ EXPECT_EQ(r4.ValueOrDie(), "hello world");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+ auto r1 = arrow::util::base64_decode("abc");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abcde");
+ ASSERT_FALSE(r2.ok());
+}
+
+TEST(Base64DecodeTest, InvalidCharacters) {
+ auto r1 = arrow::util::base64_decode("ab$=");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("Zm9v*");
+ ASSERT_FALSE(r2.ok());
+ auto r3 = arrow::util::base64_decode("abcd$AAA");
+ ASSERT_FALSE(r3.ok());
+}
+
+TEST(Base64DecodeTest, InvalidPadding) {
+ auto r1 = arrow::util::base64_decode("ab=c");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abc===");
+ ASSERT_FALSE(r2.ok());
+ auto r3 = arrow::util::base64_decode("abcd=AAA");
+ ASSERT_FALSE(r3.ok());
+ auto r4 = arrow::util::base64_decode("Zm=9v");
+ ASSERT_FALSE(r4.ok());
+}
+
+TEST(Base64DecodeTest, EdgeCases) {
+ auto r1 = arrow::util::base64_decode("====");
+ ASSERT_FALSE(r1.ok());
Review Comment:
It seems that this is an invalid padding case.
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view
string_to_encode) {
return base64_encode(bytes_to_encode, in_len);
}
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
size_t in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
- while (in_len-- && ( encoded_string[in_] != '=') &&
is_base64(encoded_string[in_])) {
+ static const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
+
+ auto is_base64 = [](unsigned char c) -> bool {
+ return (std::isalnum(c) || (c == '+') || (c == '/'));
+ };
+
+ if (encoded_string.size() % 4 != 0) {
+ return arrow::Status::Invalid("Invalid base64 input: length is not a
multiple of 4");
+ }
+
+ size_t padding_start = encoded_string.find('=');
+ if (padding_start != std::string::npos) {
+ for (size_t k = padding_start; k < encoded_string.size(); ++k) {
+ if (encoded_string[k] != '=') {
+ return arrow::Status::Invalid("Invalid base64 input: padding character
'=' found at invalid position");
+ }
+ }
+
+ size_t padding_count = encoded_string.size() - padding_start;
+ if (padding_count > 2) {
+ return arrow::Status::Invalid("Invalid base64 input: too many padding
characters");
+ }
+ }
+
+ for (char c : encoded_string) {
+ if (c != '=' && !is_base64(c)) {
+ return arrow::Status::Invalid("Invalid base64 input: contains
non-base64 character '" + std::string(1, c) + "'");
+ }
+ }
Review Comment:
These validations traverses the `encoded_string` multiple times, right? Does
this have performance penalty for large input?
Can we optimize them?
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view
string_to_encode) {
return base64_encode(bytes_to_encode, in_len);
}
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
Review Comment:
We can use `Result` here because this is in `arrow::util` namespace:
```suggestion
Result<std::string> base64_decode(std::string_view encoded_string) {
```
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view
string_to_encode) {
return base64_encode(bytes_to_encode, in_len);
}
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
size_t in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
- while (in_len-- && ( encoded_string[in_] != '=') &&
is_base64(encoded_string[in_])) {
+ static const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
+
+ auto is_base64 = [](unsigned char c) -> bool {
+ return (std::isalnum(c) || (c == '+') || (c == '/'));
+ };
+
+ if (encoded_string.size() % 4 != 0) {
+ return arrow::Status::Invalid("Invalid base64 input: length is not a
multiple of 4");
+ }
+
+ size_t padding_start = encoded_string.find('=');
+ if (padding_start != std::string::npos) {
+ for (size_t k = padding_start; k < encoded_string.size(); ++k) {
+ if (encoded_string[k] != '=') {
+ return arrow::Status::Invalid("Invalid base64 input: padding character
'=' found at invalid position");
Review Comment:
```suggestion
return Status::Invalid("Invalid base64 input: padding character '='
found at invalid position");
```
##########
cpp/src/arrow/vendored/base64.cpp:
##########
@@ -93,18 +97,51 @@ std::string base64_encode(std::string_view
string_to_encode) {
return base64_encode(bytes_to_encode, in_len);
}
-std::string base64_decode(std::string_view encoded_string) {
+arrow::Result<std::string> base64_decode(std::string_view encoded_string) {
size_t in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
- while (in_len-- && ( encoded_string[in_] != '=') &&
is_base64(encoded_string[in_])) {
+ static const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
+
+ auto is_base64 = [](unsigned char c) -> bool {
+ return (std::isalnum(c) || (c == '+') || (c == '/'));
+ };
Review Comment:
Why do we need to redefine this?
##########
cpp/src/arrow/util/string_test.cc:
##########
@@ -232,12 +233,79 @@ TEST(ToChars, FloatingPoint) {
// to std::to_string which may make ad hoc formatting choices, so we cannot
// really test much about the result.
auto result = ToChars(0.0f);
- ASSERT_TRUE(result.starts_with("0")) << result;
+ ASSERT_TRUE(result.rfind("0", 0) == 0) << result;
result = ToChars(0.25);
- ASSERT_TRUE(result.starts_with("0.25")) << result;
+ ASSERT_TRUE(result.rfind("0.25", 0) == 0) << result;
}
}
+TEST(Base64DecodeTest, ValidInputs) {
+ auto r1 = arrow::util::base64_decode("Zg==");
+ ASSERT_TRUE(r1.ok());
+ EXPECT_EQ(r1.ValueOrDie(), "f");
+ auto r2 = arrow::util::base64_decode("Zm8=");
+ ASSERT_TRUE(r2.ok());
+ EXPECT_EQ(r2.ValueOrDie(), "fo");
+ auto r3 = arrow::util::base64_decode("Zm9v");
+ ASSERT_TRUE(r3.ok());
+ EXPECT_EQ(r3.ValueOrDie(), "foo");
+ auto r4 = arrow::util::base64_decode("aGVsbG8gd29ybGQ=");
+ ASSERT_TRUE(r4.ok());
+ EXPECT_EQ(r4.ValueOrDie(), "hello world");
+}
+
+TEST(Base64DecodeTest, InvalidLength) {
+ auto r1 = arrow::util::base64_decode("abc");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abcde");
+ ASSERT_FALSE(r2.ok());
+}
+
+TEST(Base64DecodeTest, InvalidCharacters) {
+ auto r1 = arrow::util::base64_decode("ab$=");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("Zm9v*");
+ ASSERT_FALSE(r2.ok());
+ auto r3 = arrow::util::base64_decode("abcd$AAA");
+ ASSERT_FALSE(r3.ok());
+}
+
+TEST(Base64DecodeTest, InvalidPadding) {
+ auto r1 = arrow::util::base64_decode("ab=c");
+ ASSERT_FALSE(r1.ok());
+ auto r2 = arrow::util::base64_decode("abc===");
+ ASSERT_FALSE(r2.ok());
+ auto r3 = arrow::util::base64_decode("abcd=AAA");
+ ASSERT_FALSE(r3.ok());
Review Comment:
What is the difference of the 1st case and the 3rd case? It seems that both
of them have additional valid character(s) after padding.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]