This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new cc56f12f40 GH-45126: [C++][Parquet] Fix undefined behavior in the
FormatStatValue (#45127)
cc56f12f40 is described below
commit cc56f12f408417e1b6d6cc5fbd1ba0cb8d70400e
Author: Gang Wu <[email protected]>
AuthorDate: Tue Dec 31 11:08:22 2024 +0800
GH-45126: [C++][Parquet] Fix undefined behavior in the FormatStatValue
(#45127)
### Rationale for this change
FormatStatValue function in the parquet/types.cc employs reinterpret_cast
to cast bytes to specific data types. It is undefined behavior when the bytes
are unaligned.
### What changes are included in this PR?
Use std::memcpy to replace reinterpret_cast.
### Are these changes tested?
Pass CI.
### Are there any user-facing changes?
No.
* GitHub Issue: #45126
Authored-by: Gang Wu <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/parquet/types.cc | 48 ++++++++++++++++++++++++++++++++----------------
1 file changed, 32 insertions(+), 16 deletions(-)
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 7b50ed48d0..bee75c335a 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.
+#include <array>
#include <cmath>
#include <cstdint>
#include <memory>
@@ -95,31 +96,46 @@ std::string FormatStatValue(Type::type parquet_type,
::std::string_view val) {
const char* bytes = val.data();
switch (parquet_type) {
- case Type::BOOLEAN:
- result << reinterpret_cast<const bool*>(bytes)[0];
+ case Type::BOOLEAN: {
+ bool value{};
+ std::memcpy(&value, bytes, sizeof(bool));
+ result << value;
break;
- case Type::INT32:
- result << reinterpret_cast<const int32_t*>(bytes)[0];
+ }
+ case Type::INT32: {
+ int32_t value{};
+ std::memcpy(&value, bytes, sizeof(int32_t));
+ result << value;
break;
- case Type::INT64:
- result << reinterpret_cast<const int64_t*>(bytes)[0];
+ }
+ case Type::INT64: {
+ int64_t value{};
+ std::memcpy(&value, bytes, sizeof(int64_t));
+ result << value;
break;
- case Type::DOUBLE:
- result << reinterpret_cast<const double*>(bytes)[0];
+ }
+ case Type::DOUBLE: {
+ double value{};
+ std::memcpy(&value, bytes, sizeof(double));
+ result << value;
break;
- case Type::FLOAT:
- result << reinterpret_cast<const float*>(bytes)[0];
+ }
+ case Type::FLOAT: {
+ float value{};
+ std::memcpy(&value, bytes, sizeof(float));
+ result << value;
break;
+ }
case Type::INT96: {
- auto const i32_val = reinterpret_cast<const int32_t*>(bytes);
- result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
+ std::array<int32_t, 3> values{};
+ std::memcpy(values.data(), bytes, 3 * sizeof(int32_t));
+ result << values[0] << " " << values[1] << " " << values[2];
break;
}
- case Type::BYTE_ARRAY: {
- return std::string(val);
- }
+ case Type::BYTE_ARRAY:
case Type::FIXED_LEN_BYTE_ARRAY: {
- return std::string(val);
+ result << val;
+ break;
}
case Type::UNDEFINED:
default: