[ 
https://issues.apache.org/jira/browse/PARQUET-1357?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16565329#comment-16565329
 ] 

ASF GitHub Bot commented on PARQUET-1357:
-----------------------------------------

xhochy closed pull request #479: PARQUET-1357: FormatStatValue truncates binary 
statistics on zero character
URL: https://github.com/apache/parquet-cpp/pull/479
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b53f5980..927f7289 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,7 +84,7 @@ enable_testing()
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules")
 set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support")
 
-set(CLANG_FORMAT_VERSION "5.0")
+set(CLANG_FORMAT_VERSION "6.0")
 find_package(ClangTools)
 if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND)
   # Generate a Clang compile_commands.json "compilation database" file for use
diff --git a/cmake_modules/FindClangTools.cmake 
b/cmake_modules/FindClangTools.cmake
index e9221ff2..215a5cd9 100644
--- a/cmake_modules/FindClangTools.cmake
+++ b/cmake_modules/FindClangTools.cmake
@@ -30,6 +30,12 @@
 #  CLANG_FORMAT_BIN, The path to the clang format binary
 #  CLANG_TIDY_FOUND, Whether clang format was found
 
+if (DEFINED ENV{HOMEBREW_PREFIX})
+  set(HOMEBREW_PREFIX "${ENV{HOMEBREW_PREFIX}")
+else()
+  set(HOMEBREW_PREFIX "/usr/local")
+endif()
+
 find_program(CLANG_TIDY_BIN
   NAMES clang-tidy-4.0
   clang-tidy-3.9
@@ -37,7 +43,7 @@ find_program(CLANG_TIDY_BIN
   clang-tidy-3.7
   clang-tidy-3.6
   clang-tidy
-  PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
+  PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin 
"${HOMEBREW_PREFIX}/bin"
         NO_DEFAULT_PATH
 )
 
@@ -55,7 +61,7 @@ if (CLANG_FORMAT_VERSION)
       PATHS
             ${ClangTools_PATH}
             $ENV{CLANG_TOOLS_PATH}
-            /usr/local/bin /usr/bin
+            /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
             NO_DEFAULT_PATH
     )
 
@@ -67,16 +73,26 @@ if (CLANG_FORMAT_VERSION)
         if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0")
             find_program(CLANG_FORMAT_BIN
               NAMES clang-format
-              PATHS /usr/local/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin
+              PATHS 
"${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin"
                     NO_DEFAULT_PATH
             )
         else()
             find_program(CLANG_FORMAT_BIN
               NAMES clang-format
-              PATHS /usr/local/opt/llvm@${CLANG_FORMAT_VERSION}/bin
+              PATHS "${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_VERSION}/bin"
                     NO_DEFAULT_PATH
             )
         endif()
+
+        if ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND")
+          # binary was still not found, look into Cellar
+          file(GLOB CLANG_FORMAT_PATH 
"${HOMEBREW_PREFIX}/Cellar/llvm/${CLANG_FORMAT_VERSION}.*")
+          find_program(CLANG_FORMAT_BIN
+            NAMES clang-format
+            PATHS "${CLANG_FORMAT_PATH}/bin"
+                  NO_DEFAULT_PATH
+          )
+        endif()
     endif()
 else()
     find_program(CLANG_FORMAT_BIN
@@ -86,7 +102,7 @@ else()
       clang-format-3.7
       clang-format-3.6
       clang-format
-      PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
+      PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin 
"${HOMEBREW_PREFIX}/bin"
             NO_DEFAULT_PATH
     )
 endif()
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc 
b/src/parquet/arrow/arrow-reader-writer-test.cc
index 1c2f3225..8955b0ab 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -52,16 +52,16 @@ using arrow::Buffer;
 using arrow::ChunkedArray;
 using arrow::Column;
 using arrow::DataType;
+using arrow::default_memory_pool;
 using arrow::ListArray;
-using arrow::ResizableBuffer;
 using arrow::PrimitiveArray;
+using arrow::ResizableBuffer;
 using arrow::Status;
 using arrow::Table;
 using arrow::TimeUnit;
 using arrow::compute::Datum;
 using arrow::compute::DictionaryEncode;
 using arrow::compute::FunctionContext;
-using arrow::default_memory_pool;
 using arrow::io::BufferReader;
 
 using arrow::test::randint;
@@ -1453,13 +1453,13 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) {
 // Regression for ARROW-2802
 TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) {
   using ::arrow::Column;
+  using ::arrow::default_memory_pool;
   using ::arrow::Field;
   using ::arrow::Schema;
   using ::arrow::Table;
-  using ::arrow::TimeUnit;
   using ::arrow::TimestampBuilder;
   using ::arrow::TimestampType;
-  using ::arrow::default_memory_pool;
+  using ::arrow::TimeUnit;
 
   auto timestamp_type = std::make_shared<TimestampType>(TimeUnit::NANO);
 
diff --git a/src/parquet/arrow/arrow-schema-test.cc 
b/src/parquet/arrow/arrow-schema-test.cc
index 5c16c044..cb2b8508 100644
--- a/src/parquet/arrow/arrow-schema-test.cc
+++ b/src/parquet/arrow/arrow-schema-test.cc
@@ -62,8 +62,8 @@ class TestConvertParquetSchema : public ::testing::Test {
     for (int i = 0; i < expected_schema->num_fields(); ++i) {
       auto lhs = result_schema_->field(i);
       auto rhs = expected_schema->field(i);
-      EXPECT_TRUE(lhs->Equals(rhs)) << i << " " << lhs->ToString()
-                                    << " != " << rhs->ToString();
+      EXPECT_TRUE(lhs->Equals(rhs))
+          << i << " " << lhs->ToString() << " != " << rhs->ToString();
     }
   }
 
@@ -607,9 +607,10 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
     auto inner_group_type = 
std::make_shared<::arrow::StructType>(inner_group_fields);
     auto outer_group_fields = {
         std::make_shared<Field>("leaf2", INT32, true),
-        std::make_shared<Field>("innerGroup", 
::arrow::list(std::make_shared<Field>(
-                                                  "innerGroup", 
inner_group_type, false)),
-                                false)};
+        std::make_shared<Field>(
+            "innerGroup",
+            ::arrow::list(std::make_shared<Field>("innerGroup", 
inner_group_type, false)),
+            false)};
     auto outer_group_type = 
std::make_shared<::arrow::StructType>(outer_group_fields);
 
     arrow_fields.push_back(std::make_shared<Field>("leaf1", INT32, true));
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index 0f671b71..c0974ca4 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -301,9 +301,7 @@ class PARQUET_NO_EXPORT StructImpl : public 
ColumnReader::ColumnReaderImpl {
  public:
   explicit StructImpl(const std::vector<std::shared_ptr<ColumnReaderImpl>>& 
children,
                       int16_t struct_def_level, MemoryPool* pool, const Node* 
node)
-      : children_(children),
-        struct_def_level_(struct_def_level),
-        pool_(pool) {
+      : children_(children), struct_def_level_(struct_def_level), pool_(pool) {
     InitField(node, children);
   }
 
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index 4db98b77..bfc78c87 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -402,17 +402,16 @@ Status MakeEmptyListsArray(int64_t size, 
std::shared_ptr<Array>* out_array) {
                                         &offsets_buffer));
   memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
 
-  auto value_field = ::arrow::field("item", ::arrow::float64(),
-                                    false /* nullable_values */);
+  auto value_field =
+      ::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
   auto list_type = ::arrow::list(value_field);
 
   std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap 
*/,
-                                                        nullptr /* values */ };
-  auto child_data = ::arrow::ArrayData::Make(value_field->type(), 0,
-                                             std::move(child_buffers));
+                                                        nullptr /* values */};
+  auto child_data =
+      ::arrow::ArrayData::Make(value_field->type(), 0, 
std::move(child_buffers));
 
-  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
-                                                  offsets_buffer };
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */, 
offsets_buffer};
   auto array_data = ::arrow::ArrayData::Make(list_type, size, 
std::move(buffers));
   array_data->child_data.push_back(child_data);
 
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index f3ddda90..4bfeb370 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -41,8 +41,8 @@ using arrow::Int16Builder;
 using arrow::ListArray;
 using arrow::MemoryPool;
 using arrow::NumericArray;
-using arrow::ResizableBuffer;
 using arrow::PrimitiveArray;
+using arrow::ResizableBuffer;
 using arrow::Status;
 using arrow::Table;
 using arrow::TimeUnit;
@@ -216,9 +216,11 @@ class LevelBuilder {
         if (level_null_count && level_valid_bitmap == nullptr) {
           // Special case: this is a null array (all elements are null)
           RETURN_NOT_OK(def_levels_.Append(static_cast<int16_t>(def_level + 
1)));
-        } else if (nullable_level && ((level_null_count == 0) ||
-            BitUtil::GetBit(level_valid_bitmap,
-                            inner_offset + i + 
array_offsets_[recursion_level]))) {
+        } else if (nullable_level &&
+                   ((level_null_count == 0) ||
+                    BitUtil::GetBit(
+                        level_valid_bitmap,
+                        inner_offset + i + array_offsets_[recursion_level]))) {
           // Non-null element in a null level
           RETURN_NOT_OK(def_levels_.Append(static_cast<int16_t>(def_level + 
2)));
         } else {
diff --git a/src/parquet/encoding-benchmark.cc 
b/src/parquet/encoding-benchmark.cc
index 5ea8f8f5..364cdba1 100644
--- a/src/parquet/encoding-benchmark.cc
+++ b/src/parquet/encoding-benchmark.cc
@@ -20,8 +20,8 @@
 #include "parquet/encoding-internal.h"
 #include "parquet/util/memory.h"
 
-using arrow::MemoryPool;
 using arrow::default_memory_pool;
+using arrow::MemoryPool;
 
 namespace parquet {
 
diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc
index 31bb79d0..60285ab2 100644
--- a/src/parquet/encoding-test.cc
+++ b/src/parquet/encoding-test.cc
@@ -30,8 +30,8 @@
 #include "parquet/util/memory.h"
 #include "parquet/util/test-common.h"
 
-using arrow::MemoryPool;
 using arrow::default_memory_pool;
+using arrow::MemoryPool;
 
 using std::string;
 using std::vector;
diff --git a/src/parquet/encoding.h b/src/parquet/encoding.h
index 2742937c..006f22f2 100644
--- a/src/parquet/encoding.h
+++ b/src/parquet/encoding.h
@@ -51,12 +51,12 @@ class Encoder {
   virtual void PutSpaced(const T* src, int num_values, const uint8_t* 
valid_bits,
                          int64_t valid_bits_offset) {
     std::shared_ptr<ResizableBuffer> buffer;
-    auto status = ::arrow::AllocateResizableBuffer(pool_, num_values * 
sizeof(T),
-                                                   &buffer);
+    auto status =
+        ::arrow::AllocateResizableBuffer(pool_, num_values * sizeof(T), 
&buffer);
     if (!status.ok()) {
       std::ostringstream ss;
-      ss << "AllocateResizableBuffer failed in Encoder.PutSpaced in "
-         << __FILE__ << ", on line " << __LINE__;
+      ss << "AllocateResizableBuffer failed in Encoder.PutSpaced in " << 
__FILE__
+         << ", on line " << __LINE__;
       throw ParquetException(ss.str());
     }
     int32_t num_valid_values = 0;
diff --git a/src/parquet/printer.cc b/src/parquet/printer.cc
index 88b55289..3f18a5c8 100644
--- a/src/parquet/printer.cc
+++ b/src/parquet/printer.cc
@@ -84,8 +84,8 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, 
std::list<int> selecte
         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
         stream << ", Null Values: " << stats->null_count()
                << ", Distinct Values: " << stats->distinct_count() << std::endl
-               << "  Max: " << FormatStatValue(descr->physical_type(), 
max.c_str())
-               << ", Min: " << FormatStatValue(descr->physical_type(), 
min.c_str());
+               << "  Max: " << FormatStatValue(descr->physical_type(), max)
+               << ", Min: " << FormatStatValue(descr->physical_type(), min);
       } else {
         stream << "  Statistics Not Set";
       }
@@ -207,9 +207,8 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, 
std::list<int> selected
         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
         stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
                << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
-               << "\"Max\": \"" << FormatStatValue(descr->physical_type(), 
max.c_str())
-               << "\", "
-               << "\"Min\": \"" << FormatStatValue(descr->physical_type(), 
min.c_str())
+               << "\"Max\": \"" << FormatStatValue(descr->physical_type(), 
max) << "\", "
+               << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
                << "\" },";
       } else {
         stream << "\"False\",";
diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc
index bf3d1968..943d5ccf 100644
--- a/src/parquet/statistics-test.cc
+++ b/src/parquet/statistics-test.cc
@@ -36,8 +36,8 @@
 #include "parquet/types.h"
 #include "parquet/util/memory.h"
 
-using arrow::MemoryPool;
 using arrow::default_memory_pool;
+using arrow::MemoryPool;
 
 namespace parquet {
 
@@ -194,8 +194,9 @@ bool* 
TestRowGroupStatistics<BooleanType>::GetValuesPointer(std::vector<bool>& v
 }
 
 template <typename TestType>
-typename std::vector<typename TestType::c_type> TestRowGroupStatistics<
-    TestType>::GetDeepCopy(const std::vector<typename TestType::c_type>& 
values) {
+typename std::vector<typename TestType::c_type>
+TestRowGroupStatistics<TestType>::GetDeepCopy(
+    const std::vector<typename TestType::c_type>& values) {
   return values;
 }
 
diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc
index 5b014edc..ea7f783b 100644
--- a/src/parquet/statistics.cc
+++ b/src/parquet/statistics.cc
@@ -24,8 +24,8 @@
 #include "parquet/statistics.h"
 #include "parquet/util/memory.h"
 
-using arrow::MemoryPool;
 using arrow::default_memory_pool;
+using arrow::MemoryPool;
 
 namespace parquet {
 
diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc
index 4e759827..6b184e38 100644
--- a/src/parquet/types-test.cc
+++ b/src/parquet/types-test.cc
@@ -62,54 +62,81 @@ TEST(TestLogicalTypeToString, LogicalTypes) {
 }
 
 TEST(TypePrinter, StatisticsTypes) {
+#if !(defined(_WIN32) || defined(__CYGWIN__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
   std::string smin;
   std::string smax;
   int32_t int_min = 1024;
   int32_t int_max = 2048;
   smin = std::string(reinterpret_cast<char*>(&int_min), sizeof(int32_t));
   smax = std::string(reinterpret_cast<char*>(&int_max), sizeof(int32_t));
+  ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin).c_str());
   ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin.c_str()).c_str());
+  ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax).c_str());
   ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax.c_str()).c_str());
 
   int64_t int64_min = 10240000000000;
   int64_t int64_max = 20480000000000;
   smin = std::string(reinterpret_cast<char*>(&int64_min), sizeof(int64_t));
   smax = std::string(reinterpret_cast<char*>(&int64_max), sizeof(int64_t));
+  ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin).c_str());
   ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, 
smin.c_str()).c_str());
+  ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax).c_str());
   ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, 
smax.c_str()).c_str());
 
   float float_min = 1.024f;
   float float_max = 2.048f;
   smin = std::string(reinterpret_cast<char*>(&float_min), sizeof(float));
   smax = std::string(reinterpret_cast<char*>(&float_max), sizeof(float));
+  ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin).c_str());
   ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin.c_str()).c_str());
+  ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax).c_str());
   ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax.c_str()).c_str());
 
   double double_min = 1.0245;
   double double_max = 2.0489;
   smin = std::string(reinterpret_cast<char*>(&double_min), sizeof(double));
   smax = std::string(reinterpret_cast<char*>(&double_max), sizeof(double));
+  ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin).c_str());
   ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin.c_str()).c_str());
+  ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax).c_str());
   ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax.c_str()).c_str());
 
   Int96 Int96_min = {{1024, 2048, 4096}};
   Int96 Int96_max = {{2048, 4096, 8192}};
   smin = std::string(reinterpret_cast<char*>(&Int96_min), sizeof(Int96));
   smax = std::string(reinterpret_cast<char*>(&Int96_max), sizeof(Int96));
+  ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin).c_str());
   ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, 
smin.c_str()).c_str());
+  ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, smax).c_str());
   ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, 
smax.c_str()).c_str());
 
   smin = std::string("abcdef");
   smax = std::string("ijklmnop");
+  ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin).c_str());
   ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, 
smin.c_str()).c_str());
+  ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax).c_str());
   ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, 
smax.c_str()).c_str());
 
+  // PARQUET-1357: FormatStatValue truncates binary statistics on zero 
character
+  smax.push_back('\0');
+  ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax));
+  // This fails, thus the call to FormatStatValue(.., const char*) was 
deprecated.
+  // ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax.c_str()));
+
   smin = std::string("abcdefgh");
   smax = std::string("ijklmnop");
+  ASSERT_STREQ("abcdefgh", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, 
smin).c_str());
   ASSERT_STREQ("abcdefgh",
                FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, 
smin.c_str()).c_str());
+  ASSERT_STREQ("ijklmnop", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, 
smax).c_str());
   ASSERT_STREQ("ijklmnop",
                FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, 
smax.c_str()).c_str());
+#if !(defined(_WIN32) || defined(__CYGWIN__))
+#pragma GCC diagnostic pop
+#endif
 }
 
 }  // namespace parquet
diff --git a/src/parquet/types.cc b/src/parquet/types.cc
index 79bc5d1a..31209631 100644
--- a/src/parquet/types.cc
+++ b/src/parquet/types.cc
@@ -24,6 +24,41 @@
 
 namespace parquet {
 
+std::string FormatStatValue(Type::type parquet_type, const std::string& val) {
+  std::stringstream result;
+  switch (parquet_type) {
+    case Type::BOOLEAN:
+      result << reinterpret_cast<const bool*>(val.c_str())[0];
+      break;
+    case Type::INT32:
+      result << reinterpret_cast<const int32_t*>(val.c_str())[0];
+      break;
+    case Type::INT64:
+      result << reinterpret_cast<const int64_t*>(val.c_str())[0];
+      break;
+    case Type::DOUBLE:
+      result << reinterpret_cast<const double*>(val.c_str())[0];
+      break;
+    case Type::FLOAT:
+      result << reinterpret_cast<const float*>(val.c_str())[0];
+      break;
+    case Type::INT96: {
+      auto const i32_val = reinterpret_cast<const int32_t*>(val.c_str());
+      result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
+      break;
+    }
+    case Type::BYTE_ARRAY: {
+      return val;
+    }
+    case Type::FIXED_LEN_BYTE_ARRAY: {
+      return val;
+    }
+    default:
+      break;
+  }
+  return result.str();
+}
+
 std::string FormatStatValue(Type::type parquet_type, const char* val) {
   std::stringstream result;
   switch (parquet_type) {
diff --git a/src/parquet/types.h b/src/parquet/types.h
index 04cfc4b5..0f4cfc21 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -27,6 +27,7 @@
 
 #include "arrow/util/macros.h"
 
+#include "parquet/util/macros.h"
 #include "parquet/util/visibility.h"
 
 namespace parquet {
@@ -292,6 +293,11 @@ PARQUET_EXPORT std::string 
LogicalTypeToString(LogicalType::type t);
 
 PARQUET_EXPORT std::string TypeToString(Type::type t);
 
+PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
+                                           const std::string& val);
+
+/// \deprecated Since 1.5.0
+PARQUET_DEPRECATED("Use std::string instead of char* as input")
 PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const 
char* val);
 
 PARQUET_EXPORT int GetTypeByteSize(Type::type t);
diff --git a/src/parquet/util/macros.h b/src/parquet/util/macros.h
index 0d172b13..c28b2fa6 100644
--- a/src/parquet/util/macros.h
+++ b/src/parquet/util/macros.h
@@ -68,4 +68,19 @@
 #define FRIEND_TEST(test_case_name, test_name) \
   friend class test_case_name##_##test_name##_Test
 
+// clang-format off
+// [[deprecated]] is only available in C++14, use this for the time being
+// This macro takes an optional deprecation message
+#if __cplusplus <= 201103L
+# ifdef __GNUC__
+#  define PARQUET_DEPRECATED(...) __attribute__((deprecated(__VA_ARGS__)))
+# elif defined(_MSC_VER)
+#  define PARQUET_DEPRECATED(...) __declspec(deprecated(__VA_ARGS__))
+# else
+#  define PARQUET_DEPRECATED(...)
+# endif
+#else
+#  define PARQUET_DEPRECATED(...) [[deprecated(__VA_ARGS__)]]
+#endif
+
 #endif  // PARQUET_UTIL_MACROS_H
diff --git a/src/parquet/util/memory-test.cc b/src/parquet/util/memory-test.cc
index cb8c7061..bfd685db 100644
--- a/src/parquet/util/memory-test.cc
+++ b/src/parquet/util/memory-test.cc
@@ -27,8 +27,8 @@
 #include "parquet/util/memory.h"
 #include "parquet/util/test-common.h"
 
-using arrow::MemoryPool;
 using arrow::default_memory_pool;
+using arrow::MemoryPool;
 
 namespace parquet {
 
@@ -255,8 +255,8 @@ TEST(TestBufferedInputStream, Basics) {
   int64_t stream_offset = 10;
   int64_t stream_size = source_size - stream_offset;
   int64_t chunk_size = 50;
-  std::shared_ptr<ResizableBuffer> buf = AllocateBuffer(default_memory_pool(),
-                                                        source_size);
+  std::shared_ptr<ResizableBuffer> buf =
+      AllocateBuffer(default_memory_pool(), source_size);
   ASSERT_EQ(source_size, buf->size());
   for (int i = 0; i < source_size; i++) {
     buf->mutable_data()[i] = static_cast<uint8_t>(i);
diff --git a/src/parquet/util/memory.cc b/src/parquet/util/memory.cc
index df7ccc76..d9caf6e3 100644
--- a/src/parquet/util/memory.cc
+++ b/src/parquet/util/memory.cc
@@ -36,9 +36,7 @@ namespace parquet {
 
 template <class T>
 Vector<T>::Vector(int64_t size, MemoryPool* pool)
-    : buffer_(AllocateBuffer(pool, size * sizeof(T))),
-      size_(size),
-      capacity_(size) {
+    : buffer_(AllocateBuffer(pool, size * sizeof(T))), size_(size), 
capacity_(size) {
   if (size > 0) {
     data_ = reinterpret_cast<T*>(buffer_->mutable_data());
   } else {
@@ -497,8 +495,7 @@ void BufferedInputStream::Advance(int64_t num_bytes) {
 
 std::shared_ptr<ResizableBuffer> AllocateBuffer(MemoryPool* pool, int64_t 
size) {
   std::shared_ptr<ResizableBuffer> result;
-  PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(pool, size,
-                                                      &result));
+  PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(pool, size, &result));
   return result;
 }
 
diff --git a/src/parquet/util/memory.h b/src/parquet/util/memory.h
index 69dcebf4..088f86fe 100644
--- a/src/parquet/util/memory.h
+++ b/src/parquet/util/memory.h
@@ -438,7 +438,7 @@ class PARQUET_EXPORT BufferedInputStream : public 
InputStream {
 };
 
 std::shared_ptr<ResizableBuffer> PARQUET_EXPORT AllocateBuffer(
-  ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 
0);
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 
0);
 
 }  // namespace parquet
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [C++] FormatStatValue truncates binary statistics on zero character
> -------------------------------------------------------------------
>
>                 Key: PARQUET-1357
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1357
>             Project: Parquet
>          Issue Type: Bug
>          Components: parquet-cpp
>    Affects Versions: cpp-1.4.0
>            Reporter: Uwe L. Korn
>            Assignee: Uwe L. Korn
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: cpp-1.5.0
>
>
> As {{FormatStatValue}} is currently called with a C-style string, we cannot 
> pass the actual binary content with its length. Instead change the interface 
> to {{std::string}}.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to