wgtmac commented on code in PR #38272:
URL: https://github.com/apache/arrow/pull/38272#discussion_r1363350532


##########
cpp/src/arrow/util/compression_test.cc:
##########
@@ -368,6 +368,41 @@ TEST_P(CodecTest, CodecRoundtrip) {
   }
 }
 
+TEST(CodecTest, CodecRoundtripGzipMembers) {
+  int sizes[] = {0, 10000, 100000};

Review Comment:
   nit: use std::array or std::vector instead of raw array.



##########
cpp/src/arrow/util/compression_test.cc:
##########
@@ -368,6 +368,41 @@ TEST_P(CodecTest, CodecRoundtrip) {
   }
 }
 
+TEST(CodecTest, CodecRoundtripGzipMembers) {
+  int sizes[] = {0, 10000, 100000};
+
+  std::unique_ptr<Codec> c1;
+  ASSERT_OK_AND_ASSIGN(c1, Codec::Create(Compression::GZIP));
+
+  for (int data_half_size : sizes) {

Review Comment:
   ```suggestion
     for (int data_half_size : {0, 10000, 100000}) {
   ```



##########
cpp/src/arrow/util/compression_test.cc:
##########
@@ -368,6 +368,41 @@ TEST_P(CodecTest, CodecRoundtrip) {
   }
 }
 
+TEST(CodecTest, CodecRoundtripGzipMembers) {
+  int sizes[] = {0, 10000, 100000};
+
+  std::unique_ptr<Codec> c1;
+  ASSERT_OK_AND_ASSIGN(c1, Codec::Create(Compression::GZIP));

Review Comment:
   Let's name it more meaningfully.



##########
cpp/src/arrow/util/compression_test.cc:
##########
@@ -368,6 +368,41 @@ TEST_P(CodecTest, CodecRoundtrip) {
   }
 }
 
+TEST(CodecTest, CodecRoundtripGzipMembers) {
+  int sizes[] = {0, 10000, 100000};
+
+  std::unique_ptr<Codec> c1;
+  ASSERT_OK_AND_ASSIGN(c1, Codec::Create(Compression::GZIP));
+
+  for (int data_half_size : sizes) {
+    int64_t actual_size_p1, actual_size_p2;
+    std::vector<uint8_t> data_half = MakeRandomData(data_half_size);
+    std::vector<uint8_t> data_full(data_half.begin(), data_half.end());
+    data_full.insert(data_full.end(), data_half.begin(), data_half.end());
+
+    int max_compressed_len_half =
+      static_cast<int>(c1->MaxCompressedLen(data_half.size(), 
data_half.data()));
+    std::vector<uint8_t> compressed(max_compressed_len_half * 2);
+
+    // Compress in 2 steps
+    ASSERT_OK_AND_ASSIGN(actual_size_p1, c1->Compress(data_half.size(), 
data_half.data(),
+                                         max_compressed_len_half, 
compressed.data()));
+    ASSERT_OK_AND_ASSIGN(actual_size_p2, c1->Compress(data_half.size(), 
data_half.data(),
+                                         max_compressed_len_half, 
compressed.data()+actual_size_p1));

Review Comment:
   ```suggestion
       ASSERT_OK_AND_ASSIGN(int64_t actual_size_p1, 
gzip_codec->Compress(data_half.size(), data_half.data(),
                                            max_compressed_len_half, 
compressed.data()));
       ASSERT_OK_AND_ASSIGN(int64_t actual_size_p2, 
gzip_codec->Compress(data_half.size(), data_half.data(),
                                            max_compressed_len_half, 
compressed.data() + actual_size_p1));
   ```
   
   We can simply do this.



##########
cpp/src/arrow/util/compression_test.cc:
##########
@@ -368,6 +368,41 @@ TEST_P(CodecTest, CodecRoundtrip) {
   }
 }
 
+TEST(CodecTest, CodecRoundtripGzipMembers) {
+  int sizes[] = {0, 10000, 100000};
+
+  std::unique_ptr<Codec> c1;
+  ASSERT_OK_AND_ASSIGN(c1, Codec::Create(Compression::GZIP));
+
+  for (int data_half_size : sizes) {
+    int64_t actual_size_p1, actual_size_p2;
+    std::vector<uint8_t> data_half = MakeRandomData(data_half_size);
+    std::vector<uint8_t> data_full(data_half.begin(), data_half.end());
+    data_full.insert(data_full.end(), data_half.begin(), data_half.end());
+
+    int max_compressed_len_half =
+      static_cast<int>(c1->MaxCompressedLen(data_half.size(), 
data_half.data()));
+    std::vector<uint8_t> compressed(max_compressed_len_half * 2);
+
+    // Compress in 2 steps
+    ASSERT_OK_AND_ASSIGN(actual_size_p1, c1->Compress(data_half.size(), 
data_half.data(),
+                                         max_compressed_len_half, 
compressed.data()));
+    ASSERT_OK_AND_ASSIGN(actual_size_p2, c1->Compress(data_half.size(), 
data_half.data(),
+                                         max_compressed_len_half, 
compressed.data()+actual_size_p1));
+    compressed.resize(actual_size_p1 + actual_size_p2);
+
+    // Decompress the concatenated data
+    std::vector<uint8_t> decompressed(data_half_size*2);

Review Comment:
   ```suggestion
       std::vector<uint8_t> decompressed(data_half_size * 2);
   ```



##########
cpp/src/parquet/reader_test.cc:
##########
@@ -116,6 +116,10 @@ std::string rle_dict_uncompressed_corrupt_checksum() {
   return data_file("rle-dict-uncompressed-corrupt-checksum.parquet");
 }
 
+std::string concatenated_gzip_members() {
+  return data_file("concatenated_gzip_members.parquet");

Review Comment:
   This requires to upload the file to apache/parquet-testing repo first, then 
sync the submodule directory in the apache/arrow in this PR.



##########
cpp/src/arrow/util/compression_test.cc:
##########
@@ -368,6 +368,41 @@ TEST_P(CodecTest, CodecRoundtrip) {
   }
 }
 
+TEST(CodecTest, CodecRoundtripGzipMembers) {
+  int sizes[] = {0, 10000, 100000};
+
+  std::unique_ptr<Codec> c1;
+  ASSERT_OK_AND_ASSIGN(c1, Codec::Create(Compression::GZIP));

Review Comment:
   ```suggestion
     ASSERT_OK_AND_ASSIGN(auto gzip_codec, Codec::Create(Compression::GZIP));
   ```



##########
cpp/src/arrow/util/compression_test.cc:
##########
@@ -368,6 +368,41 @@ TEST_P(CodecTest, CodecRoundtrip) {
   }
 }
 
+TEST(CodecTest, CodecRoundtripGzipMembers) {
+  int sizes[] = {0, 10000, 100000};
+
+  std::unique_ptr<Codec> c1;
+  ASSERT_OK_AND_ASSIGN(c1, Codec::Create(Compression::GZIP));
+
+  for (int data_half_size : sizes) {
+    int64_t actual_size_p1, actual_size_p2;
+    std::vector<uint8_t> data_half = MakeRandomData(data_half_size);
+    std::vector<uint8_t> data_full(data_half.begin(), data_half.end());
+    data_full.insert(data_full.end(), data_half.begin(), data_half.end());
+
+    int max_compressed_len_half =
+      static_cast<int>(c1->MaxCompressedLen(data_half.size(), 
data_half.data()));
+    std::vector<uint8_t> compressed(max_compressed_len_half * 2);
+
+    // Compress in 2 steps
+    ASSERT_OK_AND_ASSIGN(actual_size_p1, c1->Compress(data_half.size(), 
data_half.data(),
+                                         max_compressed_len_half, 
compressed.data()));
+    ASSERT_OK_AND_ASSIGN(actual_size_p2, c1->Compress(data_half.size(), 
data_half.data(),
+                                         max_compressed_len_half, 
compressed.data()+actual_size_p1));
+    compressed.resize(actual_size_p1 + actual_size_p2);
+
+    // Decompress the concatenated data
+    std::vector<uint8_t> decompressed(data_half_size*2);
+    int64_t actual_decompressed_size;
+    ASSERT_OK_AND_ASSIGN(actual_decompressed_size,
+                         c1->Decompress(compressed.size(), compressed.data(),
+                                        decompressed.size(), 
decompressed.data()));
+
+    ASSERT_EQ(data_half.size()*2, actual_decompressed_size);

Review Comment:
   ```suggestion
       ASSERT_EQ(data_half.size() * 2, actual_decompressed_size);
   ```



##########
cpp/src/arrow/util/compression_test.cc:
##########
@@ -368,6 +368,41 @@ TEST_P(CodecTest, CodecRoundtrip) {
   }
 }
 
+TEST(CodecTest, CodecRoundtripGzipMembers) {
+  int sizes[] = {0, 10000, 100000};
+
+  std::unique_ptr<Codec> c1;
+  ASSERT_OK_AND_ASSIGN(c1, Codec::Create(Compression::GZIP));
+
+  for (int data_half_size : sizes) {

Review Comment:
   Or simply do this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to