HADOOP-6852. apparent bug in concatenated-bzip2 support (decoding). Contributed 
by Zsolt Venczel.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/2bc3351e
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/2bc3351e
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/2bc3351e

Branch: refs/heads/HDFS-12996
Commit: 2bc3351eaf240ea685bcf5042d79f1554bf89e00
Parents: 92cbbfe
Author: Sean Mackrory <mackror...@apache.org>
Authored: Wed Feb 21 12:53:18 2018 -0700
Committer: Sean Mackrory <mackror...@apache.org>
Committed: Wed Feb 21 12:57:14 2018 -0700

----------------------------------------------------------------------
 .../hadoop-client-minicluster/pom.xml           |   1 +
 .../apache/hadoop/io/compress/BZip2Codec.java   |   3 +-
 .../mapred/TestConcatenatedCompressedInput.java |  84 +++++++++----------
 .../src/test/resources/testdata/concat.bz2      | Bin 0 -> 208 bytes
 .../src/test/resources/testdata/concat.gz       | Bin 0 -> 148 bytes
 .../testdata/testCompressThenConcat.txt.bz2     | Bin 0 -> 3056 bytes
 .../testdata/testCompressThenConcat.txt.gz      | Bin 0 -> 3413 bytes
 .../testdata/testConcatThenCompress.txt.bz2     | Bin 0 -> 2567 bytes
 .../testdata/testConcatThenCompress.txt.gz      | Bin 0 -> 2734 bytes
 9 files changed, 42 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bc3351e/hadoop-client-modules/hadoop-client-minicluster/pom.xml
----------------------------------------------------------------------
diff --git a/hadoop-client-modules/hadoop-client-minicluster/pom.xml 
b/hadoop-client-modules/hadoop-client-minicluster/pom.xml
index 905d53a..a443648 100644
--- a/hadoop-client-modules/hadoop-client-minicluster/pom.xml
+++ b/hadoop-client-modules/hadoop-client-minicluster/pom.xml
@@ -615,6 +615,7 @@
                       <excludes>
                         <exclude>testjar/*</exclude>
                         <exclude>testshell/*</exclude>
+                        <exclude>testdata/*</exclude>
                       </excludes>
                     </filter>
                     <!-- Mockito tries to include its own unrelocated copy of 
hamcrest. :( -->

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bc3351e/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
----------------------------------------------------------------------
diff --git 
a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
 
b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
index 3c78cfc..99590ed 100644
--- 
a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
+++ 
b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
@@ -180,7 +180,8 @@ public class BZip2Codec implements Configurable, 
SplittableCompressionCodec {
       new DecompressorStream(in, decompressor,
                              conf.getInt(IO_FILE_BUFFER_SIZE_KEY,
                                  IO_FILE_BUFFER_SIZE_DEFAULT)) :
-      new BZip2CompressionInputStream(in);
+      new BZip2CompressionInputStream(
+              in, 0L, Long.MAX_VALUE, READ_MODE.BYBLOCK);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bc3351e/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestConcatenatedCompressedInput.java
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestConcatenatedCompressedInput.java
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestConcatenatedCompressedInput.java
index 977d083..af6b952 100644
--- 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestConcatenatedCompressedInput.java
+++ 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestConcatenatedCompressedInput.java
@@ -18,18 +18,6 @@
 
 package org.apache.hadoop.mapred;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.ByteArrayInputStream;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.zip.Inflater;
-
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
@@ -42,16 +30,26 @@ import org.apache.hadoop.io.compress.zlib.ZlibFactory;
 import org.apache.hadoop.util.LineReader;
 import org.apache.hadoop.util.ReflectionUtils;
 import org.junit.After;
-import org.junit.Ignore;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-@Ignore
+import java.io.ByteArrayInputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.zip.Inflater;
+
+import static org.junit.Assert.*;
+
+/**
+ * Test class for concatenated {@link CompressionInputStream}.
+ */
 public class TestConcatenatedCompressedInput {
   private static final Logger LOG =
       LoggerFactory.getLogger(TestConcatenatedCompressedInput.class);
-  private static int MAX_LENGTH = 10000;
   private static JobConf defaultConf = new JobConf();
   private static FileSystem localFs = null;
 
@@ -85,13 +83,15 @@ public class TestConcatenatedCompressedInput {
   public void after() {
     ZlibFactory.loadNativeZLib();
   }
+
+  private static final String DEFAULT_WORK_DIR = 
"target/test-classes/testdata";
   private static Path workDir = localFs.makeQualified(new Path(
-      System.getProperty("test.build.data", "/tmp"),
+      System.getProperty("test.build.data", DEFAULT_WORK_DIR),
       "TestConcatenatedCompressedInput"));
 
   private static LineReader makeStream(String str) throws IOException {
-    return new LineReader(new ByteArrayInputStream(str.getBytes("UTF-8")),
-                          defaultConf);
+    return new LineReader(new ByteArrayInputStream(
+            str.getBytes("UTF-8")), defaultConf);
   }
 
   private static void writeFile(FileSystem fs, Path name,
@@ -190,7 +190,8 @@ public class TestConcatenatedCompressedInput {
 
     // copy prebuilt (correct!) version of concat.gz to HDFS
     final String fn = "concat" + gzip.getDefaultExtension();
-    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), 
fn);
+    Path fnLocal = new Path(
+            System.getProperty("test.concat.data", DEFAULT_WORK_DIR), fn);
     Path fnHDFS  = new Path(workDir, fn);
     localFs.copyFromLocalFile(fnLocal, fnHDFS);
 
@@ -227,7 +228,7 @@ public class TestConcatenatedCompressedInput {
   @Test
   public void testPrototypeInflaterGzip() throws IOException {
     CompressionCodec gzip = new GzipCodec();  // used only for file extension
-    localFs.delete(workDir, true);            // localFs = FileSystem instance
+    localFs.delete(workDir, true); // localFs = FileSystem instance
 
     System.out.println(COLOR_BR_BLUE + "testPrototypeInflaterGzip() using " +
       "non-native/Java Inflater and manual gzip header/trailer parsing" +
@@ -235,7 +236,8 @@ public class TestConcatenatedCompressedInput {
 
     // copy prebuilt (correct!) version of concat.gz to HDFS
     final String fn = "concat" + gzip.getDefaultExtension();
-    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), 
fn);
+    Path fnLocal = new Path(
+            System.getProperty("test.concat.data", DEFAULT_WORK_DIR), fn);
     Path fnHDFS  = new Path(workDir, fn);
     localFs.copyFromLocalFile(fnLocal, fnHDFS);
 
@@ -326,14 +328,16 @@ public class TestConcatenatedCompressedInput {
 
     // copy single-member test file to HDFS
     String fn1 = "testConcatThenCompress.txt" + gzip.getDefaultExtension();
-    Path fnLocal1 = new 
Path(System.getProperty("test.concat.data","/tmp"),fn1);
+    Path fnLocal1 = new Path(
+            System.getProperty("test.concat.data", DEFAULT_WORK_DIR), fn1);
     Path fnHDFS1  = new Path(workDir, fn1);
     localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
 
     // copy multiple-member test file to HDFS
     // (actually in "seekable gzip" format, a la JIRA PIG-42)
     String fn2 = "testCompressThenConcat.txt" + gzip.getDefaultExtension();
-    Path fnLocal2 = new 
Path(System.getProperty("test.concat.data","/tmp"),fn2);
+    Path fnLocal2 = new Path(
+            System.getProperty("test.concat.data", DEFAULT_WORK_DIR), fn2);
     Path fnHDFS2  = new Path(workDir, fn2);
     localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
 
@@ -439,7 +443,8 @@ public class TestConcatenatedCompressedInput {
     InputSplit[] splits = format.getSplits(jConf, 100);
     assertEquals("compressed splits == 2", 2, splits.length);
     FileSplit tmp = (FileSplit) splits[0];
-    if (tmp.getPath().getName().equals("testCompressThenConcat.txt.gz")) {
+    if (tmp.getPath()
+            .getName().equals("testdata/testCompressThenConcat.txt.gz")) {
       System.out.println("  (swapping)");
       splits[0] = splits[1];
       splits[1] = tmp;
@@ -481,7 +486,8 @@ public class TestConcatenatedCompressedInput {
 
     // copy prebuilt (correct!) version of concat.bz2 to HDFS
     final String fn = "concat" + bzip2.getDefaultExtension();
-    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), 
fn);
+    Path fnLocal = new Path(
+            System.getProperty("test.concat.data", DEFAULT_WORK_DIR), fn);
     Path fnHDFS  = new Path(workDir, fn);
     localFs.copyFromLocalFile(fnLocal, fnHDFS);
 
@@ -531,13 +537,15 @@ public class TestConcatenatedCompressedInput {
 
     // copy single-member test file to HDFS
     String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension();
-    Path fnLocal1 = new 
Path(System.getProperty("test.concat.data","/tmp"),fn1);
+    Path fnLocal1 = new Path(
+            System.getProperty("test.concat.data", DEFAULT_WORK_DIR), fn1);
     Path fnHDFS1  = new Path(workDir, fn1);
     localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
 
     // copy multiple-member test file to HDFS
     String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension();
-    Path fnLocal2 = new 
Path(System.getProperty("test.concat.data","/tmp"),fn2);
+    Path fnLocal2 = new Path(
+            System.getProperty("test.concat.data", DEFAULT_WORK_DIR), fn2);
     Path fnHDFS2  = new Path(workDir, fn2);
     localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
 
@@ -549,21 +557,6 @@ public class TestConcatenatedCompressedInput {
     assertEquals("concat bytes available", 2567, in1.available());
     assertEquals("concat bytes available", 3056, in2.available());
 
-/*
-    // FIXME
-    // The while-loop below dies at the beginning of the 2nd concatenated
-    // member (after 17 lines successfully read) with:
-    //
-    //   java.io.IOException: bad block header
-    //   at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock(
-    //   CBZip2InputStream.java:527)
-    //
-    // It is not critical to concatenated-gzip support, HADOOP-6835, so it's
-    // simply commented out for now (and HADOOP-6852 filed).  If and when the
-    // latter issue is resolved--perhaps by fixing an error here--this code
-    // should be reenabled.  Note that the doMultipleBzip2BufferSizes() test
-    // below uses the same testCompressThenConcat.txt.bz2 file but works fine.
-
     CompressionInputStream cin2 = bzip2.createInputStream(in2);
     LineReader in = new LineReader(cin2);
     Text out = new Text();
@@ -578,7 +571,6 @@ public class TestConcatenatedCompressedInput {
                  5346, totalBytes);
     assertEquals("total uncompressed lines in concatenated test file",
                  84, lineNum);
- */
 
     // test CBZip2InputStream with lots of different input-buffer sizes
     doMultipleBzip2BufferSizes(jobConf);
@@ -645,7 +637,8 @@ public class TestConcatenatedCompressedInput {
 
   // this tests both files (testCompressThenConcat, testConcatThenCompress); 
all
   // should work with existing Java bzip2 decoder and any future native version
-  private static void doSingleBzip2BufferSize(JobConf jConf) throws 
IOException {
+  private static void doSingleBzip2BufferSize(JobConf jConf)
+          throws IOException {
     TextInputFormat format = new TextInputFormat();
     format.configure(jConf);
     format.setMinSplitSize(5500);  // work around 256-byte/22-splits issue
@@ -654,7 +647,8 @@ public class TestConcatenatedCompressedInput {
     InputSplit[] splits = format.getSplits(jConf, 100);
     assertEquals("compressed splits == 2", 2, splits.length);
     FileSplit tmp = (FileSplit) splits[0];
-    if (tmp.getPath().getName().equals("testCompressThenConcat.txt.gz")) {
+    if (tmp.getPath()
+            .getName().equals("testdata/testCompressThenConcat.txt.gz")) {
       System.out.println("  (swapping)");
       splits[0] = splits[1];
       splits[1] = tmp;

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bc3351e/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/concat.bz2
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/concat.bz2
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/concat.bz2
new file mode 100644
index 0000000..f31fb0c
Binary files /dev/null and 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/concat.bz2
 differ

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bc3351e/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/concat.gz
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/concat.gz
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/concat.gz
new file mode 100644
index 0000000..53d5a07
Binary files /dev/null and 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/concat.gz
 differ

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bc3351e/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testCompressThenConcat.txt.bz2
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testCompressThenConcat.txt.bz2
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testCompressThenConcat.txt.bz2
new file mode 100644
index 0000000..a21c0e2
Binary files /dev/null and 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testCompressThenConcat.txt.bz2
 differ

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bc3351e/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testCompressThenConcat.txt.gz
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testCompressThenConcat.txt.gz
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testCompressThenConcat.txt.gz
new file mode 100644
index 0000000..75e5f8c
Binary files /dev/null and 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testCompressThenConcat.txt.gz
 differ

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bc3351e/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testConcatThenCompress.txt.bz2
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testConcatThenCompress.txt.bz2
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testConcatThenCompress.txt.bz2
new file mode 100644
index 0000000..5983e52
Binary files /dev/null and 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testConcatThenCompress.txt.bz2
 differ

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bc3351e/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testConcatThenCompress.txt.gz
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testConcatThenCompress.txt.gz
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testConcatThenCompress.txt.gz
new file mode 100644
index 0000000..6e8eaa5
Binary files /dev/null and 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/resources/testdata/testConcatThenCompress.txt.gz
 differ


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to