[orc] branch main updated: ORC-1030: Java Tools may have missed OrcFile.MAGIC during file recovery (#941)

dongjoon Wed, 27 Oct 2021 02:52:46 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/main by this push:
     new d3f998e  ORC-1030: Java Tools may have missed OrcFile.MAGIC during 
file recovery (#941)
d3f998e is described below

commit d3f998ecb8f92b9408f44a879463a3938e242d33
Author: Guiyanakaung <[email protected]>
AuthorDate: Wed Oct 27 17:52:33 2021 +0800

    ORC-1030: Java Tools may have missed OrcFile.MAGIC during file recovery 
(#941)
    
    ### What changes were proposed in this pull request?
    
    This pr is aimed at fixing some bugs where the Java Tools recovery file 
command did not accurately find OrcFileMAGIC.
    ```java
            while (remaining > 0) {
              int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
              byte[] data = new byte[toRead];
              long startPos = corruptFileLen - remaining;
              fdis.readFully(startPos, data, 0, toRead);
    
              // find all MAGIC string and see if the file is readable from 
there
              int index = 0;
              long nextFooterOffset;
              byte[] magicBytes = 
OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
              while (index != -1) {
                index = indexOf(data, magicBytes, index + 1);
                if (index != -1) {
                  nextFooterOffset = startPos + index + magicBytes.length + 1;
                  if (isReadable(corruptPath, conf, nextFooterOffset)) {
                    footerOffsets.add(nextFooterOffset);
                  }
                }
              }
    
              System.err.println("Scanning for valid footers - startPos: " + 
startPos +
                  " toRead: " + toRead + " remaining: " + remaining);
              remaining = remaining - toRead;
            }
    ```
    Example:
    First read:   data = [..........,'O', 'R']
    Second read:  data = ['C', ...........]
    The previous implementation only looked within data and did not consider 
OrcFile.MAGIC across two reads.
    
    ### Why are the changes needed?
    
    Fix bug: Java Tools may miss OrcFileMAGIC during file recovery.
    
    ### How was this patch tested?
    
    Add UT.
---
 .../src/java/org/apache/orc/tools/FileDump.java    | 29 +++++--
 .../test/org/apache/orc/tools/TestFileDump.java    | 99 +++++++++++++++++++++-
 2 files changed, 121 insertions(+), 7 deletions(-)

diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java 
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 63f5194..b0549c1 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -62,6 +62,7 @@ import java.util.List;
 public final class FileDump {
   public static final String UNKNOWN = "UNKNOWN";
   public static final String SEPARATOR = StringUtils.repeat("_", 120) + "\n";
+  public static final String RECOVER_READ_SIZE = "orc.recover.read.size"; // 
only for testing
   public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
   public static final String DEFAULT_BACKUP_PATH = 
System.getProperty("java.io.tmpdir");
   public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new 
PathFilter() {
@@ -459,6 +460,11 @@ public final class FileDump {
   private static void recoverFiles(final List<String> corruptFiles, final 
Configuration conf,
       final String backup)
       throws IOException {
+
+    byte[] magicBytes = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
+    int magicLength = magicBytes.length;
+    int readSize = conf.getInt(RECOVER_READ_SIZE, DEFAULT_BLOCK_SIZE);
+
     for (String corruptFile : corruptFiles) {
       System.err.println("Recovering file " + corruptFile);
       Path corruptPath = new Path(corruptFile);
@@ -470,17 +476,30 @@ public final class FileDump {
         List<Long> footerOffsets = new ArrayList<>();
 
         // start reading the data file form top to bottom and record the valid 
footers
-        while (remaining > 0) {
-          int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
-          byte[] data = new byte[toRead];
+        while (remaining > 0 && corruptFileLen > (2L * magicLength)) {
+          int toRead = (int) Math.min(readSize, remaining);
           long startPos = corruptFileLen - remaining;
-          fdis.readFully(startPos, data, 0, toRead);
+          byte[] data;
+          if (startPos == 0) {
+            data = new byte[toRead];
+            fdis.readFully(startPos, data, 0, toRead);
+          }
+          else {
+            // For non-first reads, we let startPos move back magicLength bytes
+            // which prevents two adjacent reads from separating OrcFile.MAGIC
+            startPos = startPos - magicLength;
+            data = new byte[toRead + magicLength];
+            fdis.readFully(startPos, data, 0, toRead + magicLength);
+          }
 
           // find all MAGIC string and see if the file is readable from there
           int index = 0;
           long nextFooterOffset;
-          byte[] magicBytes = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
           while (index != -1) {
+            // There are two reasons for searching from index + 1
+            // 1. to skip the OrcFile.MAGIC in the file header when the first 
match is made
+            // 2. When the match is successful, the index is moved backwards 
to search for
+            // the subsequent OrcFile.MAGIC
             index = indexOf(data, magicBytes, index + 1);
             if (index != -1) {
               nextFooterOffset = startPos + index + magicBytes.length + 1;
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java 
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index fd1923a..ce916d2 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -44,10 +44,14 @@ import org.apache.orc.Writer;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 
+import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.PrintStream;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
@@ -57,12 +61,12 @@ import java.text.SimpleDateFormat;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Random;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import static org.apache.orc.tools.FileDump.RECOVER_READ_SIZE;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNull;
@@ -709,6 +713,97 @@ public class TestFileDump {
     byte[] bytes = ("OO" + OrcFile.MAGIC).getBytes(StandardCharsets.UTF_8);
     byte[] pattern = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
 
-    assertEquals(FileDump.indexOf(bytes, pattern, 1), 2);
+    assertEquals(2, FileDump.indexOf(bytes, pattern, 1));
+  }
+
+  @Test
+  public void testRecover() throws Exception {
+    TypeDescription schema = getMyRecordType();
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .fileSystem(fs)
+            .setSchema(schema));
+    Random r1 = new Random(1);
+    String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+        "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+        "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+        "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+        "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+        "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+        "spring", "of", "hope,", "it", "was", "the", "winter", "of", 
"despair,",
+        "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+        "before", "us,", "we", "were", "all", "going", "direct", "to",
+        "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+        "way"};
+    VectorizedRowBatch batch = schema.createRowBatch(1000);
+    for(int i=0; i < 21000; ++i) {
+      appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+          words[r1.nextInt(words.length)]);
+      if (batch.size == batch.getMaxSize()) {
+        writer.addRowBatch(batch);
+        batch.reset();
+      }
+    }
+    if (batch.size > 0) {
+      writer.addRowBatch(batch);
+    }
+    writer.close();
+
+    long fileSize = fs.getFileStatus(testFilePath).getLen();
+
+    String testFilePathStr = Path.mergePaths(
+        workDir, Path.mergePaths(new Path(Path.SEPARATOR), testFilePath))
+        .toUri().getPath();
+
+    String copyTestFilePathStr = Path.mergePaths(
+        workDir, Path.mergePaths(new Path(Path.SEPARATOR),
+                new Path("CopyTestFileDump.testDump.orc")))
+        .toUri().getPath();
+
+    String testCrcFilePathStr = Path.mergePaths(
+        workDir, Path.mergePaths(new Path(Path.SEPARATOR),
+                new Path(".TestFileDump.testDump.orc.crc")))
+        .toUri().getPath();
+
+    try {
+      Files.copy(Paths.get(testFilePathStr), Paths.get(copyTestFilePathStr));
+
+      // Append write data to make it a corrupt file
+      try (FileOutputStream output = new FileOutputStream(testFilePathStr, 
true)) {
+        output.write(new byte[1024]);
+        output.write(OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8));
+        output.write(new byte[1024]);
+        output.flush();
+      }
+
+      // Clean up the crc file and append data to avoid checksum read 
exceptions
+      Files.delete(Paths.get(testCrcFilePathStr));
+
+      conf.setInt(RECOVER_READ_SIZE, (int) (fileSize - 2));
+
+      FileDump.main(conf, new String[]{"--recover", "--skip-dump",
+          testFilePath.toUri().getPath()});
+
+      assertTrue(contentEquals(testFilePathStr, copyTestFilePathStr));
+    } finally {
+      Files.delete(Paths.get(copyTestFilePathStr));
+    }
+  }
+
+  private static boolean contentEquals(String filePath, String otherFilePath) 
throws IOException {
+    try (InputStream is = new BufferedInputStream(new 
FileInputStream(filePath));
+         InputStream otherIs = new BufferedInputStream(new 
FileInputStream(otherFilePath))) {
+      int ch = is.read();
+      while (-1 != ch) {
+        int ch2 = otherIs.read();
+        if (ch != ch2) {
+          return false;
+        }
+        ch = is.read();
+      }
+
+      int ch2 = otherIs.read();
+      return ch2 == -1;
+    }
   }
 }

[orc] branch main updated: ORC-1030: Java Tools may have missed OrcFile.MAGIC during file recovery (#941)

Reply via email to