This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new d3f998e ORC-1030: Java Tools may have missed OrcFile.MAGIC during
file recovery (#941)
d3f998e is described below
commit d3f998ecb8f92b9408f44a879463a3938e242d33
Author: Guiyanakaung <[email protected]>
AuthorDate: Wed Oct 27 17:52:33 2021 +0800
ORC-1030: Java Tools may have missed OrcFile.MAGIC during file recovery
(#941)
### What changes were proposed in this pull request?
This pr is aimed at fixing some bugs where the Java Tools recovery file
command did not accurately find OrcFileMAGIC.
```java
while (remaining > 0) {
int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
byte[] data = new byte[toRead];
long startPos = corruptFileLen - remaining;
fdis.readFully(startPos, data, 0, toRead);
// find all MAGIC string and see if the file is readable from
there
int index = 0;
long nextFooterOffset;
byte[] magicBytes =
OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
while (index != -1) {
index = indexOf(data, magicBytes, index + 1);
if (index != -1) {
nextFooterOffset = startPos + index + magicBytes.length + 1;
if (isReadable(corruptPath, conf, nextFooterOffset)) {
footerOffsets.add(nextFooterOffset);
}
}
}
System.err.println("Scanning for valid footers - startPos: " +
startPos +
" toRead: " + toRead + " remaining: " + remaining);
remaining = remaining - toRead;
}
```
Example:
First read: data = [..........,'O', 'R']
Second read: data = ['C', ...........]
The previous implementation only looked within data and did not consider
OrcFile.MAGIC across two reads.
### Why are the changes needed?
Fix bug: Java Tools may miss OrcFileMAGIC during file recovery.
### How was this patch tested?
Add UT.
---
.../src/java/org/apache/orc/tools/FileDump.java | 29 +++++--
.../test/org/apache/orc/tools/TestFileDump.java | 99 +++++++++++++++++++++-
2 files changed, 121 insertions(+), 7 deletions(-)
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 63f5194..b0549c1 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -62,6 +62,7 @@ import java.util.List;
public final class FileDump {
public static final String UNKNOWN = "UNKNOWN";
public static final String SEPARATOR = StringUtils.repeat("_", 120) + "\n";
+ public static final String RECOVER_READ_SIZE = "orc.recover.read.size"; //
only for testing
public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
public static final String DEFAULT_BACKUP_PATH =
System.getProperty("java.io.tmpdir");
public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new
PathFilter() {
@@ -459,6 +460,11 @@ public final class FileDump {
private static void recoverFiles(final List<String> corruptFiles, final
Configuration conf,
final String backup)
throws IOException {
+
+ byte[] magicBytes = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
+ int magicLength = magicBytes.length;
+ int readSize = conf.getInt(RECOVER_READ_SIZE, DEFAULT_BLOCK_SIZE);
+
for (String corruptFile : corruptFiles) {
System.err.println("Recovering file " + corruptFile);
Path corruptPath = new Path(corruptFile);
@@ -470,17 +476,30 @@ public final class FileDump {
List<Long> footerOffsets = new ArrayList<>();
// start reading the data file form top to bottom and record the valid
footers
- while (remaining > 0) {
- int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
- byte[] data = new byte[toRead];
+ while (remaining > 0 && corruptFileLen > (2L * magicLength)) {
+ int toRead = (int) Math.min(readSize, remaining);
long startPos = corruptFileLen - remaining;
- fdis.readFully(startPos, data, 0, toRead);
+ byte[] data;
+ if (startPos == 0) {
+ data = new byte[toRead];
+ fdis.readFully(startPos, data, 0, toRead);
+ }
+ else {
+ // For non-first reads, we let startPos move back magicLength bytes
+ // which prevents two adjacent reads from separating OrcFile.MAGIC
+ startPos = startPos - magicLength;
+ data = new byte[toRead + magicLength];
+ fdis.readFully(startPos, data, 0, toRead + magicLength);
+ }
// find all MAGIC string and see if the file is readable from there
int index = 0;
long nextFooterOffset;
- byte[] magicBytes = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
while (index != -1) {
+ // There are two reasons for searching from index + 1
+ // 1. to skip the OrcFile.MAGIC in the file header when the first
match is made
+ // 2. When the match is successful, the index is moved backwards
to search for
+ // the subsequent OrcFile.MAGIC
index = indexOf(data, magicBytes, index + 1);
if (index != -1) {
nextFooterOffset = startPos + index + magicBytes.length + 1;
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index fd1923a..ce916d2 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -44,10 +44,14 @@ import org.apache.orc.Writer;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
@@ -57,12 +61,12 @@ import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
-import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import static org.apache.orc.tools.FileDump.RECOVER_READ_SIZE;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
@@ -709,6 +713,97 @@ public class TestFileDump {
byte[] bytes = ("OO" + OrcFile.MAGIC).getBytes(StandardCharsets.UTF_8);
byte[] pattern = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
- assertEquals(FileDump.indexOf(bytes, pattern, 1), 2);
+ assertEquals(2, FileDump.indexOf(bytes, pattern, 1));
+ }
+
+ @Test
+ public void testRecover() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema));
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of",
"despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+
+ long fileSize = fs.getFileStatus(testFilePath).getLen();
+
+ String testFilePathStr = Path.mergePaths(
+ workDir, Path.mergePaths(new Path(Path.SEPARATOR), testFilePath))
+ .toUri().getPath();
+
+ String copyTestFilePathStr = Path.mergePaths(
+ workDir, Path.mergePaths(new Path(Path.SEPARATOR),
+ new Path("CopyTestFileDump.testDump.orc")))
+ .toUri().getPath();
+
+ String testCrcFilePathStr = Path.mergePaths(
+ workDir, Path.mergePaths(new Path(Path.SEPARATOR),
+ new Path(".TestFileDump.testDump.orc.crc")))
+ .toUri().getPath();
+
+ try {
+ Files.copy(Paths.get(testFilePathStr), Paths.get(copyTestFilePathStr));
+
+ // Append write data to make it a corrupt file
+ try (FileOutputStream output = new FileOutputStream(testFilePathStr,
true)) {
+ output.write(new byte[1024]);
+ output.write(OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8));
+ output.write(new byte[1024]);
+ output.flush();
+ }
+
+ // Clean up the crc file and append data to avoid checksum read
exceptions
+ Files.delete(Paths.get(testCrcFilePathStr));
+
+ conf.setInt(RECOVER_READ_SIZE, (int) (fileSize - 2));
+
+ FileDump.main(conf, new String[]{"--recover", "--skip-dump",
+ testFilePath.toUri().getPath()});
+
+ assertTrue(contentEquals(testFilePathStr, copyTestFilePathStr));
+ } finally {
+ Files.delete(Paths.get(copyTestFilePathStr));
+ }
+ }
+
+ private static boolean contentEquals(String filePath, String otherFilePath)
throws IOException {
+ try (InputStream is = new BufferedInputStream(new
FileInputStream(filePath));
+ InputStream otherIs = new BufferedInputStream(new
FileInputStream(otherFilePath))) {
+ int ch = is.read();
+ while (-1 != ch) {
+ int ch2 = otherIs.read();
+ if (ch != ch2) {
+ return false;
+ }
+ ch = is.read();
+ }
+
+ int ch2 = otherIs.read();
+ return ch2 == -1;
+ }
}
}