This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 65ce439 ORC-1034: Fix the indexOf algorithm in `FileDump` (#943)
65ce439 is described below
commit 65ce439fc7cce84f12657e73659165d20a306bd9
Author: Guiyanakaung <[email protected]>
AuthorDate: Thu Oct 21 04:21:20 2021 +0800
ORC-1034: Fix the indexOf algorithm in `FileDump` (#943)
### What changes were proposed in this pull request?
This matching algorithm is wrong when i does not backtrack after a failed
match in the middle. As a simple example data = OOORC, pattern= ORC, index = 1,
this algorithm will return -1.
This pr aims to fix the indexOf algorithm.
### Why are the changes needed?
indexOf is used to find the ORC file ending identifier to recover the file,
it is important to ensure that the method is correct.
### How was this patch tested?
Add UT.
---
.../src/java/org/apache/orc/tools/FileDump.java | 21 +++++++++------------
.../src/test/org/apache/orc/tools/TestFileDump.java | 8 ++++++++
2 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 850743f..58f8958 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -632,25 +632,22 @@ public final class FileDump {
}
// search for byte pattern in another byte array
- private static int indexOf(final byte[] data, final byte[] pattern, final
int index) {
+ public static int indexOf(final byte[] data, final byte[] pattern, final int
index) {
if (data == null || data.length == 0 || pattern == null || pattern.length
== 0 ||
index > data.length || index < 0) {
return -1;
}
- int j = 0;
- for (int i = index; i < data.length; i++) {
- if (pattern[j] == data[i]) {
- j++;
- } else {
- j = 0;
- }
-
- if (j == pattern.length) {
- return i - pattern.length + 1;
+ for (int i = index; i < data.length - pattern.length + 1; i++) {
+ boolean found = true;
+ for (int j = 0; j < pattern.length; j++) {
+ if (data[i + j] != pattern[j]) {
+ found = false;
+ break;
+ }
}
+ if (found) return i;
}
-
return -1;
}
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index 5bebbef..fd1923a 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -703,4 +703,12 @@ public class TestFileDump {
assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
TestFileDump.checkOutput(outputFilename, workDir + File.separator +
outputFilename);
}
+
+ @Test
+ public void testIndexOf() {
+ byte[] bytes = ("OO" + OrcFile.MAGIC).getBytes(StandardCharsets.UTF_8);
+ byte[] pattern = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
+
+ assertEquals(FileDump.indexOf(bytes, pattern, 1), 2);
+ }
}