This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 65ce439  ORC-1034: Fix the indexOf algorithm in `FileDump` (#943)
65ce439 is described below

commit 65ce439fc7cce84f12657e73659165d20a306bd9
Author: Guiyanakaung <[email protected]>
AuthorDate: Thu Oct 21 04:21:20 2021 +0800

    ORC-1034: Fix the indexOf algorithm in `FileDump` (#943)
    
    ### What changes were proposed in this pull request?
    
    This matching algorithm is wrong when i does not backtrack after a failed 
match in the middle. As a simple example data = OOORC, pattern= ORC, index = 1, 
this algorithm will return -1.
    
    This pr aims to fix the indexOf algorithm.
    
    ### Why are the changes needed?
    
    indexOf is used to find the ORC file ending identifier to recover the file, 
it is important to ensure that the method is correct.
    
    ### How was this patch tested?
    
    Add UT.
---
 .../src/java/org/apache/orc/tools/FileDump.java     | 21 +++++++++------------
 .../src/test/org/apache/orc/tools/TestFileDump.java |  8 ++++++++
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java 
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 850743f..58f8958 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -632,25 +632,22 @@ public final class FileDump {
   }
 
   // search for byte pattern in another byte array
-  private static int indexOf(final byte[] data, final byte[] pattern, final 
int index) {
+  public static int indexOf(final byte[] data, final byte[] pattern, final int 
index) {
     if (data == null || data.length == 0 || pattern == null || pattern.length 
== 0 ||
         index > data.length || index < 0) {
       return -1;
     }
 
-    int j = 0;
-    for (int i = index; i < data.length; i++) {
-      if (pattern[j] == data[i]) {
-        j++;
-      } else {
-        j = 0;
-      }
-
-      if (j == pattern.length) {
-        return i - pattern.length + 1;
+    for (int i = index; i < data.length - pattern.length + 1; i++) {
+      boolean found = true;
+      for (int j = 0; j < pattern.length; j++) {
+        if (data[i + j] != pattern[j]) {
+          found = false;
+          break;
+        }
       }
+      if (found) return i;
     }
-
     return -1;
   }
 
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java 
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index 5bebbef..fd1923a 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -703,4 +703,12 @@ public class TestFileDump {
     assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
     TestFileDump.checkOutput(outputFilename, workDir + File.separator + 
outputFilename);
   }
+
+  @Test
+  public void testIndexOf() {
+    byte[] bytes = ("OO" + OrcFile.MAGIC).getBytes(StandardCharsets.UTF_8);
+    byte[] pattern = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
+
+    assertEquals(FileDump.indexOf(bytes, pattern, 1), 2);
+  }
 }

Reply via email to