[GitHub] [orc] dongjoon-hyun commented on a change in pull request #582: ORC-697: Improve scan tool to report the location of corruption.

GitBox Mon, 14 Dec 2020 15:49:32 -0800


dongjoon-hyun commented on a change in pull request #582:
URL: https://github.com/apache/orc/pull/582#discussion_r542930490




##########
File path: java/tools/src/java/org/apache/orc/tools/ScanData.java
##########
@@ -40,41 +40,168 @@
 
   static CommandLine parseCommandLine(String[] args) throws ParseException {
     Options options = new Options()
-        .addOption("help", "h", false, "Provide help");
-    return new GnuParser().parse(options, args);
+        .addOption("s", "schema", false, "Print schema")
+        .addOption("h", "help", false, "Provide help");
+    return new DefaultParser().parse(options, args);
   }
 
+  static int calculateBestVectorSize(int indexStride) {
+    if (indexStride == 0) {
+      return 1024;
+    }
+    // how many 1024 batches do we have in an index stride?
+    int batchCount = (indexStride + 1023) / 1024;
+    return indexStride / batchCount;
+  }
+
+  static class LocationInfo {
+    final long firstRow;
+    final long followingRow;
+    final int stripeId;
+    final long row;
+
+    LocationInfo(long firstRow, long followingRow, int stripeId,
+        long row) {
+      this.firstRow = firstRow;
+      this.followingRow = followingRow;
+      this.stripeId = stripeId;
+      this.row = row;
+    }
+
+    public String toString() {
+      return String.format("row %d in stripe %d (rows %d-%d)",
+          row, stripeId, firstRow, followingRow);
+    }
+  }
+
+  /**
+   * Given a row, find the stripe that contains that row.
+   * @param reader the file reader
+   * @param row the global row number in the file
+   * @return the information about that row in the file
+   */
+  static LocationInfo findStripeInfo(Reader reader, long row) {
+    long firstRow = 0;
+    int stripeId = 0;
+    for (StripeInformation stripe: reader.getStripes()) {
+      stripeId += 1;
+      long lastRow = firstRow + stripe.getNumberOfRows();
+      if (firstRow <= row && row < lastRow) {
+        return new LocationInfo(firstRow, lastRow, stripeId, row);
+      }
+      firstRow = lastRow;
+    }
+    return new LocationInfo(reader.getNumberOfRows(),
+        reader.getNumberOfRows(), stripeId, row);
+  }
 
-  static void main(Configuration conf, String[] args
-                   ) throws IOException, JSONException, ParseException {
+  /**
+   * Given a failure point, find the first place that the ORC reader can
+   * recover.
+   * @param reader the ORC reader
+   * @param current the position of the failure
+   * @param batchSize the size of the batch that we tried to read
+   * @return the location that we should recover to
+   */
+  static LocationInfo findRecoveryPoint(Reader reader, LocationInfo current,
+                                        int batchSize) {
+    int stride = reader.getRowIndexStride();
+    long result;
+    // In the worst case, just move to the next stripe
+    if (stride == 0 ||
+        current.row + batchSize >= current.followingRow) {
+      result = current.followingRow;
+    } else {
+      long rowInStripe = current.row + batchSize - current.firstRow;
+      result = current.firstRow + (rowInStripe + stride - 1) / stride * stride;
+    }
+    return findStripeInfo(reader, result);
+  }
+
+  static boolean findBadColumns(Reader reader, LocationInfo current, int 
batchSize,
+      TypeDescription column, boolean[] include) {
+    include[column.getId()] = true;
+    TypeDescription schema = reader.getSchema();
+    boolean result = false;
+    if (column.getChildren() == null) {
+      int row = 0;
+      try (RecordReader rows = reader.rows(reader.options().include(include))) 
{
+        rows.seekToRow(current.row);
+        VectorizedRowBatch batch = schema.createRowBatch(
+            TypeDescription.RowBatchVersion.USE_DECIMAL64, 1);
+        for(row=0; row < batchSize; ++row) {
+          rows.nextBatch(batch);
+        }
+      } catch (Throwable t) {
+        System.out.printf("Column %d failed at row %d%n", column.getId(),
+            current.row + row);
+        result = true;
+      }
+    } else {
+      for(TypeDescription child: column.getChildren()) {
+        result |= findBadColumns(reader, current, batchSize, child, include);
+      }
+    }
+    include[column.getId()] = false;
+    return result;
+  }
+
+  static void main(Configuration conf, String[] args) throws ParseException {
     CommandLine cli = parseCommandLine(args);
     if (cli.hasOption('h') || cli.getArgs().length == 0) {
-      System.err.println("usage: java -jar orc-tools-*.jar scan [--help] <orc 
file>*");
+      System.err.println("usage: java -jar orc-tools-*.jar scan [--schema] 
[--help] <orc file>*");
       System.exit(1);
     } else {
+      boolean printSchema = cli.hasOption('s');
       List<String> badFiles = new ArrayList<>();
       for (String file : cli.getArgs()) {
-        try {
-          Path path = new Path(file);
-          Reader reader = FileDump.getReader(path, conf, badFiles);
-          if (reader == null) {
-            continue;
-          }
-          RecordReader rows = reader.rows();
-          VectorizedRowBatch batch = reader.getSchema().createRowBatch();
-          long batchCount = 0;
-          long rowCount = 0;
-          while (rows.nextBatch(batch)) {
-            batchCount += 1;
-            rowCount += batch.size;
+        try (Reader reader = FileDump.getReader(new Path(file), conf, 
badFiles)) {
+          if (reader != null) {
+            TypeDescription schema = reader.getSchema();
+            if (printSchema) {
+              System.out.println(schema.toJson());
+            }
+            VectorizedRowBatch batch = schema.createRowBatch(
+                TypeDescription.RowBatchVersion.USE_DECIMAL64,
+                calculateBestVectorSize(reader.getRowIndexStride()));
+            final int batchSize = batch.getMaxSize();
+            long badBatches = 0;
+            long currentRow = 0;
+            long goodRows = 0;
+            try (RecordReader rows = reader.rows()) {

Review comment:
       Can we handle the corrupted file more gracefully? This may be considered 
as a regression in the following case.
   
   **BEFORE**
   ```
   $ orc-tools scan ../examples/corrupt/stripe_footer_bad_column_encodings.orc
   log4j:WARN No appenders could be found for logger 
(org.apache.hadoop.util.Shell).
   log4j:WARN Please initialize the log4j system properly.
   log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for 
more info.
   Processing data file 
../examples/corrupt/stripe_footer_bad_column_encodings.orc [length: 780]
   Unable to dump data for file: 
../examples/corrupt/stripe_footer_bad_column_encodings.orc
   ```
   
   **AFTER (this PR)**
   ```
   $ java -jar tools/target/orc-tools-1.7.0-SNAPSHOT-uber.jar scan 
../examples/corrupt/stripe_footer_bad_column_encodings.orc
   log4j:WARN No appenders could be found for logger 
(org.apache.hadoop.util.Shell).
   log4j:WARN Please initialize the log4j system properly.
   log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for 
more info.
   Processing data file 
../examples/corrupt/stripe_footer_bad_column_encodings.orc [length: 780]
   Unable to open file: 
../examples/corrupt/stripe_footer_bad_column_encodings.orc
   java.lang.IndexOutOfBoundsException: Index: 0
        at java.util.Collections$EmptyList.get(Collections.java:4456)
        at org.apache.orc.OrcProto$StripeFooter.getColumns(OrcProto.java:14080)
        at 
org.apache.orc.impl.reader.StripePlanner.buildEncodings(StripePlanner.java:224)
        at 
org.apache.orc.impl.reader.StripePlanner.parseStripe(StripePlanner.java:126)
        at 
org.apache.orc.impl.RecordReaderImpl.readStripe(RecordReaderImpl.java:1117)
        at 
org.apache.orc.impl.RecordReaderImpl.advanceStripe(RecordReaderImpl.java:1168)
        at 
org.apache.orc.impl.RecordReaderImpl.advanceToNextRow(RecordReaderImpl.java:1203)
        at 
org.apache.orc.impl.RecordReaderImpl.<init>(RecordReaderImpl.java:268)
        at org.apache.orc.impl.ReaderImpl.rows(ReaderImpl.java:841)
        at org.apache.orc.impl.ReaderImpl.rows(ReaderImpl.java:835)
        at org.apache.orc.tools.ScanData.main(ScanData.java:171)
        at org.apache.orc.tools.Driver.main(Driver.java:126)
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [orc] dongjoon-hyun commented on a change in pull request #582: ORC-697: Improve scan tool to report the location of corruption.

Reply via email to