[GitHub] [orc] omalley commented on a change in pull request #582: ORC-697: Improve scan tool to report the location of corruption.

GitBox Mon, 14 Dec 2020 16:27:13 -0800


omalley commented on a change in pull request #582:
URL: https://github.com/apache/orc/pull/582#discussion_r542951701




##########
File path: java/tools/src/java/org/apache/orc/tools/ScanData.java
##########
@@ -40,41 +40,168 @@
 
   static CommandLine parseCommandLine(String[] args) throws ParseException {
     Options options = new Options()
-        .addOption("help", "h", false, "Provide help");
-    return new GnuParser().parse(options, args);
+        .addOption("s", "schema", false, "Print schema")
+        .addOption("h", "help", false, "Provide help");
+    return new DefaultParser().parse(options, args);
   }
 
+  static int calculateBestVectorSize(int indexStride) {
+    if (indexStride == 0) {
+      return 1024;
+    }
+    // how many 1024 batches do we have in an index stride?
+    int batchCount = (indexStride + 1023) / 1024;
+    return indexStride / batchCount;
+  }
+
+  static class LocationInfo {
+    final long firstRow;
+    final long followingRow;
+    final int stripeId;
+    final long row;
+
+    LocationInfo(long firstRow, long followingRow, int stripeId,
+        long row) {
+      this.firstRow = firstRow;
+      this.followingRow = followingRow;
+      this.stripeId = stripeId;
+      this.row = row;
+    }
+
+    public String toString() {
+      return String.format("row %d in stripe %d (rows %d-%d)",
+          row, stripeId, firstRow, followingRow);
+    }
+  }
+
+  /**
+   * Given a row, find the stripe that contains that row.
+   * @param reader the file reader
+   * @param row the global row number in the file
+   * @return the information about that row in the file
+   */
+  static LocationInfo findStripeInfo(Reader reader, long row) {
+    long firstRow = 0;
+    int stripeId = 0;
+    for (StripeInformation stripe: reader.getStripes()) {
+      stripeId += 1;
+      long lastRow = firstRow + stripe.getNumberOfRows();
+      if (firstRow <= row && row < lastRow) {
+        return new LocationInfo(firstRow, lastRow, stripeId, row);
+      }
+      firstRow = lastRow;
+    }
+    return new LocationInfo(reader.getNumberOfRows(),
+        reader.getNumberOfRows(), stripeId, row);
+  }
 
-  static void main(Configuration conf, String[] args
-                   ) throws IOException, JSONException, ParseException {
+  /**
+   * Given a failure point, find the first place that the ORC reader can
+   * recover.
+   * @param reader the ORC reader
+   * @param current the position of the failure
+   * @param batchSize the size of the batch that we tried to read
+   * @return the location that we should recover to
+   */
+  static LocationInfo findRecoveryPoint(Reader reader, LocationInfo current,
+                                        int batchSize) {
+    int stride = reader.getRowIndexStride();
+    long result;
+    // In the worst case, just move to the next stripe
+    if (stride == 0 ||
+        current.row + batchSize >= current.followingRow) {
+      result = current.followingRow;
+    } else {
+      long rowInStripe = current.row + batchSize - current.firstRow;
+      result = current.firstRow + (rowInStripe + stride - 1) / stride * stride;
+    }
+    return findStripeInfo(reader, result);
+  }
+
+  static boolean findBadColumns(Reader reader, LocationInfo current, int 
batchSize,
+      TypeDescription column, boolean[] include) {
+    include[column.getId()] = true;
+    TypeDescription schema = reader.getSchema();
+    boolean result = false;
+    if (column.getChildren() == null) {
+      int row = 0;
+      try (RecordReader rows = reader.rows(reader.options().include(include))) 
{
+        rows.seekToRow(current.row);
+        VectorizedRowBatch batch = schema.createRowBatch(
+            TypeDescription.RowBatchVersion.USE_DECIMAL64, 1);
+        for(row=0; row < batchSize; ++row) {
+          rows.nextBatch(batch);
+        }
+      } catch (Throwable t) {
+        System.out.printf("Column %d failed at row %d%n", column.getId(),
+            current.row + row);
+        result = true;
+      }
+    } else {
+      for(TypeDescription child: column.getChildren()) {
+        result |= findBadColumns(reader, current, batchSize, child, include);
+      }
+    }
+    include[column.getId()] = false;
+    return result;
+  }
+
+  static void main(Configuration conf, String[] args) throws ParseException {
     CommandLine cli = parseCommandLine(args);
     if (cli.hasOption('h') || cli.getArgs().length == 0) {
-      System.err.println("usage: java -jar orc-tools-*.jar scan [--help] <orc 
file>*");
+      System.err.println("usage: java -jar orc-tools-*.jar scan [--schema] 
[--help] <orc file>*");
       System.exit(1);
     } else {
+      boolean printSchema = cli.hasOption('s');
       List<String> badFiles = new ArrayList<>();
       for (String file : cli.getArgs()) {
-        try {
-          Path path = new Path(file);
-          Reader reader = FileDump.getReader(path, conf, badFiles);
-          if (reader == null) {
-            continue;
-          }
-          RecordReader rows = reader.rows();
-          VectorizedRowBatch batch = reader.getSchema().createRowBatch();
-          long batchCount = 0;
-          long rowCount = 0;
-          while (rows.nextBatch(batch)) {
-            batchCount += 1;
-            rowCount += batch.size;
+        try (Reader reader = FileDump.getReader(new Path(file), conf, 
badFiles)) {
+          if (reader != null) {
+            TypeDescription schema = reader.getSchema();
+            if (printSchema) {
+              System.out.println(schema.toJson());
+            }
+            VectorizedRowBatch batch = schema.createRowBatch(
+                TypeDescription.RowBatchVersion.USE_DECIMAL64,
+                calculateBestVectorSize(reader.getRowIndexStride()));
+            final int batchSize = batch.getMaxSize();
+            long badBatches = 0;
+            long currentRow = 0;
+            long goodRows = 0;
+            try (RecordReader rows = reader.rows()) {

Review comment:
       Yes, although the Scan tool doesn't have the information about where 
problems happen, when it is the footer. We'd need to add better exceptions in 
the ReaderImpl, which is a good idea.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [orc] omalley commented on a change in pull request #582: ORC-697: Improve scan tool to report the location of corruption.

Reply via email to