[GitHub] [orc] dongjoon-hyun commented on a change in pull request #582: ORC-697: Improve scan tool to report the location of corruption.

GitBox Mon, 14 Dec 2020 22:34:26 -0800


dongjoon-hyun commented on a change in pull request #582:
URL: https://github.com/apache/orc/pull/582#discussion_r543082167




##########
File path: java/tools/src/java/org/apache/orc/tools/ScanData.java
##########
@@ -40,41 +40,168 @@
 
   static CommandLine parseCommandLine(String[] args) throws ParseException {
     Options options = new Options()
-        .addOption("help", "h", false, "Provide help");
-    return new GnuParser().parse(options, args);
+        .addOption("s", "schema", false, "Print schema")
+        .addOption("h", "help", false, "Provide help");
+    return new DefaultParser().parse(options, args);
   }
 
+  static int calculateBestVectorSize(int indexStride) {
+    if (indexStride == 0) {
+      return 1024;
+    }
+    // how many 1024 batches do we have in an index stride?
+    int batchCount = (indexStride + 1023) / 1024;
+    return indexStride / batchCount;
+  }
+
+  static class LocationInfo {
+    final long firstRow;
+    final long followingRow;
+    final int stripeId;
+    final long row;
+
+    LocationInfo(long firstRow, long followingRow, int stripeId,
+        long row) {
+      this.firstRow = firstRow;
+      this.followingRow = followingRow;
+      this.stripeId = stripeId;
+      this.row = row;
+    }
+
+    public String toString() {
+      return String.format("row %d in stripe %d (rows %d-%d)",
+          row, stripeId, firstRow, followingRow);
+    }
+  }
+
+  /**
+   * Given a row, find the stripe that contains that row.
+   * @param reader the file reader
+   * @param row the global row number in the file
+   * @return the information about that row in the file
+   */
+  static LocationInfo findStripeInfo(Reader reader, long row) {
+    long firstRow = 0;
+    int stripeId = 0;
+    for (StripeInformation stripe: reader.getStripes()) {
+      stripeId += 1;
+      long lastRow = firstRow + stripe.getNumberOfRows();
+      if (firstRow <= row && row < lastRow) {
+        return new LocationInfo(firstRow, lastRow, stripeId, row);
+      }
+      firstRow = lastRow;
+    }
+    return new LocationInfo(reader.getNumberOfRows(),
+        reader.getNumberOfRows(), stripeId, row);
+  }
 
-  static void main(Configuration conf, String[] args
-                   ) throws IOException, JSONException, ParseException {
+  /**
+   * Given a failure point, find the first place that the ORC reader can
+   * recover.
+   * @param reader the ORC reader
+   * @param current the position of the failure
+   * @param batchSize the size of the batch that we tried to read
+   * @return the location that we should recover to
+   */
+  static LocationInfo findRecoveryPoint(Reader reader, LocationInfo current,
+                                        int batchSize) {
+    int stride = reader.getRowIndexStride();
+    long result;
+    // In the worst case, just move to the next stripe
+    if (stride == 0 ||
+        current.row + batchSize >= current.followingRow) {
+      result = current.followingRow;
+    } else {
+      long rowInStripe = current.row + batchSize - current.firstRow;
+      result = current.firstRow + (rowInStripe + stride - 1) / stride * stride;
+    }
+    return findStripeInfo(reader, result);
+  }
+
+  static boolean findBadColumns(Reader reader, LocationInfo current, int 
batchSize,
+      TypeDescription column, boolean[] include) {
+    include[column.getId()] = true;
+    TypeDescription schema = reader.getSchema();
+    boolean result = false;
+    if (column.getChildren() == null) {
+      int row = 0;
+      try (RecordReader rows = reader.rows(reader.options().include(include))) 
{
+        rows.seekToRow(current.row);
+        VectorizedRowBatch batch = schema.createRowBatch(
+            TypeDescription.RowBatchVersion.USE_DECIMAL64, 1);
+        for(row=0; row < batchSize; ++row) {
+          rows.nextBatch(batch);
+        }
+      } catch (Throwable t) {
+        System.out.printf("Column %d failed at row %d%n", column.getId(),

Review comment:
       Thanks!




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [orc] dongjoon-hyun commented on a change in pull request #582: ORC-697: Improve scan tool to report the location of corruption.

Reply via email to