[ 
https://issues.apache.org/jira/browse/DRILL-7403?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16951308#comment-16951308
 ] 

ASF GitHub Bot commented on DRILL-7403:
---------------------------------------

arina-ielchiieva commented on pull request #1871: DRILL-7403: lidate batch 
checks, vector integretity in unit tests
URL: https://github.com/apache/drill/pull/1871#discussion_r334638588
 
 

 ##########
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/validate/BatchValidator.java
 ##########
 @@ -48,50 +52,235 @@
   private static final org.slf4j.Logger logger =
       org.slf4j.LoggerFactory.getLogger(BatchValidator.class);
 
+  public static final boolean LOG_TO_STDOUT = true;
   public static final int MAX_ERRORS = 100;
 
-  private final int rowCount;
-  private final VectorAccessible batch;
-  private final List<String> errorList;
-  private int errorCount;
+  public interface ErrorReporter {
+    void error(String name, ValueVector vector, String msg);
+    void warn(String name, ValueVector vector, String msg);
+    void error(String msg);
+    int errorCount();
+  }
+
+  public abstract static class BaseErrorReporter implements ErrorReporter {
+
+    private final String opName;
+    private int errorCount;
+
+    public BaseErrorReporter(String opName) {
+      this.opName = opName;
+    }
+
+    protected boolean startError() {
+      if (errorCount == 0) {
+        logger.error("Found one or more vector errors from " + opName);
+      }
+      errorCount++;
+      if (errorCount >= MAX_ERRORS) {
+        return false;
+      }
+      return true;
+    }
+
+    @Override
+    public void error(String name, ValueVector vector, String msg) {
+      error(String.format("%s - %s: %s",
+            name, vector.getClass().getSimpleName(), msg));
+    }
+
+    @Override
+    public void warn(String name, ValueVector vector, String msg) {
+      warn(String.format("%s - %s: %s",
+            name, vector.getClass().getSimpleName(), msg));
+    }
+
+    public abstract void warn(String msg);
+
+    @Override
+    public int errorCount() { return errorCount; }
+  }
+
+  private static class StdOutReporter extends BaseErrorReporter {
+
+    public StdOutReporter(String opName) {
+      super(opName);
+    }
+
+    @Override
+    public void error(String msg) {
+      if (startError()) {
+        System.out.println(msg);
+      }
+    }
+
+    @Override
+    public void warn(String msg) {
+      System.out.println(msg);
+    }
+  }
+
+  private static class LogReporter extends BaseErrorReporter {
+
+    public LogReporter(String opName) {
+      super(opName);
+    }
+
+    @Override
+    public void error(String msg) {
+      if (startError()) {
+        logger.error(msg);
+      }
+    }
 
-  public BatchValidator(VectorAccessible batch) {
-    rowCount = batch.getRecordCount();
-    this.batch = batch;
-    errorList = null;
+    @Override
+    public void warn(String msg) {
+      logger.error(msg);
+    }
   }
 
-  public BatchValidator(VectorAccessible batch, boolean captureErrors) {
-    rowCount = batch.getRecordCount();
-    this.batch = batch;
-    if (captureErrors) {
-      errorList = new ArrayList<>();
+  private enum CheckMode { COUNTS, ALL };
+
+  private static Map<Class<? extends RecordBatch>, CheckMode> checkRules = 
buildRules();
+
+  private final ErrorReporter errorReporter;
+
+  public BatchValidator(ErrorReporter errorReporter) {
+    this.errorReporter = errorReporter;
+  }
+
+  /**
+   * At present, most operators will not pass the checks here. The following
+   * table identifies those that should be checked, and the degree of check.
+   * Over time, this table should include all operators, and thus become
+   * unnecessary.
+   */
+  private static Map<Class<? extends RecordBatch>, CheckMode> buildRules() {
+    Map<Class<? extends RecordBatch>, CheckMode> rules = new 
IdentityHashMap<>();
+    rules.put(OperatorRecordBatch.class, CheckMode.ALL);
+    return rules;
+  }
+
+  public static boolean validate(RecordBatch batch) {
+    CheckMode checkMode = checkRules.get(batch.getClass());
+
+    // If no rule, don't check this batch.
+
+    if (checkMode == null) {
+
+      // As work proceeds, might want to log those batches not checked.
+      // For now, there are too many.
+
+      return true;
+    }
+
+    // All batches that do any checks will at least check counts.
+
+    ErrorReporter reporter = errorReporter(batch);
+    int rowCount = batch.getRecordCount();
+    int valueCount = rowCount;
+    VectorContainer container = batch.getContainer();
+    if (! container.hasRecordCount()) {
+      reporter.error(String.format(
+          "%s: Container record count  not set",
+          batch.getClass().getSimpleName()));
     } else {
-      errorList = null;
+      // Row count will = container count for most operators.
+      // Row count <= container count for the filter operator.
+
+      int containerRowCount = container.getRecordCount();
+      valueCount = containerRowCount;
+      switch (batch.getSchema().getSelectionVectorMode()) {
+      case FOUR_BYTE:
+        int sv4Count = batch.getSelectionVector4().getCount();
+        if (sv4Count != rowCount) {
+          reporter.error(String.format(
+              "Mismatch between %s record count = %d, SV4 record count = %d",
+              batch.getClass().getSimpleName(),
+              rowCount, sv4Count));
+        }
+        // Don't know how to check SV4 batches
 
 Review comment:
   Maybe add todo?
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Validate batch checks, vector integretity in unit tests
> -------------------------------------------------------
>
>                 Key: DRILL-7403
>                 URL: https://issues.apache.org/jira/browse/DRILL-7403
>             Project: Apache Drill
>          Issue Type: Improvement
>            Reporter: Paul Rogers
>            Assignee: Paul Rogers
>            Priority: Minor
>
> Drill provides a {{BatchValidator}} that checks vectors. It is disabled by 
> default. This enhancement adds more checks, including checks for row counts 
> (of which there are surprisingly many.)
> Since most operators will fail if the check is enabled, this enhancement also 
> adds a table to keep track of which operators pass the checks (and for which 
> checks should be enabled) and those that still need work. This allows the 
> checks to exist in the code, and to be enabled incrementally as we fix the 
> various problems.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to