DRILL-1434: In ParquetGroupScan compute the non-null value count of a column if stats are available for each chunk. Don't apply ConvertCountToDirectScan rule if stats are not available.
Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/3c3b3d55 Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/3c3b3d55 Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/3c3b3d55 Branch: refs/heads/master Commit: 3c3b3d55d0b8b71100b2df55d482e89d0f7f0f9c Parents: 8695cdf Author: Aman Sinha <asi...@maprtech.com> Authored: Sun Nov 9 18:18:47 2014 -0800 Committer: Jacques Nadeau <jacq...@apache.org> Committed: Thu Nov 13 09:17:27 2014 -0800 ---------------------------------------------------------------------- .../drill/exec/physical/base/GroupScan.java | 1 + .../physical/ConvertCountToDirectScan.java | 4 ++++ .../exec/store/parquet/ParquetGroupScan.java | 22 ++++++++++++++++++-- 3 files changed, 25 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/3c3b3d55/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java index 2f94995..3e5e408 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java @@ -34,6 +34,7 @@ import com.google.common.collect.Lists; public interface GroupScan extends Scan, HasAffinity{ public static final List<SchemaPath> ALL_COLUMNS = Lists.<SchemaPath>newArrayList(SchemaPath.getSimplePath("*")); + public static final long NO_COLUMN_STATS = -1; public abstract void applyAssignments(List<DrillbitEndpoint> endpoints) throws PhysicalOperatorSetupException; http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/3c3b3d55/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java index 620cf1f..d794805 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java @@ -119,6 +119,10 @@ public class ConvertCountToDirectScan extends Prule { String columnName = scan.getRowType().getFieldNames().get(index).toLowerCase(); cnt = oldGrpScan.getColumnValueCount(SchemaPath.getSimplePath(columnName)); + if (cnt == GroupScan.NO_COLUMN_STATS) { + // if column stats are not available don't apply this rule + return; + } } else { return; // do nothing. } http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/3c3b3d55/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java index dab20e3..7882b66 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java @@ -227,8 +227,26 @@ public class ParquetGroupScan extends AbstractFileGroupScan { valueCountInGrp = Math.max(col.getValueCount(), valueCountInGrp); SchemaPath path = SchemaPath.getSimplePath(col.getPath().toString().replace("[", "").replace("]", "").toLowerCase()); - long valueCount = columnValueCounts.containsKey(path) ? columnValueCounts.get(path) : 0; - columnValueCounts.put(path, valueCount + col.getValueCount()); + long previousCount = 0; + long currentCount = 0; + + if (! columnValueCounts.containsKey(path)) { + // create an entry for this column + columnValueCounts.put(path, previousCount /* initialize to 0 */); + } else { + previousCount = columnValueCounts.get(path); + } + + boolean statsAvail = (col.getStatistics() != null && !col.getStatistics().isEmpty()); + + if (statsAvail && previousCount != GroupScan.NO_COLUMN_STATS) { + currentCount = col.getValueCount() - col.getStatistics().getNumNulls(); // only count non-nulls + columnValueCounts.put(path, previousCount + currentCount); + } else { + // even if 1 chunk does not have stats, we cannot rely on the value count for this column + columnValueCounts.put(path, GroupScan.NO_COLUMN_STATS); + } + } String filePath = footer.getFile().toUri().getPath();