[
https://issues.apache.org/jira/browse/DRILL-5375?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15943518#comment-15943518
]
ASF GitHub Bot commented on DRILL-5375:
---------------------------------------
Github user amansinha100 commented on a diff in the pull request:
https://github.com/apache/drill/pull/794#discussion_r108208895
--- Diff:
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/join/NestedLoopJoinTemplate.java
---
@@ -40,132 +41,133 @@
// Record count of the left batch currently being processed
private int leftRecordCount = 0;
- // List of record counts per batch in the hyper container
+ // List of record counts per batch in the hyper container
private List<Integer> rightCounts = null;
// Output batch
private NestedLoopJoinBatch outgoing = null;
- // Next right batch to process
- private int nextRightBatchToProcess = 0;
-
- // Next record in the current right batch to process
- private int nextRightRecordToProcess = 0;
-
- // Next record in the left batch to process
- private int nextLeftRecordToProcess = 0;
+ // Iteration status tracker
+ private IterationStatusTracker tracker = new IterationStatusTracker();
/**
* Method initializes necessary state and invokes the doSetup() to set
the
- * input and output value vector references
+ * input and output value vector references.
+ *
* @param context Fragment context
* @param left Current left input batch being processed
* @param rightContainer Hyper container
+ * @param rightCounts Counts for each right container
* @param outgoing Output batch
*/
- public void setupNestedLoopJoin(FragmentContext context, RecordBatch
left,
+ public void setupNestedLoopJoin(FragmentContext context,
+ RecordBatch left,
ExpandableHyperContainer rightContainer,
LinkedList<Integer> rightCounts,
NestedLoopJoinBatch outgoing) {
this.left = left;
- leftRecordCount = left.getRecordCount();
+ this.leftRecordCount = left.getRecordCount();
this.rightCounts = rightCounts;
this.outgoing = outgoing;
doSetup(context, rightContainer, left, outgoing);
}
/**
- * This method is the core of the nested loop join. For every record on
the right we go over
- * the left batch and produce the cross product output
+ * Main entry point for producing the output records. Thin wrapper
around populateOutgoingBatch(), this method
+ * controls which left batch we are processing and fetches the next left
input batch once we exhaust the current one.
+ *
+ * @param joinType join type (INNER ot LEFT)
+ * @return the number of records produced in the output batch
+ */
+ public int outputRecords(JoinRelType joinType) {
+ int outputIndex = 0;
+ while (leftRecordCount != 0) {
+ outputIndex = populateOutgoingBatch(joinType, outputIndex);
+ if (outputIndex >= NestedLoopJoinBatch.MAX_BATCH_SIZE) {
+ break;
+ }
+ // reset state and get next left batch
+ resetAndGetNextLeft();
+ }
+ return outputIndex;
+ }
+
+ /**
+ * This method is the core of the nested loop join.For each left batch
record looks for matching record
+ * from the list of right batches. Match is checked by calling {@link
#doEval(int, int, int)} method.
+ * If matching record is found both left and right records are written
into output batch,
+ * otherwise if join type is LEFT, than only left record is written,
right batch record values will be null.
+ *
+ * @param joinType join type (INNER or LEFT)
* @param outputIndex index to start emitting records at
* @return final outputIndex after producing records in the output batch
*/
- private int populateOutgoingBatch(int outputIndex) {
-
- // Total number of batches on the right side
- int totalRightBatches = rightCounts.size();
-
- // Total number of records on the left
- int localLeftRecordCount = leftRecordCount;
-
- /*
- * The below logic is the core of the NLJ. To have better performance
we copy the instance members into local
- * method variables, once we are done with the loop we need to update
the instance variables to reflect the new
- * state. To avoid code duplication of resetting the instance members
at every exit point in the loop we are using
- * 'goto'
- */
- int localNextRightBatchToProcess = nextRightBatchToProcess;
- int localNextRightRecordToProcess = nextRightRecordToProcess;
- int localNextLeftRecordToProcess = nextLeftRecordToProcess;
-
- outer: {
-
- for (; localNextRightBatchToProcess< totalRightBatches;
localNextRightBatchToProcess++) { // for every batch on the right
- int compositeIndexPart = localNextRightBatchToProcess << 16;
- int rightRecordCount =
rightCounts.get(localNextRightBatchToProcess);
-
- for (; localNextRightRecordToProcess < rightRecordCount;
localNextRightRecordToProcess++) { // for every record in this right batch
- for (; localNextLeftRecordToProcess < localLeftRecordCount;
localNextLeftRecordToProcess++) { // for every record in the left batch
-
+ private int populateOutgoingBatch(JoinRelType joinType, int outputIndex)
{
+ // copy index and match counters as local variables to speed up
processing
+ int nextRightBatchToProcess = tracker.getNextRightBatchToProcess();
+ int nextRightRecordToProcess = tracker.getNextRightRecordToProcess();
+ int nextLeftRecordToProcess = tracker.getNextLeftRecordToProcess();
+ boolean rightRecordMatched = tracker.isRightRecordMatched();
+
+ outer:
+ // for every record in the left batch
+ for (; nextLeftRecordToProcess < leftRecordCount;
nextLeftRecordToProcess++) {
+ // for every batch on the right
+ for (; nextRightBatchToProcess < rightCounts.size();
nextRightBatchToProcess++) {
+ int rightRecordCount = rightCounts.get(nextRightBatchToProcess);
--- End diff --
For keeping track of the number of records on the right that are processed
due to the O(n^2) join, we should have a non-local metric which would be a
Long. Currently, it seems the query profile for NLJoin will be incorrect.
> Nested loop join: return correct result for left join
> -----------------------------------------------------
>
> Key: DRILL-5375
> URL: https://issues.apache.org/jira/browse/DRILL-5375
> Project: Apache Drill
> Issue Type: Bug
> Affects Versions: 1.8.0
> Reporter: Arina Ielchiieva
> Assignee: Arina Ielchiieva
> Labels: doc-impacting
>
> Mini repro:
> 1. Create 2 Hive tables with data
> {code}
> CREATE TABLE t1 (
> FYQ varchar(999),
> dts varchar(999),
> dte varchar(999)
> )
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
> 2016-Q1,2016-06-01,2016-09-30
> 2016-Q2,2016-09-01,2016-12-31
> 2016-Q3,2017-01-01,2017-03-31
> 2016-Q4,2017-04-01,2017-06-30
> CREATE TABLE t2 (
> who varchar(999),
> event varchar(999),
> dt varchar(999)
> )
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
> aperson,did somthing,2017-01-06
> aperson,did somthing else,2017-01-12
> aperson,had chrsitmas,2016-12-26
> aperson,went wild,2016-01-01
> {code}
> 2. Impala Query shows correct result
> {code}
> select t2.dt, t1.fyq, t2.who, t2.event
> from t2
> left join t1 on t2.dt between t1.dts and t1.dte
> order by t2.dt;
> +------------+---------+---------+-------------------+
> | dt | fyq | who | event |
> +------------+---------+---------+-------------------+
> | 2016-01-01 | NULL | aperson | went wild |
> | 2016-12-26 | 2016-Q2 | aperson | had chrsitmas |
> | 2017-01-06 | 2016-Q3 | aperson | did somthing |
> | 2017-01-12 | 2016-Q3 | aperson | did somthing else |
> +------------+---------+---------+-------------------+
> {code}
> 3. Drill query shows wrong results:
> {code}
> alter session set planner.enable_nljoin_for_scalar_only=false;
> use hive;
> select t2.dt, t1.fyq, t2.who, t2.event
> from t2
> left join t1 on t2.dt between t1.dts and t1.dte
> order by t2.dt;
> +-------------+----------+----------+--------------------+
> | dt | fyq | who | event |
> +-------------+----------+----------+--------------------+
> | 2016-12-26 | 2016-Q2 | aperson | had chrsitmas |
> | 2017-01-06 | 2016-Q3 | aperson | did somthing |
> | 2017-01-12 | 2016-Q3 | aperson | did somthing else |
> +-------------+----------+----------+--------------------+
> 3 rows selected (2.523 seconds)
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)