This is an automated email from the ASF dual-hosted git repository.
andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git
The following commit(s) were added to refs/heads/main by this push:
new 0e10a18528 GH-2404: Support for multi-variable join keys
0e10a18528 is described below
commit 0e10a1852806e669d8f41b2412126e6ec9c76323
Author: Claus Stadler <[email protected]>
AuthorDate: Mon Apr 8 14:55:51 2024 +0200
GH-2404: Support for multi-variable join keys
---
.../sparql/engine/join/AbstractIterHashJoin.java | 82 +++----
.../jena/sparql/engine/join/BitSetMapper.java | 76 +++++++
.../jena/sparql/engine/join/HashProbeTable.java | 49 +++--
.../sparql/engine/join/ImmutableUniqueList.java | 197 +++++++++++++++++
.../org/apache/jena/sparql/engine/join/Join.java | 18 +-
.../apache/jena/sparql/engine/join/JoinIndex.java | 226 ++++++++++++++++++++
.../apache/jena/sparql/engine/join/JoinKey.java | 154 ++++++++-----
.../apache/jena/sparql/engine/join/JoinLib.java | 17 +-
.../sparql/engine/join/MultiHashProbeTable.java | 237 +++++++++++++++++++++
.../jena/sparql/engine/join/QueryIterHashJoin.java | 18 +-
.../sparql/engine/join/AbstractTestInnerJoin.java | 43 +++-
.../jena/sparql/engine/join/AbstractTestJoin.java | 8 +
.../jena/sparql/engine/join/TestBitSetMapper.java | 80 +++++++
.../jena/sparql/engine/join/TestJoinKey.java | 77 +++++++
.../engine/join/TestMultiHashProbeTable.java | 150 +++++++++++++
jena-benchmarks/jena-benchmarks-jmh/README.md | 7 +
.../jena/sparql/engine/benchmark/QueryTask.java | 52 +++++
.../sparql/engine/benchmark/QueryTaskBuilder.java | 48 +++++
.../engine/benchmark/QueryTaskBuilderRegistry.java | 55 +++++
.../sparql/engine/benchmark/QueryTaskReader.java | 90 ++++++++
.../sparql/engine/benchmark/QueryTaskResult.java | 58 +++++
.../engine/benchmark/QueryTaskTestUtils.java | 53 +++++
.../jena/sparql/engine/join/BenchmarkHashJoin.java | 102 +++++++++
.../jena/sparql/engine/join/QueryTask480.java | 82 +++++++
.../sparql/engine/join/QueryTaskBuilder480.java | 30 +++
.../engine/join/QueryTaskBuilderCurrent.java | 30 +++
.../jena/sparql/engine/join/QueryTaskCurrent.java | 78 +++++++
.../sparql/engine/join/TestBenchmarkHashJoin.java | 36 ++++
.../jena/sparql/engine/join/TestHashJoin.java | 43 ++++
.../resources/join/join_1column_simple_a_100k.ttl | 28 +++
.../resources/join/join_1column_simple_a_10k.ttl | 26 +++
.../resources/join/join_2columns_simple_a_10.ttl | 28 +++
.../resources/join/join_2columns_simple_a_15.ttl | 28 +++
.../resources/join/join_2columns_simple_a_20.ttl | 28 +++
.../resources/join/join_2columns_skewed_a_1.ttl | 42 ++++
.../resources/join/join_2columns_skewed_a_10.ttl | 33 +++
.../resources/join/join_matrix_skewed_a_10.ttl | 41 ++++
.../test/resources/join/join_matrix_skewed_a_3.ttl | 26 +++
38 files changed, 2343 insertions(+), 133 deletions(-)
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/AbstractIterHashJoin.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/AbstractIterHashJoin.java
index 830522a156..4cde4558ad 100644
---
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/AbstractIterHashJoin.java
+++
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/AbstractIterHashJoin.java
@@ -31,7 +31,7 @@ import org.apache.jena.sparql.engine.iterator.QueryIter2 ;
import org.apache.jena.sparql.engine.iterator.QueryIterPeek ;
/** Hash join algorithm
- *
+ *
* This code materializes one input into the probe table
* then hash joins the other input from the stream side.
*/
@@ -42,47 +42,51 @@ public abstract class AbstractIterHashJoin extends
QueryIter2 {
protected long s_countResults = 0 ; // Overall result size.
protected long s_trailerResults = 0 ; // Results from the
trailer iterator.
// See also stats in the probe table.
-
+
protected final JoinKey joinKey ;
- protected final HashProbeTable hashTable ;
+ protected final MultiHashProbeTable hashTable ;
private QueryIterator iterStream ;
private Binding rowStream = null ;
private Iterator<Binding> iterCurrent ;
- private boolean yielded ; // Flag to note when
current probe causes a result.
+ private boolean yielded ; // Flag to note when
current probe causes a result.
// Hanlde any "post join" additions.
private Iterator<Binding> iterTail = null ;
-
+
enum Phase { INIT, HASH , STREAM, TRAILER, DONE }
Phase state = Phase.INIT ;
-
+
private Binding slot = null ;
- protected AbstractIterHashJoin(JoinKey joinKey, QueryIterator probeIter,
QueryIterator streamIter, ExecutionContext execCxt) {
+ protected AbstractIterHashJoin(JoinKey initialJoinKey, QueryIterator
probeIter, QueryIterator streamIter, ExecutionContext execCxt) {
super(probeIter, streamIter, execCxt) ;
-
- if ( joinKey == null ) {
+
+ if ( initialJoinKey == null ) {
+ // This block computes an initial join key from the common
variables of each iterator's first binding.
QueryIterPeek pProbe = QueryIterPeek.create(probeIter, execCxt) ;
QueryIterPeek pStream = QueryIterPeek.create(streamIter, execCxt) ;
-
+
Binding bLeft = pProbe.peek() ;
Binding bRight = pStream.peek() ;
-
+
List<Var> varsLeft = Iter.toList(bLeft.vars()) ;
List<Var> varsRight = Iter.toList(bRight.vars()) ;
- joinKey = JoinKey.createVarKey(varsLeft, varsRight) ;
+
+ initialJoinKey = JoinKey.create(varsLeft, varsRight) ;
+
probeIter = pProbe ;
streamIter = pStream ;
}
-
- this.joinKey = joinKey ;
+
+ JoinKey maxJoinKey = null;
+
+ this.joinKey = initialJoinKey ;
this.iterStream = streamIter ;
- this.hashTable = new HashProbeTable(joinKey) ;
+ this.hashTable = new MultiHashProbeTable(maxJoinKey, initialJoinKey) ;
this.iterCurrent = null ;
buildHashTable(probeIter) ;
-
}
-
+
private void buildHashTable(QueryIterator iter1) {
state = Phase.HASH ;
for (; iter1.hasNext();) {
@@ -96,7 +100,7 @@ public abstract class AbstractIterHashJoin extends
QueryIter2 {
@Override
protected boolean hasNextBinding() {
- if ( isFinished() )
+ if ( isFinished() )
return false ;
if ( slot == null ) {
slot = moveToNextBindingOrNull() ;
@@ -117,21 +121,21 @@ public abstract class AbstractIterHashJoin extends
QueryIter2 {
protected Binding moveToNextBindingOrNull() {
// iterCurrent is the iterator of entries in the
- // probe hashed table for the current stream row.
+ // probe hashed table for the current stream row.
// iterStream is the stream of incoming rows.
-
+
switch ( state ) {
case DONE : return null ;
- case HASH :
+ case HASH :
case INIT :
throw new IllegalStateException() ;
case TRAILER :
return doOneTail() ;
case STREAM :
}
-
+
for(;;) {
- // Ensure we are processing a row.
+ // Ensure we are processing a row.
while ( iterCurrent == null ) {
// Move on to the next row from the right.
if ( ! iterStream.hasNext() ) {
@@ -146,8 +150,8 @@ public abstract class AbstractIterHashJoin extends
QueryIter2 {
iterCurrent = hashTable.getCandidates(rowStream) ;
yielded = false ;
}
-
- // Emit one row using the rightRow and the current matched left
rows.
+
+ // Emit one row using the rightRow and the current matched left
rows.
if ( ! iterCurrent.hasNext() ) {
iterCurrent = null ;
if ( ! yielded ) {
@@ -162,11 +166,11 @@ public abstract class AbstractIterHashJoin extends
QueryIter2 {
// Nested loop join, only on less.
//Iterator<Binding> iter = nestedLoop(iterCurrent, rowStream) ;
-
+
Binding rowCurrentProbe = iterCurrent.next() ;
Binding r = Algebra.merge(rowCurrentProbe, rowStream) ;
Binding r2 = null ;
-
+
if (r != null)
r2 = yieldOneResult(rowCurrentProbe, rowStream, r) ;
if ( r2 == null ) {
@@ -177,9 +181,9 @@ public abstract class AbstractIterHashJoin extends
QueryIter2 {
return r2 ;
}
}
- }
-
-
+ }
+
+
private Binding doOneTail() {
// Only in TRAILING
if ( iterTail.hasNext() ) {
@@ -192,19 +196,19 @@ public abstract class AbstractIterHashJoin extends
QueryIter2 {
iterTail = null ;
return null ;
}
-
+
/**
* Signal about to return a result.
* @param rowCurrentProbe
* @param rowStream
* @param rowResult
- * @return
+ * @return
*/
protected abstract Binding yieldOneResult(Binding rowCurrentProbe, Binding
rowStream, Binding rowResult) ;
/** Signal a row that yields no matches.
* This method can return a binding (the outer join case)
- * which will then be yielded. {@code yieldOneResult} will <em>not</em>
be called.
+ * which will then be yielded. {@code yieldOneResult} will <em>not</em>
be called.
* @param rowStream
* @return
*/
@@ -216,23 +220,23 @@ public abstract class AbstractIterHashJoin extends
QueryIter2 {
* @return QueryIterator or null
*/
protected abstract QueryIterator joinFinished() ;
-
+
@Override
protected void closeSubIterator() {
if ( JoinLib.JOIN_EXPLAIN ) {
String x = String.format(
- "HashJoin: LHS=%d RHS=%d Results=%d RightMisses=%d
MaxBucket=%d NoKeyBucket=%d",
- s_countProbe, s_countScan, s_countResults,
- hashTable.s_countScanMiss, hashTable.s_maxBucketSize,
hashTable.s_noKeyBucketSize) ;
+ "HashJoin: LHS=%d RHS=%d Results=%d Table=%s",
+ s_countProbe, s_countScan, s_countResults,
+ hashTable.toString()) ;
System.out.println(x) ;
}
// In case it's a peek iterator.
iterStream.close() ;
- hashTable.clear();
+ hashTable.clear();
}
@Override
- protected void requestSubCancel()
+ protected void requestSubCancel()
{ }
}
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/BitSetMapper.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/BitSetMapper.java
new file mode 100644
index 0000000000..fdb7c8da3b
--- /dev/null
+++
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/BitSetMapper.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.jena.sparql.core.Var;
+import org.apache.jena.sparql.engine.binding.Binding;
+
+/**
+ * Methods for converting collections to and from a bit set representation
+ * w.r.t. a list of reference items.
+ * <p>
+ * For example, if the list of reference items is {@code [ ?x ?y ?z ]}
+ * and the given items is the set {@code { ?y ?z }} then
+ * the resulting bit set has the value 011.
+ */
+public class BitSetMapper {
+
+ /**
+ * Create a bit set from the binding's key set.
+ *
+ * @implNote
+ * This method relies on {@link List#indexOf(Object)}.
+ * The class {@link ImmutableUniqueList} provides an index for these
lookups.
+ */
+ public static BitSet toBitSet(List<Var> referenceList, Binding binding) {
+ int n = referenceList.size();
+ BitSet result = new BitSet(n);
+ if (n < binding.size()) {
+ for (int i = 0; i < n; ++i) {
+ Var var = referenceList.get(i);
+ if (binding.contains(var)) {
+ result.set(i);
+ }
+ }
+ } else { // Iterate over the fewer row variables
+ for (Iterator<Var> it = binding.vars(); it.hasNext();) {
+ Var var = it.next();
+ int idx = referenceList.indexOf(var);
+ if (idx != -1) {
+ result.set(idx);
+ }
+ }
+ }
+ return result;
+ }
+
+ /** Map the positions of all set bits to items in the list. */
+ public static <T> List<T> toList(List<T> referenceList, BitSet key) {
+ List<T> result = new ArrayList<>(referenceList.size());
+ for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1)) {
+ T item = referenceList.get(i);
+ result.add(item);
+ }
+ return result;
+ }
+}
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/HashProbeTable.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/HashProbeTable.java
index f4e370331c..4e050e1d3c 100644
---
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/HashProbeTable.java
+++
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/HashProbeTable.java
@@ -37,15 +37,24 @@ class HashProbeTable {
/*package*/ long s_maxMatchGroup = 0;
/*package*/ long s_countScanMiss = 0;
- private final List<Binding> noKeyBucket = new ArrayList<>();
+ private final List<Binding> noKeyBucket = new
ArrayList<>();
private final MultiValuedMap<Object, Binding> buckets;
- private final JoinKey joinKey;
+ private final JoinKey joinKey;
HashProbeTable(JoinKey joinKey) {
this.joinKey = joinKey;
buckets = MultiMapUtils.newListValuedHashMap();
}
+ public JoinKey getJoinKey() {
+ return joinKey;
+ }
+
+ public void putNoKey(Binding row) {
+ s_count++;
+ noKeyBucket.add(row);
+ }
+
public void put(Binding row) {
s_count++;
Object longHash = JoinLib.hash(joinKey, row);
@@ -56,7 +65,19 @@ class HashProbeTable {
buckets.put(longHash, row);
}
+ /** Shorthand for {@code getCandidates(row, true)}. See {@link
#getCandidates(Binding, boolean)}. */
public Iterator<Binding> getCandidates(Binding row) {
+ return getCandidates(row, true);
+ }
+
+ /**
+ * Find the rows of this table that are compatible with a given lookup
binding.
+ *
+ * @param row The lookup binding.
+ * @param appendNoBucket If true then also append the data from the
no-key-bucket (if present).
+ * @return An iterator over the rows in this table that are compatible
with the lookup binding.
+ */
+ public Iterator<Binding> getCandidates(Binding row, boolean
appendNoBucket) {
Iterator<Binding> iter = null;
Object longHash = JoinLib.hash(joinKey, row);
if ( longHash == JoinLib.noKeyHash )
@@ -71,7 +92,7 @@ class HashProbeTable {
}
}
// And the rows with no common hash key
- if ( noKeyBucket != null )
+ if ( appendNoBucket && noKeyBucket != null )
iter = Iter.concat(iter, noKeyBucket.iterator());
return iter;
}
@@ -90,21 +111,10 @@ class HashProbeTable {
// What to do with them?
}
- // Should not need these operations.
- public Collection<Binding> getNoKey$() {
- if ( noKeyBucket == null )
- return null;
+ public Collection<Binding> getNoKey() {
return noKeyBucket;
}
- public Collection<Binding> getHashMatch$(Binding row) {
- Object longHash = JoinLib.hash(joinKey, row);
- if ( longHash == JoinLib.noKeyHash )
- return noKeyBucket;
- Collection<Binding> list = buckets.get(longHash);
- return list;
- }
-
public Iterator<Binding> values() {
return Iter.concat(buckets.values().iterator(),
noKeyBucket.iterator()) ;
@@ -113,4 +123,13 @@ class HashProbeTable {
public void clear() {
buckets.clear();
}
+
+ @Override
+ public String toString() {
+ stats();
+ long hashedItems = s_count - s_noKeyBucketSize;
+ String str = String.format("JoinKey=%s Items=%d HashedItems=%d
NoKeyItems=%d Buckets=%d RightMisses=%d MaxBucket=%d",
+ joinKey, s_count, hashedItems, s_noKeyBucketSize,
s_bucketCount, s_countScanMiss, s_maxBucketSize);
+ return str;
+ }
}
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/ImmutableUniqueList.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/ImmutableUniqueList.java
new file mode 100644
index 0000000000..ed2d1f0429
--- /dev/null
+++
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/ImmutableUniqueList.java
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import java.lang.reflect.Array;
+import java.util.AbstractList;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.lang3.ArrayUtils;
+
+/**
+ * An immutable list without duplicates.
+ *
+ * For lists of size {@value ImmutableUniqueList#INDEX_THRESHOLD} or larger,
+ * the {@link #indexOf(Object)} method will on first use index the elements
for faster access.
+ */
+public class ImmutableUniqueList<T> extends AbstractList<T> {
+ /** Threshold in the number of variables for when to use additional
indexing structures
+ * in order to improve scalability */
+ static final int INDEX_THRESHOLD = 5;
+
+ /** The builder can emit a key every time build() is called
+ * and it can be continued to be used.
+ */
+ public static final class Builder<T> {
+ private Class<T> itemClass;
+
+ /**
+ * The keys collection upgrades itself from ArrayList to
+ * LinkedHashSet upon adding a sufficient number of items.
+ */
+ private Collection<T> items;
+
+ Builder(Class<T> itemClass) {
+ super();
+ this.itemClass = itemClass;
+ }
+
+ private void alloc(int n) {
+ if (items == null) {
+ items = n < INDEX_THRESHOLD ? new ArrayList<>(INDEX_THRESHOLD)
: new LinkedHashSet<>();
+ } else if (!(items instanceof Set) && (items.size() + n >=
INDEX_THRESHOLD)) {
+ Set<T> tmp = new LinkedHashSet<>(items);
+ items = tmp;
+ }
+ }
+
+ public Builder<T> add(T item) {
+ if (!(items instanceof Set)) {
+ if ( items == null || ! items.contains(item) ) {
+ alloc(1);
+ items.add(item) ;
+ }
+ } else {
+ items.add(item);
+ }
+ return this ;
+ }
+
+ public Builder<T> addAll(Collection<T> items) {
+ alloc(items.size());
+ for (T item : items) {
+ add(item);
+ }
+ return this;
+ }
+
+ public Builder<T> addAll(T[] arr) {
+ alloc(arr.length);
+ for (T item : arr) {
+ add(item);
+ }
+ return this;
+ }
+
+ public Builder<T> remove(Object o) {
+ if (items != null) {
+ items.remove(o) ;
+ }
+ return this ;
+ }
+
+ public Builder<T> clear() {
+ items = null;
+ return this ;
+ }
+
+ public boolean isEmpty() {
+ return items == null || items.isEmpty();
+ }
+
+ @SuppressWarnings("unchecked")
+ public ImmutableUniqueList<T> build() {
+ T[] finalItems;
+ if (items == null) {
+ finalItems = (T[])Array.newInstance(itemClass, 0);
+ } else {
+ finalItems = (T[])Array.newInstance(itemClass, items.size());
+ items.toArray(finalItems);
+ }
+ return new ImmutableUniqueList<>(INDEX_THRESHOLD, finalItems);
+ }
+ }
+
+ public static <T> Builder<T> newUniqueListBuilder(Class<T> itemClass) {
+ return new Builder<>(itemClass);
+ }
+
+ public static <T> ImmutableUniqueList<T> createUniqueList(Class<T>
itemClass, Collection<T> items) {
+ return
ImmutableUniqueList.<T>newUniqueListBuilder(itemClass).addAll(items).build();
+ }
+
+ public static <T> ImmutableUniqueList<T> createUniqueList(Class<T>
itemClass, T[] items) {
+ return
ImmutableUniqueList.<T>newUniqueListBuilder(itemClass).addAll(items).build();
+ }
+
+ /** Subclasses may access the keys array but must never modify it! */
+ protected final T[] elementData;
+ protected final int indexThreshold;
+
+ /** keyToIdx mapping is initialized lazily in {@link #indexOf(Object)} */
+ private transient Map<T, Integer> elementToIndex;
+
+ protected ImmutableUniqueList(T[] elementData) {
+ this(INDEX_THRESHOLD, elementData);
+ }
+
+ protected ImmutableUniqueList(int indexThreshold, T[] elementData) {
+ super();
+ this.indexThreshold = indexThreshold;
+ this.elementData = elementData ;
+ }
+
+ @Override
+ public int size() { return elementData.length; }
+
+ public int length() { return size(); }
+
+ @Override
+ public T get(int i) { return elementData[i]; }
+
+ @Override
+ public boolean contains(Object o) { return indexOf(o) != -1; }
+
+ @Override
+ public int indexOf(Object o) {
+ int result;
+ if (elementData.length < indexThreshold) {
+ result = ArrayUtils.indexOf(elementData, o);
+ } else {
+ if (elementToIndex != null) {
+ result = elementToIndex.getOrDefault(o, -1);
+ } else {
+ // Compute the map from element to its index
+ Map<T, Integer> map = new HashMap<>();
+ for (int i = 0; i < elementData.length; ++i) {
+ T key = elementData[i];
+ map.put(key, i);
+ }
+ result = map.getOrDefault(o, -1);
+ elementToIndex = map;
+ }
+ }
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ return super.equals(obj);
+ }
+}
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/Join.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/Join.java
index c095d22849..cbe3493ebb 100644
--- a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/Join.java
+++ b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/Join.java
@@ -50,12 +50,26 @@ public class Join {
* @return QueryIterator
*/
public static QueryIterator join(QueryIterator left, QueryIterator right,
ExecutionContext execCxt) {
+ return join(null, left, right, execCxt);
+ }
+
+ /**
+ * Standard entry point to a join of two streams.
+ * This is not a substitution/index join.
+ * (See {@link OpExecutor} for streamed execution using substitution).
+ * @param joinKey
+ * @param left
+ * @param right
+ * @param execCxt
+ * @return QueryIterator
+ */
+ public static QueryIterator join(JoinKey joinKey, QueryIterator left,
QueryIterator right, ExecutionContext execCxt) {
if ( false )
return debug(left, right, execCxt,
- (_left, _right)->hashJoin(_left, _right, execCxt)) ;
+ (_left, _right)->hashJoin(joinKey, _left, _right,
execCxt)) ;
if ( useNestedLoopJoin )
return nestedLoopJoin(left, right, execCxt) ;
- return hashJoin(left, right, execCxt) ;
+ return hashJoin(joinKey, left, right, execCxt) ;
}
/** Standard entry point to a left join of two streams.
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinIndex.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinIndex.java
new file mode 100644
index 0000000000..28611f2073
--- /dev/null
+++ b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinIndex.java
@@ -0,0 +1,226 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
+
+import org.apache.jena.atlas.io.IndentedWriter;
+import org.apache.jena.atlas.io.Printable;
+import org.apache.jena.atlas.iterator.Iter;
+import org.apache.jena.sparql.core.Var;
+import org.apache.jena.sparql.engine.binding.Binding;
+import org.apache.jena.sparql.engine.binding.BindingFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class indexes a set of rows w.r.t. a join key, referred to as the
<i>main join key</i>.
+ * <p />
+ * Consider the main join key [?x, ?y, ?z]:
+ * <p />
+ * All rows that bind all three variables will be placed into the main table.
+ *
+ * Any rows that only bind a sub set of the variables, such as [?x, ?z] or
[?y],
+ * are placed into respective skew tables.
+ *
+ * JoinIndex instances are dynamically created by {@link MultiHashProbeTable}
based on the
+ * variables of the bindings used in lookup requests for matching rows.
+ *
+ * Internally, the lists of variables are represented as bit sets, see also
{@link BitSetMapper}.
+ */
+class JoinIndex
+ implements Iterable<Binding>, Printable
+{
+ private static final Logger logger =
LoggerFactory.getLogger(JoinIndex.class);
+
+ private JoinKey superJoinKey;
+
+ private BitSet mainJoinKeyBitSet;
+ private HashProbeTable mainTable;
+
+ /** Skew tables hold rows whose variables are a strict sub set of those of
this table. */
+ private Map<BitSet, HashProbeTable> skewTables;
+
+ /**
+ * Constructor of JoinIndex.
+ *
+ * @param superJoinKey The join key to which the bit set
representation of all involved variable lists refer to.
+ * @param mainJoinKeyBitSet The main join key as a bit set w.r.t. to the
super join key.
+ * @param mainJoinKey Optionally, as a minor optimization, the main
join key can be provided directly.
+ * It must hold that {@code mainJoinKey =
JoinKey.create(BitSetMapper.toList(superJoinKey, mainJoinKeyBitSet));}
+ */
+ public JoinIndex(JoinKey superJoinKey, BitSet mainJoinKeyBitSet, JoinKey
mainJoinKey) {
+ this.superJoinKey = Objects.requireNonNull(superJoinKey);
+ this.mainJoinKeyBitSet = mainJoinKeyBitSet;
+ if (mainJoinKey == null) {
+ mainJoinKey = JoinKey.create(BitSetMapper.toList(superJoinKey,
mainJoinKeyBitSet));
+ }
+ this.mainTable = new HashProbeTable(mainJoinKey);
+ }
+
+ public BitSet getMainJoinKeyBitSet() {
+ return mainJoinKeyBitSet;
+ }
+
+ public HashProbeTable getMainTable() {
+ return mainTable;
+ }
+
+ public Set<BitSet> getSkewKeys() {
+ return skewTables == null ? Collections.emptySet() :
skewTables.keySet();
+ }
+
+ /** Returns the skew table for the given key. Returns null if there is
none. */
+ public HashProbeTable getSkewTable(BitSet key) {
+ return skewTables == null ? null : skewTables.get(key);
+ }
+
+ public Map<BitSet, HashProbeTable> getSkewTables() {
+ return skewTables == null ? Collections.emptyMap() : skewTables;
+ }
+
+ public Map<JoinKey, HashProbeTable> getSkewTablesByJoinKey() {
+ return getSkewTables().entrySet().stream().collect(Collectors.toMap(
+ e -> JoinKey.create(BitSetMapper.toList(mainTable.getJoinKey(),
e.getKey())),
+ Entry::getValue));
+ }
+
+ /** Returns the skew table for the given key. Creates the table if needed.
Never returns null. */
+ public HashProbeTable getOrCreateSkewTable(BitSet tableKey) {
+ if (skewTables == null) {
+ // LinkedHashMap for determinism: Always traverse skew tables in
their creation order
+ skewTables = new LinkedHashMap<>();
+ }
+ HashProbeTable result = skewTables.computeIfAbsent(tableKey, key -> {
+ List<Var> vars = BitSetMapper.toList(superJoinKey, key);
+ return new HashProbeTable(JoinKey.create(vars));
+ });
+ return result;
+ }
+
+ public void put(Binding row) {
+ BitSet rawRowKey = BitSetMapper.toBitSet(superJoinKey, row);
+
+ // The next two lines are: effectiveRowKey = bitwiseAnd(mainKey,
rawRowKey)
+ BitSet effectiveRowKey = (BitSet)mainJoinKeyBitSet.clone();
+ effectiveRowKey.and(rawRowKey);
+ boolean isSameKey = effectiveRowKey.equals(mainJoinKeyBitSet);
+
+ // If there are no joining variables then append the data to the
no-key bucket of the main table
+ if (effectiveRowKey.isEmpty()) {
+ mainTable.putNoKey(row);
+ } else if (isSameKey) {
+ // Hash the row; will never end up in the no-key bucket because
that case is handled first
+ mainTable.put(row);
+ } else {
+ HashProbeTable skewTable = getOrCreateSkewTable(effectiveRowKey);
+ skewTable.put(row);
+ }
+ }
+
+ public Iterator<Binding> getCandidates(Binding row) {
+ if (logger.isTraceEnabled()) {
+ BitSet joinKeyBitSet = BitSetMapper.toBitSet(superJoinKey, row);
+ logger.trace("Lookup with " + BitSetMapper.toList(superJoinKey,
joinKeyBitSet));
+ }
+
+ Iterator<Binding> it = getMainTable().getCandidates(row); // Includes
no-key bucket
+
+ // Append data from skew tables (if they exist)
+ if (skewTables != null) {
+ for (Entry<BitSet, HashProbeTable> entry : skewTables.entrySet()) {
+ // BitSet skewKey = entry.getKey();
+ HashProbeTable skewTable = entry.getValue();
+
+ Iterator<Binding> subIt = skewTable.getCandidates(row, false);
// Excludes no-key bucket which should be empty anyway.
+ if (logger.isTraceEnabled()) {
+ subIt = printIteratorItems(subIt, "sub-iterator",
logger::trace);
+ }
+
+ it = Iter.concat(it, subIt);
+ }
+ }
+
+ if (logger.isTraceEnabled()) {
+ it = printIteratorItems(it, "Lookup result for " + row,
logger::trace);
+ }
+
+ return it;
+ }
+
+ @Override
+ public Iterator<Binding> iterator() {
+ return getCandidates(BindingFactory.empty());
+ }
+
+ public void clear() {
+ mainTable.clear();
+ if (skewTables != null) {
+ skewTables.clear();
+ }
+ }
+
+ @Override
+ public String toString() {
+ return Printable.toString(this);
+ }
+
+ @Override
+ public void output(IndentedWriter out) {
+ out.ensureStartOfLine();
+ out.println("JoinIndex " + mainTable.getJoinKey());
+ out.incIndent();
+ out.println("Main table: " + mainTable);
+ Map<BitSet, HashProbeTable> skewTables = getSkewTables();
+ if (skewTables.isEmpty()) {
+ out.println("Skew tables: none");
+ } else {
+ out.println("Skew tables");
+ skewTables.values().forEach(table -> {
+ out.incIndent();
+ out.println("|- " + table);
+ out.decIndent();
+ });
+ }
+ out.decIndent();
+ }
+
+ /** Helper function to conditionally print out the content of an iterator.
Returns another iterator over the seen data. */
+ private static <T> Iterator<T> printIteratorItems(Iterator<T> it, String
label, Consumer<String> logger) {
+ List<T> list = new ArrayList<>();
+ it.forEachRemaining(list::add);
+ if (label != null) {
+ logger.accept(label + ": " + list.size() + " items");
+ }
+ for (T item : list) {
+ logger.accept("- " + item);
+ }
+ return list.iterator();
+ }
+}
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinKey.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinKey.java
index 2f286766ba..cb9d3f55d7 100644
--- a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinKey.java
+++ b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinKey.java
@@ -17,96 +17,144 @@
*/
package org.apache.jena.sparql.engine.join;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
import org.apache.jena.sparql.core.Var ;
/** JoinKey for hash joins */
-public final class JoinKey implements Iterable<Var>
+public final class JoinKey extends ImmutableUniqueList<Var>
{
/** Key of no variables */
- private static final JoinKey emptyKey = new
JoinKey(Collections.emptyList()) ;
+ private static final JoinKey EMPTY = new JoinKey(new Var[0]);
- /** Make a JoinKey from the intersection of two sets **/
+ public static JoinKey empty() {
+ return EMPTY;
+ }
+
+ /** Make a JoinKey from the intersection of two sets **/
public static JoinKey create(Collection<Var> vars1, Collection<Var> vars2)
{
// JoinKeys and choices for keys are generally small so short loops
are best.
List<Var> intersection = new ArrayList<>() ;
for ( Var v : vars1 ) {
if ( vars2.contains(v) )
- intersection.add(v) ;
+ intersection.add(v) ;
}
- return new JoinKey(intersection) ;
+ return create(intersection) ;
}
-
- /** Make a JoinKey of single variable from the intersection of two sets
**/
+
+ /** Make a JoinKey of single variable from the intersection of two sets.
**/
+ @Deprecated
public static JoinKey createVarKey(Collection<Var> vars1, Collection<Var>
vars2) {
for ( Var v : vars1 ) {
if ( vars2.contains(v) )
return create(v) ;
}
- return emptyKey ;
+ return empty() ;
}
-
+
public static JoinKey create(Var var) {
- return new JoinKey(var) ;
+ return createUnsafe(new Var[] { var });
}
-
- /** The builder can emit a key every time build() is caller
+
+ /** The builder can emit a key every time build() is called
* and it can be continued to be used.
*/
public static final class Builder {
- private List<Var> keys = new ArrayList<>() ;
-
- public Builder() { }
-
- public boolean contains(Var var) {
- return keys.contains(var) ;
+
+ private ImmutableUniqueList.Builder<Var> delegate;
+
+ Builder() {
+ this.delegate = newUniqueListBuilder(Var.class);
}
-
+
public Builder add(Var var) {
- // We expect the keys list to be short - a Set is overkill(??)
- if ( ! contains(var) )
- keys.add(var) ;
+ delegate = delegate.add(var);
return this ;
}
-
+
+ public Builder addAll(Collection<Var> vars) {
+ delegate = delegate.addAll(vars);
+ return this;
+ }
+
+ public Builder addAll(Var[] vars) {
+ delegate = delegate.addAll(vars);
+ return this;
+ }
+
public Builder remove(Var var) {
- keys.remove(var) ;
+ delegate = delegate.remove(var);
return this ;
}
- public Builder clear() { keys.clear() ; return this ; }
+ public Builder clear() {
+ delegate = delegate.clear();
+ return this ;
+ }
public JoinKey build() {
- JoinKey joinKey = new JoinKey(new ArrayList<>(keys)) ;
- return joinKey ;
+ JoinKey result;
+ // Reuse singleton empty instance when appropriate
+ if (delegate.isEmpty()) {
+ result = empty();
+ } else {
+ ImmutableUniqueList<Var> list = delegate.build();
+ return new JoinKey(list.elementData);
+ }
+ return result;
}
}
-
- // Consider using an array.
- private final List<Var> keys ;
-
- private JoinKey(List<Var> _keys) { keys = _keys ; }
-
- private JoinKey(Var var) { keys = Collections.singletonList(var) ; }
-
- public boolean isEmpty() { return keys.isEmpty() ; }
-
- public int length() { return keys.size() ; }
-
- /** Get a single variable for this key.
- * For any one key, it always returns the same var */
- public Var getVarKey() {
- if ( keys.isEmpty() )
- return null ;
- return keys.get(0) ;
+
+ public static Builder newBuilder() {
+ return new Builder();
+ }
+
+ @SuppressWarnings("unchecked")
+ public static JoinKey create(Collection<Var> vars) {
+ return vars instanceof Set s
+ ? create(s)
+ : newBuilder().addAll(vars).build();
+ }
+
+ public static JoinKey create(String... varNames) {
+ return create(Var.varList(Arrays.asList(varNames)));
+ }
+
+ public static JoinKey create(Var[] vars) {
+ return newBuilder().addAll(vars).build();
+ }
+
+ /** Create a JoinKey directly from a Set.
+ * The set should be a {@link LinkedHashSet} because variable order
matters for JoinKeys.
+ * This method does not rely on {@link #newBuilder()}. */
+ public static JoinKey create(Set<Var> vars) {
+ Var[] arr = new Var[vars.size()];
+ arr = vars.toArray(arr);
+ return createUnsafe(arr);
+ }
+
+ /**
+ * Create a join key without coping the key array and without checking for
duplicates.
+ * The array must not be modified.
+ */
+ private static JoinKey createUnsafe(Var[] keys) {
+ return keys.length == 0 ? empty() : new JoinKey(keys);
}
-
- @Override
- public Iterator<Var> iterator() { return keys.iterator() ; }
-
- @Override
- public String toString() {
- return keys.toString() ;
+
+ private JoinKey(Var[] keys) {
+ super(keys);
+ }
+
+ /** Get a single variable for this key.
+ * For any one key, it always returns the same var */
+ public Var getVarKey() {
+ if ( elementData.length == 0 )
+ return null ;
+ return elementData[0] ;
}
}
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinLib.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinLib.java
index 645e381251..0ea9802d4f 100644
--- a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinLib.java
+++ b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/JoinLib.java
@@ -18,6 +18,8 @@
package org.apache.jena.sparql.engine.join;
+import java.util.Iterator;
+
import org.apache.jena.graph.Node ;
import org.apache.jena.sparql.core.Var ;
import org.apache.jena.sparql.engine.binding.Binding ;
@@ -25,7 +27,7 @@ import org.apache.jena.sparql.engine.binding.Binding ;
/** Internal operations in support of join algorithms. */
class JoinLib {
- /** Control stats output / development use */
+ /** Control stats output / development use */
static final boolean JOIN_EXPLAIN = false;
// No hash key marker.
@@ -41,11 +43,16 @@ class JoinLib {
return h;
}
- public static Object hash(JoinKey joinKey, Binding row) {
+ public static Object hash(Iterable<Var> joinKey, Binding row) {
+ return hash(joinKey.iterator(), row);
+ }
+
+ public static Object hash(Iterator<Var> vars, Binding row) {
long x = 31 ;
- boolean seenJoinKeyVar = false ;
+ boolean seenJoinKeyVar = false ;
// Neutral to order in the set.
- for ( Var v : joinKey ) {
+ while (vars.hasNext()) {
+ Var v = vars.next();
Node value = row.get(v) ;
long h = nullHashCode ;
if ( value != null ) {
@@ -54,7 +61,7 @@ class JoinLib {
} else {
// In join key, not in row.
}
-
+
x = x ^ h ;
}
if ( ! seenJoinKeyVar )
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/MultiHashProbeTable.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/MultiHashProbeTable.java
new file mode 100644
index 0000000000..f921795dfc
--- /dev/null
+++
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/MultiHashProbeTable.java
@@ -0,0 +1,237 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.jena.atlas.io.IndentedWriter;
+import org.apache.jena.atlas.io.Printable;
+import org.apache.jena.sparql.core.Var;
+import org.apache.jena.sparql.engine.binding.Binding;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Probe table that can store a set of bindings and dynamically re-indexes them
+ * in additional {@link JoinIndex} instances based on the lookup requests.
+ * The initial set of bindings is stored in an initial JoinIndex from which
further indexes are derived.
+ * <p>
+ * A unique list of seen variables is maintained which captures the set of
variables mentioned across all stored bindings.
+ * <p>
+ * A JoinIndex instance is created for each list of variables obtained from
the intersection of
+ * the seen variable list and the set of variables that appear in a lookup
request.
+ * <p>
+ * Sets of variables are represented as bit sets based on the seen variable
list. So if the variables [?x ?y ?z] have been seen,
+ * then a lookup with a binding mentioning [?x ?z] will be represented with
the bit set 101.
+ * <p>
+ * Each JoinIndex holds all bindings that were added to the
MultiHashProbeTable.
+ * A JoinIndex is partitioned into a single main table and zero or more skew
tables.
+ * Every binding that bind all variables of the intersection with a lookup
request is placed into the main table.
+ * Every binding that binds fewer variables is placed into a respective skew
table.
+ */
+class MultiHashProbeTable
+ implements Printable
+{
+ private static final Logger logger =
LoggerFactory.getLogger(MultiHashProbeTable.class);
+
+ // Enable printing of debug output
+ private static boolean isDebugOutputEnabled = false;
+
+ /**
+ * The joinKey specifies the largest set of variables for which to create
HashProbeTable indexes.
+ * If it is null then there is no restriction.
+ */
+ private final JoinKey maxJoinKey;
+
+ /** The initial index where all rows are put into. Further indexes may be
created as needed during lookups. */
+ private final JoinIndex initialIndex;
+
+ /** Tables for hash lookups. Will be created dynamically during {@link
#getCandidates(Binding)}. */
+ private final Map<BitSet, JoinIndex> indexes = new HashMap<>();
+
+ /** The set of seen variables across all rows */
+ private final Set<Var> seenVarSet = new LinkedHashSet<>();
+
+ /* The following two fields are initialized on the first call to
getCandidates */
+
+ private boolean isFinalized = false;
+
+ /** Instead of using Set>Var< we create bit sets w.r.t. seenVarSet
to represent subsets of variables */
+ private JoinKey seenVarsJoinKey;
+
+ public MultiHashProbeTable(JoinKey maxJoinKey, JoinKey initialJoinKey) {
+ this.maxJoinKey = maxJoinKey;
+
+ if (maxJoinKey != null && initialJoinKey != null &&
!maxJoinKey.containsAll(initialJoinKey)) {
+ throw new IllegalArgumentException("Variables of the initial join
key must be a sub set of the root one.");
+ }
+
+ if (initialJoinKey == null) {
+ initialJoinKey = JoinKey.empty();
+ }
+
+ // If an initial join key is given then its variables are added to
seen vars
+ // so that those correspond to the first bits of the bit keys
+ seenVarSet.addAll(initialJoinKey);
+
+ int nbits = initialJoinKey.size();
+ BitSet initialJoinKeyBitset = new BitSet(nbits);
+ initialJoinKeyBitset.flip(0, nbits);
+
+ if (logger.isTraceEnabled()) {
+ logger.trace("Initial join index configured with variables " +
initialJoinKey + " and bits " + initialJoinKeyBitset);
+ }
+
+ this.initialIndex = new JoinIndex(initialJoinKey,
initialJoinKeyBitset, initialJoinKey);
+ }
+
+ Map<JoinKey, JoinIndex> getIndexesByJoinKeys() {
+ return indexes.entrySet().stream()
+ .collect(Collectors.toMap(e -> toJoinKey(e.getKey()),
Entry::getValue));
+ }
+
+ Map<BitSet, JoinIndex> getIndexes() {
+ return indexes;
+ }
+
+ JoinKey toJoinKey(BitSet bitSet) {
+ JoinKey vars = JoinKey.create(BitSetMapper.toList(seenVarsJoinKey,
bitSet));
+ return vars;
+ }
+
+ public void put(Binding row) {
+ if (isFinalized) {
+ throw new IllegalStateException("Cannot add more bindings after a
lookup was performed.");
+ }
+ updateSeenVars(row);
+ initialIndex.put(row);
+ }
+
+ /** Update seen vars with the row's relevant variables w.r.t. an optional
rootJoinKey. */
+ private void updateSeenVars(Binding row) {
+ if (maxJoinKey == null) {
+ row.vars().forEachRemaining(seenVarSet::add);
+ } else {
+ // Iterate over the smallest set of variables
+ if (maxJoinKey.length() < row.size()) { // join key has fewer vars
than the row
+ maxJoinKey.forEach(v -> {
+ if (row.contains(v)) {
+ seenVarSet.add(v);
+ }
+ });
+ } else { // row has fewer vars than the join key
+ row.vars().forEachRemaining(v -> {
+ if (maxJoinKey.contains(v)) {
+ seenVarSet.add(v);
+ }
+ });
+ }
+ }
+ }
+
+ public Iterator<Binding> getCandidates(Binding row) {
+ if (!isFinalized) {
+ doFinalize();
+ }
+ BitSet joinKeyBitSet = BitSetMapper.toBitSet(seenVarsJoinKey, row);
+ if (isDebugOutputEnabled) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Lookup with " +
BitSetMapper.toList(seenVarsJoinKey, joinKeyBitSet));
+ }
+ }
+
+ Iterator<Binding> it;
+ if (joinKeyBitSet.isEmpty()) {
+ // Case for no joining variables: all bindings unconditionally
become candidates
+ it = initialIndex.iterator();
+ } else {
+ JoinIndex primaryIndex = getOrCreateJoinIndex(joinKeyBitSet);
+ it = primaryIndex.getCandidates(row);
+ }
+ return it;
+ }
+
+ /** Calling this method indicates that all bindings have been collected
+ * and are ready for indexing. No further updates are allowed unless
+ * {@link #clear()} is called.
+ *
+ * This method is package private so that it can be called from tests.
+ */
+ void doFinalize() {
+ // Note: We need to stick with the variable order provided in the
initial index -> don't sort!
+ // Arrays.sort(seenVars, (a, b) -> a.getName().compareTo(b.getName()));
+ seenVarsJoinKey = JoinKey.create(seenVarSet);
+ indexes.put(initialIndex.getMainJoinKeyBitSet(), initialIndex);
+ isFinalized = true;
+ }
+
+ private JoinIndex getOrCreateJoinIndex(BitSet joinKeyBitSet) {
+ JoinIndex result = indexes.computeIfAbsent(joinKeyBitSet,
this::createJoinIndex);
+ return result;
+ }
+
+ private JoinIndex createJoinIndex(BitSet joinKeyBitSet) {
+ JoinKey joinKey = JoinKey.create(BitSetMapper.toList(seenVarsJoinKey,
joinKeyBitSet));
+ JoinIndex result = new JoinIndex(seenVarsJoinKey, joinKeyBitSet,
joinKey);
+
+ if (logger.isTraceEnabled()) {
+ logger.trace("Creating join index with variables " + joinKey + "
and bits " + joinKeyBitSet);
+ }
+
+ for (Binding row : initialIndex) {
+ result.put(row);
+ }
+
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return Printable.toString(this);
+ }
+
+ @Override
+ public void output(IndentedWriter out) {
+ out.ensureStartOfLine();
+ out.println("MultiHashProbeTable");
+ out.incIndent();
+ getIndexesByJoinKeys().forEach((joinKey, index) -> {
+ index.output(out);
+ });
+ out.decIndent();
+ }
+
+ public Iterator<Binding> values() {
+ return initialIndex.iterator();
+ }
+
+ public void clear() {
+ indexes.clear();
+ initialIndex.clear();
+ seenVarSet.clear();
+ seenVarsJoinKey = null;
+ isFinalized = false;
+ }
+}
diff --git
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/QueryIterHashJoin.java
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/QueryIterHashJoin.java
index 8252466cde..a16f7a9b12 100644
---
a/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/QueryIterHashJoin.java
+++
b/jena-arq/src/main/java/org/apache/jena/sparql/engine/join/QueryIterHashJoin.java
@@ -18,13 +18,12 @@
package org.apache.jena.sparql.engine.join;
-import org.apache.jena.atlas.logging.Log ;
import org.apache.jena.sparql.engine.ExecutionContext ;
import org.apache.jena.sparql.engine.QueryIterator ;
import org.apache.jena.sparql.engine.binding.Binding ;
import org.apache.jena.sparql.engine.iterator.QueryIterNullIterator ;
-/** Hash left join.
+/** Hash left join.
* This code materializes the right into a probe table
* then hash joins from the left.
*/
@@ -33,7 +32,7 @@ import
org.apache.jena.sparql.engine.iterator.QueryIterNullIterator ;
//* then hash joins from the right.
public class QueryIterHashJoin extends AbstractIterHashJoin {
-
+
/**
* Create a hashjoin QueryIterator.
* @param joinKey Join key - if null, one is guessed by snooping the
input QueryIterators
@@ -49,11 +48,10 @@ public class QueryIterHashJoin extends AbstractIterHashJoin
{
right.close() ;
return QueryIterNullIterator.create(execCxt) ;
}
- if ( joinKey != null && joinKey.length() > 1 )
- Log.warn(QueryIterHashJoin.class, "Multivariable join key") ;
- return new QueryIterHashJoin(joinKey, left, right, execCxt) ;
+
+ return new QueryIterHashJoin(joinKey, left, right, execCxt) ;
}
-
+
/**
* Create a hashjoin QueryIterator.
* @param left
@@ -61,11 +59,11 @@ public class QueryIterHashJoin extends AbstractIterHashJoin
{
* @param execCxt
* @return QueryIterator
*/
-
+
public static QueryIterator create(QueryIterator left, QueryIterator
right, ExecutionContext execCxt) {
return create(null, left, right, execCxt) ;
}
-
+
private QueryIterHashJoin(JoinKey joinKey, QueryIterator left,
QueryIterator right, ExecutionContext execCxt) {
super(joinKey, left, right, execCxt) ;
}
@@ -79,7 +77,7 @@ public class QueryIterHashJoin extends AbstractIterHashJoin {
protected Binding noYieldedRows(Binding rowCurrentProbe) {
return null;
}
-
+
@Override
protected QueryIterator joinFinished() {
return null;
diff --git
a/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/AbstractTestInnerJoin.java
b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/AbstractTestInnerJoin.java
index 5152e4e72b..dbb8c0acfe 100644
---
a/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/AbstractTestInnerJoin.java
+++
b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/AbstractTestInnerJoin.java
@@ -23,9 +23,9 @@ import org.apache.jena.sparql.core.Var ;
import org.apache.jena.sparql.expr.ExprList ;
import org.junit.Test ;
-/** Tests for inner/equi joins */
+/** Tests for inner/equi joins */
public abstract class AbstractTestInnerJoin extends AbstractTestJoin {
-
+
@Override
protected void executeTest(JoinKey joinKey, Table left, Table right,
ExprList conditions, Table expectedResults) {
if ( conditions != null )
@@ -35,12 +35,12 @@ public abstract class AbstractTestInnerJoin extends
AbstractTestJoin {
executeTestJoin("1", joinKey, left, right, null, expectedResults) ;
executeTestJoin("2", joinKey, right, left, null, expectedResults) ;
}
-
+
@Test public void join_basic_1() { testJoin("a", table0(), table0(),
table0()) ; }
@Test public void join_basic_2() { testJoin("a", table1(), table0(),
table0()) ; }
@Test public void join_basic_3() { testJoin("a", tableD1(), table1(),
tableD1()) ; }
@Test public void join_basic_4() { testJoin("z", tableD1(), table1(),
tableD1()) ; }
-
+
@Test public void join_basic_5() { testJoin("a", table0(), table1(),
table0()) ; }
@Test public void join_basic_6() { testJoin("a", table1(), table0(),
table0()) ; }
@@ -71,16 +71,41 @@ public abstract class AbstractTestInnerJoin extends
AbstractTestJoin {
@Test public void join_skew_01() { testJoin("x", tableS1(), tableS2(),
tableS1J2()) ; }
@Test public void join_skew_02() { testJoin("w", tableS1(), tableS2(),
tableS1J2()) ; }
@Test public void join_skew_03() { testJoin(null, tableS1(), tableS2(),
tableS1J2()) ; }
- //@Test
- // Multiple variable join keys on skew data don't work.
- public void join_skew_04() {
+
+ // Skew tests where the order of the two bindings in tableS1 is swapped.
+ @Test public void join_skew_01b() { testJoin("x", tableS1b(), tableS2(),
tableS1J2()) ; }
+ @Test public void join_skew_02b() { testJoin("w", tableS1b(), tableS2(),
tableS1J2()) ; }
+ @Test public void join_skew_03b() { testJoin(null, tableS1b(), tableS2(),
tableS1J2()) ; }
+
+
+ @Test
+ public void join_skew_04() {
JoinKey joinKey = new JoinKey.Builder()
.add(Var.alloc("x"))
.add(Var.alloc("w"))
.build() ;
- testJoinWithKey(joinKey, tableS1(), tableS2(), tableS1J2()) ;
+ testJoinWithKey(joinKey, tableS1(), tableS2(), tableS1J2()) ;
}
-
+
+ @Test
+ public void join_skew_05() {
+ Table in = parseTableInt("""
+ (table
+ (row (?x undef) (?y 0))
+ (row (?x 0) (?y 0))
+ )""");
+
+ Table expected = parseTableInt("""
+ (table
+ (row (?x undef) (?y 0))
+ (row (?x 0) (?y 0))
+ (row (?x 0) (?y 0))
+ (row (?x 0) (?y 0))
+ )""");
+
+ testJoin(null, in, in, expected) ;
+ }
+
// Disjoint tables.
@Test public void join_disjoint_01() { testJoin("a", tableD2(), tableD8(),
tableD8x2()) ; }
@Test public void join_disjoint_02() { testJoin("z", tableD2(), tableD8(),
tableD8x2()) ; }
diff --git
a/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/AbstractTestJoin.java
b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/AbstractTestJoin.java
index c4240d5d41..93edc75b53 100644
---
a/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/AbstractTestJoin.java
+++
b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/AbstractTestJoin.java
@@ -225,6 +225,14 @@ public abstract class AbstractTestJoin extends Assert {
," (row (?z <http://example/z1>) (?x
<http://example/x>) (?w 'w11-1'))"
," (row (?z <http://example/z4>) (?x
<http://example/x>)))"
); }
+
+ // This is tableS1 with reversed order of bindings
+ protected static Table tableS1b() {
+ return parseTableInt("(table"
+ ," (row (?z <http://example/z4>) (?x
<http://example/x>))"
+ ," (row (?z <http://example/z1>) (?x
<http://example/x>) (?w 'w11-1')))"
+ ); }
+
protected static Table tableS2() {
return parseTableInt("(table (row (?x <http://example/x>) (?w
<http://example/z1>)))") ;
}
diff --git
a/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/TestBitSetMapper.java
b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/TestBitSetMapper.java
new file mode 100644
index 0000000000..164a659b00
--- /dev/null
+++
b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/TestBitSetMapper.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.sparql.engine.join;
+
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.List;
+
+import org.apache.jena.sparql.core.Var;
+import org.apache.jena.sparql.engine.binding.Binding;
+import org.apache.jena.sparql.sse.SSE;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestBitSetMapper {
+
+ /** Degenerate case: Binding with no variables. */
+ @Test
+ public void test01() {
+ testRoundtrip(Arrays.asList("x", "y", "z"), "(row )", Arrays.asList());
+ }
+
+ /** Degenerate case: Join key with no variables. */
+ @Test
+ public void test02() {
+ testRoundtrip(Arrays.asList(), "(row (?x 0) (?y 1) (?z 2))",
Arrays.asList());
+ }
+
+ /** Degenerate case: Empty intersection between variables of the join key
and the binding. */
+ @Test
+ public void test03() {
+ testRoundtrip(Arrays.asList("x", "y", "z"), "(row (?a 0) (?b 1) (?c
2))", Arrays.asList());
+ }
+
+ @Test
+ public void test04() {
+ testRoundtrip(Arrays.asList("x", "y", "z"), "(row (?x 0) (?y 1) (?z
2))", Arrays.asList("x", "y", "z"));
+ }
+
+ @Test
+ public void test05() {
+ testRoundtrip(Arrays.asList("x", "y", "z"), "(row (?y 1) (?z 2))",
Arrays.asList("y", "z"));
+ }
+
+ /** Variables that do not occur in the join key must be omitted. */
+ @Test
+ public void test06() {
+ testRoundtrip(Arrays.asList("x", "y", "z"), "(row (?y 1) (?w 2))",
Arrays.asList("y"));
+ }
+
+ /** Create a bit representation of the variables common to the join key
and the binding.
+ * Converting the bit set back to the variable list is expected to yield
the list of common variables. */
+ private static void testRoundtrip(List<String> joinKeyVarNames, String
bindingSse, List<String> expectedVarNames) {
+ JoinKey joinKey = JoinKey.create(Var.varList(joinKeyVarNames));
+ Binding binding = SSE.parseBinding(bindingSse);
+
+ BitSet bitSet = BitSetMapper.toBitSet(joinKey, binding);
+
+ List<Var> expectedVars = Var.varList(expectedVarNames);
+ List<Var> actualVars = BitSetMapper.toList(joinKey, bitSet);
+
+ Assert.assertEquals(expectedVars, actualVars);
+ }
+}
diff --git
a/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/TestJoinKey.java
b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/TestJoinKey.java
new file mode 100644
index 0000000000..ae9d8e688b
--- /dev/null
+++ b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/TestJoinKey.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.sparql.engine.join;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.jena.sparql.core.Var;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestJoinKey {
+
+ public void testEmptyKey() {
+ Assert.assertEquals(JoinKey.create(), JoinKey.empty());
+ Assert.assertTrue(JoinKey.create() == JoinKey.empty());
+ }
+
+ /** Join key building is expected to order elements by their first
occurrence and drop duplicates. */
+ @Test
+ public void testDuplicates01() {
+ JoinKey expected = JoinKey.create("a");
+ JoinKey actual = JoinKey.create("a", "a", "a");
+ Assert.assertEquals(expected, actual);
+ }
+
+ /** Join key building is expected to order elements by their first
occurrence and drop duplicates. */
+ @Test
+ public void testDuplicates02() {
+ JoinKey expected = JoinKey.create("c", "a", "b");
+ JoinKey actual = JoinKey.create("c", "a", "a", "b", "a", "c", "b");
+ Assert.assertEquals(expected, actual);
+ }
+
+ @Test
+ public void testIndexOf_small() {
+ testIndexOf("a", "b", "c");
+ }
+
+ /** The {@link JoinKey} implementation creates an extra var-to-index
mapping
+ * when the number of variables exceeds a threshold;
+ * by default {@link ImmutableUniqueList#INDEX_THRESHOLD}.
+ * Here we check whether indexOf still works as expected. */
+ @Test
+ public void testIndexOf_large() {
+ testIndexOf("a", "b", "c", "d", "e", "f", "g");
+ }
+
+ /** Tests the join key's indexOf method.
+ * The index of each var in the join key must match the position in the
argument. */
+ private static void testIndexOf(String... varNames) {
+ List<Var> vars = Var.varList(Arrays.asList(varNames));
+ JoinKey joinKey = JoinKey.create(vars);
+ Assert.assertEquals(vars.size(), joinKey.length());
+ for (int i = 0; i < joinKey.length(); ++i) {
+ Var v = vars.get(i);
+ int index = joinKey.indexOf(v);
+ Assert.assertEquals(i, index);
+ }
+ }
+}
diff --git
a/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/TestMultiHashProbeTable.java
b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/TestMultiHashProbeTable.java
new file mode 100644
index 0000000000..11f3511a01
--- /dev/null
+++
b/jena-arq/src/test/java/org/apache/jena/sparql/engine/join/TestMultiHashProbeTable.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.sparql.engine.join;
+
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.jena.atlas.iterator.Iter;
+import org.apache.jena.sparql.engine.binding.Binding;
+import org.apache.jena.sparql.engine.binding.BindingFactory;
+import org.apache.jena.sparql.sse.SSE;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestMultiHashProbeTable {
+ // Binding examples taken from TestVarFinder2
+ private static Binding row_xy = SSE.parseBinding("(row (?x 1) (?y 2))");
+ private static Binding row_x = SSE.parseBinding("(row (?x 1))");
+ private static Binding row_y = SSE.parseBinding("(row (?y 2))");
+ private static Binding row0 = SSE.parseBinding("(row)");
+
+ private static Binding match_x3 = SSE.parseBinding("(row (?x 3))");
+ private static Binding match_x3y2 = SSE.parseBinding("(row (?x 3) (?y
2))");
+
+ // TODO Parameterize with different initial join keys including null
+
+ /** Ensure that further updates are rejected after the first lookup. */
+ @Test(expected = IllegalStateException.class)
+ public void test01() {
+ MultiHashProbeTable table = new MultiHashProbeTable(null, null);
+ table.getCandidates(row0);
+ table.put(row_xy);
+ }
+
+ /** Runs the lookup process repeatedly to check for any non-deterministic
behavior,
+ * such as scrambled variable orders in JoinKeys due to use of HashSet
where LinkedHashSet
+ * needed to be used. */
+ @Test
+ public void testLookupProcessRepeated() {
+ for (int i = 0; i < 1000; ++i) {
+ testLookupProcess();
+ }
+ }
+
+ /**
+ * This test adds 4 rows to a MultiHashProbeTable and performs 2 lookups.
+ * The state of the created indexes, main and skew tables is checked.
+ */
+ @Test
+ public void testLookupProcess() {
+ List<Binding> givenRowList = List.of(row_xy, row_x, row_y, row0);
+
+ // Set.of() methods may not retain order which breaks tests because
the join keys
+ // depend on which variables are seen first and JoinKey(?x ?y) and
JoinKey(?y ?x)
+ // are not equal
+ Set<Binding> givenRowSet = new LinkedHashSet<>(givenRowList);
+
+ // Sanity check of the input data - this test case does not deal with
duplicates
+ Assert.assertEquals(givenRowList.size(), givenRowSet.size());
+
+ MultiHashProbeTable table = new MultiHashProbeTable(null, null);
+ givenRowSet.forEach(table::put);
+ table.doFinalize();
+
+ // We expect only a table for the initial empty join key
+ Assert.assertEquals(Set.of(JoinKey.empty()),
table.getIndexesByJoinKeys().keySet());
+
+ // Lookup with empty row should match all 4 rows
+ // (We don't test whether implementations preserve order)
+ Set<Binding> matchedRows = Iter.toSet(table.getCandidates(row0));
+ Assert.assertEquals(givenRowSet, matchedRows);
+
+ JoinKey emptyKey = JoinKey.empty();
+ JoinKey xKey = JoinKey.create("x");
+ JoinKey yKey = JoinKey.create("y");
+ JoinKey xyKey = JoinKey.create("x", "y"); // ISSUE join key order may
get scrambled
+
+ {
+ // Lookup should match all rows without y
+ Set<Binding> actual = Iter.toSet(table.getCandidates(match_x3));
+ Assert.assertEquals(Set.of(row_y, row0), actual);
+
+ // The indexes map is not a live-view
+ Map<JoinKey, JoinIndex> indexes = table.getIndexesByJoinKeys();
+
+ // There should now be an additional index on x
+ Assert.assertEquals(Set.of(emptyKey, xKey),
+ table.getIndexesByJoinKeys().keySet());
+
+ JoinIndex xIndex = indexes.get(xKey);
+
+ // Main table should contain all bindings
+ Assert.assertEquals(givenRowSet,
+
Iter.toSet(xIndex.getMainTable().getCandidates(BindingFactory.empty())));
+
+ // There should not be a skew table
+ Assert.assertTrue(xIndex.getSkewTables().isEmpty());
+ }
+
+ {
+ // Lookup with join key (x, y) which does not match rows with x
+ Set<Binding> actual = Iter.toSet(table.getCandidates(match_x3y2));
+ Assert.assertEquals(Set.of(row_y, row0), actual);
+
+ // There should now be an additional index on y
+ Assert.assertEquals(Set.of(emptyKey, xKey, xyKey),
+ table.getIndexesByJoinKeys().keySet());
+
+ // The indexes map is not a live-view
+ Map<JoinKey, JoinIndex> indexes = table.getIndexesByJoinKeys();
+
+ // There should now be an additional index on [?x, ?y]
+ Assert.assertEquals(Set.of(emptyKey, xKey, xyKey),
+ table.getIndexesByJoinKeys().keySet());
+
+ JoinIndex xyIndex = indexes.get(xyKey);
+
+ // Main table should contain all bindings that either bind xy or
neither
+ Assert.assertEquals(Set.of(row_xy, row0),
+
Iter.toSet(xyIndex.getMainTable().getCandidates(BindingFactory.empty())));
+
+ // There should be skew tables for the bindings with only ?x and
only ?y.
+ // Beware: Skew tables are created for all cached bindings that
bind fewer variables than
+ // the lookup binding - they don't depend on the lookup binding's
values.
+ Map<JoinKey, HashProbeTable> skewTables =
xyIndex.getSkewTablesByJoinKey();
+ Assert.assertEquals(Set.of(xKey, yKey), skewTables.keySet());
+
+ Assert.assertEquals(Set.of(row_x),
Iter.toSet(skewTables.get(xKey).values()));
+ Assert.assertEquals(Set.of(row_y),
Iter.toSet(skewTables.get(yKey).values()));
+ }
+ }
+}
diff --git a/jena-benchmarks/jena-benchmarks-jmh/README.md
b/jena-benchmarks/jena-benchmarks-jmh/README.md
new file mode 100644
index 0000000000..da1cbca6c3
--- /dev/null
+++ b/jena-benchmarks/jena-benchmarks-jmh/README.md
@@ -0,0 +1,7 @@
+# jena-benchmarks-jmh
+
+This module contains benchmarks implemented using the Java Microbenchmark
Harness (jmh) and JUnit.
+
+## Troubleshooting
+
+* `Unable to find the resource: /META-INF/BenchmarkList`: If you encounter
this error while attempting to run the benchmark JUnit tests from the IDE of
your choice then it means that the jmh annotation processor has not yet been
run. The processor should run as part of the usual java compilation (via the
`maven-compiler-plugin`). You can manually force the build by running `mvn
-Pdev clean install` on the `jena-benchmarks-jmh` module from the command line.
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTask.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTask.java
new file mode 100644
index 0000000000..e2c1d900c9
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTask.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.benchmark;
+
+/** Base class for a task that executes a (SELECT) query and can optionally
validate the result set size. */
+public abstract class QueryTask {
+ protected final String queryString;
+ protected final long expectedResultSetSize;
+ protected final boolean skipExecution;
+ protected final boolean skipValidation;
+
+ public QueryTask(String queryString, long expectedResultSetSize, boolean
skipExecution, boolean skipValidation) {
+ super();
+ this.queryString = queryString;
+ this.expectedResultSetSize = expectedResultSetSize;
+ this.skipExecution = skipExecution;
+ this.skipValidation = skipValidation;
+ }
+
+ public String getQueryString() {
+ return queryString;
+ }
+
+ public long getExpectedResultSetSize() {
+ return expectedResultSetSize;
+ }
+
+ public boolean skipExecution() {
+ return skipExecution;
+ }
+
+ public boolean skipValidation() {
+ return skipValidation;
+ }
+
+ public abstract QueryTaskResult exec();
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskBuilder.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskBuilder.java
new file mode 100644
index 0000000000..b6cc464fee
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskBuilder.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.benchmark;
+
+public abstract class QueryTaskBuilder {
+ protected String queryString;
+ protected long expectedResultSetSize;
+ protected boolean skipValidation;
+ protected boolean skipExecution;
+
+ public QueryTaskBuilder query(String queryString) {
+ this.queryString = queryString;
+ return this;
+ }
+
+ /** For select queries: Set the expected result set size. */
+ public QueryTaskBuilder expectedResultSetSize(long expectedResultSetSize) {
+ this.expectedResultSetSize = expectedResultSetSize;
+ return this;
+ }
+
+ public QueryTaskBuilder skipExecution(boolean skipExecution) {
+ this.skipExecution = skipExecution;
+ return this;
+ }
+
+ public QueryTaskBuilder skipValidation(boolean skipValidation) {
+ this.skipValidation = skipValidation;
+ return this;
+ }
+
+ public abstract QueryTask build();
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskBuilderRegistry.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskBuilderRegistry.java
new file mode 100644
index 0000000000..e7d9846673
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskBuilderRegistry.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.benchmark;
+
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.jena.atlas.lib.Creator;
+import org.apache.jena.sparql.engine.join.QueryTaskBuilder480;
+import org.apache.jena.sparql.engine.join.QueryTaskBuilderCurrent;
+
+public class QueryTaskBuilderRegistry {
+ private static final QueryTaskBuilderRegistry INSTANCE = new
QueryTaskBuilderRegistry();
+
+ static {
+ INSTANCE.put("current", () -> new QueryTaskBuilderCurrent());
+ INSTANCE.put("4.8.0", () -> new QueryTaskBuilder480());
+ }
+
+ private final Map<String, Creator<QueryTaskBuilder>> registry = new
ConcurrentHashMap<>();
+
+ private QueryTaskBuilderRegistry() {
+ }
+
+ public static QueryTaskBuilderRegistry get() {
+ return INSTANCE;
+ }
+
+ /** Returns a factory for creating fresh instances of QueryTaskBuilder for
the given name. */
+ public Creator<QueryTaskBuilder> get(String name) {
+ return Optional.ofNullable(registry.get(name))
+ .orElseThrow(() -> new NoSuchElementException("No task builder
with name " + name));
+ }
+
+ public void put(String name, Creator<QueryTaskBuilder> builderFactory) {
+ registry.put(name, builderFactory);
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskReader.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskReader.java
new file mode 100644
index 0000000000..03d85c7036
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskReader.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.benchmark;
+
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.jena.atlas.lib.Creator;
+import org.apache.jena.rdf.model.Model;
+import org.apache.jena.rdf.model.Property;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.ResourceFactory;
+import org.apache.jena.rdf.model.Statement;
+import org.apache.jena.riot.RDFDataMgr;
+
+/** Utility methods to read RDF query task descriptions and create {@link
QueryTask} objects from them. */
+public class QueryTaskReader {
+ // Internal vocabulary for representing the query tasks. May be changed /
improved.
+ public static final String NS = "http://www.example.org/";
+ public static final Property queryString =
ResourceFactory.createProperty(NS + "queryString");
+ public static final Property expectedResultSetSize =
ResourceFactory.createProperty(NS + "expectedResultSetSize");
+
+ /**
+ * The 'skipValidation' property disables checking the expected result set
size for the specified versions
+ * A more complex solution could allow specifying expected results per
version
+ */
+ public static final Property skipValidation =
ResourceFactory.createProperty(NS + "skipValidation");
+ public static final Property skipExecution =
ResourceFactory.createProperty(NS + "skipExecution");
+
+ public static QueryTask loadOne(String location, String jenaVersion) {
+ Creator<QueryTaskBuilder> taskBuilder =
QueryTaskBuilderRegistry.get().get(jenaVersion);
+ return loadOne(location, jenaVersion, taskBuilder);
+ }
+
+ public static QueryTask loadOne(String location, String jenaVersion,
Creator<QueryTaskBuilder> taskBuilderCreator) {
+ List<QueryTask> tasks = load(location, jenaVersion,
taskBuilderCreator);
+ if (tasks.size() != 1) {
+ throw new RuntimeException("Exactly one task expected");
+ }
+ return tasks.get(0);
+ }
+
+ public static List<QueryTask> load(String location, String jenaVersion,
Creator<QueryTaskBuilder> taskBuilderCreator) {
+ Model model = RDFDataMgr.loadModel(location);
+ return load(model, jenaVersion, taskBuilderCreator);
+ }
+
+ public static List<QueryTask> load(Model model, String jenaVersion,
Creator<QueryTaskBuilder> taskBuilderCreator) {
+ List<Resource> taskDescriptions =
model.listResourcesWithProperty(queryString).toList();
+ List<QueryTask> result = taskDescriptions.stream()
+ .map(task -> configure(task, jenaVersion,
taskBuilderCreator.create()).build())
+ .collect(Collectors.toList());
+ return result;
+ }
+
+ public static QueryTaskBuilder configure(Resource taskDescription, String
jenaVersion, QueryTaskBuilder taskBuilder) {
+ String query =
taskDescription.getRequiredProperty(queryString).getString();
+ long size =
Optional.ofNullable(taskDescription.getProperty(expectedResultSetSize)).map(Statement::getLong).orElse(-1l);
+
+ Set<String> skippedExecutions =
taskDescription.listProperties(skipExecution).mapWith(Statement::getString).toSet();
+ boolean skipExecution = skippedExecutions.contains(jenaVersion);
+
+ Set<String> skippedVersions =
taskDescription.listProperties(skipValidation).mapWith(Statement::getString).toSet();
+ boolean skipValidation = skippedVersions.contains(jenaVersion);
+
+ taskBuilder
+ .query(query)
+ .expectedResultSetSize(size)
+ .skipExecution(skipExecution)
+ .skipValidation(skipValidation);
+ return taskBuilder;
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskResult.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskResult.java
new file mode 100644
index 0000000000..8e00d0dc09
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskResult.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.benchmark;
+
+public class QueryTaskResult {
+ protected String queryString;
+ protected String originalOpString;
+ protected String optimizedOpString;
+ protected long resultSetSize;
+
+ public QueryTaskResult(String queryString, String originalOpString, String
optimizedOpString, long resultSetSize) {
+ super();
+ this.queryString = queryString;
+ this.originalOpString = originalOpString;
+ this.optimizedOpString = optimizedOpString;
+ this.resultSetSize = resultSetSize;
+ }
+
+ public String getQueryString() {
+ return queryString;
+ }
+
+ public String getOriginalOpString() {
+ return originalOpString;
+ }
+
+ public String getOptimizedOpString() {
+ return optimizedOpString;
+ }
+
+ public long getResultSetSize() {
+ return resultSetSize;
+ }
+
+ @Override
+ public String toString() {
+ return String.join("\n",
+ "Query:", queryString,
+ "Original op:", originalOpString,
+ "Optimized op:", optimizedOpString,
+ "Result count:", Long.toString(resultSetSize));
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskTestUtils.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskTestUtils.java
new file mode 100644
index 0000000000..97dad1f601
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/benchmark/QueryTaskTestUtils.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.benchmark;
+
+import org.junit.Assert;
+
+public class QueryTaskTestUtils {
+ /**
+ * Util method to execute a query task and assess the result.
+ * Result is asserted using junit's {@link Assert}.
+ * This method is shared between unit testing and benchmarking.
+ */
+ public static void execAndAssert(QueryTask task) {
+ if (!task.skipExecution()) {
+ QueryTaskResult data = task.exec();
+
+ long expectedResultSetSize = task.getExpectedResultSetSize();
+ long actualResultSetSize = data.getResultSetSize();
+
+ if (!task.skipValidation()) {
+ if (expectedResultSetSize >= 0) {
+ Assert.assertEquals(expectedResultSetSize,
actualResultSetSize);
+ } else {
+ // If no expected result set size was configured then
write the actual one to console.
+ System.err.println("Expected result set size not set.
Actual size was: " + actualResultSetSize);
+ }
+ }
+
+ boolean debugOutput = false;
+ if (debugOutput) {
+ System.err.println(String.join("\n",
+ "Query exec result:",
+ // "Query:", data.getOptimizedOpString(),
+ "Result count:", Long.toString(data.getResultSetSize())));
+ }
+ }
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/BenchmarkHashJoin.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/BenchmarkHashJoin.java
new file mode 100644
index 0000000000..b0a776b76d
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/BenchmarkHashJoin.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.jena.sparql.engine.benchmark.QueryTask;
+import org.apache.jena.sparql.engine.benchmark.QueryTaskReader;
+import org.apache.jena.sparql.engine.benchmark.QueryTaskTestUtils;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.results.format.ResultFormatType;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.ChainedOptionsBuilder;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.openjdk.jmh.runner.options.TimeValue;
+
+/** Benchmark implementation for hash joins. The junit runner is {@link
TestBenchmarkHashJoin}. */
+@State(Scope.Benchmark)
+public class BenchmarkHashJoin {
+ @Param({
+ "current",
+ "4.8.0"
+ })
+ public String param0_jenaVersion;
+
+ @Param({
+ "join/join_2columns_simple_a_10.ttl",
+ "join/join_2columns_simple_a_15.ttl",
+ "join/join_2columns_simple_a_20.ttl",
+ "join/join_2columns_skewed_a_1.ttl",
+ "join/join_2columns_skewed_a_10.ttl",
+ "join/join_matrix_skewed_a_10.ttl",
+ "join/join_1column_simple_a_10k.ttl",
+ "join/join_1column_simple_a_100k.ttl"
+ })
+ public String param1_queryFile;
+
+ private QueryTask task;
+
+ @Benchmark
+ public void runTask() throws Exception {
+ QueryTaskTestUtils.execAndAssert(task);
+ }
+
+ @Setup(Level.Trial)
+ public void setupTrial() throws Exception {
+ task = QueryTaskReader.loadOne(param1_queryFile, param0_jenaVersion);
+ }
+
+ public static ChainedOptionsBuilder getDefaults(Class<?> c) {
+ return new OptionsBuilder()
+ // Specify which benchmarks to run.
+ // You can be more specific if you'd like to run only one
benchmark per test.
+ .include(c.getName())
+ // Set the following options as needed
+ .mode(Mode.AverageTime)
+ .timeUnit(TimeUnit.SECONDS)
+ .warmupTime(TimeValue.NONE)
+ .warmupIterations(5)
+ .measurementIterations(5)
+ .measurementTime(TimeValue.NONE)
+ .threads(1)
+ .forks(1)
+ .shouldFailOnError(true)
+ .shouldDoGC(true)
+ //.jvmArgs("-XX:+UnlockDiagnosticVMOptions",
"-XX:+PrintInlining")
+ .jvmArgs("-Xmx4G")
+ //.addProfiler(WinPerfAsmProfiler.class)
+ .resultFormat(ResultFormatType.JSON)
+ .result(c.getSimpleName() + "_" +
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")) +
".json");
+ }
+
+ public static void main(String[] args) throws RunnerException {
+ Options opt = getDefaults(BenchmarkHashJoin.class).build();
+ new Runner(opt).run();
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTask480.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTask480.java
new file mode 100644
index 0000000000..b427b73f58
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTask480.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import java.util.function.BiConsumer;
+
+import org.apache.jena.sparql.engine.benchmark.QueryTask;
+import org.apache.jena.sparql.engine.benchmark.QueryTaskResult;
+import org.apache.shadedJena480.query.ARQ;
+import org.apache.shadedJena480.query.ResultSet;
+import org.apache.shadedJena480.query.ResultSetFormatter;
+import org.apache.shadedJena480.sparql.ARQConstants;
+import org.apache.shadedJena480.sparql.algebra.Op;
+import org.apache.shadedJena480.sparql.algebra.optimize.Optimize;
+import org.apache.shadedJena480.sparql.algebra.optimize.Rewrite;
+import org.apache.shadedJena480.sparql.algebra.optimize.RewriteFactory;
+import org.apache.shadedJena480.sparql.core.DatasetGraphFactory;
+import org.apache.shadedJena480.sparql.exec.QueryExec;
+import org.apache.shadedJena480.sparql.util.Context;
+import org.apache.shadedJena480.sys.JenaSystem;
+
+public class QueryTask480
+ extends QueryTask
+{
+ static { JenaSystem.init(); }
+
+ public QueryTask480(String queryString, long expectedResultSetSize,
boolean skipExecution, boolean skipValidation) {
+ super(queryString, expectedResultSetSize, skipExecution,
skipValidation);
+ }
+
+ @Override
+ public QueryTaskResult exec() {
+ Op[] ops = new Op[] { null, null };
+ Context cxt = setupContext((origOp, optimizedOp) -> {
+ ops[0] = origOp;
+ ops[1] = optimizedOp;
+ });
+
+ long resultSetSize;
+ try (QueryExec qe = QueryExec.newBuilder()
+ .dataset(DatasetGraphFactory.empty())
+ .query(queryString)
+ .context(cxt)
+ .build()) {
+ ResultSet rs = ResultSet.adapt(qe.select());
+
+ // System.out.println(ResultSetFormatter.asText(rs));
+ resultSetSize = ResultSetFormatter.consume(rs);
+ }
+ return new QueryTaskResult(queryString, ops[0].toString(),
ops[1].toString(), resultSetSize);
+ }
+
+ protected static Context setupContext(BiConsumer<Op, Op> opHandler) {
+ Context cxt = ARQ.getContext().copy();
+ RewriteFactory rewriteFactory = Optimize.getFactory();
+ RewriteFactory loggingRewriteFactory = c -> {
+ Rewrite rewrite = rewriteFactory.create(c);
+ return op -> {
+ Op optimizedOp = rewrite.rewrite(op);
+ opHandler.accept(op, optimizedOp);
+ return optimizedOp;
+ };
+ };
+ cxt.set(ARQConstants.sysOptimizerFactory, loggingRewriteFactory);
+ return cxt;
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTaskBuilder480.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTaskBuilder480.java
new file mode 100644
index 0000000000..58116348a1
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTaskBuilder480.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import org.apache.jena.sparql.engine.benchmark.QueryTask;
+import org.apache.jena.sparql.engine.benchmark.QueryTaskBuilder;
+
+public class QueryTaskBuilder480
+ extends QueryTaskBuilder
+{
+ @Override
+ public QueryTask build() {
+ return new QueryTask480(queryString, expectedResultSetSize,
skipExecution, skipValidation);
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTaskBuilderCurrent.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTaskBuilderCurrent.java
new file mode 100644
index 0000000000..e30fef9928
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTaskBuilderCurrent.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import org.apache.jena.sparql.engine.benchmark.QueryTask;
+import org.apache.jena.sparql.engine.benchmark.QueryTaskBuilder;
+
+public class QueryTaskBuilderCurrent
+ extends QueryTaskBuilder
+{
+ @Override
+ public QueryTask build() {
+ return new QueryTaskCurrent(queryString, expectedResultSetSize,
skipExecution, skipValidation);
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTaskCurrent.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTaskCurrent.java
new file mode 100644
index 0000000000..2b05d2e873
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/QueryTaskCurrent.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import java.util.function.BiConsumer;
+
+import org.apache.jena.query.ARQ;
+import org.apache.jena.query.ResultSet;
+import org.apache.jena.query.ResultSetFormatter;
+import org.apache.jena.sparql.ARQConstants;
+import org.apache.jena.sparql.algebra.Op;
+import org.apache.jena.sparql.algebra.optimize.Optimize;
+import org.apache.jena.sparql.algebra.optimize.Rewrite;
+import org.apache.jena.sparql.algebra.optimize.RewriteFactory;
+import org.apache.jena.sparql.core.DatasetGraphFactory;
+import org.apache.jena.sparql.engine.benchmark.QueryTask;
+import org.apache.jena.sparql.engine.benchmark.QueryTaskResult;
+import org.apache.jena.sparql.exec.QueryExec;
+import org.apache.jena.sparql.util.Context;
+
+public class QueryTaskCurrent
+ extends QueryTask
+{
+ public QueryTaskCurrent(String queryString, long expectedResultSetSize,
boolean skipExecution, boolean skipValidation) {
+ super(queryString, expectedResultSetSize, skipExecution,
skipValidation);
+ }
+
+ @Override
+ public QueryTaskResult exec() {
+ Op[] ops = new Op[] { null, null };
+ Context cxt = setupContext((origOp, optimizedOp) -> {
+ ops[0] = origOp;
+ ops[1] = optimizedOp;
+ });
+
+ long resultCount;
+ try (QueryExec qe = QueryExec.newBuilder()
+ .dataset(DatasetGraphFactory.empty())
+ .query(queryString)
+ .context(cxt)
+ .build()) {
+ ResultSet rs = ResultSet.adapt(qe.select());
+ // System.out.println(ResultSetFormatter.asText(rs));
+ resultCount = ResultSetFormatter.consume(rs);
+ }
+ return new QueryTaskResult(queryString, ops[0].toString(),
ops[1].toString(), resultCount);
+ }
+
+ protected static Context setupContext(BiConsumer<Op, Op> opHandler) {
+ Context cxt = ARQ.getContext().copy();
+ RewriteFactory rewriteFactory = Optimize.getFactory();
+ RewriteFactory loggingRewriteFactory = c -> {
+ Rewrite rewrite = rewriteFactory.create(c);
+ return op -> {
+ Op optimizedOp = rewrite.rewrite(op);
+ opHandler.accept(op, optimizedOp);
+ return optimizedOp;
+ };
+ };
+ cxt.set(ARQConstants.sysOptimizerFactory, loggingRewriteFactory);
+ return cxt;
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/TestBenchmarkHashJoin.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/TestBenchmarkHashJoin.java
new file mode 100644
index 0000000000..8c02b70132
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/TestBenchmarkHashJoin.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import java.util.Collection;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.openjdk.jmh.results.RunResult;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.Options;
+
+public class TestBenchmarkHashJoin {
+ @Test
+ public void benchmark() throws Exception {
+ Options opt =
BenchmarkHashJoin.getDefaults(BenchmarkHashJoin.class).build();
+ // JMHDefaultOptions.getDefaults(BenchmarkHashJoin.class).build();
+ Collection<RunResult> runResults = new Runner(opt).run();
+ Assert.assertNotNull(runResults);
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/TestHashJoin.java
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/TestHashJoin.java
new file mode 100644
index 0000000000..c4ae2bae5b
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/java/org/apache/jena/sparql/engine/join/TestHashJoin.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jena.sparql.engine.join;
+
+import org.apache.jena.sparql.engine.benchmark.QueryTask;
+import org.apache.jena.sparql.engine.benchmark.QueryTaskReader;
+import org.apache.jena.sparql.engine.benchmark.QueryTaskTestUtils;
+import org.junit.Test;
+
+public class TestHashJoin {
+ @Test
+ public void test_join_1column_simple_a_10k() {
+ QueryTask task =
QueryTaskReader.loadOne("join/join_1column_simple_a_10k.ttl", "current");
+ QueryTaskTestUtils.execAndAssert(task);
+ }
+
+ @Test
+ public void test_join_matrix_skewed_a_3() {
+ QueryTask task =
QueryTaskReader.loadOne("join/join_matrix_skewed_a_3.ttl", "current");
+ QueryTaskTestUtils.execAndAssert(task);
+ }
+
+ @Test
+ public void test_join_matrix_skewed_a_10() {
+ QueryTask task =
QueryTaskReader.loadOne("join/join_matrix_skewed_a_10.ttl", "current");
+ QueryTaskTestUtils.execAndAssert(task);
+ }
+}
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_1column_simple_a_100k.ttl
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_1column_simple_a_100k.ttl
new file mode 100644
index 0000000000..66e0cb1416
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_1column_simple_a_100k.ttl
@@ -0,0 +1,28 @@
+PREFIX : <http://www.example.org/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+:join_1columns_simple_a_100k
+ rdfs:label "join_1columns_simple_a_100k" ;
+ :expectedResultSetSize 100000 ;
+ :queryString
+"""
+SELECT * {
+ { SELECT ?X {
+ VALUES ?x1 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x2 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x3 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x4 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x5 { 0 1 2 3 4 5 6 7 8 9 }
+ BIND(?x1 + 10 * ?x2 + 100 * ?x3 + 1000 * ?x4 + 10000 * ?x5 AS ?X)
+ } }
+ { SELECT ?X {
+ VALUES ?x1 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x2 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x3 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x4 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x5 { 0 1 2 3 4 5 6 7 8 9 }
+ BIND(?x1 + 10 * ?x2 + 100 * ?x3 + 1000 * ?x4 + 10000 * ?x5 AS ?X)
+ } }
+}
+""" ;
+ .
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_1column_simple_a_10k.ttl
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_1column_simple_a_10k.ttl
new file mode 100644
index 0000000000..1af44fa657
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_1column_simple_a_10k.ttl
@@ -0,0 +1,26 @@
+PREFIX : <http://www.example.org/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+:join_1columns_simple_a_10k
+ rdfs:label "join_1columns_simple_a_10k" ;
+ :expectedResultSetSize 10000 ;
+ :queryString
+"""
+SELECT * {
+ { SELECT ?X {
+ VALUES ?x1 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x2 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x3 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x4 { 0 1 2 3 4 5 6 7 8 9 }
+ BIND(?x1 + 10 * ?x2 + 100 * ?x3 + 1000 * ?x4 AS ?X)
+ } }
+ { SELECT ?X {
+ VALUES ?x1 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x2 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x3 { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?x4 { 0 1 2 3 4 5 6 7 8 9 }
+ BIND(?x1 + 10 * ?x2 + 100 * ?x3 + 1000 * ?x4 AS ?X)
+ } }
+}
+""" ;
+ .
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_simple_a_10.ttl
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_simple_a_10.ttl
new file mode 100644
index 0000000000..ac5361ebd0
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_simple_a_10.ttl
@@ -0,0 +1,28 @@
+PREFIX : <http://www.example.org/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+:join_2columns_simple_a_10
+ rdfs:label "join_2columns_simple_a_10" ;
+ :expectedResultSetSize 10000 ;
+ :queryString
+"""
+SELECT * {
+ { SELECT ?X ?Y {
+ VALUES ?X_i { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?X_j { 0 1 2 3 4 5 6 7 8 9 }
+ BIND((?X_i + (10 * ?X_j)) AS ?X)
+ VALUES ?Y_i { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?Y_j { 0 1 2 3 4 5 6 7 8 9 }
+ BIND((?Y_i + (10 * ?Y_j)) AS ?Y)
+ } }
+ { SELECT ?X ?Y {
+ VALUES ?X_i { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?X_j { 0 1 2 3 4 5 6 7 8 9 }
+ BIND((?X_i + (10 * ?X_j)) AS ?X)
+ VALUES ?Y_i { 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?Y_j { 0 1 2 3 4 5 6 7 8 9 }
+ BIND((?Y_i + (10 * ?Y_j)) AS ?Y)
+ } }
+}
+""" ;
+ .
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_simple_a_15.ttl
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_simple_a_15.ttl
new file mode 100644
index 0000000000..ccd9a339f6
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_simple_a_15.ttl
@@ -0,0 +1,28 @@
+PREFIX : <http://www.example.org/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+:join_2columns_simple_a_15
+ rdfs:label "join_2columns_simple_a_15" ;
+ :expectedResultSetSize 50625 ;
+ :queryString
+"""
+SELECT * {
+ { SELECT ?X ?Y {
+ VALUES ?X_i { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 }
+ VALUES ?X_j { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 }
+ BIND((?X_i + (15 * ?X_j)) AS ?X)
+ VALUES ?Y_i { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 }
+ VALUES ?Y_j { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 }
+ BIND((?Y_i + (15 * ?Y_j)) AS ?Y)
+ } }
+ { SELECT ?X ?Y {
+ VALUES ?X_i { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 }
+ VALUES ?X_j { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 }
+ BIND((?X_i + (15 * ?X_j)) AS ?X)
+ VALUES ?Y_i { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 }
+ VALUES ?Y_j { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 }
+ BIND((?Y_i + (15 * ?Y_j)) AS ?Y)
+ } }
+}
+""" ;
+.
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_simple_a_20.ttl
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_simple_a_20.ttl
new file mode 100644
index 0000000000..7e28687bde
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_simple_a_20.ttl
@@ -0,0 +1,28 @@
+PREFIX : <http://www.example.org/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+:join_2columns_simple_a_20
+ rdfs:label "join_2columns_simple_a_20" ;
+ :expectedResultSetSize 160000 ;
+ :queryString
+"""
+SELECT * {
+ { SELECT ?X ?Y {
+ VALUES ?X_i { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 }
+ VALUES ?X_j { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 }
+ BIND((?X_i + (20 * ?X_j)) AS ?X)
+ VALUES ?Y_i { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 }
+ VALUES ?Y_j { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 }
+ BIND((?Y_i + (20 * ?Y_j)) AS ?Y)
+ } }
+ { SELECT ?X ?Y {
+ VALUES ?X_i { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 }
+ VALUES ?X_j { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 }
+ BIND((?X_i + (20 * ?X_j)) AS ?X)
+ VALUES ?Y_i { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 }
+ VALUES ?Y_j { 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 }
+ BIND((?Y_i + (20 * ?Y_j)) AS ?Y)
+ } }
+}
+""" ;
+ .
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_skewed_a_1.ttl
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_skewed_a_1.ttl
new file mode 100644
index 0000000000..057fbcc642
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_skewed_a_1.ttl
@@ -0,0 +1,42 @@
+PREFIX : <http://www.example.org/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+:join_2columns_skewed_a_1
+ rdfs:label "join_2columns_skewed_a_1" ;
+ :expectedResultSetSize 1336 ;
+ :skipValidation "4.8.0" ; # On jena 4.8.0 the actual result set size is
incorrectly 2736 ;
+ :queryString
+"""
+# SELECT (count(*) AS ?C)
+SELECT *
+WHERE
+ { { SELECT ?X ?Y
+ WHERE
+ { VALUES ?X_i { UNDEF 0 1 }
+ VALUES ?X_j { UNDEF 0 1 }
+ BIND(( ?X_i + ( 2 * ?X_j ) ) AS ?X)
+ VALUES ?Y_i { UNDEF 0 1 }
+ VALUES ?Y_j { UNDEF 0 1 }
+ BIND(( ?Y_i + ( 2 * ?Y_j ) ) AS ?Y)
+ FILTER(bound(?X) || bound(?Y))
+ }
+ }
+ { SELECT ?X ?Y
+ WHERE
+ { { SELECT ?X ?Y # ("x" AS ?RAND)
+ WHERE
+ { VALUES ?X_i { UNDEF 0 1 }
+ VALUES ?X_j { UNDEF 0 1 }
+ BIND(( ?X_i + ( 2 * ?X_j ) ) AS ?X)
+ VALUES ?Y_i { UNDEF 0 1 }
+ VALUES ?Y_j { UNDEF 0 1 }
+ BIND(( ?Y_i + ( 2 * ?Y_j ) ) AS ?Y)
+ FILTER(bound(?X) || bound(?Y))
+ }
+ }
+ # FILTER ( ?RAND < 0.95 )
+ }
+ }
+ }
+""" ;
+ .
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_skewed_a_10.ttl
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_skewed_a_10.ttl
new file mode 100644
index 0000000000..fecef5ed0f
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_2columns_skewed_a_10.ttl
@@ -0,0 +1,33 @@
+PREFIX : <http://www.example.org/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+:join_2columns_skewed_a_10
+ rdfs:label "join_2columns_skewed_a_10" ;
+ :expectedResultSetSize 9758200 ;
+ :skipExecution "4.8.0" ; # On jena 4.8.0 this task takes very long (more
than a minute); on "current" it should be only a few seconds
+ :skipValidation "4.8.0" ; # On jena 4.8.0 the actual result set size is
incorrectly 16020400 ;
+ :queryString
+"""
+# SELECT (count(*) AS ?C) {
+SELECT * {
+ { SELECT ?X ?Y {
+ VALUES ?X_i { UNDEF 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?X_j { UNDEF 0 1 2 3 4 5 6 7 8 9 }
+ BIND(( ?X_i + ( 10 * ?X_j ) ) AS ?X)
+ VALUES ?Y_i { UNDEF 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?Y_j { UNDEF 0 1 2 3 4 5 6 7 8 9 }
+ BIND(( ?Y_i + ( 10 * ?Y_j ) ) AS ?Y)
+ FILTER(bound(?X) || bound(?Y))
+ } }
+ { SELECT ?X ?Y {
+ VALUES ?X_i { UNDEF 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?X_j { UNDEF 0 1 2 3 4 5 6 7 8 9 }
+ BIND(( ?X_i + ( 10 * ?X_j ) ) AS ?X)
+ VALUES ?Y_i { UNDEF 0 1 2 3 4 5 6 7 8 9 }
+ VALUES ?Y_j { UNDEF 0 1 2 3 4 5 6 7 8 9 }
+ BIND(( ?Y_i + ( 10 * ?Y_j ) ) AS ?Y)
+ FILTER(bound(?X) || bound(?Y))
+ } }
+}
+""" ;
+ .
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_matrix_skewed_a_10.ttl
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_matrix_skewed_a_10.ttl
new file mode 100644
index 0000000000..712e96f0ef
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_matrix_skewed_a_10.ttl
@@ -0,0 +1,41 @@
+PREFIX : <http://www.example.org/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+:join_matrix_sweked_a_3
+ rdfs:label "join_matrix_sweked_a_3" ;
+ :expectedResultSetSize 100 ;
+ :queryString
+"""
+# Join of two tables where each column has an UNDEF entry
+# This test is mainly for correctness testing
+# Each row is compatible with all others; no selectivity -> no significant
performance differences expected
+SELECT *
+{
+ VALUES (?v0 ?v1 ?v2 ?v3 ?v4 ?v5 ?v6 ?v7 ?v8 ?v9) {
+ (UNDEF 1 2 3 4 5 6 7 8 9)
+ (0 UNDEF 2 3 4 5 6 7 8 9)
+ (0 1 UNDEF 3 4 5 6 7 8 9)
+ (0 1 2 UNDEF 4 5 6 7 8 9)
+ (0 1 2 3 UNDEF 5 6 7 8 9)
+ (0 1 2 3 4 UNDEF 6 7 8 9)
+ (0 1 2 3 4 5 UNDEF 7 8 9)
+ (0 1 2 3 4 5 6 UNDEF 8 9)
+ (0 1 2 3 4 5 6 7 UNDEF 9)
+ (0 1 2 3 4 5 6 7 8 UNDEF)
+ }
+ VALUES (?v0 ?v1 ?v2 ?v3 ?v4 ?v5 ?v6 ?v7 ?v8 ?v9) {
+ (UNDEF 1 2 3 4 5 6 7 8 9)
+ (0 UNDEF 2 3 4 5 6 7 8 9)
+ (0 1 UNDEF 3 4 5 6 7 8 9)
+ (0 1 2 UNDEF 4 5 6 7 8 9)
+ (0 1 2 3 UNDEF 5 6 7 8 9)
+ (0 1 2 3 4 UNDEF 6 7 8 9)
+ (0 1 2 3 4 5 UNDEF 7 8 9)
+ (0 1 2 3 4 5 6 UNDEF 8 9)
+ (0 1 2 3 4 5 6 7 UNDEF 9)
+ (0 1 2 3 4 5 6 7 8 UNDEF)
+ }
+}
+""" ;
+ .
+
diff --git
a/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_matrix_skewed_a_3.ttl
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_matrix_skewed_a_3.ttl
new file mode 100644
index 0000000000..2998a00ff1
--- /dev/null
+++
b/jena-benchmarks/jena-benchmarks-jmh/src/test/resources/join/join_matrix_skewed_a_3.ttl
@@ -0,0 +1,26 @@
+PREFIX : <http://www.example.org/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+:join_matrix_sweked_a_10
+ rdfs:label "join_matrix_sweked_a_10" ;
+ :expectedResultSetSize 9 ;
+ :queryString
+"""
+# Join of two tables where each column has an UNDEF entry
+# This test is mainly for correctness testing
+# Each row is compatible with all others; no selectivity -> no significant
performance differences expected
+SELECT *
+{
+ VALUES (?v0 ?v1 ?v2) {
+ (UNDEF 1 2)
+ (0 UNDEF 2)
+ (0 1 UNDEF)
+ }
+ VALUES (?v0 ?v1 ?v2) {
+ (UNDEF 1 2)
+ (0 UNDEF 2)
+ (0 1 UNDEF)
+ }
+}
+""" ;
+ .