DRILL-5879: Improved SQL Pattern Contains Performance close apache/drill#1072
Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/2420b35d Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/2420b35d Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/2420b35d Branch: refs/heads/master Commit: 2420b35d716a35a10be91ea82ec17bdeb392a77c Parents: 48623ea Author: Salim Achouche <sachouc...@gmail.com> Authored: Wed Dec 13 14:24:45 2017 -0800 Committer: Aman Sinha <asi...@maprtech.com> Committed: Tue Jan 23 17:37:17 2018 -0800 ---------------------------------------------------------------------- .../expr/fn/impl/SqlPatternContainsMatcher.java | 274 ++++++++++++++++++- .../exec/expr/fn/impl/TestSqlPatterns.java | 112 +++++++- 2 files changed, 363 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/2420b35d/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java index 04f5dac..ec95349 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java @@ -19,44 +19,292 @@ package org.apache.drill.exec.expr.fn.impl; import io.netty.buffer.DrillBuf; -public class SqlPatternContainsMatcher extends AbstractSqlPatternMatcher { +/** SQL Pattern Contains implementation */ +public final class SqlPatternContainsMatcher extends AbstractSqlPatternMatcher { + private final MatcherFcn matcherFcn; public SqlPatternContainsMatcher(String patternString) { super(patternString); + + // Pattern matching is 1) a CPU intensive operation and 2) pattern and input dependent. The conclusion is + // that there is no single implementation that can do it all well. So, we use multiple implementations + // chosen based on the pattern length. + if (patternLength == 0) { + matcherFcn = new MatcherZero(); + } else if (patternLength == 1) { + matcherFcn = new MatcherOne(); + } else if (patternLength == 2) { + matcherFcn = new MatcherTwo(); + } else if (patternLength == 3) { + matcherFcn = new MatcherThree(); + } else if (patternLength < 10) { + matcherFcn = new MatcherN(); + } else { + matcherFcn = new BoyerMooreMatcher(); + } } @Override public int match(int start, int end, DrillBuf drillBuf) { + return matcherFcn.match(start, end, drillBuf); + } + + //-------------------------------------------------------------------------- + // Inner Data Structure + // -------------------------------------------------------------------------- + + /** Abstract matcher class to allow us pick the most efficient implementation */ + private abstract class MatcherFcn { + protected final byte[] patternArray; + + protected MatcherFcn() { + assert patternByteBuffer.hasArray(); + + patternArray = patternByteBuffer.array(); + } + + /** + * @return 1 if the pattern was matched; 0 otherwise + */ + protected abstract int match(int start, int end, DrillBuf drillBuf); + } + + /** Handles patterns with length zero */ + private final class MatcherZero extends MatcherFcn { - if (patternLength == 0) { // Everything should match for null pattern string + private MatcherZero() { + } + + /** {@inheritDoc} */ + @Override + protected final int match(int start, int end, DrillBuf drillBuf) { return 1; } + } + + /** Handles patterns with length one */ + private final class MatcherOne extends MatcherFcn { + final byte firstPatternByte; + + private MatcherOne() { + firstPatternByte = patternArray[0]; + } + + /** {@inheritDoc} */ + @Override + protected final int match(int start, int end, DrillBuf drillBuf) { + final int lengthToProcess = end - start; + + // simplePattern string has meta characters i.e % and _ and escape characters removed. + // so, we can just directly compare. + for (int idx = 0; idx < lengthToProcess; idx++) { + byte inputByte = drillBuf.getByte(start + idx); + + if (firstPatternByte != inputByte) { + continue; + } + return 1; + } + return 0; + } + } + + /** Handles patterns with length two */ + private final class MatcherTwo extends MatcherFcn { + final byte firstPatternByte; + final byte secondPatternByte; + + private MatcherTwo() { + firstPatternByte = patternArray[0]; + secondPatternByte = patternArray[1]; + } + + /** {@inheritDoc} */ + @Override + protected final int match(int start, int end, DrillBuf drillBuf) { + final int lengthToProcess = end - start - 1; + + // simplePattern string has meta characters i.e % and _ and escape characters removed. + // so, we can just directly compare. + for (int idx = 0; idx < lengthToProcess; idx++) { + final byte firstInByte = drillBuf.getByte(start + idx); - final int txtLength = end - start; + if (firstPatternByte != firstInByte) { + continue; + } else { + final byte secondInByte = drillBuf.getByte(start + idx + 1); - // no match if input string length is less than pattern length - if (txtLength < patternLength) { + if (secondInByte == secondPatternByte) { + return 1; + } + } + } return 0; } + } + /** Handles patterns with length three */ + private final class MatcherThree extends MatcherFcn { + final byte firstPatternByte; + final byte secondPatternByte; + final byte thirdPatternByte; - final int outerEnd = txtLength - patternLength; + private MatcherThree() { + firstPatternByte = patternArray[0]; + secondPatternByte = patternArray[1]; + thirdPatternByte = patternArray[2]; + } - outer: - for (int txtIndex = 0; txtIndex <= outerEnd; txtIndex++) { + /** {@inheritDoc} */ + @Override + protected final int match(int start, int end, DrillBuf drillBuf) { + final int lengthToProcess = end - start - 2; // simplePattern string has meta characters i.e % and _ and escape characters removed. // so, we can just directly compare. - for (int patternIndex = 0; patternIndex < patternLength; patternIndex++) { - if (patternByteBuffer.get(patternIndex) != drillBuf.getByte(start + txtIndex + patternIndex)) { - continue outer; + for (int idx = 0; idx < lengthToProcess; idx++) { + final byte inputByte = drillBuf.getByte(start + idx); + + if (firstPatternByte != inputByte) { + continue; + } else { + final byte secondInByte = drillBuf.getByte(start + idx + 1); + final byte thirdInByte = drillBuf.getByte(start + idx + 2); + + if (secondInByte == secondPatternByte && thirdInByte == thirdPatternByte) { + return 1; + } } } + return 0; + } + } - return 1; + /** Handles patterns with arbitrary length */ + private final class MatcherN extends MatcherFcn { + final byte firstPatternByte; + + private MatcherN() { + firstPatternByte = patternArray[0]; + } + + /** {@inheritDoc} */ + @Override + protected final int match(int start, int end, DrillBuf drillBuf) { + final int lengthToProcess = end - start - patternLength + 1; + int patternIndex = 0; + + // simplePattern string has meta characters i.e % and _ and escape characters removed. + // so, we can just directly compare. + for (int idx = 0; idx < lengthToProcess; idx++) { + final byte inputByte = drillBuf.getByte(start + idx); + + if (firstPatternByte == inputByte) { + for (patternIndex = 1; patternIndex < patternLength; ++patternIndex) { + final byte currInByte = drillBuf.getByte(start + idx + patternIndex); + final byte currPattByte = patternArray[patternIndex]; + + if (currInByte != currPattByte) { + break; + } + } + + if (patternIndex == patternLength) { + return 1; + } + } + } + return 0; + } + } + + /** + * Boyer-Moore matcher algorithm; excellent for large patterns and for prefix patterns which appear + * frequently in the input. + */ + private final class BoyerMooreMatcher extends MatcherFcn { + private final int[] offsetTable; + private final int[] characterTable; + + private BoyerMooreMatcher() { + super(); + + this.offsetTable = makeOffsetTable(); + this.characterTable = makeCharTable(); + } + + /** {@inheritDoc} */ + @Override + protected int match(int start, int end, DrillBuf drillBuf) { + final int inputLength = end - start; + + for (int idx1 = patternLength - 1, idx2; idx1 < inputLength;) { + for (idx2 = patternLength - 1; patternArray[idx2] == drillBuf.getByte(start + idx1); --idx1, --idx2) { + if (idx2 == 0) { + return 1; + } + } + // idx1 += pattern.length - idx2; // For naive method + idx1 += Math.max(offsetTable[patternLength - 1 - idx2], characterTable[drillBuf.getByte(start + idx1) & 0xFF]); + } + return 0; + } + + /** Build the jump table based on the mismatched character information **/ + private int[] makeCharTable() { + final int TABLE_SIZE = 256; // This implementation is based on byte comparison + int[] resultTable = new int[TABLE_SIZE]; + + for (int idx = 0; idx < resultTable.length; ++idx) { + resultTable[idx] = patternLength; + } + + for (int idx = 0; idx < patternLength - 1; ++idx) { + final int patternValue = ((int) patternArray[idx]) & 0xFF; + resultTable[patternValue] = patternLength - 1 - idx; + } + + return resultTable; + } + + /** Builds the scan offset based on which mismatch occurs. **/ + private int[] makeOffsetTable() { + int[] resultTable = new int[patternLength]; + int lastPrefixPosition = patternLength; + + for (int idx = patternLength - 1; idx >= 0; --idx) { + if (isPrefix(idx + 1)) { + lastPrefixPosition = idx + 1; + } + resultTable[patternLength - 1 - idx] = lastPrefixPosition - idx + patternLength - 1; + } + + for (int idx = 0; idx < patternLength - 1; ++idx) { + int suffixLen = suffixLength(idx); + resultTable[suffixLen] = patternLength - 1 - idx + suffixLen; + } + + return resultTable; } - return 0; + /** Checks whether needle[pos:end] is a prefix of pattern **/ + private boolean isPrefix(int pos) { + for (int idx1 = pos, idx2 = 0; idx1 < patternLength; ++idx1, ++idx2) { + if (patternArray[idx1] != patternArray[idx2]) { + return false; + } + } + return true; + } + + /** Computes the maximum length of the substring ends at "pos" and is a suffix **/ + private int suffixLength(int pos) { + int result = 0; + for (int idx1 = pos, idx2 = patternLength - 1; idx1 >= 0 && patternArray[idx1] == patternArray[idx2]; --idx1, --idx2) { + result += 1; + } + return result; + } } } http://git-wip-us.apache.org/repos/asf/drill/blob/2420b35d/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java index 2eecb54..7d85719 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java @@ -17,23 +17,25 @@ */ package org.apache.drill.exec.expr.fn.impl; -import io.netty.buffer.DrillBuf; -import org.apache.drill.common.exceptions.DrillRuntimeException; -import org.apache.drill.exec.memory.BufferAllocator; -import org.apache.drill.exec.memory.RootAllocatorFactory; -import org.junit.After; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; +import java.util.ArrayList; +import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import org.apache.drill.common.exceptions.DrillRuntimeException; +import org.apache.drill.exec.memory.BufferAllocator; +import org.apache.drill.exec.memory.RootAllocatorFactory; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import io.netty.buffer.DrillBuf; public class TestSqlPatterns { BufferAllocator allocator; @@ -446,10 +448,100 @@ public class TestSqlPatterns { assertEquals(1, sqlPatternComplex.match(0, byteBuffer.limit(), drillBuf)); // should match } + @Test + public void testSqlPatternContainsMUltipleMatchers() { + + final String longASCIIString = "Drill supports a variety of NoSQL databases and file systems, including HBase, MongoDB, MapR-DB, HDFS, MapR-FS, Amazon S3, Azure Blob Storage, Google Cloud Storage, Swift, " + + "NAS and local files. A single query can join data from multiple datastores. For example, you can join a user profile collection in MongoDB with a directory of event logs in Hadoop."; + final String emptyString = ""; + final String unicodeString = "¤EÃsÃW°ê»Ã®i¶T¤¤¤Ã3¼Ã®i¶TÃU2~~"; + + final List<SQLPatternTestParams> tests = new ArrayList<SQLPatternTestParams>(); + + // Tests for Matcher ZERO + tests.add(new SQLPatternTestParams(longASCIIString, "", true)); + tests.add(new SQLPatternTestParams(emptyString, "", true)); + tests.add(new SQLPatternTestParams(unicodeString, "", true)); + + // Tests for Matcher ONE + tests.add(new SQLPatternTestParams(longASCIIString, "N", true)); + tests.add(new SQLPatternTestParams(longASCIIString, "&", false)); + tests.add(new SQLPatternTestParams(emptyString, "N", false)); + + // Tests for Matcher TWO + tests.add(new SQLPatternTestParams(longASCIIString, "SQ", true)); + tests.add(new SQLPatternTestParams(longASCIIString, "eT", false)); + tests.add(new SQLPatternTestParams("A", "SQ", false)); + tests.add(new SQLPatternTestParams(emptyString, "SQ", false)); + tests.add(new SQLPatternTestParams(unicodeString, "¶", true)); + tests.add(new SQLPatternTestParams(unicodeString, "AT", false)); + + // Tests for Matcher THREE + tests.add(new SQLPatternTestParams(longASCIIString, "SQL", true)); + tests.add(new SQLPatternTestParams(longASCIIString, "cas", false)); + tests.add(new SQLPatternTestParams("S", "SQL", false)); + tests.add(new SQLPatternTestParams(emptyString, "SQL", false)); + tests.add(new SQLPatternTestParams(unicodeString, "¶T", true)); + tests.add(new SQLPatternTestParams(unicodeString, "¶A", false)); + + // Tests for Matcher for patterns of length: 3 < length < 10 + tests.add(new SQLPatternTestParams(longASCIIString, "MongoDB", true)); + tests.add(new SQLPatternTestParams(longASCIIString, "MongoDz", false)); + tests.add(new SQLPatternTestParams("Mon", "MongoDB", false)); + tests.add(new SQLPatternTestParams(emptyString, "MongoDB", false)); + tests.add(new SQLPatternTestParams(unicodeString, "®i¶", true)); + tests.add(new SQLPatternTestParams(unicodeString, "®x¶", false)); + + // Tests for Matcher for patterns of length >= 10 + tests.add(new SQLPatternTestParams(longASCIIString, "multiple datastores", true)); + tests.add(new SQLPatternTestParams(longASCIIString, "multiple datastorb", false)); + tests.add(new SQLPatternTestParams("multiple", "multiple datastores", false)); + tests.add(new SQLPatternTestParams(emptyString, "multiple datastores", false)); + tests.add(new SQLPatternTestParams(unicodeString, "¶T¤¤¤Ã3¼", true)); + tests.add(new SQLPatternTestParams(unicodeString, "¶T¤¤¤Ãz¼", false)); + + for (SQLPatternTestParams test : tests) { + setDrillBuf(test.inputString); + + RegexpUtil.SqlPatternInfo patternInfo = new RegexpUtil.SqlPatternInfo(RegexpUtil.SqlPatternType.CONTAINS, "", test.patternString); + SqlPatternMatcher sqlPatternContains = SqlPatternFactory.getSqlPatternMatcher(patternInfo); + int eval = sqlPatternContains.match(0, byteBuffer.limit(), drillBuf); + int expectedEval = test.shouldMatch ? 1 : 0; + + if (eval != expectedEval) { + System.err.format("test failed; params=%s%n", test); + } + + assertEquals(expectedEval, eval); + } + } + + @After public void cleanup() { drillBuf.close(); allocator.close(); } + + // ------------- + // Inner Classes + // ------------- + + /** Container class to hold SQL pattern test data */ + private static class SQLPatternTestParams { + private final String inputString; + private final String patternString; + private final boolean shouldMatch; + + private SQLPatternTestParams(String inputString, String patternString, boolean shouldMatch) { + this.inputString = inputString; + this.patternString = patternString; + this.shouldMatch = shouldMatch; + } + + public String toString() { + return "input=["+inputString+"], pattern=["+patternString+"], should-match=["+shouldMatch+"].."; + } + } }