[
https://issues.apache.org/jira/browse/PHOENIX-1287?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14363871#comment-14363871
]
ASF GitHub Bot commented on PHOENIX-1287:
-----------------------------------------
Github user JamesRTaylor commented on a diff in the pull request:
https://github.com/apache/phoenix/pull/46#discussion_r26525162
--- Diff:
phoenix-core/src/main/java/org/apache/phoenix/expression/util/regex/JONIRegexWrapper.java
---
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.expression.util.regex;
+
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.phoenix.schema.SortOrder;
+import org.jcodings.Encoding;
+import org.jcodings.specific.UTF8Encoding;
+import org.joni.Matcher;
+import org.joni.Option;
+import org.joni.Regex;
+
+import com.google.common.base.Preconditions;
+
+public class JONIRegexWrapper {
+
+ private static final Encoding PVARCHAR_ENCODING =
UTF8Encoding.INSTANCE;
+
+ static class JONIPattern extends AbstractBasePattern {
+
+ private final Regex pattern;
+ private boolean isLastMatcherStringNull;
+ private final String patternString;
+
+ JONIPattern(String patternString) {
+ this(patternString, 0);
+ }
+
+ JONIPattern(String patternString, int flags) {
+ this.patternString = patternString;
+ if (patternString != null) {
+ patternString = replacePatternQuote(patternString);
+ byte[] patternBytes = patternString.getBytes();
+ pattern = new Regex(patternBytes, 0, patternBytes.length,
flags, PVARCHAR_ENCODING);
+ } else {
+ pattern = null;
+ }
+ isLastMatcherStringNull = false;
+ }
+
+ public enum ReplaceQuoteMachine {
+ STOP, ZERO, ONE_BACKSLASH, IN_QUOTE, IN_QUOTE_ONE_BACKSLASH;
+
+ ReplaceQuoteMachine next(StringBuilder sb, char ch) {
+ switch (this) {
+ case ZERO:
+ switch (ch) {
+ case '\\':
+ return ONE_BACKSLASH;
+ default:
+ sb.append(ch);
+ return ZERO;
+ }
+ case ONE_BACKSLASH:
+ switch (ch) {
+ case 'Q':
+ return IN_QUOTE;
+ default:
+ sb.append('\\');
+ return ZERO.next(sb, ch);
+ }
+ case IN_QUOTE:
+ // add backslashes for .^$*+?()[{\|
+ switch (ch) {
+ case '.':
+ case '^':
+ case '$':
+ case '*':
+ case '+':
+ case '?':
+ case '(':
+ case ')':
+ case '[':
+ case '{':
+ case '|':
+ sb.append('\\').append(ch);
+ return IN_QUOTE;
+ case '\\':
+ return IN_QUOTE_ONE_BACKSLASH;
+ default:
+ sb.append(ch);
+ return IN_QUOTE;
+ }
+ case IN_QUOTE_ONE_BACKSLASH:
+ switch (ch) {
+ case 'E':
+ return ZERO;
+ default:
+ sb.append('\\');
+ return IN_QUOTE.next(sb, ch);
+ }
+ case STOP:
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+
+ ReplaceQuoteMachine EOF(StringBuilder sb) {
+ switch (this) {
+ case ONE_BACKSLASH:
+ case IN_QUOTE_ONE_BACKSLASH:
+ sb.append('\\');
+ case ZERO:
+ case IN_QUOTE:
+ return STOP;
+ case STOP:
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+ }
+
+ private String replacePatternQuote(String patternString) {
+ StringBuilder sb = new StringBuilder();
+ ReplaceQuoteMachine cur = ReplaceQuoteMachine.ZERO;
+ for (int i = 0; i < patternString.length(); ++i) {
+ cur = cur.next(sb, patternString.charAt(i));
+ }
+ cur = cur.EOF(sb);
+ return sb.toString();
+ }
+
+ @Override
+ public AbstractBaseMatcher macher(ImmutableBytesWritable ptr,
SortOrder sortOrder) {
+ Preconditions.checkNotNull(ptr);
+ Preconditions.checkNotNull(sortOrder);
+ byte[] matcherSourceBytes =
Utils.immutableBytesWritableToBytes(ptr, sortOrder);
--- End diff --
I hope there's a pattern.matcher(byte[] buf, int offset) method we can use
instead below, as this will cause a copy of the underlying byte[].
If this is necessary, can you use ByteUtil.copyKeyBytesIfNecessary() as it
prevents a copy unless necessary. I think this logic may be able to be pulled
out of here and into LikeExpression.evaluate as it should be the same in both
cases.
> Use the joni byte[] regex engine in place of j.u.regex
> ------------------------------------------------------
>
> Key: PHOENIX-1287
> URL: https://issues.apache.org/jira/browse/PHOENIX-1287
> Project: Phoenix
> Issue Type: Bug
> Reporter: James Taylor
> Assignee: Shuxiong Ye
> Labels: gsoc2015
>
> See HBASE-11907. We'd get a 2x perf benefit plus it's driven off of byte[]
> instead of strings.Thanks for the pointer, [~apurtell].
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)