Smalyshev has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/366326 )

Change subject: [WIP] Make regex() interruptible
......................................................................

[WIP] Make regex() interruptible

Bug: T169862
Change-Id: I1a89c64c1501388f085b4264c8704641ff1000b2
---
M 
blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java
A 
blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/constraints/InterruptibleRegexBOp.java
A 
common/src/main/java/org/wikidata/query/rdf/common/InterruptibleCharSequence.java
M src/build/checkstyle/checkstyle.xml
M war/src/main/webapp/WEB-INF/web.xml
5 files changed, 460 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf 
refs/changes/26/366326/1

diff --git 
a/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java
 
b/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java
index 3a6abe7..31d7a65 100644
--- 
a/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java
+++ 
b/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java
@@ -16,6 +16,7 @@
 import org.slf4j.LoggerFactory;
 import org.wikidata.query.rdf.blazegraph.constraints.CoordinatePartBOp;
 import org.wikidata.query.rdf.blazegraph.constraints.DecodeUriBOp;
+import org.wikidata.query.rdf.blazegraph.constraints.InterruptibleRegexBOp;
 import org.wikidata.query.rdf.blazegraph.constraints.WikibaseCornerBOp;
 import org.wikidata.query.rdf.blazegraph.constraints.WikibaseDateBOp;
 import org.wikidata.query.rdf.blazegraph.constraints.WikibaseDistanceBOp;
@@ -142,6 +143,9 @@
         FunctionRegistry.add(new URIImpl(GeoSparql.LAT_FUNCTION), 
getCoordinatePartBOpFactory(CoordinatePartBOp.Parts.LAT));
         // wikibase:decodeUri
         FunctionRegistry.add(new URIImpl(Ontology.NAMESPACE + "decodeUri"), 
getDecodeUriBOpFactory());
+
+        FunctionRegistry.remove(FunctionRegistry.REGEX);
+        FunctionRegistry.add(FunctionRegistry.REGEX, getRegexBOpFactory());
 
         addPrefixes(WikibaseUris.getURISystem());
 
@@ -313,6 +317,43 @@
     }
 
     /**
+     * Get CoordinatePartBOp factory.
+     * @return Factory to create CoordinatePartBOp
+     */
+    private static Factory getRegexBOpFactory() {
+        return new Factory() {
+            public IValueExpression<? extends IV> create(
+                    final BOpContextBase context,
+                    final GlobalAnnotations globals,
+                    Map<String, Object> scalarValues,
+                    final ValueExpressionNode... args) {
+
+                checkArgs(args, ValueExpressionNode.class,
+                        ValueExpressionNode.class);
+
+                final IValueExpression<? extends IV> var = AST2BOpUtility
+                        .toVE(context, globals, args[0]);
+
+                final IValueExpression<? extends IV> pattern = AST2BOpUtility
+                        .toVE(context, globals, args[1]);
+
+                if (args.length == 2) {
+
+                    return new InterruptibleRegexBOp(var, pattern);
+
+                } else {
+
+                    final IValueExpression<? extends IV> flags = AST2BOpUtility
+                            .toVE(context, globals, args[2]);
+
+                    return new InterruptibleRegexBOp(var, pattern, flags);
+
+                }
+            }
+        };
+    }
+
+    /**
      * Service factory that disables remote access.
      */
     private static final class DisableRemotesServiceFactory extends 
AbstractServiceFactoryBase {
diff --git 
a/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/constraints/InterruptibleRegexBOp.java
 
b/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/constraints/InterruptibleRegexBOp.java
new file mode 100644
index 0000000..5fc5cf7
--- /dev/null
+++ 
b/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/constraints/InterruptibleRegexBOp.java
@@ -0,0 +1,367 @@
+/*
+Copy of the RegexBOp.java, modified to use interruptible char sequence.
+
+This is the copy of RegexBOp.java, modified to use InterruptibleCharSequence.
+
+Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.
+
+Contact:
+     SYSTAP, LLC DBA Blazegraph
+     2501 Calvert ST NW #106
+     Washington, DC 20008
+     licen...@blazegraph.com
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+*/
+package org.wikidata.query.rdf.blazegraph.constraints;
+
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.openrdf.model.Literal;
+import org.openrdf.model.Value;
+import org.openrdf.query.algebra.evaluation.util.QueryEvaluationUtil;
+import org.wikidata.query.rdf.common.InterruptibleCharSequence;
+
+import com.bigdata.bop.BOp;
+import com.bigdata.bop.IBindingSet;
+import com.bigdata.bop.IConstant;
+import com.bigdata.bop.IValueExpression;
+import com.bigdata.bop.NV;
+import com.bigdata.rdf.error.SparqlTypeErrorException;
+import com.bigdata.rdf.internal.IV;
+import com.bigdata.rdf.internal.constraints.INeedsMaterialization;
+import com.bigdata.rdf.internal.constraints.RegexBOp;
+import com.bigdata.rdf.internal.constraints.XSDBooleanIVValueExpression;
+import com.bigdata.rdf.sparql.ast.FilterNode;
+import com.bigdata.rdf.sparql.ast.QueryHints;
+
+//CHECKSTYLE:OFF
+/**
+ * SPARQL REGEX operator.
+ */
+public class InterruptibleRegexBOp extends XSDBooleanIVValueExpression
+        implements INeedsMaterialization {
+
+    /**
+     *
+     */
+    private static final long serialVersionUID = 1357420268214930143L;
+
+    private static final transient Logger log = 
LoggerFactory.getLogger(RegexBOp.class);
+
+    private static final boolean debug = log.isDebugEnabled();
+
+    private static final boolean info = log.isInfoEnabled();
+
+    /**
+     *
+     * Local member to implement {@link QueryHints.REGEX_MATCH_NON_STRING}
+     *
+     * {@link BLZG-1780}
+     *
+     */
+    private boolean matchNonString = QueryHints.DEFAULT_REGEX_MATCH_NON_STRING;
+
+       public interface Annotations extends 
XSDBooleanIVValueExpression.Annotations {
+
+
+        /**
+         * The cached regex pattern.
+         */
+        public String PATTERN = RegexBOp.class.getName()
+                + ".pattern";
+
+    }
+
+    private static Map<String,Object> anns(
+            final IValueExpression<? extends IV> pattern,
+            final IValueExpression<? extends IV> flags) {
+
+        try {
+
+            if (pattern instanceof IConstant &&
+                    (flags == null || flags instanceof IConstant)) {
+
+                final IV parg = ((IConstant<IV>) pattern).get();
+
+                final IV farg = flags != null ?
+                        ((IConstant<IV>) flags).get() : null;
+
+                if (parg.hasValue() && (farg == null || farg.hasValue())) {
+
+                    final Value pargVal = parg.getValue();
+
+                    final Value fargVal = farg != null ? farg.getValue() : 
null;
+
+                    return NV.asMap(
+                            new NV(Annotations.PATTERN,
+                                    getPattern(pargVal, fargVal)));
+
+                }
+
+            }
+
+        } catch (Exception ex) {
+
+            if (info) {
+                log.info("could not create pattern for: " + pattern + ", " + 
flags);
+            }
+
+        }
+
+        return BOp.NOANNS;
+
+    }
+
+    /**
+     * Construct a regex bop without flags.
+     */
+    @SuppressWarnings("rawtypes")
+    public InterruptibleRegexBOp(
+            final IValueExpression<? extends IV> var,
+            final IValueExpression<? extends IV> pattern) {
+
+        this(new BOp[] { var, pattern }, anns(pattern, null));
+
+    }
+
+    /**
+     * Construct a regex bop with flags.
+     */
+    @SuppressWarnings("rawtypes")
+    public InterruptibleRegexBOp(
+            final IValueExpression<? extends IV> var,
+            final IValueExpression<? extends IV> pattern,
+            final IValueExpression<? extends IV> flags) {
+
+        this(new BOp[] { var, pattern, flags }, anns(pattern, flags));
+
+    }
+
+    /**
+     * Required shallow copy constructor.
+     */
+    public InterruptibleRegexBOp(final BOp[] args, final Map<String, Object> 
anns) {
+
+        super(args, anns);
+
+        if (args.length < 2 || args[0] == null || args[1] == null)
+            throw new IllegalArgumentException();
+
+    }
+
+    /**
+     * Constructor required for {@link 
com.bigdata.bop.BOpUtility#deepCopy(FilterNode)}.
+     */
+    public InterruptibleRegexBOp(final InterruptibleRegexBOp op) {
+        super(op);
+    }
+
+    @Override
+    public Requirement getRequirement() {
+
+        return INeedsMaterialization.Requirement.SOMETIMES;
+
+    }
+
+    @Override
+    public boolean accept(final IBindingSet bs) {
+
+        final Value var = asValue(getAndCheckBound(0, bs));
+
+        @SuppressWarnings("rawtypes")
+        final IV pattern = getAndCheckBound(1, bs);
+
+        @SuppressWarnings("rawtypes")
+        final IV flags = arity() > 2 ? get(2).get(bs) : null;
+
+        if (debug) {
+            log.debug("regex var: " + var);
+            log.debug("regex pattern: " + pattern);
+            log.debug("regex flags: " + flags);
+        }
+
+        return accept(var, pattern.getValue(), flags != null ? flags.getValue()
+                : null);
+
+    }
+
+    /**
+     * Lifted directly from Sesame's EvaluationStrategyImpl.
+     *
+     * FIXME The Pattern should be cached if the pattern argument and flags are
+     * constants.
+     *
+     * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/516";>
+     *      REGEXBOp should cache the Pattern when it is a constant </a>
+     */
+    private boolean accept(final Value arg, final Value parg, final Value 
farg) {
+
+        if (debug) {
+            log.debug("regex var: " + arg);
+            log.debug("regex pattern: " + parg);
+            log.debug("regex flags: " + farg);
+            //Fixme not sure why we weren't able pick up via properties
+                       log.debug(QueryHints.REGEX_MATCH_NON_STRING
+                                       + ": "
+                                       + 
this.getProperty(QueryHints.REGEX_MATCH_NON_STRING,
+                                                       
QueryHints.DEFAULT_REGEX_MATCH_NON_STRING));
+                       log.debug("matchNonString:  " + this.matchNonString);
+        }
+
+        //BLZG-1200 changed to isPlainLiteral
+               if (QueryEvaluationUtil.isPlainLiteral(arg)
+                               // BLZG-1780:  Query Hint to cast to string
+                               || matchNonString ) {
+
+            final String text;
+
+            if(QueryEvaluationUtil.isPlainLiteral(arg)) {
+               text = ((Literal) arg).getLabel();
+            } else { //Query Hint Override with explicit conversion
+               text = arg.stringValue();
+            }
+
+            if(debug) {
+               log.debug("regex text:  " + text);
+            }
+
+            try {
+
+                // first check for cached pattern
+                Pattern pattern = (Pattern) getProperty(Annotations.PATTERN);
+
+                if (pattern == null) {
+
+                    // resolve the pattern. NB: NOT cached.
+                    pattern = getPattern(parg, farg);
+
+                }
+
+                if (Thread.interrupted()) {
+
+                    /*
+                     * Eagerly notice if the operator is interrupted.
+                     *
+                     * Note: Regex can be a high latency operation for a large
+                     * RDF Literal. Therefore we want to check for an interrupt
+                     * before each regex test. The Pattern code itself will not
+                     * notice an interrupt....
+                     */
+                    throw new RuntimeException(new InterruptedException());
+
+                }
+
+                final boolean result = pattern.matcher(new 
InterruptibleCharSequence(text)).find();
+
+                return result;
+
+            } catch (IllegalArgumentException ex) {
+
+                throw new SparqlTypeErrorException();
+
+            }
+
+        } else {
+
+
+
+               if(debug) {
+                       log.debug("Unknown type:  " + arg);
+               }
+
+            throw new SparqlTypeErrorException();
+
+        }
+
+    }
+
+    private static Pattern getPattern(final Value parg, final Value farg)
+            throws IllegalArgumentException {
+
+        if (debug) {
+            log.debug("regex pattern: " + parg);
+            log.debug("regex flags: " + farg);
+        }
+
+        //BLZG-1200 Literals with language types are not included in REGEX
+        if (QueryEvaluationUtil.isPlainLiteral(parg)
+                && (farg == null || QueryEvaluationUtil.isPlainLiteral(farg))) 
{
+
+            final String ptn = ((Literal) parg).getLabel();
+            String flags = "";
+            if (farg != null) {
+                flags = ((Literal)farg).getLabel();
+            }
+            int f = 0;
+            for (char c : flags.toCharArray()) {
+                switch (c) {
+                    case 's':
+                        f |= Pattern.DOTALL;
+                        break;
+                    case 'm':
+                        f |= Pattern.MULTILINE;
+                        break;
+                    case 'i': {
+                    /*
+                     * The SPARQL REGEX operator is based on the XQuery REGEX
+                     * operator. That operator should be Unicode clean by
+                     * default. Therefore, when case-folding is specified, we
+                     * also need to include the UNICODE_CASE option.
+                     *
+                     * @see <a
+                     * 
href="https://sourceforge.net/apps/trac/bigdata/ticket/655";
+                     * > SPARQL REGEX operator does not perform case-folding
+                     * correctly for Unicode data </a>
+                     */
+                        f |= Pattern.CASE_INSENSITIVE;
+                        f |= Pattern.UNICODE_CASE;
+                        break;
+                    }
+                    case 'x':
+                        f |= Pattern.COMMENTS;
+                        break;
+                    case 'd':
+                        f |= Pattern.UNIX_LINES;
+                        break;
+                    case 'u': // Implicit with 'i' flag.
+//                      f |= Pattern.UNICODE_CASE;
+                        break;
+                    default:
+                        throw new IllegalArgumentException();
+                }
+            }
+            final Pattern pattern = Pattern.compile(ptn, f);
+            return pattern;
+        }
+
+        throw new IllegalArgumentException();
+
+    }
+
+    public boolean isMatchNonString() {
+               return matchNonString;
+       }
+
+       public void setMatchNonString(boolean matchNonString) {
+               this.matchNonString = matchNonString;
+       }
+
+
+}
+//CHECKSTYLE:ON
diff --git 
a/common/src/main/java/org/wikidata/query/rdf/common/InterruptibleCharSequence.java
 
b/common/src/main/java/org/wikidata/query/rdf/common/InterruptibleCharSequence.java
new file mode 100644
index 0000000..95c9004
--- /dev/null
+++ 
b/common/src/main/java/org/wikidata/query/rdf/common/InterruptibleCharSequence.java
@@ -0,0 +1,41 @@
+package org.wikidata.query.rdf.common;
+/**
+ * CharSequence that noticed thread interrupts -- as might be necessary
+ * to recover from a loose regex on unexpected challenging input.
+ *
+ * @author gojomo
+ */
+public class InterruptibleCharSequence implements CharSequence {
+    /**
+     * Parent sequence.
+     */
+    private final CharSequence inner;
+
+    public InterruptibleCharSequence(CharSequence inner) {
+        super();
+        this.inner = inner;
+    }
+
+    @Override
+    public char charAt(int index) {
+        if (Thread.interrupted()) { // clears flag if set
+            throw new RuntimeException(new InterruptedException());
+        }
+        return inner.charAt(index);
+    }
+
+    @Override
+    public int length() {
+        return inner.length();
+    }
+
+    @Override
+    public CharSequence subSequence(int start, int end) {
+        return new InterruptibleCharSequence(inner.subSequence(start, end));
+    }
+
+    @Override
+    public String toString() {
+        return inner.toString();
+    }
+}
diff --git a/src/build/checkstyle/checkstyle.xml 
b/src/build/checkstyle/checkstyle.xml
index 6b9486d..faa61e5 100644
--- a/src/build/checkstyle/checkstyle.xml
+++ b/src/build/checkstyle/checkstyle.xml
@@ -1,8 +1,8 @@
 <?xml version="1.0"?>
-<!-- Copyright (C) 2014 Wikimedia Foundation Licensed under the Apache 
License, Version 2.0 (the "License"); you may not 
-  use this file except in compliance with the License. You may obtain a copy 
of the License at http://www.apache.org/licenses/LICENSE-2.0 
-  Unless required by applicable law or agreed to in writing, software 
distributed under the License is distributed on an "AS 
-  IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied. See the License for the specific language 
+<!-- Copyright (C) 2014 Wikimedia Foundation Licensed under the Apache 
License, Version 2.0 (the "License"); you may not
+  use this file except in compliance with the License. You may obtain a copy 
of the License at http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software 
distributed under the License is distributed on an "AS
+  IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied. See the License for the specific language
   governing permissions and limitations under the License. -->
 <!DOCTYPE module PUBLIC
     "-//Puppy Crawl//DTD Check Configuration 1.2//EN"
@@ -10,12 +10,17 @@
 
 <module name="Checker">
   <module name="SuppressWarningsFilter" />
+  <module name="SuppressionCommentFilter">
+    <property name="offCommentFormat" value="CHECKSTYLE:OFF"/>
+    <property name="onCommentFormat" value="CHECKSTYLE:ON"/>
+  </module>
   <module name="SuppressionFilter">
     <!-- Suppresses some checks for tests. -->
     <property name="file" value="${checkstyle.suppressions}" />
   </module>
   <module name="TreeWalker">
     <module name="SuppressWarningsHolder" />
+    <module name="FileContentsHolder"/>
     <!-- Javadoc -->
     <module name="JavadocType">
       <property name="allowMissingParamTags" value="true" /> <!-- Sometimes 
its just overkill -->
@@ -80,7 +85,7 @@
     </module>
     <module name="TypeName" />
     <module name="ConstantName">
-      <!-- Allow "log". There might be other non-constant static finals but 
they should be rare enough we can just list them 
+      <!-- Allow "log". There might be other non-constant static finals but 
they should be rare enough we can just list them
         here when they come up. -->
       <property name="format" value="^([A-Z][A-Z0-9]*(_[A-Z0-9]+)*|log)$" />
     </module>
diff --git a/war/src/main/webapp/WEB-INF/web.xml 
b/war/src/main/webapp/WEB-INF/web.xml
index 91efc26..e04b437 100644
--- a/war/src/main/webapp/WEB-INF/web.xml
+++ b/war/src/main/webapp/WEB-INF/web.xml
@@ -53,7 +53,7 @@
   <context-param>
    <description>When non-zero, the timeout for queries 
(milliseconds).</description>
    <param-name>queryTimeout</param-name>
-   <param-value>600000</param-value>
+   <param-value>6000</param-value>
   </context-param>
   <context-param>
    <description>When non-zero, the timeout for the warmup period 
(milliseconds). The warmup period pulls in the non-leaf index pages and reduces 
the impact of sudden heavy query workloads on the disk and on GC.  The end 
points are not available during the warmup period.</description>

-- 
To view, visit https://gerrit.wikimedia.org/r/366326
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I1a89c64c1501388f085b4264c8704641ff1000b2
Gerrit-PatchSet: 1
Gerrit-Project: wikidata/query/rdf
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <smalys...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to