Smalyshev has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/366326 )
Change subject: [WIP] Make regex() interruptible ...................................................................... [WIP] Make regex() interruptible Bug: T169862 Change-Id: I1a89c64c1501388f085b4264c8704641ff1000b2 --- M blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java A blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/constraints/InterruptibleRegexBOp.java A common/src/main/java/org/wikidata/query/rdf/common/InterruptibleCharSequence.java M src/build/checkstyle/checkstyle.xml M war/src/main/webapp/WEB-INF/web.xml 5 files changed, 460 insertions(+), 6 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf refs/changes/26/366326/1 diff --git a/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java b/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java index 3a6abe7..31d7a65 100644 --- a/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java +++ b/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/WikibaseContextListener.java @@ -16,6 +16,7 @@ import org.slf4j.LoggerFactory; import org.wikidata.query.rdf.blazegraph.constraints.CoordinatePartBOp; import org.wikidata.query.rdf.blazegraph.constraints.DecodeUriBOp; +import org.wikidata.query.rdf.blazegraph.constraints.InterruptibleRegexBOp; import org.wikidata.query.rdf.blazegraph.constraints.WikibaseCornerBOp; import org.wikidata.query.rdf.blazegraph.constraints.WikibaseDateBOp; import org.wikidata.query.rdf.blazegraph.constraints.WikibaseDistanceBOp; @@ -142,6 +143,9 @@ FunctionRegistry.add(new URIImpl(GeoSparql.LAT_FUNCTION), getCoordinatePartBOpFactory(CoordinatePartBOp.Parts.LAT)); // wikibase:decodeUri FunctionRegistry.add(new URIImpl(Ontology.NAMESPACE + "decodeUri"), getDecodeUriBOpFactory()); + + FunctionRegistry.remove(FunctionRegistry.REGEX); + FunctionRegistry.add(FunctionRegistry.REGEX, getRegexBOpFactory()); addPrefixes(WikibaseUris.getURISystem()); @@ -313,6 +317,43 @@ } /** + * Get CoordinatePartBOp factory. + * @return Factory to create CoordinatePartBOp + */ + private static Factory getRegexBOpFactory() { + return new Factory() { + public IValueExpression<? extends IV> create( + final BOpContextBase context, + final GlobalAnnotations globals, + Map<String, Object> scalarValues, + final ValueExpressionNode... args) { + + checkArgs(args, ValueExpressionNode.class, + ValueExpressionNode.class); + + final IValueExpression<? extends IV> var = AST2BOpUtility + .toVE(context, globals, args[0]); + + final IValueExpression<? extends IV> pattern = AST2BOpUtility + .toVE(context, globals, args[1]); + + if (args.length == 2) { + + return new InterruptibleRegexBOp(var, pattern); + + } else { + + final IValueExpression<? extends IV> flags = AST2BOpUtility + .toVE(context, globals, args[2]); + + return new InterruptibleRegexBOp(var, pattern, flags); + + } + } + }; + } + + /** * Service factory that disables remote access. */ private static final class DisableRemotesServiceFactory extends AbstractServiceFactoryBase { diff --git a/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/constraints/InterruptibleRegexBOp.java b/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/constraints/InterruptibleRegexBOp.java new file mode 100644 index 0000000..5fc5cf7 --- /dev/null +++ b/blazegraph/src/main/java/org/wikidata/query/rdf/blazegraph/constraints/InterruptibleRegexBOp.java @@ -0,0 +1,367 @@ +/* +Copy of the RegexBOp.java, modified to use interruptible char sequence. + +This is the copy of RegexBOp.java, modified to use InterruptibleCharSequence. + +Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. + +Contact: + SYSTAP, LLC DBA Blazegraph + 2501 Calvert ST NW #106 + Washington, DC 20008 + licen...@blazegraph.com + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +*/ +package org.wikidata.query.rdf.blazegraph.constraints; + +import java.util.Map; +import java.util.regex.Pattern; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.openrdf.model.Literal; +import org.openrdf.model.Value; +import org.openrdf.query.algebra.evaluation.util.QueryEvaluationUtil; +import org.wikidata.query.rdf.common.InterruptibleCharSequence; + +import com.bigdata.bop.BOp; +import com.bigdata.bop.IBindingSet; +import com.bigdata.bop.IConstant; +import com.bigdata.bop.IValueExpression; +import com.bigdata.bop.NV; +import com.bigdata.rdf.error.SparqlTypeErrorException; +import com.bigdata.rdf.internal.IV; +import com.bigdata.rdf.internal.constraints.INeedsMaterialization; +import com.bigdata.rdf.internal.constraints.RegexBOp; +import com.bigdata.rdf.internal.constraints.XSDBooleanIVValueExpression; +import com.bigdata.rdf.sparql.ast.FilterNode; +import com.bigdata.rdf.sparql.ast.QueryHints; + +//CHECKSTYLE:OFF +/** + * SPARQL REGEX operator. + */ +public class InterruptibleRegexBOp extends XSDBooleanIVValueExpression + implements INeedsMaterialization { + + /** + * + */ + private static final long serialVersionUID = 1357420268214930143L; + + private static final transient Logger log = LoggerFactory.getLogger(RegexBOp.class); + + private static final boolean debug = log.isDebugEnabled(); + + private static final boolean info = log.isInfoEnabled(); + + /** + * + * Local member to implement {@link QueryHints.REGEX_MATCH_NON_STRING} + * + * {@link BLZG-1780} + * + */ + private boolean matchNonString = QueryHints.DEFAULT_REGEX_MATCH_NON_STRING; + + public interface Annotations extends XSDBooleanIVValueExpression.Annotations { + + + /** + * The cached regex pattern. + */ + public String PATTERN = RegexBOp.class.getName() + + ".pattern"; + + } + + private static Map<String,Object> anns( + final IValueExpression<? extends IV> pattern, + final IValueExpression<? extends IV> flags) { + + try { + + if (pattern instanceof IConstant && + (flags == null || flags instanceof IConstant)) { + + final IV parg = ((IConstant<IV>) pattern).get(); + + final IV farg = flags != null ? + ((IConstant<IV>) flags).get() : null; + + if (parg.hasValue() && (farg == null || farg.hasValue())) { + + final Value pargVal = parg.getValue(); + + final Value fargVal = farg != null ? farg.getValue() : null; + + return NV.asMap( + new NV(Annotations.PATTERN, + getPattern(pargVal, fargVal))); + + } + + } + + } catch (Exception ex) { + + if (info) { + log.info("could not create pattern for: " + pattern + ", " + flags); + } + + } + + return BOp.NOANNS; + + } + + /** + * Construct a regex bop without flags. + */ + @SuppressWarnings("rawtypes") + public InterruptibleRegexBOp( + final IValueExpression<? extends IV> var, + final IValueExpression<? extends IV> pattern) { + + this(new BOp[] { var, pattern }, anns(pattern, null)); + + } + + /** + * Construct a regex bop with flags. + */ + @SuppressWarnings("rawtypes") + public InterruptibleRegexBOp( + final IValueExpression<? extends IV> var, + final IValueExpression<? extends IV> pattern, + final IValueExpression<? extends IV> flags) { + + this(new BOp[] { var, pattern, flags }, anns(pattern, flags)); + + } + + /** + * Required shallow copy constructor. + */ + public InterruptibleRegexBOp(final BOp[] args, final Map<String, Object> anns) { + + super(args, anns); + + if (args.length < 2 || args[0] == null || args[1] == null) + throw new IllegalArgumentException(); + + } + + /** + * Constructor required for {@link com.bigdata.bop.BOpUtility#deepCopy(FilterNode)}. + */ + public InterruptibleRegexBOp(final InterruptibleRegexBOp op) { + super(op); + } + + @Override + public Requirement getRequirement() { + + return INeedsMaterialization.Requirement.SOMETIMES; + + } + + @Override + public boolean accept(final IBindingSet bs) { + + final Value var = asValue(getAndCheckBound(0, bs)); + + @SuppressWarnings("rawtypes") + final IV pattern = getAndCheckBound(1, bs); + + @SuppressWarnings("rawtypes") + final IV flags = arity() > 2 ? get(2).get(bs) : null; + + if (debug) { + log.debug("regex var: " + var); + log.debug("regex pattern: " + pattern); + log.debug("regex flags: " + flags); + } + + return accept(var, pattern.getValue(), flags != null ? flags.getValue() + : null); + + } + + /** + * Lifted directly from Sesame's EvaluationStrategyImpl. + * + * FIXME The Pattern should be cached if the pattern argument and flags are + * constants. + * + * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/516"> + * REGEXBOp should cache the Pattern when it is a constant </a> + */ + private boolean accept(final Value arg, final Value parg, final Value farg) { + + if (debug) { + log.debug("regex var: " + arg); + log.debug("regex pattern: " + parg); + log.debug("regex flags: " + farg); + //Fixme not sure why we weren't able pick up via properties + log.debug(QueryHints.REGEX_MATCH_NON_STRING + + ": " + + this.getProperty(QueryHints.REGEX_MATCH_NON_STRING, + QueryHints.DEFAULT_REGEX_MATCH_NON_STRING)); + log.debug("matchNonString: " + this.matchNonString); + } + + //BLZG-1200 changed to isPlainLiteral + if (QueryEvaluationUtil.isPlainLiteral(arg) + // BLZG-1780: Query Hint to cast to string + || matchNonString ) { + + final String text; + + if(QueryEvaluationUtil.isPlainLiteral(arg)) { + text = ((Literal) arg).getLabel(); + } else { //Query Hint Override with explicit conversion + text = arg.stringValue(); + } + + if(debug) { + log.debug("regex text: " + text); + } + + try { + + // first check for cached pattern + Pattern pattern = (Pattern) getProperty(Annotations.PATTERN); + + if (pattern == null) { + + // resolve the pattern. NB: NOT cached. + pattern = getPattern(parg, farg); + + } + + if (Thread.interrupted()) { + + /* + * Eagerly notice if the operator is interrupted. + * + * Note: Regex can be a high latency operation for a large + * RDF Literal. Therefore we want to check for an interrupt + * before each regex test. The Pattern code itself will not + * notice an interrupt.... + */ + throw new RuntimeException(new InterruptedException()); + + } + + final boolean result = pattern.matcher(new InterruptibleCharSequence(text)).find(); + + return result; + + } catch (IllegalArgumentException ex) { + + throw new SparqlTypeErrorException(); + + } + + } else { + + + + if(debug) { + log.debug("Unknown type: " + arg); + } + + throw new SparqlTypeErrorException(); + + } + + } + + private static Pattern getPattern(final Value parg, final Value farg) + throws IllegalArgumentException { + + if (debug) { + log.debug("regex pattern: " + parg); + log.debug("regex flags: " + farg); + } + + //BLZG-1200 Literals with language types are not included in REGEX + if (QueryEvaluationUtil.isPlainLiteral(parg) + && (farg == null || QueryEvaluationUtil.isPlainLiteral(farg))) { + + final String ptn = ((Literal) parg).getLabel(); + String flags = ""; + if (farg != null) { + flags = ((Literal)farg).getLabel(); + } + int f = 0; + for (char c : flags.toCharArray()) { + switch (c) { + case 's': + f |= Pattern.DOTALL; + break; + case 'm': + f |= Pattern.MULTILINE; + break; + case 'i': { + /* + * The SPARQL REGEX operator is based on the XQuery REGEX + * operator. That operator should be Unicode clean by + * default. Therefore, when case-folding is specified, we + * also need to include the UNICODE_CASE option. + * + * @see <a + * href="https://sourceforge.net/apps/trac/bigdata/ticket/655" + * > SPARQL REGEX operator does not perform case-folding + * correctly for Unicode data </a> + */ + f |= Pattern.CASE_INSENSITIVE; + f |= Pattern.UNICODE_CASE; + break; + } + case 'x': + f |= Pattern.COMMENTS; + break; + case 'd': + f |= Pattern.UNIX_LINES; + break; + case 'u': // Implicit with 'i' flag. +// f |= Pattern.UNICODE_CASE; + break; + default: + throw new IllegalArgumentException(); + } + } + final Pattern pattern = Pattern.compile(ptn, f); + return pattern; + } + + throw new IllegalArgumentException(); + + } + + public boolean isMatchNonString() { + return matchNonString; + } + + public void setMatchNonString(boolean matchNonString) { + this.matchNonString = matchNonString; + } + + +} +//CHECKSTYLE:ON diff --git a/common/src/main/java/org/wikidata/query/rdf/common/InterruptibleCharSequence.java b/common/src/main/java/org/wikidata/query/rdf/common/InterruptibleCharSequence.java new file mode 100644 index 0000000..95c9004 --- /dev/null +++ b/common/src/main/java/org/wikidata/query/rdf/common/InterruptibleCharSequence.java @@ -0,0 +1,41 @@ +package org.wikidata.query.rdf.common; +/** + * CharSequence that noticed thread interrupts -- as might be necessary + * to recover from a loose regex on unexpected challenging input. + * + * @author gojomo + */ +public class InterruptibleCharSequence implements CharSequence { + /** + * Parent sequence. + */ + private final CharSequence inner; + + public InterruptibleCharSequence(CharSequence inner) { + super(); + this.inner = inner; + } + + @Override + public char charAt(int index) { + if (Thread.interrupted()) { // clears flag if set + throw new RuntimeException(new InterruptedException()); + } + return inner.charAt(index); + } + + @Override + public int length() { + return inner.length(); + } + + @Override + public CharSequence subSequence(int start, int end) { + return new InterruptibleCharSequence(inner.subSequence(start, end)); + } + + @Override + public String toString() { + return inner.toString(); + } +} diff --git a/src/build/checkstyle/checkstyle.xml b/src/build/checkstyle/checkstyle.xml index 6b9486d..faa61e5 100644 --- a/src/build/checkstyle/checkstyle.xml +++ b/src/build/checkstyle/checkstyle.xml @@ -1,8 +1,8 @@ <?xml version="1.0"?> -<!-- Copyright (C) 2014 Wikimedia Foundation Licensed under the Apache License, Version 2.0 (the "License"); you may not - use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS - IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language +<!-- Copyright (C) 2014 Wikimedia Foundation Licensed under the Apache License, Version 2.0 (the "License"); you may not + use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS + IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.2//EN" @@ -10,12 +10,17 @@ <module name="Checker"> <module name="SuppressWarningsFilter" /> + <module name="SuppressionCommentFilter"> + <property name="offCommentFormat" value="CHECKSTYLE:OFF"/> + <property name="onCommentFormat" value="CHECKSTYLE:ON"/> + </module> <module name="SuppressionFilter"> <!-- Suppresses some checks for tests. --> <property name="file" value="${checkstyle.suppressions}" /> </module> <module name="TreeWalker"> <module name="SuppressWarningsHolder" /> + <module name="FileContentsHolder"/> <!-- Javadoc --> <module name="JavadocType"> <property name="allowMissingParamTags" value="true" /> <!-- Sometimes its just overkill --> @@ -80,7 +85,7 @@ </module> <module name="TypeName" /> <module name="ConstantName"> - <!-- Allow "log". There might be other non-constant static finals but they should be rare enough we can just list them + <!-- Allow "log". There might be other non-constant static finals but they should be rare enough we can just list them here when they come up. --> <property name="format" value="^([A-Z][A-Z0-9]*(_[A-Z0-9]+)*|log)$" /> </module> diff --git a/war/src/main/webapp/WEB-INF/web.xml b/war/src/main/webapp/WEB-INF/web.xml index 91efc26..e04b437 100644 --- a/war/src/main/webapp/WEB-INF/web.xml +++ b/war/src/main/webapp/WEB-INF/web.xml @@ -53,7 +53,7 @@ <context-param> <description>When non-zero, the timeout for queries (milliseconds).</description> <param-name>queryTimeout</param-name> - <param-value>600000</param-value> + <param-value>6000</param-value> </context-param> <context-param> <description>When non-zero, the timeout for the warmup period (milliseconds). The warmup period pulls in the non-leaf index pages and reduces the impact of sudden heavy query workloads on the disk and on GC. The end points are not available during the warmup period.</description> -- To view, visit https://gerrit.wikimedia.org/r/366326 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I1a89c64c1501388f085b4264c8704641ff1000b2 Gerrit-PatchSet: 1 Gerrit-Project: wikidata/query/rdf Gerrit-Branch: master Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits