VXQUERY-32 Integrate Apache Lucene Added two functions to VXQuery: build-collection-on-index creates a lucene index for a collection collection-from-index queries a lucene index to return a collection Added tests
Project: http://git-wip-us.apache.org/repos/asf/vxquery/repo Commit: http://git-wip-us.apache.org/repos/asf/vxquery/commit/1f623b16 Tree: http://git-wip-us.apache.org/repos/asf/vxquery/tree/1f623b16 Diff: http://git-wip-us.apache.org/repos/asf/vxquery/diff/1f623b16 Branch: refs/heads/master Commit: 1f623b1664fb5a86dae8b6be2497d822f80505d9 Parents: 2b59326 Author: Steven Glenn Jacobs <[email protected]> Authored: Fri May 27 12:42:47 2016 -0700 Committer: Steven Glenn Jacobs <[email protected]> Committed: Fri May 27 12:42:47 2016 -0700 ---------------------------------------------------------------------- vxquery-core/pom.xml | 31 +- .../vxquery/functions/builtin-functions.xml | 22 + .../apache/vxquery/index/IndexAttributes.java | 287 ++++++ .../vxquery/index/IndexDocumentBuilder.java | 882 +++++++++++++++++++ .../org/apache/vxquery/index/IndexElement.java | 42 + .../VXQueryCollectionOperatorDescriptor.java | 81 +- .../functions/index/CaseSensitiveAnalyzer.java | 144 +++ .../index/CaseSensitiveQueryParser.java | 42 + ...ctionFromIndexUnnestingEvaluatorFactory.java | 327 +++++++ .../IndexConstructorScalarEvaluatorFactory.java | 70 ++ .../functions/index/IndexConstructorUtil.java | 144 +++ .../runtime/functions/util/FunctionHelper.java | 7 +- .../vxquery/xmlparser/SAXContentHandler.java | 72 +- .../org/apache/vxquery/xmlparser/XMLParser.java | 2 +- .../org/apache/vxquery/xtest/VXQueryTest.java | 49 +- .../Indexing/createIndex.txt | 0 .../ExpectedTestResults/Indexing/useIndex1.txt | 2 + .../ExpectedTestResults/Indexing/useIndex2.txt | 1 + .../ExpectedTestResults/Indexing/useIndex3.txt | 1 + .../ExpectedTestResults/Indexing/useIndex4.txt | 1 + .../ExpectedTestResults/Indexing/useIndex5.txt | 3 + .../ExpectedTestResults/Indexing/useIndex6.txt | 2 + .../ExpectedTestResults/Indexing/useIndex7.txt | 3 + .../Queries/XQuery/Indexing/createIndex.xq | 20 + .../Queries/XQuery/Indexing/useIndex1.xq | 25 + .../Queries/XQuery/Indexing/useIndex2.xq | 24 + .../Queries/XQuery/Indexing/useIndex3.xq | 27 + .../Queries/XQuery/Indexing/useIndex4.xq | 24 + .../Queries/XQuery/Indexing/useIndex5.xq | 23 + .../Queries/XQuery/Indexing/useIndex6.xq | 23 + .../Queries/XQuery/Indexing/useIndex7.xq | 27 + .../src/test/resources/VXQueryCatalog.xml | 15 + .../src/test/resources/cat/IndexingQueries.xml | 63 ++ 33 files changed, 2380 insertions(+), 106 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/pom.xml ---------------------------------------------------------------------- diff --git a/vxquery-core/pom.xml b/vxquery-core/pom.xml index d8f5f53..56eb45f 100644 --- a/vxquery-core/pom.xml +++ b/vxquery-core/pom.xml @@ -266,7 +266,36 @@ <artifactId>junit</artifactId> <scope>test</scope> </dependency> - + <dependency> + <artifactId>lucene-core</artifactId> + <groupId>org.apache.lucene</groupId> + <type>jar</type> + <version>5.5.1</version> + </dependency> + <dependency> + <artifactId>lucene-queryparser</artifactId> + <groupId>org.apache.lucene</groupId> + <type>jar</type> + <version>5.5.1</version> + </dependency> + <dependency> + <artifactId>lucene-analyzers-common</artifactId> + <groupId>org.apache.lucene</groupId> + <type>jar</type> + <version>5.5.1</version> + </dependency> + <dependency> + <artifactId>lucene-demo</artifactId> + <groupId>org.apache.lucene</groupId> + <type>jar</type> + <version>5.5.1</version> + </dependency> + <dependency> + <artifactId>lucene-backward-codecs</artifactId> + <groupId>org.apache.lucene</groupId> + <type>jar</type> + <version>5.5.1</version> + </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml b/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml index 3b9371d..8379ccf 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml +++ b/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml @@ -128,6 +128,28 @@ <!-- Collection operator is added during the rewrite rules phase. --> </function> + <!-- fn:build-index-on-collection($arg as xs:string?, $indexFolder as xs:string?) as node()* --> + <function name="fn:build-index-on-collection"> + <param name="collection-folder" type="xs:string?"/> + <param name="index-folder" type="xs:string?"/> + <return type="node()*"/> + <runtime type="scalar" class="org.apache.vxquery.runtime.functions.index.IndexConstructorScalarEvaluatorFactory"/> + </function> + + <!-- fn:collection-from-index($indexfolder as xs:string?, $elementpath as xs:string?) as node()* --> + <function name="fn:collection-from-index"> + <param name="index-folder" type="xs:string?"/> + <param name="element-path" type="xs:string?"/> + <return type="node()*"/> + <runtime type="unnesting" class="org.apache.vxquery.runtime.functions.index.CollectionFromIndexUnnestingEvaluatorFactory"/> + <property type="DocumentOrder" class="org.apache.vxquery.compiler.rewriter.rules.propagationpolicies.InputPropertyPropagationPolicy"> + <argument value="0"/> + </property> + <property type="UniqueNodes" class="org.apache.vxquery.compiler.rewriter.rules.propagationpolicies.InputPropertyPropagationPolicy"> + <argument value="0"/> + </property> + </function> + <!-- fn:collection-with-tag($arg1 as xs:string?, $arg2 as xs:string?) as node()* --> <function name="fn:collection-with-tag"> <param name="arg1" type="xs:string?"/> http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/src/main/java/org/apache/vxquery/index/IndexAttributes.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/index/IndexAttributes.java b/vxquery-core/src/main/java/org/apache/vxquery/index/IndexAttributes.java new file mode 100644 index 0000000..cf8e3c0 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/index/IndexAttributes.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.vxquery.index; + +import java.util.List; + +import org.xml.sax.Attributes; + +public class IndexAttributes implements Attributes { + int length; + + List<String> names; + List<String> values; + List<String> uris; + List<String> localnames; + List<String> types; + List<String> qnames; + + public IndexAttributes(List<String> n, List<String> v, List<String> u, List<String> l, List<String> t, + List<String> q) { + length = n.size(); + names = n; + values = v; + uris = u; + localnames = l; + types = t; + qnames = q; + } + + /** + * Return the number of attributes in the list. + * <p> + * Once you know the number of attributes, you can iterate through the list. + * </p> + * + * @return The number of attributes in the list. + * @see #getURI(int) + * @see #getLocalName(int) + * @see #getQName(int) + * @see #getType(int) + * @see #getValue(int) + */ + @Override + public int getLength() { + return length; + } + + /** + * Look up an attribute's Namespace URI by index. + * + * @param index + * The attribute index (zero-based). + * @return The Namespace URI, or the empty string if none + * is available, or null if the index is out of + * range. + * @see #getLength + */ + @Override + public String getURI(int index) { + return uris.get(index); + } + + /** + * Look up an attribute's local name by index. + * + * @param index + * The attribute index (zero-based). + * @return The local name, or the empty string if Namespace + * processing is not being performed, or null + * if the index is out of range. + * @see #getLength + */ + @Override + public String getLocalName(int index) { + return localnames.get(index); + } + + /** + * Look up an attribute's XML qualified (prefixed) name by index. + * + * @param index + * The attribute index (zero-based). + * @return The XML qualified name, or the empty string + * if none is available, or null if the index + * is out of range. + * @see #getLength + */ + @Override + public String getQName(int index) { + return qnames.get(index); + } + + /** + * Look up an attribute's type by index. + * <p> + * The attribute type is one of the strings "CDATA", "ID", "IDREF", "IDREFS", "NMTOKEN", "NMTOKENS", "ENTITY", + * "ENTITIES", or "NOTATION" (always in upper case). + * </p> + * <p> + * If the parser has not read a declaration for the attribute, or if the parser does not report attribute types, + * then it must return the value "CDATA" as stated in the XML 1.0 Recommendation (clause 3.3.3, + * "Attribute-Value Normalization"). + * </p> + * <p> + * For an enumerated attribute that is not a notation, the parser will report the type as "NMTOKEN". + * </p> + * + * @param index + * The attribute index (zero-based). + * @return The attribute's type as a string, or null if the + * index is out of range. + * @see #getLength + */ + @Override + public String getType(int index) { + return types.get(index); + } + + /** + * Look up an attribute's value by index. + * <p> + * If the attribute value is a list of tokens (IDREFS, ENTITIES, or NMTOKENS), the tokens will be concatenated into + * a single string with each token separated by a single space. + * </p> + * + * @param index + * The attribute index (zero-based). + * @return The attribute's value as a string, or null if the + * index is out of range. + * @see #getLength + */ + @Override + public String getValue(int index) { + return values.get(index); + } + + //////////////////////////////////////////////////////////////////// + // Name-based query. + //////////////////////////////////////////////////////////////////// + + /** + * Look up the index of an attribute by Namespace name. + * + * @param uri + * The Namespace URI, or the empty string if + * the name has no Namespace URI. + * @param localName + * The attribute's local name. + * @return The index of the attribute, or -1 if it does not + * appear in the list. + */ + @Override + public int getIndex(String uri, String localName) { + for (int i = 0; i < length; i++) { + if (localnames.get(i).equals(localName) && uris.get(i).equals(uri)) { + return i; + } + } + return -1; + } + + /** + * Look up the index of an attribute by XML qualified (prefixed) name. + * + * @param qName + * The qualified (prefixed) name. + * @return The index of the attribute, or -1 if it does not + * appear in the list. + */ + @Override + public int getIndex(String qName) { + for (int i = 0; i < length; i++) { + if (qnames.get(i).equals(qName)) { + return i; + } + } + return -1; + } + + /** + * Look up an attribute's type by Namespace name. + * <p> + * See {@link #getType(int) getType(int)} for a description of the possible types. + * </p> + * + * @param uri + * The Namespace URI, or the empty String if the + * name has no Namespace URI. + * @param localName + * The local name of the attribute. + * @return The attribute type as a string, or null if the + * attribute is not in the list or if Namespace + * processing is not being performed. + */ + @Override + public String getType(String uri, String localName) { + for (int i = 0; i < length; i++) { + if (localnames.get(i).equals(localName) && uris.get(i).equals(uri)) { + return types.get(i); + } + } + return null; + } + + /** + * Look up an attribute's type by XML qualified (prefixed) name. + * <p> + * See {@link #getType(int) getType(int)} for a description of the possible types. + * </p> + * + * @param qName + * The XML qualified name. + * @return The attribute type as a string, or null if the + * attribute is not in the list or if qualified names + * are not available. + */ + @Override + public String getType(String qName) { + for (int i = 0; i < length; i++) { + if (qnames.get(i).equals(qName)) { + return types.get(i); + } + } + return null; + } + + /** + * Look up an attribute's value by Namespace name. + * <p> + * See {@link #getValue(int) getValue(int)} for a description of the possible values. + * </p> + * + * @param uri + * The Namespace URI, or the empty String if the + * name has no Namespace URI. + * @param localName + * The local name of the attribute. + * @return The attribute value as a string, or null if the + * attribute is not in the list. + */ + @Override + public String getValue(String uri, String localName) { + for (int i = 0; i < length; i++) { + if (localnames.get(i).equals(localName) && uris.get(i).equals(uri)) { + return values.get(i); + } + } + return null; + } + + /** + * Look up an attribute's value by XML qualified (prefixed) name. + * <p> + * See {@link #getValue(int) getValue(int)} for a description of the possible values. + * </p> + * + * @param qName + * The XML qualified name. + * @return The attribute value as a string, or null if the + * attribute is not in the list or if qualified names + * are not available. + */ + @Override + public String getValue(String qName) { + for (int i = 0; i < length; i++) { + if (qnames.get(i).equals(qName)) { + return values.get(i); + } + } + return null; + } + +} http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/src/main/java/org/apache/vxquery/index/IndexDocumentBuilder.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/index/IndexDocumentBuilder.java b/vxquery-core/src/main/java/org/apache/vxquery/index/IndexDocumentBuilder.java new file mode 100644 index 0000000..2884097 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/index/IndexDocumentBuilder.java @@ -0,0 +1,882 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.vxquery.index; + +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.BooleanPointable; +import org.apache.hyracks.data.std.primitive.BytePointable; +import org.apache.hyracks.data.std.primitive.DoublePointable; +import org.apache.hyracks.data.std.primitive.FloatPointable; +import org.apache.hyracks.data.std.primitive.IntegerPointable; +import org.apache.hyracks.data.std.primitive.LongPointable; +import org.apache.hyracks.data.std.primitive.ShortPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.primitive.VoidPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexWriter; +import org.apache.vxquery.datamodel.accessors.PointablePool; +import org.apache.vxquery.datamodel.accessors.PointablePoolFactory; +import org.apache.vxquery.datamodel.accessors.SequencePointable; +import org.apache.vxquery.datamodel.accessors.TaggedValuePointable; +import org.apache.vxquery.datamodel.accessors.atomic.CodedQNamePointable; +import org.apache.vxquery.datamodel.accessors.atomic.XSBinaryPointable; +import org.apache.vxquery.datamodel.accessors.atomic.XSDatePointable; +import org.apache.vxquery.datamodel.accessors.atomic.XSDateTimePointable; +import org.apache.vxquery.datamodel.accessors.atomic.XSDecimalPointable; +import org.apache.vxquery.datamodel.accessors.atomic.XSDurationPointable; +import org.apache.vxquery.datamodel.accessors.atomic.XSQNamePointable; +import org.apache.vxquery.datamodel.accessors.atomic.XSTimePointable; +import org.apache.vxquery.datamodel.accessors.nodes.AttributeNodePointable; +import org.apache.vxquery.datamodel.accessors.nodes.DocumentNodePointable; +import org.apache.vxquery.datamodel.accessors.nodes.ElementNodePointable; +import org.apache.vxquery.datamodel.accessors.nodes.NodeTreePointable; +import org.apache.vxquery.datamodel.accessors.nodes.TextOrCommentNodePointable; +import org.apache.vxquery.datamodel.values.ValueTag; +import org.apache.vxquery.runtime.functions.cast.CastToStringOperation; +import org.apache.vxquery.serializer.XMLSerializer; + +public class IndexDocumentBuilder extends XMLSerializer { + private final IPointable treePointable; + + private final PointablePool pp; + private NodeTreePointable ntp; + + private final ArrayBackedValueStorage abvs = new ArrayBackedValueStorage(); + private final DataOutput dOut = abvs.getDataOutput(); + private final CastToStringOperation castToString = new CastToStringOperation(); + private final Document doc; + private final List<ComplexItem> results; + + private final byte[] bstart; + private final int sstart; + private final int lstart; + private final IndexWriter writer; + + class ComplexItem { + public final StringField sf; + public final String id; + + public ComplexItem(StringField sfin, String idin) { + sf = sfin; + id = idin; + } + } + + //TODO: Handle Processing Instructions, PrefixedNames, and Namepsace entries + public IndexDocumentBuilder(IPointable tree, IndexWriter inWriter) { + this.treePointable = tree; + writer = inWriter; + + //convert to tagged value pointable + TaggedValuePointable tvp = (TaggedValuePointable) TaggedValuePointable.FACTORY.createPointable(); + tvp.set(treePointable.getByteArray(), 0, treePointable.getLength()); + + //get bytes and info from doc pointer + bstart = tvp.getByteArray(); + sstart = tvp.getStartOffset(); + lstart = tvp.getLength(); + + doc = new Document(); + + results = new ArrayList<ComplexItem>(); + + pp = PointablePoolFactory.INSTANCE.createPointablePool(); + } + + //This is a wrapper to start indexing using the functions adapted from XMLSerializer + public void printStart() throws IOException { + + print(bstart, sstart, lstart, "0", ""); + for (int i = 1; i < results.size() - 1; i++) { + //TODO: Since each doc is a file, + //we can only handle files + //small enough to fit in memory + doc.add(results.get(i).sf); + } + writer.addDocument(doc); + + } + + //adapted from XMLSerializer. The following functions are used to traverse the TaggedValuePointable + //and create the index elements, then create the item for the lucene index + public void print(byte[] b, int s, int l, String deweyId, String epath) throws IOException { + TaggedValuePointable tvp = pp.takeOne(TaggedValuePointable.class); + try { + tvp.set(b, s, l); + printTaggedValuePointable(tvp, deweyId, epath); + } finally { + pp.giveBack(tvp); + } + } + + private void printTaggedValuePointable(TaggedValuePointable tvp, String deweyId, String epath) throws IOException { + byte tag = tvp.getTag(); + String type = "text"; + String[] result = { "", "" }; + switch ((int) tag) { + case ValueTag.XS_ANY_URI_TAG: + result = printString(tvp, epath); + break; + + case ValueTag.XS_BASE64_BINARY_TAG: + result = printBase64Binary(tvp, epath); + break; + + case ValueTag.XS_BOOLEAN_TAG: + result = printBoolean(tvp, epath); + break; + + case ValueTag.XS_DATE_TAG: + result = printDate(tvp, epath); + break; + + case ValueTag.XS_DATETIME_TAG: + result = printDateTime(tvp, epath); + break; + + case ValueTag.XS_DAY_TIME_DURATION_TAG: + result = printDTDuration(tvp, epath); + break; + + case ValueTag.XS_BYTE_TAG: + result = printByte(tvp, epath); + break; + + case ValueTag.XS_DECIMAL_TAG: + result = printDecimal(tvp, epath); + break; + + case ValueTag.XS_DOUBLE_TAG: + result = printDouble(tvp, epath); + break; + + case ValueTag.XS_DURATION_TAG: + result = printDuration(tvp, epath); + break; + + case ValueTag.XS_FLOAT_TAG: + result = printFloat(tvp, epath); + break; + + case ValueTag.XS_G_DAY_TAG: + result = printGDay(tvp, epath); + break; + + case ValueTag.XS_G_MONTH_TAG: + result = printGMonth(tvp, epath); + break; + + case ValueTag.XS_G_MONTH_DAY_TAG: + result = printGMonthDay(tvp, epath); + break; + + case ValueTag.XS_G_YEAR_TAG: + result = printGYear(tvp, epath); + break; + + case ValueTag.XS_G_YEAR_MONTH_TAG: + result = printGYearMonth(tvp, epath); + break; + + case ValueTag.XS_HEX_BINARY_TAG: + result = printHexBinary(tvp, epath); + break; + + case ValueTag.XS_INT_TAG: + case ValueTag.XS_UNSIGNED_SHORT_TAG: + result = printInt(tvp, epath); + break; + + case ValueTag.XS_INTEGER_TAG: + case ValueTag.XS_LONG_TAG: + case ValueTag.XS_NEGATIVE_INTEGER_TAG: + case ValueTag.XS_NON_POSITIVE_INTEGER_TAG: + case ValueTag.XS_NON_NEGATIVE_INTEGER_TAG: + case ValueTag.XS_POSITIVE_INTEGER_TAG: + case ValueTag.XS_UNSIGNED_INT_TAG: + case ValueTag.XS_UNSIGNED_LONG_TAG: + result = printInteger(tvp, epath); + break; + + case ValueTag.XS_NOTATION_TAG: + result = printString(tvp, epath); + break; + + case ValueTag.XS_QNAME_TAG: + result = printQName(tvp, epath); + break; + + case ValueTag.XS_SHORT_TAG: + case ValueTag.XS_UNSIGNED_BYTE_TAG: + result = printShort(tvp, epath); + break; + + case ValueTag.XS_STRING_TAG: + case ValueTag.XS_NORMALIZED_STRING_TAG: + case ValueTag.XS_TOKEN_TAG: + case ValueTag.XS_LANGUAGE_TAG: + case ValueTag.XS_NMTOKEN_TAG: + case ValueTag.XS_NAME_TAG: + case ValueTag.XS_NCNAME_TAG: + case ValueTag.XS_ID_TAG: + case ValueTag.XS_IDREF_TAG: + case ValueTag.XS_ENTITY_TAG: + result = printString(tvp, epath); + break; + + case ValueTag.XS_TIME_TAG: + result = printTime(tvp, epath); + break; + + case ValueTag.XS_UNTYPED_ATOMIC_TAG: + result = printString(tvp, epath); + break; + + case ValueTag.XS_YEAR_MONTH_DURATION_TAG: + result = printYMDuration(tvp, epath); + break; + + case ValueTag.ATTRIBUTE_NODE_TAG: + type = "attribute"; + printAttributeNode(tvp, deweyId, epath); + break; + + case ValueTag.TEXT_NODE_TAG: + type = "textnode"; + result = printTextNode(tvp, epath); + break; + + case ValueTag.COMMENT_NODE_TAG: + type = "comment"; + result = printCommentNode(tvp, epath); + break; + + case ValueTag.SEQUENCE_TAG: + type = "sequence"; + printSequence(tvp, deweyId, epath); + break; + + case ValueTag.NODE_TREE_TAG: + type = "tree"; + printNodeTree(tvp, deweyId, epath); + break; + + case ValueTag.ELEMENT_NODE_TAG: + type = "element"; + printElementNode(tvp, deweyId, epath); + break; + + case ValueTag.DOCUMENT_NODE_TAG: + type = "doc"; + buildIndexItem(deweyId, type, result, epath); + printDocumentNode(tvp, deweyId, epath); + break; + + default: + throw new UnsupportedOperationException("Encountered tag: " + tvp.getTag()); + } + if ((int) tag != ValueTag.DOCUMENT_NODE_TAG && (int) tag != ValueTag.SEQUENCE_TAG + && (int) tag != ValueTag.NODE_TREE_TAG && (int) tag != ValueTag.ELEMENT_NODE_TAG + && (int) tag != ValueTag.ATTRIBUTE_NODE_TAG) { + buildIndexItem(deweyId, type, result, epath); + } + + } + + private void buildIndexItem(String deweyId, String type, String[] result, String parentPath) { + //Create an Index element + IndexElement test = new IndexElement(deweyId, type, result[1]); + + String path = test.epath(); + + path = StringUtils.replace(path, parentPath, ""); + //Parser doesn't like / so paths are saved as name.name.... + String luceneParentPath = parentPath.replaceAll("/", "."); + + if (!type.equals("doc")) { + path = path.replaceFirst("/", ":"); + } else { + luceneParentPath = ""; + } + //Parser doesn't like / so paths are saved as name.name.... + path = path.replaceAll("/", "."); + //Add this element to the array (they will be added in reverse order. + String fullItem = luceneParentPath + path + "." + test.type(); + + results.add(new ComplexItem(new StringField("item", fullItem, Field.Store.YES), test.id())); + } + + private String[] printDecimal(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSDecimalPointable dp = pp.takeOne(XSDecimalPointable.class); + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertDecimal(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private void printNodeTree(TaggedValuePointable tvp, String deweyId, String path) throws IOException { + if (ntp != null) { + throw new IllegalStateException("Nested NodeTreePointable found"); + } + ntp = pp.takeOne(NodeTreePointable.class); + TaggedValuePointable rootTVP = pp.takeOne(TaggedValuePointable.class); + try { + tvp.getValue(ntp); + ntp.getRootNode(rootTVP); + printTaggedValuePointable(rootTVP, deweyId, path); + } finally { + pp.giveBack(rootTVP); + pp.giveBack(ntp); + ntp = null; + } + } + + private String[] printCommentNode(TaggedValuePointable tvp, String path) { + String[] result = { "", path }; + TextOrCommentNodePointable tcnp = pp.takeOne(TextOrCommentNodePointable.class); + UTF8StringPointable utf8sp = pp.takeOne(UTF8StringPointable.class); + try { + tvp.getValue(tcnp); + tcnp.getValue(ntp, utf8sp); + + result = printString(utf8sp, path); + + } finally { + pp.giveBack(tcnp); + pp.giveBack(utf8sp); + } + return result; + } + + private String[] printTextNode(TaggedValuePointable tvp, String path) { + String[] result = { "", path }; + TextOrCommentNodePointable tcnp = pp.takeOne(TextOrCommentNodePointable.class); + UTF8StringPointable utf8sp = pp.takeOne(UTF8StringPointable.class); + try { + tvp.getValue(tcnp); + tcnp.getValue(ntp, utf8sp); + result = printString(utf8sp, path); + } finally { + pp.giveBack(tcnp); + pp.giveBack(utf8sp); + } + return result; + } + + private void printAttributeNode(TaggedValuePointable tvp, String deweyId, String path) throws IOException { + String[] result = { "", path }; + AttributeNodePointable anp = pp.takeOne(AttributeNodePointable.class); + CodedQNamePointable cqp = pp.takeOne(CodedQNamePointable.class); + UTF8StringPointable utf8sp = pp.takeOne(UTF8StringPointable.class); + TaggedValuePointable valueTVP = pp.takeOne(TaggedValuePointable.class); + try { + tvp.getValue(anp); + anp.getName(cqp); + result = printPrefixedQName(cqp, utf8sp, path); + buildIndexItem(deweyId, "attribute", result, path); + + anp.getValue(ntp, valueTVP); + + String attributeValueId = deweyId + ".0"; + printTaggedValuePointable(valueTVP, attributeValueId, result[1]); + + } finally { + pp.giveBack(valueTVP); + pp.giveBack(utf8sp); + pp.giveBack(anp); + pp.giveBack(cqp); + } + } + + private void printElementNode(TaggedValuePointable tvp, String deweyId, String path) throws IOException { + String[] result = { "", path }; + ElementNodePointable enp = pp.takeOne(ElementNodePointable.class); + CodedQNamePointable cqp = pp.takeOne(CodedQNamePointable.class); + UTF8StringPointable utf8sp = pp.takeOne(UTF8StringPointable.class); + SequencePointable seqp = pp.takeOne(SequencePointable.class); + try { + tvp.getValue(enp); + enp.getName(cqp); + result = printPrefixedQName(cqp, utf8sp, path); + buildIndexItem(deweyId, "element", result, path); + + enp.getAttributeSequence(ntp, seqp); + int numattributes = 0; + if (seqp.getByteArray() != null && seqp.getEntryCount() > 0) { + printSequence(seqp, deweyId, 0, result[1]); + numattributes = seqp.getEntryCount(); + } + + enp.getChildrenSequence(ntp, seqp); + if (seqp.getByteArray() != null) { + printSequence(seqp, deweyId, numattributes, result[1]); + } + + } finally { + pp.giveBack(seqp); + pp.giveBack(utf8sp); + pp.giveBack(cqp); + pp.giveBack(enp); + } + } + + private String[] printPrefixedQName(CodedQNamePointable cqp, UTF8StringPointable utf8sp, String path) { + ntp.getString(cqp.getLocalCode(), utf8sp); + return printString(utf8sp, path); + } + + private void printDocumentNode(TaggedValuePointable tvp, String deweyId, String path) throws IOException { + DocumentNodePointable dnp = pp.takeOne(DocumentNodePointable.class); + SequencePointable seqp = pp.takeOne(SequencePointable.class); + try { + tvp.getValue(dnp); + dnp.getContent(ntp, seqp); + printSequence(seqp, deweyId, 0, path); + } finally { + pp.giveBack(seqp); + pp.giveBack(dnp); + } + } + + private void printSequence(TaggedValuePointable tvp, String deweyId, String path) throws IOException { + SequencePointable seqp = pp.takeOne(SequencePointable.class); + try { + tvp.getValue(seqp); + printSequence(seqp, deweyId, 0, path); + } finally { + pp.giveBack(seqp); + } + } + + private void printSequence(SequencePointable seqp, String deweyId, int addon, String path) throws IOException { + VoidPointable vp = pp.takeOne(VoidPointable.class); + try { + int len = seqp.getEntryCount(); + for (int i = 0; i < len; ++i) { + int location = i + addon; + String childID = deweyId + "." + Integer.toString(location); + seqp.getEntry(i, vp); + print(vp.getByteArray(), vp.getStartOffset(), vp.getLength(), childID, path); + } + } finally { + pp.giveBack(vp); + } + } + + private String[] printBase64Binary(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSBinaryPointable bp = pp.takeOne(XSBinaryPointable.class); + try { + tvp.getValue(bp); + abvs.reset(); + castToString.convertBase64Binary(bp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(bp); + } + return result; + } + + private String[] printBoolean(TaggedValuePointable tvp, String path) { + String[] result = { "", path }; + BooleanPointable bp = pp.takeOne(BooleanPointable.class); + try { + tvp.getValue(bp); + result[0] = Boolean.toString(bp.getBoolean()); + result[1] = path + "/" + result[0]; + } finally { + pp.giveBack(bp); + } + return result; + } + + private String[] printByte(TaggedValuePointable tvp, String path) { + String[] result = { "", path }; + BytePointable bp = pp.takeOne(BytePointable.class); + try { + tvp.getValue(bp); + result[0] = Byte.toString(bp.byteValue()); + result[1] = path + "/" + result[0]; + } finally { + pp.giveBack(bp); + } + return result; + } + + private String[] printDouble(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + DoublePointable dp = pp.takeOne(DoublePointable.class); + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertDouble(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private String[] printDate(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSDatePointable dp = pp.takeOne(XSDatePointable.class); + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertDate(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private String[] printDateTime(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSDateTimePointable dtp = pp.takeOne(XSDateTimePointable.class); + try { + tvp.getValue(dtp); + abvs.reset(); + castToString.convertDatetime(dtp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dtp); + } + return result; + } + + private String[] printDTDuration(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + LongPointable lp = pp.takeOne(LongPointable.class); + try { + tvp.getValue(lp); + abvs.reset(); + castToString.convertDTDuration(lp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(lp); + } + return result; + } + + private String[] printDuration(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSDurationPointable dp = pp.takeOne(XSDurationPointable.class); + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertDuration(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private String[] printFloat(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + FloatPointable fp = pp.takeOne(FloatPointable.class); + try { + tvp.getValue(fp); + abvs.reset(); + castToString.convertFloat(fp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(fp); + } + return result; + } + + private String[] printGDay(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSDatePointable dp = pp.takeOne(XSDatePointable.class); + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertGDay(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private String[] printGMonth(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSDatePointable dp = pp.takeOne(XSDatePointable.class); + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertGMonth(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private String[] printGMonthDay(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSDatePointable dp = pp.takeOne(XSDatePointable.class); + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertGMonthDay(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private String[] printGYear(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSDatePointable dp = pp.takeOne(XSDatePointable.class); + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertGYear(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private String[] printGYearMonth(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSDatePointable dp = pp.takeOne(XSDatePointable.class); + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertGYearMonth(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private String[] printHexBinary(TaggedValuePointable tvp, String path) throws IOException { + String[] result = { "", path }; + XSBinaryPointable bp = pp.takeOne(XSBinaryPointable.class); + try { + tvp.getValue(bp); + abvs.reset(); + castToString.convertHexBinary(bp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(bp); + } + return result; + } + + private String[] printInt(TaggedValuePointable tvp, String path) { + String[] result = { "", path }; + IntegerPointable ip = pp.takeOne(IntegerPointable.class); + try { + tvp.getValue(ip); + result[0] = Integer.toString(ip.intValue()); + result[1] = path + "/" + result[0]; + } finally { + pp.giveBack(ip); + } + return result; + } + + private String[] printInteger(TaggedValuePointable tvp, String path) { + String[] result = { "", path }; + LongPointable lp = pp.takeOne(LongPointable.class); + try { + tvp.getValue(lp); + result[0] = Long.toString(lp.longValue()); + result[1] = path + "/" + result[0]; + } finally { + pp.giveBack(lp); + } + return result; + } + + private String[] printShort(TaggedValuePointable tvp, String path) { + ShortPointable sp = pp.takeOne(ShortPointable.class); + String[] result = { "", path }; + try { + tvp.getValue(sp); + result[0] = Short.toString(sp.shortValue()); + result[1] = path + "/" + result[0]; + } finally { + pp.giveBack(sp); + } + return result; + } + + private String[] printQName(TaggedValuePointable tvp, String path) throws IOException { + XSQNamePointable dp = pp.takeOne(XSQNamePointable.class); + String[] result = { "", path }; + try { + tvp.getValue(dp); + abvs.reset(); + castToString.convertQName(dp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(dp); + } + return result; + } + + private String[] printStringAbvs(String path) { + UTF8StringPointable utf8sp = pp.takeOne(UTF8StringPointable.class); + String[] result = { "", path }; + try { + utf8sp.set(abvs.getByteArray(), abvs.getStartOffset() + 1, abvs.getLength() - 1); + result = printString(utf8sp, path); + } finally { + pp.giveBack(utf8sp); + } + return result; + } + + private String[] printString(TaggedValuePointable tvp, String path) { + UTF8StringPointable utf8sp = pp.takeOne(UTF8StringPointable.class); + String[] result = { "", path }; + try { + tvp.getValue(utf8sp); + result = printString(utf8sp, path); + } finally { + pp.giveBack(utf8sp); + } + return result; + } + + private String[] printString(UTF8StringPointable utf8sp, String path) { + int utfLen = utf8sp.getUTFLength(); + int offset = 2; + String[] result = { "", path }; + while (utfLen > 0) { + char c = utf8sp.charAt(offset); + switch (c) { + case '<': + result[0] += "<"; + break; + + case '>': + result[0] += ">"; + break; + + case '&': + result[0] += "&"; + break; + + case '"': + result[0] += """; + break; + + case '\'': + result[0] += "'"; + break; + + default: + result[0] += Character.toString(c); + break; + } + int cLen = UTF8StringPointable.getModifiedUTF8Len(c); + offset += cLen; + utfLen -= cLen; + + } + result[1] = path + "/" + result[0]; + return result; + } + + private String[] printTime(TaggedValuePointable tvp, String path) throws IOException { + XSTimePointable tp = pp.takeOne(XSTimePointable.class); + String[] result = { "", path }; + try { + tvp.getValue(tp); + abvs.reset(); + castToString.convertTime(tp, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(tp); + } + return result; + } + + private String[] printYMDuration(TaggedValuePointable tvp, String path) throws IOException { + IntegerPointable ip = pp.takeOne(IntegerPointable.class); + String[] result = { "", path }; + try { + tvp.getValue(ip); + abvs.reset(); + castToString.convertYMDuration(ip, dOut); + result = printStringAbvs(path); + } catch (Exception e) { + throw new IOException(e); + } finally { + pp.giveBack(ip); + } + return result; + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/src/main/java/org/apache/vxquery/index/IndexElement.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/index/IndexElement.java b/vxquery-core/src/main/java/org/apache/vxquery/index/IndexElement.java new file mode 100644 index 0000000..d2487a5 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/index/IndexElement.java @@ -0,0 +1,42 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.index; + +public class IndexElement { + private String id; + private String type; + private String elementpath; + + public IndexElement(String id, String type, String elementpath) { + this.id = id; + this.type = type; + this.elementpath = elementpath; + } + + public String id() { + return id; + } + + public String type() { + return type; + } + + public String epath() { + return elementpath; + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java b/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java index b8dca63..ef51cee 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java @@ -18,8 +18,6 @@ package org.apache.vxquery.metadata; import java.io.ByteArrayInputStream; import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; import java.nio.ByteBuffer; @@ -31,8 +29,6 @@ import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; -import javax.xml.parsers.ParserConfigurationException; - import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.TrueFileFilter; import org.apache.commons.lang.StringUtils; @@ -67,7 +63,6 @@ import org.apache.vxquery.hdfs2.HDFSFunctions; import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; import org.apache.vxquery.xmlparser.TreeNodeIdProvider; import org.apache.vxquery.xmlparser.XMLParser; -import org.xml.sax.SAXException; public class VXQueryCollectionOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor { private static final long serialVersionUID = 1L; @@ -179,50 +174,34 @@ public class VXQueryCollectionOperatorDescriptor extends AbstractSingleActivityO for (int i = 0; i < size; i++) { //read split context = ctxFactory.createContext(job.getConfiguration(), i); - try { - reader = inputFormat.createRecordReader(inputSplits.get(i), context); - reader.initialize(inputSplits.get(i), context); - while (reader.nextKeyValue()) { - value = reader.getCurrentValue().toString(); - //Split value if it contains more than one item with the tag - if (StringUtils.countMatches(value, tag) > 1) { - String items[] = value.split(tag); - for (String item : items) { - if (item.length() > 0) { - item = START_TAG + tag + item; - stream = new ByteArrayInputStream( - item.getBytes(StandardCharsets.UTF_8)); - parser.parseHDFSElements(stream, writer, fta, i); - } + + reader = inputFormat.createRecordReader(inputSplits.get(i), context); + reader.initialize(inputSplits.get(i), context); + while (reader.nextKeyValue()) { + value = reader.getCurrentValue().toString(); + //Split value if it contains more than one item with the tag + if (StringUtils.countMatches(value, tag) > 1) { + String items[] = value.split(tag); + for (String item : items) { + if (item.length() > 0) { + item = START_TAG + tag + item; + stream = new ByteArrayInputStream( + item.getBytes(StandardCharsets.UTF_8)); + parser.parseHDFSElements(stream, writer, fta, i); } - } else { - value = START_TAG + value; - //create an input stream to the file currently reading and send it to parser - stream = new ByteArrayInputStream( - value.getBytes(StandardCharsets.UTF_8)); - parser.parseHDFSElements(stream, writer, fta, i); } - } - - } catch (InterruptedException e) { - if (LOGGER.isLoggable(Level.SEVERE)) { - LOGGER.severe(e.getMessage()); + } else { + value = START_TAG + value; + //create an input stream to the file currently reading and send it to parser + stream = new ByteArrayInputStream(value.getBytes(StandardCharsets.UTF_8)); + parser.parseHDFSElements(stream, writer, fta, i); } } - } - } catch (IOException e) { - if (LOGGER.isLoggable(Level.SEVERE)) { - LOGGER.severe(e.getMessage()); - } - } catch (ParserConfigurationException e) { - if (LOGGER.isLoggable(Level.SEVERE)) { - LOGGER.severe(e.getMessage()); - } - } catch (SAXException e) { - if (LOGGER.isLoggable(Level.SEVERE)) { - LOGGER.severe(e.getMessage()); } + + } catch (Exception e) { + throw new HyracksDataException(e); } } else { try { @@ -248,22 +227,14 @@ public class VXQueryCollectionOperatorDescriptor extends AbstractSingleActivityO throw new HyracksDataException("Invalid HDFS directory parameter (" + nodeId + ":" + directory + ") passed to collection."); } - } catch (FileNotFoundException e) { - if (LOGGER.isLoggable(Level.SEVERE)) { - LOGGER.severe(e.getMessage()); - } - } catch (IOException e) { - if (LOGGER.isLoggable(Level.SEVERE)) { - LOGGER.severe(e.getMessage()); - } + } catch (Exception e) { + throw new HyracksDataException(e); } } try { fs.close(); - } catch (IOException e) { - if (LOGGER.isLoggable(Level.SEVERE)) { - LOGGER.severe(e.getMessage()); - } + } catch (Exception e) { + throw new HyracksDataException(e); } } } http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveAnalyzer.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveAnalyzer.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveAnalyzer.java new file mode 100644 index 0000000..803aeee --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveAnalyzer.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.vxquery.runtime.functions.index; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.standard.ClassicAnalyzer; +import org.apache.lucene.analysis.standard.ClassicTokenizer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.standard.std40.StandardTokenizer40; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.StopwordAnalyzerBase; +import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.Version; + +/** + * Filters {@link StandardTokenizer} with {@link StandardFilter}, + * and {@link StopFilter}, using a list of + * English stop words. + * <a name="version"/> + * <p> + * You must specify the required {@link Version} + * compatibility when creating StandardAnalyzer: + * <ul> + * <li>As of 3.4, Hiragana and Han characters are no longer wrongly split + * from their combining characters. If you use a previous version number, + * you get the exact broken behavior for backwards compatibility. + * <li>As of 3.1, StandardTokenizer implements Unicode text segmentation, + * and StopFilter correctly handles Unicode 4.0 supplementary characters + * in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer} + * are the pre-3.1 implementations of StandardTokenizer and + * StandardAnalyzer. + * <li>As of 2.9, StopFilter preserves position increments + * <li>As of 2.4, Tokens incorrectly identified as acronyms + * are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>) + * </ul> + */ +public final class CaseSensitiveAnalyzer extends StopwordAnalyzerBase { + + /** Default maximum allowed token length */ + public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; + + private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + + /** + * An unmodifiable set containing some common English words that are usually not + * useful for searching. + */ + public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + + /** + * Builds an analyzer with the given stop words. + * + * @param stopWords + * stop words + */ + public CaseSensitiveAnalyzer(CharArraySet stopWords) { + super(stopWords); + } + + /** + * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}). + */ + public CaseSensitiveAnalyzer() { + this(STOP_WORDS_SET); + } + + /** + * Builds an analyzer with the stop words from the given reader. + * + * @see WordlistLoader#getWordSet(Reader) + * @param stopwords + * Reader to read stop words from + */ + public CaseSensitiveAnalyzer(Reader stopwords) throws IOException { + this(loadStopwordSet(stopwords)); + } + + /** + * Set maximum allowed token length. If a token is seen + * that exceeds this length then it is discarded. This + * setting only takes effect the next time tokenStream or + * tokenStream is called. + */ + public void setMaxTokenLength(int length) { + maxTokenLength = length; + } + + /** + * @see #setMaxTokenLength + */ + public int getMaxTokenLength() { + return maxTokenLength; + } + + @Override + protected TokenStreamComponents createComponents(final String fieldName) { + final Tokenizer src; + if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) { + StandardTokenizer t = new StandardTokenizer(); + t.setMaxTokenLength(maxTokenLength); + src = t; + } else { + StandardTokenizer40 t = new StandardTokenizer40(); + t.setMaxTokenLength(maxTokenLength); + src = t; + } + TokenStream tok = new StandardFilter(src); + tok = new StopFilter(tok, stopwords); + return new TokenStreamComponents(src, tok) { + @Override + protected void setReader(final Reader reader) { + int m = CaseSensitiveAnalyzer.this.maxTokenLength; + if (src instanceof StandardTokenizer) { + ((StandardTokenizer) src).setMaxTokenLength(m); + } else { + ((StandardTokenizer40) src).setMaxTokenLength(m); + } + super.setReader(reader); + } + }; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveQueryParser.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveQueryParser.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveQueryParser.java new file mode 100644 index 0000000..7cb0a18 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveQueryParser.java @@ -0,0 +1,42 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index; + +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.FastCharStream; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.Query; + +public class CaseSensitiveQueryParser extends QueryParser { + + public CaseSensitiveQueryParser(String f, Analyzer a) { + super(new FastCharStream(new StringReader(""))); + init(f, a); + } + + @Override + protected Query getPrefixQuery(String field, String termStr) throws ParseException { + if (!getAllowLeadingWildcard() && termStr.startsWith("*")) + throw new ParseException("'*' not allowed as first character in PrefixQuery"); + Term t = new Term(field, termStr); + return newPrefixQuery(t); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java new file mode 100644 index 0000000..cf0b203 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.vxquery.runtime.functions.index; + +import java.io.DataInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.algebricks.runtime.base.IUnnestingEvaluator; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.FSDirectory; +import org.apache.vxquery.datamodel.accessors.TaggedValuePointable; +import org.apache.vxquery.datamodel.values.ValueTag; +import org.apache.vxquery.exceptions.ErrorCode; +import org.apache.vxquery.exceptions.SystemException; +import org.apache.vxquery.index.IndexAttributes; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentUnnestingEvaluator; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentUnnestingEvaluatorFactory; +import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; +import org.apache.vxquery.xmlparser.SAXContentHandler; +import org.apache.vxquery.xmlparser.TreeNodeIdProvider; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +public class CollectionFromIndexUnnestingEvaluatorFactory extends AbstractTaggedValueArgumentUnnestingEvaluatorFactory { + private static final long serialVersionUID = 1L; + + public CollectionFromIndexUnnestingEvaluatorFactory(IScalarEvaluatorFactory[] args) { + super(args); + } + + @Override + protected IUnnestingEvaluator createEvaluator(IHyracksTaskContext ctx, IScalarEvaluator[] args) + throws AlgebricksException { + + return new AbstractTaggedValueArgumentUnnestingEvaluator(args) { + + private ArrayBackedValueStorage nodeAbvs = new ArrayBackedValueStorage(); + + private int indexPlace; + private int indexLength; + private String elementPath; + private String indexName; + + private UTF8StringPointable stringIndexFolder = (UTF8StringPointable) UTF8StringPointable.FACTORY + .createPointable(); + private UTF8StringPointable stringElementPath = (UTF8StringPointable) UTF8StringPointable.FACTORY + .createPointable(); + private ByteBufferInputStream bbis = new ByteBufferInputStream(); + private DataInputStream di = new DataInputStream(bbis); + + private IndexReader reader; + private IndexSearcher searcher; + private Analyzer analyzer; + private QueryParser parser; + private ScoreDoc[] hits; + private SAXContentHandler handler; + private Query query; + private Document doc; + private List<IndexableField> fields; + + @Override + public boolean step(IPointable result) throws AlgebricksException { + /* each step will create a tuple for a single xml file + * This is done using the parse function + * checkoverflow is used throughout. This is because memory might not be + * able to hold all of the results at once, so we return 1 million at + * a time and check when we need to get more + */ + if (indexPlace < indexLength) { + nodeAbvs.reset(); + try { + //TODO: now we get back the entire document + doc = searcher.doc(hits[indexPlace].doc); + fields = doc.getFields(); + parse(nodeAbvs); + } catch (IOException e) { + throw new AlgebricksException(e); + } + indexPlace += 1; + result.set(nodeAbvs.getByteArray(), nodeAbvs.getStartOffset(), nodeAbvs.getLength()); + return true; + } + return false; + } + + @Override + protected void init(TaggedValuePointable[] args) throws SystemException { + + int partition = ctxview.getTaskAttemptId().getTaskId().getPartition(); + ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider((short) partition); + handler = new SAXContentHandler(false, nodeIdProvider, true); + + nodeAbvs.reset(); + indexPlace = 0; + TaggedValuePointable tvp1 = args[0]; + TaggedValuePointable tvp2 = args[1]; + + if (tvp1.getTag() != ValueTag.XS_STRING_TAG || tvp2.getTag() != ValueTag.XS_STRING_TAG) { + throw new SystemException(ErrorCode.FORG0006); + } + tvp1.getValue(stringIndexFolder); + tvp2.getValue(stringElementPath); + //This whole loop is to get the string arguments, indefolder, elementpath, and match option + try { + // Get the list of files. + bbis.setByteBuffer(ByteBuffer.wrap( + Arrays.copyOfRange(stringIndexFolder.getByteArray(), stringIndexFolder.getStartOffset(), + stringIndexFolder.getLength() + stringIndexFolder.getStartOffset())), + 0); + indexName = di.readUTF(); + bbis.setByteBuffer(ByteBuffer.wrap( + Arrays.copyOfRange(stringElementPath.getByteArray(), stringElementPath.getStartOffset(), + stringElementPath.getLength() + stringElementPath.getStartOffset())), + 0); + elementPath = di.readUTF(); + + indexPlace = 0; + + //Create the index reader. + reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexName))); + } catch (IOException e) { + throw new SystemException(ErrorCode.SYSE0001, e); + } + + searcher = new IndexSearcher(reader); + analyzer = new CaseSensitiveAnalyzer(); + + parser = new CaseSensitiveQueryParser("item", analyzer); + + String queryString = elementPath.replaceAll("/", "."); + queryString = "item:" + queryString + "*"; + + int lastslash = elementPath.lastIndexOf("/"); + elementPath = elementPath.substring(0, lastslash) + ":" + elementPath.substring(lastslash + 1); + elementPath = elementPath.replaceAll("/", ".") + ".element"; + + TopDocs results = null; + try { + query = parser.parse(queryString); + + //TODO: Right now it only returns 1000000 results + results = searcher.search(query, 1000000); + + } catch (Exception e) { + throw new SystemException(null); + } + + hits = results.scoreDocs; + System.out.println("found: " + results.totalHits); + indexPlace = 0; + indexLength = hits.length; + + } + + public void parse(ArrayBackedValueStorage abvsFileNode) throws IOException { + try { + handler.startDocument(); + + for (int i = 0; i < fields.size(); i++) { + String fieldValue = fields.get(i).stringValue(); + if (fieldValue.equals(elementPath)) { + buildElement(abvsFileNode, i); + } + } + + handler.endDocument(); + handler.writeDocument(abvsFileNode); + } catch (Exception e) { + throw new IOException(e); + } + } + + private int buildElement(ArrayBackedValueStorage abvsFileNode, int fieldNum) throws SAXException { + int whereIFinish = fieldNum; + IndexableField field = fields.get(fieldNum); + String contents = field.stringValue(); + String uri = ""; + + int firstColon = contents.indexOf(':'); + int lastDot = contents.lastIndexOf('.'); + String type = contents.substring(lastDot + 1); + String lastBit = contents.substring(firstColon + 1, lastDot); + + if (type.equals("textnode")) { + char[] charContents = lastBit.toCharArray(); + handler.characters(charContents, 0, charContents.length); + + } + if (type.equals("element")) { + List<String> names = new ArrayList<String>(); + List<String> values = new ArrayList<String>(); + List<String> uris = new ArrayList<String>(); + List<String> localNames = new ArrayList<String>(); + List<String> types = new ArrayList<String>(); + List<String> qNames = new ArrayList<String>(); + whereIFinish = findAttributeChildren(whereIFinish, names, values, uris, localNames, types, qNames); + Attributes atts = new IndexAttributes(names, values, uris, localNames, types, qNames); + + handler.startElement(uri, lastBit, lastBit, atts); + + boolean noMoreChildren = false; + + while (whereIFinish + 1 < fields.size() && !noMoreChildren) { + if (isChild(fields.get(whereIFinish + 1), field)) { + whereIFinish = buildElement(abvsFileNode, whereIFinish + 1); + } else { + noMoreChildren = true; + } + } + + handler.endElement(uri, lastBit, lastBit); + + } + return whereIFinish; + } + + /*This function creates the attribute children for an element node + * + */ + int findAttributeChildren(int fieldnum, List<String> n, List<String> v, List<String> u, List<String> l, + List<String> t, List<String> q) { + int nextindex = fieldnum + 1; + boolean foundattributes = false; + if (nextindex < fields.size()) { + IndexableField nextguy; + + while (nextindex < fields.size()) { + nextguy = fields.get(nextindex); + String contents = nextguy.stringValue(); + int firstcolon = contents.indexOf(':'); + int lastdot = contents.lastIndexOf('.'); + String lastbit = contents.substring(firstcolon + 1, lastdot); + + if (isDirectChildAttribute(nextguy, fields.get(fieldnum))) { + foundattributes = true; + n.add(lastbit); + IndexableField nextnextguy = fields.get(nextindex + 1); + contents = nextnextguy.stringValue(); + firstcolon = contents.indexOf(':'); + lastdot = contents.lastIndexOf('.'); + String nextlastbit = contents.substring(firstcolon + 1, lastdot); + v.add(nextlastbit); + u.add(lastbit); + l.add(lastbit); + t.add(lastbit); + q.add(lastbit); + } else { + break; + } + nextindex += 2; + } + } + if (foundattributes) { + return nextindex - 1; + + } else { + return fieldnum; + } + } + + boolean isChild(IndexableField child, IndexableField adult) { + String childId = child.stringValue(); + String adultId = adult.stringValue(); + + int lastDotChild = childId.lastIndexOf('.'); + int lastDotAdult = adultId.lastIndexOf('.'); + + String childPath = childId.substring(0, lastDotChild); + String adultPath = adultId.substring(0, lastDotAdult); + adultPath = adultPath.replaceFirst(":", "."); + + return (childPath.startsWith(adultPath + ":") || childPath.startsWith(adultPath + ".")); + } + + boolean isDirectChildAttribute(IndexableField child, IndexableField adult) { + String childId = child.stringValue(); + String adultId = adult.stringValue(); + + String childPath = childId.substring(0, childId.lastIndexOf('.')); + String adultPath = adultId.substring(0, adultId.lastIndexOf('.')); + adultPath = adultPath.replaceFirst(":", "."); + String[] childSegments = child.stringValue().split("\\."); + + String childType = childSegments[childSegments.length - 1]; + + return (childPath.startsWith(adultPath + ":") && childType.equals("attribute")); + } + + }; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/1f623b16/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorScalarEvaluatorFactory.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorScalarEvaluatorFactory.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorScalarEvaluatorFactory.java new file mode 100644 index 0000000..c3776d9 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorScalarEvaluatorFactory.java @@ -0,0 +1,70 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index; + +import java.io.DataInputStream; + +import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream; +import org.apache.vxquery.datamodel.accessors.TaggedValuePointable; +import org.apache.vxquery.datamodel.builders.sequence.SequenceBuilder; +import org.apache.vxquery.exceptions.SystemException; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentScalarEvaluator; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentScalarEvaluatorFactory; +import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; +import org.apache.vxquery.xmlparser.TreeNodeIdProvider; + +public class IndexConstructorScalarEvaluatorFactory extends AbstractTaggedValueArgumentScalarEvaluatorFactory { + //Creates one Lucene doc per file + + private static final long serialVersionUID = 1L; + + public IndexConstructorScalarEvaluatorFactory(IScalarEvaluatorFactory[] args) { + super(args); + } + + @Override + protected IScalarEvaluator createEvaluator(IHyracksTaskContext ctx, IScalarEvaluator[] args) + throws AlgebricksException { + final ArrayBackedValueStorage abvs = new ArrayBackedValueStorage(); + final UTF8StringPointable stringp = (UTF8StringPointable) UTF8StringPointable.FACTORY.createPointable(); + final TaggedValuePointable nodep = (TaggedValuePointable) TaggedValuePointable.FACTORY.createPointable(); + final ByteBufferInputStream bbis = new ByteBufferInputStream(); + final DataInputStream di = new DataInputStream(bbis); + final SequenceBuilder sb = new SequenceBuilder(); + final ArrayBackedValueStorage abvsFileNode = new ArrayBackedValueStorage(); + final int partition = ctx.getTaskAttemptId().getTaskId().getPartition(); + final String nodeId = ctx.getJobletContext().getApplicationContext().getNodeId(); + final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider((short) partition); + + return new AbstractTaggedValueArgumentScalarEvaluator(args) { + + @Override + protected void evaluate(TaggedValuePointable[] args, IPointable result) throws SystemException { + IndexConstructorUtil.evaluate(args, result, stringp, bbis, di, sb, abvs, nodeIdProvider, abvsFileNode, + nodep, false, nodeId); + } + + }; + } +}
