http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveAnalyzer.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveAnalyzer.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveAnalyzer.java new file mode 100644 index 0000000..803aeee --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveAnalyzer.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.vxquery.runtime.functions.index; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.standard.ClassicAnalyzer; +import org.apache.lucene.analysis.standard.ClassicTokenizer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.standard.std40.StandardTokenizer40; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.StopwordAnalyzerBase; +import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.Version; + +/** + * Filters {@link StandardTokenizer} with {@link StandardFilter}, + * and {@link StopFilter}, using a list of + * English stop words. + * <a name="version"/> + * <p> + * You must specify the required {@link Version} + * compatibility when creating StandardAnalyzer: + * <ul> + * <li>As of 3.4, Hiragana and Han characters are no longer wrongly split + * from their combining characters. If you use a previous version number, + * you get the exact broken behavior for backwards compatibility. + * <li>As of 3.1, StandardTokenizer implements Unicode text segmentation, + * and StopFilter correctly handles Unicode 4.0 supplementary characters + * in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer} + * are the pre-3.1 implementations of StandardTokenizer and + * StandardAnalyzer. + * <li>As of 2.9, StopFilter preserves position increments + * <li>As of 2.4, Tokens incorrectly identified as acronyms + * are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>) + * </ul> + */ +public final class CaseSensitiveAnalyzer extends StopwordAnalyzerBase { + + /** Default maximum allowed token length */ + public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; + + private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + + /** + * An unmodifiable set containing some common English words that are usually not + * useful for searching. + */ + public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + + /** + * Builds an analyzer with the given stop words. + * + * @param stopWords + * stop words + */ + public CaseSensitiveAnalyzer(CharArraySet stopWords) { + super(stopWords); + } + + /** + * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}). + */ + public CaseSensitiveAnalyzer() { + this(STOP_WORDS_SET); + } + + /** + * Builds an analyzer with the stop words from the given reader. + * + * @see WordlistLoader#getWordSet(Reader) + * @param stopwords + * Reader to read stop words from + */ + public CaseSensitiveAnalyzer(Reader stopwords) throws IOException { + this(loadStopwordSet(stopwords)); + } + + /** + * Set maximum allowed token length. If a token is seen + * that exceeds this length then it is discarded. This + * setting only takes effect the next time tokenStream or + * tokenStream is called. + */ + public void setMaxTokenLength(int length) { + maxTokenLength = length; + } + + /** + * @see #setMaxTokenLength + */ + public int getMaxTokenLength() { + return maxTokenLength; + } + + @Override + protected TokenStreamComponents createComponents(final String fieldName) { + final Tokenizer src; + if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) { + StandardTokenizer t = new StandardTokenizer(); + t.setMaxTokenLength(maxTokenLength); + src = t; + } else { + StandardTokenizer40 t = new StandardTokenizer40(); + t.setMaxTokenLength(maxTokenLength); + src = t; + } + TokenStream tok = new StandardFilter(src); + tok = new StopFilter(tok, stopwords); + return new TokenStreamComponents(src, tok) { + @Override + protected void setReader(final Reader reader) { + int m = CaseSensitiveAnalyzer.this.maxTokenLength; + if (src instanceof StandardTokenizer) { + ((StandardTokenizer) src).setMaxTokenLength(m); + } else { + ((StandardTokenizer40) src).setMaxTokenLength(m); + } + super.setReader(reader); + } + }; + } +} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveQueryParser.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveQueryParser.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveQueryParser.java new file mode 100644 index 0000000..7cb0a18 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CaseSensitiveQueryParser.java @@ -0,0 +1,42 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index; + +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.FastCharStream; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.Query; + +public class CaseSensitiveQueryParser extends QueryParser { + + public CaseSensitiveQueryParser(String f, Analyzer a) { + super(new FastCharStream(new StringReader(""))); + init(f, a); + } + + @Override + protected Query getPrefixQuery(String field, String termStr) throws ParseException { + if (!getAllowLeadingWildcard() && termStr.startsWith("*")) + throw new ParseException("'*' not allowed as first character in PrefixQuery"); + Term t = new Term(field, termStr); + return newPrefixQuery(t); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java new file mode 100644 index 0000000..cf0b203 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.vxquery.runtime.functions.index; + +import java.io.DataInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.algebricks.runtime.base.IUnnestingEvaluator; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.FSDirectory; +import org.apache.vxquery.datamodel.accessors.TaggedValuePointable; +import org.apache.vxquery.datamodel.values.ValueTag; +import org.apache.vxquery.exceptions.ErrorCode; +import org.apache.vxquery.exceptions.SystemException; +import org.apache.vxquery.index.IndexAttributes; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentUnnestingEvaluator; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentUnnestingEvaluatorFactory; +import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; +import org.apache.vxquery.xmlparser.SAXContentHandler; +import org.apache.vxquery.xmlparser.TreeNodeIdProvider; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +public class CollectionFromIndexUnnestingEvaluatorFactory extends AbstractTaggedValueArgumentUnnestingEvaluatorFactory { + private static final long serialVersionUID = 1L; + + public CollectionFromIndexUnnestingEvaluatorFactory(IScalarEvaluatorFactory[] args) { + super(args); + } + + @Override + protected IUnnestingEvaluator createEvaluator(IHyracksTaskContext ctx, IScalarEvaluator[] args) + throws AlgebricksException { + + return new AbstractTaggedValueArgumentUnnestingEvaluator(args) { + + private ArrayBackedValueStorage nodeAbvs = new ArrayBackedValueStorage(); + + private int indexPlace; + private int indexLength; + private String elementPath; + private String indexName; + + private UTF8StringPointable stringIndexFolder = (UTF8StringPointable) UTF8StringPointable.FACTORY + .createPointable(); + private UTF8StringPointable stringElementPath = (UTF8StringPointable) UTF8StringPointable.FACTORY + .createPointable(); + private ByteBufferInputStream bbis = new ByteBufferInputStream(); + private DataInputStream di = new DataInputStream(bbis); + + private IndexReader reader; + private IndexSearcher searcher; + private Analyzer analyzer; + private QueryParser parser; + private ScoreDoc[] hits; + private SAXContentHandler handler; + private Query query; + private Document doc; + private List<IndexableField> fields; + + @Override + public boolean step(IPointable result) throws AlgebricksException { + /* each step will create a tuple for a single xml file + * This is done using the parse function + * checkoverflow is used throughout. This is because memory might not be + * able to hold all of the results at once, so we return 1 million at + * a time and check when we need to get more + */ + if (indexPlace < indexLength) { + nodeAbvs.reset(); + try { + //TODO: now we get back the entire document + doc = searcher.doc(hits[indexPlace].doc); + fields = doc.getFields(); + parse(nodeAbvs); + } catch (IOException e) { + throw new AlgebricksException(e); + } + indexPlace += 1; + result.set(nodeAbvs.getByteArray(), nodeAbvs.getStartOffset(), nodeAbvs.getLength()); + return true; + } + return false; + } + + @Override + protected void init(TaggedValuePointable[] args) throws SystemException { + + int partition = ctxview.getTaskAttemptId().getTaskId().getPartition(); + ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider((short) partition); + handler = new SAXContentHandler(false, nodeIdProvider, true); + + nodeAbvs.reset(); + indexPlace = 0; + TaggedValuePointable tvp1 = args[0]; + TaggedValuePointable tvp2 = args[1]; + + if (tvp1.getTag() != ValueTag.XS_STRING_TAG || tvp2.getTag() != ValueTag.XS_STRING_TAG) { + throw new SystemException(ErrorCode.FORG0006); + } + tvp1.getValue(stringIndexFolder); + tvp2.getValue(stringElementPath); + //This whole loop is to get the string arguments, indefolder, elementpath, and match option + try { + // Get the list of files. + bbis.setByteBuffer(ByteBuffer.wrap( + Arrays.copyOfRange(stringIndexFolder.getByteArray(), stringIndexFolder.getStartOffset(), + stringIndexFolder.getLength() + stringIndexFolder.getStartOffset())), + 0); + indexName = di.readUTF(); + bbis.setByteBuffer(ByteBuffer.wrap( + Arrays.copyOfRange(stringElementPath.getByteArray(), stringElementPath.getStartOffset(), + stringElementPath.getLength() + stringElementPath.getStartOffset())), + 0); + elementPath = di.readUTF(); + + indexPlace = 0; + + //Create the index reader. + reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexName))); + } catch (IOException e) { + throw new SystemException(ErrorCode.SYSE0001, e); + } + + searcher = new IndexSearcher(reader); + analyzer = new CaseSensitiveAnalyzer(); + + parser = new CaseSensitiveQueryParser("item", analyzer); + + String queryString = elementPath.replaceAll("/", "."); + queryString = "item:" + queryString + "*"; + + int lastslash = elementPath.lastIndexOf("/"); + elementPath = elementPath.substring(0, lastslash) + ":" + elementPath.substring(lastslash + 1); + elementPath = elementPath.replaceAll("/", ".") + ".element"; + + TopDocs results = null; + try { + query = parser.parse(queryString); + + //TODO: Right now it only returns 1000000 results + results = searcher.search(query, 1000000); + + } catch (Exception e) { + throw new SystemException(null); + } + + hits = results.scoreDocs; + System.out.println("found: " + results.totalHits); + indexPlace = 0; + indexLength = hits.length; + + } + + public void parse(ArrayBackedValueStorage abvsFileNode) throws IOException { + try { + handler.startDocument(); + + for (int i = 0; i < fields.size(); i++) { + String fieldValue = fields.get(i).stringValue(); + if (fieldValue.equals(elementPath)) { + buildElement(abvsFileNode, i); + } + } + + handler.endDocument(); + handler.writeDocument(abvsFileNode); + } catch (Exception e) { + throw new IOException(e); + } + } + + private int buildElement(ArrayBackedValueStorage abvsFileNode, int fieldNum) throws SAXException { + int whereIFinish = fieldNum; + IndexableField field = fields.get(fieldNum); + String contents = field.stringValue(); + String uri = ""; + + int firstColon = contents.indexOf(':'); + int lastDot = contents.lastIndexOf('.'); + String type = contents.substring(lastDot + 1); + String lastBit = contents.substring(firstColon + 1, lastDot); + + if (type.equals("textnode")) { + char[] charContents = lastBit.toCharArray(); + handler.characters(charContents, 0, charContents.length); + + } + if (type.equals("element")) { + List<String> names = new ArrayList<String>(); + List<String> values = new ArrayList<String>(); + List<String> uris = new ArrayList<String>(); + List<String> localNames = new ArrayList<String>(); + List<String> types = new ArrayList<String>(); + List<String> qNames = new ArrayList<String>(); + whereIFinish = findAttributeChildren(whereIFinish, names, values, uris, localNames, types, qNames); + Attributes atts = new IndexAttributes(names, values, uris, localNames, types, qNames); + + handler.startElement(uri, lastBit, lastBit, atts); + + boolean noMoreChildren = false; + + while (whereIFinish + 1 < fields.size() && !noMoreChildren) { + if (isChild(fields.get(whereIFinish + 1), field)) { + whereIFinish = buildElement(abvsFileNode, whereIFinish + 1); + } else { + noMoreChildren = true; + } + } + + handler.endElement(uri, lastBit, lastBit); + + } + return whereIFinish; + } + + /*This function creates the attribute children for an element node + * + */ + int findAttributeChildren(int fieldnum, List<String> n, List<String> v, List<String> u, List<String> l, + List<String> t, List<String> q) { + int nextindex = fieldnum + 1; + boolean foundattributes = false; + if (nextindex < fields.size()) { + IndexableField nextguy; + + while (nextindex < fields.size()) { + nextguy = fields.get(nextindex); + String contents = nextguy.stringValue(); + int firstcolon = contents.indexOf(':'); + int lastdot = contents.lastIndexOf('.'); + String lastbit = contents.substring(firstcolon + 1, lastdot); + + if (isDirectChildAttribute(nextguy, fields.get(fieldnum))) { + foundattributes = true; + n.add(lastbit); + IndexableField nextnextguy = fields.get(nextindex + 1); + contents = nextnextguy.stringValue(); + firstcolon = contents.indexOf(':'); + lastdot = contents.lastIndexOf('.'); + String nextlastbit = contents.substring(firstcolon + 1, lastdot); + v.add(nextlastbit); + u.add(lastbit); + l.add(lastbit); + t.add(lastbit); + q.add(lastbit); + } else { + break; + } + nextindex += 2; + } + } + if (foundattributes) { + return nextindex - 1; + + } else { + return fieldnum; + } + } + + boolean isChild(IndexableField child, IndexableField adult) { + String childId = child.stringValue(); + String adultId = adult.stringValue(); + + int lastDotChild = childId.lastIndexOf('.'); + int lastDotAdult = adultId.lastIndexOf('.'); + + String childPath = childId.substring(0, lastDotChild); + String adultPath = adultId.substring(0, lastDotAdult); + adultPath = adultPath.replaceFirst(":", "."); + + return (childPath.startsWith(adultPath + ":") || childPath.startsWith(adultPath + ".")); + } + + boolean isDirectChildAttribute(IndexableField child, IndexableField adult) { + String childId = child.stringValue(); + String adultId = adult.stringValue(); + + String childPath = childId.substring(0, childId.lastIndexOf('.')); + String adultPath = adultId.substring(0, adultId.lastIndexOf('.')); + adultPath = adultPath.replaceFirst(":", "."); + String[] childSegments = child.stringValue().split("\\."); + + String childType = childSegments[childSegments.length - 1]; + + return (childPath.startsWith(adultPath + ":") && childType.equals("attribute")); + } + + }; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorScalarEvaluatorFactory.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorScalarEvaluatorFactory.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorScalarEvaluatorFactory.java new file mode 100644 index 0000000..c3776d9 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorScalarEvaluatorFactory.java @@ -0,0 +1,70 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index; + +import java.io.DataInputStream; + +import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream; +import org.apache.vxquery.datamodel.accessors.TaggedValuePointable; +import org.apache.vxquery.datamodel.builders.sequence.SequenceBuilder; +import org.apache.vxquery.exceptions.SystemException; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentScalarEvaluator; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentScalarEvaluatorFactory; +import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; +import org.apache.vxquery.xmlparser.TreeNodeIdProvider; + +public class IndexConstructorScalarEvaluatorFactory extends AbstractTaggedValueArgumentScalarEvaluatorFactory { + //Creates one Lucene doc per file + + private static final long serialVersionUID = 1L; + + public IndexConstructorScalarEvaluatorFactory(IScalarEvaluatorFactory[] args) { + super(args); + } + + @Override + protected IScalarEvaluator createEvaluator(IHyracksTaskContext ctx, IScalarEvaluator[] args) + throws AlgebricksException { + final ArrayBackedValueStorage abvs = new ArrayBackedValueStorage(); + final UTF8StringPointable stringp = (UTF8StringPointable) UTF8StringPointable.FACTORY.createPointable(); + final TaggedValuePointable nodep = (TaggedValuePointable) TaggedValuePointable.FACTORY.createPointable(); + final ByteBufferInputStream bbis = new ByteBufferInputStream(); + final DataInputStream di = new DataInputStream(bbis); + final SequenceBuilder sb = new SequenceBuilder(); + final ArrayBackedValueStorage abvsFileNode = new ArrayBackedValueStorage(); + final int partition = ctx.getTaskAttemptId().getTaskId().getPartition(); + final String nodeId = ctx.getJobletContext().getApplicationContext().getNodeId(); + final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider((short) partition); + + return new AbstractTaggedValueArgumentScalarEvaluator(args) { + + @Override + protected void evaluate(TaggedValuePointable[] args, IPointable result) throws SystemException { + IndexConstructorUtil.evaluate(args, result, stringp, bbis, di, sb, abvs, nodeIdProvider, abvsFileNode, + nodep, false, nodeId); + } + + }; + } +} http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorUtil.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorUtil.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorUtil.java new file mode 100644 index 0000000..7191827 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorUtil.java @@ -0,0 +1,144 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index; + +import java.io.DataInputStream; +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Paths; +import java.util.Arrays; + +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.vxquery.datamodel.accessors.TaggedValuePointable; +import org.apache.vxquery.datamodel.builders.sequence.SequenceBuilder; +import org.apache.vxquery.datamodel.values.ValueTag; +import org.apache.vxquery.exceptions.ErrorCode; +import org.apache.vxquery.exceptions.SystemException; +import org.apache.vxquery.index.IndexDocumentBuilder; +import org.apache.vxquery.runtime.functions.util.FunctionHelper; +import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; +import org.apache.vxquery.xmlparser.XMLParser; + +public class IndexConstructorUtil { + public static void evaluate(TaggedValuePointable[] args, IPointable result, UTF8StringPointable stringp, + ByteBufferInputStream bbis, DataInputStream di, SequenceBuilder sb, ArrayBackedValueStorage abvs, + ITreeNodeIdProvider nodeIdProvider, ArrayBackedValueStorage abvsFileNode, TaggedValuePointable nodep, + boolean isElementPath, String nodeId) throws SystemException { + String collectionFolder; + String indexFolder; + TaggedValuePointable collectionTVP = args[0]; + TaggedValuePointable indexTVP = args[1]; + + if (collectionTVP.getTag() != ValueTag.XS_STRING_TAG || indexTVP.getTag() != ValueTag.XS_STRING_TAG) { + throw new SystemException(ErrorCode.FORG0006); + } + + try { + // Get the list of files. + collectionTVP.getValue(stringp); + bbis.setByteBuffer(ByteBuffer.wrap(Arrays.copyOfRange(stringp.getByteArray(), stringp.getStartOffset(), + stringp.getLength() + stringp.getStartOffset())), 0); + collectionFolder = di.readUTF(); + + // Get the index folder + indexTVP.getValue(stringp); + bbis.setByteBuffer(ByteBuffer.wrap(Arrays.copyOfRange(stringp.getByteArray(), stringp.getStartOffset(), + stringp.getLength() + stringp.getStartOffset())), 0); + indexFolder = di.readUTF(); + } catch (IOException e) { + throw new SystemException(ErrorCode.SYSE0001, e); + } + File collectionDirectory = new File(collectionFolder); + if (!collectionDirectory.exists()) { + throw new RuntimeException("The collection directory (" + collectionFolder + ") does not exist."); + } + + try { + abvs.reset(); + sb.reset(abvs); + + Directory dir = FSDirectory.open(Paths.get(indexFolder)); + Analyzer analyzer = new CaseSensitiveAnalyzer(); + IndexWriterConfig iwc = new IndexWriterConfig(analyzer); + + // Create will overwrite the index everytime + iwc.setOpenMode(OpenMode.CREATE); + + //Create an index writer + IndexWriter writer = new IndexWriter(dir, iwc); + + //Add files to index + indexXmlFiles(collectionDirectory, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di, + nodeId); + + //This makes write slower but search faster. + writer.forceMerge(1); + + writer.close(); + + sb.finish(); + result.set(abvs); + } catch (IOException e) { + throw new SystemException(ErrorCode.SYSE0001, e); + } + } + + /*This function goes recursively one file at a time. First it turns the file into an ABVS document node, then + * it indexes that document node. + */ + public static void indexXmlFiles(File collectionDirectory, IndexWriter writer, boolean isElementPath, + TaggedValuePointable nodep, ArrayBackedValueStorage abvsFileNode, ITreeNodeIdProvider nodeIdProvider, + SequenceBuilder sb, ByteBufferInputStream bbis, DataInputStream di, String nodeId) + throws SystemException, IOException { + for (File file : collectionDirectory.listFiles()) { + + if (readableXmlFile(file.getPath())) { + abvsFileNode.reset(); + // Get the document node + XMLParser parser = new XMLParser(false, nodeIdProvider, nodeId); + FunctionHelper.readInDocFromString(file.getPath(), bbis, di, abvsFileNode, parser); + + nodep.set(abvsFileNode.getByteArray(), abvsFileNode.getStartOffset(), abvsFileNode.getLength()); + + //Add the document to the index + //Creates one lucene doc per file + IndexDocumentBuilder ibuilder = new IndexDocumentBuilder(nodep, writer); + + ibuilder.printStart(); + + } else if (file.isDirectory()) { + // Consider all XML file in sub directories. + indexXmlFiles(file, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di, nodeId); + } + } + } + + public static boolean readableXmlFile(String path) { + return (path.toLowerCase().endsWith(".xml") || path.toLowerCase().endsWith(".xml.gz")); + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java index d394bbc..b6668ba 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java @@ -480,7 +480,7 @@ public class FunctionHelper { public static boolean compareTaggedValues(AbstractValueComparisonOperation aOp, TaggedValuePointable tvp1, TaggedValuePointable tvp2, DynamicContext dCtx, TypedPointables tp1, TypedPointables tp2) - throws SystemException { + throws SystemException { int tid1 = getBaseTypeForComparisons(tvp1.getTag()); int tid2 = getBaseTypeForComparisons(tvp2.getTag()); @@ -1217,6 +1217,11 @@ public class FunctionHelper { } catch (SystemException e) { throw new HyracksDataException(e); } + readInDocFromString(fName, bbis, di, abvs, parser); + } + + public static void readInDocFromString(String fName, ByteBufferInputStream bbis, DataInputStream di, + ArrayBackedValueStorage abvs, XMLParser parser) throws HyracksDataException { if (!fName.contains("hdfs:/")) { File file = new File(fName); if (file.exists()) { http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java index 03a125b..846c27b 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java @@ -51,40 +51,41 @@ import org.xml.sax.ext.LexicalHandler; public class SAXContentHandler implements ContentHandler, LexicalHandler { // XML node builders - private final AttributeNodeBuilder anb; - private final CommentNodeBuilder cnb; - private final DictionaryBuilder db; - private final DocumentNodeBuilder docb; - private final PINodeBuilder pinb; - private final TextNodeBuilder tnb; - private final UTF8StringBuilder utf8b; - private final List<ElementNodeBuilder> enbStack; - private final List<ElementNodeBuilder> freeENBList; + protected final AttributeNodeBuilder anb; + protected final CommentNodeBuilder cnb; + protected final DictionaryBuilder db; + protected final DocumentNodeBuilder docb; + protected final PINodeBuilder pinb; + protected final TextNodeBuilder tnb; + protected final UTF8StringBuilder utf8b; + protected final List<ElementNodeBuilder> enbStack; + protected final List<ElementNodeBuilder> freeENBList; + protected boolean isIndexHandler; // Frame writing variables - private IFrameFieldAppender appender; + protected IFrameFieldAppender appender; private int tupleIndex; private IFrameWriter writer; // Element writing and path step variables - private boolean skipping; + protected boolean skipping; private String[] childLocalName = null; private String[] childUri = null; private boolean[] subElement = null; private final TaggedValuePointable tvp; // Basic tracking and setting variables - private final boolean attachTypes; - private final boolean createNodeIds; + protected final boolean attachTypes; + protected final boolean createNodeIds; private int depth; - private final ArrayBackedValueStorage resultABVS; - private boolean pendingText; - private int nodeIdCounter; - private final ITreeNodeIdProvider nodeIdProvider; - private final ArrayBackedValueStorage tempABVS; + protected final ArrayBackedValueStorage resultABVS; + protected boolean pendingText; + protected int nodeIdCounter; + protected final ITreeNodeIdProvider nodeIdProvider; + protected final ArrayBackedValueStorage tempABVS; private final ArrayBackedValueStorage textABVS; - public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider) { + public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider, boolean isIndexHandler) { // XML node builders anb = new AttributeNodeBuilder(); cnb = new CommentNodeBuilder(); @@ -110,11 +111,16 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { this.nodeIdProvider = nodeIdProvider; tempABVS = new ArrayBackedValueStorage(); textABVS = new ArrayBackedValueStorage(); + this.isIndexHandler = isIndexHandler; + if (isIndexHandler) { + this.appender = null; + this.skipping = false; + } } public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider, IFrameFieldAppender appender, List<SequenceType> childSequenceTypes) { - this(attachTypes, nodeIdProvider); + this(attachTypes, nodeIdProvider, false); // Frame writing variables this.appender = appender; @@ -189,16 +195,21 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { return; } try { - boolean nonSkipped = foundFirstNonSkippedElement(); + boolean nonSkipped = false; + if (!isIndexHandler) { + nonSkipped = foundFirstNonSkippedElement(); + } flushText(); ElementNodeBuilder enb = enbStack.remove(enbStack.size() - 1); enb.endChildrenChunk(); endChildInParent(enb, nonSkipped); freeENB(enb); - if (nonSkipped) { - writeElement(); + if (!isIndexHandler) { + if (nonSkipped) { + writeElement(); + } + endElementChildPathStep(); } - endElementChildPathStep(); } catch (IOException e) { e.printStackTrace(); throw new SAXException(e); @@ -248,7 +259,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void startDocument() throws SAXException { - if (subElement == null) { + if (isIndexHandler || subElement == null) { skipping = false; } db.reset(); @@ -305,7 +316,10 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { ++depth; - boolean start = startElementChildPathStep(uri, localName); + boolean start = false; + if (!isIndexHandler) { + start = startElementChildPathStep(uri, localName); + } if (skipping) { return; @@ -392,7 +406,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } } - private void flushText() throws IOException { + protected void flushText() throws IOException { if (pendingText) { peekENBStackTop().startChild(tnb); if (createNodeIds) { @@ -471,7 +485,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { out.write(resultABVS.getByteArray(), resultABVS.getStartOffset(), resultABVS.getLength()); } - private ElementNodeBuilder createENB() { + protected ElementNodeBuilder createENB() { if (freeENBList.isEmpty()) { return new ElementNodeBuilder(); } @@ -482,7 +496,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { freeENBList.add(enb); } - private ElementNodeBuilder peekENBStackTop() { + protected ElementNodeBuilder peekENBStackTop() { return enbStack.get(enbStack.size() - 1); } http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java index a62a26c..34d7ba9 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java @@ -57,7 +57,7 @@ public class XMLParser { try { parser = XMLReaderFactory.createXMLReader(); if (appender == null) { - handler = new SAXContentHandler(attachTypes, idProvider); + handler = new SAXContentHandler(attachTypes, idProvider, false); } else { List<SequenceType> childSequenceTypes = new ArrayList<SequenceType>(); for (int typeCode : childSeq) { http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-core/src/test/java/org/apache/vxquery/datamodel/ArrayByteTest.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/test/java/org/apache/vxquery/datamodel/ArrayByteTest.java b/vxquery-core/src/test/java/org/apache/vxquery/datamodel/ArrayByteTest.java index 68af1d5..eec32b9 100644 --- a/vxquery-core/src/test/java/org/apache/vxquery/datamodel/ArrayByteTest.java +++ b/vxquery-core/src/test/java/org/apache/vxquery/datamodel/ArrayByteTest.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.vxquery.datamodel; import java.io.IOException; http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/main/java/org/apache/vxquery/xtest/MiniDFS.java ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/main/java/org/apache/vxquery/xtest/MiniDFS.java b/vxquery-xtest/src/main/java/org/apache/vxquery/xtest/MiniDFS.java index a675150..345a6b5 100644 --- a/vxquery-xtest/src/main/java/org/apache/vxquery/xtest/MiniDFS.java +++ b/vxquery-xtest/src/main/java/org/apache/vxquery/xtest/MiniDFS.java @@ -18,7 +18,6 @@ package org.apache.vxquery.xtest; import java.io.IOException; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.MiniDFSCluster; @@ -31,9 +30,8 @@ public class MiniDFS { private static final String PATH_TO_HADOOP_CONF = "src/test/resources/hadoop/conf"; private static final String DATA_PATH = "src/test/resources/TestSources/ghcnd"; - public void startHDFS() throws IOException { + public void startHDFS(String folder) throws IOException { - FileSystem lfs = FileSystem.getLocal(new Configuration()); JobConf conf = new JobConf(); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml")); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml")); @@ -41,9 +39,8 @@ public class MiniDFS { int numDataNodes = 1; int nameNodePort = 40000; - // cleanup artifacts created on the local file system - lfs.delete(new Path("build"), true); System.setProperty("hadoop.log.dir", "logs"); + System.setProperty("test.build.data", folder.concat("/")); MiniDFSCluster.Builder build = new MiniDFSCluster.Builder(conf); build.nameNodePort(nameNodePort); build.nameNodeHttpPort(nameNodePort + 34); http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/java/org/apache/vxquery/xtest/VXQueryTest.java ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/java/org/apache/vxquery/xtest/VXQueryTest.java b/vxquery-xtest/src/test/java/org/apache/vxquery/xtest/VXQueryTest.java index 4d0ddc0..3679aba 100644 --- a/vxquery-xtest/src/test/java/org/apache/vxquery/xtest/VXQueryTest.java +++ b/vxquery-xtest/src/test/java/org/apache/vxquery/xtest/VXQueryTest.java @@ -1,25 +1,26 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.vxquery.xtest; import java.io.File; import java.io.IOException; import java.util.Collection; +import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -30,9 +31,10 @@ import org.junit.runners.Parameterized.Parameters; @RunWith(Parameterized.class) public class VXQueryTest extends AbstractXQueryTest { private static MiniDFS dfs; + private final static String TMP = "target/tmp"; - private static String VXQUERY_CATALOG = StringUtils.join(new String[] { "src", "test", "resources", - "VXQueryCatalog.xml" }, File.separator); + private static String VXQUERY_CATALOG = StringUtils + .join(new String[] { "src", "test", "resources", "VXQueryCatalog.xml" }, File.separator); public VXQueryTest(TestCase tc) throws Exception { super(tc); @@ -57,17 +59,28 @@ public class VXQueryTest extends AbstractXQueryTest { } @BeforeClass - public static void setupHDFS() { + public static void setup() throws IOException { + File tmp = new File(TMP); + if (tmp.exists()) { + FileUtils.deleteDirectory(tmp); + } + new File(TMP.concat("/indexFolder")).mkdirs(); + String HDFSFolder = TMP.concat("/hdfsFolder"); + new File(HDFSFolder).mkdirs(); dfs = new MiniDFS(); try { - dfs.startHDFS(); + dfs.startHDFS(HDFSFolder); } catch (IOException e) { System.err.println(e); } } @AfterClass - public static void shutdownHDFS() { + public static void shutdown() throws IOException { + File tmp = new File(TMP); + if (tmp.exists()) { + FileUtils.deleteDirectory(tmp); + } dfs.shutdownHDFS(); } http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/createIndex.txt ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/createIndex.txt b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/createIndex.txt new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex1.txt ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex1.txt b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex1.txt new file mode 100644 index 0000000..baf9dca --- /dev/null +++ b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex1.txt @@ -0,0 +1,2 @@ +<data><date>2003-03-03T00:00:00.000</date><dataType>TMIN</dataType><station>GHCND:AS000000003</station><value>13.75</value><attributes><attribute/><attribute/><attribute>a</attribute><attribute/></attributes></data> +<data><date>2003-03-03T00:00:00.000</date><dataType>TMAX</dataType><station>GHCND:AS000000003</station><value>33</value><attributes><attribute/><attribute/><attribute>a</attribute></attributes></data> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex2.txt ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex2.txt b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex2.txt new file mode 100644 index 0000000..ef8dde4 --- /dev/null +++ b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex2.txt @@ -0,0 +1 @@ +<data><date>2001-01-01T00:00:00.000</date><dataType>AWND</dataType><station>GHCND:US000000001</station><value>1000</value><attributes><attribute/><attribute/><attribute>a</attribute></attributes></data> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex3.txt ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex3.txt b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex3.txt new file mode 100644 index 0000000..d8263ee --- /dev/null +++ b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex3.txt @@ -0,0 +1 @@ +2 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex4.txt ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex4.txt b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex4.txt new file mode 100644 index 0000000..f30101c --- /dev/null +++ b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex4.txt @@ -0,0 +1 @@ +3.3 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex5.txt ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex5.txt b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex5.txt new file mode 100644 index 0000000..c84c360 --- /dev/null +++ b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex5.txt @@ -0,0 +1,3 @@ +<data><date>2002-02-02T00:00:00.000</date><dataType>TMIN</dataType><station>GHCND:US000000002</station><value>12.5</value><attributes><attribute/><attribute/><attribute>a</attribute><attribute/></attributes></data> +<data><date>2002-02-02T00:00:00.000</date><dataType>TMAX</dataType><station>GHCND:US000000002</station><value>32</value><attributes><attribute/><attribute/><attribute>a</attribute><attribute/></attributes></data> +<data><date>2002-02-02T00:00:00.000</date><dataType>PRCP</dataType><station>GHCND:US000000002</station><value>20</value><attributes><attribute/><attribute/><attribute>a</attribute></attributes></data> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex6.txt ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex6.txt b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex6.txt new file mode 100644 index 0000000..9abedff --- /dev/null +++ b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex6.txt @@ -0,0 +1,2 @@ +<station><id>GHCND:US000000001</id><displayName>Station 1</displayName><latitude>10.000</latitude><longitude>-10.000</longitude><elevation>1000.0</elevation><locationLabels><type>ST</type><id>FIPS:1</id><displayName>State 1</displayName></locationLabels><locationLabels><type>CNTY</type><id>FIPS:-9999</id><displayName>County 1</displayName></locationLabels><locationLabels><type>CNTRY</type><id>FIPS:US</id><displayName/></locationLabels></station> +<station><id>GHCND:US000000002</id><displayName>Station 2</displayName><latitude>20.000</latitude><longitude>-20.000</longitude><elevation>2000.0</elevation><locationLabels><type>ST</type><id>FIPS:1</id><displayName>State 1</displayName></locationLabels><locationLabels><type>CNTY</type><id>FIPS:-9999</id><displayName>County 2</displayName></locationLabels><locationLabels><type>CNTRY</type><id>FIPS:US</id><displayName/></locationLabels></station> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex7.txt ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex7.txt b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex7.txt new file mode 100644 index 0000000..c84c360 --- /dev/null +++ b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/useIndex7.txt @@ -0,0 +1,3 @@ +<data><date>2002-02-02T00:00:00.000</date><dataType>TMIN</dataType><station>GHCND:US000000002</station><value>12.5</value><attributes><attribute/><attribute/><attribute>a</attribute><attribute/></attributes></data> +<data><date>2002-02-02T00:00:00.000</date><dataType>TMAX</dataType><station>GHCND:US000000002</station><value>32</value><attributes><attribute/><attribute/><attribute>a</attribute><attribute/></attributes></data> +<data><date>2002-02-02T00:00:00.000</date><dataType>PRCP</dataType><station>GHCND:US000000002</station><value>20</value><attributes><attribute/><attribute/><attribute>a</attribute></attributes></data> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/createIndex.xq ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/createIndex.xq b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/createIndex.xq new file mode 100644 index 0000000..f34ac4c --- /dev/null +++ b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/createIndex.xq @@ -0,0 +1,20 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: Build Lucene Index :) +build-index-on-collection( "src/test/resources/TestSources/ghcnd", "target/tmp/indexFolder") + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex1.xq ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex1.xq b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex1.xq new file mode 100644 index 0000000..1635f61 --- /dev/null +++ b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex1.xq @@ -0,0 +1,25 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: Search Lucene Index :) +for $r in collection-from-index("target/tmp/indexFolder", "/dataCollection/data")/data +let $datetime := xs:dateTime(fn:data($r/date)) +where $r/station eq "GHCND:AS000000003" + and fn:year-from-dateTime($datetime) ge 2000 + and fn:month-from-dateTime($datetime) eq 3 + and fn:day-from-dateTime($datetime) eq 3 +return $r \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex2.xq ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex2.xq b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex2.xq new file mode 100644 index 0000000..bf19ee9 --- /dev/null +++ b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex2.xq @@ -0,0 +1,24 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: Search Lucene Index :) +(: Find all reading for hurricane force wind warning or extreme wind warning. :) +(: The warnings occur when the wind speed (AWND) exceeds 110 mph (49.1744 :) +(: meters per second). (Wind value is in tenth of a meter per second) :) +for $r in collection-from-index("target/tmp/indexFolder", "/dataCollection/data")/data +where $r/dataType eq "AWND" and xs:decimal($r/value) gt 491.744 +return $r http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex3.xq ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex3.xq b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex3.xq new file mode 100644 index 0000000..28cf019 --- /dev/null +++ b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex3.xq @@ -0,0 +1,27 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: Search Lucene Index :) +(: Find the annual precipitation (PRCP) for a Seattle using the airport :) +(: station (US000000002) for 2002. :) +fn:sum( + for $r in collection-from-index("target/tmp/indexFolder", "/dataCollection/data")/data + where $r/station eq "GHCND:US000000002" + and $r/dataType eq "PRCP" + and fn:year-from-dateTime(xs:dateTime(fn:data($r/date))) eq 2002 + return $r/value +) div 10 http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex4.xq ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex4.xq b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex4.xq new file mode 100644 index 0000000..2b75cf4 --- /dev/null +++ b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex4.xq @@ -0,0 +1,24 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: Search Lucene Index :) +(: Find the highest recorded temperature (TMAX) in Celsius. :) +fn:max( + for $r in collection-from-index("target/tmp/indexFolder", "/dataCollection/data")/data + where $r/dataType eq "TMAX" + return $r/value +) div 10 http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex5.xq ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex5.xq b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex5.xq new file mode 100644 index 0000000..e83484a --- /dev/null +++ b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex5.xq @@ -0,0 +1,23 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: Search Lucene Index :) +(: Find all the weather readings for Washington state for a specific day :) +(: 2002-2-2. :) +for $r in collection-from-index("target/tmp/indexFolder", "/dataCollection/data")/data +where xs:dateTime(fn:data($r/date)) eq xs:dateTime("2002-02-02T00:00:00.000") +return $r http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex6.xq ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex6.xq b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex6.xq new file mode 100644 index 0000000..04f6672 --- /dev/null +++ b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex6.xq @@ -0,0 +1,23 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: Search Lucene Index :) +(: Find all the weather readings for Washington state for a specific day :) +(: 2002-2-2. :) +for $s in collection-from-index("target/tmp/indexFolder", "/stationCollection/station")/station +where (some $x in $s/locationLabels satisfies ($x/type eq "ST" and fn:upper-case(fn:data($x/displayName)) eq "STATE 1")) +return $s http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex7.xq ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex7.xq b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex7.xq new file mode 100644 index 0000000..e471baa --- /dev/null +++ b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/useIndex7.xq @@ -0,0 +1,27 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: Search Lucene Index :) +(: Find all the weather readings for Washington state for a specific day :) +(: 2002-2-2. :) +for $s in collection-from-index("target/tmp/indexFolder", "/stationCollection/station")/station +for $r in collection-from-index("target/tmp/indexFolder", "/dataCollection/data")/data + +where $s/id eq $r/station + and (some $x in $s/locationLabels satisfies ($x/type eq "ST" and fn:upper-case(fn:data($x/displayName)) eq "STATE 1")) + and xs:dateTime(fn:data($r/date)) eq xs:dateTime("2002-02-02T00:00:00.000") +return $r http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/VXQueryCatalog.xml ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/VXQueryCatalog.xml b/vxquery-xtest/src/test/resources/VXQueryCatalog.xml index 076341e..5e6eb62 100644 --- a/vxquery-xtest/src/test/resources/VXQueryCatalog.xml +++ b/vxquery-xtest/src/test/resources/VXQueryCatalog.xml @@ -42,6 +42,8 @@ <!ENTITY HDFSAggregateQueries SYSTEM "cat/HDFSAggregateQueries.xml"> +<!ENTITY IndexingQueries SYSTEM "cat/IndexingQueries.xml"> + <!ENTITY JsonArrayQueries SYSTEM "cat/JsonArrayQueries.xml"> <!ENTITY JsonObjectQueries SYSTEM "cat/JsonObjectQueries.xml"> @@ -208,7 +210,20 @@ <title>Aggregate HDFS Execution Tests</title> <description/> </GroupInfo> - &HDFSAggregateQueries; + &HDFSAggregateQueries; + </test-group> + </test-group> + <test-group name="IndexingQueries" featureOwner="Steven Jacobs"> + <GroupInfo> + <title>Indexing Queries</title> + <description/> + </GroupInfo> + <test-group name="IndexingTests" featureOwner="Steven Jacobs"> + <GroupInfo> + <title>Indexing Execution Tests</title> + <description/> + </GroupInfo> + &IndexingQueries; </test-group> </test-group> <test-group name="JsoniqQueries" featureOwner="Christina Pavlopoulou"> http://git-wip-us.apache.org/repos/asf/vxquery/blob/bf475170/vxquery-xtest/src/test/resources/cat/IndexingQueries.xml ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/cat/IndexingQueries.xml b/vxquery-xtest/src/test/resources/cat/IndexingQueries.xml new file mode 100644 index 0000000..1f8291d --- /dev/null +++ b/vxquery-xtest/src/test/resources/cat/IndexingQueries.xml @@ -0,0 +1,63 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<test-group xmlns="http://www.w3.org/2005/02/query-test-XQTSCatalog" name="IndexingQueries" featureOwner="VXQuery"> + <GroupInfo> + <title>Indexing</title> + <description/> + </GroupInfo> + <test-case name="create-index" FilePath="Indexing/" Creator="Steven Jacobs"> + <description>Create Lucene Index from Collection.</description> + <query name="createIndex" date="2016-05-26"/> + <output-file compare="Text">createIndex.txt</output-file> + </test-case> + <test-case name="use-index-1" FilePath="Indexing/" Creator="Steven Jacobs"> + <description>Get Collection From Lucene Index</description> + <query name="useIndex1" date="2016-05-26"/> + <output-file compare="Text">useIndex1.txt</output-file> + </test-case> + <test-case name="use-index-2" FilePath="Indexing/" Creator="Steven Jacobs"> + <description>Get Collection From Lucene Index</description> + <query name="useIndex2" date="2016-05-26"/> + <output-file compare="Text">useIndex2.txt</output-file> + </test-case> + <test-case name="use-index-3" FilePath="Indexing/" Creator="Steven Jacobs"> + <description>Get Collection From Lucene Index</description> + <query name="useIndex3" date="2016-05-26"/> + <output-file compare="Text">useIndex3.txt</output-file> + </test-case> + <test-case name="use-index-4" FilePath="Indexing/" Creator="Steven Jacobs"> + <description>Get Collection From Lucene Index</description> + <query name="useIndex4" date="2016-05-26"/> + <output-file compare="Text">useIndex4.txt</output-file> + </test-case> + <test-case name="use-index-5" FilePath="Indexing/" Creator="Steven Jacobs"> + <description>Get Collection From Lucene Index</description> + <query name="useIndex5" date="2016-05-26"/> + <output-file compare="Text">useIndex5.txt</output-file> + </test-case> + <test-case name="use-index-6" FilePath="Indexing/" Creator="Steven Jacobs"> + <description>Get Collection From Lucene Index</description> + <query name="useIndex6" date="2016-05-26"/> + <output-file compare="Text">useIndex6.txt</output-file> + </test-case> + <test-case name="use-index-7" FilePath="Indexing/" Creator="Steven Jacobs"> + <description>Get Collection From Lucene Index</description> + <query name="useIndex7" date="2016-05-26"/> + <output-file compare="Text">useIndex7.txt</output-file> + </test-case> +</test-group>
