Repository: vxquery Updated Branches: refs/heads/master 9e4e99050 -> 303899f10
VXQUERY-198 Added Update Index Statement Adds a metadata file for tracking status of Lucene Index update-index uses this file to update document entries handles file changes, deletes, and adds Includes query test Author: Menaka (menakaj) Project: http://git-wip-us.apache.org/repos/asf/vxquery/repo Commit: http://git-wip-us.apache.org/repos/asf/vxquery/commit/303899f1 Tree: http://git-wip-us.apache.org/repos/asf/vxquery/tree/303899f1 Diff: http://git-wip-us.apache.org/repos/asf/vxquery/diff/303899f1 Branch: refs/heads/master Commit: 303899f107e7314f8b264e95c347c73987d2a01b Parents: 9e4e990 Author: Steven Glenn Jacobs <[email protected]> Authored: Mon Jun 27 11:42:07 2016 -0700 Committer: Steven Glenn Jacobs <[email protected]> Committed: Mon Jun 27 11:42:07 2016 -0700 ---------------------------------------------------------------------- .gitignore | 1 + .../vxquery/functions/builtin-functions.xml | 8 + .../vxquery/index/IndexDocumentBuilder.java | 7 +- ...ctionFromIndexUnnestingEvaluatorFactory.java | 2 +- .../functions/index/IndexConstructorUtil.java | 96 +++++-- .../index/IndexUpdaterEvaluatorFactory.java | 76 ++++++ .../functions/index/updateIndex/Constants.java | 26 ++ .../index/updateIndex/IndexUpdater.java | 264 +++++++++++++++++++ .../index/updateIndex/MetaFileUtil.java | 109 ++++++++ .../index/updateIndex/XmlMetadata.java | 57 ++++ .../Indexing/updateIndex.txt | 0 .../Queries/XQuery/Indexing/updateIndex.xq | 19 ++ .../src/test/resources/cat/IndexingQueries.xml | 5 + 13 files changed, 647 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 65263b3..734a174 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,6 @@ .settings .classpath .idea +.iml target /ClusterControllerService/ http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml b/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml index adeef38..870ab75 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml +++ b/vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml @@ -135,6 +135,14 @@ <return type="node()*"/> <runtime type="scalar" class="org.apache.vxquery.runtime.functions.index.IndexConstructorScalarEvaluatorFactory"/> </function> + + <!-- fn:update-index($indexFolder as xs:string?) as node()* --> + <function name="fn:update-index"> + <param name="index-folder" type="xs:string?"/> + <return type="node()*"/> + <runtime type="scalar" + class="org.apache.vxquery.runtime.functions.index.IndexUpdaterEvaluatorFactory"/> + </function> <!-- fn:collection-from-index($indexfolder as xs:string?, $elementpath as xs:string?) as node()* --> <function name="fn:collection-from-index"> http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-core/src/main/java/org/apache/vxquery/index/IndexDocumentBuilder.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/index/IndexDocumentBuilder.java b/vxquery-core/src/main/java/org/apache/vxquery/index/IndexDocumentBuilder.java index 2884097..bccd28d 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/index/IndexDocumentBuilder.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/index/IndexDocumentBuilder.java @@ -56,6 +56,7 @@ import org.apache.vxquery.datamodel.accessors.nodes.NodeTreePointable; import org.apache.vxquery.datamodel.accessors.nodes.TextOrCommentNodePointable; import org.apache.vxquery.datamodel.values.ValueTag; import org.apache.vxquery.runtime.functions.cast.CastToStringOperation; +import org.apache.vxquery.runtime.functions.index.updateIndex.Constants; import org.apache.vxquery.serializer.XMLSerializer; public class IndexDocumentBuilder extends XMLSerializer { @@ -74,6 +75,7 @@ public class IndexDocumentBuilder extends XMLSerializer { private final int sstart; private final int lstart; private final IndexWriter writer; + private final String filePath; class ComplexItem { public final StringField sf; @@ -86,10 +88,12 @@ public class IndexDocumentBuilder extends XMLSerializer { } //TODO: Handle Processing Instructions, PrefixedNames, and Namepsace entries - public IndexDocumentBuilder(IPointable tree, IndexWriter inWriter) { + public IndexDocumentBuilder(IPointable tree, IndexWriter inWriter, String file) { this.treePointable = tree; writer = inWriter; + this.filePath = file; + //convert to tagged value pointable TaggedValuePointable tvp = (TaggedValuePointable) TaggedValuePointable.FACTORY.createPointable(); tvp.set(treePointable.getByteArray(), 0, treePointable.getLength()); @@ -109,6 +113,7 @@ public class IndexDocumentBuilder extends XMLSerializer { //This is a wrapper to start indexing using the functions adapted from XMLSerializer public void printStart() throws IOException { + doc.add(new StringField(Constants.FIELD_PATH, filePath, Field.Store.YES)); print(bstart, sstart, lstart, "0", ""); for (int i = 1; i < results.size() - 1; i++) { //TODO: Since each doc is a file, http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java index cf0b203..9bd6b92 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/CollectionFromIndexUnnestingEvaluatorFactory.java @@ -136,7 +136,7 @@ public class CollectionFromIndexUnnestingEvaluatorFactory extends AbstractTagged } tvp1.getValue(stringIndexFolder); tvp2.getValue(stringElementPath); - //This whole loop is to get the string arguments, indefolder, elementpath, and match option + //This whole loop is to get the string arguments, indexFolder, elementPath, and match option try { // Get the list of files. bbis.setByteBuffer(ByteBuffer.wrap( http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorUtil.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorUtil.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorUtil.java index 7191827..ed409f1 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorUtil.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexConstructorUtil.java @@ -16,13 +16,6 @@ */ package org.apache.vxquery.runtime.functions.index; -import java.io.DataInputStream; -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.file.Paths; -import java.util.Arrays; - import org.apache.hyracks.data.std.api.IPointable; import org.apache.hyracks.data.std.primitive.UTF8StringPointable; import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; @@ -39,15 +32,32 @@ import org.apache.vxquery.datamodel.values.ValueTag; import org.apache.vxquery.exceptions.ErrorCode; import org.apache.vxquery.exceptions.SystemException; import org.apache.vxquery.index.IndexDocumentBuilder; +import org.apache.vxquery.runtime.functions.index.updateIndex.Constants; +import org.apache.vxquery.runtime.functions.index.updateIndex.MetaFileUtil; +import org.apache.vxquery.runtime.functions.index.updateIndex.XmlMetadata; import org.apache.vxquery.runtime.functions.util.FunctionHelper; import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; import org.apache.vxquery.xmlparser.XMLParser; +import java.io.DataInputStream; +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Paths; +import java.security.NoSuchAlgorithmException; +import java.util.Arrays; +import java.util.concurrent.ConcurrentHashMap; + public class IndexConstructorUtil { + static boolean isMetaFilePresent = false; + static MetaFileUtil metaFileUtil; + static ConcurrentHashMap<String, XmlMetadata> metadataMap = new ConcurrentHashMap<>(); + public static void evaluate(TaggedValuePointable[] args, IPointable result, UTF8StringPointable stringp, - ByteBufferInputStream bbis, DataInputStream di, SequenceBuilder sb, ArrayBackedValueStorage abvs, - ITreeNodeIdProvider nodeIdProvider, ArrayBackedValueStorage abvsFileNode, TaggedValuePointable nodep, - boolean isElementPath, String nodeId) throws SystemException { + ByteBufferInputStream bbis, DataInputStream di, SequenceBuilder sb, + ArrayBackedValueStorage abvs, ITreeNodeIdProvider nodeIdProvider, + ArrayBackedValueStorage abvsFileNode, TaggedValuePointable nodep, + boolean isElementPath, String nodeId) throws SystemException { String collectionFolder; String indexFolder; TaggedValuePointable collectionTVP = args[0]; @@ -69,6 +79,10 @@ public class IndexConstructorUtil { bbis.setByteBuffer(ByteBuffer.wrap(Arrays.copyOfRange(stringp.getByteArray(), stringp.getStartOffset(), stringp.getLength() + stringp.getStartOffset())), 0); indexFolder = di.readUTF(); + + metaFileUtil = MetaFileUtil.create(indexFolder); + isMetaFilePresent = metaFileUtil.isMetaFilePresent(); + } catch (IOException e) { throw new SystemException(ErrorCode.SYSE0001, e); } @@ -95,6 +109,16 @@ public class IndexConstructorUtil { indexXmlFiles(collectionDirectory, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di, nodeId); + if (!isMetaFilePresent) { + // Add collection information to the map. + XmlMetadata data = new XmlMetadata(); + data.setPath(collectionFolder); + metadataMap.put(Constants.COLLECTION_ENTRY, data); + + // Write metadata map to a file. + metaFileUtil.writeMetaFile(metadataMap); + } + //This makes write slower but search faster. writer.forceMerge(1); @@ -111,25 +135,34 @@ public class IndexConstructorUtil { * it indexes that document node. */ public static void indexXmlFiles(File collectionDirectory, IndexWriter writer, boolean isElementPath, - TaggedValuePointable nodep, ArrayBackedValueStorage abvsFileNode, ITreeNodeIdProvider nodeIdProvider, - SequenceBuilder sb, ByteBufferInputStream bbis, DataInputStream di, String nodeId) - throws SystemException, IOException { + TaggedValuePointable nodep, ArrayBackedValueStorage abvsFileNode, + ITreeNodeIdProvider nodeIdProvider, SequenceBuilder sb, + ByteBufferInputStream bbis, DataInputStream di, String nodeId) + throws SystemException, IOException { + + for (File file : collectionDirectory.listFiles()) { if (readableXmlFile(file.getPath())) { abvsFileNode.reset(); - // Get the document node - XMLParser parser = new XMLParser(false, nodeIdProvider, nodeId); - FunctionHelper.readInDocFromString(file.getPath(), bbis, di, abvsFileNode, parser); - nodep.set(abvsFileNode.getByteArray(), abvsFileNode.getStartOffset(), abvsFileNode.getLength()); - - //Add the document to the index - //Creates one lucene doc per file - IndexDocumentBuilder ibuilder = new IndexDocumentBuilder(nodep, writer); + IndexDocumentBuilder ibuilder = getIndexBuilder(file, writer, nodep, abvsFileNode, nodeIdProvider, + bbis, di, nodeId); ibuilder.printStart(); + if (!isMetaFilePresent) { + XmlMetadata xmlMetadata = new XmlMetadata(); + xmlMetadata.setPath(file.getCanonicalPath()); + xmlMetadata.setFileName(file.getName()); + try { + xmlMetadata.setMd5(metaFileUtil.generateMD5(file)); + } catch (NoSuchAlgorithmException e) { + throw new SystemException(ErrorCode.SYSE0001, e); + } + metadataMap.put(file.getCanonicalPath(), xmlMetadata); + } + } else if (file.isDirectory()) { // Consider all XML file in sub directories. indexXmlFiles(file, writer, isElementPath, nodep, abvsFileNode, nodeIdProvider, sb, bbis, di, nodeId); @@ -141,4 +174,25 @@ public class IndexConstructorUtil { return (path.toLowerCase().endsWith(".xml") || path.toLowerCase().endsWith(".xml.gz")); } + + /** + * Separated from create index method so that it could be used as a helper function in IndexUpdater + */ + public static IndexDocumentBuilder getIndexBuilder(File file, IndexWriter writer, + TaggedValuePointable nodep, ArrayBackedValueStorage abvsFileNode, + ITreeNodeIdProvider nodeIdProvider, + ByteBufferInputStream bbis, DataInputStream di, String nodeId) + throws IOException { + + //Get the document node + XMLParser parser = new XMLParser(false, nodeIdProvider, nodeId); + FunctionHelper.readInDocFromString(file.getPath(), bbis, di, abvsFileNode, parser); + + nodep.set(abvsFileNode.getByteArray(), abvsFileNode.getStartOffset(), abvsFileNode.getLength()); + + //Add the document to the index + //Creates one lucene doc per file + return new IndexDocumentBuilder(nodep, writer, file.getCanonicalPath()); + } + } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexUpdaterEvaluatorFactory.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexUpdaterEvaluatorFactory.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexUpdaterEvaluatorFactory.java new file mode 100644 index 0000000..0231f3d --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/IndexUpdaterEvaluatorFactory.java @@ -0,0 +1,76 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index; + +import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream; +import org.apache.vxquery.datamodel.accessors.TaggedValuePointable; +import org.apache.vxquery.datamodel.builders.sequence.SequenceBuilder; +import org.apache.vxquery.exceptions.SystemException; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentScalarEvaluator; +import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentScalarEvaluatorFactory; +import org.apache.vxquery.runtime.functions.index.updateIndex.IndexUpdater; +import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; +import org.apache.vxquery.xmlparser.TreeNodeIdProvider; + +import java.io.DataInputStream; +import java.io.IOException; +import java.security.NoSuchAlgorithmException; + +/** + * Update the index of collection + */ +public class IndexUpdaterEvaluatorFactory extends AbstractTaggedValueArgumentScalarEvaluatorFactory { + public IndexUpdaterEvaluatorFactory(IScalarEvaluatorFactory[] args) { + super(args); + } + + @Override + protected IScalarEvaluator createEvaluator(IHyracksTaskContext ctx, IScalarEvaluator[] args) throws AlgebricksException { + final ArrayBackedValueStorage abvs = new ArrayBackedValueStorage(); + final UTF8StringPointable stringp = (UTF8StringPointable) UTF8StringPointable.FACTORY.createPointable(); + final TaggedValuePointable nodep = (TaggedValuePointable) TaggedValuePointable.FACTORY.createPointable(); + final ByteBufferInputStream bbis = new ByteBufferInputStream(); + final DataInputStream di = new DataInputStream(bbis); + final SequenceBuilder sb = new SequenceBuilder(); + final ArrayBackedValueStorage abvsFileNode = new ArrayBackedValueStorage(); + final int partition = ctx.getTaskAttemptId().getTaskId().getPartition(); + final String nodeId = ctx.getJobletContext().getApplicationContext().getNodeId(); + final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider((short) partition); + + return new AbstractTaggedValueArgumentScalarEvaluator(args) { + + @Override + protected void evaluate(TaggedValuePointable[] args, IPointable result) throws SystemException { + IndexUpdater updater = new IndexUpdater(args, result, stringp, bbis, di, sb, abvs, nodeIdProvider, + abvsFileNode, nodep, nodeId); + try { + updater.evaluate(); + } catch (IOException | NoSuchAlgorithmException e) { + e.printStackTrace(); + } + } + + }; + } +} http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/Constants.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/Constants.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/Constants.java new file mode 100644 index 0000000..321d348 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/Constants.java @@ -0,0 +1,26 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index.updateIndex; + +/** + * Constants used in updating index + */ +public class Constants { + public static String FIELD_PATH = "path"; + public static String META_FILE_NAME = "metaFile.file"; + public static String COLLECTION_ENTRY = "collection"; +} http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/IndexUpdater.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/IndexUpdater.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/IndexUpdater.java new file mode 100644 index 0000000..11621a7 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/IndexUpdater.java @@ -0,0 +1,264 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index.updateIndex; + +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.vxquery.datamodel.accessors.TaggedValuePointable; +import org.apache.vxquery.datamodel.builders.sequence.SequenceBuilder; +import org.apache.vxquery.datamodel.values.ValueTag; +import org.apache.vxquery.exceptions.ErrorCode; +import org.apache.vxquery.exceptions.SystemException; +import org.apache.vxquery.index.IndexDocumentBuilder; +import org.apache.vxquery.runtime.functions.index.CaseSensitiveAnalyzer; +import org.apache.vxquery.runtime.functions.index.IndexConstructorUtil; +import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; + +import java.io.DataInputStream; +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Paths; +import java.security.NoSuchAlgorithmException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Update the index if the source files are changed. + */ +public class IndexUpdater { + private MetaFileUtil metaFileUtil; + private ConcurrentHashMap<String, XmlMetadata> metadataMap; + private TaggedValuePointable[] args; + private IPointable result; + private UTF8StringPointable stringp; + private ByteBufferInputStream bbis; + private DataInputStream di; + private SequenceBuilder sb; + private ArrayBackedValueStorage abvs; + private ITreeNodeIdProvider nodeIdProvider; + private ArrayBackedValueStorage abvsFileNode; + private TaggedValuePointable nodep; + private String nodeId; + private IndexWriter indexWriter; + private Set<String> pathsFromFileList; + private Logger LOGGER = Logger.getLogger("Index Updater"); + + //TODO : Implement for paralleizing + public IndexUpdater(TaggedValuePointable[] args, IPointable result, UTF8StringPointable stringp, + ByteBufferInputStream bbis, DataInputStream di, SequenceBuilder sb, ArrayBackedValueStorage abvs, + ITreeNodeIdProvider nodeIdProvider, ArrayBackedValueStorage abvsFileNode, + TaggedValuePointable nodep, String nodeId) { + this.args = args; + this.result = result; + this.stringp = stringp; + this.bbis = bbis; + this.di = di; + this.sb = sb; + this.abvs = abvs; + this.nodeIdProvider = nodeIdProvider; + this.abvsFileNode = abvsFileNode; + this.nodep = nodep; + this.nodeId = nodeId; + this.pathsFromFileList = new HashSet<>(); + } + + public void evaluate() throws SystemException, IOException, NoSuchAlgorithmException { + String collectionFolder; + String indexFolder; + TaggedValuePointable indexTVP = args[0]; + + if (indexTVP.getTag() != ValueTag.XS_STRING_TAG) { + throw new SystemException(ErrorCode.FORG0006); + } + + XmlMetadata collectionMetadata; + try { + // Get the index folder + indexTVP.getValue(stringp); + bbis.setByteBuffer(ByteBuffer.wrap(Arrays.copyOfRange(stringp.getByteArray(), stringp.getStartOffset(), + stringp.getLength() + stringp.getStartOffset())), 0); + indexFolder = di.readUTF(); + + // Read the metadata file and load the metadata map into memory. + metaFileUtil = MetaFileUtil.create(indexFolder); + metadataMap = metaFileUtil.readMetaFile(); + + // Retrieve the collection folder path. + // Remove the entry for ease of the next steps. + collectionMetadata = metadataMap.remove(Constants.COLLECTION_ENTRY); + collectionFolder = collectionMetadata.getPath(); + + } catch (IOException | ClassNotFoundException e) { + throw new SystemException(ErrorCode.SYSE0001, e); + } + + File collectionDirectory = new File(collectionFolder); + if (!collectionDirectory.exists()) { + throw new RuntimeException("The collection directory (" + collectionFolder + ") does not exist."); + } + + abvs.reset(); + sb.reset(abvs); + + Directory fsdir = FSDirectory.open(Paths.get(indexFolder)); + indexWriter = new IndexWriter(fsdir, new IndexWriterConfig(new CaseSensitiveAnalyzer()). + setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)); + + //Execute update index process + updateIndex(collectionDirectory); + + //Detect deleted files and execute the delete index process. + deleteIndexOfDeletedFiles(metadataMap.keySet(), pathsFromFileList); + + // Add collection path entry back + metadataMap.put(Constants.COLLECTION_ENTRY, collectionMetadata); + + //Write the updated metadata to the file. + metaFileUtil.writeMetaFile(metadataMap); + + indexWriter.forceMerge(1); + + indexWriter.close(); + + sb.finish(); + result.set(abvs); + } + + /** + * Check the collection for changes. + * If changes are detected, update the index + * + * @param collection : Collection folder path + */ + private void updateIndex(File collection) throws IOException, NoSuchAlgorithmException { + + File[] list = collection.listFiles(); + + assert list != null; + for (File file : list) { + pathsFromFileList.add(file.getCanonicalPath()); + if (IndexConstructorUtil.readableXmlFile(file.getCanonicalPath())) { + XmlMetadata data = metadataMap.get(file.getCanonicalPath()); + String md5 = metaFileUtil.generateMD5(file); + + abvsFileNode.reset(); + + IndexDocumentBuilder indexDocumentBuilder; + if (data != null) { + + // This case checks whether the file has been changed. + // If the file has changed, delete the existing document, create a new index document and add it + // to the current index. + // At the same time, update the metadata for the file. + if (!md5.equals(data.getMd5())) { + + //Update index corresponding to the xml file. + indexWriter.deleteDocuments(new Term(Constants.FIELD_PATH, file.getCanonicalPath())); + indexDocumentBuilder = IndexConstructorUtil.getIndexBuilder(file, indexWriter, + nodep, abvsFileNode, nodeIdProvider, bbis, di, nodeId); + indexDocumentBuilder.printStart(); + + if (LOGGER.isDebugEnabled()) + LOGGER.log(Level.DEBUG, "New Index is created for updated file " + file.getCanonicalPath()); + + //Update the metadata map. + XmlMetadata metadata = updateEntry(file, data); + metadataMap.replace(file.getCanonicalPath(), metadata); + + } + } else { + + // In this case, the xml file has not added to the index. (It is a newly added file) + // Therefore generate a new index for this file and add it to the existing index. + indexDocumentBuilder = IndexConstructorUtil.getIndexBuilder(file, indexWriter, + nodep, abvsFileNode, nodeIdProvider, bbis, di, nodeId); + indexDocumentBuilder.printStart(); + + if (LOGGER.isDebugEnabled()) + LOGGER.log(Level.DEBUG, "New Index is created for newly added file " + file.getCanonicalPath()); + + XmlMetadata metadata = updateEntry(file, null); + metadataMap.put(file.getCanonicalPath(), metadata); + } + } else if (file.isDirectory()) { + updateIndex(file); + } + } + } + + + /** + * Update the current XmlMetadata object related to the currently reading XML file. + * + * @param file : XML file + * @param metadata : Existing metadata object + * @return : XML metadata object with updated fields. + * @throws IOException + * @throws NoSuchAlgorithmException + */ + public XmlMetadata updateEntry(File file, XmlMetadata metadata) throws IOException, NoSuchAlgorithmException { + + if (metadata == null) + metadata = new XmlMetadata(); + + metadata.setPath(file.getCanonicalPath()); + metadata.setFileName(file.getName()); + metadata.setMd5(metaFileUtil.generateMD5(file)); + return metadata; + } + + /** + * Delete the index of deleted files. + * + * @param pathsFromMap : Set of paths taken from metafile. + * @param pathsFromFileList : Set of paths taken from list of existing files. + * @throws IOException + */ + public void deleteIndexOfDeletedFiles(Set<String> pathsFromMap, Set<String> pathsFromFileList) throws IOException { + Set<String> sfm = new HashSet<>(pathsFromMap); + + // If any file has been deleted from the collection, the number of files stored in metadata is higher than + // the actual number of files. + // With set difference, the paths of deleted files are taken from the stored metadata. + // Delete the corresponding indexes of each file from the index and as well as remove the entry from the + // metadata file. + + if (sfm.size() > pathsFromFileList.size()) { + sfm.removeAll(pathsFromFileList); + + for (String s : sfm) { + metadataMap.remove(s); + indexWriter.deleteDocuments(new Term(Constants.FIELD_PATH, s)); + if (LOGGER.isDebugEnabled()) + LOGGER.log(Level.DEBUG, "Index of the deleted file " + s + " was deleted from the index!"); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/MetaFileUtil.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/MetaFileUtil.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/MetaFileUtil.java new file mode 100644 index 0000000..97c9da7 --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/MetaFileUtil.java @@ -0,0 +1,109 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index.updateIndex; + +import org.apache.log4j.Level; +import org.apache.log4j.Logger; + +import javax.xml.bind.DatatypeConverter; +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Utility class for writing, reading metadata file and generating checksum. + */ +public class MetaFileUtil { + + private File metaFile; + private Logger LOGGER = Logger.getLogger("MetadataFileUtil"); + + private MetaFileUtil(String indexFolder) { + this.metaFile = new File(indexFolder + "/" + Constants.META_FILE_NAME); + } + + public static MetaFileUtil create(String indexFolder) { + return new MetaFileUtil(indexFolder); + } + + /** + * Checks for existing metadata file. + * @return true if the metadata file is present + */ + public boolean isMetaFilePresent() { + return metaFile.exists(); + } + + /** + * Write the given List of XmlMetadata objects to a file. + * If the metadata file is already presents, delete it. + * + * @param metadataMap : Set of XmlMetaData objects + * @throws IOException + */ + public void writeMetaFile(ConcurrentHashMap<String, XmlMetadata> metadataMap) throws IOException { + if (this.isMetaFilePresent()) Files.delete(Paths.get(metaFile.getCanonicalPath())); + + FileOutputStream fileOutputStream = new FileOutputStream(this.metaFile); + ObjectOutputStream objectOutputStream = new ObjectOutputStream(fileOutputStream); + objectOutputStream.writeObject(metadataMap); + objectOutputStream.close(); + + if (LOGGER.isDebugEnabled()) + LOGGER.log(Level.DEBUG, "Writing metadata file completed successfully!"); + + } + + + /** + * Read metadata file + * + * @return : List of XmlMetadata objects + * @throws IOException + * @throws ClassNotFoundException + */ + public ConcurrentHashMap<String, XmlMetadata> readMetaFile() throws IOException, ClassNotFoundException { + FileInputStream fin = new FileInputStream(this.metaFile); + ObjectInputStream ois = new ObjectInputStream(fin); + ConcurrentHashMap<String, XmlMetadata> metadataMap = new ConcurrentHashMap<>((Map<String, XmlMetadata>)ois + .readObject()) ; + ois.close(); + + return metadataMap; + + } + + /** + * Generate MD5 checksum string for a given file. + * + * @param file : File which the checksum should be generated. + * @return : Checksum String + * @throws NoSuchAlgorithmException + * @throws IOException + */ + public String generateMD5(File file) throws NoSuchAlgorithmException, IOException { + MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(Files.readAllBytes(file.toPath())); + byte[] md5 = md.digest(); + return DatatypeConverter.printHexBinary(md5); + } + +} http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/XmlMetadata.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/XmlMetadata.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/XmlMetadata.java new file mode 100644 index 0000000..38f283f --- /dev/null +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/index/updateIndex/XmlMetadata.java @@ -0,0 +1,57 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.vxquery.runtime.functions.index.updateIndex; + +import java.io.Serializable; + +/** + *Class to store metadata related to an XML file. + * This contains + * - Path to the xml file + * - MD5 Checksum String + * - File name + */ +public class XmlMetadata implements Serializable{ + + private String path; + private String md5; + private String fileName; + + public String getPath() { + return path; + } + + public void setPath(String path) { + this.path = path; + } + + public String getMd5() { + return md5; + } + + public void setMd5(String md5) { + this.md5 = md5; + } + + public String getFileName() { + return fileName; + } + + public void setFileName(String fileName) { + this.fileName = fileName; + } +} http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/updateIndex.txt ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/updateIndex.txt b/vxquery-xtest/src/test/resources/ExpectedTestResults/Indexing/updateIndex.txt new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/updateIndex.xq ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/updateIndex.xq b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/updateIndex.xq new file mode 100644 index 0000000..061f1c1 --- /dev/null +++ b/vxquery-xtest/src/test/resources/Queries/XQuery/Indexing/updateIndex.xq @@ -0,0 +1,19 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: Update Lucene Index :) +update-index("target/tmp/indexFolder") http://git-wip-us.apache.org/repos/asf/vxquery/blob/303899f1/vxquery-xtest/src/test/resources/cat/IndexingQueries.xml ---------------------------------------------------------------------- diff --git a/vxquery-xtest/src/test/resources/cat/IndexingQueries.xml b/vxquery-xtest/src/test/resources/cat/IndexingQueries.xml index c69a6b5..369dc82 100644 --- a/vxquery-xtest/src/test/resources/cat/IndexingQueries.xml +++ b/vxquery-xtest/src/test/resources/cat/IndexingQueries.xml @@ -60,4 +60,9 @@ <query name="useIndex7" date="2016-05-26"/> <output-file compare="Text">useIndex7.txt</output-file> </test-case> + <test-case name="update-index" FilePath="Indexing/" Creator="Menaka Jayawardena"> + <description>Update the existing index</description> + <query name="updateIndex" date="2016-06-24"/> + <output-file compare="Text">updateIndex.txt</output-file> + </test-case> </test-group>
