Revision: 17186
http://sourceforge.net/p/gate/code/17186
Author: valyt
Date: 2013-12-18 16:03:06 +0000 (Wed, 18 Dec 2013)
Log Message:
-----------
Starting to draft the new updateable index API.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
Added Paths:
-----------
mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
Removed Paths:
-------------
mimir/branches/5.0/mimir-core/src/gate/mimir/Index.java
Deleted: mimir/branches/5.0/mimir-core/src/gate/mimir/Index.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/Index.java 2013-12-18
15:26:26 UTC (rev 17185)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/Index.java 2013-12-18
16:03:06 UTC (rev 17186)
@@ -1,56 +0,0 @@
-/*
- * Index.java
- *
- * Copyright (c) 2007-2013, The University of Sheffield.
- *
- * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
- * and is free software, licenced under the GNU Lesser General Public License,
- * Version 3, June 2007 (also included with this distribution as file
- * LICENCE-LGPL3.html).
- *
- * Valentin Tablan, 30 Oct 2013
- *
- * $Id$
- */
-package gate.mimir;
-
-import java.io.File;
-
-/**
- * A Mímir index which can index document and answer queries. This class is the
- * main entry point to the Mímir API.
- *
- * A Mímir index comprises the following data elements:
- * <ul>
- * <li>A datastore used by the semantic annotation helpers</li>
- * <li>one or more sub-indexes</li>
- * <li>the zip collection containing the document textual content and
- * metadata</li>
- * </ul>
- */
-public class Index {
-
- /**
- * The {@link IndexConfig} used for this index.
- */
- protected IndexConfig indexConfig;
-
- /**
- * Create a new Index.
- * @param indexConfig the configuration for the index.
- */
- public Index(IndexConfig indexConfig) {
- super();
- this.indexConfig = indexConfig;
- }
-
- /**
- * Open and existing Index.
- * @param indexDirectory the on-disk directory containing the index to be
- * opened.
- */
- public Index(File indexDirectory ) {
- // TODO
- }
-
-}
Copied: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java (from rev
17042, mimir/branches/5.0/mimir-core/src/gate/mimir/Index.java)
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
(rev 0)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2013-12-18 16:03:06 UTC (rev 17186)
@@ -0,0 +1,65 @@
+/*
+ * Index.java
+ *
+ * Copyright (c) 2007-2013, The University of Sheffield.
+ *
+ * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
+ * and is free software, licenced under the GNU Lesser General Public License,
+ * Version 3, June 2007 (also included with this distribution as file
+ * LICENCE-LGPL3.html).
+ *
+ * Valentin Tablan, 30 Oct 2013
+ *
+ * $Id$
+ */
+package gate.mimir;
+
+import gate.mimir.index.AtomicIndex;
+import gate.mimir.index.Indexer;
+import gate.mimir.search.QueryEngine;
+
+import java.io.File;
+
+/**
+ * A Mímir index which can index document and answer queries. This class is the
+ * main entry point to the Mímir API.
+ *
+ * A Mímir index is a compound index comprising the following data elements:
+ * <ul>
+ * <li>A datastore used by the semantic annotation helpers</li>
+ * <li>one or more indirect sub-indexes</li>
+ * <li>one or more direct sub-indexes (optional)</li>
+ * <li>the zip collection containing the document textual content and
+ * metadata</li>
+ * </ul>
+ */
+public class MimirIndex {
+
+ /**
+ * The {@link IndexConfig} used for this index.
+ */
+ protected IndexConfig indexConfig;
+
+ /**
+ * Create a new Index.
+ * @param indexConfig the configuration for the index.
+ */
+ public MimirIndex(IndexConfig indexConfig) {
+ super();
+ this.indexConfig = indexConfig;
+ }
+
+ /**
+ * Open and existing Index.
+ * @param indexDirectory the on-disk directory containing the index to be
+ * opened.
+ */
+ public MimirIndex(File indexDirectory ) {
+ // TODO
+ }
+
+ protected AtomicIndex[] tokenIndexes;
+
+ protected AtomicIndex[] mentionIndexes;
+
+}
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-18 15:26:26 UTC (rev 17185)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-18 16:03:06 UTC (rev 17186)
@@ -14,35 +14,145 @@
*/
package gate.mimir.index;
-import it.unimi.di.big.mg4j.index.IndexIterator;
+import gate.Document;
+import gate.mimir.MimirIndex;
import it.unimi.di.big.mg4j.index.IndexReader;
+import java.io.File;
+
/**
* An indirect index associating terms with documents. Terms can be either
token
- * feature values, or semantic annotation URIs.
+ * feature values, or semantic annotation URIs. Optionally, a direct index may
+ * also be present.
+ *
+ * An atomic index manages a head index (the principal data) and a set of tail
+ * indexes (batches containing updates). Additionally, the date representing
+ * all the new documents that have been queued for indexing since the last tail
+ * was written are stored in RAM.
*/
-public interface AtomicIndex {
+public abstract class AtomicIndex {
/**
- * Adds a new posting (term - document association) to the index.
- * @param termId the term being indexed
- * @param documentId the document in which the term occurs.
- * @param position the position in the document where the term occurs.
+ * The file name (under the current directory for this atomic index) which
+ * stores the principal index.
*/
- public void addPosting(long termId, long documentId, int position);
+ public static final String HEAD_FILE_NAME = "head";
/**
- * Searches the index.
- * @param termId the term being sought
- * @return an {@link IndexIterator} that can return the results of the
search.
+ * The file extension used for the temporary directory where the updated head
+ * is being built.
*/
- public IndexIterator search(long termId);
+ public static final String HEAD_NEW_EXT = ".new";
/**
- * Returns an {@link IndexReader} that can be used to search the index. The
- * returned index reader is not guaranteed to reflect any changes to the
index
- * that occur after the reader has been returned.
- * @return an {@link IndexReader}
+ * The file extension used for the temporary directory where the old head
+ * index is being stored while the newly updated one is being installed.
*/
- public IndexReader getReader();
+ public static final String HEAD_OLD_EXT = ".old";
+
+ /**
+ * The prefix used for file names (under the current directory for this
+ * atomic index) for updates to the head index.
+ */
+ public static final String TAIL_FILE_NAME_PREFIX = "tail-";
+
+
+
+ /**
+ * The file name (under the current directory for this atomic index) for the
+ * directory containing the documents that have been queued for indexing,
but
+ * not yet indexed.
+ */
+ public static final String DOCUMENTs_QUEUE_FILE_NAME = "queued-documents";
+
+ /**
+ * The name of this atomic index.
+ */
+ protected String name;
+
+ protected File indexDirectory;
+
+ /**
+ * The number of occurrences stored in this index.
+ */
+ protected long occurrences;
+
+ /**
+ * The number of occurrences represented in RAM and not yet written to disk.
+ */
+ protected long newOccurrences;
+
+ /**
+ * The {@link MimirIndex} that this atomic index is a member of.
+ */
+ protected MimirIndex parent;
+
+ protected boolean hasDirectIndex;
+
+ /**
+ * Gets the name of this atomic index. This is used as the file name
for the
+ * directory storing the index files.
+ * @return
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Is a direct index configured for this atomic index.
+ * @return
+ */
+ public boolean hasDirectIndex(){
+ return hasDirectIndex;
+ }
+
+ public void indexDocument(Document document) {
+ //TODO
+
+ // write to documents queue
+
+ // convert to index data in RAM
+
+ }
+
+ /**
+ * Writes all the data currently stored in RAM to a new tail index.
+ */
+ public void writeNewTail() {
+ //TODO
+ // dump new tail
+
+ // merge new tail into index cluster
+
+ if(hasDirectIndex) {
+ // dump new direct tail (invert the tail just written)
+ // merge new direct tail into direct index cluster
+ }
+
+ // clear queued-documents folder
+
+ newOccurrences = 0;
+ }
+
+
+ /**
+ * Combines all the currently existing tails into the head, generating
a new
+ * head index.
+ */
+ public void combineTails() {
+ // TODO
+
+ // create new head directory
+
+ // start combining the head, and each of the tails, writing out to
the
+ // new head dir.
+
+
+ }
+
+ public IndexReader getIndexReader() {
+ // TODO
+ return null;
+ }
+
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT
organizations don't have a clear picture of how application performance
affects their revenue. With AppDynamics, you get 100% visibility into your
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs