Revision: 15948
http://gate.svn.sourceforge.net/gate/?rev=15948&view=rev
Author: valyt
Date: 2012-07-17 12:40:41 +0000 (Tue, 17 Jul 2012)
Log Message:
-----------
More work on term queries:
- Multi-document queries (And and Or) using MG4J operators
- reengineered the class hierarchy, to extract common work into a new abstract
class
- implemented the counts support using MG4J visitors implementations.
Modified Paths:
--------------
mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractTermsQuery.java
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentTermsQuery.java
mimir/trunk/mimir-core/src/gate/mimir/search/terms/TermsQuery.java
Added Paths:
-----------
mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsAndTermsQuery.java
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsOrTermsQuery.java
Added:
mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
===================================================================
---
mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
(rev 0)
+++
mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
2012-07-17 12:40:41 UTC (rev 15948)
@@ -0,0 +1,130 @@
+/*
+ * AbstractIndexTermsQuery.java
+ *
+ * Copyright (c) 2007-2011, The University of Sheffield.
+ *
+ * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
+ * and is free software, licenced under the GNU Lesser General Public License,
+ * Version 3, June 2007 (also included with this distribution as file
+ * LICENCE-LGPL3.html).
+ *
+ * Valentin Tablan, 17 Jul 2012
+ *
+ * $Id$
+ */
+package gate.mimir.search.terms;
+
+import gate.mimir.search.IndexReaderPool;
+import gate.mimir.search.QueryEngine;
+import gate.mimir.search.QueryEngine.IndexType;
+import it.unimi.dsi.big.mg4j.search.DocumentIterator;
+import it.unimi.dsi.big.mg4j.search.visitor.CounterCollectionVisitor;
+import it.unimi.dsi.big.mg4j.search.visitor.CounterSetupVisitor;
+import it.unimi.dsi.big.mg4j.search.visitor.TermCollectionVisitor;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.longs.LongArrayList;
+import it.unimi.dsi.fastutil.objects.ObjectArrayList;
+
+import java.io.IOException;
+
+/**
+ * Base class for terms queries that use an MG4J index for their search.
+ */
+public abstract class AbstractIndexTermsQuery extends AbstractTermsQuery {
+
+ /**
+ * The name of the subindex in which the terms are sought. Each Mímir
+ * index includes multiple sub-indexes (some storing tokens, other storing
+ * annotations), identified by a name. For token indexes, the index name is
+ * the name of the token feature being indexed; for annotation indexes, the
+ * index name is the annotation type.
+ */
+ protected final String indexName;
+
+ /**
+ * The type of index being searched (tokens or annotations).
+ */
+ protected final IndexType indexType;
+
+ /**
+ * The direct index used for executing the query. This value is non-null
only
+ * if a direct index was configured as part of the Mímir index being
searched.
+ */
+ protected IndexReaderPool directIndexPool;
+
+ /**
+ * The indirect index used for executing the query.
+ */
+ protected IndexReaderPool indirectIndexPool;
+
+ /**
+ * The query engine used to execute this query.
+ */
+ protected QueryEngine engine;
+
+ public AbstractIndexTermsQuery(String indexName, IndexType indexType,
+ boolean stringsEnabled, boolean countsEnabled, int limit) {
+ super(stringsEnabled, countsEnabled, limit);
+ this.indexName = indexName;
+ this.indexType = indexType;
+ }
+
+
+ protected void prepare(QueryEngine engine) {
+ this.engine = engine;
+ switch(indexType){
+ case ANNOTATIONS:
+ directIndexPool = engine.getAnnotationDirectIndex(indexName);
+ indirectIndexPool = engine.getAnnotationIndex(indexName);
+ break;
+ case TOKENS:
+ directIndexPool = engine.getTokenDirectIndex(indexName);
+ indirectIndexPool = engine.getTokenIndex(indexName);
+ break;
+ default:
+ throw new IllegalArgumentException("Invalid index type: " +
+ indexType.toString());
+ }
+ }
+
+ protected TermsResultSet buildResultSet(DocumentIterator documentIterator)
+ throws IOException {
+ // prepare local data
+ LongArrayList termIds = new LongArrayList();
+ ObjectArrayList<String> termStrings = stringsEnabled ?
+ new ObjectArrayList<String>() : null;
+ IntArrayList termCounts = countsEnabled ? new IntArrayList() : null;
+ TermCollectionVisitor termCollectionVisitor = null;
+ CounterSetupVisitor counterSetupVisitor = null;
+ CounterCollectionVisitor counterCollectionVisitor = null;
+ if(countsEnabled) {
+ termCollectionVisitor = new TermCollectionVisitor();
+ counterSetupVisitor = new CounterSetupVisitor( termCollectionVisitor );
+ counterCollectionVisitor = new CounterCollectionVisitor(
counterSetupVisitor );
+ termCollectionVisitor.prepare();
+ documentIterator.accept( termCollectionVisitor );
+ counterSetupVisitor.prepare();
+ documentIterator.accept( counterSetupVisitor );
+ }
+
+ long termId = documentIterator.nextDocument();
+ while(termId != DocumentIterator.END_OF_LIST && termId != -1 &&
+ termIds.size() < limit) {
+ termIds.add(termId);
+ if(countsEnabled){
+ counterSetupVisitor.clear();
+ documentIterator.acceptOnTruePaths( counterCollectionVisitor );
+ int count = 0;
+ for (int aCount : counterSetupVisitor.count ) count += aCount;
+ termCounts.add(count);
+ }
+ if(stringsEnabled) termStrings.add(indirectIndexPool.getTerm(termId));
+ termId = documentIterator.nextDocument();
+ }
+ // construct the result
+ return new TermsResultSet(termIds.toLongArray(),
+ stringsEnabled ? termStrings.toArray(new String[termStrings.size()]) :
null,
+ null,
+ countsEnabled ? termCounts.toIntArray() : null);
+ }
+}
Property changes on:
mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
___________________________________________________________________
Added: svn:mime-type
+ text/plain
Added: svn:keywords
+ Id
Added: svn:eol-style
+ native
Modified:
mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractTermsQuery.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractTermsQuery.java
2012-07-17 01:18:52 UTC (rev 15947)
+++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractTermsQuery.java
2012-07-17 12:40:41 UTC (rev 15948)
@@ -14,6 +14,15 @@
*/
package gate.mimir.search.terms;
+import java.io.IOException;
+
+import gate.mimir.search.IndexReaderPool;
+import it.unimi.dsi.big.mg4j.index.IndexIterator;
+import it.unimi.dsi.big.mg4j.search.DocumentIterator;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.longs.LongArrayList;
+import it.unimi.dsi.fastutil.objects.ObjectArrayList;
+
/**
* Base class for term queries.
*/
@@ -23,7 +32,7 @@
protected boolean countsEnabled;
- public static final int NO_LIMIT = -1;
+ public static final int NO_LIMIT = Integer.MAX_VALUE;
/**
* The maximum number of results to be returned.
*/
@@ -43,5 +52,5 @@
public AbstractTermsQuery() {
this(false, false, NO_LIMIT);
}
-
+
}
Modified:
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentTermsQuery.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentTermsQuery.java
2012-07-17 01:18:52 UTC (rev 15947)
+++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentTermsQuery.java
2012-07-17 12:40:41 UTC (rev 15948)
@@ -14,45 +14,26 @@
*/
package gate.mimir.search.terms;
-import java.io.IOException;
-
-import it.unimi.dsi.big.mg4j.index.BitStreamIndex;
-import it.unimi.dsi.big.mg4j.index.Index;
-import it.unimi.dsi.big.mg4j.index.IndexIterator;
-import it.unimi.dsi.big.mg4j.index.IndexReader;
-import it.unimi.dsi.big.mg4j.search.DocumentIterator;
-import it.unimi.dsi.big.util.StringMap;
-import it.unimi.dsi.fastutil.ints.IntArrayList;
-import it.unimi.dsi.fastutil.longs.LongArrayList;
-import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import gate.mimir.index.mg4j.MimirDirectIndexBuilder;
import gate.mimir.search.IndexReaderPool;
import gate.mimir.search.QueryEngine;
+import gate.mimir.search.QueryEngine.IndexType;
+import it.unimi.dsi.big.mg4j.index.IndexIterator;
+import it.unimi.dsi.big.mg4j.index.IndexReader;
-import static gate.mimir.search.QueryEngine.IndexType;
+import java.io.IOException;
/**
* A {@link TermsQuery} that returns the terms occurring in a document.
*/
-public class DocumentTermsQuery extends AbstractTermsQuery {
+public class DocumentTermsQuery extends AbstractIndexTermsQuery {
/**
* The ID of the document for which the terms are being sought.
*/
protected final long documentId;
- /**
- * The name of the subindex in which the terms are sought. Each Mímir
- * index includes multiple sub-indexes (some storing tokens, other storing
- * annotations), identified by a name. For token indexes, the index name is
- * the name of the token feature being indexed; for annotation indexes, the
- * index name is the annotation type.
- */
- protected final String indexName;
-
- protected final IndexType indexType;
-
/**
* Creates a new document term query.
*
@@ -72,13 +53,11 @@
*
* @param limit the maximum number of results to be returned.
*/
- public DocumentTermsQuery(long documentId, String indexName,
+ public DocumentTermsQuery(String indexName,
IndexType indexType, boolean stringsEnabled, boolean countsEnabled,
- int limit) {
- super(stringsEnabled, countsEnabled, limit);
+ int limit, long documentId) {
+ super(indexName, indexType, stringsEnabled, countsEnabled, limit);
this.documentId = documentId;
- this.indexName = indexName;
- this.indexType = indexType;
}
/**
@@ -99,9 +78,9 @@
*
* @param indexType the type of the index being searched.
*/
- public DocumentTermsQuery(long documentId, String indexName,
- IndexType indexType) {
- this(documentId, indexName, indexType, false, true, NO_LIMIT);
+ public DocumentTermsQuery(String indexName, IndexType indexType,
+ long documentId) {
+ this(indexName, indexType, false, true, NO_LIMIT, documentId);
}
/**
@@ -123,59 +102,26 @@
*
* @param limit the maximum number of results to be returned.
*/
- public DocumentTermsQuery(long documentId, String indexName,
- IndexType indexType, int limit) {
- this(documentId, indexName, indexType, false, true, limit);
+ public DocumentTermsQuery(String indexName, IndexType indexType, int limit,
+ long documentId) {
+ this(indexName, indexType, false, true, limit, documentId);
}
+
+
/* (non-Javadoc)
- * @see gate.mimir.search.terms.TermQuery#execute()
+ * @see
gate.mimir.search.terms.TermsQuery#execute(gate.mimir.search.QueryEngine)
*/
@Override
public TermsResultSet execute(QueryEngine engine) throws IOException {
- IndexReaderPool directIndexPool = null;
- IndexReaderPool indirectIndexPool = null;
- IndexReader indexReader = null;
+ prepare(engine);
+ IndexReader indexReader = null;
try{
- switch(indexType){
- case ANNOTATIONS:
- directIndexPool = engine.getAnnotationDirectIndex(indexName);
- indirectIndexPool = engine.getAnnotationIndex(indexName);
- break;
- case TOKENS:
- directIndexPool = engine.getTokenDirectIndex(indexName);
- indirectIndexPool = engine.getTokenIndex(indexName);
- break;
- default:
- throw new IllegalArgumentException("Invalid index type: " +
- indexType.toString());
- }
-
- // prepare local data
- LongArrayList termIds = new LongArrayList();
- ObjectArrayList<String> termStrings = stringsEnabled ?
- new ObjectArrayList<String>() : null;
- IntArrayList termCounts = countsEnabled ? new IntArrayList() : null;
- // start the actual search
indexReader = directIndexPool.borrowReader();
- IndexIterator results = indexReader.documents(
- MimirDirectIndexBuilder.longToTerm(documentId));
- long termId = results.nextDocument();
- while(termId != DocumentIterator.END_OF_LIST && termId != -1) {
- termIds.add(termId);
- if(countsEnabled) termCounts.add(results.count());
- if(stringsEnabled) termStrings.add(indirectIndexPool.getTerm(termId));
- termId = results.nextDocument();
- }
- // construct the result
- return new TermsResultSet(termIds.toLongArray(),
- stringsEnabled ? termStrings.toArray(new String[termStrings.size()]) :
null,
- null,
- countsEnabled ? termCounts.toIntArray() : null);
+ return buildResultSet(
+ indexReader.documents(MimirDirectIndexBuilder.longToTerm(documentId)));
} finally {
- if(indexReader != null) {
- directIndexPool.returnReader(indexReader);
- }
+ if(indexReader != null) directIndexPool.returnReader(indexReader);
}
}
}
Added:
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsAndTermsQuery.java
===================================================================
---
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsAndTermsQuery.java
(rev 0)
+++
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsAndTermsQuery.java
2012-07-17 12:40:41 UTC (rev 15948)
@@ -0,0 +1,75 @@
+/*
+ * DocumentsAndTermsQuery.java
+ *
+ * Copyright (c) 2007-2011, The University of Sheffield.
+ *
+ * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
+ * and is free software, licenced under the GNU Lesser General Public License,
+ * Version 3, June 2007 (also included with this distribution as file
+ * LICENCE-LGPL3.html).
+ *
+ * Valentin Tablan, 17 Jul 2012
+ *
+ * $Id$
+ */
+package gate.mimir.search.terms;
+
+import gate.mimir.index.mg4j.MimirDirectIndexBuilder;
+import gate.mimir.search.QueryEngine;
+import gate.mimir.search.QueryEngine.IndexType;
+import it.unimi.dsi.big.mg4j.index.IndexIterator;
+import it.unimi.dsi.big.mg4j.index.IndexReader;
+import it.unimi.dsi.big.mg4j.search.AndDocumentIterator;
+
+import java.io.IOException;
+
+/**
+ * Find the terms that occur in <strong>all</strong> the documents in a given
+ * set.
+ */
+public class DocumentsAndTermsQuery extends AbstractIndexTermsQuery {
+
+ /**
+ * The document IDs for which the terms are sought.
+ */
+ protected long[] documentIds;
+
+ public DocumentsAndTermsQuery(String indexName, IndexType indexType,
+ boolean stringsEnabled, boolean countsEnabled,
+ int limit, long... documentIds) {
+ super(indexName, indexType, stringsEnabled, countsEnabled, limit);
+ this.documentIds = documentIds;
+ }
+
+ public DocumentsAndTermsQuery(String indexName, IndexType indexType,
+ long... documentIds) {
+ this(indexName, indexType, false, false, NO_LIMIT, documentIds);
+ }
+
+ public DocumentsAndTermsQuery(String indexName, IndexType indexType,
+ int limit, long... documentIds) {
+ this(indexName, indexType, false, false, limit, documentIds);
+ }
+
+ /* (non-Javadoc)
+ * @see
gate.mimir.search.terms.TermsQuery#execute(gate.mimir.search.QueryEngine)
+ */
+ @Override
+ public TermsResultSet execute(QueryEngine engine) throws IOException {
+ prepare(engine);
+ IndexReader[] indexReaders = new IndexReader[documentIds.length];
+ try {
+ IndexIterator[] iterators = new IndexIterator[documentIds.length];
+ for(int i = 0; i < documentIds.length; i++) {
+ indexReaders[i] = directIndexPool.borrowReader();
+ iterators[i] = indexReaders[i].documents(
+ MimirDirectIndexBuilder.longToTerm(documentIds[i]));
+ }
+ return buildResultSet(AndDocumentIterator.getInstance(iterators));
+ } finally {
+ for(IndexReader reader : indexReaders) {
+ directIndexPool.returnReader(reader);
+ }
+ }
+ }
+}
Property changes on:
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsAndTermsQuery.java
___________________________________________________________________
Added: svn:mime-type
+ text/plain
Added: svn:keywords
+ Id
Added: svn:eol-style
+ native
Added:
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsOrTermsQuery.java
===================================================================
---
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsOrTermsQuery.java
(rev 0)
+++
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsOrTermsQuery.java
2012-07-17 12:40:41 UTC (rev 15948)
@@ -0,0 +1,77 @@
+/*
+ * DocumentsOrTermsQuery.java
+ *
+ * Copyright (c) 2007-2011, The University of Sheffield.
+ *
+ * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
+ * and is free software, licenced under the GNU Lesser General Public License,
+ * Version 3, June 2007 (also included with this distribution as file
+ * LICENCE-LGPL3.html).
+ *
+ * Valentin Tablan, 17 Jul 2012
+ *
+ * $Id$
+ */
+package gate.mimir.search.terms;
+
+import gate.mimir.index.mg4j.MimirDirectIndexBuilder;
+import gate.mimir.search.QueryEngine;
+import gate.mimir.search.QueryEngine.IndexType;
+import it.unimi.dsi.big.mg4j.index.IndexIterator;
+import it.unimi.dsi.big.mg4j.index.IndexReader;
+import it.unimi.dsi.big.mg4j.search.OrDocumentIterator;
+
+import java.io.IOException;
+
+/**
+ * Find the terms that occur in <strong>any</strong> of the documents in a
given
+ * set.
+ */
+public class DocumentsOrTermsQuery extends AbstractIndexTermsQuery {
+
+
+ /**
+ * The document IDs for which the terms are sought.
+ */
+ protected long[] documentIds;
+
+ public DocumentsOrTermsQuery(String indexName, IndexType indexType,
+ boolean stringsEnabled, boolean countsEnabled,
+ int limit, long... documentIds) {
+ super(indexName, indexType, stringsEnabled, countsEnabled, limit);
+ this.documentIds = documentIds;
+ }
+
+
+ public DocumentsOrTermsQuery(String indexName, IndexType indexType,
+ long... documentIds) {
+ this(indexName, indexType, false, false, NO_LIMIT, documentIds);
+ }
+
+ public DocumentsOrTermsQuery(String indexName, IndexType indexType, int
limit,
+ long... documentIds) {
+ this(indexName, indexType, false, false, limit, documentIds);
+ }
+
+ /* (non-Javadoc)
+ * @see
gate.mimir.search.terms.TermsQuery#execute(gate.mimir.search.QueryEngine)
+ */
+ @Override
+ public TermsResultSet execute(QueryEngine engine) throws IOException {
+ prepare(engine);
+ IndexReader[] indexReaders = new IndexReader[documentIds.length];
+ try {
+ IndexIterator[] iterators = new IndexIterator[documentIds.length];
+ for(int i = 0; i < documentIds.length; i++) {
+ indexReaders[i] = directIndexPool.borrowReader();
+ iterators[i] = indexReaders[i].documents(
+ MimirDirectIndexBuilder.longToTerm(documentIds[i]));
+ }
+ return buildResultSet(OrDocumentIterator.getInstance(iterators));
+ } finally {
+ for(IndexReader reader : indexReaders) {
+ directIndexPool.returnReader(reader);
+ }
+ }
+ }
+}
Property changes on:
mimir/trunk/mimir-core/src/gate/mimir/search/terms/DocumentsOrTermsQuery.java
___________________________________________________________________
Added: svn:mime-type
+ text/plain
Added: svn:keywords
+ Id
Added: svn:eol-style
+ native
Modified: mimir/trunk/mimir-core/src/gate/mimir/search/terms/TermsQuery.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/search/terms/TermsQuery.java
2012-07-17 01:18:52 UTC (rev 15947)
+++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/TermsQuery.java
2012-07-17 12:40:41 UTC (rev 15948)
@@ -19,12 +19,13 @@
import gate.mimir.search.QueryEngine;
/**
- * A query that returns terms.
- * Term queries are fast, so they run synchronously.
+ * A query that returns terms. The terms returned must be sorted in ascending
+ * order of their term ID.
*/
public interface TermsQuery {
/**
* Runs the term query (in the calling thread) and returns the matched terms.
+ * The terms returned must be sorted in ascending order of their term ID.
* @return a {@link TermsResultSet} containing the matched terms.
* @param engine the {@link QueryEngine} used to execute the search.
* @throws IOException
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and
threat landscape has changed and how IT managers can respond. Discussions
will include endpoint security, mobile security and the latest in malware
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs