Author: shinichiro
Date: Sun Aug 2 09:29:34 2015
New Revision: 1693798
URL: http://svn.apache.org/r1693798
Log:
add multiprocess support with hdfs index
Added:
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/hadoop-minicluster.xml
(with props)
Modified:
manifoldcf/branches/CONNECTORS-1219/build.xml
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/build.xml
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClient.java
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClientManager.java
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneConnector.java
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/test/java/org/apache/manifoldcf/agents/output/lucene/tests/LuceneClientTest.java
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/pom.xml
Modified: manifoldcf/branches/CONNECTORS-1219/build.xml
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1219/build.xml?rev=1693798&r1=1693797&r2=1693798&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1219/build.xml (original)
+++ manifoldcf/branches/CONNECTORS-1219/build.xml Sun Aug 2 09:29:34 2015
@@ -1999,6 +1999,25 @@ Use Apache Forrest version forrest-0.9-d
<param name="artifact-name" value="lucene-sandbox"/>
<param name="artifact-type" value="jar"/>
</antcall>
+ <antcall target="download-via-maven">
+ <param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/solr"/>
+ <param name="artifact-version" value="${lucene.version}"/>
+ <param name="artifact-name" value="solr-core"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven">
+ <param name="target" value="lib"/>
+ <param name="project-path"
value="com/googlecode/concurrentlinkedhashmap"/>
+ <param name="artifact-version" value="1.2"/>
+ <param name="artifact-name" value="concurrentlinkedhashmap-lru"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <mkdir dir="lib/hadoop-minicluster"/>
+ <copy file="connectors/lucene/hadoop-minicluster.xml"
toFile="lib/hadoop-minicluster/pom.xml"/>
+ <exec dir="lib/hadoop-minicluster" executable="mvn">
+ <arg line="dependency:copy-dependencies"/>
+ </exec>
</target>
<target name="make-core-deps"
depends="download-resteasy,download-jsoup,download-mockito,download-alfresco-webscript-plugin,download-alfresco-indexer-client,download-mongo-java-driver,download-jira-client,download-google-api-client,download-dropbox-client,download-solrj,download-zookeeper,download-httpcomponents,download-json,download-hsqldb,download-xerces,download-commons,download-elasticsearch-plugin,download-solr-plugins,download-sharepoint-plugins,download-jstl,download-xmlgraphics-commons,download-woodstox,download-xmlsec,download-xml-apis,download-wss4j,download-velocity,download-streambuffer,download-stax,download-servlet-api,download-xml-resolver,download-osgi,download-opensaml,download-mimepull,download-mail,download-log4j,download-junit,download-jaxws,download-glassfish,download-jaxb,download-tomcat,download-h2,download-h2-support,download-geronimo-specs,download-fop,download-postgresql,download-axis,download-saaj,download-wsdl4j,download-castor,download-jetty,downloa
d-slf4j,download-xalan,download-activation,download-avalon-framework,download-poi,download-chemistry,download-ecj,download-hadoop,download-htrace,download-protobuf,download-tika,download-jackson,download-lucene">
Modified: manifoldcf/branches/CONNECTORS-1219/connectors/lucene/build.xml
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1219/connectors/lucene/build.xml?rev=1693798&r1=1693797&r2=1693798&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1219/connectors/lucene/build.xml (original)
+++ manifoldcf/branches/CONNECTORS-1219/connectors/lucene/build.xml Sun Aug 2
09:29:34 2015
@@ -40,6 +40,22 @@
<include name="lucene-sandbox*.jar"/>
<include name="guava-*.jar"/>
<include name="gson*.jar"/>
+ <include name="solr-core*.jar"/>
+ <include name="concurrentlinkedhashmap-lru*.jar"/>
+ <include name="solr-solrj*.jar"/>
+ <include name="hadoop-common*.jar"/>
+ <include name="hadoop-annotations*.jar"/>
+ <include name="hadoop-auth*.jar"/>
+ <include name="hadoop-hdfs*.jar"/>
+ <include name="htrace-core*.jar"/>
+ <include name="protobuf-java*.jar"/>
+ </fileset>
+ </path>
+
+ <path id="connector-test-classpath">
+ <path refid="mcf-connector-build.connector-test-classpath"/>
+ <fileset dir="../../lib/hadoop-minicluster/target/dependency">
+ <include name="*.jar"/>
</fileset>
</path>
@@ -54,6 +70,15 @@
<include name="lucene-sandbox*.jar"/>
<include name="guava-*.jar"/>
<include name="gson*.jar"/>
+ <include name="solr-core*.jar"/>
+ <include name="concurrentlinkedhashmap-lru*.jar"/>
+ <include name="solr-solrj*.jar"/>
+ <include name="hadoop-common*.jar"/>
+ <include name="hadoop-annotations*.jar"/>
+ <include name="hadoop-auth*.jar"/>
+ <include name="hadoop-hdfs*.jar"/>
+ <include name="htrace-core*.jar"/>
+ <include name="protobuf-java*.jar"/>
</fileset>
</copy>
</target>
Modified:
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClient.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClient.java?rev=1693798&r1=1693797&r2=1693798&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClient.java
(original)
+++
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClient.java
Sun Aug 2 09:29:34 2015
@@ -20,7 +20,6 @@ import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Type;
-import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -28,6 +27,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
@@ -47,6 +48,14 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.NRTCachingDirectory;
+import org.apache.solr.store.blockcache.BlockCache;
+import org.apache.solr.store.blockcache.BlockDirectory;
+import org.apache.solr.store.blockcache.BlockDirectoryCache;
+import org.apache.solr.store.blockcache.BufferStore;
+import org.apache.solr.store.blockcache.Cache;
+import org.apache.solr.store.blockcache.Metrics;
+import org.apache.solr.store.hdfs.HdfsDirectory;
+import org.apache.solr.store.hdfs.HdfsLockFactory;
import com.google.common.base.Joiner;
import com.google.common.base.Objects;
@@ -60,7 +69,7 @@ import com.google.gson.reflect.TypeToken
public class LuceneClient implements Closeable {
- private final Path path;
+ private final String path;
private final Map<String,Map<String,Object>> charfiltersInfo;
private final Map<String,Map<String,Object>> tokenizersInfo;
private final Map<String,Map<String,Object>> filtersInfo;
@@ -115,7 +124,7 @@ public class LuceneClient implements Clo
}
}
- public LuceneClient(Path path) throws IOException {
+ public LuceneClient(String path) throws IOException {
this(path,
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(),
LuceneClient.defaultAnalyzers(), LuceneClient.defaultFields(),
@@ -123,7 +132,7 @@ public class LuceneClient implements Clo
LuceneClient.defaultMaxDocumentLength());
}
- public LuceneClient(Path path,
+ public LuceneClient(String path,
String charfilters, String tokenizers, String filters,
String analyzers, String fields,
String idField, String contentField,
@@ -153,10 +162,9 @@ public class LuceneClient implements Clo
.setCommitOnClose(IndexWriterConfig.DEFAULT_COMMIT_ON_CLOSE)
.setRAMBufferSizeMB(IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB * 6);
- Directory fsDir = FSDirectory.open(path);
- NRTCachingDirectory cachedDir = new NRTCachingDirectory(fsDir, 4, 48);
+ Directory dir = initDirectory();
- this.writer = new IndexWriter(cachedDir, config);
+ this.writer = new IndexWriter(dir, config);
initIndex();
@@ -251,10 +259,68 @@ public class LuceneClient implements Clo
return copy;
}
+ public static boolean useHdfs(String path) {
+ return path.startsWith("hdfs:/");
+ }
+
+ private static BlockCache globalBlockCache;
+
+ private Directory initDirectory() throws IOException {
+ Directory directory;
+
+ if (!useHdfs(path)) {
+ Directory fsDir = FSDirectory.open(new File(path).toPath());
+ directory = new NRTCachingDirectory(fsDir, 4, 48);
+ } else {
+ Directory dir;
+
+ Configuration conf = new Configuration();
+ conf.setBoolean("fs.hdfs.impl.disable.cache", true);
+
+ Metrics metrics = new Metrics();
+
+ boolean blockCacheEnabled = true;
+ if (blockCacheEnabled) {
+ boolean blockCacheGlobal = true;
+ boolean blockCacheReadEnabled = true;
+
+ int numberOfBlocksPerBank = 16384;
+ int blockSize = BlockDirectory.BLOCK_SIZE;
+ int bankCount = 1;
+ boolean directAllocation = true;
+ int slabSize = numberOfBlocksPerBank * blockSize;
+ int bufferSize = 128;
+ int bufferCount = 128 * 128;
+
+ synchronized (LuceneClient.class) {
+ if (globalBlockCache == null) {
+ BufferStore.initNewBuffer(bufferSize, bufferCount, metrics);
+
+ long totalMemory = (long) bankCount * (long) numberOfBlocksPerBank
* (long) blockSize;
+ globalBlockCache = new BlockCache(metrics, directAllocation,
totalMemory, slabSize, blockSize);
+ }
+ }
+
+ Cache cache = new BlockDirectoryCache(globalBlockCache, path, metrics,
blockCacheGlobal);
+ HdfsDirectory hdfsDir = new HdfsDirectory(new Path(path),
HdfsLockFactory.INSTANCE, conf);
+ dir = new BlockDirectory(path, hdfsDir, cache, null,
blockCacheReadEnabled, false);
+ } else {
+ dir = new HdfsDirectory(new Path(path), HdfsLockFactory.INSTANCE,
conf);
+ }
+ directory = new NRTCachingDirectory(dir, 16, 192);
+ }
+ return directory;
+ }
+
private void initIndex() throws IOException {
- File dirFile = path.toFile();
- boolean indexExists = dirFile.canRead() && dirFile.list().length > 1;
- if (!indexExists) writer.commit();
+ if (!useHdfs(path)) {
+ File dirFile = new File(path);
+ boolean indexExists = dirFile.canRead() && dirFile.list().length > 1;
+ if (!indexExists) writer.commit();
+ } else {
+ writer.commit();
+ refresh();
+ }
}
public Map<String,Map<String,Object>> fieldsInfo() {
@@ -278,7 +344,7 @@ public class LuceneClient implements Clo
}
public static String createVersionString(
- Path path,
+ String path,
Map<String,Map<String,Object>> charfiltersInfo,
Map<String,Map<String,Object>> tokenizersInfo,
Map<String,Map<String,Object>> filtersInfo,
@@ -286,7 +352,7 @@ public class LuceneClient implements Clo
Map<String,Map<String,Object>> fieldsInfo,
String idField,String contentField,
Long maxDocumentLength) {
- return LuceneConfig.PARAM_PATH + ":" + path.toString() + "+"
+ return LuceneConfig.PARAM_PATH + ":" + path + "+"
+ LuceneConfig.PARAM_CHARFILTERS + ":" +
Joiner.on(",").withKeyValueSeparator("=").join(charfiltersInfo) + "+"
+ LuceneConfig.PARAM_TOKENIZERS + ":" +
Joiner.on(",").withKeyValueSeparator("=").join(tokenizersInfo) + "+"
+ LuceneConfig.PARAM_FILTERS + ":" +
Joiner.on(",").withKeyValueSeparator("=").join(filtersInfo) + "+"
@@ -371,10 +437,12 @@ public class LuceneClient implements Clo
}
public LeafReader reader() throws IOException {
+ // The caller is responsible for ensuring that the returned reader is
closed.
return
SlowCompositeReaderWrapper.wrap(DirectoryReader.open(writer.getDirectory()));
}
public IndexSearcher newSearcher() throws IOException {
+ // The caller is responsible for ensuring that returned searcher's reader
is closed.
return new IndexSearcher(DirectoryReader.open(writer.getDirectory()));
}
Modified:
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClientManager.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClientManager.java?rev=1693798&r1=1693797&r2=1693798&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClientManager.java
(original)
+++
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneClientManager.java
Sun Aug 2 09:29:34 2015
@@ -1,6 +1,5 @@
package org.apache.manifoldcf.agents.output.lucene;
-import java.io.File;
import java.util.Map;
import com.google.common.collect.Maps;
@@ -12,24 +11,31 @@ public class LuceneClientManager {
private LuceneClientManager() { }
public synchronized static LuceneClient getClient(
- String path,
+ String path, String processID,
String charfilters, String tokenizers, String filters,
String analyzers, String fields,
String idField, String contentField,
Long maxDocumentLength) throws Exception
{
- LuceneClient client = clients.get(path);
+ String paramPath;
+ if (!LuceneClient.useHdfs(path)) {
+ paramPath = path;
+ } else {
+ paramPath = (processID.equals("")) ? path : path + "/" + processID;
+ }
+
+ LuceneClient client = clients.get(paramPath);
if (client == null) {
- return newClient(path, charfilters, tokenizers, filters, analyzers,
fields, idField, contentField, maxDocumentLength);
+ return newClient(paramPath, charfilters, tokenizers, filters, analyzers,
fields, idField, contentField, maxDocumentLength);
}
if (client != null) {
if (!client.isOpen()) {
- return newClient(path, charfilters, tokenizers, filters, analyzers,
fields, idField, contentField, maxDocumentLength);
+ return newClient(paramPath, charfilters, tokenizers, filters,
analyzers, fields, idField, contentField, maxDocumentLength);
}
String latestVersion = LuceneClient.createVersionString(
- new File(path).toPath(),
+ paramPath,
LuceneClient.parseAsMap(charfilters),
LuceneClient.parseAsMap(tokenizers),
LuceneClient.parseAsMap(filters),
@@ -53,9 +59,16 @@ public class LuceneClientManager {
String idField, String contentField,
Long maxDocumentLength) throws Exception
{
- LuceneClient client = new LuceneClient(new File(path).toPath(),
- charfilters, tokenizers, filters, analyzers, fields,
- idField, contentField, maxDocumentLength);
+ ClassLoader cl = Thread.currentThread().getContextClassLoader();
+ LuceneClient client;
+ try {
+
Thread.currentThread().setContextClassLoader(LuceneClientManager.class.getClassLoader());
+ client = new LuceneClient(path,
+ charfilters, tokenizers, filters, analyzers, fields,
+ idField, contentField, maxDocumentLength);
+ } finally {
+ Thread.currentThread().setContextClassLoader(cl);
+ }
clients.put(path, client);
return client;
}
Modified:
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneConnector.java?rev=1693798&r1=1693797&r2=1693798&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneConnector.java
(original)
+++
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/main/java/org/apache/manifoldcf/agents/output/lucene/LuceneConnector.java
Sun Aug 2 09:29:34 2015
@@ -36,6 +36,7 @@ import org.apache.manifoldcf.core.interf
import org.apache.manifoldcf.core.interfaces.IThreadContext;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
import org.apache.manifoldcf.core.interfaces.VersionContext;
+import org.apache.manifoldcf.core.system.ManifoldCF;
import org.apache.manifoldcf.crawler.system.Logging;
@@ -166,7 +167,7 @@ public class LuceneConnector extends org
try
{
- client = LuceneClientManager.getClient(path,
+ client = LuceneClientManager.getClient(path, ManifoldCF.getProcessID(),
charfilters, tokenizers, filters, analyzers, fields,
idField, contentField, maxDocumentLength);
}
Modified:
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/test/java/org/apache/manifoldcf/agents/output/lucene/tests/LuceneClientTest.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/test/java/org/apache/manifoldcf/agents/output/lucene/tests/LuceneClientTest.java?rev=1693798&r1=1693797&r2=1693798&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/test/java/org/apache/manifoldcf/agents/output/lucene/tests/LuceneClientTest.java
(original)
+++
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/connector/src/test/java/org/apache/manifoldcf/agents/output/lucene/tests/LuceneClientTest.java
Sun Aug 2 09:29:34 2015
@@ -24,7 +24,10 @@ import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.lucene.document.Document;
+import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@@ -32,8 +35,8 @@ import org.apache.lucene.search.IndexSea
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.BytesRef;
-
import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
import org.apache.manifoldcf.agents.output.lucene.LuceneClient;
import org.apache.manifoldcf.agents.output.lucene.LuceneClientManager;
@@ -45,8 +48,11 @@ import com.google.common.base.StandardSy
import com.google.common.io.ByteSource;
import org.junit.After;
+import org.junit.AfterClass;
import org.junit.Before;
+import org.junit.BeforeClass;
import org.junit.Test;
+
import static org.junit.Assert.*;
import static org.hamcrest.CoreMatchers.*;
@@ -55,9 +61,29 @@ public class LuceneClientTest {
private static final String sep =
StandardSystemProperty.FILE_SEPARATOR.value();
private File testDir;
+ static {
+ System.setProperty("org.apache.commons.logging.Log",
"org.apache.commons.logging.impl.NoOpLog");
+ }
+ private static MiniDFSCluster hdfsCluster;
+ private static String hdfsPath;
+
private static final String ID = LuceneClient.defaultIdField();
private static final String CONTENT = LuceneClient.defaultContentField();
+ @BeforeClass
+ public static void beforeClass() throws IOException {
+ Configuration conf = new Configuration();
+ conf.setBoolean("fs.hdfs.impl.disable.cache", true);
+ MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf);
+ hdfsCluster = builder.build();
+ hdfsPath = "hdfs://localhost:" + hdfsCluster.getNameNodePort() +
"/HdfsTest";
+ }
+
+ @AfterClass
+ public static void afterClass() {
+ hdfsCluster.shutdown();
+ }
+
@Before
public void setUp() {
String root = getClass().getResource("/").getFile();
@@ -90,7 +116,7 @@ public class LuceneClientTest {
String path = testDir.getAbsolutePath()+sep+"tmp"+sep+"openclose-index";
File f = new File(path);
assertThat(f.exists(), is(false));
- LuceneClient client = new LuceneClient(f.toPath());
+ LuceneClient client = new LuceneClient(path);
assertThat(f.exists(), is(true));
assertThat(client.isOpen(), is(true));
client.close();
@@ -100,7 +126,7 @@ public class LuceneClientTest {
@Test
public void testInitIndexDir() throws IOException {
String path = testDir.getAbsolutePath()+sep+"tmp"+sep+"initindexdir-index";
- LuceneClient client = new LuceneClient(new File(path).toPath());
+ LuceneClient client = new LuceneClient(path);
List<String> indexDirList = Arrays.asList(new File(path).list());
assertThat(indexDirList.size(), is(2));
assertThat(indexDirList.contains("write.lock"), is(true));
@@ -108,6 +134,7 @@ public class LuceneClientTest {
IndexSearcher searcher = client.newSearcher();
assertThat(searcher.count(new MatchAllDocsQuery()), is(0));
+ searcher.getIndexReader().close();
assertThat(client.newRealtimeSearcher().count(new MatchAllDocsQuery()),
is(0));
client.close();
@@ -118,12 +145,12 @@ public class LuceneClientTest {
String path =
testDir.getAbsolutePath()+sep+"tmp"+sep+"getclientfrommanager-index";
LuceneClient client1 =
- LuceneClientManager.getClient(path, LuceneClient.defaultCharfilters(),
LuceneClient.defaultTokenizers(), LuceneClient.defaultFilters(),
LuceneClient.defaultAnalyzers(), LuceneClient.defaultFields(),
+ LuceneClientManager.getClient(path, ManifoldCF.getProcessID(),
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(), LuceneClient.defaultAnalyzers(),
LuceneClient.defaultFields(),
LuceneClient.defaultIdField(), LuceneClient.defaultContentField(),
LuceneClient.defaultMaxDocumentLength());
assertThat(client1.isOpen(), is(true));
LuceneClient client2 =
- LuceneClientManager.getClient(path, LuceneClient.defaultCharfilters(),
LuceneClient.defaultTokenizers(), LuceneClient.defaultFilters(),
LuceneClient.defaultAnalyzers(), LuceneClient.defaultFields(),
+ LuceneClientManager.getClient(path, ManifoldCF.getProcessID(),
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(), LuceneClient.defaultAnalyzers(),
LuceneClient.defaultFields(),
"id", "content", LuceneClient.defaultMaxDocumentLength());
assertThat(client2.isOpen(), is(true));
@@ -132,7 +159,7 @@ public class LuceneClientTest {
LuceneClient client3;
try {
client3 =
- LuceneClientManager.getClient(path, LuceneClient.defaultCharfilters(),
LuceneClient.defaultTokenizers(), LuceneClient.defaultFilters(),
LuceneClient.defaultAnalyzers(), LuceneClient.defaultFields(),
+ LuceneClientManager.getClient(path, ManifoldCF.getProcessID(),
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(), LuceneClient.defaultAnalyzers(),
LuceneClient.defaultFields(),
"dummy_id", "dummy_content",
LuceneClient.defaultMaxDocumentLength());
fail("Should not get here");
} catch (Exception e) {
@@ -145,7 +172,56 @@ public class LuceneClientTest {
assertThat(client1, is(client2));
client3 =
- LuceneClientManager.getClient(path, LuceneClient.defaultCharfilters(),
LuceneClient.defaultTokenizers(), LuceneClient.defaultFilters(),
LuceneClient.defaultAnalyzers(), LuceneClient.defaultFields(),
+ LuceneClientManager.getClient(path, ManifoldCF.getProcessID(),
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(), LuceneClient.defaultAnalyzers(),
LuceneClient.defaultFields(),
+ "dummy_id", "dummy_content", LuceneClient.defaultMaxDocumentLength());
+ assertThat(client3.isOpen(), is(true));
+
+ assertThat(client3, not(client1));
+ assertThat(client3, not(client2));
+ }
+
+ @Test
+ public void testGetClientFromManagerHdfs() throws Exception {
+ String path = hdfsPath+"/getclientfrommanager";
+ String processID_A = "A";
+ String processID_B = "B";
+
+ LuceneClient client1 =
+ LuceneClientManager.getClient(path, processID_A,
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(), LuceneClient.defaultAnalyzers(),
LuceneClient.defaultFields(),
+ LuceneClient.defaultIdField(), LuceneClient.defaultContentField(),
LuceneClient.defaultMaxDocumentLength());
+ assertThat(client1.isOpen(), is(true));
+
+ LuceneClient client2 =
+ LuceneClientManager.getClient(path, processID_A,
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(), LuceneClient.defaultAnalyzers(),
LuceneClient.defaultFields(),
+ "id", "content", LuceneClient.defaultMaxDocumentLength());
+ assertThat(client2.isOpen(), is(true));
+
+ assertThat(client1, is(client2));
+
+ LuceneClient clientB =
+ LuceneClientManager.getClient(path, processID_B,
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(), LuceneClient.defaultAnalyzers(),
LuceneClient.defaultFields(),
+ LuceneClient.defaultIdField(), LuceneClient.defaultContentField(),
LuceneClient.defaultMaxDocumentLength());
+ assertThat(clientB.isOpen(), is(true));
+
+ assertThat(clientB, not(client2));
+
+ LuceneClient client3;
+ try {
+ client3 =
+ LuceneClientManager.getClient(path, processID_A,
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(), LuceneClient.defaultAnalyzers(),
LuceneClient.defaultFields(),
+ "dummy_id", "dummy_content",
LuceneClient.defaultMaxDocumentLength());
+ fail("Should not get here");
+ } catch (Exception e) {
+ assert e instanceof IllegalStateException;
+ }
+
+ client1.close();
+ assertThat(client1.isOpen(), is(false));
+ assertThat(client2.isOpen(), is(false));
+ assertThat(client1, is(client2));
+
+ client3 =
+ LuceneClientManager.getClient(path, processID_A,
LuceneClient.defaultCharfilters(), LuceneClient.defaultTokenizers(),
LuceneClient.defaultFilters(), LuceneClient.defaultAnalyzers(),
LuceneClient.defaultFields(),
"dummy_id", "dummy_content", LuceneClient.defaultMaxDocumentLength());
assertThat(client3.isOpen(), is(true));
@@ -156,7 +232,7 @@ public class LuceneClientTest {
@Test
public void testAddOrReplace() throws IOException {
String path = testDir.getAbsolutePath()+sep+"tmp"+sep+"addorreplace-index";
- try (LuceneClient client = new LuceneClient(new File(path).toPath())) {
+ try (LuceneClient client = new LuceneClient(path)) {
// add
LuceneDocument doc1 = new LuceneDocument();
doc1 = LuceneDocument.addField(doc1, ID, "/repo/001",
client.fieldsInfo());
@@ -172,6 +248,7 @@ public class LuceneClientTest {
IndexSearcher searcher = client.newSearcher();
assertThat(searcher.count(new TermQuery(new Term(CONTENT, "green"))),
is(1));
assertThat(searcher.count(new TermQuery(new Term(CONTENT, "yellow"))),
is(1));
+ searcher.getIndexReader().close();
// update
LuceneDocument updateDoc = new LuceneDocument();
@@ -183,6 +260,7 @@ public class LuceneClientTest {
searcher = client.newSearcher();
assertThat(searcher.count(new TermQuery(new Term(CONTENT, "green"))),
is(0));
assertThat(searcher.count(new TermQuery(new Term(CONTENT, "yellow"))),
is(2));
+ searcher.getIndexReader().close();
// add
LuceneDocument addDoc = new LuceneDocument();
@@ -195,13 +273,14 @@ public class LuceneClientTest {
assertThat(searcher.count(new TermQuery(new Term(CONTENT, "green"))),
is(0));
assertThat(searcher.count(new TermQuery(new Term(CONTENT, "yellow"))),
is(2));
assertThat(searcher.count(new TermQuery(new Term(CONTENT, "red"))),
is(1));
+ searcher.getIndexReader().close();
}
}
@Test
public void testRemove() throws IOException {
String path = testDir.getAbsolutePath()+sep+"tmp"+sep+"remove-index";
- try (LuceneClient client = new LuceneClient(new File(path).toPath())) {
+ try (LuceneClient client = new LuceneClient(path)) {
LuceneDocument doc1 = new LuceneDocument();
doc1 = LuceneDocument.addField(doc1, ID, "/repo/001",
client.fieldsInfo());
@@ -216,19 +295,21 @@ public class LuceneClientTest {
client.optimize();
IndexSearcher searcher = client.newSearcher();
assertThat(searcher.count(new TermQuery(new Term(CONTENT, "apache"))),
is(2));
+ searcher.getIndexReader().close();
client.remove("/repo/001");
client.optimize();
searcher = client.newSearcher();
assertThat(searcher.count(new TermQuery(new Term(CONTENT, "apache"))),
is(1));
+ searcher.getIndexReader().close();
}
}
@Test
public void testDefaultSettings() throws IOException, InterruptedException {
String path =
testDir.getAbsolutePath()+sep+"tmp"+sep+"defaultsettings-index";
- try (LuceneClient client = new LuceneClient(new File(path).toPath())) {
+ try (LuceneClient client = new LuceneClient(path)) {
String content1 = "Apache ManifoldCF, Apache Lucene";
LuceneDocument doc1 = new LuceneDocument();
@@ -264,20 +345,22 @@ public class LuceneClientTest {
TopDocs hits = searcher.search(client.newQuery("id:\\/repo\\/001"), 1);
int docID = hits.scoreDocs[0].doc;
- Terms terms = client.reader().getTermVector(docID, CONTENT);
- TermsEnum te = terms.iterator();
- BytesRef br;
- while ((br = te.next()) != null) {
- if (te.seekExact(new BytesRef("apache"))) {
- assertThat(br.utf8ToString(), is("apache"));
- assertThat(te.totalTermFreq(), is(2L));
- break;
+ try (LeafReader reader = client.reader()) {
+ Terms terms = reader.getTermVector(docID, CONTENT);
+ TermsEnum te = terms.iterator();
+ BytesRef br;
+ while ((br = te.next()) != null) {
+ if (te.seekExact(new BytesRef("apache"))) {
+ assertThat(br.utf8ToString(), is("apache"));
+ assertThat(te.totalTermFreq(), is(2L));
+ break;
+ }
}
- }
- assertThat(client.reader().docFreq(new Term(CONTENT, br)), is(3));
+ assertThat(reader.docFreq(new Term(CONTENT, br)), is(3));
- assertThat(client.reader().getTermVector(docID, "content_ws"),
is(nullValue()));
- assertThat(client.reader().getTermVector(docID, "content_ngram"),
is(nullValue()));
+ assertThat(reader.getTermVector(docID, "content_ws"), is(nullValue()));
+ assertThat(reader.getTermVector(docID, "content_ngram"),
is(nullValue()));
+ }
hits = searcher.search(client.newQuery("id:\\/repo\\/003"), 1);
Document storedDocument = searcher.doc(hits.scoreDocs[0].doc);
@@ -294,6 +377,9 @@ public class LuceneClientTest {
IndexSearcher searcher2 = client.newSearcher();
assertThat(searcher2.count(client.newQuery(ID+":"+nrt)), is(0));
assertThat(client.newRealtimeSearcher().count(client.newQuery(ID+":"+nrt)),
is(1));
+
+ searcher.getIndexReader().close();
+ searcher2.getIndexReader().close();
}
}
@@ -310,7 +396,7 @@ public class LuceneClientTest {
rd.setBinary(in, b.length);
String path = testDir.getAbsolutePath()+sep+"tmp"+sep+"rd-index";
- try (LuceneClient client = new LuceneClient(new File(path).toPath())) {
+ try (LuceneClient client = new LuceneClient(path)) {
LuceneDocument doc = new LuceneDocument();
doc = LuceneDocument.addField(doc, client.idField(), documentURI,
client.fieldsInfo());
@@ -339,6 +425,92 @@ public class LuceneClientTest {
assertThat(searcher.count(client.newQuery("content:categorization")),
is(1));
assertThat(searcher.count(client.newQuery("content:tagging")), is(1));
assertThat(searcher.count(client.newQuery("content:(classification AND
lucene)")), is(1));
+ searcher.getIndexReader().close();
+ }
+ }
+
+ @Test
+ public void testHdfsSimple1() throws IOException {
+ try (LuceneClient client = new LuceneClient(hdfsPath+"/Simple1")) {
+ for (int i = 0; i < 10; i++) {
+ LuceneDocument doc = new LuceneDocument();
+ doc = LuceneDocument.addField(doc, ID, String.valueOf(i),
client.fieldsInfo());
+ doc = LuceneDocument.addField(doc, CONTENT, ByteSource.wrap("hdfs
directory?".getBytes(StandardCharsets.UTF_8)).openBufferedStream(),
client.fieldsInfo());
+ client.addOrReplace(String.valueOf(i), doc);
+ }
+ client.refresh();
+
assertThat(client.newRealtimeSearcher().count(client.newQuery("content:hdfs")),
is(10));
+
+ client.remove(String.valueOf(0));
+ client.refresh();
+
assertThat(client.newRealtimeSearcher().count(client.newQuery("content:hdfs")),
is(9));
+ }
+ }
+
+ @Test
+ public void testHdfsSimple2() throws IOException {
+ try (LuceneClient client = new LuceneClient(hdfsPath+"/Simple2")) {
+ for (int i = 0; i < 10; i++) {
+ LuceneDocument doc = new LuceneDocument();
+ doc = LuceneDocument.addField(doc, ID, String.valueOf(i),
client.fieldsInfo());
+ doc = LuceneDocument.addField(doc, CONTENT, ByteSource.wrap("hdfs
directory.".getBytes(StandardCharsets.UTF_8)).openBufferedStream(),
client.fieldsInfo());
+ client.addOrReplace(String.valueOf(i), doc);
+ }
+ client.optimize();
+ IndexSearcher searcher = client.newSearcher();
+ assertThat(searcher.count(client.newQuery("content:hdfs")), is(10));
+ searcher.getIndexReader().close();
+
+ client.remove(String.valueOf(0));
+ client.optimize();
+ IndexSearcher searcher2 = client.newSearcher();
+ assertThat(searcher2.count(client.newQuery("content:hdfs")), is(9));
+ searcher2.getIndexReader().close();
+ }
+ }
+
+ @Test
+ public void testHdfsLock() throws IOException {
+ String samePath = testDir.getAbsolutePath()+sep+"tmp"+sep+"lock";
+ try {
+ try (LuceneClient client1 = new LuceneClient(samePath);
+ LuceneClient client2 = new LuceneClient(samePath)) {
+ fail("Should not get here");
+ }
+ } catch (Exception e) {
+ assert e instanceof LockObtainFailedException;
+ }
+
+ samePath = hdfsPath+"/lock";
+ String processID_A = "/A";
+ String processID_B = "/B";
+ try {
+ try (LuceneClient client1 = new LuceneClient(samePath+processID_A);
+ LuceneClient client2 = new LuceneClient(samePath+processID_B)) {
+ for (int i = 0; i < 2; i++) {
+ LuceneDocument doc1 = new LuceneDocument();
+ doc1 = LuceneDocument.addField(doc1, ID, "A"+String.valueOf(i),
client1.fieldsInfo());
+ client1.addOrReplace("A"+String.valueOf(i), doc1);
+ }
+ client1.commit();
+ for (int i = 0; i < 3; i++) {
+ LuceneDocument doc2 = new LuceneDocument();
+ doc2 = LuceneDocument.addField(doc2, ID, "B"+String.valueOf(i),
client2.fieldsInfo());
+ client2.addOrReplace("B"+String.valueOf(i), doc2);
+ }
+ client2.commit();
+
+ IndexSearcher searcher1 = client1.newSearcher();
+ assertThat(searcher1.count(client1.newQuery("*:*")), is(2));
+ searcher1.getIndexReader().close();
+
+ IndexSearcher searcher2 = client2.newSearcher();
+ assertThat(searcher2.count(client2.newQuery("*:*")), is(3));
+ searcher2.getIndexReader().close();
+ }
+ } catch (Exception e) {
+ System.out.println(e.getMessage());
+ fail("Should not get here");
}
}
Added:
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/hadoop-minicluster.xml
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1219/connectors/lucene/hadoop-minicluster.xml?rev=1693798&view=auto
==============================================================================
---
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/hadoop-minicluster.xml
(added)
+++
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/hadoop-minicluster.xml
Sun Aug 2 09:29:34 2015
@@ -0,0 +1,33 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ <groupId>org.apache.manifoldcf</groupId>
+ <artifactId>mcf-lucene-connector-hadoop-minicluster</artifactId>
+ <version>2.2-SNAPSHOT</version>
+ <modelVersion>4.0.0</modelVersion>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-minicluster</artifactId>
+ <version>2.6.0</version>
+ </dependency>
+ </dependencies>
+
+</project>
Propchange:
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/hadoop-minicluster.xml
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
manifoldcf/branches/CONNECTORS-1219/connectors/lucene/hadoop-minicluster.xml
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Modified: manifoldcf/branches/CONNECTORS-1219/connectors/lucene/pom.xml
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1219/connectors/lucene/pom.xml?rev=1693798&r1=1693797&r2=1693798&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1219/connectors/lucene/pom.xml (original)
+++ manifoldcf/branches/CONNECTORS-1219/connectors/lucene/pom.xml Sun Aug 2
09:29:34 2015
@@ -135,6 +135,9 @@
</excludes>
<forkMode>always</forkMode>
<workingDirectory>target/test-output</workingDirectory>
+ <systemPropertyVariables>
+ <XX:MaxDirectMemorySize>512m</XX:MaxDirectMemorySize>
+ </systemPropertyVariables>
</configuration>
</plugin>
@@ -236,6 +239,76 @@
<artifactId>gson</artifactId>
<version>2.2.4</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.solr</groupId>
+ <artifactId>solr-core</artifactId>
+ <version>${lucene.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>*</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.concurrentlinkedhashmap</groupId>
+ <artifactId>concurrentlinkedhashmap-lru</artifactId>
+ <version>1.2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.solr</groupId>
+ <artifactId>solr-solrj</artifactId>
+ <version>${lucene.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>*</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-annotations</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-auth</artifactId>
+ <version>${hadoop.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>*</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>*</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ <version>${hadoop.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>*</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.htrace</groupId>
+ <artifactId>htrace-core</artifactId>
+ <version>3.0.4</version>
+ </dependency>
<!-- Testing dependencies -->
@@ -376,5 +449,12 @@
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-minicluster</artifactId>
+ <version>${hadoop.version}</version>
+ <scope>test</scope>
+ </dependency>
+
</dependencies>
</project>