Repository: nutch Updated Branches: refs/heads/master d29be63bd -> abc01175d
fix for NUTCH-2234 and NUTCH-2236. Upgrades Elasticsearch and Hadoop dependencies, which, in turn, requires updates to Guava and Lucene dependencies: - Elasticsearch 1.4.1 -> Elasticsearch 2.3.3 - Lucene 4.10.2 -> 5.5.0 - Solrj 5.4.1 -> 5.5.0 - Guava 16.0.1 -> Guava 18.0 - Hadoop 2.4.0 -> 2.7.2 Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/abc01175 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/abc01175 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/abc01175 Branch: refs/heads/master Commit: abc01175d8a1595db8d8d34a816c5f87f7474565 Parents: d29be63 Author: Joseph Naegele <[email protected]> Authored: Wed May 25 18:27:31 2016 +0000 Committer: Joseph Naegele <[email protected]> Committed: Mon Jun 27 21:16:25 2016 +0000 ---------------------------------------------------------------------- build.xml | 8 ++- default.properties | 5 +- ivy/ivy.xml | 15 +++--- src/plugin/indexer-elastic/ivy.xml | 2 +- src/plugin/indexer-elastic/plugin.xml | 52 +++++++++++++------- .../indexwriter/elastic/ElasticIndexWriter.java | 19 ++++--- src/plugin/indexer-solr/ivy.xml | 2 +- src/plugin/indexer-solr/plugin.xml | 2 +- src/plugin/parsefilter-naivebayes/ivy.xml | 4 +- src/plugin/parsefilter-naivebayes/plugin.xml | 4 +- src/plugin/scoring-similarity/build.xml | 10 +--- src/plugin/scoring-similarity/ivy.xml | 1 + src/plugin/scoring-similarity/plugin.xml | 3 +- .../similarity/util/LuceneAnalyzerUtil.java | 4 +- .../similarity/util/LuceneTokenizer.java | 21 +++++--- 15 files changed, 91 insertions(+), 61 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/build.xml ---------------------------------------------------------------------- diff --git a/build.xml b/build.xml index 5cff1ea..a1c41ed 100644 --- a/build.xml +++ b/build.xml @@ -234,8 +234,10 @@ <packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/> <link href="${javadoc.link.java}"/> - <link href="${javadoc.link.lucene}"/> <link href="${javadoc.link.hadoop}"/> + <link href="${javadoc.link.lucene.core}"/> + <link href="${javadoc.link.lucene.analyzers-common}"/> + <link href="${javadoc.link.solr-solrj}"/> <classpath refid="classpath"/> <classpath> @@ -675,8 +677,10 @@ <packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/> <link href="${javadoc.link.java}"/> - <link href="${javadoc.link.lucene}"/> <link href="${javadoc.link.hadoop}"/> + <link href="${javadoc.link.lucene.core}"/> + <link href="${javadoc.link.lucene.analyzers-common}"/> + <link href="${javadoc.link.solr-solrj}"/> <classpath refid="classpath"/> <classpath> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/default.properties ---------------------------------------------------------------------- diff --git a/default.properties b/default.properties index c8d9212..33390f7 100644 --- a/default.properties +++ b/default.properties @@ -44,7 +44,10 @@ test.junit.output.format = plain javadoc.proxy.host=-J-DproxyHost= javadoc.proxy.port=-J-DproxyPort= javadoc.link.java=http://docs.oracle.com/javase/7/docs/api/ -javadoc.link.hadoop=http://hadoop.apache.org/docs/r2.4.0/api/ +javadoc.link.hadoop=http://hadoop.apache.org/docs/r2.7.2/api/ +javadoc.link.lucene.core=https://lucene.apache.org/core/5_5_0/core/ +javadoc.link.lucene.analyzers-common=https://lucene.apache.org/core/5_5_0/analyzers-common/ +javadoc.link.solr-solrj=https://lucene.apache.org/solr/5_5_0/solr-solrj/ javadoc.packages=org.apache.nutch.* dist.dir=./dist http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/ivy/ivy.xml ---------------------------------------------------------------------- diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 027f0c1..a4e9481 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -52,7 +52,7 @@ <dependency org="com.tdunning" name="t-digest" rev="3.1" /> <!-- Hadoop Dependencies --> - <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.4.0" conf="*->default"> + <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.7.2" conf="*->default"> <exclude org="hsqldb" name="hsqldb" /> <exclude org="net.sf.kosmosfs" name="kfs" /> <exclude org="net.java.dev.jets3t" name="jets3t" /> @@ -60,9 +60,9 @@ <exclude org="org.mortbay.jetty" name="jsp-*" /> <exclude org="ant" name="ant" /> </dependency> - <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.4.0" conf="*->default"/> - <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.4.0" conf="*->default"/> - <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.4.0" conf="*->default"/> + <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.7.2" conf="*->default"/> + <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.7.2" conf="*->default"/> + <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.2" conf="*->default"/> <!-- End of Hadoop Dependencies --> <dependency org="org.apache.tika" name="tika-core" rev="1.12" /> @@ -72,7 +72,7 @@ <dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" /> <dependency org="oro" name="oro" rev="2.0.8" /> - <dependency org="com.google.guava" name="guava" rev="16.0.1" /> + <dependency org="com.google.guava" name="guava" rev="18.0" /> <dependency org="com.github.crawler-commons" name="crawler-commons" rev="0.6" /> @@ -88,7 +88,6 @@ <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" conf="*->default"/> <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.5.1" conf="*->default"/> - <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.10.2" conf="*->default"></dependency> <!-- WARC artifacts needed --> <dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default"> <exclude module="hadoop-core"/> @@ -105,6 +104,10 @@ <dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" /> <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" /> <dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" /> + <dependency org="tomcat" name="jasper-runtime" rev="5.5.23" conf="test->default" /> + <dependency org="tomcat" name="jasper-compiler" rev="5.5.23" conf="test->default"> + <exclude org="ant" name="ant" /> + </dependency> <!-- end of test artifacts --> <!-- web app dependencies --> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-elastic/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml index 6681410..f34075f 100644 --- a/src/plugin/indexer-elastic/ivy.xml +++ b/src/plugin/indexer-elastic/ivy.xml @@ -36,7 +36,7 @@ </publications> <dependencies> - <dependency org="org.elasticsearch" name="elasticsearch" rev="1.4.1" + <dependency org="org.elasticsearch" name="elasticsearch" rev="2.3.3" conf="*->default"/> </dependencies> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-elastic/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/indexer-elastic/plugin.xml b/src/plugin/indexer-elastic/plugin.xml index 02aad85..d99a665 100644 --- a/src/plugin/indexer-elastic/plugin.xml +++ b/src/plugin/indexer-elastic/plugin.xml @@ -22,25 +22,39 @@ <library name="indexer-elastic.jar"> <export name="*" /> </library> - - <library name="elasticsearch-1.4.1.jar"/> - <library name="lucene-analyzers-common-4.10.2.jar"/> - <library name="lucene-codecs-4.10.2.jar"/> - <library name="lucene-core-4.10.2.jar"/> - <library name="lucene-grouping-4.10.2.jar"/> - <library name="lucene-highlighter-4.10.2.jar"/> - <library name="lucene-join-4.10.2.jar"/> - <library name="lucene-memory-4.10.2.jar"/> - <library name="lucene-misc-4.10.2.jar"/> - <library name="lucene-queries-4.10.2.jar"/> - <library name="lucene-queryparser-4.10.2.jar"/> - <library name="lucene-sandbox-4.10.2.jar"/> - <library name="lucene-spatial-4.10.2.jar"/> - <library name="lucene-suggest-4.10.2.jar"/> - <library name="spatial4j-0.4.1.jar"/> - <library name="antlr-runtime-3.5.jar"/> - <library name="asm-4.1"/> - <library name="asm-commons-4.1.jar"/> + <library name="elasticsearch-2.3.3.jar"/> + <library name="commons-cli-1.3.1.jar"/> + <library name="compress-lzf-1.0.2.jar"/> + <library name="guava-18.0.jar"/> + <library name="HdrHistogram-2.1.6.jar"/> + <library name="hppc-0.7.1.jar"/> + <library name="indexer-elastic.jar"/> + <library name="jackson-core-2.6.6.jar"/> + <library name="jackson-dataformat-cbor-2.6.6.jar"/> + <library name="jackson-dataformat-smile-2.6.6.jar"/> + <library name="jackson-dataformat-yaml-2.6.6.jar"/> + <library name="joda-convert-1.2.jar"/> + <library name="joda-time-2.8.2.jar"/> + <library name="jsr166e-1.1.0.jar"/> + <library name="lucene-analyzers-common-5.5.0.jar"/> + <library name="lucene-backward-codecs-5.5.0.jar"/> + <library name="lucene-core-5.5.0.jar"/> + <library name="lucene-grouping-5.5.0.jar"/> + <library name="lucene-highlighter-5.5.0.jar"/> + <library name="lucene-join-5.5.0.jar"/> + <library name="lucene-memory-5.5.0.jar"/> + <library name="lucene-misc-5.5.0.jar"/> + <library name="lucene-queries-5.5.0.jar"/> + <library name="lucene-queryparser-5.5.0.jar"/> + <library name="lucene-sandbox-5.5.0.jar"/> + <library name="lucene-spatial-5.5.0.jar"/> + <library name="lucene-spatial3d-5.5.0.jar"/> + <library name="lucene-suggest-5.5.0.jar"/> + <library name="netty-3.10.5.Final.jar"/> + <library name="securesm-1.0.jar"/> + <library name="snakeyaml-1.15.jar"/> + <library name="spatial4j-0.5.jar"/> + <library name="t-digest-3.0.jar"/> </runtime> <requires> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java ---------------------------------------------------------------------- diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java index c1827e7..9367e41 100644 --- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java +++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java @@ -21,6 +21,7 @@ import static org.elasticsearch.node.NodeBuilder.nodeBuilder; import java.io.BufferedReader; import java.io.IOException; +import java.net.InetAddress; import java.util.HashMap; import java.util.Map; @@ -38,9 +39,8 @@ import org.elasticsearch.action.delete.DeleteRequestBuilder; import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.client.Client; import org.elasticsearch.client.transport.TransportClient; -import org.elasticsearch.common.settings.ImmutableSettings; -import org.elasticsearch.common.settings.ImmutableSettings.Builder; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.Settings.Builder; import org.elasticsearch.common.transport.InetSocketTransportAddress; import org.elasticsearch.node.Node; import org.slf4j.Logger; @@ -79,8 +79,7 @@ public class ElasticIndexWriter implements IndexWriter { host = job.get(ElasticConstants.HOST); port = job.getInt(ElasticConstants.PORT, 9300); - Builder settingsBuilder = ImmutableSettings.settingsBuilder().classLoader( - Settings.class.getClassLoader()); + Builder settingsBuilder = Settings.builder(); BufferedReader reader = new BufferedReader( job.getConfResourceAsReader("elasticsearch.conf")); @@ -106,8 +105,10 @@ public class ElasticIndexWriter implements IndexWriter { // Prefer TransportClient if (host != null && port > 1) { - client = new TransportClient(settings) - .addTransportAddress(new InetSocketTransportAddress(host, port)); + TransportClient transportClient = TransportClient.builder() + .settings(settings).build() + .addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port)); + client = transportClient; } else if (clusterName != null) { node = nodeBuilder().settings(settings).client(true).node(); client = node.client(); @@ -141,8 +142,10 @@ public class ElasticIndexWriter implements IndexWriter { bulkLength += value.toString().length(); } } else { - source.put(fieldName, doc.getFieldValue(fieldName)); - bulkLength += doc.getFieldValue(fieldName).toString().length(); + if (doc.getFieldValue(fieldName) != null) { + source.put(fieldName, doc.getFieldValue(fieldName)); + bulkLength += doc.getFieldValue(fieldName).toString().length(); + } } } request.setSource(source); http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-solr/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml index 566ec78..65e97e7 100644 --- a/src/plugin/indexer-solr/ivy.xml +++ b/src/plugin/indexer-solr/ivy.xml @@ -36,7 +36,7 @@ </publications> <dependencies> - <dependency org="org.apache.solr" name="solr-solrj" rev="5.4.1"/> + <dependency org="org.apache.solr" name="solr-solrj" rev="5.5.0"/> <dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.1" conf="*->default"/> <dependency org="org.apache.httpcomponents" name="httpmime" rev="4.4.1" conf="*->default"/> </dependencies> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-solr/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/indexer-solr/plugin.xml b/src/plugin/indexer-solr/plugin.xml index c92d3aa..0e86796 100644 --- a/src/plugin/indexer-solr/plugin.xml +++ b/src/plugin/indexer-solr/plugin.xml @@ -28,7 +28,7 @@ <library name="httpmime-4.4.1.jar"/> <library name="noggit-0.6.jar"/> <library name="slf4j-api-1.7.7.jar"/> - <library name="solr-solrj-5.4.1.jar"/> + <library name="solr-solrj-5.5.0.jar"/> <library name="stax2-api-3.1.4.jar"/> <library name="woodstox-core-asl-4.4.1.jar"/> <library name="zookeeper-3.4.6.jar"/> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/parsefilter-naivebayes/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/ivy.xml b/src/plugin/parsefilter-naivebayes/ivy.xml index eea057f..08cca2c 100644 --- a/src/plugin/parsefilter-naivebayes/ivy.xml +++ b/src/plugin/parsefilter-naivebayes/ivy.xml @@ -41,8 +41,8 @@ <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" > <exclude org="org.apache.mrunit" name="mrunit"/> </dependency> - <dependency org="org.apache.lucene" name="lucene-core" rev="4.10.2" /> - <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.10.2" /> + <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" /> + <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" /> </dependencies> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/parsefilter-naivebayes/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parsefilter-naivebayes/plugin.xml b/src/plugin/parsefilter-naivebayes/plugin.xml index b3217a8..ac15041 100644 --- a/src/plugin/parsefilter-naivebayes/plugin.xml +++ b/src/plugin/parsefilter-naivebayes/plugin.xml @@ -31,8 +31,8 @@ <library name="guava-14.0.1.jar"/> <library name="jackson-core-asl-1.9.12.jar"/> <library name="jackson-mapper-asl-1.9.12.jar"/> - <library name="lucene-analyzers-common-4.10.2.jar"/> - <library name="lucene-core-4.10.2.jar"/> + <library name="lucene-analyzers-common-5.5.0.jar"/> + <library name="lucene-core-5.5.0.jar"/> <library name="mahout-core-0.9.jar"/> <library name="mahout-math-0.10.1.jar"/> <library name="slf4j-api-1.7.12.jar"/> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/build.xml b/src/plugin/scoring-similarity/build.xml index 98abc70..66ac8f3 100644 --- a/src/plugin/scoring-similarity/build.xml +++ b/src/plugin/scoring-similarity/build.xml @@ -18,15 +18,7 @@ <project name="scoring-similarity" default="jar-core"> <import file="../build-plugin.xml"/> - <target name="deps-jar"> - <ant target="jar" inheritall="false" dir="../indexer-elastic" /> - </target> - <!-- Add compilation dependencies to classpath --> - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/indexer-elastic/*.jar" /> - </fileset> - </path> + <!-- Deploy Unit test dependencies --> <target name="deps-test"> <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/ivy.xml b/src/plugin/scoring-similarity/ivy.xml index 1a86d68..be0a1de 100644 --- a/src/plugin/scoring-similarity/ivy.xml +++ b/src/plugin/scoring-similarity/ivy.xml @@ -36,6 +36,7 @@ </publications> <dependencies> + <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/> </dependencies> </ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/plugin.xml b/src/plugin/scoring-similarity/plugin.xml index e3a04b2..9639c18 100644 --- a/src/plugin/scoring-similarity/plugin.xml +++ b/src/plugin/scoring-similarity/plugin.xml @@ -26,7 +26,8 @@ <library name="scoring-similarity.jar"> <export name="*"/> </library> - <library name="lucene-core-4.10.2.jar"/> + <library name="lucene-analyzers-common-5.5.0.jar"/> + <library name="lucene-core-5.5.0.jar"/> </runtime> <requires> http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java index 78b0fa9..4b519bc 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java @@ -70,8 +70,8 @@ public class LuceneAnalyzerUtil extends Analyzer{ } @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer source = new ClassicTokenizer(reader); + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new ClassicTokenizer(); TokenStream filter = new LowerCaseFilter(source); if(stopSet != null) { filter = new StopFilter(filter, stopSet); http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java index 6f6d4d4..acb987c 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java @@ -19,6 +19,7 @@ package org.apache.nutch.scoring.similarity.util; import java.io.StringReader; import java.util.List; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; @@ -113,21 +114,29 @@ public class LuceneTokenizer { return tokenStream; } - private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizer){ - switch(tokenizer){ + private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType){ + Tokenizer tokenizer = null; + switch(tokenizerType){ case CLASSIC: - tokenStream = new ClassicTokenizer(new StringReader(content)); + tokenizer = new ClassicTokenizer(); break; case STANDARD: - tokenStream = new StandardTokenizer(new StringReader(content)); + default: + tokenizer = new StandardTokenizer(); } + + tokenizer.setReader(new StringReader(content)); + + tokenStream = tokenizer; + return tokenStream; } private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) { - tokenStream = new StandardTokenizer(new StringReader(content)); - tokenStream = new LowerCaseFilter(tokenStream); + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader(content)); + tokenStream = new LowerCaseFilter(tokenizer); tokenStream = applyStemmer(stemFilterType); ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram); shingleFilter.setOutputUnigrams(false);
