Author: lewismc
Date: Thu Jul 30 21:29:42 2015
New Revision: 1693507
URL: http://svn.apache.org/r1693507
Log:
NUTCH-1785 Ability to index raw content
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/schema-solr4.xml
nutch/trunk/conf/schema.xml
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul 30 21:29:42 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-1785 Ability to index raw content (markus, lewismc)
+
* NUTCH-2063 Add -mimeStats flag to FileDumper tool (Mike Joyce via lewismc)
* NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is
Fetched (lewismc)
Modified: nutch/trunk/conf/schema-solr4.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/conf/schema-solr4.xml (original)
+++ nutch/trunk/conf/schema-solr4.xml Thu Jul 30 21:29:42 2015
@@ -32,6 +32,8 @@
<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true"
omitNorms="true"/>
+ <fieldtype name="binary" class="solr.BinaryField"/>
+
<!--
Default numeric field types. For faster range queries, consider the
tint/tfloat/tlong/tdouble types.
@@ -405,6 +407,10 @@
<!-- fields for tld plugin -->
<field name="tld" type="string" stored="false" indexed="false"/>
+
+ <!-- field containing segment's raw binary content if indexed with
-addBinaryContent -->
+ <field name="binaryContent" type="binary" stored="true" indexed="false"/>
+
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>text</defaultSearchField>
Modified: nutch/trunk/conf/schema.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Thu Jul 30 21:29:42 2015
@@ -39,6 +39,7 @@
<fieldType name="date" class="solr.TrieDateField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>
<fieldType name="location" class="solr.LatLonType"
subFieldSuffix="_coordinate"/>
+ <fieldtype name="binary" class="solr.BinaryField"/>
<fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
@@ -176,6 +177,9 @@
<!-- fields for tld plugin -->
<field name="tld" type="string" stored="false" indexed="false"/>
+ <!-- field containing segment's raw binary content if indexed with
-addBinaryContent -->
++ <field name="binaryContent" type="binary" stored="true"
indexed="false"/>
+
<!-- to work with Solr 4.9 and beyond that use RealTimeGetHandler -->
<field name="_version_" type="long" indexed="true" stored="true"/>
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Thu Jul 30 21:29:42 2015
@@ -71,21 +71,21 @@
<dependency org="com.google.guava" name="guava" rev="11.0.2" />
<dependency org="com.google.code.crawler-commons"
name="crawler-commons"
rev="0.5" />
- <dependency org="org.apache.cxf" name="cxf" rev="3.0.4"/>
- <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws"
rev="3.0.4"/>
- <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs"
rev="3.0.4"/>
- <dependency org="org.apache.cxf" name="cxf-rt-transports-http"
rev="3.0.4"/>
- <dependency org="org.apache.cxf"
name="cxf-rt-transports-http-jetty" rev="3.0.4"/>
- <dependency org="com.fasterxml.jackson.core"
name="jackson-databind" rev="2.5.1" />
- <dependency org="com.fasterxml.jackson.dataformat"
name="jackson-dataformat-cbor" rev="2.5.1" />
- <dependency org="com.fasterxml.jackson.jaxrs"
name="jackson-jaxrs-json-provider" rev="2.5.1" />
+ <dependency org="org.apache.cxf" name="cxf" rev="3.0.4"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws"
rev="3.0.4"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs"
rev="3.0.4"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-transports-http"
rev="3.0.4"/>
+ <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty"
rev="3.0.4"/>
+ <dependency org="com.fasterxml.jackson.core" name="jackson-databind"
rev="2.5.1" />
+ <dependency org="com.fasterxml.jackson.dataformat"
name="jackson-dataformat-cbor" rev="2.5.1" />
+ <dependency org="com.fasterxml.jackson.jaxrs"
name="jackson-jaxrs-json-provider" rev="2.5.1" />
- <dependency org="org.apache.commons" name="commons-jexl"
rev="2.1.1" />
+ <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
- <dependency org="org.apache.mahout" name="mahout-math"
rev="0.8" />
- <dependency org="org.apache.mahout" name="mahout-core"
rev="0.8" />
- <dependency org="org.apache.lucene" name="lucene-core"
rev="4.3.0" />
- <dependency org="org.apache.lucene"
name="lucene-analyzers-common" rev="4.3.0" />
+ <dependency org="org.apache.mahout" name="mahout-math" rev="0.8" />
+ <dependency org="org.apache.mahout" name="mahout-core" rev="0.8" />
+ <dependency org="org.apache.lucene" name="lucene-core" rev="4.3.0" />
+ <dependency org="org.apache.lucene" name="lucene-analyzers-common"
rev="4.3.0" />
<!--Configuration: test -->
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Thu Jul
30 21:29:42 2015
@@ -22,6 +22,8 @@ import java.util.Iterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.binary.StringUtils;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -34,7 +36,6 @@ import org.apache.hadoop.mapred.OutputCo
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Inlinks;
@@ -48,6 +49,7 @@ import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
@@ -64,10 +66,12 @@ public class IndexerMapReduce extends Co
public static final String INDEXER_SKIP_NOTMODIFIED =
"indexer.skip.notmodified";
public static final String URL_FILTERING = "indexer.url.filters";
public static final String URL_NORMALIZING = "indexer.url.normalizers";
+ public static final String INDEXER_BINARY_AS_BASE64 =
"indexer.binary.base64";
private boolean skip = false;
private boolean delete = false;
private boolean deleteRobotsNoIndex = false;
+ private boolean base64 = false;
private IndexingFilters filters;
private ScoringFilters scfilters;
@@ -91,6 +95,7 @@ public class IndexerMapReduce extends Co
this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX,
false);
this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
+ this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false);
normalize = job.getBoolean(URL_NORMALIZING, false);
filter = job.getBoolean(URL_FILTERING, false);
@@ -159,7 +164,7 @@ public class IndexerMapReduce extends Co
public void map(Text key, Writable value,
OutputCollector<Text, NutchWritable> output, Reporter reporter)
- throws IOException {
+ throws IOException {
String urlString = filterUrl(normalizeUrl(key.toString()));
if (urlString == null) {
@@ -173,10 +178,11 @@ public class IndexerMapReduce extends Co
public void reduce(Text key, Iterator<NutchWritable> values,
OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
- throws IOException {
+ throws IOException {
Inlinks inlinks = null;
CrawlDatum dbDatum = null;
CrawlDatum fetchDatum = null;
+ Content content = null;
ParseData parseData = null;
ParseText parseText = null;
@@ -219,6 +225,8 @@ public class IndexerMapReduce extends Co
}
} else if (value instanceof ParseText) {
parseText = (ParseText) value;
+ } else if (value instanceof Content) {
+ content = (Content)value;
} else if (LOG.isWarnEnabled()) {
LOG.warn("Unrecognized type: " + value.getClass());
}
@@ -327,6 +335,18 @@ public class IndexerMapReduce extends Co
// store boost for use by explain and dedup
doc.add("boost", Float.toString(boost));
+ if (content != null) {
+ // Get the original unencoded content
+ String binary = new String(content.getContent());
+
+ // optionally encode as base64
+ if (base64) {
+ binary = Base64.encodeBase64String(StringUtils.getBytesUtf8(binary));
+ }
+
+ doc.add("binaryContent", binary);
+ }
+
reporter.incrCounter("IndexerStatus", "indexed (add/update)", 1);
NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
@@ -337,7 +357,7 @@ public class IndexerMapReduce extends Co
}
public static void initMRJob(Path crawlDb, Path linkDb,
- Collection<Path> segments, JobConf job) {
+ Collection<Path> segments, JobConf job, boolean addBinaryContent) {
LOG.info("IndexerMapReduce: crawldb: " + crawlDb);
@@ -352,6 +372,10 @@ public class IndexerMapReduce extends Co
CrawlDatum.PARSE_DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
+
+ if (addBinaryContent) {
+ FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+ }
}
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -367,7 +391,7 @@ public class IndexerMapReduce extends Co
}
} catch (IOException e) {
LOG.warn("Failed to use linkDb ({}) for indexing: {}", linkDb,
- StringUtils.stringifyException(e));
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java Thu Jul 30
21:29:42 2015
@@ -83,6 +83,22 @@ public class IndexingJob extends NutchTo
public void index(Path crawlDb, Path linkDb, List<Path> segments,
boolean noCommit, boolean deleteGone, String params, boolean filter,
boolean normalize) throws IOException {
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+ false, false);
+ }
+
+ public void index(Path crawlDb, Path linkDb, List<Path> segments,
+ boolean noCommit, boolean deleteGone, String params,
+ boolean filter, boolean normalize, boolean addBinaryContent) throws
IOException {
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+ false, false, false);
+ }
+
+ public void index(Path crawlDb, Path linkDb, List<Path> segments,
+ boolean noCommit, boolean deleteGone, String params,
+ boolean filter, boolean normalize, boolean addBinaryContent,
+ boolean base64) throws IOException {
+
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -94,11 +110,17 @@ public class IndexingJob extends NutchTo
LOG.info("Indexer: deleting gone documents: " + deleteGone);
LOG.info("Indexer: URL filtering: " + filter);
LOG.info("Indexer: URL normalizing: " + normalize);
-
+ if (addBinaryContent) {
+ if (base64) {
+ LOG.info("Indexer: adding binary content as Base64");
+ } else {
+ LOG.info("Indexer: adding binary content");
+ }
+ }
IndexWriters writers = new IndexWriters(getConf());
LOG.info(writers.describe());
- IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
+ IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job,
addBinaryContent);
// NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
// job.set(SolrConstants.SERVER_URL, solrUrl);
@@ -106,6 +128,7 @@ public class IndexingJob extends NutchTo
job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
+ job.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, base64);
if (params != null) {
job.set(IndexerMapReduce.INDEXER_PARAMS, params);
@@ -141,7 +164,8 @@ public class IndexingJob extends NutchTo
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err
- .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone]
[-filter] [-normalize]");
+ //.println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone]
[-filter] [-normalize]");
+ .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone]
[-filter] [-normalize] [-addBinaryContent] [-base64]");
IndexWriters writers = new IndexWriters(getConf());
System.err.println(writers.describe());
return -1;
@@ -157,6 +181,8 @@ public class IndexingJob extends NutchTo
boolean deleteGone = false;
boolean filter = false;
boolean normalize = false;
+ boolean addBinaryContent = false;
+ boolean base64 = false;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-linkdb")) {
@@ -180,6 +206,10 @@ public class IndexingJob extends NutchTo
filter = true;
} else if (args[i].equals("-normalize")) {
normalize = true;
+ } else if (args[i].equals("-addBinaryContent")) {
+ addBinaryContent = true;
+ } else if (args[i].equals("-base64")) {
+ base64 = true;
} else if (args[i].equals("-params")) {
params = args[++i];
} else {
@@ -188,8 +218,7 @@ public class IndexingJob extends NutchTo
}
try {
- index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter,
- normalize);
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter,
normalize, addBinaryContent, base64);
return 0;
} catch (final Exception e) {
LOG.error("Indexer: " + StringUtils.stringifyException(e));