IndexingJob.java

lewismc Thu, 30 Jul 2015 14:30:08 -0700

Author: lewismc
Date: Thu Jul 30 21:29:42 2015
New Revision: 1693507

URL: http://svn.apache.org/r1693507
Log:
NUTCH-1785 Ability to index raw content


Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/schema-solr4.xml
    nutch/trunk/conf/schema.xml
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul 30 21:29:42 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1785 Ability to index raw content (markus, lewismc)
+
 * NUTCH-2063 Add -mimeStats flag to FileDumper tool (Mike Joyce via lewismc)
 
 * NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is 
Fetched (lewismc)

Modified: nutch/trunk/conf/schema-solr4.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/conf/schema-solr4.xml (original)
+++ nutch/trunk/conf/schema-solr4.xml Thu Jul 30 21:29:42 2015
@@ -32,6 +32,8 @@
     <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
     <fieldType name="string" class="solr.StrField" sortMissingLast="true" 
omitNorms="true"/>
 
+    <fieldtype name="binary" class="solr.BinaryField"/>
+
 
     <!--
       Default numeric field types. For faster range queries, consider the 
tint/tfloat/tlong/tdouble types.
@@ -405,6 +407,10 @@
 
     <!-- fields for tld plugin -->    
     <field name="tld" type="string" stored="false" indexed="false"/>
+
+    <!-- field containing segment's raw binary content if indexed with 
-addBinaryContent -->
+    <field name="binaryContent" type="binary" stored="true" indexed="false"/>
+
  </fields>
  <uniqueKey>id</uniqueKey>
  <defaultSearchField>text</defaultSearchField>

Modified: nutch/trunk/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Thu Jul 30 21:29:42 2015
@@ -39,6 +39,7 @@
         <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
             omitNorms="true" positionIncrementGap="0"/>
         <fieldType name="location" class="solr.LatLonType" 
subFieldSuffix="_coordinate"/>
+        <fieldtype name="binary" class="solr.BinaryField"/>
 
         <fieldType name="text" class="solr.TextField"
             positionIncrementGap="100">
@@ -176,6 +177,9 @@
         <!-- fields for tld plugin -->    
         <field name="tld" type="string" stored="false" indexed="false"/>
 
+        <!-- field containing segment's raw binary content if indexed with 
-addBinaryContent -->
++       <field name="binaryContent" type="binary" stored="true" 
indexed="false"/>
+
         <!-- to work with Solr 4.9 and beyond that use RealTimeGetHandler -->
         <field name="_version_" type="long" indexed="true" stored="true"/>
 

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Thu Jul 30 21:29:42 2015
@@ -71,21 +71,21 @@
                <dependency org="com.google.guava" name="guava" rev="11.0.2" />
                <dependency org="com.google.code.crawler-commons" 
name="crawler-commons"
                        rev="0.5" />
-                <dependency org="org.apache.cxf" name="cxf" rev="3.0.4"/>
-                <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" 
rev="3.0.4"/>
-                <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" 
rev="3.0.4"/>
-                <dependency org="org.apache.cxf" name="cxf-rt-transports-http" 
rev="3.0.4"/>
-                <dependency org="org.apache.cxf" 
name="cxf-rt-transports-http-jetty" rev="3.0.4"/>
-                <dependency org="com.fasterxml.jackson.core" 
name="jackson-databind" rev="2.5.1" /> 
-                <dependency org="com.fasterxml.jackson.dataformat" 
name="jackson-dataformat-cbor" rev="2.5.1" />
-                <dependency org="com.fasterxml.jackson.jaxrs" 
name="jackson-jaxrs-json-provider" rev="2.5.1" />        
+        <dependency org="org.apache.cxf" name="cxf" rev="3.0.4"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" 
rev="3.0.4"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" 
rev="3.0.4"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-transports-http" 
rev="3.0.4"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" 
rev="3.0.4"/>
+        <dependency org="com.fasterxml.jackson.core" name="jackson-databind" 
rev="2.5.1" /> 
+        <dependency org="com.fasterxml.jackson.dataformat" 
name="jackson-dataformat-cbor" rev="2.5.1" />
+        <dependency org="com.fasterxml.jackson.jaxrs" 
name="jackson-jaxrs-json-provider" rev="2.5.1" />        
 
-                <dependency org="org.apache.commons" name="commons-jexl" 
rev="2.1.1" />
+        <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
 
-                 <dependency org="org.apache.mahout" name="mahout-math" 
rev="0.8" />
-                 <dependency org="org.apache.mahout" name="mahout-core" 
rev="0.8" />
-                 <dependency org="org.apache.lucene" name="lucene-core" 
rev="4.3.0" />
-                 <dependency org="org.apache.lucene" 
name="lucene-analyzers-common" rev="4.3.0" />
+        <dependency org="org.apache.mahout" name="mahout-math" rev="0.8" />
+        <dependency org="org.apache.mahout" name="mahout-core" rev="0.8" />
+        <dependency org="org.apache.lucene" name="lucene-core" rev="4.3.0" />
+        <dependency org="org.apache.lucene" name="lucene-analyzers-common" 
rev="4.3.0" />
               
                <!--Configuration: test -->
 

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Thu Jul 
30 21:29:42 2015
@@ -22,6 +22,8 @@ import java.util.Iterator;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.binary.StringUtils;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -34,7 +36,6 @@ import org.apache.hadoop.mapred.OutputCo
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.CrawlDb;
 import org.apache.nutch.crawl.Inlinks;
@@ -48,6 +49,7 @@ import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 
@@ -64,10 +66,12 @@ public class IndexerMapReduce extends Co
   public static final String INDEXER_SKIP_NOTMODIFIED = 
"indexer.skip.notmodified";
   public static final String URL_FILTERING = "indexer.url.filters";
   public static final String URL_NORMALIZING = "indexer.url.normalizers";
+  public static final String INDEXER_BINARY_AS_BASE64 = 
"indexer.binary.base64";
 
   private boolean skip = false;
   private boolean delete = false;
   private boolean deleteRobotsNoIndex = false;
+  private boolean base64 = false;
   private IndexingFilters filters;
   private ScoringFilters scfilters;
 
@@ -91,6 +95,7 @@ public class IndexerMapReduce extends Co
     this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX,
         false);
     this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
+    this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false);
 
     normalize = job.getBoolean(URL_NORMALIZING, false);
     filter = job.getBoolean(URL_FILTERING, false);
@@ -159,7 +164,7 @@ public class IndexerMapReduce extends Co
 
   public void map(Text key, Writable value,
       OutputCollector<Text, NutchWritable> output, Reporter reporter)
-      throws IOException {
+          throws IOException {
 
     String urlString = filterUrl(normalizeUrl(key.toString()));
     if (urlString == null) {
@@ -173,10 +178,11 @@ public class IndexerMapReduce extends Co
 
   public void reduce(Text key, Iterator<NutchWritable> values,
       OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
-      throws IOException {
+          throws IOException {
     Inlinks inlinks = null;
     CrawlDatum dbDatum = null;
     CrawlDatum fetchDatum = null;
+    Content content = null;
     ParseData parseData = null;
     ParseText parseText = null;
 
@@ -219,6 +225,8 @@ public class IndexerMapReduce extends Co
         }
       } else if (value instanceof ParseText) {
         parseText = (ParseText) value;
+      } else if (value instanceof Content) {
+        content = (Content)value;
       } else if (LOG.isWarnEnabled()) {
         LOG.warn("Unrecognized type: " + value.getClass());
       }
@@ -327,6 +335,18 @@ public class IndexerMapReduce extends Co
     // store boost for use by explain and dedup
     doc.add("boost", Float.toString(boost));
 
+    if (content != null) {
+      // Get the original unencoded content
+      String binary = new String(content.getContent());
+
+      // optionally encode as base64
+      if (base64) {
+        binary = Base64.encodeBase64String(StringUtils.getBytesUtf8(binary));
+      }
+
+      doc.add("binaryContent", binary);
+    }
+
     reporter.incrCounter("IndexerStatus", "indexed (add/update)", 1);
 
     NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
@@ -337,7 +357,7 @@ public class IndexerMapReduce extends Co
   }
 
   public static void initMRJob(Path crawlDb, Path linkDb,
-      Collection<Path> segments, JobConf job) {
+      Collection<Path> segments, JobConf job, boolean addBinaryContent) {
 
     LOG.info("IndexerMapReduce: crawldb: " + crawlDb);
 
@@ -352,6 +372,10 @@ public class IndexerMapReduce extends Co
           CrawlDatum.PARSE_DIR_NAME));
       FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
       FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
+
+      if (addBinaryContent) {
+        FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+      }
     }
 
     FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -367,7 +391,7 @@ public class IndexerMapReduce extends Co
         }
       } catch (IOException e) {
         LOG.warn("Failed to use linkDb ({}) for indexing: {}", linkDb,
-            StringUtils.stringifyException(e));
+            org.apache.hadoop.util.StringUtils.stringifyException(e));
       }
     }
 

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1693507&r1=1693506&r2=1693507&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java Thu Jul 30 
21:29:42 2015
@@ -83,6 +83,22 @@ public class IndexingJob extends NutchTo
   public void index(Path crawlDb, Path linkDb, List<Path> segments,
       boolean noCommit, boolean deleteGone, String params, boolean filter,
       boolean normalize) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+        false, false);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params,
+      boolean filter, boolean normalize, boolean addBinaryContent) throws 
IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+        false, false, false);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params,
+      boolean filter, boolean normalize, boolean addBinaryContent,
+      boolean base64) throws IOException {
+
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
@@ -94,11 +110,17 @@ public class IndexingJob extends NutchTo
     LOG.info("Indexer: deleting gone documents: " + deleteGone);
     LOG.info("Indexer: URL filtering: " + filter);
     LOG.info("Indexer: URL normalizing: " + normalize);
-
+    if (addBinaryContent) {
+      if (base64) {
+        LOG.info("Indexer: adding binary content as Base64");
+      } else {
+        LOG.info("Indexer: adding binary content");
+      }
+    }        
     IndexWriters writers = new IndexWriters(getConf());
     LOG.info(writers.describe());
 
-    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
+    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job, 
addBinaryContent);
 
     // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
     // job.set(SolrConstants.SERVER_URL, solrUrl);
@@ -106,6 +128,7 @@ public class IndexingJob extends NutchTo
     job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
     job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
     job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
+    job.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, base64);
 
     if (params != null) {
       job.set(IndexerMapReduce.INDEXER_PARAMS, params);
@@ -141,7 +164,8 @@ public class IndexingJob extends NutchTo
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
       System.err
-      .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params 
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] 
[-filter] [-normalize]");
+      //.println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params 
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] 
[-filter] [-normalize]");
+      .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params 
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] 
[-filter] [-normalize] [-addBinaryContent] [-base64]");
       IndexWriters writers = new IndexWriters(getConf());
       System.err.println(writers.describe());
       return -1;
@@ -157,6 +181,8 @@ public class IndexingJob extends NutchTo
     boolean deleteGone = false;
     boolean filter = false;
     boolean normalize = false;
+    boolean addBinaryContent = false;
+    boolean base64 = false;
 
     for (int i = 1; i < args.length; i++) {
       if (args[i].equals("-linkdb")) {
@@ -180,6 +206,10 @@ public class IndexingJob extends NutchTo
         filter = true;
       } else if (args[i].equals("-normalize")) {
         normalize = true;
+      } else if (args[i].equals("-addBinaryContent")) {
+        addBinaryContent = true;
+      } else if (args[i].equals("-base64")) {
+        base64 = true;
       } else if (args[i].equals("-params")) {
         params = args[++i];
       } else {
@@ -188,8 +218,7 @@ public class IndexingJob extends NutchTo
     }
 
     try {
-      index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter,
-          normalize);
+      index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, 
normalize, addBinaryContent, base64);
       return 0;
     } catch (final Exception e) {
       LOG.error("Indexer: " + StringUtils.stringifyException(e));

svn commit: r1693507 - in /nutch/trunk: CHANGES.txt conf/schema-solr4.xml conf/schema.xml ivy/ivy.xml src/java/org/apache/nutch/indexer/IndexerMapReduce.java src/java/org/apache/nutch/indexer/IndexingJob.java

Reply via email to