Author: snagel
Date: Wed May  8 22:04:04 2013
New Revision: 1480484

URL: http://svn.apache.org/r1480484
Log:
NUTCH-956 solrindex issues

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/schema-solr4.xml
    nutch/branches/2.x/conf/schema.xml
    
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1480484&r1=1480483&r2=1480484&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed May  8 22:04:04 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via 
lewismc, snagel)
+
 * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)
 
 * NUTCH-1514 Phase out the deprecated configuration properties (if possible) 
(tejasp)

Modified: nutch/branches/2.x/conf/schema-solr4.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1480484&r1=1480483&r2=1480484&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema-solr4.xml (original)
+++ nutch/branches/2.x/conf/schema-solr4.xml Wed May  8 22:04:04 2013
@@ -346,6 +346,9 @@
 
     <!-- fields for creativecommons plugin -->
     <field name="cc" type="string" stored="true" indexed="true" 
multiValued="true"/>
+
+    <!-- fields for tld plugin -->    
+    <field name="tld" type="string" stored="false" indexed="false"/>
  </fields>
  <uniqueKey>id</uniqueKey>
  <defaultSearchField>text</defaultSearchField>

Modified: nutch/branches/2.x/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1480484&r1=1480483&r2=1480484&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Wed May  8 22:04:04 2013
@@ -114,6 +114,9 @@
         <!-- fields for creativecommons plugin -->
         <field name="cc" type="string" stored="true" indexed="true"
             multiValued="true"/>
+            
+        <!-- fields for tld plugin -->    
+        <field name="tld" type="string" stored="false" indexed="false"/>
     </fields>
     <uniqueKey>id</uniqueKey>
     <defaultSearchField>content</defaultSearchField>

Modified: 
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480484&r1=1480483&r2=1480484&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Wed May  8 22:04:04 2013
@@ -44,10 +44,12 @@ import org.slf4j.LoggerFactory;
 
 /**
  * Add (or reset) a few metaData properties as respective fields (if they are
- * available), so that they can be displayed by more.jsp (called by 
search.jsp).
+ * available), so that they can be accurately used within the search index.
  * 
- * content-type is indexed to support query by type: last-modifed is indexed to
- * support query by date:
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains 
content length from the HTTP
+ * header, 'type' field is indexed to support query by type and finally the 
'title' field is an attempt 
+ * to reset the title if a content-disposition hint exists. The logic is that 
such a presence is indicative 
+ * that the content provider wants the filename therein to be used as the 
title.
  * 
  * Still need to make content-length searchable!
  * 
@@ -171,7 +173,9 @@ public class MoreIndexingFilter implemen
    */
   private NutchDocument addType(NutchDocument doc, WebPage page, String url) {
     String mimeType = null;
-    Utf8 contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));
+    Utf8 contentType = page.getContentType();
+    if (contentType == null)
+       contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));
     if (contentType == null) {
       // Note by Jerome Charron on 20050415:
       // Content Type not solved by a previous plugin
@@ -194,13 +198,11 @@ public class MoreIndexingFilter implemen
       return doc;
     }
 
-    //String scontentType = mimeType.getName();
-
     doc.add("type", mimeType);
 
     // Check if we need to split the content type in sub parts
-    if ( null != contentType && 
conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
-      String[] parts = getParts(contentType.toString());
+    if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
+      String[] parts = getParts(mimeType);
 
       for(String part: parts) {
         doc.add("type", part);


Reply via email to