Author: snagel Date: Wed May 8 22:04:04 2013 New Revision: 1480484 URL: http://svn.apache.org/r1480484 Log: NUTCH-956 solrindex issues
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/schema-solr4.xml nutch/branches/2.x/conf/schema.xml nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1480484&r1=1480483&r2=1480484&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Wed May 8 22:04:04 2013 @@ -2,6 +2,8 @@ Nutch Change Log Release 2.2 - Current Development +* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel) + * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp) * NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp) Modified: nutch/branches/2.x/conf/schema-solr4.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1480484&r1=1480483&r2=1480484&view=diff ============================================================================== --- nutch/branches/2.x/conf/schema-solr4.xml (original) +++ nutch/branches/2.x/conf/schema-solr4.xml Wed May 8 22:04:04 2013 @@ -346,6 +346,9 @@ <!-- fields for creativecommons plugin --> <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/> + + <!-- fields for tld plugin --> + <field name="tld" type="string" stored="false" indexed="false"/> </fields> <uniqueKey>id</uniqueKey> <defaultSearchField>text</defaultSearchField> Modified: nutch/branches/2.x/conf/schema.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1480484&r1=1480483&r2=1480484&view=diff ============================================================================== --- nutch/branches/2.x/conf/schema.xml (original) +++ nutch/branches/2.x/conf/schema.xml Wed May 8 22:04:04 2013 @@ -114,6 +114,9 @@ <!-- fields for creativecommons plugin --> <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/> + + <!-- fields for tld plugin --> + <field name="tld" type="string" stored="false" indexed="false"/> </fields> <uniqueKey>id</uniqueKey> <defaultSearchField>content</defaultSearchField> Modified: nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480484&r1=1480483&r2=1480484&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed May 8 22:04:04 2013 @@ -44,10 +44,12 @@ import org.slf4j.LoggerFactory; /** * Add (or reset) a few metaData properties as respective fields (if they are - * available), so that they can be displayed by more.jsp (called by search.jsp). + * available), so that they can be accurately used within the search index. * - * content-type is indexed to support query by type: last-modifed is indexed to - * support query by date: + * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP + * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt + * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative + * that the content provider wants the filename therein to be used as the title. * * Still need to make content-length searchable! * @@ -171,7 +173,9 @@ public class MoreIndexingFilter implemen */ private NutchDocument addType(NutchDocument doc, WebPage page, String url) { String mimeType = null; - Utf8 contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE)); + Utf8 contentType = page.getContentType(); + if (contentType == null) + contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE)); if (contentType == null) { // Note by Jerome Charron on 20050415: // Content Type not solved by a previous plugin @@ -194,13 +198,11 @@ public class MoreIndexingFilter implemen return doc; } - //String scontentType = mimeType.getName(); - doc.add("type", mimeType); // Check if we need to split the content type in sub parts - if ( null != contentType && conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) { - String[] parts = getParts(contentType.toString()); + if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) { + String[] parts = getParts(mimeType); for(String part: parts) { doc.add("type", part);