Author: siren
Date: Tue Feb 17 14:28:14 2009
New Revision: 745096

URL: http://svn.apache.org/viewvc?rev=745096&view=rev
Log:
fix NUTCH-631 - thanks to Stefan Will

Added:
    lucene/nutch/trunk/src/plugin/index-more/src/test/
    lucene/nutch/trunk/src/plugin/index-more/src/test/org/
    lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/
    
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/
    
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/plugin/build.xml
    
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745096&r1=745095&r2=745096&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 17 14:28:14 2009
@@ -339,6 +339,9 @@
      (Curtis d'Entremont, ab)
 
 127. NUTCH-683 - NUTCH-676 broke CrawlDbMerger. (dogacan)
+
+128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException
+     (Stefan Will, siren)
      
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=745096&r1=745095&r2=745096&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Feb 17 14:28:14 2009
@@ -93,6 +93,7 @@
   <target name="test">
     <parallel threadCount="2">
      <ant dir="creativecommons" target="test"/>
+     <ant dir="index-more" target="test"/>
      <ant dir="languageidentifier" target="test"/>
      <ant dir="lib-http" target="test"/>
      <ant dir="ontology" target="test"/>

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=745096&r1=745095&r2=745096&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Tue Feb 17 14:28:14 2009
@@ -175,12 +175,31 @@
     return doc;
   }
 
-  // Add Content-Type and its primaryType and subType
+  /**
+   * <p>
+   * Add Content-Type and its primaryType and subType add contentType,
+   * primaryType and subType to field "type" as un-stored, indexed and
+   * un-tokenized, so that search results can be confined by contentType or its
+   * primaryType or its subType.
+   * </p>
+   * <p>
+   * For example, if contentType is application/vnd.ms-powerpoint, search can 
be
+   * done with one of the following qualifiers
+   * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
+   * all case insensitive. The query filter is implemented in
+   * {...@link TypeQueryFilter}.
+   * </p>
+   * 
+   * @param doc
+   * @param data
+   * @param url
+   * @return
+   */
   private NutchDocument addType(NutchDocument doc, ParseData data, String url) 
{
     MimeType mimeType = null;
     String contentType = data.getMeta(Response.CONTENT_TYPE);
     if (contentType == null) {
-       // Note by Jerome Charron on 20050415:
+        // Note by Jerome Charron on 20050415:
         // Content Type not solved by a previous plugin
         // Or unable to solve it... Trying to find it
         // Should be better to use the doc content too
@@ -202,32 +221,31 @@
     }
 
     contentType = mimeType.getName();
-    String primaryType = mimeType.getSuperType().getName();
-    String subType = mimeType.getSubTypes().first().getName();
-    // leave this for future improvement
-    //MimeTypeParameterList parameterList = mimeType.getParameters()
-
-    // add contentType, primaryType and subType to field "type"
-    // as un-stored, indexed and un-tokenized, so that search results
-    // can be confined by contentType or its primaryType or its subType.
-    // For example, if contentType is application/vnd.ms-powerpoint,
-    // search can be done with one of the following qualifiers
-    // type:application/vnd.ms-powerpoint
-    // type:application
-    // type:vnd.ms-powerpoint
-    // all case insensitive.
-    // The query filter is implemented in TypeQueryFilter.java
+    
     doc.add("type", contentType);
-    doc.add("type", primaryType);
-    doc.add("type", subType);
 
-    // add its primaryType and subType to respective fields
-    doc.add("primaryType", primaryType);
-    doc.add("subType", subType);
+    String[] parts = getParts(contentType);
+
+    for(String part: parts) {
+      doc.add("type", part);
+    }
+    
+    // leave this for future improvement
+    //MimeTypeParameterList parameterList = mimeType.getParameters()
 
     return doc;
   }
 
+  
+  /**
+   * Utility method for splitting mime type into type and subtype.
+   * @param mimeType
+   * @return
+   */
+  static String[] getParts(String mimeType) {
+    return mimeType.split("/");
+  }
+
   // Reset title if we see non-standard HTTP header "Content-Disposition".
   // It's a good indication that content provider wants filename therein
   // be used as the title of this url.

Added: 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=745096&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 Tue Feb 17 14:28:14 2009
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.more;
+
+import junit.framework.TestCase;
+
+public class TestMoreIndexingFilter extends TestCase {
+
+  public void testGetParts() {
+    String[] parts = MoreIndexingFilter.getParts("text/html");
+    assertParts(parts, 2, "text", "html");
+
+  }
+
+  private void assertParts(String[] parts, int count, String... expected) {
+    assertEquals(count, parts.length);
+    for (int i = 0; i < expected.length; i++) {
+      assertEquals(expected[i], parts[i]);
+    }
+  }
+}


Reply via email to