Author: jerome
Date: Thu Oct 20 04:28:57 2005
New Revision: 326889

URL: http://svn.apache.org/viewcvs?rev=326889&view=rev
Log:
NUTCH-88 Fixes:
* Changes some logs levels from severe to warning
* MimeType now offers a clean method that only keep the primary and sub types 
(removing extra parameters)
* ParserFactory uses a cleaned content-type to find the suitable parser.
* Add some unit tests.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
    lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
    lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
    lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
Thu Oct 20 04:28:57 2005
@@ -19,7 +19,6 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.Vector;
-import java.util.logging.Level;
 import java.util.logging.Logger;
 
 import java.io.InputStream;
@@ -92,9 +91,8 @@
         parsePluginUrl = new URL(fParsePluginsFile);
         ppInputStream = parsePluginUrl.openStream();
       } catch (Exception e) {
-        LOG.log(Level.SEVERE,
-                "Unable to load parse plugins file from URL " +
-                "[" + fParsePluginsFile + "]", e);
+        LOG.warning("Unable to load parse plugins file from URL " +
+                    "[" + fParsePluginsFile + "]. Reason is [" + e + "]");
         return pList;
       }
     } else {
@@ -109,7 +107,8 @@
       parser = factory.newDocumentBuilder();
       document = parser.parse(inputSource);
     } catch (Exception e) {
-      LOG.log(Level.SEVERE, "Unable to parse [" + fParsePluginsFile + "]", e);
+      LOG.warning("Unable to parse [" + fParsePluginsFile + "]." +
+                  "Reason is [" + e + "]");
       return null;
     }
     

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Thu Oct 
20 04:28:57 2005
@@ -64,6 +64,7 @@
     
     Parse parse = null;
     for (int i=0; i<parsers.length; i++) {
+      LOG.info("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]");
       parse = parsers[i].getParse(content);
       if ((parse != null) && (parse.getData().getStatus().isSuccess())) {
         return parse;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Thu 
Oct 20 04:28:57 2005
@@ -29,6 +29,8 @@
 import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.plugin.PluginRuntimeException;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypeException;
 
 
 /** Creates and caches [EMAIL PROTECTED] Parser} plugins.*/
@@ -228,9 +230,19 @@
    * @return List - List of extensions to be used for this contentType.
    *                If none, returns null.
    */
-  protected static List getExtensions(String contentType){
+  protected static List getExtensions(String contentType) {
     
-    List extensions = (List)CACHE.get(contentType);
+    // First of all, tries to clean the content-type
+    String type = null;
+    try {
+        type = MimeType.clean(contentType);
+    } catch (MimeTypeException mte) {
+        LOG.info("Could not clean the content-type [" + contentType +
+                 "], Reason is [" + mte + "]. Using its raw version...");
+        type = contentType;
+    }
+
+    List extensions = (List) CACHE.get(type);
 
     // Just compare the reference:
     // if this is the empty list, we know we will find no extension.
@@ -239,13 +251,13 @@
     }
     
     if (extensions == null) {
-      extensions = findExtensions(contentType);
+      extensions = findExtensions(type);
       if (extensions != null) {
-        CACHE.put(contentType, extensions);
+        CACHE.put(type, extensions);
       } else {
        // Put the empty extension list into cache
        // to remember we don't know any related extension.
-       CACHE.put(contentType, EMPTY_EXTENSION_LIST);
+       CACHE.put(type, EMPTY_EXTENSION_LIST);
       }
     }
     return extensions;
@@ -311,7 +323,7 @@
         // file. 
         // OR it was enabled in plugin.includes, but the plugin's plugin.xml
         // file does not claim that the plugin supports the specified mimeType
-        // in either case, LOG the appropriate error message to SEVERE level
+        // in either case, LOG the appropriate error message to WARN level
         
         if (ext == null) {
            //try to get it just by its pluginId
@@ -331,10 +343,10 @@
           
           } else{
             // plugin wasn't enabled via plugin.includes
-            LOG.severe("ParserFactory: Plugin: " + parsePluginId + 
-                       " mapped to contentType " + contentType +
-                       " via parse-plugins.xml, but not enabled via " +
-                       "plugin.includes in nutch-default.xml");                
     
+            LOG.warning("ParserFactory: Plugin: " + parsePluginId + 
+                        " mapped to contentType " + contentType +
+                        " via parse-plugins.xml, but not enabled via " +
+                        "plugin.includes in nutch-default.xml");               
      
           }
           
         } else{

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java Thu 
Oct 20 04:28:57 2005
@@ -105,14 +105,27 @@
         }
                 
         // All is ok, assign values
-        this.name = primary + SEPARATOR + sub;
+        this.name = primary + SEPARATOR + clearedSub;
         this.primary = primary;
         this.sub = clearedSub;
         this.extensions = new ArrayList();
         this.magics = new ArrayList();
     }
-    
-    
+
+    /**
+     * Cleans a content-type.
+     * This method cleans a content-type by removing its optional parameters
+     * and returning only its <code>primary-type/sub-type</code>.
+     * @param type is the content-type to clean.
+     * @return the cleaned version of the specified content-type.
+     * @throws MimeTypeException if something wrong occurs during the
+     *         parsing/cleaning of the specified type.
+     */
+    public final static String clean(String type) throws MimeTypeException {
+        return (new MimeType(type)).getName();
+    }
+
+
     /**
      * Return the name of this mime-type.
      * @return the name of this mime-type.

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java 
Thu Oct 20 04:28:57 2005
@@ -58,6 +58,8 @@
   public void testGetExtensions() throws Exception {
     Extension ext = (Extension)ParserFactory.getExtensions("text/html").get(0);
     assertEquals("parse-html", ext.getDescriptor().getPluginId());
+    ext = (Extension) ParserFactory.getExtensions("text/html; 
charset=ISO-8859-1").get(0);
+    assertEquals("parse-html", ext.getDescriptor().getPluginId());
     ext = (Extension)ParserFactory.getExtensions("foo/bar").get(0);
     assertEquals("parse-text", ext.getDescriptor().getPluginId());
   }
@@ -69,6 +71,13 @@
     assertEquals(1, parsers.length);
     assertEquals("org.apache.nutch.parse.html.HtmlParser",
         parsers[0].getClass().getName());
+
+    parsers = ParserFactory.getParsers("text/html; charset=ISO-8859-1", 
"http://foo.com";);
+    assertNotNull(parsers);
+    assertEquals(1, parsers.length);
+    assertEquals("org.apache.nutch.parse.html.HtmlParser",
+        parsers[0].getClass().getName());
+
     
     parsers = ParserFactory.getParsers("application/x-javascript",
     "http://foo.com";);

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java 
Thu Oct 20 04:28:57 2005
@@ -87,6 +87,31 @@
         constructorSuccess("mime/type");
     }
 
+    /** Test of <code>clean(String)</code> method. */
+    public void testClean() {
+        try {
+            assertEquals("text/html", MimeType.clean("text/html"));
+            assertEquals("text/html", MimeType.clean("text/html; 
charset=ISO-8859-1"));
+        } catch (Exception e) {
+            fail(e.toString());
+        }
+        cleanExceptionChecker(null);
+        cleanExceptionChecker("");
+        cleanExceptionChecker("text");
+        cleanExceptionChecker("/html");
+        cleanExceptionChecker("/text/html");
+    }
+
+    private static void cleanExceptionChecker(String type) {
+        try {
+            MimeType.clean(type);
+            fail("Must raise a MimeTypeException for [" + type + "]");
+        } catch (MimeTypeException mte) { // All is ok
+        } catch (Exception e) {
+            fail("Must raise a MimeTypeException for [" + type + "]");
+        }
+    }
+
     /** Test of <code>getPrimaryType</code> method. */
     public void testGetPrimaryType() {
     }


Reply via email to