Author: jerome
Date: Thu Oct 20 04:28:57 2005
New Revision: 326889
URL: http://svn.apache.org/viewcvs?rev=326889&view=rev
Log:
NUTCH-88 Fixes:
* Changes some logs levels from severe to warning
* MimeType now offers a clean method that only keep the primary and sub types
(removing extra parameters)
* ParserFactory uses a cleaned content-type to find the suitable parser.
* Add some unit tests.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
Thu Oct 20 04:28:57 2005
@@ -19,7 +19,6 @@
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
-import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.InputStream;
@@ -92,9 +91,8 @@
parsePluginUrl = new URL(fParsePluginsFile);
ppInputStream = parsePluginUrl.openStream();
} catch (Exception e) {
- LOG.log(Level.SEVERE,
- "Unable to load parse plugins file from URL " +
- "[" + fParsePluginsFile + "]", e);
+ LOG.warning("Unable to load parse plugins file from URL " +
+ "[" + fParsePluginsFile + "]. Reason is [" + e + "]");
return pList;
}
} else {
@@ -109,7 +107,8 @@
parser = factory.newDocumentBuilder();
document = parser.parse(inputSource);
} catch (Exception e) {
- LOG.log(Level.SEVERE, "Unable to parse [" + fParsePluginsFile + "]", e);
+ LOG.warning("Unable to parse [" + fParsePluginsFile + "]." +
+ "Reason is [" + e + "]");
return null;
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Thu Oct
20 04:28:57 2005
@@ -64,6 +64,7 @@
Parse parse = null;
for (int i=0; i<parsers.length; i++) {
+ LOG.info("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]");
parse = parsers[i].getParse(content);
if ((parse != null) && (parse.getData().getStatus().isSuccess())) {
return parse;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Thu
Oct 20 04:28:57 2005
@@ -29,6 +29,8 @@
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.plugin.PluginRuntimeException;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypeException;
/** Creates and caches [EMAIL PROTECTED] Parser} plugins.*/
@@ -228,9 +230,19 @@
* @return List - List of extensions to be used for this contentType.
* If none, returns null.
*/
- protected static List getExtensions(String contentType){
+ protected static List getExtensions(String contentType) {
- List extensions = (List)CACHE.get(contentType);
+ // First of all, tries to clean the content-type
+ String type = null;
+ try {
+ type = MimeType.clean(contentType);
+ } catch (MimeTypeException mte) {
+ LOG.info("Could not clean the content-type [" + contentType +
+ "], Reason is [" + mte + "]. Using its raw version...");
+ type = contentType;
+ }
+
+ List extensions = (List) CACHE.get(type);
// Just compare the reference:
// if this is the empty list, we know we will find no extension.
@@ -239,13 +251,13 @@
}
if (extensions == null) {
- extensions = findExtensions(contentType);
+ extensions = findExtensions(type);
if (extensions != null) {
- CACHE.put(contentType, extensions);
+ CACHE.put(type, extensions);
} else {
// Put the empty extension list into cache
// to remember we don't know any related extension.
- CACHE.put(contentType, EMPTY_EXTENSION_LIST);
+ CACHE.put(type, EMPTY_EXTENSION_LIST);
}
}
return extensions;
@@ -311,7 +323,7 @@
// file.
// OR it was enabled in plugin.includes, but the plugin's plugin.xml
// file does not claim that the plugin supports the specified mimeType
- // in either case, LOG the appropriate error message to SEVERE level
+ // in either case, LOG the appropriate error message to WARN level
if (ext == null) {
//try to get it just by its pluginId
@@ -331,10 +343,10 @@
} else{
// plugin wasn't enabled via plugin.includes
- LOG.severe("ParserFactory: Plugin: " + parsePluginId +
- " mapped to contentType " + contentType +
- " via parse-plugins.xml, but not enabled via " +
- "plugin.includes in nutch-default.xml");
+ LOG.warning("ParserFactory: Plugin: " + parsePluginId +
+ " mapped to contentType " + contentType +
+ " via parse-plugins.xml, but not enabled via " +
+ "plugin.includes in nutch-default.xml");
}
} else{
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java Thu
Oct 20 04:28:57 2005
@@ -105,14 +105,27 @@
}
// All is ok, assign values
- this.name = primary + SEPARATOR + sub;
+ this.name = primary + SEPARATOR + clearedSub;
this.primary = primary;
this.sub = clearedSub;
this.extensions = new ArrayList();
this.magics = new ArrayList();
}
-
-
+
+ /**
+ * Cleans a content-type.
+ * This method cleans a content-type by removing its optional parameters
+ * and returning only its <code>primary-type/sub-type</code>.
+ * @param type is the content-type to clean.
+ * @return the cleaned version of the specified content-type.
+ * @throws MimeTypeException if something wrong occurs during the
+ * parsing/cleaning of the specified type.
+ */
+ public final static String clean(String type) throws MimeTypeException {
+ return (new MimeType(type)).getName();
+ }
+
+
/**
* Return the name of this mime-type.
* @return the name of this mime-type.
Modified:
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
Thu Oct 20 04:28:57 2005
@@ -58,6 +58,8 @@
public void testGetExtensions() throws Exception {
Extension ext = (Extension)ParserFactory.getExtensions("text/html").get(0);
assertEquals("parse-html", ext.getDescriptor().getPluginId());
+ ext = (Extension) ParserFactory.getExtensions("text/html;
charset=ISO-8859-1").get(0);
+ assertEquals("parse-html", ext.getDescriptor().getPluginId());
ext = (Extension)ParserFactory.getExtensions("foo/bar").get(0);
assertEquals("parse-text", ext.getDescriptor().getPluginId());
}
@@ -69,6 +71,13 @@
assertEquals(1, parsers.length);
assertEquals("org.apache.nutch.parse.html.HtmlParser",
parsers[0].getClass().getName());
+
+ parsers = ParserFactory.getParsers("text/html; charset=ISO-8859-1",
"http://foo.com");
+ assertNotNull(parsers);
+ assertEquals(1, parsers.length);
+ assertEquals("org.apache.nutch.parse.html.HtmlParser",
+ parsers[0].getClass().getName());
+
parsers = ParserFactory.getParsers("application/x-javascript",
"http://foo.com");
Modified:
lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java?rev=326889&r1=326888&r2=326889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/TestMimeType.java
Thu Oct 20 04:28:57 2005
@@ -87,6 +87,31 @@
constructorSuccess("mime/type");
}
+ /** Test of <code>clean(String)</code> method. */
+ public void testClean() {
+ try {
+ assertEquals("text/html", MimeType.clean("text/html"));
+ assertEquals("text/html", MimeType.clean("text/html;
charset=ISO-8859-1"));
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ cleanExceptionChecker(null);
+ cleanExceptionChecker("");
+ cleanExceptionChecker("text");
+ cleanExceptionChecker("/html");
+ cleanExceptionChecker("/text/html");
+ }
+
+ private static void cleanExceptionChecker(String type) {
+ try {
+ MimeType.clean(type);
+ fail("Must raise a MimeTypeException for [" + type + "]");
+ } catch (MimeTypeException mte) { // All is ok
+ } catch (Exception e) {
+ fail("Must raise a MimeTypeException for [" + type + "]");
+ }
+ }
+
/** Test of <code>getPrimaryType</code> method. */
public void testGetPrimaryType() {
}