Author: siren
Date: Tue May 15 11:29:49 2007
New Revision: 538273

URL: http://svn.apache.org/viewvc?view=rev&rev=538273
Log:
NUTCH-161 Change Plain text parser to use parser.character.encoding.default 
property for fall back encoding
spotted by KuroSaka TeruHiko

Modified:
    lucene/nutch/trunk/CHANGES.txt
    
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=538273&r1=538272&r2=538273
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue May 15 11:29:49 2007
@@ -19,6 +19,10 @@
  
  7. NUTCH-483 - Remove redundant commons-logging jar from ontology plugin
     (siren)
+    
+ 8. NUTCH-161 - Change Plain text parser to
+    use parser.character.encoding.default property for fall back encoding
+    (KuroSaka TeruHiko, siren)
   
 
 Release 0.9 - 2007-04-02

Modified: 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=538273&r1=538272&r2=538273
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
 Tue May 15 11:29:49 2007
@@ -24,35 +24,42 @@
 import org.apache.hadoop.conf.Configuration;
 
 public class TextParser implements Parser {
+
   private Configuration conf;
+  
+  /**
+   * Encoding to be used when character set isn't specified
+   * as HTTP header.
+   */
+  private String defaultEncoding;
 
+  /**
+   * Parses plain text document. This code uses configured default encoding
+   * [EMAIL PROTECTED] parser.character.encoding.default} if character set 
isn't specified
+   * as HTTP header. FIXME: implement charset detector
+   */
   public ParseResult getParse(Content content) {
 
-    // ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new
-    // Outlink[0], metadata);
-
     String encoding = StringUtil.parseCharacterEncoding(content
         .getContentType());
     String text;
-    if (encoding != null) { // found an encoding header
-      try { // try to use named encoding
-        text = new String(content.getContent(), encoding);
-      } catch (java.io.UnsupportedEncodingException e) {
-        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), 
getConf());
-      }
-    } else {
-      // FIXME: implement charset detector. This code causes problem when
-      // character set isn't specified in HTTP header.
-      text = new String(content.getContent()); // use default encoding
+    try {
+      text = new String(content.getContent(), encoding != null ? encoding
+          : defaultEncoding);
+    } catch (java.io.UnsupportedEncodingException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
     }
+    
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
         OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata());
     parseData.setConf(this.conf);
     return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, 
parseData));
-    
   }
 
   public void setConf(Configuration conf) {
+    defaultEncoding = conf.get("parser.character.encoding.default",
+        "windows-1252");
     this.conf = conf;
   }
 



-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to