Author: siren Date: Tue May 15 11:29:49 2007 New Revision: 538273 URL: http://svn.apache.org/viewvc?view=rev&rev=538273 Log: NUTCH-161 Change Plain text parser to use parser.character.encoding.default property for fall back encoding spotted by KuroSaka TeruHiko
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=538273&r1=538272&r2=538273 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue May 15 11:29:49 2007 @@ -19,6 +19,10 @@ 7. NUTCH-483 - Remove redundant commons-logging jar from ontology plugin (siren) + + 8. NUTCH-161 - Change Plain text parser to + use parser.character.encoding.default property for fall back encoding + (KuroSaka TeruHiko, siren) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=538273&r1=538272&r2=538273 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Tue May 15 11:29:49 2007 @@ -24,35 +24,42 @@ import org.apache.hadoop.conf.Configuration; public class TextParser implements Parser { + private Configuration conf; + + /** + * Encoding to be used when character set isn't specified + * as HTTP header. + */ + private String defaultEncoding; + /** + * Parses plain text document. This code uses configured default encoding + * [EMAIL PROTECTED] parser.character.encoding.default} if character set isn't specified + * as HTTP header. FIXME: implement charset detector + */ public ParseResult getParse(Content content) { - // ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new - // Outlink[0], metadata); - String encoding = StringUtil.parseCharacterEncoding(content .getContentType()); String text; - if (encoding != null) { // found an encoding header - try { // try to use named encoding - text = new String(content.getContent(), encoding); - } catch (java.io.UnsupportedEncodingException e) { - return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); - } - } else { - // FIXME: implement charset detector. This code causes problem when - // character set isn't specified in HTTP header. - text = new String(content.getContent()); // use default encoding + try { + text = new String(content.getContent(), encoding != null ? encoding + : defaultEncoding); + } catch (java.io.UnsupportedEncodingException e) { + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); } + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata()); parseData.setConf(this.conf); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); - } public void setConf(Configuration conf) { + defaultEncoding = conf.get("parser.character.encoding.default", + "windows-1252"); this.conf = conf; } ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs