Author: jerome
Date: Tue Dec 13 08:33:46 2005
New Revision: 356532
URL: http://svn.apache.org/viewcvs?rev=356532&view=rev
Log:
Remove hard-coded content-type checking in parsers
Modified:
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
Modified:
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=356532&r1=356531&r2=356532&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Tue Dec 13 08:33:46 2005
@@ -109,17 +109,12 @@
Outlink[] outlinks = new Outlink[0];
ContentProperties metadata = new ContentProperties();
- // check that contentType is one we can handle
- String contentType = content.getContentType();
- if (!"".equals(contentType) && !contentType.startsWith("text/html"))
- return new ParseStatus(ParseStatus.FAILED,
ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not text/html: " + contentType).getEmptyParse();
-
// parse the content
DocumentFragment root;
try {
byte[] contentInOctets = content.getContent();
InputSource input = new InputSource(new
ByteArrayInputStream(contentInOctets));
+ String contentType = content.getMetadata().getProperty("Content-Type");
String encoding = StringUtil.parseCharacterEncoding(contentType);
if (encoding!=null) {
metadata.put("OriginalCharEncoding", encoding);
Modified:
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=356532&r1=356531&r2=356532&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
Tue Dec 13 08:33:46 2005
@@ -89,15 +89,6 @@
*/
public Parse getParse(final Content content) {
- // check that contentType is one we can handle
- final String contentType = content.getContentType();
-
- if (contentType != null && !contentType.startsWith(MIME_TYPE)) {
- return new ParseStatus(ParseStatus.FAILED,
- ParseStatus.FAILED_INVALID_FORMAT, "Content-Type is not ["
- + MIME_TYPE + "] was: " + contentType).getEmptyParse();
- }
-
String plainText = null;
String title = null;
Outlink[] outlinks = null;
@@ -170,4 +161,4 @@
}
}
-}
\ No newline at end of file
+}
Modified:
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=356532&r1=356531&r2=356532&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
Tue Dec 13 08:33:46 2005
@@ -57,12 +57,6 @@
public Parse getParse(Content content) {
- // check that contentType is one we can handle
- String contentType = content.getContentType();
- if (contentType != null && !contentType.startsWith("application/msword"))
- return new ParseStatus(ParseStatus.FAILED,
ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not application/msword: " + contentType).getEmptyParse();
-
String text = null;
String title = null;
Properties properties = null;
Modified:
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=356532&r1=356531&r2=356532&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
Tue Dec 13 08:33:46 2005
@@ -84,12 +84,6 @@
public Parse getParse(Content content) {
- // check that contentType is one we can handle
- String contentType = content.getContentType();
- if (contentType != null && !contentType.startsWith("application/pdf"))
- return new ParseStatus(ParseStatus.FAILED,
ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not application/pdf: " + contentType).getEmptyParse();
-
// in memory representation of pdf file
PDDocument pdf = null;
Modified:
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=356532&r1=356531&r2=356532&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
Tue Dec 13 08:33:46 2005
@@ -101,15 +101,6 @@
*/
public Parse getParse(Content content) {
- // check that contentType is one we can handle
- String contentType = content.getContentType();
- if (contentType != null
- && (!contentType.startsWith("text/xml") && !contentType
- .startsWith("application/rss+xml")))
- return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not text/xml or application/rss+xml: "
- + contentType).getEmptyParse();
-
List theRSSChannels = null;
try {
Modified:
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=356532&r1=356531&r2=356532&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
Tue Dec 13 08:33:46 2005
@@ -48,13 +48,6 @@
public Parse getParse(final Content content) {
- // check that contentType is one we can handle
- final String contentType = content.getContentType();
- if (contentType != null && !contentType.startsWith("application/zip"))
{
- return new ParseStatus(ParseStatus.FAILED,
ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not application/zip: " +
contentType).getEmptyParse();
- }
-
String resultText = null;
String resultTitle = null;
Outlink[] outlinks = null;