Author: jerome
Date: Tue Sep 27 13:45:37 2005
New Revision: 292035
URL: http://svn.apache.org/viewcvs?rev=292035view=rev
Log:
NUTCH-88, First step proposal implementation (thanks to Chris Mattmann and
Sébastien Le Callonnec)
Added:
lucene/nutch/trunk/conf/parse-plugins.dtd (with props)
lucene/nutch/trunk/conf/parse-plugins.xml (with props)
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
(with props)
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
(with props)
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
(with props)
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
Added: lucene/nutch/trunk/conf/parse-plugins.dtd
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=292035view=auto
==
--- lucene/nutch/trunk/conf/parse-plugins.dtd (added)
+++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Sep 27 13:45:37 2005
@@ -0,0 +1,7 @@
+!ELEMENT parse-plugins (mimeType+)
+!ELEMENT mimeType (plugin+)
+!ATTLIST mimeType name CDATA #REQUIRED
+
+!ELEMENT plugin EMPTY
+!ATTLIST plugin id CDATA #REQUIRED
+!ATTLIST plugin order CDATA ''
\ No newline at end of file
Propchange: lucene/nutch/trunk/conf/parse-plugins.dtd
--
svn:eol-style = native
Added: lucene/nutch/trunk/conf/parse-plugins.xml
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=292035view=auto
==
--- lucene/nutch/trunk/conf/parse-plugins.xml (added)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Sep 27 13:45:37 2005
@@ -0,0 +1,207 @@
+?xml version=1.0 encoding=UTF-8?
+!--
+ Copyright 2005 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the License);
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an AS IS BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ Author : mattmann
+ Description: This xml file represents a natural ordering for which
parsing
+ plugin should get called for a particular mimeType.
+--
+
+parse-plugins
+
+ !-- by default if the mimeType is set to *, or
+ can't be determined, use parse-text --
+ mimeType name=*
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/java
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/msword
+ plugin id=parse-msword /
+ /mimeType
+
+ mimeType name=application/pdf
+ plugin id=parse-pdf /
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/postscript
+ plugin id=parse-pdf /
+ /mimeType
+
+ mimeType name=application/rss+xml
+ plugin id=parse-rss /
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/vnd.ms-excel
+ plugin id=parse-msexcel /
+ /mimeType
+
+ mimeType name=application/vnd.ms-powerpoint
+ plugin id=parse-mspowerpoint /
+ /mimeType
+
+ mimeType name=application/vnd.wap.wbxml
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/vnd.wap.wmlc
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/vnd.wap.wmlscriptc
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/xhtml+xml
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/x-bzip2
+ !-- try and parse it with the zip parser --
+ plugin id=parse-zip /
+ /mimeType
+
+ mimeType name=application/x-csh
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/x-gzip
+ !-- try and parse it with the zip parser --
+ plugin id=parse-zip /
+ /mimeType
+
+ mimeType name=application/x-javascript
+ plugin id=parse-js /
+ plugin id=parse-text /
+ /mimeType
+
+ mimeType name=application/x-kword
+ !-- try and parse it with the word parser --
+ plugin id=parse-msword /
+ /mimeType
+
+ mimeType name=application/x-kspread
+ !-- try and parse