Philipp Suter wrote:
I would have some spare cycles starting end of july until end of august.. but I would need some short explanation where and how to integrate the flash text extractor. furthermore is there any document, whatsoever explaining the nutch deign approach? I never had a look at the sources of nutch and the design is very much tuned for performance, which does not make it easier to understand it but better to use it :-)
First, you need to check out the complete source from SVN trunk. Then you can copy one of the existing plugins and use it as a template. I attached an Eclipse project - just put these two files in the main directory (where README.txt is), import the project into Eclipse and off you go.
-- Best regards, Andrzej Bialecki <>< ___. ___ ___ ___ _ _ __________________________________ [__ || __|__/|__||\/| Information Retrieval, Semantic Web ___|||__|| \| || | Embedded Unix, System Integration http://www.sigram.com Contact: info at sigram dot com
<?xml version="1.0" encoding="UTF-8"?> <classpath> <classpathentry kind="src" path="src/java"/> <classpathentry kind="src" path="src/plugin/parse-js/src/java"/> <classpathentry kind="src" path="src/plugin/protocol-httpclient/src/java"/> <classpathentry kind="src" path="src/plugin/clustering-carrot2/src/java"/> <classpathentry kind="src" path="src/plugin/clustering-carrot2/src/test"/> <classpathentry kind="src" path="src/plugin/creativecommons/src/java"/> <classpathentry kind="src" path="src/plugin/creativecommons/src/test"/> <classpathentry kind="src" path="src/plugin/index-basic/src/java"/> <classpathentry kind="src" path="src/plugin/index-more/src/java"/> <classpathentry kind="src" path="src/plugin/languageidentifier/src/java"/> <classpathentry kind="src" path="src/plugin/languageidentifier/src/test"/> <classpathentry kind="src" path="src/plugin/ontology/src/java"/> <classpathentry kind="src" path="src/plugin/ontology/src/test"/> <classpathentry kind="src" path="src/plugin/parse-ext/src/java"/> <classpathentry kind="src" path="src/plugin/parse-ext/src/test"/> <classpathentry kind="src" path="src/plugin/parse-html/src/java"/> <classpathentry kind="src" path="src/plugin/parse-html/src/test"/> <classpathentry kind="src" path="src/plugin/parse-msword/src/java"/> <classpathentry kind="src" path="src/plugin/parse-msword/src/test"/> <classpathentry kind="src" path="src/plugin/parse-pdf/src/java"/> <classpathentry kind="src" path="src/plugin/parse-pdf/src/test"/> <classpathentry kind="src" path="src/plugin/parse-text/src/java"/> <classpathentry kind="src" path="src/plugin/protocol-file/src/java"/> <classpathentry kind="src" path="src/plugin/protocol-ftp/src/java"/> <classpathentry kind="src" path="src/plugin/protocol-http/src/java"/> <classpathentry kind="src" path="src/plugin/protocol-http/src/test"/> <classpathentry kind="src" path="src/plugin/query-basic/src/java"/> <classpathentry kind="src" path="src/plugin/query-more/src/java"/> <classpathentry kind="src" path="src/plugin/query-site/src/java"/> <classpathentry kind="src" path="src/plugin/query-url/src/java"/> <classpathentry kind="src" path="src/plugin/urlfilter-prefix/src/java"/> <classpathentry kind="src" path="src/plugin/urlfilter-regex/src/java"/> <classpathentry kind="src" path="src/test"/> <classpathentry kind="lib" path="lib/commons-logging-api-1.0.4.jar"/> <classpathentry kind="lib" path="lib/concurrent-1.3.4.jar"/> <classpathentry kind="lib" path="lib/jakarta-oro-2.0.7.jar"/> <classpathentry kind="lib" path="lib/jetty-5.1.2.jar"/> <classpathentry kind="lib" path="lib/junit-3.8.1.jar"/> <classpathentry kind="lib" path="lib/servlet-api.jar"/> <classpathentry kind="lib" path="lib/taglibs-i18n.jar"/> <classpathentry kind="lib" path="lib/xerces-2_6_2.jar"/> <classpathentry kind="lib" path="lib/xerces-2_6_2-apis.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-local-core.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-util-common.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/commons-collections-3.0.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/commons-pool-1.1.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/FSA.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/Jama-1.0.1-patched.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/nekohtml-0.9.2.jar"/> <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/violinstrings-1.0.2.jar"/> <classpathentry kind="lib" path="src/plugin/ontology/lib/commons-logging-1.0.3.jar"/> <classpathentry kind="lib" path="src/plugin/ontology/lib/icu4j_2_6_1.jar"/> <classpathentry kind="lib" path="src/plugin/ontology/lib/jena-2.1.jar"/> <classpathentry kind="lib" path="src/plugin/parse-html/lib/nekohtml-0.9.4.jar"/> <classpathentry kind="lib" path="src/plugin/parse-msword/lib/poi-2.1-20040508.jar"/> <classpathentry kind="lib" path="src/plugin/parse-msword/lib/poi-scratchpad-2.1-20040508.jar"/> <classpathentry kind="lib" path="src/plugin/protocol-ftp/lib/commons-net-1.2.0-dev.jar"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="lib" path="lib/jaf-1.0.2.jar"/> <classpathentry kind="lib" path="src/plugin/parse-pdf/lib/log4j-1.2.9.jar"/> <classpathentry kind="lib" path="src/plugin/parse-pdf/lib/PDFBox-0.7.0.jar"/> <classpathentry kind="lib" path="conf"/> <classpathentry kind="lib" path="build/plugins"/> <classpathentry kind="lib" path="lib/lucene-1.9-rc1-dev.jar"/> <classpathentry kind="lib" path="lib/lucene-misc-1.9-rc1-dev.jar"/> <classpathentry kind="lib" path="src/plugin/parse-html/lib/tagsoup-1.0rc3.jar"/> <classpathentry kind="lib" path="src/plugin/protocol-httpclient/lib/commons-codec.jar"/> <classpathentry kind="lib" path="src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar"/> <classpathentry kind="output" path=".eclipse"/> </classpath>
<?xml version="1.0" encoding="UTF-8"?> <projectDescription> <name>Nutch Vanilla</name> <comment></comment> <projects> </projects> <buildSpec> <buildCommand> <name>org.eclipse.jdt.core.javabuilder</name> <arguments> </arguments> </buildCommand> </buildSpec> <natures> <nature>org.eclipse.jdt.core.javanature</nature> </natures> </projectDescription>
