Added: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?view=auto&rev=567100
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Fri Aug
17 11:17:06 2007
@@ -0,0 +1,236 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.StringTokenizer;
+
+import junit.framework.TestCase;
+import org.apache.tika.config.Content;
+import org.apache.tika.config.LiusConfig;
+import org.apache.tika.exception.LiusException;
+import org.apache.tika.log.LiusLogger;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserFactory;
+import org.apache.tika.utils.Utils;
+import net.hedges.mimeinfo.MimeInfoException;
+
+/**
+ * Junit test class
+ * @author Rida Benjelloun ([EMAIL PROTECTED])
+ */
+public class TestParsers extends TestCase {
+
+ private LiusConfig tc;
+
+ private File classDir;
+
+ private String config;
+
+ public void setUp() {
+ String sep = File.separator;
+ StringTokenizer st = new StringTokenizer(System.getProperty(
+
+ "java.class.path"), File.pathSeparator);
+
+ classDir = new File(st.nextToken());
+
+ config = classDir.getParent() + sep + "config" + sep + "config.xml";
+
+ String log4j = classDir.getParent() + sep + "Config" + sep + "log4j"
+ + sep + "log4j.properties";
+
+ tc = LiusConfig.getInstance(config);
+
+ LiusLogger.setLoggerConfigFile(log4j);
+
+ }
+
+ /*
+ * public void testConfig(){ TikaConfig tc =
+ * TikaConfig.getInstance("C:\\tika\\config\\tikaConfig2.xml");
ParserConfig
+ * pc = tc.getParserConfig("text/html"); assertEquals("parse-html",
+ * pc.getName()); }
+ */
+
+ public void testPDFExtraction() {
+ Parser parser = null;
+ File testFile = new File(classDir.getParent() + File.separator
+ + "testFiles" + File.separator + "testPDF.PDF");
+ try {
+ parser = ParserFactory.getParser(testFile, tc);
+ } catch (MimeInfoException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (LiusException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ public void testTXTExtraction() {
+ Parser parser = null;
+ File testFile = new File(classDir.getParent() + File.separator
+ + "testFiles" + File.separator + "testTXT.txt");
+ try {
+ parser = ParserFactory.getParser(testFile, tc);
+ } catch (MimeInfoException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (LiusException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ public void testRTFExtraction() {
+ Parser parser = null;
+ File testFile = new File(classDir.getParent() + File.separator
+ + "testFiles" + File.separator + "testRTF.rtf");
+ try {
+ parser = ParserFactory.getParser(testFile, tc);
+
+ } catch (MimeInfoException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (LiusException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ public void testXMLExtraction() {
+ Parser parser = null;
+ File testFile = new File(classDir.getParent() + File.separator
+ + "testFiles" + File.separator + "testXML.xml");
+ try {
+ parser = ParserFactory.getParser(testFile, tc);
+ } catch (MimeInfoException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (LiusException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ public void testPPTExtraction() {
+ Parser parser = null;
+ File testFile = new File(classDir.getParent() + File.separator
+ + "testFiles" + File.separator + "testPPT.ppt");
+ try {
+ parser = ParserFactory.getParser(testFile, tc);
+ System.out.println(parser.getStrContent());
+ } catch (MimeInfoException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (LiusException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ public void testWORDxtraction() {
+ Parser parser = null;
+ File testFile = new File(classDir.getParent() + File.separator
+ + "testFiles" + File.separator + "testWORD.doc");
+ try {
+ parser = ParserFactory.getParser(testFile, tc);
+ System.out.println(parser.getStrContent());
+ } catch (MimeInfoException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (LiusException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ public void testEXCELExtraction() {
+ Parser parser = null;
+ File testFile = new File(classDir.getParent() + File.separator
+ + "testFiles" + File.separator + "testEXCEL.xls");
+ try {
+ parser = ParserFactory.getParser(testFile, tc);
+ // System.out.println(parser.getStrContent());
+ printContentsInfo(parser);
+ } catch (MimeInfoException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (LiusException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ public void testOOExtraction() {
+ Parser parser = null;
+ File testFile = new File(classDir.getParent() + File.separator
+ + "testFiles" + File.separator + "testOO2.odt");
+ try {
+ parser = ParserFactory.getParser(testFile, tc);
+ // System.out.println(parser.getStrContent());
+ printContentsInfo(parser);
+ } catch (MimeInfoException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (LiusException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ public void testHTMLExtraction() {
+ Parser parser = null;
+ File testFile = new File(classDir.getParent() + File.separator
+ + "testFiles" + File.separator + "testHTML.html");
+ try {
+ parser = ParserFactory.getParser(testFile, tc);
+ assertEquals("Title : Test Indexation Html",
(parser.getContent("title")).getValue());
+ // System.out.println(parser.getStrContent());
+ printContentsInfo(parser);
+ } catch (MimeInfoException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (LiusException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ private void printContentsInfo(Parser parser) {
+ String mimeType = parser.getMimeType();
+ System.out.println("Mime : " + mimeType);
+ String strContent = parser.getStrContent();
+ Collection<Content> structuredContent = parser.getContents();
+ Utils.print(structuredContent);
+ System.out.println("==============");
+ // Content title = parser.getContent("title");
+ }
+
+}
Propchange: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/tika/trunk/src/test/resources/testHTML.html
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/testHTML.html?view=auto&rev=567100
==============================================================================
--- incubator/tika/trunk/src/test/resources/testHTML.html (added)
+++ incubator/tika/trunk/src/test/resources/testHTML.html Fri Aug 17 11:17:06
2007
@@ -0,0 +1,9 @@
+<html>
+ <head>
+ <title>Title : Test Indexation Html</title>
+ </head>
+ <body>
+ <h1>Test Indexation Html</h1>
+ <p>Indexation du fichier</p>
+ </body>
+</html>
\ No newline at end of file
Propchange: incubator/tika/trunk/src/test/resources/testHTML.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/tika/trunk/src/test/resources/testRTF.rtf
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/testRTF.rtf?view=auto&rev=567100
==============================================================================
--- incubator/tika/trunk/src/test/resources/testRTF.rtf (added)
+++ incubator/tika/trunk/src/test/resources/testRTF.rtf Fri Aug 17 11:17:06 2007
@@ -0,0 +1,17 @@
+{\rtf1\ansi\ansicpg1252\uc1\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1036\deflangfe1036{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose
02020603050405020304}Times New Roman;}{\f37\froman\fcharset238\fprq2 Times New
Roman CE;}
+{\f38\froman\fcharset204\fprq2 Times New Roman
Cyr;}{\f40\froman\fcharset161\fprq2 Times New Roman
Greek;}{\f41\froman\fcharset162\fprq2 Times New Roman
Tur;}{\f42\froman\fcharset177\fprq2 Times New Roman (Hebrew);}
+{\f43\froman\fcharset178\fprq2 Times New Roman
(Arabic);}{\f44\froman\fcharset186\fprq2 Times New Roman
Baltic;}{\f45\froman\fcharset163\fprq2 Times New Roman
(Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;
+\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;
+\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql
\li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \snext0
Normal;}{\*\cs10 \additive \ssemihidden
+Default Paragraph
Font;}{\*\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
+\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden
Normal Table;}}{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\rsidtbl
\rsid2954171\rsid10375891}
+{\*\generator Microsoft Word 11.0.6568;}{\info{\title Test d\'92indexation
Word}{\author Bibliotheque}{\operator
Bibliotheque}{\creatim\yr2006\mo5\dy18\hr12\min19}{\revtim\yr2006\mo5\dy18\hr12\min19}{\version2}{\edmins0}{\nofpages1}{\nofwords3}
+{\nofchars21}{\*\company Universite
Laval}{\nofcharsws23}{\vern24579}}\paperw11906\paperh16838\margl1417\margr1417\margt1417\margb1417
+\deftab708\widowctrl\ftnbj\aenddoc\hyphhotz425\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1417\dgvorigin1417\dghshow1\dgvshow1
+\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct\asianbrkrule\nojkernpunct\rsidroot2954171
\fet0
+\sectd
\linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sftnbj
{\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta
.}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3
+\pndec\pnstart1\pnindent720\pnhang {\pntxta
.}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta
)}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta
)}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}
+{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta
)}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta
)}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta
)}}\pard\plain
+\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid2954171 Test
d\rquote indexation Word
+\par
+\par }}
\ No newline at end of file
Added: incubator/tika/trunk/src/test/resources/testTXT.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/testTXT.txt?view=auto&rev=567100
==============================================================================
--- incubator/tika/trunk/src/test/resources/testTXT.txt (added)
+++ incubator/tika/trunk/src/test/resources/testTXT.txt Fri Aug 17 11:17:06 2007
@@ -0,0 +1,2 @@
+Test d'indexation de Txt
+http://www.apache.org
Propchange: incubator/tika/trunk/src/test/resources/testTXT.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/tika/trunk/src/test/resources/testXML.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/testXML.xml?view=auto&rev=567100
==============================================================================
--- incubator/tika/trunk/src/test/resources/testXML.xml (added)
+++ incubator/tika/trunk/src/test/resources/testXML.xml Fri Aug 17 11:17:06 2007
@@ -0,0 +1,31 @@
+<oaidc:dc xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/">
+
+ <dc:title>Archimède et Lius</dc:title>
+
+ <dc:creator>Rida Benjelloun</dc:creator>
+
+ <dc:subject>Java</dc:subject>
+
+ <dc:subject>XML</dc:subject>
+
+ <dc:subject>XSLT</dc:subject>
+
+ <dc:subject>JDOM</dc:subject>
+
+ <dc:subject>Indexation</dc:subject>
+
+ <dc:description>Framework d'indexation des documents XML, HTML, PDF
etc.. </dc:description>
+
+ <dc:identifier>http://www.apache.org</dc:identifier>
+
+ <dc:date>2000-12</dc:date>
+
+ <dc:type>test</dc:type>
+
+ <dc:format>application/msword</dc:format>
+
+ <dc:language>Fr</dc:language>
+
+ <dc:rights>Non restreint</dc:rights>
+
+</oaidc:dc>
\ No newline at end of file
Propchange: incubator/tika/trunk/src/test/resources/testXML.xml
------------------------------------------------------------------------------
svn:eol-style = native