Author: jukka
Date: Tue Sep 25 13:37:00 2007
New Revision: 579372
URL: http://svn.apache.org/viewvc?rev=579372&view=rev
Log:
TIKA-31 - protected Parser.parse(InputStream stream, Iterable<Content> contents)
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/exception/TikaException.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Tue Sep 25 13:37:00 2007
@@ -46,3 +46,6 @@
21. TIKA-28 - Rename config.xml to tika-config.xml or similar (mattmann)
22. TIKA-26 - Use Map<String, Content> instead of List<Content> (jukka)
+
+23. TIKA-31 - protected Parser.parse(InputStream stream,
+ Iterable<Content> contents) (jukka & K. Bennett)
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/exception/TikaException.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/exception/TikaException.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/exception/TikaException.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/exception/TikaException.java
Tue Sep 25 13:37:00 2007
@@ -25,4 +25,8 @@
super(msg);
}
+ public TikaException(String msg, Throwable cause) {
+ super(msg, cause);
+ }
+
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Tue
Sep 25 13:37:00 2007
@@ -16,16 +16,24 @@
*/
package org.apache.tika.parser;
+import java.io.IOException;
import java.io.InputStream;
+import java.util.List;
import java.util.Map;
+import org.apache.log4j.Logger;
+import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.RegexUtils;
/**
* Abstract class Parser
*/
public abstract class Parser {
+ private static final Logger logger = Logger.getLogger(Parser.class);
+
private InputStream is;
private String mimeType;
@@ -34,16 +42,14 @@
private Map<String, Content> contents;
- protected String contentStr;
+ private String contentStr;
+
+ private boolean parsed = false;
public void setInputStream(InputStream is) {
this.is = is;
}
- public InputStream getInputStream() {
- return is;
- }
-
/**
* Get document mime type
*/
@@ -89,11 +95,53 @@
* regex selection or fulltext
*/
public Map<String, Content> getContents() {
+ if (!parsed) {
+ try {
+ try {
+ contentStr = parse(is, contents.values());
+ } finally {
+ is.close();
+ }
+
+ for (Content content : contents.values()) {
+ if ("fulltext".equalsIgnoreCase(content.getTextSelect())) {
+ content.setValue(contentStr);
+ } else if
("summary".equalsIgnoreCase(content.getTextSelect())) {
+ int length = Math.min(contentStr.length(), 500);
+ String summary = contentStr.substring(0, length);
+ content.setValue(summary);
+ } else if (content.getRegexSelect() != null) {
+ String regex = content.getRegexSelect();
+ try {
+ List<String> values =
+ RegexUtils.extract(contentStr, regex);
+ if (values.size() > 0) {
+ content.setValue(values.get(0));
+ content.setValues(
+ values.toArray(new
String[values.size()]));
+ }
+ } catch (MalformedPatternException e) {
+ logger.error(
+ "Invalid regular expression: " + regex, e);
+ }
+ }
+ }
+ } catch (Exception e) {
+ logger.error("Parse error: " + e.getMessage(), e);
+ contentStr = "";
+ } finally {
+ parsed = true;
+ }
+ }
return contents;
}
public void setContents(Map<String, Content> contents) {
this.contents = contents;
}
+
+ protected abstract String parse(
+ InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException;
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Tue Sep 25 13:37:00 2007
@@ -16,16 +16,13 @@
*/
package org.apache.tika.parser.html;
+import java.io.IOException;
import java.io.InputStream;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.RegexUtils;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@@ -40,56 +37,23 @@
static Logger logger = Logger.getRootLogger();
- private Node root = null;
-
- public Map<String, Content> getContents() {
- if (contentStr == null) {
- if (root == null)
- root = getRoot(getInputStream());
- contentStr = getTextContent(root);
- }
- Map<String, Content> ctt = super.getContents();
-
- Iterator i = ctt.values().iterator();
- while (i.hasNext()) {
- Content ct = (Content) i.next();
- if (ct.getTextSelect() != null) {
- if (ct.getTextSelect().equalsIgnoreCase("fulltext")) {
- ct.setValue(contentStr);
- } else {
- extractElementTxt((Element) root, ct);
- }
-
- }
-
- else if (ct.getRegexSelect() != null) {
- try {
- List<String> valuesLs = RegexUtils.extract(contentStr, ct
- .getRegexSelect());
- if (valuesLs.size() > 0) {
- ct.setValue(valuesLs.get(0));
- ct.setValues(valuesLs.toArray(new String[0]));
- }
- } catch (MalformedPatternException e) {
- logger.error(e.getMessage());
- }
- }
- }
-
- return ctt;
-
- }
-
- private Node getRoot(InputStream is) {
+ protected String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
- org.w3c.dom.Document doc = tidy.parseDOM(is, null);
- return doc.getDocumentElement();
+ Node root = tidy.parseDOM(stream, null).getDocumentElement();
+ for (Content content : contents) {
+ String text = content.getTextSelect();
+ if (text != null && !text.equalsIgnoreCase("fulltext")
+ && !text.equalsIgnoreCase("summary")) {
+ extractElementTxt((Element) root, content);
+ }
+ }
+ return getTextContent(root);
}
private void extractElementTxt(Element root, Content content) {
-
NodeList children = root.getElementsByTagName(content.getTextSelect());
if (children != null) {
if (children.getLength() > 0) {
@@ -116,7 +80,6 @@
}
}
}
-
}
private String getTextContent(Node node) {
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
Tue Sep 25 13:37:00 2007
@@ -16,62 +16,27 @@
*/
package org.apache.tika.parser.msexcel;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.io.IOException;
+import java.io.InputStream;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.MSExtractor;
-import org.apache.tika.utils.RegexUtils;
-
-import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
/**
* Excel parser
- *
- *
*/
public class MsExcelParser extends Parser {
- private MSExtractor extrator = new ExcelExtractor();
-
- static Logger logger = Logger.getRootLogger();
- public Map<String, Content> getContents() {
- if (contentStr == null) {
- // extrator.setContents(getParserConfig().getContents());
- try {
- contentStr = extrator.extractText(getInputStream());
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
+ protected String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ try {
+ return new ExcelExtractor().extractText(stream);
+ } catch (IOException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new TikaException("Error parsing an Excel document", e);
}
- Map<String, Content> ctt = super.getContents();
- Iterator i = ctt.values().iterator();
- while (i.hasNext()) {
- Content ct = (Content) i.next();
- if (ct.getTextSelect() != null) {
- if (ct.getTextSelect().equalsIgnoreCase("fulltext")) {
- ct.setValue(contentStr);
- }
-
- } else if (ct.getRegexSelect() != null) {
- try {
- List<String> valuesLs = RegexUtils.extract(contentStr, ct
- .getRegexSelect());
- if (valuesLs.size() > 0) {
- ct.setValue(valuesLs.get(0));
- ct.setValues(valuesLs.toArray(new String[0]));
- }
- } catch (MalformedPatternException e) {
- logger.error(e.getMessage());
- }
- }
- }
-
- return ctt;
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
Tue Sep 25 13:37:00 2007
@@ -16,63 +16,29 @@
*/
package org.apache.tika.parser.mspowerpoint;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.io.IOException;
+import java.io.InputStream;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.RegexUtils;
-
-import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
/**
* Power point parser
- *
- *
*/
public class MsPowerPointParser extends Parser {
- private PPTExtractor extrator = new PPTExtractor();
-
- static Logger logger = Logger.getRootLogger();
-
- public Map<String, Content> getContents() {
- if (contentStr == null) {
- extrator.setContents(super.getContents());
- try {
- contentStr = extrator.extractText(getInputStream());
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
+ protected String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ try {
+ PPTExtractor extrator = new PPTExtractor();
+ extrator.setContents(contents);
+ return extrator.extractText(stream);
+ } catch (IOException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new TikaException("Error parsing a PowerPoint document", e);
}
- Map<String, Content> ctt = super.getContents();
- Iterator i = ctt.values().iterator();
- while (i.hasNext()) {
- Content ct = (Content) i.next();
- if (ct.getTextSelect() != null) {
- if (ct.getTextSelect().equalsIgnoreCase("fulltext")) {
- ct.setValue(contentStr);
- }
-
- } else if (ct.getRegexSelect() != null) {
- try {
- List<String> valuesLs = RegexUtils.extract(contentStr, ct
- .getRegexSelect());
- if (valuesLs.size() > 0) {
- ct.setValue(valuesLs.get(0));
- ct.setValues(valuesLs.toArray(new String[0]));
- }
- } catch (MalformedPatternException e) {
- logger.error(e.getMessage());
- }
- }
- }
-
- return ctt;
}
/*
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
Tue Sep 25 13:37:00 2007
@@ -16,64 +16,27 @@
*/
package org.apache.tika.parser.msword;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.io.IOException;
+import java.io.InputStream;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.MSExtractor;
-import org.apache.tika.utils.RegexUtils;
-
-import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
/**
* Word parser
- *
- *
*/
public class MsWordParser extends Parser {
- private MSExtractor extractor = new WordExtractor();
-
- static Logger logger = Logger.getRootLogger();
-
- public Map<String, Content> getContents() {
- if (contentStr == null) {
- // extractor
- try {
- contentStr = extractor.extractText(getInputStream());
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
+ protected String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ try {
+ return new WordExtractor().extractText(stream);
+ } catch (IOException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new TikaException("Error parsing a Word document", e);
}
- Map<String, Content> ctt = super.getContents();
- Iterator i = ctt.values().iterator();
- while (i.hasNext()) {
- Content ct = (Content) i.next();
- if (ct.getTextSelect() != null) {
- if (ct.getTextSelect().equalsIgnoreCase("fulltext")) {
- ct.setValue(contentStr);
- }
-
- } else if (ct.getRegexSelect() != null) {
- try {
- List<String> valuesLs = RegexUtils.extract(contentStr, ct
- .getRegexSelect());
- if (valuesLs.size() > 0) {
- ct.setValue(valuesLs.get(0));
- ct.setValues(valuesLs.toArray(new String[0]));
- }
- } catch (MalformedPatternException e) {
- logger.error(e.getMessage());
- }
- }
- }
-
- return ctt;
-
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
Tue Sep 25 13:37:00 2007
@@ -23,19 +23,17 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.xml.XMLParser;
-import org.apache.tika.utils.RegexUtils;
import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
+import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
@@ -43,8 +41,6 @@
/**
* OpenOffice parser
- *
- *
*/
public class OpenOfficeParser extends Parser {
static Logger logger = Logger.getRootLogger();
@@ -52,12 +48,8 @@
private final Namespace NS_DC = Namespace.getNamespace("dc",
"http://purl.org/dc/elements/1.1/");
- private XMLParser xp = new XMLParser();
-
- private org.jdom.Document xmlDoc;
-
public org.jdom.Document parse(InputStream is) {
- xmlDoc = new org.jdom.Document();
+ Document xmlDoc = new org.jdom.Document();
org.jdom.Document xmlMeta = new org.jdom.Document();
try {
List files = unzip(is);
@@ -83,34 +75,17 @@
return xmlDoc;
}
- public Map<String, Content> getContents() {
- if (xmlDoc == null)
- xmlDoc = parse(getInputStream());
- if (contentStr == null) {
- contentStr = xp.concatOccurance(xmlDoc, "//*", " ");
- }
- List<String> documentNs = xp.getAllDocumentNs(xmlDoc);
- Map<String, Content> ctt = super.getContents();
- Iterator it = ctt.values().iterator();
- while (it.hasNext()) {
- Content content = (Content) it.next();
+ protected String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ Document xmlDoc = parse(stream);
+ XMLParser xp = new XMLParser();
+ xp.getAllDocumentNs(xmlDoc);
+ for (Content content : contents) {
if (content.getXPathSelect() != null) {
xp.extractContent(xmlDoc, content);
- } else if (content.getRegexSelect() != null) {
- try {
- List<String> valuesLs = RegexUtils.extract(contentStr,
- content.getRegexSelect());
- if (valuesLs.size() > 0) {
- content.setValue(valuesLs.get(0));
- content.setValues(valuesLs.toArray(new String[0]));
- }
- } catch (MalformedPatternException e) {
- logger.error(e.getMessage());
- }
}
}
-
- return ctt;
+ return xp.concatOccurance(xmlDoc, "//*", " ");
}
public List unzip(InputStream is) {
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Tue Sep 25 13:37:00 2007
@@ -17,159 +17,74 @@
package org.apache.tika.parser.pdf;
import java.io.IOException;
+import java.io.InputStream;
import java.io.StringWriter;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.util.Calendar;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.RegexUtils;
-import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
-import org.pdfbox.exceptions.CryptographyException;
-import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;
/**
* PDF parser
- *
- *
*/
public class PDFParser extends Parser {
- static Logger logger = Logger.getRootLogger();
- private PDDocument pdfDocument = null;
-
- public Map<String, Content> getContents() {
- // String contents = getContent();
- if (contentStr == null) {
+ protected String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ try {
+ PDDocument pdfDocument = PDDocument.load(stream);
try {
- pdfDocument = PDDocument.load(getInputStream());
if (pdfDocument.isEncrypted()) {
pdfDocument.decrypt("");
}
- StringWriter writer = new StringWriter();
- PDFTextStripper stripper = new PDFTextStripper();
- stripper.writeText(pdfDocument, writer);
- contentStr = writer.getBuffer().toString();
- } catch (CryptographyException e) {
- logger.error(e.getMessage());
- } catch (IOException e) {
- e.printStackTrace();
- logger.error(e.getMessage());
- } catch (InvalidPasswordException e) {
- logger.error(e.getMessage());
- } finally {
- if (pdfDocument != null) {
- try {
- pdfDocument.close();
- } catch (IOException ex) {
- logger.error(ex.getMessage());
- }
- }
- }
- }
- Map<String, Content> ctt = super.getContents();
- Iterator i = ctt.values().iterator();
- while (i.hasNext()) {
- Content ct = (Content) i.next();
-
- if (ct.getTextSelect() != null) {
-
- if (ct.getTextSelect().equalsIgnoreCase("fulltext")) {
- ct.setValue(contentStr);
-
- } else {
- try {
- PDDocumentInformation metaData = pdfDocument
- .getDocumentInformation();
- if (ct.getTextSelect().equalsIgnoreCase("title")) {
- if (metaData.getTitle() != null) {
- ct.setValue(metaData.getTitle());
-
- }
- } else if (ct.getTextSelect()
- .equalsIgnoreCase("author")) {
- if (metaData.getAuthor() != null) {
- ct.setValue(metaData.getAuthor());
-
- }
- } else if (ct.getTextSelect().equalsIgnoreCase(
- "creator")) {
- if (metaData.getCreator() != null) {
- ct.setValue(metaData.getCreator());
-
- }
- } else if (ct.getTextSelect().equalsIgnoreCase(
- "keywords")) {
- if (metaData.getKeywords() != null) {
- ct.setValue(metaData.getKeywords());
-
- }
- } else if (ct.getTextSelect().equalsIgnoreCase(
- "producer")) {
- if (metaData.getProducer() != null) {
- ct.setValue(metaData.getProducer());
-
- }
- } else if (ct.getTextSelect().equalsIgnoreCase(
- "subject")) {
- if (metaData.getSubject() != null) {
- ct.setValue(metaData.getSubject());
-
- }
- } else if (ct.getTextSelect().equalsIgnoreCase(
- "trapped")) {
- if (metaData.getTrapped() != null) {
- ct.setValue(metaData.getTrapped());
-
- }
- } else if (ct.getTextSelect().equalsIgnoreCase(
- "creationDate")) {
- if (metaData.getCreationDate() != null) {
- ct.setValue(metaData.getCreationDate()
- .getTime().toString());
-
- }
- } else if (ct.getTextSelect().equalsIgnoreCase(
- "modificationDate")) {
- if (metaData.getModificationDate() != null) {
- ct.setValue(metaData.getModificationDate()
- .getTime().toString());
-
- }
- } else if (ct.getTextSelect().equalsIgnoreCase(
- "summary")) {
- int summarySize = Math
- .min(contentStr.length(), 500);
- String summary = contentStr.substring(0,
- summarySize);
- ct.setValue(summary);
+
+ PDDocumentInformation metaData =
+ pdfDocument.getDocumentInformation();
+ for (Content content : contents) {
+ String text = content.getTextSelect();
+ if ("title".equalsIgnoreCase(text)) {
+ content.setValue(metaData.getTitle());
+ } else if ("author".equalsIgnoreCase(text)) {
+ content.setValue(metaData.getAuthor());
+ } else if ("creator".equalsIgnoreCase(text)) {
+ content.setValue(metaData.getCreator());
+ } else if ("keywords".equalsIgnoreCase(text)) {
+ content.setValue(metaData.getKeywords());
+ } else if ("producer".equalsIgnoreCase(text)) {
+ content.setValue(metaData.getProducer());
+ } else if ("subject".equalsIgnoreCase(text)) {
+ content.setValue(metaData.getSubject());
+ } else if ("trapped".equalsIgnoreCase(text)) {
+ content.setValue(metaData.getTrapped());
+ } else if ("creationDate".equalsIgnoreCase(text)) {
+ Calendar calendar = metaData.getCreationDate();
+ if (calendar != null) {
+ content.setValue(calendar.getTime().toString());
+ }
+ } else if ("modificationDate".equalsIgnoreCase(text)) {
+ Calendar calendar = metaData.getModificationDate();
+ if (calendar != null) {
+ content.setValue(calendar.getTime().toString());
}
- } catch (IOException e) {
- logger.error(e.getMessage());
}
}
- } else if (ct.getRegexSelect() != null) {
- try {
- List<String> valuesLs = RegexUtils.extract(contentStr, ct
- .getRegexSelect());
- if (valuesLs.size() > 0) {
- ct.setValue(valuesLs.get(0));
- ct.setValues(valuesLs.toArray(new String[0]));
- }
- } catch (MalformedPatternException e) {
- logger.error(e.getMessage());
- }
+ StringWriter writer = new StringWriter();
+ new PDFTextStripper().writeText(pdfDocument, writer);
+ return writer.getBuffer().toString();
+ } finally {
+ pdfDocument.close();
}
+ } catch (IOException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new TikaException("Error parsing a PDF document", e);
}
-
- return ctt;
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
Tue Sep 25 13:37:00 2007
@@ -17,67 +17,31 @@
package org.apache.tika.parser.rtf;
import java.io.IOException;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.io.InputStream;
-import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.RegexUtils;
-
-import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
/**
* RTF parser
- *
- *
*/
public class RTFParser extends Parser {
- static Logger logger = Logger.getRootLogger();
-
- public Map<String, Content> getContents() {
- if (contentStr == null) {
- try {
- DefaultStyledDocument sd = new DefaultStyledDocument();
- RTFEditorKit kit = new RTFEditorKit();
- kit.read(getInputStream(), sd, 0);
- contentStr = sd.getText(0, sd.getLength());
- } catch (IOException e) {
- logger.error(e.getMessage());
- } catch (BadLocationException j) {
- logger.error(j.getMessage());
- }
+ protected String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ try {
+ DefaultStyledDocument sd = new DefaultStyledDocument();
+ new RTFEditorKit().read(stream, sd, 0);
+ return sd.getText(0, sd.getLength());
+ } catch (IOException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new TikaException("Error parsing an RTF document", e);
}
- Map<String, Content> ctt = super.getContents();
- Iterator i = ctt.values().iterator();
- while (i.hasNext()) {
- Content ct = (Content) i.next();
- if (ct.getTextSelect() != null) {
- if (ct.getTextSelect().equalsIgnoreCase("fulltext")) {
- ct.setValue(contentStr);
- }
-
- } else if (ct.getRegexSelect() != null) {
- try {
- List<String> valuesLs = RegexUtils.extract(contentStr, ct
- .getRegexSelect());
- if (valuesLs.size() > 0) {
- ct.setValue(valuesLs.get(0));
- ct.setValues(valuesLs.toArray(new String[0]));
- }
- } catch (MalformedPatternException e) {
- logger.error(e.getMessage());
- }
- }
- }
-
- return ctt;
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Tue Sep 25 13:37:00 2007
@@ -17,78 +17,29 @@
package org.apache.tika.parser.txt;
import java.io.BufferedReader;
-import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.RegexUtils;
-
-import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
/**
* Text parser
- *
- *
*/
public class TXTParser extends Parser {
- static Logger logger = Logger.getRootLogger();
-
- public Map<String, Content> getContents() {
- if (contentStr == null) {
- StringBuffer sb = new StringBuffer();
- try {
- BufferedReader br = new BufferedReader(new InputStreamReader(
- getInputStream()));
- String line = null;
- while ((line = br.readLine()) != null) {
- sb.append(line);
- sb.append(" ");
- }
- } catch (FileNotFoundException ex) {
- logger.error(ex.getMessage());
- } catch (IOException ex1) {
- logger.error(ex1.getMessage());
- } finally {
- try {
- getInputStream().close();
- } catch (IOException e) {
- logger.error(e.getMessage());
- }
- }
- contentStr = sb.toString();
- }
- Map<String, Content> ctt = super.getContents();
- Iterator i = ctt.values().iterator();
- while (i.hasNext()) {
- Content ct = (Content) i.next();
- if (ct.getTextSelect() != null) {
- if (ct.getTextSelect().equalsIgnoreCase("fulltext")) {
- ct.setValue(contentStr);
- }
-
- } else if (ct.getRegexSelect() != null) {
- try {
- List<String> valuesLs = RegexUtils.extract(contentStr, ct
- .getRegexSelect());
- if (valuesLs.size() > 0) {
- ct.setValue(valuesLs.get(0));
- ct.setValues(valuesLs.toArray(new String[0]));
- }
- } catch (MalformedPatternException e) {
- logger.error(e.getMessage());
- }
- }
+ protected String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ StringBuffer sb = new StringBuffer();
+ BufferedReader br = new BufferedReader(new InputStreamReader(stream));
+ String line = null;
+ while ((line = br.readLine()) != null) {
+ sb.append(line);
+ sb.append(" ");
}
-
- return ctt;
-
+ return sb.toString();
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Tue Sep 25 13:37:00 2007
@@ -16,18 +16,18 @@
*/
package org.apache.tika.parser.xml;
+import java.io.IOException;
+import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.RegexUtils;
import org.apache.tika.utils.Utils;
import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
import org.jaxen.JaxenException;
import org.jaxen.SimpleNamespaceContext;
import org.jaxen.jdom.JDOMXPath;
@@ -43,48 +43,24 @@
/**
* XML parser
- *
- *
*/
public class XMLParser extends Parser {
- static Logger logger = Logger.getRootLogger();
- private Document xmlDoc = null;
+ static Logger logger = Logger.getRootLogger();
private SimpleNamespaceContext nsc = new SimpleNamespaceContext();
- public Map<String, Content> getContents() {
- if (contentStr == null) {
- if (xmlDoc == null)
- xmlDoc = Utils.parse(getInputStream());
- contentStr = concatOccurance(xmlDoc, "//*", " ");
- }
- if (xmlDoc == null)
- xmlDoc = Utils.parse(getInputStream());
- List<String> documentNs = getAllDocumentNs(xmlDoc);
- Map<String, Content> ctt = super.getContents();
- Iterator it = ctt.values().iterator();
- if (exist(documentNs, getNamespace())) {
- while (it.hasNext()) {
- Content content = (Content) it.next();
+ protected String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ Document xmlDoc = Utils.parse(stream);
+ if (exist(getAllDocumentNs(xmlDoc), getNamespace())) {
+ for (Content content : contents) {
if (content.getXPathSelect() != null) {
extractContent(xmlDoc, content);
- } else if (content.getRegexSelect() != null) {
- try {
- List<String> valuesLs = RegexUtils.extract(contentStr,
- content.getRegexSelect());
- if (valuesLs.size() > 0) {
- content.setValue(valuesLs.get(0));
- content.setValues(valuesLs.toArray(new String[0]));
- }
- } catch (MalformedPatternException e) {
- logger.error(e.getMessage());
- }
}
}
}
-
- return ctt;
+ return concatOccurance(xmlDoc, "//*", " ");
}
public String concatOccurance(Object xmlDoc, String xpath, String
concatSep) {
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
Tue Sep 25 13:37:00 2007
@@ -18,7 +18,6 @@
// JDK imports
import java.io.InputStream;
-import java.util.Map;
import org.apache.tika.config.Content;
// Jakarta POI imports
@@ -42,13 +41,13 @@
private POIFSReader reader = null;
- private Map<String, Content> contents;
+ private Iterable<Content> contents;
/** Constructs a new Microsoft document extractor. */
public MSExtractor() {
}
- public void setContents(Map<String, Content> contents){
+ public void setContents(Iterable<Content> contents){
this.contents = contents;
}
@@ -94,7 +93,7 @@
try {
SummaryInformation si = (SummaryInformation) PropertySetFactory
.create(event.getStream());
- for (Content content : contents.values()) {
+ for (Content content : contents) {
if (content.getTextSelect().equalsIgnoreCase("title")) {
content.setValue(si.getTitle());
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
Tue Sep 25 13:37:00 2007
@@ -261,8 +261,6 @@
}
private static String getStringContent(Parser parser) throws IOException {
- String content = parser.getStrContent();
- parser.getInputStream().close();
- return content;
+ return parser.getStrContent();
}
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=579372&r1=579371&r2=579372&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Tue Sep
25 13:37:00 2007
@@ -150,7 +150,6 @@
final String expected = "Test Indexation Html";
assertTrue("text contains '" + expected + "'",
text.contains(expected));
- parser.getInputStream().close();
}
private File getTestFile(String filename) {