svn commit: r580874 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/ src/main/java/org/apache/tika/utils/ src/test/java/org/apache/tika/

jukka Mon, 01 Oct 2007 00:22:36 -0700

Author: jukka
Date: Mon Oct  1 00:21:21 2007
New Revision: 580874

URL: http://svn.apache.org/viewvc?rev=580874&view=rev
Log:
TIKA-33 - Stateless parsers


Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
    incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Oct  1 00:21:21 2007
@@ -50,6 +50,7 @@
 23. TIKA-31 - protected Parser.parse(InputStream stream,
               Iterable<Content> contents) (jukka & K. Bennett)
               
-24. TIKA-36 A convenience method for getting a document's content's text 
+24. TIKA-36 - A convenience method for getting a document's content's text 
               would be helpful (K. Bennett & mattmann)
   
+25. TIKA-33 - Stateless parsers (jukka)

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Mon 
Oct  1 00:21:21 2007
@@ -34,22 +34,10 @@
 
     private static final Logger logger = Logger.getLogger(Parser.class);
 
-    private InputStream is;
-
     private String mimeType;
 
     private String namespace;
 
-    private Map<String, Content> contents;
-
-    private String contentStr;
-
-    private boolean parsed = false; 
-
-    public void setInputStream(InputStream is) {
-        this.is = is;
-    }
-
     /**
      * Get document mime type
      */
@@ -72,87 +60,39 @@
         this.namespace = namespace;
     }
 
-    /**
-     * Get the string content of the document
-     */
-    public String getStrContent() {
-        getContents();
-        return contentStr;
-    }
-
-    /**
-     * Get a content object, this object is configured from the TikaConfig Xml.
-     * It could be a document metadata, XPath selection, regex selection or
-     * fulltext
-     */
-    public Content getContent(String name) {
-        return getContents().get(name);
-    }
-
-    /**
-     * Returns the text associated with the Content named 'name',
-     * or null if such a Content does not exist.
-     *
-     * @param name name of Content the caller wants the value of
-     * @return the found Content's value, or null if not found
-     */
-    public String getContentValue(String name) {
-        Content content = getContent(name);
-
-        return content != null
-                ? content.getValue()
-                : null;
-    }
-
-    /**
-     * Get a List of contents objects, this objects are configured from the
-     * TikaConfig Xml file. It could be a document metadata, XPath selection,
-     * regex selection or fulltext
-     */
-    public Map<String, Content> getContents() {
-        if (!parsed) {
-            try {
-                try {
-                    contentStr = parse(is, contents.values());
-                } finally {
-                    is.close();
-                }
-
-                for (Content content : contents.values()) {
-                    if ("fulltext".equalsIgnoreCase(content.getTextSelect())) {
-                        content.setValue(contentStr);
-                    } else if 
("summary".equalsIgnoreCase(content.getTextSelect())) {
-                        int length = Math.min(contentStr.length(), 500);
-                        String summary = contentStr.substring(0, length);
-                        content.setValue(summary);
-                    } else if (content.getRegexSelect() != null) {
-                        String regex = content.getRegexSelect();
-                        try {
-                            List<String> values =
-                                RegexUtils.extract(contentStr, regex);
-                            if (values.size() > 0) {
-                                content.setValue(values.get(0));
-                                content.setValues(
-                                        values.toArray(new 
String[values.size()]));
-                            }
-                        } catch (MalformedPatternException e) {
-                            logger.error(
-                                    "Invalid regular expression: " + regex, e);
+    public String getContents(InputStream stream, Map<String, Content> 
contents) {
+        try {
+            String contentStr = parse(stream, contents.values());
+
+            for (Content content : contents.values()) {
+                if ("fulltext".equalsIgnoreCase(content.getTextSelect())) {
+                    content.setValue(contentStr);
+                } else if 
("summary".equalsIgnoreCase(content.getTextSelect())) {
+                    int length = Math.min(contentStr.length(), 500);
+                    String summary = contentStr.substring(0, length);
+                    content.setValue(summary);
+                } else if (content.getRegexSelect() != null) {
+                    String regex = content.getRegexSelect();
+                    try {
+                        List<String> values =
+                            RegexUtils.extract(contentStr, regex);
+                        if (values.size() > 0) {
+                            content.setValue(values.get(0));
+                            content.setValues(
+                                    values.toArray(new String[values.size()]));
                         }
+                    } catch (MalformedPatternException e) {
+                        logger.error(
+                                "Invalid regular expression: " + regex, e);
                     }
                 }
-            } catch (Exception e) {
-                logger.error("Parse error: " + e.getMessage(), e);
-                contentStr = "";
-            } finally {
-                parsed = true;
             }
-        }
-        return contents;
-    }
 
-    public void setContents(Map<String, Content> contents) {
-        this.contents = contents;
+            return contentStr;
+        } catch (Exception e) {
+            logger.error("Parse error: " + e.getMessage(), e);
+            return "";
+        }
     }
 
     protected abstract String parse(

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java 
Mon Oct  1 00:21:21 2007
@@ -16,11 +16,8 @@
  */
 package org.apache.tika.parser;
 
-import java.io.InputStream;
-
 import org.apache.commons.lang.StringUtils;
 import org.apache.log4j.Logger;
-import org.apache.tika.config.TikaConfig;
 import org.apache.tika.config.ParserConfig;
 import org.apache.tika.exception.TikaException;
 
@@ -33,80 +30,19 @@
 
     static Logger logger = Logger.getRootLogger();
 
-
-
-    public static Parser getParser(
-            InputStream inputStream, String mimeType, TikaConfig tc)
-            throws TikaException {
-
-        // Verify that all passed parameters are (probably) valid.
-
-        if (StringUtils.isBlank(mimeType)) {
-            throw new TikaException("Mime type not specified.");
-        }
-
-        if (inputStream == null) {
-            throw new TikaException("Input stream is null.");
-        }
-
-        if (tc == null) {
-            throw new TikaException("Configuration object is null.");
-        }
-
-        ParserConfig pc = getParserConfig(mimeType, tc);
-        if (pc == null) {
-            throw new TikaException(
-                    "Could not find parser config for mime type "
-                    + mimeType + ".");
-        }
-
-        String className = pc.getParserClass();
-        Parser parser = null;
-
+    public static Parser getParser(ParserConfig config) throws TikaException {
+        String className = config.getParserClass();
         if (StringUtils.isBlank(className)) {
             throw new TikaException(
                     "Parser class name missing from ParserConfig.");
         }
-
         try {
-            logger.info("Loading parser class = " + className
-                    + " MimeType = " + mimeType);
-
-            Class<?> parserClass = Class.forName(className);
-            parser = (Parser) parserClass.newInstance();
-            parser.setMimeType(mimeType);
-            parser.setContents(pc.getContents());
-            parser.setInputStream(inputStream);
-
-        } catch (ClassNotFoundException e) {
-            logger.error(e.getMessage());
-            throw new TikaException(e.getMessage());
-        } catch (InstantiationException e) {
-            logger.error(e.getMessage());
-            throw new TikaException(e.getMessage());
-        } catch (IllegalAccessException e) {
-            logger.error(e.getMessage());
+            logger.info("Loading parser class = " + className);
+            return (Parser) Class.forName(className).newInstance();
+        } catch (Exception e) {
+            logger.error("Unable to instantiate parser: " + className, e);
             throw new TikaException(e.getMessage());
         }
-
-        return parser;
     }
 
-
-    private static ParserConfig getParserConfig(String mimeType, TikaConfig tc)
-            throws TikaException {
-
-        ParserConfig pc = tc.getParserConfig(mimeType);
-
-        if (pc == null) {
-            String message =
-                    "Could not find parser configuration for mime type "
-                    + mimeType + ".";
-
-            logger.error(message);
-            throw new TikaException(message);
-        }
-
-        return pc;
-    }
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java 
Mon Oct  1 00:21:21 2007
@@ -24,6 +24,7 @@
 import java.net.URL;
 
 // TIKA imports
+import org.apache.tika.config.ParserConfig;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.TikaMimeKeys;
@@ -41,54 +42,18 @@
      * receive input from a stream opened from the specified URL. NB: Close the
      * input stream when it is no longer needed!
      * 
-     * @param inputStream
-     *            stream containing document data to parse
-     * @param config
-     * @param mimeType
-     *            the document's MIME type
-     * @return a parser appropriate to this MIME type and ready to read input
-     *         from the specified document
-     * @throws TikaException
-     * @throws IOException
-     */
-    public static Parser getParser(InputStream inputStream, TikaConfig config,
-            String mimeType) throws TikaException, IOException {
-
-        if (inputStream == null) {
-            throw new TikaException("Document input stream not provided.");
-        }
-
-        return ParserFactory.getParser(inputStream, mimeType, config);
-    }
-
-    // Note that we cannot provide a method that takes an InputStream
-    // but not a MIME type, since we will not have a resource
-    // name from which to derive it.
-
-    /**
-     * Returns a parser that can handle the specified MIME type, and is set to
-     * receive input from a stream opened from the specified URL. NB: Close the
-     * input stream when it is no longer needed!
-     * 
-     * @param documentUrl
-     *            URL pointing to the document to parse
      * @param config
      * @param mimeType
      *            the document's MIME type
-     * @return a parser appropriate to this MIME type and ready to read input
-     *         from the specified document
+     * @return a parser appropriate to this MIME type
      * @throws TikaException
-     * @throws IOException
      */
-    public static Parser getParser(URL documentUrl, TikaConfig config,
-            String mimeType) throws TikaException, IOException {
-
-        if (documentUrl == null) {
-            throw new TikaException("Document URL not provided.");
-        }
-
-        return ParserFactory.getParser(documentUrl.openStream(), mimeType,
-                config);
+    public static Parser getParser(TikaConfig config, String mimeType)
+            throws TikaException {
+        ParserConfig pc  = config.getParserConfig(mimeType);
+        Parser parser = ParserFactory.getParser(pc);
+        parser.setMimeType(mimeType);
+        return parser;
     }
 
     /**
@@ -103,48 +68,12 @@
      * @return a parser appropriate to this MIME type and ready to read input
      *         from the specified document
      * @throws TikaException
-     * @throws IOException
      */
     public static Parser getParser(URL documentUrl, TikaConfig config)
-            throws TikaException, IOException {
-
-        String mimetype = config.getMimeRepository().getMimeType(documentUrl)
-                .getName();
-        return getParser(documentUrl, config, mimetype);
-    }
-
-    /**
-     * Returns a parser that can handle the specified MIME type, and is set to
-     * receive input from a stream opened from the specified URL. NB: Close the
-     * input stream when it is no longer needed!
-     * 
-     * @param documentFile
-     *            File object pointing to the document to parse
-     * @param config
-     * @param mimeType
-     *            the document's MIME type
-     * @return a parser appropriate to this MIME type and ready to read input
-     *         from the specified document
-     * @throws TikaException
-     * @throws IOException
-     */
-    public static Parser getParser(File documentFile, TikaConfig config,
-            String mimeType) throws TikaException, IOException {
-
-        if (documentFile == null) {
-            throw new TikaException("Document file not provided.");
-        }
-
-        if (!documentFile.canRead()) {
-            throw new TikaException(
-                    "Document file does not exist or is not readable.");
-        }
-
-        FileInputStream inputStream = new FileInputStream(documentFile);
-        // TODO: Do we want to wrap a BufferedInputStream, or does the
-        // file's buffering suffice?
-
-        return ParserFactory.getParser(inputStream, mimeType, config);
+            throws TikaException {
+        String mimetype =
+            config.getMimeRepository().getMimeType(documentUrl).getName();
+        return getParser(config, mimetype);
     }
 
     /**
@@ -158,14 +87,12 @@
      * @return a parser appropriate to this MIME type and ready to read input
      *         from the specified document
      * @throws TikaException
-     * @throws IOException
      */
     public static Parser getParser(File documentFile, TikaConfig config)
-            throws TikaException, IOException {
-
-        String mimetype = config.getMimeRepository().getMimeType(documentFile)
-                .getName();
-        return getParser(documentFile, config, mimetype);
+            throws TikaException {
+        String mimetype =
+            config.getMimeRepository().getMimeType(documentFile).getName();
+        return getParser(config, mimetype);
     }
 
     /**
@@ -180,12 +107,13 @@
      * @throws TikaException
      * @throws IOException
      */
-    public static String getStringContent(InputStream inputStream,
-            TikaConfig config, String mimeType) throws TikaException,
-            IOException {
-
-        Parser parser = getParser(inputStream, config, mimeType);
-        return getStringContent(parser);
+    public static String getStringContent(
+            InputStream inputStream, TikaConfig config, String mimeType)
+            throws TikaException, IOException {
+        ParserConfig pc  = config.getParserConfig(mimeType);
+        Parser parser = ParserFactory.getParser(pc);
+        parser.setMimeType(mimeType);
+        return parser.getContents(inputStream, pc.getContents());
     }
 
     /**
@@ -200,9 +128,9 @@
      */
     public static String getStringContent(URL documentUrl, TikaConfig config)
             throws TikaException, IOException {
-
-        Parser parser = getParser(documentUrl, config);
-        return getStringContent(parser);
+        String mime =
+                config.getMimeRepository().getMimeType(documentUrl).getName();
+        return getStringContent(documentUrl, config, mime);
     }
 
     /**
@@ -217,11 +145,15 @@
      * @throws TikaException
      * @throws IOException
      */
-    public static String getStringContent(URL documentUrl, TikaConfig config,
-            String mimeType) throws TikaException, IOException {
-
-        Parser parser = getParser(documentUrl, config, mimeType);
-        return getStringContent(parser);
+    public static String getStringContent(
+            URL documentUrl, TikaConfig config, String mimeType)
+            throws TikaException, IOException {
+        InputStream stream = documentUrl.openStream();
+        try {
+            return getStringContent(stream, config, mimeType);
+        } finally {
+            stream.close();
+        }
     }
 
     /**
@@ -236,11 +168,15 @@
      * @throws TikaException
      * @throws IOException
      */
-    public static String getStringContent(File documentFile, TikaConfig config,
-            String mimeType) throws TikaException, IOException {
-
-        Parser parser = getParser(documentFile, config, mimeType);
-        return getStringContent(parser);
+    public static String getStringContent(
+            File documentFile, TikaConfig config, String mimeType)
+            throws TikaException, IOException {
+        InputStream stream = new FileInputStream(documentFile);
+        try {
+            return getStringContent(stream, config, mimeType);
+        } finally {
+            stream.close();
+        }
     }
 
     /**
@@ -255,12 +191,9 @@
      */
     public static String getStringContent(File documentFile, TikaConfig config)
             throws TikaException, IOException {
-
-        Parser parser = getParser(documentFile, config);
-        return getStringContent(parser);
+        String mime =
+            config.getMimeRepository().getMimeType(documentFile).getName();
+        return getStringContent(documentFile, config, mime);
     }
 
-    private static String getStringContent(Parser parser) throws IOException {
-        return parser.getStrContent();
-    }
 }

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java 
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Mon Oct 
 1 00:21:21 2007
@@ -17,12 +17,17 @@
 package org.apache.tika;
 
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
 
 import org.apache.tika.config.Content;
+import org.apache.tika.config.ParserConfig;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.log.TikaLogger;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserFactory;
 import org.apache.tika.utils.ParseUtils;
 import org.apache.tika.utils.Utils;
 import org.jdom.JDOMException;
@@ -69,12 +74,7 @@
         File file = getTestFile("testPDF.pdf");
         String s1 = ParseUtils.getStringContent(file, tc);
         String s2 = ParseUtils.getStringContent(file, tc, "application/pdf");
-
-        Parser parser = ParseUtils.getParser(file, tc);
-        String s3 = parser.getStrContent();
-
         assertEquals(s1, s2);
-        assertEquals(s1, s3);
     }
 
     public void testTXTExtraction() throws Exception {
@@ -135,25 +135,33 @@
         String s2 = ParseUtils.getStringContent(file, tc, "text/html");
         assertEquals(s1, s2);
 
-        Parser parser = ParseUtils.getParser(file, tc);
+        ParserConfig config = tc.getParserConfig("text/html");
+        Parser parser = ParserFactory.getParser(config);
         assertNotNull(parser);
         assertEquals("org.apache.tika.parser.html.HtmlParser", 
parser.getClass().getName());
+        parser.setMimeType("text/html");
 
-        
-        Content content = parser.getContent("title");
-        assertNotNull(content);
-        assertEquals("Title : Test Indexation Html", content.getValue());
+        Map<String, Content> contents = config.getContents();
+        assertNotNull(contents);
+        InputStream stream = new FileInputStream(file);
+        try {
+            parser.getContents(stream, contents);
+        } finally {
+            stream.close();
+        }
+        assertEquals(
+                "Title : Test Indexation Html",
+                contents.get("title").getValue());
 
         assertEquals("text/html", parser.getMimeType());
 
-        final String text = Utils.toString(parser.getContents());
+        final String text = Utils.toString(contents);
         final String expected = "Test Indexation Html";
-        assertTrue("text contains '" + expected + "'",
-                text.contains(expected));
+        assertTrue("text contains '" + expected + "'", 
text.contains(expected));
     }
 
     private File getTestFile(String filename) {
-      return new File(testFilesBaseDir, filename);
+        return new File(testFilesBaseDir, filename);
     }
 
 }

svn commit: r580874 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/ src/main/java/org/apache/tika/utils/ src/test/java/org/apache/tika/

Reply via email to