sr...

ridabenjelloun Mon, 01 Oct 2007 09:40:37 -0700

Author: ridabenjelloun
Date: Mon Oct  1 09:40:06 2007
New Revision: 581010

URL: http://svn.apache.org/viewvc?rev=581010&view=rev
Log:
TIKA-35 Extract MsOffice properties. I have implement a method in Utils class 
that allows to copy InputStream in memory.


TIKA-39 Excel parsing improvements. Sami Siren patch

Modified:
    incubator/tika/trunk/CHANGES.txt
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/ExcelExtractor.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
    incubator/tika/trunk/src/main/resources/tika-config.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
    incubator/tika/trunk/src/test/resources/test-documents/testEXCEL.xls

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Oct  1 09:40:06 2007
@@ -55,4 +55,8 @@
   
 25. TIKA-33 - Stateless parsers (jukka)
 
-26. TIKA-38 - TXTParser adds a space to the content it reads from a file
+26. TIKA-38 - TXTParser adds a space to the content it reads from a file (K. 
Bennett, Rida)
+
+27. TIKA-35 - Extract MsOffice properties (Rida)
+
+28. Tika-39 - Excel parsing improvements (Sami, Rida)
\ No newline at end of file

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/ExcelExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/ExcelExtractor.java?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/ExcelExtractor.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/ExcelExtractor.java
 Mon Oct  1 09:40:06 2007
@@ -39,10 +39,10 @@
   
   public String extractText(InputStream input) throws Exception {
     
-    String resultText = "";
+    StringBuilder resultText = new StringBuilder();
     HSSFWorkbook wb = new HSSFWorkbook(input);
     if (wb == null) {
-      return resultText;
+      return resultText.toString();
     }
     
     HSSFSheet sheet;
@@ -72,20 +72,19 @@
               } else
              */
             if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
-              resultText += cell.getStringCellValue() + " ";
+              
resultText.append(cell.getRichStringCellValue().getString()).append(" ");
             } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
               Double d = new Double(cell.getNumericCellValue());
-              resultText += d.toString() + " ";
-            }
-            /* else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
-                 resultText += cell.getCellFormula() + " ";
-               } 
-             */
+              resultText.append(d).append(" ");
+            } else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
+              resultText.append(cell.getNumericCellValue()).append(" ");
+            } 
+            
           }
         }
       }
     }
-    return resultText;
+    return resultText.toString();
   }
   
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
 Mon Oct  1 09:40:06 2007
@@ -22,21 +22,27 @@
 import org.apache.tika.config.Content;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.MSExtractor;
+import org.apache.tika.utils.Utils;
 
 /**
  * Excel parser
  */
 public class MsExcelParser extends Parser {
 
-    protected String parse(InputStream stream, Iterable<Content> contents)
-            throws IOException, TikaException {
-        try {
-            return new ExcelExtractor().extractText(stream);
-        } catch (IOException e) {
-            throw e;
-        } catch (Exception e) {
-            throw new TikaException("Error parsing an Excel document", e);
-        }
-    }
+       protected String parse(InputStream stream, Iterable<Content> contents)
+                       throws IOException, TikaException {
+               try {
+                       MSExtractor extractor = new ExcelExtractor();
+                       extractor.setContents(contents);
+                       InputStream[] isa = Utils.copyInputStream(stream, 2);
+                       extractor.extractProperties(isa[0]);
+                       return extractor.extractText(isa[1]);
+               } catch (IOException e) {
+                       throw e;
+               } catch (Exception e) {
+                       throw new TikaException("Error parsing an Excel 
document", e);
+               }
+       }
 
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
 Mon Oct  1 09:40:06 2007
@@ -22,31 +22,35 @@
 import org.apache.tika.config.Content;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.MSExtractor;
+import org.apache.tika.utils.Utils;
 
 /**
  * Power point parser
  */
 public class MsPowerPointParser extends Parser {
 
-    protected String parse(InputStream stream, Iterable<Content> contents)
-            throws IOException, TikaException {
-        try {
-            PPTExtractor extrator = new PPTExtractor();
-            extrator.setContents(contents);
-            return extrator.extractText(stream);
-        } catch (IOException e) {
-            throw e;
-        } catch (Exception e) {
-            throw new TikaException("Error parsing a PowerPoint document", e);
-        }
-    }
+       protected String parse(InputStream stream, Iterable<Content> contents)
+                       throws IOException, TikaException {
+               try {
+                       MSExtractor extractor = new PPTExtractor();
+                       extractor.setContents(contents);
+                       InputStream[] isa = Utils.copyInputStream(stream, 2);
+                       extractor.extractProperties(isa[0]);
+                       return extractor.extractText(isa[1]);
+               } catch (IOException e) {
+                       throw e;
+               } catch (Exception e) {
+                       throw new TikaException("Error parsing a PowerPoint 
document", e);
+               }
+       }
 
-    /*
-     * public List<Content> getContents() {
-     * extrator.setContents(getParserConfig().getContents()); try {
-     * extrator.extract(getInputStream()); } catch (Exception e) { // TODO
-     * Auto-generated catch block e.printStackTrace(); } return
-     * getParserConfig().getContents(); }
-     */
+       /*
+        * public List<Content> getContents() {
+        * extrator.setContents(getParserConfig().getContents()); try {
+        * extrator.extract(getInputStream()); } catch (Exception e) { // TODO
+        * Auto-generated catch block e.printStackTrace(); } return
+        * getParserConfig().getContents(); }
+        */
 
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
 Mon Oct  1 09:40:06 2007
@@ -22,21 +22,27 @@
 import org.apache.tika.config.Content;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.MSExtractor;
+import org.apache.tika.utils.Utils;
 
 /**
  * Word parser
  */
 public class MsWordParser extends Parser {
 
-    protected String parse(InputStream stream, Iterable<Content> contents)
-            throws IOException, TikaException {
-        try {
-            return new WordExtractor().extractText(stream);
-        } catch (IOException e) {
-            throw e;
-        } catch (Exception e) {
-            throw new TikaException("Error parsing a Word document", e);
-        }
-    }
+       protected String parse(InputStream stream, Iterable<Content> contents)
+                       throws IOException, TikaException {
+               try {
+                       MSExtractor extractor = new WordExtractor();
+                       extractor.setContents(contents);
+                       InputStream[] isa = Utils.copyInputStream(stream, 2);
+                       extractor.extractProperties(isa[0]);
+                       return extractor.extractText(isa[1]);
+               } catch (IOException e) {
+                       throw e;
+               } catch (Exception e) {
+                       throw new TikaException("Error parsing a Word 
document", e);
+               }
+       }
 
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java 
Mon Oct  1 09:40:06 2007
@@ -19,8 +19,7 @@
 // JDK imports
 import java.io.InputStream;
 
-import org.apache.tika.config.Content;
-// Jakarta POI imports
+import org.apache.tika.config.Content; // Jakarta POI imports
 import org.apache.log4j.Logger;
 import org.apache.poi.hpsf.PropertySetFactory;
 import org.apache.poi.hpsf.SummaryInformation;
@@ -35,125 +34,137 @@
  */
 public abstract class MSExtractor {
 
-    static Logger LOG = Logger.getRootLogger();
+       static Logger LOG = Logger.getRootLogger();
 
-    private String text = null;
+       private String text = null;
 
-    private POIFSReader reader = null;
-    
-    private Iterable<Content> contents;
-
-    /** Constructs a new Microsoft document extractor. */
-    public MSExtractor() {        
-    }
-    
-    public void setContents(Iterable<Content> contents){
-        this.contents = contents;
-    }
-
-    /**
-     * Extracts properties and text from an MS Document input stream
-     */
-    public void extract(InputStream input) throws Exception {
-        // First, extract properties
-        this.reader = new POIFSReader();
-        
-        this.reader.registerListener(new PropertiesReaderListener(),
-                SummaryInformation.DEFAULT_STREAM_NAME);
-        //input.reset();
-        if (input.available() > 0) {
-            reader.read(input);
-        }
-        //input.reset();
-        this.text = extractText(input);
-    }
-
-    /**
-     * Extracts the text content from a Microsoft document input stream.
-     */
-    public abstract String extractText(InputStream input) throws Exception;
-
-    /**
-     * Get the content text of the Microsoft document.
-     * 
-     * @return the content text of the document
-     */
-    protected String getText() {
-        return this.text;
-    }
-
-    private class PropertiesReaderListener implements POIFSReaderListener {
-
-        public void processPOIFSReaderEvent(POIFSReaderEvent event) {
-            if (!event.getName().startsWith(
-                    SummaryInformation.DEFAULT_STREAM_NAME)) {
-                return;
-            }
-
-            try {
-                SummaryInformation si = (SummaryInformation) PropertySetFactory
-                        .create(event.getStream());
-                for (Content content : contents) {
-                    if (content.getTextSelect().equalsIgnoreCase("title")) {
-                        content.setValue(si.getTitle());
-                    }
-                    if (content.getTextSelect().equalsIgnoreCase("author")) {
-                        content.setValue(si.getAuthor());
-                    }
-                    else if 
(content.getTextSelect().equalsIgnoreCase("keywords")) {
-                        content.setValue(si.getKeywords());
-                    }
-                    else if 
(content.getTextSelect().equalsIgnoreCase("subject")) {
-                        content.setValue(si.getSubject());    
-                    }
-                    else if 
(content.getTextSelect().equalsIgnoreCase("lastauthor")) {
-                        content.setValue(si.getLastAuthor());    
-                    }
-                    else if 
(content.getTextSelect().equalsIgnoreCase("comments")) {
-                        content.setValue(si.getComments());    
-                    }
-                    else if 
(content.getTextSelect().equalsIgnoreCase("template")) {
-                        content.setValue(si.getTemplate());    
-                    }
-                    else if 
(content.getTextSelect().equalsIgnoreCase("applicationname")) {
-                        content.setValue(si.getApplicationName());
-                    }
-                    else if 
(content.getTextSelect().equalsIgnoreCase("revnumber")) {
-                        content.setValue(si.getRevNumber());
-                    }
-                    else if 
(content.getTextSelect().equalsIgnoreCase("creationdate")) {
-                        content.setValue(si.getCreateDateTime().toString());
-                    }
-                    else if (content.getTextSelect().equalsIgnoreCase("")) {
-                        //content.setValue(si.getCharCount());
-                    }
-                }
-
-            } catch (Exception ex) {
-            }
-
-        }
-
-    }
-
-}
-
-/*
- * setProperty(DublinCore.TITLE, si.getTitle());
- * setProperty(Office.APPLICATION_NAME, si.getApplicationName());
- * setProperty(Office.AUTHOR, si.getAuthor());
- * setProperty(Office.CHARACTER_COUNT, si.getCharCount());
- * setProperty(Office.COMMENTS, si.getComments()); setProperty(DublinCore.DATE,
- * si.getCreateDateTime()); // setProperty(Office.EDIT_TIME, si.getEditTime());
- * setProperty(HttpHeaders.LAST_MODIFIED, si.getLastSaveDateTime());
- * setProperty(Office.KEYWORDS, si.getKeywords());
- * setProperty(Office.LAST_AUTHOR, si.getLastAuthor());
- * setProperty(Office.LAST_PRINTED, si.getLastPrinted());
- * setProperty(Office.LAST_SAVED, si.getLastSaveDateTime());
- * setProperty(Office.PAGE_COUNT, si.getPageCount());
- * setProperty(Office.REVISION_NUMBER, si.getRevNumber());
- * setProperty(DublinCore.RIGHTS, si.getSecurity());
- * setProperty(DublinCore.SUBJECT, si.getSubject());
- * setProperty(Office.TEMPLATE, si.getTemplate());
- * setProperty(Office.WORD_COUNT, si.getWordCount());
- */
+       private POIFSReader reader = null;
+
+       private Iterable<Content> contents;
+
+       /** Constructs a new Microsoft document extractor. */
+       public MSExtractor() {
+       }
+
+       public void setContents(Iterable<Content> contents) {
+               this.contents = contents;
+       }
+
+       /**
+        * Extracts properties and text from an MS Document input stream
+        */
+       public void extractProperties(InputStream input) throws Exception {
+               // First, extract properties
+               this.reader = new POIFSReader();
+
+               this.reader.registerListener(new PropertiesReaderListener(),
+                               SummaryInformation.DEFAULT_STREAM_NAME);
+               // input.reset();
+               if (input.available() > 0) {
+                       reader.read(input);
+               }
+               // input.reset();
+               // this.text = extractText(input);
+       }
+
+       /**
+        * Extracts the text content from a Microsoft document input stream.
+        */
+       public abstract String extractText(InputStream input) throws Exception;
+
+       /**
+        * Get the content text of the Microsoft document.
+        * 
+        * @return the content text of the document
+        */
+       protected String getText() {
+               return this.text;
+       }
+
+       private class PropertiesReaderListener implements POIFSReaderListener {
+
+               public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+                       if (!event.getName().startsWith(
+                                       
SummaryInformation.DEFAULT_STREAM_NAME)) {
+                               return;
+                       }
+
+                       try {
+                               SummaryInformation si = (SummaryInformation) 
PropertySetFactory
+                                               .create(event.getStream());
+                               for (Content content : contents) {
+                                       if 
(content.getTextSelect().equalsIgnoreCase("title")) {
+                                               if (si.getTitle() != null)
+                                                       
content.setValue(si.getTitle());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "author")) {
+                                               if (si.getAuthor() != null)
+                                                       
content.setValue(si.getAuthor());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "keywords")) {
+                                               if (si.getKeywords() != null)
+                                                       
content.setValue(si.getKeywords());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "subject")) {
+                                               if (si.getSubject() != null)
+                                                       
content.setValue(si.getSubject());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "lastauthor")) {
+                                               if (si.getLastAuthor() != null)
+                                                       
content.setValue(si.getLastAuthor());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "comments")) {
+                                               if (si.getComments() != null)
+                                                       
content.setValue(si.getComments());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "template")) {
+                                               if (si.getTemplate() != null)
+                                                       
content.setValue(si.getTemplate());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "applicationname")) {
+                                               if (si.getApplicationName() != 
null)
+                                                       
content.setValue(si.getApplicationName());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "revnumber")) {
+                                               if (si.getRevNumber() != null)
+                                                       
content.setValue(si.getRevNumber());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "creationdate")) {
+                                               if (si.getCreateDateTime() != 
null)
+                                                       
content.setValue(si.getCreateDateTime().toString());
+                                       } else if 
(content.getTextSelect().equalsIgnoreCase(
+                                                       "charcount")) {
+                                               if (si.getCharCount() > 0)
+                                                       content.setValue("" + 
si.getCharCount());
+                                       } else if 
(content.getTextSelect().equals("edittime")) {
+                                               if (si.getEditTime() > 0)
+                                                       content.setValue("" + 
si.getEditTime());
+                                       } else if 
(content.getTextSelect().equals(
+                                                       "lastsavedatetime")) {
+                                               if (si.getLastSaveDateTime() != 
null)
+                                                       
content.setValue(si.getLastSaveDateTime()
+                                                                       
.toString());
+                                       } else if 
(content.getTextSelect().equals("pagecount")) {
+                                               if (si.getPageCount() > 0)
+                                                       content.setValue("" + 
si.getPageCount());
+                                       } else if 
(content.getTextSelect().equals("security")) {
+                                               if (si.getSecurity() > 0)
+                                                       content.setValue("" + 
si.getSecurity());
+                                       } else if 
(content.getTextSelect().equals("wordcount")) {
+                                               if (si.getWordCount() > 0)
+                                                       content.setValue("" + 
si.getWordCount());
+                                       } else if 
(content.getTextSelect().equals("lastprinted")) {
+                                               if (si.getLastPrinted() != null)
+                                                       
content.setValue(si.getLastPrinted().toString());
+                                       }
+
+                               }
+
+                       } catch (Exception ex) {
+                       }
+
+               }
+
+       }
+
+}
\ No newline at end of file

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Mon Oct 
 1 09:40:06 2007
@@ -52,110 +52,154 @@
 
 public class Utils {
 
-    static Logger logger = Logger.getRootLogger();
+       static Logger logger = Logger.getRootLogger();
 
-    public static String toString(Map<String, Content> structuredContent) {
-      final StringWriter sw = new StringWriter();
-      print(structuredContent,sw);
-      return sw.toString();
-    }
-    
-    public static void print(Map<String, Content> structuredContent) {
-      print(structuredContent,new OutputStreamWriter(System.out));
-    }
-    
-    public static void print(Map<String, Content> structuredContent,Writer 
outputWriter) {
-        final PrintWriter output = new PrintWriter(outputWriter,true);
-        for (Map.Entry<String, Content> entry : structuredContent.entrySet()) {
-            Content ct = entry.getValue();
-            if (ct.getValue() != null) {
-                output.print(entry.getKey() + ": ");
-                output.println(ct.getValue());
-            } else if (ct.getValues() != null) {
-                output.print(entry.getKey() + ": ");
-                for (int j = 0; j < ct.getValues().length; j++) {
-                    if (j == 0)
-                        output.println(ct.getValues()[j]);
-                    else {
-                        output.println("\t" + ct.getValues()[j]);
-                    }
-                }
-            } else { // there are no values, but there is a Content object
-                System.out.println(
-                        "Content '" + entry.getKey() + "' has no values.");
-            }
-        }
-    }
-
-    public static Document parse(InputStream is) {
-        org.jdom.Document xmlDoc = new org.jdom.Document();
-        try {
-            SAXBuilder builder = new SAXBuilder();
-            builder.setValidation(false);
-            xmlDoc = builder.build(is);
-        } catch (JDOMException e) {
-            logger.error(e.getMessage());
-        } catch (IOException e) {
-            logger.error(e.getMessage());
-        }
-        return xmlDoc;
-    }
-
-    public static List unzip(InputStream is) {
-        List res = new ArrayList();
-        try {
-            ZipInputStream in = new ZipInputStream(is);
-            ZipEntry entry = null;
-            while ((entry = in.getNextEntry()) != null) {
-                ByteArrayOutputStream stream = new ByteArrayOutputStream();
-                byte[] buf = new byte[1024];
-                int len;
-                while ((len = in.read(buf)) > 0) {
-                    stream.write(buf, 0, len);
-                }
-                InputStream isEntry = new ByteArrayInputStream(stream
-                        .toByteArray());
-                File file = File.createTempFile("tmp", "_" + entry.getName());
-                copyInputStream(isEntry, new BufferedOutputStream(
-                        new FileOutputStream(file)));
-                res.add(file);
-            }
-            in.close();
-        } catch (IOException e) {
-            logger.error(e.getMessage());
-        }
-        return res;
-    }
-
-    private static void copyInputStream(InputStream in, OutputStream out)
-            throws IOException {
-        byte[] buffer = new byte[1024];
-        int len;
-
-        while ((len = in.read(buffer)) >= 0)
-            out.write(buffer, 0, len);
-
-        in.close();
-        out.close();
-    }
-
-    public static void saveInXmlFile(Document doc, String file) {
-        Format f = Format.getPrettyFormat().setEncoding("UTF-8");
-
-        XMLOutputter xop = new XMLOutputter(f);
-
-        try {
-
-            xop.output(doc, new FileOutputStream(file));
-
-        }
-
-        catch (IOException ex) {
-
-            logger.error(ex.getMessage());
-
-        }
-
-    }
+       public static String toString(Map<String, Content> structuredContent) {
+               final StringWriter sw = new StringWriter();
+               print(structuredContent, sw);
+               return sw.toString();
+       }
+
+       public static void print(Map<String, Content> structuredContent) {
+               print(structuredContent, new OutputStreamWriter(System.out));
+       }
+
+       public static void print(Map<String, Content> structuredContent,
+                       Writer outputWriter) {
+               final PrintWriter output = new PrintWriter(outputWriter, true);
+               for (Map.Entry<String, Content> entry : 
structuredContent.entrySet()) {
+                       Content ct = entry.getValue();
+                       if (ct.getValue() != null) {
+                               output.print(entry.getKey() + ": ");
+                               output.println(ct.getValue());
+                       } else if (ct.getValues() != null) {
+                               output.print(entry.getKey() + ": ");
+                               for (int j = 0; j < ct.getValues().length; j++) 
{
+                                       if (j == 0)
+                                               
output.println(ct.getValues()[j]);
+                                       else {
+                                               output.println("\t" + 
ct.getValues()[j]);
+                                       }
+                               }
+                       } else { // there are no values, but there is a Content 
object
+                               System.out.println("Content '" + entry.getKey()
+                                               + "' has no values.");
+                       }
+               }
+       }
+
+       public static Document parse(InputStream is) {
+               org.jdom.Document xmlDoc = new org.jdom.Document();
+               try {
+                       SAXBuilder builder = new SAXBuilder();
+                       builder.setValidation(false);
+                       xmlDoc = builder.build(is);
+               } catch (JDOMException e) {
+                       logger.error(e.getMessage());
+               } catch (IOException e) {
+                       logger.error(e.getMessage());
+               }
+               return xmlDoc;
+       }
+
+       public static List unzip(InputStream is) {
+               List res = new ArrayList();
+               try {
+                       ZipInputStream in = new ZipInputStream(is);
+                       ZipEntry entry = null;
+                       while ((entry = in.getNextEntry()) != null) {
+                               ByteArrayOutputStream stream = new 
ByteArrayOutputStream();
+                               byte[] buf = new byte[1024];
+                               int len;
+                               while ((len = in.read(buf)) > 0) {
+                                       stream.write(buf, 0, len);
+                               }
+                               InputStream isEntry = new 
ByteArrayInputStream(stream
+                                               .toByteArray());
+                               File file = File.createTempFile("tmp", "_" + 
entry.getName());
+                               saveInputStreamInFile(isEntry, new 
BufferedOutputStream(
+                                               new FileOutputStream(file)));
+                               res.add(file);
+                       }
+                       in.close();
+               } catch (IOException e) {
+                       logger.error(e.getMessage());
+               }
+               return res;
+       }
+
+       private static void saveInputStreamInFile(InputStream in, OutputStream 
out)
+                       throws IOException {
+               byte[] buffer = new byte[1024];
+               int len;
+
+               while ((len = in.read(buffer)) >= 0)
+                       out.write(buffer, 0, len);
+
+               in.close();
+               out.close();
+       }
+
+       public static void saveInXmlFile(Document doc, String file) {
+               Format f = Format.getPrettyFormat().setEncoding("UTF-8");
+
+               XMLOutputter xop = new XMLOutputter(f);
+
+               try {
+
+                       xop.output(doc, new FileOutputStream(file));
+
+               }
+
+               catch (IOException ex) {
+
+                       logger.error(ex.getMessage());
+
+               }
+       }
+
+       /**
+        * Get the contents of an <code>InputStream</code> as a
+        * <code>byte[]</code>.
+        * <p>
+        * This method buffers the input internally, so there is no need to use 
a
+        * 
+        * <code>BufferedInputStream</code>.
+        * 
+        * @param input
+        *            the <code>InputStream</code> to read from
+        * @return the requested byte array
+        * @throws NullPointerException
+        *             if the input is null
+        * 
+        * @throws IOException
+        *             if an I/O error occurs
+        */
+       public static byte[] toByteArray(InputStream input) throws IOException {
+               ByteArrayOutputStream output = new ByteArrayOutputStream();
+               copy(input, output);
+               return output.toByteArray();
+       }
+
+       public static long copy(InputStream input, OutputStream output)
+                       throws IOException {
+               byte[] buffer = new byte[1024];
+               long count = 0;
+               int n = 0;
+               while (-1 != (n = input.read(buffer))) {
+                       output.write(buffer, 0, n);
+                       count += n;
+               }
+               return count;
+       }
+       
+       public static InputStream[] copyInputStream(InputStream is, int 
nbCopies) throws IOException {      
+               InputStream[] isa = new InputStream[nbCopies];
+               byte[] content = toByteArray(is);
+        for (int i = 0; i < nbCopies; i++) {
+                       isa[i] = new ByteArrayInputStream(content);
+               }
+        return isa;
+}
 
 }

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Mon Oct  1 09:40:06 
2007
@@ -32,6 +32,23 @@
                 <mime>application/msword</mime>
                 <extract>
                     <content name="fullText" textSelect="fullText"/>
+                    <content name="title" textSelect="title" type="Text"/>
+                                       <content name="author" 
textSelect="author" type="Text"/>
+                                       <content name="keywords" 
textSelect="keywords" type="Text"/>
+                                       <content name="subject" 
textSelect="subject" type="Text"/>
+                                       <content name="lastauthor" 
textSelect="lastauthor" type="Text"/>
+                                       <content name="comments" 
textSelect="comments" type="Text"/>
+                                       <content name="template" 
textSelect="template" type="Text"/>
+                                       <content name="applicationname" 
textSelect="applicationname" type="Text"/>
+                                       <content name="revnumber" 
textSelect="revnumber" type="Text"/>
+                                       <content name="creationdate" 
textSelect="creationdate"/>
+                                       <content name="charcount" 
textSelect="charcount"/>
+                                       <content name="edittime" 
textSelect="edittime"/>
+                                       <content name="lastsavedatetime" 
textSelect="lastsavedatetime"/>
+                                       <content name="pagecount" 
textSelect="pagecount"/>
+                                       <content name="security" 
textSelect="security"/>
+                                       <content name="wordcount" 
textSelect="wordcount"/>
+                                       <content name="lastprinted" 
textSelect="lastprinted"/>          
                     <content name="outLinks">
                         <regexSelect>
                             <![CDATA[
@@ -47,6 +64,23 @@
                 <mime>application/vnd.ms-excel</mime>
                 <extract>
                     <content name="fullText" textSelect="fullText"/>
+                    <content name="title" textSelect="title" type="Text"/>
+                                       <content name="author" 
textSelect="author" type="Text"/>
+                                       <content name="keywords" 
textSelect="keywords" type="Text"/>
+                                       <content name="subject" 
textSelect="subject" type="Text"/>
+                                       <content name="lastauthor" 
textSelect="lastauthor" type="Text"/>
+                                       <content name="comments" 
textSelect="comments" type="Text"/>
+                                       <content name="template" 
textSelect="template" type="Text"/>
+                                       <content name="applicationname" 
textSelect="applicationname" type="Text"/>
+                                       <content name="revnumber" 
textSelect="revnumber" type="Text"/>
+                                       <content name="creationdate" 
textSelect="creationdate"/>
+                                       <content name="charcount" 
textSelect="charcount"/>
+                                       <content name="edittime" 
textSelect="edittime"/>
+                                       <content name="lastsavedatetime" 
textSelect="lastsavedatetime"/>
+                                       <content name="pagecount" 
textSelect="pagecount"/>
+                                       <content name="security" 
textSelect="security"/>
+                                       <content name="wordcount" 
textSelect="wordcount"/>
+                                       <content name="lastprinted" 
textSelect="lastprinted"/>          
                     <content name="outLinks">
                         <regexSelect>
                             <![CDATA[
@@ -62,6 +96,23 @@
                 <mime>application/vnd.ms-powerpoint</mime>
                 <extract>
                     <content name="fullText" textSelect="fullText"/>
+                    <content name="title" textSelect="title" type="Text"/>
+                                       <content name="author" 
textSelect="author" type="Text"/>
+                                       <content name="keywords" 
textSelect="keywords" type="Text"/>
+                                       <content name="subject" 
textSelect="subject" type="Text"/>
+                                       <content name="lastauthor" 
textSelect="lastauthor" type="Text"/>
+                                       <content name="comments" 
textSelect="comments" type="Text"/>
+                                       <content name="template" 
textSelect="template" type="Text"/>
+                                       <content name="applicationname" 
textSelect="applicationname" type="Text"/>
+                                       <content name="revnumber" 
textSelect="revnumber" type="Text"/>
+                                       <content name="creationdate" 
textSelect="creationdate"/>
+                                       <content name="charcount" 
textSelect="charcount"/>
+                                       <content name="edittime" 
textSelect="edittime"/>
+                                       <content name="lastsavedatetime" 
textSelect="lastsavedatetime"/>
+                                       <content name="pagecount" 
textSelect="pagecount"/>
+                                       <content name="security" 
textSelect="security"/>
+                                       <content name="wordcount" 
textSelect="wordcount"/>
+                                       <content name="lastprinted" 
textSelect="lastprinted"/>          
                     <content name="title" textSelect="title"/>
                     <content name="author" textSelect="author"/>
                     <content name="subject" textSelect="subject"/>             
       

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java 
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Mon Oct 
 1 09:40:06 2007
@@ -22,6 +22,8 @@
 import java.io.InputStream;
 import java.util.Map;
 
+import junit.framework.TestCase;
+
 import org.apache.tika.config.Content;
 import org.apache.tika.config.ParserConfig;
 import org.apache.tika.config.TikaConfig;
@@ -32,136 +34,173 @@
 import org.apache.tika.utils.Utils;
 import org.jdom.JDOMException;
 
-import junit.framework.TestCase;
-
 /**
- * Junit test class for Tika [EMAIL PROTECTED] Parser}s. 
+ * Junit test class for Tika [EMAIL PROTECTED] Parser}s.
  */
 public class TestParsers extends TestCase {
 
-    private TikaConfig tc;
-    private File testFilesBaseDir; 
+       private TikaConfig tc;
+       private File testFilesBaseDir;
 
-    public void setUp() throws JDOMException, IOException {
-        /* FIXME the old mechanism does not work anymore when running the tests
-         * with Maven - need a resource-based one, but this means more
-         * changes to classes which rely on filenames.
-         *  
-        String sep = File.separator;
-        StringTokenizer st = new StringTokenizer(System.getProperty(
-                "java.class.path"), File.pathSeparator);
-
-        classDir = new File(st.nextToken());
-
-        config = classDir.getParent() + sep + "config" + sep + "config.xml";
-
-        String log4j = classDir.getParent() + sep + "Config" + sep + "log4j"
-                + sep + "log4j.properties";
-         */ 
-
-        // FIXME for now, fix filenames according to Maven testing layout
-        final String tikaConfigFilename = "target/classes/tika-config.xml";
-        final String log4jPropertiesFilename = 
"target/classes/log4j/log4j.properties";
-        testFilesBaseDir = new File("src/test/resources/test-documents");
-        
-        tc = new TikaConfig(tikaConfigFilename);
-
-        TikaLogger.setLoggerConfigFile(log4jPropertiesFilename);
-
-    }
-
-    public void testPDFExtraction() throws Exception {
-        File file = getTestFile("testPDF.pdf");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "application/pdf");
-        assertEquals(s1, s2);
-    }
-
-    public void testTXTExtraction() throws Exception {
-        File file = getTestFile("testTXT.txt");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "text/plain");
-        assertEquals(s1, s2);
-    }
-
-    public void testRTFExtraction() throws Exception {
-        File file = getTestFile("testRTF.rtf");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
-        assertEquals(s1, s2);
-    }
-
-    public void testXMLExtraction() throws Exception {
-        File file = getTestFile("testXML.xml");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "application/xml");
-        assertEquals(s1, s2);
-    }
-
-    public void testPPTExtraction() throws Exception {
-        File file = getTestFile("testPPT.ppt");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc,
-                "application/vnd.ms-powerpoint");
-        assertEquals(s1, s2);
-    }
-
-    public void testWORDxtraction() throws Exception {
-        File file = getTestFile("testWORD.doc");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, 
"application/msword");
-        assertEquals(s1, s2);
-    }
-
-    public void testEXCELExtraction() throws Exception {
-        File file = getTestFile("testEXCEL.xls");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc,
-                "application/vnd.ms-excel");
-        assertEquals(s1, s2);
-    }
-
-    public void testOOExtraction() throws Exception {
-        File file = getTestFile("testOpenOffice2.odt");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc,
-                "application/vnd.oasis.opendocument.text");
-        assertEquals(s1, s2);
-    }
-
-    public void testHTMLExtraction() throws Exception {
-        File file = getTestFile("testHTML.html");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "text/html");
-        assertEquals(s1, s2);
-
-        ParserConfig config = tc.getParserConfig("text/html");
-        Parser parser = ParserFactory.getParser(config);
-        assertNotNull(parser);
-        assertEquals("org.apache.tika.parser.html.HtmlParser", 
parser.getClass().getName());
-        parser.setMimeType("text/html");
-
-        Map<String, Content> contents = config.getContents();
-        assertNotNull(contents);
-        InputStream stream = new FileInputStream(file);
-        try {
-            parser.getContents(stream, contents);
-        } finally {
-            stream.close();
-        }
-        assertEquals(
-                "Title : Test Indexation Html",
-                contents.get("title").getValue());
-
-        assertEquals("text/html", parser.getMimeType());
-
-        final String text = Utils.toString(contents);
-        final String expected = "Test Indexation Html";
-        assertTrue("text contains '" + expected + "'", 
text.contains(expected));
-    }
-
-    private File getTestFile(String filename) {
-        return new File(testFilesBaseDir, filename);
-    }
+       public void setUp() throws JDOMException, IOException {
+               /*
+                * FIXME the old mechanism does not work anymore when running 
the tests
+                * with Maven - need a resource-based one, but this means more 
changes
+                * to classes which rely on filenames.
+                * 
+                * String sep = File.separator; StringTokenizer st = new
+                * StringTokenizer(System.getProperty( "java.class.path"),
+                * File.pathSeparator);
+                * 
+                * classDir = new File(st.nextToken());
+                * 
+                * config = classDir.getParent() + sep + "config" + sep + 
"config.xml";
+                * 
+                * String log4j = classDir.getParent() + sep + "Config" + sep + 
"log4j" +
+                * sep + "log4j.properties";
+                */
+
+               // FIXME for now, fix filenames according to Maven testing 
layout
+               final String tikaConfigFilename = 
"target/classes/tika-config.xml";
+               final String log4jPropertiesFilename = 
"target/classes/log4j/log4j.properties";
+               testFilesBaseDir = new 
File("src/test/resources/test-documents");
+
+               tc = new TikaConfig(tikaConfigFilename);
+
+               TikaLogger.setLoggerConfigFile(log4jPropertiesFilename);
+
+       }
+
+       public void testPDFExtraction() throws Exception {
+               File file = getTestFile("testPDF.pdf");
+               String s1 = ParseUtils.getStringContent(file, tc);
+               String s2 = ParseUtils.getStringContent(file, tc, 
"application/pdf");
+               assertEquals(s1, s2);
+       }
+
+       public void testTXTExtraction() throws Exception {
+               File file = getTestFile("testTXT.txt");
+               String s1 = ParseUtils.getStringContent(file, tc);
+               String s2 = ParseUtils.getStringContent(file, tc, "text/plain");
+               assertEquals(s1, s2);
+       }
+
+       public void testRTFExtraction() throws Exception {
+               File file = getTestFile("testRTF.rtf");
+               String s1 = ParseUtils.getStringContent(file, tc);
+               String s2 = ParseUtils.getStringContent(file, tc, 
"application/rtf");
+               assertEquals(s1, s2);
+       }
+
+       public void testXMLExtraction() throws Exception {
+               File file = getTestFile("testXML.xml");
+               String s1 = ParseUtils.getStringContent(file, tc);
+               String s2 = ParseUtils.getStringContent(file, tc, 
"application/xml");
+               assertEquals(s1, s2);
+       }
+
+       public void testPPTExtraction() throws Exception {
+               File file = getTestFile("testPPT.ppt");
+               String s1 = ParseUtils.getStringContent(file, tc);
+               String s2 = ParseUtils.getStringContent(file, tc,
+                               "application/vnd.ms-powerpoint");
+               assertEquals(s1, s2);
+               ParserConfig config = tc
+                               
.getParserConfig("application/vnd.ms-powerpoint");
+               Parser parser = ParserFactory.getParser(config);
+               Map<String, Content> contents = config.getContents();
+               assertNotNull(contents);
+               InputStream stream = new FileInputStream(file);
+               try {
+                       parser.getContents(stream, contents);
+               } finally {
+                       stream.close();
+               }
+               assertEquals("Sample Powerpoint Slide", contents.get("title")
+                               .getValue());
+       }
+
+       public void testWORDxtraction() throws Exception {
+               File file = getTestFile("testWORD.doc");
+               String s1 = ParseUtils.getStringContent(file, tc);
+               String s2 = ParseUtils.getStringContent(file, tc, 
"application/msword");
+               assertEquals(s1, s2);
+               ParserConfig config = tc.getParserConfig("application/msword");
+               Parser parser = ParserFactory.getParser(config);
+               Map<String, Content> contents = config.getContents();
+               assertNotNull(contents);
+               InputStream stream = new FileInputStream(file);
+               try {
+                       parser.getContents(stream, contents);
+               } finally {
+                       stream.close();
+               }
+               assertEquals("Sample Word Document", 
contents.get("title").getValue());
+       }
+
+       public void testEXCELExtraction() throws Exception {
+               final String expected = "Numbers and their Squares Number 
Square 1.0 1.0 2.0 4.0 3.0 9.0 4.0 16.0 5.0 25.0 6.0 36.0 7.0 49.0 8.0 64.0 9.0 
81.0 10.0 100.0 11.0 121.0 12.0 144.0 13.0 169.0 14.0 196.0 15.0 225.0 Written 
and saved in Microsoft Excel X for Mac Service Release 1.";
+               File file = getTestFile("testEXCEL.xls");
+               String s1 = ParseUtils.getStringContent(file, tc);
+               String s2 = ParseUtils.getStringContent(file, tc,
+                               "application/vnd.ms-excel");
+               assertEquals(s1, s2);
+               assertTrue("Text does not contain '" + expected + "'", s1
+                               .contains(expected));
+               ParserConfig config = 
tc.getParserConfig("application/vnd.ms-excel");
+               Parser parser = ParserFactory.getParser(config);
+               Map<String, Content> contents = config.getContents();
+               assertNotNull(contents);
+               InputStream stream = new FileInputStream(file);
+               try {
+                       parser.getContents(stream, contents);
+               } finally {
+                       stream.close();
+               }
+               assertEquals("Simple Excel document", 
contents.get("title").getValue());
+       }
+
+       public void testOOExtraction() throws Exception {
+               File file = getTestFile("testOpenOffice2.odt");
+               String s1 = ParseUtils.getStringContent(file, tc);
+               String s2 = ParseUtils.getStringContent(file, tc,
+                               "application/vnd.oasis.opendocument.text");
+               assertEquals(s1, s2);
+       }
+
+       public void testHTMLExtraction() throws Exception {
+               File file = getTestFile("testHTML.html");
+               String s1 = ParseUtils.getStringContent(file, tc);
+               String s2 = ParseUtils.getStringContent(file, tc, "text/html");
+               assertEquals(s1, s2);
+
+               ParserConfig config = tc.getParserConfig("text/html");
+               Parser parser = ParserFactory.getParser(config);
+               assertNotNull(parser);
+               assertEquals("org.apache.tika.parser.html.HtmlParser", parser
+                               .getClass().getName());
+               parser.setMimeType("text/html");
+
+               Map<String, Content> contents = config.getContents();
+               assertNotNull(contents);
+               InputStream stream = new FileInputStream(file);
+               try {
+                       parser.getContents(stream, contents);
+               } finally {
+                       stream.close();
+               }
+               assertEquals("Title : Test Indexation Html", 
contents.get("title")
+                               .getValue());
+
+               assertEquals("text/html", parser.getMimeType());
+
+               final String text = Utils.toString(contents);
+               final String expected = "Test Indexation Html";
+               assertTrue("text contains '" + expected + "'", 
text.contains(expected));
+       }
+
+       private File getTestFile(String filename) {
+               return new File(testFilesBaseDir, filename);
+       }
 
 }

Modified: incubator/tika/trunk/src/test/resources/test-documents/testEXCEL.xls
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testEXCEL.xls?rev=581010&r1=581009&r2=581010&view=diff
==============================================================================
Binary files - no diff available.

svn commit: r581010 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/msexcel/ src/main/java/org/apache/tika/parser/mspowerpoint/ src/main/java/org/apache/tika/parser/msword/ src/main/java/org/apache/tika/utils/ src/main/resources/ sr...

Reply via email to