Author: nick
Date: Mon Dec 22 05:15:54 2014
New Revision: 1647240
URL: http://svn.apache.org/r1647240
Log:
TIKA-1490 Parser for old Excel 2-4 files
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
Modified:
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1647240&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
Mon Dec 22 05:15:54 2014
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A POI-powered Tika Parser for very old versions of Excel, from
+ * pre-OLE2 days, such as Excel 4.
+ */
+public class OldExcelParser extends AbstractParser {
+ private static final long serialVersionUID = 4611820730372823452L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.ms-excel.sheet.4"),
+ MediaType.application("vnd.ms-excel.workspace.4"),
+ MediaType.application("vnd.ms-excel.sheet.3"),
+ MediaType.application("vnd.ms-excel.workspace.3"),
+ MediaType.application("vnd.ms-excel.sheet.2")
+ )));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Extracts properties and text from an MS Document input stream
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Open the POI provided extractor
+ OldExcelExtractor extractor = new OldExcelExtractor(stream);
+
+ // We can't do anything about metadata, as these old formats
+ // didn't have any stored with them
+
+ // Set the content type
+ // TODO Get the version and type, to set as the Content Type
+
+ // Have the text extracted and given to our Content Handler
+ parse(extractor, handler, metadata);
+ }
+
+ protected static void parse(OldExcelExtractor extractor, ContentHandler
handler,
+ Metadata metadata) throws TikaException, IOException, SAXException
{
+ // Get the whole text, as a single string
+ String text = extractor.getText();
+
+ // Split and output
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ String line;
+ BufferedReader reader = new BufferedReader(new StringReader(text));
+ while ((line = reader.readLine()) != null) {
+ xhtml.startElement("p");
+ xhtml.characters(line);
+ xhtml.endElement("p");
+ }
+
+ xhtml.endDocument();
+ }
+}
Modified:
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1647240&r1=1647239&r2=1647240&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
(original)
+++
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Mon Dec 22 05:15:54 2014
@@ -35,6 +35,7 @@ org.apache.tika.parser.mail.RFC822Parser
org.apache.tika.parser.mbox.MboxParser
org.apache.tika.parser.mbox.OutlookPSTParser
org.apache.tika.parser.microsoft.OfficeParser
+org.apache.tika.parser.microsoft.OldExcelParser
org.apache.tika.parser.microsoft.TNEFParser
org.apache.tika.parser.microsoft.ooxml.OOXMLParser
org.apache.tika.parser.mp3.Mp3Parser