Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.poi.hssf.extractor.OldExcelExtractor; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * A POI-powered Tika Parser for very old versions of Excel, from + * pre-OLE2 days, such as Excel 4. + */ +public class OldExcelParser extends AbstractParser { + private static final long serialVersionUID = 4611820730372823452L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.ms-excel.sheet.4"), + MediaType.application("vnd.ms-excel.workspace.4"), + MediaType.application("vnd.ms-excel.sheet.3"), + MediaType.application("vnd.ms-excel.workspace.3"), + MediaType.application("vnd.ms-excel.sheet.2") + ))); + + protected static void parse(OldExcelExtractor extractor, + XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException { + // Get the whole text, as a single string + String text = extractor.getText(); + + // Split and output + xhtml.startDocument(); + + String line; + BufferedReader reader = new BufferedReader(new StringReader(text)); + while ((line = reader.readLine()) != null) { + xhtml.startElement("p"); + xhtml.characters(line); + xhtml.endElement("p"); + } + + xhtml.endDocument(); + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * Extracts properties and text from an MS Document input stream + */ + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // Open the POI provided extractor + OldExcelExtractor extractor = new OldExcelExtractor(stream); + + // We can't do anything about metadata, as these old formats + // didn't have any stored with them + + // Set the content type + // TODO Get the version and type, to set as the Content Type + + // Have the text extracted and given to our Content Handler + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + parse(extractor, xhtml); + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,386 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; +import java.text.ParseException; +import java.util.Date; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.poi.hmef.attribute.MAPIRtfAttribute; +import org.apache.poi.hsmf.MAPIMessage; +import org.apache.poi.hsmf.datatypes.AttachmentChunks; +import org.apache.poi.hsmf.datatypes.ByteChunk; +import org.apache.poi.hsmf.datatypes.Chunk; +import org.apache.poi.hsmf.datatypes.Chunks; +import org.apache.poi.hsmf.datatypes.MAPIProperty; +import org.apache.poi.hsmf.datatypes.PropertyValue; +import org.apache.poi.hsmf.datatypes.StringChunk; +import org.apache.poi.hsmf.datatypes.Types; +import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.util.CodePageUtil; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.HtmlEncodingDetector; +import org.apache.tika.parser.html.HtmlParser; +import org.apache.tika.parser.mbox.MboxParser; +import org.apache.tika.parser.rtf.RTFParser; +import org.apache.tika.parser.txt.CharsetDetector; +import org.apache.tika.parser.txt.CharsetMatch; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Outlook Message Parser. + */ +public class OutlookExtractor extends AbstractPOIFSExtractor { + private static final Metadata EMPTY_METADATA = new Metadata(); + HtmlEncodingDetector detector = new HtmlEncodingDetector(); + + private final MAPIMessage msg; + + public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException { + this(filesystem.getRoot(), context); + } + + public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException { + super(context); + + try { + this.msg = new MAPIMessage(root); + } catch (IOException e) { + throw new TikaException("Failed to parse Outlook message", e); + } + } + + public void parse(XHTMLContentHandler xhtml, Metadata metadata) + throws TikaException, SAXException, IOException { + try { + msg.setReturnNullOnMissingChunk(true); + + // If the message contains strings that aren't stored + // as Unicode, try to sort out an encoding for them + if (msg.has7BitEncodingStrings()) { + guess7BitEncoding(msg); + } + + // Start with the metadata + String subject = msg.getSubject(); + String from = msg.getDisplayFrom(); + + metadata.set(TikaCoreProperties.CREATOR, from); + metadata.set(Metadata.MESSAGE_FROM, from); + metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo()); + metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC()); + metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC()); + + metadata.set(TikaCoreProperties.TITLE, subject); + // TODO: Move to description in Tika 2.0 + metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, + msg.getConversationTopic()); + + try { + for (String recipientAddress : msg.getRecipientEmailAddressList()) { + if (recipientAddress != null) + metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); + } + } catch (ChunkNotFoundException he) { + } // Will be fixed in POI 3.7 Final + + // Date - try two ways to find it + // First try via the proper chunk + if (msg.getMessageDate() != null) { + metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime()); + metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime()); + } else { + try { + // Failing that try via the raw headers + String[] headers = msg.getHeaders(); + if (headers != null && headers.length > 0) { + for (String header : headers) { + if (header.toLowerCase(Locale.ROOT).startsWith("date:")) { + String date = header.substring(header.indexOf(':') + 1).trim(); + + // See if we can parse it as a normal mail date + try { + Date d = MboxParser.parseDate(date); + metadata.set(TikaCoreProperties.CREATED, d); + metadata.set(TikaCoreProperties.MODIFIED, d); + } catch (ParseException e) { + // Store it as-is, and hope for the best... + metadata.set(TikaCoreProperties.CREATED, date); + metadata.set(TikaCoreProperties.MODIFIED, date); + } + break; + } + } + } + } catch (ChunkNotFoundException he) { + // We can't find the date, sorry... + } + } + + + xhtml.element("h1", subject); + + // Output the from and to details in text, as you + // often want them in text form for searching + xhtml.startElement("dl"); + if (from != null) { + header(xhtml, "From", from); + } + header(xhtml, "To", msg.getDisplayTo()); + header(xhtml, "Cc", msg.getDisplayCC()); + header(xhtml, "Bcc", msg.getDisplayBCC()); + try { + header(xhtml, "Recipients", msg.getRecipientEmailAddress()); + } catch (ChunkNotFoundException e) { + } + xhtml.endElement("dl"); + + // Get the message body. Preference order is: html, rtf, text + Chunk htmlChunk = null; + Chunk rtfChunk = null; + Chunk textChunk = null; + for (Chunk chunk : msg.getMainChunks().getChunks()) { + if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { + htmlChunk = chunk; + } + if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { + rtfChunk = chunk; + } + if (chunk.getChunkId() == MAPIProperty.BODY.id) { + textChunk = chunk; + } + } + + boolean doneBody = false; + xhtml.startElement("div", "class", "message-body"); + if (htmlChunk != null) { + byte[] data = null; + if (htmlChunk instanceof ByteChunk) { + data = ((ByteChunk) htmlChunk).getValue(); + } else if (htmlChunk instanceof StringChunk) { + data = ((StringChunk) htmlChunk).getRawValue(); + } + if (data != null) { + HtmlParser htmlParser = new HtmlParser(); + htmlParser.parse( + new ByteArrayInputStream(data), + new EmbeddedContentHandler(new BodyContentHandler(xhtml)), + new Metadata(), new ParseContext() + ); + doneBody = true; + } + } + if (rtfChunk != null && !doneBody) { + ByteChunk chunk = (ByteChunk) rtfChunk; + MAPIRtfAttribute rtf = new MAPIRtfAttribute( + MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue() + ); + RTFParser rtfParser = new RTFParser(); + rtfParser.parse( + new ByteArrayInputStream(rtf.getData()), + new EmbeddedContentHandler(new BodyContentHandler(xhtml)), + new Metadata(), new ParseContext()); + doneBody = true; + } + if (textChunk != null && !doneBody) { + xhtml.element("p", ((StringChunk) textChunk).getValue()); + } + xhtml.endElement("div"); + + // Process the attachments + for (AttachmentChunks attachment : msg.getAttachmentFiles()) { + xhtml.startElement("div", "class", "attachment-entry"); + + String filename = null; + if (attachment.attachLongFileName != null) { + filename = attachment.attachLongFileName.getValue(); + } else if (attachment.attachFileName != null) { + filename = attachment.attachFileName.getValue(); + } + if (filename != null && filename.length() > 0) { + xhtml.element("h1", filename); + } + + if (attachment.attachData != null) { + handleEmbeddedResource( + TikaInputStream.get(attachment.attachData.getValue()), + filename, null, + null, xhtml, true + ); + } + if (attachment.attachmentDirectory != null) { + handleEmbeddedOfficeDoc( + attachment.attachmentDirectory.getDirectory(), + xhtml + ); + } + + xhtml.endElement("div"); + } + } catch (ChunkNotFoundException e) { + throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e); + } + } + + private void header(XHTMLContentHandler xhtml, String key, String value) + throws SAXException { + if (value != null && value.length() > 0) { + xhtml.element("dt", key); + xhtml.element("dd", value); + } + } + + /** + * Tries to identify the correct encoding for 7-bit (non-unicode) + * strings in the file. + * <p>Many messages store their strings as unicode, which is + * nice and easy. Some use one-byte encodings for their + * strings, but don't always store the encoding anywhere + * helpful in the file.</p> + * <p>This method checks for codepage properties, and failing that + * looks at the headers for the message, and uses these to + * guess the correct encoding for your file.</p> + * <p>Bug #49441 has more on why this is needed</p> + * <p>This is taken verbatim from POI (TIKA-1238) + * as a temporary workaround to prevent unsupported encoding exceptions</p> + */ + private void guess7BitEncoding(MAPIMessage msg) { + Chunks mainChunks = msg.getMainChunks(); + //sanity check + if (mainChunks == null) { + return; + } + + Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties(); + if (props != null) { + // First choice is a codepage property + for (MAPIProperty prop : new MAPIProperty[]{ + MAPIProperty.MESSAGE_CODEPAGE, + MAPIProperty.INTERNET_CPID + }) { + List<PropertyValue> val = props.get(prop); + if (val != null && val.size() > 0) { + int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue(); + String encoding = null; + try { + encoding = CodePageUtil.codepageToEncoding(codepage, true); + } catch (UnsupportedEncodingException e) { + //swallow + } + if (tryToSet7BitEncoding(msg, encoding)) { + return; + } + } + } + } + + // Second choice is a charset on a content type header + try { + String[] headers = msg.getHeaders(); + if(headers != null && headers.length > 0) { + // Look for a content type with a charset + Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); + + for(String header : headers) { + if(header.startsWith("Content-Type")) { + Matcher m = p.matcher(header); + if(m.matches()) { + // Found it! Tell all the string chunks + String charset = m.group(1); + if (tryToSet7BitEncoding(msg, charset)) { + return; + } + } + } + } + } + } catch(ChunkNotFoundException e) {} + + // Nothing suitable in the headers, try HTML + // TODO: do we need to replicate this in Tika? If we wind up + // parsing the html version of the email, this is duplicative?? + // Or do we need to reset the header strings based on the html + // meta header if there is no other information? + try { + String html = msg.getHtmlBody(); + if(html != null && html.length() > 0) { + Charset charset = null; + try { + charset = detector.detect(new ByteArrayInputStream( + html.getBytes(UTF_8)), EMPTY_METADATA); + } catch (IOException e) { + //swallow + } + if (charset != null && tryToSet7BitEncoding(msg, charset.name())) { + return; + } + } + } catch(ChunkNotFoundException e) {} + + //absolute last resort, try charset detector + StringChunk text = mainChunks.textBodyChunk; + if (text != null) { + CharsetDetector detector = new CharsetDetector(); + detector.setText(text.getRawValue()); + CharsetMatch match = detector.detect(); + if (match != null && match.getConfidence() > 35 && + tryToSet7BitEncoding(msg, match.getName())) { + return; + } + } + } + + private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) { + if (charsetName == null) { + return false; + } + + if (charsetName.equalsIgnoreCase("utf-8")) { + return false; + } + try { + if (Charset.isSupported(charsetName)) { + msg.set7BitEncoding(charsetName); + return true; + } + } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { + //swallow + } + return false; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,436 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.apache.tika.mime.MediaType.application; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +import org.apache.commons.io.IOUtils; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.DocumentNode; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.detect.Detector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; + +/** + * A detector that works on a POIFS OLE2 document + * to figure out exactly what the file is. + * This should work for all OLE2 documents, whether + * they are ones supported by POI or not. + */ +public class POIFSContainerDetector implements Detector { + + /** + * The OLE base file format + */ + public static final MediaType OLE = application("x-tika-msoffice"); + /** + * The protected OOXML base file format + */ + public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected"); + /** + * General embedded document type within an OLE2 container + */ + public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded"); + /** + * An OLE10 Native embedded document within another OLE2 document + */ + public static final MediaType OLE10_NATIVE = + new MediaType(GENERAL_EMBEDDED, "format", "ole10_native"); + /** + * Some other kind of embedded document, in a CompObj container within another OLE2 document + */ + public static final MediaType COMP_OBJ = + new MediaType(GENERAL_EMBEDDED, "format", "comp_obj"); + /** + * Microsoft Excel + */ + public static final MediaType XLS = application("vnd.ms-excel"); + /** + * Microsoft Word + */ + public static final MediaType DOC = application("msword"); + /** + * Microsoft PowerPoint + */ + public static final MediaType PPT = application("vnd.ms-powerpoint"); + /** + * Microsoft Publisher + */ + public static final MediaType PUB = application("x-mspublisher"); + /** + * Microsoft Visio + */ + public static final MediaType VSD = application("vnd.visio"); + /** + * Microsoft Works + */ + public static final MediaType WPS = application("vnd.ms-works"); + /** + * Microsoft Works Spreadsheet 7.0 + */ + public static final MediaType XLR = application("x-tika-msworks-spreadsheet"); + /** + * Microsoft Outlook + */ + public static final MediaType MSG = application("vnd.ms-outlook"); + /** + * Microsoft Project + */ + public static final MediaType MPP = application("vnd.ms-project"); + /** + * StarOffice Calc + */ + public static final MediaType SDC = application("vnd.stardivision.calc"); + /** + * StarOffice Draw + */ + public static final MediaType SDA = application("vnd.stardivision.draw"); + /** + * StarOffice Impress + */ + public static final MediaType SDD = application("vnd.stardivision.impress"); + /** + * StarOffice Writer + */ + public static final MediaType SDW = application("vnd.stardivision.writer"); + /** + * SolidWorks CAD file + */ + public static final MediaType SLDWORKS = application("sldworks"); + /** + * Hangul Word Processor (Korean) + */ + public static final MediaType HWP = application("x-hwp-v5"); + /** + * Serial version UID + */ + private static final long serialVersionUID = -3028021741663605293L; + /** + * An ASCII String "StarImpress" + */ + private static final byte[] STAR_IMPRESS = new byte[]{ + 0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73 + }; + /** + * An ASCII String "StarDraw" + */ + private static final byte[] STAR_DRAW = new byte[]{ + 0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77 + }; + /** + * An ASCII String "Quill96" for Works Files + */ + private static final byte[] WORKS_QUILL96 = new byte[]{ + 0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36 + }; + /** + * Regexp for matching the MPP Project Data stream + */ + private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+"); + + /** + * Internal detection of the specific kind of OLE2 document, based on the + * names of the top level streams within the file. + * + * @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root + * entry of the filesystem whose type is to be detected, as a + * second argument. + */ + protected static MediaType detect(Set<String> names) { + return detect(names, null); + } + + /** + * Internal detection of the specific kind of OLE2 document, based on the + * names of the top-level streams within the file. In some cases the + * detection may need access to the root {@link DirectoryEntry} of that file + * for best results. The entry can be given as a second, optional argument. + * + * @param names + * @param root + * @return + */ + protected static MediaType detect(Set<String> names, DirectoryEntry root) { + if (names != null) { + if (names.contains("SwDocContentMgr") && names.contains("SwDocMgrTempStorage")) { + return SLDWORKS; + } else if (names.contains("StarCalcDocument")) { + // Star Office Calc + return SDC; + } else if (names.contains("StarWriterDocument")) { + return SDW; + } else if (names.contains("StarDrawDocument3")) { + if (root == null) { + /* + * This is either StarOfficeDraw or StarOfficeImpress, we have + * to consult the CompObj to distinguish them, if this method is + * called in "legacy mode", without the root, just return + * x-tika-msoffice. The one-argument method is only for backward + * compatibility, if someone calls old API he/she can get the + * old result. + */ + return OLE; + } else { + return processCompObjFormatType(root); + } + } else if (names.contains("\u0005HwpSummaryInformation")) { + // Hangul Word Processor v5+ (previous aren't OLE2-based) + return HWP; + } else if (names.contains("WksSSWorkBook")) { + // This check has to be before names.contains("Workbook") + // Works 7.0 spreadsheet files contain both + // we want to avoid classifying this as Excel + return XLR; + } else if (names.contains("Workbook") || names.contains("WORKBOOK")) { + return XLS; + } else if (names.contains("Book")) { + // Excel 95 or older, we won't be able to parse this.... + return XLS; + } else if (names.contains("EncryptedPackage") && + names.contains("EncryptionInfo") && + names.contains("\u0006DataSpaces")) { + // This is a protected OOXML document, which is an OLE2 file + // with an Encrypted Stream which holds the OOXML data + // Without decrypting the stream, we can't tell what kind of + // OOXML file we have. Return a general OOXML Protected type, + // and hope the name based detection can guess the rest! + return OOXML_PROTECTED; + } else if (names.contains("EncryptedPackage")) { + return OLE; + } else if (names.contains("WordDocument")) { + return DOC; + } else if (names.contains("Quill")) { + return PUB; + } else if (names.contains("PowerPoint Document")) { + return PPT; + } else if (names.contains("VisioDocument")) { + return VSD; + } else if (names.contains("\u0001Ole10Native")) { + return OLE10_NATIVE; + } else if (names.contains("MatOST")) { + // this occurs on older Works Word Processor files (versions 3.0 and 4.0) + return WPS; + } else if (names.contains("CONTENTS") && names.contains("SPELLING")) { + // Newer Works files + return WPS; + } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) { + return COMP_OBJ; + } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) { + // CompObj is a general kind of OLE2 embedding, but this may be an old Works file + // If we have the Directory, check + if (root != null) { + MediaType type = processCompObjFormatType(root); + if (type == WPS) { + return WPS; + } else { + // Assume it's a general CompObj embedded resource + return COMP_OBJ; + } + } else { + // Assume it's a general CompObj embedded resource + return COMP_OBJ; + } + } else if (names.contains("CONTENTS")) { + // CONTENTS without SPELLING nor CompObj normally means some sort + // of embedded non-office file inside an OLE2 document + // This is most commonly triggered on nested directories + return OLE; + } else if (names.contains("\u0001CompObj") && + (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) { + // Could be Project, look for common name patterns + for (String name : names) { + if (mppDataMatch.matcher(name).matches()) { + return MPP; + } + } + } else if (names.contains("PerfectOffice_MAIN")) { + if (names.contains("SlideShow")) { + return MediaType.application("x-corelpresentations"); // .shw + } else if (names.contains("PerfectOffice_OBJECTS")) { + return MediaType.application("x-quattro-pro"); // .wb? + } + } else if (names.contains("NativeContent_MAIN")) { + return MediaType.application("x-quattro-pro"); // .qpw + } else { + for (String name : names) { + if (name.startsWith("__substg1.0_")) { + return MSG; + } + } + } + } + + // Couldn't detect a more specific type + return OLE; + } + + /** + * Is this one of the kinds of formats which uses CompObj to + * store all of their data, eg Star Draw, Star Impress or + * (older) Works? + * If not, it's likely an embedded resource + */ + private static MediaType processCompObjFormatType(DirectoryEntry root) { + try { + Entry e = root.getEntry("\u0001CompObj"); + if (e != null && e.isDocumentEntry()) { + DocumentNode dn = (DocumentNode) e; + DocumentInputStream stream = new DocumentInputStream(dn); + byte[] bytes = IOUtils.toByteArray(stream); + /* + * This array contains a string with a normal ASCII name of the + * application used to create this file. We want to search for that + * name. + */ + if (arrayContains(bytes, STAR_DRAW)) { + return SDA; + } else if (arrayContains(bytes, STAR_IMPRESS)) { + return SDD; + } else if (arrayContains(bytes, WORKS_QUILL96)) { + return WPS; + } + } + } catch (Exception e) { + /* + * "root.getEntry" can throw FileNotFoundException. The code inside + * "if" can throw IOExceptions. Theoretically. Practically no + * exceptions will likely ever appear. + * + * Swallow all of them. If any occur, we just assume that we can't + * distinguish between Draw and Impress and return something safe: + * x-tika-msoffice + */ + } + return OLE; + } + + // poor man's search for byte arrays, replace with some library call if + // you know one without adding new dependencies + private static boolean arrayContains(byte[] larger, byte[] smaller) { + int largerCounter = 0; + int smallerCounter = 0; + while (largerCounter < larger.length) { + if (larger[largerCounter] == smaller[smallerCounter]) { + largerCounter++; + smallerCounter++; + if (smallerCounter == smaller.length) { + return true; + } + } else { + largerCounter = largerCounter - smallerCounter + 1; + smallerCounter = 0; + } + } + return false; + } + + private static Set<String> getTopLevelNames(TikaInputStream stream) + throws IOException { + // Force the document stream to a (possibly temporary) file + // so we don't modify the current position of the stream + File file = stream.getFile(); + + try { + NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true); + + // Optimize a possible later parsing process by keeping + // a reference to the already opened POI file system + stream.setOpenContainer(fs); + + return getTopLevelNames(fs.getRoot()); + } catch (IOException e) { + // Parse error in POI, so we don't know the file type + return Collections.emptySet(); + } catch (RuntimeException e) { + // Another problem in POI + return Collections.emptySet(); + } + } + + private static Set<String> getTopLevelNames(DirectoryNode root) { + Set<String> names = new HashSet<String>(); + for (Entry entry : root) { + names.add(entry.getName()); + } + return names; + } + + public MediaType detect(InputStream input, Metadata metadata) + throws IOException { + // Check if we have access to the document + if (input == null) { + return MediaType.OCTET_STREAM; + } + + // If this is a TikaInputStream wrapping an already + // parsed NPOIFileSystem/DirectoryNode, just get the + // names from the root: + TikaInputStream tis = TikaInputStream.cast(input); + Set<String> names = null; + if (tis != null) { + Object container = tis.getOpenContainer(); + if (container instanceof NPOIFSFileSystem) { + names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot()); + } else if (container instanceof DirectoryNode) { + names = getTopLevelNames((DirectoryNode) container); + } + } + + if (names == null) { + // Check if the document starts with the OLE header + input.mark(8); + try { + if (input.read() != 0xd0 || input.read() != 0xcf + || input.read() != 0x11 || input.read() != 0xe0 + || input.read() != 0xa1 || input.read() != 0xb1 + || input.read() != 0x1a || input.read() != 0xe1) { + return MediaType.OCTET_STREAM; + } + } finally { + input.reset(); + } + } + + // We can only detect the exact type when given a TikaInputStream + if (names == null && tis != null) { + // Look for known top level entry names to detect the document type + names = getTopLevelNames(tis); + } + + // Detect based on the names (as available) + if (tis != null && + tis.getOpenContainer() != null && + tis.getOpenContainer() instanceof NPOIFSFileSystem) { + return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot()); + } else { + return detect(names, null); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Date; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.poi.hpsf.CustomProperties; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.MarkUnsupportedException; +import org.apache.poi.hpsf.NoPropertySetStreamException; +import org.apache.poi.hpsf.PropertySet; +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.UnexpectedPropertySetTypeException; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.MSOffice; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.PagedText; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; + +/** + * Extractor for Common OLE2 (HPSF) metadata + */ +public class SummaryExtractor { + private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class); + + private static final String SUMMARY_INFORMATION = + SummaryInformation.DEFAULT_STREAM_NAME; + + private static final String DOCUMENT_SUMMARY_INFORMATION = + DocumentSummaryInformation.DEFAULT_STREAM_NAME; + + private final Metadata metadata; + + public SummaryExtractor(Metadata metadata) { + this.metadata = metadata; + } + + public void parseSummaries(NPOIFSFileSystem filesystem) + throws IOException, TikaException { + parseSummaries(filesystem.getRoot()); + } + + public void parseSummaries(DirectoryNode root) + throws IOException, TikaException { + parseSummaryEntryIfExists(root, SUMMARY_INFORMATION); + parseSummaryEntryIfExists(root, DOCUMENT_SUMMARY_INFORMATION); + } + + private void parseSummaryEntryIfExists( + DirectoryNode root, String entryName) + throws IOException, TikaException { + try { + DocumentEntry entry = + (DocumentEntry) root.getEntry(entryName); + PropertySet properties = + new PropertySet(new DocumentInputStream(entry)); + if (properties.isSummaryInformation()) { + parse(new SummaryInformation(properties)); + } + if (properties.isDocumentSummaryInformation()) { + parse(new DocumentSummaryInformation(properties)); + } + } catch (FileNotFoundException e) { + // entry does not exist, just skip it + } catch (NoPropertySetStreamException e) { + // no property stream, just skip it + } catch (UnexpectedPropertySetTypeException e) { + throw new TikaException("Unexpected HPSF document", e); + } catch (MarkUnsupportedException e) { + throw new TikaException("Invalid DocumentInputStream", e); + } catch (Exception e) { + logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e); + } + } + + private void parse(SummaryInformation summary) { + set(TikaCoreProperties.TITLE, summary.getTitle()); + addMulti(metadata, TikaCoreProperties.CREATOR, summary.getAuthor()); + set(TikaCoreProperties.KEYWORDS, summary.getKeywords()); + // TODO Move to OO subject in Tika 2.0 + set(TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, summary.getSubject()); + set(TikaCoreProperties.MODIFIER, summary.getLastAuthor()); + set(TikaCoreProperties.COMMENTS, summary.getComments()); + set(OfficeOpenXMLExtended.TEMPLATE, summary.getTemplate()); + set(OfficeOpenXMLExtended.APPLICATION, summary.getApplicationName()); + set(OfficeOpenXMLCore.REVISION, summary.getRevNumber()); + set(TikaCoreProperties.CREATED, summary.getCreateDateTime()); + set(TikaCoreProperties.MODIFIED, summary.getLastSaveDateTime()); + set(TikaCoreProperties.PRINT_DATE, summary.getLastPrinted()); + set(Metadata.EDIT_TIME, summary.getEditTime()); + set(OfficeOpenXMLExtended.DOC_SECURITY, summary.getSecurity()); + + // New style counts + set(Office.WORD_COUNT, summary.getWordCount()); + set(Office.CHARACTER_COUNT, summary.getCharCount()); + set(Office.PAGE_COUNT, summary.getPageCount()); + if (summary.getPageCount() > 0) { + metadata.set(PagedText.N_PAGES, summary.getPageCount()); + } + + // Old style, Tika 1.0 properties + // TODO Remove these in Tika 2.0 + set(Metadata.TEMPLATE, summary.getTemplate()); + set(Metadata.APPLICATION_NAME, summary.getApplicationName()); + set(Metadata.REVISION_NUMBER, summary.getRevNumber()); + set(Metadata.SECURITY, summary.getSecurity()); + set(MSOffice.WORD_COUNT, summary.getWordCount()); + set(MSOffice.CHARACTER_COUNT, summary.getCharCount()); + set(MSOffice.PAGE_COUNT, summary.getPageCount()); + } + + private void parse(DocumentSummaryInformation summary) { + set(OfficeOpenXMLExtended.COMPANY, summary.getCompany()); + addMulti(metadata, OfficeOpenXMLExtended.MANAGER, summary.getManager()); + set(TikaCoreProperties.LANGUAGE, getLanguage(summary)); + set(OfficeOpenXMLCore.CATEGORY, summary.getCategory()); + + // New style counts + set(Office.SLIDE_COUNT, summary.getSlideCount()); + if (summary.getSlideCount() > 0) { + metadata.set(PagedText.N_PAGES, summary.getSlideCount()); + } + // Old style, Tika 1.0 counts + // TODO Remove these in Tika 2.0 + set(Metadata.COMPANY, summary.getCompany()); + set(Metadata.MANAGER, summary.getManager()); + set(MSOffice.SLIDE_COUNT, summary.getSlideCount()); + set(Metadata.CATEGORY, summary.getCategory()); + + parse(summary.getCustomProperties()); + } + + private String getLanguage(DocumentSummaryInformation summary) { + CustomProperties customProperties = summary.getCustomProperties(); + if (customProperties != null) { + Object value = customProperties.get("Language"); + if (value instanceof String) { + return (String) value; + } + } + return null; + } + + /** + * Attempt to parse custom document properties and add to the collection of metadata + * + * @param customProperties + */ + private void parse(CustomProperties customProperties) { + if (customProperties != null) { + for (String name : customProperties.nameSet()) { + // Apply the custom prefix + String key = Metadata.USER_DEFINED_METADATA_NAME_PREFIX + name; + + // Get, convert and save property value + Object value = customProperties.get(name); + if (value instanceof String) { + set(key, (String) value); + } else if (value instanceof Date) { + Property prop = Property.externalDate(key); + metadata.set(prop, (Date) value); + } else if (value instanceof Boolean) { + Property prop = Property.externalBoolean(key); + metadata.set(prop, value.toString()); + } else if (value instanceof Long) { + Property prop = Property.externalInteger(key); + metadata.set(prop, ((Long) value).intValue()); + } else if (value instanceof Double) { + Property prop = Property.externalReal(key); + metadata.set(prop, (Double) value); + } else if (value instanceof Integer) { + Property prop = Property.externalInteger(key); + metadata.set(prop, ((Integer) value).intValue()); + } + } + } + } + + private void set(String name, String value) { + if (value != null) { + metadata.set(name, value); + } + } + + private void set(Property property, String value) { + if (value != null) { + metadata.set(property, value); + } + } + + private void set(Property property, Date value) { + if (value != null) { + metadata.set(property, value); + } + } + + private void set(Property property, int value) { + if (value > 0) { + metadata.set(property, value); + } + } + + private void set(String name, long value) { + if (value > 0) { + metadata.set(name, Long.toString(value)); + } + } + + //MS stores values that should be multiple values (e.g. dc:creator) + //as a semicolon-delimited list. We need to split + //on semicolon to add each value. + public static void addMulti(Metadata metadata, Property property, String string) { + if (string == null) { + return; + } + String[] parts = string.split(";"); + String[] current = metadata.getValues(property); + Set<String> seen = new HashSet<>(); + if (current != null) { + for (String val : current) { + seen.add(val); + } + } + for (String part : parts) { + if (! seen.contains(part)) { + metadata.add(property, part); + seen.add(part); + } + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.poi.hmef.Attachment; +import org.apache.poi.hmef.HMEFMessage; +import org.apache.poi.hmef.attribute.MAPIAttribute; +import org.apache.poi.hmef.attribute.MAPIRtfAttribute; +import org.apache.poi.hsmf.datatypes.MAPIProperty; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * A POI-powered Tika Parser for TNEF (Transport Neutral + * Encoding Format) messages, aka winmail.dat + */ +public class TNEFParser extends AbstractParser { + private static final long serialVersionUID = 4611820730372823452L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.ms-tnef"), + MediaType.application("ms-tnef"), + MediaType.application("x-tnef") + ))); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * Extracts properties and text from an MS Document input stream + */ + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + // We work by recursing, so get the appropriate bits + EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); + EmbeddedDocumentExtractor embeddedExtractor; + if (ex == null) { + embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); + } else { + embeddedExtractor = ex; + } + + // Ask POI to process the file for us + HMEFMessage msg = new HMEFMessage(stream); + + // Set the message subject if known + String subject = msg.getSubject(); + if (subject != null && subject.length() > 0) { + // TODO: Move to title in Tika 2.0 + metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject); + } + + // Recurse into the message body RTF + MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED); + if (attr != null && attr instanceof MAPIRtfAttribute) { + MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr; + handleEmbedded( + "message.rtf", "application/rtf", + rtf.getData(), + embeddedExtractor, handler + ); + } + + // Recurse into each attachment in turn + for (Attachment attachment : msg.getAttachments()) { + String name = attachment.getLongFilename(); + if (name == null || name.length() == 0) { + name = attachment.getFilename(); + } + if (name == null || name.length() == 0) { + String ext = attachment.getExtension(); + if (ext != null) { + name = "unknown" + ext; + } + } + handleEmbedded( + name, null, attachment.getContents(), + embeddedExtractor, handler + ); + } + } + + private void handleEmbedded(String name, String type, byte[] contents, + EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler) + throws IOException, SAXException, TikaException { + Metadata metadata = new Metadata(); + if (name != null) + metadata.set(Metadata.RESOURCE_NAME_KEY, name); + if (type != null) + metadata.set(Metadata.CONTENT_TYPE, type); + + if (embeddedExtractor.shouldParseEmbedded(metadata)) { + embeddedExtractor.parseEmbedded( + TikaInputStream.get(contents), + new EmbeddedContentHandler(handler), + metadata, false); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Text cell. + */ +public class TextCell implements Cell { + + private final String text; + + public TextCell(String text) { + this.text = text; + } + + public void render(XHTMLContentHandler handler) throws SAXException { + handler.characters(text); + } + + public String toString() { + return "Text Cell: \"" + text + "\""; + } +}
