Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Fri May 29 14:36:21 2015 @@ -60,18 +60,18 @@ class MailContentHandler implements Cont private EmbeddedDocumentExtractor extractor; private boolean inPart = false; - + MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) { this.handler = xhtml; this.metadata = metadata; this.strictParsing = strictParsing; - + // Fetch / Build an EmbeddedDocumentExtractor with which // to handle/process the parts/attachments - + // Was an EmbeddedDocumentExtractor explicitly supplied? this.extractor = context.get(EmbeddedDocumentExtractor.class); - + // If there's no EmbeddedDocumentExtractor, then try using a normal parser // This will ensure that the contents are made available to the user, so // the see the text, but without fine-grained control/extraction @@ -80,7 +80,7 @@ class MailContentHandler implements Cont // If the user gave a parser, use that, if not the default Parser parser = context.get(AutoDetectParser.class); if (parser == null) { - parser = context.get(Parser.class); + parser = context.get(Parser.class); } if (parser == null) { TikaConfig tikaConfig = context.get(TikaConfig.class); @@ -151,10 +151,10 @@ class MailContentHandler implements Cont /** * Header for the whole message or its parts - * + * * @see http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ - * Field.html - **/ + * Field.html + */ public void field(Field field) throws MimeException { // inPart indicates whether these metadata correspond to the // whole message or its parts @@ -207,7 +207,7 @@ class MailContentHandler implements Cont } private void processAddressList(ParsedField field, String addressListType, - String metadataField) throws MimeException { + String metadataField) throws MimeException { AddressListField toField = (AddressListField) field; if (toField.isValidField()) { AddressList addressList = toField.getAddressList(); @@ -265,7 +265,7 @@ class MailContentHandler implements Cont private String stripOutFieldPrefix(Field field, String fieldname) { String temp = field.getRaw().toString(); int loc = fieldname.length(); - while (temp.charAt(loc) ==' ') { + while (temp.charAt(loc) == ' ') { loc++; } return temp.substring(loc);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Fri May 29 14:36:21 2015 @@ -37,7 +37,7 @@ import org.xml.sax.SAXException; /** * Uses apache-mime4j to parse emails. Each part is treated with the * corresponding parser and displayed within elements. - * <p> + * <p/> * A {@link MimeEntityConfig} object can be passed in the parsing context * to better control the parsing process. * @@ -45,7 +45,9 @@ import org.xml.sax.SAXException; */ public class RFC822Parser extends AbstractParser { - /** Serial version UID */ + /** + * Serial version UID + */ private static final long serialVersionUID = -5504243905998074168L; private static final Set<MediaType> SUPPORTED_TYPES = Collections @@ -56,7 +58,7 @@ public class RFC822Parser extends Abstra } public void parse(InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) throws IOException, + Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Get the mime4j configuration, or use a default one MimeConfig config = new MimeConfig(); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Fri May 29 14:36:21 2015 @@ -53,162 +53,161 @@ import org.xml.sax.SAXException; */ public class MboxParser extends AbstractParser { - /** Serial version UID */ - private static final long serialVersionUID = -1762689436731160661L; + public static final String MBOX_MIME_TYPE = "application/mbox"; + public static final String MBOX_RECORD_DIVIDER = "From "; + public static final int MAIL_MAX_SIZE = 50000000; + /** + * Serial version UID + */ + private static final long serialVersionUID = -1762689436731160661L; + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox")); + private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)"); + private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>"); + + private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-"; + private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from"; + private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>(); + private boolean tracking = false; + + public static Date parseDate(String headerContent) throws ParseException { + SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US); + return dateFormat.parse(headerContent); + } - private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox")); + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } - public static final String MBOX_MIME_TYPE = "application/mbox"; - public static final String MBOX_RECORD_DIVIDER = "From "; - public static final int MAIL_MAX_SIZE = 50000000; - - private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)"); - private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>"); - - private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-"; - private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from"; - - private boolean tracking = false; - private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>(); - - public Set<MediaType> getSupportedTypes(ParseContext context) { - return SUPPORTED_TYPES; - } - - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) - throws IOException, TikaException, SAXException { - - EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class, - new ParsingEmbeddedDocumentExtractor(context)); - - String charsetName = "windows-1252"; - - metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE); - metadata.set(Metadata.CONTENT_ENCODING, charsetName); - - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - - InputStreamReader isr = new InputStreamReader(stream, charsetName); - BufferedReader reader = new BufferedReader(isr); - try { - String curLine = reader.readLine(); - int mailItem = 0; - do { - if (curLine.startsWith(MBOX_RECORD_DIVIDER)) { - Metadata mailMetadata = new Metadata(); - Queue<String> multiline = new LinkedList<String>(); - mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length())); - mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822"); - curLine = reader.readLine(); - - ByteArrayOutputStream message = new ByteArrayOutputStream(100000); - do { - if (curLine.startsWith(" ") || curLine.startsWith("\t")) { - String latestLine = multiline.poll(); - latestLine += " " + curLine.trim(); - multiline.add(latestLine); - } else { - multiline.add(curLine); - } + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, TikaException, SAXException { - message.write(curLine.getBytes(charsetName)); - message.write(0x0A); - curLine = reader.readLine(); - } while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE); - - for (String item : multiline) { - saveHeaderInMetadata(mailMetadata, item); - } - - ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray()); - message = null; - - if (extractor.shouldParseEmbedded(mailMetadata)) { - extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true); - } - - if (tracking) { - getTrackingMetadata().put(mailItem++, mailMetadata); - } - } else { - curLine = reader.readLine(); + EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + + String charsetName = "windows-1252"; + + metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE); + metadata.set(Metadata.CONTENT_ENCODING, charsetName); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + InputStreamReader isr = new InputStreamReader(stream, charsetName); + BufferedReader reader = new BufferedReader(isr); + try { + String curLine = reader.readLine(); + int mailItem = 0; + do { + if (curLine.startsWith(MBOX_RECORD_DIVIDER)) { + Metadata mailMetadata = new Metadata(); + Queue<String> multiline = new LinkedList<String>(); + mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length())); + mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822"); + curLine = reader.readLine(); + + ByteArrayOutputStream message = new ByteArrayOutputStream(100000); + do { + if (curLine.startsWith(" ") || curLine.startsWith("\t")) { + String latestLine = multiline.poll(); + latestLine += " " + curLine.trim(); + multiline.add(latestLine); + } else { + multiline.add(curLine); + } + + message.write(curLine.getBytes(charsetName)); + message.write(0x0A); + curLine = reader.readLine(); + } + while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE); + + for (String item : multiline) { + saveHeaderInMetadata(mailMetadata, item); + } + + ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray()); + message = null; + + if (extractor.shouldParseEmbedded(mailMetadata)) { + extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true); + } + + if (tracking) { + getTrackingMetadata().put(mailItem++, mailMetadata); + } + } else { + curLine = reader.readLine(); + } + + } while (curLine != null && !Thread.currentThread().isInterrupted()); + + } finally { + reader.close(); } - } while (curLine != null && !Thread.currentThread().isInterrupted()); + xhtml.endDocument(); + } + + public boolean isTracking() { + return tracking; + } + + public void setTracking(boolean tracking) { + this.tracking = tracking; + } - } finally { - reader.close(); + public Map<Integer, Metadata> getTrackingMetadata() { + return trackingMetadata; } - xhtml.endDocument(); - } + private void saveHeaderInMetadata(Metadata metadata, String curLine) { + Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine); + if (!headerMatcher.matches()) { + return; // ignore malformed header lines + } + + String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT); + String headerContent = headerMatcher.group(2); + + if (headerTag.equalsIgnoreCase("From")) { + metadata.set(TikaCoreProperties.CREATOR, headerContent); + } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc") + || headerTag.equalsIgnoreCase("Bcc")) { + Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent); + if (address.find()) { + metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1)); + } else if (headerContent.indexOf('@') > -1) { + metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent); + } - public static Date parseDate(String headerContent) throws ParseException { - SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US); - return dateFormat.parse(headerContent); - } - - public boolean isTracking() { - return tracking; - } - - public void setTracking(boolean tracking) { - this.tracking = tracking; - } - - public Map<Integer, Metadata> getTrackingMetadata() { - return trackingMetadata; - } - - private void saveHeaderInMetadata(Metadata metadata, String curLine) { - Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine); - if (!headerMatcher.matches()) { - return; // ignore malformed header lines - } - - String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT); - String headerContent = headerMatcher.group(2); - - if (headerTag.equalsIgnoreCase("From")) { - metadata.set(TikaCoreProperties.CREATOR, headerContent); - } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc") - || headerTag.equalsIgnoreCase("Bcc")) { - Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent); - if (address.find()) { - metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1)); - } else if (headerContent.indexOf('@') > -1) { - metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent); - } - - String property = Metadata.MESSAGE_TO; - if (headerTag.equalsIgnoreCase("Cc")) { - property = Metadata.MESSAGE_CC; - } else if (headerTag.equalsIgnoreCase("Bcc")) { - property = Metadata.MESSAGE_BCC; - } - metadata.add(property, headerContent); - } else if (headerTag.equalsIgnoreCase("Subject")) { - metadata.add(Metadata.SUBJECT, headerContent); - } else if (headerTag.equalsIgnoreCase("Date")) { - try { - Date date = parseDate(headerContent); - metadata.set(TikaCoreProperties.CREATED, date); - } catch (ParseException e) { - // ignoring date because format was not understood - } - } else if (headerTag.equalsIgnoreCase("Message-Id")) { - metadata.set(TikaCoreProperties.IDENTIFIER, headerContent); - } else if (headerTag.equalsIgnoreCase("In-Reply-To")) { - metadata.set(TikaCoreProperties.RELATION, headerContent); - } else if (headerTag.equalsIgnoreCase("Content-Type")) { - // TODO - key off content-type in headers to - // set mapping to use for content and convert if necessary. - - metadata.add(Metadata.CONTENT_TYPE, headerContent); - metadata.set(TikaCoreProperties.FORMAT, headerContent); - } else { - metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent); + String property = Metadata.MESSAGE_TO; + if (headerTag.equalsIgnoreCase("Cc")) { + property = Metadata.MESSAGE_CC; + } else if (headerTag.equalsIgnoreCase("Bcc")) { + property = Metadata.MESSAGE_BCC; + } + metadata.add(property, headerContent); + } else if (headerTag.equalsIgnoreCase("Subject")) { + metadata.add(Metadata.SUBJECT, headerContent); + } else if (headerTag.equalsIgnoreCase("Date")) { + try { + Date date = parseDate(headerContent); + metadata.set(TikaCoreProperties.CREATED, date); + } catch (ParseException e) { + // ignoring date because format was not understood + } + } else if (headerTag.equalsIgnoreCase("Message-Id")) { + metadata.set(TikaCoreProperties.IDENTIFIER, headerContent); + } else if (headerTag.equalsIgnoreCase("In-Reply-To")) { + metadata.set(TikaCoreProperties.RELATION, headerContent); + } else if (headerTag.equalsIgnoreCase("Content-Type")) { + // TODO - key off content-type in headers to + // set mapping to use for content and convert if necessary. + + metadata.add(Metadata.CONTENT_TYPE, headerContent); + metadata.set(TikaCoreProperties.FORMAT, headerContent); + } else { + metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent); + } } - } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java Fri May 29 14:36:21 2015 @@ -48,159 +48,157 @@ import org.xml.sax.helpers.AttributesImp /** * @author Tran Nam Quang * @author hong-thai.nguyen - * */ public class OutlookPSTParser extends AbstractParser { - private static final long serialVersionUID = 620998217748364063L; + private static final long serialVersionUID = 620998217748364063L; - private static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst"); - private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE); + private static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst"); + private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE); - public Set<MediaType> getSupportedTypes(ParseContext context) { - return SUPPORTED_TYPES; - } - - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { - - // Use the delegate parser to parse the contained document - EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class, - new ParsingEmbeddedDocumentExtractor(context)); - - metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString()); - - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - - TikaInputStream in = TikaInputStream.get(stream); - PSTFile pstFile = null; - try { - pstFile = new PSTFile(in.getFile().getPath()); - metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length())); - boolean isValid = pstFile.getFileHandle().getFD().valid(); - metadata.set("isValid", valueOf(isValid)); - if (isValid) { - parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor); - } - } catch (Exception e) { - throw new TikaException(e.getMessage(), e); - } finally { - if (pstFile != null && pstFile.getFileHandle() != null) { - try{ - pstFile.getFileHandle().close(); - } catch (IOException e) { - //swallow closing exception - } - } + private static AttributesImpl createAttribute(String attName, String attValue) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", attName, attName, "CDATA", attValue); + return attributes; } - xhtml.endDocument(); - } + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } - private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor) - throws Exception { - if (pstFolder.getContentCount() > 0) { - PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild(); - while (pstMail != null) { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId()); - handler.startElement("div", attributes); - handler.element("h1", pstMail.getSubject()); - - parserMailItem(handler, pstMail, embeddedExtractor); - parseMailAttachments(handler, pstMail, embeddedExtractor); - - handler.endElement("div"); - - pstMail = (PSTMessage) pstFolder.getNextChild(); - } - } - - if (pstFolder.hasSubfolders()) { - for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) { - handler.startElement("div", createAttribute("class", "email-folder")); - handler.element("h1", pstSubFolder.getDisplayName()); - parseFolder(handler, pstSubFolder, embeddedExtractor); - handler.endElement("div"); - } - } - } - - private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException { - Metadata mailMetadata = new Metadata(); - mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); - mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId()); - mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId()); - mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject()); - mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName()); - mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName()); - mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime()); - mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime()); - mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment()); - mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId())); - mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress()); - mailMetadata.set("recipients", pstMail.getRecipientsString()); - mailMetadata.set("displayTo", pstMail.getDisplayTo()); - mailMetadata.set("displayCC", pstMail.getDisplayCC()); - mailMetadata.set("displayBCC", pstMail.getDisplayBCC()); - mailMetadata.set("importance", valueOf(pstMail.getImportance())); - mailMetadata.set("priority", valueOf(pstMail.getPriority())); - mailMetadata.set("flagged", valueOf(pstMail.isFlagged())); - - byte[] mailContent = pstMail.getBody().getBytes(IOUtils.UTF_8); - embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true); - } - - - private static AttributesImpl createAttribute(String attName, String attValue) { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", attName, attName, "CDATA", attValue); - return attributes; - } - - private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor) - throws TikaException { - int numberOfAttachments = email.getNumberOfAttachments(); - for (int i = 0; i < numberOfAttachments; i++) { - File tempFile = null; - try { - PSTAttachment attach = email.getAttachment(i); - - // Get the filename; both long and short filenames can be used for attachments - String filename = attach.getLongFilename(); - if (filename.isEmpty()) { - filename = attach.getFilename(); + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + // Use the delegate parser to parse the contained document + EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + + metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString()); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + TikaInputStream in = TikaInputStream.get(stream); + PSTFile pstFile = null; + try { + pstFile = new PSTFile(in.getFile().getPath()); + metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length())); + boolean isValid = pstFile.getFileHandle().getFD().valid(); + metadata.set("isValid", valueOf(isValid)); + if (isValid) { + parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor); + } + } catch (Exception e) { + throw new TikaException(e.getMessage(), e); + } finally { + if (pstFile != null && pstFile.getFileHandle() != null) { + try { + pstFile.getFileHandle().close(); + } catch (IOException e) { + //swallow closing exception + } + } } - xhtml.element("p", filename); + xhtml.endDocument(); + } - Metadata attachMeta = new Metadata(); - attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename); - attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename); - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", filename); - xhtml.startElement("div", attributes); - if (embeddedExtractor.shouldParseEmbedded(attachMeta)) { - TemporaryResources tmp = new TemporaryResources(); - try { - TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp); - embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true); - } finally { - tmp.dispose(); - } + private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor) + throws Exception { + if (pstFolder.getContentCount() > 0) { + PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild(); + while (pstMail != null) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId()); + handler.startElement("div", attributes); + handler.element("h1", pstMail.getSubject()); + + parserMailItem(handler, pstMail, embeddedExtractor); + parseMailAttachments(handler, pstMail, embeddedExtractor); + + handler.endElement("div"); + + pstMail = (PSTMessage) pstFolder.getNextChild(); + } + } + + if (pstFolder.hasSubfolders()) { + for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) { + handler.startElement("div", createAttribute("class", "email-folder")); + handler.element("h1", pstSubFolder.getDisplayName()); + parseFolder(handler, pstSubFolder, embeddedExtractor); + handler.endElement("div"); + } } - xhtml.endElement("div"); + } - } catch (Exception e) { - throw new TikaException("Unable to unpack document stream", e); - } finally { - if (tempFile != null) - tempFile.delete(); - } + private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException { + Metadata mailMetadata = new Metadata(); + mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); + mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId()); + mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId()); + mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject()); + mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName()); + mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName()); + mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime()); + mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime()); + mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment()); + mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId())); + mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress()); + mailMetadata.set("recipients", pstMail.getRecipientsString()); + mailMetadata.set("displayTo", pstMail.getDisplayTo()); + mailMetadata.set("displayCC", pstMail.getDisplayCC()); + mailMetadata.set("displayBCC", pstMail.getDisplayBCC()); + mailMetadata.set("importance", valueOf(pstMail.getImportance())); + mailMetadata.set("priority", valueOf(pstMail.getPriority())); + mailMetadata.set("flagged", valueOf(pstMail.isFlagged())); + + byte[] mailContent = pstMail.getBody().getBytes(IOUtils.UTF_8); + embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true); + } + + private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor) + throws TikaException { + int numberOfAttachments = email.getNumberOfAttachments(); + for (int i = 0; i < numberOfAttachments; i++) { + File tempFile = null; + try { + PSTAttachment attach = email.getAttachment(i); + + // Get the filename; both long and short filenames can be used for attachments + String filename = attach.getLongFilename(); + if (filename.isEmpty()) { + filename = attach.getFilename(); + } + + xhtml.element("p", filename); + + Metadata attachMeta = new Metadata(); + attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename); + attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", filename); + xhtml.startElement("div", attributes); + if (embeddedExtractor.shouldParseEmbedded(attachMeta)) { + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp); + embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true); + } finally { + tmp.dispose(); + } + } + xhtml.endElement("div"); + + } catch (Exception e) { + throw new TikaException("Unable to unpack document stream", e); + } finally { + if (tempFile != null) + tempFile.delete(); + } + } } - } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java Fri May 29 14:36:21 2015 @@ -34,15 +34,13 @@ public abstract class AbstractListManage //helper class that is docx/doc format agnostic protected class ParagraphLevelCounter { - Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)"); - //counts can == 0 if the format is decimal, make sure //that flag values are < 0 private final Integer NOT_SEEN_YET = -1; private final Integer FIRST_SKIPPED = -2; - private List<Integer> counts = new ArrayList<Integer>(); private final LevelTuple[] levelTuples; - + Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)"); + private List<Integer> counts = new ArrayList<Integer>(); private int lastLevel = -1; public ParagraphLevelCounter(LevelTuple[] levelTuples) { @@ -52,16 +50,17 @@ public abstract class AbstractListManage public int getNumberOfLevels() { return levelTuples.length; } + /** * Apply this to every numbered paragraph in order. * - * @param levelNumber level number that is being incremented + * @param levelNumber level number that is being incremented * @return the new formatted number string for this level */ public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) { - for (int i = lastLevel+1; i < levelNumber; i++) { - if (i >= counts.size()){ + for (int i = lastLevel + 1; i < levelNumber; i++) { + if (i >= counts.size()) { int val = getStart(i, overrideLevelTuples); counts.add(i, val); } else { @@ -104,7 +103,7 @@ public abstract class AbstractListManage //short circuit bullet String numFmt = getNumFormat(level, isLegal, overrideLevelTuples); if ("bullet".equals(numFmt)) { - return BULLET+" "; + return BULLET + " "; } String lvlText = (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ? @@ -163,7 +162,7 @@ public abstract class AbstractListManage } else if ("ordinal".equals(numFmt)) { return ordinalize(count); } else if ("decimalZero".equals(numFmt)) { - return "0"+NumberFormatter.getNumber(count, 0); + return "0" + NumberFormatter.getNumber(count, 0); } else if ("none".equals(numFmt)) { return ""; } @@ -174,13 +173,13 @@ public abstract class AbstractListManage //this is only good for locale == English String countString = Integer.toString(count); if (countString.endsWith("1")) { - return countString+"st"; + return countString + "st"; } else if (countString.endsWith("2")) { - return countString+"nd"; + return countString + "nd"; } else if (countString.endsWith("3")) { - return countString+"rd"; + return countString + "rd"; } - return countString+"th"; + return countString + "th"; } private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) { @@ -218,7 +217,7 @@ public abstract class AbstractListManage if (restart == 0) { return; } else if (restart == -1 || - startlevelNumber <= restart - 1 ) { + startlevelNumber <= restart - 1) { counts.set(levelNumber, NOT_SEEN_YET); } else { //do nothing/don't reset Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Fri May 29 14:36:21 2015 @@ -47,56 +47,59 @@ import org.apache.tika.sax.XHTMLContentH import org.xml.sax.SAXException; abstract class AbstractPOIFSExtractor { + private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class); private final EmbeddedDocumentExtractor extractor; private PasswordProvider passwordProvider; private TikaConfig tikaConfig; private MimeTypes mimeTypes; private Detector detector; private Metadata metadata; - private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class); protected AbstractPOIFSExtractor(ParseContext context) { this(context, null); } + protected AbstractPOIFSExtractor(ParseContext context, Metadata metadata) { EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - if (ex==null) { + if (ex == null) { this.extractor = new ParsingEmbeddedDocumentExtractor(context); } else { this.extractor = ex; } - + this.passwordProvider = context.get(PasswordProvider.class); this.tikaConfig = context.get(TikaConfig.class); this.mimeTypes = context.get(MimeTypes.class); this.detector = context.get(Detector.class); this.metadata = metadata; } - + // Note - these cache, but avoid creating the default TikaConfig if not needed protected TikaConfig getTikaConfig() { - if (tikaConfig == null) { - tikaConfig = TikaConfig.getDefaultConfig(); - } - return tikaConfig; + if (tikaConfig == null) { + tikaConfig = TikaConfig.getDefaultConfig(); + } + return tikaConfig; } + protected Detector getDetector() { - if (detector != null) return detector; - - detector = getTikaConfig().getDetector(); - return detector; + if (detector != null) return detector; + + detector = getTikaConfig().getDetector(); + return detector; } + protected MimeTypes getMimeTypes() { - if (mimeTypes != null) return mimeTypes; - - mimeTypes = getTikaConfig().getMimeRepository(); - return mimeTypes; + if (mimeTypes != null) return mimeTypes; + + mimeTypes = getTikaConfig().getMimeRepository(); + return mimeTypes; } - + /** * Returns the password to be used for this file, or null - * if no / default password should be used + * if no / default password should be used */ protected String getPassword() { if (passwordProvider != null) { @@ -104,30 +107,30 @@ abstract class AbstractPOIFSExtractor { } return null; } - + protected void handleEmbeddedResource(TikaInputStream resource, String filename, String relationshipID, String mediaType, XHTMLContentHandler xhtml, boolean outputHtml) - throws IOException, SAXException, TikaException { - try { - Metadata metadata = new Metadata(); - if(filename != null) { - metadata.set(Metadata.TIKA_MIME_FILE, filename); - metadata.set(Metadata.RESOURCE_NAME_KEY, filename); - } - if (relationshipID != null) { - metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID); - } - if(mediaType != null) { - metadata.set(Metadata.CONTENT_TYPE, mediaType); - } - - if (extractor.shouldParseEmbedded(metadata)) { - extractor.parseEmbedded(resource, xhtml, metadata, outputHtml); - } - } finally { - resource.close(); - } + throws IOException, SAXException, TikaException { + try { + Metadata metadata = new Metadata(); + if (filename != null) { + metadata.set(Metadata.TIKA_MIME_FILE, filename); + metadata.set(Metadata.RESOURCE_NAME_KEY, filename); + } + if (relationshipID != null) { + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID); + } + if (mediaType != null) { + metadata.set(Metadata.CONTENT_TYPE, mediaType); + } + + if (extractor.shouldParseEmbedded(metadata)) { + extractor.parseEmbedded(resource, xhtml, metadata, outputHtml); + } + } finally { + resource.close(); + } } /** @@ -167,7 +170,7 @@ abstract class AbstractPOIFSExtractor { if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: - Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir); + Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir); if (ole.getLabel() != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel()); } @@ -180,33 +183,33 @@ abstract class AbstractPOIFSExtractor { } } else if (type == POIFSDocumentType.COMP_OBJ) { try { - // Grab the contents and process - DocumentEntry contentsEntry; - try { - contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS"); - } catch (FileNotFoundException ioe) { - contentsEntry = (DocumentEntry)dir.getEntry("Contents"); - } - DocumentInputStream inp = new DocumentInputStream(contentsEntry); - byte[] contents = new byte[contentsEntry.getSize()]; - inp.readFully(contents); - embedded = TikaInputStream.get(contents); - - // Try to work out what it is - MediaType mediaType = getDetector().detect(embedded, new Metadata()); - String extension = type.getExtension(); - try { - MimeType mimeType = getMimeTypes().forName(mediaType.toString()); - extension = mimeType.getExtension(); - } catch(MimeTypeException mte) { - // No details on this type are known - } - - // Record what we can do about it - metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString()); - metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension); - } catch(Exception e) { - throw new TikaException("Invalid embedded resource", e); + // Grab the contents and process + DocumentEntry contentsEntry; + try { + contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS"); + } catch (FileNotFoundException ioe) { + contentsEntry = (DocumentEntry) dir.getEntry("Contents"); + } + DocumentInputStream inp = new DocumentInputStream(contentsEntry); + byte[] contents = new byte[contentsEntry.getSize()]; + inp.readFully(contents); + embedded = TikaInputStream.get(contents); + + // Try to work out what it is + MediaType mediaType = getDetector().detect(embedded, new Metadata()); + String extension = type.getExtension(); + try { + MimeType mimeType = getMimeTypes().forName(mediaType.toString()); + extension = mimeType.getExtension(); + } catch (MimeTypeException mte) { + // No details on this type are known + } + + // Record what we can do about it + metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString()); + metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension); + } catch (Exception e) { + throw new TikaException("Invalid embedded resource", e); } } else { metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Fri May 29 14:36:21 2015 @@ -16,7 +16,7 @@ */ package org.apache.tika.parser.microsoft; -import java.awt.Point; +import java.awt.*; import java.io.IOException; import java.text.NumberFormat; import java.util.ArrayList; @@ -74,11 +74,11 @@ import org.xml.sax.SAXException; /** * Excel parser implementation which uses POI's Event API * to handle the contents of a Workbook. - * <p> + * <p/> * The Event API uses a much smaller memory footprint than * <code>HSSFWorkbook</code> when processing excel files * but at the cost of more complexity. - * <p> + * <p/> * With the Event API a <i>listener</i> is registered for * specific record types and those records are created, * fired off to the listener and then discarded as the stream @@ -90,6 +90,8 @@ import org.xml.sax.SAXException; */ public class ExcelExtractor extends AbstractPOIFSExtractor { + private static final String WORKBOOK_ENTRY = "Workbook"; + private static final String BOOK_ENTRY = "Book"; /** * <code>true</code> if the HSSFListener should be registered * to listen for all records or <code>false</code> (the default) @@ -97,9 +99,6 @@ public class ExcelExtractor extends Abst * records. */ private boolean listenForAllRecords = false; - - private static final String WORKBOOK_ENTRY = "Workbook"; - private static final String BOOK_ENTRY = "Book"; public ExcelExtractor(ParseContext context, Metadata metadata) { super(context, metadata); @@ -116,14 +115,14 @@ public class ExcelExtractor extends Abst /** * Specifies whether this parser should to listen for all * records or just for the specified few. - * <p> + * <p/> * <strong>Note:</strong> Under normal operation this setting should * be <code>false</code> (the default), but you can experiment with * this setting for testing and debugging purposes. * * @param listenForAllRecords <code>true</code> if the HSSFListener - * should be registered to listen for all records or <code>false</code> - * if the listener should be configured to only receive specified records. + * should be registered to listen for all records or <code>false</code> + * if the listener should be configured to only receive specified records. */ public void setListenForAllRecords(boolean listenForAllRecords) { this.listenForAllRecords = listenForAllRecords; @@ -135,7 +134,7 @@ public class ExcelExtractor extends Abst * * @param filesystem POI file system * @throws IOException if an error occurs processing the workbook - * or writing the extracted content + * or writing the extracted content */ protected void parse( NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml, @@ -146,7 +145,7 @@ public class ExcelExtractor extends Abst protected void parse( DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { - if (! root.hasEntry(WORKBOOK_ENTRY)) { + if (!root.hasEntry(WORKBOOK_ENTRY)) { if (root.hasEntry(BOOK_ENTRY)) { // Excel 5 / Excel 95 file // Records are in a different structure so needs a @@ -155,14 +154,14 @@ public class ExcelExtractor extends Abst OldExcelParser.parse(extractor, xhtml); return; } else { - // Corrupt file / very old file, just skip text extraction - return; + // Corrupt file / very old file, just skip text extraction + return; } } - + // If a password was supplied, use it, otherwise the default Biff8EncryptionKey.setCurrentUserPassword(getPassword()); - + // Have the file processed in event mode TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this); listener.processFile(root, isListenForAllRecords()); @@ -177,7 +176,7 @@ public class ExcelExtractor extends Abst // ignore parse errors from embedded documents } } - } + } } // ====================================================================== @@ -191,12 +190,18 @@ public class ExcelExtractor extends Abst * XHTML content handler to which the document content is rendered. */ private final XHTMLContentHandler handler; - + /** * The POIFS Extractor, used for embeded resources. */ private final AbstractPOIFSExtractor extractor; - + /** + * Format for rendering numbers in the worksheet. Currently we just + * use the platform default formatting. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a> + */ + private final NumberFormat format; /** * Potential exception thrown by the content handler. When set to * non-<code>null</code>, causes all subsequent HSSF records to be @@ -204,53 +209,37 @@ public class ExcelExtractor extends Abst * {@link #throwStoredException()} is invoked. */ private Exception exception = null; - private SSTRecord sstRecord; private FormulaRecord stringFormulaRecord; - private short previousSid; - /** * Internal <code>FormatTrackingHSSFListener</code> to handle cell * formatting within the extraction. */ private FormatTrackingHSSFListener formatListener; - /** * List of worksheet names. */ private List<String> sheetNames = new ArrayList<String>(); - /** * Index of the current worksheet within the workbook. * Used to find the worksheet name in the {@link #sheetNames} list. */ private short currentSheetIndex; - /** * Content of the current worksheet, or <code>null</code> if no * worksheet is currently active. */ private SortedMap<Point, Cell> currentSheet = null; - /** * Extra text or cells that crops up, typically as part of a - * worksheet but not always. + * worksheet but not always. */ private List<Cell> extraTextCells = new ArrayList<Cell>(); - - /** - * Format for rendering numbers in the worksheet. Currently we just - * use the platform default formatting. - * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a> - */ - private final NumberFormat format; - /** * These aren't complete when we first see them, as the - * depend on continue records that aren't always - * contiguous. Collect them for later processing. + * depend on continue records that aren't always + * contiguous. Collect them for later processing. */ private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>(); @@ -270,21 +259,21 @@ public class ExcelExtractor extends Abst /** * Entry point to listener to start the processing of a file. * - * @param filesystem POI file system. + * @param filesystem POI file system. * @param listenForAllRecords sets whether the listener is configured to listen - * for all records types or not. - * @throws IOException on any IO errors. + * for all records types or not. + * @throws IOException on any IO errors. * @throws SAXException on any SAX parsing errors. */ - public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords) - throws IOException, SAXException, TikaException { + public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords) + throws IOException, SAXException, TikaException { processFile(filesystem.getRoot(), listenForAllRecords); } - public void processFile(DirectoryNode root, boolean listenForAllRecords) - throws IOException, SAXException, TikaException { + public void processFile(DirectoryNode root, boolean listenForAllRecords) + throws IOException, SAXException, TikaException { - // Set up listener and register the records we want to process + // Set up listener and register the records we want to process HSSFRequest hssfRequest = new HSSFRequest(); if (listenForAllRecords) { hssfRequest.addListenerForAllRecords(formatListener); @@ -317,17 +306,17 @@ public class ExcelExtractor extends Abst } catch (org.apache.poi.EncryptedDocumentException e) { throw new EncryptedDocumentException(e); } - + // Output any extra text that came after all the sheets - processExtraText(); - + processExtraText(); + // Look for embeded images, now that the drawing records // have been fully matched with their continue data - for(DrawingGroupRecord dgr : drawingGroups) { - dgr.decode(); - findPictures(dgr.getEscherRecords()); + for (DrawingGroupRecord dgr : drawingGroups) { + dgr.decode(); + findPictures(dgr.getEscherRecords()); } - } + } /** * Process a HSSF record. @@ -339,7 +328,7 @@ public class ExcelExtractor extends Abst try { internalProcessRecord(record); } catch (TikaException te) { - exception = te; + exception = te; } catch (IOException ie) { exception = ie; } catch (SAXException se) { @@ -350,142 +339,142 @@ public class ExcelExtractor extends Abst public void throwStoredException() throws TikaException, SAXException, IOException { if (exception != null) { - if(exception instanceof IOException) - throw (IOException)exception; - if(exception instanceof SAXException) - throw (SAXException)exception; - if(exception instanceof TikaException) - throw (TikaException)exception; + if (exception instanceof IOException) + throw (IOException) exception; + if (exception instanceof SAXException) + throw (SAXException) exception; + if (exception instanceof TikaException) + throw (TikaException) exception; throw new TikaException(exception.getMessage()); } } private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException { switch (record.getSid()) { - case BOFRecord.sid: // start of workbook, worksheet etc. records - BOFRecord bof = (BOFRecord) record; - if (bof.getType() == BOFRecord.TYPE_WORKBOOK) { - currentSheetIndex = -1; - } else if (bof.getType() == BOFRecord.TYPE_CHART) { - if(previousSid == EOFRecord.sid) { - // This is a sheet which contains only a chart - newSheet(); - } else { - // This is a chart within a normal sheet - // Handling of this is a bit hacky... - if (currentSheet != null) { - processSheet(); - currentSheetIndex--; + case BOFRecord.sid: // start of workbook, worksheet etc. records + BOFRecord bof = (BOFRecord) record; + if (bof.getType() == BOFRecord.TYPE_WORKBOOK) { + currentSheetIndex = -1; + } else if (bof.getType() == BOFRecord.TYPE_CHART) { + if (previousSid == EOFRecord.sid) { + // This is a sheet which contains only a chart newSheet(); + } else { + // This is a chart within a normal sheet + // Handling of this is a bit hacky... + if (currentSheet != null) { + processSheet(); + currentSheetIndex--; + newSheet(); + } } + } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) { + newSheet(); } - } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) { - newSheet(); - } - break; + break; - case EOFRecord.sid: // end of workbook, worksheet etc. records - if (currentSheet != null) { - processSheet(); - } - currentSheet = null; - break; + case EOFRecord.sid: // end of workbook, worksheet etc. records + if (currentSheet != null) { + processSheet(); + } + currentSheet = null; + break; - case BoundSheetRecord.sid: // Worksheet index record - BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record; - sheetNames.add(boundSheetRecord.getSheetname()); - break; - - case SSTRecord.sid: // holds all the strings for LabelSSTRecords - sstRecord = (SSTRecord) record; - break; - - case FormulaRecord.sid: // Cell value from a formula - FormulaRecord formula = (FormulaRecord) record; - if (formula.hasCachedResultString()) { - // The String itself should be the next record - stringFormulaRecord = formula; - } else { - addTextCell(record, formatListener.formatNumberDateCell(formula)); - } - break; - - case StringRecord.sid: - if (previousSid == FormulaRecord.sid) { - // Cached string value of a string formula - StringRecord sr = (StringRecord) record; - addTextCell(stringFormulaRecord, sr.getString()); - } else { - // Some other string not associated with a cell, skip - } - break; + case BoundSheetRecord.sid: // Worksheet index record + BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record; + sheetNames.add(boundSheetRecord.getSheetname()); + break; + + case SSTRecord.sid: // holds all the strings for LabelSSTRecords + sstRecord = (SSTRecord) record; + break; + + case FormulaRecord.sid: // Cell value from a formula + FormulaRecord formula = (FormulaRecord) record; + if (formula.hasCachedResultString()) { + // The String itself should be the next record + stringFormulaRecord = formula; + } else { + addTextCell(record, formatListener.formatNumberDateCell(formula)); + } + break; - case LabelRecord.sid: // strings stored directly in the cell - LabelRecord label = (LabelRecord) record; - addTextCell(record, label.getValue()); - break; - - case LabelSSTRecord.sid: // Ref. a string in the shared string table - LabelSSTRecord sst = (LabelSSTRecord) record; - UnicodeString unicode = sstRecord.getString(sst.getSSTIndex()); - addTextCell(record, unicode.getString()); - break; - - case NumberRecord.sid: // Contains a numeric cell value - NumberRecord number = (NumberRecord) record; - addTextCell(record, formatListener.formatNumberDateCell(number)); - break; - - case RKRecord.sid: // Excel internal number record - RKRecord rk = (RKRecord) record; - addCell(record, new NumberCell(rk.getRKNumber(), format)); - break; - - case HyperlinkRecord.sid: // holds a URL associated with a cell - if (currentSheet != null) { - HyperlinkRecord link = (HyperlinkRecord) record; - Point point = - new Point(link.getFirstColumn(), link.getFirstRow()); - Cell cell = currentSheet.get(point); - if (cell != null) { - String address = link.getAddress(); - if (address != null) { - addCell(record, new LinkedCell(cell, address)); - } else { - addCell(record, cell); + case StringRecord.sid: + if (previousSid == FormulaRecord.sid) { + // Cached string value of a string formula + StringRecord sr = (StringRecord) record; + addTextCell(stringFormulaRecord, sr.getString()); + } else { + // Some other string not associated with a cell, skip + } + break; + + case LabelRecord.sid: // strings stored directly in the cell + LabelRecord label = (LabelRecord) record; + addTextCell(record, label.getValue()); + break; + + case LabelSSTRecord.sid: // Ref. a string in the shared string table + LabelSSTRecord sst = (LabelSSTRecord) record; + UnicodeString unicode = sstRecord.getString(sst.getSSTIndex()); + addTextCell(record, unicode.getString()); + break; + + case NumberRecord.sid: // Contains a numeric cell value + NumberRecord number = (NumberRecord) record; + addTextCell(record, formatListener.formatNumberDateCell(number)); + break; + + case RKRecord.sid: // Excel internal number record + RKRecord rk = (RKRecord) record; + addCell(record, new NumberCell(rk.getRKNumber(), format)); + break; + + case HyperlinkRecord.sid: // holds a URL associated with a cell + if (currentSheet != null) { + HyperlinkRecord link = (HyperlinkRecord) record; + Point point = + new Point(link.getFirstColumn(), link.getFirstRow()); + Cell cell = currentSheet.get(point); + if (cell != null) { + String address = link.getAddress(); + if (address != null) { + addCell(record, new LinkedCell(cell, address)); + } else { + addCell(record, cell); + } } } - } - break; + break; - case TextObjectRecord.sid: - TextObjectRecord tor = (TextObjectRecord) record; - addTextCell(record, tor.getStr().getString()); - break; - - case SeriesTextRecord.sid: // Chart label or title - SeriesTextRecord str = (SeriesTextRecord) record; - addTextCell(record, str.getText()); - break; - - case DrawingGroupRecord.sid: - // Collect this now, we'll process later when all - // the continue records are in - drawingGroups.add( (DrawingGroupRecord)record ); - break; + case TextObjectRecord.sid: + TextObjectRecord tor = (TextObjectRecord) record; + addTextCell(record, tor.getStr().getString()); + break; + + case SeriesTextRecord.sid: // Chart label or title + SeriesTextRecord str = (SeriesTextRecord) record; + addTextCell(record, str.getText()); + break; + + case DrawingGroupRecord.sid: + // Collect this now, we'll process later when all + // the continue records are in + drawingGroups.add((DrawingGroupRecord) record); + break; } previousSid = record.getSid(); - + if (stringFormulaRecord != record) { - stringFormulaRecord = null; + stringFormulaRecord = null; } } private void processExtraText() throws SAXException { - if(extraTextCells.size() > 0) { - for(Cell cell : extraTextCells) { + if (extraTextCells.size() > 0) { + for (Cell cell : extraTextCells) { handler.startElement("div", "class", "outside"); cell.render(handler); handler.endElement("div"); @@ -501,7 +490,7 @@ public class ExcelExtractor extends Abst * worksheet (if any) at the position (if any) of the given record. * * @param record record that holds the cell value - * @param cell cell value (or <code>null</code>) + * @param cell cell value (or <code>null</code>) */ private void addCell(Record record, Cell cell) throws SAXException { if (cell == null) { @@ -510,7 +499,7 @@ public class ExcelExtractor extends Abst && record instanceof CellValueRecordInterface) { // Normal cell inside a worksheet CellValueRecordInterface value = - (CellValueRecordInterface) record; + (CellValueRecordInterface) record; Point point = new Point(value.getColumn(), value.getRow()); currentSheet.put(point, cell); } else { @@ -524,7 +513,7 @@ public class ExcelExtractor extends Abst * is trimmed, and ignored if <code>null</code> or empty. * * @param record record that holds the text value - * @param text text content, may be <code>null</code> + * @param text text content, may be <code>null</code> * @throws SAXException */ private void addTextCell(Record record, String text) throws SAXException { @@ -584,32 +573,32 @@ public class ExcelExtractor extends Abst // Sheet End handler.endElement("tbody"); handler.endElement("table"); - + // Finish up processExtraText(); handler.endElement("div"); } private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException { - for(EscherRecord escherRecord : records) { - if (escherRecord instanceof EscherBSERecord) { - EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord(); - if (blip != null) { - HSSFPictureData picture = new HSSFPictureData(blip); - String mimeType = picture.getMimeType(); - TikaInputStream stream = TikaInputStream.get(picture.getData()); - - // Handle the embeded resource - extractor.handleEmbeddedResource( - stream, null, null, mimeType, - handler, true - ); - } - } - - // Recursive call. - findPictures(escherRecord.getChildRecords()); - } + for (EscherRecord escherRecord : records) { + if (escherRecord instanceof EscherBSERecord) { + EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord(); + if (blip != null) { + HSSFPictureData picture = new HSSFPictureData(blip); + String mimeType = picture.getMimeType(); + TikaInputStream stream = TikaInputStream.get(picture.getData()); + + // Handle the embeded resource + extractor.handleEmbeddedResource( + stream, null, null, mimeType, + handler, true + ); + } + } + + // Recursive call. + findPictures(escherRecord.getChildRecords()); + } } }
