Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.poi.hwpf.converter.NumberFormatter; + +public abstract class AbstractListManager { + private final static String BULLET = "\u00b7"; + + protected Map<Integer, ParagraphLevelCounter> listLevelMap = new HashMap<Integer, ParagraphLevelCounter>(); + protected Map<Integer, LevelTuple[]> overrideTupleMap = new HashMap<Integer, LevelTuple[]>(); + + //helper class that is docx/doc format agnostic + protected class ParagraphLevelCounter { + + //counts can == 0 if the format is decimal, make sure + //that flag values are < 0 + private final Integer NOT_SEEN_YET = -1; + private final Integer FIRST_SKIPPED = -2; + private final LevelTuple[] levelTuples; + Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)"); + private List<Integer> counts = new ArrayList<Integer>(); + private int lastLevel = -1; + + public ParagraphLevelCounter(LevelTuple[] levelTuples) { + this.levelTuples = levelTuples; + } + + public int getNumberOfLevels() { + return levelTuples.length; + } + + /** + * Apply this to every numbered paragraph in order. + * + * @param levelNumber level number that is being incremented + * @return the new formatted number string for this level + */ + public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) { + + for (int i = lastLevel + 1; i < levelNumber; i++) { + if (i >= counts.size()) { + int val = getStart(i, overrideLevelTuples); + counts.add(i, val); + } else { + int count = counts.get(i); + if (count == NOT_SEEN_YET) { + count = getStart(i, overrideLevelTuples); + counts.set(i, count); + } + } + } + + if (levelNumber < counts.size()) { + resetAfter(levelNumber, overrideLevelTuples); + int count = counts.get(levelNumber); + if (count == NOT_SEEN_YET) { + count = getStart(levelNumber, overrideLevelTuples); + } else { + count++; + } + counts.set(levelNumber, count); + lastLevel = levelNumber; + return format(levelNumber, overrideLevelTuples); + } + + counts.add(levelNumber, getStart(levelNumber, overrideLevelTuples)); + lastLevel = levelNumber; + return format(levelNumber, overrideLevelTuples); + } + + /** + * @param level which level to format + * @return the string that represents the number and the surrounding text for this paragraph + */ + private String format(int level, LevelTuple[] overrideLevelTuples) { + if (level < 0 || level >= levelTuples.length) { + //log? + return ""; + } + boolean isLegal = (overrideLevelTuples != null) ? overrideLevelTuples[level].isLegal : levelTuples[level].isLegal; + //short circuit bullet + String numFmt = getNumFormat(level, isLegal, overrideLevelTuples); + if ("bullet".equals(numFmt)) { + return BULLET + " "; + } + + String lvlText = (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ? + levelTuples[level].lvlText : overrideLevelTuples[level].lvlText; + StringBuilder sb = new StringBuilder(); + Matcher m = LEVEL_INTERPOLATOR.matcher(lvlText); + int last = 0; + while (m.find()) { + sb.append(lvlText.substring(last, m.start())); + String lvlString = m.group(1); + int lvlNum = -1; + try { + lvlNum = Integer.parseInt(lvlString); + } catch (NumberFormatException e) { + //swallow + } + String numString = ""; + //need to subtract 1 because, e.g. %1 is the format + //for the number at array offset 0 + numString = formatNum(lvlNum - 1, isLegal, overrideLevelTuples); + + sb.append(numString); + last = m.end(); + } + sb.append(lvlText.substring(last)); + if (sb.length() > 0) { + //TODO: add in character after number + sb.append(" "); + } + return sb.toString(); + } + + //actual level number; can return empty string if numberformatter fails + private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) { + + int numFmtStyle = 0; + String numFmt = getNumFormat(lvlNum, isLegal, overrideLevelTuples); + + int count = getCount(lvlNum); + if (count < 0) { + count = 1; + } + if ("lowerLetter".equals(numFmt)) { + numFmtStyle = 4; + } else if ("lowerRoman".equals(numFmt)) { + numFmtStyle = 2; + } else if ("decimal".equals(numFmt)) { + numFmtStyle = 0; + } else if ("upperLetter".equals(numFmt)) { + numFmtStyle = 3; + } else if ("upperRoman".equals(numFmt)) { + numFmtStyle = 1; + } else if ("bullet".equals(numFmt)) { + return ""; + //not yet handled by NumberFormatter...TODO: add to NumberFormatter? + } else if ("ordinal".equals(numFmt)) { + return ordinalize(count); + } else if ("decimalZero".equals(numFmt)) { + return "0" + NumberFormatter.getNumber(count, 0); + } else if ("none".equals(numFmt)) { + return ""; + } + try { + return NumberFormatter.getNumber(count, numFmtStyle); + } catch (IllegalArgumentException e) { + return ""; + } + } + + private String ordinalize(int count) { + //this is only good for locale == English + String countString = Integer.toString(count); + if (countString.endsWith("1")) { + return countString + "st"; + } else if (countString.endsWith("2")) { + return countString + "nd"; + } else if (countString.endsWith("3")) { + return countString + "rd"; + } + return countString + "th"; + } + + private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) { + if (lvlNum < 0 || lvlNum >= levelTuples.length) { + //log? + return "decimal"; + } + if (isLegal) { + //return decimal no matter the level if isLegal is true + return "decimal"; + } + return (overrideLevelTuples == null || overrideLevelTuples[lvlNum].numFmt == null) ? + levelTuples[lvlNum].numFmt : overrideLevelTuples[lvlNum].numFmt; + } + + private int getCount(int lvlNum) { + if (lvlNum < 0 || lvlNum >= counts.size()) { + //log? + return 1; + } + return counts.get(lvlNum); + } + + private void resetAfter(int startlevelNumber, LevelTuple[] overrideLevelTuples) { + for (int levelNumber = startlevelNumber + 1; levelNumber < counts.size(); levelNumber++) { + int cnt = counts.get(levelNumber); + if (cnt == NOT_SEEN_YET) { + //do nothing + } else if (cnt == FIRST_SKIPPED) { + //do nothing + } else if (levelTuples.length > levelNumber) { + //never reset if restarts == 0 + int restart = (overrideLevelTuples == null || overrideLevelTuples[levelNumber].restart < 0) ? + levelTuples[levelNumber].restart : overrideLevelTuples[levelNumber].restart; + if (restart == 0) { + return; + } else if (restart == -1 || + startlevelNumber <= restart - 1) { + counts.set(levelNumber, NOT_SEEN_YET); + } else { + //do nothing/don't reset + } + } else { + //reset! + counts.set(levelNumber, NOT_SEEN_YET); + } + } + } + + private int getStart(int levelNumber, LevelTuple[] overrideLevelTuples) { + if (levelNumber >= levelTuples.length) { + return 1; + } else { + return (overrideLevelTuples == null || overrideLevelTuples[levelNumber].start < 0) ? + levelTuples[levelNumber].start : overrideLevelTuples[levelNumber].start; + } + } + } + + protected class LevelTuple { + private final int start; + private final int restart; + private final String lvlText; + private final String numFmt; + private final boolean isLegal; + + public LevelTuple(String lvlText) { + this.lvlText = lvlText; + start = 1; + restart = -1; + numFmt = "decimal"; + isLegal = false; + } + + public LevelTuple(int start, int restart, String lvlText, String numFmt, boolean isLegal) { + this.start = start; + this.restart = restart; + this.lvlText = lvlText; + this.numFmt = numFmt; + this.isLegal = isLegal; + } + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.Ole10Native; +import org.apache.poi.poifs.filesystem.Ole10NativeException; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; +import org.apache.tika.parser.pkg.ZipContainerDetector; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +abstract class AbstractPOIFSExtractor { + private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class); + private final EmbeddedDocumentExtractor extractor; + private PasswordProvider passwordProvider; + private TikaConfig tikaConfig; + private MimeTypes mimeTypes; + private Detector detector; + private Metadata metadata; + + protected AbstractPOIFSExtractor(ParseContext context) { + this(context, null); + } + + protected AbstractPOIFSExtractor(ParseContext context, Metadata metadata) { + EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); + + if (ex == null) { + this.extractor = new ParsingEmbeddedDocumentExtractor(context); + } else { + this.extractor = ex; + } + + this.passwordProvider = context.get(PasswordProvider.class); + this.tikaConfig = context.get(TikaConfig.class); + this.mimeTypes = context.get(MimeTypes.class); + this.detector = context.get(Detector.class); + this.metadata = metadata; + } + + // Note - these cache, but avoid creating the default TikaConfig if not needed + protected TikaConfig getTikaConfig() { + if (tikaConfig == null) { + tikaConfig = TikaConfig.getDefaultConfig(); + } + return tikaConfig; + } + + protected Detector getDetector() { + if (detector != null) return detector; + + detector = getTikaConfig().getDetector(); + return detector; + } + + protected MimeTypes getMimeTypes() { + if (mimeTypes != null) return mimeTypes; + + mimeTypes = getTikaConfig().getMimeRepository(); + return mimeTypes; + } + + /** + * Returns the password to be used for this file, or null + * if no / default password should be used + */ + protected String getPassword() { + if (passwordProvider != null) { + return passwordProvider.getPassword(metadata); + } + return null; + } + + protected void handleEmbeddedResource(TikaInputStream resource, String filename, + String relationshipID, String mediaType, XHTMLContentHandler xhtml, + boolean outputHtml) + throws IOException, SAXException, TikaException { + try { + Metadata metadata = new Metadata(); + if (filename != null) { + metadata.set(Metadata.TIKA_MIME_FILE, filename); + metadata.set(Metadata.RESOURCE_NAME_KEY, filename); + } + if (relationshipID != null) { + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID); + } + if (mediaType != null) { + metadata.set(Metadata.CONTENT_TYPE, mediaType); + } + + if (extractor.shouldParseEmbedded(metadata)) { + extractor.parseEmbedded(resource, xhtml, metadata, outputHtml); + } + } finally { + resource.close(); + } + } + + /** + * Handle an office document that's embedded at the POIFS level + */ + protected void handleEmbeddedOfficeDoc( + DirectoryEntry dir, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + + // Is it an embedded OLE2 document, or an embedded OOXML document? + + if (dir.hasEntry("Package")) { + // It's OOXML (has a ZipFile): + Entry ooxml = dir.getEntry("Package"); + + try (TikaInputStream stream = TikaInputStream.get( + new DocumentInputStream((DocumentEntry) ooxml))) { + ZipContainerDetector detector = new ZipContainerDetector(); + MediaType type = detector.detect(stream, new Metadata()); + handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true); + return; + } + } + + // It's regular OLE2: + + // What kind of document is it? + Metadata metadata = new Metadata(); + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName()); + POIFSDocumentType type = POIFSDocumentType.detectType(dir); + TikaInputStream embedded = null; + + try { + if (type == POIFSDocumentType.OLE10_NATIVE) { + try { + // Try to un-wrap the OLE10Native record: + Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir); + if (ole.getLabel() != null) { + metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel()); + } + byte[] data = ole.getDataBuffer(); + embedded = TikaInputStream.get(data); + } catch (Ole10NativeException ex) { + // Not a valid OLE10Native record, skip it + } catch (Exception e) { + logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e); + } + } else if (type == POIFSDocumentType.COMP_OBJ) { + try { + // Grab the contents and process + DocumentEntry contentsEntry; + try { + contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS"); + } catch (FileNotFoundException ioe) { + contentsEntry = (DocumentEntry) dir.getEntry("Contents"); + } + DocumentInputStream inp = new DocumentInputStream(contentsEntry); + byte[] contents = new byte[contentsEntry.getSize()]; + inp.readFully(contents); + embedded = TikaInputStream.get(contents); + + // Try to work out what it is + MediaType mediaType = getDetector().detect(embedded, new Metadata()); + String extension = type.getExtension(); + try { + MimeType mimeType = getMimeTypes().forName(mediaType.toString()); + extension = mimeType.getExtension(); + } catch (MimeTypeException mte) { + // No details on this type are known + } + + // Record what we can do about it + metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString()); + metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension); + } catch (Exception e) { + throw new TikaException("Invalid embedded resource", e); + } + } else { + metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); + metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension()); + } + + // Should we parse it? + if (extractor.shouldParseEmbedded(metadata)) { + if (embedded == null) { + // Make a TikaInputStream that just + // passes the root directory of the + // embedded document, and is otherwise + // empty (byte[0]): + embedded = TikaInputStream.get(new byte[0]); + embedded.setOpenContainer(dir); + } + extractor.parseEmbedded(embedded, xhtml, metadata, true); + } + } finally { + if (embedded != null) { + embedded.close(); + } + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Cell of content. Classes that implement this interface are used by + * Tika parsers (currently just the MS Excel parser) to keep track of + * individual pieces of content before they are rendered to the XHTML + * SAX event stream. + */ +public interface Cell { + + /** + * Renders the content to the given XHTML SAX event stream. + * + * @param handler + * @throws SAXException + */ + void render(XHTMLContentHandler handler) throws SAXException; + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Cell decorator. + */ +public class CellDecorator implements Cell { + + private final Cell cell; + + public CellDecorator(Cell cell) { + this.cell = cell; + } + + public void render(XHTMLContentHandler handler) throws SAXException { + cell.render(handler); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,633 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.awt.*; +import java.io.IOException; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.apache.poi.ddf.EscherBSERecord; +import org.apache.poi.ddf.EscherBlipRecord; +import org.apache.poi.ddf.EscherRecord; +import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; +import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; +import org.apache.poi.hssf.eventusermodel.HSSFListener; +import org.apache.poi.hssf.eventusermodel.HSSFRequest; +import org.apache.poi.hssf.extractor.OldExcelExtractor; +import org.apache.poi.hssf.record.BOFRecord; +import org.apache.poi.hssf.record.BoundSheetRecord; +import org.apache.poi.hssf.record.CellValueRecordInterface; +import org.apache.poi.hssf.record.CountryRecord; +import org.apache.poi.hssf.record.DateWindow1904Record; +import org.apache.poi.hssf.record.DrawingGroupRecord; +import org.apache.poi.hssf.record.EOFRecord; +import org.apache.poi.hssf.record.ExtendedFormatRecord; +import org.apache.poi.hssf.record.FooterRecord; +import org.apache.poi.hssf.record.FormatRecord; +import org.apache.poi.hssf.record.FormulaRecord; +import org.apache.poi.hssf.record.HeaderRecord; +import org.apache.poi.hssf.record.HyperlinkRecord; +import org.apache.poi.hssf.record.LabelRecord; +import org.apache.poi.hssf.record.LabelSSTRecord; +import org.apache.poi.hssf.record.NumberRecord; +import org.apache.poi.hssf.record.RKRecord; +import org.apache.poi.hssf.record.Record; +import org.apache.poi.hssf.record.SSTRecord; +import org.apache.poi.hssf.record.StringRecord; +import org.apache.poi.hssf.record.TextObjectRecord; +import org.apache.poi.hssf.record.chart.SeriesTextRecord; +import org.apache.poi.hssf.record.common.UnicodeString; +import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; +import org.apache.poi.hssf.usermodel.HSSFPictureData; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Excel parser implementation which uses POI's Event API + * to handle the contents of a Workbook. + * <p/> + * The Event API uses a much smaller memory footprint than + * <code>HSSFWorkbook</code> when processing excel files + * but at the cost of more complexity. + * <p/> + * With the Event API a <i>listener</i> is registered for + * specific record types and those records are created, + * fired off to the listener and then discarded as the stream + * is being processed. + * + * @see org.apache.poi.hssf.eventusermodel.HSSFListener + * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api"> + * POI Event API How To</a> + */ +public class ExcelExtractor extends AbstractPOIFSExtractor { + + private static final String WORKBOOK_ENTRY = "Workbook"; + private static final String BOOK_ENTRY = "Book"; + /** + * <code>true</code> if the HSSFListener should be registered + * to listen for all records or <code>false</code> (the default) + * if the listener should be configured to only receive specified + * records. + */ + private boolean listenForAllRecords = false; + + public ExcelExtractor(ParseContext context, Metadata metadata) { + super(context, metadata); + } + + /** + * Returns <code>true</code> if this parser is configured to listen + * for all records instead of just the specified few. + */ + public boolean isListenForAllRecords() { + return listenForAllRecords; + } + + /** + * Specifies whether this parser should to listen for all + * records or just for the specified few. + * <p/> + * <strong>Note:</strong> Under normal operation this setting should + * be <code>false</code> (the default), but you can experiment with + * this setting for testing and debugging purposes. + * + * @param listenForAllRecords <code>true</code> if the HSSFListener + * should be registered to listen for all records or <code>false</code> + * if the listener should be configured to only receive specified records. + */ + public void setListenForAllRecords(boolean listenForAllRecords) { + this.listenForAllRecords = listenForAllRecords; + } + + /** + * Extracts text from an Excel Workbook writing the extracted content + * to the specified {@link Appendable}. + * + * @param filesystem POI file system + * @throws IOException if an error occurs processing the workbook + * or writing the extracted content + */ + protected void parse( + NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml, + Locale locale) throws IOException, SAXException, TikaException { + parse(filesystem.getRoot(), xhtml, locale); + } + + protected void parse( + DirectoryNode root, XHTMLContentHandler xhtml, + Locale locale) throws IOException, SAXException, TikaException { + if (!root.hasEntry(WORKBOOK_ENTRY)) { + if (root.hasEntry(BOOK_ENTRY)) { + // Excel 5 / Excel 95 file + // Records are in a different structure so needs a + // different parser to process them + OldExcelExtractor extractor = new OldExcelExtractor(root); + OldExcelParser.parse(extractor, xhtml); + return; + } else { + // Corrupt file / very old file, just skip text extraction + return; + } + } + + // If a password was supplied, use it, otherwise the default + Biff8EncryptionKey.setCurrentUserPassword(getPassword()); + + // Have the file processed in event mode + TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this); + listener.processFile(root, isListenForAllRecords()); + listener.throwStoredException(); + + for (Entry entry : root) { + if (entry.getName().startsWith("MBD") + && entry instanceof DirectoryEntry) { + try { + handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); + } catch (TikaException e) { + // ignore parse errors from embedded documents + } + } + } + } + + // ====================================================================== + + /** + * HSSF Listener implementation which processes the HSSF records. + */ + private static class TikaHSSFListener implements HSSFListener { + + /** + * XHTML content handler to which the document content is rendered. + */ + private final XHTMLContentHandler handler; + + /** + * The POIFS Extractor, used for embeded resources. + */ + private final AbstractPOIFSExtractor extractor; + /** + * Format for rendering numbers in the worksheet. Currently we just + * use the platform default formatting. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a> + */ + private final NumberFormat format; + /** + * Potential exception thrown by the content handler. When set to + * non-<code>null</code>, causes all subsequent HSSF records to be + * ignored and the stored exception to be thrown when + * {@link #throwStoredException()} is invoked. + */ + private Exception exception = null; + private SSTRecord sstRecord; + private FormulaRecord stringFormulaRecord; + private short previousSid; + /** + * Internal <code>FormatTrackingHSSFListener</code> to handle cell + * formatting within the extraction. + */ + private FormatTrackingHSSFListener formatListener; + /** + * List of worksheet names. + */ + private List<String> sheetNames = new ArrayList<String>(); + /** + * Index of the current worksheet within the workbook. + * Used to find the worksheet name in the {@link #sheetNames} list. + */ + private short currentSheetIndex; + /** + * Content of the current worksheet, or <code>null</code> if no + * worksheet is currently active. + */ + private SortedMap<Point, Cell> currentSheet = null; + /** + * Extra text or cells that crops up, typically as part of a + * worksheet but not always. + */ + private List<Cell> extraTextCells = new ArrayList<Cell>(); + /** + * These aren't complete when we first see them, as the + * depend on continue records that aren't always + * contiguous. Collect them for later processing. + */ + private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>(); + + /** + * Construct a new listener instance outputting parsed data to + * the specified XHTML content handler. + * + * @param handler Destination to write the parsed output to + */ + private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) { + this.handler = handler; + this.extractor = extractor; + this.format = NumberFormat.getInstance(locale); + this.formatListener = new FormatTrackingHSSFListener(this, locale); + } + + /** + * Entry point to listener to start the processing of a file. + * + * @param filesystem POI file system. + * @param listenForAllRecords sets whether the listener is configured to listen + * for all records types or not. + * @throws IOException on any IO errors. + * @throws SAXException on any SAX parsing errors. + */ + public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords) + throws IOException, SAXException, TikaException { + processFile(filesystem.getRoot(), listenForAllRecords); + } + + public void processFile(DirectoryNode root, boolean listenForAllRecords) + throws IOException, SAXException, TikaException { + + // Set up listener and register the records we want to process + HSSFRequest hssfRequest = new HSSFRequest(); + if (listenForAllRecords) { + hssfRequest.addListenerForAllRecords(formatListener); + } else { + hssfRequest.addListener(formatListener, BOFRecord.sid); + hssfRequest.addListener(formatListener, EOFRecord.sid); + hssfRequest.addListener(formatListener, DateWindow1904Record.sid); + hssfRequest.addListener(formatListener, CountryRecord.sid); + hssfRequest.addListener(formatListener, BoundSheetRecord.sid); + hssfRequest.addListener(formatListener, SSTRecord.sid); + hssfRequest.addListener(formatListener, FormulaRecord.sid); + hssfRequest.addListener(formatListener, LabelRecord.sid); + hssfRequest.addListener(formatListener, LabelSSTRecord.sid); + hssfRequest.addListener(formatListener, NumberRecord.sid); + hssfRequest.addListener(formatListener, RKRecord.sid); + hssfRequest.addListener(formatListener, StringRecord.sid); + hssfRequest.addListener(formatListener, HyperlinkRecord.sid); + hssfRequest.addListener(formatListener, TextObjectRecord.sid); + hssfRequest.addListener(formatListener, SeriesTextRecord.sid); + hssfRequest.addListener(formatListener, FormatRecord.sid); + hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid); + hssfRequest.addListener(formatListener, DrawingGroupRecord.sid); + hssfRequest.addListener(formatListener, HeaderRecord.sid); + hssfRequest.addListener(formatListener, FooterRecord.sid); + } + + // Create event factory and process Workbook (fire events) + DocumentInputStream documentInputStream = root.createDocumentInputStream(WORKBOOK_ENTRY); + HSSFEventFactory eventFactory = new HSSFEventFactory(); + try { + eventFactory.processEvents(hssfRequest, documentInputStream); + } catch (org.apache.poi.EncryptedDocumentException e) { + throw new EncryptedDocumentException(e); + } + + // Output any extra text that came after all the sheets + processExtraText(); + + // Look for embeded images, now that the drawing records + // have been fully matched with their continue data + for (DrawingGroupRecord dgr : drawingGroups) { + dgr.decode(); + findPictures(dgr.getEscherRecords()); + } + } + + /** + * Process a HSSF record. + * + * @param record HSSF Record + */ + public void processRecord(Record record) { + if (exception == null) { + try { + internalProcessRecord(record); + } catch (TikaException te) { + exception = te; + } catch (IOException ie) { + exception = ie; + } catch (SAXException se) { + exception = se; + } + } + } + + public void throwStoredException() throws TikaException, SAXException, IOException { + if (exception != null) { + if (exception instanceof IOException) + throw (IOException) exception; + if (exception instanceof SAXException) + throw (SAXException) exception; + if (exception instanceof TikaException) + throw (TikaException) exception; + throw new TikaException(exception.getMessage()); + } + } + + private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException { + switch (record.getSid()) { + case BOFRecord.sid: // start of workbook, worksheet etc. records + BOFRecord bof = (BOFRecord) record; + if (bof.getType() == BOFRecord.TYPE_WORKBOOK) { + currentSheetIndex = -1; + } else if (bof.getType() == BOFRecord.TYPE_CHART) { + if (previousSid == EOFRecord.sid) { + // This is a sheet which contains only a chart + newSheet(); + } else { + // This is a chart within a normal sheet + // Handling of this is a bit hacky... + if (currentSheet != null) { + processSheet(); + currentSheetIndex--; + newSheet(); + } + } + } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) { + newSheet(); + } + break; + + case EOFRecord.sid: // end of workbook, worksheet etc. records + if (currentSheet != null) { + processSheet(); + } + currentSheet = null; + break; + + case BoundSheetRecord.sid: // Worksheet index record + BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record; + sheetNames.add(boundSheetRecord.getSheetname()); + break; + + case SSTRecord.sid: // holds all the strings for LabelSSTRecords + sstRecord = (SSTRecord) record; + break; + + case FormulaRecord.sid: // Cell value from a formula + FormulaRecord formula = (FormulaRecord) record; + if (formula.hasCachedResultString()) { + // The String itself should be the next record + stringFormulaRecord = formula; + } else { + addTextCell(record, formatListener.formatNumberDateCell(formula)); + } + break; + + case StringRecord.sid: + if (previousSid == FormulaRecord.sid) { + // Cached string value of a string formula + StringRecord sr = (StringRecord) record; + addTextCell(stringFormulaRecord, sr.getString()); + } else { + // Some other string not associated with a cell, skip + } + break; + + case LabelRecord.sid: // strings stored directly in the cell + LabelRecord label = (LabelRecord) record; + addTextCell(record, label.getValue()); + break; + + case LabelSSTRecord.sid: // Ref. a string in the shared string table + LabelSSTRecord sst = (LabelSSTRecord) record; + UnicodeString unicode = sstRecord.getString(sst.getSSTIndex()); + addTextCell(record, unicode.getString()); + break; + + case NumberRecord.sid: // Contains a numeric cell value + NumberRecord number = (NumberRecord) record; + addTextCell(record, formatListener.formatNumberDateCell(number)); + break; + + case RKRecord.sid: // Excel internal number record + RKRecord rk = (RKRecord) record; + addCell(record, new NumberCell(rk.getRKNumber(), format)); + break; + + case HyperlinkRecord.sid: // holds a URL associated with a cell + if (currentSheet != null) { + HyperlinkRecord link = (HyperlinkRecord) record; + Point point = + new Point(link.getFirstColumn(), link.getFirstRow()); + Cell cell = currentSheet.get(point); + if (cell != null) { + String address = link.getAddress(); + if (address != null) { + addCell(record, new LinkedCell(cell, address)); + } else { + addCell(record, cell); + } + } + } + break; + + case TextObjectRecord.sid: + TextObjectRecord tor = (TextObjectRecord) record; + addTextCell(record, tor.getStr().getString()); + break; + + case SeriesTextRecord.sid: // Chart label or title + SeriesTextRecord str = (SeriesTextRecord) record; + addTextCell(record, str.getText()); + break; + + case DrawingGroupRecord.sid: + // Collect this now, we'll process later when all + // the continue records are in + drawingGroups.add((DrawingGroupRecord) record); + break; + + case HeaderRecord.sid: + HeaderRecord headerRecord = (HeaderRecord) record; + addTextCell(record, headerRecord.getText()); + break; + + case FooterRecord.sid: + FooterRecord footerRecord = (FooterRecord) record; + addTextCell(record, footerRecord.getText()); + break; + + } + + previousSid = record.getSid(); + + if (stringFormulaRecord != record) { + stringFormulaRecord = null; + } + } + + private void processExtraText() throws SAXException { + if (extraTextCells.size() > 0) { + for (Cell cell : extraTextCells) { + handler.startElement("div", "class", "outside"); + cell.render(handler); + handler.endElement("div"); + } + + // Reset + extraTextCells.clear(); + } + } + + /** + * Adds the given cell (unless <code>null</code>) to the current + * worksheet (if any) at the position (if any) of the given record. + * + * @param record record that holds the cell value + * @param cell cell value (or <code>null</code>) + */ + private void addCell(Record record, Cell cell) throws SAXException { + if (cell == null) { + // Ignore empty cells + } else if (currentSheet != null + && record instanceof CellValueRecordInterface) { + // Normal cell inside a worksheet + CellValueRecordInterface value = + (CellValueRecordInterface) record; + Point point = new Point(value.getColumn(), value.getRow()); + currentSheet.put(point, cell); + } else { + // Cell outside the worksheets + extraTextCells.add(cell); + } + } + + /** + * Adds a text cell with the given text comment. The given text + * is trimmed, and ignored if <code>null</code> or empty. + * + * @param record record that holds the text value + * @param text text content, may be <code>null</code> + * @throws SAXException + */ + private void addTextCell(Record record, String text) throws SAXException { + if (text != null) { + text = text.trim(); + if (text.length() > 0) { + addCell(record, new TextCell(text)); + } + } + } + + private void newSheet() { + currentSheetIndex++; + currentSheet = new TreeMap<Point, Cell>(new PointComparator()); + } + + /** + * Process an excel sheet. + * + * @throws SAXException if an error occurs + */ + private void processSheet() throws SAXException { + // Sheet Start + handler.startElement("div", "class", "page"); + if (currentSheetIndex < sheetNames.size()) { + handler.element("h1", sheetNames.get(currentSheetIndex)); + } + handler.startElement("table"); + handler.startElement("tbody"); + + // Process Rows + int currentRow = 0; + int currentColumn = 0; + handler.startElement("tr"); + handler.startElement("td"); + for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) { + while (currentRow < entry.getKey().y) { + handler.endElement("td"); + handler.endElement("tr"); + handler.startElement("tr"); + handler.startElement("td"); + currentRow++; + currentColumn = 0; + } + + while (currentColumn < entry.getKey().x) { + handler.endElement("td"); + handler.startElement("td"); + currentColumn++; + } + + entry.getValue().render(handler); + } + handler.endElement("td"); + handler.endElement("tr"); + + // Sheet End + handler.endElement("tbody"); + handler.endElement("table"); + + // Finish up + processExtraText(); + handler.endElement("div"); + } + + private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException { + for (EscherRecord escherRecord : records) { + if (escherRecord instanceof EscherBSERecord) { + EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord(); + if (blip != null) { + HSSFPictureData picture = new HSSFPictureData(blip); + String mimeType = picture.getMimeType(); + TikaInputStream stream = TikaInputStream.get(picture.getData()); + + // Handle the embeded resource + extractor.handleEmbeddedResource( + stream, null, null, mimeType, + handler, true + ); + } + } + + // Recursive call. + findPictures(escherRecord.getChildRecords()); + } + } + } + + /** + * Utility comparator for points. + */ + private static class PointComparator implements Comparator<Point> { + + public int compare(Point a, Point b) { + int diff = a.y - b.y; + if (diff == 0) { + diff = a.x - b.x; + } + return diff; + } + + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,366 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; + +import org.apache.poi.hslf.model.Comment; +import org.apache.poi.hslf.model.HeadersFooters; +import org.apache.poi.hslf.model.OLEShape; +import org.apache.poi.hslf.usermodel.HSLFMasterSheet; +import org.apache.poi.hslf.usermodel.HSLFNotes; +import org.apache.poi.hslf.usermodel.HSLFObjectData; +import org.apache.poi.hslf.usermodel.HSLFPictureData; +import org.apache.poi.hslf.usermodel.HSLFShape; +import org.apache.poi.hslf.usermodel.HSLFSlide; +import org.apache.poi.hslf.usermodel.HSLFSlideShow; +import org.apache.poi.hslf.usermodel.HSLFTable; +import org.apache.poi.hslf.usermodel.HSLFTableCell; +import org.apache.poi.hslf.usermodel.HSLFTextParagraph; +import org.apache.poi.hslf.usermodel.HSLFTextRun; +import org.apache.poi.hslf.usermodel.HSLFTextShape; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class HSLFExtractor extends AbstractPOIFSExtractor { + public HSLFExtractor(ParseContext context) { + super(context); + } + + protected void parse( + NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + parse(filesystem.getRoot(), xhtml); + } + + protected void parse( + DirectoryNode root, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + HSLFSlideShow ss = new HSLFSlideShow(root); + List<HSLFSlide> _slides = ss.getSlides(); + + xhtml.startElement("div", "class", "slideShow"); + + /* Iterate over slides and extract text */ + for (HSLFSlide slide : _slides) { + xhtml.startElement("div", "class", "slide"); + + // Slide header, if present + HeadersFooters hf = slide.getHeadersFooters(); + if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { + xhtml.startElement("p", "class", "slide-header"); + + xhtml.characters(hf.getHeaderText()); + + xhtml.endElement("p"); + } + + // Slide master, if present + extractMaster(xhtml, slide.getMasterSheet()); + + // Slide text + { + xhtml.startElement("div", "class", "slide-content"); + + textRunsToText(xhtml, slide.getTextParagraphs()); + + xhtml.endElement("div"); + } + + // Table text + for (HSLFShape shape : slide.getShapes()) { + if (shape instanceof HSLFTable) { + extractTableText(xhtml, (HSLFTable) shape); + } + } + + // Slide footer, if present + if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { + xhtml.startElement("p", "class", "slide-footer"); + + xhtml.characters(hf.getFooterText()); + + xhtml.endElement("p"); + } + + // Comments, if present + StringBuilder authorStringBuilder = new StringBuilder(); + for (Comment comment : slide.getComments()) { + authorStringBuilder.setLength(0); + xhtml.startElement("p", "class", "slide-comment"); + + if (comment.getAuthor() != null) { + authorStringBuilder.append(comment.getAuthor()); + } + if (comment.getAuthorInitials() != null) { + if (authorStringBuilder.length() > 0) { + authorStringBuilder.append(" "); + } + authorStringBuilder.append("("+comment.getAuthorInitials()+")"); + } + if (authorStringBuilder.length() > 0) { + if (comment.getText() != null) { + authorStringBuilder.append(" - "); + } + xhtml.startElement("b"); + xhtml.characters(authorStringBuilder.toString()); + xhtml.endElement("b"); + } + if (comment.getText() != null) { + xhtml.characters(comment.getText()); + } + xhtml.endElement("p"); + } + + // Now any embedded resources + handleSlideEmbeddedResources(slide, xhtml); + + // TODO Find the Notes for this slide and extract inline + + // Slide complete + xhtml.endElement("div"); + } + + // All slides done + xhtml.endElement("div"); + + /* notes */ + xhtml.startElement("div", "class", "slide-notes"); + HashSet<Integer> seenNotes = new HashSet<>(); + HeadersFooters hf = ss.getNotesHeadersFooters(); + + for (HSLFSlide slide : _slides) { + HSLFNotes notes = slide.getNotes(); + if (notes == null) { + continue; + } + Integer id = notes._getSheetNumber(); + if (seenNotes.contains(id)) { + continue; + } + seenNotes.add(id); + + // Repeat the Notes header, if set + if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { + xhtml.startElement("p", "class", "slide-note-header"); + xhtml.characters(hf.getHeaderText()); + xhtml.endElement("p"); + } + + // Notes text + textRunsToText(xhtml, notes.getTextParagraphs()); + + // Repeat the notes footer, if set + if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { + xhtml.startElement("p", "class", "slide-note-footer"); + xhtml.characters(hf.getFooterText()); + xhtml.endElement("p"); + } + } + + handleSlideEmbeddedPictures(ss, xhtml); + + xhtml.endElement("div"); + } + + private void extractMaster(XHTMLContentHandler xhtml, HSLFMasterSheet master) throws SAXException { + if (master == null) { + return; + } + List<HSLFShape> shapes = master.getShapes(); + if (shapes == null || shapes.isEmpty()) { + return; + } + + xhtml.startElement("div", "class", "slide-master-content"); + for (HSLFShape shape : shapes) { + if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) { + if (shape instanceof HSLFTextShape) { + HSLFTextShape tsh = (HSLFTextShape) shape; + String text = tsh.getText(); + if (text != null) { + xhtml.element("p", text); + } + } + } + } + xhtml.endElement("div"); + } + + private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException { + xhtml.startElement("table"); + for (int row = 0; row < shape.getNumberOfRows(); row++) { + xhtml.startElement("tr"); + for (int col = 0; col < shape.getNumberOfColumns(); col++) { + HSLFTableCell cell = shape.getCell(row, col); + //insert empty string for empty cell if cell is null + String txt = ""; + if (cell != null) { + txt = cell.getText(); + } + xhtml.element("td", txt); + } + xhtml.endElement("tr"); + } + xhtml.endElement("table"); + } + + private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException { + if (paragraphsList == null) { + return; + } + + for (List<HSLFTextParagraph> run : paragraphsList) { + // Leaving in wisdom from TIKA-712 for easy revert. + // Avoid boiler-plate text on the master slide (0 + // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE): + //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) { + + boolean isBullet = false; + for (HSLFTextParagraph htp : run) { + boolean nextBullet = htp.isBullet(); + // TODO: identify bullet/list type + if (isBullet != nextBullet) { + isBullet = nextBullet; + if (isBullet) { + xhtml.startElement("ul"); + } else { + xhtml.endElement("ul"); + } + } + + List<HSLFTextRun> textRuns = htp.getTextRuns(); + String firstLine = removePBreak(textRuns.get(0).getRawText()); + boolean showBullet = (isBullet && (textRuns.size() > 1 || !"".equals(firstLine))); + String paraTag = showBullet ? "li" : "p"; + + xhtml.startElement(paraTag); + for (HSLFTextRun htr : textRuns) { + String line = htr.getRawText(); + if (line != null) { + boolean isfirst = true; + for (String fragment : line.split("\\u000b")) { + if (!isfirst) { + xhtml.startElement("br"); + xhtml.endElement("br"); + } + isfirst = false; + xhtml.characters(removePBreak(fragment)); + } + if (line.endsWith("\u000b")) { + xhtml.startElement("br"); + xhtml.endElement("br"); + } + } + } + xhtml.endElement(paraTag); + } + if (isBullet) { + xhtml.endElement("ul"); + } + } + } + + // remove trailing paragraph break + private static String removePBreak(String fragment) { + // the last text run of a text paragraph contains the paragraph break (\r) + // line breaks (\\u000b) can happen more often + return fragment.replaceFirst("\\r$", ""); + } + + private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml) + throws TikaException, SAXException, IOException { + for (HSLFPictureData pic : slideshow.getPictureData()) { + String mediaType; + + switch (pic.getType()) { + case EMF: + mediaType = "application/x-emf"; + break; + case WMF: + mediaType = "application/x-msmetafile"; + break; + case DIB: + mediaType = "image/bmp"; + break; + default: + mediaType = pic.getContentType(); + break; + } + + handleEmbeddedResource( + TikaInputStream.get(pic.getData()), null, null, + mediaType, xhtml, false); + } + } + + private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml) + throws TikaException, SAXException, IOException { + List<HSLFShape> shapes; + try { + shapes = slide.getShapes(); + } catch (NullPointerException e) { + // Sometimes HSLF hits problems + // Please open POI bugs for any you come across! + return; + } + + for (HSLFShape shape : shapes) { + if (shape instanceof OLEShape) { + OLEShape oleShape = (OLEShape) shape; + HSLFObjectData data = null; + try { + data = oleShape.getObjectData(); + } catch (NullPointerException e) { + /* getObjectData throws NPE some times. */ + } + + if (data != null) { + String objID = Integer.toString(oleShape.getObjectID()); + + // Embedded Object: add a <div + // class="embedded" id="X"/> so consumer can see where + // in the main text each embedded document + // occurred: + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", objID); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + + try (TikaInputStream stream = TikaInputStream.get(data.getData())) { + String mediaType = null; + if ("Excel.Chart.8".equals(oleShape.getProgID())) { + mediaType = "application/vnd.ms-excel"; + } + handleEmbeddedResource( + stream, objID, objID, + mediaType, xhtml, false); + } + } + } + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft; + + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.math.BigDecimal; +import java.text.DateFormat; +import java.text.NumberFormat; +import java.util.Date; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import com.healthmarketscience.jackcess.Column; +import com.healthmarketscience.jackcess.DataType; +import com.healthmarketscience.jackcess.Database; +import com.healthmarketscience.jackcess.PropertyMap; +import com.healthmarketscience.jackcess.Row; +import com.healthmarketscience.jackcess.Table; +import com.healthmarketscience.jackcess.query.Query; +import com.healthmarketscience.jackcess.util.OleBlob; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.HtmlParser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Internal class. Needs to be instantiated for each parse because of + * the lack of thread safety with the dateTimeFormatter + */ +class JackcessExtractor extends AbstractPOIFSExtractor { + + final static String TITLE_PROP_KEY = "Title"; + final static String AUTHOR_PROP_KEY = "Author"; + final static String COMPANY_PROP_KEY = "Company"; + + final static String TEXT_FORMAT_KEY = "TextFormat"; + final static String CURRENCY_FORMAT_KEY = "Format"; + final static byte TEXT_FORMAT = 0; + final static byte RICH_TEXT_FORMAT = 1; + final static ParseContext EMPTY_PARSE_CONTEXT = new ParseContext(); + + final NumberFormat currencyFormatter; + final DateFormat shortDateTimeFormatter; + + final HtmlParser htmlParser = new HtmlParser(); + + protected JackcessExtractor(ParseContext context, Locale locale) { + super(context); + currencyFormatter = NumberFormat.getCurrencyInstance(locale); + shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, locale); + } + + public void parse(Database db, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException { + + + String pw = db.getDatabasePassword(); + if (pw != null) { + metadata.set(JackcessParser.MDB_PW, pw); + } + + PropertyMap dbp = db.getDatabaseProperties(); + for (PropertyMap.Property p : dbp) { + metadata.add(JackcessParser.MDB_PROPERTY_PREFIX + p.getName(), + toString(p.getValue(), p.getType())); + } + + PropertyMap up = db.getUserDefinedProperties(); + for (PropertyMap.Property p : up) { + metadata.add(JackcessParser.USER_DEFINED_PROPERTY_PREFIX+ p.getName(), + toString(p.getValue(), p.getType())); + } + + Set<String> found = new HashSet<>(); + PropertyMap summaryProperties = db.getSummaryProperties(); + if (summaryProperties != null) { + //try to get core properties + PropertyMap.Property title = summaryProperties.get(TITLE_PROP_KEY); + if (title != null) { + metadata.set(TikaCoreProperties.TITLE, toString(title.getValue(), title.getType())); + found.add(title.getName()); + } + PropertyMap.Property author = summaryProperties.get(AUTHOR_PROP_KEY); + if (author != null && author.getValue() != null) { + String authorString = toString(author.getValue(), author.getType()); + SummaryExtractor.addMulti(metadata, TikaCoreProperties.CREATOR, authorString); + found.add(author.getName()); + } + PropertyMap.Property company = summaryProperties.get(COMPANY_PROP_KEY); + if (company != null) { + metadata.set(OfficeOpenXMLExtended.COMPANY, toString(company.getValue(), company.getType())); + found.add(company.getName()); + } + + for (PropertyMap.Property p : db.getSummaryProperties()) { + if (! found.contains(p.getName())) { + metadata.add(JackcessParser.SUMMARY_PROPERTY_PREFIX + p.getName(), + toString(p.getValue(), p.getType())); + } + } + + } + + Iterator<Table> it = db.newIterable(). + setIncludeLinkedTables(false). + setIncludeSystemTables(false).iterator(); + + while (it.hasNext()) { + Table table = it.next(); + String tableName = table.getName(); + List<? extends Column> columns = table.getColumns(); + xhtml.startElement("table", "name", tableName); + addHeaders(columns, xhtml); + xhtml.startElement("tbody"); + + Row r = table.getNextRow(); + + while (r != null) { + xhtml.startElement("tr"); + for (Column c : columns) { + handleCell(r, c, xhtml); + } + xhtml.endElement("tr"); + r = table.getNextRow(); + } + xhtml.endElement("tbody"); + xhtml.endElement("table"); + } + + for (Query q : db.getQueries()) { + xhtml.startElement("div", "type", "sqlQuery"); + xhtml.characters(q.toSQLString()); + xhtml.endElement("div"); + } + } + + private void addHeaders(List<? extends Column> columns, XHTMLContentHandler xhtml) throws SAXException { + xhtml.startElement("thead"); + xhtml.startElement("tr"); + for (Column c : columns) { + xhtml.startElement("th"); + xhtml.characters(c.getName()); + xhtml.endElement("th"); + } + xhtml.endElement("tr"); + xhtml.endElement("thead"); + + } + + private void handleCell(Row r, Column c, XHTMLContentHandler handler) + throws SAXException, IOException, TikaException { + + handler.startElement("td"); + if (c.getType().equals(DataType.OLE)) { + handleOLE(r, c.getName(), handler); + } else if (c.getType().equals(DataType.BINARY)) { + Object obj = r.get(c.getName()); + if (obj != null) { + byte[] bytes = (byte[])obj; + handleEmbeddedResource( + TikaInputStream.get(bytes), + null,//filename + null,//relationshipId + null,//mediatype + handler, false); + } + } else { + Object obj = r.get(c.getName()); + String v = toString(obj, c.getType()); + if (isRichText(c)) { + BodyContentHandler h = new BodyContentHandler(); + Metadata m = new Metadata(); + m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); + try { + htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)), + h, + m, EMPTY_PARSE_CONTEXT); + handler.characters(h.toString()); + } catch (SAXException e) { + //if something went wrong in htmlparser, just append the characters + handler.characters(v); + } + } else { + handler.characters(v); + } + } + handler.endElement("td"); + } + + private boolean isRichText(Column c) throws IOException { + + if (c == null) { + return false; + } + + PropertyMap m = c.getProperties(); + if (m == null) { + return false; + } + if (c.getType() == null || ! c.getType().equals(DataType.MEMO)) { + return false; + } + Object b = m.getValue(TEXT_FORMAT_KEY); + if (b instanceof Byte) { + if (((Byte)b).byteValue() == RICH_TEXT_FORMAT) { + return true; + } + } + return false; + } + + private String toString(Object value, DataType type) { + if (value == null) { + return ""; + } + if (type == null) { + //this shouldn't happen + return value.toString(); + } + switch (type) { + case LONG: + return Integer.toString((Integer)value); + case TEXT: + return (String)value; + case MONEY: + //TODO: consider getting parsing "Format" field from + //field properties. + return formatCurrency(((BigDecimal)value).doubleValue(), type); + case SHORT_DATE_TIME: + return formatShortDateTime((Date)value); + case BOOLEAN: + return Boolean.toString((Boolean) value); + case MEMO: + return (String)value; + case INT: + return Short.toString((Short)value); + case DOUBLE: + return Double.toString((Double)value); + case FLOAT: + return Float.toString((Float)value); + case NUMERIC: + return value.toString(); + case BYTE: + return Byte.toString((Byte)value); + case GUID: + return value.toString(); + case COMPLEX_TYPE: //skip all these + case UNKNOWN_0D: + case UNKNOWN_11: + case UNSUPPORTED_FIXEDLEN: + case UNSUPPORTED_VARLEN: + default: + return ""; + + } + } + + private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { + OleBlob blob = row.getBlob(cName); + //lifted shamelessly from Jackcess's OleBlobTest + if (blob == null) + return; + + OleBlob.Content content = blob.getContent(); + if (content == null) + return; + + switch (content.getType()) { + case LINK: + xhtml.characters(((OleBlob.LinkContent) content).getLinkPath()); + break; + case SIMPLE_PACKAGE: + OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content; + + handleEmbeddedResource( + TikaInputStream.get(spc.getStream()), + spc.getFileName(),//filename + null,//relationshipId + spc.getTypeName(),//mediatype + xhtml, false); + break; + case OTHER: + OleBlob.OtherContent oc = (OleBlob.OtherContent) content; + handleEmbeddedResource( + TikaInputStream.get(oc.getStream()), + null,//filename + null,//relationshipId + oc.getTypeName(),//mediatype + xhtml, false); + break; + case COMPOUND_STORAGE: + OleBlob.CompoundContent cc = (OleBlob.CompoundContent) content; + handleCompoundContent(cc, xhtml); + break; + } + } + + private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { + NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream()); + handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml); + } + + String formatCurrency(Double d, DataType type) { + if (d == null) { + return ""; + } + return currencyFormatter.format(d); + } + + String formatShortDateTime(Date d) { + if (d == null) { + return ""; + } + return shortDateTimeFormatter.format(d); + } +} +
