fix for TIKA-1876 contributed by manalishah
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7ebe007e Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7ebe007e Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7ebe007e Branch: refs/heads/master Commit: 7ebe007ec03088449f67619ef1e6cb564178b14b Parents: a13369b Author: manali <[email protected]> Authored: Fri Feb 26 18:36:02 2016 -0800 Committer: manali <[email protected]> Committed: Fri Feb 26 18:36:02 2016 -0800 ---------------------------------------------------------------------- CHANGES.txt | 2 ++ .../src/main/java/org/apache/tika/mime/MimeType.java | 1 - .../org/apache/tika/mime/tika-mimetypes.xml | 13 ++++++------- .../tika/parser/microsoft/ooxml/XWPFListManager.java | 4 ++++ .../org/apache/tika/parser/ner/NERecogniser.java | 2 -- .../apache/tika/server/RichTextContentHandler.java | 15 +++++++++++++-- .../apache/tika/server/resource/TikaResource.java | 2 +- .../tika/server/resource/UnpackerResource.java | 2 +- 8 files changed, 27 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index bb30540..0ffc69f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -9,6 +9,8 @@ Release 1.13 - ??? * Upgrade to Jackson 2.7.1 (TIKA-1869). + * RichTextContentHandler moved from the Server package to Core (TIKA-1870). + Release 1.12 - 01/24/2016 * Support for iFrames and element link extraction is provided in http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-core/src/main/java/org/apache/tika/mime/MimeType.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java index fc520cf..b4d651e 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java @@ -270,7 +270,6 @@ public final class MimeType implements Comparable<MimeType>, Serializable { } } - void addMagic(Magic magic) { if (magic == null) { return; http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 5bb30fc..95f41e6 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -38,12 +38,6 @@ --> <mime-info> - <mime-type type="application/dicom"> - <_comment>DICOM medical imaging data</_comment> - <magic priority="50"> - <match value="DICM" type="string" offset="128"/> - </magic> - </mime-type> <mime-type type="application/activemessage"/> <mime-type type="application/andrew-inset"> <glob pattern="*.ez"/> @@ -118,7 +112,12 @@ <mime-type type="application/dec-dx"/> <mime-type type="application/dialog-info+xml"/> - + <mime-type type="application/dicom"> + <_comment>DICOM medical imaging data</_comment> + <magic priority="50"> + <match value="DICM" type="string" offset="128"/> + </magic> + </mime-type> <mime-type type="application/dita+xml"> <sub-class-of type="application/xml"/> http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java index 5654378..a938c2f 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java @@ -57,6 +57,10 @@ public class XWPFListManager extends AbstractListManager { * @return the formatted number or an empty string if something went wrong */ public String getFormattedNumber(final XWPFParagraph paragraph) { + if (numbering == null) { + return ""; + } + int currNumId = paragraph.getNumID().intValue(); XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID()); if (xwpfNum == null) { http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java index 3bebff2..c4693eb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java @@ -36,8 +36,6 @@ public interface NERecogniser { String DATE = "DATE"; String PERCENT = "PERCENT"; String MONEY = "MONEY"; - String FACILITY = "FACILITY"; - String GPE = "GPE"; /** * checks if this Named Entity recogniser is available for service http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java b/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java index 81095a7..8fcc4d5 100644 --- a/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java +++ b/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java @@ -15,15 +15,26 @@ * limitations under the License. */ -package org.apache.tika.server; +package org.apache.tika.sax; import java.io.Writer; -import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.Attributes; import org.xml.sax.SAXException; +/** + * Content handler for Rich Text, it will extract XHTML <img/> + * tag <alt/> attribute and XHTML <a/> tag <name/> + * attribute into the output. + */ public class RichTextContentHandler extends WriteOutContentHandler { + + /** + * Creates a content handler that writes XHTML body character events to + * the given writer. + * + * @param writer writer + */ public RichTextContentHandler(Writer writer) { super(writer); } http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java index d74ef74..566203a 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java @@ -72,7 +72,7 @@ import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler; -import org.apache.tika.server.RichTextContentHandler; +import org.apache.tika.sax.RichTextContentHandler; import org.apache.tika.server.TikaServerParseException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java index cf3a0e9..8ee516e 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java @@ -58,7 +58,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.server.RichTextContentHandler; +import org.apache.tika.sax.RichTextContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler;
