[
https://issues.apache.org/jira/browse/TIKA-2208?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15761055#comment-15761055
]
Tim Allison edited comment on TIKA-2208 at 12/19/16 12:54 PM:
--------------------------------------------------------------
Three cheers for unit tests!
It looks like we need to add vnd.ms-powerpoint.template.macroenabled.12 to
OOXMLParser's handled media types. I'll make that change shortly.
Meanwhile, you could try something like this, which runs against nearly all of
our test documents:
{noformat}
private static final Set<MediaType> INCLUDES = new HashSet<>();
static {
for (MediaType mediaType : OOXMLParser.SUPPORTED_TYPES) {
if (mediaType.equals(MediaType.application("x-tika-ooxml"))) {
continue;
}
INCLUDES.add(mediaType);
}
INCLUDES.add(MediaType.application("vnd.ms-powerpoint.template.macroenabled.12"));
}
private static final Set<MediaType> EXCLUDES =
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
MediaType.application("x-tika-ooxml")
)));
private static final Parser DECORATED_PARSERS[] = new Parser[] {
// documents
new org.apache.tika.parser.html.HtmlParser(),
new org.apache.tika.parser.rtf.RTFParser(),
new org.apache.tika.parser.pdf.PDFParser(),
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.OfficeParser(),
new org.apache.tika.parser.microsoft.OldExcelParser(),
ParserDecorator.withTypes(
ParserDecorator.withoutTypes(
new
org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES
), INCLUDES),
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),
new org.apache.tika.parser.epub.EpubParser(),
};
private static final Parser STANDARD_PARSERS[] = new Parser[] {
// documents
new org.apache.tika.parser.html.HtmlParser(),
new org.apache.tika.parser.rtf.RTFParser(),
new org.apache.tika.parser.pdf.PDFParser(),
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.OfficeParser(),
new org.apache.tika.parser.microsoft.OldExcelParser(),
new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),
new org.apache.tika.parser.epub.EpubParser(),
};
private static final AutoDetectParser DECORATED_PARSER_INSTANCE = new
AutoDetectParser(DECORATED_PARSERS);
private static final AutoDetectParser STANDARD_PARSER_INSTANCE = new
AutoDetectParser(STANDARD_PARSERS);
private static final Tika DECORATED_TIKA = new
Tika(DECORATED_PARSER_INSTANCE.getDetector(), DECORATED_PARSER_INSTANCE);
private static final Tika STANDARD_TIKA = new
Tika(STANDARD_PARSER_INSTANCE.getDetector(), STANDARD_PARSER_INSTANCE);
@Test
public void testSkipVisioOOXML() throws Exception {
for (File f : getResourceAsFile("/test-documents").listFiles()) {
if (f.isDirectory()) {
continue;
}
if (f.getName().contains("VISIO") && (f.getName().endsWith("x") ||
f.getName().endsWith("m"))) {
continue;
}
if (f.getName().contains("embeddedVsdx")) {
continue;
}
boolean decoratedEx = false;
boolean standardEx = false;
String decoratedOutput = "";
String standardOutput = "";
try (InputStream is = TikaInputStream.get(f)) {
decoratedOutput = DECORATED_TIKA.parseToString(is);
} catch (Throwable e) {
decoratedEx = true;
}
try (InputStream is = TikaInputStream.get(f)) {
standardOutput = STANDARD_TIKA.parseToString(is);
} catch (Throwable e) {
standardEx = true;
}
assertEquals(f.getName(), standardEx, decoratedEx);
if (standardEx == false) {
assertEquals(f.getName(), standardOutput, decoratedOutput);
}
}
}
{noformat}
was (Author: [email protected]):
Three cheers for unit tests!
It looks like we need to add vnd.ms-powerpoint.template.macroenabled.12 to
OOXMLParser's handled media types. I'll make that change shortly.
Meanwhile, you could try something like this, which runs against nearly all of
our test documents:
{noformat}
private static final Set<MediaType> INCLUDES = new HashSet<>();
static {
for (MediaType mediaType : OOXMLParser.SUPPORTED_TYPES) {
if (mediaType.equals(MediaType.application("x-tika-ooxml"))) {
continue;
}
INCLUDES.add(mediaType);
}
INCLUDES.add(MediaType.application("vnd.ms-powerpoint.template.macroenabled.12"));
}
private static final Set<MediaType> EXCLUDES =
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
MediaType.application("x-tika-ooxml")
)));
private static final Parser DECORATED_PARSERS[] = new Parser[] {
// documents
new org.apache.tika.parser.html.HtmlParser(),
new org.apache.tika.parser.rtf.RTFParser(),
new org.apache.tika.parser.pdf.PDFParser(),
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.OfficeParser(),
new org.apache.tika.parser.microsoft.OldExcelParser(),
ParserDecorator.withTypes(ParserDecorator.withoutTypes(
new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
EXCLUDES
), INCLUDES),
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),
new org.apache.tika.parser.epub.EpubParser(),
};
private static final Parser STANDARD_PARSERS[] = new Parser[] {
// documents
new org.apache.tika.parser.html.HtmlParser(),
new org.apache.tika.parser.rtf.RTFParser(),
new org.apache.tika.parser.pdf.PDFParser(),
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.OfficeParser(),
new org.apache.tika.parser.microsoft.OldExcelParser(),
new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),
new org.apache.tika.parser.epub.EpubParser(),
};
private static final AutoDetectParser DECORATED_PARSER_INSTANCE = new
AutoDetectParser(DECORATED_PARSERS);
private static final AutoDetectParser STANDARD_PARSER_INSTANCE = new
AutoDetectParser(STANDARD_PARSERS);
private static final Tika DECORATED_TIKA = new
Tika(DECORATED_PARSER_INSTANCE.getDetector(), DECORATED_PARSER_INSTANCE);
private static final Tika STANDARD_TIKA = new
Tika(STANDARD_PARSER_INSTANCE.getDetector(), STANDARD_PARSER_INSTANCE);
@Test
public void testSkipVisioOOXML() throws Exception {
for (File f : getResourceAsFile("/test-documents").listFiles()) {
if (f.isDirectory()) {
continue;
}
if (f.getName().contains("VISIO") && (f.getName().endsWith("x") ||
f.getName().endsWith("m"))) {
continue;
}
if (f.getName().contains("embeddedVsdx")) {
continue;
}
boolean decoratedEx = false;
boolean standardEx = false;
String decoratedOutput = "";
String standardOutput = "";
try (InputStream is = TikaInputStream.get(f)) {
decoratedOutput = DECORATED_TIKA.parseToString(is);
} catch (Throwable e) {
decoratedEx = true;
}
try (InputStream is = TikaInputStream.get(f)) {
standardOutput = STANDARD_TIKA.parseToString(is);
} catch (Throwable e) {
standardEx = true;
}
assertEquals(f.getName(), standardEx, decoratedEx);
if (standardEx == false) {
assertEquals(f.getName(), standardOutput, decoratedOutput);
}
}
}
{noformat}
> Catch missing libraires
> -----------------------
>
> Key: TIKA-2208
> URL: https://issues.apache.org/jira/browse/TIKA-2208
> Project: Tika
> Issue Type: Improvement
> Components: parser
> Reporter: David Pilato
>
> Hi there
> We have decided to remove support for some formats when using Tika to extract
> text and metadata.
> We defined our list of Parsers:
> {code:java}
> private static final Parser PARSERS[] = new Parser[] {
> // documents
> new org.apache.tika.parser.html.HtmlParser(),
> new org.apache.tika.parser.rtf.RTFParser(),
> new org.apache.tika.parser.pdf.PDFParser(),
> new org.apache.tika.parser.txt.TXTParser(),
> new org.apache.tika.parser.microsoft.OfficeParser(),
> new org.apache.tika.parser.microsoft.OldExcelParser(),
> new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
> new org.apache.tika.parser.odf.OpenDocumentParser(),
> new org.apache.tika.parser.iwork.IWorkPackageParser(),
> new org.apache.tika.parser.xml.DcXMLParser(),
> new org.apache.tika.parser.epub.EpubParser(),
> };
> private static final AutoDetectParser PARSER_INSTANCE = new
> AutoDetectParser(PARSERS);
> private static final Tika TIKA_INSTANCE = new
> Tika(PARSER_INSTANCE.getDetector(), PARSER_INSTANCE);
> {code}
> But when a MS Office Word document embeds another non supported document
> (Like a Visio Schema) an {{NoClassDefFoundError}} is raised.
> Would it be possible to catch such a case and throw in that case a
> {{TikaException}} so it behaves as an Exception and not as a Throwable?
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)