Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,54 @@ +/** + * ****************************************************************************** + * Copyright (C) 2005, International Business Machines Corporation and * + * others. All Rights Reserved. * + * ****************************************************************************** + */ +package org.apache.tika.parser.txt; + +/** + * Abstract class for recognizing a single charset. + * Part of the implementation of ICU's CharsetDetector. + * + * Each specific charset that can be recognized will have an instance + * of some subclass of this class. All interaction between the overall + * CharsetDetector and the stuff specific to an individual charset happens + * via the interface provided here. + * + * Instances of CharsetDetector DO NOT have or maintain + * state pertaining to a specific match or detect operation. + * The WILL be shared by multiple instances of CharsetDetector. + * They encapsulate const charset-specific information. + * + * @internal + */ +abstract class CharsetRecognizer { + /** + * Get the IANA name of this charset. + * @return the charset name. + */ + abstract String getName(); + + /** + * Get the ISO language code for this charset. + * @return the language code, or <code>null</code> if the language cannot be determined. + */ + public String getLanguage() { + return null; + } + + /** + * Test the match of this charset with the input text data + * which is obtained via the CharsetDetector object. + * + * @param det The CharsetDetector, which contains the input text + * to be checked for being in this charset. + * @return Two values packed into one int (Damn java, anyhow) + * <br/> + * bits 0-7: the match confidence, ranging from 0-100 + * <br/> + * bits 8-15: The match reason, an enum-like value. + */ + abstract int match(CharsetDetector det); + +}
Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,74 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.txt; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; + +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.utils.CharsetUtils; + +public class Icu4jEncodingDetector implements EncodingDetector { + + public Charset detect(InputStream input, Metadata metadata) + throws IOException { + if (input == null) { + return null; + } + + CharsetDetector detector = new CharsetDetector(); + + String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); + String incomingType = metadata.get(Metadata.CONTENT_TYPE); + if (incomingCharset == null && incomingType != null) { + // TIKA-341: Use charset in content-type + MediaType mt = MediaType.parse(incomingType); + if (mt != null) { + incomingCharset = mt.getParameters().get("charset"); + } + } + + if (incomingCharset != null) { + String cleaned = CharsetUtils.clean(incomingCharset); + if (cleaned != null) { + detector.setDeclaredEncoding(cleaned); + } else { + // TODO: log a warning? + } + } + + // TIKA-341 without enabling input filtering (stripping of tags) + // short HTML tests don't work well + detector.enableInputFilter(true); + + detector.setText(input); + + for (CharsetMatch match : detector.detectAll()) { + try { + return CharsetUtils.forName(match.getName()); + } catch (Exception e) { + // ignore + } + } + + return null; + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.txt; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.Collections; +import java.util.Set; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Plain text parser. The text encoding of the document stream is + * automatically detected based on the byte patterns found at the + * beginning of the stream and the given document metadata, most + * notably the <code>charset</code> parameter of a + * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value. + * <p/> + * This parser sets the following output metadata entries: + * <dl> + * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt> + * <dd><code>text/plain; charset=...</code></dd> + * </dl> + */ +public class TXTParser extends AbstractParser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = -6656102320836888910L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.TEXT_PLAIN); + + private static final ServiceLoader LOADER = + new ServiceLoader(TXTParser.class.getClassLoader()); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // Automatically detect the character encoding + try (AutoDetectReader reader = new AutoDetectReader( + new CloseShieldInputStream(stream), metadata, + context.get(ServiceLoader.class, LOADER))) { + Charset charset = reader.getCharset(); + MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset); + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + // deprecated, see TIKA-431 + metadata.set(Metadata.CONTENT_ENCODING, charset.name()); + + XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + xhtml.startElement("p"); + char[] buffer = new char[4096]; + int n = reader.read(buffer); + while (n != -1) { + xhtml.characters(buffer, 0, n); + n = reader.read(buffer); + } + xhtml.endElement("p"); + + xhtml.endDocument(); + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.txt; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; + +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.metadata.Metadata; + +public class UniversalEncodingDetector implements EncodingDetector { + + private static final int BUFSIZE = 1024; + + private static final int LOOKAHEAD = 16 * BUFSIZE; + + public Charset detect(InputStream input, Metadata metadata) + throws IOException { + if (input == null) { + return null; + } + + input.mark(LOOKAHEAD); + try { + UniversalEncodingListener listener = + new UniversalEncodingListener(metadata); + + byte[] b = new byte[BUFSIZE]; + int n = 0; + int m = input.read(b); + while (m != -1 && n < LOOKAHEAD && !listener.isDone()) { + n += m; + listener.handleData(b, 0, m); + m = input.read(b, 0, Math.min(b.length, LOOKAHEAD - n)); + } + + return listener.dataEnd(); + } catch (LinkageError e) { + return null; // juniversalchardet is not available + } finally { + input.reset(); + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.txt; + +import java.nio.charset.Charset; + +import org.apache.tika.detect.TextStatistics; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.utils.CharsetUtils; +import org.mozilla.universalchardet.CharsetListener; +import org.mozilla.universalchardet.Constants; +import org.mozilla.universalchardet.UniversalDetector; + +/** + * Helper class used by {@link UniversalEncodingDetector} to access the + * <code>juniversalchardet</code> detection logic. + */ +class UniversalEncodingListener implements CharsetListener { + + private static final String CHARSET_ISO_8859_1 = "ISO-8859-1"; + + private static final String CHARSET_ISO_8859_15 = "ISO-8859-15"; + + private final TextStatistics statistics = new TextStatistics(); + + private final UniversalDetector detector = new UniversalDetector(this); + + private String hint = null; + + private Charset charset = null; + + public UniversalEncodingListener(Metadata metadata) { + MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE)); + if (type != null) { + hint = type.getParameters().get("charset"); + } + if (hint == null) { + hint = metadata.get(Metadata.CONTENT_ENCODING); + } + } + + public void report(String name) { + if (Constants.CHARSET_WINDOWS_1252.equals(name)) { + if (hint != null) { + // Use the encoding hint when available + name = hint; + } else if (statistics.count('\r') == 0) { + // If there are no CR(LF)s, then the encoding is more + // likely to be ISO-8859-1(5) than windows-1252 + if (statistics.count(0xa4) > 0) { // currency/euro sign + // The general currency sign is hardly ever used in + // ISO-8859-1, so it's more likely that we're dealing + // with ISO-8859-15, where the character is used for + // the euro symbol, which is more commonly used. + name = CHARSET_ISO_8859_15; + } else { + name = CHARSET_ISO_8859_1; + } + } + } + try { + this.charset = CharsetUtils.forName(name); + } catch (Exception e) { + // ignore + } + } + + public boolean isDone() { + return detector.isDone(); + } + + public void handleData(byte[] buf, int offset, int length) { + statistics.addData(buf, offset, length); + detector.handleData(buf, offset, length); + } + + public Charset dataEnd() { + detector.dataEnd(); + if (charset == null && statistics.isMostlyAscii()) { + report(Constants.CHARSET_WINDOWS_1252); + } + return charset; + } + +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import java.util.Arrays; +import java.util.List; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Base class for SAX handlers that map SAX events into document metadata. + * + * @since Apache Tika 0.10 + */ +class AbstractMetadataHandler extends DefaultHandler { + + private final Metadata metadata; + private final Property property; + private final String name; + + protected AbstractMetadataHandler(Metadata metadata, String name) { + this.metadata = metadata; + this.property = null; + this.name = name; + } + protected AbstractMetadataHandler(Metadata metadata, Property property) { + this.metadata = metadata; + this.property = property; + this.name = property.getName(); + } + + /** + * Adds the given metadata value. The value is ignored if it is + * <code>null</code> or empty. If the metadata entry already exists, + * then the given value is appended to it with a comma as the separator. + * + * @param value metadata value + */ + protected void addMetadata(String value) { + if (value != null && value.length() > 0) { + if (metadata.isMultiValued(name)) { + // Add the value, assuming it's not already there + List<String> previous = Arrays.asList(metadata.getValues(name)); + if (!previous.contains(value)) { + if (property != null) { + metadata.add(property, value); + } else { + metadata.add(name, value); + } + } + } else { + // Set the value, assuming it's not already there + String previous = metadata.get(name); + if (previous != null && previous.length() > 0) { + if (!previous.equals(value)) { + if (property != null) { + if (property.isMultiValuePermitted()) { + metadata.add(property, value); + } else { + // Replace the existing value if isMultiValuePermitted is false + metadata.set(property, value); + } + } else { + metadata.add(name, value); + } + } + } else { + if (property != null) { + metadata.set(property, value); + } else { + metadata.set(name, value); + } + } + } + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.Metadata; +import org.xml.sax.Attributes; +import org.xml.sax.helpers.DefaultHandler; + +/** + * This adds a Metadata entry for a given node. + * The textual content of the node is used as the + * value, and the Metadata name is taken from + * an attribute, with a prefix if required. + */ +public class AttributeDependantMetadataHandler extends DefaultHandler { + + private final Metadata metadata; + + private final String nameHoldingAttribute; + private final String namePrefix; + private String name; + + private final StringBuilder buffer = new StringBuilder(); + + public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) { + this.metadata = metadata; + this.nameHoldingAttribute = nameHoldingAttribute; + this.namePrefix = namePrefix; + } + + public void addMetadata(String value) { + if(name == null || name.length() == 0) { + // We didn't find the attribute which holds the name + return; + } + if (value.length() > 0) { + String previous = metadata.get(name); + if (previous != null && previous.length() > 0) { + value = previous + ", " + value; + } + metadata.set(name, value); + } + } + + public void endElement(String uri, String localName, String name) { + addMetadata(buffer.toString()); + buffer.setLength(0); + } + + public void startElement( + String uri, String localName, String name, Attributes attributes) { + String rawName = attributes.getValue(nameHoldingAttribute); + if (rawName != null) { + if (namePrefix == null) { + this.name = rawName; + } else { + this.name = namePrefix + rawName; + } + } + // All other attributes are ignored + } + + + public void characters(char[] ch, int start, int length) { + buffer.append(ch, start, length); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * SAX event handler that maps the contents of an XML attribute into + * a metadata field. + * + * @since Apache Tika 0.10 + */ +public class AttributeMetadataHandler extends AbstractMetadataHandler { + + private final String uri; + + private final String localName; + + public AttributeMetadataHandler( + String uri, String localName, Metadata metadata, String name) { + super(metadata, name); + this.uri = uri; + this.localName = localName; + } + public AttributeMetadataHandler( + String uri, String localName, Metadata metadata, Property property) { + super(metadata, property); + this.uri = uri; + this.localName = localName; + } + + @Override + public void startElement( + String uri, String localName, String qName, Attributes attributes) + throws SAXException { + for (int i = 0; i < attributes.getLength(); i++) { + if (attributes.getURI(i).equals(this.uri) + && attributes.getLocalName(i).equals(this.localName)) { + addMetadata(attributes.getValue(i).trim()); + } + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.DublinCore; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.TeeContentHandler; +import org.xml.sax.ContentHandler; + +/** + * Dublin Core metadata parser + */ +public class DcXMLParser extends XMLParser { + + /** Serial version UID */ + private static final long serialVersionUID = 4905318835463880819L; + + private static ContentHandler getDublinCoreHandler( + Metadata metadata, Property property, String element) { + return new ElementMetadataHandler( + DublinCore.NAMESPACE_URI_DC, element, + metadata, property); + } + + protected ContentHandler getContentHandler( + ContentHandler handler, Metadata metadata, ParseContext context) { + return new TeeContentHandler( + super.getContentHandler(handler, metadata, context), + getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"), + getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"), + getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"), + getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"), + getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"), + getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"), + getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"), + getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"), + getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"), + getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"), + getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"), + getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights")); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import java.util.Arrays; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.xml.sax.Attributes; + +/** + * SAX event handler that maps the contents of an XML element into + * a metadata field. + * + * @since Apache Tika 0.10 + */ +public class ElementMetadataHandler extends AbstractMetadataHandler { + /** + * Logger for this class + */ + private static final Log logger = LogFactory + .getLog(ElementMetadataHandler.class); + + private static final String LOCAL_NAME_RDF_BAG = "Bag"; + private static final String LOCAL_NAME_RDF_LI = "li"; + private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + + private final String uri; + + private final String localName; + + private final Metadata metadata; + + private final String name; + private Property targetProperty; + + private final boolean allowDuplicateValues; + private final boolean allowEmptyValues; + + /** + * The buffer used to capture characters when inside a bag li element. + */ + private final StringBuilder bufferBagged = new StringBuilder(); + + /** + * The buffer used to capture characters inside standard elements. + */ + private final StringBuilder bufferBagless = new StringBuilder(); + + /** + * Whether or not the value was found in a standard element structure or inside a bag. + */ + private boolean isBagless = true; + + private int matchLevel = 0; + private int parentMatchLevel = 0; + + /** + * Constructor for string metadata keys. + * + * @param uri the uri of the namespace of the element + * @param localName the local name of the element + * @param metadata the Tika metadata object to populate + * @param name the Tika metadata field key + */ + public ElementMetadataHandler( + String uri, String localName, Metadata metadata, String name) { + super(metadata, name); + this.uri = uri; + this.localName = localName; + this.metadata = metadata; + this.name = name; + this.allowDuplicateValues = false; + this.allowEmptyValues = false; + if (logger.isTraceEnabled()) { + logger.trace("created simple handler for " + this.name); + } + } + + /** + * Constructor for string metadata keys which allows change of behavior + * for duplicate and empty entry values. + * + * @param uri the uri of the namespace of the element + * @param localName the local name of the element + * @param metadata the Tika metadata object to populate + * @param name the Tika metadata field key + * @param allowDuplicateValues add duplicate values to the Tika metadata + * @param allowEmptyValues add empty values to the Tika metadata + */ + public ElementMetadataHandler( + String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) { + super(metadata, name); + this.uri = uri; + this.localName = localName; + this.metadata = metadata; + this.name = name; + this.allowDuplicateValues = allowDuplicateValues; + this.allowEmptyValues = allowEmptyValues; + if (logger.isTraceEnabled()) { + logger.trace("created simple handler for " + this.name); + } + } + + /** + * Constructor for Property metadata keys. + * + * @param uri the uri of the namespace of the element + * @param localName the local name of the element + * @param metadata the Tika metadata object to populate + * @param targetProperty the Tika metadata Property key + */ + public ElementMetadataHandler( + String uri, String localName, Metadata metadata, Property targetProperty) { + super(metadata, targetProperty); + this.uri = uri; + this.localName = localName; + this.metadata = metadata; + this.targetProperty = targetProperty; + this.name = targetProperty.getName(); + this.allowDuplicateValues = false; + this.allowEmptyValues = false; + if (logger.isTraceEnabled()) { + logger.trace("created property handler for " + this.name); + } + } + + /** + * Constructor for Property metadata keys which allows change of behavior + * for duplicate and empty entry values. + * + * @param uri the uri of the namespace of the element + * @param localName the local name of the element + * @param metadata the Tika metadata object to populate + * @param targetProperty the Tika metadata Property key + * @param allowDuplicateValues add duplicate values to the Tika metadata + * @param allowEmptyValues add empty values to the Tika metadata + */ + public ElementMetadataHandler( + String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) { + super(metadata, targetProperty); + this.uri = uri; + this.localName = localName; + this.metadata = metadata; + this.targetProperty = targetProperty; + this.name = targetProperty.getName(); + this.allowDuplicateValues = allowDuplicateValues; + this.allowEmptyValues = allowEmptyValues; + if (logger.isTraceEnabled()) { + logger.trace("created property handler for " + this.name); + } + } + + protected boolean isMatchingParentElement(String uri, String localName) { + return (uri.equals(this.uri) && localName.equals(this.localName)); + } + + protected boolean isMatchingElement(String uri, String localName) { + // match if we're inside the parent element or within some bag element + return (uri.equals(this.uri) && localName.equals(this.localName)) || + (parentMatchLevel > 0 && + ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) || + (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI)) + ) + ); + } + + @Override + public void startElement( + String uri, String localName, String name, Attributes attributes) { + if (isMatchingElement(uri, localName)) { + matchLevel++; + } + if (isMatchingParentElement(uri, localName)) { + parentMatchLevel++; + } + } + + @Override + public void endElement(String uri, String localName, String name) { + if (isMatchingParentElement(uri, localName)) { + parentMatchLevel--; + } + if (isMatchingElement(uri, localName)) { + matchLevel--; + if (matchLevel == 2) { + // we're inside a bag li element, add the bagged buffer + addMetadata(bufferBagged.toString().trim()); + bufferBagged.setLength(0); + isBagless = false; + } + if (matchLevel == 0 && isBagless) { + String valueBagless = bufferBagless.toString(); + if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) { + // we're in a standard element, add the bagless buffer + addMetadata(valueBagless.trim()); + bufferBagless.setLength(0); + } + isBagless = true; + } + } + } + + @Override + public void characters(char[] ch, int start, int length) { + // We need to append to both buffers since we don't if we're inside a bag until we're done + if (parentMatchLevel > 0 && matchLevel > 2) { + bufferBagged.append(ch, start, length); + } + if (parentMatchLevel > 0 && matchLevel > 0) { + bufferBagless.append(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) { + characters(ch, start, length); + } + + @Override + protected void addMetadata(String value) { + if (logger.isTraceEnabled()) { + logger.trace("adding " + name + "=" + value); + } + if (targetProperty != null && targetProperty.isMultiValuePermitted()) { + if ((value != null && value.length() > 0) || allowEmptyValues) { + if (value == null || value.length() == 0 && allowEmptyValues) { + value = ""; + } + String[] previous = metadata.getValues(name); + if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) { + metadata.add(targetProperty, value); + } + } + } else { + super.addMetadata(value); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.commons.codec.binary.Base64; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Collections; +import java.util.Set; + +public class FictionBookParser extends XMLParser { + private static final long serialVersionUID = 4195954546491524374L; + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return Collections.singleton(MediaType.application("x-fictionbook+xml")); + } + + @Override + protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) { + EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); + + if (ex == null) { + ex = new ParsingEmbeddedDocumentExtractor(context); + } + + return new BinaryElementsDataHandler(ex, handler); + } + + private static class BinaryElementsDataHandler extends DefaultHandler { + private static final String ELEMENT_BINARY = "binary"; + + private boolean binaryMode = false; + private static final String ATTRIBUTE_ID = "id"; + + private final EmbeddedDocumentExtractor partExtractor; + private final ContentHandler handler; + private final StringBuilder binaryData = new StringBuilder(); + private Metadata metadata; + private static final String ATTRIBUTE_CONTENT_TYPE = "content-type"; + + private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) { + this.partExtractor = partExtractor; + this.handler = handler; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { + binaryMode = ELEMENT_BINARY.equals(localName); + if (binaryMode) { + binaryData.setLength(0); + metadata = new Metadata(); + + metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID)); + metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE)); + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (binaryMode) { + try { + partExtractor.parseEmbedded( + new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())), + handler, + metadata, + true + ); + } catch (IOException e) { + throw new SAXException("IOException in parseEmbedded", e); + } + + binaryMode = false; + binaryData.setLength(0); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (!binaryMode) { + handler.characters(ch, start, length); + } else { + binaryData.append(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + handler.ignorableWhitespace(ch, start, length); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.xml.sax.Attributes; +import org.xml.sax.helpers.DefaultHandler; + +/** + * This adds Metadata entries with a specified name for + * the textual content of a node (if present), and + * all attribute values passed through the matcher + * (but not their names). + * + * @deprecated Use the {@link AttributeMetadataHandler} and + * {@link ElementMetadataHandler} classes instead + */ +public class MetadataHandler extends DefaultHandler { + + private final Metadata metadata; + + private final Property property; + private final String name; + + private final StringBuilder buffer = new StringBuilder(); + + public MetadataHandler(Metadata metadata, String name) { + this.metadata = metadata; + this.property = null; + this.name = name; + } + public MetadataHandler(Metadata metadata, Property property) { + this.metadata = metadata; + this.property = property; + this.name = property.getName(); + } + + public void addMetadata(String value) { + if (value.length() > 0) { + String previous = metadata.get(name); + if (previous != null && previous.length() > 0) { + value = previous + ", " + value; + } + + if (this.property != null) { + metadata.set(property, value); + } else { + metadata.set(name, value); + } + } + } + + public void endElement(String uri, String localName, String name) { + addMetadata(buffer.toString()); + buffer.setLength(0); + } + + public void startElement( + String uri, String localName, String name, Attributes attributes) { + for (int i = 0; i < attributes.getLength(); i++) { + addMetadata(attributes.getValue(i)); + } + } + + + public void characters(char[] ch, int start, int length) { + buffer.append(ch, start, length); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.TaggedContentHandler; +import org.apache.tika.sax.TextContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * XML parser. + */ +public class XMLParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = -6028836725280212837L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("xml"), + MediaType.image("svg+xml")))); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + if (metadata.get(Metadata.CONTENT_TYPE) == null) { + metadata.set(Metadata.CONTENT_TYPE, "application/xml"); + } + + final XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.startElement("p"); + + TaggedContentHandler tagged = new TaggedContentHandler(handler); + try { + context.getSAXParser().parse( + new CloseShieldInputStream(stream), + new OfflineContentHandler(new EmbeddedContentHandler( + getContentHandler(tagged, metadata, context)))); + } catch (SAXException e) { + tagged.throwIfCauseOf(e); + throw new TikaException("XML parse error", e); + } finally { + xhtml.endElement("p"); + xhtml.endDocument(); + } + } + + protected ContentHandler getContentHandler( + ContentHandler handler, Metadata metadata, ParseContext context) { + return new TextContentHandler(handler, true); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector Sat Jan 16 18:23:01 2016 @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.parser.txt.UniversalEncodingDetector +org.apache.tika.parser.txt.Icu4jEncodingDetector Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016 @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.tika.parser.txt.TXTParser +#org.apache.tika.parser.strings.Latin1StringsParser +#org.apache.tika.parser.strings.StringsParser +org.apache.tika.parser.xml.DcXMLParser +org.apache.tika.parser.xml.FictionBookParser +#org.apache.tika.parser.xml.XMLParser Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.strings; + +import static org.junit.Assert.*; + +import org.junit.Test; + +public class FileConfigTest { + + @Test + public void testNoConfig() { + FileConfig config = new FileConfig(); + assertEquals("Invalid default filePath value", "", config.getFilePath()); + assertEquals("Invalid default mime option value", false, config.isMimetype()); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,69 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.strings; + +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.charset.StandardCharsets.UTF_16; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class Latin1StringsParserTest { + + @Test + public void testParse() throws Exception { + + String testStr = "These are Latin1 accented scripts: \u00C2 \u00C3 \u00C9 \u00DC \u00E2 \u00E3 \u00E9 \u00FC"; + String smallStr = "ab"; + + byte[] iso8859Bytes = testStr.getBytes(ISO_8859_1); + byte[] utf8Bytes = testStr.getBytes(UTF_8); + byte[] utf16Bytes = testStr.getBytes(UTF_16); + byte[] zeros = new byte[10]; + byte[] smallString = smallStr.getBytes(ISO_8859_1); + byte[] trashBytes = { 0x00, 0x01, 0x02, 0x03, 0x1E, 0x1F, (byte) 0xFF }; + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(iso8859Bytes); + baos.write(zeros); + baos.write(utf8Bytes); + baos.write(trashBytes); + baos.write(utf16Bytes); + baos.write(zeros); + baos.write(smallString); + + Parser parser = new Latin1StringsParser(); + ContentHandler handler = new BodyContentHandler(); + + try (InputStream stream = new ByteArrayInputStream(baos.toByteArray())) { + parser.parse(stream, handler, new Metadata(), new ParseContext()); + } + + String result = handler.toString(); + String expected = testStr + "\n" + testStr + "\n" + testStr + "\n"; + + // Test if result contains only the test string appended 3 times + assertTrue(result.equals(expected)); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.strings; + +import static org.junit.Assert.*; + +import java.io.File; +import java.io.InputStream; + +import org.junit.Test; + +public class StringsConfigTest { + + @Test + public void testNoConfig() { + StringsConfig config = new StringsConfig(); + assertEquals("Invalid default filePath value", "", config.getStringsPath()); + assertEquals("Invalid default encoding value", StringsEncoding.SINGLE_7_BIT, config.getEncoding()); + assertEquals("Invalid default min-len value", 4, config.getMinLength()); + assertEquals("Invalid default timeout value", 120, config.getTimeout()); + } + + @Test + public void testPartialConfig() { + InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-partial.properties"); + + StringsConfig config = new StringsConfig(stream); + assertEquals("Invalid default stringsPath value", "", config.getStringsPath()); + assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding()); + assertEquals("Invalid default min-len value", 4, config.getMinLength()); + assertEquals("Invalid overridden timeout value", 60, config.getTimeout()); + } + + @Test + public void testFullConfig() { + InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-full.properties"); + + StringsConfig config = new StringsConfig(stream); + assertEquals("Invalid overridden stringsPath value", "/opt/strings" + File.separator, config.getStringsPath()); + assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding()); + assertEquals("Invalid overridden min-len value", 3, config.getMinLength()); + assertEquals("Invalid overridden timeout value", 60, config.getTimeout()); + } + + @Test(expected=IllegalArgumentException.class) + public void testValidateEconding() { + StringsConfig config = new StringsConfig(); + config.setMinLength(0); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,74 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.strings; + +import static org.apache.tika.parser.strings.StringsParser.getStringsProg; +import static org.junit.Assert.*; +import static org.junit.Assume.assumeTrue; + +import java.io.InputStream; +import java.util.Arrays; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class StringsParserTest { + public static boolean canRun() { + StringsConfig config = new StringsConfig(); + String[] checkCmd = {config.getStringsPath() + getStringsProg(), "--version"}; + boolean hasStrings = ExternalParser.check(checkCmd); + return hasStrings; + } + + @Test + public void testParse() throws Exception { + assumeTrue(canRun()); + + String resource = "/test-documents/testOCTET_header.dbase3"; + + String[] content = { "CLASSNO", "TITLE", "ITEMNO", "LISTNO", "LISTDATE" }; + + String[] met_attributes = {"min-len", "encoding", "strings:file_output"}; + + StringsConfig stringsConfig = new StringsConfig(); + FileConfig fileConfig = new FileConfig(); + + Parser parser = new StringsParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + ParseContext context = new ParseContext(); + context.set(StringsConfig.class, stringsConfig); + context.set(FileConfig.class, fileConfig); + + try (InputStream stream = StringsParserTest.class.getResourceAsStream(resource)) { + parser.parse(stream, handler, metadata, context); + } catch (Exception e) { + e.printStackTrace(); + } + + // Content + for (String word : content) { + assertTrue(handler.toString().contains(word)); + } + + // Metadata + Arrays.equals(met_attributes, metadata.names()); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.txt; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; + +import org.junit.Test; + +public class CharsetDetectorTest { + + @Test + public void testTagDropper() throws IOException { + try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) { + CharsetDetector detector = new CharsetDetector(); + detector.enableInputFilter(true); + detector.setText(in); + CharsetMatch[] matches = detector.detectAll(); + CharsetMatch mm = null; + for (CharsetMatch m : matches) { + if (mm == null || mm.getConfidence() < m.getConfidence()) { + mm = m; + } + } + assertTrue(mm != null); + assertEquals("UTF-8", mm.getName()); + } + } + + /* https://issues.apache.org/jira/browse/TIKA-1248 + * Verify empty or null declaredEncoding doesn't cause an exception + * + */ + + @Test + public void testEmptyOrNullDeclaredCharset() throws IOException { + try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) { + CharsetDetector detector = new CharsetDetector(); + Reader reader = detector.getReader(in, null); + assertTrue(reader.ready()); + + reader = detector.getReader(in, ""); + assertTrue(reader.ready()); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.txt; + +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import java.io.ByteArrayInputStream; +import java.io.StringWriter; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.WriteOutContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +public class TXTParserTest { + + private Parser parser = new TXTParser(); + + @Test + public void testEnglishText() throws Exception { + String text = + "Hello, World! This is simple UTF-8 text content written" + + " in English to test autodetection of both the character" + + " encoding and the language of the input stream."; + + Metadata metadata = new Metadata(); + StringWriter writer = new StringWriter(); + parser.parse( + new ByteArrayInputStream(text.getBytes(ISO_8859_1)), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); + String content = writer.toString(); + + assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + + // TIKA-501: Remove language detection from TXTParser + assertNull(metadata.get(Metadata.CONTENT_LANGUAGE)); + assertNull(metadata.get(TikaCoreProperties.LANGUAGE)); + + assertContains("Hello", content); + assertContains("World", content); + assertContains("autodetection", content); + assertContains("stream", content); + } + + @Test + public void testUTF8Text() throws Exception { + String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n"; + + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(text.getBytes(UTF_8)), + handler, metadata, new ParseContext()); + assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated + + assertContains(text, handler.toString()); + } + + @Test + public void testEmptyText() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); + assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("\n", handler.toString()); + } + + /** + * Test for the heuristics that we use to assign an eight-bit character + * encoding to mostly ASCII sequences. If a more specific match can not + * be made, a string with a CR(LF) in it is most probably windows-1252, + * otherwise ISO-8859-1, except if it contains the currency/euro symbol + * (byte 0xa4) in which case it's more likely to be ISO-8859-15. + */ + @Test + public void testLatinDetectionHeuristics() throws Exception { + String windows = "test\r\n"; + String unix = "test\n"; + String euro = "test \u20ac\n"; + + Metadata metadata; + + metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(windows.getBytes("ISO-8859-15")), + new DefaultHandler(), metadata, new ParseContext()); + assertEquals( + "text/plain; charset=windows-1252", + metadata.get(Metadata.CONTENT_TYPE)); + + metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), + new DefaultHandler(), metadata, new ParseContext()); + assertEquals( + "text/plain; charset=ISO-8859-1", + metadata.get(Metadata.CONTENT_TYPE)); + + metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(euro.getBytes("ISO-8859-15")), + new DefaultHandler(), metadata, new ParseContext()); + assertEquals( + "text/plain; charset=ISO-8859-15", + metadata.get(Metadata.CONTENT_TYPE)); + } + + /** + * Test case for TIKA-240: Drop the BOM when extracting plain text + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a> + */ + @Test + public void testDropByteOrderMark() throws Exception { + assertExtractText("UTF-8 BOM", "test", new byte[]{ + (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'}); + assertExtractText("UTF-16 BE BOM", "test", new byte[]{ + (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'}); + assertExtractText("UTF-16 LE BOM", "test", new byte[]{ + (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0}); + } + + /** + * Test case for TIKA-335: using incoming charset + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a> + */ + @Test + public void testUseIncomingCharsetAsHint() throws Exception { + // Could be ISO 8859-1 or ISO 8859-15 or ... + // u00e1 is latin small letter a with acute + final String test2 = "the name is \u00e1ndre"; + + Metadata metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated + + metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15"); + parser.parse( + new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated + } + + /** + * Test case for TIKA-341: using charset in content-type + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a> + */ + @Test + public void testUsingCharsetInContentTypeHeader() throws Exception { + // Could be ISO 8859-1 or ISO 8859-15 or ... + // u00e1 is latin small letter a with acute + final String test2 = "the name is \u00e1ndre"; + + Metadata metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated + + metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15"); + parser.parse( + new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated + } + + private void assertExtractText(String msg, String expected, byte[] input) + throws Exception { + ContentHandler handler = new BodyContentHandler() { + public void ignorableWhitespace(char[] ch, int off, int len) { + // Ignore the whitespace added by XHTMLContentHandler + } + }; + Metadata metadata = new Metadata(); + parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext()); + assertEquals(msg, expected, handler.toString()); + } + + /** + * Test case for TIKA-339: don't override incoming language + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a> + */ + @Test + public void testRetainIncomingLanguage() throws Exception { + final String test = "Simple Content"; + + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.LANGUAGE, "en"); + + parser.parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE)); + } + + @Test + public void testCP866() throws Exception { + Metadata metadata = new Metadata(); + StringWriter writer = new StringWriter(); + parser.parse( + TXTParserTest.class.getResourceAsStream("/test-documents/russian.cp866.txt"), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); + + assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE)); + } + + @Test + public void testEBCDIC_CP500() throws Exception { + Metadata metadata = new Metadata(); + StringWriter writer = new StringWriter(); + parser.parse( + TXTParserTest.class.getResourceAsStream("/test-documents/english.cp500.txt"), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); + + assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE)); + + // Additional check that it isn't too eager on short blocks of text + metadata = new Metadata(); + writer = new StringWriter(); + parser.parse( + new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(ISO_8859_1)), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); + + assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + } + + /** + * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a> + */ + @Test + public void testCharsetDetectionWithShortSnipet() throws Exception { + final String text = "Hello, World!"; + + Metadata metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(text.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + + // Now verify that if we tell the parser the encoding is UTF-8, that's what + // we get back (see TIKA-868) + metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8"); + parser.parse( + new ByteArrayInputStream(text.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + } + +}
