Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java?rev=777643&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java (added) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java Fri May 22 18:21:59 2009 @@ -0,0 +1,55 @@ +/** +******************************************************************************* +* Copyright (C) 2005, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +*/ +package org.apache.tika.parser.txt; + +/** + * Abstract class for recognizing a single charset. + * Part of the implementation of ICU's CharsetDetector. + * + * Each specific charset that can be recognized will have an instance + * of some subclass of this class. All interaction between the overall + * CharsetDetector and the stuff specific to an individual charset happens + * via the interface provided here. + * + * Instances of CharsetDetector DO NOT have or maintain + * state pertaining to a specific match or detect operation. + * The WILL be shared by multiple instances of CharsetDetector. + * They encapsulate const charset-specific information. + * + * @internal + */ +abstract class CharsetRecognizer { + /** + * Get the IANA name of this charset. + * @return the charset name. + */ + abstract String getName(); + + /** + * Get the ISO language code for this charset. + * @return the language code, or <code>null</code> if the language cannot be determined. + */ + public String getLanguage() + { + return null; + } + + /** + * Test the match of this charset with the input text data + * which is obtained via the CharsetDetector object. + * + * @param det The CharsetDetector, which contains the input text + * to be checked for being in this charset. + * @return Two values packed into one int (Damn java, anyhow) + * <br/> + * bits 0-7: the match confidence, ranging from 0-100 + * <br/> + * bits 8-15: The match reason, an enum-like value. + */ + abstract int match(CharsetDetector det); + +}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=777643&r1=777642&r2=777643&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java Fri May 22 18:21:59 2009 @@ -28,9 +28,6 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import com.ibm.icu.text.CharsetDetector; -import com.ibm.icu.text.CharsetMatch; - /** * Text parser */ Modified: lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/LICENSE.txt URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/LICENSE.txt?rev=777643&r1=777642&r2=777643&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/LICENSE.txt (original) +++ lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/LICENSE.txt Fri May 22 18:21:59 2009 @@ -200,3 +200,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + + + +APACHE TIKA SUBCOMPONENTS + +Apache Tika includes a number of subcomponents with separate copyright notices +and license terms. Your use of these subcomponents is subject to the terms and +conditions of the following licenses. + +Charset detection code from ICU4J (http://site.icu-project.org/) + + Copyright (c) 1995-2009 International Business Machines Corporation + and others + + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, and/or sell copies of the Software, and to permit persons + to whom the Software is furnished to do so, provided that the above + copyright notice(s) and this permission notice appear in all copies + of the Software and that both the above copyright notice(s) and this + permission notice appear in supporting documentation. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE + BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, + OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, + ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + SOFTWARE. + + Except as contained in this notice, the name of a copyright holder shall + not be used in advertising or otherwise to promote the sale, use or other + dealings in this Software without prior written authorization of the + copyright holder. Modified: lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/NOTICE.txt URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/NOTICE.txt?rev=777643&r1=777642&r2=777643&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/NOTICE.txt (original) +++ lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/NOTICE.txt Fri May 22 18:21:59 2009 @@ -3,3 +3,7 @@ This product includes software developed at The Apache Software Foundation (http://www.apache.org/). + +This product includes software developed by the following copyright owners: + +Copyright (c) 1995-2009 International Business Machines Corporation and others
