Added: incubator/tika/trunk/src/main/java/org/apache/tika/utils/StringUtil.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/StringUtil.java?rev=578161&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/utils/StringUtil.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/StringUtil.java Fri Sep 21 08:07:58 2007 @@ -0,0 +1,211 @@ +/** + * Copyright 2007 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.utils; + +// JDK imports +import java.util.HashMap; +import java.nio.charset.Charset; + +/** + * A collection of String processing utility methods. + */ +public class StringUtil { + + /** + * Returns a copy of <code>s</code> padded with trailing spaces so that + * it's length is <code>length</code>. Strings already + * <code>length</code> characters long or longer are not altered. + */ + public static String rightPad(String s, int length) { + StringBuffer sb = new StringBuffer(s); + for (int i = length - s.length(); i > 0; i--) + sb.append(" "); + return sb.toString(); + } + + /** + * Returns a copy of <code>s</code> padded with leading spaces so that + * it's length is <code>length</code>. Strings already + * <code>length</code> characters long or longer are not altered. + */ + public static String leftPad(String s, int length) { + StringBuffer sb = new StringBuffer(); + for (int i = length - s.length(); i > 0; i--) + sb.append(" "); + sb.append(s); + return sb.toString(); + } + + private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; + + /** + * Convenience call for [EMAIL PROTECTED] #toHexString(byte[], String, int)}, where + * <code>sep = null; lineLen = Integer.MAX_VALUE</code>. + * + * @param buf + */ + public static String toHexString(byte[] buf) { + return toHexString(buf, null, Integer.MAX_VALUE); + } + + /** + * Get a text representation of a byte[] as hexadecimal String, where each + * pair of hexadecimal digits corresponds to consecutive bytes in the array. + * + * @param buf + * input data + * @param sep + * separate every pair of hexadecimal digits with this separator, + * or null if no separation is needed. + * @param lineLen + * break the output String into lines containing output for + * lineLen bytes. + */ + public static String toHexString(byte[] buf, String sep, int lineLen) { + if (buf == null) + return null; + if (lineLen <= 0) + lineLen = Integer.MAX_VALUE; + StringBuffer res = new StringBuffer(buf.length * 2); + for (int i = 0; i < buf.length; i++) { + int b = buf[i]; + res.append(HEX_DIGITS[(b >> 4) & 0xf]); + res.append(HEX_DIGITS[b & 0xf]); + if (i > 0 && (i % lineLen) == 0) + res.append('\n'); + else if (sep != null && i < lineLen - 1) + res.append(sep); + } + return res.toString(); + } + + /** + * Convert a String containing consecutive (no inside whitespace) + * hexadecimal digits into a corresponding byte array. If the number of + * digits is not even, a '0' will be appended in the front of the String + * prior to conversion. Leading and trailing whitespace is ignored. + * + * @param text + * input text + * @return converted byte array, or null if unable to convert + */ + public static byte[] fromHexString(String text) { + text = text.trim(); + if (text.length() % 2 != 0) + text = "0" + text; + int resLen = text.length() / 2; + int loNibble, hiNibble; + byte[] res = new byte[resLen]; + for (int i = 0; i < resLen; i++) { + int j = i << 1; + hiNibble = charToNibble(text.charAt(j)); + loNibble = charToNibble(text.charAt(j + 1)); + if (loNibble == -1 || hiNibble == -1) + return null; + res[i] = (byte) (hiNibble << 4 | loNibble); + } + return res; + } + + private static final int charToNibble(char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } else if (c >= 'a' && c <= 'f') { + return 0xa + (c - 'a'); + } else if (c >= 'A' && c <= 'F') { + return 0xA + (c - 'A'); + } else { + return -1; + } + } + + /** + * Parse the character encoding from the specified content type header. If + * the content type is null, or there is no explicit character encoding, + * <code>null</code> is returned. <br /> + * This method was copy from org.apache.catalina.util.RequestUtil is + * licensed under the Apache License, Version 2.0 (the "License"). + * + * @param contentType + * a content type header + */ + public static String parseCharacterEncoding(String contentType) { + if (contentType == null) + return (null); + int start = contentType.indexOf("charset="); + if (start < 0) + return (null); + String encoding = contentType.substring(start + 8); + int end = encoding.indexOf(';'); + if (end >= 0) + encoding = encoding.substring(0, end); + encoding = encoding.trim(); + if ((encoding.length() > 2) && (encoding.startsWith("\"")) + && (encoding.endsWith("\""))) + encoding = encoding.substring(1, encoding.length() - 1); + return (encoding.trim()); + + } + + /** + * Checks if a string is empty (ie is null or empty). + */ + public static boolean isEmpty(String str) { + return (str == null) || (str.equals("")); + } + + private static HashMap encodingAliases = new HashMap(); + + /** + * the following map is not an alias mapping table, but maps character + * encodings which are often used in mislabelled documents to their correct + * encodings. For instance, there are a lot of documents labelled + * 'ISO-8859-1' which contain characters not covered by ISO-8859-1 but + * covered by windows-1252. Because windows-1252 is a superset of ISO-8859-1 + * (sharing code points for the common part), it's better to treat + * ISO-8859-1 as synonymous with windows-1252 than to reject, as invalid, + * documents labelled as ISO-8859-1 that have characters outside ISO-8859-1. + */ + static { + encodingAliases.put("ISO-8859-1", "windows-1252"); + encodingAliases.put("EUC-KR", "x-windows-949"); + encodingAliases.put("x-EUC-CN", "GB18030"); + encodingAliases.put("GBK", "GB18030"); + // encodingAliases.put("Big5", "Big5HKSCS"); + // encodingAliases.put("TIS620", "Cp874"); + // encodingAliases.put("ISO-8859-11", "Cp874"); + + } + + public static String resolveEncodingAlias(String encoding) { + if (!Charset.isSupported(encoding)) + return null; + String canonicalName = new String(Charset.forName(encoding).name()); + return encodingAliases.containsKey(canonicalName) ? (String) encodingAliases + .get(canonicalName) + : canonicalName; + } + + public static void main(String[] args) { + if (args.length != 1) + System.out.println("Usage: StringUtil <encoding name>"); + else + System.out.println(args[0] + " is resolved to " + + resolveEncodingAlias(args[0])); + } +}
Added: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=578161&view=auto ============================================================================== --- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (added) +++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Fri Sep 21 08:07:58 2007 @@ -0,0 +1,355 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Auhtor : Jerome Charron + Description: This xml file defines the valid mime types used by Tika. + The mime types within this file are based on the types in the mime-types.xml + file available in Apache Nutch. +--> + +<mime-info> + + <mime-type type="text/plain"> + <magic priority="50"> + <match value="This is TeX," type="string" offset="0" /> + <match value="This is METAFONT," type="string" offset="0" /> + </magic> + <glob pattern="*.txt" /> + <glob pattern="*.asc" /> + </mime-type> + + <mime-type type="text/html"> + <magic priority="50"> + <match value="<!DOCTYPE HTML" type="string" + offset="0:64" /> + <match value="<!doctype html" type="string" + offset="0:64" /> + <match value="<HEAD" type="string" offset="0:64" /> + <match value="<head" type="string" offset="0:64" /> + <match value="<TITLE" type="string" offset="0:64" /> + <match value="<title" type="string" offset="0:64" /> + <match value="<html" type="string" offset="0:64" /> + <match value="<HTML" type="string" offset="0:64" /> + <match value="<BODY" type="string" offset="0" /> + <match value="<body" type="string" offset="0" /> + <match value="<TITLE" type="string" offset="0" /> + <match value="<title" type="string" offset="0" /> + <match value="<!--" type="string" offset="0" /> + <match value="<h1" type="string" offset="0" /> + <match value="<H1" type="string" offset="0" /> + <match value="<!doctype HTML" type="string" offset="0" /> + <match value="<!DOCTYPE html" type="string" offset="0" /> + </magic> + <glob pattern="*.html" /> + <glob pattern="*.htm" /> + </mime-type> + + <mime-type type="application/xhtml+xml"> + <sub-class-of type="text/xml" /> + <glob pattern="*.xhtml" /> + <root-XML namespaceURI='http://www.w3.org/1999/xhtml' + localName='html' /> + </mime-type> + + <mime-type type="application/vnd.ms-powerpoint"> + <glob pattern="*.ppz" /> + <glob pattern="*.ppt" /> + <glob pattern="*.pps" /> + <glob pattern="*.pot" /> + <magic priority="50"> + <match value="0xcfd0e011" type="little32" offset="0" /> + </magic> + </mime-type> + + <mime-type type="application/vnd.ms-excel"> + <magic priority="50"> + <match value="Microsoft Excel 5.0 Worksheet" type="string" + offset="2080" /> + </magic> + <glob pattern="*.xls" /> + <glob pattern="*.xlc" /> + <glob pattern="*.xll" /> + <glob pattern="*.xlm" /> + <glob pattern="*.xlw" /> + <glob pattern="*.xla" /> + <glob pattern="*.xlt" /> + <glob pattern="*.xld" /> + <alias type="application/msexcel" /> + </mime-type> + + <mime-type type="application/vnd.oasis.opendocument.text"> + <glob pattern="*.odt" /> + </mime-type> + + + <mime-type type="application/zip"> + <alias type="application/x-zip-compressed" /> + <magic priority="40"> + <match value="PK\003\004" type="string" offset="0" /> + </magic> + <glob pattern="*.zip" /> + </mime-type> + + <mime-type type="application/vnd.oasis.opendocument.text"> + <glob pattern="*.oth" /> + </mime-type> + + <mime-type type="application/msword"> + <magic priority="50"> + <match value="\x31\xbe\x00\x00" type="string" offset="0" /> + <match value="PO^Q`" type="string" offset="0" /> + <match value="\376\067\0\043" type="string" offset="0" /> + <match value="\333\245-\0\0\0" type="string" offset="0" /> + <match value="Microsoft Word 6.0 Document" type="string" + offset="2080" /> + <match value="Microsoft Word document data" type="string" + offset="2112" /> + </magic> + <glob pattern="*.doc" /> + <alias type="application/vnd.ms-word" /> + </mime-type> + + <mime-type type="application/octet-stream"> + <magic priority="50"> + <match value="\037\036" type="string" offset="0" /> + <match value="017437" type="host16" offset="0" /> + <match value="0x1fff" type="host16" offset="0" /> + <match value="\377\037" type="string" offset="0" /> + <match value="0145405" type="host16" offset="0" /> + </magic> + <glob pattern="*.bin" /> + </mime-type> + + <mime-type type="application/pdf"> + <magic priority="50"> + <match value="%PDF-" type="string" offset="0" /> + </magic> + <glob pattern="*.pdf" /> + <alias type="application/x-pdf" /> + </mime-type> + + <mime-type type="application/atom+xml"> + <root-XML localName="feed" + namespaceURI="http://purl.org/atom/ns#" /> + </mime-type> + + <mime-type type="application/mac-binhex40"> + <glob pattern="*.hqx" /> + </mime-type> + + <mime-type type="application/mac-compactpro"> + <glob pattern="*.cpt" /> + </mime-type> + + <mime-type type="application/rtf"> + <alias type="text/rtf" /> + </mime-type> + + <mime-type type="application/rss+xml"> + <alias type="text/rss" /> + <root-XML localName="rss" /> + <root-XML namespaceURI="http://purl.org/rss/1.0/" /> + <glob pattern="*.rss" /> + </mime-type> + + <!-- added in by mattmann --> + <mime-type type="application/xml"> + <alias type="text/xml" /> + <glob pattern="*.xml" /> + </mime-type> + + <mime-type type="application/x-mif"> + <alias type="application/vnd.mif" /> + </mime-type> + + <mime-type type="application/vnd.wap.wbxml"> + <glob pattern="*.wbxml" /> + </mime-type> + + <mime-type type="application/vnd.wap.wmlc"> + <_comment>Compiled WML Document</_comment> + <glob pattern="*.wmlc" /> + </mime-type> + + <mime-type type="application/vnd.wap.wmlscriptc"> + <_comment>Compiled WML Script</_comment> + <glob pattern="*.wmlsc" /> + </mime-type> + + <mime-type type="text/vnd.wap.wmlscript"> + <_comment>WML Script</_comment> + <glob pattern="*.wmls" /> + </mime-type> + + <mime-type type="application/x-bzip"> + <alias type="application/x-bzip2" /> + </mime-type> + + <mime-type type="application/x-bzip-compressed-tar"> + <glob pattern="*.tbz" /> + <glob pattern="*.tbz2" /> + </mime-type> + + <mime-type type="application/x-cdlink"> + <_comment>Virtual CD-ROM CD Image File</_comment> + <glob pattern="*.vcd" /> + </mime-type> + + <mime-type type="application/x-director"> + <_comment>Shockwave Movie</_comment> + <glob pattern="*.dcr" /> + <glob pattern="*.dir" /> + <glob pattern="*.dxr" /> + </mime-type> + + <mime-type type="application/x-futuresplash"> + <_comment>Macromedia FutureSplash File</_comment> + <glob pattern="*.spl" /> + </mime-type> + + <mime-type type="application/x-java"> + <alias type="application/java" /> + </mime-type> + + <mime-type type="application/x-koan"> + <_comment>SSEYO Koan File</_comment> + <glob pattern="*.skp" /> + <glob pattern="*.skd" /> + <glob pattern="*.skt" /> + <glob pattern="*.skm" /> + </mime-type> + + <mime-type type="application/x-latex"> + <_comment>LaTeX Source Document</_comment> + <glob pattern="*.latex" /> + </mime-type> + + <!-- JC CHANGED + <mime-type type="application/x-mif"> + <_comment>FrameMaker MIF document</_comment> + <glob pattern="*.mif"/> + </mime-type> --> + + <mime-type type="application/x-ms-dos-executable"> + <alias type="application/x-dosexec;exe" /> + </mime-type> + + <mime-type type="application/ogg"> + <alias type="application/x-ogg" /> + </mime-type> + + <mime-type type="application/x-rar"> + <alias type="application/x-rar-compressed" /> + </mime-type> + + <mime-type type="application/x-shellscript"> + <alias type="application/x-sh" /> + </mime-type> + + <mime-type type="application/xhtml+xml"> + <glob pattern="*.xht" /> + </mime-type> + + <mime-type type="audio/midi"> + <glob pattern="*.kar" /> + </mime-type> + + <mime-type type="audio/x-pn-realaudio"> + <alias type="audio/x-realaudio" /> + </mime-type> + + <mime-type type="image/tiff"> + <magic priority="50"> + <match value="0x4d4d2a00" type="string" offset="0" /> + <match value="0x49492a00" type="string" offset="0" /> + </magic> + </mime-type> + + <mime-type type="message/rfc822"> + <magic priority="50"> + <match type="string" value="Relay-Version:" offset="0" /> + <match type="string" value="#! rnews" offset="0" /> + <match type="string" value="N#! rnews" offset="0" /> + <match type="string" value="Forward to" offset="0" /> + <match type="string" value="Pipe to" offset="0" /> + <match type="string" value="Return-Path:" offset="0" /> + <match type="string" value="From:" offset="0" /> + <match type="string" value="Message-ID:" offset="0" /> + <match type="string" value="Date:" offset="0" /> + </magic> + </mime-type> + + <mime-type type="image/vnd.wap.wbmp"> + <_comment>Wireless Bitmap File Format</_comment> + <glob pattern="*.wbmp" /> + </mime-type> + + <mime-type type="image/x-psd"> + <alias type="image/photoshop" /> + </mime-type> + + <mime-type type="image/x-xcf"> + <alias type="image/xcf" /> + <magic priority="50"> + <match type="string" value="gimp xcf " offset="0" /> + </magic> + </mime-type> + + <mime-type type="model/iges"> + <_comment> + Initial Graphics Exchange Specification Format + </_comment> + <glob pattern="*.igs" /> + <glob pattern="*.iges" /> + </mime-type> + + <mime-type type="model/mesh"> + <glob pattern="*.msh" /> + <glob pattern="*.mesh" /> + <glob pattern="*.silo" /> + </mime-type> + + <mime-type type="model/vrml"> + <glob pattern="*.vrml" /> + </mime-type> + + <mime-type type="text/x-tcl"> + <alias type="application/x-tcl" /> + </mime-type> + + <mime-type type="text/x-tex"> + <alias type="application/x-tex" /> + </mime-type> + + <mime-type type="text/x-texinfo"> + <alias type="application/x-texinfo" /> + </mime-type> + + <mime-type type="text/x-troff-me"> + <alias type="application/x-troff-me" /> + </mime-type> + + <mime-type type="video/vnd.mpegurl"> + <glob pattern="*.mxu" /> + </mime-type> + + <mime-type type="x-conference/x-cooltalk"> + <_comment>Cooltalk Audio</_comment> + <glob pattern="*.ice" /> + </mime-type> + +</mime-info> Added: incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java?rev=578161&view=auto ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java (added) +++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java Fri Sep 21 08:07:58 2007 @@ -0,0 +1,92 @@ +//Copyright (c) 2007, California Institute of Technology. +//ALL RIGHTS RESERVED. U.S. Government sponsorship acknowledged. +// +//$Id$ + +package org.apache.tika.mime; + +//JDK imports +import java.io.File; +import java.net.MalformedURLException; +import java.net.URL; + +//Tika imports +import org.apache.tika.metadata.TikaMimeKeys; +import org.apache.tika.utils.Configuration; + +//Junit imports +import junit.framework.TestCase; + +/** + * @author mattmann + * @version $Revision$ + * + * <p> + * Test Suite for the [EMAIL PROTECTED] MimeTypes} repository. + * </p>. + */ +public class TestMimeUtils extends TestCase implements TikaMimeKeys { + + private static final String tikaMimeFile = "org/apache/tika/mime/tika-mimetypes.xml"; + + private Configuration conf; + + private static URL u; + + static { + try { + u = new URL("http://mydomain.com/x.pdf?x=y"); + } catch (MalformedURLException e) { + fail(e.getMessage()); + } + } + + private static final File f = new File("/a/b/c/x.pdf"); + + private MimeUtils utils; + + public TestMimeUtils() { + Configuration conf = new Configuration(); + conf.set(TIKA_MIME_FILE, tikaMimeFile); + utils = new MimeUtils(conf); + assertNotNull(utils); + } + + public void testLoadMimeTypes() { + assertNotNull(utils.getRepository().forName("application/octet-stream")); + assertNotNull(utils.getRepository().forName("text/x-tex")); + } + + public void testGuessMimeTypes() { + + assertEquals("application/pdf", utils.getRepository().getMimeType( + "x.pdf").getName()); + assertEquals("application/pdf", utils.getRepository().getMimeType(u) + .getName()); + assertEquals("application/pdf", utils.getRepository().getMimeType(f) + .getName()); + assertEquals("text/plain", utils.getRepository().getMimeType("x.txt") + .getName()); + assertEquals("text/html", utils.getRepository().getMimeType("x.htm") + .getName()); + assertEquals("text/html", utils.getRepository().getMimeType("x.html") + .getName()); + assertEquals("application/xhtml+xml", utils.getRepository() + .getMimeType("x.xhtml").getName()); + assertEquals("application/xml", utils.getRepository().getMimeType( + "x.xml").getName()); + assertEquals("application/msword", utils.getRepository().getMimeType( + "x.doc").getName()); + assertEquals("application/vnd.ms-powerpoint", utils.getRepository() + .getMimeType("x.ppt").getName()); + assertEquals("application/vnd.ms-excel", utils.getRepository() + .getMimeType("x.xls").getName()); + assertEquals("application/zip", utils.getRepository().getMimeType( + "x.zip").getName()); + assertEquals("application/vnd.oasis.opendocument.text", utils + .getRepository().getMimeType("x.odt").getName()); + assertEquals("application/octet-stream", utils.getRepository() + .getMimeType("x.xyz").getName()); + } + +}
