Author: tallison
Date: Fri Jan 23 19:55:51 2015
New Revision: 1654351
URL: http://svn.apache.org/r1654351
Log:
TIKA-1529: turn forbidden-apis back on and clean up all mentions of UTF-8
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
tika/trunk/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java
tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
tika/trunk/tika-parent/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/HTMLHelper.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/JSONMessageBodyWriter.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataListMessageBodyWriter.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TextMessageBodyWriter.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/XMPMessageBodyWriter.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/Lingo24Translator.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Fri Jan
23 19:55:51 2015
@@ -727,7 +727,7 @@ public class TikaCLI {
} else if (System.getProperty("os.name")
.toLowerCase(Locale.ROOT).startsWith("mac os x")) {
// TIKA-324: Override the default encoding on Mac OS X
- return new OutputStreamWriter(output, "UTF-8");
+ return new OutputStreamWriter(output, IOUtils.UTF_8);
} else {
return new OutputStreamWriter(output, Charset.defaultCharset());
}
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Fri Jan
23 19:55:51 2015
@@ -459,7 +459,7 @@ public class TikaGUI extends JFrame
InputStream stream = url.openStream();
try {
StringWriter writer = new StringWriter();
- IOUtils.copy(stream, writer, "UTF-8");
+ IOUtils.copy(stream, writer, IOUtils.UTF_8.name());
JEditorPane editor =
new JEditorPane("text/plain", writer.toString());
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
(original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Fri
Jan 23 19:55:51 2015
@@ -16,20 +16,20 @@
*/
package org.apache.tika.cli;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.PrintStream;
import java.net.URI;
-
import org.apache.commons.io.FileUtils;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
/**
* Tests the Tika's cli
*/
@@ -49,7 +49,7 @@ public class TikaCLITest {
outContent = new ByteArrayOutputStream();
resourcePrefix = testDataURI.toString();
stdout = System.out;
- System.setOut(new PrintStream(outContent, true, "UTF-8"));
+ System.setOut(new PrintStream(outContent, true, IOUtils.UTF_8.name()));
}
/**
@@ -73,7 +73,7 @@ public class TikaCLITest {
public void testListParserDetail() throws Exception{
String[] params = {"--list-parser-detail"};
TikaCLI.main(params);
-
assertTrue(outContent.toString("UTF-8").contains("application/vnd.oasis.opendocument.text-web"));
+
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web"));
}
/**
@@ -98,7 +98,7 @@ public class TikaCLITest {
public void testXMLOutput() throws Exception{
String[] params = {"-x", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString("UTF-8").contains("?xml version=\"1.0\"
encoding=\"UTF-8\"?"));
+ assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("?xml
version=\"1.0\" encoding=\"UTF-8\"?"));
}
/**
@@ -112,7 +112,7 @@ public class TikaCLITest {
TikaCLI.main(params);
assertTrue(outContent.toString("UTF-8").contains("html
xmlns=\"http://www.w3.org/1999/xhtml"));
assertTrue("Expanded <title></title> element should be present",
- outContent.toString("UTF-8").contains("<title></title>"));
+
outContent.toString(IOUtils.UTF_8.name()).contains("<title></title>"));
}
/**
@@ -124,7 +124,7 @@ public class TikaCLITest {
public void testTextOutput() throws Exception{
String[] params = {"-t", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString("UTF-8").contains("finished off the
cake"));
+
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("finished off the
cake"));
}
/**
@@ -135,7 +135,7 @@ public class TikaCLITest {
public void testMetadataOutput() throws Exception{
String[] params = {"-m", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString("UTF-8").contains("text/plain"));
+
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
}
/**
@@ -147,7 +147,7 @@ public class TikaCLITest {
public void testJsonMetadataOutput() throws Exception {
String[] params = {"--json", resourcePrefix +
"testJsonMultipleInts.html"};
TikaCLI.main(params);
- String json = outContent.toString("UTF-8");
+ String json = outContent.toString(IOUtils.UTF_8.name());
//TIKA-1310
assertTrue(json.contains("\"fb:admins\":\"1,2,3,4\","));
@@ -168,7 +168,7 @@ public class TikaCLITest {
public void testJsonMetadataPrettyPrintOutput() throws Exception {
String[] params = {"--json", "-r", resourcePrefix +
"testJsonMultipleInts.html"};
TikaCLI.main(params);
- String json = outContent.toString("UTF-8");
+ String json = outContent.toString(IOUtils.UTF_8.name());
assertTrue(json.contains(" \"X-Parsed-By\": [\n" +
" \"org.apache.tika.parser.DefaultParser\",\n" +
@@ -191,7 +191,7 @@ public class TikaCLITest {
public void testLanguageOutput() throws Exception{
String[] params = {"-l", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString("UTF-8").contains("en"));
+ assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("en"));
}
/**
@@ -203,7 +203,7 @@ public class TikaCLITest {
public void testDetectOutput() throws Exception{
String[] params = {"-d", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString("UTF-8").contains("text/plain"));
+
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
}
/**
@@ -215,7 +215,7 @@ public class TikaCLITest {
public void testListMetModels() throws Exception{
String[] params = {"--list-met-models", resourcePrefix +
"alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString("UTF-8").contains("text/plain"));
+
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
}
/**
@@ -227,7 +227,7 @@ public class TikaCLITest {
public void testListSupportedTypes() throws Exception{
String[] params = {"--list-supported-types", resourcePrefix +
"alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString("UTF-8").contains("supertype:
application/octet-stream"));
+
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("supertype:
application/octet-stream"));
}
/**
@@ -300,7 +300,7 @@ public class TikaCLITest {
public void testMultiValuedMetadata() throws Exception {
String[] params = {"-m", resourcePrefix +
"testMultipleSheets.numbers"};
TikaCLI.main(params);
- String content = outContent.toString("UTF-8");
+ String content = outContent.toString(IOUtils.UTF_8.name());
assertTrue(content.contains("sheetNames: Checking"));
assertTrue(content.contains("sheetNames: Secon sheet"));
assertTrue(content.contains("sheetNames: Logical Sheet 3"));
@@ -314,7 +314,7 @@ public class TikaCLITest {
new File("subdir/foo.txt").delete();
new File("subdir").delete();
TikaCLI.main(params);
- String content = outContent.toString("UTF-8");
+ String content = outContent.toString(IOUtils.UTF_8.name());
assertTrue(content.contains("Extracting 'subdir/foo.txt'"));
// clean up. TODO: These should be in target.
new File("target/subdir/foo.txt").delete();
@@ -340,7 +340,7 @@ public class TikaCLITest {
public void testConfig() throws Exception {
String[] params = new
String[]{"--config="+testDataFile.toString()+"/tika-config1.xml",
resourcePrefix+"bad_xml.xml"};
TikaCLI.main(params);
- String content = outContent.toString("UTF-8");
+ String content = outContent.toString(IOUtils.UTF_8.name());
assertTrue(content.contains("apple"));
assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser"));
}
@@ -349,7 +349,7 @@ public class TikaCLITest {
public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception
{
String[] params = new String[]{"-m", "-J", "-r",
resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
- String content = outContent.toString("UTF-8");
+ String content = outContent.toString(IOUtils.UTF_8.name());
assertTrue(content.contains("[\n" +
" {\n" +
" \"Application-Name\": \"Microsoft Office Word\",\n" +
@@ -365,7 +365,7 @@ public class TikaCLITest {
public void testJsonRecursiveMetadataParserDefault() throws Exception {
String[] params = new String[]{"-J", "-r",
resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
- String content = outContent.toString("UTF-8");
+ String content = outContent.toString(IOUtils.UTF_8.name());
assertTrue(content.contains("\"X-TIKA:content\": \"\\u003chtml
xmlns\\u003d\\\"http://www.w3.org/1999/xhtml"));
}
@@ -373,7 +373,7 @@ public class TikaCLITest {
public void testJsonRecursiveMetadataParserText() throws Exception {
String[] params = new String[]{"-J", "-r", "-t",
resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
- String content = outContent.toString("UTF-8");
+ String content = outContent.toString(IOUtils.UTF_8.name());
assertTrue(content.contains("\\n\\nembed_4\\n"));
assertTrue(content.contains("\\n\\nembed_0"));
}
Modified:
tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
--- tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
(original)
+++ tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
Fri Jan 23 19:55:51 2015
@@ -41,6 +41,7 @@ import org.apache.tika.config.TikaConfig
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.fork.ForkParser;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -101,7 +102,7 @@ public class BundleIT {
ForkParser parser = (ForkParser)
bc.getService(bc.getServiceReference(ForkParser.class.getName()));
ClassLoader classLoader = parser.getClass().getClassLoader();
String data = "<!DOCTYPE html>\n<html><body><p>test
<span>content</span></p></body></html>";
- InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
+ InputStream stream = new
ByteArrayInputStream(data.getBytes(IOUtils.UTF_8));
Writer writer = new StringWriter();
ContentHandler contentHandler = new BodyContentHandler(writer);
Metadata metadata = new Metadata();
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
Fri Jan 23 19:55:51 2015
@@ -29,6 +29,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
+import org.apache.tika.io.IOUtils;
/**
* Internal utility class that Tika uses to look up service providers.
@@ -329,7 +330,7 @@ public class ServiceLoader {
InputStream stream = resource.openStream();
try {
BufferedReader reader =
- new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+ new BufferedReader(new InputStreamReader(stream,
IOUtils.UTF_8));
String line = reader.readLine();
while (line != null) {
line = COMMENT.matcher(line).replaceFirst("");
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
Fri Jan 23 19:55:51 2015
@@ -19,14 +19,13 @@ package org.apache.tika.detect;
import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -99,11 +98,7 @@ public class MagicDetector implements De
} else if (type.equals("stringignorecase")) {
decoded = decodeString(value.toLowerCase(Locale.ROOT), type);
} else if (type.equals("byte")) {
- try {
- decoded = tmpVal.getBytes("UTF-8");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ decoded = tmpVal.getBytes(IOUtils.UTF_8);
} else if (type.equals("host16") || type.equals("little16")) {
int i = Integer.parseInt(tmpVal, radix);
decoded = new byte[] { (byte) (i & 0x00FF), (byte) (i >> 8) };
@@ -399,7 +394,7 @@ public class MagicDetector implements De
flags = Pattern.CASE_INSENSITIVE;
}
- Pattern p = Pattern.compile(new String(this.pattern, "UTF-8"),
flags);
+ Pattern p = Pattern.compile(new String(this.pattern,
IOUtils.UTF_8), flags);
ByteBuffer bb = ByteBuffer.wrap(buffer);
CharBuffer result = ISO_8859_1.decode(bb);
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
Fri Jan 23 19:55:51 2015
@@ -22,6 +22,7 @@ import java.net.URLDecoder;
import java.util.Map;
import java.util.regex.Pattern;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -119,7 +120,7 @@ public class NameDetector implements Det
int percent = name.indexOf('%');
if (percent != -1) {
try {
- name = URLDecoder.decode(name, "UTF-8");
+ name = URLDecoder.decode(name, IOUtils.UTF_8.name());
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException("UTF-8 not supported", e);
}
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
Fri Jan 23 19:55:51 2015
@@ -413,7 +413,7 @@ public class ExternalEmbedder implements
if (process.exitValue() != 0) {
throw new TikaException("There was an error executing the
command line" +
"\nExecutable Command:\n\n" + cmd +
- "\nExecutable Error:\n\n" +
stdErrOutputStream.toString("UTF-8"));
+ "\nExecutable Error:\n\n" +
stdErrOutputStream.toString(IOUtils.UTF_8.name()));
}
}
}
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java Fri
Jan 23 19:55:51 2015
@@ -263,7 +263,7 @@ class ForkClient {
String manifest =
"Main-Class: " + ForkServer.class.getName() + "\n";
jar.putNextEntry(new ZipEntry("META-INF/MANIFEST.MF"));
- jar.write(manifest.getBytes("UTF-8"));
+ jar.write(manifest.getBytes(IOUtils.UTF_8));
Class<?>[] bootstrap = {
ForkServer.class, ForkObjectInputStream.class,
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java Fri Jan
23 19:55:51 2015
@@ -28,9 +28,9 @@ import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringWriter;
-import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.channels.Channel;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
@@ -77,6 +77,9 @@ import java.util.List;
*/
public class IOUtils {
+ //TODO: switch to StandardCharsets when we move to Java 1.7
+ public static final Charset UTF_8 = Charset.forName("UTF-8");
+
/**
* The default buffer size to use.
*/
@@ -255,7 +258,7 @@ public class IOUtils {
*/
@Deprecated
public static byte[] toByteArray(String input) throws IOException {
- return input.getBytes("UTF-8");
+ return input.getBytes(IOUtils.UTF_8);
}
// read char[]
@@ -393,7 +396,7 @@ public class IOUtils {
*/
@Deprecated
public static String toString(byte[] input) throws IOException {
- return new String(input, "UTF-8");
+ return new String(input, IOUtils.UTF_8);
}
/**
@@ -415,7 +418,7 @@ public class IOUtils {
throws IOException {
// If no encoding is specified, default to UTF-8.
if (encoding == null) {
- return new String(input, "UTF-8");
+ return new String(input, IOUtils.UTF_8);
} else {
return new String(input, encoding);
}
@@ -437,7 +440,7 @@ public class IOUtils {
* @since Commons IO 1.1
*/
public static List<String> readLines(InputStream input) throws IOException
{
- InputStreamReader reader = new InputStreamReader(input, "UTF-8");
+ InputStreamReader reader = new InputStreamReader(input, IOUtils.UTF_8);
return readLines(reader);
}
@@ -531,13 +534,8 @@ public class IOUtils {
* @since Commons IO 1.1
*/
public static InputStream toInputStream(String input) {
- try {
- byte[] bytes = input.getBytes("UTF-8");
- return new ByteArrayInputStream(bytes);
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
-
+ byte[] bytes = input.getBytes(IOUtils.UTF_8);
+ return new ByteArrayInputStream(bytes);
}
/**
@@ -554,7 +552,7 @@ public class IOUtils {
* @since Commons IO 1.1
*/
public static InputStream toInputStream(String input, String encoding)
throws IOException {
- byte[] bytes = encoding != null ? input.getBytes(encoding) :
input.getBytes("UTF-8");
+ byte[] bytes = encoding != null ? input.getBytes(encoding) :
input.getBytes(IOUtils.UTF_8);
return new ByteArrayInputStream(bytes);
}
@@ -592,7 +590,7 @@ public class IOUtils {
*/
public static void write(byte[] data, Writer output) throws IOException {
if (data != null) {
- output.write(new String(data, "UTF-8"));
+ output.write(new String(data, IOUtils.UTF_8));
}
}
@@ -660,7 +658,7 @@ public class IOUtils {
public static void write(char[] data, OutputStream output)
throws IOException {
if (data != null) {
- output.write(new String(data).getBytes("UTF-8"));
+ output.write(new String(data).getBytes(IOUtils.UTF_8));
}
}
@@ -786,7 +784,7 @@ public class IOUtils {
public static void write(String data, OutputStream output)
throws IOException {
if (data != null) {
- output.write(data.getBytes("UTF-8"));
+ output.write(data.getBytes(IOUtils.UTF_8));
}
}
@@ -855,7 +853,7 @@ public class IOUtils {
public static void write(StringBuffer data, OutputStream output)
throws IOException {
if (data != null) {
- output.write(data.toString().getBytes("UTF-8"));
+ output.write(data.toString().getBytes(IOUtils.UTF_8));
}
}
@@ -961,7 +959,7 @@ public class IOUtils {
*/
public static void copy(InputStream input, Writer output)
throws IOException {
- InputStreamReader in = new InputStreamReader(input, "UTF-8");
+ InputStreamReader in = new InputStreamReader(input, IOUtils.UTF_8);
copy(in, output);
}
@@ -1068,7 +1066,7 @@ public class IOUtils {
*/
public static void copy(Reader input, OutputStream output)
throws IOException {
- OutputStreamWriter out = new OutputStreamWriter(output, "UTF-8");
+ OutputStreamWriter out = new OutputStreamWriter(output, IOUtils.UTF_8);
copy(input, out);
// XXX Unless anyone is planning on rewriting OutputStreamWriter, we
// have to flush here.
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
Fri Jan 23 19:55:51 2015
@@ -25,6 +25,8 @@ import java.util.Map;
import java.util.Properties;
import java.util.Set;
+import org.apache.tika.io.IOUtils;
+
/**
* Identifier of the language that best matches a given content profile.
* The content profile is compared to generic language profiles based on
@@ -44,7 +46,6 @@ public class LanguageIdentifier {
private static final Map<String, LanguageProfile> PROFILES =
new HashMap<String, LanguageProfile>();
private static final String PROFILE_SUFFIX = ".ngp";
- private static final String PROFILE_ENCODING = "UTF-8";
private static Properties props = new Properties();
private static String errors = "";
@@ -76,7 +77,7 @@ public class LanguageIdentifier {
LanguageIdentifier.class.getResourceAsStream(language +
PROFILE_SUFFIX);
try {
BufferedReader reader =
- new BufferedReader(new InputStreamReader(stream,
PROFILE_ENCODING));
+ new BufferedReader(new InputStreamReader(stream,
IOUtils.UTF_8));
String line = reader.readLine();
while (line != null) {
if (line.length() > 0 && !line.startsWith("#")) {
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
Fri Jan 23 19:55:51 2015
@@ -33,8 +33,9 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
/**
* This class runs a ngram analysis over submitted text, results might be used
* for automatic language identification.
@@ -341,7 +342,7 @@ public class LanguageProfilerBuilder {
ngrams.clear();
ngramcounts = new int[maxLength + 1];
- BufferedReader reader = new BufferedReader(new InputStreamReader(is,
"UTF-8"));
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is,
IOUtils.UTF_8));
String line = null;
while ((line = reader.readLine()) != null) {
@@ -405,7 +406,7 @@ public class LanguageProfilerBuilder {
*/
public void save(OutputStream os) throws IOException {
os.write(("# NgramProfile generated at " + new Date() +
- " for Apache Tika Language
Identification\n").getBytes("UTF-8"));
+ " for Apache Tika Language
Identification\n").getBytes(IOUtils.UTF_8));
// And then each ngram
@@ -432,7 +433,7 @@ public class LanguageProfilerBuilder {
for (int i = 0; i < list.size(); i++) {
NGramEntry e = list.get(i);
String line = e.toString() + " " + e.getCount() + "\n";
- os.write(line.getBytes("UTF-8"));
+ os.write(line.getBytes(IOUtils.UTF_8));
}
os.flush();
}
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
Fri Jan 23 19:55:51 2015
@@ -24,7 +24,6 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
-import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
@@ -232,7 +231,7 @@ public class ExternalParser extends Abst
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
throws SAXException, IOException {
- Reader reader = new InputStreamReader(stream, "UTF-8");
+ Reader reader = new InputStreamReader(stream, IOUtils.UTF_8);
try {
xhtml.startDocument();
xhtml.startElement("p");
@@ -293,11 +292,7 @@ public class ExternalParser extends Abst
new Thread() {
public void run() {
BufferedReader reader;
- try {
- reader = new BufferedReader(new InputStreamReader(stream,
"UTF-8"));
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ reader = new BufferedReader(new InputStreamReader(stream,
IOUtils.UTF_8));
try {
String line;
while ( (line = reader.readLine()) != null ) {
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
Fri Jan 23 19:55:51 2015
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.junit.Test;
@@ -54,8 +55,8 @@ public class TextDetectorTest {
@Test
public void testDetectText() throws Exception {
- assertText("Hello, World!".getBytes("UTF-8"));
- assertText(" \t\r\n".getBytes("UTF-8"));
+ assertText("Hello, World!".getBytes(IOUtils.UTF_8));
+ assertText(" \t\r\n".getBytes(IOUtils.UTF_8));
assertNotText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
assertNotText(new byte[] { 0 });
assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 });
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java
(original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java
Fri Jan 23 19:55:51 2015
@@ -23,7 +23,6 @@ import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import org.junit.Test;
@@ -69,11 +68,7 @@ public class TailStreamTest
*/
private static InputStream generateStream(int from, int length)
{
- try {
- return new ByteArrayInputStream(generateText(from,
length).getBytes("UTF-8"));
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ return new ByteArrayInputStream(generateText(from,
length).getBytes(IOUtils.UTF_8));
}
/**
@@ -128,7 +123,7 @@ public class TailStreamTest
TailStream stream = new TailStream(generateStream(0, 2 * count),
count);
readStream(stream);
assertEquals("Wrong buffer", generateText(count, count), new String(
- stream.getTail(), "UTF-8"));
+ stream.getTail(), IOUtils.UTF_8));
}
/**
@@ -149,7 +144,7 @@ public class TailStreamTest
read = stream.read(buf);
}
assertEquals("Wrong buffer", generateText(count - tailSize, tailSize),
- new String(stream.getTail(), "UTF-8"));
+ new String(stream.getTail(), IOUtils.UTF_8));
stream.close();
}
@@ -169,7 +164,7 @@ public class TailStreamTest
stream.reset();
readStream(stream);
assertEquals("Wrong buffer", generateText(tailSize, tailSize),
- new String(stream.getTail(), "UTF-8"));
+ new String(stream.getTail(), IOUtils.UTF_8));
}
/**
@@ -185,7 +180,7 @@ public class TailStreamTest
byte[] buf = new byte[count];
stream.read(buf);
assertEquals("Wrong buffer", generateText(count - tailSize, tailSize),
- new String(stream.getTail(), "UTF-8"));
+ new String(stream.getTail(), IOUtils.UTF_8));
stream.close();
}
@@ -202,7 +197,7 @@ public class TailStreamTest
assertEquals("Wrong skip result", skipCount, stream.skip(skipCount));
assertEquals("Wrong buffer",
generateText(skipCount - tailSize, tailSize),
- new String(stream.getTail(), "UTF-8"));
+ new String(stream.getTail(), IOUtils.UTF_8));
stream.close();
}
@@ -216,7 +211,7 @@ public class TailStreamTest
TailStream stream = new TailStream(generateStream(0, count), 2 *
count);
assertEquals("Wrong skip result", count, stream.skip(2 * count));
assertEquals("Wrong buffer", generateText(0, count),
- new String(stream.getTail(), "UTF-8"));
+ new String(stream.getTail(), IOUtils.UTF_8));
stream.close();
}
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
Fri Jan 23 19:55:51 2015
@@ -16,6 +16,10 @@
*/
package org.apache.tika.io;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
@@ -27,11 +31,7 @@ import java.io.OutputStream;
import java.net.URL;
import org.apache.tika.metadata.Metadata;
-
import org.junit.Test;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
public class TikaInputStreamTest {
@@ -62,7 +62,7 @@ public class TikaInputStreamTest {
@Test
public void testStreamBased() throws IOException {
InputStream input =
- new ByteArrayInputStream("Hello, World!".getBytes("UTF-8"));
+ new ByteArrayInputStream("Hello, World!".getBytes(IOUtils.UTF_8));
InputStream stream = TikaInputStream.get(input);
File file = TikaInputStream.get(stream).getFile();
@@ -89,7 +89,7 @@ public class TikaInputStreamTest {
File file = File.createTempFile("tika-", ".tmp");
OutputStream stream = new FileOutputStream(file);
try {
- stream.write(data.getBytes("UTF-8"));
+ stream.write(data.getBytes(IOUtils.UTF_8));
} finally {
stream.close();
}
@@ -108,7 +108,7 @@ public class TikaInputStreamTest {
private String readStream(InputStream stream) throws IOException {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
IOUtils.copy(stream, buffer);
- return buffer.toString("UTF-8");
+ return buffer.toString(IOUtils.UTF_8.name());
}
@Test
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
Fri Jan 23 19:55:51 2015
@@ -16,16 +16,16 @@
*/
package org.apache.tika.language;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Writer;
import java.util.HashMap;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
import org.apache.tika.io.IOUtils;
import org.junit.Before;
import org.junit.Test;
@@ -139,7 +139,7 @@ public class LanguageIdentifierTest {
InputStream stream =
LanguageIdentifierTest.class.getResourceAsStream(language +
".test");
try {
- IOUtils.copy(new InputStreamReader(stream, "UTF-8"), writer);
+ IOUtils.copy(new InputStreamReader(stream, IOUtils.UTF_8), writer);
} finally {
stream.close();
}
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
Fri Jan 23 19:55:51 2015
@@ -17,6 +17,9 @@
package org.apache.tika.language;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
@@ -27,12 +30,10 @@ import java.io.InputStreamReader;
import java.net.URISyntaxException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.junit.After;
import org.junit.Test;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
public class LanguageProfilerBuilderTest {
/* Test members */
private LanguageProfilerBuilder ngramProfile = null;
@@ -40,7 +41,6 @@ public class LanguageProfilerBuilderTest
private final String profileName =
"../tika-core/src/test/resources/org/apache/tika/language/langbuilder/"
+ LanguageProfilerBuilderTest.class.getName();
private final String corpusName = "langbuilder/welsh_corpus.txt";
- private final String encoding = "UTF-8";
private final String FILE_EXTENSION = "ngp";
private final String LANGUAGE = "welsh";
private final int maxlen = 1000;
@@ -50,7 +50,7 @@ public class LanguageProfilerBuilderTest
InputStream is =
LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName);
try {
- ngramProfile = LanguageProfilerBuilder.create(profileName, is ,
encoding);
+ ngramProfile = LanguageProfilerBuilder.create(profileName, is ,
IOUtils.UTF_8.name());
} finally {
is.close();
}
@@ -82,7 +82,7 @@ public class LanguageProfilerBuilderTest
+ FILE_EXTENSION));
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(
- stream, encoding));
+ stream, IOUtils.UTF_8));
String line = reader.readLine();
while (line != null) {
if (line.length() > 0 && !line.startsWith("#")) {// skips the
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
Fri Jan 23 19:55:51 2015
@@ -25,8 +25,8 @@ import java.io.InputStream;
import java.net.URL;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
-
import org.junit.Before;
import org.junit.Test;
@@ -85,7 +85,7 @@ public class MimeDetectionTest {
new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
new Metadata()));
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
- new ByteArrayInputStream("\ufefftest".getBytes("UTF-8")),
+ new ByteArrayInputStream("\ufefftest".getBytes(IOUtils.UTF_8)),
new Metadata()));
}
@@ -195,7 +195,7 @@ public class MimeDetectionTest {
@Test
public void testNotXML() throws IOException {
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
- new ByteArrayInputStream("<!-- test -->".getBytes("UTF-8")),
+ new ByteArrayInputStream("<!-- test
-->".getBytes(IOUtils.UTF_8)),
new Metadata()));
}
@@ -219,7 +219,7 @@ public class MimeDetectionTest {
*/
@Test
public void testMimeMagicClashSamePriority() throws IOException {
- byte[] helloWorld = "Hello, World!".getBytes("UTF-8");
+ byte[] helloWorld = "Hello, World!".getBytes(IOUtils.UTF_8);
MediaType helloType = MediaType.parse("hello/world-file");
MediaType helloXType = MediaType.parse("hello/x-world-hello");
Metadata metadata;
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
Fri Jan 23 19:55:51 2015
@@ -16,6 +16,10 @@
*/
package org.apache.tika.sax;
+import static junit.framework.Assert.assertFalse;
+import static junit.framework.Assert.assertTrue;
+import static org.junit.Assert.assertEquals;
+
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -23,6 +27,7 @@ import java.io.UnsupportedEncodingExcept
import java.util.Set;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -34,15 +39,12 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
-import static junit.framework.Assert.assertFalse;
-import static junit.framework.Assert.assertTrue;
-import static org.junit.Assert.assertEquals;
-
/**
* Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class.
*/
public class BasicContentHandlerFactoryTest {
- private static final String ENCODING = "UTF-8";
+
+ private static final String ENCODING = IOUtils.UTF_8.name();
//default max char len (at least in WriteOutContentHandler is 100k)
private static final int OVER_DEFAULT = 120000;
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
Fri Jan 23 19:55:51 2015
@@ -21,6 +21,7 @@ import static org.junit.Assert.assertEqu
import java.io.ByteArrayOutputStream;
import java.io.OutputStream;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.junit.Test;
@@ -45,7 +46,7 @@ public class BodyContentHandlerTest {
xhtml.element("p", "Test text");
xhtml.endDocument();
- assertEquals("Test text\n", buffer.toString("UTF-8"));
+ assertEquals("Test text\n", buffer.toString(IOUtils.UTF_8.name()));
}
}
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
Fri Jan 23 19:55:51 2015
@@ -16,20 +16,6 @@ package org.apache.tika.example;
* limitations under the License.
*/
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.language.translate.DefaultTranslator;
-import org.apache.tika.language.translate.Translator;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.CompositeParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
@@ -37,18 +23,35 @@ import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
+
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
+import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.language.translate.DefaultTranslator;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+
/**
* This class shows how to dump a TikaConfig object to a configuration file.
@@ -187,19 +190,19 @@ public class DumpTikaConfigExample {
*/
public static void main(String[] args) throws Exception {
- String encoding = "UTF-8";
+ Charset encoding = IOUtils.UTF_8;
Writer writer = null;
if (args.length > 0) {
- writer = new OutputStreamWriter(new FileOutputStream(new
File(args[0])));
+ writer = new OutputStreamWriter(new FileOutputStream(new
File(args[0])), encoding);
} else {
writer = new StringWriter();
}
if (args.length > 1) {
- encoding = args[1];
+ encoding = Charset.forName(args[1]);
}
DumpTikaConfigExample ex = new DumpTikaConfigExample();
- ex.dump(TikaConfig.getDefaultConfig(), writer, encoding);
+ ex.dump(TikaConfig.getDefaultConfig(), writer, encoding.name());
writer.flush();
Modified: tika/trunk/tika-parent/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parent/pom.xml?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
--- tika/trunk/tika-parent/pom.xml (original)
+++ tika/trunk/tika-parent/pom.xml Fri Jan 23 19:55:51 2015
@@ -213,6 +213,14 @@
<role>committer</role>
</roles>
</developer>
+ <developer>
+ <name>Tim Allison</name>
+ <id>tallison</id>
+ <timezone>-5</timezone>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
</developers>
<contributors>
<contributor>
@@ -274,7 +282,6 @@
</properties>
<build>
- <pluginManagement>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
@@ -287,7 +294,7 @@
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
- <version>1.6.1</version>
+ <version>1.7</version>
<configuration>
<targetVersion>${maven.compiler.target}</targetVersion>
<internalRuntimeForbidden>true</internalRuntimeForbidden>
@@ -321,6 +328,36 @@
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
</plugin>
+ </plugins>
+
+ <pluginManagement>
+ <plugins>
+ <!--This plugin's configuration is used to store Eclipse m2e settings
only. It has no influence on the Maven build itself.-->
+ <plugin>
+ <groupId>org.eclipse.m2e</groupId>
+ <artifactId>lifecycle-mapping</artifactId>
+ <version>1.0.0</version>
+ <configuration>
+ <lifecycleMappingMetadata>
+ <pluginExecutions>
+ <pluginExecution>
+ <pluginExecutionFilter>
+ <groupId>de.thetaphi</groupId>
+ <artifactId>forbiddenapis</artifactId>
+ <versionRange>[1.0,)</versionRange>
+ <goals>
+ <goal>check</goal>
+ <goal>testCheck</goal>
+ </goals>
+ </pluginExecutionFilter>
+ <action>
+ <ignore/>
+ </action>
+ </pluginExecution>
+ </pluginExecutions>
+ </lifecycleMappingMetadata>
+ </configuration>
+ </plugin>
</plugins>
</pluginManagement>
</build>
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
Fri Jan 23 19:55:51 2015
@@ -16,11 +16,11 @@
*/
package org.apache.tika.parser.chm.accessor;
-import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.parser.chm.core.ChmCommons;
import org.apache.tika.parser.chm.core.ChmConstants;
import org.apache.tika.parser.chm.exception.ChmParsingException;
@@ -232,13 +232,10 @@ public class ChmDirectoryListingSet {
DirectoryListingEntry dle = new DirectoryListingEntry();
dle.setNameLength(strlen);
- try {
- dle.setName(new String(ChmCommons.copyOfRange(
+ dle.setName(new String(ChmCommons.copyOfRange(
dir_chunk, placeHolder,
- (placeHolder + dle.getNameLength())),
"UTF-8"));
- } catch (UnsupportedEncodingException ex) {
- dle.setName(new String(dir_chunk, placeHolder,
placeHolder + dle.getNameLength()));
- }
+ (placeHolder + dle.getNameLength())),
IOUtils.UTF_8));
+
checkControlData(dle);
checkResetTable(dle);
setPlaceHolder(placeHolder
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
Fri Jan 23 19:55:51 2015
@@ -16,10 +16,10 @@
*/
package org.apache.tika.parser.chm.accessor;
-import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.parser.chm.assertion.ChmAssert;
import org.apache.tika.parser.chm.core.ChmConstants;
import org.apache.tika.parser.chm.exception.ChmParsingException;
@@ -62,11 +62,7 @@ public class ChmItsfHeader implements Ch
private int currentPlace = 0;
public ChmItsfHeader() {
- try {
- signature = ChmConstants.ITSF.getBytes("UTF-8"); /* 0 (ITSF) */
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ signature = ChmConstants.ITSF.getBytes(IOUtils.UTF_8); /* 0 (ITSF) */
}
/**
@@ -74,11 +70,7 @@ public class ChmItsfHeader implements Ch
*/
public String toString() {
StringBuilder sb = new StringBuilder();
- try {
- sb.append(new String(getSignature(), "UTF-8") + " ");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ sb.append(new String(getSignature(), IOUtils.UTF_8) + " ");
sb.append(getVersion() + " ");
sb.append(getHeaderLen() + " ");
sb.append(getUnknown_000c() + " ");
@@ -471,12 +463,8 @@ public class ChmItsfHeader implements Ch
chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data,
chmItsfHeader.getUnknownLen()));
chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data,
chmItsfHeader.getDirOffset()));
chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data,
chmItsfHeader.getDirLen()));
- try {
- if (!new String(chmItsfHeader.getSignature(),
"UTF-8").equals(ChmConstants.ITSF))
- throw new TikaException("seems not valid file");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ if (!new String(chmItsfHeader.getSignature(),
IOUtils.UTF_8).equals(ChmConstants.ITSF))
+ throw new TikaException("seems not valid file");
if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) {
if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN)
throw new TikaException("something wrong with header");
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
Fri Jan 23 19:55:51 2015
@@ -16,14 +16,15 @@
*/
package org.apache.tika.parser.chm.accessor;
+import java.io.UnsupportedEncodingException;
+
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.parser.chm.assertion.ChmAssert;
import org.apache.tika.parser.chm.core.ChmCommons;
import org.apache.tika.parser.chm.core.ChmConstants;
import org.apache.tika.parser.chm.exception.ChmParsingException;
-import java.io.UnsupportedEncodingException;
-
/**
* Directory header The directory starts with a header; its format is as
* follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD
Length
@@ -68,25 +69,17 @@ public class ChmItspHeader implements Ch
private int currentPlace = 0;
public ChmItspHeader() {
- try {
- signature = ChmConstants.ITSP.getBytes("UTF-8"); /*
+ signature = ChmConstants.ITSP.getBytes(IOUtils.UTF_8); /*
* 0
*
(ITSP
* )
*/
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
}
public String toString() {
StringBuilder sb = new StringBuilder();
- try {
- sb.append("[ signature:=" + new String(getSignature(), "UTF-8")
- + System.getProperty("line.separator"));
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ sb.append("[ signature:=" + new String(getSignature(), IOUtils.UTF_8)
+ + System.getProperty("line.separator"));
sb.append("version:=\t" + getVersion()
+ System.getProperty("line.separator"));
sb.append("header_len:=\t" + getHeader_len()
@@ -544,12 +537,9 @@ public class ChmItspHeader implements Ch
ChmConstants.BYTE_ARRAY_LENGHT));
/* Checks validity of the itsp header */
- try {
- if (!new String(chmItspHeader.getSignature(),
"UTF-8").equals(ChmConstants.ITSP))
+ if (!new String(chmItspHeader.getSignature(),
IOUtils.UTF_8).equals(ChmConstants.ITSP))
throw new ChmParsingException("seems not valid signature");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+
if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1)
throw new ChmParsingException("!=ChmConstants.CHM_VER_1");
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
Fri Jan 23 19:55:51 2015
@@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.chm.accessor;
+import java.io.UnsupportedEncodingException;
+
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.parser.chm.assertion.ChmAssert;
import org.apache.tika.parser.chm.core.ChmConstants;
import org.apache.tika.parser.chm.exception.ChmParsingException;
-import java.io.UnsupportedEncodingException;
-
/**
*
* ::DataSpace/Storage/<SectionName>/ControlData This file contains $20 bytes
of
@@ -54,15 +55,11 @@ public class ChmLzxcControlData implemen
private int currentPlace = 0;
public ChmLzxcControlData() {
- try {
- signature = ChmConstants.LZXC.getBytes("UTF-8"); /*
+ signature = ChmConstants.LZXC.getBytes(IOUtils.UTF_8); /*
* 4
* (LZXC
* )
*/
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
}
/**
@@ -257,12 +254,8 @@ public class ChmLzxcControlData implemen
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("size(unknown):=" + this.getSize() + ", ");
- try {
- sb.append("signature(Compression type identifier):="
- + new String(this.getSignature(), "UTF-8") + ", ");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ sb.append("signature(Compression type identifier):="
+ + new String(this.getSignature(), IOUtils.UTF_8) + ", ");
sb.append("version(Possibly numeric code for LZX):="
+ this.getVersion() + System.getProperty("line.separator"));
sb.append("resetInterval(The Huffman reset interval):="
@@ -313,14 +306,10 @@ public class ChmLzxcControlData implemen
"window size / resetInterval should be more than 1");
/* checks a signature */
- try {
- if (!new String(chmLzxcControlData.getSignature(), "UTF-8")
- .equals(ChmConstants.LZXC))
- throw new ChmParsingException(
- "the signature does not seem to be correct");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ if (!new String(chmLzxcControlData.getSignature(), IOUtils.UTF_8)
+ .equals(ChmConstants.LZXC))
+ throw new ChmParsingException(
+ "the signature does not seem to be correct");
}
/**
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
Fri Jan 23 19:55:51 2015
@@ -16,10 +16,10 @@
*/
package org.apache.tika.parser.chm.accessor;
-import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.parser.chm.assertion.ChmAssert;
import org.apache.tika.parser.chm.core.ChmCommons;
import org.apache.tika.parser.chm.core.ChmConstants;
@@ -54,11 +54,7 @@ public class ChmPmgiHeader implements Ch
private int currentPlace = 0;
public ChmPmgiHeader() {
- try {
- signature = ChmConstants.CHM_PMGI_MARKER.getBytes("UTF-8"); /* 0
(PMGI) */
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ signature = ChmConstants.CHM_PMGI_MARKER.getBytes(IOUtils.UTF_8); /* 0
(PMGI) */
}
private int getDataRemained() {
@@ -84,12 +80,9 @@ public class ChmPmgiHeader implements Ch
ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
ChmAssert.assertPositiveInt(count);
this.setDataRemained(data.length);
- try {
index = ChmCommons.indexOf(data,
- ChmConstants.CHM_PMGI_MARKER.getBytes("UTF-8"));
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ ChmConstants.CHM_PMGI_MARKER.getBytes(IOUtils.UTF_8));
+
if (index >= 0)
System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0,
count);
else{
@@ -156,11 +149,7 @@ public class ChmPmgiHeader implements Ch
*/
public String toString() {
StringBuilder sb = new StringBuilder();
- try {
- sb.append("signature:=" + new String(getSignature(), "UTF-8") + ",
");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ sb.append("signature:=" + new String(getSignature(), IOUtils.UTF_8) +
", ");
sb.append("free space:=" + getFreeSpace()
+ System.getProperty("line.separator"));
return sb.toString();
@@ -177,14 +166,10 @@ public class ChmPmgiHeader implements Ch
chmPmgiHeader.setFreeSpace(chmPmgiHeader.unmarshalUInt32(data,
chmPmgiHeader.getFreeSpace()));
/* check structure */
- try {
- if (!Arrays.equals(chmPmgiHeader.getSignature(),
- ChmConstants.CHM_PMGI_MARKER.getBytes("UTF-8")))
- throw new TikaException(
- "it does not seem to be valid a PMGI signature, check
ChmItsp index_root if it was -1, means no PMGI, use PMGL insted");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ if (!Arrays.equals(chmPmgiHeader.getSignature(),
+ ChmConstants.CHM_PMGI_MARKER.getBytes(IOUtils.UTF_8)))
+ throw new TikaException(
+ "it does not seem to be valid a PMGI signature, check
ChmItsp index_root if it was -1, means no PMGI, use PMGL insted");
}
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
Fri Jan 23 19:55:51 2015
@@ -16,9 +16,8 @@
*/
package org.apache.tika.parser.chm.accessor;
-import java.io.UnsupportedEncodingException;
-
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.parser.chm.assertion.ChmAssert;
import org.apache.tika.parser.chm.core.ChmConstants;
import org.apache.tika.parser.chm.exception.ChmParsingException;
@@ -68,15 +67,11 @@ public class ChmPmglHeader implements Ch
private int currentPlace = 0;
public ChmPmglHeader() {
- try {
- signature = ChmConstants.PMGL.getBytes("UTF-8"); /*
+ signature = ChmConstants.PMGL.getBytes(IOUtils.UTF_8); /*
* 0
*
(PMGL
* )
*/
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
}
private int getDataRemained() {
@@ -108,11 +103,7 @@ public class ChmPmglHeader implements Ch
public String toString() {
StringBuilder sb = new StringBuilder();
- try {
- sb.append("signatute:=" + new String(getSignature(), "UTF-8") + ",
");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ sb.append("signatute:=" + new String(getSignature(), IOUtils.UTF_8) +
", ");
sb.append("free space:=" + getFreeSpace() + ", ");
sb.append("unknown0008:=" + getUnknown0008() + ", ");
sb.append("prev block:=" + getBlockPrev() + ", ");
@@ -175,13 +166,9 @@ public class ChmPmglHeader implements Ch
chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data));
/* check structure */
- try {
- if (!new String(chmPmglHeader.getSignature(),
"UTF-8").equals(ChmConstants.PMGL))
- throw new ChmParsingException(ChmPmglHeader.class.getName()
- + " pmgl != pmgl.signature");
- } catch (UnsupportedEncodingException e) {
- throw new AssertionError("UTF-8 not supported.");
- }
+ if (!new String(chmPmglHeader.getSignature(),
IOUtils.UTF_8).equals(ChmConstants.PMGL))
+ throw new ChmParsingException(ChmPmglHeader.class.getName()
+ + " pmgl != pmgl.signature");
}
public byte[] getSignature() {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
Fri Jan 23 19:55:51 2015
@@ -16,12 +16,14 @@
*/
package org.apache.tika.parser.chm.core;
+import org.apache.tika.io.IOUtils;
+
public class ChmConstants {
/* Prevents instantiation */
private ChmConstants() {
}
- public static final String DEFAULT_CHARSET = "UTF-8";
+ public static final String DEFAULT_CHARSET = IOUtils.UTF_8.name();
public static final String ITSF = "ITSF";
public static final String ITSP = "ITSP";
public static final String PMGL = "PMGL";
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
Fri Jan 23 19:55:51 2015
@@ -172,7 +172,7 @@ public class ChmExtractor {
int indexOfControlData = getChmDirList().getControlDataIndex();
int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
- ChmConstants.LZXC.getBytes("UTF-8"));
+ ChmConstants.LZXC.getBytes(IOUtils.UTF_8));
byte[] dir_chunk = null;
if (indexOfResetData > 0)
dir_chunk = ChmCommons.copyOfRange( getData(),
indexOfResetData, indexOfResetData
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
Fri Jan 23 19:55:51 2015
@@ -93,7 +93,7 @@ public class EpubParser extends Abstract
ZipEntry entry = zip.getNextEntry();
while (entry != null) {
if (entry.getName().equals("mimetype")) {
- String type = IOUtils.toString(zip, "UTF-8");
+ String type = IOUtils.toString(zip, IOUtils.UTF_8.name());
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals("metadata.xml")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/gdal/GDALParser.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
Fri Jan 23 19:55:51 2015
@@ -30,9 +30,8 @@ import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
-//Tika imports
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -41,12 +40,13 @@ import org.apache.tika.parser.AbstractPa
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import static org.apache.tika.parser.external.ExternalParser.INPUT_FILE_TOKEN;
+//Tika imports
//SAX imports
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
/**
* Wraps execution of the <a href="http//gdal.org/">Geospatial Data Abstraction
@@ -385,7 +385,7 @@ public class GDALParser extends Abstract
private String extractOutput(InputStream stream) throws SAXException,
IOException {
StringBuffer sb = new StringBuffer();
- Reader reader = new InputStreamReader(stream, "UTF-8");
+ Reader reader = new InputStreamReader(stream, IOUtils.UTF_8);
try {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n =
reader.read(buffer)) {
@@ -400,8 +400,8 @@ public class GDALParser extends Abstract
private void processOutput(ContentHandler handler, Metadata metadata,
String output) throws SAXException, IOException
{
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- InputStream stream = new
ByteArrayInputStream(output.getBytes("UTF-8"));
- Reader reader = new InputStreamReader(stream, "UTF-8");
+ InputStream stream = new
ByteArrayInputStream(output.getBytes(IOUtils.UTF_8));
+ Reader reader = new InputStreamReader(stream, IOUtils.UTF_8);
try {
xhtml.startDocument();
xhtml.startElement("p");
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
Fri Jan 23 19:55:51 2015
@@ -301,7 +301,7 @@ public class ImageMetadataExtractor {
@Override
protected SimpleDateFormat initialValue()
{
- return new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
+ return new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US);
}
};
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
Fri Jan 23 19:55:51 2015
@@ -22,12 +22,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.Iterator;
import java.util.List;
import org.apache.jempbox.xmp.XMPMetadata;
import org.apache.jempbox.xmp.XMPSchemaDublinCore;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.xml.sax.InputSource;
@@ -39,7 +39,7 @@ public class JempboxExtractor {
private Metadata metadata;
// The XMP spec says it must be unicode, but for most file formats it
specifies "must be encoded in UTF-8"
- private static final String DEFAULT_XMP_CHARSET = "UTF-8";
+ private static final String DEFAULT_XMP_CHARSET = IOUtils.UTF_8.name();
public JempboxExtractor(Metadata metadata) {
this.metadata = metadata;
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
Fri Jan 23 19:55:51 2015
@@ -28,6 +28,7 @@ import java.util.Set;
import java.util.TimeZone;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -161,7 +162,7 @@ public class IptcAnpaParser implements P
}
int msgsize = is.read(buf); // read in at least the
full data
- String message = (new String(buf, "UTF-8")).toLowerCase(Locale.ROOT);
+ String message = (new String(buf,
IOUtils.UTF_8)).toLowerCase(Locale.ROOT);
// these are not if-then-else, because we want to go from most common
// and fall through to least. this is imperfect, as these tags could
// show up in other agency stories, but i can't find a spec or any
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java?rev=1654351&r1=1654350&r2=1654351&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
Fri Jan 23 19:55:51 2015
@@ -24,6 +24,7 @@ import java.util.Set;
import java.util.Map;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AbstractParser;
@@ -86,7 +87,7 @@ public class MatParser extends AbstractP
}
// Get endian indicator from header file
- String endianBytes = new String(hdr.getEndianIndicator(),
"UTF-8"); // Retrieve endian bytes and convert to string
+ String endianBytes = new String(hdr.getEndianIndicator(),
IOUtils.UTF_8); // Retrieve endian bytes and convert to string
String endianCode = String.valueOf(endianBytes.toCharArray()); //
Convert bytes to characters to string
metadata.set("endian", endianCode);