Author: nick
Date: Thu Aug 20 10:07:52 2015
New Revision: 1696751
URL: http://svn.apache.org/r1696751
Log:
TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO
copies, and java.nio.charset.StandardCharsets
Modified:
tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DirListParser.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
Modified:
tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
--- tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
(original)
+++ tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
Thu Aug 20 10:07:52 2015
@@ -16,6 +16,7 @@
*/
package org.apache.tika.bundle;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;
@@ -141,7 +142,7 @@ public class BundleIT {
public void testForkParser() throws Exception {
ForkParser parser = new ForkParser(Activator.class.getClassLoader(),
defaultParser);
String data = "<!DOCTYPE html>\n<html><body><p>test
<span>content</span></p></body></html>";
- InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
+ InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
Writer writer = new StringWriter();
ContentHandler contentHandler = new BodyContentHandler(writer);
Metadata metadata = new Metadata();
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DirListParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DirListParser.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DirListParser.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DirListParser.java
Thu Aug 20 10:07:52 2015
@@ -32,6 +32,8 @@ import org.apache.tika.sax.BodyContentHa
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Parses the output of /bin/ls and counts the number of files and the number
of
* executables using Tika.
@@ -75,8 +77,7 @@ public class DirListParser implements Pa
Metadata metadata, ParseContext context) throws
IOException,
SAXException, TikaException {
- List<String> lines = FileUtils.readLines(
- TikaInputStream.get(is).getFile(), "utf-8");
+ List<String> lines =
FileUtils.readLines(TikaInputStream.get(is).getFile(), UTF_8);
for (String line : lines) {
String[] fileToks = line.split("\\s+");
if (fileToks.length < 8)
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
Thu Aug 20 10:07:52 2015
@@ -41,7 +41,6 @@ import org.apache.tika.config.TikaConfig
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
import org.apache.tika.language.translate.DefaultTranslator;
import org.apache.tika.language.translate.Translator;
import org.apache.tika.mime.MediaType;
@@ -52,6 +51,8 @@ import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* This class shows how to dump a TikaConfig object to a configuration file.
@@ -190,7 +191,7 @@ public class DumpTikaConfigExample {
*/
public static void main(String[] args) throws Exception {
- Charset encoding = IOUtils.UTF_8;
+ Charset encoding = UTF_8;
Writer writer = null;
if (args.length > 0) {
writer = new OutputStreamWriter(new FileOutputStream(new
File(args[0])), encoding);
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
Thu Aug 20 10:07:52 2015
@@ -20,12 +20,12 @@ import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
+import org.apache.commons.io.FilenameUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.FilenameUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypeException;
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
Thu Aug 20 10:07:52 2015
@@ -32,6 +32,8 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Demonstrates how to call the different components within Tika: its
* {@link Detector} framework (aka MIME identification and repository), its
@@ -91,7 +93,7 @@ public class MyFirstTika {
+ detector.detect(stream, metadata) + "]");
LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile(
- FileUtils.readFileToString(new File(filename), "utf-8")));
+ FileUtils.readFileToString(new File(filename), UTF_8)));
System.out.println("The language of this content is: ["
+ lang.getLanguage() + "]");
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java
Thu Aug 20 10:07:52 2015
@@ -37,6 +37,8 @@ import org.apache.tika.sax.XHTMLContentH
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Demonstrates Tika and its ability to sense symlinks.
*/
@@ -52,7 +54,7 @@ public class RollbackSoftware {
LinkContentHandler handler = new LinkContentHandler();
Metadata met = new Metadata();
DeploymentAreaParser parser = new DeploymentAreaParser();
-
parser.parse(IOUtils.toInputStream(deployArea.getAbsolutePath(), "utf-8"),
+
parser.parse(IOUtils.toInputStream(deployArea.getAbsolutePath(), UTF_8),
handler, met);
List<Link> links = handler.getLinks();
if (links.size() < 2)
@@ -110,7 +112,7 @@ public class RollbackSoftware {
Metadata metadata, ParseContext context) throws
IOException,
SAXException, TikaException {
- File deployArea = new File(IOUtils.toString(is,
"utf-8"));
+ File deployArea = new File(IOUtils.toString(is, UTF_8));
File[] versions = deployArea.listFiles(new FileFilter()
{
public boolean accept(File pathname) {
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java
Thu Aug 20 10:07:52 2015
@@ -15,6 +15,7 @@
package org.apache.tika.example;
import java.io.ByteArrayInputStream;
+
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -22,7 +23,7 @@ import org.apache.tika.sax.WriteOutConte
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
-import com.google.common.base.Charsets;
+import static java.nio.charset.StandardCharsets.UTF_8;
public class SpringExample {
@@ -30,7 +31,7 @@ public class SpringExample {
ApplicationContext context = new ClassPathXmlApplicationContext(
new String[] {
"org/apache/tika/example/spring.xml" });
Parser parser = context.getBean("tika", Parser.class);
- parser.parse(new ByteArrayInputStream("Hello,
World!".getBytes(Charsets.UTF_8)),
+ parser.parse(new ByteArrayInputStream("Hello,
World!".getBytes(UTF_8)),
new WriteOutContentHandler(System.out), new
Metadata(),
new ParseContext());
}
Modified:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
(original)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
Thu Aug 20 10:07:52 2015
@@ -18,6 +18,8 @@ package org.apache.tika.example;
*/
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -26,6 +28,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.nio.charset.Charset;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.CompositeDetector;
@@ -61,9 +64,9 @@ public class DumpTikaConfigExampleTest {
@Test
public void testDump() throws Exception {
DumpTikaConfigExample ex = new DumpTikaConfigExample();
- for (String encoding : new String[]{ "UTF-8", "UTF-16LE"}) {
- Writer writer = new OutputStreamWriter(new
FileOutputStream(configFile), encoding);
- ex.dump(TikaConfig.getDefaultConfig(), writer, encoding);
+ for (Charset charset : new Charset[]{UTF_8, UTF_16LE}) {
+ Writer writer = new OutputStreamWriter(new
FileOutputStream(configFile), charset);
+ ex.dump(TikaConfig.getDefaultConfig(), writer, charset.name());
writer.flush();
writer.close();
Modified:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
(original)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
Thu Aug 20 10:07:52 2015
@@ -14,6 +14,7 @@
package org.apache.tika.example;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import java.io.ByteArrayOutputStream;
@@ -23,8 +24,6 @@ import java.io.PrintStream;
import org.apache.commons.io.FileUtils;
import org.junit.Test;
-import com.google.common.base.Charsets;
-
public class SimpleTextExtractorTest {
@Test
public void testSimpleTextExtractor() throws Exception {
@@ -33,18 +32,17 @@ public class SimpleTextExtractorTest {
+ " content written in English to test autodetection of"
+ " the character encoding of the input stream.";
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- String UTF8 = Charsets.UTF_8.name();
PrintStream out = System.out;
- System.setOut(new PrintStream(buffer, true, UTF8));
+ System.setOut(new PrintStream(buffer, true, UTF_8.name()));
File file = new File("target", "test.txt");
- FileUtils.writeStringToFile(file, message, UTF8);
+ FileUtils.writeStringToFile(file, message, UTF_8);
SimpleTextExtractor.main(new String[] { file.getPath() });
file.delete();
System.setOut(out);
- assertEquals(message, buffer.toString(UTF8).trim());
+ assertEquals(message, buffer.toString(UTF_8.name()).trim());
}
}
Modified:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java?rev=1696751&r1=1696750&r2=1696751&view=diff
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
(original)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
Thu Aug 20 10:07:52 2015
@@ -14,12 +14,12 @@
package org.apache.tika.example;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
-import com.google.common.base.Charsets;
import org.junit.Test;
@SuppressWarnings("deprecation")
@@ -30,14 +30,14 @@ public class SimpleTypeDetectorTest {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
PrintStream out = System.out;
- System.setOut(new PrintStream(buffer, true,
Charsets.UTF_8.name()));
+ System.setOut(new PrintStream(buffer, true, UTF_8.name()));
SimpleTypeDetector.main(new String[] { "pom.xml" });
System.setOut(out);
assertEquals("pom.xml: application/xml",
- buffer.toString(Charsets.UTF_8.name()).trim());
+ buffer.toString(UTF_8.name()).trim());
}
}