This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch code-cleanup-for-4x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0b5a17ae834bffb9d6c6ca83a3ca33ca71c949a4 Author: tallison <[email protected]> AuthorDate: Fri Feb 6 17:01:06 2026 -0500 cleanup for 4.x --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 13 +- .../java/org/apache/tika/mime/MimeTypesReader.java | 9 +- .../java/org/apache/tika/parser/NetworkParser.java | 170 --------------------- 3 files changed, 2 insertions(+), 190 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index dfcc299520..9c12d2d804 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -32,7 +32,6 @@ import java.io.Serializable; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.lang.reflect.Field; -import java.net.URI; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; @@ -84,7 +83,6 @@ import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.CompositeParser; -import org.apache.tika.parser.NetworkParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; @@ -128,7 +126,6 @@ public class TikaCLI { private TikaLoader tikaLoader; private String configFilePath; private boolean recursiveJSON = false; - private URI networkURI = null; /** * Output character encoding, or <code>null</code> for platform default */ @@ -511,10 +508,6 @@ public class TikaCLI { prettyPrint = true; } else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) { throw new IllegalArgumentException("As of Tika 2.0, the server option is no longer supported in tika-app.\n" + "See https://wiki.apache.org/tika/TikaJAXRS for usage."); - } else if (arg.startsWith("-c")) { - networkURI = new URI(arg.substring("-c".length())); - } else if (arg.startsWith("--client=")) { - networkURI = new URI(arg.substring("--client=".length())); } else { pipeMode = false; configure(); @@ -879,11 +872,7 @@ public class TikaCLI { Files.deleteIfExists(tempConfig); } } - if (networkURI != null) { - parser = new NetworkParser(networkURI); - } else { - parser = tikaLoader.loadAutoDetectParser(); - } + parser = tikaLoader.loadAutoDetectParser(); // Load configs from tika-config.json and merge into existing context // (preserves EmbeddedDocumentExtractor and other items set before configure()) diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java index 76bc5c7525..72168feac4 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java @@ -26,7 +26,6 @@ import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantReadWriteLock; -import javax.xml.XMLConstants; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; @@ -211,14 +210,8 @@ public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMe } private static SAXParser newSAXParser() throws TikaException { - SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParserFactory factory = XMLReaderUtils.getSAXParserFactory(); factory.setNamespaceAware(false); - try { - factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); - } catch (ParserConfigurationException | SAXException e) { - LOG.warn("can't set secure processing feature on: " + factory.getClass() + - ". User assumes responsibility for consequences."); - } try { return factory.newSAXParser(); } catch (ParserConfigurationException | SAXException e) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java b/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java deleted file mode 100644 index ff88f17c11..0000000000 --- a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser; - -import java.io.FilterOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.Socket; -import java.net.URI; -import java.net.URL; -import java.net.URLConnection; -import java.util.Collections; -import java.util.Set; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.CloseShieldInputStream; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.sax.TaggedContentHandler; -import org.apache.tika.sax.TeeContentHandler; -import org.apache.tika.utils.XMLReaderUtils; - - -public class NetworkParser implements Parser { - - private final URI uri; - - private final Set<MediaType> supportedTypes; - - public NetworkParser(URI uri, Set<MediaType> supportedTypes) { - this.uri = uri; - this.supportedTypes = supportedTypes; - } - - public NetworkParser(URI uri) { - this(uri, Collections.singleton(MediaType.OCTET_STREAM)); - } - - public Set<MediaType> getSupportedTypes(ParseContext context) { - return supportedTypes; - } - - public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - if ("telnet".equals(uri.getScheme())) { - try (Socket socket = new Socket(uri.getHost(), uri.getPort())) { - new ParsingTask(tis, new FilterOutputStream(socket.getOutputStream()) { - @Override - public void close() throws IOException { - socket.shutdownOutput(); - } - }).parse(socket.getInputStream(), handler, metadata, context); - } - } else { - URL url = uri.toURL(); - URLConnection connection = url.openConnection(); - connection.setDoOutput(true); - connection.connect(); - try (InputStream input = connection.getInputStream()) { - new ParsingTask(tis, connection.getOutputStream()) - .parse(CloseShieldInputStream.wrap(input), handler, metadata, context); - } - } - - } - - private static class ParsingTask implements Runnable { - - private final TikaInputStream input; - - private final OutputStream output; - - private volatile Exception exception = null; - - public ParsingTask(TikaInputStream input, OutputStream output) { - this.input = input; - this.output = output; - } - - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - Thread thread = new Thread(this, "Tika network parser"); - thread.start(); - - TaggedContentHandler tagged = - new TaggedContentHandler(handler); - try { - XMLReaderUtils - .parseSAX(stream, new TeeContentHandler(tagged, new MetaHandler(metadata)), - context); - } catch (SAXException e) { - tagged.throwIfCauseOf(e); - throw new TikaException("Invalid network parser output", e); - } catch (IOException e) { - throw new TikaException("Unable to read network parser output", e); - } finally { - try { - thread.join(1000); - } catch (InterruptedException e) { - throw new TikaException("Network parser interrupted", e); - } - - if (exception != null) { - input.throwIfCauseOf(exception); - throw new TikaException("Unexpected network parser error", exception); - } - } - } - - //----------------------------------------------------------<Runnable> - - public void run() { - try { - try { - IOUtils.copy(input, output); - } finally { - output.close(); - } - } catch (Exception e) { - exception = e; - } - } - - } - - private static class MetaHandler extends DefaultHandler { - - private final Metadata metadata; - - public MetaHandler(Metadata metadata) { - this.metadata = metadata; - } - - @Override - public void startElement(String uri, String localName, String qName, Attributes attributes) - throws SAXException { - if ("http://www.w3.org/1999/xhtml".equals(uri) && "meta".equals(localName)) { - String name = attributes.getValue("", "name"); - String content = attributes.getValue("", "content"); - if (name != null && content != null) { - metadata.add(name, content); - } - } - } - - } - -}
