Repository: any23 Updated Branches: refs/heads/master f9abbec20 -> db25f0213
ANY23-341 Remove dependency on defunct commons-httpclient Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/db25f021 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/db25f021 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/db25f021 Branch: refs/heads/master Commit: db25f0213714f0d6c0377818c00a3aeb58436d56 Parents: f9abbec Author: Hans <[email protected]> Authored: Tue Apr 3 13:33:20 2018 -0500 Committer: Hans <[email protected]> Committed: Wed Apr 4 16:37:16 2018 -0500 ---------------------------------------------------------------------- cli/pom.xml | 4 - core/pom.xml | 4 - .../any23/extractor/html/HTMLDocument.java | 30 ++++-- .../apache/any23/http/DefaultHTTPClient.java | 106 +++++++++++-------- .../java/org/apache/any23/http/HTTPClient.java | 12 +-- .../apache/any23/source/HTTPDocumentSource.java | 11 +- .../java/org/apache/any23/util/LogUtils.java | 2 - pom.xml | 5 - .../java/org/apache/any23/servlet/Servlet.java | 4 +- 9 files changed, 95 insertions(+), 83 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/cli/pom.xml ---------------------------------------------------------------------- diff --git a/cli/pom.xml b/cli/pom.xml index 07b7e6b..321b150 100644 --- a/cli/pom.xml +++ b/cli/pom.xml @@ -94,10 +94,6 @@ <artifactId>commons-lang</artifactId> </dependency> <dependency> - <groupId>commons-httpclient</groupId> - <artifactId>commons-httpclient</artifactId> - </dependency> - <dependency> <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> </dependency> http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/pom.xml ---------------------------------------------------------------------- diff --git a/core/pom.xml b/core/pom.xml index 6fd2550..58a37ee 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -63,10 +63,6 @@ <artifactId>commons-lang</artifactId> </dependency> <dependency> - <groupId>commons-httpclient</groupId> - <artifactId>commons-httpclient</artifactId> - </dependency> - <dependency> <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> </dependency> http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java b/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java index bb958c7..188e0f1 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java +++ b/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java @@ -24,6 +24,7 @@ import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; @@ -375,15 +376,32 @@ public class HTMLDocument { private java.net.URI getBaseIRI() throws ExtractionException { if (baseIRI == null) { + // document.getBaseURI() returns null for document URIs with + // special characters, e.g., http://semanticweb.org/wiki/Knud_Möller + // It also does *not* take html "base" elements into account. + // (But it does take into account urls specified by the attribute "xml:base".) + + // So, for now, let's use getDocumentURI() instead. + // TODO: Make this approach better. + + Document doc = document instanceof Document ? (Document)document : document.getOwnerDocument(); + + if (doc == null) { + throw new ExtractionException("Node " + document.getNodeName() + " was not associated with a document."); + } + + String uri = doc.getDocumentURI(); + + if (uri == null) { + throw new ExtractionException("document URI is null, this should not happen"); + } + try { - if (document.getBaseURI() == null) { - log.warn("document.getBaseURI() is null, this should not happen"); - } - baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(document.getBaseURI())); + baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(uri)); } catch (IllegalArgumentException ex) { - throw new ExtractionException("Error in base IRI: " + document.getBaseURI(), ex); + throw new ExtractionException("Error in base IRI: " + uri, ex); } catch (URISyntaxException ex) { - throw new ExtractionException("Error in base IRI: " + document.getBaseURI(), ex); + throw new ExtractionException("Error in base IRI: " + uri, ex); } } return baseIRI; http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java index d520441..2615585 100644 --- a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java +++ b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java @@ -17,16 +17,24 @@ package org.apache.any23.http; -import org.apache.commons.httpclient.*; -import org.apache.commons.httpclient.methods.GetMethod; -import org.apache.commons.httpclient.params.HttpConnectionManagerParams; +import org.apache.commons.io.IOUtils; +import org.apache.http.Header; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.config.SocketConfig; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; +import org.apache.http.message.BasicHeader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.net.URI; import java.util.ArrayList; import java.util.List; -import java.util.regex.Pattern; /** * Opens an {@link InputStream} on an HTTP IRI. Is configured @@ -37,9 +45,7 @@ import java.util.regex.Pattern; */ public class DefaultHTTPClient implements HTTPClient { - private static final Pattern ESCAPED_PATTERN = Pattern.compile("%[0-9a-f]{2}",Pattern.CASE_INSENSITIVE); - - private final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager(); + private final PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager(); private HTTPClientConfiguration configuration; @@ -51,9 +57,6 @@ public class DefaultHTTPClient implements HTTPClient { private String contentType = null; - public static final boolean isUrlEncoded(String url) { - return ESCAPED_PATTERN.matcher(url).find(); - } /** * Creates a {@link DefaultHTTPClient} instance already initialized @@ -82,35 +85,31 @@ public class DefaultHTTPClient implements HTTPClient { * located at the URI. */ public InputStream openInputStream(String uri) throws IOException { - GetMethod method = null; + HttpGet method = null; try { ensureClientInitialized(); - String uriStr; - try { - URI uriObj = new URI(uri, isUrlEncoded(uri)); - // [scheme:][//authority][path][?query][#fragment] - uriStr = uriObj.toString(); - } catch (URIException e) { - throw new IllegalArgumentException("Invalid IRI string.", e); - } - method = new GetMethod(uriStr); - method.setFollowRedirects(true); - client.executeMethod(method); - _contentLength = method.getResponseContentLength(); - final Header contentTypeHeader = method.getResponseHeader("Content-Type"); + HttpClientContext context = HttpClientContext.create(); + method = new HttpGet(uri); + HttpResponse response = client.execute(method, context); + List<URI> locations = context.getRedirectLocations(); + + URI actualURI = locations == null || locations.isEmpty() ? method.getURI() : locations.get(locations.size() - 1); + actualDocumentIRI = actualURI.toString(); + + final Header contentTypeHeader = response.getFirstHeader("Content-Type"); contentType = contentTypeHeader == null ? null : contentTypeHeader.getValue(); - if (method.getStatusCode() != 200) { + if (response.getStatusLine().getStatusCode() != 200) { throw new IOException( - "Failed to fetch " + uri + ": " + method.getStatusCode() + " " + method.getStatusText() + "Failed to fetch " + uri + ": " + response.getStatusLine().getStatusCode() + " " + response.getStatusLine().getReasonPhrase() ); } - actualDocumentIRI = method.getURI().toString(); - byte[] response = method.getResponseBody(); - return new ByteArrayInputStream(response); + byte[] bytes = IOUtils.toByteArray(response.getEntity().getContent()); + _contentLength = bytes.length; + return new ByteArrayInputStream(bytes); } finally { if (method != null) { - method.releaseConnection(); + method.reset(); } } } @@ -143,25 +142,38 @@ public class DefaultHTTPClient implements HTTPClient { } private void ensureClientInitialized() { - if(configuration == null) throw new IllegalStateException("client must be initialized first."); - if (client != null) return; - client = new HttpClient(manager); - HttpConnectionManager connectionManager = client.getHttpConnectionManager(); - HttpConnectionManagerParams params = connectionManager.getParams(); - params.setConnectionTimeout(configuration.getDefaultTimeout()); - params.setSoTimeout(configuration.getDefaultTimeout()); - params.setMaxTotalConnections(configuration.getMaxConnections()); - - HostConfiguration hostConf = client.getHostConfiguration(); - List<Header> headers = new ArrayList<Header>(); - headers.add(new Header("User-Agent", configuration.getUserAgent())); + if (configuration == null) + throw new IllegalStateException("client must be initialized first."); + if (client != null) + return; + + RequestConfig requestConfig = RequestConfig.custom() + .setConnectTimeout(getConnectionTimeout()) + .setSocketTimeout(getSoTimeout()) + .setRedirectsEnabled(true) + .build(); + + SocketConfig socketConfig = SocketConfig.custom() + .setSoTimeout(getSoTimeout()) + .build(); + + List<Header> headers = new ArrayList<>(); + headers.add(new BasicHeader("User-Agent", configuration.getUserAgent())); if (configuration.getAcceptHeader() != null) { - headers.add(new Header("Accept", configuration.getAcceptHeader())); + headers.add(new BasicHeader("Accept", configuration.getAcceptHeader())); } - headers.add(new Header("Accept-Language", "en-us,en-gb,en,*;q=0.3")); //TODO: this must become parametric. - headers.add(new Header("Accept-Charset", "utf-8,iso-8859-1;q=0.7,*;q=0.5")); - // headers.add(new Header("Accept-Encoding", "x-gzip, gzip")); - hostConf.getParams().setParameter("http.default-headers", headers); + headers.add(new BasicHeader("Accept-Language", "en-us,en-gb,en,*;q=0.3")); //TODO: this must become parametric. + // headers.add(new BasicHeader("Accept-Encoding", "x-gzip, gzip")); + headers.add(new BasicHeader("Accept-Charset", "utf-8,iso-8859-1;q=0.7,*;q=0.5")); + + + client = HttpClients.custom() + .setConnectionManager(manager) + .setDefaultRequestConfig(requestConfig) + .setDefaultSocketConfig(socketConfig) + .setMaxConnTotal(configuration.getMaxConnections()) + .setDefaultHeaders(headers) + .build(); } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/http/HTTPClient.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/http/HTTPClient.java b/core/src/main/java/org/apache/any23/http/HTTPClient.java index 0bc4dbc..3f08975 100644 --- a/core/src/main/java/org/apache/any23/http/HTTPClient.java +++ b/core/src/main/java/org/apache/any23/http/HTTPClient.java @@ -33,7 +33,7 @@ public interface HTTPClient { * * @param configuration configuration for the HTTP Client. */ - public abstract void init(HTTPClientConfiguration configuration); + void init(HTTPClientConfiguration configuration); /** * Opens the input stream for the given target IRI. @@ -42,7 +42,7 @@ public interface HTTPClient { * @return input stream to access IRI content. * @throws IOException if any error occurs while reading the IRI content. */ - public abstract InputStream openInputStream(String uri) throws IOException; + InputStream openInputStream(String uri) throws IOException; /** * Release all static resources help by the instance. Call this @@ -50,7 +50,7 @@ public interface HTTPClient { * application, like for example when shutting down a servlet * context. */ - public abstract void close(); + void close(); /** * The value of the Content-Type header reported by the server. @@ -58,12 +58,12 @@ public interface HTTPClient { * * @return the content type as string. */ - public abstract String getContentType(); + String getContentType(); /** * @return content length in bytes. */ - public abstract long getContentLength(); + long getContentLength(); /** * Returns the actual IRI from which the document was fetched. @@ -73,6 +73,6 @@ public interface HTTPClient { * * @return actual document IRI. */ - public abstract String getActualDocumentIRI(); + String getActualDocumentIRI(); } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java index fef124d..e9cebee 100644 --- a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java +++ b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java @@ -17,15 +17,13 @@ package org.apache.any23.source; -import org.apache.any23.http.DefaultHTTPClient; import org.apache.any23.http.HTTPClient; -import org.apache.commons.httpclient.URI; -import org.apache.commons.httpclient.URIException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; +import java.net.URI; import java.net.URISyntaxException; /** @@ -50,13 +48,12 @@ public class HTTPDocumentSource implements DocumentSource { private String normalize(String uri) throws URISyntaxException { try { - URI normalized = new URI(uri, DefaultHTTPClient.isUrlEncoded(uri)); - normalized.normalize(); + URI normalized = new URI(uri).normalize(); return normalized.toString(); - } catch (URIException e) { + } catch (URISyntaxException e) { LOG.warn("Invalid uri: {}", uri); LOG.error("Can not convert URL", e); - throw new URISyntaxException(uri, e.getMessage()); + throw e; } } http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/util/LogUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/util/LogUtils.java b/core/src/main/java/org/apache/any23/util/LogUtils.java index ef43c20..30b24ca 100644 --- a/core/src/main/java/org/apache/any23/util/LogUtils.java +++ b/core/src/main/java/org/apache/any23/util/LogUtils.java @@ -27,8 +27,6 @@ public class LogUtils { public static void setDefaultLogging() { Logger.getLogger("").setLevel(Level.WARNING); - // Suppress silly cookie warnings. - Logger.getLogger("org.apache.commons.httpclient").setLevel(Level.SEVERE); Logger.getLogger("").getHandlers()[0].setLevel(Level.ALL); } http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 4a62dd1..f0f809d 100644 --- a/pom.xml +++ b/pom.xml @@ -332,11 +332,6 @@ <version>2.6</version> </dependency> <dependency> - <groupId>commons-httpclient</groupId> - <artifactId>commons-httpclient</artifactId> - <version>3.1</version> - </dependency> - <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>${httpclient.version}</version> http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/service/src/main/java/org/apache/any23/servlet/Servlet.java ---------------------------------------------------------------------- diff --git a/service/src/main/java/org/apache/any23/servlet/Servlet.java b/service/src/main/java/org/apache/any23/servlet/Servlet.java index 154f41d..ad7c1ed 100644 --- a/service/src/main/java/org/apache/any23/servlet/Servlet.java +++ b/service/src/main/java/org/apache/any23/servlet/Servlet.java @@ -29,7 +29,6 @@ import org.apache.any23.source.ByteArrayDocumentSource; import org.apache.any23.source.DocumentSource; import org.apache.any23.source.HTTPDocumentSource; import org.apache.any23.source.StringDocumentSource; -import org.apache.commons.httpclient.URI; import org.eclipse.rdf4j.rio.RDFFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,6 +40,7 @@ import javax.servlet.http.HttpServletResponse; import java.io.File; import java.io.IOException; +import java.net.URI; import java.net.URISyntaxException; import java.util.regex.Pattern; @@ -286,7 +286,7 @@ public class Servlet extends HttpServlet { private boolean isValidIRI(String s) { try { - URI uri = new URI(s, false); + URI uri = new URI(s); if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) { return false; }
