Yet another fix for URL processing - do not escape what is already escaped :)
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/f773f840 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/f773f840 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/f773f840 Branch: refs/heads/master Commit: f773f840e93766265e87038688f9d36d4fe7e939 Parents: 64ae99b Author: Eugene Dzhurinsky <[email protected]> Authored: Wed Dec 25 18:47:40 2013 -0500 Committer: Eugene Dzhurinsky <[email protected]> Committed: Thu May 8 23:03:22 2014 -0400 ---------------------------------------------------------------------- .../apache/any23/http/DefaultHTTPClient.java | 34 +++++++------------- .../apache/any23/source/HTTPDocumentSource.java | 3 +- 2 files changed, 13 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/f773f840/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java index 967f59f..f533040 100644 --- a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java +++ b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java @@ -17,22 +17,16 @@ package org.apache.any23.http; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HostConfiguration; -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpConnectionManager; -import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; +import org.apache.commons.httpclient.*; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpConnectionManagerParams; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import java.net.URI; -import java.net.URISyntaxException; -import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; +import java.util.regex.Pattern; /** * Opens an {@link InputStream} on an HTTP URI. Is configured @@ -43,6 +37,8 @@ import java.util.List; */ public class DefaultHTTPClient implements HTTPClient { + private static final Pattern ESCAPED_PATTERN = Pattern.compile("%[0-9a-f]{2}",Pattern.CASE_INSENSITIVE); + private final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager(); private HTTPClientConfiguration configuration; @@ -55,6 +51,10 @@ public class DefaultHTTPClient implements HTTPClient { private String contentType = null; + public static final boolean isUrlEncoded(String url) { + return ESCAPED_PATTERN.matcher(url).find(); + } + /** * Creates a {@link DefaultHTTPClient} instance already initialized * @@ -86,22 +86,10 @@ public class DefaultHTTPClient implements HTTPClient { ensureClientInitialized(); String uriStr; try { - URI uriObj = new URI(uri); + URI uriObj = new URI(uri, isUrlEncoded(uri)); // [scheme:][//authority][path][?query][#fragment] - final String path = uriObj.getPath(); - final String query = uriObj.getQuery(); - final String fragment = uriObj.getFragment(); - uriStr = String.format( - "%s://%s%s%s%s%s%s", - uriObj.getScheme(), - uriObj.getAuthority(), - path, - query == null ? "" : "?", - query, - fragment == null ? "" : "#", - fragment != null ? URLEncoder.encode(fragment, "UTF-8") : "" - ); - } catch (URISyntaxException e) { + uriStr = uriObj.toString(); + } catch (URIException e) { throw new IllegalArgumentException("Invalid URI string.", e); } method = new GetMethod(uriStr); http://git-wip-us.apache.org/repos/asf/any23/blob/f773f840/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java index 709bf5a..61a1b2d 100644 --- a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java +++ b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java @@ -17,6 +17,7 @@ package org.apache.any23.source; +import org.apache.any23.http.DefaultHTTPClient; import org.apache.any23.http.HTTPClient; import org.apache.commons.httpclient.URI; import org.apache.commons.httpclient.URIException; @@ -49,7 +50,7 @@ public class HTTPDocumentSource implements DocumentSource { private String normalize(String uri) throws URISyntaxException { try { - URI normalized = new URI(uri, false); + URI normalized = new URI(uri, DefaultHTTPClient.isUrlEncoded(uri)); normalized.normalize(); return normalized.toString(); } catch (URIException e) {
