Fix URL encoding issues
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/4249ef32 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/4249ef32 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/4249ef32 Branch: refs/heads/master Commit: 4249ef3229565cd810eff2f79c1c6b06013d96a0 Parents: c224e26 Author: Eugene Dzhurinsky <[email protected]> Authored: Sun Dec 22 23:37:04 2013 -0500 Committer: Eugene Dzhurinsky <[email protected]> Committed: Thu May 8 23:03:21 2014 -0400 ---------------------------------------------------------------------- .../apache/any23/source/HTTPDocumentSource.java | 19 +++++++-- .../java/org/apache/any23/servlet/Servlet.java | 41 ++++++++++---------- 2 files changed, 36 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/4249ef32/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java index 6ea2cc8..709bf5a 100644 --- a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java +++ b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java @@ -18,10 +18,13 @@ package org.apache.any23.source; import org.apache.any23.http.HTTPClient; +import org.apache.commons.httpclient.URI; +import org.apache.commons.httpclient.URIException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; -import java.net.URI; import java.net.URISyntaxException; /** @@ -29,6 +32,8 @@ import java.net.URISyntaxException; */ public class HTTPDocumentSource implements DocumentSource { + private static final Logger LOG = LoggerFactory.getLogger(HTTPDocumentSource.class); + private final HTTPClient client; private String uri; @@ -43,7 +48,15 @@ public class HTTPDocumentSource implements DocumentSource { } private String normalize(String uri) throws URISyntaxException { - return new URI(uri).normalize().toString(); + try { + URI normalized = new URI(uri, false); + normalized.normalize(); + return normalized.toString(); + } catch (URIException e) { + LOG.warn("Invalid uri: {}", uri); + LOG.error("Can not convert URL", e); + throw new URISyntaxException(uri, e.getMessage()); + } } private void ensureOpen() throws IOException { @@ -80,5 +93,5 @@ public class HTTPDocumentSource implements DocumentSource { public boolean isLocal() { return false; } - + } http://git-wip-us.apache.org/repos/asf/any23/blob/4249ef32/service/src/main/java/org/apache/any23/servlet/Servlet.java ---------------------------------------------------------------------- diff --git a/service/src/main/java/org/apache/any23/servlet/Servlet.java b/service/src/main/java/org/apache/any23/servlet/Servlet.java index 0a968de..31f104e 100644 --- a/service/src/main/java/org/apache/any23/servlet/Servlet.java +++ b/service/src/main/java/org/apache/any23/servlet/Servlet.java @@ -26,14 +26,16 @@ import org.apache.any23.source.ByteArrayDocumentSource; import org.apache.any23.source.DocumentSource; import org.apache.any23.source.HTTPDocumentSource; import org.apache.any23.source.StringDocumentSource; +import org.apache.commons.httpclient.URI; import org.openrdf.rio.RDFFormat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; -import java.net.URI; import java.net.URISyntaxException; import java.util.regex.Pattern; @@ -48,6 +50,8 @@ import static org.apache.any23.extractor.ExtractionParameters.ValidationMode; */ public class Servlet extends HttpServlet { + private static final Logger LOG = LoggerFactory.getLogger(Servlet.class); + public static final String DEFAULT_BASE_URI = "http://any23.org/tmp/"; private static final long serialVersionUID = 8207685628715421336L; @@ -135,23 +139,17 @@ public class Servlet extends HttpServlet { MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept")); if (result == null) { return null; - } - else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) { + } else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) { return "turtle"; - } - else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) { + } else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) { return "n3"; - } - else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) { + } else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) { return "nq"; - } - else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) { + } else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) { return "rdf"; - } - else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) { + } else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) { return "nt"; - } - else { + } else { return "turtle"; // shouldn't happen } } @@ -220,13 +218,14 @@ public class Servlet extends HttpServlet { } private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report) - throws IOException { + throws IOException { try { if (!isValidURI(uri)) { throw new URISyntaxException(uri, "@@@"); } return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri); } catch (URISyntaxException ex) { + LOG.error("Invalid URI detected", ex); responder.sendError(400, "Invalid input URI " + uri, report); return null; } @@ -239,11 +238,11 @@ public class Servlet extends HttpServlet { private boolean isValidURI(String s) { try { - URI uri = new URI(s); + URI uri = new URI(s, false); if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) { return false; } - } catch (URISyntaxException e) { + } catch (Exception e) { return false; } return true; @@ -252,15 +251,15 @@ public class Servlet extends HttpServlet { private ValidationMode getValidationMode(HttpServletRequest request) { final String PARAMETER = "validation-mode"; final String validationMode = request.getParameter(PARAMETER); - if(validationMode == null) return ValidationMode.None; - if("none".equalsIgnoreCase(validationMode)) return ValidationMode.None; - if("validate".equalsIgnoreCase(validationMode)) return ValidationMode.Validate; - if("validate-fix".equalsIgnoreCase(validationMode)) return ValidationMode.ValidateAndFix; + if (validationMode == null) return ValidationMode.None; + if ("none".equalsIgnoreCase(validationMode)) return ValidationMode.None; + if ("validate".equalsIgnoreCase(validationMode)) return ValidationMode.Validate; + if ("validate-fix".equalsIgnoreCase(validationMode)) return ValidationMode.ValidateAndFix; throw new IllegalArgumentException( String.format("Invalid value '%s' for '%s' parameter.", validationMode, PARAMETER) ); } - + private ExtractionParameters getExtractionParameters(HttpServletRequest request) { final ValidationMode mode = getValidationMode(request); return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
