http://git-wip-us.apache.org/repos/asf/aurora/blob/86a547b9/commons/src/main/java/com/twitter/common/net/ProxyAuthorizer.java ---------------------------------------------------------------------- diff --git a/commons/src/main/java/com/twitter/common/net/ProxyAuthorizer.java b/commons/src/main/java/com/twitter/common/net/ProxyAuthorizer.java new file mode 100644 index 0000000..151088a --- /dev/null +++ b/commons/src/main/java/com/twitter/common/net/ProxyAuthorizer.java @@ -0,0 +1,44 @@ +// ================================================================================================= +// Copyright 2011 Twitter, Inc. +// ------------------------------------------------------------------------------------------------- +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this work except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file, or at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ================================================================================================= + +package com.twitter.common.net; + +import org.apache.commons.codec.binary.Base64; + +import java.net.HttpURLConnection; + +/** + * Authorizes http connection for use over the proxy it is built with + * + * @author William Farner + */ +public class ProxyAuthorizer { + private final ProxyConfig config; + + private ProxyAuthorizer(ProxyConfig config) { + this.config = config; + } + + public static ProxyAuthorizer adapt(ProxyConfig config) { + return new ProxyAuthorizer(config); + } + + public void authorize(HttpURLConnection httpCon) { + httpCon.setRequestProperty("Proxy-Authorization", "Basic " + + new String(Base64.encodeBase64(new String(config.getProxyUser() + ":" + + config.getProxyPassword()).getBytes())).trim()); + } +}
http://git-wip-us.apache.org/repos/asf/aurora/blob/86a547b9/commons/src/main/java/com/twitter/common/net/ProxyConfig.java ---------------------------------------------------------------------- diff --git a/commons/src/main/java/com/twitter/common/net/ProxyConfig.java b/commons/src/main/java/com/twitter/common/net/ProxyConfig.java new file mode 100644 index 0000000..d0748d8 --- /dev/null +++ b/commons/src/main/java/com/twitter/common/net/ProxyConfig.java @@ -0,0 +1,33 @@ +// ================================================================================================= +// Copyright 2011 Twitter, Inc. +// ------------------------------------------------------------------------------------------------- +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this work except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file, or at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ================================================================================================= + +package com.twitter.common.net; + +import java.net.InetSocketAddress; +import javax.annotation.Nullable; + +/** + * Proxy configuration parameters: proxy address, username, and password. + * + * @author John Corwin + */ +public interface ProxyConfig { + public InetSocketAddress getProxyAddress(); + + public @Nullable String getProxyUser(); + + public @Nullable String getProxyPassword(); +} http://git-wip-us.apache.org/repos/asf/aurora/blob/86a547b9/commons/src/main/java/com/twitter/common/net/UrlHelper.java ---------------------------------------------------------------------- diff --git a/commons/src/main/java/com/twitter/common/net/UrlHelper.java b/commons/src/main/java/com/twitter/common/net/UrlHelper.java new file mode 100644 index 0000000..a453336 --- /dev/null +++ b/commons/src/main/java/com/twitter/common/net/UrlHelper.java @@ -0,0 +1,159 @@ +// ================================================================================================= +// Copyright 2011 Twitter, Inc. +// ------------------------------------------------------------------------------------------------- +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this work except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file, or at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ================================================================================================= + +package com.twitter.common.net; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.Arrays; +import java.util.List; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author William Farner + */ +public class UrlHelper { + + private static final Logger LOG = Logger.getLogger(UrlHelper.class.getName()); + + /** + * Gets the domain from {@code url}. + * + * @param url A url. + * @return The domain portion of the URL, or {@code null} if the url is invalid. + */ + public static String getDomain(String url) { + try { + return getDomainChecked(url); + } catch (URISyntaxException e) { + LOG.finest("Malformed url: " + url); + return null; + } + } + + /** + * Gets the domain from {@code uri}, and throws an exception if it's not a valid uri. + * + * @param url A url. + * @throws URISyntaxException if url is not a valid {@code URI} + * @return The domain portion of the given url, or {@code null} if the host is undefined. + */ + public static String getDomainChecked(String url) throws URISyntaxException { + Preconditions.checkNotNull(url); + url = addProtocol(url); + return new URI(url).getHost(); + } + + /** + * Gets the path from {@code url}. + * + * @param url A url. + * @return The path portion of the URL, or {@code null} if the url is invalid. + */ + public static String getPath(String url) { + Preconditions.checkNotNull(url); + url = addProtocol(url); + try { + return new URI(url).getPath(); + } catch (URISyntaxException e) { + LOG.info("Malformed url: " + url); + return null; + } + } + + /** + * Strips URL parameters from a url. + * This will remove anything after and including a question mark in the URL. + * + * @param url The URL to strip parameters from. + * @return The original URL with parameters stripped, which will be the original URL if no + * parameters were found. + */ + public static String stripUrlParameters(String url) { + Preconditions.checkNotNull(url); + int paramStartIndex = url.indexOf("?"); + if (paramStartIndex == -1) { + return url; + } else { + return url.substring(0, paramStartIndex); + } + } + + /** + * Convenience method that calls #stripUrlParameters(String) for a URL. + * + * @param url The URL to strip parameters from. + * @return The original URL with parameters stripped, which will be the original URL if no + * parameters were found. + */ + public static String stripUrlParameters(URL url) { + return stripUrlParameters(url.toString()); + } + + private static final Pattern URL_PROTOCOL_REGEX = + Pattern.compile("^https?://", Pattern.CASE_INSENSITIVE); + + /** + * Checks whether a URL specifies its protocol, prepending http if it does not. + * + * @param url The URL to fix. + * @return The URL with the http protocol specified if no protocol was already specified. + */ + public static String addProtocol(String url) { + Preconditions.checkNotNull(url); + Matcher matcher = URL_PROTOCOL_REGEX.matcher(url); + if (!matcher.find()) { + url = "http://" + url; + } + return url; + } + + /** + * Gets the domain levels for a host. + * For example, sub1.sub2.domain.co.uk would return + * [sub1.sub2.domain.co.uk, sub2.domain.co.uk, domain.co.uk, co.uk, uk]. + * + * + * @param host The host to peel subdomains off from. + * @return The domain levels in this host. + */ + public static List<String> getDomainLevels(String host) { + Preconditions.checkNotNull(host); + + // Automatically include www prefix if not present. + if (!host.startsWith("www")) { + host = "www." + host; + } + + Joiner joiner = Joiner.on("."); + List<String> domainParts = Lists.newLinkedList(Arrays.asList(host.split("\\."))); + List<String> levels = Lists.newLinkedList(); + + while (!domainParts.isEmpty()) { + levels.add(joiner.join(domainParts)); + domainParts.remove(0); + } + + return levels; + } +} http://git-wip-us.apache.org/repos/asf/aurora/blob/86a547b9/commons/src/main/java/com/twitter/common/net/UrlResolver.java ---------------------------------------------------------------------- diff --git a/commons/src/main/java/com/twitter/common/net/UrlResolver.java b/commons/src/main/java/com/twitter/common/net/UrlResolver.java new file mode 100644 index 0000000..53e7ee7 --- /dev/null +++ b/commons/src/main/java/com/twitter/common/net/UrlResolver.java @@ -0,0 +1,449 @@ +// ================================================================================================= +// Copyright 2011 Twitter, Inc. +// ------------------------------------------------------------------------------------------------- +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this work except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file, or at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ================================================================================================= + +package com.twitter.common.net; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.Executor; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Level; +import java.util.logging.Logger; + +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Functions; +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.util.concurrent.ListenableFutureTask; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +import com.twitter.common.base.ExceptionalFunction; +import com.twitter.common.net.UrlResolver.ResolvedUrl.EndState; +import com.twitter.common.quantity.Amount; +import com.twitter.common.quantity.Time; +import com.twitter.common.stats.PrintableHistogram; +import com.twitter.common.util.BackoffStrategy; +import com.twitter.common.util.Clock; +import com.twitter.common.util.TruncatedBinaryBackoff; +import com.twitter.common.util.caching.Cache; +import com.twitter.common.util.caching.LRUCache; + +/** + * Class to aid in resolving URLs by following redirects, which can optionally be performed + * asynchronously using a thread pool. + * + * @author William Farner + */ +public class UrlResolver { + private static final Logger LOG = Logger.getLogger(UrlResolver.class.getName()); + + private static final String TWITTER_UA = "Twitterbot/0.1"; + private static final UrlResolverUtil URL_RESOLVER = + new UrlResolverUtil(Functions.constant(TWITTER_UA)); + + private static final ExceptionalFunction<String, String, IOException> RESOLVER = + new ExceptionalFunction<String, String, IOException>() { + @Override public String apply(String url) throws IOException { + return URL_RESOLVER.getEffectiveUrl(url, null); + } + }; + + private static ExceptionalFunction<String, String, IOException> + getUrlResolver(final @Nullable ProxyConfig proxyConfig) { + if (proxyConfig != null) { + return new ExceptionalFunction<String, String, IOException>() { + @Override public String apply(String url) throws IOException { + return URL_RESOLVER.getEffectiveUrl(url, proxyConfig); + } + }; + } else { + return RESOLVER; + } + } + + private final ExceptionalFunction<String, String, IOException> resolver; + private final int maxRedirects; + + // Tracks the number of active tasks (threads in use). + private final Semaphore poolEntrySemaphore; + private final Integer threadPoolSize; + + // Helps with signaling the handler. + private final Executor handlerExecutor; + + // Manages the thread pool and task execution. + private ExecutorService executor; + + // Cache to store resolved URLs. + private final Cache<String, String> urlCache = LRUCache.<String, String>builder() + .maxSize(10000) + .makeSynchronized(true) + .build(); + + // Variables to track connection/request stats. + private AtomicInteger requestCount = new AtomicInteger(0); + private AtomicInteger cacheHits = new AtomicInteger(0); + private AtomicInteger failureCount = new AtomicInteger(0); + // Tracks the time (in milliseconds) required to resolve URLs. + private final PrintableHistogram urlResolutionTimesMs = new PrintableHistogram( + 1, 5, 10, 25, 50, 75, 100, 150, 200, 250, 300, 500, 750, 1000, 1500, 2000); + + private final Clock clock; + private final BackoffStrategy backoffStrategy; + + @VisibleForTesting + UrlResolver(Clock clock, BackoffStrategy backoffStrategy, + ExceptionalFunction<String, String, IOException> resolver, int maxRedirects) { + this(clock, backoffStrategy, resolver, maxRedirects, null); + } + + /** + * Creates a new asynchronous URL resolver. A thread pool will be used to resolve URLs, and + * resolved URLs will be announced via {@code handler}. + * + * @param maxRedirects The maximum number of HTTP redirects to follow. + * @param threadPoolSize The number of threads to use for resolving URLs. + * @param proxyConfig The proxy settings with which to make the HTTP request, or null for the + * default configured proxy. + */ + public UrlResolver(int maxRedirects, int threadPoolSize, @Nullable ProxyConfig proxyConfig) { + this(Clock.SYSTEM_CLOCK, + new TruncatedBinaryBackoff(Amount.of(100L, Time.MILLISECONDS), Amount.of(1L, Time.SECONDS)), + getUrlResolver(proxyConfig), maxRedirects, threadPoolSize); + } + + public UrlResolver(int maxRedirects, int threadPoolSize) { + this(maxRedirects, threadPoolSize, null); + } + + private UrlResolver(Clock clock, BackoffStrategy backoffStrategy, + ExceptionalFunction<String, String, IOException> resolver, int maxRedirects, + @Nullable Integer threadPoolSize) { + this.clock = clock; + this.backoffStrategy = backoffStrategy; + this.resolver = resolver; + this.maxRedirects = maxRedirects; + + if (threadPoolSize != null) { + this.threadPoolSize = threadPoolSize; + Preconditions.checkState(threadPoolSize > 0); + poolEntrySemaphore = new Semaphore(threadPoolSize); + + // Start up the thread pool. + reset(); + + // Executor to send notifications back to the handler. This also needs to be + // a daemon thread. + handlerExecutor = + Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().setDaemon(true).build()); + } else { + this.threadPoolSize = null; + poolEntrySemaphore = null; + handlerExecutor = null; + } + } + + public Future<ResolvedUrl> resolveUrlAsync(final String url, final ResolvedUrlHandler handler) { + Preconditions.checkNotNull( + "Asynchronous URL resolution cannot be performed without a valid handler.", handler); + + try { + poolEntrySemaphore.acquire(); + } catch (InterruptedException e) { + LOG.log(Level.SEVERE, "Interrupted while waiting for thread to resolve URL: " + url, e); + return null; + } + final ListenableFutureTask<ResolvedUrl> future = + ListenableFutureTask.create( + new Callable<ResolvedUrl>() { + @Override public ResolvedUrl call() { + return resolveUrl(url); + } + }); + + future.addListener(new Runnable() { + @Override public void run() { + try { + handler.resolved(future); + } finally { + poolEntrySemaphore.release(); + } + } + }, handlerExecutor); + + executor.execute(future); + return future; + } + + private void logThreadpoolInfo() { + LOG.info("Shutting down thread pool, available permits: " + + poolEntrySemaphore.availablePermits()); + LOG.info("Queued threads? " + poolEntrySemaphore.hasQueuedThreads()); + LOG.info("Queue length: " + poolEntrySemaphore.getQueueLength()); + } + + public void reset() { + Preconditions.checkState(threadPoolSize != null); + if (executor != null) { + Preconditions.checkState(executor.isShutdown(), + "The thread pool must be shut down before resetting."); + Preconditions.checkState(executor.isTerminated(), "There may still be pending async tasks."); + } + + // Create a thread pool with daemon threads, so that they may be terminated when no + // application threads are running. + executor = Executors.newFixedThreadPool(threadPoolSize, + new ThreadFactoryBuilder().setDaemon(true).setNameFormat("UrlResolver[%d]").build()); + } + + /** + * Terminates the thread pool, waiting at most {@code waitSeconds} for active threads to complete. + * After this method is called, no more URLs may be submitted for resolution. + * + * @param waitSeconds The number of seconds to wait for active threads to complete. + */ + public void clearAsyncTasks(int waitSeconds) { + Preconditions.checkState(threadPoolSize != null, + "finish() should not be called on a synchronous URL resolver."); + + logThreadpoolInfo(); + executor.shutdown(); // Disable new tasks from being submitted. + try { + // Wait a while for existing tasks to terminate + if (!executor.awaitTermination(waitSeconds, TimeUnit.SECONDS)) { + LOG.info("Pool did not terminate, forcing shutdown."); + logThreadpoolInfo(); + List<Runnable> remaining = executor.shutdownNow(); + LOG.info("Tasks still running: " + remaining); + // Wait a while for tasks to respond to being cancelled + if (!executor.awaitTermination(waitSeconds, TimeUnit.SECONDS)) { + LOG.warning("Pool did not terminate."); + logThreadpoolInfo(); + } + } + } catch (InterruptedException e) { + LOG.log(Level.WARNING, "Interrupted while waiting for threadpool to finish.", e); + // (Re-)Cancel if current thread also interrupted + executor.shutdownNow(); + // Preserve interrupt status + Thread.currentThread().interrupt(); + } + } + + /** + * Resolves a URL synchronously. + * + * @param url The URL to resolve. + * @return The resolved URL. + */ + public ResolvedUrl resolveUrl(String url) { + ResolvedUrl resolvedUrl = new ResolvedUrl(); + resolvedUrl.setStartUrl(url); + + String cached = urlCache.get(url); + if (cached != null) { + cacheHits.incrementAndGet(); + resolvedUrl.setNextResolve(cached); + resolvedUrl.setEndState(EndState.CACHED); + return resolvedUrl; + } + + String currentUrl = url; + long backoffMs = 0L; + String next = null; + for (int i = 0; i < maxRedirects; i++) { + try { + next = resolveOnce(currentUrl); + + // If there was a 4xx or a 5xx, we''ll get a null back, so we pretend like we never advanced + // to allow for a retry within the redirect limit. + // TODO(John Sirois): we really need access to the return code here to do the right thing; ie: + // retry for internal server errors but probably not for unauthorized + if (next == null) { + if (i < maxRedirects - 1) { // don't wait if we're about to exit the loop + backoffMs = backoffStrategy.calculateBackoffMs(backoffMs); + try { + clock.waitFor(backoffMs); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException( + "Interrupted waiting to retry a failed resolution for: " + currentUrl, e); + } + } + continue; + } + + backoffMs = 0L; + if (next.equals(currentUrl)) { + // We've reached the end of the redirect chain. + resolvedUrl.setEndState(EndState.REACHED_LANDING); + urlCache.put(url, currentUrl); + for (String intermediateUrl : resolvedUrl.getIntermediateUrls()) { + urlCache.put(intermediateUrl, currentUrl); + } + return resolvedUrl; + } else if (!url.equals(next)) { + resolvedUrl.setNextResolve(next); + } + currentUrl = next; + } catch (IOException e) { + LOG.log(Level.INFO, "Failed to resolve url: " + url, e); + resolvedUrl.setEndState(EndState.ERROR); + return resolvedUrl; + } + } + + resolvedUrl.setEndState(next == null || url.equals(currentUrl) ? EndState.ERROR + : EndState.REDIRECT_LIMIT); + return resolvedUrl; + } + + /** + * Resolves a url, following at most one redirect. Thread-safe. + * + * @param url The URL to resolve. + * @return The result of following the URL through at most one redirect or null if the url could + * not be followed + * @throws IOException If an error occurs while resolving the URL. + */ + private String resolveOnce(String url) throws IOException { + requestCount.incrementAndGet(); + + String resolvedUrl = urlCache.get(url); + if (resolvedUrl != null) { + cacheHits.incrementAndGet(); + return resolvedUrl; + } + + try { + long startTimeMs = System.currentTimeMillis(); + resolvedUrl = resolver.apply(url); + if (resolvedUrl == null) { + return null; + } + + urlCache.put(url, resolvedUrl); + + synchronized (urlResolutionTimesMs) { + urlResolutionTimesMs.addValue(System.currentTimeMillis() - startTimeMs); + } + return resolvedUrl; + } catch (IOException e) { + failureCount.incrementAndGet(); + throw e; + } + } + + @Override + public String toString() { + return String.format("Cache: %s\nFailed requests: %d,\nResolution Times: %s", + urlCache, failureCount.get(), + urlResolutionTimesMs.toString()); + } + + /** + * Class to wrap the result of a URL resolution. + */ + public static class ResolvedUrl { + public enum EndState { + REACHED_LANDING, + ERROR, + CACHED, + REDIRECT_LIMIT + } + + private String startUrl; + private final List<String> resolveChain; + private EndState endState; + + public ResolvedUrl() { + resolveChain = Lists.newArrayList(); + } + + @VisibleForTesting + public ResolvedUrl(EndState endState, String startUrl, String... resolveChain) { + this.endState = endState; + this.startUrl = startUrl; + this.resolveChain = Lists.newArrayList(resolveChain); + } + + public String getStartUrl() { + return startUrl; + } + + void setStartUrl(String startUrl) { + this.startUrl = startUrl; + } + + /** + * Returns the last URL resolved following a redirect chain, or null if the startUrl is a + * landing URL. + */ + public String getEndUrl() { + return resolveChain.isEmpty() ? null : Iterables.getLast(resolveChain); + } + + void setNextResolve(String endUrl) { + this.resolveChain.add(endUrl); + } + + /** + * Returns any immediate URLs encountered on the resolution chain. If the startUrl redirects + * directly to the endUrl or they are the same the imtermediate URLs will be empty. + */ + public Iterable<String> getIntermediateUrls() { + return resolveChain.size() <= 1 ? ImmutableList.<String>of() + : resolveChain.subList(0, resolveChain.size() - 1); + } + + public EndState getEndState() { + return endState; + } + + void setEndState(EndState endState) { + this.endState = endState; + } + + public String toString() { + return String.format("%s -> %s [%s, %d redirects]", + startUrl, Joiner.on(" -> ").join(resolveChain), endState, resolveChain.size()); + } + } + + /** + * Interface to use for notifying the caller of resolved URLs. + */ + public interface ResolvedUrlHandler { + /** + * Signals that a URL has been resolved to its target. The implementation of this method must + * be thread safe. + * + * @param future The future that has finished resolving a URL. + */ + public void resolved(Future<ResolvedUrl> future); + } +} http://git-wip-us.apache.org/repos/asf/aurora/blob/86a547b9/commons/src/main/java/com/twitter/common/net/UrlResolverUtil.java ---------------------------------------------------------------------- diff --git a/commons/src/main/java/com/twitter/common/net/UrlResolverUtil.java b/commons/src/main/java/com/twitter/common/net/UrlResolverUtil.java new file mode 100644 index 0000000..cc63969 --- /dev/null +++ b/commons/src/main/java/com/twitter/common/net/UrlResolverUtil.java @@ -0,0 +1,151 @@ +// ================================================================================================= +// Copyright 2011 Twitter, Inc. +// ------------------------------------------------------------------------------------------------- +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this work except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file, or at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ================================================================================================= + +package com.twitter.common.net; + +import com.google.common.base.Function; +import com.google.common.base.Functions; +import com.google.common.base.Preconditions; +import com.twitter.common.base.MorePreconditions; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.Proxy; +import java.net.Proxy.Type; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.Map; +import java.util.logging.Logger; +import javax.annotation.Nullable; + +/** + * A utility that can resolve HTTP urls. + * + * @author John Sirois + */ +class UrlResolverUtil { + + private static final Logger LOG = Logger.getLogger(UrlResolverUtil.class.getName()); + + // Default user-agent string to user for HTTP requests. + private static final String DEFAULT_USER_AGENT = "Lynxy/6.6.6dev.8 libwww-FM/3.14159FM"; + + private static Map<String, String> checkNotBlank(Map<String, String> hostToUserAgent) { + Preconditions.checkNotNull(hostToUserAgent); + MorePreconditions.checkNotBlank(hostToUserAgent.entrySet()); + return hostToUserAgent; + } + + private final Function<? super URL, String> urlToUserAgent; + + UrlResolverUtil(Map<String, String> hostToUserAgent) { + this(Functions.compose(Functions.forMap(checkNotBlank(hostToUserAgent), DEFAULT_USER_AGENT), + new Function<URL, String>() { + @Override public String apply(URL url) { + return url.getHost(); + } + })); + } + + UrlResolverUtil(Function<? super URL, String> urlToUserAgent) { + this.urlToUserAgent = Preconditions.checkNotNull(urlToUserAgent); + } + + /** + * Returns the URL that {@code url} lands on, which will be the result of a 3xx redirect, + * or {@code url} if the url does not redirect using an HTTP 3xx response code. If there is a + * non-2xx or 3xx HTTP response code null is returned. + * + * @param url The URL to follow. + * @return The redirected URL, or {@code url} if {@code url} returns a 2XX response, otherwise + * null + * @throws java.io.IOException If an error occurs while trying to follow the url. + */ + String getEffectiveUrl(String url, @Nullable ProxyConfig proxyConfig) throws IOException { + Preconditions.checkNotNull(url); + // Don't follow https. + if (url.startsWith("https://")) { + url = url.replace("https://", "http://"); + } else if (!url.startsWith("http://")) { + url = "http://" + url; + } + + URL urlObj = new URL(url); + + HttpURLConnection con; + if (proxyConfig != null) { + Proxy proxy = new Proxy(Type.HTTP, proxyConfig.getProxyAddress()); + con = (HttpURLConnection) urlObj.openConnection(proxy); + ProxyAuthorizer.adapt(proxyConfig).authorize(con); + } else { + con = (HttpURLConnection) urlObj.openConnection(); + } + try { + + // TODO(John Sirois): several commonly tweeted hosts 406 or 400 on HEADs and only work with GETs + // fix the call chain to be able to specify retry-with-GET + con.setRequestMethod("HEAD"); + + con.setUseCaches(true); + con.setConnectTimeout(5000); + con.setReadTimeout(5000); + con.setInstanceFollowRedirects(false); + + // I hate to have to do this, but some URL shorteners don't respond otherwise. + con.setRequestProperty("User-Agent", urlToUserAgent.apply(urlObj)); + try { + con.connect(); + } catch (StringIndexOutOfBoundsException e) { + LOG.info("Got StringIndexOutOfBoundsException when fetching headers for " + url); + return null; + } + + int responseCode = con.getResponseCode(); + switch (responseCode / 100) { + case 2: + return url; + case 3: + String location = con.getHeaderField("Location"); + if (location == null) { + if (responseCode != 304 /* not modified */) { + LOG.info( + String.format("[%d] Location header was null for URL: %s", responseCode, url)); + } + return url; + } + + // HTTP 1.1 spec says this should be an absolute URI, but i see lots of instances where it + // is relative, so we need to check. + try { + String domain = UrlHelper.getDomainChecked(location); + if (domain == null || domain.isEmpty()) { + // This is a relative URI. + location = "http://" + UrlHelper.getDomain(url) + location; + } + } catch (URISyntaxException e) { + LOG.info("location contained an invalid URI: " + location); + } + + return location; + default: + LOG.info("Failed to resolve url: " + url + " with: " + + responseCode + " -> " + con.getResponseMessage()); + return null; + } + } finally { + con.disconnect(); + } + } +}
