Revision: 338
Author: bslatkin
Date: Sun Feb 28 23:04:32 2010
Log: hub: use whole domain in dos regexes instead of suffix
http://code.google.com/p/pubsubhubbub/source/detail?r=338
Modified:
/trunk/hub/dos.py
/trunk/hub/dos_test.py
=======================================
--- /trunk/hub/dos.py Fri Feb 26 13:52:21 2010
+++ /trunk/hub/dos.py Sun Feb 28 23:04:32 2010
@@ -187,32 +187,19 @@
URL_DOMAIN_RE = re.compile(
r'https?://(?:'
r'([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)|' # IP address
- r'(?:([a-zA-Z0-9-]+\.)*([a-zA-Z0-9-]+\.[a-zA-Z0-9-]+))|' # Domain
+ r'((?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)|' # Domain
r'([^/]+)' # Anyting else
r')(?:/.*)?') # The rest of the URL
-# Domains where the full domain should be used for any rate limiting or
-# statistics instead of just the suffix due to different developers being
-# present on different URLs.
-DOMAIN_EXCEPTIONS = frozenset([
- 'amazonaws.com',
- 'appspot.com',
- 'heroku.com',
-])
-
def get_url_domain(url):
"""Returns the domain for a URL or 'bad_url if it's not a valid URL."""
match = URL_DOMAIN_RE.match(url)
if match:
- groups = list(match.groups())
- if groups[1] and groups[2] in DOMAIN_EXCEPTIONS:
- groups[2] = groups[1] + groups[2]
- groups[1] = None
- groups = filter(bool, groups)
+ groups = filter(bool, match.groups())
else:
- groups = []
- return (groups + ['bad_url'])[0]
+ groups = tuple()
+ return (groups + ('bad_url',))[0]
################################################################################
=======================================
--- /trunk/hub/dos_test.py Fri Feb 26 13:52:21 2010
+++ /trunk/hub/dos_test.py Sun Feb 28 23:04:32 2010
@@ -484,15 +484,15 @@
dos.get_url_domain('http://example.com/foo/bar?meep=stuff#asdf'))
# One subdomain
self.assertEquals(
- 'example.com',
+ 'www.example.com',
dos.get_url_domain('http://www.example.com/foo/bar?meep=stuff#asdf'))
# Many subdomains
self.assertEquals(
- 'example.com',
+ '1.2.3.many.sub.example.com',
dos.get_url_domain('http://1.2.3.many.sub.example.com/'))
# Domain with no trailing path
self.assertEquals(
- 'example.com',
+ 'www.example.com',
dos.get_url_domain('http://www.example.com'))
def testDomainExceptions(self):
@@ -633,10 +633,10 @@
self.domainB = 'example.com'
self.domainC = 'other.com'
self.domainD = 'meep.com'
- self.url1 = 'http://other.mydomain.com/stuff/meep'
- self.url2 = 'http://foo.example.com/some-path?a=b'
- self.url3 = 'http://bar.example.com'
- self.url4 = 'http://www.other.com/relative'
+ self.url1 = 'http://mydomain.com/stuff/meep'
+ self.url2 = 'http://example.com/some-path?a=b'
+ self.url3 = 'http://example.com'
+ self.url4 = 'http://other.com/relative'
self.url5 = 'http://meep.com/another-one'
self.all_urls = [self.url1, self.url2, self.url3, self.url4, self.url5]
@@ -1346,10 +1346,10 @@
self.domain1 = 'mydomain.com'
self.domain2 = 'example.com'
self.domain3 = 'other.com'
- self.url1 = 'http://other.mydomain.com/stuff/meep'
- self.url2 = 'http://foo.example.com/some-path?a=b'
- self.url3 = 'http://bar.example.com'
- self.url4 = 'http://www.other.com/relative'
+ self.url1 = 'http://mydomain.com/stuff/meep'
+ self.url2 = 'http://example.com/some-path?a=b'
+ self.url3 = 'http://example.com'
+ self.url4 = 'http://other.com/relative'
self.scorer = dos.UrlScorer(
period=60,
min_requests=1,