Revision: 338
Author: bslatkin
Date: Sun Feb 28 23:04:32 2010
Log: hub: use whole domain in dos regexes instead of suffix
http://code.google.com/p/pubsubhubbub/source/detail?r=338

Modified:
 /trunk/hub/dos.py
 /trunk/hub/dos_test.py

=======================================
--- /trunk/hub/dos.py   Fri Feb 26 13:52:21 2010
+++ /trunk/hub/dos.py   Sun Feb 28 23:04:32 2010
@@ -187,32 +187,19 @@
 URL_DOMAIN_RE = re.compile(
     r'https?://(?:'
     r'([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)|'  # IP address
-    r'(?:([a-zA-Z0-9-]+\.)*([a-zA-Z0-9-]+\.[a-zA-Z0-9-]+))|'  # Domain
+    r'((?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)|'  # Domain
     r'([^/]+)'  # Anyting else
     r')(?:/.*)?')  # The rest of the URL

-# Domains where the full domain should be used for any rate limiting or
-# statistics instead of just the suffix due to different developers being
-# present on different URLs.
-DOMAIN_EXCEPTIONS = frozenset([
-  'amazonaws.com',
-  'appspot.com',
-  'heroku.com',
-])
-

 def get_url_domain(url):
   """Returns the domain for a URL or 'bad_url if it's not a valid URL."""
   match = URL_DOMAIN_RE.match(url)
   if match:
-    groups = list(match.groups())
-    if groups[1] and groups[2] in DOMAIN_EXCEPTIONS:
-      groups[2] = groups[1] + groups[2]
-    groups[1] = None
-    groups = filter(bool, groups)
+    groups = filter(bool, match.groups())
   else:
-    groups = []
-  return (groups + ['bad_url'])[0]
+    groups = tuple()
+  return (groups + ('bad_url',))[0]

################################################################################

=======================================
--- /trunk/hub/dos_test.py      Fri Feb 26 13:52:21 2010
+++ /trunk/hub/dos_test.py      Sun Feb 28 23:04:32 2010
@@ -484,15 +484,15 @@
         dos.get_url_domain('http://example.com/foo/bar?meep=stuff#asdf'))
     # One subdomain
     self.assertEquals(
-        'example.com',
+        'www.example.com',
dos.get_url_domain('http://www.example.com/foo/bar?meep=stuff#asdf'))
     # Many subdomains
     self.assertEquals(
-        'example.com',
+        '1.2.3.many.sub.example.com',
         dos.get_url_domain('http://1.2.3.many.sub.example.com/'))
     # Domain with no trailing path
     self.assertEquals(
-        'example.com',
+        'www.example.com',
         dos.get_url_domain('http://www.example.com'))

   def testDomainExceptions(self):
@@ -633,10 +633,10 @@
     self.domainB = 'example.com'
     self.domainC = 'other.com'
     self.domainD = 'meep.com'
-    self.url1 = 'http://other.mydomain.com/stuff/meep'
-    self.url2 = 'http://foo.example.com/some-path?a=b'
-    self.url3 = 'http://bar.example.com'
-    self.url4 = 'http://www.other.com/relative'
+    self.url1 = 'http://mydomain.com/stuff/meep'
+    self.url2 = 'http://example.com/some-path?a=b'
+    self.url3 = 'http://example.com'
+    self.url4 = 'http://other.com/relative'
     self.url5 = 'http://meep.com/another-one'
     self.all_urls = [self.url1, self.url2, self.url3, self.url4, self.url5]

@@ -1346,10 +1346,10 @@
     self.domain1 = 'mydomain.com'
     self.domain2 = 'example.com'
     self.domain3 = 'other.com'
-    self.url1 = 'http://other.mydomain.com/stuff/meep'
-    self.url2 = 'http://foo.example.com/some-path?a=b'
-    self.url3 = 'http://bar.example.com'
-    self.url4 = 'http://www.other.com/relative'
+    self.url1 = 'http://mydomain.com/stuff/meep'
+    self.url2 = 'http://example.com/some-path?a=b'
+    self.url3 = 'http://example.com'
+    self.url4 = 'http://other.com/relative'
     self.scorer = dos.UrlScorer(
         period=60,
         min_requests=1,

Reply via email to