Revision: 322
Author: bslatkin
Date: Wed Feb  3 09:48:25 2010
Log: remove auto-discovery code from hub
http://code.google.com/p/pubsubhubbub/source/detail?r=322

Modified:
 /trunk/hub/main.py
 /trunk/hub/main_test.py

=======================================
--- /trunk/hub/main.py  Wed Feb  3 09:37:28 2010
+++ /trunk/hub/main.py  Wed Feb  3 09:48:25 2010
@@ -224,28 +224,6 @@

 MAPPINGS_QUEUE = 'mappings'

-WEBLOGS_XMLRPC_ERROR = """<?xml version="1.0"?>
-<methodResponse><params><param><value><struct>
-<member><name>flerror</name><value><boolean>1</boolean></value></member>
-<member><name>message</name><value>%s</value></member>
-<member><name>legal</name><value>
-You agree that use of this ping service is governed
-by the Terms of Use found at pubsubhubbub.appspot.com.
-</value></member>
-</struct></value></param></params></methodResponse>
-"""
-
-WEBLOGS_XMLRPC_SUCCESS = """<?xml version="1.0"?>
-<methodResponse><params><param><value><struct>
-<member><name>flerror</name><value><boolean>0</boolean></value></member>
-<member><name>message</name><value>Thanks for the ping.</value></member>
-<member><name>legal</name><value>
-You agree that use of this ping service is governed
-by the Terms of Use found at pubsubhubbub.appspot.com.
-</value></member>
-</struct></value></param></params></methodResponse>
-"""
-
################################################################################
 # Helper functions

@@ -361,109 +339,6 @@
   """Returns a string containing a random challenge token."""
   return ''.join(random.choice(_VALID_CHARS) for i in xrange(128))

-
-class HtmlDiscoveryParser(sgmllib.SGMLParser):
-  """HTML parser that auto-discovers feed URLs.
-
-  Based off of Mark Pilgrim's auto-discovery script from:
-    http://diveintomark.org/archives/2002/05/31/rss_autodiscovery_in_python
-
-  Thus, this class is roughly Copyright 2002, Mark Pilgrim and is
-  under the Python license:
-    http://www.python.org/psf/license/
-
-  Feed URLs will be placed in the 'feed_urls' attribute's list.
-  """
-
-  def reset(self):
-    sgmllib.SGMLParser.reset(self)
-    self.feed_urls = []
-
-  def end_head(self, attrs):
-    self.setnomoretags()
-
-  def start_body(self, attrs):
-    self.setnomoretags()
-
-  def do_link(self, attrs):
-    attr_dict = dict(attrs)
-    if attr_dict.get('rel').lower() != 'alternate':
-      return
-    type = attr_dict.get('type')
-    if type not in ('application/atom+xml', 'application/rss+xml'):
-      return
-    href = attr_dict.get('href')
-    # This URL may be bad, but it will be validated later.
-    self.feed_urls.append(href)
-
-
-class AutoDiscoveryError(Exception):
-  """Raised when auto-discovery fails for whatever reason.
-
-  The exception detail should be set to a descriptive string that could
-  be presented to the requestor on the other side.
-  """
-
-
-def auto_discover_urls(blog_url):
-  """Auto-discovers the feed links for a URL.
-
-  Caches the discovered URLs in memcache.
-
-  Args:
- blog_url: The feed to do auto-discovery on. May be a feed URL itself, in
-      which case this URL will be returned.
-
-  Returns:
-    A list of feed URLs. May be multiple in cases where multiple formats or
-    variants of a feed are auto-discovered.
-
-  Raises:
-    AutoDiscoveryError if auto-discovery fails for any reason.
-  """
-  key = 'auto_discover:' + blog_url
-  mapping = memcache.get(key)
-  if mapping:
-    feed_urls = mapping.split('\n')
-    logging.debug('Cache hit for auto-discovery of blog_url=%s: %s',
-                  blog_url, feed_urls)
-    return feed_urls
-
-  try:
-    result = urlfetch.fetch(blog_url)
-  except (apiproxy_errors.Error, urlfetch.Error), e:
-    logging.exception('Error fetching for discovery blog URL=%s', blog_url)
-    raise AutoDiscoveryError('Error fetching content for auto-discovery')
-
-  if result.status_code != 200:
-    logging.error('Discovery status_code=%s for blog URL=%s',
-                  result.status_code, blog_url)
- raise AutoDiscoveryError('Auto-discovery fetch received status code %s' %
-                             result.status_code)
-
-  content_type = result.headers.get('content-type', '')
-  if 'xml' in content_type:
-    # The supplied URL is actually XML, which means it *should* be a feed.
-    feed_urls = [blog_url]
-  elif 'html' in content_type:
-    parser = HtmlDiscoveryParser()
-    try:
-      parser.feed(result.content)
-    except sgmllib.SGMLParseError:
-      logging.exception('Parsing HTML for auto-discovery '
-                        'failed for blog URL=%s', blog_url)
-      # Cache the error to prevent further, crappy load.
-      memcache.add(key, '')
-      raise AutoDiscoveryError('Could not parse HTML for auto-discovery')
-    else:
-      feed_urls = parser.feed_urls
-  else:
-    raise AutoDiscoveryError(
- 'Blog URL has bad content-type for auto-discovery: %s' % content_type)
-
-  memcache.add(key, '\n'.join(feed_urls))
-  return feed_urls
-
################################################################################
 # Models

@@ -1972,80 +1847,6 @@
     if error:
       self.response.out.write(error)

-
-class WeblogsPingHandler(PublishHandlerBase):
-  """Handles weblogs.com-style pings."""
-
-  # To protect gainst auto-discovery DoS attacks.
-  @dos.limit(header=None, param='url', count=5, period=1)
-  # To limit a single pinging host.
-  @dos.limit(count=20, period=1)
-  def get(self):
-    """Handles REST pings."""
-    self.response.headers['Content-Type'] = 'text/plain'
-    name = self.request.get('name')
-    url = self.request.get('url')
-
-    if not name:
-      self.response.out.write('Missing Weblogs.com REST parameter "name"')
-      self.response.set_status(400)
-      return
-    if not url:
-      self.response.out.write('Missing Weblogs.com REST parameter "url"')
-      self.response.set_status(400)
-      return
-
-    logging.debug('Weblogs.com REST ping for %s', url)
-    if not is_valid_url(url):
-      self.response.set_status(400)
-      self.response.out.write('url invalid: %s' % url)
-      return
-
-    found_urls = auto_discover_urls(url)
-    error = self.receive_publish(found_urls, 200, 'url')
-    if error:
-      self.response.out.write(error)
-
-  # To limit a single pinging host.
-  @dos.limit(count=20, period=1)
-  def post(self):
-    """Handles XML-RPC pings."""
-    try:
-      params, method = xmlrpclib.loads(self.request.body)
-    except:
-      logging.debug('Invalid XML-RPC with body:\n%s', self.request.body)
-      self.response.headers['Content-Type'] = 'text/plain'
-      self.response.out.write('Content body not valid XML-RPC')
-      self.response.set_status(400)
-      return
-
-    error = ''
-    if method != 'weblogUpdates.ping':
-      error = 'Invalid XML-RPC method: %s' % method
-    elif len(params) < 2:
-      error = 'Invalid number of XML-RPC params: %d' % len(params)
-    elif len(params) >= 4 and not is_valid_url(params[3]):
-      error = 'Invalid feed URL in extended XML-RPC ping: %s' % params[3]
-    elif not is_valid_url(params[1]):
-      error = 'Invalid blog URL in XML-RPC ping: %s' % params[1]
-    else:
-      blog_name, blog_url, unused, feed_url, unused = \
-          (params + ['', '', ''])[:5]
-      if feed_url:
-        logging.debug('Weblogs.com extended XML-RPC ping for %s', feed_url)
-        found_urls = [feed_url]
-      else:
-        logging.debug('Weblogs.com XML-RPC ping for %s', blog_url)
-        # TODO(bslatkin): figure out how to rate-limit this
-        found_urls = auto_discover_urls(url)
-      error = self.receive_publish(found_urls, 200, 'unused')
-
-    self.response.headers['Content-Type'] = 'text/xml'
-    if error:
-      self.response.out.write(WEBLOGS_XMLRPC_ERROR % error)
-    else:
-      self.response.out.write(WEBLOGS_XMLRPC_SUCCESS)
-
################################################################################
 # Pulling

=======================================
--- /trunk/hub/main_test.py     Wed Feb  3 09:37:28 2010
+++ /trunk/hub/main_test.py     Wed Feb  3 09:48:25 2010
@@ -109,91 +109,6 @@
            u'/07256788297315478906/label/\u30d6\u30ed\u30b0\u8846')
     self.assertEquals(good_iri, main.normalize_iri(iri))

-
-class AutoDiscoverUrlsTest(unittest.TestCase):
-  """Tests for the auto_discover_urls function."""
-
-  def setUp(self):
-    """Sets up the test harness."""
-    testutil.setup_for_testing()
-    self.url = 'http://example.com/'
-
-  def tearDown(self):
-    """Tears down the test harness."""
-    urlfetch_test_stub.instance.verify_and_reset()
-
-  def testHtmlDiscovery(self):
-    """Tests HTML discovery with multiple feed links."""
-    urlfetch_test_stub.instance.expect(
-      'GET', self.url, 200, """
-<html><head>
-<link rel="alternate" type="application/atom+xml"
-href="http://example.com/feed/1";>
-<link rel="alternate" type="application/atom+xml"
-href="http://example.com/feed/2"/>
-<link rel="alternate" type="application/rss+xml"
-href="http://example.com/feed/3";>
-<link rel="alternate" type="application/rss+xml"
-href="http://example.com/feed/4"/>
-</head>
-<body>
-meep
-</body>
-</html>
-""", response_headers={'content-type': 'text/html'})
-    self.assertEquals(
-        ['http://example.com/feed/1', 'http://example.com/feed/2',
-         'http://example.com/feed/3', 'http://example.com/feed/4'],
-        main.auto_discover_urls(self.url))
-
-  def testCacheHit(self):
-    """Tests when the result is already in memcache."""
-    memcache.set('auto_discover:' + self.url,
-        'http://example.com/feed/1\nhttp://example.com/feed/2\n'
-        'http://example.com/feed/3\nhttp://example.com/feed/4')
-    self.assertEquals(
-        ['http://example.com/feed/1', 'http://example.com/feed/2',
-         'http://example.com/feed/3', 'http://example.com/feed/4'],
-        main.auto_discover_urls(self.url))
-
-  def testFetchError(self):
-    """Tests when an exception is hit while fetching the blog URL."""
-    urlfetch_test_stub.instance.expect(
-      'GET', self.url, 200, "", urlfetch_error=True)
-    self.assertRaises(
-      main.AutoDiscoveryError, main.auto_discover_urls, self.url)
-
-  def testBadFetchResponseCode(self):
-    """Tests when the fetch response code is not 200 OK."""
-    urlfetch_test_stub.instance.expect(
-      'GET', self.url, 404, "")
-    self.assertRaises(
-      main.AutoDiscoveryError, main.auto_discover_urls, self.url)
-
-  def testBlogUrlIsFeed(self):
-    """Tests when the blog URL supplied is actually a feed."""
-    urlfetch_test_stub.instance.expect(
-      'GET', self.url, 200, "unused",
-      response_headers={'content-type': 'text/xml'})
-    self.assertEquals([self.url], main.auto_discover_urls(self.url))
-
-  def testBadContentType(self):
-    """Tests when the fetched blog URL is of a bad content-type."""
-    urlfetch_test_stub.instance.expect(
-      'GET', self.url, 200, "unused",
-      response_headers={'content-type': 'text/plain'})
-    self.assertRaises(
-      main.AutoDiscoveryError, main.auto_discover_urls, self.url)
-
-  def testHtmlParseError(self):
-    """Tests when the HTML won't parse correctly."""
-    urlfetch_test_stub.instance.expect(
-      'GET', self.url, 200, "<! --  foo -- >",
-      response_headers={'content-type': 'text/html'})
-    self.assertRaises(
-      main.AutoDiscoveryError, main.auto_discover_urls, self.url)
-    self.assertEquals('', memcache.get('auto_discover:' + self.url))
-
################################################################################

 class TestWorkQueueHandler(webapp.RequestHandler):

Reply via email to