Revision: 322
Author: bslatkin
Date: Wed Feb 3 09:48:25 2010
Log: remove auto-discovery code from hub
http://code.google.com/p/pubsubhubbub/source/detail?r=322
Modified:
/trunk/hub/main.py
/trunk/hub/main_test.py
=======================================
--- /trunk/hub/main.py Wed Feb 3 09:37:28 2010
+++ /trunk/hub/main.py Wed Feb 3 09:48:25 2010
@@ -224,28 +224,6 @@
MAPPINGS_QUEUE = 'mappings'
-WEBLOGS_XMLRPC_ERROR = """<?xml version="1.0"?>
-<methodResponse><params><param><value><struct>
-<member><name>flerror</name><value><boolean>1</boolean></value></member>
-<member><name>message</name><value>%s</value></member>
-<member><name>legal</name><value>
-You agree that use of this ping service is governed
-by the Terms of Use found at pubsubhubbub.appspot.com.
-</value></member>
-</struct></value></param></params></methodResponse>
-"""
-
-WEBLOGS_XMLRPC_SUCCESS = """<?xml version="1.0"?>
-<methodResponse><params><param><value><struct>
-<member><name>flerror</name><value><boolean>0</boolean></value></member>
-<member><name>message</name><value>Thanks for the ping.</value></member>
-<member><name>legal</name><value>
-You agree that use of this ping service is governed
-by the Terms of Use found at pubsubhubbub.appspot.com.
-</value></member>
-</struct></value></param></params></methodResponse>
-"""
-
################################################################################
# Helper functions
@@ -361,109 +339,6 @@
"""Returns a string containing a random challenge token."""
return ''.join(random.choice(_VALID_CHARS) for i in xrange(128))
-
-class HtmlDiscoveryParser(sgmllib.SGMLParser):
- """HTML parser that auto-discovers feed URLs.
-
- Based off of Mark Pilgrim's auto-discovery script from:
- http://diveintomark.org/archives/2002/05/31/rss_autodiscovery_in_python
-
- Thus, this class is roughly Copyright 2002, Mark Pilgrim and is
- under the Python license:
- http://www.python.org/psf/license/
-
- Feed URLs will be placed in the 'feed_urls' attribute's list.
- """
-
- def reset(self):
- sgmllib.SGMLParser.reset(self)
- self.feed_urls = []
-
- def end_head(self, attrs):
- self.setnomoretags()
-
- def start_body(self, attrs):
- self.setnomoretags()
-
- def do_link(self, attrs):
- attr_dict = dict(attrs)
- if attr_dict.get('rel').lower() != 'alternate':
- return
- type = attr_dict.get('type')
- if type not in ('application/atom+xml', 'application/rss+xml'):
- return
- href = attr_dict.get('href')
- # This URL may be bad, but it will be validated later.
- self.feed_urls.append(href)
-
-
-class AutoDiscoveryError(Exception):
- """Raised when auto-discovery fails for whatever reason.
-
- The exception detail should be set to a descriptive string that could
- be presented to the requestor on the other side.
- """
-
-
-def auto_discover_urls(blog_url):
- """Auto-discovers the feed links for a URL.
-
- Caches the discovered URLs in memcache.
-
- Args:
- blog_url: The feed to do auto-discovery on. May be a feed URL itself,
in
- which case this URL will be returned.
-
- Returns:
- A list of feed URLs. May be multiple in cases where multiple formats or
- variants of a feed are auto-discovered.
-
- Raises:
- AutoDiscoveryError if auto-discovery fails for any reason.
- """
- key = 'auto_discover:' + blog_url
- mapping = memcache.get(key)
- if mapping:
- feed_urls = mapping.split('\n')
- logging.debug('Cache hit for auto-discovery of blog_url=%s: %s',
- blog_url, feed_urls)
- return feed_urls
-
- try:
- result = urlfetch.fetch(blog_url)
- except (apiproxy_errors.Error, urlfetch.Error), e:
- logging.exception('Error fetching for discovery blog URL=%s', blog_url)
- raise AutoDiscoveryError('Error fetching content for auto-discovery')
-
- if result.status_code != 200:
- logging.error('Discovery status_code=%s for blog URL=%s',
- result.status_code, blog_url)
- raise AutoDiscoveryError('Auto-discovery fetch received status
code %s' %
- result.status_code)
-
- content_type = result.headers.get('content-type', '')
- if 'xml' in content_type:
- # The supplied URL is actually XML, which means it *should* be a feed.
- feed_urls = [blog_url]
- elif 'html' in content_type:
- parser = HtmlDiscoveryParser()
- try:
- parser.feed(result.content)
- except sgmllib.SGMLParseError:
- logging.exception('Parsing HTML for auto-discovery '
- 'failed for blog URL=%s', blog_url)
- # Cache the error to prevent further, crappy load.
- memcache.add(key, '')
- raise AutoDiscoveryError('Could not parse HTML for auto-discovery')
- else:
- feed_urls = parser.feed_urls
- else:
- raise AutoDiscoveryError(
- 'Blog URL has bad content-type for auto-discovery: %s' %
content_type)
-
- memcache.add(key, '\n'.join(feed_urls))
- return feed_urls
-
################################################################################
# Models
@@ -1972,80 +1847,6 @@
if error:
self.response.out.write(error)
-
-class WeblogsPingHandler(PublishHandlerBase):
- """Handles weblogs.com-style pings."""
-
- # To protect gainst auto-discovery DoS attacks.
- @dos.limit(header=None, param='url', count=5, period=1)
- # To limit a single pinging host.
- @dos.limit(count=20, period=1)
- def get(self):
- """Handles REST pings."""
- self.response.headers['Content-Type'] = 'text/plain'
- name = self.request.get('name')
- url = self.request.get('url')
-
- if not name:
- self.response.out.write('Missing Weblogs.com REST parameter "name"')
- self.response.set_status(400)
- return
- if not url:
- self.response.out.write('Missing Weblogs.com REST parameter "url"')
- self.response.set_status(400)
- return
-
- logging.debug('Weblogs.com REST ping for %s', url)
- if not is_valid_url(url):
- self.response.set_status(400)
- self.response.out.write('url invalid: %s' % url)
- return
-
- found_urls = auto_discover_urls(url)
- error = self.receive_publish(found_urls, 200, 'url')
- if error:
- self.response.out.write(error)
-
- # To limit a single pinging host.
- @dos.limit(count=20, period=1)
- def post(self):
- """Handles XML-RPC pings."""
- try:
- params, method = xmlrpclib.loads(self.request.body)
- except:
- logging.debug('Invalid XML-RPC with body:\n%s', self.request.body)
- self.response.headers['Content-Type'] = 'text/plain'
- self.response.out.write('Content body not valid XML-RPC')
- self.response.set_status(400)
- return
-
- error = ''
- if method != 'weblogUpdates.ping':
- error = 'Invalid XML-RPC method: %s' % method
- elif len(params) < 2:
- error = 'Invalid number of XML-RPC params: %d' % len(params)
- elif len(params) >= 4 and not is_valid_url(params[3]):
- error = 'Invalid feed URL in extended XML-RPC ping: %s' % params[3]
- elif not is_valid_url(params[1]):
- error = 'Invalid blog URL in XML-RPC ping: %s' % params[1]
- else:
- blog_name, blog_url, unused, feed_url, unused = \
- (params + ['', '', ''])[:5]
- if feed_url:
- logging.debug('Weblogs.com extended XML-RPC ping for %s', feed_url)
- found_urls = [feed_url]
- else:
- logging.debug('Weblogs.com XML-RPC ping for %s', blog_url)
- # TODO(bslatkin): figure out how to rate-limit this
- found_urls = auto_discover_urls(url)
- error = self.receive_publish(found_urls, 200, 'unused')
-
- self.response.headers['Content-Type'] = 'text/xml'
- if error:
- self.response.out.write(WEBLOGS_XMLRPC_ERROR % error)
- else:
- self.response.out.write(WEBLOGS_XMLRPC_SUCCESS)
-
################################################################################
# Pulling
=======================================
--- /trunk/hub/main_test.py Wed Feb 3 09:37:28 2010
+++ /trunk/hub/main_test.py Wed Feb 3 09:48:25 2010
@@ -109,91 +109,6 @@
u'/07256788297315478906/label/\u30d6\u30ed\u30b0\u8846')
self.assertEquals(good_iri, main.normalize_iri(iri))
-
-class AutoDiscoverUrlsTest(unittest.TestCase):
- """Tests for the auto_discover_urls function."""
-
- def setUp(self):
- """Sets up the test harness."""
- testutil.setup_for_testing()
- self.url = 'http://example.com/'
-
- def tearDown(self):
- """Tears down the test harness."""
- urlfetch_test_stub.instance.verify_and_reset()
-
- def testHtmlDiscovery(self):
- """Tests HTML discovery with multiple feed links."""
- urlfetch_test_stub.instance.expect(
- 'GET', self.url, 200, """
-<html><head>
-<link rel="alternate" type="application/atom+xml"
-href="http://example.com/feed/1">
-<link rel="alternate" type="application/atom+xml"
-href="http://example.com/feed/2"/>
-<link rel="alternate" type="application/rss+xml"
-href="http://example.com/feed/3">
-<link rel="alternate" type="application/rss+xml"
-href="http://example.com/feed/4"/>
-</head>
-<body>
-meep
-</body>
-</html>
-""", response_headers={'content-type': 'text/html'})
- self.assertEquals(
- ['http://example.com/feed/1', 'http://example.com/feed/2',
- 'http://example.com/feed/3', 'http://example.com/feed/4'],
- main.auto_discover_urls(self.url))
-
- def testCacheHit(self):
- """Tests when the result is already in memcache."""
- memcache.set('auto_discover:' + self.url,
- 'http://example.com/feed/1\nhttp://example.com/feed/2\n'
- 'http://example.com/feed/3\nhttp://example.com/feed/4')
- self.assertEquals(
- ['http://example.com/feed/1', 'http://example.com/feed/2',
- 'http://example.com/feed/3', 'http://example.com/feed/4'],
- main.auto_discover_urls(self.url))
-
- def testFetchError(self):
- """Tests when an exception is hit while fetching the blog URL."""
- urlfetch_test_stub.instance.expect(
- 'GET', self.url, 200, "", urlfetch_error=True)
- self.assertRaises(
- main.AutoDiscoveryError, main.auto_discover_urls, self.url)
-
- def testBadFetchResponseCode(self):
- """Tests when the fetch response code is not 200 OK."""
- urlfetch_test_stub.instance.expect(
- 'GET', self.url, 404, "")
- self.assertRaises(
- main.AutoDiscoveryError, main.auto_discover_urls, self.url)
-
- def testBlogUrlIsFeed(self):
- """Tests when the blog URL supplied is actually a feed."""
- urlfetch_test_stub.instance.expect(
- 'GET', self.url, 200, "unused",
- response_headers={'content-type': 'text/xml'})
- self.assertEquals([self.url], main.auto_discover_urls(self.url))
-
- def testBadContentType(self):
- """Tests when the fetched blog URL is of a bad content-type."""
- urlfetch_test_stub.instance.expect(
- 'GET', self.url, 200, "unused",
- response_headers={'content-type': 'text/plain'})
- self.assertRaises(
- main.AutoDiscoveryError, main.auto_discover_urls, self.url)
-
- def testHtmlParseError(self):
- """Tests when the HTML won't parse correctly."""
- urlfetch_test_stub.instance.expect(
- 'GET', self.url, 200, "<! -- foo -- >",
- response_headers={'content-type': 'text/html'})
- self.assertRaises(
- main.AutoDiscoveryError, main.auto_discover_urls, self.url)
- self.assertEquals('', memcache.get('auto_discover:' + self.url))
-
################################################################################
class TestWorkQueueHandler(webapp.RequestHandler):