#!/usr/bin/python # -*- coding: utf-8 -*- """Command-line Twitter screen-scraping client.
This is approximately the simplest possible Twitter client. It depends on the Python standard library plus lxml, PyQuery, and JSON. A couple of problems remaining: - Because it doesn't store the cookie, it goes through the login process every time, which takes about 30 seconds. - Consequently, if you run this often, you may end up having to visit the Twitter web site manually, log out, and log back in to answer a CAPTCHA. This version currently just uses the nicely-structured HTML you get on the Twitter home page when you're logged in. Twitter's client hits URLs like <https://twitter.com/i/timeline?composed_count=0&include_available_features=1&include_entities=1&include_new_items_bar=true&interval=30000&latent_count=0&since_id=298424125649719297> using cookies for authentication and identification and gets JSON containing an `items_html` key, which is HTML with the same structure. The `since_id` is optional, and you can also probably use `max_id` to look into the past. From that, we can do for item in reversed(['<%s> %s' % (PyQuery(t)('.username b').text(), PyQuery(t)('.js-tweet-text').text()) for t in PyQuery(lxml.html.document_fromstring(json.load(open('/home/default/tmp/twitter.json'))['items_html']))('.tweet')]): print item Except that the PyQuery `.text()` method is broken and inserts unwanted spaces. If you forget to implement cookies, or if the server decides you might be a robot, you get a message like: > Hmm, hubo un problema con el servidor. ¿Volvemos a intentarlo? Or sometimes: > Hmm, there was a problem reaching the server. Try again? Friends at Twitter: Please don't break this program on purpose. Users have the right to run whatever software they want on their own computers, including to access your service. (I have no objection to Captchas.) """ import re import sys import urllib import urllib2 import lxml.html import json from pyquery import PyQuery def q(bytes): return PyQuery(lxml.html.document_fromstring(bytes)) usage = """Usage: %s filename filename should contain your username and password, one per line. """ try: username, password = [line.strip() for line in open(sys.argv[1]).readlines()] except: sys.stderr.write(usage % sys.argv[0]) sys.exit(1) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor) hellodoc = opener.open('https://twitter.com/').read() authtok = q(hellodoc)('input[name=authenticity_token]').val() print "-:- got authtok", authtok assert authtok, hellodoc print "-:- logging in with username %r" % username login = opener.open(urllib2.Request('https://twitter.com/sessions', urllib.urlencode({ 'session[username_or_email]': username, 'session[password]': password, 'authenticity_token': authtok, }))) page = login.read() tweets = q(page)('.tweet') if not len(tweets): print login.info() print page print q(page)('.modal-body .embed-overlay-content').text() print "something went wrong, see above, maybe try logging out of and into the site with a browser" def get_text(element_list): """Slightly fixed version of the reader case of pyquery.PyQuery.text. The original code fucks up your whitespace. This code leaves it as is. """ text = [] def add_text(elem, no_tail=False): if elem.text: text.append(elem.text) for child in elem.getchildren(): add_text(child) if not no_tail and elem.tail: text.append(elem.tail) for elem in element_list: add_text(elem, no_tail=True) return ''.join(text) def htmltext(element_list): "Normalizes whitespace to a single space (as is normal in HTML)." return re.subn(r'\s+', ' ', get_text(element_list))[0] for tweet in reversed(tweets): line = ' <@%s> %s' % (htmltext(PyQuery(tweet)('.username b')), htmltext(PyQuery(tweet)('.js-tweet-text'))) # Not sure how the encoding gets horked, but it does, and here's # how to unhork it: print line.encode('latin-1').decode('utf-8', 'replace') -- To unsubscribe: http://lists.canonical.org/mailman/listinfo/kragen-hacks