This is one of the stages of the project described at http://lists.canonical.org/pipermail/kragen-tol/2005-October/000794.html.
This is currently running at http://considerate.murch-sitaker.org:8000/ and mostly seems to work OK technically --- the UI is a different problem altogether. New users universally enter the word they're looking for in the guidewords boxes. I haven't yet experimented with ways to solve this user interface problem, but it clearly needs solving. I'm also planning to use a Google-Maps-like interface to avoid having to download the entire 750K JPEG to see any part of the full-res page. I need to figure out how to put the gigabyte of page image data up on the web. #!/usr/bin/python import twisted.web.resource, twisted.internet.reactor, twisted.web.static, sys import os.path, nevow, twisted.web.error, urllib, random, string, pickle import urlparse T = nevow.tags # To do: # D put up on web # D make reduced images <1024 pixels high # [EMAIL PROTECTED]:/mnt/raid/kragen/media/oed-v$ time # for x in newenglishdict05murrmiss_jpg/*.jpg; do # convert -size 1024x1024 -resize 1024x1024 "$x" # thumbnails/"$(basename "$x")"; # echo $x done; # done # D make navigation among these reduced images possible by wrapping them in # pages # D remember the words at the top of each image # D support navigation by words # - make much smaller thumbnails somehow --- copying 100 thumbnails to # panacea only managed 40KB/s in 3:46, i.e. 226 seconds, or 2.26 # seconds each. Perhaps only the top 200 pixels or so, or 16KB each? # D keep a file that knows the words at the top of each image # - make a torrent for the images # - make a streamlined data entry UI for guidewords? # - remove "if it's in the book" when it's a headword # - never redirect back to the same page def flatten(stan): return str(nevow.flat.flatten(stan)) def dict_precedes(a, b): def transformed(word): word = word.lower() return ''.join([letter for letter in word if letter in string.ascii_letters]) return transformed(a) < transformed(b) class Book: def __init__(self, images, thumbnails, guidewordlog): self.images = images self.thumbnails = thumbnails self.thumbnail_list = os.listdir(self.thumbnails) self.guidewords_list = [None] * len(self.thumbnail_list) self.guidewordlog = guidewordlog self.read_guidewords() def read_guidewords(self): self.guidewordlog.seek(0) try: while 1: pageno, first, last = pickle.load(self.guidewordlog) self.guidewords_list[pageno] = first, last except EOFError: pass def images_resource(self): return twisted.web.static.File(self.images) def thumbnails_resource(self): return twisted.web.static.File(self.thumbnails) def viewer_resource(self): return ThumbnailNavigator(self) def thumbnail_url(self, index): return "/thumbnails/" + self.thumbnail_list[index] def fullsize_url(self, index): return "/images/" + self.thumbnail_list[index] def add_guidewords(self, index, first, last): self.guidewords_list[index] = first, last print "Page", index, "has guidewords", first, "and", last pickle.dump((index, first, last), self.guidewordlog) self.guidewordlog.flush() def get_guidewords(self, index): return self.guidewords_list[index] def should_be_on_page(self, index, word): guidewords = self.get_guidewords(index) if not guidewords: return False first, last = guidewords return (not dict_precedes(word, first) and not dict_precedes(last, word)) def look_for(self, word): last_before, first_after = 21, 1298 # XXX for OED vol V word = word.lower() for ii in xrange(len(self.guidewords_list)): guidewords = self.get_guidewords(ii) if not guidewords: continue if self.should_be_on_page(ii, word): return ii first, last = guidewords if dict_precedes(last, word) and ii < first_after: last_before = ii if dict_precedes(word, first) and ii < first_after: first_after = ii if last_before < first_after - 1: return int((last_before + first_after) / 2) else: # one of these must be wrong? return random.choice([last_before, first_after]) class ThumbnailPage(twisted.web.resource.Resource): def __init__(self, book, index): twisted.web.resource.Resource.__init__(self) self.book = book self.index = index def render_GET(self, req): print "Displaying page", self.index form = '' if req.args.has_key('q'): word = req.args['q'][0] if word[0].lower() not in 'hijk': warning = "(Not in this dictionary, which only covers HIJK.)" elif self.book.should_be_on_page(self.index, word): warning = "(Should be on this page if it's in this book.)" else: warning = '' first, last = '', '' guidewords = self.book.get_guidewords(self.index) if guidewords: first, last = guidewords form = T.form(method="POST")[ T.p["Searching for ", T.b[word], ". ", T.b[warning]], "Enter the guide words at the top of the page: ", T.input(type="text", name="first_word", value=first), T.input(type="text", name="last_word", value=last), T.input(type="hidden", name="q", value=word), T.input(type="submit", value="Update"), ] return flatten( T.html[T.head[T.title['Page number ', str(self.index)]], T.body[ form, T.script(type="text/javascript")[ "document.forms[0][0].focus()" ], T.a(href=self.page_link(self.index-1, req))["Prev"], ' ', T.a(href=self.book.fullsize_url(self.index))[ T.img(src=self.book.thumbnail_url(self.index), align="top") ], ' ', T.a(href=self.page_link(self.index+1, req))["Next"], ], ]) def render_POST(self, req): if req.args.has_key('first_word'): first = req.args['first_word'][0] last = req.args['last_word'][0] or first self.book.add_guidewords(self.index, first, last) base_url = req.prePathURL() word = req.args['q'][0] recommended_page_number = self.book.look_for(word) newurl = str(recommended_page_number) + '?q=' + urllib.quote(word) newurl = urlparse.urljoin(base_url, newurl) req.redirect(newurl) return '' # who cares about Opera 1.0? def page_link(self, index, req): if req.args.has_key('q'): return '%d?q=%s' % (index, urllib.quote(req.args['q'][0])) else: return str(index) class ThumbnailNavigator(twisted.web.resource.Resource): def __init__(self, book): twisted.web.resource.Resource.__init__(self) self.book = book def getChild(self, childname, req): try: index = int(childname) except: return twisted.web.error.NoResource("Misspelled page number.") return ThumbnailPage(self.book, index) def render_GET(self, req): return flatten( T.html[T.head[T.title["OED Volume V"]], T.body["See ", T.a(href="page/0")["the first page"], ' or search for a word from H to K. Searching may ', 'require that you type in the guidewords you see on ', 'up to six pages before you get to the correct page: ', T.form(method='POST', action='page/1')[ T.input(name='q', value='hawk') ], T.script(type="text/javascript")[ "document.forms[0][0].focus()" ], ]]) indexpage = flatten( T.html[T.head[T.title["Index to OED volume V site"]], T.body[T.h1["Index to OED volume V site"], T.ul[ T.li[T.a(href="page")["My prototype viewer"]], T.li[T.a(href="images/")["Archive's index page"]], T.li[T.a(href="thumbnails/")["Raw thumbnails dir"]], ], T.address[T.a(href="mailto:[EMAIL PROTECTED]")[ "Kragen Sitaker"]], ], ] ) def ok(a, b): assert a == b, (a, b) def test(): assert dict_precedes('ideologically', 'idiom') assert dict_precedes('Ideologically', 'idiom') assert dict_precedes('ideologically', 'Idiom') assert dict_precedes('Had-I-Wist', 'Haematite') assert dict_precedes('Hades', 'Had-I-Wist') assert dict_precedes('Hackthorn', 'Haddock') assert dict_precedes('Haddock', 'Hades') assert dict_precedes('Hackthorn', 'Hades') test() def main(port): root = twisted.web.resource.Resource() imagebase = '.' images = os.path.join(imagebase, 'newenglishdict05murrmiss_jpg') assert os.path.exists(images) thumbnails = os.path.join(imagebase, 'thumbnails') guidewordlog = os.path.join(imagebase, 'guidewords') book = Book(images, thumbnails, file(guidewordlog, 'r+')) root.putChild('', twisted.web.static.Data(indexpage, 'text/html')) root.putChild('images', book.images_resource()) root.putChild('thumbnails', book.thumbnails_resource()) root.putChild('page', book.viewer_resource()) twisted.internet.reactor.listenTCP(port, twisted.web.server.Site(root)) print "listening", port twisted.internet.reactor.run() if __name__ == '__main__': main(int(sys.argv[1]))