This is one of the stages of the project described at

This is currently running at
and mostly seems to work OK technically --- the UI is a different
problem altogether.  New users universally enter the word they're
looking for in the guidewords boxes.  I haven't yet experimented with
ways to solve this user interface problem, but it clearly needs solving.

I'm also planning to use a Google-Maps-like interface to avoid having to
download the entire 750K JPEG to see any part of the full-res page.

I need to figure out how to put the gigabyte of page image data up on
the web.

import twisted.web.resource, twisted.internet.reactor, twisted.web.static, sys
import os.path, nevow, twisted.web.error, urllib, random, string, pickle
import urlparse
T = nevow.tags
# To do:
# D put up on web
# D make reduced images <1024 pixels high
# [EMAIL PROTECTED]:/mnt/raid/kragen/media/oed-v$ time
# for x in newenglishdict05murrmiss_jpg/*.jpg; do
#    convert -size 1024x1024 -resize 1024x1024 "$x"
#       thumbnails/"$(basename "$x")";
#    echo $x done;
# done
# D make navigation among these reduced images possible by wrapping them in
#   pages
# D remember the words at the top of each image
# D support navigation by words
# - make much smaller thumbnails somehow --- copying 100 thumbnails to
#   panacea only managed 40KB/s in 3:46, i.e. 226 seconds, or 2.26
#   seconds each.  Perhaps only the top 200 pixels or so, or 16KB each?
# D keep a file that knows the words at the top of each image
# - make a torrent for the images
# - make a streamlined data entry UI for guidewords?
# - remove "if it's in the book" when it's a headword
# - never redirect back to the same page

def flatten(stan): return str(nevow.flat.flatten(stan))

def dict_precedes(a, b):
    def transformed(word):
        word = word.lower()
        return ''.join([letter for letter in word
                        if letter in string.ascii_letters])
    return transformed(a) < transformed(b)

class Book:
    def __init__(self, images, thumbnails, guidewordlog):
        self.images = images
        self.thumbnails = thumbnails
        self.thumbnail_list = os.listdir(self.thumbnails)
        self.guidewords_list = [None] * len(self.thumbnail_list)
        self.guidewordlog = guidewordlog
    def read_guidewords(self):
            while 1:
                pageno, first, last = pickle.load(self.guidewordlog)
                self.guidewords_list[pageno] = first, last
        except EOFError:
    def images_resource(self):
        return twisted.web.static.File(self.images)
    def thumbnails_resource(self):
        return twisted.web.static.File(self.thumbnails)
    def viewer_resource(self):
        return ThumbnailNavigator(self)
    def thumbnail_url(self, index):
        return "/thumbnails/" + self.thumbnail_list[index]
    def fullsize_url(self, index):
        return "/images/" + self.thumbnail_list[index]
    def add_guidewords(self, index, first, last):
        self.guidewords_list[index] = first, last
        print "Page", index, "has guidewords", first, "and", last
        pickle.dump((index, first, last), self.guidewordlog)
    def get_guidewords(self, index):
        return self.guidewords_list[index]
    def should_be_on_page(self, index, word):
        guidewords = self.get_guidewords(index)
        if not guidewords: return False
        first, last = guidewords
        return (not dict_precedes(word, first) and
                not dict_precedes(last, word))
    def look_for(self, word):
        last_before, first_after = 21, 1298  # XXX for OED vol V
        word = word.lower()
        for ii in xrange(len(self.guidewords_list)):
            guidewords = self.get_guidewords(ii)
            if not guidewords: continue
            if self.should_be_on_page(ii, word): return ii
            first, last = guidewords
            if dict_precedes(last, word) and ii < first_after:
                last_before = ii
            if dict_precedes(word, first) and ii < first_after:
                first_after = ii
        if last_before < first_after - 1:
            return int((last_before + first_after) / 2)
            # one of these must be wrong?
            return random.choice([last_before, first_after])

class ThumbnailPage(twisted.web.resource.Resource):
    def __init__(self, book, index):
        twisted.web.resource.Resource.__init__(self) = book
        self.index = index
    def render_GET(self, req):
        print "Displaying page", self.index
        form = ''
        if req.args.has_key('q'):
            word = req.args['q'][0]
            if word[0].lower() not in 'hijk':
                warning = "(Not in this dictionary, which only covers HIJK.)"
            elif, word):
                warning = "(Should be on this page if it's in this book.)"
            else: warning = ''
            first, last = '', ''
            guidewords =
            if guidewords: first, last = guidewords
            form = T.form(method="POST")[
                T.p["Searching for ", T.b[word], ". ", T.b[warning]],
                "Enter the guide words at the top of the page: ",
                T.input(type="text", name="first_word", value=first),
                T.input(type="text", name="last_word", value=last),
                T.input(type="hidden", name="q", value=word),
                T.input(type="submit", value="Update"),
        return flatten(
            T.html[T.head[T.title['Page number ', str(self.index)]],
                    T.a(href=self.page_link(self.index-1, req))["Prev"], ' ',
                    ' ', T.a(href=self.page_link(self.index+1, req))["Next"],
    def render_POST(self, req):
        if req.args.has_key('first_word'):
            first = req.args['first_word'][0]
            last = req.args['last_word'][0] or first
  , first, last)
        base_url = req.prePathURL()
        word = req.args['q'][0]
        recommended_page_number =
        newurl = str(recommended_page_number) + '?q=' + urllib.quote(word)
        newurl = urlparse.urljoin(base_url, newurl)
        return ''  # who cares about Opera 1.0?
    def page_link(self, index, req):
        if req.args.has_key('q'):
            return '%d?q=%s' % (index, urllib.quote(req.args['q'][0]))
        else: return str(index)

class ThumbnailNavigator(twisted.web.resource.Resource):
    def __init__(self, book):
        twisted.web.resource.Resource.__init__(self) = book
    def getChild(self, childname, req):
        try: index = int(childname)
        except: return twisted.web.error.NoResource("Misspelled page number.")
        return ThumbnailPage(, index)
    def render_GET(self, req):
        return flatten(
            T.html[T.head[T.title["OED Volume V"]],
                T.body["See ", T.a(href="page/0")["the first page"],
                    ' or search for a word from H to K.  Searching may ',
                    'require that you type in the guidewords you see on ',
                    'up to six pages before you get to the correct page: ',
                    T.form(method='POST', action='page/1')[
                        T.input(name='q', value='hawk')

indexpage = flatten(
    T.html[T.head[T.title["Index to OED volume V site"]],
        T.body[T.h1["Index to OED volume V site"],
      [T.a(href="page")["My prototype viewer"]],
      [T.a(href="images/")["Archive's index page"]],
      [T.a(href="thumbnails/")["Raw thumbnails dir"]],
            T.address[T.a(href="mailto:[EMAIL PROTECTED]")[
                "Kragen Sitaker"]],

def ok(a, b): assert a == b, (a, b)
def test():
    assert dict_precedes('ideologically', 'idiom')
    assert dict_precedes('Ideologically', 'idiom')
    assert dict_precedes('ideologically', 'Idiom')
    assert dict_precedes('Had-I-Wist', 'Haematite')
    assert dict_precedes('Hades', 'Had-I-Wist')
    assert dict_precedes('Hackthorn', 'Haddock')
    assert dict_precedes('Haddock', 'Hades')
    assert dict_precedes('Hackthorn', 'Hades')

def main(port):
    root = twisted.web.resource.Resource()
    imagebase = '.'
    images = os.path.join(imagebase, 'newenglishdict05murrmiss_jpg')
    assert os.path.exists(images)
    thumbnails = os.path.join(imagebase, 'thumbnails')
    guidewordlog = os.path.join(imagebase, 'guidewords')
    book = Book(images, thumbnails, file(guidewordlog, 'r+'))

    root.putChild('', twisted.web.static.Data(indexpage, 'text/html'))
    root.putChild('images', book.images_resource())
    root.putChild('thumbnails', book.thumbnails_resource())
    root.putChild('page', book.viewer_resource())

    print "listening", port
if __name__ == '__main__':

Reply via email to