Browsing the OED by user-entered guidewords

Kragen Sitaker Tue, 15 Nov 2005 13:25:26 -0800

This is one of the stages of the project described at
http://lists.canonical.org/pipermail/kragen-tol/2005-October/000794.html.


This is currently running at http://considerate.murch-sitaker.org:8000/
and mostly seems to work OK technically --- the UI is a different
problem altogether.  New users universally enter the word they're
looking for in the guidewords boxes.  I haven't yet experimented with
ways to solve this user interface problem, but it clearly needs solving.

I'm also planning to use a Google-Maps-like interface to avoid having to
download the entire 750K JPEG to see any part of the full-res page.

I need to figure out how to put the gigabyte of page image data up on
the web.

#!/usr/bin/python
import twisted.web.resource, twisted.internet.reactor, twisted.web.static, sys
import os.path, nevow, twisted.web.error, urllib, random, string, pickle
import urlparse
T = nevow.tags
# To do:
# D put up on web
# D make reduced images <1024 pixels high
# [EMAIL PROTECTED]:/mnt/raid/kragen/media/oed-v$ time
# for x in newenglishdict05murrmiss_jpg/*.jpg; do
#    convert -size 1024x1024 -resize 1024x1024 "$x"
#       thumbnails/"$(basename "$x")";
#    echo $x done;
# done
# D make navigation among these reduced images possible by wrapping them in
#   pages
# D remember the words at the top of each image
# D support navigation by words
# - make much smaller thumbnails somehow --- copying 100 thumbnails to
#   panacea only managed 40KB/s in 3:46, i.e. 226 seconds, or 2.26
#   seconds each.  Perhaps only the top 200 pixels or so, or 16KB each?
# D keep a file that knows the words at the top of each image
# - make a torrent for the images
# - make a streamlined data entry UI for guidewords?
# - remove "if it's in the book" when it's a headword
# - never redirect back to the same page

def flatten(stan): return str(nevow.flat.flatten(stan))

def dict_precedes(a, b):
    def transformed(word):
        word = word.lower()
        return ''.join([letter for letter in word
                        if letter in string.ascii_letters])
    return transformed(a) < transformed(b)

class Book:
    def __init__(self, images, thumbnails, guidewordlog):
        self.images = images
        self.thumbnails = thumbnails
        self.thumbnail_list = os.listdir(self.thumbnails)
        self.guidewords_list = [None] * len(self.thumbnail_list)
        self.guidewordlog = guidewordlog
        self.read_guidewords()
    def read_guidewords(self):
        self.guidewordlog.seek(0)
        try:
            while 1:
                pageno, first, last = pickle.load(self.guidewordlog)
                self.guidewords_list[pageno] = first, last
        except EOFError:
            pass
    def images_resource(self):
        return twisted.web.static.File(self.images)
    def thumbnails_resource(self):
        return twisted.web.static.File(self.thumbnails)
    def viewer_resource(self):
        return ThumbnailNavigator(self)
    def thumbnail_url(self, index):
        return "/thumbnails/" + self.thumbnail_list[index]
    def fullsize_url(self, index):
        return "/images/" + self.thumbnail_list[index]
    def add_guidewords(self, index, first, last):
        self.guidewords_list[index] = first, last
        print "Page", index, "has guidewords", first, "and", last
        pickle.dump((index, first, last), self.guidewordlog)
        self.guidewordlog.flush()
    def get_guidewords(self, index):
        return self.guidewords_list[index]
    def should_be_on_page(self, index, word):
        guidewords = self.get_guidewords(index)
        if not guidewords: return False
        first, last = guidewords
        return (not dict_precedes(word, first) and
                not dict_precedes(last, word))
    def look_for(self, word):
        last_before, first_after = 21, 1298  # XXX for OED vol V
        word = word.lower()
        for ii in xrange(len(self.guidewords_list)):
            guidewords = self.get_guidewords(ii)
            if not guidewords: continue
            if self.should_be_on_page(ii, word): return ii
            first, last = guidewords
            if dict_precedes(last, word) and ii < first_after:
                last_before = ii
            if dict_precedes(word, first) and ii < first_after:
                first_after = ii
        if last_before < first_after - 1:
            return int((last_before + first_after) / 2)
        else:
            # one of these must be wrong?
            return random.choice([last_before, first_after])

class ThumbnailPage(twisted.web.resource.Resource):
    def __init__(self, book, index):
        twisted.web.resource.Resource.__init__(self)
        self.book = book
        self.index = index
    def render_GET(self, req):
        print "Displaying page", self.index
        form = ''
        if req.args.has_key('q'):
            word = req.args['q'][0]
            if word[0].lower() not in 'hijk':
                warning = "(Not in this dictionary, which only covers HIJK.)"
            elif self.book.should_be_on_page(self.index, word):
                warning = "(Should be on this page if it's in this book.)"
            else: warning = ''
            first, last = '', ''
            guidewords = self.book.get_guidewords(self.index)
            if guidewords: first, last = guidewords
            form = T.form(method="POST")[
                T.p["Searching for ", T.b[word], ". ", T.b[warning]],
                "Enter the guide words at the top of the page: ",
                T.input(type="text", name="first_word", value=first),
                T.input(type="text", name="last_word", value=last),
                T.input(type="hidden", name="q", value=word),
                T.input(type="submit", value="Update"),
            ]
        return flatten(
            T.html[T.head[T.title['Page number ', str(self.index)]],
                T.body[
                    form,
                    T.script(type="text/javascript")[
                        "document.forms[0][0].focus()"
                    ],
                    T.a(href=self.page_link(self.index-1, req))["Prev"], ' ',
                    T.a(href=self.book.fullsize_url(self.index))[
                        T.img(src=self.book.thumbnail_url(self.index),
                              align="top")
                    ],
                    ' ', T.a(href=self.page_link(self.index+1, req))["Next"],
                ],
            ])
    def render_POST(self, req):
        if req.args.has_key('first_word'):
            first = req.args['first_word'][0]
            last = req.args['last_word'][0] or first
            self.book.add_guidewords(self.index, first, last)
        base_url = req.prePathURL()
        word = req.args['q'][0]
        recommended_page_number = self.book.look_for(word)
        newurl = str(recommended_page_number) + '?q=' + urllib.quote(word)
        newurl = urlparse.urljoin(base_url, newurl)
        req.redirect(newurl)
        return ''  # who cares about Opera 1.0?
        
    def page_link(self, index, req):
        if req.args.has_key('q'):
            return '%d?q=%s' % (index, urllib.quote(req.args['q'][0]))
        else: return str(index)

class ThumbnailNavigator(twisted.web.resource.Resource):
    def __init__(self, book):
        twisted.web.resource.Resource.__init__(self)
        self.book = book
    def getChild(self, childname, req):
        try: index = int(childname)
        except: return twisted.web.error.NoResource("Misspelled page number.")
        return ThumbnailPage(self.book, index)
    def render_GET(self, req):
        return flatten(
            T.html[T.head[T.title["OED Volume V"]],
                T.body["See ", T.a(href="page/0")["the first page"],
                    ' or search for a word from H to K.  Searching may ',
                    'require that you type in the guidewords you see on ',
                    'up to six pages before you get to the correct page: ',
                    T.form(method='POST', action='page/1')[
                        T.input(name='q', value='hawk')
                    ],
                    T.script(type="text/javascript")[
                        "document.forms[0][0].focus()"
                    ],
                ]])

indexpage = flatten(
    T.html[T.head[T.title["Index to OED volume V site"]],
        T.body[T.h1["Index to OED volume V site"],
            T.ul[
                T.li[T.a(href="page")["My prototype viewer"]],
                T.li[T.a(href="images/")["Archive's index page"]],
                T.li[T.a(href="thumbnails/")["Raw thumbnails dir"]],
            ],
            T.address[T.a(href="mailto:[EMAIL PROTECTED]")[
                "Kragen Sitaker"]],
        ],
    ]
)

def ok(a, b): assert a == b, (a, b)
def test():
    assert dict_precedes('ideologically', 'idiom')
    assert dict_precedes('Ideologically', 'idiom')
    assert dict_precedes('ideologically', 'Idiom')
    assert dict_precedes('Had-I-Wist', 'Haematite')
    assert dict_precedes('Hades', 'Had-I-Wist')
    assert dict_precedes('Hackthorn', 'Haddock')
    assert dict_precedes('Haddock', 'Hades')
    assert dict_precedes('Hackthorn', 'Hades')
test()

def main(port):
    root = twisted.web.resource.Resource()
    imagebase = '.'
    images = os.path.join(imagebase, 'newenglishdict05murrmiss_jpg')
    assert os.path.exists(images)
    thumbnails = os.path.join(imagebase, 'thumbnails')
    guidewordlog = os.path.join(imagebase, 'guidewords')
    book = Book(images, thumbnails, file(guidewordlog, 'r+'))

    root.putChild('', twisted.web.static.Data(indexpage, 'text/html'))
    root.putChild('images', book.images_resource())
    root.putChild('thumbnails', book.thumbnails_resource())
    root.putChild('page', book.viewer_resource())

    twisted.internet.reactor.listenTCP(port,
        twisted.web.server.Site(root))
    print "listening", port
    twisted.internet.reactor.run()
if __name__ == '__main__':
    main(int(sys.argv[1]))

Browsing the OED by user-entered guidewords

Reply via email to