Re: [Imdbpy-devel] overall TV Series info in SQL

Davide Alberani Fri, 20 Oct 2006 10:36:20 -0700

On Oct 20, Jay Klein <[EMAIL PROTECTED]> wrote:

> Let me know if you come up with another version or change, I'll be
> glad to test it out.


Ok; in the attachment there is another try.

It should handle correctly the split of too large set of data, so
try to run it without modifying 'flushEvery' and let it run even
if a TOO MANY DATA warning appears (but look for notices of lost
data).

I've tested it in my local installation (changing the max_allowed_packet
of my MySQL server from the default - 16M - to 2M and 1M) and it seems
to work.

If you still have problems, I think we have to try:
  import blackmagic
  blackmagic.debug(verbose=True)

;-)

Thanks again for the tests!
-- 
Davide Alberani <[EMAIL PROTECTED]> [PGP KeyID: 0x465BFD47]
http://erlug.linux.it/~da/

#!/usr/bin/env python
"""
imdbpy2sql.py script.

This script puts the data of the plain text data files into a
SQL database.

Copyright 2005-2006 Davide Alberani <[EMAIL PROTECTED]>
               2006 Giuseppe "Cowo" Corbelli <cowo --> lugbs.linux.it>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

from __future__ import generators
import os, sys, getopt, time, re
from gzip import GzipFile
from types import UnicodeType

from sqlobject import *

from imdb.parser.sql import soundex
from imdb.parser.sql.dbschema import *
from imdb.utils import analyze_title, analyze_name, \
        build_name, build_title, normalizeName, _articles
from imdb.parser.local.movieParser import _bus, _ldk, _lit, _links_sect
from imdb.parser.local.personParser import _parseBiography
from imdb._exceptions import IMDbParserError

re_nameImdbIndex = re.compile(r'\(([IVXLCDM]+)\)')

HELP = """imdbpy2sql usage:
    %s -d /directory/with/PlainTextDataFiles/ -u URI

        # NOTE: URI is something along the line:
                scheme://[user[:[EMAIL PROTECTED]:port]/database[?parameters]

                Examples:
                mysql://user:[EMAIL PROTECTED]/database
                postgres://user:[EMAIL PROTECTED]/database
                sqlite:/tmp/imdb.db
                sqlite:/C|/full/path/to/database

                See README.sqldb for more information.
""" % sys.argv[0]

# Directory containing the IMDb's Plain Text Data Files.
IMDB_PTDF_DIR = None
# URI used to connect to the database.
URI = None

# Manage arguments list.
try:
    optlist, args = getopt.getopt(sys.argv[1:], 'u:d:h',
                                                ['uri=', 'data=', 'help'])
except getopt.error, e:
    print 'Troubles with arguments.'
    print HELP
    sys.exit(2)

for opt in optlist:
    if opt[0] in ('-d', '--data'):
        IMDB_PTDF_DIR = opt[1]
    elif opt[0] in ('-u', '--uri'):
        URI = opt[1]
    elif opt[0] in ('-h', '--help'):
        print HELP
        sys.exit(0)

if IMDB_PTDF_DIR is None:
    print 'You must supply the directory with the plain text data files'
    print HELP
    sys.exit(2)

if URI is None:
    print 'You must supply the URI for the database connection'
    print HELP
    sys.exit(2)

# Connect to the database.
conn = setConnection(URI)
# Extract exceptions to trap.
OperationalError = conn.module.OperationalError
IntegrityError = conn.module.IntegrityError

connectObject = conn.getConnection()
# Cursor object.
CURS = connectObject.cursor()

# Name of the database and style of the parameters.
DB_NAME = conn.dbName
PARAM_STYLE = conn.module.paramstyle


def tableName(table):
    """Return a string with the name of the table in the current db."""
    return table.sqlmeta.table

def colName(table, column):
    """Return a string with the name of the column in the current db."""
    if column == 'id':
        return table.sqlmeta.idName
    return table.sqlmeta.columns[column].dbName


class RawValue(object):
    """String-like objects to store raw SQL parameters, that are not
    intended to be replaced with positional parameters, in the query."""
    def __init__(self, s, v):
        self.string = s
        self.value = v
    def __str__(self):
        return self.string


def _makeConvNamed(cols):
    """Return a function to be used to convert a list of parameters
    from positional style to named style (convert from a list of
    tuples to a list of dictionaries."""
    nrCols = len(cols)
    def _converter(params):
        for paramIndex, paramSet in enumerate(params):
            d = {}
            for i in xrange(nrCols):
                d[cols[i]] = paramSet[i]
            params[paramIndex] = d
        return params
    return _converter

def createSQLstr(table, cols, command='INSERT'):
    """Given a table and a list of columns returns a sql statement
    useful to insert a set of data in the database.
    Along with the string, also a function useful to convert parameters
    from positional to named style is returned."""
    sqlstr = '%s INTO %s ' % (command, tableName(table))
    colNames = []
    values = []
    convCols = []
    count = 1
    def _valStr(s, index):
        if DB_NAME in ('mysql', 'postgres'): return '%s'
        elif PARAM_STYLE == 'format': return '%s'
        elif PARAM_STYLE == 'qmark': return '?'
        elif PARAM_STYLE == 'numeric': return ':%s' % index
        elif PARAM_STYLE == 'named': return ':%s' % s
        elif PARAM_STYLE == 'pyformat': return '%(' + s + ')s'
    for col in cols:
        if isinstance(col, RawValue):
            colNames.append(colName(table, col.string))
            values.append(str(col.value))
        elif col == 'id':
            colNames.append(table.sqlmeta.idName)
            values.append(_valStr('id', count))
            convCols.append(col)
            count += 1
        else:
            colNames.append(colName(table, col))
            values.append(_valStr(col, count))
            convCols.append(col)
            count += 1
    sqlstr += '(%s) ' % ', '.join(colNames)
    sqlstr += 'VALUES (%s)' % ', '.join(values)
    if DB_NAME not in ('mysql', 'postgres') and \
            PARAM_STYLE in ('named', 'pyformat'):
        converter = _makeConvNamed(convCols)
    else:
        # Return the list itself.
        converter = lambda x: x
    return sqlstr, converter

def _(s):
    """Nicely print a string to sys.stdout."""
    if not isinstance(s, UnicodeType):
        s = unicode(s, 'utf_8')
    s = s.encode(sys.stdout.encoding or 'utf_8', 'replace')
    return s

# Show time consumed by the single function call.
CTIME = int(time.time())
BEGIN_TIME = CTIME
def t(s):
    global CTIME
    nt = int(time.time())
    print '# TIME', s, ': %d min, %s sec.' % divmod(nt-CTIME, 60)
    CTIME = nt

def title_soundex(title):
    """Return the soundex code for the given title; the (optional) ending
    article is pruned.  It assumes to receive a title without year/imdbIndex
    or kind indications, but just the title string, as the one in the
    analyze_title(title)['title'] value."""
    title = unicode(title, 'utf_8')
    # Prune non-ascii chars from the string.
    title = title.encode('ascii', 'replace')
    if not title: return None
    ts = title.split(', ')
    # Strip the ending article, if any.
    if ts[-1].lower() in _articles:
        title = ', '.join(ts[:-1])
    return soundex(title)

def name_soundexes(name):
    """Return three soundex codes for the given name; the name is assumed
    to be in the 'surname, name' format, without the imdbIndex indication,
    as the one in the analyze_name(name)['name'] value.
    The first one is the soundex of the name in the canonical format.
    The second is the soundex of the name in the normal format, if different
    from the first one.
    The third is the soundex of the surname, if different from the
    other two values."""
    name = unicode(name, 'utf_8')
    # Prune non-ascii chars from the string.
    name = name.encode('ascii', 'ignore')
    if not name: return (None, None, None)
    s1 = soundex(name)
    name_normal = normalizeName(name)
    s2 = soundex(name_normal)
    if s1 == s2: s2 = None
    namesplit = name.split(', ')
    s3 = soundex(namesplit[0])
    if s3 and s3 in (s1, s2): s3 = None
    return (s1, s2, s3)


# Handle laserdisc keys.
for key, value in _ldk.items():
    _ldk[key] = 'LD %s' % value


# Tags to identify where the meaningful data begin/end in files.
MOVIES = 'movies.list.gz'
MOVIES_START = ('MOVIES LIST', '===========', '')
MOVIES_STOP = '--------------------------------------------------'
CAST_START = ('Name', '----')
CAST_STOP = '-----------------------------'
RAT_START = ('MOVIE RATINGS REPORT', '',
            'New  Distribution  Votes  Rank  Title')
RAT_STOP = '\n'
RAT_TOP250_START = ('note: for this top 250', '', 'New  Distribution')
RAT_BOT10_START = ('BOTTOM 10 MOVIES', '', 'New  Distribution')
TOPBOT_STOP = '\n'
AKAT_START = ('AKA TITLES LIST', '=============', '', '', '')
AKAT_IT_START = ('AKA TITLES LIST ITALIAN', '=======================', '', '')
AKAT_DE_START = ('AKA TITLES LIST GERMAN', '======================', '')
AKAT_ISO_START = ('AKA TITLES LIST ISO', '===================', '')
AKAT_HU_START = ('AKA TITLES LIST HUNGARIAN', '=========================', '')
AKAT_NO_START = ('AKA TITLES LIST NORWEGIAN', '=========================', '')
AKAN_START = ('AKA NAMES LIST', '=============', '')
AV_START = ('ALTERNATE VERSIONS LIST', '=======================', '', '')
MINHASH_STOP = '-------------------------'
GOOFS_START = ('GOOFS LIST', '==========', '')
QUOTES_START = ('QUOTES LIST', '=============')
CC_START = ('CRAZY CREDITS', '=============')
BIO_START = ('BIOGRAPHY LIST', '==============')
BUS_START = ('BUSINESS LIST', '=============', '')
BUS_STOP = '                                    ====='
CER_START = ('CERTIFICATES LIST', '=================')
COL_START = ('COLOR INFO LIST', '===============')
COU_START = ('COUNTRIES LIST', '==============')
DIS_START = ('DISTRIBUTORS LIST', '=================', '')
GEN_START = ('8: THE GENRES LIST', '==================', '')
KEY_START = ('8: THE KEYWORDS LIST', '====================', '')
LAN_START = ('LANGUAGE LIST', '=============')
LOC_START = ('LOCATIONS LIST', '==============', '')
MIS_START = ('MISCELLANEOUS COMPANY LIST', '==========================')
PRO_START = ('PRODUCTION COMPANIES LIST', '=========================', '')
RUN_START = ('RUNNING TIMES LIST', '==================')
SOU_START = ('SOUND-MIX LIST', '==============')
SFX_START = ('SFXCO COMPANIES LIST', '====================', '')
TCN_START = ('TECHNICAL LIST', '==============', '', '')
LSD_START = ('LASERDISC LIST', '==============', '------------------------')
LIT_START = ('LITERATURE LIST', '===============', '')
LIT_STOP = 'COPYING POLICY'
LINK_START = ('MOVIE LINKS LIST', '================', '')
MPAA_START = ('MPAA RATINGS REASONS LIST', '=========================')
PLOT_START = ('PLOT SUMMARIES LIST', '===================', '')
RELDATE_START = ('RELEASE DATES LIST', '==================')
SNDT_START = ('SOUNDTRACKS LIST', '================', '', '', '')
TAGL_START = ('TAG LINES LIST', '==============', '', '')
TAGL_STOP = '-----------------------------------------'
TRIV_START = ('FILM TRIVIA', '===========', '')
COMPCAST_START = ('CAST COVERAGE TRACKING LIST', '===========================')
COMPCREW_START = ('CREW COVERAGE TRACKING LIST', '===========================')
COMP_STOP = '---------------'

GzipFileRL = GzipFile.readline
class SourceFile(GzipFile):
    """Instances of this class are used to read gzipped files,
    starting from a defined line to a (optionally) given end."""
    def __init__(self, filename=None, mode=None, start=(), stop=None,
                    pwarning=1, *args, **kwds):
        filename = os.path.join(IMDB_PTDF_DIR, filename)
        try:
            GzipFile.__init__(self, filename, mode, *args, **kwds)
        except IOError, e:
            if not pwarning: raise
            print 'WARNING WARNING WARNING'
            print 'WARNING unable to read the "%s" file.' % filename
            print 'WARNING The file will be skipped, and the contained'
            print 'WARNING information will NOT be stored in the database.'
            print 'WARNING Complete error: ', e
            # re-raise the exception.
            raise
        self.start = start
        for item in start:
            itemlen = len(item)
            for line in self:
                if line[:itemlen] == item: break
        self.set_stop(stop)

    def set_stop(self, stop):
        if stop is not None:
            self.stop = stop
            self.stoplen = len(self.stop)
            self.readline = self.readline_checkEnd
        else:
            self.readline = self.readline_NOcheckEnd

    def readline_NOcheckEnd(self, size=-1):
        line = GzipFile.readline(self, size)
        return unicode(line, 'latin_1').encode('utf_8')

    def readline_checkEnd(self, size=-1):
        line = GzipFile.readline(self, size)
        if self.stop is not None and line[:self.stoplen] == self.stop: return ''
        return unicode(line, 'latin_1').encode('utf_8')

    def getByHashSections(self):
        return getSectionHash(self)

    def getByNMMVSections(self):
        return getSectionNMMV(self)


def getSectionHash(fp):
    """Return sections separated by lines starting with #"""
    curSectList = []
    curSectListApp = curSectList.append
    curTitle = ''
    joiner = ''.join
    for line in fp:
        if line and line[0] == '#':
            if curSectList and curTitle:
                yield curTitle, joiner(curSectList)
                curSectList[:] = []
                curTitle = ''
            curTitle = line[2:]
        else: curSectListApp(line)
    if curSectList and curTitle:
        yield curTitle, joiner(curSectList)
        curSectList[:] = []
        curTitle = ''

NMMVSections = dict([(x, None) for x in ('MV: ', 'NM: ', 'OT: ', 'MOVI')])
NMMVSectionsHASK = NMMVSections.has_key
def getSectionNMMV(fp):
    """Return sections separated by lines starting with 'NM: ', 'MV: ',
    'OT: ' or 'MOVI'."""
    curSectList = []
    curSectListApp = curSectList.append
    curNMMV = ''
    joiner = ''.join
    for line in fp:
        if NMMVSectionsHASK(line[:4]):
            if curSectList and curNMMV:
                yield curNMMV, joiner(curSectList)
                curSectList[:] = []
                curNMMV = ''
            if line[:4] == 'MOVI': curNMMV = line[6:]
            else: curNMMV = line[4:]
        elif not (line and line[0] == '-'): curSectListApp(line)
    if curSectList and curNMMV:
        yield curNMMV, joiner(curSectList)
        curSectList[:] = []
        curNMMV = ''

def counter(initValue=1):
    """A counter implemented using a generator."""
    i = initValue
    while 1:
        yield i
        i += 1

class _BaseCache(dict):
    """Base class for Movie and Person basic information."""
    def __init__(self, d=None, flushEvery=100000):
        dict.__init__(self)
        # Flush data into the SQL database every flushEvery entries.
        self.flushEvery = flushEvery
        self._tmpDict = {}
        self._flushing = 0
        self._deferredData = {}
        self._recursionLevel = 0
        if d is not None:
            for k, v in d.iteritems(): self[k] = v

    def __setitem__(self, key, counter):
        """Every time a key is set, its value is the counter;
        every flushEvery, the temporary dictionary is
        flushed to the database, and then zeroed."""
        if counter % self.flushEvery == 0:
            self.flush()
        dict.__setitem__(self, key, counter)
        if not self._flushing:
            self._tmpDict[key] = counter
        else:
            self._deferredData[key] = self.counter.next()

    def flush(self, quiet=0, _recursionLevel=0):
        """Flush to the database."""
        if self._flushing: return
        self._flushing = 1
        if _recursionLevel >= 5:
            print 'WARNING recursion level exceded trying to flush data'
            print 'WARNING this batch of data is lost (%s).' % self.className
            self._tmpDict.clear()
            return
        if self._tmpDict:
            try:
                self._toDB(quiet)
                self._tmpDict.clear()
            except OperationalError, e:
                # Dataset too large; split it in two and retry.
                ## new code!
                ## the same class instance (self) is used, instead of
                ## creating two separated objects.
                _recursionLevel += 1
                self._flushing = 0
                firstHalf = {}
                poptmpd = self._tmpDict.popitem
                originalLength = len(self._tmpDict)
                for x in xrange(1 + originalLength/2):
                    k, v = poptmpd()
                    firstHalf[k] = v
                print ' * TOO MANY DATA (%s items in %s), recursion: %s' % \
                                                        (originalLength,
                                                        self.className,
                                                        _recursionLevel)
                print '   * SPLITTING (run 1 of 2), recursion: %s' % \
                                                        _recursionLevel
                self.flush(quiet=quiet, _recursionLevel=_recursionLevel)
                self._tmpDict = firstHalf
                print '   * SPLITTING (run 2 of 2), recursion: %s' % \
                                                        _recursionLevel
                self.flush(quiet=quiet, _recursionLevel=_recursionLevel)
                self._tmpDict.clear()
        self._flushing = 0
        # Flush also deferred data.
        if self._deferredData:
            self._tmpDict = self._deferredData
            self.flush(quiet=1)
            self._deferredData = {}
        connectObject.commit()

    def populate(self):
        """Populate the dictionary from the database."""
        raise NotImplementedError

    def _toDB(self, quiet=0):
        """Write the dictionary to the database."""
        raise NotImplementedError

    def add(self, key, miscData=None):
        """Insert a new key and return its value."""
        c = self.counter.next()
        # miscData=[('a_dict', 'value')] will set self.a_dict's c key
        # to 'value'.
        if miscData is not None:
            for d_name, data in miscData:
                getattr(self, d_name)[c] = data
        self[key] = c
        return c

    def addUnique(self, key, miscData=None):
        """Insert a new key and return its value; if the key is already
        in the dictionary, its previous  value is returned."""
        if self.has_key(key): return self[key]
        else: return self.add(key, miscData)


def fetchsome(curs, size=20000):
    """Yes, I've read the Python Cookbook! :-)"""
    while 1:
        res = CURS.fetchmany(size)
        if not res: break
        for r in res: yield r

class MoviesCache(_BaseCache):
    """Manage the movies list."""
    className = 'MoviesCache'
    counter = counter()

    def __init__(self, *args, **kwds):
        _BaseCache.__init__(self, *args, **kwds)
        self.episodesYear = {}
        self.sqlstr, self.converter = createSQLstr(Title, ('id', 'title',
                                    'imdbIndex', 'kindID', 'productionYear',
                                    'phoneticCode', 'episodeOfID',
                                    'seasonNr', 'episodeNr'))

    def populate(self):
        print ' * POPULATING %s...' % self.className
        titleTbl = tableName(Title)
        movieidCol = colName(Title, 'id')
        titleCol = colName(Title, 'title')
        kindidCol = colName(Title, 'kindID')
        yearCol = colName(Title, 'productionYear')
        imdbindexCol = colName(Title, 'imdbIndex')
        episodeofidCol = colName(Title, 'episodeOfID')
        seasonNrCol = colName(Title, 'seasonNr')
        episodeNrCol = colName(Title, 'episodeNr')
        sqlPop = 'SELECT %s, %s, %s, %s, %s, %s, %s, %s FROM %s;' % (movieidCol,
                    titleCol, kindidCol, yearCol, imdbindexCol,
                    episodeofidCol, seasonNrCol, episodeNrCol, titleTbl)
        CURS.execute(sqlPop)
        _oldcacheValues = Title.sqlmeta.cacheValues
        Title.sqlmeta.cacheValues = False
        for x in fetchsome(CURS, self.flushEvery):
            mdict = {'title': unicode(x[1], 'utf_8'), 'kind': KIND_STRS[x[2]],
                    'year': x[3], 'imdbIndex': x[4]}
            if mdict['imdbIndex'] is None: del mdict['imdbIndex']
            if mdict['year'] is None: del mdict['year']
            else: mdict['year'] = str(mdict['year'])
            episodeOfID = x[5]
            if episodeOfID is not None:
                s = Title.get(episodeOfID)
                series_d = {'title': s.title,
                            'kind': str(s.kind.kind),
                            'year': s.productionYear, 'imdbIndex': s.imdbIndex}
                if series_d['imdbIndex'] is None: del series_d['imdbIndex']
                if series_d['year'] is None: del series_d['year']
                else: series_d['year'] = str(series_d['year'])
                mdict['episode of'] = series_d
            title = build_title(mdict, canonical=1, ptdf=1)
            dict.__setitem__(self, title, x[0])
        self.counter = counter(Title.select().count() + 1)
        Title.sqlmeta.cacheValues = _oldcacheValues

    def _toDB(self, quiet=0):
        if not quiet:
            print ' * FLUSHING %s...' % self.className
            sys.stdout.flush()
        l = []
        lapp = l.append
        tmpDictiter = self._tmpDict.iteritems
        for k, v in tmpDictiter():
            try:
                t = analyze_title(k)
            except IMDbParserError:
                if k and k.strip():
                    print 'WARNING %s._toDB() invalid title:' % self.className,
                    print _(k)
                continue
            tget = t.get
            episodeOf = None
            kind = tget('kind')
            if kind == 'episode':
                #series title
                stitle = build_title(tget('episode of'), canonical=1)
                episodeOf = self.addUnique(stitle)
                del t['episode of']
                year = self.episodesYear.get(v)
                if year is not None:
                    try: t['year'] = int(year)
                    except ValueError: pass
            title = tget('title')
            soundex = title_soundex(title)
            lapp((v, title, tget('imdbIndex'), KIND_IDS[kind],
                    tget('year'), soundex, episodeOf,
                    tget('season'), tget('episode')))
        self._runCommand(l)

    def _runCommand(self, dataList):
        CURS.executemany(self.sqlstr, self.converter(dataList))


class PersonsCache(_BaseCache):
    """Manage the persons list."""
    counter = counter()

    def __init__(self, *args, **kwds):
        _BaseCache.__init__(self, *args, **kwds)
        self.sqlstr, self.converter = createSQLstr(Name, ['id', 'name',
                                'imdbIndex', 'namePcodeCf', 'namePcodeNf',
                                'surnamePcode'])

    def populate(self):
        print ' * POPULATING PersonsCache...'
        nameTbl = tableName(Name)
        personidCol = colName(Name, 'id')
        nameCol = colName(Name, 'name')
        imdbindexCol = colName(Name, 'imdbIndex')
        CURS.execute('SELECT %s, %s, %s FROM %s;' % (personidCol, nameCol,
                                                    imdbindexCol, nameTbl))
        _oldcacheValues = Name.sqlmeta.cacheValues
        Name.sqlmeta.cacheValues = False
        for x in fetchsome(CURS, self.flushEvery):
            nd = {'name': unicode(x[1], 'utf_8')}
            if x[2]: nd['imdbIndex'] = x[2]
            name = build_name(nd, canonical=1)
            dict.__setitem__(self, name, x[0])
        self.counter = counter(Name.select().count() + 1)
        Name.sqlmeta.cacheValues = _oldcacheValues

    def _toDB(self, quiet=0):
        if not quiet:
            print ' * FLUSHING PersonsCache...'
            sys.stdout.flush()
        l = []
        lapp = l.append
        tmpDictiter = self._tmpDict.iteritems
        for k, v in tmpDictiter():
            try:
                t = analyze_name(k)
            except IMDbParserError:
                if k and k.strip():
                    print 'WARNING PersonsCache._toDB() invalid name:', _(k)
                continue
            tget = t.get
            name = tget('name')
            namePcodeCf, namePcodeNf, surnamePcode = name_soundexes(name)
            lapp((v, name, tget('imdbIndex'),
                namePcodeCf, namePcodeNf, surnamePcode))
        CURS.executemany(self.sqlstr, self.converter(l))


class SQLData(dict):
    """Variable set of information, to be stored from time to time
    to the SQL database."""
    def __init__(self, table=None, cols=None, sqlString='', converter=None,
                d={}, flushEvery=20000, counterInit=1):
        if not sqlString:
            if not (table and cols):
                raise TypeError, '"table" or "cols" unspecified'
            sqlString, converter = createSQLstr(table, cols)
        elif converter is None:
            raise TypeError, '"sqlString" or "converter" unspecified'
        dict.__init__(self)
        self.counterInit = counterInit
        self.counter = counterInit
        self.flushEvery = flushEvery
        self.sqlString = sqlString
        self.converter = converter
        self._recursionLevel = 1
        for k, v in d.items(): self[k] = v

    def __setitem__(self, key, value):
        """The value is discarded, the counter is used as the 'real' key
        and the user's 'key' is used as its values."""
        counter = self.counter
        if counter % self.flushEvery == 0:
            self.flush()
        dict.__setitem__(self, counter, key)
        self.counter += 1

    def add(self, key):
        self[key] = None

    def flush(self, _resetRecursion=1):
        if not self: return
        # XXX: it's safer to flush MoviesCache and PersonsCache, to preserve
        #      consistency of ForeignKey, but it can also slow down everything
        #      a bit...
        CACHE_MID.flush(quiet=1)
        CACHE_PID.flush(quiet=1)
        if _resetRecursion: self._recursionLevel = 1
        if self._recursionLevel >= 5:
            print 'WARNING recursion level exceded trying to flush data'
            print 'WARNING this batch of data is lost.'
            self.clear()
            self.counter = self.counterInit
            return
        try:
            self._toDB()
            self.clear()
            self.counter = self.counterInit
        except OperationalError, e:
            print ' * TOO MANY DATA (%s items), SPLITTING (run #%d)...' % \
                    (len(self), self._recursionLevel)
            self._recursionLevel += 1
            newdata = self.__class__(sqlString=self.sqlString,
                                    converter=self.converter)
            newdata._recursionLevel = self._recursionLevel
            newflushEvery = self.flushEvery / 2
            self.flushEvery = newflushEvery
            newdata.flushEvery = newflushEvery
            popitem = self.popitem
            dsi = dict.__setitem__
            for x in xrange(len(self)/2):
                k, v = popitem()
                dsi(newdata, k, v)
            newdata.flush(_resetRecursion=0)
            del newdata
            self.flush(_resetRecursion=0)
            self.clear()
            self.counter = self.counterInit
        connectObject.commit()

    def _toDB(self):
        print ' * FLUSHING SQLData...'
        CURS.executemany(self.sqlString, self.converter(self.values()))


# Miscellaneous functions.

def unpack(line, headers, sep='\t'):
    """Given a line, split at seps and return a dictionary with key
    from the header list.
    E.g.:
        line = '      0000000124    8805   8.4  Incredibles, The (2004)'
        header = ('votes distribution', 'votes', 'rating', 'title')
        seps=('  ',)

    will returns: {'votes distribution': '0000000124', 'votes': '8805',
                    'rating': '8.4', 'title': 'Incredibles, The (2004)'}
    """
    r = {}
    ls1 = filter(None, line.split(sep))
    for index, item in enumerate(ls1):
        try: name = headers[index]
        except IndexError: name = 'item%s' % index
        r[name] = item.strip()
    return r

def _parseMinusList(fdata):
    """Parse a list of lines starting with '- '."""
    rlist = []
    tmplist = []
    for line in fdata:
        if line and line[:2] == '- ':
            if tmplist: rlist.append(' '.join(tmplist))
            l = line[2:].strip()
            if l: tmplist[:] = [l]
            else: tmplist[:] = []
        else:
            l = line.strip()
            if l: tmplist.append(l)
    if tmplist: rlist.append(' '.join(tmplist))
    return rlist


def _parseColonList(lines, replaceKeys):
    """Parser for lists with "TAG: value" strings."""
    out = {}
    for line in lines:
        line = line.strip()
        if not line: continue
        cols = line.split(':', 1)
        if len(cols) < 2: continue
        k = cols[0]
        k = replaceKeys.get(k, k)
        v = ' '.join(cols[1:]).strip()
        if not out.has_key(k): out[k] = []
        out[k].append(v)
    return out


# Functions used to manage data files.

def readMovieList():
    """Read the movies.list.gz file."""
    try: mdbf = SourceFile(MOVIES, start=MOVIES_START, stop=MOVIES_STOP)
    except IOError: return
    count = 0
    for line in mdbf:
        line_d = unpack(line, ('title', 'year'))
        title = line_d['title']
        yearData = None
        # Collect 'year' column for tv series' episodes.
        if title[-1:] == '}':
            yearData = [('episodesYear', line_d['year'])]
        mid = CACHE_MID.addUnique(title, yearData)
        if count % 10000 == 0:
            print 'SCANNING movies:', _(title),
            print '(movieID: %s)' % mid
        count += 1
    CACHE_MID.flush()
    CACHE_MID.episodesYear.clear()
    mdbf.close()


def doCast(fp, roleid, rolename):
    """Populate the cast table."""
    pid = None
    count = 0
    name = ''
    roleidVal = RawValue('roleID', roleid)
    sqldata = SQLData(table=CastInfo, cols=['personID', 'movieID',
                        'personRole', 'note', 'nrOrder', roleidVal])
    if rolename == 'miscellaneous crew': sqldata.flushEvery = 10000
    for line in fp:
        if line and line[0] != '\t':
            if line[0] == '\n': continue
            sl = filter(None, line.split('\t'))
            if len(sl) != 2: continue
            name, line = sl
            pid = CACHE_PID.addUnique(name.strip())
        line = line.strip()
        ll = line.split('  ')
        title = ll[0]
        note = None
        role = None
        order = None
        for item in ll[1:]:
            if not item: continue
            if item[0] == '[':
                role = item[1:-1]
            elif item[0] == '(':
                note = item
            elif item[0] == '<':
                textor = item[1:-1]
                try:
                    order = long(textor)
                except ValueError:
                    os = textor.split(',')
                    if len(os) == 3:
                        try:
                            order = ((long(os[2])-1) * 1000) + \
                                    ((long(os[1])-1) * 100) + (long(os[0])-1)
                        except ValueError:
                            pass
        movieid = CACHE_MID.addUnique(title)
        sqldata.add((pid, movieid, role, note, order))
        if count % 10000 == 0:
            print 'SCANNING %s:' % rolename,
            print _(name)
        count += 1
    sqldata.flush()
    print 'CLOSING %s...' % rolename


def castLists():
    """Read files listed in the 'role' column of the 'roletypes' table."""
    for rt in RoleType.select():
        roleid = rt.id
        rolename = fname = rt.role
        if rolename == 'guest':
            continue
        fname = fname.replace(' ', '-')
        if fname == 'actress': fname = 'actresses.list.gz'
        elif fname == 'miscellaneous-crew': fname = 'miscellaneous.list.gz'
        else: fname = fname + 's.list.gz'
        print 'DOING', fname
        try:
            f = SourceFile(fname, start=CAST_START, stop=CAST_STOP)
        except IOError:
            continue
        doCast(f, roleid, rolename)
        f.close()
        t('castLists(%s)' % rolename)


def doAkaNames():
    """People's akas."""
    pid = None
    count = 0
    try: fp = SourceFile('aka-names.list.gz', start=AKAN_START)
    except IOError: return
    sqldata = SQLData(table=AkaName, cols=['personID', 'name', 'imdbIndex',
                            'namePcodeCf', 'namePcodeNf', 'surnamePcode'])
    for line in fp:
        if line and line[0] != ' ':
            if line[0] == '\n': continue
            pid = CACHE_PID.addUnique(line.strip())
        else:
            line = line.strip()
            if line[:5] == '(aka ': line = line[5:]
            if line[-1:] == ')': line = line[:-1]
            try:
                name_dict = analyze_name(line)
            except IMDbParserError:
                if line: print 'WARNING: wrong name:', _(line)
                continue
            name = name_dict.get('name')
            namePcodeCf, namePcodeNf, surnamePcode = name_soundexes(name)
            sqldata.add((pid, name, name_dict.get('imdbIndex'),
                        namePcodeCf, namePcodeNf, surnamePcode))
            if count % 10000 == 0:
                print 'SCANNING akanames:', _(line)
            count += 1
    sqldata.flush()
    fp.close()


class AkasMoviesCache(MoviesCache):
    """A MoviesCache-like class used to populate the AkaTitle table."""
    className = 'AkasMoviesCache'
    counter = counter()

    def __init__(self, *args, **kdws):
        MoviesCache.__init__(self, *args, **kdws)
        self.flushEvery = 50000
        self.notes = {}
        self.ids = {}
        self.sqlstr, self.converter = createSQLstr(AkaTitle, ('id', 'movieID',
                            'title', 'imdbIndex', 'kindID', 'productionYear',
                            'phoneticCode', 'episodeOfID', 'seasonNr',
                            'episodeNr', 'note'))

    def flush(self, *args, **kwds):
        # Preserve consistency of ForeignKey.
        CACHE_MID.flush(quiet=1)
        super(AkasMoviesCache, self).flush(*args, **kwds)

    def _runCommand(self, dataList):
        new_dataList = []
        new_dataListapp = new_dataList.append
        while dataList:
            item = dataList.pop()
            # id used to store this entry.
            the_id = item[0]
            # id of the referred title.
            original_title_id = self.ids.get(the_id)
            new_item = [the_id, original_title_id]
            new_item += item[1:]
            new_item.append(self.notes.get(the_id))
            new_dataListapp(tuple(new_item))
        new_dataList.reverse()
        CURS.executemany(self.sqlstr, self.converter(new_dataList))
CACHE_MID_AKAS = AkasMoviesCache()


def doAkaTitles():
    """Movies' akas."""
    mid = None
    count = 0
    for fname, start in (('aka-titles.list.gz',AKAT_START),
                    ('italian-aka-titles.list.gz',AKAT_IT_START),
                    ('german-aka-titles.list.gz',AKAT_DE_START),
                    ('iso-aka-titles.list.gz',AKAT_ISO_START),
                    (os.path.join('contrib','hungarian-aka-titles.list.gz'),
                        AKAT_HU_START),
                    (os.path.join('contrib','norwegian-aka-titles.list.gz'),
                        AKAT_NO_START)):
        incontrib = 0
        pwarning = 1
        if start in (AKAT_HU_START, AKAT_NO_START):
            pwarning = 0
            incontrib = 1
        try:
            fp = SourceFile(fname, start=start,
                            stop='---------------------------',
                            pwarning=pwarning)
        except IOError:
            continue
        for line in fp:
            if line and line[0] != ' ':
                if line[0] == '\n': continue
                mid = CACHE_MID.addUnique(line.strip())
            else:
                res = unpack(line.strip(), ('title', 'note'))
                note = res.get('note')
                if incontrib:
                    if res.get('note'): note += ' '
                    else: note = ''
                    if start == AKAT_HU_START: note += '(Hungary)'
                    elif start == AKAT_NO_START: note += '(Norway)'
                akat = res.get('title', '')
                if akat[:5] == '(aka ': akat = akat[5:]
                if akat[-2:] in ('))', '})'): akat = akat[:-1]
                if count % 10000 == 0:
                    print 'SCANNING %s:' % fname[:-8].replace('-', ' '),
                    print _(akat)
                append_data = [('ids', mid)]
                if note is not None:
                    append_data.append(('notes', note))
                aka_id = CACHE_MID_AKAS.add(akat, append_data)
                count += 1
        fp.close()
    CACHE_MID_AKAS.flush()
    CACHE_MID_AKAS.clear()
    CACHE_MID_AKAS.notes.clear()
    CACHE_MID_AKAS.ids.clear()


def doMovieLinks():
    """Connections between movies."""
    mid = None
    count = 0
    sqldata = SQLData(table=MovieLink,
                cols=['movieID', 'linkedMovieID', 'linkTypeID'],
                flushEvery=10000)
    try: fp = SourceFile('movie-links.list.gz', start=LINK_START)
    except IOError: return
    for line in fp:
        if line and line[0] != ' ':
            if line[0] == '\n': continue
            title = line.strip()
            mid = CACHE_MID.addUnique(title)
            if count % 10000 == 0:
                print 'SCANNING movielinks:', _(title)
        else:
            line = line.strip()
            link_txt = unicode(line, 'utf_8').encode('ascii', 'replace')
            theid = None
            for k, lenkp1, v in MOVIELINK_IDS:
                if link_txt and link_txt[0] == '(' \
                        and link_txt[1:lenkp1+1] == k:
                    theid = v
                    break
            if theid is None: continue
            totitle = line[lenkp1+2:-1].strip()
            totitleid = CACHE_MID.addUnique(totitle)
            sqldata.add((mid, totitleid, theid))
        count += 1
    sqldata.flush()
    fp.close()


def minusHashFiles(fp, funct, defaultid, descr):
    """A file with lines starting with '# ' and '- '."""
    sqldata = SQLData(table=MovieInfo,
                        cols=['movieID', 'infoTypeID', 'info'])
    sqldata.flushEvery = 2500
    if descr == 'quotes': sqldata.flushEvery = 4000
    elif descr == 'soundtracks': sqldata.flushEvery = 3000
    elif descr == 'trivia': sqldata.flushEvery = 3000
    count = 0
    for title, text in fp.getByHashSections():
        title = title.strip()
        d = funct(text.split('\n'))
        mid = CACHE_MID.addUnique(title)
        if count % 5000 == 0:
            print 'SCANNING %s:' % descr,
            print _(title)
        for data in d:
            sqldata.add((mid, defaultid, data))
        count += 1
    sqldata.flush()


def doMinusHashFiles():
    """Files with lines starting with '# ' and '- '."""
    for fname, start in [('alternate versions',AV_START),
                         ('goofs',GOOFS_START), ('crazy credits',CC_START),
                         ('quotes',QUOTES_START),
                         ('soundtracks',SNDT_START),
                         ('trivia',TRIV_START)]:
        try:
            fp = SourceFile(fname.replace(' ', '-')+'.list.gz', start=start,
                        stop=MINHASH_STOP)
        except IOError:
            continue
        funct = _parseMinusList
        if fname == 'quotes': funct = getQuotes
        index = fname
        if index == 'soundtracks': index = 'soundtrack'
        minusHashFiles(fp, funct, INFO_TYPES[index], fname)
        fp.close()


def getTaglines():
    """Movie's taglines."""
    try: fp = SourceFile('taglines.list.gz', start=TAGL_START, stop=TAGL_STOP)
    except IOError: return
    sqldata = SQLData(table=MovieInfo,
                cols=['movieID', 'infoTypeID', 'info'],
                flushEvery=10000)
    count = 0
    for title, text in fp.getByHashSections():
        title = title.strip()
        mid = CACHE_MID.addUnique(title)
        for tag in text.split('\n'):
            tag = tag.strip()
            if not tag: continue
            if count % 10000 == 0:
                print 'SCANNING taglines:', _(title)
            sqldata.add((mid, INFO_TYPES['taglines'], tag))
        count += 1
    sqldata.flush()
    fp.close()


def getQuotes(lines):
    """Movie's quotes."""
    quotes = []
    qttl = []
    for line in lines:
        if line and line[:2] == '  ' and qttl and qttl[-1] and \
                not qttl[-1].endswith('::'):
            line = line.lstrip()
            if line: qttl[-1] += ' %s' % line
        elif not line.strip():
            if qttl: quotes.append('::'.join(qttl))
            qttl[:] = []
        else:
            line = line.lstrip()
            if line: qttl.append(line)
    if qttl: quotes.append('::'.join(qttl))
    return quotes


_usd = '$'
_gbp = unichr(0x00a3).encode('utf_8')
_eur = unichr(0x20ac).encode('utf_8')
def getBusiness(lines):
    """Movie's business information."""
    bd = _parseColonList(lines, _bus)
    for k in bd.keys():
        nv = []
        for v in bd[k]:
            v = v.replace('USD ',_usd).replace('GBP ',_gbp).replace('EUR',_eur)
            nv.append(v)
        bd[k] = nv
    return bd


def getLaserDisc(lines):
    """Laserdisc information."""
    d = _parseColonList(lines, _ldk)
    for k, v in d.iteritems():
        d[k] = ' '.join(v)
    return d


def getLiterature(lines):
    """Movie's literature information."""
    return _parseColonList(lines, _lit)


_mpaa = {'RE': 'mpaa'}
def getMPAA(lines):
    """Movie's mpaa information."""
    d = _parseColonList(lines, _mpaa)
    for k, v in d.iteritems():
        d[k] = ' '.join(v)
    return d


def nmmvFiles(fp, funct, fname):
    """Files with sections separated by 'MV: ' or 'NM: '."""
    count = 0
    sqlsP = (PersonInfo, ['personID', 'infoTypeID', 'info', 'note'])
    sqlsM = (MovieInfo, ['movieID', 'infoTypeID', 'info', 'note'])

    if fname == 'biographies.list.gz':
        datakind = 'person'
        sqls = sqlsP
        guestid = RoleType.select(RoleType.q.role == 'guest')[0].id
        roleid = str(guestid)
        guestdata = SQLData(table=CastInfo,
                cols=['personID', 'movieID', 'personRole', 'note',
                RawValue('roleID', roleid)], flushEvery=10000)
        akanamesdata = SQLData(table=AkaName, cols=['personID', 'name',
                'imdbIndex', 'namePcodeCf', 'namePcodeNf', 'surnamePcode'])
    else:
        datakind = 'movie'
        sqls = sqlsM
        guestdata = None
        akanamesdata = None
    sqldata = SQLData(table=sqls[0], cols=sqls[1])
    if fname == 'plot.list.gz': sqldata.flushEvery = 1000
    elif fname == 'literature.list.gz': sqldata.flushEvery = 5000
    elif fname == 'business.list.gz': sqldata.flushEvery = 10000
    elif fname == 'biographies.list.gz': sqldata.flushEvery = 5000
    _ltype = type([])
    for ton, text in fp.getByNMMVSections():
        ton = ton.strip()
        if not ton: continue
        note = None
        if datakind == 'movie':
            mopid = CACHE_MID.addUnique(ton)
        else: mopid = CACHE_PID.addUnique(ton)
        if count % 6000 == 0:
            print 'SCANNING %s:' % fname[:-8].replace('-', ' '),
            print _(ton)
        d = funct(text.split('\n'))
        for k, v in d.iteritems():
            if k != 'notable tv guest appearances':
                theid = INFO_TYPES.get(k)
                if theid is None:
                    print 'WARNING key "%s" of ToN' % k,
                    print _(ton),
                    print 'not in INFO_TYPES'
                    continue
            if type(v) is _ltype:
                for i in v:
                    if k == 'notable tv guest appearances':
                        # Put "guest" information in the cast table; these
                        # are a list of Movie object (yes, imdb.Movie.Movie)
                        # FIXME: no more used?
                        title = i.get('long imdb canonical title')
                        if not title: continue
                        movieid = CACHE_MID.addUnique(title)
                        guestdata.add((mopid, movieid, i.currentRole or None,
                                        i.notes or None))
                        continue
                    if k in ('plot', 'mini biography'):
                        s = i.split('::')
                        if len(s) == 2:
                            if note: note += ' '
                            else: note = ''
                            note += '(author: %s)' % s[0]
                            i = s[1]
                    if i: sqldata.add((mopid, theid, i, note))
                    note = None
            else:
                if v: sqldata.add((mopid, theid, v, note))
            if k in ('nick names', 'birth name') and v:
                # Put also the birth name/nick names in the list of aliases.
                if k == 'birth name': realnames = [v]
                else: realnames = v
                for realname in realnames:
                    imdbIndex = re_nameImdbIndex.findall(realname) or None
                    if imdbIndex:
                        imdbIndex = imdbIndex[0]
                        realname = re_nameImdbIndex.sub('', realname)
                    if realname:
                        # XXX: check for duplicates?
                        ##if k == 'birth name':
                        ##    realname = canonicalName(realname)
                        ##else:
                        ##    realname = normalizeName(realname)
                        namePcodeCf, namePcodeNf, surnamePcode = \
                                    name_soundexes(realname)
                        akanamesdata.add((mopid, realname, imdbIndex,
                                    namePcodeCf, namePcodeNf, surnamePcode))
        count += 1
    if guestdata is not None: guestdata.flush()
    if akanamesdata is not None: akanamesdata.flush()
    sqldata.flush()


def doNMMVFiles():
    """Files with large sections, about movies and persons."""
    for fname, start, funct in [('biographies.list.gz',BIO_START,_parseBiography),
            ('business.list.gz',BUS_START,getBusiness),
            ('laserdisc.list.gz',LSD_START,getLaserDisc),
            ('literature.list.gz',LIT_START,getLiterature),
            ('mpaa-ratings-reasons.list.gz',MPAA_START,getMPAA),
            ('plot.list.gz',PLOT_START,getPlot)]:
    ##for fname, start, funct in [('business.list.gz',BUS_START,getBusiness)]:
        try:
            fp = SourceFile(fname, start=start)
        except IOError:
            continue
        if fname == 'literature.list.gz': fp.set_stop(LIT_STOP)
        elif fname == 'business.list.gz': fp.set_stop(BUS_STOP)
        nmmvFiles(fp, funct, fname)
        fp.close()
        t('doNMMVFiles(%s)' % fname[:-8].replace('-', ' '))


def doMiscMovieInfo():
    """Files with information on a single line about movies."""
    sqldata = SQLData(table=MovieInfo,
                cols=['movieID', 'infoTypeID', 'info', 'note'])
    for dataf in (('certificates.list.gz',CER_START),
                    ('color-info.list.gz',COL_START),
                    ('countries.list.gz',COU_START),
                    ('distributors.list.gz',DIS_START),
                    ('genres.list.gz',GEN_START),
                    ('keywords.list.gz',KEY_START),
                    ('language.list.gz',LAN_START),
                    ('locations.list.gz',LOC_START),
                    ('miscellaneous-companies.list.gz',MIS_START),
                    ('production-companies.list.gz',PRO_START),
                    ('running-times.list.gz',RUN_START),
                    ('sound-mix.list.gz',SOU_START),
                    ('special-effects-companies.list.gz',SFX_START),
                    ('technical.list.gz',TCN_START),
                    ('release-dates.list.gz',RELDATE_START)):
        try:
            fp = SourceFile(dataf[0], start=dataf[1])
        except IOError:
            continue
        typeindex = dataf[0][:-8].replace('-', ' ')
        if typeindex == 'running times': typeindex = 'runtimes'
        elif typeindex == 'technical': typeindex = 'tech info'
        elif typeindex == 'language': typeindex = 'languages'
        infoid =  INFO_TYPES[typeindex]
        count = 0
        if dataf[0] in ('distributors.list.gz', 'locations.list.gz',
                        'miscellaneous-companies.list.gz'):
            sqldata.flushEvery = 10000
        else:
            sqldata.flushEvery = 20000
        for line in fp:
            data = unpack(line.strip(), ('title', 'info', 'note'))
            if not data.has_key('title'): continue
            if not data.has_key('info'): continue
            title = data['title']
            mid = CACHE_MID.addUnique(title)
            note = None
            if data.has_key('note'):
                note = data['note']
            if count % 10000 == 0:
                print 'SCANNING %s:' % dataf[0][:-8].replace('-', ' '),
                print _(data['title'])
            sqldata.add((mid, infoid, data['info'], note))
            count += 1
        sqldata.flush()
        fp.close()
        t('doMiscMovieInfo(%s)' % dataf[0][:-8].replace('-', ' '))


def getRating():
    """Movie's rating."""
    try: fp = SourceFile('ratings.list.gz', start=RAT_START, stop=RAT_STOP)
    except IOError: return
    sqldata = SQLData(table=MovieInfo, cols=['movieID', 'infoTypeID', 'info'])
    count = 0
    for line in fp:
        data = unpack(line, ('votes distribution', 'votes', 'rating', 'title'),
                        sep='  ')
        if not data.has_key('title'): continue
        title = data['title'].strip()
        mid = CACHE_MID.addUnique(title)
        if count % 10000 == 0:
                print 'SCANNING rating:', _(title)
        sqldata.add((mid, INFO_TYPES['votes distribution'],
                    data.get('votes distribution')))
        sqldata.add((mid, INFO_TYPES['votes'], data.get('votes')))
        sqldata.add((mid, INFO_TYPES['rating'], data.get('rating')))
        count += 1
    sqldata.flush()
    fp.close()


def getTopBottomRating():
    """Movie's rating, scanning for top 250 and bottom 100."""
    for what in ('top 250 rank', 'bottom 10 rank'):
        if what == 'top 250 rank': st = RAT_TOP250_START
        else: st = RAT_BOT10_START
        try: fp = SourceFile('ratings.list.gz', start=st, stop=TOPBOT_STOP)
        except IOError: break
        sqldata = SQLData(table=MovieInfo,
                    cols=['movieID',
                        RawValue('infoTypeID', str(INFO_TYPES[what])), 'info'])
        count = 1
        print 'SCANNING %s...' % what
        for line in fp:
            data = unpack(line, ('votes distribution', 'votes', 'rank',
                            'title'), sep='  ')
            if not data.has_key('title'): continue
            title = data['title'].strip()
            mid = CACHE_MID.addUnique(title)
            if what == 'top 250 rank': rank = count
            else: rank = 11 - count
            sqldata.add((mid, rank))
            count += 1
        sqldata.flush()
        fp.close()


def getPlot(lines):
    """Movie's plot."""
    plotl = []
    plotlappend = plotl.append
    plotltmp = []
    plotltmpappend = plotltmp.append
    for line in lines:
        linestart = line[:4]
        if linestart == 'PL: ':
            plotltmpappend(line[4:])
        elif linestart == 'BY: ':
            plotlappend('%s::%s' % (line[4:].strip(), ' '.join(plotltmp)))
            plotltmp[:] = []
    return {'plot': plotl}


def completeCast():
    """Movie's complete cast/crew information."""
    CCKind = {}
    for x in CompCastType.select():
        CCKind[x.kind] = x.id
    for fname, start in [('complete-cast.list.gz',COMPCAST_START),
                        ('complete-crew.list.gz',COMPCREW_START)]:
        try:
            fp = SourceFile(fname, start=start, stop=COMP_STOP)
        except IOError:
            continue
        if fname == 'complete-cast.list.gz': obj = 'cast'
        else: obj = 'crew'
        subID = str(CCKind[obj])
        sqldata = SQLData(table=CompleteCast,
                cols=['movieID', RawValue('subjectID', subID),
                'statusID'])
        count = 0
        for line in fp:
            ll = [x for x in line.split('\t') if x]
            if len(ll) != 2: continue
            title = ll[0]
            mid = CACHE_MID.addUnique(title)
            if count % 10000 == 0:
                print 'SCANNING %s:' % fname[:-8].replace('-', ' '),
                print _(title)
            sqldata.add((mid, str(CCKind[ll[1].lower().strip()])))
            count += 1
        fp.close()
        sqldata.flush()


# global instances
CACHE_MID = MoviesCache()
CACHE_PID = PersonsCache()



def _cmpfunc(x, y):
    """Sort a list of tuples, by the length of the first item (in reverse)."""
    lx = len(x[0])
    ly = len(y[0])
    if lx > ly: return -1
    elif lx < ly: return 1
    return 0

INFO_TYPES = {}
MOVIELINK_IDS = []
KIND_IDS = {}
KIND_STRS = {}
CCAST_TYPES = {}

def readConstants():
    """Read constants from the database."""
    global INFO_TYPES, MOVIELINK_IDS, KIND_IDS, KIND_STRS, CCAST_TYPES

    for x in InfoType.select():
        INFO_TYPES[x.info] = x.id

    for x in LinkType.select():
        MOVIELINK_IDS.append((x.link, len(x.link), x.id))
    MOVIELINK_IDS.sort(_cmpfunc)

    for x in KindType.select():
        KIND_IDS[x.kind] = x.id
        KIND_STRS[x.id] = x.kind

    for x in CompCastType.select():
        CCAST_TYPES[x.kind] = x.id


# begin the iterations...
def run():
    print 'RUNNING imdbpy2sql.py'
    # Truncate the current database.
    print 'DROPPING current database...',
    sys.stdout.flush()
    dropTables()
    print 'done!'
    print 'CREATING new tables...',
    sys.stdout.flush()
    createTables()
    readConstants()
    print 'done!'
    t('dropping and recreating the database')

    # Populate the CACHE_MID instance.
    readMovieList()
    ##CACHE_MID.populate()
    ##CACHE_PID.populate()
    t('readMovieList()')


    # actors, actresses, directors, ....
    castLists()

    doAkaNames()
    t('doAkaNames()')
    doAkaTitles()
    t('doAkaTitles()')

    doMinusHashFiles()
    t('doMinusHashFiles()')

    doNMMVFiles()

    doMiscMovieInfo()
    doMovieLinks()
    t('doMovieLinks()')

    getRating()
    t('getRating()')
    getTaglines()
    t('getTaglines()')
    getTopBottomRating()
    t('getTopBottomRating()')
    completeCast()
    t('completeCast()')

    # Flush caches.
    CACHE_MID.flush()
    CACHE_MID.clear()
    CACHE_PID.flush()
    CACHE_PID.clear()
    t('fushing caches...')

    print 'TOTAL TIME TO INSERT DATA: %d minutes, %d seconds' % \
            divmod(int(time.time())-BEGIN_TIME, 60)

    print 'building database indexes (this may take a while)'
    sys.stdout.flush()
    createIndexes()
    t('createIndexes()')

    print 'DONE! (in %d minutes, %d seconds)' % \
            divmod(int(time.time())-BEGIN_TIME, 60)


_HEARD = 0
def _kdb_handler(signum, frame):
    """Die gracefully."""
    global _HEARD
    if _HEARD:
        print "EHI!  DON'T PUSH ME!  I'VE HEARD YOU THE FIRST TIME! :-)"
        return
    print 'INTERRUPT REQUEST RECEIVED FROM USER.  FLUSHING CACHES...'
    _HEARD = 1
    # XXX: trap _every_ error?
    try: CACHE_MID.flush()
    except IntegrityError: pass
    try: CACHE_PID.flush()
    except IntegrityError: pass
    print 'DONE! (in %d minutes, %d seconds)' % \
            divmod(int(time.time())-BEGIN_TIME, 60)
    sys.exit()


if __name__ == '__main__':
    import signal
    signal.signal(signal.SIGINT, _kdb_handler)
    run()

-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642

_______________________________________________
Imdbpy-devel mailing list
Imdbpy-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/imdbpy-devel

Re: [Imdbpy-devel] overall TV Series info in SQL

Reply via email to