[sr-dev] Re: Postoji li parser za ovako nešto?

Часлав Илић Fri, 27 May 2011 01:11:09 -0700

> [: Marw :]
> Makao sam zareze iz praznih redova i redove i skripta je odradila posao
> (00-000_SR-RS.csv).


Мхмх… није ми баш јасно откуд ти редови само са зарезом, али не делује
проблематично. Eво малчице допуњена скрипта да то аутоматски игнорише.

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

try:
    import fallback_import_paths
except:
    pass

import csv
import hashlib
import locale
import optparse
import os
import sys

from pology.catalog import Catalog
from pology.fsops import str_to_unicode
from pology.msgreport import warning_on_msg
from pology.report import warning, error
from pology.split import proper_words


def main ():

    locale.setlocale(locale.LC_ALL, "")

    usage = (
        "\n"
        "  %prog extract POTFILE TABFILE [OPTIONS]\n"
        "  %prog inject TABFILE... POFILE [OPTIONS]")
    desc = (
        "Simplistically extract a POT file into a monolanguage table file. "
        "Inject translated monolanguage table file into a PO file. "
        "Encoding of the table file is expected to be UTF-8 at all times, "
        "regardless of the PO file encoding.")
    ver = (
        u"%prog x1\n"
        u"Copyright © 2011, "
        u"Chusslove Illich (Часлав Илић) <[email protected]>")

    opars = optparse.OptionParser(usage=usage, description=desc, version=ver)
    _all_formats = ["csv", "html", "xls"]
    _def_format = "csv"
    _def_dialect = "excel"
    opars.add_option(
        "-d", "--dialect", metavar="DIALECT",
        action="store", dest="dialect", default=_def_dialect,
        help="The dialect for the CSV format "
             "(known: %s; default: %s)."
             % (", ".join(sorted(csv.list_dialects())), _def_dialect))
    opars.add_option(
        "-f", "--format", metavar="FORMAT",
        action="store", dest="format", default=_def_format,
        help="Format of the table file "
             "(known: %s; default: %s)."
             % (", ".join(_all_formats), _def_format))
    opars.add_option(
        "-t", "--translator", metavar="'NAME <EMAIL>'",
        action="store", dest="translator", default=None,
        help="Name and email address of the translator, "
             "to update the PO header on injection.")
    _min_words_per_chunk = 500
    opars.add_option(
        "-w", "--words-per-chunk", metavar="NUM",
        action="store", dest="words_per_chunk", default=None,
        help="Chunk output into several table files, "
             "such that each contains at most this many words (minimum: %d). "
             "Resulting files will have suffixes -000, -001, etc. "
             "inserted before the extension."
             % _min_words_per_chunk)

    options, free_args = opars.parse_args(str_to_unicode(sys.argv[1:]))

    maxwordcnt = None
    if options.words_per_chunk:
        try:
            maxwordcnt = int(options.words_per_chunk)
        except:
            error("Words per chunk argument must be an integer.")
        if maxwordcnt < _min_words_per_chunk:
            error("Words per chunk argument must be at least %d."
                    % _min_words_per_chunk)

    mode = free_args.pop(0)
    paths = free_args
    if mode == "extract":
        if len(paths) != 2:
            opars.print_usage()
            exit(1)
        popath, tabpath = paths
        extdata = extract(popath, bool(maxwordcnt))
        if maxwordcnt:
            extchunks = chunk(extdata, maxwordcnt, tabpath)
        else:
            extchunks = [(extdata, tabpath)]
        for cextdata, ctabpath in extchunks:
            if options.format == "csv":
                write_csv(cextdata, ctabpath, dialect=options.dialect)
            elif options.format == "html":
                write_html(extdata, ctabpath)
            elif options.format == "xls":
                write_xls(cextdata, ctabpath)
            else:
                error("Unknown format '%s' for the table file."
                      % options.format)
    elif mode == "inject":
        if len(paths) < 2:
            opars.print_usage()
            exit(1)
        tabpaths = paths[:-1]
        popath = paths[-1]
        if options.format == "csv":
            injdata = read_csv(tabpaths, dialect=options.dialect)
        elif options.format == "html":
            error("Injection from HTML not implemented yet.")
            injdata = read_html(tabpaths)
        elif options.format == "xls":
            injdata = read_xls(tabpaths)
        else:
            error("Unknown format '%s' for the table file." % options.format)
        inject(injdata, popath, translator=options.translator)
    else:
        error("Unknown operation mode '%s'." % mode)


def extract (filepath, wordcnt=False):

    cat = Catalog(filepath, monitored=False)
    extdata = []
    for msg in cat:
        if msg.obsolete:
            continue
        if msg.msgid_plural is not None:
            warning_on_msg(
                "Extraction of plural messages not implemented yet, "
                "skipping.", msg, cat)
            continue
        msghex = get_msg_hex(msg)
        if wordcnt:
            words = proper_words(msg.msgid, True, cat.accelerator(), msg.format)
            extdata.append((msg, msghex, len(words)))
        else:
            extdata.append((msg, msghex))

    return extdata


def write_csv (extdata, filepath, dialect="excel"):

    ofl = open(filepath, "w")
    data = csv.writer(ofl, dialect=dialect)
    for msg, msghex in extdata:
        data.writerow([msghex, msg.msgid.encode("utf8")])
    ofl.close()


def write_xls (extdata, filepath):

    try:
        import xlwt
    except:
        error("Cannot import Python module '%s'." % "xlwt")

    book = xlwt.Workbook(encoding="UTF-8")
    sheet = book.add_sheet("PO-extract")

    enclines = []
    for i, (msg, msghex) in enumerate(extdata):
        sheet.write(i, 0, msghex)
        sheet.write(i, 1, msg.msgid)
    book.save(filepath)


# NOTE: Does not really work properly, added just for a quick test.
def write_html (extdata, filepath):

    enclines = []
    enclines.append("<?xml version='1.0' encoding='UTF-8'?>")
    enclines.append("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>")
    enclines.append("<html>")
    enclines.append("<body>")
    for msg, msghex in extdata:
        line = "<p id=\"%s\">%s</p>" % (msghex, msg.msgid)
        encline = line.encode("utf8")
        enclines.append(encline)
    enclines.append("</body>")
    enclines.append("</html>")
    enclines.append("")
    csv = open(filepath, "w")
    csv.writelines("\n".join(enclines))
    csv.close()


def inject (injdata, filepath, translator=None):

    cat = Catalog(filepath)
    msgs_by_hex = {}
    for msg in cat:
        msghex = get_msg_hex(msg)
        msgstr, fname, rno = injdata.get(msghex, [None] * 3)
        if msgstr is None:
            # Does not apply, input may be one chunk (-w option).
            #warning_on_msg("Not found in input file.", msg, cat)
            continue
        injdata.pop(msghex)
        if msgstr:
            msg.msgstr[0] = msgstr
            msg.unfuzzy()
    if translator is not None:
        cat.update_header(name=translator)
    cat.sync()

    extras_by_file = {}
    for msgstr, fname, rno in injdata.values():
        if fname not in extras_by_file:
            extras_by_file[fname] = []
        extras_by_file[fname].append(rno)
    for fname, rnos in sorted(extras_by_file.items()):
        warning("Some messages from input file '%s' do not exist "
                "in the PO file, at rows: %s."
                % (fname, ", ".join(map(str, sorted(rnos)))))


def read_csv (filepaths, dialect="excel"):

    injdata = {}
    for filepath in filepaths:
        ifl = open(filepath)
        data = csv.reader(ifl, dialect=dialect)
        for i, row in enumerate(data):
            msghex = row[0].strip()
            if msghex and len(row) >= 2:
                msgstr = row[1].decode("utf8")
                injdata[msghex] = (msgstr, filepath, i + 1)
        ifl.close()

    return injdata


def read_xls (filepaths):

    try:
        import xlrd
    except:
        error("Cannot import Python module '%s'." % "xlrd")

    injdata = {}
    for filepath in filepaths:
        book = xlrd.open_workbook(filepath)
        sheet = book.sheet_by_index(0)
        for i in xrange(sheet.nrows):
            msghex = sheet.cell(i, 0).value
            msgstr = sheet.cell(i, 1).value
            injdata[msghex] = (msgstr, filepath, i + 1)

    return injdata


def get_msg_hex (msg):

    h = hashlib.md5()
    h.update(msg.key.encode("utf8"))
    msghex = h.hexdigest()
    return msghex


def chunk (extdata, maxwordcnt, basepath):

    p = basepath.rfind(".")
    if p < 0:
        p = len(argoutpath)
    nextpath = basepath[:p]
    ext = basepath[p:]

    extchunks = []
    nmsgs = len(extdata)
    for i in range(nmsgs + 1):
        if i < nmsgs:
            msg, msghex, wordcnt = extdata[i]
        if i == 0 or i == nmsgs or csumwordcnt + wordcnt > maxwordcnt:
            if i > 0:
                ctabpath = "%s-%03d%s" % (nextpath, len(extchunks), ext)
                extchunks.append((cextdata, ctabpath))
            cextdata = []
            csumwordcnt = 0
        if i < nmsgs:
            csumwordcnt += wordcnt
            cextdata.append((msg, msghex))

    return extchunks


if __name__ == "__main__":
    main()

signature.asc
Description: This is a digitally signed message part.

[sr-dev] Re: Postoji li parser za ovako nešto?

Одговори путем е-поште