[sr-dev] Re: PO u Wordfastu (nastavak)

Часлав Илић Sun, 29 May 2011 10:05:46 -0700

> [: Marw :]
> Mozilino SUMO sučelje:
>
> https://localize.mozilla.org/sr/sumo/
>
> Mislim da će onaj regularni izraz pokriti tagove


Одговарајући регуларни израз овде требало би да буде:

  <[^>]*>|(\w*\)[fds]|%[fds]|\{\w*\}

Поред ознака покрива и непроменљиве типа %(foobar)s, %s, {foobar}.

Ове ПО датотеке садрже и множинске поруке; приложене дорађене скрипте које
умеју да рукују и множинским порукама.

Множинска порука је порука овог типа:

  #: apps/sumo/helpers.py:268
  msgid "%(number)d year ago"
  msgid_plural "%(number)d years ago"
  msgstr[0] ""
  msgstr[1] ""
  msgstr[2] ""

Треба је превести овако:

  #: apps/sumo/helpers.py:268
  msgid "%(number)d year ago"
  msgid_plural "%(number)d years ago"
  msgstr[0] "пре %(number)d годину"
  msgstr[1] "пре %(number)d године"
  msgstr[2] "пре %(number)d година"

Превод са индексом [0] користи се за све бројеве (тј. смене непроменљиве
%(number)d) који се завршавају на 1, осим оне на 11; индекс [1] за све
бројеве који се завршавају на 2, 3, 4, осим оне на 12, 13, 14; индекс [2] за
све остале бројеве.

У Вордфасту ћеш видети три раздвојена сегмента за сваку множинску поруку:

  {ut1}[0]|{ut2} years ago{ut3}
  {ut1}[1]|{ut2} years ago{ut3}
  {ut1}[2]|{ut2} years ago{ut3}

Овде су {ut1} и {ut3} оне фантомске непроменљиве које умеће сам Вордфаст. Не
рачунајући њих, сваки сегмент почиње префиксом [индекс]|. Тај префикс се
изоставља у преводу:

  {ut1}пре {ut2} годину{ut3}
  {ut1}пре {ut2} године{ut3}
  {ut1}пре {ut2} година{ut3}

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

try:
    import fallback_import_paths
except:
    pass

import csv
import hashlib
import locale
import optparse
import os
import sys

from pology.catalog import Catalog
from pology.fsops import str_to_unicode
from pology.msgreport import warning_on_msg
from pology.report import warning, error
from pology.split import proper_words


def main ():

    locale.setlocale(locale.LC_ALL, "")

    usage = (
        "\n"
        "  %prog extract POTFILE TABFILE [OPTIONS]\n"
        "  %prog inject TABFILE... POFILE [OPTIONS]")
    desc = (
        "Simplistically extract a POT file into a monolanguage table file. "
        "Inject translated monolanguage table file into a PO file. "
        "Encoding of the table file is expected to be UTF-8 at all times, "
        "regardless of the PO file encoding.")
    ver = (
        u"%prog x1\n"
        u"Copyright © 2011, "
        u"Chusslove Illich (Часлав Илић) <[email protected]>")

    opars = optparse.OptionParser(usage=usage, description=desc, version=ver)
    _all_formats = ["csv", "html", "xls"]
    _def_format = "csv"
    _def_dialect = "excel"
    opars.add_option(
        "-d", "--dialect", metavar="DIALECT",
        action="store", dest="dialect", default=_def_dialect,
        help="The dialect for the CSV format "
             "(known: %s; default: %s)."
             % (", ".join(sorted(csv.list_dialects())), _def_dialect))
    opars.add_option(
        "-f", "--format", metavar="FORMAT",
        action="store", dest="format", default=_def_format,
        help="Format of the table file "
             "(known: %s; default: %s)."
             % (", ".join(_all_formats), _def_format))
    opars.add_option(
        "-t", "--translator", metavar="'NAME <EMAIL>'",
        action="store", dest="translator", default=None,
        help="Name and email address of the translator, "
             "to update the PO header on injection.")
    _min_words_per_chunk = 500
    opars.add_option(
        "-w", "--words-per-chunk", metavar="NUM",
        action="store", dest="words_per_chunk", default=None,
        help="Chunk output into several table files, "
             "such that each contains at most this many words (minimum: %d). "
             "Resulting files will have suffixes -000, -001, etc. "
             "inserted before the extension."
             % _min_words_per_chunk)

    options, free_args = opars.parse_args(str_to_unicode(sys.argv[1:]))

    maxwordcnt = None
    if options.words_per_chunk:
        try:
            maxwordcnt = int(options.words_per_chunk)
        except:
            error("Words per chunk argument must be an integer.")
        if maxwordcnt < _min_words_per_chunk:
            error("Words per chunk argument must be at least %d."
                    % _min_words_per_chunk)

    mode = free_args.pop(0)
    paths = free_args
    if mode == "extract":
        if len(paths) != 2:
            opars.print_usage()
            exit(1)
        popath, tabpath = paths
        extdata = extract(popath, bool(maxwordcnt))
        if maxwordcnt:
            extchunks = chunk(extdata, maxwordcnt, tabpath)
        else:
            extchunks = [(extdata, tabpath)]
        for cextdata, ctabpath in extchunks:
            if options.format == "csv":
                write_csv(cextdata, ctabpath, dialect=options.dialect)
            elif options.format == "html":
                write_html(extdata, ctabpath)
            elif options.format == "xls":
                write_xls(cextdata, ctabpath)
            else:
                error("Unknown format '%s' for the table file."
                      % options.format)
    elif mode == "inject":
        if len(paths) < 2:
            opars.print_usage()
            exit(1)
        tabpaths = paths[:-1]
        popath = paths[-1]
        if options.format == "csv":
            injdata = read_csv(tabpaths, dialect=options.dialect)
        elif options.format == "html":
            error("Injection from HTML not implemented yet.")
            injdata = read_html(tabpaths)
        elif options.format == "xls":
            injdata = read_xls(tabpaths)
        else:
            error("Unknown format '%s' for the table file." % options.format)
        inject(injdata, popath, translator=options.translator)
    else:
        error("Unknown operation mode '%s'." % mode)


class ExtData:
    plfmt = "[%d]|"
    def __init__ (self):
        self.hashsum = None
        self.srctext = None
        self.wordcnt = None


def extract (filepath, wordcnt=False):

    cat = Catalog(filepath, monitored=False)

    nplurals = cat.nplurals()
    plunityidx = cat.plural_index(1)
    if plunityidx not in cat.plural_indices_single():
        plunityidx = None

    extdata = []
    for msg in cat:
        if msg.obsolete:
            continue
        if msg.msgid_plural is None:
            addidx = False
            srctexts = [msg.msgid]
        else:
            addidx = True
            srctexts = [msg.msgid_plural] * nplurals
            if plunityidx is not None:
                srctexts[plunityidx] = msg.msgid
        for i, srctext in enumerate(srctexts):
            edt = ExtData()
            edt.hashsum = get_msg_hashsum(msg)
            edt.srctext = srctext
            if addidx:
                edt.hashsum = "%s-%d" % (edt.hashsum, i)
                edt.srctext = "%s%s" % (ExtData.plfmt % i, edt.srctext)
            if wordcnt:
                words = proper_words(srctext, True, cat.accelerator(),
                                     msg.format)
                edt.wordcnt = len(words)
            extdata.append(edt)

    return extdata


def write_csv (extdata, filepath, dialect="excel"):

    ofl = open(filepath, "w")
    data = csv.writer(ofl, dialect=dialect)
    for edt in extdata:
        data.writerow([edt.hashsum, edt.srctext.encode("utf8")])
    ofl.close()


def write_xls (extdata, filepath):

    try:
        import xlwt
    except:
        error("Cannot import Python module '%s'." % "xlwt")

    book = xlwt.Workbook(encoding="UTF-8")
    sheet = book.add_sheet("PO-extract")

    enclines = []
    for i, edt in enumerate(extdata):
        sheet.write(i, 0, edt.hashsum)
        sheet.write(i, 1, edt.srctext)
    book.save(filepath)


# NOTE: Does not really work properly, added just for a quick test.
def write_html (extdata, filepath):

    enclines = []
    enclines.append("<?xml version='1.0' encoding='UTF-8'?>")
    enclines.append("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>")
    enclines.append("<html>")
    enclines.append("<body>")
    for edt in extdata:
        line = "<p id=\"%s\">%s</p>" % (edt.hashsum, edt.srctext)
        encline = line.encode("utf8")
        enclines.append(encline)
    enclines.append("</body>")
    enclines.append("</html>")
    enclines.append("")
    csv = open(filepath, "w")
    csv.writelines("\n".join(enclines))
    csv.close()


class InjData:
    def __init__ (self):
        self.hashsum = None
        self.trntext = None
        self.filepath = None
        self.filepos = None


def inject (injdata, filepath, translator=None):

    cat = Catalog(filepath)
    nplurals = cat.nplurals()
    for msg in cat:
        addidx = msg.msgid_plural is not None
        trntexts = []
        for i in range(len(msg.msgstr)):
            hashsum = get_msg_hashsum(msg)
            if addidx:
                hashsum += "-%d" % i
            idt = injdata.get(hashsum)
            if idt is None:
                # Does not apply, input may be one chunk (-w option).
                #warning_on_msg("Not found in input file.", msg, cat)
                break
            injdata.pop(hashsum)
            trntext = idt.trntext
            if addidx:
                plpref = ExtData.plfmt % i
                if trntext.startswith(plpref):
                    trntext = trntext[len(plpref):]
            trntexts.append(trntext)
        if len(trntexts) != nplurals:
            continue
        msg.msgstr[:] = trntexts
        msg.unfuzzy()
    if translator is not None:
        cat.update_header(name=translator)
    cat.sync()

    extras_by_file = {}
    for idt in injdata.values():
        if idt.filepath not in extras_by_file:
            extras_by_file[idt.filepath] = []
        extras_by_file[idt.filepath].append(idt.filepos)
    for fname, fposs in sorted(extras_by_file.items()):
        warning("Some messages from input file '%s' do not exist "
                "in the PO file, at rows: %s."
                % (fname, ", ".join(map(str, sorted(fposs)))))


def read_csv (filepaths, dialect="excel"):

    injdata = {}
    for filepath in filepaths:
        ifl = open(filepath)
        data = csv.reader(ifl, dialect=dialect)
        for i, row in enumerate(data):
            idt = InjData()
            idt.hashsum = row[0].strip()
            if idt.hashsum and len(row) >= 2:
                idt.trntext = row[1].decode("utf8")
                idt.filepath = filepath
                idt.filepos = i + 1
                injdata[idt.hashsum] = idt
        ifl.close()

    return injdata


def read_xls (filepaths):

    try:
        import xlrd
    except:
        error("Cannot import Python module '%s'." % "xlrd")

    injdata = {}
    for filepath in filepaths:
        book = xlrd.open_workbook(filepath)
        sheet = book.sheet_by_index(0)
        for i in xrange(sheet.nrows):
            idt = InjData()
            idt.hashsum = sheet.cell(i, 0).value
            idt.trntext = sheet.cell(i, 1).value
            idt.filepath = filepath
            idt.filepos = i + 1
            injdata[idt.hashsum] = idt

    return injdata


def get_msg_hashsum (msg):

    h = hashlib.md5()
    h.update(msg.key.encode("utf8"))
    hashsum = h.hexdigest()
    return hashsum


def chunk (extdata, maxwordcnt, basepath):

    p = basepath.rfind(".")
    if p < 0:
        p = len(argoutpath)
    nextpath = basepath[:p]
    ext = basepath[p:]

    extchunks = []
    nmsgs = len(extdata)
    for i in range(nmsgs + 1):
        if i < nmsgs:
            edt = extdata[i]
        if i == 0 or i == nmsgs or csumwordcnt + edt.wordcnt > maxwordcnt:
            if i > 0:
                ctabpath = "%s-%03d%s" % (nextpath, len(extchunks), ext)
                extchunks.append((cextdata, ctabpath))
            cextdata = []
            csumwordcnt = 0
        if i < nmsgs:
            csumwordcnt += edt.wordcnt
            cextdata.append(edt)

    return extchunks


if __name__ == "__main__":
    main()

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

try:
    import fallback_import_paths
except:
    pass

import locale
import optparse
import os
import re
import sys
import time

from pology.catalog import Catalog
from pology.fsops import str_to_unicode, collect_catalogs
from pology.msgreport import warning_on_msg
from pology.report import warning, error


def main ():

    locale.setlocale(locale.LC_ALL, "")

    usage = (
        "\n"
        "  %prog POPATH... TMPATH TMLANG [OPTIONS]")
    desc = (
        "Extract Wordfast translation memory from a set of PO files.\n"
        "\n"
        "POPATH... can be any combination of file and directory paths. "
        "Directories will be searche recursively for PO files. "
        "TMPATH is the path to the TM file to be created. "
        "TMLANG is the language code in Wordfast TM format "
        "(e.g. DE-DE for German-Germany, PT-BR for Portugese-Brazil).\n"
        "\n"
        "IMPORTANT NOTE: "
        "There was no documentation available for the Wordfast TM format, "
        "and its observed properties were rather strange in some aspects. "
        "It is not unlikely that this script will do the wrong thing when "
        "converting some PO messages, or even produce invalid TM file.")
    ver = (
        u"%prog x1\n"
        u"Copyright © 2011, "
        u"Chusslove Illich (Часлав Илић) <[email protected]>")

    opars = optparse.OptionParser(usage=usage, description=desc, version=ver)
    opars.add_option(
        "-W", "--wrap-kludge",
        action="store_true", dest="wrap_kludge", default=False,
        help="If there is at least one replaceable indicator in text, "
             "wrap the whole text with two more indicators.")
    opars.add_option(
        "-r", "--repl-regex", metavar="REGEX",
        action="store", dest="repl_regex", default=None,
        help="Regular expression for determining additional replaceables "
             "(note: in Python regex dialect, not Wordfast).")

    options, free_args = opars.parse_args(str_to_unicode(sys.argv[1:]))
    if len(free_args) < 3:
        opars.print_usage()
        exit(1)

    rplrx = None
    if options.repl_regex is not None:
        try:
            rplrx = re.compile(options.repl_regex, re.U)
        except:
            error("Invalid regular expression '%s'." % options.repl_regex)

    tmlang = free_args.pop()
    tmpath = free_args.pop()
    popaths = collect_catalogs(free_args)

    extract_wordfast_tm(popaths, tmpath, tmlang,
                        rplrx=rplrx, wrapkludge=options.wrap_kludge)


# Index prefix for plural forms.
# NOTE: Must be same as in extinj_po_tab.py.
_plfmt = "[%d]|"

def extract_wordfast_tm (popaths, tmpath, tmlang,
                         rplrx=None, wrapkludge=False):

    # Extract source-translation pairs from PO messages.
    srcs_trns = set()
    for popath in popaths:
        cat = Catalog(popath, monitored=False)

        nplurals = cat.nplurals()
        plunityidx = cat.plural_index(1)
        if plunityidx not in cat.plural_indices_single():
            plunityidx = None

        for msg in cat:
            if msg.translated or (msg.fuzzy and msg.msgid_previous is not None):
                if msg.msgid_plural is None:
                    addidx = False
                    if msg.translated:
                        srcs = [msg.msgid]
                    else:
                        srcs = [msg.msgid_previous]
                    trns = [msg.msgstr[0]]
                else:
                    addidx = True
                    if msg.translated:
                        srcs = [msg.msgid_plural] * nplurals
                    else:
                        srcs = [msg.msgid_plural_previous] * nplurals
                    if plunityidx is not None:
                        if msg.translated:
                            srcs[plunityidx] = msg.msgid
                        else:
                            srcs[plunityidx] = msg.msgid_previous
                    trns = [u""] * nplurals
                    cpnum = min(len(msg.msgstr), nplurals)
                    trns[:cpnum] = msg.msgstr[:cpnum]
                    for i in range(nplurals):
                        plpref = _plfmt % i
                        srcs[i] = plpref + srcs[i]
                        trns[i] = plpref + trns[i]

                for src, trn in zip(srcs, trns):
                    src = text_to_wordfast_tm(src, rplrx, wrapkludge)
                    trn = text_to_wordfast_tm(trn, rplrx, wrapkludge)
                    srcs_trns.add((src, trn))

    # Write the TM file.
    timestr = time.strftime("%Y%m%d~%H%M%S")
    cmdname = os.path.basename(sys.argv[0])
    userid = "EP"
    srclang = "EN-US"
    sep = "\t"

    lines = []
    hdrflds = (
        "%%%s" % timestr,
        "%%User ID,%s,%s %s" % (userid, userid, cmdname),
        "%TU=00000000",
        "%%%s" % srclang,
        "%Wordfast TM v.546/00",
        "%%%s" % tmlang,
        "%-----------",
        "", "", "", "",
    )
    lines.append(sep.join(hdrflds))
    srcs_trns = sorted(srcs_trns)
    for src, trn in srcs_trns:
        flds = (
            timestr,
            userid,
            "0",
            srclang,
            src,
            tmlang,
            trn,
            "", "", "", "",
        )
        lines.append(sep.join(flds))
    lines.append("")

    fcnt = "\n".join(lines).encode("utf16")
    ofl = open(tmpath, "w")
    ofl.write(fcnt)
    ofl.close()


# FIXME: Likely incomplete.
_chars_ents = (
    (u"&", "&'26;"), # must be first
    (u"®", "&'AE;"),
    (u"©", "&'A9;"),
)

# Basic replaceables.
# The order matters: joined by | regex operator.
_base_rpl_rxstrs = (
    "[\t\n]+",
)
_base_rplrx = re.compile("|".join(_base_rpl_rxstrs), re.U)

def text_to_wordfast_tm (text, rplrx=None, wrapkludge=False):

    if rplrx is not None:
        segs = rplrx.split(text)
    else:
        segs = [text]
    segs1 = []
    for seg in segs:
        segs1.extend(_base_rplrx.split(seg))
    segs = segs1

    if wrapkludge:
        # If there is at least one replaceable indicator in text,
        # add two more outer indicators.
        if len(segs) > 1:
            segs.insert(0, "")
            segs.append("")

    rsegs = []
    for i in range(len(segs) - 1):
        rseg = segs[i]
        for char, ent in _chars_ents:
            rseg = rseg.replace(char, ent)
        rsegs.append(rseg)
        if i < 26:
            rplind = "&t%s;" % chr(ord("A") + i)
        else:
            # FIXME: Not really true... or maybe yes.
            rplind = "&t{=ut%d};" % (ord("A") + i)
        rsegs.append(rplind)
    rsegs.append(segs[-1])

    rtext = "".join(rsegs)
    return rtext


if __name__ == "__main__":
    main()

signature.asc
Description: This is a digitally signed message part.

[sr-dev] Re: PO u Wordfastu (nastavak)

Одговори путем е-поште