[sr-dev] Re: Postoji li parser za ovako nešto?

Часлав Илић Sat, 21 May 2011 10:59:07 -0700

> [: Marw :]
> Calc mi izveze kolone pod navodnicima, pa onda skripta prijavljuje grešku.


Био сам на брзину ручно нашкрабао читање/писање ЦСВ-а, јер нисам видео да
већ постоји модул за то у стандардној библиотеци, и ето проблема. Сад сам
пребацио да користи тај модул, па ће ваљда Ексел глатко да чита оно што
скрипта направи, а скрипта глатко да чита оно што Ексел избаци. Тако бар
ради са Рачуном.

С друге стране, док сам ово горе извиђао, натрапао сам и на модуле са стране
xlrd и xlwt, за непосредно писање и читање Екселових датотека. Ако те не
мрзи да их инсталираш (тј. мање те мрзи него да идеш преко Ексела), можеш да
их скинеш одавде:

http://www.python-excel.org/

Онда скрипту извршаваш овако:

  extinj_po_tab.py extract file.po file.xls -f xls
  extinj_po_tab.py inject file.xls file.po -f xls -t 'Пера Перић 
<[email protected]>'

(ако се формат не зада опцијом -f, подразумева се ЦСВ).

Скрипти сам променио име у extinj_po_tab.py, пошто сада рукује са више
формата. Ону претходну можеш да бришеш.

> [: Marw :]
> Ne prepoznaje ih kao "placeables", već ih tretira kao najobičniji tekst.
> [...]
> Btw, ako neko hoće da ekstperimentiše sa Wordfastom ima na:

Изгледа да уметке¹ издваја делимично према формату документа који се
преводи. Нпр. увек сматра {nekidjavo} за уметак, а кад учитам ХТМЛ документ,
онда додатно узима и ХТМЛ ознаке, <b>, <i>, итд. (Али нешто што није позната
ХТМЛ ознака третира као границу преводне јединице, нпр. <emph>, тако да није
употребљиво за документацију Опенофиса.)

Међутим, при отварању Екселовог фајла, понуди кориснику да унесе регуларни
израз за издвајање уметака. Прво изаберем „Advanced Wizard“, онда испразним
„use headers?“, онда изаберем колону B за превођење, и на последњој страници
у поље „Please enter Regular Expression.“ унесем:

  <[^>]*>

Није савршено решење, али види како иде, можда буде довољно за документацију
ОО-а.

[1] Привремени израз; сматрај се задуженим да смислиш превод за ’placeable’.
Не прихвата се више од једне речи, не прихвата се ’плејсабл’.

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

try:
    import fallback_import_paths
except:
    pass

import csv
import hashlib
import locale
import os
import optparse
import sys

from pology.catalog import Catalog
from pology.fsops import str_to_unicode
from pology.msgreport import warning_on_msg
from pology.report import warning, error


def main ():

    locale.setlocale(locale.LC_ALL, "")

    usage = (
        "\n"
        "  %prog extract POTFILE TABFILE [OPTIONS]\n"
        "  %prog inject TABFILE POFILE [OPTIONS]")
    desc = (
        "Simplistically extract a POT file into a monolanguage table file. "
        "Inject translated monolanguage table file into a PO file. "
        "Encoding of the table file is expected to be UTF-8 at all times, "
        "regardless of the PO file encoding.")
    ver = (
        u"%prog x1\n"
        u"Copyright © 2011, "
        u"Chusslove Illich (Часлав Илић) <[email protected]>")

    opars = optparse.OptionParser(usage=usage, description=desc, version=ver)
    _all_formats = ["csv", "html", "xls"]
    _def_format = "csv"
    opars.add_option(
        "-f", "--format", metavar="FORMAT",
        action="store", dest="format", default=_def_format,
        help="Format of the table file "
             "(known: %s; default: %s)."
             % (", ".join(_all_formats), _def_format))
    _def_dialect = "excel"
    opars.add_option(
        "-d", "--dialect", metavar="DIALECT",
        action="store", dest="dialect", default=_def_dialect,
        help="The dialect for the CSV format "
             "(known: %s; default: %s)."
             % (", ".join(sorted(csv.list_dialects())), _def_dialect))
    opars.add_option(
        "-t", "--translator", metavar="'NAME <EMAIL>'",
        action="store", dest="translator", default=None,
        help="Name and email address of the translator, "
             "to update the PO header on injection.")

    options, free_args = opars.parse_args(str_to_unicode(sys.argv[1:]))
    if len(free_args) != 3:
        opars.print_usage()
        exit(1)

    mode = free_args[0]
    if mode == "extract":
        extdata = extract(free_args[1])
        if options.format == "csv":
            write_csv(extdata, free_args[2], dialect=options.dialect)
        elif options.format == "html":
            write_html(extdata, free_args[2])
        elif options.format == "xls":
            write_xls(extdata, free_args[2])
        else:
            error("Unknown format '%s' for the table file." % options.format)
    elif mode == "inject":
        if options.format == "csv":
            injdata = read_csv(free_args[1], dialect=options.dialect)
        elif options.format == "html":
            error("Injection from HTML not implemented yet.")
            injdata = read_html(free_args[1])
        elif options.format == "xls":
            injdata = read_xls(free_args[1])
        else:
            error("Unknown format '%s' for the table file." % options.format)
        inject(injdata, free_args[2], translator=options.translator)
    else:
        error("Unknown operation mode '%s'." % mode)


def extract (filepath):

    cat = Catalog(filepath, monitored=False)
    msgs_hexes = []
    for msg in cat:
        if msg.msgid_plural is not None:
            warning_on_msg(
                "Extraction of plural messages not implemented yet, "
                "skipping.", msg, cat)
            continue
        msgs_hexes.append((msg, get_msg_hex(msg)))

    return msgs_hexes


def write_csv (msgs_hexes, filepath, dialect="excel"):

    ofl = open(filepath, "w")
    data = csv.writer(ofl, dialect=dialect)
    for msg, msghex in msgs_hexes:
        data.writerow([msghex, msg.msgid.encode("utf8")])
    ofl.close()


def write_xls (msgs_hexes, filepath):

    try:
        import xlwt
    except:
        error("Cannot import Python module '%s'." % "xlwt")

    book = xlwt.Workbook(encoding="UTF-8")
    sheet = book.add_sheet("PO-extract")

    enclines = []
    for i, (msg, msghex) in enumerate(msgs_hexes):
        sheet.write(i, 0, msghex)
        sheet.write(i, 1, msg.msgid)
    book.save(filepath)


def write_html (msgs_hexes, filepath):

    enclines = []
    enclines.append("<?xml version='1.0' encoding='UTF-8'?>")
    enclines.append("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>")
    enclines.append("<html>")
    enclines.append("<body>")
    for msg, msghex in msgs_hexes:
        line = "<p id=\"%s\">%s</p>" % (msghex, msg.msgid)
        encline = line.encode("utf8")
        enclines.append(encline)
    enclines.append("</body>")
    enclines.append("</html>")
    enclines.append("")
    csv = open(filepath, "w")
    csv.writelines("\n".join(enclines))
    csv.close()


def inject (trs_by_hex, filepath, translator=None):

    cat = Catalog(filepath)
    msgs_by_hex = {}
    for msg in cat:
        msghex = get_msg_hex(msg)
        msgstr, csvlno = trs_by_hex.get(msghex, [None, None])
        if msgstr is None:
            warning_on_msg("Not found in input file.", msg, cat)
            continue
        trs_by_hex.pop(msghex)
        if msgstr:
            msg.msgstr[0] = msgstr
            msg.unfuzzy()
    if translator is not None:
        cat.update_header(name=translator)
    cat.sync()

    extra_lnos = [x[1] for x in trs_by_hex.values()]
    if extra_lnos:
        warning("Some messages from input file do not exist in the PO file, "
                "at lines: %s."
                % ", ".join(map(str, sorted(extra_lnos))))


def read_csv (filepath, dialect="excel"):

    trs_by_hex = {}
    ifl = open(filepath)
    data = csv.reader(ifl, dialect=dialect)
    for i, row in enumerate(data):
        msghex = row[0]
        msgstr = row[1].decode("utf8")
        trs_by_hex[msghex] = (msgstr, i + 1)
    ifl.close()

    return trs_by_hex


def read_xls (filepath):

    try:
        import xlrd
    except:
        error("Cannot import Python module '%s'." % "xlrd")

    book = xlrd.open_workbook(filepath)
    sheet = book.sheet_by_index(0)

    trs_by_hex = {}
    for i in xrange(sheet.nrows):
        msghex = sheet.cell(i, 0).value
        msgstr = sheet.cell(i, 1).value
        trs_by_hex[msghex] = (msgstr, i + 1)

    return trs_by_hex


def get_msg_hex (msg):

    h = hashlib.md5()
    h.update(msg.key.encode("utf8"))
    msghex = h.hexdigest()
    return msghex


if __name__ == "__main__":
    main()

signature.asc
Description: This is a digitally signed message part.

[sr-dev] Re: Postoji li parser za ovako nešto?

Одговори путем е-поште