> [: Marw :] > Mozilino SUMO sučelje: > > https://localize.mozilla.org/sr/sumo/ > > Mislim da će onaj regularni izraz pokriti tagove
Одговарајући регуларни израз овде требало би да буде:
<[^>]*>|(\w*\)[fds]|%[fds]|\{\w*\}
Поред ознака покрива и непроменљиве типа %(foobar)s, %s, {foobar}.
Ове ПО датотеке садрже и множинске поруке; приложене дорађене скрипте које
умеју да рукују и множинским порукама.
Множинска порука је порука овог типа:
#: apps/sumo/helpers.py:268
msgid "%(number)d year ago"
msgid_plural "%(number)d years ago"
msgstr[0] ""
msgstr[1] ""
msgstr[2] ""
Треба је превести овако:
#: apps/sumo/helpers.py:268
msgid "%(number)d year ago"
msgid_plural "%(number)d years ago"
msgstr[0] "пре %(number)d годину"
msgstr[1] "пре %(number)d године"
msgstr[2] "пре %(number)d година"
Превод са индексом [0] користи се за све бројеве (тј. смене непроменљиве
%(number)d) који се завршавају на 1, осим оне на 11; индекс [1] за све
бројеве који се завршавају на 2, 3, 4, осим оне на 12, 13, 14; индекс [2] за
све остале бројеве.
У Вордфасту ћеш видети три раздвојена сегмента за сваку множинску поруку:
{ut1}[0]|{ut2} years ago{ut3}
{ut1}[1]|{ut2} years ago{ut3}
{ut1}[2]|{ut2} years ago{ut3}
Овде су {ut1} и {ut3} оне фантомске непроменљиве које умеће сам Вордфаст. Не
рачунајући њих, сваки сегмент почиње префиксом [индекс]|. Тај префикс се
изоставља у преводу:
{ut1}пре {ut2} годину{ut3}
{ut1}пре {ut2} године{ut3}
{ut1}пре {ut2} година{ut3}
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
try:
import fallback_import_paths
except:
pass
import csv
import hashlib
import locale
import optparse
import os
import sys
from pology.catalog import Catalog
from pology.fsops import str_to_unicode
from pology.msgreport import warning_on_msg
from pology.report import warning, error
from pology.split import proper_words
def main ():
locale.setlocale(locale.LC_ALL, "")
usage = (
"\n"
" %prog extract POTFILE TABFILE [OPTIONS]\n"
" %prog inject TABFILE... POFILE [OPTIONS]")
desc = (
"Simplistically extract a POT file into a monolanguage table file. "
"Inject translated monolanguage table file into a PO file. "
"Encoding of the table file is expected to be UTF-8 at all times, "
"regardless of the PO file encoding.")
ver = (
u"%prog x1\n"
u"Copyright © 2011, "
u"Chusslove Illich (Часлав Илић) <[email protected]>")
opars = optparse.OptionParser(usage=usage, description=desc, version=ver)
_all_formats = ["csv", "html", "xls"]
_def_format = "csv"
_def_dialect = "excel"
opars.add_option(
"-d", "--dialect", metavar="DIALECT",
action="store", dest="dialect", default=_def_dialect,
help="The dialect for the CSV format "
"(known: %s; default: %s)."
% (", ".join(sorted(csv.list_dialects())), _def_dialect))
opars.add_option(
"-f", "--format", metavar="FORMAT",
action="store", dest="format", default=_def_format,
help="Format of the table file "
"(known: %s; default: %s)."
% (", ".join(_all_formats), _def_format))
opars.add_option(
"-t", "--translator", metavar="'NAME <EMAIL>'",
action="store", dest="translator", default=None,
help="Name and email address of the translator, "
"to update the PO header on injection.")
_min_words_per_chunk = 500
opars.add_option(
"-w", "--words-per-chunk", metavar="NUM",
action="store", dest="words_per_chunk", default=None,
help="Chunk output into several table files, "
"such that each contains at most this many words (minimum: %d). "
"Resulting files will have suffixes -000, -001, etc. "
"inserted before the extension."
% _min_words_per_chunk)
options, free_args = opars.parse_args(str_to_unicode(sys.argv[1:]))
maxwordcnt = None
if options.words_per_chunk:
try:
maxwordcnt = int(options.words_per_chunk)
except:
error("Words per chunk argument must be an integer.")
if maxwordcnt < _min_words_per_chunk:
error("Words per chunk argument must be at least %d."
% _min_words_per_chunk)
mode = free_args.pop(0)
paths = free_args
if mode == "extract":
if len(paths) != 2:
opars.print_usage()
exit(1)
popath, tabpath = paths
extdata = extract(popath, bool(maxwordcnt))
if maxwordcnt:
extchunks = chunk(extdata, maxwordcnt, tabpath)
else:
extchunks = [(extdata, tabpath)]
for cextdata, ctabpath in extchunks:
if options.format == "csv":
write_csv(cextdata, ctabpath, dialect=options.dialect)
elif options.format == "html":
write_html(extdata, ctabpath)
elif options.format == "xls":
write_xls(cextdata, ctabpath)
else:
error("Unknown format '%s' for the table file."
% options.format)
elif mode == "inject":
if len(paths) < 2:
opars.print_usage()
exit(1)
tabpaths = paths[:-1]
popath = paths[-1]
if options.format == "csv":
injdata = read_csv(tabpaths, dialect=options.dialect)
elif options.format == "html":
error("Injection from HTML not implemented yet.")
injdata = read_html(tabpaths)
elif options.format == "xls":
injdata = read_xls(tabpaths)
else:
error("Unknown format '%s' for the table file." % options.format)
inject(injdata, popath, translator=options.translator)
else:
error("Unknown operation mode '%s'." % mode)
class ExtData:
plfmt = "[%d]|"
def __init__ (self):
self.hashsum = None
self.srctext = None
self.wordcnt = None
def extract (filepath, wordcnt=False):
cat = Catalog(filepath, monitored=False)
nplurals = cat.nplurals()
plunityidx = cat.plural_index(1)
if plunityidx not in cat.plural_indices_single():
plunityidx = None
extdata = []
for msg in cat:
if msg.obsolete:
continue
if msg.msgid_plural is None:
addidx = False
srctexts = [msg.msgid]
else:
addidx = True
srctexts = [msg.msgid_plural] * nplurals
if plunityidx is not None:
srctexts[plunityidx] = msg.msgid
for i, srctext in enumerate(srctexts):
edt = ExtData()
edt.hashsum = get_msg_hashsum(msg)
edt.srctext = srctext
if addidx:
edt.hashsum = "%s-%d" % (edt.hashsum, i)
edt.srctext = "%s%s" % (ExtData.plfmt % i, edt.srctext)
if wordcnt:
words = proper_words(srctext, True, cat.accelerator(),
msg.format)
edt.wordcnt = len(words)
extdata.append(edt)
return extdata
def write_csv (extdata, filepath, dialect="excel"):
ofl = open(filepath, "w")
data = csv.writer(ofl, dialect=dialect)
for edt in extdata:
data.writerow([edt.hashsum, edt.srctext.encode("utf8")])
ofl.close()
def write_xls (extdata, filepath):
try:
import xlwt
except:
error("Cannot import Python module '%s'." % "xlwt")
book = xlwt.Workbook(encoding="UTF-8")
sheet = book.add_sheet("PO-extract")
enclines = []
for i, edt in enumerate(extdata):
sheet.write(i, 0, edt.hashsum)
sheet.write(i, 1, edt.srctext)
book.save(filepath)
# NOTE: Does not really work properly, added just for a quick test.
def write_html (extdata, filepath):
enclines = []
enclines.append("<?xml version='1.0' encoding='UTF-8'?>")
enclines.append("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>")
enclines.append("<html>")
enclines.append("<body>")
for edt in extdata:
line = "<p id=\"%s\">%s</p>" % (edt.hashsum, edt.srctext)
encline = line.encode("utf8")
enclines.append(encline)
enclines.append("</body>")
enclines.append("</html>")
enclines.append("")
csv = open(filepath, "w")
csv.writelines("\n".join(enclines))
csv.close()
class InjData:
def __init__ (self):
self.hashsum = None
self.trntext = None
self.filepath = None
self.filepos = None
def inject (injdata, filepath, translator=None):
cat = Catalog(filepath)
nplurals = cat.nplurals()
for msg in cat:
addidx = msg.msgid_plural is not None
trntexts = []
for i in range(len(msg.msgstr)):
hashsum = get_msg_hashsum(msg)
if addidx:
hashsum += "-%d" % i
idt = injdata.get(hashsum)
if idt is None:
# Does not apply, input may be one chunk (-w option).
#warning_on_msg("Not found in input file.", msg, cat)
break
injdata.pop(hashsum)
trntext = idt.trntext
if addidx:
plpref = ExtData.plfmt % i
if trntext.startswith(plpref):
trntext = trntext[len(plpref):]
trntexts.append(trntext)
if len(trntexts) != nplurals:
continue
msg.msgstr[:] = trntexts
msg.unfuzzy()
if translator is not None:
cat.update_header(name=translator)
cat.sync()
extras_by_file = {}
for idt in injdata.values():
if idt.filepath not in extras_by_file:
extras_by_file[idt.filepath] = []
extras_by_file[idt.filepath].append(idt.filepos)
for fname, fposs in sorted(extras_by_file.items()):
warning("Some messages from input file '%s' do not exist "
"in the PO file, at rows: %s."
% (fname, ", ".join(map(str, sorted(fposs)))))
def read_csv (filepaths, dialect="excel"):
injdata = {}
for filepath in filepaths:
ifl = open(filepath)
data = csv.reader(ifl, dialect=dialect)
for i, row in enumerate(data):
idt = InjData()
idt.hashsum = row[0].strip()
if idt.hashsum and len(row) >= 2:
idt.trntext = row[1].decode("utf8")
idt.filepath = filepath
idt.filepos = i + 1
injdata[idt.hashsum] = idt
ifl.close()
return injdata
def read_xls (filepaths):
try:
import xlrd
except:
error("Cannot import Python module '%s'." % "xlrd")
injdata = {}
for filepath in filepaths:
book = xlrd.open_workbook(filepath)
sheet = book.sheet_by_index(0)
for i in xrange(sheet.nrows):
idt = InjData()
idt.hashsum = sheet.cell(i, 0).value
idt.trntext = sheet.cell(i, 1).value
idt.filepath = filepath
idt.filepos = i + 1
injdata[idt.hashsum] = idt
return injdata
def get_msg_hashsum (msg):
h = hashlib.md5()
h.update(msg.key.encode("utf8"))
hashsum = h.hexdigest()
return hashsum
def chunk (extdata, maxwordcnt, basepath):
p = basepath.rfind(".")
if p < 0:
p = len(argoutpath)
nextpath = basepath[:p]
ext = basepath[p:]
extchunks = []
nmsgs = len(extdata)
for i in range(nmsgs + 1):
if i < nmsgs:
edt = extdata[i]
if i == 0 or i == nmsgs or csumwordcnt + edt.wordcnt > maxwordcnt:
if i > 0:
ctabpath = "%s-%03d%s" % (nextpath, len(extchunks), ext)
extchunks.append((cextdata, ctabpath))
cextdata = []
csumwordcnt = 0
if i < nmsgs:
csumwordcnt += edt.wordcnt
cextdata.append(edt)
return extchunks
if __name__ == "__main__":
main()
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
try:
import fallback_import_paths
except:
pass
import locale
import optparse
import os
import re
import sys
import time
from pology.catalog import Catalog
from pology.fsops import str_to_unicode, collect_catalogs
from pology.msgreport import warning_on_msg
from pology.report import warning, error
def main ():
locale.setlocale(locale.LC_ALL, "")
usage = (
"\n"
" %prog POPATH... TMPATH TMLANG [OPTIONS]")
desc = (
"Extract Wordfast translation memory from a set of PO files.\n"
"\n"
"POPATH... can be any combination of file and directory paths. "
"Directories will be searche recursively for PO files. "
"TMPATH is the path to the TM file to be created. "
"TMLANG is the language code in Wordfast TM format "
"(e.g. DE-DE for German-Germany, PT-BR for Portugese-Brazil).\n"
"\n"
"IMPORTANT NOTE: "
"There was no documentation available for the Wordfast TM format, "
"and its observed properties were rather strange in some aspects. "
"It is not unlikely that this script will do the wrong thing when "
"converting some PO messages, or even produce invalid TM file.")
ver = (
u"%prog x1\n"
u"Copyright © 2011, "
u"Chusslove Illich (Часлав Илић) <[email protected]>")
opars = optparse.OptionParser(usage=usage, description=desc, version=ver)
opars.add_option(
"-W", "--wrap-kludge",
action="store_true", dest="wrap_kludge", default=False,
help="If there is at least one replaceable indicator in text, "
"wrap the whole text with two more indicators.")
opars.add_option(
"-r", "--repl-regex", metavar="REGEX",
action="store", dest="repl_regex", default=None,
help="Regular expression for determining additional replaceables "
"(note: in Python regex dialect, not Wordfast).")
options, free_args = opars.parse_args(str_to_unicode(sys.argv[1:]))
if len(free_args) < 3:
opars.print_usage()
exit(1)
rplrx = None
if options.repl_regex is not None:
try:
rplrx = re.compile(options.repl_regex, re.U)
except:
error("Invalid regular expression '%s'." % options.repl_regex)
tmlang = free_args.pop()
tmpath = free_args.pop()
popaths = collect_catalogs(free_args)
extract_wordfast_tm(popaths, tmpath, tmlang,
rplrx=rplrx, wrapkludge=options.wrap_kludge)
# Index prefix for plural forms.
# NOTE: Must be same as in extinj_po_tab.py.
_plfmt = "[%d]|"
def extract_wordfast_tm (popaths, tmpath, tmlang,
rplrx=None, wrapkludge=False):
# Extract source-translation pairs from PO messages.
srcs_trns = set()
for popath in popaths:
cat = Catalog(popath, monitored=False)
nplurals = cat.nplurals()
plunityidx = cat.plural_index(1)
if plunityidx not in cat.plural_indices_single():
plunityidx = None
for msg in cat:
if msg.translated or (msg.fuzzy and msg.msgid_previous is not None):
if msg.msgid_plural is None:
addidx = False
if msg.translated:
srcs = [msg.msgid]
else:
srcs = [msg.msgid_previous]
trns = [msg.msgstr[0]]
else:
addidx = True
if msg.translated:
srcs = [msg.msgid_plural] * nplurals
else:
srcs = [msg.msgid_plural_previous] * nplurals
if plunityidx is not None:
if msg.translated:
srcs[plunityidx] = msg.msgid
else:
srcs[plunityidx] = msg.msgid_previous
trns = [u""] * nplurals
cpnum = min(len(msg.msgstr), nplurals)
trns[:cpnum] = msg.msgstr[:cpnum]
for i in range(nplurals):
plpref = _plfmt % i
srcs[i] = plpref + srcs[i]
trns[i] = plpref + trns[i]
for src, trn in zip(srcs, trns):
src = text_to_wordfast_tm(src, rplrx, wrapkludge)
trn = text_to_wordfast_tm(trn, rplrx, wrapkludge)
srcs_trns.add((src, trn))
# Write the TM file.
timestr = time.strftime("%Y%m%d~%H%M%S")
cmdname = os.path.basename(sys.argv[0])
userid = "EP"
srclang = "EN-US"
sep = "\t"
lines = []
hdrflds = (
"%%%s" % timestr,
"%%User ID,%s,%s %s" % (userid, userid, cmdname),
"%TU=00000000",
"%%%s" % srclang,
"%Wordfast TM v.546/00",
"%%%s" % tmlang,
"%-----------",
"", "", "", "",
)
lines.append(sep.join(hdrflds))
srcs_trns = sorted(srcs_trns)
for src, trn in srcs_trns:
flds = (
timestr,
userid,
"0",
srclang,
src,
tmlang,
trn,
"", "", "", "",
)
lines.append(sep.join(flds))
lines.append("")
fcnt = "\n".join(lines).encode("utf16")
ofl = open(tmpath, "w")
ofl.write(fcnt)
ofl.close()
# FIXME: Likely incomplete.
_chars_ents = (
(u"&", "&'26;"), # must be first
(u"®", "&'AE;"),
(u"©", "&'A9;"),
)
# Basic replaceables.
# The order matters: joined by | regex operator.
_base_rpl_rxstrs = (
"[\t\n]+",
)
_base_rplrx = re.compile("|".join(_base_rpl_rxstrs), re.U)
def text_to_wordfast_tm (text, rplrx=None, wrapkludge=False):
if rplrx is not None:
segs = rplrx.split(text)
else:
segs = [text]
segs1 = []
for seg in segs:
segs1.extend(_base_rplrx.split(seg))
segs = segs1
if wrapkludge:
# If there is at least one replaceable indicator in text,
# add two more outer indicators.
if len(segs) > 1:
segs.insert(0, "")
segs.append("")
rsegs = []
for i in range(len(segs) - 1):
rseg = segs[i]
for char, ent in _chars_ents:
rseg = rseg.replace(char, ent)
rsegs.append(rseg)
if i < 26:
rplind = "&t%s;" % chr(ord("A") + i)
else:
# FIXME: Not really true... or maybe yes.
rplind = "&t{=ut%d};" % (ord("A") + i)
rsegs.append(rplind)
rsegs.append(segs[-1])
rtext = "".join(rsegs)
return rtext
if __name__ == "__main__":
main()
signature.asc
Description: This is a digitally signed message part.

