Re: a splitting script

Benno Schulenberg Mon, 26 Jan 2026 07:34:13 -0800


Op 25-01-2026 om 17:19 schreef Egmont Koblinger:

Another random find:


9.9 tarball's hu.po line 3252.  The translation is outdated, it misses
the "+" flag.  Accordingly, it's marked as fuzzy.


This made me think: when a msgid-msgstr pair is marked as fuzzy,
then the split.py script should _not_ split any options that are
in the msgid and msgstr, because whatever is in the msgstr does
not correspond to what is in the msgid -- or at least not fully.

One can observe this problem when running ./split.py on sk.po
and then searching for "--no-dereference".  Oops.

(In hu.po at the TP there are no fuzzies, so I didn't notice the
problem there.)

So split.py has to check for the "#, fuzzy" marker, and skip the
splitting of the subsequent msgid-msgstr pair.  I've implemented
that in the attached updated script.

(But maybe it is better to split them anyway and mark every
resulting pair as fuzzy?)


--
Regards,

Benno

#!/usr/bin/env python3

import sys
import re

def split_po_entries(lines):
    i = 0
    fuzzy = False

    while i < len(lines):
        line = lines[i]

        if "#, fuzzy" in line:
            fuzzy = True

        if line.strip() == 'msgid ""':
            start_i = i
            msgid_lines = []
            i += 1
            while i < len(lines) and lines[i].startswith('"'):
                msgid_lines.append(lines[i])
                i += 1

            if i < len(lines) and lines[i].strip() == 'msgstr ""':
                msgstr_lines = []
                i += 1
                while i < len(lines) and lines[i].startswith('"'):
                    msgstr_lines.append(lines[i])
                    i += 1

                def is_option(line):
                    if line.startswith('"      --'):
                        return True
                    if line.startswith('"  -'):
                        text = line[4:]
                        if text.startswith('M '):
                            return False
                        if len(text) > 0 and text[0] != ' ':
                            return True
                    if re.match(r'^"  \S+ -\S\S \S+  ', line):
                        return True
                    if re.match(r'^"  [a-z]+=\S+  ', line):
                        return True
                    return False

                def is_option_relaxed(line):
                    if re.match(r'^" {1,6}--', line):
                        return True
                    if line.startswith('"  -'):
                        text = line[4:]
                        if text.startswith('M '):
                            return False
                        if len(text) > 0 and text[0] != ' ':
                            return True
                    if re.match(r'^"  \S+ -\S\S \S+  ', line):
                        return True
                    if re.match(r'^"  [a-z]+=\S+  ', line):
                        return True
                    return False

                has_options = any(is_option(line) for line in msgid_lines)

                if has_options and not fuzzy:
                    first_non_empty = None
                    for j, line in enumerate(msgid_lines):
                        if line.strip() not in ('""', '"\\n"'):
                            first_non_empty = j
                            break

                    if first_non_empty is not None and is_option(msgid_lines[first_non_empty]):
                        msgid_lines = msgid_lines[first_non_empty:]
                        msgstr_lines = msgstr_lines[first_non_empty:] if first_non_empty < len(msgstr_lines) else msgstr_lines

                    msgid_groups = []
                    msgstr_groups = []

                    msgid_indices = [0]
                    for j in range(1, len(msgid_lines)):
                        if is_option(msgid_lines[j]):
                            msgid_indices.append(j)
                    msgid_indices.append(len(msgid_lines))

                    msgstr_indices = [0]
                    for j in range(1, len(msgstr_lines)):
                        if is_option_relaxed(msgstr_lines[j]):
                            msgstr_indices.append(j)
                    msgstr_indices.append(len(msgstr_lines))

                    for k in range(len(msgid_indices) - 1):
                        msgid_groups.append(msgid_lines[msgid_indices[k]:msgid_indices[k+1]])

                    for k in range(len(msgstr_indices) - 1):
                        msgstr_groups.append(msgstr_lines[msgstr_indices[k]:msgstr_indices[k+1]])

                    for msgid_group, msgstr_group in zip(msgid_groups, msgstr_groups):
                        print('msgid ""')
                        for line in msgid_group:
                            print(line, end='')
                        print('msgstr ""')
                        for line in msgstr_group:
                            print(line, end='')
                        print()

                    continue

            for j in range(start_i, i):
                print(lines[j], end='')
            fuzzy = False
            continue

        print(line, end='')
        i += 1

if __name__ == '__main__':
    if len(sys.argv) > 1:
        with open(sys.argv[1]) as f:
            split_po_entries(f.readlines())
    else:
        split_po_entries(sys.stdin.readlines())

Re: a splitting script

Reply via email to