Re: a splitting script

Pádraig Brady Tue, 27 Jan 2026 07:10:12 -0800

On 26/01/2026 15:47, [email protected] wrote:


Op 25-01-2026 om 19:30 schreef Pádraig Brady:

Oh...  Can you give two examples of commands for which option
descriptions aren't on the next line?


cat, ptx, truncate at least
as the descriptions on those are succinct enough.


For `truncate` it would be just two options that would get wrapped
when they shouldn't be -- that would still be acceptable.  But for
`cat` it would be ten and for `ptx` sixteen.  :/

Well, the script could check for "src/cat.c" and "src/ptx.c" in
the preceding line and skip the wrapping when the relevant bools
are set.  So... please implement the wrapping and I'll implement
the exceptions.

(That is: the wrapping should only happen when options are split,
not for any options that are already single.  This will not prevent
all valid translations from becoming fuzzy when msgmerged, but a
good amount.)



I've attached an updated split.py that wraps iff splitting,and also auto 
excludes the commands that don't wrap.

It does result in a lot less fuzzy:
  $ diff pl-new-orig.po pl-new.po | grep -- '-#, fuzzy' | wc -l
  233

Note sk.po has an invalid utf8 char which stops processing,
so I manually edited sk.po so that the non utf8 ç
in Fran.*Pinard was replaced, _before_ I ran the script.

Note also af.po and gl.po should be run with
LC_ALL=en_US.iso-8859-1 or equivalent.

thanks,
Padraig

#!/usr/bin/env python3

import sys
import re

# Files to exclude from msgid wrapping
EXCLUDED_FILES_PATTERN = re.compile(r'src/(cat|nl|ptx|realpath|runcon|shuf|stdbuf|stty|sync|tac|truncate|uname|who)\.c')

def wrap_msgid_line(line):
    """Wrap a single msgid line by splitting option from description.

    Returns a list of lines after wrapping.
    """
    # Remove trailing newline for processing
    content = line.rstrip('\n')

    # Content should be like: "  -a, --multiple       description\n"
    if not content.startswith('"') or not content.endswith('"'):
        return [line]

    # Get the inner content (without outer quotes)
    inner = content[1:-1]

    # Check if it ends with \n
    has_trailing_newline = inner.endswith('\\n')
    if has_trailing_newline:
        inner_no_newline = inner[:-2]
    else:
        inner_no_newline = inner

    # Pattern to match option followed by 2+ spaces and description
    # Options: leading spaces, optional short opt (-X, ), long opt (--something)
    match = re.match(r'^(\s+(?:-\S,\s+)?--?[^\s]+)\s{2,}(.+)$', inner_no_newline)
    if not match:
        return [line]

    option = match.group(1)
    description = match.group(2)

    # Build wrapped lines
    option_line = '"' + option + '\\n"\n'
    desc_line = '"         ' + description + ('\\n"\n' if has_trailing_newline else '"\n')

    return [option_line, desc_line]

def split_po_entries(lines):
    i = 0
    fuzzy = False
    current_files = []
    prev_was_location = False

    while i < len(lines):
        line = lines[i]

        # Track current files from location comments (can span multiple consecutive #: lines)
        if line.startswith('#:'):
            if not prev_was_location:
                # Start of a new entry's location comments - reset
                current_files = []
            current_files.append(line)
            prev_was_location = True
        else:
            prev_was_location = False

        if "#, fuzzy" in line:
            fuzzy = True

        if line.strip() == 'msgid ""':
            start_i = i
            msgid_lines = []
            i += 1
            while i < len(lines) and lines[i].startswith('"'):
                msgid_lines.append(lines[i])
                i += 1

            if i < len(lines) and lines[i].strip() == 'msgstr ""':
                msgstr_lines = []
                i += 1
                while i < len(lines) and lines[i].startswith('"'):
                    msgstr_lines.append(lines[i])
                    i += 1

                def is_option(line):
                    if line.startswith('"      --'):
                        return True
                    if line.startswith('"  -'):
                        text = line[4:]
                        if text.startswith('M '):
                            return False
                        if len(text) > 0 and text[0] != ' ':
                            return True
                    if re.match(r'^"  \S+ -\S\S \S+  ', line):
                        return True
                    if re.match(r'^"  [a-z]+=\S+  ', line):
                        return True
                    return False

                def is_option_relaxed(line):
                    if re.match(r'^" {1,6}--', line):
                        return True
                    return is_option(line)

                has_options = any(is_option(line) for line in msgid_lines)

                # Check if wrapping should be excluded for any of the tagged files
                should_wrap = True
                if any(EXCLUDED_FILES_PATTERN.search(f) for f in current_files):
                    should_wrap = False

                if has_options and not fuzzy:
                    first_non_empty = None
                    for j, line in enumerate(msgid_lines):
                        if line.strip() not in ('""', '"\\n"'):
                            first_non_empty = j
                            break

                    if first_non_empty is not None and is_option(msgid_lines[first_non_empty]):
                        msgid_lines = msgid_lines[first_non_empty:]
                        msgstr_lines = msgstr_lines[first_non_empty:] if first_non_empty < len(msgstr_lines) else msgstr_lines

                    msgid_groups = []
                    msgstr_groups = []

                    msgid_indices = [0]
                    for j in range(1, len(msgid_lines)):
                        if is_option(msgid_lines[j]):
                            msgid_indices.append(j)
                    msgid_indices.append(len(msgid_lines))

                    msgstr_indices = [0]
                    for j in range(1, len(msgstr_lines)):
                        if is_option_relaxed(msgstr_lines[j]):
                            msgstr_indices.append(j)
                    msgstr_indices.append(len(msgstr_lines))

                    for k in range(len(msgid_indices) - 1):
                        msgid_groups.append(msgid_lines[msgid_indices[k]:msgid_indices[k+1]])

                    for k in range(len(msgstr_indices) - 1):
                        msgstr_groups.append(msgstr_lines[msgstr_indices[k]:msgstr_indices[k+1]])

                    for msgid_group, msgstr_group in zip(msgid_groups, msgstr_groups):
                        # Wrap msgid lines if appropriate
                        if should_wrap:
                            wrapped_msgid = []
                            for mline in msgid_group:
                                wrapped_msgid.extend(wrap_msgid_line(mline))
                        else:
                            wrapped_msgid = msgid_group

                        # Output msgid
                        print('msgid ' + wrapped_msgid[0], end='')
                        for wline in wrapped_msgid[1:]:
                            print(wline, end='')

                        # Output msgstr
                        if len(msgstr_group) == 1:
                            print('msgstr ' + msgstr_group[0], end='')
                        else:
                            print('msgstr ""')
                            for mline in msgstr_group:
                                print(mline, end='')
                        print()

                    continue

            for j in range(start_i, i):
                print(lines[j], end='')
            fuzzy = False
            continue

        print(line, end='')
        i += 1

if __name__ == '__main__':
    if len(sys.argv) > 1:
        with open(sys.argv[1]) as f:
            split_po_entries(f.readlines())
    else:
        split_po_entries(sys.stdin.readlines())

Re: a splitting script

Reply via email to