[sword-devel] Script to find a best fit v11n

DM Smith Wed, 18 Jun 2025 12:51:00 -0700

Hi,

Several have commented on how hard it is to test an OSIS xml file against v11ns especially since it goes off into an infinite loop. (I’ve posted a patch that fixes that) But it is still a process of trial and error to find an appropriate v11n.

So, I’ve been iterating with chatGPT to create a python script to find a best fit v11n. Since I don’t know python, I can’t vouch for the script beyond it worked for a simple test case that had an extra chapter for Genesis and had some extra verses at the end of a chapter in that book.

I offer it, as a starting place. See the attached file.

It has a —debug flag.

The first argument is expected to be the OSIS xml file.

The second argument is optional and gives the location to the include directory of svn/sword/trunk/include with all the canon*.h files. If you don’t supply the argument, it uses the web to load the canon*.h files from https://www.crosswire.org/svn/sword/trunk/include.

It will score the fitness of each of the v11ns. It gives the score as a %, but I don’t know what that means. I told it that it should prioritize book matches, then chapter matches and finally verse matches. I don’t know how well it did that scoring. I didn’t test for that.

The output is alphabetized. If more than one v11n have the same high score, they are listed.

In His Service,

import re
import xml.etree.ElementTree as ET
from collections import defaultdict
from pathlib import Path
import sys
import requests
import argparse


DEBUG = False
REMOTE_URL = "https://www.crosswire.org/svn/sword/trunk/include/";

def parse_osis(file_path):
    osis_structure = defaultdict(lambda: defaultdict(set))
    context = ET.iterparse(file_path, events=('start',))
    for event, elem in context:
        if elem.tag.endswith('verse'):
            osisid = elem.attrib.get('osisID')
            if osisid:
                parts = osisid.split('.')
                if len(parts) == 3:
                    book, chapter, verse = parts
                    try:
                        chapter = int(re.sub(r'\D', '', chapter))
                        verse = int(re.sub(r'\D', '', verse))
                        osis_structure[book][chapter].add(verse)
                    except ValueError:
                        continue
            elem.clear()
    if DEBUG:
        print("Parsed OSIS structure:")
        for book in osis_structure:
            print(f"  {book}: {sorted(osis_structure[book].keys())}")
    return osis_structure

def load_canon_file(canon_path):
    if canon_path.startswith("http://";) or canon_path.startswith("https://";):
        if DEBUG:
            print(f"Downloading: {canon_path}")
        response = requests.get(canon_path)
        if response.status_code == 200:
            if DEBUG:
                print(f"Downloaded {len(response.text)} bytes from {canon_path}")
            return response.text
        else:
            raise ValueError(f"Failed to download {canon_path} (status {response.status_code})")
    else:
        if DEBUG:
            print(f"Reading local file: {canon_path}")
        with open(canon_path, encoding='utf-8') as f:
            content = f.read()
            if DEBUG:
                print(f"Loaded {len(content)} bytes from local file")
            return content

def parse_books_array(content, array_name):
    pattern = re.compile(
        rf'struct\s+sbook\s+{re.escape(array_name)}\s*\[\s*\]\s*=\s*\{{(.*?)\}};',
        re.DOTALL | re.IGNORECASE
    )
    match = pattern.search(content)
    if not match:
        if DEBUG:
            print(f"No {array_name} array found")
        return []
    entries = re.findall(r'\{\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*(\d+)\s*\}', match.group(1))
    if DEBUG:
        print(f"Parsed {len(entries)} entries from {array_name}")
    return [(full, abbrev, osis, int(ch)) for full, abbrev, osis, ch in entries]

def parse_vm_array(content, v11n_name):
    patterns = []
    flags = re.DOTALL | re.IGNORECASE
    if v11n_name == "KJV":
        patterns = [
            re.compile(r'int\s+vm\s*\[\s*\]\s*=\s*\{(.*?)\};', flags),
            re.compile(r'int\s+vm_kjv\s*\[\s*\]\s*=\s*\{(.*?)\};', flags)
        ]
    else:
        patterns = [re.compile(r'int\s+vm_' + re.escape(v11n_name) + r'\s*\[\s*\]\s*=\s*\{(.*?)\};', flags)]

    for pattern in patterns:
        vm_match = pattern.search(content)
        if vm_match:
            vm_entries = list(map(int, re.findall(r'\d+', vm_match.group(1))))
            if DEBUG:
                print(f"Parsed {len(vm_entries)} verse max entries for {v11n_name}")
            return vm_entries

    if DEBUG:
        print(f"No vm array found for {v11n_name}")
    return []

def parse_canon_file(file_path, base_ot=None, base_nt=None):
    fname = file_path.split("/")[-1] if file_path.startswith("http") else Path(file_path).name
    if "canon_null" in fname or "canon_abbrevs" in fname:
        if DEBUG:
            print(f"Skipping {fname} (excluded file)")
        return None, {}

    v11n_name = Path(fname).stem.replace("canon", "").lstrip('_') or "KJV"
    content = load_canon_file(file_path)

    if DEBUG:
        print(f"Processing {fname} as versification '{v11n_name}'")

    otbooks_name = f"otbooks_{v11n_name}" if v11n_name != "KJV" else "otbooks"
    ntbooks_name = f"ntbooks_{v11n_name}" if v11n_name != "KJV" else "ntbooks"

    ot_books = parse_books_array(content, otbooks_name)
    nt_books = parse_books_array(content, ntbooks_name)

    if not ot_books and base_ot is not None:
        if DEBUG:
            print(f"Falling back to base OT books for {v11n_name}")
        ot_books = base_ot
    if not nt_books and base_nt is not None:
        if DEBUG:
            print(f"Falling back to base NT books for {v11n_name}")
        nt_books = base_nt

    books = ot_books + nt_books

    vm_entries = parse_vm_array(content, v11n_name)
    if not vm_entries:
        if DEBUG:
            print(f"No vm array found in {v11n_name}")
        return v11n_name, {}

    structure = defaultdict(dict)
    i = 0
    for _, _, osis, chapters in books:
        for ch in range(1, chapters + 1):
            if i < len(vm_entries):
                structure[osis][ch] = vm_entries[i]
                i += 1
            else:
                break

    if DEBUG:
        print(f"Parsed structure for {v11n_name} with {len(structure)} books")

    return v11n_name, structure

def score_v11n(osis_structure, v11n_structure):
    score = 0
    max_score = 0
    mismatch_details = []

    for book, chapters in osis_structure.items():
        max_score += len(chapters) * 2
        if book not in v11n_structure:
            mismatch_details.append(f'Missing book: {book}')
            continue
        for ch_num, verses in chapters.items():
            max_score += len(verses)
            if ch_num not in v11n_structure[book]:
                mismatch_details.append(f'{book} missing chapter {ch_num}')
                continue
            score += 2
            max_verse = v11n_structure[book][ch_num]
            matching_verses = sum(1 for v in verses if v <= max_verse)
            score += matching_verses
            missed = len(verses) - matching_verses
            if missed > 0:
                mismatch_details.append(f'{book} {ch_num}: {missed} verse(s) too many')
    fit_percent = (score / max_score * 100) if max_score else 0
    return fit_percent, mismatch_details

def find_best_fit(osis_file, canon_dir):
    osis_structure = parse_osis(osis_file)
    results = []

    canon_files = []
    if canon_dir.startswith("http://";) or canon_dir.startswith("https://";):
        index_url = canon_dir.rstrip("/") + "/"
        if DEBUG:
            print(f"Fetching directory listing from {index_url}")
        index_html = requests.get(index_url).text
        canon_files = re.findall(r'href=[\'"]?(canon[^\'"]+\.h)[\'"]?', index_html)
        canon_files = [index_url + fname for fname in canon_files if not ("canon_null" in fname or "canon_abbrevs" in fname)]
        if index_url + "canon.h" not in canon_files:
            canon_files.append(index_url + "canon.h")
        if DEBUG:
            print(f"Found {len(canon_files)} canon files in remote directory:")
            for f in canon_files:
                print(f"  - {f}")
    else:
        canon_files = list(Path(canon_dir).glob("canon*.h"))
        kjv_path = Path(canon_dir) / "canon.h"
        if kjv_path not in canon_files:
            canon_files.append(kjv_path)

    base_ot = base_nt = None
    base_canon_path = None
    if canon_dir.startswith("http://";) or canon_dir.startswith("https://";):
        base_canon_path = canon_dir.rstrip("/") + "/canon.h"
    else:
        base_canon_path = str(Path(canon_dir) / "canon.h")

    if DEBUG:
        print(f"Loading base canon file from {base_canon_path}")
    base_content = load_canon_file(base_canon_path)
    base_ot = parse_books_array(base_content, "otbooks")
    base_nt = parse_books_array(base_content, "ntbooks")

    for canon_file in canon_files:
        try:
            name, structure = parse_canon_file(str(canon_file), base_ot, base_nt)
            if name is None:
                continue
            fit, mismatches = score_v11n(osis_structure, structure)
            results.append((fit, name, mismatches))
        except Exception as e:
            print(f"Failed to parse {canon_file}: {e}")

    results.sort(reverse=True)
    results.sort(key=lambda r: (-r[0], r[1].lower()))
    for fit, name, mismatches in results:
        print(f"{name}: {fit:.2f}% fit")
        for m in mismatches:
            print(f"  - {m}")

    if results:
        best_score = results[0][0]
        best_matches = [(fit, name) for fit, name, _ in results if fit == best_score]
        print(f"\nBest match(es) ({best_score:.2f}%):")
        for _, name in best_matches:
            print(f"  - {name}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Find best-fit versification for an OSIS file.")
    parser.add_argument("osis_file", help="Path to OSIS XML file")
    parser.add_argument("canon_path", nargs="?", default=REMOTE_URL, help="Path or URL to canon headers (default: remote CrossWire URL)")
    parser.add_argument("--debug", action="store_true", help="Enable debug output")
    args = parser.parse_args()

    DEBUG = args.debug
    if DEBUG:
        print("Debug mode enabled\n")

    find_best_fit(args.osis_file, args.canon_path)

_______________________________________________
sword-devel mailing list: sword-devel@crosswire.org
http://crosswire.org/mailman/listinfo/sword-devel
Instructions to unsubscribe/change your settings at above page

[sword-devel] Script to find a best fit v11n

Reply via email to