Hello,

I have decided not to rely on very kind help by David
with his Windows tools and I have written (hopefully)
completely platform neutral pure Python 3 script for checking
pairwise-characters. So, far it was used only for fixing
https://gitlab.com/crosswire-bible-society/CzeCEP/-/issues/2 and
I am quite sure it is pretty buggy, but it could be proven useful
for somebody.

Temporarily the script is in its own repo
(https://gitlab.com/mcepl/bible-freq-counter) and attached to
this message, but I would like to submit it to sword-utils. How
to do it?

Blessings,

Matěj
-- 
http://matej.ceplovi.cz/blog/, @mcepl@floss.social
GPG Finger: 3C76 A027 CA45 AD70 98B5  BC1D 7920 5802 880B C9D8
 
Afraid to die alone?
Become a bus driver.
  -- alleged easter egg in notepad++
#!/usr/bin/python3

import enum
import logging
import pprint
import sys
import xml.sax

from collections import Counter

logging.basicConfig(level=logging.INFO)
log = logging.getLogger()

QType=enum.Enum('QType', ['SINGLE', 'DOUBLE'])

class PairCheckerHandler(xml.sax.ContentHandler):
    configuration = {
        'cs': {
            'OPEN' : "‚„",
            'CLOSE' : "‘“",
            'SINGLE' : "‚‘",
            'DOUBLE' : "„“"
        }
    }

    def __init__(self):
        xml.sax.ContentHandler.__init__(self)
        self.OPEN_CH = None
        self.CLOSE_CH = None
        self.SINGLE_CH = None
        self.DOUBLE_CH = None
        self.PAIR_CH = None
        
        self.freq_table = Counter()
        self.balance = Counter()
        self.current_ref = None

    def __process_character(self, c):
        self.freq_table.update(c)
        if c in self.OPEN_CH:
            if c in self.SINGLE_CH:
                self.balance[QType.SINGLE] += 1
            elif c in self.DOUBLE_CH:
                self.balance[QType.DOUBLE] += 1
            log.debug(f"Opening character {c} (balance {self.balance})")
        elif c in self.CLOSE_CH:
            if c in self.SINGLE_CH:
                self.balance[QType.SINGLE] -= 1
            elif c in self.DOUBLE_CH:
                self.balance[QType.DOUBLE] -= 1
            log.debug(f"Closing character {c} (balance {self.balance})")

        if any([self.balance[x] < 0 for x in self.balance]):
            print(f"Balance for character {c} is below zero in {self.current_ref}",
                  file=sys.stderr)
            sys.exit(1)
        elif any([self.balance[x] > 1 for x in self.balance]):
            print(f"Balance for character {c} is over one in {self.current_ref}",
                  file=sys.stderr)
            sys.exit(1)
    
    def startElement(self, name, attrs):
        if 'osisText' in name:
            lang = attrs.get('xml:lang')
            log.debug(f'lang = {lang}')
            self.OPEN_CH = self.configuration[lang]['OPEN']
            self.CLOSE_CH = self.configuration[lang]['CLOSE']
            self.SINGLE_CH = self.configuration[lang]['SINGLE']
            self.DOUBLE_CH = self.configuration[lang]['DOUBLE']
            self.PAIR_CH = self.OPEN_CH+self.CLOSE_CH
            log.debug(f'self.PAIR_CH = {self.PAIR_CH}')
        elif 'verse' in name:
            if 'sID' in attrs:
                log.debug(f'name = {name}, sID = {attrs["sID"]}')
                self.current_ref = attrs['sID']

    def characters(self, content):
        log.debug(f'current_ref = {self.current_ref}, content: {content} ({type(content)})')
        if self.current_ref is None:
            return
        for c in content:
            if c in self.PAIR_CH:
                self.__process_character(c)


if __name__ == "__main__":
    parser = xml.sax.make_parser()
    handler = PairCheckerHandler()
    parser.setContentHandler(handler)
    for bible_file in sys.argv[1:]:
        parser.parse(bible_file)
        pprint.pprint(dict(handler.freq_table))

Attachment: signature.asc
Description: PGP signature

_______________________________________________
sword-devel mailing list: sword-devel@crosswire.org
http://crosswire.org/mailman/listinfo/sword-devel
Instructions to unsubscribe/change your settings at above page

Reply via email to