Mwalker has submitted this change and it was merged.

Change subject: Parser for CLDR data -- into PHP for Twig
......................................................................


Parser for CLDR data -- into PHP for Twig

The initial L10n filter extension for twig consumes CLDR data.
This gets it into the correct format.

Change-Id: I676f8281ec823d5f08a05ad049d99a892870c960
---
A CldrParser/CldrParser.py
1 file changed, 296 insertions(+), 0 deletions(-)

Approvals:
  Adamw: Verified; Looks good to me, but someone else must approve
  Mwalker: Verified; Looks good to me, approved



diff --git a/CldrParser/CldrParser.py b/CldrParser/CldrParser.py
new file mode 100755
index 0000000..63adc9f
--- /dev/null
+++ b/CldrParser/CldrParser.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+"""
+Generate a PHP file with Locale data in it from CLDR data. Run with --help for 
all options.
+"""
+
+from optparse import OptionParser
+import xml.etree.ElementTree as ET
+import os
+import codecs
+
+
+def parseCLDR(cldr_path):
+    """Parse CLDR XML files into currency and locale data
+    Returns a set of: (
+        cldr date
+        {currencyIso: {digits, rounding}},
+        {locale: {positive_currency_str, negative_currency_str, grouping: []}}
+        {locale: {group_char, decimal_char}}
+    )
+
+    In the currency strings: $1 is the symbol, $2 is the formatted number
+    If the group_char is '', there is no grouping
+    In grouping, if there is multiple elements then the apply the groups from 
the decimal point forwards
+    """
+
+    # === Obtain currency data information (digits, rounding)
+    currencyData = {}
+    supData = ET.parse(cldr_path + 
'supplemental/supplementalData.xml').getroot()
+
+    for child in supData.findall("./currencyData/fractions/*"):
+        code = child.attrib['iso4217']
+        digits = int(child.attrib['digits'])
+        rounding = int(child.attrib['rounding'])
+
+        if code == 'DEFAULT':
+            # Check some assumptions
+            if digits != 2 and rounding != 0:
+                print("Assumptions did not hold on currency data default 
digits and rounding!")
+                exit(1)
+        else:
+            currencyData[code] = {'digits': digits, 'rounding': rounding}
+
+    # --- Get the CLDR version/date
+    cldrVersion = supData.find("./version").attrib['number'] + " - " + 
supData.find("./generation").attrib['date']
+
+    # === Now, for each locale... ===
+    localeNumericFormat = {}
+    localeSymbols = {}
+
+    for filename in os.listdir(cldr_path + 'main/'):
+        locale = filename[:-4]
+        data = ET.parse(cldr_path + "main/%s" % filename).getroot()
+
+        # --- ... information on the symbols used ---
+        decimalChar = 
data.find("./numbers/symbols[@numberSystem='latn']/decimal")
+        groupChar = data.find("./numbers/symbols[@numberSystem='latn']/group")
+
+        if decimalChar is not None and groupChar is not None:
+            localeSymbols[locale] = {'decimal_char': decimalChar.text, 
'group_char': groupChar.text}
+        elif decimalChar is not None:
+            localeSymbols[locale] = {'decimal_char': decimalChar.text, 
'group_char': ','}
+        elif groupChar is not None:
+            localeSymbols[locale] = {'decimal_char': '.', 'group_char': 
groupChar.text}
+
+        # --- ... information on number and currency formatting ---
+        patternNode = data.find(
+            
"./numbers/currencyFormats[@numberSystem='latn']/currencyFormatLength/currencyFormat/pattern"
+        )
+        if patternNode is not None:
+            pattern = patternNode.text
+            if len(pattern.split(';')) == 2:
+                localeNumericFormat[locale] = extractNumericLocale(locale, 
pattern.split(';')[0], pattern.split(';')[1])
+            else:
+                localeNumericFormat[locale] = extractNumericLocale(locale, 
pattern)
+
+    return cldrVersion, currencyData, localeNumericFormat, localeSymbols
+
+
+def extractNumericLocale(locale, pPattern, nPattern=None):
+    """Extract grouping char, decimal char, digit grouping, positive, and 
negative format strings from a pattern
+    pPattern -- the positive pattern
+    nPattern -- the negative pattern if given
+    """
+    # Replace a unicode character with something more MediaWiki
+    pPattern = pPattern.replace('¤', '$1')
+    if nPattern is not None:
+        nPattern = nPattern.replace('¤', '$1')
+
+    # Work from the positive pattern first; these come in like ¤ #,##,##0.00 
where ¤ is the currency symbol
+    # Find the start and end of the numeric pattern -- this seems to always 
start with # and end with 0
+    # We know that , and . are placeholders that should always exist
+    start = pPattern.find('#')
+    end = pPattern.rfind('0')
+
+    # Find the decimal char
+    decimalLoc = pPattern.find('.')
+    if decimalLoc == -1:
+        # The assumption being it has one...
+        print("Locale %s breaks the decimal separator assumption! Pattern: %s" 
% (locale, pPattern))
+        exit(1)
+
+    # Get the grouping, the first character we should see after the first # 
should be the separator
+    # ... unless we don't have group separators...
+    groupChar = pPattern[start + 1]
+    grouping = []
+    if groupChar not in {',', '0'}:
+        print("Locale %s breaks the group separator assumption! Pattern: %s" % 
(locale, pPattern))
+        exit(1)
+    elif groupChar == '0':
+        # No group separator...
+        groupChar = ''
+    else:
+        # Do we have another group separator?
+        secondSepLoc = pPattern.find(groupChar, start + 2, end)
+        if secondSepLoc != -1:
+            # Odd grouping
+            grouping.append(secondSepLoc - start - 2)
+            # If we have another one it's another violation of an assumption
+            if pPattern.find(groupChar, secondSepLoc + 2) != -1:
+                print("Locale %s breaks the grouping assumption! Pattern: %s" 
% (locale, pPattern))
+                exit(1)
+        else:
+            # Only one separator
+            secondSepLoc = start + 1
+
+        # What's the grouping between the second group separator and the 
decimal separator
+        grouping.append(decimalLoc - secondSepLoc - 1)
+
+    # Now sub out all this numeric string mess
+    numPattern = pPattern[start:end + 1]
+    pPattern = pPattern.replace(numPattern, '$2')
+
+    # and construct the nPattern
+    if nPattern is not None:
+        nPattern = nPattern.replace(numPattern, '$2')
+    else:
+        nPattern = '-' + pPattern
+
+    return {
+        'positive_currency_str': pPattern,
+        'negative_currency_str': nPattern,
+        'grouping': grouping
+    }
+
+
+def outputData(data, outfile, className, namespace=None):
+    """Generates a PHP data file"""
+
+    skeleton = """<?php %(namespace)s
+/**
+ * This file was automatically generated, do not edit!
+ *
+ * input: CLDR data file, %(cldrVersion)s
+ */
+class %(className)s {
+    /** @var array array(ISO code => array(decimal digits, rounding digits)) */
+    public static $currencyData = array(
+        %(currencyData)s
+    );
+
+    /** @var array array(ISO code => array(array(grouping), 
positive_currency_string, negative_currency_string)) */
+    public static $localeNumberFormat = array(
+        %(numberData)s
+    );
+
+    /** @var array array(ISO code => array(decimal_char, group_char)) */
+    public static $localCharacters = array(
+        %(charData)s
+    );
+}
+"""
+
+    # Add some defaults
+    data[1]['*'] = {'digits': 2, 'rounding': 0}
+    data[2]['*'] = {'grouping': [], 'positive_currency_str': '$1 $2', 
'negative_currency_str': '-$1 $2'}
+    data[3]['*'] = {'decimal_char': '.', 'group_char': ',', }
+
+    # Currency process
+    phpCurrency = []
+    for currency, idict in sorted(data[1].items()):
+        phpCurrency.append("'%s' => array(%s, %s)" % (currency, 
idict['digits'], idict['rounding']))
+    phpCurrency = ',\n        '.join(phpCurrency)
+
+    # Number format process
+    phpLocaleNumber = []
+    for locale, idict in sorted(data[2].items()):
+        phpLocaleNumber.append("'%s' => array(array(%s), '%s', '%s')" % (
+            locale,
+            ', '.join([str(x) for x in idict['grouping']]),
+            idict['positive_currency_str'],
+            idict['negative_currency_str']
+        ))
+    phpLocaleNumber = ',\n        '.join(phpLocaleNumber)
+
+    # Symbol format process
+    phpLocaleSymbol = []
+    for locale, idict in sorted(data[3].items()):
+        decimalChar = idict['decimal_char']
+        if decimalChar == "'":
+            decimalChar = "\\'"
+
+        groupChar = idict['group_char']
+        if groupChar == "'":
+            groupChar = "\\'"
+
+        phpLocaleSymbol.append("'%s' => array('%s', '%s')" % (locale, 
decimalChar, groupChar))
+    phpLocaleSymbol = ',\n        '.join(phpLocaleSymbol)
+
+    # Other file things
+    if namespace is not None:
+        namespace = "namespace %s;" % namespace
+    else:
+        namespace = ''
+
+    f = codecs.open(outfile, 'w', 'utf-8')
+    f.write(skeleton % {
+               'namespace': namespace,
+               'cldrVersion': data[0],
+               'className': className,
+               'currencyData': phpCurrency,
+               'numberData': phpLocaleNumber,
+               'charData': phpLocaleSymbol,
+       })
+    f.close()
+
+
+def test():
+    """Unit tests"""
+    result = []
+
+    out = extractNumericLocale('test', '¤ #,##,##0.00')
+    expected = {
+        'positive_currency_str': '$1 $2',
+        'negative_currency_str': '-$1 $2',
+        'grouping': [2, 3]
+    }
+    result.append(out)
+    if out != expected:
+        print("First test failed %s" % out)
+
+    out = extractNumericLocale('test', '#,##0.00 ¤', '(#,##0.00) ¤')
+    expected = {
+        'positive_currency_str': '$2 $1',
+        'negative_currency_str': '($2) $1',
+        'grouping': [3]
+    }
+    result.append(out)
+    if out != expected:
+        print("Second test failed %s" % out)
+
+    out = extractNumericLocale('test', '#0.00¤')
+    expected = {
+        'positive_currency_str': '$2$1',
+        'negative_currency_str': '-$2$1',
+        'grouping': []
+    }
+    result.append(out)
+    if out != expected:
+        print("Third test failed %s" % out)
+
+    return result
+
+
+if __name__ == "__main__":
+    # === Extract options ===
+    parser = OptionParser(usage="usage: %prog [options] <location of CLDR 
data>")
+    parser.add_option("-o", "--outputFile", default='CldrData.php', 
help='Output file name')
+    parser.add_option("-c", "--className", default='CldrData', help='Name of 
auto generated class')
+    parser.add_option("-n", "--namespace", default=None, help='Namespace for 
generated class')
+    parser.add_option("-t", "--test", default=False, help='Run tests')
+    (options, args) = parser.parse_args()
+
+    if options.test:
+        test()
+        exit()
+
+    cldr_path = ''
+    if len(args) != 1:
+        parser.print_help()
+        exit(1)
+    else:
+        cldr_path = args[0]
+
+    # === Check to see if CLDR exists at path ===
+    if cldr_path[-1] != os.path.sep:
+        cldr_path += os.path.sep
+    if not os.path.exists(cldr_path + 'main/en.xml'):
+        print(
+            "It appears that CLDR does not exist at the given path. Are you 
not pointing into the 'common' directory?"
+        )
+        exit(1)
+
+    # Run it!
+    outputData(parseCLDR(cldr_path), options.outputFile, options.className, 
options.namespace)

-- 
To view, visit https://gerrit.wikimedia.org/r/53722
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I676f8281ec823d5f08a05ad049d99a892870c960
Gerrit-PatchSet: 2
Gerrit-Project: wikimedia/fundraising/tools
Gerrit-Branch: master
Gerrit-Owner: Mwalker <[email protected]>
Gerrit-Reviewer: Adamw <[email protected]>
Gerrit-Reviewer: Mwalker <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to