#!/usr/bin/python # coding=utf-8 # -*- encoding: utf-8 -*- import sys, codecs, copy, commands; __help__ = """ This is a conversion script for the format outputted by lt-expand to the format accepted by the freeling indexdict program. Input is an expanded Apertium dictionary: perro:perro Output is a Freeling dictionary: perro perro NCMPV0 To convert the Apertium tagset into a PAROLE-compatible tagset, a file with the parole tag and Apertium tag list is used. The two are separated by a tab: NCMPV0 Options: -n Skip Proper Names -l Skip entries containing spaces -p Use existing lt-expand output -h Displays this help message """ sys.stdin = codecs.getreader('utf-8')(sys.stdin); sys.stdout = codecs.getwriter('utf-8')(sys.stdout); sys.stderr = codecs.getwriter('utf-8')(sys.stderr); if sys.argv[1] == '-h': print 'Usage: ./dix-to-maco.py [-l|-n|-p] '; print ''; print __help__; sys.exit(0); if len(sys.argv) < 4: print 'Usage: ./dix-to-maco.py [-l|-n|-p] '; print ''; sys.exit(-1); if commands.getstatusoutput('lt-expand')[0] > 256: print 'Please install `lt-expand\' or add it to your $PATH'; sys.exit(-1); tipoxt = sys.argv[1]; infile = sys.argv[2]; tffile = sys.argv[3]; transform = {}; # Read in the parole lookup table. for line in file(tffile).read().split('\n'): row = line.split('\t'); if len(row) < 2: continue; parole = row[0].strip(); apertium = row[1].strip(); transform[apertium] = parole; # We use a hash to store the surface (superficial) forms, along with # their analyses. The hash is used so that duplicate values are removed. surface = {}; mode = 0; # "General entries" if tipoxt == '-l': mode = 1; if tipoxt == '-p': inlines = file(infile).read().decode('utf-8').split('\n') else: inlines = commands.getoutput('lt-expand ' + infile).decode('utf-8').split('\n') for line in inlines: # We skip "generate only" entries and regexes if line.count(':<:') > 0 or line.count('REGEXP') > 0: continue; # Are we extracting locutions or main entries? if mode == 1: if line.count(' ') < 1 and line.count('#') < 1: continue; else: if line.count(' ') > 0 or line.count('#') > 0: continue; row = line.replace(':>:', ':').split(':'); case_flag = 0; for c in row[0]: if c.isupper(): case_flag = 1; continue; # Skip proper names in the general entries if tipoxt == '-n': if case_flag == 0: continue; else: if case_flag == 1: continue; if line.count('#') > 0 and line.count(' ') < 0 and mode == 1: # bapurau newydd:>:papur# newydd # cat bapurau newydd:>:cat papur# newydd # -> # we need to count the number of spaces backwards from the end # of the right side in order to calculate where to place the '<>' # as paradigms might throw out different number of spaces. # Skip these at the moment as we'll also need to work out how to # only put in one entry for the non-flecting part. # continue; elif line.count('#') == 0 and line.count(' ') > 0 and mode == 1: lemma = ''; analysis = ''; tags = ''; if line.count('+') > 0: superficial = row[0].strip().replace(' ', '_'); subreadings = row[1].strip().replace(' ', '_').split('+'); if superficial not in surface: surface[superficial] = {}; lemma = ''; analysis = ''; tags = ''; for reading in subreadings: tag = ''; lemma_part = ''; idx = reading.index('<'); tag = reading[idx:]; lemma_part = reading.split('<')[0]; if tag in transform: tags = tags + transform[tag] + '+'; lemma = lemma + lemma_part + '+'; lemma = lemma.replace(' ', '_'); analysis = lemma.strip('+') + ' ' + tags.strip('+'); surface[superficial][analysis] = ''; else: superficial = row[0].strip().replace(' ', '_'); analysis = row[1].strip().replace(' ', '_'); if superficial not in surface: surface[superficial] = {}; tags = ''; if '<' in analysis: idx = analysis.index('<'); tags = analysis[idx:]; lemma = analysis.split('<')[0].replace(' ', '_'); if tags in transform: analysis = lemma + ' ' + transform[tags]; surface[superficial][analysis] = ''; if line.count('+') > 0 and mode == 0: superficial = row[0].strip().replace(' ', '_'); subreadings = row[1].strip().split('+'); if superficial not in surface: surface[superficial] = {}; lemma = ''; analysis = ''; tags = ''; for reading in subreadings: tag = ''; lemma_part = ''; idx = reading.index('<'); tag = reading[idx:]; lemma_part = reading.split('<')[0]; if tag in transform: tags = tags + transform[tag] + '+'; lemma = lemma + lemma_part + '+'; lemma = lemma.replace(' ', '_'); analysis = lemma.strip('+') + ' ' + tags.strip('+'); surface[superficial][analysis] = ''; elif mode == 0: superficial = row[0].strip().replace(' ', '_'); analysis = row[1].strip(); if superficial not in surface: surface[superficial] = {}; tags = ''; if '<' in analysis: idx = analysis.index('<'); tags = analysis[idx:]; lemma = analysis.split('<')[0].replace(' ', '_'); if tags in transform: analysis = lemma + ' ' + transform[tags]; surface[superficial][analysis] = ''; # Print out for key in surface: analyses = ''; for subkey in surface[key]: analyses = analyses + subkey + ' '; print key + ' ' + analyses;