On Fri, Jan 15, 2010 at 12:01 PM, Eknath Venkataramani < eknath.i...@gmail.com> wrote:
> I have a txt file in the following format: > [code] > "confident" => { > count => 4, > trans => { > "ashahvasahta" => 0.74918568, > "atahmavaishahvaasa" => 0.09095465, > "pahraaram\.nbha" => 0.06990729, > "mailatae" => 0.02856427, > "utanai" => 0.01929341, > "anaa" => 0.01578552, > "uthaanae" => 0.01403157, > "jaitanae" => 0.01227762, > }, > }, > "consumers" => { > count => 4, > trans => { > "upabhaokahtaa" => 0.75144362, > "upabhaokahtaaom\.n" => 0.12980166, > "sauda\�\�\�dha" => 0.11875471, > }, > }, > "a" => { > count => 1164, > trans => { > "eka" => 0.14900491, > "kaisai" => 0.08834675, > "haai" => 0.06774697, > "kaoi" => 0.05394308, > "kai" => 0.04981982, > "\(none\)" => 0.04400085, > "kaa" => 0.03726579, > "kae" => 0.03446450, > }, > }, > [/code] > > and I need to extract "confident" , "ashahvasahta" from the first > record, "consumers", "upabhaokahtaa" from the second record... > i.e. "word in english" and the "first word in the probable-translations" > > Thanks is advance > Eknath > _______________________________________________ > BangPypers mailing list > BangPypers@python.org > http://mail.python.org/mailman/listinfo/bangpypers > Since I hadn't had a chance to write a recursive descent parser, took this opportunity to do a bit of an exercise. I have used a parser called pyparsing. ---------- Begin Code ---------- # coding=utf-8 from pyparsing import * import pprint import sys data = ''' "confident" => { count => 4, trans => { "ashahvasahta" => 0.74918568, "atahmavaishahvaasa" => 0.09095465, "pahraaram\.nbha" => 0.06990729, "mailatae" => 0.02856427, "utanai" => 0.01929341, "anaa" => 0.01578552, "uthaanae" => 0.01403157, "jaitanae" => 0.01227762, }, }, "consumers" => { count => 4, trans => { "upabhaokahtaa" => 0.75144362, "upabhaokahtaaom\.n" => 0.12980166, "sauda\�\�\�dha" => 0.11875471, }, }, "a" => { count => 1164, trans => { "eka" => 0.14900491, "kaisai" => 0.08834675, "haai" => 0.06774697, "kaoi" => 0.05394308, "kai" => 0.04981982, "\(none\)" => 0.04400085, "kaa" => 0.03726579, "kae" => 0.03446450, }, } ''' # Setup pyparsing tokens dct = Forward() pair_op = Literal("=>") comma = Literal(",").suppress() beg_brace = Literal("{").suppress() end_brace = Literal("}").suppress() num = Word("0123456789.") key = (Word(alphas + nums) ^ quotedString).setResultsName("key") val = (num ^ dct).setResultsName("value") key_value_pair = Group(key + pair_op.suppress() + val) key_value_pair_list = delimitedList(key_value_pair) dct << Group(beg_brace + key_value_pair_list + Optional(comma) + end_brace) # parse data parsed = key_value_pair_list.parseString(data) # function to extract ie. form a python datastructure def extract(result): if 'key' in result.keys() : if isinstance(result.value,ParseResults) : return ( result.key, extract(result.value) ) else : return ( result.key, result.value ) else : return(dict(extract(elem) for elem in result)) # extract extracted = extract(parsed) # print extracted data pprint.pprint(extracted, sys.stdout) # print the english word and first translated word print "\n\n============\nTranslations\n============\n" print dict( (english, reduce(lambda x,y : (y[0],float(y[1])) if float(y[1]) > x[1] else x , translations['trans'].items(), ('',0.0))[0] ) for english,translations in extracted.items() ) ---------- End Code ---------- Dhananjay -- -------------------------------------------------------- blog: http://blog.dhananjaynene.com twitter: http://twitter.com/dnene http://twitter.com/_pythonic _______________________________________________ BangPypers mailing list BangPypers@python.org http://mail.python.org/mailman/listinfo/bangpypers