Hello Tutors, I've used Pyparsing to write a Noun Phrase Extractor that extracts noun phrases from a Part-of-Speech tagged file. My question is: how can I mark, instead of extract, the phrases. For example, In the sentence:
The DET big ADJ woman NOUN saw VERB and CONJ greeted VERB the DET green ADJ man NOUN both "The big woman" and "the green man" are noun phrases. I need the results to look like this: <NP> The big woman </NP> saw and greeted <NP>the green man </NP> # This script is far from complete. It works only with Arabic # This is a parser for NP chunks # This depends on non-vocalized texts and tags. from pyparsing import * CASE = oneOf("CASE_INDEF_ACC CASE_INDEF_GEN CASE_INDEF_NOM") DEMONSTRATIVE = oneOf("DE DEM_PRON_F DEM_PRON_FD DEM_PRON_FS DEM_PRON_MD DEM_PRON_MP DEM_PRON_MS") NOUN_SUFFIX = oneOf("NSUFF_FEM_DU_ACC NSUFF_FEM_DU_GEN NSUFF_FEM_DU_NOM NSUFF_FEM_PL NSUFF_FEM_SG NSUFF_MASC_DU_ACC NSUFF_MASC_DU_GEN NSUFF_MASC_DU_NOM NSUFF_MASC_PL_ACC NSUFF_MASC_PL_GEN NSUFF_MASC_PL_NOM") NOUN_SUFFIX_IDAFA = oneOf("NSUFF_FEM_DU_ACC_POSS NSUFF_FEM_DU_GEN_POSS NSUFF_FEM_DU_NOM_POSS NSUFF_MASC_DU_ACC_POSS NSUFF_MASC_DU_GEN_POSS NSUFF_MASC_DU_NOM_POSS NSUFF_MASC_PL_ACC_POSS NSUFF_MASC_PL_GEN_POSS NSUFF_MASC_PL_NOM_POSS") POSSESSIVE_PRONOUN = oneOf("POSS_PRON_1P POSS_PRON_1S POSS_PRON_2FS POSS_PRON_2MP POSS_PRON_2MS POSS_PRON_3D POSS_PRON_3FP POSS_PRON_3FS POSS_PRON_3MP POSS_PRON_3MS") PRONOUN = oneOf("PRON_1P PRON_1S PRON_2FS PRON_2MP PRON_2MS PRON_3D PRON_3FP PRON_3FS" "PRON_3MP PRON_3MS") lexical = Word(alphas+"$"+"<"+">"+"|"+"}"+"{") NOUN = Literal("NOUN") DET = Literal("DET") ADJ = Literal("ADJ") NOMINAL = lexical + NOUN + Optional(lexical + NOUN_SUFFIX) import sys infile = open(sys.argv[1]).read() # This is for the definite NP made up of a noun + an adjective # An example is Alrjl AlmHtrm AL = Literal("Al") DEFINITE_NOUN = AL + DET + lexical + NOUN + Optional(NOUN_SUFFIX) DEFINITE_ADJECTIVE = AL + DET + lexical + ADJ + Optional(NOUN_SUFFIX) NOUN_ADJ = DEFINITE_NOUN + ZeroOrMore(DEFINITE_ADJECTIVE) DEMON_NOUN_ADJ = Optional(DEMONSTRATIVE) + NOUN_ADJ # Now for the indefinite NP # Example: rjl mHtrm INDEFINITE_NOUN = lexical + NOUN + Optional(NOUN_SUFFIX) INDEFINITE_ADJECTIVE = lexical + ADJ + Optional(NOUN_SUFFIX) INDEF_NOUN_ADJ = INDEFINITE_NOUN + ZeroOrMore(DEFINITE_ADJECTIVE) pattern3 = OneOrMore(NOMINAL) + lexical + DET + NOMINAL #pattern2 = NP = pattern3 | DEMON_NOUN_ADJ | INDEF_NOUN_ADJ # get the file import sys infile = open(sys.argv[1]).read() tokens = NP.scanString(infile) for x in tokens: for i,v in enumerate(x[0]): if i%2 == 0: print v, print "\n" -- لا أعرف مظلوما تواطأ الناس علي هضمه ولا زهدوا في إنصافه كالحقيقة.....محمد الغزالي "No victim has ever been more repressed and alienated than the truth" Emad Soliman Nawfal Indiana University, Bloomington --------------------------------------------------------
_______________________________________________ Tutor maillist - Tutor@python.org http://mail.python.org/mailman/listinfo/tutor