Finally I solved the problem, with some really minor things to tweak. I guess it's true that I had two problems working with regular expressions.
Thank you all for your help. I really learned a lot on quite a difficult problem. Final Code: #For text files in a directory... #Analyzes a randomly organized UTF8 document with EINECS, CAS, Chemical, and Chemical Formula #into a document structured as EINECS|CAS|Chemical|Chemical Formula. import os import codecs import re path = "C:\\text_samples\\text\\" path2 = "C:\\text_samples\\text\\output\\" EINECS = re.compile(r'^\d\d\d-\d\d\d-\d$') CAS = re.compile(r'^\d*-\d\d-\d$') FORMULA = re.compile(r'([A-Z][A-Za-z0-9]+\.?[A-Za-z0-9]+/?[A-Za- z0-9]+)') def iter_elements(tokens): product = [] for tok in tokens: if EINECS.match(tok) and len(product) >= 4: match = re.match(FORMULA,product[-1]) if match: product[2:-1] = [' '.join(product[2:-1])] yield product product = [] else: product[2:-1] = [' '.join(product[2:])] del product[-1] yield product product = [] product.append(tok) yield product for text in os.listdir(path): input_text = os.path.join(path,text) output_text = os.path.join(path2,text) input = codecs.open(input_text, 'r','utf8') output = codecs.open(output_text, 'w', 'utf8') tokens = input.read().split() for element in iter_elements(tokens): output.write('|'.join(element)) output.write("\r\n") input.close() output.close() -- http://mail.python.org/mailman/listinfo/python-list