# parse_url11.py # devpla...@gmail.com # 2010-12 (Dec)-27 # A brute force ugly hack from a novice programmer.
# You're welcome to use the code, clean it up, make positive suggestions # for improvement. """ Parse a url string into a list using a generator. """ #special_itemMeaning = ";?:@=&#." #"//", #"/", special_item = [";", "?", ":", "@", "=", "&", "#", ".", "/", "//"] # drop urls with obviously bad formatting - NOTIMPLEMENTED drop_item = ["|", "localhost", "..", "///"] ignore_urls_containing = ["php", "cgi"] def url_parser_generator(url): len_text = len(url) index = 0 start1 = 0 # required here if url contains ONLY specials start2 = 0 # required here if url contains ONLY non specials while index < len_text: # LOOP1 == Get and item in the special_item list; can be any length if url[index] in special_item: start1 = index inloop1 = True while inloop1: if inloop1: if url[start1:index+1] in special_item: #print "[",start1, ":", index+1, "] = ", url[start1:index+1] inloop1 = True else: # not in ANYMORE, but was in special_item #print "[",start1, ":", index, "] = ", url[start1:index] yield url[start1:index] start1 = index inloop1 = False if inloop1: if index < len_text-1: index = index + 1 else: #yield url[start1:index] # NEW inloop1 = False elif url[index] in drop_item: # not properly implemeted at all raise NotImplemented( "Processing items in the drop_item list is not "\ "implemented.", url[index]) elif url[index] in ignore_urls_containing: # not properly implemeted at all raise NotImplemented( "Processing items in the ignore_urls_containing list "\ "is not implemented.", url[index]) # LOOP2 == Get any item not in the special_item list; can be any length elif not url[index] in special_item: start2 = index inloop2 = True while inloop2: if inloop2: #if not url[start2:index+1] in special_item: #<- doesn"t work if not url[index] in special_item: #print "[",start2, ":", index+1, "] = ", url[start2:index+1] inloop2 = True else: # not in ANYMORE, but item was not in special_item before #print "[",start2, ":", index, "] = ", url[start2:index] yield url[start2:index] start2 = index inloop2 = False if inloop2: if index < len_text-1: index = index + 1 else: #yield url[start2:index] # NEW inloop2 = False else: print url[index], "Not Implemented" # should not get here index = index + 1 if index >= len_text-1: break # Process any remaining part of URL and yield it to caller. # Don't know if last item in url is a special or non special. # Used start1 and start2 instead of start and # used inloop1 and inloop2 instead of inloop # to help debug, as using just "start" and "inloop" can get be # harder to track in a generator. if start1 >= start2: start = start1 else: start = start2 yield url[start: index+1] def parse(url): mylist = [] words = url_parser_generator(url) for word in words: mylist.append(word) #print word return mylist def test(): urls = { 0: (True,"http://docs.python.org/dev/library/stdtypes.html? highlight=partition#str.partition"), 1: (True,"/http:///docs.python.org/dev/library/stdtypes.html? highlight=partition#str.partition"), 2: (True,"//http:///docs.python.org/dev/library/stdtypes.html? highlight=partition#str.partition"), 3: (True,"///http:///docs.python.org/dev/library/stdtypes.html? highlight=partition#str.partition"), 4: (True,"/http:///docs.python.org/dev/library/stdtypes.html? highlight=partition#str.partition/"), 5: (True,"//http:///docs.python.org/dev/library/stdtypes.html? highlight=partition#str.partition//"), 6: (True,"///http:///docs.python.org/dev/library/stdtypes.html? highlight=partition#str.partition///"), 7: (True,"/#/http:///#docs.python..org/dev//////library/ stdtypes./html??highlight=p=partition#str.partition///"), 8: (True,"httpdocspythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition"), 9: (True,"httpdocs.pythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition"), 10: (True,":httpdocspythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition"), 11: (True,"httpdocspythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition/"), 12: (True,"///:;#.???"), # only special_items 13: (True,"///a:;#.???"), # only 1 non special_item 14: (True,"///:;#.???a"), # only 1 non special_item 15: (True,"a///:;#.???"), # only 1 non special_item 16: (True,"http://docs.python.php"), 17: (True,"http://php.python.org"), 18: (True,"http://www.localhost.com"), } # test various combinations of special_item characters in possible in urls for url_num in range(len(urls)): value = urls[url_num] test, url = value if test: # allow for single tesing mylist = parse(url) print print print "url:", url_num, " ", url print print mylist print return mylist test() -- http://mail.python.org/mailman/listinfo/python-list