Hi... got a short test app that i'm playing with. the goal is to get data off the page in question.
basically, i should be able to get a list of "tr" nodes, and then to iterate/parse them. i'm missing something, as i think i can get a single node, but i can't figure out how to display the contents of the node.. nor how to get the list of the "tr" nodes.... my test code is: -------------------------------- #!/usr/bin/python #test python script import re import libxml2dom import urllib import urllib2 import sys, string from mechanize import Browser import mechanize #import tidy import os.path import cookielib from libxml2dom import Node from libxml2dom import NodeList ######################## # # Parse pricegrabber.com ######################## # datafile tfile = open("price.dat", 'wr+') efile = open("price_err.dat", 'wr+') urlopen = urllib2.urlopen ##cj = urllib2.cookielib.LWPCookieJar() Request = urllib2.Request br = Browser() user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' values1 = {'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'Python' } headers = { 'User-Agent' : user_agent } url ="http://www.pricegrabber.com/rating_summary.php/page=1" #======================================= if __name__ == "__main__": # main app txdata = None #---------------------------- # get the kentucky test pages #br.set_cookiejar(cj) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.addheaders = [('User-Agent', 'Firefox')] br.open(url) #cj.save(COOKIEFILE) # resave cookies res = br.response() # this is a copy of response s = res.read() # s contains HTML not XML text d = libxml2dom.parseString(s, html=1) print "d = d",d #get the input/text dialogs #tn1 = "//[EMAIL PROTECTED]'main_content']/form[1]/input[position()=1]/@name" t1 = "/html/body/[EMAIL PROTECTED]'pgSiteContainer']/[EMAIL PROTECTED]'pgPageContent']/table[2]/tbo dy" tr = "/html/body/[EMAIL PROTECTED]'pgSiteContainer']/[EMAIL PROTECTED]'pgPageContent']/table[2]/tbo dy/tr[4]" tr_=d.xpath(tr) print "len =",tr_[1].nodeValue print "fin" ----------------------------------------------- my issue appears to be related to the last "tbody", or tbody/tr[4]... if i leave off the tbody, i can display data, as the tr_ is an array with data... with the "tbody" it appears that the tr_ array is not defined, or it has no data... however, i can use the DOM tool with firefox to observe the fact that the "tbody" is there... so.. what am i missing... thoughts/comments are most welcome... also, i'm willing to send a small amount via paypal!! -bruce -- http://mail.python.org/mailman/listinfo/python-list