Re: help with link parsing?

2010-12-22 Thread Colin J. Williams

On 21-Dec-10 12:22 PM, Jon Clements wrote:

import lxml
from urlparse import urlsplit

doc = lxml.html.parse('http://www.google.com')
print map(urlsplit, doc.xpath('//a/@href'))

[SplitResult(scheme='http', netloc='www.google.co.uk', path='/imghp',
query='hl=entab=wi', fragment=''), SplitResult(scheme='http',
netloc='video.google.co.uk', path='/', query='hl=entab=wv',
fragment=''), SplitResult(scheme='http', netloc='maps.google.co.uk',
path='/maps', query='hl=entab=wl', fragment=''),
SplitResult(scheme='http', netloc='news.google.co.uk', path='/nwshp',
query='hl=entab=wn', fragment=''), ...]


Jon,

What version of Python was used to run this?

Colin W.
--
http://mail.python.org/mailman/listinfo/python-list


Re: help with link parsing?

2010-12-22 Thread Jon Clements
On Dec 22, 4:24 pm, Colin J. Williams cjwilliam...@gmail.com
wrote:
 On 21-Dec-10 12:22 PM, Jon Clements wrote:

  import lxml
  from urlparse import urlsplit

  doc = lxml.html.parse('http://www.google.com')
  print map(urlsplit, doc.xpath('//a/@href'))

  [SplitResult(scheme='http', netloc='www.google.co.uk', path='/imghp',
  query='hl=entab=wi', fragment=''), SplitResult(scheme='http',
  netloc='video.google.co.uk', path='/', query='hl=entab=wv',
  fragment=''), SplitResult(scheme='http', netloc='maps.google.co.uk',
  path='/maps', query='hl=entab=wl', fragment=''),
  SplitResult(scheme='http', netloc='news.google.co.uk', path='/nwshp',
  query='hl=entab=wn', fragment=''), ...]

 Jon,

 What version of Python was used to run this?

 Colin W.

2.6.5 - the lxml library is not a standard module though and needs to
be installed.
-- 
http://mail.python.org/mailman/listinfo/python-list


Re: help with link parsing?

2010-12-21 Thread Jon Clements
On Dec 20, 7:14 pm, Littlefield, Tyler ty...@tysdomain.com wrote:
 Hello all,
 I have a question. I guess this worked pre 2.6; I don't remember the
 last time I used it, but it was a while ago, and now it's failing.
 Anyone mind looking at it and telling me what's going wrong? Also, is
 there a quick way to match on a certain site? like links from google.com
 and only output those?
 #!/usr/bin/env python

 #This program is free software: you can redistribute it and/or modify it
 under the terms of the GNU General Public License as published
 #by the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 #This program is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of
 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 General Public License for more details.
 #
 #You should have received a copy of the GNU General Public License along
 with this program. If not, see
 #http://www.gnu.org/licenses/.

 
 This script will parse out all the links in an html document and write
 them to a textfile.
 
 import sys,optparse
 import htmllib,formatter

 #program class declarations:
 class Links(htmllib.HTMLParser):
      def __init__(self,formatter):
          htmllib.HTMLParser.__init__(self, formatter)
          self.links=[]
      def start_a(self, attrs):
          if (len(attrs)0):
              for a in attrs:
                  if a[0]==href:
                      self.links.append(a[1])
                      print a[1]
                      break

 def main(argv):
      if (len(argv)!=3):
          print(Error:\n+argv[0]+ input output.\nParses input
 for all links and saves them to output.)
          return 1
      lcount=0
      format=formatter.NullFormatter()
      html=Links(format)
      print Retrieving data:
      page=open(argv[1],r)
      print Feeding data to parser:
      html.feed(page.read())
      page.close()
      print Writing links:
      output=open(argv[2],w)
      for i in (html.links):
          output.write(i+\n)
          lcount+=1
      output.close()
      print(Wrote +str(lcount)+ links to +argv[2]+.);
      print(done.)

 if (__name__ == __main__):
      #we call the main function passing a list of args, and exit with
 the return code passed back.
      sys.exit(main(sys.argv))

 --

 Thanks,
 Ty

This doesn't answer your original question, but excluding the command
line handling, how's this do you?:

import lxml
from urlparse import urlsplit

doc = lxml.html.parse('http://www.google.com')
print map(urlsplit, doc.xpath('//a/@href'))

[SplitResult(scheme='http', netloc='www.google.co.uk', path='/imghp',
query='hl=entab=wi', fragment=''), SplitResult(scheme='http',
netloc='video.google.co.uk', path='/', query='hl=entab=wv',
fragment=''), SplitResult(scheme='http', netloc='maps.google.co.uk',
path='/maps', query='hl=entab=wl', fragment=''),
SplitResult(scheme='http', netloc='news.google.co.uk', path='/nwshp',
query='hl=entab=wn', fragment=''), ...]

Much nicer IMHO, plus the lxml.html has iterlinks() and other
convenience functions for handling HTML.

hth

Jon.

-- 
http://mail.python.org/mailman/listinfo/python-list


help with link parsing?

2010-12-20 Thread Littlefield, Tyler

Hello all,
I have a question. I guess this worked pre 2.6; I don't remember the 
last time I used it, but it was a while ago, and now it's failing. 
Anyone mind looking at it and telling me what's going wrong? Also, is 
there a quick way to match on a certain site? like links from google.com 
and only output those?

#!/usr/bin/env python

#This program is free software: you can redistribute it and/or modify it 
under the terms of the GNU General Public License as published
#by the Free Software Foundation, either version 3 of the License, or 
(at your option) any later version.


#This program is distributed in the hope that it will be useful, but 
WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
General Public License for more details.

#
#You should have received a copy of the GNU General Public License along 
with this program. If not, see

#http://www.gnu.org/licenses/.


This script will parse out all the links in an html document and write 
them to a textfile.


import sys,optparse
import htmllib,formatter

#program class declarations:
class Links(htmllib.HTMLParser):
def __init__(self,formatter):
htmllib.HTMLParser.__init__(self, formatter)
self.links=[]
def start_a(self, attrs):
if (len(attrs)0):
for a in attrs:
if a[0]==href:
self.links.append(a[1])
print a[1]
break

def main(argv):
if (len(argv)!=3):
print(Error:\n+argv[0]+ input output.\nParses input 
for all links and saves them to output.)

return 1
lcount=0
format=formatter.NullFormatter()
html=Links(format)
print Retrieving data:
page=open(argv[1],r)
print Feeding data to parser:
html.feed(page.read())
page.close()
print Writing links:
output=open(argv[2],w)
for i in (html.links):
output.write(i+\n)
lcount+=1
output.close()
print(Wrote +str(lcount)+ links to +argv[2]+.);
print(done.)

if (__name__ == __main__):
#we call the main function passing a list of args, and exit with 
the return code passed back.

sys.exit(main(sys.argv))

--

Thanks,
Ty

--
http://mail.python.org/mailman/listinfo/python-list


Re: help with link parsing?

2010-12-20 Thread Chris Rebert
On Mon, Dec 20, 2010 at 11:14 AM, Littlefield, Tyler
ty...@tysdomain.com wrote:
 Hello all,
 I have a question. I guess this worked pre 2.6; I don't remember the last
 time I used it, but it was a while ago, and now it's failing. Anyone mind
 looking at it and telling me what's going wrong?

Please describe /exactly/ how it is failing for you, including the
full exception traceback (if any).
The script seems to work fine for me under both Python v2.6.6 and v2.7.1.

Cheers,
Chris
--
http://blog.rebertia.com
-- 
http://mail.python.org/mailman/listinfo/python-list