Initially I thought you are missing scrolling but looks like you are only interested in getting the content either in text or html. I am not definite but some websites respond to particular clients really well. You can play with client string and see how it behaves.
On Mon, Oct 22, 2012 at 1:11 AM, flyer <[email protected]> wrote: > I wrote a python script using QtWebKit to get all page info including info > generated by AJAX requests. I run the following code on CentOS Server and do > the following settings: > >> >> $ Xvfb :100 -screen 0 9000x15000x24 & >> >> $ export DISPLAY=:100 > > > The following code worked, however, it could only get one-screen info of the > web page, namely, getting different amount of info according to the screen > resolution. I could only get part of the info of the webpage. > > I have tried using selenium and I can get all web info if I set large screen > resolution using Xvfb . > > Please give me some tips about how to solve the problem and any manual for > QtWebKit is also appreciated because I can't find more materials about it. > > The following is my code: > >>> #!/usr/bin/env python >>> >>> #coding: utf-8 >>> >>> >>> >>> import sys >>> >>> import time >>> >>> >>> from PySide.QtCore import QUrl, SIGNAL >>> >>> from PySide.QtGui import QApplication >>> >>> from PySide.QtWebKit import QWebPage, QWebView, QWebSettings >>> >>> from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest >>> >>> >>> reload(sys) >>> >>> sys.setdefaultencoding('utf-8') >>> >>> >>> fn_log = 'url_dd.txt' >>> >>> fp_log = open(fn_log, 'ab+') >>> >>> >>> class WebPage(QWebPage): >>> >>> >>> def __init__(self, logger=None, parent=None): >>> >>> super(WebPage, self).__init__(parent) >>> >>> >>> >>> def javaScriptConsoleMessage(self, message, lineNumber, sourceID): >>> >>> sys.stderr.write('Javascritp error at line number %d\n' % >>> (lineNumber)) >>> >>> sys.stderr.write('%s\n' % (message, )) >>> >>> sys.stderr.write('Source ID: %s\n' % (sourceID, )) >>> >>> >>> >>> class Crawler(QApplication): >>> >>> >>> >>> def __init__(self, url): >>> >>> super(Crawler, self).__init__(sys.argv) >>> >>> >>> >>> self.url = url >>> >>> self.web_view = QWebView() >>> >>> self.web_page = WebPage() >>> >>> self.web_view.setPage(self.web_page) >>> >>> self.web_frame = self.web_page.mainFrame() >>> >>> >>> self.network = NetworkAccessManager() >>> >>> self.web_page.setNetworkAccessManager(self.network) >>> >>> >>> >>> self.settings = self.web_page.settings().globalSettings() >>> >>> self.settings.setAttribute(QWebSettings.AutoLoadImages, False) >>> >>> self.settings.setAttribute(QWebSettings.PluginsEnabled, False) >>> >>> QWebSettings.clearMemoryCaches() >>> >>> >>> self.web_view.resize(1024, 9000) >>> >>> >>> self.connect(self.web_page, SIGNAL('loadFinished(bool)'), >>> self.loadFinished) >>> >>> >>> print 'Before loading' >>> >>> self.web_view.load(QUrl(self.url)) >>> >>> print 'After loading' >>> >>> >>> def loadFinished(self, ok): >>> >>> print 'Start loadFinished()' >>> >>> >>> print 'Start writing' >>> >>> with open('content_dd.txt', 'ab+') as fp: >>> >>> fp.write(self.web_frame.toHtml().toUtf8()) >>> >>> print 'End writing' >>> >>> >>> >>> print 'End loadFinished()' >>> >>> >>> try: >>> >>> self.quit() >>> >>> except Exception, e: >>> >>> print 'FATAL ERROR: %s' % (str(e), ) >>> >>> >>> >>> class NetworkAccessManager(QNetworkAccessManager): >>> >>> >>> >>> def __init__(self): >>> >>> super(NetworkAccessManager, self).__init__() >>> >>> # QNetworkAccessManager.__init__(self) >>> >>> self.connect(self, SIGNAL('finished (QNetworkReply *)'), >>> self.finishd) >>> >>> >>> >>> def createRequest(self, operation, request, data): >>> >>> # url = request.url().toString() >>> >>> self.setNetworkAccessible(self.Accessible) >>> >>> >>> >>> return QNetworkAccessManager.createRequest(self, operation, >>> request, data) >>> >>> >>> def finishd(self, reply): >>> >>> print 'In NetworkAccessManager finishd' >>> >>> url = str(reply.url().toString()) >>> >>> >>> >>> log = '%s: %s\n' % (time.ctime(), url) >>> >>> fp_log.write(log) >>> >>> >>> print url >>> >>> >>> >>> if __name__ == '__main__': >>> >>> # url = >>> 'http://product.dangdang.com/product.aspx?product_id=22822333' >>> >>> url = 'http://product.dangdang.com/product.aspx?product_id=22848707' >>> >>> crawler = Crawler(url) >>> >>> sys.exit(crawler.exec_()) >> >> > > -- > 宠辱不惊,闲看庭前花开花落;去留无意,漫随天边云卷云舒。 > > > > _______________________________________________ > PySide mailing list > [email protected] > http://lists.qt-project.org/mailman/listinfo/pyside > _______________________________________________ PySide mailing list [email protected] http://lists.qt-project.org/mailman/listinfo/pyside
