Harvesting data from websites using WebKit and PyQt4 - part 3
Check out the new site at https://rkblog.dev.
10 November 2009
Comments
In this tutorial we will make second app, that will get final data for ads URLs (title and URL). The application has lot load urls from the database and then update the entry.
Ad URL Parser
The GUI looks like this:
- "Start" (startButton, pushButton widget), which starts URLs loading
- progressBar - how many URLs done
- webView - widget for loading pages
pyuic4 parser.ui > parser.py
And create run.py with the skeleton code:
# -*- coding: utf-8 -*-
import sys
from PyQt4 import QtCore, QtGui, QtWebKit
from PyQt4.QtSql import *
from parser import Ui_parserWindow
class ParseAds(QtGui.QMainWindow):
def __init__(self, parent=None):
QtGui.QWidget.__init__(self, parent)
self.ui = Ui_parserWindow()
self.ui.setupUi(self)
s = self.ui.webView.settings()
s.setAttribute(QtWebKit.QWebSettings.AutoLoadImages, False)
s.setAttribute(QtWebKit.QWebSettings.JavascriptCanOpenWindows, False)
s.setAttribute(QtWebKit.QWebSettings.PluginsEnabled, False)
self.db = QSqlDatabase.addDatabase("QSQLITE")
self.db.setDatabaseName("./ads")
self.dbstatus = self.db.open()
if self.dbstatus:
print 'DB ok'
else:
print 'DB error'
if __name__ == "__main__":
app = QtGui.QApplication(sys.argv)
myapp = ParseAds()
myapp.show()
sys.exit(app.exec_())
# -*- coding: utf-8 -*-
import sys
from urllib import unquote
from PyQt4 import QtCore, QtGui, QtWebKit
from PyQt4.QtSql import *
from parser import Ui_parserWindow
class FakeBrowser(QtWebKit.QWebPage):
"""
Set custom userAgent for the QWebView
"""
def __init__(self, parent=None):
super(FakeBrowser, self).__init__(parent)
def userAgentForUrl(self, url):
return 'Opera/9.64 (X11; Linux x86_64; U; en) Presto/2.1.1 AdsSeeker-not-a-human'
class ParseAds(QtGui.QMainWindow):
def __init__(self, parent=None):
QtGui.QWidget.__init__(self, parent)
self.ui = Ui_parserWindow()
self.ui.setupUi(self)
s = self.ui.webView.settings()
s.setAttribute(QtWebKit.QWebSettings.AutoLoadImages, False)
s.setAttribute(QtWebKit.QWebSettings.JavascriptCanOpenWindows, False)
s.setAttribute(QtWebKit.QWebSettings.PluginsEnabled, False)
fb = FakeBrowser(self)
self.ui.webView.setPage(fb)
QtCore.QObject.connect(self.ui.startButton,QtCore.SIGNAL("clicked()"), self.start)
QtCore.QObject.connect(self.ui.webView,QtCore.SIGNAL("loadFinished (bool)"), self.loadFinished)
QtCore.QObject.connect(self.ui.webView,QtCore.SIGNAL("loadProgress (int)"), self.loadProgress)
self.db = QSqlDatabase.addDatabase("QSQLITE")
self.db.setDatabaseName("./ads")
self.dbstatus = self.db.open()
if self.dbstatus:
print 'DB ok'
else:
print 'DB error'
# get URIs to parse
self.URIs = []
self.URIs_count = 0
if self.dbstatus:
query = QSqlQuery(self.db)
if query.exec_("SELECT id, link FROM ads_data WHERE is_parsed = 0"):
while query.next():
ad_id = query.value(0).toInt()[0]
ad_link = query.value(1).toString()
self.URIs.append({'ad_id': ad_id, 'ad_link': unquote(str(ad_link))})
self.URIs_count = len(self.URIs)
print 'ToDo %s URIs' % str(self.URIs_count)
def loadProgress(self, progress):
"""
Print the progress of page load
"""
print progress
def start(self):
"""
Start loading the web pages
"""
self.ui.startButton.setEnabled(False)
self.nexturl = self.__getNextUrl()
if self.nexturl:
print 'LOAD: %s' % self.nexturl['ad_link']
self.ui.webView.load(QtCore.QUrl(self.nexturl['ad_link']))
else:
print 'No URIs'
def loadFinished(self):
"""
Parse loaded page, update DB entry and load next page if available
"""
view = self.ui.webView
title = unicode(view.title())
url = unicode(view.url().toString())
print 'TITLE: %s' % title
query = QSqlQuery(self.db)
qry = "UPDATE ads_data SET dest_title='%s', dest_url='%s', is_parsed = 1 WHERE id = %s" % (title, url, self.nexturl['ad_id'])
if query.exec_(qry):
print u'Update'
else:
print 'Update Error'
print qry
print query.lastError().text()
print
# set the progress bar of pages loaded
done = self.URIs_count - len(self.URIs)
progress_value = (float(done)/float(self.URIs_count))*100
self.ui.progressBar.setValue(progress_value)
# load next
self.nexturl = self.__getNextUrl()
if self.nexturl:
print 'LOAD: %s' % self.nexturl['ad_link']
self.ui.webView.load(QtCore.QUrl(self.nexturl['ad_link']))
else:
print 'No URIs'
def __getNextUrl(self):
"""
return next URL to parse
"""
if len(self.URIs) > 0:
elem = self.URIs.pop()
return elem
else:
return False
if __name__ == "__main__":
app = QtGui.QApplication(sys.argv)
myapp = ParseAds()
myapp.show()
sys.exit(app.exec_())
- I've created a class called FakeBrowser which inherits QWebPage and overwrite userAgentForUrl method. In __init__ I've used my class instead of standard QWebPage. By doing so I've changed the USER AGENT of the browser (it's good to mark this app as bot, as no one likes cheated ads clicks).
- In __init__ we get also URIs to load and we put them in self.URIs
- Method __getNextUrl returns (last) element of that list if it exists
- Method loadFinished runs after page is loaded. We get the final page title, URL and update the DB row (using ID from self.nexturl). Note: you may not get all titles (ad url invalid, inactive etc.)
sqlite> SELECT DISTINCT dest_title,dest_url FROM ads_data WHERE is_parsed = 1;
|http://s0b.bluestreak.com/ix.e?hy
Białe Szaleństwo|http://www.gutekfilm.pl/bialeszalenstwo/
Intel Polsat Racing Tour - PIT STOP www.f1.pl|http://f1.sport.pl/intel-polsat-racing-tour.456.0.html
Książka "Django. Ćwiczenia praktyczne" -- Wydawnictwo Helion, księgarnia helion.pl|http://helion.pl/ksiazki/cwdjan.htm
Oferta | PKO Bank Polski|http://www.pkobp.pl/index.php/id=oferta/grupa=4/id_product=000000000000345/podgrupa=0/section=indy/idk=2/ver=7/wid=38180040140317
Serafina|http://www.gutekfilm.pl/serafina/
Ubezpieczenia Allianz Direct: KALKULATOR OC, AC, Turystyczne | Allianz Direct|https://www.allianzdirect.pl/ubezpieczenia/start.html
megiteam.pl|http://www.megiteam.pl/
Source Code
RkBlog
Check out the new site at https://rkblog.dev.
Comment article