Harvesting data from websites using WebKit and PyQt4 - part 3

In this tutorial we will make second app, that will get final data for ads URLs (title and URL). The application has lot load urls from the database and then update the entry.

Ad URL Parser

The GUI looks like this:
adsp1
  • "Start" (startButton, pushButton widget), which starts URLs loading
  • progressBar - how many URLs done
  • webView - widget for loading pages
We make the Python class:
pyuic4 parser.ui > parser.py
And create run.py with the skeleton code:
# -*- coding: utf-8 -*-
import sys

from PyQt4 import QtCore, QtGui, QtWebKit
from PyQt4.QtSql import *
from parser import Ui_parserWindow

class ParseAds(QtGui.QMainWindow):
	def __init__(self, parent=None):
		QtGui.QWidget.__init__(self, parent)
		self.ui = Ui_parserWindow()
		self.ui.setupUi(self)
		
		s = self.ui.webView.settings()
		s.setAttribute(QtWebKit.QWebSettings.AutoLoadImages, False)
		s.setAttribute(QtWebKit.QWebSettings.JavascriptCanOpenWindows, False)
		s.setAttribute(QtWebKit.QWebSettings.PluginsEnabled, False)
		
		self.db = QSqlDatabase.addDatabase("QSQLITE")
		self.db.setDatabaseName("./ads")
		self.dbstatus = self.db.open()
		if self.dbstatus:
			print 'DB ok'
		else:
			print 'DB error'
		

if __name__ == "__main__":
	app = QtGui.QApplication(sys.argv)
	myapp = ParseAds()
	myapp.show()
	sys.exit(app.exec_())
It will start the app, but won't do anything. We have to add webpage loader:
# -*- coding: utf-8 -*-
import sys
from urllib import unquote

from PyQt4 import QtCore, QtGui, QtWebKit
from PyQt4.QtSql import *
from parser import Ui_parserWindow

class FakeBrowser(QtWebKit.QWebPage):
	"""
	Set custom userAgent for the QWebView
	"""
	def __init__(self, parent=None):
		super(FakeBrowser, self).__init__(parent)
	def userAgentForUrl(self, url):
		return 'Opera/9.64 (X11; Linux x86_64; U; en) Presto/2.1.1 AdsSeeker-not-a-human'

class ParseAds(QtGui.QMainWindow):
	def __init__(self, parent=None):
		QtGui.QWidget.__init__(self, parent)
		self.ui = Ui_parserWindow()
		self.ui.setupUi(self)
		
		s = self.ui.webView.settings()
		s.setAttribute(QtWebKit.QWebSettings.AutoLoadImages, False)
		s.setAttribute(QtWebKit.QWebSettings.JavascriptCanOpenWindows, False)
		s.setAttribute(QtWebKit.QWebSettings.PluginsEnabled, False)
		fb = FakeBrowser(self)
		self.ui.webView.setPage(fb)
		
		QtCore.QObject.connect(self.ui.startButton,QtCore.SIGNAL("clicked()"), self.start)
		QtCore.QObject.connect(self.ui.webView,QtCore.SIGNAL("loadFinished (bool)"), self.loadFinished)
		QtCore.QObject.connect(self.ui.webView,QtCore.SIGNAL("loadProgress (int)"), self.loadProgress)
		
		self.db = QSqlDatabase.addDatabase("QSQLITE")
		self.db.setDatabaseName("./ads")
		self.dbstatus = self.db.open()
		if self.dbstatus:
			print 'DB ok'
		else:
			print 'DB error'
		
		# get URIs to parse
		self.URIs = []
		self.URIs_count = 0
		if self.dbstatus:
			query = QSqlQuery(self.db)
			if query.exec_("SELECT id, link FROM ads_data WHERE is_parsed = 0"):
				while query.next():
					ad_id = query.value(0).toInt()[0]
					ad_link = query.value(1).toString()
					self.URIs.append({'ad_id': ad_id, 'ad_link': unquote(str(ad_link))})
			self.URIs_count = len(self.URIs)
			print 'ToDo %s URIs' % str(self.URIs_count)
	
	def loadProgress(self, progress):
		"""
		Print the progress of page load
		"""
		print progress
	
	def start(self):
		"""
		Start loading the web pages
		"""
		self.ui.startButton.setEnabled(False)
		self.nexturl = self.__getNextUrl()
		if self.nexturl:
			print 'LOAD: %s' % self.nexturl['ad_link']
			self.ui.webView.load(QtCore.QUrl(self.nexturl['ad_link']))
		else:
			print 'No URIs'
	
	def loadFinished(self):
		"""
		Parse loaded page, update DB entry and load next page if available
		"""
		view = self.ui.webView
		title = unicode(view.title())
		url = unicode(view.url().toString())
		print 'TITLE: %s' % title
		
		query = QSqlQuery(self.db)
		qry = "UPDATE ads_data SET dest_title='%s', dest_url='%s', is_parsed = 1 WHERE id = %s" % (title, url, self.nexturl['ad_id'])
		if query.exec_(qry):
			print u'Update'
		else:
			print 'Update Error'
			print qry
			print query.lastError().text()
			print
		
		# set the progress bar of pages loaded
		done = self.URIs_count - len(self.URIs)
		progress_value = (float(done)/float(self.URIs_count))*100
		self.ui.progressBar.setValue(progress_value)
		
		# load next
		self.nexturl = self.__getNextUrl()
		if self.nexturl:
			print 'LOAD: %s' % self.nexturl['ad_link']
			self.ui.webView.load(QtCore.QUrl(self.nexturl['ad_link']))
		else:
			print 'No URIs'
	
	def __getNextUrl(self):
		"""
		return next URL to parse
		"""
		if len(self.URIs) > 0:
			elem = self.URIs.pop()
			return elem
		else:
			return False

if __name__ == "__main__":
	app = QtGui.QApplication(sys.argv)
	myapp = ParseAds()
	myapp.show()
	sys.exit(app.exec_())
We have some new things here:
  • I've created a class called FakeBrowser which inherits QWebPage and overwrite userAgentForUrl method. In __init__ I've used my class instead of standard QWebPage. By doing so I've changed the USER AGENT of the browser (it's good to mark this app as bot, as no one likes cheated ads clicks).
  • In __init__ we get also URIs to load and we put them in self.URIs
  • Method __getNextUrl returns (last) element of that list if it exists
  • Method loadFinished runs after page is loaded. We get the final page title, URL and update the DB row (using ID from self.nexturl). Note: you may not get all titles (ad url invalid, inactive etc.)
When the app loads every new URL in DB we will receive full data for a report. Example:
sqlite> SELECT DISTINCT dest_title,dest_url FROM ads_data WHERE is_parsed = 1;
|http://s0b.bluestreak.com/ix.e?hy
Białe Szaleństwo|http://www.gutekfilm.pl/bialeszalenstwo/
Intel Polsat Racing Tour - PIT STOP www.f1.pl|http://f1.sport.pl/intel-polsat-racing-tour.456.0.html
Książka "Django. Ćwiczenia praktyczne" -- Wydawnictwo Helion, księgarnia helion.pl|http://helion.pl/ksiazki/cwdjan.htm
Oferta   |   PKO Bank Polski|http://www.pkobp.pl/index.php/id=oferta/grupa=4/id_product=000000000000345/podgrupa=0/section=indy/idk=2/ver=7/wid=38180040140317
Serafina|http://www.gutekfilm.pl/serafina/
Ubezpieczenia Allianz Direct: KALKULATOR OC, AC, Turystyczne | Allianz Direct|https://www.allianzdirect.pl/ubezpieczenia/start.html
megiteam.pl|http://www.megiteam.pl/

Source Code

RkBlog

PyQt and GUI, 10 November 2009, Piotr Maliński

Comment article
RkBlog main page Search RSS Contact