RkBlog

Hardware, programming and astronomy tutorials and reviews.

Harvesting data from websites using WebKit and PyQt4 - part 3

Gathering final data for ad urls (final page title and URL) using PyQt4 with QtSql module.

In this tutorial we will make second app, that will get final data for ads URLs (title and URL). The application has lot load urls from the database and then update the entry.

Ad URL Parser

The GUI looks like this:
adsp1
We make the Python class:
pyuic4 parser.ui > parser.py
And create run.py with the skeleton code:
# -*- coding: utf-8 -*-
import sys

from PyQt4 import QtCore, QtGui, QtWebKit
from PyQt4.QtSql import *
from parser import Ui_parserWindow

class ParseAds(QtGui.QMainWindow):
	def __init__(self, parent=None):
		QtGui.QWidget.__init__(self, parent)
		self.ui = Ui_parserWindow()
		self.ui.setupUi(self)
		
		s = self.ui.webView.settings()
		s.setAttribute(QtWebKit.QWebSettings.AutoLoadImages, False)
		s.setAttribute(QtWebKit.QWebSettings.JavascriptCanOpenWindows, False)
		s.setAttribute(QtWebKit.QWebSettings.PluginsEnabled, False)
		
		self.db = QSqlDatabase.addDatabase("QSQLITE")
		self.db.setDatabaseName("./ads")
		self.dbstatus = self.db.open()
		if self.dbstatus:
			print 'DB ok'
		else:
			print 'DB error'
		

if __name__ == "__main__":
	app = QtGui.QApplication(sys.argv)
	myapp = ParseAds()
	myapp.show()
	sys.exit(app.exec_())
It will start the app, but won't do anything. We have to add webpage loader:
# -*- coding: utf-8 -*-
import sys
from urllib import unquote

from PyQt4 import QtCore, QtGui, QtWebKit
from PyQt4.QtSql import *
from parser import Ui_parserWindow

class FakeBrowser(QtWebKit.QWebPage):
	"""
	Set custom userAgent for the QWebView
	"""
	def __init__(self, parent=None):
		super(FakeBrowser, self).__init__(parent)
	def userAgentForUrl(self, url):
		return 'Opera/9.64 (X11; Linux x86_64; U; en) Presto/2.1.1 AdsSeeker-not-a-human'

class ParseAds(QtGui.QMainWindow):
	def __init__(self, parent=None):
		QtGui.QWidget.__init__(self, parent)
		self.ui = Ui_parserWindow()
		self.ui.setupUi(self)
		
		s = self.ui.webView.settings()
		s.setAttribute(QtWebKit.QWebSettings.AutoLoadImages, False)
		s.setAttribute(QtWebKit.QWebSettings.JavascriptCanOpenWindows, False)
		s.setAttribute(QtWebKit.QWebSettings.PluginsEnabled, False)
		fb = FakeBrowser(self)
		self.ui.webView.setPage(fb)
		
		QtCore.QObject.connect(self.ui.startButton,QtCore.SIGNAL("clicked()"), self.start)
		QtCore.QObject.connect(self.ui.webView,QtCore.SIGNAL("loadFinished (bool)"), self.loadFinished)
		QtCore.QObject.connect(self.ui.webView,QtCore.SIGNAL("loadProgress (int)"), self.loadProgress)
		
		self.db = QSqlDatabase.addDatabase("QSQLITE")
		self.db.setDatabaseName("./ads")
		self.dbstatus = self.db.open()
		if self.dbstatus:
			print 'DB ok'
		else:
			print 'DB error'
		
		# get URIs to parse
		self.URIs = []
		self.URIs_count = 0
		if self.dbstatus:
			query = QSqlQuery(self.db)
			if query.exec_("SELECT id, link FROM ads_data WHERE is_parsed = 0"):
				while query.next():
					ad_id = query.value(0).toInt()[0]
					ad_link = query.value(1).toString()
					self.URIs.append({'ad_id': ad_id, 'ad_link': unquote(str(ad_link))})
			self.URIs_count = len(self.URIs)
			print 'ToDo %s URIs' % str(self.URIs_count)
	
	def loadProgress(self, progress):
		"""
		Print the progress of page load
		"""
		print progress
	
	def start(self):
		"""
		Start loading the web pages
		"""
		self.ui.startButton.setEnabled(False)
		self.nexturl = self.__getNextUrl()
		if self.nexturl:
			print 'LOAD: %s' % self.nexturl['ad_link']
			self.ui.webView.load(QtCore.QUrl(self.nexturl['ad_link']))
		else:
			print 'No URIs'
	
	def loadFinished(self):
		"""
		Parse loaded page, update DB entry and load next page if available
		"""
		view = self.ui.webView
		title = unicode(view.title())
		url = unicode(view.url().toString())
		print 'TITLE: %s' % title
		
		query = QSqlQuery(self.db)
		qry = "UPDATE ads_data SET dest_title='%s', dest_url='%s', is_parsed = 1 WHERE id = %s" % (title, url, self.nexturl['ad_id'])
		if query.exec_(qry):
			print u'Update'
		else:
			print 'Update Error'
			print qry
			print query.lastError().text()
			print
		
		# set the progress bar of pages loaded
		done = self.URIs_count - len(self.URIs)
		progress_value = (float(done)/float(self.URIs_count))*100
		self.ui.progressBar.setValue(progress_value)
		
		# load next
		self.nexturl = self.__getNextUrl()
		if self.nexturl:
			print 'LOAD: %s' % self.nexturl['ad_link']
			self.ui.webView.load(QtCore.QUrl(self.nexturl['ad_link']))
		else:
			print 'No URIs'
	
	def __getNextUrl(self):
		"""
		return next URL to parse
		"""
		if len(self.URIs) > 0:
			elem = self.URIs.pop()
			return elem
		else:
			return False

if __name__ == "__main__":
	app = QtGui.QApplication(sys.argv)
	myapp = ParseAds()
	myapp.show()
	sys.exit(app.exec_())
We have some new things here: When the app loads every new URL in DB we will receive full data for a report. Example:
sqlite> SELECT DISTINCT dest_title,dest_url FROM ads_data WHERE is_parsed = 1;
|http://s0b.bluestreak.com/ix.e?hy
Białe Szaleństwo|http://www.gutekfilm.pl/bialeszalenstwo/
Intel Polsat Racing Tour - PIT STOP www.f1.pl|http://f1.sport.pl/intel-polsat-racing-tour.456.0.html
Książka "Django. Ćwiczenia praktyczne" -- Wydawnictwo Helion, księgarnia helion.pl|http://helion.pl/ksiazki/cwdjan.htm
Oferta   |   PKO Bank Polski|http://www.pkobp.pl/index.php/id=oferta/grupa=4/id_product=000000000000345/podgrupa=0/section=indy/idk=2/ver=7/wid=38180040140317
Serafina|http://www.gutekfilm.pl/serafina/
Ubezpieczenia Allianz Direct: KALKULATOR OC, AC, Turystyczne | Allianz Direct|https://www.allianzdirect.pl/ubezpieczenia/start.html
megiteam.pl|http://www.megiteam.pl/

Source Code

RkBlog

PyQt and GUI, 10 November 2009,

Comment article