How to get a wikipedia (mediawiki) page content
Check out the new site at https://rkblog.dev.
14 July 2008
Comments
This code will extract the content of any mediawiki wiki page:
from httplib import HTTPConnection
from re import findall, DOTALL
def wiki(slug):
#connect to a wikipedia (mediawiki) site (Polish)
conn = HTTPConnection("pl.wikipedia.org")
#get page by slug
conn.request('GET', '/wiki/'+slug)
r = conn.getresponse()
if str(r.status) == '200':
#extract content
tags = findall( r'<!-- start content -->(.*?)<!-- end content -->', r.read().decode('utf-8'), DOTALL)
tags = tags[0].replace('href="/wiki', 'href="http://pl.wikipedia.org/wiki').replace('href="/w/', 'href="http://pl.wikipedia.org/w/')
conn.close()
return tags
print wiki('Linux')
RkBlog
Check out the new site at https://rkblog.dev.
Comment article