Xapian in Python
Check out the new site at https://rkblog.dev.
14 July 2008
Comments
Xapian is one of full text search engines which can be used in Python (also in PHP :nice:). As others it has poor tutorial section and you are left with pure API docs. There are also Xapwrap and PyXapian project but I won't touch them in this article. If you use MySQL the use it full text search features ;) If not, try Xapian.Instalation
You need "xapian" and "xapian-bindings" packages. Most Linux distributions should have them in repositories. "xapian-bindings" may be named as "xapian-bindings-python" or "xapian-python" if splitted. In other cases check the project website.Introduction to Xapian
Here is a basic indexer, similar to that one from xapian-bindings examles:import xapian
import string
MAX_PROB_TERM_LENGTH = 64
def p_alnum(c):
return (c in string.ascii_letters or c in string.digits)
def p_notalnum(c):
return not p_alnum(c)
def p_notplusminus(c):
return c != '+' and c != '-'
def find_p(string, start, predicate):
while start<len(string) and not predicate(string[start]):
start += 1
return start
database = xapian.WritableDatabase('test/', xapian.DB_CREATE_OR_OPEN)
stemmer = xapian.Stem("english")
para = '''this is a testing'''
doc = xapian.Document()
doc.set_data(para)
pos = 0
i = 0
while i < len(para):
i = find_p(para, i, p_alnum)
j = find_p(para, i, p_notalnum)
k = find_p(para, j, p_notplusminus)
if k == len(para) or not p_alnum(para[k]):
j = k
if (j - i) <= MAX_PROB_TERM_LENGTH and j > i:
term = stemmer.stem_word(string.lower(para[i:j]))
doc.add_posting(term, pos)
pos += 1
i = j
database.add_document(doc)
para = '''this is a test'''
And execute it again. Now we can search using a searcher:
import sys
import xapian
try:
database = xapian.Database('test/')
enquire = xapian.Enquire(database)
stemmer = xapian.Stem("english")
terms = []
for term in sys.argv[1:]:
terms.append(stemmer.stem_word(term.lower()))
query = xapian.Query(xapian.Query.OP_OR, terms)
print "Performing query `%s'" % query.get_description()
enquire.set_query(query)
matches = enquire.get_mset(0, 10)
print "%i results found" % matches.get_matches_estimated()
for match in matches:
print "ID %i %i%% [%s]" % (match[xapian.MSET_DID], match[xapian.MSET_PERCENT], match[xapian.MSET_DOCUMENT].get_data())
except Exception, e:
print >> sys.stderr, "Exception: %s" % str(e)
sys.exit(1)
python search.py test
You will get both phrases but that one with "testing" will have lower probability. Xapian supports stemming for few languages. You set it with:
stemmer = xapian.Stem("english")
Supported languages: none, danish (da), dutch (nl), english (en), finnish (fi), french (fr), german (de), italian (it), norwegian (no), portuguese (pt), russian (ru), spanish (es), swedish (sv).Xapian and databases
To index something from the database we need to add entry ID and possibly some other data to the indexed entry in xapian. We need to modify our indexer:import xapian
import string
MAX_PROB_TERM_LENGTH = 64
def p_alnum(c):
return (c in string.ascii_letters or c in string.digits)
def p_notalnum(c):
return not p_alnum(c)
def p_notplusminus(c):
return c != '+' and c != '-'
def find_p(string, start, predicate):
while start<len(string) and not predicate(string[start]):
start += 1
return start
NEWS_ID = 0
NEWS_TITLE = 1
NEWS_DESC = 2
database = xapian.WritableDatabase('test/', xapian.DB_CREATE_OR_OPEN)
stemmer = xapian.Stem("english")
para = '''this is a testing'''
doc = xapian.Document()
doc.set_data(para)
pos = 0
i = 0
while i < len(para):
i = find_p(para, i, p_alnum)
j = find_p(para, i, p_notalnum)
k = find_p(para, j, p_notplusminus)
if k == len(para) or not p_alnum(para[k]):
j = k
if (j - i) <= MAX_PROB_TERM_LENGTH and j > i:
term = stemmer.stem_word(string.lower(para[i:j]))
doc.add_posting(term, pos)
pos += 1
i = j
doc.add_value(NEWS_ID, str(323))
doc.add_value(NEWS_TITLE, 'Tytul newsa')
doc.add_value(NEWS_DESC, 'bla bla bla')
database.add_document(doc)
import sys
import xapian
NEWS_ID = 0
NEWS_TITLE = 1
NEWS_DESC = 2
try:
database = xapian.Database('test/')
enquire = xapian.Enquire(database)
stemmer = xapian.Stem("english")
terms = []
for term in sys.argv[1:]:
terms.append(stemmer.stem_word(term.lower()))
query = xapian.Query(xapian.Query.OP_OR, terms)
print "Performing query `%s'" % query.get_description()
enquire.set_query(query)
matches = enquire.get_mset(0, 10)
print "%i results found" % matches.get_matches_estimated()
for match in matches:
#print "ID %i %i%% [%s]" % (match[xapian.MSET_DID], match[xapian.MSET_PERCENT], match[xapian.MSET_DOCUMENT].get_data())
print match[xapian.MSET_DOCUMENT].get_value(NEWS_TITLE)
except Exception, e:
print >> sys.stderr, "Exception: %s" % str(e)
sys.exit(1)
For a Django news application with a model like this one:
class News(models.Model):
title = models.CharField(maxlength=255, verbose_name=_('Title'))
slug = models.SlugField(maxlength=255, unique=True, prepopulate_from=("title", ), verbose_name=_('Slug'))
text = models.TextField(verbose_name=_('Text'))
text_more = models.TextField(verbose_name=_(' More Text'), blank=True)
is_more = models.BooleanField(blank=True, default=False, verbose_name=_('Is More Text'))
is_external = models.BooleanField(blank=True, default=False, verbose_name=_('Is External News'))
category = models.ForeignKey(Category)
date = models.DateField(auto_now = True)
jakilinux_field = models.CharField(maxlength=255, verbose_name='Jakilinux', blank=True)
wykop_field = models.CharField(maxlength=255, verbose_name='Wykop', blank=True)
infoneo_field = models.CharField(maxlength=255, verbose_name='Infoneo', blank=True)
digg_field = models.CharField(maxlength=255, verbose_name='Digg', blank=True)
reddit_field = models.CharField(maxlength=255, verbose_name='Reddit', blank=True)
class Meta:
verbose_name = _('News')
verbose_name_plural = _('News')
db_table = 'rk_news' + str(settings.SITE_ID)
class Admin:
list_display = ('title', 'date')
list_filter = ['date']
search_fields = ['title', 'text']
date_hierarchy = 'date'
def get_absolute_url(self):
return '/news/more/' + str(self.slug) + '/'
def __str__(self):
return self.title
from os import environ
environ['DJANGO_SETTINGS_MODULE'] = 'settings'
from settings import *
from news.models import *
import xapian
import string
MAX_PROB_TERM_LENGTH = 64
def p_alnum(c):
return (c in string.ascii_letters or c in string.digits)
def p_notalnum(c):
return not p_alnum(c)
def p_notplusminus(c):
return c != '+' and c != '-'
def find_p(string, start, predicate):
while start<len(string) and not predicate(string[start]):
start += 1
return start
news = News.objects.all()
NEWS_ID = 0
NEWS_TITLE = 1
NEWS_DESC = 2
database = xapian.WritableDatabase('test/', xapian.DB_CREATE_OR_OPEN)
stemmer = xapian.Stem("english")
for new in news:
para = new.text + str(new.text_more)
doc = xapian.Document()
doc.set_data(para)
pos = 0
i = 0
while i < len(para):
i = find_p(para, i, p_alnum)
j = find_p(para, i, p_notalnum)
k = find_p(para, j, p_notplusminus)
if k == len(para) or not p_alnum(para[k]):
j = k
if (j - i) <= MAX_PROB_TERM_LENGTH and j > i:
term = stemmer.stem_word(string.lower(para[i:j]))
doc.add_posting(term, pos)
pos += 1
i = j
doc.add_value(NEWS_ID, str(new.id))
doc.add_value(NEWS_TITLE, new.title)
doc.add_value(NEWS_DESC, new.text)
database.add_document(doc)
[piotr@localhost biblioteka]$ python simplesearch.py django Performing query `Xapian::Query(django)' 8 results found How to Beat Rails - 100% More crazy changes for Django 1.0 ? - 98% Big Django project ;) - 87% Django 0.96 released - 86% polib - gettext translation manager - 71% Diamanda 2006.12 Stable Released - 65% More on Django 1.0 changes - 65% Djangoish Gettext Translator - 57%
Pumping Up Your Applications with Xapian Full-Text Search - More advanced example using XML-RPC and Twisted
RkBlog
Check out the new site at https://rkblog.dev.
Comment article