[mb-commits] r9917 - search_index/trunk/mbsearch/freedb
root at musicbrainz.org
root at musicbrainz.org
Mon Jun 30 08:38:45 UTC 2008
Author: robert
Date: 2008-06-30 08:38:45 +0000 (Mon, 30 Jun 2008)
New Revision: 9917
Modified:
search_index/trunk/mbsearch/freedb/freedb.py
Log:
Ported freedb search to xapian.
Modified: search_index/trunk/mbsearch/freedb/freedb.py
===================================================================
--- search_index/trunk/mbsearch/freedb/freedb.py 2008-06-30 08:38:09 UTC (rev 9916)
+++ search_index/trunk/mbsearch/freedb/freedb.py 2008-06-30 08:38:45 UTC (rev 9917)
@@ -27,11 +27,14 @@
import sys, getopt
import os
import tarfile
-import PyLucene
+import xapian
sys.path.append("../..")
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText
+indexDatabase = None
+indexer = None
+
def usage():
print "%s: [-c] <freedb dump file>" % sys.argv[0]
print
@@ -91,15 +94,34 @@
else:
#print "%s\n%s\n%s\n%s %s %d\n" % (artist.encode('utf-8'), name.encode('utf-8'), discid.encode('utf-8'),
# cat.encode('utf-8'), year.encode('utf-8'), duration)
- doc = PyLucene.Document()
- doc.add(PyLucene.Field(u"artist", artist, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED))
- doc.add(PyLucene.Field(u"title", name, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED))
- doc.add(PyLucene.Field(u"discid", discid, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED))
- doc.add(PyLucene.Field(u"cat", cat, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED))
- doc.add(PyLucene.Field(u"year", year, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED))
- doc.add(PyLucene.Field(u"tracks", unicode(numTracks), PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED))
- luceneIndex.addDocument(doc)
+ doc = xapian.Document()
+ indexer.set_document(doc)
+
+ storedata = u"artist=%s\ntitle=%s\ndiscid=%s\ncat=%s\nyear=%s\ntracks=%d\n" % \
+ (artist.replace(u"\n", u"\\n").strip(),
+ name.replace(u"\n", u"\\n").strip(),
+ discid.replace(u"\n", u"\\n").strip(),
+ cat.replace(u"\n", u"\\n").strip(),
+ year.replace(u"\n", u"\\n").strip(),
+ numTracks)
+ #print storedata.encode('utf-8', 'replace')
+ indexer.index_text(normalize(artist), 1, u"XARTIST")
+ indexer.index_text(normalize(title), 1, u"XTITLE")
+ indexer.index_text(normalize(discid), 1, u"XDISCID")
+ indexer.index_text(normalize(cat), 1, u"XCAT")
+ indexer.index_text(normalize(year), 1, u"XYEAR")
+ indexer.index_text(normalize(unicode(numTracks)), 1, u"XTRACKS")
+
+ doc.set_data(storedata)
+ indexDatabase.add_document(doc)
+
+def normalize(text):
+ # TODO: mangle text here
+ text = text.lower().strip()
+ text = normalizeText(text)
+ return text
+
def processCategory(cat, count):
inf = None
try:
@@ -144,9 +166,13 @@
for key, value in opts:
if key == "-c": create = True
-analyzer = StandardUnaccentAnalyzer()
-luceneIndex = PyLucene.IndexWriter(PyLucene.FSDirectory.getDirectory("freedb_index", create), analyzer, create)
+if create:
+ indexDatabase = xapian.WritableDatabase("../../data/freedb_index", xapian.DB_CREATE_OR_OVERWRITE)
+else:
+ indexDatabase = xapian.WritableDatabase("../../data/freedb_index", xapian.DB_OPEN)
+indexer = xapian.TermGenerator()
+
# The built in python tarfile object will run my computer out of memory when trying to chew
# through the whole data file in one go. Regular tar won't decompress to stdout with filenames (which contain
# the important category data bit) so we can't tell what the categories are. The only way that I've
@@ -161,8 +187,4 @@
print "Found %d CDs" % total
-print "Optimizing index"
-luceneIndex.optimize()
-
print "Done"
-luceneIndex.close()
More information about the MusicBrainz-commits
mailing list