[mb-commits] r9917 - search_index/trunk/mbsearch/freedb

root at musicbrainz.org root at musicbrainz.org
Mon Jun 30 08:38:45 UTC 2008


Author: robert
Date: 2008-06-30 08:38:45 +0000 (Mon, 30 Jun 2008)
New Revision: 9917

Modified:
   search_index/trunk/mbsearch/freedb/freedb.py
Log:
Ported freedb search to xapian.


Modified: search_index/trunk/mbsearch/freedb/freedb.py
===================================================================
--- search_index/trunk/mbsearch/freedb/freedb.py	2008-06-30 08:38:09 UTC (rev 9916)
+++ search_index/trunk/mbsearch/freedb/freedb.py	2008-06-30 08:38:45 UTC (rev 9917)
@@ -27,11 +27,14 @@
 import sys, getopt
 import os
 import tarfile
-import PyLucene
+import xapian
 
 sys.path.append("../..")
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText
 
+indexDatabase = None
+indexer = None
+
 def usage():
     print "%s: [-c] <freedb dump file>" % sys.argv[0]
     print
@@ -91,15 +94,34 @@
     else:
         #print "%s\n%s\n%s\n%s %s %d\n" % (artist.encode('utf-8'), name.encode('utf-8'), discid.encode('utf-8'),
         #                                  cat.encode('utf-8'), year.encode('utf-8'), duration)
-        doc = PyLucene.Document()
-        doc.add(PyLucene.Field(u"artist", artist,             PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED))
-        doc.add(PyLucene.Field(u"title",  name,               PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED))
-        doc.add(PyLucene.Field(u"discid", discid,             PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED))
-        doc.add(PyLucene.Field(u"cat",    cat,                PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED))
-        doc.add(PyLucene.Field(u"year",   year,               PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED))
-        doc.add(PyLucene.Field(u"tracks", unicode(numTracks), PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED))
-        luceneIndex.addDocument(doc)
 
+        doc = xapian.Document()
+	indexer.set_document(doc)
+
+        storedata = u"artist=%s\ntitle=%s\ndiscid=%s\ncat=%s\nyear=%s\ntracks=%d\n" %  \
+	                                         (artist.replace(u"\n", u"\\n").strip(), 
+						  name.replace(u"\n", u"\\n").strip(), 
+						  discid.replace(u"\n", u"\\n").strip(), 
+						  cat.replace(u"\n", u"\\n").strip(), 
+						  year.replace(u"\n", u"\\n").strip(), 
+						  numTracks)
+	#print storedata.encode('utf-8', 'replace')
+	indexer.index_text(normalize(artist), 1, u"XARTIST")
+	indexer.index_text(normalize(title), 1, u"XTITLE")
+	indexer.index_text(normalize(discid), 1, u"XDISCID")
+	indexer.index_text(normalize(cat), 1, u"XCAT")
+	indexer.index_text(normalize(year), 1, u"XYEAR")
+	indexer.index_text(normalize(unicode(numTracks)), 1, u"XTRACKS")
+
+	doc.set_data(storedata)
+        indexDatabase.add_document(doc)
+
+def normalize(text):
+    # TODO: mangle text here
+    text = text.lower().strip()
+    text = normalizeText(text)
+    return text
+
 def processCategory(cat, count):
     inf =  None
     try:
@@ -144,9 +166,13 @@
 for key, value in opts:
     if key == "-c": create = True
 
-analyzer = StandardUnaccentAnalyzer()
-luceneIndex = PyLucene.IndexWriter(PyLucene.FSDirectory.getDirectory("freedb_index", create), analyzer, create)
+if create:
+    indexDatabase = xapian.WritableDatabase("../../data/freedb_index", xapian.DB_CREATE_OR_OVERWRITE)
+else:
+    indexDatabase = xapian.WritableDatabase("../../data/freedb_index", xapian.DB_OPEN)
 
+indexer = xapian.TermGenerator()
+
 # The built in python tarfile object will run my computer out of memory when trying to chew 
 # through the whole data file in one go. Regular tar won't decompress to stdout with filenames (which contain
 # the important category data bit) so we can't tell what the categories are. The only way that I've 
@@ -161,8 +187,4 @@
 
 print "Found %d CDs" % total
 
-print "Optimizing index"
-luceneIndex.optimize()
-
 print "Done"
-luceneIndex.close()




More information about the MusicBrainz-commits mailing list