[mb-commits] r9893 - in search_index/trunk: . mbsearch mbsearch/serverindex tools

root at musicbrainz.org root at musicbrainz.org
Sun Jun 29 07:39:56 UTC 2008


Author: robert
Date: 2008-06-29 07:39:56 +0000 (Sun, 29 Jun 2008)
New Revision: 9893

Modified:
   search_index/trunk/builder.py
   search_index/trunk/mbsearch/indexcreator.py
   search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
   search_index/trunk/mbsearch/serverindex/artistindex.py
   search_index/trunk/mbsearch/serverindex/la_annotationindex.py
   search_index/trunk/mbsearch/serverindex/labelindex.py
   search_index/trunk/mbsearch/serverindex/re_annotationindex.py
   search_index/trunk/mbsearch/serverindex/releaseindex.py
   search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
   search_index/trunk/mbsearch/serverindex/trackindex.py
   search_index/trunk/tools/tinysearch.py
Log:
Added the tinysearch for xapian.
Sped up track index by building multiple smaller indexes and glueing them together at the end. Now takes less than 3 hours.
Improved searching by fixing probablistic weights.
qdur searches work for the track index.


Modified: search_index/trunk/builder.py
===================================================================
--- search_index/trunk/builder.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/builder.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -36,6 +36,10 @@
     print " -p <db passwd>   -- The password for the user. (default: -blank-)"
     print " -t               -- Text the index builder by creating small text indexes."
     print " -b [artnl]       -- Which indexes to build (Artist, Release, Track, aNnotation, Labels)"
+    print
+    print "NOTE: Set the env var XAPIAN_FLUSH_THRESHOLD to some large value that consumes a"
+    print "reasonable amount of RAM on the machine on which you're building the indexes."
+    print " e.g.: XAPIAN_FLUSH_THRESHOLD=\"1000000\""
     sys.exit(-1)
 
 # Default config values 
@@ -73,24 +77,33 @@
     index = artistindex.ArtistIndex("data/artist_index", True, host, database, user, passwd)
     ok = index.run(test)
     if ok:
-        index.close()
-        print "done\n"
+        if not index.close():
+	    ok = 0
+	    print "Index compacting failed. Stop."
+	else:
+	    print "done\n"
 
 if ok and (not build or build.find('r') >= 0):
     print "Creating release index:"
     index = releaseindex.ReleaseIndex("data/release_index", True, host, database, user, passwd)
     ok = index.run(test)
     if ok:
-        index.close()
-        print "done\n"
+        if not index.close():
+	    ok = 0
+	    print "Index compacting failed. Stop."
+	else:
+	    print "done\n"
    
 if ok and (not build or build.find('t') >= 0):
     print "Creating track index:"
-    index = trackindex.TrackIndex("data/track_index", True, host, database, user, passwd)
+    index = trackindex.TrackIndex("data/track_index", True, host, database, user, passwd, True)
     ok = index.run(test)
     if ok:
-        index.close()
-        print "done\n"
+        if not index.close():
+	    ok = 0
+	    print "Index compacting failed. Stop."
+	else:
+	    print "done\n"
 
 if ok and (not build or build.find('n') >= 0):
     print "Creating artist annotation index:"
@@ -122,5 +135,8 @@
     index = labelindex.LabelIndex("data/label_index", True, host, database, user, passwd)
     ok = index.run(test)
     if ok:
-         index.close()
-         print "done\n"
+	if not index.close():
+	    ok = 0
+ 	    print "Index compacting failed. Stop."
+	else:
+	    print "done\n"

Modified: search_index/trunk/mbsearch/indexcreator.py
===================================================================
--- search_index/trunk/mbsearch/indexcreator.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/indexcreator.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -24,15 +24,20 @@
 #
 #---------------------------------------------------------------------------
 
-import re, time, sys, string
+import re, time, sys, string, os
 import xapian
 import psycopg2
 import re
 from unac import unac
 from escape_ideographic import addSpacesToIdeographicStrings
+from random import random
 
 # Number of rows to process for each database 'chunk'
-IDS_PER_CHUNK = 2000
+IDS_PER_CHUNK = 1000
+
+# The number of slices to create that will later be joined into a total index
+SLICES_PER_INDEX = 20
+
 STATS_WINDOW_SIZE = 6
 MAX_FIELD_LEN = 65536
 
@@ -60,16 +65,23 @@
     MusicBrainz data.
     '''
 
-    def __init__(self, indexName, create, host, database, user, passwd):
+    def __init__(self, indexName, create, host, database, user, passwd, useSlices = False):
         self.create = create
         self.host = host
         self.database = database
         self.user = user
         self.passwd = passwd
+	self.indexName = indexName
+	self.useSlices = useSlices
 
 	self.dotsRe = re.compile("((?:\w\.){2,})", re.UNICODE)
 
         self.indexer = xapian.TermGenerator()
+
+	if useSlices:
+	    self.curSlice = 0
+	    indexName = "%s.%d" % (indexName, self.curSlice)
+
         if create:
              self.index = xapian.WritableDatabase(indexName, xapian.DB_CREATE_OR_OVERWRITE)
         else:
@@ -79,8 +91,38 @@
         '''
         Close the index by discarding a ref
         '''
+
         del self.index
 
+	# If we've created many slices, merge them together into one final database 
+	if self.useSlices:
+	    cmd = "xapian-compact --multipass "
+	    for i in xrange(self.curSlice + 1):
+		cmd += "%s.%d " % (self.indexName, i)
+	    cmd += self.indexName
+
+	    print "Compacting the database..."
+	    if os.system(cmd):
+		return False
+	    else:
+		for i in xrange(self.curSlice + 1):
+		    os.system("rm -rf %s.%d" % (self.indexName, i))
+	return True
+
+    def switchToNextSlice(self):
+
+	if not self.useSlices: return
+
+	del self.index
+	self.curSlice += 1
+	indexName = "%s.%d" % (self.indexName, self.curSlice)
+
+	print "Create database Slice %d" % self.curSlice
+        if self.create:
+             self.index = xapian.WritableDatabase(indexName, xapian.DB_CREATE_OR_OVERWRITE)
+        else:
+             self.index = xapian.WritableDatabase(indexName, xapian.DB_OPEN)
+
     def getRowCountQuery(self):
         """Return the SQL query that determines how many rows need
         to be processed.
@@ -137,6 +179,11 @@
 
 	return u''.join(bits)
 
+    def calculateWeight(self, text):
+	maxLen = 100
+	l = min(maxLen, len(text))
+	return maxLen - l
+
     def run(self, doTest = False):
         '''
         Execute the query, massage the returned data and pass it to
@@ -165,9 +212,12 @@
         if doTest:
             numChunks = min(numChunks, 50)
 
+        rowsPerSlice = totalRows / SLICES_PER_INDEX
+
         # Record the start time
         t0 = time.time()
         rowsIndexed = 0
+	rowsThisSlice = 0
 
         # Now get the data, one chunk at a time
         for i in xrange(numChunks):
@@ -183,6 +233,7 @@
                 doc = xapian.Document()
                 self.indexer.set_document(doc)
                 storedata = u""
+		primaryWeight = 0
                 for field, data in zip(self.FIELDS, row):
 
 		    isList = True
@@ -192,6 +243,7 @@
 		        isList = False
 
                     name, method, tokenize = field[:3]
+		    weight = 1
                     for i, text in enumerate(data):
                         # Ensure that the value is an unicode string
                         if not text: continue
@@ -201,6 +253,9 @@
                         if text:
 			    #print "%s:%s" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
                             if method in (FIELD_STORE, FIELD_INDEX_AND_STORE): 
+				if tokenize == FIELD_TOKENIZE_PRIMARY:
+				    primaryWeight = self.calculateWeight(text)
+				    #print "w: %s -> %d" % (text.encode('utf-8', 'replace'), primaryWeight)
                                 if isList:
                                     storedata += u"%s%d=%s\n" % (name, i, text.replace(u"\n", u"\\n").strip())
 			        else:
@@ -208,27 +263,30 @@
 			        
                             if method in (FIELD_INDEX, FIELD_INDEX_AND_STORE):
 				if tokenize == FIELD_TOKENIZE_NORMAL or tokenize == FIELD_TOKENIZE_PRIMARY:
+				    if primaryWeight:
+					weight = primaryWeight
+					primaryWeight = 0
+					doc.add_value(0, u"%d" % weight)
+				    else:
+					weight = self.calculateWeight(text)
                                     text = text.strip().lower()
                                     text = unac.unac_string(text)
                                     text = self.removeDots(text) 
                                     text = addSpacesToIdeographicStrings(text) 
-                                    self.indexer.index_text(text, 0, u"X" + name.upper())
-				    #print "ndx: %s:'%s'" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
-				    if tokenize == FIELD_TOKENIZE_PRIMARY:
-					doc.add_value(0, u"%d" % (MAX_FIELD_LEN - len(text)))
+                                    self.indexer.index_text(text, weight, u"X" + name.upper())
+				    #print "ndx: %s:'%s' %d" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'), weight)
 			        elif tokenize == FIELD_TOKENIZE_NONE:
-				    #print "add: '%s'" % text.encode('utf-8', 'replace')
-				    doc.add_term(u"X" + name.upper() + text, 0)
+				    #print "add: %s:'%s'" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
+				    doc.add_term(u"X" + name.upper() + text, 1)
 			        elif tokenize == FIELD_TOKENIZE_MBID:
                                     text = text.strip().lower()
 				    text = text.replace(u'-', u'')
-				    #print "gid: '%s'" % text.encode('utf-8', 'replace')
-                                    self.indexer.index_text(text, 0, u"X" + name.upper())
+                                    self.indexer.index_text(text, 1, u"X" + name.upper())
 
-			        self.indexer.increase_termpos()
+			        #self.indexer.increase_termpos()
 			    elif tokenize == FIELD_TOKENIZE_PRIMARY:
-				#print "val: '%s'" % len(text.split(u" "))
-				doc.add_value(0, u"%d" % (MAX_FIELD_LEN - len(text)))
+				weight = self.calculateWeight(text)
+				doc.add_value(0, u"%d" % weight)
 			        
 
                 # Add the document to the index
@@ -239,6 +297,7 @@
 
             t2 = time.time()
             rowsIndexed += rowsThisChunk
+            rowsThisSlice += rowsThisChunk
 
             # Add the current number of seconds per row into a sliding window queue
             if rowsThisChunk > 0: stats.append((t2 - t1) / rowsThisChunk)
@@ -262,5 +321,9 @@
             print
             sys.stdout.flush()
 
+            if rowsThisSlice > rowsPerSlice:
+	        rowsThisSlice = 0
+		self.switchToNextSlice()
+
         conn.close()
         return True

Modified: search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/ar_annotationindex.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/ar_annotationindex.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -40,8 +40,8 @@
               (u"text", indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
     ]
 
-    def __init__(self, indexName, clear, host, database, user, passwd):
-        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+    def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
 
     def getRowCountQuery(self):
         return "SELECT max(id), count(id) from annotation where type = 1"

Modified: search_index/trunk/mbsearch/serverindex/artistindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/artistindex.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/artistindex.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -41,20 +41,19 @@
     """This class specifies the details on how to create the artist index."""
 
     FIELDS = [
-        (u'arid',     indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_MBID,    None),
-        (u'artist',   indexcreator.FIELD_INDEX,            indexcreator.FIELD_TOKENIZE_NORMAL,  normalizeText),
-        (u'artist',   indexcreator.FIELD_STORE,            indexcreator.FIELD_TOKENIZE_PRIMARY, normalizeText),
-#        (u'artist',   indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_PRIMARY, normalizeText),
-        (u'sortname', indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NORMAL,  normalizeText),
-        (u'type',     indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NONE,    _normalizeArtistType),
-        (u'begin',    indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NONE,    normalizeDate),
-        (u'end',      indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NONE,    normalizeDate),
-        (u'comment',  indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NORMAL,  normalizeText),
-        (u'alias',    indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NORMAL,  normalizeText),
+        (u'arid',      indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_MBID,    None),
+        (u'artist',    indexcreator.FIELD_STORE,            indexcreator.FIELD_TOKENIZE_PRIMARY, normalizeText),
+        (u'artist',    indexcreator.FIELD_INDEX,            indexcreator.FIELD_TOKENIZE_NORMAL,  normalizeText),
+        (u'sortname',  indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NORMAL,  normalizeText),
+        (u'type',      indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NONE,    _normalizeArtistType),
+        (u'begin',     indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NONE,    normalizeDate),
+        (u'end',       indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NONE,    normalizeDate),
+        (u'comment',   indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NORMAL,  normalizeText),
+        (u'alias',     indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NORMAL,  normalizeText),
     ]
 
-    def __init__(self, indexName, clear, host, database, user, passwd):
-        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+    def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
         self.aliasDict = {}
 
     def getRowCountQuery(self):
@@ -89,7 +88,7 @@
 
         conn.close()
 
-        return """SELECT gid, name || ' ' || sortname || ' ', name, 
+        return """SELECT gid, name, name || ' ' || sortname || ' ',
 	                 sortname, type, begindate, enddate, resolution
                     FROM artist
                    WHERE id BETWEEN %d AND %d ORDER BY id""" % (chunkNum * chunkSize, ((chunkNum + 1) * chunkSize) - 1)
@@ -99,13 +98,13 @@
         ret = super(ArtistIndex, self).processRow(row)
         try:
             ret.append(self.aliasDict[row[0]])
-	    ret[1] += ' ' + ' '.join(self.aliasDict[row[0]]) 
+	    ret[2] += ' ' + ' '.join(self.aliasDict[row[0]]) 
         except KeyError:
             ret.append([])
 
-	ret[1] = ret[1].replace(u',', u'').lower() 
-	words = ret[1].split(' ') 
+	ret[2] = ret[2].replace(u',', u'').lower() 
+	words = ret[2].split(' ') 
 	words = self.uniquer(words) 
-	ret[1] = ' '.join(words) 
+	ret[2] = ' '.join(words) 
 
         return ret

Modified: search_index/trunk/mbsearch/serverindex/la_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/la_annotationindex.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/la_annotationindex.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -40,8 +40,8 @@
               (u"text", indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
     ]
 
-    def __init__(self, indexName, clear, host, database, user, passwd):
-        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+    def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
 
     def getRowCountQuery(self):
         return "SELECT max(id), count(id) from annotation where type = 3"

Modified: search_index/trunk/mbsearch/serverindex/labelindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/labelindex.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/labelindex.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -55,8 +55,8 @@
         (u'alias',    indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL,  normalizeText),
     ]
 
-    def __init__(self, indexName, clear, host, database, user, passwd):
-        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+    def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
         self.aliasDict = {}
 
     def getRowCountQuery(self):

Modified: search_index/trunk/mbsearch/serverindex/re_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/re_annotationindex.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/re_annotationindex.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -40,8 +40,8 @@
               (u"text", indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
     ]
 
-    def __init__(self, indexName, clear, host, database, user, passwd):
-        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+    def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
 
     def getRowCountQuery(self):
         return "SELECT max(id), count(id) from annotation where type = 2"

Modified: search_index/trunk/mbsearch/serverindex/releaseindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/releaseindex.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/releaseindex.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -153,8 +153,8 @@
              (u"barcode",  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE,    normalizeText),
     ]
 
-    def __init__(self, indexName, clear, host, database, user, passwd):
-        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+    def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
         self.parenRegexp = re.compile('\(.*?\)$|\[(.*?)\]$')
         self.dateRe = re.compile('-00')
 

Modified: search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/tr_annotationindex.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/tr_annotationindex.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -40,8 +40,8 @@
               (u"text", indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
     ]
 
-    def __init__(self, indexName, clear, host, database, user, passwd):
-        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+    def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
 
     def getRowCountQuery(self):
         return "SELECT max(id), count(id) from annotation where type = 4"

Modified: search_index/trunk/mbsearch/serverindex/trackindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/trackindex.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/trackindex.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -34,17 +34,11 @@
 TI_DURATION_QUANT = 2000
 
 
-def _normalizeDur(value):
-    if value is not None and value > 0:
-        return unicode(value)
-    return u''
-
 def _normalizeQdur(value):
     if value is not None and value > 0:
         return unicode(value / TI_DURATION_QUANT)
     return u''
 
-
 class TrackIndex(indexcreator.IndexCreator):
     '''
     This class specifies the details on how to create the track index. For more details on how
@@ -60,13 +54,13 @@
         (u'tracks',  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE,   normalizeText),
         (u'trid',    indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_MBID,   None),
         (u'track',   indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_PRIMARY,normalizeText),
-        (u'dur',     indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE,   _normalizeDur),
-        (u'qdur',    indexcreator.FIELD_INDEX,           indexcreator.FIELD_TOKENIZE_NONE,   _normalizeQdur),
+        (u'dur',     indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE,   normalizeText),
+        (u'qdur',    indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, _normalizeQdur),
         (u'tnum',    indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE,   normalizeText),
     ]
 
-    def __init__(self, indexName, clear, host, database, user, passwd):
-        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+    def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+        indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
 
     def getRowCountQuery(self):
         return "SELECT max(id), count(*) FROM track"

Modified: search_index/trunk/tools/tinysearch.py
===================================================================
--- search_index/trunk/tools/tinysearch.py	2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/tools/tinysearch.py	2008-06-29 07:39:56 UTC (rev 9893)
@@ -1,81 +1,85 @@
 #!/usr/bin/env python
-#---------------------------------------------------------------------------
 #
-#   Pimp My Tunes -- The MusicBrainz command line tagger.
-#                    Let a gazllion tunes be tagged!
-#   
-#   Copyright (C) Robert Kaye 2005
-#   
-#   This file is part of pimpmytunes.
+# Simple command-line search script.
 #
-#   pimpmytunes is free software; you can redistribute it and/or modify
-#   it under the terms of the GNU General Public License as published by
-#   the Free Software Foundation; either version 2 of the License, or
-#   (at your option) any later version.
+# Copyright (C) 2003 James Aylett
+# Copyright (C) 2004,2007 Olly Betts
 #
-#   pimpmytunes is distributed in the hope that it will be useful,
-#   but WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#   GNU General Public License for more details.
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
 #
-#   You should have received a copy of the GNU General Public License
-#   along with pimpmytunes; if not, write to the Free Software
-#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
 #
-#---------------------------------------------------------------------------
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+# USA
 
-import sys, os, getopt, re
-import PyLucene
-import readline
+import sys
+import xapian
 from unac import unac
 
-def usage():
-    print "%s: <index dir> <field> <query>" % sys.argv[0]
-    sys.exit(-1)
+# We require at least two command line arguments.
+if len(sys.argv) < 4:
+    print >> sys.stderr, "Usage: %s PATH_TO_DATABASE FIELD QUERY" % sys.argv[0]
+    sys.exit(1)
 
-class Search(object):
+try:
+    # Open the database for searching.
+    database = xapian.Database(sys.argv[1])
 
-   def __init__(self, indexName):
-       self.analyzer = PyLucene.StandardAnalyzer()
-       try:
-           self.index = PyLucene.IndexSearcher(PyLucene.FSDirectory.getDirectory(indexName, False))
-       except ValueError:
-           raise NoSuchIndexError
+    # Start an enquire session.
+    enquire = xapian.Enquire(database)
 
-   def close(self):
-       self.index.close()
+    # Combine the rest of the command line arguments with spaces between
+    # them, so that simple queries don't have to be quoted at the shell
+    # level.
+    query_string = sys.argv[3]
+    for arg in sys.argv[4:]:
+        query_string += ' '
+        query_string += arg
 
-   def match(self, query, field, maxHits):
-       parser = PyLucene.QueryParser(field, self.analyzer)
-       parsedQuery = parser.parse(query)
-       hits = list(self.index.search(parsedQuery))
-       if not hits:
-           print "No hits"
-           return
+    # Parse the query string to produce a Xapian::Query object.
+    qp = xapian.QueryParser()
+    #stemmer = xapian.Stem("english")
+    #qp.set_stemmer(stemmer)
+    qp.set_database(database)
+    qp.set_stemming_strategy(xapian.QueryParser.STEM_NONE)
+    qp.set_default_op(xapian.Query.OP_AND)
+    qp.add_prefix("artist", "XARTIST")
+    qp.add_prefix("begin", "XBEGIN")
+    qp.add_prefix("end", "XEND")
+    qp.add_prefix("type", "XTYPE")
+    qp.add_prefix("arid", "XARID")
+    qp.add_prefix("release", "XRELEASE")
+    qp.add_prefix("track", "XTRACK")
+    qp.add_prefix("dur", "XDUR")
+    qp.add_prefix("qdur", "XQDUR")
 
-       for hit in hits[:maxHits]:
-           doc = hit.getDocument()
-           for field in doc.getFields():
-               print "%8s: " % field.name().encode('utf-8'),
-               for val in doc.getValues(field.name()):
-                   print "%s " % (val.encode('utf-8')),
-               print
-           print
+    query_string = unac.unac_string(unicode(query_string, 'utf-8'))
+    print "query: '%s'" % query_string
+    query = qp.parse_query(query_string, xapian.QueryParser.FLAG_PHRASE|
+	                                 xapian.QueryParser.FLAG_BOOLEAN|
+					 xapian.QueryParser.FLAG_LOVEHATE,
+					 "X" + sys.argv[2].upper())
+    print "Parsed query is: %s" % query.get_description()
 
-# Parse the command line args
-opts = None
-args = None
-indexDir = "."
+    # Find the top 10 results for the query.
+    enquire.set_query(query)
+    matches = enquire.get_mset(0, 10)
 
-try:
-    opts, args = getopt.getopt(sys.argv[1:], "hi:")
-except:
-    usage()
+    # Display the results.
+    print "%i results found." % matches.get_matches_estimated()
+    print "Results 1-%i:" % matches.size()
 
-for key, value in opts:
-    if key == "-h": usage()
+    for m in matches:
+        print "%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data())
 
-if len(args) < 3: usage()
-
-s = Search(args[0])
-s.match(args[2], args[1], 10)
+except Exception, e:
+    print >> sys.stderr, "Exception: %s" % str(e)
+    sys.exit(1)




More information about the MusicBrainz-commits mailing list