[mb-commits] r9879 - in search_index/trunk/mbsearch: . serverindex

root at musicbrainz.org root at musicbrainz.org
Wed Jun 25 07:56:34 UTC 2008


Author: robert
Date: 2008-06-25 07:56:34 +0000 (Wed, 25 Jun 2008)
New Revision: 9879

Modified:
   search_index/trunk/mbsearch/indexcreator.py
   search_index/trunk/mbsearch/normalize.py
   search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
   search_index/trunk/mbsearch/serverindex/artistindex.py
   search_index/trunk/mbsearch/serverindex/la_annotationindex.py
   search_index/trunk/mbsearch/serverindex/labelindex.py
   search_index/trunk/mbsearch/serverindex/re_annotationindex.py
   search_index/trunk/mbsearch/serverindex/releaseindex.py
   search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
   search_index/trunk/mbsearch/serverindex/trackindex.py
Log:
Lots of tweaking and cleanup. Firmly committed to Xapian now as I've got things tweaked so they are
far better than Lucene. More bugs remain, but its time for a check in.


Modified: search_index/trunk/mbsearch/indexcreator.py
===================================================================
--- search_index/trunk/mbsearch/indexcreator.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/indexcreator.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -27,19 +27,16 @@
 import re, time, sys, string
 import xapian
 import psycopg2
-import normalize
+import re
 from unac import unac
+from escape_ideographic import addSpacesToIdeographicStrings
 
 # Number of rows to process for each database 'chunk'
 IDS_PER_CHUNK = 2000
 STATS_WINDOW_SIZE = 6
-MAX_RAM_SIZE = 32000000
 
 # TODO: Query DB in transactions to make sure final index is consistent
 
-FIELD_STORE_YES = 1
-FIELD_STORE_NO = 0
-
 FIELD_TOKENIZE_YES = 1
 FIELD_TOKENIZE_NO = 0
 
@@ -60,21 +57,14 @@
         self.user = user
         self.passwd = passwd
 
-        self.normalize = normalize.Normalize()
+	self.dotsRe = re.compile("((?:\w\.){2,})")
+
         self.indexer = xapian.TermGenerator()
-        #stemmer = xapian.Stem("english")
-        #self.indexer.set_stemmer(stemmer)
         if create:
              self.index = xapian.WritableDatabase(indexName, xapian.DB_CREATE_OR_OVERWRITE)
         else:
              self.index = xapian.WritableDatabase(indexName, xapian.DB_OPEN)
          
-    def getAnalyzer(self):
-        indexer = xapian.TermGenerator()
-        stemmer = xapian.Stem("english")
-        indexer.set_stemmer(stemmer)
-        return indexer
-
     def close(self):
         '''
         Close the index by discarding a ref
@@ -112,13 +102,31 @@
 
         ret = []
         for field, value in zip(self.FIELDS, row):
-            normalize = field[4]
-            if normalize is not None:
-                value = normalize(value)
+	    normalize = field[4]
+	    if normalize is not None:
+		value = normalize(value)
             ret.append(value)
 
         return ret
+  
+    def removeDots(self, query):
+	'''
+	Remove dots between characters so we can find R.E.M.
+	'''
 
+	bits = []
+	index = 0
+	for m in self.dotsRe.finditer(query):
+	    bits.append(query[index:m.start()])
+	    acronym = query[m.start():m.end()]
+	    bits.append(acronym.replace(u".", u""))
+	    index = m.end()
+
+	if index < len(query):
+	    bits.append(query[index:len(query)])
+
+	return u''.join(bits)
+
     def run(self, doTest = False):
         '''
         Execute the query, massage the returned data and pass it to
@@ -145,7 +153,7 @@
 
         numChunks = (maxId / IDS_PER_CHUNK) + 1
         if doTest:
-            numChunks = min(numChunks, 25)
+            numChunks = min(numChunks, 2)
 
         # Record the start time
         t0 = time.time()
@@ -173,7 +181,7 @@
                         data = [ data ]
 		        isList = False
 
-                    name, weight, method = field[:3]
+                    name, weight, method, tokenize = field[:4]
                     for i, text in enumerate(data):
                         # Ensure that the value is an unicode string
                         if not text: continue
@@ -190,9 +198,20 @@
 			        
                             #if method == FIELD_INDEX: print text.encode('utf-8')
                             if method in (FIELD_INDEX, FIELD_INDEX_AND_STORE):
-                                self.indexer.index_text(unac.unac_string(text.strip()), weight, u"X" + name.upper())
+				if tokenize:
+                                    text = text.strip().lower()
+                                    text = unac.unac_string(text)
+                                    text = self.removeDots(text) 
+                                    text = addSpacesToIdeographicStrings(text) 
+                                    self.indexer.index_text(text, weight, u"X" + name.upper())
+				    # TEST HACK
+				    if name == u"artist": doc.add_value(0, u"%d" % len(text.split(u" ")))
+			        else:
+				    doc.add_term(u"X" + name.upper() + text, weight)
+			        self.indexer.increase_termpos()
 
                 # Add the document to the index
+		#print "data: %s" % (storedata.encode('utf-8', 'replace'))
                 doc.set_data(storedata)
                 self.index.add_document(doc)
                 rowsThisChunk += 1

Modified: search_index/trunk/mbsearch/normalize.py
===================================================================
--- search_index/trunk/mbsearch/normalize.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/normalize.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -32,34 +32,16 @@
 TYPES = (u'album', u'single', u'ep', u'compilation', u'soundtrack', u'spokenword', 
          u'interview', u'audiobook', u'live', u'remix', u'other')
 
-class Normalize(object):
-    '''
-    Normalize input according to our own normalization rules. This way we can mix searches with
-    stop words and searches without stop words in one index.
-    '''
+_dateRe = re.compile('-00')
 
-    def normalize(self, text):
-        '''
-        return the normalized version of this query. See inline comments for exactly what normalization entails. 
-        '''
-
-        # If not unicode, make it unicode
-        if text.__class__.__name__ in ["float", "int"]:
-            text = unicode(text)
-        elif text.__class__.__name__ == "str":
-            text = unicode(text, 'utf-8')
-
-        # unaccent the string and return it
-        return unac.unac_string(text)
-
 def normalizeText(value):
     """Normalize text value from DB."""
     if value is None:
-        return u''
+	return u''
     if isinstance(value, (float, int)):
-        return unicode(value)
+	return unicode(value)
     if isinstance(value, str):
-        value = value.decode('utf-8')
+	value = value.decode('utf-8')
     return value
 
 def normalizeReleaseType(value):
@@ -74,18 +56,13 @@
 	        return u''
         return u''
 
-_dateRe = re.compile('-00')
-
 def normalizeDate(value):
     """Normalize date value from DB."""
-    # FIXME this should return date in format yyyyMMdd, not yyyy-MM-dd (see
-    # org.apache.PyLucene.document.DateTools)
     if value is None:
         return u''
     value = value.decode('utf-8')
     return _dateRe.sub(u'', value)
 
-if __name__ == '__main__':
-    n = Normalize()
-    print n.normalize("The test string %#$")
-    print n.stopWordNormalize("In the Kingdom of the Blind the One-Eyed Are Kings")
+def normalizeMBID(value):
+    """Normalize an MBID UUID"""
+    return value.replace(u'-', u'')

Modified: search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/ar_annotationindex.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/ar_annotationindex.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
 
 import re
 from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeMBID
 
 class ArtistAnnotationIndex(indexcreator.IndexCreator):
     '''
@@ -35,18 +34,15 @@
     '''
 
     FIELDS = [
-              (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None), 
-              (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-              (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
-              (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"mbid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID), 
+              (u"name", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"type", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+              (u"text", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):
         indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
 
-    def getAnalyzer(self):
-        return StandardUnaccentAnalyzer()
-
     def getRowCountQuery(self):
         return "SELECT max(id), count(id) from annotation where type = 1"
 

Modified: search_index/trunk/mbsearch/serverindex/artistindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/artistindex.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/artistindex.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -27,8 +27,7 @@
 import re
 import psycopg2
 from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText, normalizeDate
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeDate, normalizeMBID
 
 TYPES = (u'unknown', u'person', u'group')
 
@@ -42,24 +41,20 @@
     """This class specifies the details on how to create the artist index."""
 
     FIELDS = [
-        (u'arid',     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, None),
-        (u'artist',   10, indexcreator.FIELD_INDEX,           indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'artist',   0, indexcreator.FIELD_STORE,           indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'sortname', 10, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'type',     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, _normalizeArtistType),
-        (u'begin',    1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeDate),
-        (u'end',      1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeDate),
-        (u'comment',  1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'alias',    1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'arid',     1, indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID),
+        (u'artist',   1, indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'sortname', 1, indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'type',     1, indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NO,  _normalizeArtistType),
+        (u'begin',    1, indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
+        (u'end',      1, indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
+        (u'comment',  1, indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'alias',    0, indexcreator.FIELD_INDEX_AND_STORE,  indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):
         indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
         self.aliasDict = {}
 
-    def getAnalyzer(self):
-        return StandardUnaccentAnalyzer()
-
     def getRowCountQuery(self):
         return "SELECT max(id), count(*) FROM artist"
 
@@ -80,40 +75,16 @@
 
         conn.close()
 
-        return """SELECT gid, name || ' ' || sortname || ' ',
-	                 name, sortname, type, begindate, enddate, resolution
+        return """SELECT gid, name, sortname, type, begindate, enddate, resolution
                     FROM artist
                    WHERE id BETWEEN %d AND %d ORDER BY id""" % (chunkNum * chunkSize, ((chunkNum + 1) * chunkSize) - 1)
 
-
-    def uniquer(self, seq, idfun=None):
-	if idfun is None:
-	    def idfun(x): return x
-	seen = {}
-	result = []
-	for item in seq:
-	    marker = idfun(item)
-	    # in old Python versions:
-	    # if seen.has_key(marker)
-	    # but in new ones:
-	    if marker in seen: continue
-	    seen[marker] = 1
-	    result.append(item)
-	return result
-
     def processRow(self, row):
         """Normalize appropriate columns of the row."""
         ret = super(ArtistIndex, self).processRow(row)
         try:
             ret.append(self.aliasDict[row[0]])
-            #ret[1] += ' '.join(self.aliasDict[row[0]])
         except KeyError:
             ret.append([])
 
-        # TODO: Remove no word characters
-        #ret[1] = ret[1].replace(u',', u'').lower()
-        #words = ret[1].split(' ')
-	#words = self.uniquer(words)
-	#ret[1] = ' '.join(words)
-
         return ret

Modified: search_index/trunk/mbsearch/serverindex/la_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/la_annotationindex.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/la_annotationindex.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
 
 import re
 from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeMBID
 
 class LabelAnnotationIndex(indexcreator.IndexCreator):
     '''
@@ -35,18 +34,15 @@
     '''
 
     FIELDS = [
-              (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None), 
-              (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-              (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
-              (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"mbid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID), 
+              (u"name", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"type", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+              (u"text", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):
         indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
 
-    def getAnalyzer(self):
-        return StandardUnaccentAnalyzer()
-
     def getRowCountQuery(self):
         return "SELECT max(id), count(id) from annotation where type = 3"
 

Modified: search_index/trunk/mbsearch/serverindex/labelindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/labelindex.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/labelindex.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -27,8 +27,7 @@
 import re
 import psycopg2
 from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText, normalizeDate
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeDate, normalizeMBID
 
 TYPES = (u'unknown', u'distributor', u'holding', u'production', u'orig. prod.', 
          u'bootleg prod.', u'reissue prod.', u'publisher')
@@ -43,25 +42,22 @@
     """This class specifies the details on how to create the label index."""
 
     FIELDS = [
-        (u'laid',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
-        (u'label',    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'sortname', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'type',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeLabelType),
-        (u'code',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
-        (u'country',  indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
-        (u'begin',    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
-        (u'end',      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
-        (u'comment',  indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'alias',    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'laid',     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID),
+        (u'label',    1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'sortname', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'type',     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, _normalizeLabelType),
+        (u'code',     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+        (u'country',  1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+        (u'begin',    1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
+        (u'end',      1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
+        (u'comment',  1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'alias',    1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):
         indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
         self.aliasDict = {}
 
-    def getAnalyzer(self):
-        return StandardUnaccentAnalyzer()
-
     def getRowCountQuery(self):
         return "SELECT max(id), count(*) FROM label"
 
@@ -82,7 +78,7 @@
 
         conn.close()
 
-        return """SELECT gid, label.name, sortname, type, labelcode, lower(isocode), begindate, enddate, resolution
+        return """SELECT gid, label.name, sortname, type, labelcode, isocode, begindate, enddate, resolution
                     FROM label LEFT JOIN country ON label.country = country.id 
                    WHERE label.id BETWEEN %d AND %d""" % (chunkNum * chunkSize, ((chunkNum + 1) * chunkSize) - 1)
 

Modified: search_index/trunk/mbsearch/serverindex/re_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/re_annotationindex.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/re_annotationindex.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
 
 import re
 from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeMBID
 
 class ReleaseAnnotationIndex(indexcreator.IndexCreator):
     '''
@@ -35,10 +34,10 @@
     '''
 
     FIELDS = [
-              (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None), 
-              (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-              (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
-              (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"mbid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID), 
+              (u"name", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"type", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+              (u"text", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):

Modified: search_index/trunk/mbsearch/serverindex/releaseindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/releaseindex.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/releaseindex.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -27,8 +27,7 @@
 import re
 import psycopg2
 from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType, normalizeMBID
 
 ALBUM_STATUS_FIRST = 100
 ALBUM_STATUS_LAST = 102
@@ -136,22 +135,22 @@
     '''
 
     FIELDS = [ 
-             (u"arid",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
-             (u"artist",    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-             (u"reid",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
-             (u"release",   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-             (u"type",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeReleaseType),
-             (u"status",    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  _normalizeReleaseStatus),
-             (u"tracks",    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
-             (u"discids",   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
-             (u"asin",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-             (u"lang",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeLanguage),
-             (u"script",    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeScript),
-             (u"country",   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-             (u"date",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
-             (u"label",     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-             (u"catno",     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-             (u"barcode",   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"arid",     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID),
+             (u"artist",   1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"reid",     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID),
+             (u"release",  1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"type",     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeReleaseType),
+             (u"status",   1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  _normalizeReleaseStatus),
+             (u"tracks",   1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+             (u"discids",  1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+             (u"asin",     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"lang",     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  _normalizeLanguage),
+             (u"script",   1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  _normalizeScript),
+             (u"country",  1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"date",     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
+             (u"label",    1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"catno",    1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+             (u"barcode",  1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):
@@ -159,9 +158,6 @@
         self.parenRegexp = re.compile('\(.*?\)$|\[(.*?)\]$')
         self.dateRe = re.compile('-00')
 
-    def getAnalyzer(self):
-        return StandardUnaccentAnalyzer()
-
     def getRowCountQuery(self):
         return "SELECT max(id), count(*) FROM album"
 

Modified: search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/tr_annotationindex.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/tr_annotationindex.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
 
 import re
 from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeMBID
 
 class TrackAnnotationIndex(indexcreator.IndexCreator):
     '''
@@ -35,18 +34,15 @@
     '''
 
     FIELDS = [
-              (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None), 
-              (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-              (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
-              (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"mbid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID), 
+              (u"name", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"type", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+              (u"text", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):
         indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
 
-    def getAnalyzer(self):
-        return StandardUnaccentAnalyzer()
-
     def getRowCountQuery(self):
         return "SELECT max(id), count(id) from annotation where type = 4"
 

Modified: search_index/trunk/mbsearch/serverindex/trackindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/trackindex.py	2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/trackindex.py	2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
 
 import re
 from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType, normalizeMBID
 
 ALBUM_ATTR_FIRST = 1
 ALBUM_ATTR_LAST = 11
@@ -53,25 +52,22 @@
     '''
 
     FIELDS = [
-        (u'arid',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
-        (u'artist',   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'reid',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
-        (u'release',  indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'type',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeReleaseType),
-        (u'tracks',   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
-        (u'trid',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
-        (u'track',    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
-        (u'dur',      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  _normalizeDur),
-        (u'qdur',     indexcreator.FIELD_STORE_NO,  indexcreator.FIELD_TOKENIZE_NO,  _normalizeQdur),
-        (u'tnum',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+        (u'arid',    1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID),
+        (u'artist',  1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'reid',    1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID),
+        (u'release', 1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'type',    1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeReleaseType),
+        (u'tracks',  1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+        (u'trid',    1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeMBID),
+        (u'track',   1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+        (u'dur',     1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  _normalizeDur),
+        (u'qdur',    1,  indexcreator.FIELD_INDEX,           indexcreator.FIELD_TOKENIZE_NO,  _normalizeQdur),
+        (u'tnum',    1,  indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):
         indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
 
-    def getAnalyzer(self):
-        return StandardUnaccentAnalyzer()
-
     def getRowCountQuery(self):
         return "SELECT max(id), count(*) FROM track"
 




More information about the MusicBrainz-commits mailing list