[mb-commits] r9863 - in search_index/trunk: . mbsearch mbsearch/serverindex mbsearch/serverindex/analyzers

root at musicbrainz.org root at musicbrainz.org
Sat May 31 04:24:30 UTC 2008


Author: robert
Date: 2008-05-31 04:24:30 +0000 (Sat, 31 May 2008)
New Revision: 9863

Modified:
   search_index/trunk/README
   search_index/trunk/builder.py
   search_index/trunk/mbsearch/indexcreator.py
   search_index/trunk/mbsearch/normalize.py
   search_index/trunk/mbsearch/serverindex/analyzers/unaccent.py
   search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
   search_index/trunk/mbsearch/serverindex/artistindex.py
   search_index/trunk/mbsearch/serverindex/la_annotationindex.py
   search_index/trunk/mbsearch/serverindex/labelindex.py
   search_index/trunk/mbsearch/serverindex/re_annotationindex.py
   search_index/trunk/mbsearch/serverindex/releaseindex.py
   search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
   search_index/trunk/mbsearch/serverindex/trackindex.py
Log:
Port search indexes to use xapian


Modified: search_index/trunk/README
===================================================================
--- search_index/trunk/README	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/README	2008-05-31 04:24:30 UTC (rev 9863)
@@ -1,4 +1,4 @@
-Lucene Indexes for MusicBrainz
+Search Indexes for MusicBrainz
 ------------------------------
 
 Introduction
@@ -16,14 +16,15 @@
 The following software packages are required to build the lucene indexes
 (use the most recent version, unless otherwise noted)
 
-postgres 8.x.x     - A postgres install with a full MusicBrainz
-                     data set. See the INSTALL file in mb_server
-                     for details on this. Or only the client portions
-		     if postgres is running on another server.
-python 2.4         - http://python.org (2.4 or better)
-PsychopgTwo        - http://www.initd.org/tracker/psycopg/wiki/PsycopgTwo
-PyLucene 2.2.x     - http://downloads.osafoundation.org/PyLucene
-PyUnac 1.7.x       - http://download.gna.org/unac/python-unac-1.7.0.tar.gz
+postgres 8.x.x         - A postgres install with a full MusicBrainz
+                         data set. See the INSTALL file in mb_server
+                         for details on this. Or only the client portions
+		         if postgres is running on another server.
+python 2.4             - http://python.org (2.4 or better)
+PsychopgTwo            - http://www.initd.org/tracker/psycopg/wiki/PsycopgTwo
+Xapian                 - http://xapian.org/download.php
+Xapian Python bindings - http://xapian.org/download.php
+PyUnac 1.7.x           - http://download.gna.org/unac/python-unac-1.7.0.tar.gz
 
 Needed for Python versions 2.4.x and prior
 cytpes             - http://starship.python.net/crew/theller/ctypes/
@@ -53,4 +54,4 @@
    artist_index, annotation_index, label_index, release_index, track_index
 
 Once the builder.py script completes, you can move these files into the 
-lucene_server index directory. (See lucene_index/README for details on this).
+search_server index directory. (See search_server/README for details on this).

Modified: search_index/trunk/builder.py
===================================================================
--- search_index/trunk/builder.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/builder.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
 #---------------------------------------------------------------------------
 
 import sys, getopt
-import PyLucene
 
 def usage():
     print "%s: [-h -d -u -p -t -b]" % sys.argv[0]
@@ -74,8 +73,6 @@
     index = artistindex.ArtistIndex("data/artist_index", True, host, database, user, passwd)
     ok = index.run(test)
     if ok:
-        print "optimizing"
-        index.optimize()
         index.close()
         print "done\n"
 
@@ -84,8 +81,6 @@
     index = releaseindex.ReleaseIndex("data/release_index", True, host, database, user, passwd)
     ok = index.run(test)
     if ok:
-        print "optimizing"
-        index.optimize()
         index.close()
         print "done\n"
    
@@ -94,8 +89,6 @@
     index = trackindex.TrackIndex("data/track_index", True, host, database, user, passwd)
     ok = index.run(test)
     if ok:
-        print "optimizing"
-        index.optimize()
         index.close()
         print "done\n"
 
@@ -119,8 +112,6 @@
 		index = tr_annotationindex.TrackAnnotationIndex("data/annotation_index", False, host, database, user, passwd)
 		ok = index.run(test)
 		if ok:
-		    print "optimizing"
-		    index.optimize()
 		    index.close()
 		    print "done\n"
 		else:
@@ -130,9 +121,6 @@
     print "Creating label index:"
     index = labelindex.LabelIndex("data/label_index", True, host, database, user, passwd)
     ok = index.run(test)
-    print "optimizing"
     if ok:
-        ok = index.optimize()
-        if ok:
-            index.close()
-            print "done\n"
+         index.close()
+         print "done\n"

Modified: search_index/trunk/mbsearch/indexcreator.py
===================================================================
--- search_index/trunk/mbsearch/indexcreator.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/indexcreator.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -24,10 +24,11 @@
 #
 #---------------------------------------------------------------------------
 
-import PyLucene, re, time, sys
+import re, time, sys, string
+import xapian
 import psycopg2
+import normalize
 from unac import unac
-import normalize
 
 # Number of rows to process for each database 'chunk'
 IDS_PER_CHUNK = 2000
@@ -36,6 +37,16 @@
 
 # TODO: Query DB in transactions to make sure final index is consistent
 
+FIELD_STORE_YES = 1
+FIELD_STORE_NO = 0
+
+FIELD_TOKENIZE_YES = 1
+FIELD_TOKENIZE_NO = 0
+
+FIELD_INDEX_AND_STORE = 0
+FIELD_INDEX = 1
+FIELD_STORE = 2
+
 class IndexCreator(object):
     '''
     This class is the base class for creating lucene indexes from the
@@ -49,30 +60,26 @@
         self.user = user
         self.passwd = passwd
 
-        self.analyzer = self.getAnalyzer()
         self.normalize = normalize.Normalize()
-
-        self.index = PyLucene.IndexWriter(
-               PyLucene.FSDirectory.getDirectory(indexName, create), 
-               self.analyzer, create)
-        self.index.setMergeFactor(100)
-        self.index.setMaxBufferedDocs(1000)
-        self.index.setUseCompoundFile(False)
+        self.indexer = xapian.TermGenerator()
+        #stemmer = xapian.Stem("english")
+        #self.indexer.set_stemmer(stemmer)
+        if create:
+             self.index = xapian.WritableDatabase(indexName, xapian.DB_CREATE_OR_OVERWRITE)
+        else:
+             self.index = xapian.WritableDatabase(indexName, xapian.DB_OPEN)
          
     def getAnalyzer(self):
-        return PyLucene.StandardAnalyzer()
+        indexer = xapian.TermGenerator()
+        stemmer = xapian.Stem("english")
+        indexer.set_stemmer(stemmer)
+        return indexer
 
-    def optimize(self):
-        '''
-        Optimize the current index
-        '''
-        self.index.optimize()
-
     def close(self):
         '''
-        Close the lucene index
+        Close the index by discarding a ref
         '''
-        self.index.close()
+        del self.index
 
     def getRowCountQuery(self):
         """Return the SQL query that determines how many rows need
@@ -105,7 +112,7 @@
 
         ret = []
         for field, value in zip(self.FIELDS, row):
-            normalize = field[3]
+            normalize = field[4]
             if normalize is not None:
                 value = normalize(value)
             ret.append(value)
@@ -127,25 +134,8 @@
             print "Cannot connect to the database: %s" % unicode(msg)
             return False
 
-        # get the current replication number
+        # Get the number of rows we need to process
         curs = conn.cursor()
-        curs.execute("SELECT current_replication_sequence, last_replication_date FROM replication_control");
-        rows = curs.fetchall()
-        if len(rows) and rows[0][0]:
-            row = rows[0]
-            doc = PyLucene.Document()
-            doc.add(PyLucene.Field(u"repseqkey", u'1',
-                                   PyLucene.Field.Store.YES,
-                                   PyLucene.Field.Index.TOKENIZED))
-            doc.add(PyLucene.Field(u"repseq", u'%d' % row[0],
-                                   PyLucene.Field.Store.YES,
-                                   PyLucene.Field.Index.TOKENIZED))
-            doc.add(PyLucene.Field(u"repdate", unicode(row[1].ctime(), 'utf-8'),
-                                   PyLucene.Field.Store.YES,
-                                   PyLucene.Field.Index.TOKENIZED))
-            self.index.addDocument(doc)
-
-        # Get the number of rows we need to process
         curs.execute(self.getRowCountQuery())
         rows = curs.fetchall()
         if not rows: return False
@@ -155,7 +145,7 @@
 
         numChunks = (maxId / IDS_PER_CHUNK) + 1
         if doTest:
-            numChunks = min(numChunks, 3)
+            numChunks = min(numChunks, 25)
 
         # Record the start time
         t0 = time.time()
@@ -172,29 +162,40 @@
                 row = self.processRow(row)
                 index = 0
 
-                doc = PyLucene.Document()
+                doc = xapian.Document()
+                self.indexer.set_document(doc)
+                storedata = u""
                 for field, data in zip(self.FIELDS, row):
+
+		    isList = True
                     # if its not a list of items, make it a list
                     if not isinstance(data, list):
                         data = [ data ]
+		        isList = False
 
-                    name, store, index = field[:3]
-                    for text in data:
+                    name, weight, method = field[:3]
+                    for i, text in enumerate(data):
                         # Ensure that the value is an unicode string
-                        if text == None: 
-                            text = u''
+                        if not text: continue
                         if not isinstance(text, basestring):
                             text = unicode(text)
                         # Add the pair to the current document
                         if text:
 			    #print "%s:%s" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
-                            doc.add(PyLucene.Field(name, text.strip(), store, index))
+                            if method in (FIELD_STORE, FIELD_INDEX_AND_STORE): 
+                                if isList:
+                                    storedata += u"%s%d=%s\n" % (name, i, text.replace(u"\n", u"\\n").strip())
+			        else:
+                                    storedata += u"%s=%s\n" % (name, text.replace(u"\n", u"\\n").strip())
+			        
+                            #if method == FIELD_INDEX: print text.encode('utf-8')
+                            if method in (FIELD_INDEX, FIELD_INDEX_AND_STORE):
+                                self.indexer.index_text(unac.unac_string(text.strip()), weight, u"X" + name.upper())
 
                 # Add the document to the index
-                self.index.addDocument(doc)
+                doc.set_data(storedata)
+                self.index.add_document(doc)
                 rowsThisChunk += 1
-                if self.index.ramSizeInBytes() > MAX_RAM_SIZE:
-                    self.index.flush()
 
             t2 = time.time()
             rowsIndexed += rowsThisChunk

Modified: search_index/trunk/mbsearch/normalize.py
===================================================================
--- search_index/trunk/mbsearch/normalize.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/normalize.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
 #---------------------------------------------------------------------------
 
 import re
-import PyLucene
 from unac import unac
 
 ALBUM_ATTR_FIRST = 1
@@ -53,22 +52,6 @@
         # unaccent the string and return it
         return unac.unac_string(text)
 
-    def stopWordNormalize(self, text):
-        '''
-        return the normalized version of this query. Same as the standard normalization, but stop words
-        are also removed
-        '''
-
-        stopWords = PyLucene.StopAnalyzer.ENGLISH_STOP_WORDS
-        words = self.normalize(text).split()
-        for stopWord in stopWords:
-            try:
-                while True: del words[words.index(stopWord)]
-            except ValueError:
-                pass
-           
-        return ' '.join(words)
-
 def normalizeText(value):
     """Normalize text value from DB."""
     if value is None:

Modified: search_index/trunk/mbsearch/serverindex/analyzers/unaccent.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/analyzers/unaccent.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/analyzers/unaccent.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -1,32 +1,32 @@
-import PyLucene
-from unac import unac
+#from unac import unac
 
-class AccentFilter(object):
-    """Remove accents from the input text."""
+#class AccentFilter(object):
+#    """Remove accents from the input text."""
+#
+#    def __init__(self, input):
+#        self.input = input
+#
+#    def __iter__(self):
+#        return self
+#
+#    def next(self):
+#        token = self.input.next()
+#        if token is not None:
+#            text = unac.unac_string(token.termText())
+#            token = PyLucene.Token(text, token.startOffset(),
+#                                   token.endOffset(), token.type())
+#        return token
 
-    def __init__(self, input):
-        self.input = input
 
-    def __iter__(self):
-        return self
-
-    def next(self):
-        token = self.input.next()
-        if token is not None:
-            text = unac.unac_string(token.termText())
-            token = PyLucene.Token(text, token.startOffset(),
-                                   token.endOffset(), token.type())
-        return token
-
-
 class StandardUnaccentAnalyzer(object):
     """Like PyLucene.StandardAnalyzer but unaccents the input text."""
-
-    def tokenStream(self, fieldName, reader):
-        res = PyLucene.StandardTokenizer(reader)
-        res = PyLucene.StandardFilter(res)
-        res = PyLucene.StopFilter(res,
-                                  PyLucene.StopAnalyzer().ENGLISH_STOP_WORDS)
-        res = AccentFilter(res)
-        res = PyLucene.LowerCaseFilter(res)
-        return res
+    pass
+#
+#    def tokenStream(self, fieldName, reader):
+#        res = PyLucene.StandardTokenizer(reader)
+#        res = PyLucene.StandardFilter(res)
+#        res = PyLucene.StopFilter(res,
+#                                  PyLucene.StopAnalyzer().ENGLISH_STOP_WORDS)
+#        res = AccentFilter(res)
+#        res = PyLucene.LowerCaseFilter(res)
+#        return res

Modified: search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/ar_annotationindex.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/ar_annotationindex.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
 #---------------------------------------------------------------------------
 
 import re
-import PyLucene
 from mbsearch import indexcreator
 from mbsearch.normalize import normalizeText
 from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -36,10 +35,10 @@
     '''
 
     FIELDS = [
-              (u"mbid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None), 
-              (u"name", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-              (u"type", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
-              (u"text", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
+              (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None), 
+              (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+              (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):

Modified: search_index/trunk/mbsearch/serverindex/artistindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/artistindex.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/artistindex.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
 #---------------------------------------------------------------------------
 
 import re
-import PyLucene
 import psycopg2
 from mbsearch import indexcreator
 from mbsearch.normalize import normalizeText, normalizeDate
@@ -43,14 +42,15 @@
     """This class specifies the details on how to create the artist index."""
 
     FIELDS = [
-        (u'arid',     PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
-        (u'artist',   PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-        (u'sortname', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-        (u'type',     PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, _normalizeArtistType),
-        (u'begin',    PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
-        (u'end',      PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
-        (u'comment',  PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-        (u'alias',    PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
+        (u'arid',     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, None),
+        (u'artist',   10, indexcreator.FIELD_INDEX,           indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'artist',   0, indexcreator.FIELD_STORE,           indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'sortname', 10, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'type',     1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, _normalizeArtistType),
+        (u'begin',    1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeDate),
+        (u'end',      1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeDate),
+        (u'comment',  1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'alias',    1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):
@@ -80,15 +80,40 @@
 
         conn.close()
 
-        return """SELECT gid, name, sortname, type, begindate, enddate, resolution
+        return """SELECT gid, name || ' ' || sortname || ' ',
+	                 name, sortname, type, begindate, enddate, resolution
                     FROM artist
                    WHERE id BETWEEN %d AND %d ORDER BY id""" % (chunkNum * chunkSize, ((chunkNum + 1) * chunkSize) - 1)
 
+
+    def uniquer(self, seq, idfun=None):
+	if idfun is None:
+	    def idfun(x): return x
+	seen = {}
+	result = []
+	for item in seq:
+	    marker = idfun(item)
+	    # in old Python versions:
+	    # if seen.has_key(marker)
+	    # but in new ones:
+	    if marker in seen: continue
+	    seen[marker] = 1
+	    result.append(item)
+	return result
+
     def processRow(self, row):
         """Normalize appropriate columns of the row."""
         ret = super(ArtistIndex, self).processRow(row)
         try:
             ret.append(self.aliasDict[row[0]])
+            #ret[1] += ' '.join(self.aliasDict[row[0]])
         except KeyError:
             ret.append([])
+
+        # TODO: Remove no word characters
+        #ret[1] = ret[1].replace(u',', u'').lower()
+        #words = ret[1].split(' ')
+	#words = self.uniquer(words)
+	#ret[1] = ' '.join(words)
+
         return ret

Modified: search_index/trunk/mbsearch/serverindex/la_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/la_annotationindex.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/la_annotationindex.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
 #---------------------------------------------------------------------------
 
 import re
-import PyLucene
 from mbsearch import indexcreator
 from mbsearch.normalize import normalizeText
 from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -36,10 +35,10 @@
     '''
 
     FIELDS = [
-              (u"mbid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None), 
-              (u"name", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-              (u"type", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
-              (u"text", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
+              (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None), 
+              (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+              (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):

Modified: search_index/trunk/mbsearch/serverindex/labelindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/labelindex.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/labelindex.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
 #---------------------------------------------------------------------------
 
 import re
-import PyLucene
 import psycopg2
 from mbsearch import indexcreator
 from mbsearch.normalize import normalizeText, normalizeDate
@@ -44,16 +43,16 @@
     """This class specifies the details on how to create the label index."""
 
     FIELDS = [
-        (u'laid',     PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
-        (u'label',    PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-        (u'sortname', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-        (u'type',     PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    _normalizeLabelType),
-        (u'code',     PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
-        (u'country',  PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
-        (u'begin',    PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
-        (u'end',      PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
-        (u'comment',  PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-        (u'alias',    PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
+        (u'laid',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
+        (u'label',    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'sortname', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'type',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeLabelType),
+        (u'code',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+        (u'country',  indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+        (u'begin',    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
+        (u'end',      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
+        (u'comment',  indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'alias',    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):

Modified: search_index/trunk/mbsearch/serverindex/re_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/re_annotationindex.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/re_annotationindex.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
 #---------------------------------------------------------------------------
 
 import re
-import PyLucene
 from mbsearch import indexcreator
 from mbsearch.normalize import normalizeText
 from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -36,10 +35,10 @@
     '''
 
     FIELDS = [
-              (u"mbid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None), 
-              (u"name", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-              (u"type", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
-              (u"text", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
+              (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None), 
+              (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+              (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):

Modified: search_index/trunk/mbsearch/serverindex/releaseindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/releaseindex.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/releaseindex.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -24,7 +24,7 @@
 #
 #---------------------------------------------------------------------------
 
-import PyLucene, re
+import re
 import psycopg2
 from mbsearch import indexcreator
 from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType
@@ -136,22 +136,22 @@
     '''
 
     FIELDS = [ 
-             (u"arid",      PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
-             (u"artist",    PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-             (u"reid",      PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
-             (u"release",   PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-             (u"type",      PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeReleaseType),
-             (u"status",    PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, _normalizeReleaseStatus),
-             (u"tracks",    PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
-             (u"discids",   PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
-             (u"asin",      PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-             (u"lang",      PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    _normalizeLanguage),
-             (u"script",    PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    _normalizeScript),
-             (u"country",   PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-             (u"date",      PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
-             (u"label",     PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-             (u"catno",     PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-             (u"barcode",   PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
+             (u"arid",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
+             (u"artist",    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"reid",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
+             (u"release",   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"type",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeReleaseType),
+             (u"status",    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  _normalizeReleaseStatus),
+             (u"tracks",    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+             (u"discids",   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+             (u"asin",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"lang",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeLanguage),
+             (u"script",    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeScript),
+             (u"country",   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"date",      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeDate),
+             (u"label",     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"catno",     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+             (u"barcode",   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):
@@ -183,11 +183,11 @@
                 self.aliasDict[row[0]] = { 'country' : [], 'date' : [], 'label' : [], 'catno' : [], 'barcode' : [] };
 
             ref = self.aliasDict[row[0]]
-            ref['country'].append(unicode(row[1] or '-', 'utf-8'))
-            ref['date'].append(self.dateRe.sub('', unicode(row[2], 'utf-8')) or u'-')
-            ref['label'].append(unicode(row[3] or '-', 'utf-8'))
-            ref['catno'].append(unicode(row[4] or '-', 'utf-8'))
-            ref['barcode'].append(unicode(row[5] or '-', 'utf-8'))
+            ref['country'].append(unicode(row[1] or '', 'utf-8'))
+            ref['date'].append(self.dateRe.sub('', unicode(row[2], 'utf-8')) or u'')
+            ref['label'].append(unicode(row[3] or '', 'utf-8'))
+            ref['catno'].append(unicode(row[4] or '', 'utf-8'))
+            ref['barcode'].append(unicode(row[5] or '', 'utf-8'))
 
         conn.close()
 

Modified: search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/tr_annotationindex.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/tr_annotationindex.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
 #---------------------------------------------------------------------------
 
 import re
-import PyLucene
 from mbsearch import indexcreator
 from mbsearch.normalize import normalizeText
 from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -36,10 +35,10 @@
     '''
 
     FIELDS = [
-              (u"mbid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None), 
-              (u"name", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-              (u"type", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
-              (u"text", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
+              (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None), 
+              (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+              (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+              (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):

Modified: search_index/trunk/mbsearch/serverindex/trackindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/trackindex.py	2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/trackindex.py	2008-05-31 04:24:30 UTC (rev 9863)
@@ -24,7 +24,7 @@
 #
 #---------------------------------------------------------------------------
 
-import PyLucene, re
+import re
 from mbsearch import indexcreator
 from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType
 from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -53,17 +53,17 @@
     '''
 
     FIELDS = [
-        (u'arid',     PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
-        (u'artist',   PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-        (u'reid',     PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
-        (u'release',  PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-        (u'type',     PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeReleaseType),
-        (u'tracks',   PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
-        (u'trid',     PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
-        (u'track',    PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED,    normalizeText),
-        (u'dur',      PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, _normalizeDur),
-        (u'qdur',     PyLucene.Field.Store.NO,  PyLucene.Field.Index.UN_TOKENIZED, _normalizeQdur),
-        (u'tnum',     PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
+        (u'arid',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
+        (u'artist',   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'reid',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
+        (u'release',  indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'type',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeReleaseType),
+        (u'tracks',   indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
+        (u'trid',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  None),
+        (u'track',    indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+        (u'dur',      indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  _normalizeDur),
+        (u'qdur',     indexcreator.FIELD_STORE_NO,  indexcreator.FIELD_TOKENIZE_NO,  _normalizeQdur),
+        (u'tnum',     indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO,  normalizeText),
     ]
 
     def __init__(self, indexName, clear, host, database, user, passwd):




More information about the MusicBrainz-commits mailing list