[mb-commits] r9863 - in search_index/trunk: . mbsearch mbsearch/serverindex mbsearch/serverindex/analyzers
root at musicbrainz.org
root at musicbrainz.org
Sat May 31 04:24:30 UTC 2008
Author: robert
Date: 2008-05-31 04:24:30 +0000 (Sat, 31 May 2008)
New Revision: 9863
Modified:
search_index/trunk/README
search_index/trunk/builder.py
search_index/trunk/mbsearch/indexcreator.py
search_index/trunk/mbsearch/normalize.py
search_index/trunk/mbsearch/serverindex/analyzers/unaccent.py
search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
search_index/trunk/mbsearch/serverindex/artistindex.py
search_index/trunk/mbsearch/serverindex/la_annotationindex.py
search_index/trunk/mbsearch/serverindex/labelindex.py
search_index/trunk/mbsearch/serverindex/re_annotationindex.py
search_index/trunk/mbsearch/serverindex/releaseindex.py
search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
search_index/trunk/mbsearch/serverindex/trackindex.py
Log:
Port search indexes to use xapian
Modified: search_index/trunk/README
===================================================================
--- search_index/trunk/README 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/README 2008-05-31 04:24:30 UTC (rev 9863)
@@ -1,4 +1,4 @@
-Lucene Indexes for MusicBrainz
+Search Indexes for MusicBrainz
------------------------------
Introduction
@@ -16,14 +16,15 @@
The following software packages are required to build the lucene indexes
(use the most recent version, unless otherwise noted)
-postgres 8.x.x - A postgres install with a full MusicBrainz
- data set. See the INSTALL file in mb_server
- for details on this. Or only the client portions
- if postgres is running on another server.
-python 2.4 - http://python.org (2.4 or better)
-PsychopgTwo - http://www.initd.org/tracker/psycopg/wiki/PsycopgTwo
-PyLucene 2.2.x - http://downloads.osafoundation.org/PyLucene
-PyUnac 1.7.x - http://download.gna.org/unac/python-unac-1.7.0.tar.gz
+postgres 8.x.x - A postgres install with a full MusicBrainz
+ data set. See the INSTALL file in mb_server
+ for details on this. Or only the client portions
+ if postgres is running on another server.
+python 2.4 - http://python.org (2.4 or better)
+PsychopgTwo - http://www.initd.org/tracker/psycopg/wiki/PsycopgTwo
+Xapian - http://xapian.org/download.php
+Xapian Python bindings - http://xapian.org/download.php
+PyUnac 1.7.x - http://download.gna.org/unac/python-unac-1.7.0.tar.gz
Needed for Python versions 2.4.x and prior
cytpes - http://starship.python.net/crew/theller/ctypes/
@@ -53,4 +54,4 @@
artist_index, annotation_index, label_index, release_index, track_index
Once the builder.py script completes, you can move these files into the
-lucene_server index directory. (See lucene_index/README for details on this).
+search_server index directory. (See search_server/README for details on this).
Modified: search_index/trunk/builder.py
===================================================================
--- search_index/trunk/builder.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/builder.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
#---------------------------------------------------------------------------
import sys, getopt
-import PyLucene
def usage():
print "%s: [-h -d -u -p -t -b]" % sys.argv[0]
@@ -74,8 +73,6 @@
index = artistindex.ArtistIndex("data/artist_index", True, host, database, user, passwd)
ok = index.run(test)
if ok:
- print "optimizing"
- index.optimize()
index.close()
print "done\n"
@@ -84,8 +81,6 @@
index = releaseindex.ReleaseIndex("data/release_index", True, host, database, user, passwd)
ok = index.run(test)
if ok:
- print "optimizing"
- index.optimize()
index.close()
print "done\n"
@@ -94,8 +89,6 @@
index = trackindex.TrackIndex("data/track_index", True, host, database, user, passwd)
ok = index.run(test)
if ok:
- print "optimizing"
- index.optimize()
index.close()
print "done\n"
@@ -119,8 +112,6 @@
index = tr_annotationindex.TrackAnnotationIndex("data/annotation_index", False, host, database, user, passwd)
ok = index.run(test)
if ok:
- print "optimizing"
- index.optimize()
index.close()
print "done\n"
else:
@@ -130,9 +121,6 @@
print "Creating label index:"
index = labelindex.LabelIndex("data/label_index", True, host, database, user, passwd)
ok = index.run(test)
- print "optimizing"
if ok:
- ok = index.optimize()
- if ok:
- index.close()
- print "done\n"
+ index.close()
+ print "done\n"
Modified: search_index/trunk/mbsearch/indexcreator.py
===================================================================
--- search_index/trunk/mbsearch/indexcreator.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/indexcreator.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -24,10 +24,11 @@
#
#---------------------------------------------------------------------------
-import PyLucene, re, time, sys
+import re, time, sys, string
+import xapian
import psycopg2
+import normalize
from unac import unac
-import normalize
# Number of rows to process for each database 'chunk'
IDS_PER_CHUNK = 2000
@@ -36,6 +37,16 @@
# TODO: Query DB in transactions to make sure final index is consistent
+FIELD_STORE_YES = 1
+FIELD_STORE_NO = 0
+
+FIELD_TOKENIZE_YES = 1
+FIELD_TOKENIZE_NO = 0
+
+FIELD_INDEX_AND_STORE = 0
+FIELD_INDEX = 1
+FIELD_STORE = 2
+
class IndexCreator(object):
'''
This class is the base class for creating lucene indexes from the
@@ -49,30 +60,26 @@
self.user = user
self.passwd = passwd
- self.analyzer = self.getAnalyzer()
self.normalize = normalize.Normalize()
-
- self.index = PyLucene.IndexWriter(
- PyLucene.FSDirectory.getDirectory(indexName, create),
- self.analyzer, create)
- self.index.setMergeFactor(100)
- self.index.setMaxBufferedDocs(1000)
- self.index.setUseCompoundFile(False)
+ self.indexer = xapian.TermGenerator()
+ #stemmer = xapian.Stem("english")
+ #self.indexer.set_stemmer(stemmer)
+ if create:
+ self.index = xapian.WritableDatabase(indexName, xapian.DB_CREATE_OR_OVERWRITE)
+ else:
+ self.index = xapian.WritableDatabase(indexName, xapian.DB_OPEN)
def getAnalyzer(self):
- return PyLucene.StandardAnalyzer()
+ indexer = xapian.TermGenerator()
+ stemmer = xapian.Stem("english")
+ indexer.set_stemmer(stemmer)
+ return indexer
- def optimize(self):
- '''
- Optimize the current index
- '''
- self.index.optimize()
-
def close(self):
'''
- Close the lucene index
+ Close the index by discarding a ref
'''
- self.index.close()
+ del self.index
def getRowCountQuery(self):
"""Return the SQL query that determines how many rows need
@@ -105,7 +112,7 @@
ret = []
for field, value in zip(self.FIELDS, row):
- normalize = field[3]
+ normalize = field[4]
if normalize is not None:
value = normalize(value)
ret.append(value)
@@ -127,25 +134,8 @@
print "Cannot connect to the database: %s" % unicode(msg)
return False
- # get the current replication number
+ # Get the number of rows we need to process
curs = conn.cursor()
- curs.execute("SELECT current_replication_sequence, last_replication_date FROM replication_control");
- rows = curs.fetchall()
- if len(rows) and rows[0][0]:
- row = rows[0]
- doc = PyLucene.Document()
- doc.add(PyLucene.Field(u"repseqkey", u'1',
- PyLucene.Field.Store.YES,
- PyLucene.Field.Index.TOKENIZED))
- doc.add(PyLucene.Field(u"repseq", u'%d' % row[0],
- PyLucene.Field.Store.YES,
- PyLucene.Field.Index.TOKENIZED))
- doc.add(PyLucene.Field(u"repdate", unicode(row[1].ctime(), 'utf-8'),
- PyLucene.Field.Store.YES,
- PyLucene.Field.Index.TOKENIZED))
- self.index.addDocument(doc)
-
- # Get the number of rows we need to process
curs.execute(self.getRowCountQuery())
rows = curs.fetchall()
if not rows: return False
@@ -155,7 +145,7 @@
numChunks = (maxId / IDS_PER_CHUNK) + 1
if doTest:
- numChunks = min(numChunks, 3)
+ numChunks = min(numChunks, 25)
# Record the start time
t0 = time.time()
@@ -172,29 +162,40 @@
row = self.processRow(row)
index = 0
- doc = PyLucene.Document()
+ doc = xapian.Document()
+ self.indexer.set_document(doc)
+ storedata = u""
for field, data in zip(self.FIELDS, row):
+
+ isList = True
# if its not a list of items, make it a list
if not isinstance(data, list):
data = [ data ]
+ isList = False
- name, store, index = field[:3]
- for text in data:
+ name, weight, method = field[:3]
+ for i, text in enumerate(data):
# Ensure that the value is an unicode string
- if text == None:
- text = u''
+ if not text: continue
if not isinstance(text, basestring):
text = unicode(text)
# Add the pair to the current document
if text:
#print "%s:%s" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
- doc.add(PyLucene.Field(name, text.strip(), store, index))
+ if method in (FIELD_STORE, FIELD_INDEX_AND_STORE):
+ if isList:
+ storedata += u"%s%d=%s\n" % (name, i, text.replace(u"\n", u"\\n").strip())
+ else:
+ storedata += u"%s=%s\n" % (name, text.replace(u"\n", u"\\n").strip())
+
+ #if method == FIELD_INDEX: print text.encode('utf-8')
+ if method in (FIELD_INDEX, FIELD_INDEX_AND_STORE):
+ self.indexer.index_text(unac.unac_string(text.strip()), weight, u"X" + name.upper())
# Add the document to the index
- self.index.addDocument(doc)
+ doc.set_data(storedata)
+ self.index.add_document(doc)
rowsThisChunk += 1
- if self.index.ramSizeInBytes() > MAX_RAM_SIZE:
- self.index.flush()
t2 = time.time()
rowsIndexed += rowsThisChunk
Modified: search_index/trunk/mbsearch/normalize.py
===================================================================
--- search_index/trunk/mbsearch/normalize.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/normalize.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
#---------------------------------------------------------------------------
import re
-import PyLucene
from unac import unac
ALBUM_ATTR_FIRST = 1
@@ -53,22 +52,6 @@
# unaccent the string and return it
return unac.unac_string(text)
- def stopWordNormalize(self, text):
- '''
- return the normalized version of this query. Same as the standard normalization, but stop words
- are also removed
- '''
-
- stopWords = PyLucene.StopAnalyzer.ENGLISH_STOP_WORDS
- words = self.normalize(text).split()
- for stopWord in stopWords:
- try:
- while True: del words[words.index(stopWord)]
- except ValueError:
- pass
-
- return ' '.join(words)
-
def normalizeText(value):
"""Normalize text value from DB."""
if value is None:
Modified: search_index/trunk/mbsearch/serverindex/analyzers/unaccent.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/analyzers/unaccent.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/analyzers/unaccent.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -1,32 +1,32 @@
-import PyLucene
-from unac import unac
+#from unac import unac
-class AccentFilter(object):
- """Remove accents from the input text."""
+#class AccentFilter(object):
+# """Remove accents from the input text."""
+#
+# def __init__(self, input):
+# self.input = input
+#
+# def __iter__(self):
+# return self
+#
+# def next(self):
+# token = self.input.next()
+# if token is not None:
+# text = unac.unac_string(token.termText())
+# token = PyLucene.Token(text, token.startOffset(),
+# token.endOffset(), token.type())
+# return token
- def __init__(self, input):
- self.input = input
- def __iter__(self):
- return self
-
- def next(self):
- token = self.input.next()
- if token is not None:
- text = unac.unac_string(token.termText())
- token = PyLucene.Token(text, token.startOffset(),
- token.endOffset(), token.type())
- return token
-
-
class StandardUnaccentAnalyzer(object):
"""Like PyLucene.StandardAnalyzer but unaccents the input text."""
-
- def tokenStream(self, fieldName, reader):
- res = PyLucene.StandardTokenizer(reader)
- res = PyLucene.StandardFilter(res)
- res = PyLucene.StopFilter(res,
- PyLucene.StopAnalyzer().ENGLISH_STOP_WORDS)
- res = AccentFilter(res)
- res = PyLucene.LowerCaseFilter(res)
- return res
+ pass
+#
+# def tokenStream(self, fieldName, reader):
+# res = PyLucene.StandardTokenizer(reader)
+# res = PyLucene.StandardFilter(res)
+# res = PyLucene.StopFilter(res,
+# PyLucene.StopAnalyzer().ENGLISH_STOP_WORDS)
+# res = AccentFilter(res)
+# res = PyLucene.LowerCaseFilter(res)
+# return res
Modified: search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/ar_annotationindex.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/ar_annotationindex.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
#---------------------------------------------------------------------------
import re
-import PyLucene
from mbsearch import indexcreator
from mbsearch.normalize import normalizeText
from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -36,10 +35,10 @@
'''
FIELDS = [
- (u"mbid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u"name", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"type", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
- (u"text", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
+ (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
Modified: search_index/trunk/mbsearch/serverindex/artistindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/artistindex.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/artistindex.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
#---------------------------------------------------------------------------
import re
-import PyLucene
import psycopg2
from mbsearch import indexcreator
from mbsearch.normalize import normalizeText, normalizeDate
@@ -43,14 +42,15 @@
"""This class specifies the details on how to create the artist index."""
FIELDS = [
- (u'arid', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u'artist', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u'sortname', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u'type', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, _normalizeArtistType),
- (u'begin', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
- (u'end', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
- (u'comment', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u'alias', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
+ (u'arid', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u'artist', 10, indexcreator.FIELD_INDEX, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'artist', 0, indexcreator.FIELD_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'sortname', 10, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'type', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, _normalizeArtistType),
+ (u'begin', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeDate),
+ (u'end', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeDate),
+ (u'comment', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'alias', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
@@ -80,15 +80,40 @@
conn.close()
- return """SELECT gid, name, sortname, type, begindate, enddate, resolution
+ return """SELECT gid, name || ' ' || sortname || ' ',
+ name, sortname, type, begindate, enddate, resolution
FROM artist
WHERE id BETWEEN %d AND %d ORDER BY id""" % (chunkNum * chunkSize, ((chunkNum + 1) * chunkSize) - 1)
+
+ def uniquer(self, seq, idfun=None):
+ if idfun is None:
+ def idfun(x): return x
+ seen = {}
+ result = []
+ for item in seq:
+ marker = idfun(item)
+ # in old Python versions:
+ # if seen.has_key(marker)
+ # but in new ones:
+ if marker in seen: continue
+ seen[marker] = 1
+ result.append(item)
+ return result
+
def processRow(self, row):
"""Normalize appropriate columns of the row."""
ret = super(ArtistIndex, self).processRow(row)
try:
ret.append(self.aliasDict[row[0]])
+ #ret[1] += ' '.join(self.aliasDict[row[0]])
except KeyError:
ret.append([])
+
+ # TODO: Remove no word characters
+ #ret[1] = ret[1].replace(u',', u'').lower()
+ #words = ret[1].split(' ')
+ #words = self.uniquer(words)
+ #ret[1] = ' '.join(words)
+
return ret
Modified: search_index/trunk/mbsearch/serverindex/la_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/la_annotationindex.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/la_annotationindex.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
#---------------------------------------------------------------------------
import re
-import PyLucene
from mbsearch import indexcreator
from mbsearch.normalize import normalizeText
from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -36,10 +35,10 @@
'''
FIELDS = [
- (u"mbid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u"name", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"type", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
- (u"text", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
+ (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
Modified: search_index/trunk/mbsearch/serverindex/labelindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/labelindex.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/labelindex.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
#---------------------------------------------------------------------------
import re
-import PyLucene
import psycopg2
from mbsearch import indexcreator
from mbsearch.normalize import normalizeText, normalizeDate
@@ -44,16 +43,16 @@
"""This class specifies the details on how to create the label index."""
FIELDS = [
- (u'laid', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u'label', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u'sortname', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u'type', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, _normalizeLabelType),
- (u'code', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
- (u'country', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
- (u'begin', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
- (u'end', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
- (u'comment', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u'alias', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
+ (u'laid', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u'label', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'sortname', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'type', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeLabelType),
+ (u'code', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u'country', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u'begin', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
+ (u'end', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
+ (u'comment', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'alias', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
Modified: search_index/trunk/mbsearch/serverindex/re_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/re_annotationindex.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/re_annotationindex.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
#---------------------------------------------------------------------------
import re
-import PyLucene
from mbsearch import indexcreator
from mbsearch.normalize import normalizeText
from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -36,10 +35,10 @@
'''
FIELDS = [
- (u"mbid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u"name", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"type", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
- (u"text", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
+ (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
Modified: search_index/trunk/mbsearch/serverindex/releaseindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/releaseindex.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/releaseindex.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -24,7 +24,7 @@
#
#---------------------------------------------------------------------------
-import PyLucene, re
+import re
import psycopg2
from mbsearch import indexcreator
from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType
@@ -136,22 +136,22 @@
'''
FIELDS = [
- (u"arid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u"artist", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"reid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u"release", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"type", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeReleaseType),
- (u"status", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, _normalizeReleaseStatus),
- (u"tracks", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
- (u"discids", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
- (u"asin", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"lang", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, _normalizeLanguage),
- (u"script", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, _normalizeScript),
- (u"country", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"date", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeDate),
- (u"label", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"catno", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"barcode", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
+ (u"arid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u"artist", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"reid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u"release", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeReleaseType),
+ (u"status", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, _normalizeReleaseStatus),
+ (u"tracks", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"discids", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"asin", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"lang", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeLanguage),
+ (u"script", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeScript),
+ (u"country", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"date", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
+ (u"label", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"catno", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"barcode", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
@@ -183,11 +183,11 @@
self.aliasDict[row[0]] = { 'country' : [], 'date' : [], 'label' : [], 'catno' : [], 'barcode' : [] };
ref = self.aliasDict[row[0]]
- ref['country'].append(unicode(row[1] or '-', 'utf-8'))
- ref['date'].append(self.dateRe.sub('', unicode(row[2], 'utf-8')) or u'-')
- ref['label'].append(unicode(row[3] or '-', 'utf-8'))
- ref['catno'].append(unicode(row[4] or '-', 'utf-8'))
- ref['barcode'].append(unicode(row[5] or '-', 'utf-8'))
+ ref['country'].append(unicode(row[1] or '', 'utf-8'))
+ ref['date'].append(self.dateRe.sub('', unicode(row[2], 'utf-8')) or u'')
+ ref['label'].append(unicode(row[3] or '', 'utf-8'))
+ ref['catno'].append(unicode(row[4] or '', 'utf-8'))
+ ref['barcode'].append(unicode(row[5] or '', 'utf-8'))
conn.close()
Modified: search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/tr_annotationindex.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/tr_annotationindex.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -25,7 +25,6 @@
#---------------------------------------------------------------------------
import re
-import PyLucene
from mbsearch import indexcreator
from mbsearch.normalize import normalizeText
from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -36,10 +35,10 @@
'''
FIELDS = [
- (u"mbid", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u"name", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u"type", PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
- (u"text", PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
+ (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
Modified: search_index/trunk/mbsearch/serverindex/trackindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/trackindex.py 2008-05-31 04:19:10 UTC (rev 9862)
+++ search_index/trunk/mbsearch/serverindex/trackindex.py 2008-05-31 04:24:30 UTC (rev 9863)
@@ -24,7 +24,7 @@
#
#---------------------------------------------------------------------------
-import PyLucene, re
+import re
from mbsearch import indexcreator
from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType
from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
@@ -53,17 +53,17 @@
'''
FIELDS = [
- (u'arid', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u'artist', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u'reid', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u'release', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u'type', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeReleaseType),
- (u'tracks', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
- (u'trid', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, None),
- (u'track', PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED, normalizeText),
- (u'dur', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, _normalizeDur),
- (u'qdur', PyLucene.Field.Store.NO, PyLucene.Field.Index.UN_TOKENIZED, _normalizeQdur),
- (u'tnum', PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED, normalizeText),
+ (u'arid', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u'artist', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'reid', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u'release', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'type', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeReleaseType),
+ (u'tracks', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u'trid', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
+ (u'track', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'dur', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, _normalizeDur),
+ (u'qdur', indexcreator.FIELD_STORE_NO, indexcreator.FIELD_TOKENIZE_NO, _normalizeQdur),
+ (u'tnum', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
More information about the MusicBrainz-commits
mailing list