[mb-commits] r9879 - in search_index/trunk/mbsearch: . serverindex
root at musicbrainz.org
root at musicbrainz.org
Wed Jun 25 07:56:34 UTC 2008
Author: robert
Date: 2008-06-25 07:56:34 +0000 (Wed, 25 Jun 2008)
New Revision: 9879
Modified:
search_index/trunk/mbsearch/indexcreator.py
search_index/trunk/mbsearch/normalize.py
search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
search_index/trunk/mbsearch/serverindex/artistindex.py
search_index/trunk/mbsearch/serverindex/la_annotationindex.py
search_index/trunk/mbsearch/serverindex/labelindex.py
search_index/trunk/mbsearch/serverindex/re_annotationindex.py
search_index/trunk/mbsearch/serverindex/releaseindex.py
search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
search_index/trunk/mbsearch/serverindex/trackindex.py
Log:
Lots of tweaking and cleanup. Firmly committed to Xapian now as I've got things tweaked so they are
far better than Lucene. More bugs remain, but its time for a check in.
Modified: search_index/trunk/mbsearch/indexcreator.py
===================================================================
--- search_index/trunk/mbsearch/indexcreator.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/indexcreator.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -27,19 +27,16 @@
import re, time, sys, string
import xapian
import psycopg2
-import normalize
+import re
from unac import unac
+from escape_ideographic import addSpacesToIdeographicStrings
# Number of rows to process for each database 'chunk'
IDS_PER_CHUNK = 2000
STATS_WINDOW_SIZE = 6
-MAX_RAM_SIZE = 32000000
# TODO: Query DB in transactions to make sure final index is consistent
-FIELD_STORE_YES = 1
-FIELD_STORE_NO = 0
-
FIELD_TOKENIZE_YES = 1
FIELD_TOKENIZE_NO = 0
@@ -60,21 +57,14 @@
self.user = user
self.passwd = passwd
- self.normalize = normalize.Normalize()
+ self.dotsRe = re.compile("((?:\w\.){2,})")
+
self.indexer = xapian.TermGenerator()
- #stemmer = xapian.Stem("english")
- #self.indexer.set_stemmer(stemmer)
if create:
self.index = xapian.WritableDatabase(indexName, xapian.DB_CREATE_OR_OVERWRITE)
else:
self.index = xapian.WritableDatabase(indexName, xapian.DB_OPEN)
- def getAnalyzer(self):
- indexer = xapian.TermGenerator()
- stemmer = xapian.Stem("english")
- indexer.set_stemmer(stemmer)
- return indexer
-
def close(self):
'''
Close the index by discarding a ref
@@ -112,13 +102,31 @@
ret = []
for field, value in zip(self.FIELDS, row):
- normalize = field[4]
- if normalize is not None:
- value = normalize(value)
+ normalize = field[4]
+ if normalize is not None:
+ value = normalize(value)
ret.append(value)
return ret
+
+ def removeDots(self, query):
+ '''
+ Remove dots between characters so we can find R.E.M.
+ '''
+ bits = []
+ index = 0
+ for m in self.dotsRe.finditer(query):
+ bits.append(query[index:m.start()])
+ acronym = query[m.start():m.end()]
+ bits.append(acronym.replace(u".", u""))
+ index = m.end()
+
+ if index < len(query):
+ bits.append(query[index:len(query)])
+
+ return u''.join(bits)
+
def run(self, doTest = False):
'''
Execute the query, massage the returned data and pass it to
@@ -145,7 +153,7 @@
numChunks = (maxId / IDS_PER_CHUNK) + 1
if doTest:
- numChunks = min(numChunks, 25)
+ numChunks = min(numChunks, 2)
# Record the start time
t0 = time.time()
@@ -173,7 +181,7 @@
data = [ data ]
isList = False
- name, weight, method = field[:3]
+ name, weight, method, tokenize = field[:4]
for i, text in enumerate(data):
# Ensure that the value is an unicode string
if not text: continue
@@ -190,9 +198,20 @@
#if method == FIELD_INDEX: print text.encode('utf-8')
if method in (FIELD_INDEX, FIELD_INDEX_AND_STORE):
- self.indexer.index_text(unac.unac_string(text.strip()), weight, u"X" + name.upper())
+ if tokenize:
+ text = text.strip().lower()
+ text = unac.unac_string(text)
+ text = self.removeDots(text)
+ text = addSpacesToIdeographicStrings(text)
+ self.indexer.index_text(text, weight, u"X" + name.upper())
+ # TEST HACK
+ if name == u"artist": doc.add_value(0, u"%d" % len(text.split(u" ")))
+ else:
+ doc.add_term(u"X" + name.upper() + text, weight)
+ self.indexer.increase_termpos()
# Add the document to the index
+ #print "data: %s" % (storedata.encode('utf-8', 'replace'))
doc.set_data(storedata)
self.index.add_document(doc)
rowsThisChunk += 1
Modified: search_index/trunk/mbsearch/normalize.py
===================================================================
--- search_index/trunk/mbsearch/normalize.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/normalize.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -32,34 +32,16 @@
TYPES = (u'album', u'single', u'ep', u'compilation', u'soundtrack', u'spokenword',
u'interview', u'audiobook', u'live', u'remix', u'other')
-class Normalize(object):
- '''
- Normalize input according to our own normalization rules. This way we can mix searches with
- stop words and searches without stop words in one index.
- '''
+_dateRe = re.compile('-00')
- def normalize(self, text):
- '''
- return the normalized version of this query. See inline comments for exactly what normalization entails.
- '''
-
- # If not unicode, make it unicode
- if text.__class__.__name__ in ["float", "int"]:
- text = unicode(text)
- elif text.__class__.__name__ == "str":
- text = unicode(text, 'utf-8')
-
- # unaccent the string and return it
- return unac.unac_string(text)
-
def normalizeText(value):
"""Normalize text value from DB."""
if value is None:
- return u''
+ return u''
if isinstance(value, (float, int)):
- return unicode(value)
+ return unicode(value)
if isinstance(value, str):
- value = value.decode('utf-8')
+ value = value.decode('utf-8')
return value
def normalizeReleaseType(value):
@@ -74,18 +56,13 @@
return u''
return u''
-_dateRe = re.compile('-00')
-
def normalizeDate(value):
"""Normalize date value from DB."""
- # FIXME this should return date in format yyyyMMdd, not yyyy-MM-dd (see
- # org.apache.PyLucene.document.DateTools)
if value is None:
return u''
value = value.decode('utf-8')
return _dateRe.sub(u'', value)
-if __name__ == '__main__':
- n = Normalize()
- print n.normalize("The test string %#$")
- print n.stopWordNormalize("In the Kingdom of the Blind the One-Eyed Are Kings")
+def normalizeMBID(value):
+ """Normalize an MBID UUID"""
+ return value.replace(u'-', u'')
Modified: search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/ar_annotationindex.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/ar_annotationindex.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
import re
from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeMBID
class ArtistAnnotationIndex(indexcreator.IndexCreator):
'''
@@ -35,18 +34,15 @@
'''
FIELDS = [
- (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
- (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"mbid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u"name", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"text", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
- def getAnalyzer(self):
- return StandardUnaccentAnalyzer()
-
def getRowCountQuery(self):
return "SELECT max(id), count(id) from annotation where type = 1"
Modified: search_index/trunk/mbsearch/serverindex/artistindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/artistindex.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/artistindex.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -27,8 +27,7 @@
import re
import psycopg2
from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText, normalizeDate
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeDate, normalizeMBID
TYPES = (u'unknown', u'person', u'group')
@@ -42,24 +41,20 @@
"""This class specifies the details on how to create the artist index."""
FIELDS = [
- (u'arid', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, None),
- (u'artist', 10, indexcreator.FIELD_INDEX, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'artist', 0, indexcreator.FIELD_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'sortname', 10, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'type', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, _normalizeArtistType),
- (u'begin', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeDate),
- (u'end', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeDate),
- (u'comment', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'alias', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'arid', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u'artist', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'sortname', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'type', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, _normalizeArtistType),
+ (u'begin', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
+ (u'end', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
+ (u'comment', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'alias', 0, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
self.aliasDict = {}
- def getAnalyzer(self):
- return StandardUnaccentAnalyzer()
-
def getRowCountQuery(self):
return "SELECT max(id), count(*) FROM artist"
@@ -80,40 +75,16 @@
conn.close()
- return """SELECT gid, name || ' ' || sortname || ' ',
- name, sortname, type, begindate, enddate, resolution
+ return """SELECT gid, name, sortname, type, begindate, enddate, resolution
FROM artist
WHERE id BETWEEN %d AND %d ORDER BY id""" % (chunkNum * chunkSize, ((chunkNum + 1) * chunkSize) - 1)
-
- def uniquer(self, seq, idfun=None):
- if idfun is None:
- def idfun(x): return x
- seen = {}
- result = []
- for item in seq:
- marker = idfun(item)
- # in old Python versions:
- # if seen.has_key(marker)
- # but in new ones:
- if marker in seen: continue
- seen[marker] = 1
- result.append(item)
- return result
-
def processRow(self, row):
"""Normalize appropriate columns of the row."""
ret = super(ArtistIndex, self).processRow(row)
try:
ret.append(self.aliasDict[row[0]])
- #ret[1] += ' '.join(self.aliasDict[row[0]])
except KeyError:
ret.append([])
- # TODO: Remove no word characters
- #ret[1] = ret[1].replace(u',', u'').lower()
- #words = ret[1].split(' ')
- #words = self.uniquer(words)
- #ret[1] = ' '.join(words)
-
return ret
Modified: search_index/trunk/mbsearch/serverindex/la_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/la_annotationindex.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/la_annotationindex.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
import re
from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeMBID
class LabelAnnotationIndex(indexcreator.IndexCreator):
'''
@@ -35,18 +34,15 @@
'''
FIELDS = [
- (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
- (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"mbid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u"name", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"text", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
- def getAnalyzer(self):
- return StandardUnaccentAnalyzer()
-
def getRowCountQuery(self):
return "SELECT max(id), count(id) from annotation where type = 3"
Modified: search_index/trunk/mbsearch/serverindex/labelindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/labelindex.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/labelindex.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -27,8 +27,7 @@
import re
import psycopg2
from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText, normalizeDate
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeDate, normalizeMBID
TYPES = (u'unknown', u'distributor', u'holding', u'production', u'orig. prod.',
u'bootleg prod.', u'reissue prod.', u'publisher')
@@ -43,25 +42,22 @@
"""This class specifies the details on how to create the label index."""
FIELDS = [
- (u'laid', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u'label', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'sortname', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'type', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeLabelType),
- (u'code', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
- (u'country', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
- (u'begin', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
- (u'end', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
- (u'comment', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'alias', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'laid', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u'label', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'sortname', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'type', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, _normalizeLabelType),
+ (u'code', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u'country', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u'begin', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
+ (u'end', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
+ (u'comment', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'alias', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
self.aliasDict = {}
- def getAnalyzer(self):
- return StandardUnaccentAnalyzer()
-
def getRowCountQuery(self):
return "SELECT max(id), count(*) FROM label"
@@ -82,7 +78,7 @@
conn.close()
- return """SELECT gid, label.name, sortname, type, labelcode, lower(isocode), begindate, enddate, resolution
+ return """SELECT gid, label.name, sortname, type, labelcode, isocode, begindate, enddate, resolution
FROM label LEFT JOIN country ON label.country = country.id
WHERE label.id BETWEEN %d AND %d""" % (chunkNum * chunkSize, ((chunkNum + 1) * chunkSize) - 1)
Modified: search_index/trunk/mbsearch/serverindex/re_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/re_annotationindex.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/re_annotationindex.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
import re
from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeMBID
class ReleaseAnnotationIndex(indexcreator.IndexCreator):
'''
@@ -35,10 +34,10 @@
'''
FIELDS = [
- (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
- (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"mbid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u"name", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"text", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
Modified: search_index/trunk/mbsearch/serverindex/releaseindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/releaseindex.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/releaseindex.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -27,8 +27,7 @@
import re
import psycopg2
from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType, normalizeMBID
ALBUM_STATUS_FIRST = 100
ALBUM_STATUS_LAST = 102
@@ -136,22 +135,22 @@
'''
FIELDS = [
- (u"arid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u"artist", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"reid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u"release", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeReleaseType),
- (u"status", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, _normalizeReleaseStatus),
- (u"tracks", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
- (u"discids", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
- (u"asin", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"lang", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeLanguage),
- (u"script", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, _normalizeScript),
- (u"country", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"date", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
- (u"label", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"catno", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"barcode", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"arid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u"artist", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"reid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u"release", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeReleaseType),
+ (u"status", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, _normalizeReleaseStatus),
+ (u"tracks", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"discids", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"asin", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"lang", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, _normalizeLanguage),
+ (u"script", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, _normalizeScript),
+ (u"country", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"date", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeDate),
+ (u"label", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"catno", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"barcode", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
@@ -159,9 +158,6 @@
self.parenRegexp = re.compile('\(.*?\)$|\[(.*?)\]$')
self.dateRe = re.compile('-00')
- def getAnalyzer(self):
- return StandardUnaccentAnalyzer()
-
def getRowCountQuery(self):
return "SELECT max(id), count(*) FROM album"
Modified: search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/tr_annotationindex.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/tr_annotationindex.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
import re
from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeMBID
class TrackAnnotationIndex(indexcreator.IndexCreator):
'''
@@ -35,18 +34,15 @@
'''
FIELDS = [
- (u"mbid", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u"name", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u"type", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
- (u"text", indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"mbid", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u"name", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u"type", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u"text", 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
- def getAnalyzer(self):
- return StandardUnaccentAnalyzer()
-
def getRowCountQuery(self):
return "SELECT max(id), count(id) from annotation where type = 4"
Modified: search_index/trunk/mbsearch/serverindex/trackindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/trackindex.py 2008-06-23 11:54:54 UTC (rev 9878)
+++ search_index/trunk/mbsearch/serverindex/trackindex.py 2008-06-25 07:56:34 UTC (rev 9879)
@@ -26,8 +26,7 @@
import re
from mbsearch import indexcreator
-from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType
-from mbsearch.serverindex.analyzers.unaccent import StandardUnaccentAnalyzer
+from mbsearch.normalize import normalizeText, normalizeDate, normalizeReleaseType, normalizeMBID
ALBUM_ATTR_FIRST = 1
ALBUM_ATTR_LAST = 11
@@ -53,25 +52,22 @@
'''
FIELDS = [
- (u'arid', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u'artist', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'reid', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u'release', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'type', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeReleaseType),
- (u'tracks', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
- (u'trid', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, None),
- (u'track', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
- (u'dur', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, _normalizeDur),
- (u'qdur', indexcreator.FIELD_STORE_NO, indexcreator.FIELD_TOKENIZE_NO, _normalizeQdur),
- (u'tnum', indexcreator.FIELD_STORE_YES, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u'arid', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u'artist', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'reid', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u'release', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_YES, normalizeText),
+ (u'type', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeReleaseType),
+ (u'tracks', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u'trid', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeMBID),
+ (u'track', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
+ (u'dur', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, _normalizeDur),
+ (u'qdur', 1, indexcreator.FIELD_INDEX, indexcreator.FIELD_TOKENIZE_NO, _normalizeQdur),
+ (u'tnum', 1, indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NO, normalizeText),
]
def __init__(self, indexName, clear, host, database, user, passwd):
indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
- def getAnalyzer(self):
- return StandardUnaccentAnalyzer()
-
def getRowCountQuery(self):
return "SELECT max(id), count(*) FROM track"
More information about the MusicBrainz-commits
mailing list