[mb-commits] r9893 - in search_index/trunk: . mbsearch mbsearch/serverindex tools
root at musicbrainz.org
root at musicbrainz.org
Sun Jun 29 07:39:56 UTC 2008
Author: robert
Date: 2008-06-29 07:39:56 +0000 (Sun, 29 Jun 2008)
New Revision: 9893
Modified:
search_index/trunk/builder.py
search_index/trunk/mbsearch/indexcreator.py
search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
search_index/trunk/mbsearch/serverindex/artistindex.py
search_index/trunk/mbsearch/serverindex/la_annotationindex.py
search_index/trunk/mbsearch/serverindex/labelindex.py
search_index/trunk/mbsearch/serverindex/re_annotationindex.py
search_index/trunk/mbsearch/serverindex/releaseindex.py
search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
search_index/trunk/mbsearch/serverindex/trackindex.py
search_index/trunk/tools/tinysearch.py
Log:
Added the tinysearch for xapian.
Sped up track index by building multiple smaller indexes and glueing them together at the end. Now takes less than 3 hours.
Improved searching by fixing probablistic weights.
qdur searches work for the track index.
Modified: search_index/trunk/builder.py
===================================================================
--- search_index/trunk/builder.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/builder.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -36,6 +36,10 @@
print " -p <db passwd> -- The password for the user. (default: -blank-)"
print " -t -- Text the index builder by creating small text indexes."
print " -b [artnl] -- Which indexes to build (Artist, Release, Track, aNnotation, Labels)"
+ print
+ print "NOTE: Set the env var XAPIAN_FLUSH_THRESHOLD to some large value that consumes a"
+ print "reasonable amount of RAM on the machine on which you're building the indexes."
+ print " e.g.: XAPIAN_FLUSH_THRESHOLD=\"1000000\""
sys.exit(-1)
# Default config values
@@ -73,24 +77,33 @@
index = artistindex.ArtistIndex("data/artist_index", True, host, database, user, passwd)
ok = index.run(test)
if ok:
- index.close()
- print "done\n"
+ if not index.close():
+ ok = 0
+ print "Index compacting failed. Stop."
+ else:
+ print "done\n"
if ok and (not build or build.find('r') >= 0):
print "Creating release index:"
index = releaseindex.ReleaseIndex("data/release_index", True, host, database, user, passwd)
ok = index.run(test)
if ok:
- index.close()
- print "done\n"
+ if not index.close():
+ ok = 0
+ print "Index compacting failed. Stop."
+ else:
+ print "done\n"
if ok and (not build or build.find('t') >= 0):
print "Creating track index:"
- index = trackindex.TrackIndex("data/track_index", True, host, database, user, passwd)
+ index = trackindex.TrackIndex("data/track_index", True, host, database, user, passwd, True)
ok = index.run(test)
if ok:
- index.close()
- print "done\n"
+ if not index.close():
+ ok = 0
+ print "Index compacting failed. Stop."
+ else:
+ print "done\n"
if ok and (not build or build.find('n') >= 0):
print "Creating artist annotation index:"
@@ -122,5 +135,8 @@
index = labelindex.LabelIndex("data/label_index", True, host, database, user, passwd)
ok = index.run(test)
if ok:
- index.close()
- print "done\n"
+ if not index.close():
+ ok = 0
+ print "Index compacting failed. Stop."
+ else:
+ print "done\n"
Modified: search_index/trunk/mbsearch/indexcreator.py
===================================================================
--- search_index/trunk/mbsearch/indexcreator.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/indexcreator.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -24,15 +24,20 @@
#
#---------------------------------------------------------------------------
-import re, time, sys, string
+import re, time, sys, string, os
import xapian
import psycopg2
import re
from unac import unac
from escape_ideographic import addSpacesToIdeographicStrings
+from random import random
# Number of rows to process for each database 'chunk'
-IDS_PER_CHUNK = 2000
+IDS_PER_CHUNK = 1000
+
+# The number of slices to create that will later be joined into a total index
+SLICES_PER_INDEX = 20
+
STATS_WINDOW_SIZE = 6
MAX_FIELD_LEN = 65536
@@ -60,16 +65,23 @@
MusicBrainz data.
'''
- def __init__(self, indexName, create, host, database, user, passwd):
+ def __init__(self, indexName, create, host, database, user, passwd, useSlices = False):
self.create = create
self.host = host
self.database = database
self.user = user
self.passwd = passwd
+ self.indexName = indexName
+ self.useSlices = useSlices
self.dotsRe = re.compile("((?:\w\.){2,})", re.UNICODE)
self.indexer = xapian.TermGenerator()
+
+ if useSlices:
+ self.curSlice = 0
+ indexName = "%s.%d" % (indexName, self.curSlice)
+
if create:
self.index = xapian.WritableDatabase(indexName, xapian.DB_CREATE_OR_OVERWRITE)
else:
@@ -79,8 +91,38 @@
'''
Close the index by discarding a ref
'''
+
del self.index
+ # If we've created many slices, merge them together into one final database
+ if self.useSlices:
+ cmd = "xapian-compact --multipass "
+ for i in xrange(self.curSlice + 1):
+ cmd += "%s.%d " % (self.indexName, i)
+ cmd += self.indexName
+
+ print "Compacting the database..."
+ if os.system(cmd):
+ return False
+ else:
+ for i in xrange(self.curSlice + 1):
+ os.system("rm -rf %s.%d" % (self.indexName, i))
+ return True
+
+ def switchToNextSlice(self):
+
+ if not self.useSlices: return
+
+ del self.index
+ self.curSlice += 1
+ indexName = "%s.%d" % (self.indexName, self.curSlice)
+
+ print "Create database Slice %d" % self.curSlice
+ if self.create:
+ self.index = xapian.WritableDatabase(indexName, xapian.DB_CREATE_OR_OVERWRITE)
+ else:
+ self.index = xapian.WritableDatabase(indexName, xapian.DB_OPEN)
+
def getRowCountQuery(self):
"""Return the SQL query that determines how many rows need
to be processed.
@@ -137,6 +179,11 @@
return u''.join(bits)
+ def calculateWeight(self, text):
+ maxLen = 100
+ l = min(maxLen, len(text))
+ return maxLen - l
+
def run(self, doTest = False):
'''
Execute the query, massage the returned data and pass it to
@@ -165,9 +212,12 @@
if doTest:
numChunks = min(numChunks, 50)
+ rowsPerSlice = totalRows / SLICES_PER_INDEX
+
# Record the start time
t0 = time.time()
rowsIndexed = 0
+ rowsThisSlice = 0
# Now get the data, one chunk at a time
for i in xrange(numChunks):
@@ -183,6 +233,7 @@
doc = xapian.Document()
self.indexer.set_document(doc)
storedata = u""
+ primaryWeight = 0
for field, data in zip(self.FIELDS, row):
isList = True
@@ -192,6 +243,7 @@
isList = False
name, method, tokenize = field[:3]
+ weight = 1
for i, text in enumerate(data):
# Ensure that the value is an unicode string
if not text: continue
@@ -201,6 +253,9 @@
if text:
#print "%s:%s" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
if method in (FIELD_STORE, FIELD_INDEX_AND_STORE):
+ if tokenize == FIELD_TOKENIZE_PRIMARY:
+ primaryWeight = self.calculateWeight(text)
+ #print "w: %s -> %d" % (text.encode('utf-8', 'replace'), primaryWeight)
if isList:
storedata += u"%s%d=%s\n" % (name, i, text.replace(u"\n", u"\\n").strip())
else:
@@ -208,27 +263,30 @@
if method in (FIELD_INDEX, FIELD_INDEX_AND_STORE):
if tokenize == FIELD_TOKENIZE_NORMAL or tokenize == FIELD_TOKENIZE_PRIMARY:
+ if primaryWeight:
+ weight = primaryWeight
+ primaryWeight = 0
+ doc.add_value(0, u"%d" % weight)
+ else:
+ weight = self.calculateWeight(text)
text = text.strip().lower()
text = unac.unac_string(text)
text = self.removeDots(text)
text = addSpacesToIdeographicStrings(text)
- self.indexer.index_text(text, 0, u"X" + name.upper())
- #print "ndx: %s:'%s'" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
- if tokenize == FIELD_TOKENIZE_PRIMARY:
- doc.add_value(0, u"%d" % (MAX_FIELD_LEN - len(text)))
+ self.indexer.index_text(text, weight, u"X" + name.upper())
+ #print "ndx: %s:'%s' %d" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'), weight)
elif tokenize == FIELD_TOKENIZE_NONE:
- #print "add: '%s'" % text.encode('utf-8', 'replace')
- doc.add_term(u"X" + name.upper() + text, 0)
+ #print "add: %s:'%s'" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
+ doc.add_term(u"X" + name.upper() + text, 1)
elif tokenize == FIELD_TOKENIZE_MBID:
text = text.strip().lower()
text = text.replace(u'-', u'')
- #print "gid: '%s'" % text.encode('utf-8', 'replace')
- self.indexer.index_text(text, 0, u"X" + name.upper())
+ self.indexer.index_text(text, 1, u"X" + name.upper())
- self.indexer.increase_termpos()
+ #self.indexer.increase_termpos()
elif tokenize == FIELD_TOKENIZE_PRIMARY:
- #print "val: '%s'" % len(text.split(u" "))
- doc.add_value(0, u"%d" % (MAX_FIELD_LEN - len(text)))
+ weight = self.calculateWeight(text)
+ doc.add_value(0, u"%d" % weight)
# Add the document to the index
@@ -239,6 +297,7 @@
t2 = time.time()
rowsIndexed += rowsThisChunk
+ rowsThisSlice += rowsThisChunk
# Add the current number of seconds per row into a sliding window queue
if rowsThisChunk > 0: stats.append((t2 - t1) / rowsThisChunk)
@@ -262,5 +321,9 @@
print
sys.stdout.flush()
+ if rowsThisSlice > rowsPerSlice:
+ rowsThisSlice = 0
+ self.switchToNextSlice()
+
conn.close()
return True
Modified: search_index/trunk/mbsearch/serverindex/ar_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/ar_annotationindex.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/ar_annotationindex.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -40,8 +40,8 @@
(u"text", indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
]
- def __init__(self, indexName, clear, host, database, user, passwd):
- indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+ def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+ indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
def getRowCountQuery(self):
return "SELECT max(id), count(id) from annotation where type = 1"
Modified: search_index/trunk/mbsearch/serverindex/artistindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/artistindex.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/artistindex.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -41,20 +41,19 @@
"""This class specifies the details on how to create the artist index."""
FIELDS = [
- (u'arid', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_MBID, None),
- (u'artist', indexcreator.FIELD_INDEX, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
- (u'artist', indexcreator.FIELD_STORE, indexcreator.FIELD_TOKENIZE_PRIMARY, normalizeText),
-# (u'artist', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_PRIMARY, normalizeText),
- (u'sortname', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
- (u'type', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, _normalizeArtistType),
- (u'begin', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, normalizeDate),
- (u'end', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, normalizeDate),
- (u'comment', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
- (u'alias', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
+ (u'arid', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_MBID, None),
+ (u'artist', indexcreator.FIELD_STORE, indexcreator.FIELD_TOKENIZE_PRIMARY, normalizeText),
+ (u'artist', indexcreator.FIELD_INDEX, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
+ (u'sortname', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
+ (u'type', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, _normalizeArtistType),
+ (u'begin', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, normalizeDate),
+ (u'end', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, normalizeDate),
+ (u'comment', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
+ (u'alias', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
]
- def __init__(self, indexName, clear, host, database, user, passwd):
- indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+ def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+ indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
self.aliasDict = {}
def getRowCountQuery(self):
@@ -89,7 +88,7 @@
conn.close()
- return """SELECT gid, name || ' ' || sortname || ' ', name,
+ return """SELECT gid, name, name || ' ' || sortname || ' ',
sortname, type, begindate, enddate, resolution
FROM artist
WHERE id BETWEEN %d AND %d ORDER BY id""" % (chunkNum * chunkSize, ((chunkNum + 1) * chunkSize) - 1)
@@ -99,13 +98,13 @@
ret = super(ArtistIndex, self).processRow(row)
try:
ret.append(self.aliasDict[row[0]])
- ret[1] += ' ' + ' '.join(self.aliasDict[row[0]])
+ ret[2] += ' ' + ' '.join(self.aliasDict[row[0]])
except KeyError:
ret.append([])
- ret[1] = ret[1].replace(u',', u'').lower()
- words = ret[1].split(' ')
+ ret[2] = ret[2].replace(u',', u'').lower()
+ words = ret[2].split(' ')
words = self.uniquer(words)
- ret[1] = ' '.join(words)
+ ret[2] = ' '.join(words)
return ret
Modified: search_index/trunk/mbsearch/serverindex/la_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/la_annotationindex.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/la_annotationindex.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -40,8 +40,8 @@
(u"text", indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
]
- def __init__(self, indexName, clear, host, database, user, passwd):
- indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+ def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+ indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
def getRowCountQuery(self):
return "SELECT max(id), count(id) from annotation where type = 3"
Modified: search_index/trunk/mbsearch/serverindex/labelindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/labelindex.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/labelindex.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -55,8 +55,8 @@
(u'alias', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
]
- def __init__(self, indexName, clear, host, database, user, passwd):
- indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+ def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+ indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
self.aliasDict = {}
def getRowCountQuery(self):
Modified: search_index/trunk/mbsearch/serverindex/re_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/re_annotationindex.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/re_annotationindex.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -40,8 +40,8 @@
(u"text", indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
]
- def __init__(self, indexName, clear, host, database, user, passwd):
- indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+ def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+ indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
def getRowCountQuery(self):
return "SELECT max(id), count(id) from annotation where type = 2"
Modified: search_index/trunk/mbsearch/serverindex/releaseindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/releaseindex.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/releaseindex.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -153,8 +153,8 @@
(u"barcode", indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, normalizeText),
]
- def __init__(self, indexName, clear, host, database, user, passwd):
- indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+ def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+ indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
self.parenRegexp = re.compile('\(.*?\)$|\[(.*?)\]$')
self.dateRe = re.compile('-00')
Modified: search_index/trunk/mbsearch/serverindex/tr_annotationindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/tr_annotationindex.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/tr_annotationindex.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -40,8 +40,8 @@
(u"text", indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, normalizeText),
]
- def __init__(self, indexName, clear, host, database, user, passwd):
- indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+ def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+ indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
def getRowCountQuery(self):
return "SELECT max(id), count(id) from annotation where type = 4"
Modified: search_index/trunk/mbsearch/serverindex/trackindex.py
===================================================================
--- search_index/trunk/mbsearch/serverindex/trackindex.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/mbsearch/serverindex/trackindex.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -34,17 +34,11 @@
TI_DURATION_QUANT = 2000
-def _normalizeDur(value):
- if value is not None and value > 0:
- return unicode(value)
- return u''
-
def _normalizeQdur(value):
if value is not None and value > 0:
return unicode(value / TI_DURATION_QUANT)
return u''
-
class TrackIndex(indexcreator.IndexCreator):
'''
This class specifies the details on how to create the track index. For more details on how
@@ -60,13 +54,13 @@
(u'tracks', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, normalizeText),
(u'trid', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_MBID, None),
(u'track', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_PRIMARY,normalizeText),
- (u'dur', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, _normalizeDur),
- (u'qdur', indexcreator.FIELD_INDEX, indexcreator.FIELD_TOKENIZE_NONE, _normalizeQdur),
+ (u'dur', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, normalizeText),
+ (u'qdur', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NORMAL, _normalizeQdur),
(u'tnum', indexcreator.FIELD_INDEX_AND_STORE, indexcreator.FIELD_TOKENIZE_NONE, normalizeText),
]
- def __init__(self, indexName, clear, host, database, user, passwd):
- indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd)
+ def __init__(self, indexName, clear, host, database, user, passwd, useSlices = False):
+ indexcreator.IndexCreator.__init__(self, indexName, clear, host, database, user, passwd, useSlices)
def getRowCountQuery(self):
return "SELECT max(id), count(*) FROM track"
Modified: search_index/trunk/tools/tinysearch.py
===================================================================
--- search_index/trunk/tools/tinysearch.py 2008-06-28 09:19:52 UTC (rev 9892)
+++ search_index/trunk/tools/tinysearch.py 2008-06-29 07:39:56 UTC (rev 9893)
@@ -1,81 +1,85 @@
#!/usr/bin/env python
-#---------------------------------------------------------------------------
#
-# Pimp My Tunes -- The MusicBrainz command line tagger.
-# Let a gazllion tunes be tagged!
-#
-# Copyright (C) Robert Kaye 2005
-#
-# This file is part of pimpmytunes.
+# Simple command-line search script.
#
-# pimpmytunes is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
+# Copyright (C) 2003 James Aylett
+# Copyright (C) 2004,2007 Olly Betts
#
-# pimpmytunes is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
#
-# You should have received a copy of the GNU General Public License
-# along with pimpmytunes; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
#
-#---------------------------------------------------------------------------
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+# USA
-import sys, os, getopt, re
-import PyLucene
-import readline
+import sys
+import xapian
from unac import unac
-def usage():
- print "%s: <index dir> <field> <query>" % sys.argv[0]
- sys.exit(-1)
+# We require at least two command line arguments.
+if len(sys.argv) < 4:
+ print >> sys.stderr, "Usage: %s PATH_TO_DATABASE FIELD QUERY" % sys.argv[0]
+ sys.exit(1)
-class Search(object):
+try:
+ # Open the database for searching.
+ database = xapian.Database(sys.argv[1])
- def __init__(self, indexName):
- self.analyzer = PyLucene.StandardAnalyzer()
- try:
- self.index = PyLucene.IndexSearcher(PyLucene.FSDirectory.getDirectory(indexName, False))
- except ValueError:
- raise NoSuchIndexError
+ # Start an enquire session.
+ enquire = xapian.Enquire(database)
- def close(self):
- self.index.close()
+ # Combine the rest of the command line arguments with spaces between
+ # them, so that simple queries don't have to be quoted at the shell
+ # level.
+ query_string = sys.argv[3]
+ for arg in sys.argv[4:]:
+ query_string += ' '
+ query_string += arg
- def match(self, query, field, maxHits):
- parser = PyLucene.QueryParser(field, self.analyzer)
- parsedQuery = parser.parse(query)
- hits = list(self.index.search(parsedQuery))
- if not hits:
- print "No hits"
- return
+ # Parse the query string to produce a Xapian::Query object.
+ qp = xapian.QueryParser()
+ #stemmer = xapian.Stem("english")
+ #qp.set_stemmer(stemmer)
+ qp.set_database(database)
+ qp.set_stemming_strategy(xapian.QueryParser.STEM_NONE)
+ qp.set_default_op(xapian.Query.OP_AND)
+ qp.add_prefix("artist", "XARTIST")
+ qp.add_prefix("begin", "XBEGIN")
+ qp.add_prefix("end", "XEND")
+ qp.add_prefix("type", "XTYPE")
+ qp.add_prefix("arid", "XARID")
+ qp.add_prefix("release", "XRELEASE")
+ qp.add_prefix("track", "XTRACK")
+ qp.add_prefix("dur", "XDUR")
+ qp.add_prefix("qdur", "XQDUR")
- for hit in hits[:maxHits]:
- doc = hit.getDocument()
- for field in doc.getFields():
- print "%8s: " % field.name().encode('utf-8'),
- for val in doc.getValues(field.name()):
- print "%s " % (val.encode('utf-8')),
- print
- print
+ query_string = unac.unac_string(unicode(query_string, 'utf-8'))
+ print "query: '%s'" % query_string
+ query = qp.parse_query(query_string, xapian.QueryParser.FLAG_PHRASE|
+ xapian.QueryParser.FLAG_BOOLEAN|
+ xapian.QueryParser.FLAG_LOVEHATE,
+ "X" + sys.argv[2].upper())
+ print "Parsed query is: %s" % query.get_description()
-# Parse the command line args
-opts = None
-args = None
-indexDir = "."
+ # Find the top 10 results for the query.
+ enquire.set_query(query)
+ matches = enquire.get_mset(0, 10)
-try:
- opts, args = getopt.getopt(sys.argv[1:], "hi:")
-except:
- usage()
+ # Display the results.
+ print "%i results found." % matches.get_matches_estimated()
+ print "Results 1-%i:" % matches.size()
-for key, value in opts:
- if key == "-h": usage()
+ for m in matches:
+ print "%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data())
-if len(args) < 3: usage()
-
-s = Search(args[0])
-s.match(args[2], args[1], 10)
+except Exception, e:
+ print >> sys.stderr, "Exception: %s" % str(e)
+ sys.exit(1)
More information about the MusicBrainz-commits
mailing list