[mb-commits] r9880 - search_server/trunk/lib
root at musicbrainz.org
root at musicbrainz.org
Wed Jun 25 09:49:09 UTC 2008
Author: robert
Date: 2008-06-25 09:49:09 +0000 (Wed, 25 Jun 2008)
New Revision: 9880
Modified:
search_server/trunk/lib/annotationsearch.py
search_server/trunk/lib/search.py
Log:
Fix annotation searches. Fix mbid searches. Remove lucene style term boosting syntax from queries.
Change R.E.M. -> REM. Lowercase all terms, except capped AND, OR, NOT, XOR. Searching is working
pretty damned well now!
Modified: search_server/trunk/lib/annotationsearch.py
===================================================================
--- search_server/trunk/lib/annotationsearch.py 2008-06-25 07:56:34 UTC (rev 9879)
+++ search_server/trunk/lib/annotationsearch.py 2008-06-25 09:49:09 UTC (rev 9880)
@@ -45,17 +45,19 @@
out += u'<tr class="searchresultsheader"><td>Score</td><td>Type</td><td>Name</td><td>Annotation</td>'
out += u'</tr>'
- for doc in hits:
- type = doc.get('type')
- text = doc.get('text')
- mbid = doc.get('mbid')
- name = doc.get('name')
+ for i, doc in enumerate(hits):
+ type = doc.get('type') or u''
+ text = doc.get('text') or u''
+ mbid = doc.get('mbid') or u''
+ name = doc.get('name') or u''
out += u'<tr class="searchresults%s">' % search.oddeven[i % 2]
out += u"<td>%d</td>" % doc['_score']
out += u"<td>%s</td>" % self.escape(type)
out += u"<td><a href=\"/%s/%s.html\">%s</a></td>" % (self.escape(type),
self.escape(mbid), self.escape(name))
+
+ text = text.replace(u"\\n", u"\n");
out += u"<td>%%WIKIBEGIN%%%s%%WIKIEND%%</td>" % (self.escape(text))
out += u"</tr>"
out += u"</table></div>"
Modified: search_server/trunk/lib/search.py
===================================================================
--- search_server/trunk/lib/search.py 2008-06-25 07:56:34 UTC (rev 9879)
+++ search_server/trunk/lib/search.py 2008-06-25 09:49:09 UTC (rev 9880)
@@ -26,7 +26,9 @@
import sys, os
import xapian
from unac import unac
+from escape_ideographic import addSpacesToIdeographicStrings
import time
+import re
# TODO:
# auto index update
@@ -67,6 +69,10 @@
self.rel = 0
self.offset = 0
+ self.escapeTermBoost = re.compile(u"\^[0-9]*\.?[0-9]+")
+ self.dotsRe = re.compile("((?:\w\.){2,})")
+ self.uuidRe = re.compile("[a-z0-9]{8}[:-][a-z0-9]{4}[:-][a-z0-9]{4}[:-][a-z0-9]{4}[:-][a-z0-9]{12}")
+
self.defaultField = u''
try:
sys.stderr.write(indexName + "\n")
@@ -75,7 +81,11 @@
text = str(msg)
raise NoSuchIndexError(text)
+ self.weight = xapian.BM25Weight(0, 0, 1, .1, .5);
self.en = enquire = xapian.Enquire(self.index)
+ self.en.set_weighting_scheme(self.weight)
+ self.en.set_sort_by_relevance_then_value(0, True)
+
self.qp = xapian.QueryParser()
self.qp.set_database(self.index)
self.qp.set_stemming_strategy(xapian.QueryParser.STEM_NONE)
@@ -105,12 +115,6 @@
'''
return text.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
- def unEscapeUUID(self, uuid):
- '''
- Convert a dashless UUID to a standard UUID with dashes.
- '''
- return "%s-%s-%s-%s-%s" % (uuid[0:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32])
-
def setDuration(self, dur):
'''
If a duration is set, the search can color code track lengths
@@ -164,6 +168,19 @@
out += u'height="13" width="28" align="middle" border="0">'
return out
+ def lowercaseQuery(self, query):
+ '''
+ Change the query to all lower case since Xapian seems to have some case sensitivity issues.
+ But, don't change everything to lowercase -- AND, OR, NOT, XOR in uppercase must be preserved.
+ '''
+
+ out = []
+ for word in query.split(u' '):
+ if word not in (u'AND', u'OR', u'NOT', u'XOR'): word = word.lower()
+ out.append(word)
+
+ return u' '.join(out)
+
def mangleQuery(self, query):
'''
For backwards compatibility, filter a query before passing it to lucene. Returns
@@ -171,6 +188,45 @@
'''
return query
+ def removeTermBoosting(self, query):
+ '''
+ Removes term boosting from old queries since Xapain doesn't support them
+ '''
+ return self.escapeTermBoost.sub("", query)
+
+ def removeDots(self, query):
+ '''
+ Remove dots between characters so we can find R.E.M.
+ '''
+
+ bits = []
+ index = 0
+ for m in self.dotsRe.finditer(query):
+ bits.append(query[index:m.start()])
+ acronym = query[m.start():m.end()]
+ bits.append(acronym.replace(u".", u""))
+ index = m.end()
+
+ if index < len(query):
+ bits.append(query[index:len(query)])
+
+ return u''.join(bits)
+
+ def removeDashesFromMBIDs(self, query):
+
+ bits = []
+ index = 0
+ for m in self.uuidRe.finditer(query):
+ bits.append(query[index:m.start()])
+ acronym = query[m.start():m.end()]
+ bits.append(acronym.replace(u"-", u""))
+ index = m.end()
+
+ if index < len(query):
+ bits.append(query[index:len(query)])
+
+ return u''.join(bits)
+
def queryIndex(self, query, offset, maxHits):
'''
Carry out a search, and return the hits
@@ -178,14 +234,23 @@
if not query: raise QueryError(u"No query was sent")
-
try:
- query = unac.unac_string(self.mangleQuery(unicode(query, 'utf-8')))
+ query = unicode(query, 'utf-8')
+ self.f = open("/tmp/log", "a")
+ print >>self.f, "query: '%s'" % query.encode('utf-8', 'replace')
+ self.f.close()
+ query = self.lowercaseQuery(query)
+ query = self.mangleQuery(query)
+ query = self.removeTermBoosting(query)
+ query = self.removeDots(query)
+ query = self.removeDashesFromMBIDs(query)
+ query = unac.unac_string(query)
+ query = addSpacesToIdeographicStrings(query)
except UnicodeDecodeError:
raise QueryError(u"Unicode decode problem: Invalid utf-8 characters passed to search query.")
try:
- parsedQuery = self.qp.parse_query(unac.unac_string(query),
+ parsedQuery = self.qp.parse_query(query,
xapian.QueryParser.FLAG_PHRASE |
xapian.QueryParser.FLAG_BOOLEAN |
xapian.QueryParser.FLAG_LOVEHATE,
@@ -216,7 +281,7 @@
dataDict['_score'] = match.percent
hits.append(dataDict)
- return hits
+ return (hits, matches.get_matches_estimated())
def log_error(self, msg):
log = open("/tmp/slow_queries.txt", "a")
@@ -227,7 +292,7 @@
def search(self, query, maxHits, offset, type='xml'):
if maxHits < 1: maxHits = MAX_HITS
- hits = self.queryIndex(query, offset, maxHits);
+ (hits, total) = self.queryIndex(query, offset, maxHits);
redirect = ""
if len(hits) == 1:
doc = hits[0]
@@ -237,7 +302,7 @@
if type == 'html':
# This comment will be used by the MB server to determine the number of hits returned
- stats = u"<!--\nhits=%d\noffset=%d\n" % (len(hits), offset)
+ stats = u"<!--\nhits=%d\noffset=%d\n" % (total, offset)
if redirect: stats += u"redirect=%s\n" % redirect
stats += u"-->"
return stats.encode('utf-8') + self.asHTML(hits, maxHits, offset).encode('utf-8')
More information about the MusicBrainz-commits
mailing list