[mb-commits] r9880 - search_server/trunk/lib

root at musicbrainz.org root at musicbrainz.org
Wed Jun 25 09:49:09 UTC 2008


Author: robert
Date: 2008-06-25 09:49:09 +0000 (Wed, 25 Jun 2008)
New Revision: 9880

Modified:
   search_server/trunk/lib/annotationsearch.py
   search_server/trunk/lib/search.py
Log:
Fix annotation searches. Fix mbid searches. Remove lucene style term boosting syntax from queries.
Change R.E.M. -> REM. Lowercase all terms, except capped AND, OR, NOT, XOR. Searching is working
pretty damned well now!


Modified: search_server/trunk/lib/annotationsearch.py
===================================================================
--- search_server/trunk/lib/annotationsearch.py	2008-06-25 07:56:34 UTC (rev 9879)
+++ search_server/trunk/lib/annotationsearch.py	2008-06-25 09:49:09 UTC (rev 9880)
@@ -45,17 +45,19 @@
        out += u'<tr class="searchresultsheader"><td>Score</td><td>Type</td><td>Name</td><td>Annotation</td>'
        out += u'</tr>'
        
-       for doc in hits:
-           type = doc.get('type')
-           text = doc.get('text')
-           mbid = doc.get('mbid')
-           name = doc.get('name')
+       for i, doc in enumerate(hits):
+           type = doc.get('type') or u''
+           text = doc.get('text') or u''
+           mbid = doc.get('mbid') or u''
+           name = doc.get('name') or u''
 
            out += u'<tr class="searchresults%s">' % search.oddeven[i % 2]
            out += u"<td>%d</td>" % doc['_score']
            out += u"<td>%s</td>" % self.escape(type)
            out += u"<td><a href=\"/%s/%s.html\">%s</a></td>" % (self.escape(type), 
                                                                self.escape(mbid), self.escape(name))
+
+           text = text.replace(u"\\n", u"\n");
            out += u"<td>%%WIKIBEGIN%%%s%%WIKIEND%%</td>" % (self.escape(text)) 
            out += u"</tr>"
        out += u"</table></div>"

Modified: search_server/trunk/lib/search.py
===================================================================
--- search_server/trunk/lib/search.py	2008-06-25 07:56:34 UTC (rev 9879)
+++ search_server/trunk/lib/search.py	2008-06-25 09:49:09 UTC (rev 9880)
@@ -26,7 +26,9 @@
 import sys, os
 import xapian
 from unac import unac
+from escape_ideographic import addSpacesToIdeographicStrings
 import time
+import re
 
 # TODO: 
 # auto index update
@@ -67,6 +69,10 @@
         self.rel = 0
         self.offset = 0
 
+	self.escapeTermBoost = re.compile(u"\^[0-9]*\.?[0-9]+")
+	self.dotsRe = re.compile("((?:\w\.){2,})")
+	self.uuidRe = re.compile("[a-z0-9]{8}[:-][a-z0-9]{4}[:-][a-z0-9]{4}[:-][a-z0-9]{4}[:-][a-z0-9]{12}")
+
         self.defaultField = u''
         try:
             sys.stderr.write(indexName + "\n")
@@ -75,7 +81,11 @@
 	    text = str(msg)
             raise NoSuchIndexError(text)
 
+        self.weight = xapian.BM25Weight(0, 0, 1, .1, .5);
         self.en = enquire = xapian.Enquire(self.index)
+        self.en.set_weighting_scheme(self.weight)
+        self.en.set_sort_by_relevance_then_value(0, True)
+
         self.qp = xapian.QueryParser()
         self.qp.set_database(self.index)
         self.qp.set_stemming_strategy(xapian.QueryParser.STEM_NONE)
@@ -105,12 +115,6 @@
         '''
         return text.replace(u'&', u'&amp;').replace(u'<', u'&lt;').replace(u'>', u'&gt;')
  
-    def unEscapeUUID(self, uuid):
-        '''
-        Convert a dashless UUID to a standard UUID with dashes.
-        '''
-        return "%s-%s-%s-%s-%s" % (uuid[0:8], uuid[8:12], uuid[12:16], uuid[16:20], uuid[20:32])
-
     def setDuration(self, dur):
         '''
         If a duration is set, the search can color code track lengths
@@ -164,6 +168,19 @@
         out += u'height="13" width="28" align="middle" border="0">'
         return out
 
+    def lowercaseQuery(self, query):
+	'''
+	Change the query to all lower case since Xapian seems to have some case sensitivity issues.
+	But, don't change everything to lowercase -- AND, OR, NOT, XOR in uppercase must be preserved.
+	'''
+
+	out = []
+	for word in query.split(u' '):
+	    if word not in (u'AND', u'OR', u'NOT', u'XOR'): word = word.lower()
+	    out.append(word)
+
+        return u' '.join(out)
+
     def mangleQuery(self, query):
         '''
         For backwards compatibility, filter a query before passing it to lucene. Returns
@@ -171,6 +188,45 @@
         '''
         return query
 
+    def removeTermBoosting(self, query):
+        '''
+	Removes term boosting from old queries since Xapain doesn't support them
+        '''
+        return self.escapeTermBoost.sub("", query)
+
+    def removeDots(self, query):
+	'''
+	Remove dots between characters so we can find R.E.M.
+	'''
+
+	bits = []
+	index = 0
+	for m in self.dotsRe.finditer(query):
+	    bits.append(query[index:m.start()])
+	    acronym = query[m.start():m.end()]
+	    bits.append(acronym.replace(u".", u""))
+	    index = m.end()
+
+	if index < len(query):
+	    bits.append(query[index:len(query)])
+
+	return u''.join(bits)
+
+    def removeDashesFromMBIDs(self, query):
+
+	bits = []
+	index = 0
+	for m in self.uuidRe.finditer(query):
+	    bits.append(query[index:m.start()])
+	    acronym = query[m.start():m.end()]
+	    bits.append(acronym.replace(u"-", u""))
+	    index = m.end()
+
+	if index < len(query):
+	    bits.append(query[index:len(query)])
+
+	return u''.join(bits)
+
     def queryIndex(self, query, offset, maxHits):
         '''
         Carry out a search, and return the hits
@@ -178,14 +234,23 @@
 
         if not query: raise QueryError(u"No query was sent")
 
-
         try:
-            query = unac.unac_string(self.mangleQuery(unicode(query, 'utf-8')))
+            query = unicode(query, 'utf-8')
+	    self.f = open("/tmp/log", "a")
+	    print >>self.f, "query: '%s'" % query.encode('utf-8', 'replace') 
+	    self.f.close()
+            query = self.lowercaseQuery(query)
+            query = self.mangleQuery(query)
+            query = self.removeTermBoosting(query)
+            query = self.removeDots(query)
+	    query = self.removeDashesFromMBIDs(query)
+            query = unac.unac_string(query)
+            query = addSpacesToIdeographicStrings(query)
         except UnicodeDecodeError:
             raise QueryError(u"Unicode decode problem: Invalid utf-8 characters passed to search query.")
 
         try:
-	    parsedQuery = self.qp.parse_query(unac.unac_string(query), 
+	    parsedQuery = self.qp.parse_query(query, 
 			      	              xapian.QueryParser.FLAG_PHRASE | 
 				              xapian.QueryParser.FLAG_BOOLEAN | 
 				              xapian.QueryParser.FLAG_LOVEHATE,
@@ -216,7 +281,7 @@
 	    dataDict['_score'] = match.percent
 	    hits.append(dataDict)
             
-        return hits
+        return (hits, matches.get_matches_estimated())
  
     def log_error(self, msg):
         log = open("/tmp/slow_queries.txt", "a")
@@ -227,7 +292,7 @@
  
     def search(self, query, maxHits, offset, type='xml'):
         if maxHits < 1: maxHits = MAX_HITS
-        hits = self.queryIndex(query, offset, maxHits);
+        (hits, total) = self.queryIndex(query, offset, maxHits);
         redirect = ""
         if len(hits) == 1:
            doc = hits[0]
@@ -237,7 +302,7 @@
 
         if type == 'html':
             # This comment will be used by the MB server to determine the number of hits returned
-            stats = u"<!--\nhits=%d\noffset=%d\n" % (len(hits), offset)
+            stats = u"<!--\nhits=%d\noffset=%d\n" % (total, offset)
             if redirect: stats += u"redirect=%s\n" % redirect
             stats += u"-->"
             return stats.encode('utf-8') + self.asHTML(hits, maxHits, offset).encode('utf-8')




More information about the MusicBrainz-commits mailing list