[mb-commits] r9887 - search_index/trunk/mbsearch

root at musicbrainz.org root at musicbrainz.org
Thu Jun 26 09:43:21 UTC 2008


Author: robert
Date: 2008-06-26 09:43:21 +0000 (Thu, 26 Jun 2008)
New Revision: 9887

Added:
   search_index/trunk/mbsearch/escape_ideographic.py
Modified:
   search_index/trunk/mbsearch/indexcreator.py
Log:
Checking in the ideographic escaper hack and stop using a magic number in the indexer.


Added: search_index/trunk/mbsearch/escape_ideographic.py


Property changes on: search_index/trunk/mbsearch/escape_ideographic.py
___________________________________________________________________
Name: svn:executable
   + *
Name: svn:keywords
   + Id HeadURL

Modified: search_index/trunk/mbsearch/indexcreator.py
===================================================================
--- search_index/trunk/mbsearch/indexcreator.py	2008-06-26 08:01:43 UTC (rev 9886)
+++ search_index/trunk/mbsearch/indexcreator.py	2008-06-26 09:43:21 UTC (rev 9887)
@@ -34,6 +34,7 @@
 # Number of rows to process for each database 'chunk'
 IDS_PER_CHUNK = 2000
 STATS_WINDOW_SIZE = 6
+MAX_FIELD_LEN = 65536
 
 # TODO: Query DB in transactions to make sure final index is consistent
 
@@ -66,7 +67,7 @@
         self.user = user
         self.passwd = passwd
 
-	self.dotsRe = re.compile("((?:\w\.){2,})")
+	self.dotsRe = re.compile("((?:\w\.){2,})", re.UNICODE)
 
         self.indexer = xapian.TermGenerator()
         if create:
@@ -214,7 +215,7 @@
                                     self.indexer.index_text(text, 0, u"X" + name.upper())
 				    #print "ndx: %s:'%s'" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
 				    if tokenize == FIELD_TOKENIZE_PRIMARY:
-					doc.add_value(0, u"%d" % (1000 - len(text)))
+					doc.add_value(0, u"%d" % (MAX_FIELD_LEN - len(text)))
 			        elif tokenize == FIELD_TOKENIZE_NONE:
 				    #print "add: '%s'" % text.encode('utf-8', 'replace')
 				    doc.add_term(u"X" + name.upper() + text, 0)
@@ -227,7 +228,7 @@
 			        self.indexer.increase_termpos()
 			    elif tokenize == FIELD_TOKENIZE_PRIMARY:
 				#print "val: '%s'" % len(text.split(u" "))
-				doc.add_value(0, u"%d" % (1000 - len(text)))
+				doc.add_value(0, u"%d" % (MAX_FIELD_LEN - len(text)))
 			        
 
                 # Add the document to the index




More information about the MusicBrainz-commits mailing list