[mb-commits] r9887 - search_index/trunk/mbsearch
root at musicbrainz.org
root at musicbrainz.org
Thu Jun 26 09:43:21 UTC 2008
Author: robert
Date: 2008-06-26 09:43:21 +0000 (Thu, 26 Jun 2008)
New Revision: 9887
Added:
search_index/trunk/mbsearch/escape_ideographic.py
Modified:
search_index/trunk/mbsearch/indexcreator.py
Log:
Checking in the ideographic escaper hack and stop using a magic number in the indexer.
Added: search_index/trunk/mbsearch/escape_ideographic.py
Property changes on: search_index/trunk/mbsearch/escape_ideographic.py
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:keywords
+ Id HeadURL
Modified: search_index/trunk/mbsearch/indexcreator.py
===================================================================
--- search_index/trunk/mbsearch/indexcreator.py 2008-06-26 08:01:43 UTC (rev 9886)
+++ search_index/trunk/mbsearch/indexcreator.py 2008-06-26 09:43:21 UTC (rev 9887)
@@ -34,6 +34,7 @@
# Number of rows to process for each database 'chunk'
IDS_PER_CHUNK = 2000
STATS_WINDOW_SIZE = 6
+MAX_FIELD_LEN = 65536
# TODO: Query DB in transactions to make sure final index is consistent
@@ -66,7 +67,7 @@
self.user = user
self.passwd = passwd
- self.dotsRe = re.compile("((?:\w\.){2,})")
+ self.dotsRe = re.compile("((?:\w\.){2,})", re.UNICODE)
self.indexer = xapian.TermGenerator()
if create:
@@ -214,7 +215,7 @@
self.indexer.index_text(text, 0, u"X" + name.upper())
#print "ndx: %s:'%s'" % (name.encode('utf-8', 'replace'), text.encode('utf-8', 'replace'))
if tokenize == FIELD_TOKENIZE_PRIMARY:
- doc.add_value(0, u"%d" % (1000 - len(text)))
+ doc.add_value(0, u"%d" % (MAX_FIELD_LEN - len(text)))
elif tokenize == FIELD_TOKENIZE_NONE:
#print "add: '%s'" % text.encode('utf-8', 'replace')
doc.add_term(u"X" + name.upper() + text, 0)
@@ -227,7 +228,7 @@
self.indexer.increase_termpos()
elif tokenize == FIELD_TOKENIZE_PRIMARY:
#print "val: '%s'" % len(text.split(u" "))
- doc.add_value(0, u"%d" % (1000 - len(text)))
+ doc.add_value(0, u"%d" % (MAX_FIELD_LEN - len(text)))
# Add the document to the index
More information about the MusicBrainz-commits
mailing list