# Eithon Cadag; pyindexer.py is released under GPL # needed prereqs: # - Python 2.3 (2.4 will not work due to Apache + mod_python issues) # - Numeric Python # - Natural Language Toolkit # - Apache (for online indexing) # - mod_python (for dynamic web indexing) ############## # Usage: # p = PyIndexer() # instantiation # p.add_text(1,"this is the content of my first text") # adding a document # p.add_text(2,"this is another text - my second one!") # another doc # p.run_indexing() # indexes documents # q = QueryVectorHandler(p) # prepares a handler for querying # print q.search("second text") # returns results with scores, and total time from Numeric import * import nltk.stemmer.porter import re, time, cPickle class Timer: def __init__(self): self.start = None self.stop = None def start_timer(self): self.start = time.time() def stop_timer(self): self.stop = time.time() def get_elapse(self): return self.stop - self.start class TextPreProcessor: def __init__(self): self.STOPWORDS = "a about all among an and are as at " + \ "be been between both but by do during " + \ "each either for found from further " + \ "has have however if in into is it its i " + \ "made make many more most must no not " + \ "of on or same several some such " + \ "than that the their these they this those through to toward " + \ "upon used using was were what which while who will with within would" self.stop_stemmed = [self._stem(x) for x in self.STOPWORDS.split()] self.pattern = re.compile('([a-z0-9\-]+)') def process(self, text): """takes a string and processes it into a list of cleaned words""" text = text.split() procd = [self._stem(self._stopcheck(self._clean_re(x))) for x in text] procd = [x for x in procd if len(x) > 0] return procd def _stem(self, word): stemmer = nltk.stemmer.porter.PorterStemmer() return stemmer.stem_word(word) def _clean_re(self, word): try: word = word.lower() m = self.pattern.match(word) mg = m.group() except AttributeError: mg = '' return mg def _stopcheck(self, word): if word in self.STOPWORDS or word in self.stop_stemmed: return '' else: return word class PyIndexer: def __init__(self): self.processor = TextPreProcessor() self.word_corpus = [] self.text_bin = [] self.index_bin = {} def add_text(self, id, text): ntxt = self.processor.process(text) for x in ntxt: if x not in self.word_corpus: self.word_corpus.append(x) self.text_bin.append((id, ntxt)) def run_indexing(self): """During indexing, the text corpus is cleared to save on space - be sure to store actual text data in another source somewhere.""" assert self.text_bin != None t = Timer() t.start_timer() for txt_cnt in range(len(self.text_bin)): self.index_bin[self.text_bin[txt_cnt][0]] = self.vectorize(self.text_bin[txt_cnt][1]) self.text_bin[txt_cnt] = None self.text_bin = None t.stop_timer() return t.get_elapse() def vectorize(self, text): text_vector = [0 for x in range(len(self.word_corpus))] for j in range(len(self.word_corpus)): text_vector[j] = text.count(self.word_corpus[j]) return array(text_vector) class IndexPickler: def __init__(self, path): self.path = path def save(self, index): cPickle.dump(index, open(self.path,'w')) def load(self): return cPickle.load(open(self.path)) class QueryVectorHandler: def __init__(self, pindex): assert pindex.index_bin != dict() self.index = pindex def vector_qry(self, txt): ntxt = self.index.processor.process(txt) return self.index.vectorize(ntxt) def normalize(self, vec): vlen = float(len(vec)) return array([x/vlen for x in vec]) def get_cos(self, vec_a, vec_b): try: nvec_a = self.normalize(vec_a) nvec_b = self.normalize(vec_b) score = (innerproduct(nvec_a,nvec_b)) except OverflowError: score = 0 return score def search(self, txt, count=10): t = Timer() t.start_timer() vector = self.vector_qry(txt) top_returns = [] for tgt in self.index.index_bin: score = self.get_cos(vector, self.index.index_bin[tgt]) if score > 0.0: top_returns.append((score,tgt)) top_returns.sort() top_returns.reverse() t.stop_timer() return top_returns[0:count], t.get_elapse() if __name__ == '__main__': reload(pyindexer) from pyindexer import * t = Timer() t.start_timer() ip = IndexPickler("C:\Program Files\Apache Group\Apache2\htdocs\cw\pyin.in") p = ip.load() q = QueryVectorHandler(p) print q.search("i need more fox ram for my jedi",1) t.stop_timer() print t.get_elapse()