# Eithon Cadag; pyindexer.py is released under GPL
# needed prereqs:
# - Python 2.3 (2.4 will not work due to Apache + mod_python issues)
# - Numeric Python
# - Natural Language Toolkit
# - Apache (for online indexing)
# - mod_python (for dynamic web indexing)
##############
# Usage:
#  p = PyIndexer() # instantiation
#  p.add_text(1,"this is the content of my first text") # adding a document
#  p.add_text(2,"this is another text - my second one!") # another doc
#  p.run_indexing() # indexes documents
#  q = QueryVectorHandler(p) # prepares a handler for querying
#  print q.search("second text") # returns results with scores, and total time

from Numeric import *
import nltk.stemmer.porter
import re, time, cPickle

class Timer:
    def __init__(self):
        self.start = None
        self.stop = None

    def start_timer(self):
        self.start = time.time()

    def stop_timer(self):
        self.stop = time.time()

    def get_elapse(self): return self.stop - self.start
    

class TextPreProcessor:
    def __init__(self):
        self.STOPWORDS = "a about all among an and are as at " + \
                    "be been between both but by do during " + \
                    "each either for found from further " + \
                    "has have however if in into is it its i " + \
                    "made make many more most must no not " + \
                    "of on or same several some such " + \
                    "than that the their these they this those through to toward " + \
                    "upon used using was were what which while who will with within would"
        self.stop_stemmed = [self._stem(x) for x in self.STOPWORDS.split()]
        self.pattern = re.compile('([a-z0-9\-]+)')

    def process(self, text):
        """takes a string and processes it into a list of cleaned words"""
        text = text.split()
        procd = [self._stem(self._stopcheck(self._clean_re(x))) for x in text]
        procd = [x for x in procd if len(x) > 0]
        return procd
    
    def _stem(self, word):
        stemmer = nltk.stemmer.porter.PorterStemmer()
        return stemmer.stem_word(word)

    def _clean_re(self, word):
        try:
            word = word.lower()
            m = self.pattern.match(word)
            mg =  m.group()
        except AttributeError:
            mg = ''
        return mg

    def _stopcheck(self, word):
        if word in self.STOPWORDS or word in self.stop_stemmed:
            return ''
        else:
            return word


class PyIndexer:
    def __init__(self):
        self.processor = TextPreProcessor()
        self.word_corpus = []
        self.text_bin = []
        self.index_bin = {}
        
    def add_text(self, id, text):
        ntxt = self.processor.process(text)
        for x in ntxt:
            if x not in self.word_corpus: self.word_corpus.append(x)
        self.text_bin.append((id, ntxt))

    def run_indexing(self):
        """During indexing, the text corpus is cleared to save on space -
        be sure to store actual text data in another source somewhere."""
        assert self.text_bin != None
        t = Timer()
        t.start_timer()
        for txt_cnt in range(len(self.text_bin)):
            self.index_bin[self.text_bin[txt_cnt][0]] = self.vectorize(self.text_bin[txt_cnt][1])
            self.text_bin[txt_cnt] = None
        self.text_bin = None
        t.stop_timer()
        return t.get_elapse()

    def vectorize(self, text):
        text_vector = [0 for x in range(len(self.word_corpus))]
        for j in range(len(self.word_corpus)):
            text_vector[j] = text.count(self.word_corpus[j])
        return array(text_vector)

class IndexPickler:
    def __init__(self, path):
        self.path = path

    def save(self, index):
        cPickle.dump(index, open(self.path,'w'))

    def load(self):
        return cPickle.load(open(self.path))


class QueryVectorHandler:
    def __init__(self, pindex):
        assert pindex.index_bin != dict()
        self.index = pindex

    def vector_qry(self, txt):
        ntxt = self.index.processor.process(txt)
        return self.index.vectorize(ntxt)

    def normalize(self, vec):
        vlen = float(len(vec))
        return array([x/vlen for x in vec])

    def get_cos(self, vec_a, vec_b):
        try:
            nvec_a = self.normalize(vec_a)
            nvec_b = self.normalize(vec_b)
            score = (innerproduct(nvec_a,nvec_b))
        except OverflowError:
            score = 0
        return score

    def search(self, txt, count=10):
        t = Timer()
        t.start_timer()
        vector = self.vector_qry(txt)
        top_returns = []
        for tgt in self.index.index_bin:
            score = self.get_cos(vector, self.index.index_bin[tgt])
            if score > 0.0:
                top_returns.append((score,tgt))
        top_returns.sort()
        top_returns.reverse()
        t.stop_timer()
        return top_returns[0:count], t.get_elapse()


if __name__ == '__main__':
    reload(pyindexer)
    from pyindexer import *
    t = Timer()
    t.start_timer()

    ip = IndexPickler("C:\Program Files\Apache Group\Apache2\htdocs\cw\pyin.in")
    p = ip.load()
    q = QueryVectorHandler(p)
    print q.search("i need more fox ram for my jedi",1)
    t.stop_timer()
    print t.get_elapse()