#!/usr/bin/python import hotshot, hotshot.stats def profileit(printlines=20): def _my(func): def _func(*args, **kargs): prof = hotshot.Profile("profiling.data") res = prof.runcall(func, *args, **kargs) prof.close() stats = hotshot.stats.load("profiling.data") stats.strip_dirs() stats.sort_stats('time', 'calls') print ">>>---- Begin profiling print" stats.print_stats(printlines) print ">>>---- End profiling print" return res return _func return _my #@profileit(20) def find_keyword(test_string = None): if not test_string: test_string = 'Hacker news is a good site while Techcrunch not so much' key_file = open('keywords2.txt') data = key_file.read() words = data.split() word_freq = {} for word in words: if word in word_freq: word_freq[word]+=1 else: word_freq[word] = 1 word_prob_dict = {} size_corpus = len(words) for word in word_freq: word_prob_dict[word] = float(word_freq[word])/size_corpus prob_list = [] for word, prob in word_prob_dict.items(): prob_list.append(prob) non_exist_prob = min(prob_list)/2 words = test_string.split() test_word_freq = {} for word in words: if word in test_word_freq: test_word_freq[word]+=1 else: test_word_freq[word] = 1 test_words_ba = {} for word, freq in test_word_freq.items(): if word in word_prob_dict: test_words_ba[word] = freq/word_prob_dict[word] else: test_words_ba[word] = freq/non_exist_prob test_word_ba_list = [] for word, ba in test_words_ba.items(): test_word_ba_list.append((word, ba)) def sort_func(a, b): if a[1] > b[1]: return -1 elif a[1] < b[1]: return 1 return 0 test_word_ba_list.sort(sort_func) print test_word_ba_list[:2] return test_word_ba_list[:2] if __name__ == "__main__": # cProfile.run('find_keyword()') find_keyword() # print find_keyword('Java is an island and a programming language') # print find_keyword('Java is an island and a programming language. Python is a snake and a programming') # print find_keyword('Java is an island and a programming language') # print find_keyword('Java is an island and a programming programming language')