#!/usr/bin/python import hotshot, hotshot.stats import couchdb import simplejson as json import urllib def profileit(printlines=20): def _my(func): def _func(*args, **kargs): prof = hotshot.Profile("profiling.data") res = prof.runcall(func, *args, **kargs) prof.close() stats = hotshot.stats.load("profiling.data") stats.strip_dirs() stats.sort_stats('time', 'calls') print ">>>---- Begin profiling print" stats.print_stats(printlines) print ">>>---- End profiling print" return res return _func return _my db_name = "kw2" db_name = "keywords" view_url = "http://localhost:5984/%s/_design/finding/_view/word_count" % (db_name) try: server = couchdb.Server('http://127.0.0.1:5984') db = server[db_name] len(db) except couchdb.client.ResourceNotFound: server.create(db_name) db = server[db_name] len(db) def load_db(): key_file = open('keywords2.txt') data = key_file.read() words = data.split() for word in words: node = db.create( { "word": word } ) def all_word_count(): try: # u = "http://localhost:5984/%s/_view/finding/word_count?group=true" % (db_name) u = "%s?group=true" % (view_url) ### Example Output: {"rows":[{"key":"be","value":1},{"key":"do","value":4},{"key":"to","value":1},{"key":"we","value":2}]} j = json.loads(urllib.urlopen(u).read()) return j['rows'] except: return [{}] def total_word_count(): try: # u = "http://localhost:5984/%s/_view/finding/word_count" % (db_name) u = view_url ### output {"rows":[{"key":null,"value":124580}]} j = json.loads(urllib.urlopen(u).read()) return j['rows'][0]['value'] except: return 0 def build_prob_dict(word_list, total_words): num = float(total_words) try: return dict([ (r['key'], r['value'] / num ) for r in word_list]) except: return {} #@profileit(20) def find_keyword(test_string = None): if not test_string: test_string = 'Hacker news is a good site while Techcrunch not so much' word_prob_dict = build_prob_dict(all_word_count(), total_word_count()) non_exist_prob = min(word_prob_dict.values()) / 2.0 #Build a word count from the input text words = test_string.split() test_word_freq = {} for word in words: if word in test_word_freq: test_word_freq[word]+=1 else: test_word_freq[word] = 1 #Build a freq count of the input words test_words_ba = {} for word, freq in test_word_freq.items(): if word in word_prob_dict: test_words_ba[word] = freq/word_prob_dict[word] else: test_words_ba[word] = freq/non_exist_prob test_word_ba_list = [] for word, ba in test_words_ba.items(): test_word_ba_list.append((word, ba)) def sort_func(a, b): if a[1] > b[1]: return -1 elif a[1] < b[1]: return 1 return 0 test_word_ba_list.sort(sort_func) print test_word_ba_list[:2] return test_word_ba_list[:2] if __name__ == "__main__": # load_db() find_keyword()