Download raw (923 bytes)
# -*- coding: utf-8 -*- #!/usr/bin/env python import csv from sklearn.feature_extraction.text import TfidfVectorizer CSVPATH = 'datas/csv/beyond-the-first-decade_datas_2006-2015.csv' descriptions = [] with open(CSVPATH, 'rb') as csvfile: reader = csv.DictReader(csvfile, delimiter=',', quotechar='"') talks = [] descriptions = [row['description'] for row in reader if row['description']] vect = TfidfVectorizer(min_df=1) tfidf = vect.fit_transform(descriptions) matrix = (tfidf * tfidf.T).A for i, row in enumerate(matrix): l = list(row) most_similar_value = sorted(l, reverse=True)[1] most_similar_i = l.index( most_similar_value ) #print i, row[i], most_similar_i, most_similar_value print "A DESCRIPTION" print print descriptions[i] print print "THE MOST SIMILAR DESCRIPTION @ %s" % most_similar_value print print descriptions[most_similar_i] print