Commit 0516994 Roberto Alsina
committed May 22, 2017
1 parent e7fb38e commit 0516994 Copy full SHA for 0516994
File tree 1 file changed +6
-7
lines changed
1 file changed +6
-7
lines changed Original file line number Diff line number Diff line change 30
30
31
31
from nikola .plugin_categories import Task
32
32
33
+
33
34
class Similarity (Task ):
34
35
"""Calculate post similarity."""
35
36
name = "similarity"
36
37
37
38
def set_site (self , site ):
38
39
self .site = site
39
-
40
+
40
41
def gen_tasks (self ):
41
42
"""Build similarity data for each post."""
42
43
self .site .scan_posts ()
43
-
44
+
44
45
texts = []
45
-
46
+
46
47
for p in self .site .timeline :
47
48
texts .append (p .text (strip_html = True ).lower ().split ())
48
-
49
+
49
50
dictionary = gensim .corpora .Dictionary (texts )
50
51
corpus = [dictionary .doc2bow (text ) for text in texts ]
51
52
lsi = gensim .models .LsiModel (corpus , id2word = dictionary , num_topics = 2 )
52
53
index = gensim .similarities .MatrixSimilarity (lsi [corpus ])
53
-
54
+
54
55
for i , post in enumerate (self .site .timeline ):
55
56
doc = texts [i ]
56
57
vec_bow = dictionary .doc2bow (doc )
57
58
vec_lsi = lsi [vec_bow ]
58
59
sims = index [vec_lsi ]
59
60
sims = sorted (enumerate (sims ), key = lambda item : - item [1 ])
60
61
print (i , sims [:10 ])
61
-
62
-
You can’t perform that action at this time.
0 commit comments