Skip to content

Commit fdc0f59

Browse files
author
Roberto Alsina
committedMay 23, 2017
getting there
1 parent 0516994 commit fdc0f59

File tree

2 files changed

+72
-15
lines changed

2 files changed

+72
-15
lines changed
 

‎v7/similarity/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
stop-words

‎v7/similarity/similarity.py

+71-15
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@
2626

2727
from __future__ import print_function, unicode_literals
2828

29+
import json
30+
import os
31+
2932
import gensim
33+
from stop_words import get_stop_words
3034

3135
from nikola.plugin_categories import Task
3236

@@ -42,20 +46,72 @@ def gen_tasks(self):
4246
"""Build similarity data for each post."""
4347
self.site.scan_posts()
4448

49+
stopwords = {}
50+
for l in self.site.translations:
51+
stopwords[l] = get_stop_words(l)
52+
53+
def split_text(text, lang="en"):
54+
words = text.lower().split()
55+
return [w for w in words if w not in stopwords[lang]]
56+
57+
# FIXME langs!!!!
4558
texts = []
4659

47-
for p in self.site.timeline:
48-
texts.append(p.text(strip_html=True).lower().split())
49-
50-
dictionary = gensim.corpora.Dictionary(texts)
51-
corpus = [dictionary.doc2bow(text) for text in texts]
52-
lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=2)
53-
index = gensim.similarities.MatrixSimilarity(lsi[corpus])
54-
55-
for i, post in enumerate(self.site.timeline):
56-
doc = texts[i]
57-
vec_bow = dictionary.doc2bow(doc)
58-
vec_lsi = lsi[vec_bow]
59-
sims = index[vec_lsi]
60-
sims = sorted(enumerate(sims), key=lambda item: -item[1])
61-
print(i, sims[:10])
60+
yield self.group_task()
61+
62+
def write_similar(path, related):
63+
data = []
64+
for p, score, tag, title, body in related:
65+
data.append({
66+
'url': '/' + p.destination_path(sep='/'),
67+
'title': p.title(),
68+
'score': score,
69+
'detailed_score': [tag, title, float(body)],
70+
})
71+
with open(path, 'w+') as outf:
72+
json.dump(data, outf)
73+
74+
75+
def tags_similarity(p1, p2):
76+
t1 = set(p1.tags)
77+
t2 = set(p2.tags)
78+
if not (t1 and t2):
79+
return 0
80+
# Totally making this up
81+
return 2.0 * len(t1.intersection(t2)) / (len(t1) + len(t2))
82+
83+
def title_similarity(p1, p2):
84+
t1 = set(split_text(p1.title()))
85+
t2 = set(split_text(p2.title()))
86+
if not (t1 and t2):
87+
return 0
88+
# Totally making this up
89+
return 2.0 * len(t1.intersection(t2)) / (len(t1) + len(t2))
90+
91+
for lang in self.site.translations:
92+
texts = []
93+
for p in self.site.timeline:
94+
texts.append(split_text(p.text(strip_html=True, lang=lang), lang=lang))
95+
dictionary = gensim.corpora.Dictionary(texts)
96+
corpus = [dictionary.doc2bow(text) for text in texts]
97+
lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=2)
98+
index = gensim.similarities.MatrixSimilarity(lsi[corpus])
99+
for i, post in enumerate(self.site.timeline):
100+
# FIXME config output
101+
out_name = os.path.join('output', post.destination_path(lang=lang))+'.related.json'
102+
doc = texts[i]
103+
vec_bow = dictionary.doc2bow(doc)
104+
vec_lsi = lsi[vec_bow]
105+
body_sims = index[vec_lsi]
106+
tag_sims = [tags_similarity(post, p) for p in self.site.timeline]
107+
title_sims = [title_similarity(post, p) for p in self.site.timeline]
108+
full_sims = [tag_sims[i] + title_sims[i] + body_sims[i] *2 for i in range(len(self.site.timeline))]
109+
full_sims = sorted(enumerate(full_sims), key=lambda item: -item[1])
110+
related = [(self.site.timeline[s[0]], s[1], tag_sims[s[0]], title_sims[s[0]], body_sims[s[0]]) for s in full_sims[:11] if s[0] != i ]
111+
task = {
112+
'basename': self.name,
113+
'name': out_name,
114+
'targets': [out_name],
115+
'actions': [(write_similar, (out_name, related))],
116+
}
117+
yield task

0 commit comments

Comments
 (0)
Please sign in to comment.