Skip to content

Commit

Permalink
getting there
Browse files Browse the repository at this point in the history
  • Loading branch information
Roberto Alsina committed May 23, 2017
1 parent 0516994 commit fdc0f59
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 15 deletions.
1 change: 1 addition & 0 deletions v7/similarity/requirements.txt
@@ -0,0 +1 @@
stop-words
86 changes: 71 additions & 15 deletions v7/similarity/similarity.py
Expand Up @@ -26,7 +26,11 @@

from __future__ import print_function, unicode_literals

import json
import os

import gensim
from stop_words import get_stop_words

from nikola.plugin_categories import Task

Expand All @@ -42,20 +46,72 @@ def gen_tasks(self):
"""Build similarity data for each post."""
self.site.scan_posts()

stopwords = {}
for l in self.site.translations:
stopwords[l] = get_stop_words(l)

def split_text(text, lang="en"):
words = text.lower().split()
return [w for w in words if w not in stopwords[lang]]

# FIXME langs!!!!
texts = []

for p in self.site.timeline:
texts.append(p.text(strip_html=True).lower().split())

dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=2)
index = gensim.similarities.MatrixSimilarity(lsi[corpus])

for i, post in enumerate(self.site.timeline):
doc = texts[i]
vec_bow = dictionary.doc2bow(doc)
vec_lsi = lsi[vec_bow]
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(i, sims[:10])
yield self.group_task()

def write_similar(path, related):
data = []
for p, score, tag, title, body in related:
data.append({
'url': '/' + p.destination_path(sep='/'),
'title': p.title(),
'score': score,
'detailed_score': [tag, title, float(body)],
})
with open(path, 'w+') as outf:
json.dump(data, outf)


def tags_similarity(p1, p2):
t1 = set(p1.tags)
t2 = set(p2.tags)
if not (t1 and t2):
return 0
# Totally making this up
return 2.0 * len(t1.intersection(t2)) / (len(t1) + len(t2))

def title_similarity(p1, p2):
t1 = set(split_text(p1.title()))
t2 = set(split_text(p2.title()))
if not (t1 and t2):
return 0
# Totally making this up
return 2.0 * len(t1.intersection(t2)) / (len(t1) + len(t2))

for lang in self.site.translations:
texts = []
for p in self.site.timeline:
texts.append(split_text(p.text(strip_html=True, lang=lang), lang=lang))
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=2)
index = gensim.similarities.MatrixSimilarity(lsi[corpus])
for i, post in enumerate(self.site.timeline):
# FIXME config output
out_name = os.path.join('output', post.destination_path(lang=lang))+'.related.json'
doc = texts[i]
vec_bow = dictionary.doc2bow(doc)
vec_lsi = lsi[vec_bow]
body_sims = index[vec_lsi]
tag_sims = [tags_similarity(post, p) for p in self.site.timeline]
title_sims = [title_similarity(post, p) for p in self.site.timeline]
full_sims = [tag_sims[i] + title_sims[i] + body_sims[i] *2 for i in range(len(self.site.timeline))]
full_sims = sorted(enumerate(full_sims), key=lambda item: -item[1])
related = [(self.site.timeline[s[0]], s[1], tag_sims[s[0]], title_sims[s[0]], body_sims[s[0]]) for s in full_sims[:11] if s[0] != i ]
task = {
'basename': self.name,
'name': out_name,
'targets': [out_name],
'actions': [(write_similar, (out_name, related))],
}
yield task

0 comments on commit fdc0f59

Please sign in to comment.