getting there

Roberto Alsina · Roberto Alsina · commit fdc0f59f2cff · 2017-05-23T17:41:13.000-03:00
diff --git a/v7/similarity/requirements.txt b/v7/similarity/requirements.txt
@@ -0,0 +1 @@
+stop-words
diff --git a/v7/similarity/similarity.py b/v7/similarity/similarity.py
@@ -26,7 +26,11 @@
 
 from __future__ import print_function, unicode_literals
 
+import json
+import os
+
 import gensim
+from stop_words import get_stop_words
 
 from nikola.plugin_categories import Task
 
@@ -42,20 +46,72 @@ def gen_tasks(self):
         """Build similarity data for each post."""
         self.site.scan_posts()
 
+        stopwords = {}
+        for l in self.site.translations:
+            stopwords[l] = get_stop_words(l)
+
+        def split_text(text, lang="en"):
+            words = text.lower().split()
+            return [w for w in words if w not in stopwords[lang]]
+
+        # FIXME langs!!!!
         texts = []
 
-        for p in self.site.timeline:
-            texts.append(p.text(strip_html=True).lower().split())
-
-        dictionary = gensim.corpora.Dictionary(texts)
-        corpus = [dictionary.doc2bow(text) for text in texts]
-        lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=2)
-        index = gensim.similarities.MatrixSimilarity(lsi[corpus])
-
-        for i, post in enumerate(self.site.timeline):
-            doc = texts[i]
-            vec_bow = dictionary.doc2bow(doc)
-            vec_lsi = lsi[vec_bow]
-            sims = index[vec_lsi]
-            sims = sorted(enumerate(sims), key=lambda item: -item[1])
-            print(i, sims[:10])
+        yield self.group_task()
+
+        def write_similar(path, related):
+            data = []
+            for p, score, tag, title, body in related:
+                data.append({
+                    'url': '/' + p.destination_path(sep='/'),
+                    'title': p.title(),
+                    'score': score,
+                    'detailed_score': [tag, title, float(body)],
+                })
+            with open(path, 'w+') as outf:
+                json.dump(data, outf)
+
+
+        def tags_similarity(p1, p2):
+            t1 = set(p1.tags)
+            t2 = set(p2.tags)
+            if not (t1 and t2):
+                return 0
+            # Totally making this up
+            return 2.0 * len(t1.intersection(t2)) / (len(t1) + len(t2))
+
+        def title_similarity(p1, p2):
+            t1 = set(split_text(p1.title()))
+            t2 = set(split_text(p2.title()))
+            if not (t1 and t2):
+                return 0
+            # Totally making this up
+            return 2.0 * len(t1.intersection(t2)) / (len(t1) + len(t2))
+
+        for lang in self.site.translations:
+            texts = []
+            for p in self.site.timeline:
+                texts.append(split_text(p.text(strip_html=True, lang=lang), lang=lang))
+            dictionary = gensim.corpora.Dictionary(texts)
+            corpus = [dictionary.doc2bow(text) for text in texts]
+            lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=2)
+            index = gensim.similarities.MatrixSimilarity(lsi[corpus])
+            for i, post in enumerate(self.site.timeline):
+                # FIXME config output
+                out_name = os.path.join('output', post.destination_path(lang=lang))+'.related.json'
+                doc = texts[i]
+                vec_bow = dictionary.doc2bow(doc)
+                vec_lsi = lsi[vec_bow]
+                body_sims = index[vec_lsi]
+                tag_sims = [tags_similarity(post, p) for p in self.site.timeline]
+                title_sims = [title_similarity(post, p) for p in self.site.timeline]
+                full_sims = [tag_sims[i] + title_sims[i] + body_sims[i] *2 for i in range(len(self.site.timeline))]
+                full_sims = sorted(enumerate(full_sims), key=lambda item: -item[1])
+                related = [(self.site.timeline[s[0]], s[1], tag_sims[s[0]], title_sims[s[0]], body_sims[s[0]]) for s in full_sims[:11] if s[0] != i ]
+                task = {
+                    'basename': self.name,
+                    'name': out_name,
+                    'targets': [out_name],
+                    'actions': [(write_similar, (out_name, related))],
+                }
+                yield task