Skip to content

Commit e7fb38e

Browse files
author
Roberto Alsina
committedMay 22, 2017
Similarity plugin (WIP)
1 parent 52fdc6e commit e7fb38e

File tree

2 files changed

+74
-0
lines changed

2 files changed

+74
-0
lines changed
 

‎v7/similarity/similarity.plugin

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[Core]
2+
Name = similarity
3+
Module = similarity
4+
5+
[Nikola]
6+
PluginCategory = Task
7+
8+
[Documentation]
9+
Author = Roberto Alsina
10+
Version = 0.1
11+
Website = http://plugins.getnikola.com/#similarity
12+
Description = Calculate similar posts

‎v7/similarity/similarity.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Copyright © 2017 Roberto Alsina and others.
4+
5+
# Permission is hereby granted, free of charge, to any
6+
# person obtaining a copy of this software and associated
7+
# documentation files (the "Software"), to deal in the
8+
# Software without restriction, including without limitation
9+
# the rights to use, copy, modify, merge, publish,
10+
# distribute, sublicense, and/or sell copies of the
11+
# Software, and to permit persons to whom the Software is
12+
# furnished to do so, subject to the following conditions:
13+
#
14+
# The above copyright notice and this permission notice
15+
# shall be included in all copies or substantial portions of
16+
# the Software.
17+
#
18+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
19+
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
20+
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
21+
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
22+
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23+
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
24+
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26+
27+
from __future__ import print_function, unicode_literals
28+
29+
import gensim
30+
31+
from nikola.plugin_categories import Task
32+
33+
class Similarity(Task):
34+
"""Calculate post similarity."""
35+
name = "similarity"
36+
37+
def set_site(self, site):
38+
self.site = site
39+
40+
def gen_tasks(self):
41+
"""Build similarity data for each post."""
42+
self.site.scan_posts()
43+
44+
texts = []
45+
46+
for p in self.site.timeline:
47+
texts.append(p.text(strip_html=True).lower().split())
48+
49+
dictionary = gensim.corpora.Dictionary(texts)
50+
corpus = [dictionary.doc2bow(text) for text in texts]
51+
lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=2)
52+
index = gensim.similarities.MatrixSimilarity(lsi[corpus])
53+
54+
for i, post in enumerate(self.site.timeline):
55+
doc = texts[i]
56+
vec_bow = dictionary.doc2bow(doc)
57+
vec_lsi = lsi[vec_bow]
58+
sims = index[vec_lsi]
59+
sims = sorted(enumerate(sims), key=lambda item: -item[1])
60+
print(i, sims[:10])
61+
62+

0 commit comments

Comments
 (0)
Please sign in to comment.