1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright © 2017 Roberto Alsina and others.
4
+
5
+ # Permission is hereby granted, free of charge, to any
6
+ # person obtaining a copy of this software and associated
7
+ # documentation files (the "Software"), to deal in the
8
+ # Software without restriction, including without limitation
9
+ # the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the
11
+ # Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice
15
+ # shall be included in all copies or substantial portions of
16
+ # the Software.
17
+ #
18
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
19
+ # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
20
+ # WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
21
+ # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
22
+ # OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23
+ # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
24
+ # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26
+
27
+ from __future__ import print_function , unicode_literals
28
+
29
+ import gensim
30
+
31
+ from nikola .plugin_categories import Task
32
+
33
+ class Similarity (Task ):
34
+ """Calculate post similarity."""
35
+ name = "similarity"
36
+
37
+ def set_site (self , site ):
38
+ self .site = site
39
+
40
+ def gen_tasks (self ):
41
+ """Build similarity data for each post."""
42
+ self .site .scan_posts ()
43
+
44
+ texts = []
45
+
46
+ for p in self .site .timeline :
47
+ texts .append (p .text (strip_html = True ).lower ().split ())
48
+
49
+ dictionary = gensim .corpora .Dictionary (texts )
50
+ corpus = [dictionary .doc2bow (text ) for text in texts ]
51
+ lsi = gensim .models .LsiModel (corpus , id2word = dictionary , num_topics = 2 )
52
+ index = gensim .similarities .MatrixSimilarity (lsi [corpus ])
53
+
54
+ for i , post in enumerate (self .site .timeline ):
55
+ doc = texts [i ]
56
+ vec_bow = dictionary .doc2bow (doc )
57
+ vec_lsi = lsi [vec_bow ]
58
+ sims = index [vec_lsi ]
59
+ sims = sorted (enumerate (sims ), key = lambda item : - item [1 ])
60
+ print (i , sims [:10 ])
61
+
62
+
0 commit comments