26
26
27
27
from __future__ import print_function , unicode_literals
28
28
29
+ import json
30
+ import os
31
+
29
32
import gensim
33
+ from stop_words import get_stop_words
30
34
31
35
from nikola .plugin_categories import Task
32
36
@@ -42,20 +46,72 @@ def gen_tasks(self):
42
46
"""Build similarity data for each post."""
43
47
self .site .scan_posts ()
44
48
49
+ stopwords = {}
50
+ for l in self .site .translations :
51
+ stopwords [l ] = get_stop_words (l )
52
+
53
+ def split_text (text , lang = "en" ):
54
+ words = text .lower ().split ()
55
+ return [w for w in words if w not in stopwords [lang ]]
56
+
57
+ # FIXME langs!!!!
45
58
texts = []
46
59
47
- for p in self .site .timeline :
48
- texts .append (p .text (strip_html = True ).lower ().split ())
49
-
50
- dictionary = gensim .corpora .Dictionary (texts )
51
- corpus = [dictionary .doc2bow (text ) for text in texts ]
52
- lsi = gensim .models .LsiModel (corpus , id2word = dictionary , num_topics = 2 )
53
- index = gensim .similarities .MatrixSimilarity (lsi [corpus ])
54
-
55
- for i , post in enumerate (self .site .timeline ):
56
- doc = texts [i ]
57
- vec_bow = dictionary .doc2bow (doc )
58
- vec_lsi = lsi [vec_bow ]
59
- sims = index [vec_lsi ]
60
- sims = sorted (enumerate (sims ), key = lambda item : - item [1 ])
61
- print (i , sims [:10 ])
60
+ yield self .group_task ()
61
+
62
+ def write_similar (path , related ):
63
+ data = []
64
+ for p , score , tag , title , body in related :
65
+ data .append ({
66
+ 'url' : '/' + p .destination_path (sep = '/' ),
67
+ 'title' : p .title (),
68
+ 'score' : score ,
69
+ 'detailed_score' : [tag , title , float (body )],
70
+ })
71
+ with open (path , 'w+' ) as outf :
72
+ json .dump (data , outf )
73
+
74
+
75
+ def tags_similarity (p1 , p2 ):
76
+ t1 = set (p1 .tags )
77
+ t2 = set (p2 .tags )
78
+ if not (t1 and t2 ):
79
+ return 0
80
+ # Totally making this up
81
+ return 2.0 * len (t1 .intersection (t2 )) / (len (t1 ) + len (t2 ))
82
+
83
+ def title_similarity (p1 , p2 ):
84
+ t1 = set (split_text (p1 .title ()))
85
+ t2 = set (split_text (p2 .title ()))
86
+ if not (t1 and t2 ):
87
+ return 0
88
+ # Totally making this up
89
+ return 2.0 * len (t1 .intersection (t2 )) / (len (t1 ) + len (t2 ))
90
+
91
+ for lang in self .site .translations :
92
+ texts = []
93
+ for p in self .site .timeline :
94
+ texts .append (split_text (p .text (strip_html = True , lang = lang ), lang = lang ))
95
+ dictionary = gensim .corpora .Dictionary (texts )
96
+ corpus = [dictionary .doc2bow (text ) for text in texts ]
97
+ lsi = gensim .models .LsiModel (corpus , id2word = dictionary , num_topics = 2 )
98
+ index = gensim .similarities .MatrixSimilarity (lsi [corpus ])
99
+ for i , post in enumerate (self .site .timeline ):
100
+ # FIXME config output
101
+ out_name = os .path .join ('output' , post .destination_path (lang = lang ))+ '.related.json'
102
+ doc = texts [i ]
103
+ vec_bow = dictionary .doc2bow (doc )
104
+ vec_lsi = lsi [vec_bow ]
105
+ body_sims = index [vec_lsi ]
106
+ tag_sims = [tags_similarity (post , p ) for p in self .site .timeline ]
107
+ title_sims = [title_similarity (post , p ) for p in self .site .timeline ]
108
+ full_sims = [tag_sims [i ] + title_sims [i ] + body_sims [i ] * 2 for i in range (len (self .site .timeline ))]
109
+ full_sims = sorted (enumerate (full_sims ), key = lambda item : - item [1 ])
110
+ related = [(self .site .timeline [s [0 ]], s [1 ], tag_sims [s [0 ]], title_sims [s [0 ]], body_sims [s [0 ]]) for s in full_sims [:11 ] if s [0 ] != i ]
111
+ task = {
112
+ 'basename' : self .name ,
113
+ 'name' : out_name ,
114
+ 'targets' : [out_name ],
115
+ 'actions' : [(write_similar , (out_name , related ))],
116
+ }
117
+ yield task
0 commit comments