Navigation Menu

Skip to content

Commit

Permalink
Added threadurls for reddit crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
noqqe committed Nov 2, 2017
1 parent 61dead2 commit 719dc1b
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
7 changes: 4 additions & 3 deletions crawler/__init__.py
Expand Up @@ -226,11 +226,12 @@ def __images_clear(cls):
cls.__images = {} # alternative: cls.__images[:] = [] # be aware: list.clean() is not available in py2

@classmethod
def __add_image(cls, uri, crawler, site):
def __add_image(cls, uri, crawler, site, threadurl=None):
"""
:type uri: str
:type crawler: str
:type site: str
:type threadurl: str
:return: bool
"""
if not cls._is_image(uri):
Expand All @@ -247,8 +248,8 @@ def __add_image(cls, uri, crawler, site):
cls.__images[crawler][site] = []

cls._blacklist(uri) # add it to the blacklist to detect duplicates
cls.__images[crawler][site].append("%s#%s" % (uri, crawler))
cls._log("debug", "added %s-%s: %s" % (crawler, site, uri))
cls.__images[crawler][site].append("%s#%s#%s" % (uri, crawler, threadurl))
cls._log("debug", "added %s-%s-%s: %s" % (crawler, site, threadurl, uri))
return True

@classmethod
Expand Down
4 changes: 3 additions & 1 deletion crawler/reddit.py
Expand Up @@ -46,7 +46,9 @@ def _crawl(self):
for child in data['data']['children']:
image = child['data']['url']
if image:
if self._add_image(image, self.__site):
threadurl = 'https://reddit.com/' + child['data']['permalink']
self.__class__._log("debug", threadurl)
if self._add_image(image, self.__site, threadurl):
images_added += 1

if not images_added:
Expand Down

0 comments on commit 719dc1b

Please sign in to comment.