diff options
Diffstat (limited to 'gallery_dl/extractor/reddit.py')
| -rw-r--r-- | gallery_dl/extractor/reddit.py | 80 |
1 files changed, 58 insertions, 22 deletions
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 94e95e8..9c283de 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from subreddits at https://www.reddit.com/""" +"""Extractors for https://www.reddit.com/""" from .common import Extractor, Message from .. import text, util, extractor, exception @@ -31,7 +31,8 @@ class RedditExtractor(Extractor): yield Message.Version, 1 with extractor.blacklist( - util.SPECIAL_EXTRACTORS, [RedditSubredditExtractor]): + util.SPECIAL_EXTRACTORS, + [RedditSubredditExtractor, RedditUserExtractor]): while True: extra = [] for url, data in self._urls(submissions): @@ -68,18 +69,18 @@ class RedditExtractor(Extractor): submission["selftext_html"] or "", ' href="', '"'): yield url, submission - for comment in comments: - for url in text.extract_iter( - comment["body_html"] or "", ' href="', '"'): - yield url, comment + if comments: + for comment in comments: + for url in text.extract_iter( + comment["body_html"] or "", ' href="', '"'): + yield url, comment class RedditSubredditExtractor(RedditExtractor): - """Extractor for images from subreddits on reddit.com""" + """Extractor for URLs from subreddits on reddit.com""" subcategory = "subreddit" - pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/([^/?&#]+)" - r"(/[a-z]+)?/?" - r"(?:\?.*?(?:\bt=([a-z]+))?)?$") + pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/" + r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?(?:$|#)") test = ( ("https://www.reddit.com/r/lavaporn/"), ("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month"), @@ -90,24 +91,53 @@ class RedditSubredditExtractor(RedditExtractor): def __init__(self, match): RedditExtractor.__init__(self, match) - self.subreddit, self.order, self.timeframe = match.groups() + self.subreddit = match.group(1) + self.params = text.parse_query(match.group(2)) def submissions(self): - subreddit = self.subreddit + (self.order or "") - params = {"t": self.timeframe} if self.timeframe else {} - return self.api.submissions_subreddit(subreddit, params) + return self.api.submissions_subreddit(self.subreddit, self.params) + + +class RedditUserExtractor(RedditExtractor): + """Extractor for URLs from posts by a reddit user""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/u(?:ser)?/" + r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?") + test = ( + ("https://www.reddit.com/user/username/", { + "count": ">= 2", + }), + ("https://www.reddit.com/user/username/gilded/?sort=top&t=month"), + ("https://old.reddit.com/user/username/"), + ("https://www.reddit.com/u/username/"), + ) + + def __init__(self, match): + RedditExtractor.__init__(self, match) + self.user = match.group(1) + self.params = text.parse_query(match.group(2)) + + def submissions(self): + return self.api.submissions_user(self.user, self.params) class RedditSubmissionExtractor(RedditExtractor): - """Extractor for images from a submission on reddit.com""" + """Extractor for URLs from a submission on reddit.com""" subcategory = "submission" pattern = (r"(?:https?://)?(?:" r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|" r"redd\.it" r")/([a-z0-9]+)") test = ( - ("https://www.reddit.com/r/lavaporn/comments/2a00np/", { - "pattern": r"https?://i\.imgur\.com/AaAUCgy\.jpg", + ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", { + "pattern": r"https://", + "count": 3, + }), + # ignore submission comments (#429) + ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", { + "options": (("comments", 0),), + "pattern": r"https://c2.staticflickr.com/8/7272/\w+_k.jpg", + "count": 1, }), ("https://old.reddit.com/r/lavaporn/comments/2a00np/"), ("https://np.reddit.com/r/lavaporn/comments/2a00np/"), @@ -156,7 +186,7 @@ class RedditAPI(): def __init__(self, extractor): self.extractor = extractor - self.comments = extractor.config("comments", 500) + self.comments = text.parse_int(extractor.config("comments", 500)) self.morecomments = extractor.config("morecomments", False) self.refresh_token = extractor.config("refresh-token") self.log = extractor.log @@ -168,7 +198,7 @@ class RedditAPI(): self.client_id = None self.log.warning( "Conflicting values for 'client-id' and 'user-agent': " - "override either both or none of them.") + "overwrite either both or none of them.") else: self.client_id = client_id extractor.session.headers["User-Agent"] = user_agent @@ -179,7 +209,7 @@ class RedditAPI(): link_id = "t3_" + submission_id if self.morecomments else None submission, comments = self._call(endpoint, {"limit": self.comments}) return (submission["data"]["children"][0]["data"], - self._flatten(comments, link_id)) + self._flatten(comments, link_id) if self.comments else None) def submissions_subreddit(self, subreddit, params): """Collect all (submission, comments)-tuples of a subreddit""" @@ -187,6 +217,12 @@ class RedditAPI(): params["limit"] = 100 return self._pagination(endpoint, params) + def submissions_user(self, user, params): + """Collect all (submission, comments)-tuples posted by a user""" + endpoint = "/user/" + user + "/.json" + params["limit"] = 100 + return self._pagination(endpoint, params) + def morechildren(self, link_id, children): """Load additional comments from a submission""" endpoint = "/api/morechildren" @@ -249,7 +285,7 @@ class RedditAPI(): raise Exception(data["message"]) return data - def _pagination(self, endpoint, params, _empty=()): + def _pagination(self, endpoint, params): id_min = self._parse_id("id-min", 0) id_max = self._parse_id("id-max", 2147483647) date_min, date_max = self.extractor._get_date_min_max(0, 253402210800) @@ -267,7 +303,7 @@ class RedditAPI(): except exception.AuthorizationError: pass else: - yield submission, _empty + yield submission, None if not data["after"]: return |
