summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/reddit.py
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@ubuntu.com>2019-10-01 19:12:47 -0400
committerLibravatarUnit 193 <unit193@ubuntu.com>2019-10-01 19:12:47 -0400
commit639d9ea4a667733aadc3ff83a1df2cc9f0add3a9 (patch)
tree5761b58d6fc3e8bbb99b39b8e4417673bccb0b86 /gallery_dl/extractor/reddit.py
parentc09a9f00dd83017d486cd77650347bc2a397ad55 (diff)
New upstream version 1.10.5upstream/1.10.5
Diffstat (limited to 'gallery_dl/extractor/reddit.py')
-rw-r--r--gallery_dl/extractor/reddit.py80
1 files changed, 58 insertions, 22 deletions
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 94e95e8..9c283de 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from subreddits at https://www.reddit.com/"""
+"""Extractors for https://www.reddit.com/"""
from .common import Extractor, Message
from .. import text, util, extractor, exception
@@ -31,7 +31,8 @@ class RedditExtractor(Extractor):
yield Message.Version, 1
with extractor.blacklist(
- util.SPECIAL_EXTRACTORS, [RedditSubredditExtractor]):
+ util.SPECIAL_EXTRACTORS,
+ [RedditSubredditExtractor, RedditUserExtractor]):
while True:
extra = []
for url, data in self._urls(submissions):
@@ -68,18 +69,18 @@ class RedditExtractor(Extractor):
submission["selftext_html"] or "", ' href="', '"'):
yield url, submission
- for comment in comments:
- for url in text.extract_iter(
- comment["body_html"] or "", ' href="', '"'):
- yield url, comment
+ if comments:
+ for comment in comments:
+ for url in text.extract_iter(
+ comment["body_html"] or "", ' href="', '"'):
+ yield url, comment
class RedditSubredditExtractor(RedditExtractor):
- """Extractor for images from subreddits on reddit.com"""
+ """Extractor for URLs from subreddits on reddit.com"""
subcategory = "subreddit"
- pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/([^/?&#]+)"
- r"(/[a-z]+)?/?"
- r"(?:\?.*?(?:\bt=([a-z]+))?)?$")
+ pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/"
+ r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?(?:$|#)")
test = (
("https://www.reddit.com/r/lavaporn/"),
("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month"),
@@ -90,24 +91,53 @@ class RedditSubredditExtractor(RedditExtractor):
def __init__(self, match):
RedditExtractor.__init__(self, match)
- self.subreddit, self.order, self.timeframe = match.groups()
+ self.subreddit = match.group(1)
+ self.params = text.parse_query(match.group(2))
def submissions(self):
- subreddit = self.subreddit + (self.order or "")
- params = {"t": self.timeframe} if self.timeframe else {}
- return self.api.submissions_subreddit(subreddit, params)
+ return self.api.submissions_subreddit(self.subreddit, self.params)
+
+
+class RedditUserExtractor(RedditExtractor):
+ """Extractor for URLs from posts by a reddit user"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/u(?:ser)?/"
+ r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?")
+ test = (
+ ("https://www.reddit.com/user/username/", {
+ "count": ">= 2",
+ }),
+ ("https://www.reddit.com/user/username/gilded/?sort=top&t=month"),
+ ("https://old.reddit.com/user/username/"),
+ ("https://www.reddit.com/u/username/"),
+ )
+
+ def __init__(self, match):
+ RedditExtractor.__init__(self, match)
+ self.user = match.group(1)
+ self.params = text.parse_query(match.group(2))
+
+ def submissions(self):
+ return self.api.submissions_user(self.user, self.params)
class RedditSubmissionExtractor(RedditExtractor):
- """Extractor for images from a submission on reddit.com"""
+ """Extractor for URLs from a submission on reddit.com"""
subcategory = "submission"
pattern = (r"(?:https?://)?(?:"
r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|"
r"redd\.it"
r")/([a-z0-9]+)")
test = (
- ("https://www.reddit.com/r/lavaporn/comments/2a00np/", {
- "pattern": r"https?://i\.imgur\.com/AaAUCgy\.jpg",
+ ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", {
+ "pattern": r"https://",
+ "count": 3,
+ }),
+ # ignore submission comments (#429)
+ ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", {
+ "options": (("comments", 0),),
+ "pattern": r"https://c2.staticflickr.com/8/7272/\w+_k.jpg",
+ "count": 1,
}),
("https://old.reddit.com/r/lavaporn/comments/2a00np/"),
("https://np.reddit.com/r/lavaporn/comments/2a00np/"),
@@ -156,7 +186,7 @@ class RedditAPI():
def __init__(self, extractor):
self.extractor = extractor
- self.comments = extractor.config("comments", 500)
+ self.comments = text.parse_int(extractor.config("comments", 500))
self.morecomments = extractor.config("morecomments", False)
self.refresh_token = extractor.config("refresh-token")
self.log = extractor.log
@@ -168,7 +198,7 @@ class RedditAPI():
self.client_id = None
self.log.warning(
"Conflicting values for 'client-id' and 'user-agent': "
- "override either both or none of them.")
+ "overwrite either both or none of them.")
else:
self.client_id = client_id
extractor.session.headers["User-Agent"] = user_agent
@@ -179,7 +209,7 @@ class RedditAPI():
link_id = "t3_" + submission_id if self.morecomments else None
submission, comments = self._call(endpoint, {"limit": self.comments})
return (submission["data"]["children"][0]["data"],
- self._flatten(comments, link_id))
+ self._flatten(comments, link_id) if self.comments else None)
def submissions_subreddit(self, subreddit, params):
"""Collect all (submission, comments)-tuples of a subreddit"""
@@ -187,6 +217,12 @@ class RedditAPI():
params["limit"] = 100
return self._pagination(endpoint, params)
+ def submissions_user(self, user, params):
+ """Collect all (submission, comments)-tuples posted by a user"""
+ endpoint = "/user/" + user + "/.json"
+ params["limit"] = 100
+ return self._pagination(endpoint, params)
+
def morechildren(self, link_id, children):
"""Load additional comments from a submission"""
endpoint = "/api/morechildren"
@@ -249,7 +285,7 @@ class RedditAPI():
raise Exception(data["message"])
return data
- def _pagination(self, endpoint, params, _empty=()):
+ def _pagination(self, endpoint, params):
id_min = self._parse_id("id-min", 0)
id_max = self._parse_id("id-max", 2147483647)
date_min, date_max = self.extractor._get_date_min_max(0, 253402210800)
@@ -267,7 +303,7 @@ class RedditAPI():
except exception.AuthorizationError:
pass
else:
- yield submission, _empty
+ yield submission, None
if not data["after"]:
return