New upstream version 1.10.5upstream/1.10.5

author: Unit 193 <unit193@ubuntu.com> 2019-10-01 19:12:47 -0400
committer: Unit 193 <unit193@ubuntu.com> 2019-10-01 19:12:47 -0400
commit: 639d9ea4a667733aadc3ff83a1df2cc9f0add3a9 (patch)
tree: 5761b58d6fc3e8bbb99b39b8e4417673bccb0b86 /gallery_dl/extractor/reddit.py
parent: c09a9f00dd83017d486cd77650347bc2a397ad55 (diff)
1 files changed, 58 insertions, 22 deletions
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 94e95e8..9c283de 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -6,7 +6,7 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-"""Extract images from subreddits at https://www.reddit.com/"""
+"""Extractors for https://www.reddit.com/"""
 
 from .common import Extractor, Message
 from .. import text, util, extractor, exception
@@ -31,7 +31,8 @@ class RedditExtractor(Extractor):
 
         yield Message.Version, 1
         with extractor.blacklist(
-                util.SPECIAL_EXTRACTORS, [RedditSubredditExtractor]):
+                util.SPECIAL_EXTRACTORS,
+                [RedditSubredditExtractor, RedditUserExtractor]):
             while True:
                 extra = []
                 for url, data in self._urls(submissions):
@@ -68,18 +69,18 @@ class RedditExtractor(Extractor):
                     submission["selftext_html"] or "", ' href="', '"'):
                 yield url, submission
 
-            for comment in comments:
-                for url in text.extract_iter(
-                        comment["body_html"] or "", ' href="', '"'):
-                    yield url, comment
+            if comments:
+                for comment in comments:
+                    for url in text.extract_iter(
+                            comment["body_html"] or "", ' href="', '"'):
+                        yield url, comment
 
 
 class RedditSubredditExtractor(RedditExtractor):
-    """Extractor for images from subreddits on reddit.com"""
+    """Extractor for URLs from subreddits on reddit.com"""
     subcategory = "subreddit"
-    pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/([^/?&#]+)"
-               r"(/[a-z]+)?/?"
-               r"(?:\?.*?(?:\bt=([a-z]+))?)?$")
+    pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/"
+               r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?(?:$|#)")
     test = (
         ("https://www.reddit.com/r/lavaporn/"),
         ("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month"),
@@ -90,24 +91,53 @@ class RedditSubredditExtractor(RedditExtractor):
 
     def __init__(self, match):
         RedditExtractor.__init__(self, match)
-        self.subreddit, self.order, self.timeframe = match.groups()
+        self.subreddit = match.group(1)
+        self.params = text.parse_query(match.group(2))
 
     def submissions(self):
-        subreddit = self.subreddit + (self.order or "")
-        params = {"t": self.timeframe} if self.timeframe else {}
-        return self.api.submissions_subreddit(subreddit, params)
+        return self.api.submissions_subreddit(self.subreddit, self.params)
+
+
+class RedditUserExtractor(RedditExtractor):
+    """Extractor for URLs from posts by a reddit user"""
+    subcategory = "user"
+    pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/u(?:ser)?/"
+               r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?")
+    test = (
+        ("https://www.reddit.com/user/username/", {
+            "count": ">= 2",
+        }),
+        ("https://www.reddit.com/user/username/gilded/?sort=top&t=month"),
+        ("https://old.reddit.com/user/username/"),
+        ("https://www.reddit.com/u/username/"),
+    )
+
+    def __init__(self, match):
+        RedditExtractor.__init__(self, match)
+        self.user = match.group(1)
+        self.params = text.parse_query(match.group(2))
+
+    def submissions(self):
+        return self.api.submissions_user(self.user, self.params)
 
 
 class RedditSubmissionExtractor(RedditExtractor):
-    """Extractor for images from a submission on reddit.com"""
+    """Extractor for URLs from a submission on reddit.com"""
     subcategory = "submission"
     pattern = (r"(?:https?://)?(?:"
                r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|"
                r"redd\.it"
                r")/([a-z0-9]+)")
     test = (
-        ("https://www.reddit.com/r/lavaporn/comments/2a00np/", {
-            "pattern": r"https?://i\.imgur\.com/AaAUCgy\.jpg",
+        ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", {
+            "pattern": r"https://",
+            "count": 3,
+        }),
+        # ignore submission comments (#429)
+        ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", {
+            "options": (("comments", 0),),
+            "pattern": r"https://c2.staticflickr.com/8/7272/\w+_k.jpg",
+            "count": 1,
         }),
         ("https://old.reddit.com/r/lavaporn/comments/2a00np/"),
         ("https://np.reddit.com/r/lavaporn/comments/2a00np/"),
@@ -156,7 +186,7 @@ class RedditAPI():
 
     def __init__(self, extractor):
         self.extractor = extractor
-        self.comments = extractor.config("comments", 500)
+        self.comments = text.parse_int(extractor.config("comments", 500))
         self.morecomments = extractor.config("morecomments", False)
         self.refresh_token = extractor.config("refresh-token")
         self.log = extractor.log
@@ -168,7 +198,7 @@ class RedditAPI():
             self.client_id = None
             self.log.warning(
                 "Conflicting values for 'client-id' and 'user-agent': "
-                "override either both or none of them.")
+                "overwrite either both or none of them.")
         else:
             self.client_id = client_id
             extractor.session.headers["User-Agent"] = user_agent
@@ -179,7 +209,7 @@ class RedditAPI():
         link_id = "t3_" + submission_id if self.morecomments else None
         submission, comments = self._call(endpoint, {"limit": self.comments})
         return (submission["data"]["children"][0]["data"],
-                self._flatten(comments, link_id))
+                self._flatten(comments, link_id) if self.comments else None)
 
     def submissions_subreddit(self, subreddit, params):
         """Collect all (submission, comments)-tuples of a subreddit"""
@@ -187,6 +217,12 @@ class RedditAPI():
         params["limit"] = 100
         return self._pagination(endpoint, params)
 
+    def submissions_user(self, user, params):
+        """Collect all (submission, comments)-tuples posted by a user"""
+        endpoint = "/user/" + user + "/.json"
+        params["limit"] = 100
+        return self._pagination(endpoint, params)
+
     def morechildren(self, link_id, children):
         """Load additional comments from a submission"""
         endpoint = "/api/morechildren"
@@ -249,7 +285,7 @@ class RedditAPI():
             raise Exception(data["message"])
         return data
 
-    def _pagination(self, endpoint, params, _empty=()):
+    def _pagination(self, endpoint, params):
         id_min = self._parse_id("id-min", 0)
         id_max = self._parse_id("id-max", 2147483647)
         date_min, date_max = self.extractor._get_date_min_max(0, 253402210800)
@@ -267,7 +303,7 @@ class RedditAPI():
                         except exception.AuthorizationError:
                             pass
                     else:
-                        yield submission, _empty
+                        yield submission, None
 
             if not data["after"]:
                 return
author	Unit 193 <unit193@ubuntu.com>	2019-10-01 19:12:47 -0400
committer	Unit 193 <unit193@ubuntu.com>	2019-10-01 19:12:47 -0400
commit	639d9ea4a667733aadc3ff83a1df2cc9f0add3a9 (patch)
tree	5761b58d6fc3e8bbb99b39b8e4417673bccb0b86 /gallery_dl/extractor/reddit.py
parent	c09a9f00dd83017d486cd77650347bc2a397ad55 (diff)