New upstream version 1.23.1.upstream/1.23.1

author: Unit 193 <unit193@unit193.net> 2022-09-22 19:43:53 -0400
committer: Unit 193 <unit193@unit193.net> 2022-09-22 19:43:53 -0400
commit: e6b82556343116256be047ab7099bedd9063f66a (patch)
tree: 884c0435863d130ec967163b82a2638ff1bd9505 /gallery_dl
parent: a768930761f7f20587ae40a8cacca0e55c85290a (diff)
18 files changed, 488 insertions, 102 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index 329e7ab..7504fa4 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -96,9 +96,9 @@ def parse_inputfile(file, log):
         else:
             # url
             if " #" in line:
-                line = line.partition(" #")[0]
+                line = line.partition(" #")[0].rstrip()
             elif "\t#" in line:
-                line = line.partition("\t#")[0]
+                line = line.partition("\t#")[0].rstrip()
             if gconf or lconf:
                 yield util.ExtendedUrl(line, gconf, lconf)
                 gconf = []
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 9e4507a..fed6998 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -55,6 +55,7 @@ modules = [
     "hentaihere",
     "hiperdex",
     "hitomi",
+    "hotleak",
     "idolcomplex",
     "imagebam",
     "imagechest",
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 3091f57..2502411 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -37,6 +37,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
             "pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4",
             "content": "80e61d1dbc5896ae7ef9a28734c747b28b320471",
         }),
+        # cdn4
+        ("https://bunkr.is/a/iXTTc1o2", {
+            "pattern": r"https://(cdn|media-files)4\.bunkr\.is/",
+            "content": "da29aae371b7adc8c5ef8e6991b66b69823791e8",
+        }),
         ("https://bunkr.to/a/Lktg9Keq"),
     )
 
@@ -66,9 +71,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
             data = json.loads(text.extract(
                 self.request(url).text,
                 'id="__NEXT_DATA__" type="application/json">', '<')[0])
-            props = data["props"]["pageProps"]
-            album = props["album"]
-            files = props["files"]
+            album = data["props"]["pageProps"]["album"]
+            files = album["files"]
         except Exception as exc:
             self.log.debug(exc.__class__.__name__, exc)
             self.root = self.root.replace("bunkr", "app.bunkr", 1)
@@ -77,7 +81,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
         for file in files:
             name = file["name"]
             cdn = file["cdn"]
-            if name.endswith(".mp4"):
+            if name.endswith((".mp4", ".m4v", ".mov")):
                 cdn = cdn.replace("//cdn", "//media-files")
             file["file"] = cdn + "/" + name
 
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 1b41101..f7ee51f 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -152,7 +152,8 @@ class Extractor():
                 server = response.headers.get("Server")
                 if server and server.startswith("cloudflare"):
                     if code == 503 and \
-                            b"jschl-answer" in response.content:
+                            (b"_cf_chl_opt" in response.content or
+                             b"jschl-answer" in response.content):
                         self.log.warning("Cloudflare IUAM challenge")
                         break
                     if code == 403 and \
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 2720691..01ba03a 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -219,7 +219,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
             if self.limits:
                 self._check_limits(data)
             if "/fullimg.php" in url:
-                data["extension"] = ""
                 data["_http_validate"] = _validate_response
             else:
                 data["_http_validate"] = None
@@ -328,8 +327,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
         data["image_token"] = self.key["start"] = extr('var startkey="', '";')
         self.key["show"] = extr('var showkey="', '";')
 
-        if iurl.endswith("g/509.gif"):
-            self._report_limits(data)
+        self._check_509(iurl, data)
         return url, text.nameext_from_url(iurl, data)
 
     def images_from_api(self):
@@ -365,8 +363,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
             data["num"] = request["page"]
             data["image_token"] = imgkey
 
-            if imgurl.endswith("g/509.gif"):
-                self._report_limits(data)
+            self._check_509(imgurl, data)
             yield url, text.nameext_from_url(imgurl, data)
 
             request["imgkey"] = nextkey
@@ -385,6 +382,15 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
         if self._remaining <= 0:
             self._report_limits(data)
 
+    def _check_509(self, url, data):
+        # full 509.gif URLs
+        # - https://exhentai.org/img/509.gif
+        # - https://ehgt.org/g/509.gif
+        if url.endswith(("hentai.org/img/509.gif",
+                         "ehgt.org/g/509.gif")):
+            self.log.debug(url)
+            self._report_limits(data)
+
     def _update_limits(self):
         url = "https://e-hentai.org/home.php"
         cookies = {
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index 2bd8c6b..e85d68a 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2017-2020 Mike Fährmann
+# Copyright 2017-2022 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,8 @@
 from .common import Extractor, Message
 from .. import text, oauth, util, exception
 
+BASE_PATTERN = r"(?:https?://)?(?:www\.|secure\.|m\.)?flickr\.com"
+
 
 class FlickrExtractor(Extractor):
     """Base class for flickr extractors"""
@@ -55,7 +57,7 @@ class FlickrImageExtractor(FlickrExtractor):
     """Extractor for individual images from flickr.com"""
     subcategory = "image"
     pattern = (r"(?:https?://)?(?:"
-               r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/"
+               r"(?:(?:www\.|secure\.|m\.)?flickr\.com/photos/[^/?#]+/"
                r"|[\w-]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
                r"|flic\.kr/p/([A-Za-z1-9]+))")
     test = (
@@ -77,6 +79,10 @@ class FlickrImageExtractor(FlickrExtractor):
                 "width": 1024,
             },
         }),
+        ("https://secure.flickr.com/photos/departingyyz/16089302239"),
+        ("https://m.flickr.com/photos/departingyyz/16089302239"),
+        ("https://flickr.com/photos/departingyyz/16089302239"),
+
         ("https://www.flickr.com/photos/145617051@N08/46733161535", {
             "count": 1,
             "keyword": {"media": "video"},
@@ -132,8 +138,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
     directory_fmt = ("{category}", "{user[username]}",
                      "Albums", "{album[id]} {album[title]}")
     archive_fmt = "a_{album[id]}_{id}"
-    pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
-               r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?")
+    pattern = BASE_PATTERN + r"/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?"
     test = (
         (("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), {
             "pattern": FlickrImageExtractor.pattern,
@@ -143,6 +148,8 @@ class FlickrAlbumExtractor(FlickrExtractor):
             "pattern": pattern,
             "count": 2,
         }),
+        ("https://secure.flickr.com/photos/shona_s/albums"),
+        ("https://m.flickr.com/photos/shona_s/albums"),
     )
 
     def __init__(self, match):
@@ -180,8 +187,7 @@ class FlickrGalleryExtractor(FlickrExtractor):
     directory_fmt = ("{category}", "{user[username]}",
                      "Galleries", "{gallery[gallery_id]} {gallery[title]}")
     archive_fmt = "g_{gallery[id]}_{id}"
-    pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
-               r"photos/([^/]+)/galleries/(\d+)")
+    pattern = BASE_PATTERN + r"/photos/([^/?#]+)/galleries/(\d+)"
     test = (("https://www.flickr.com/photos/flickr/"
              "galleries/72157681572514792/"), {
         "pattern": FlickrImageExtractor.pattern,
@@ -206,7 +212,7 @@ class FlickrGroupExtractor(FlickrExtractor):
     subcategory = "group"
     directory_fmt = ("{category}", "Groups", "{group[groupname]}")
     archive_fmt = "G_{group[nsid]}_{id}"
-    pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"
+    pattern = BASE_PATTERN + r"/groups/([^/?#]+)"
     test = ("https://www.flickr.com/groups/bird_headshots/", {
         "pattern": FlickrImageExtractor.pattern,
         "count": "> 150",
@@ -224,7 +230,7 @@ class FlickrUserExtractor(FlickrExtractor):
     """Extractor for the photostream of a flickr user"""
     subcategory = "user"
     archive_fmt = "u_{user[nsid]}_{id}"
-    pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"
+    pattern = BASE_PATTERN + r"/photos/([^/?#]+)/?$"
     test = ("https://www.flickr.com/photos/shona_s/", {
         "pattern": FlickrImageExtractor.pattern,
         "count": 28,
@@ -239,7 +245,7 @@ class FlickrFavoriteExtractor(FlickrExtractor):
     subcategory = "favorite"
     directory_fmt = ("{category}", "{user[username]}", "Favorites")
     archive_fmt = "f_{user[nsid]}_{id}"
-    pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"
+    pattern = BASE_PATTERN + r"/photos/([^/?#]+)/favorites"
     test = ("https://www.flickr.com/photos/shona_s/favorites", {
         "pattern": FlickrImageExtractor.pattern,
         "count": 4,
@@ -254,7 +260,7 @@ class FlickrSearchExtractor(FlickrExtractor):
     subcategory = "search"
     directory_fmt = ("{category}", "Search", "{search[text]}")
     archive_fmt = "s_{search}_{id}"
-    pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)"
+    pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
     test = (
         ("https://flickr.com/search/?text=mountain"),
         ("https://flickr.com/search/?text=tree%20cloud%20house"
@@ -275,7 +281,11 @@ class FlickrSearchExtractor(FlickrExtractor):
 
 
 class FlickrAPI(oauth.OAuth1API):
-    """Minimal interface for the flickr API"""
+    """Minimal interface for the flickr API
+
+    https://www.flickr.com/services/api/
+    """
+
     API_URL = "https://api.flickr.com/services/rest/"
     API_KEY = "ac4fd7aa98585b9eee1ba761c209de68"
     API_SECRET = "3adb0f568dc68393"
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
new file mode 100644
index 0000000..d6575cf
--- /dev/null
+++ b/gallery_dl/extractor/hotleak.py
@@ -0,0 +1,228 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hotleak.vip/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?hotleak\.vip"
+
+
+class HotleakExtractor(Extractor):
+    """Base class for hotleak extractors"""
+    category = "hotleak"
+    directory_fmt = ("{category}", "{creator}",)
+    filename_fmt = "{creator}_{id}.{extension}"
+    archive_fmt = "{type}_{creator}_{id}"
+    root = "https://hotleak.vip"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.session.headers["Referer"] = self.root
+
+    def items(self):
+        for post in self.posts():
+            yield Message.Directory, post
+            yield Message.Url, post["url"], post
+
+    def posts(self):
+        """Return an iterable containing relevant posts"""
+        return ()
+
+    def _pagination(self, url, params):
+        params = text.parse_query(params)
+        params["page"] = text.parse_int(params.get("page"), 1)
+
+        while True:
+            page = self.request(url, params=params).text
+            if "</article>" not in page:
+                return
+
+            for item in text.extract_iter(
+                    page, '<article class="movie-item', '</article>'):
+                yield text.extract(item, '<a href="', '"')[0]
+
+            params["page"] += 1
+
+
+class HotleakPostExtractor(HotleakExtractor):
+    """Extractor for individual posts on hotleak"""
+    subcategory = "post"
+    pattern = (BASE_PATTERN + r"/(?!hot|creators|videos|photos)"
+               r"([^/]+)/(photo|video)/(\d+)")
+    test = (
+        ("https://hotleak.vip/kaiyakawaii/photo/1617145", {
+            "pattern": r"https://hotleak\.vip/storage/images/3625"
+                       r"/1617145/fefdd5988dfcf6b98cc9e11616018868\.jpg",
+            "keyword": {
+                "id": 1617145,
+                "creator": "kaiyakawaii",
+                "type": "photo",
+                "filename": "fefdd5988dfcf6b98cc9e11616018868",
+                "extension": "jpg",
+            },
+        }),
+        ("https://hotleak.vip/lilmochidoll/video/1625538", {
+            "pattern": r"ytdl:https://cdn8-leak\.camhdxx\.com"
+                       r"/1661/1625538/index\.m3u8",
+            "keyword": {
+                "id": 1625538,
+                "creator": "lilmochidoll",
+                "type": "video",
+                "filename": "index",
+                "extension": "mp4",
+            },
+        }),
+    )
+
+    def __init__(self, match):
+        HotleakExtractor.__init__(self, match)
+        self.creator, self.type, self.id = match.groups()
+
+    def posts(self):
+        url = "{}/{}/{}/{}".format(
+            self.root, self.creator, self.type, self.id)
+        page = self.request(url).text
+        page = text.extract(
+            page, '<div class="movie-image thumb">', '</article>')[0]
+        data = {
+            "id"     : text.parse_int(self.id),
+            "creator": self.creator,
+            "type"   : self.type,
+        }
+
+        if self.type == "photo":
+            data["url"] = text.extract(page, 'data-src="', '"')[0]
+            text.nameext_from_url(data["url"], data)
+
+        elif self.type == "video":
+            data["url"] = "ytdl:" + text.extract(
+                text.unescape(page), '"src":"', '"')[0]
+            text.nameext_from_url(data["url"], data)
+            data["extension"] = "mp4"
+
+        return (data,)
+
+
+class HotleakCreatorExtractor(HotleakExtractor):
+    """Extractor for all posts from a hotleak creator"""
+    subcategory = "creator"
+    pattern = BASE_PATTERN + r"/(?!hot|creators|videos|photos)([^/?#]+)/?$"
+    test = (
+        ("https://hotleak.vip/kaiyakawaii", {
+            "range": "1-200",
+            "count": 200,
+        }),
+        ("https://hotleak.vip/stellaviolet", {
+            "count": "> 600"
+        }),
+        ("https://hotleak.vip/doesnotexist", {
+            "exception": exception.NotFoundError,
+        }),
+    )
+
+    def __init__(self, match):
+        HotleakExtractor.__init__(self, match)
+        self.creator = match.group(1)
+
+    def posts(self):
+        url = "{}/{}".format(self.root, self.creator)
+        return self._pagination(url)
+
+    def _pagination(self, url):
+        headers = {"X-Requested-With": "XMLHttpRequest"}
+        params = {"page": 1}
+
+        while True:
+            try:
+                response = self.request(
+                    url, headers=headers, params=params, notfound="creator")
+            except exception.HttpError as exc:
+                if exc.response.status_code == 429:
+                    self.wait(
+                        until=exc.response.headers.get("X-RateLimit-Reset"))
+                    continue
+
+            posts = response.json()
+            if not posts:
+                return
+
+            data = {"creator": self.creator}
+            for post in posts:
+                data["id"] = text.parse_int(post["id"])
+
+                if post["type"] == 0:
+                    data["type"] = "photo"
+                    data["url"] = self.root + "/storage/" + post["image"]
+                    text.nameext_from_url(data["url"], data)
+
+                elif post["type"] == 1:
+                    data["type"] = "video"
+                    data["url"] = "ytdl:" + post["stream_url_play"]
+                    text.nameext_from_url(data["url"], data)
+                    data["extension"] = "mp4"
+
+                yield data
+            params["page"] += 1
+
+
+class HotleakCategoryExtractor(HotleakExtractor):
+    """Extractor for hotleak categories"""
+    subcategory = "category"
+    pattern = BASE_PATTERN + r"/(hot|creators|videos|photos)(?:/?\?([^#]+))?"
+    test = (
+        ("https://hotleak.vip/photos", {
+            "pattern": HotleakPostExtractor.pattern,
+            "range": "1-50",
+            "count": 50,
+        }),
+        ("https://hotleak.vip/videos"),
+        ("https://hotleak.vip/creators", {
+            "pattern": HotleakCreatorExtractor.pattern,
+            "range": "1-50",
+            "count": 50,
+        }),
+        ("https://hotleak.vip/hot"),
+    )
+
+    def __init__(self, match):
+        HotleakExtractor.__init__(self, match)
+        self._category, self.params = match.groups()
+
+    def items(self):
+        url = "{}/{}".format(self.root, self._category)
+
+        if self._category in ("hot", "creators"):
+            data = {"_extractor": HotleakCreatorExtractor}
+        elif self._category in ("videos", "photos"):
+            data = {"_extractor": HotleakPostExtractor}
+
+        for item in self._pagination(url, self.params):
+            yield Message.Queue, item, data
+
+
+class HotleakSearchExtractor(HotleakExtractor):
+    """Extractor for hotleak search results"""
+    subcategory = "search"
+    pattern = BASE_PATTERN + r"/search(?:/?\?([^#]+))"
+    test = (
+        ("https://hotleak.vip/search?search=gallery-dl", {
+            "count": 0,
+        }),
+        ("https://hotleak.vip/search?search=hannah", {
+            "count": "> 30",
+        }),
+    )
+
+    def __init__(self, match):
+        HotleakExtractor.__init__(self, match)
+        self.params = match.group(1)
+
+    def items(self):
+        data = {"_extractor": HotleakCreatorExtractor}
+        for creator in self._pagination(self.root + "/search", self.params):
+            yield Message.Queue, creator, data
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index d56af8b..8c98d2e 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -264,6 +264,12 @@ class InstagramExtractor(Extractor):
                 "post_id": reel_id,
                 "post_shortcode": shortcode_from_id(reel_id),
             }
+
+            if "title" in post:
+                data["highlight_title"] = post["title"]
+            if "created_at" in post:
+                data["date"] = text.parse_timestamp(post.get("created_at"))
+
         else:
             data = {
                 "post_id" : post["pk"],
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index 0a6a6d3..56e3b39 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -49,7 +49,8 @@ class PahealExtractor(Extractor):
             "id"      : post_id,
             "tags"    : extr(": ", "<"),
             "md5"     : extr("/_thumbs/", "/"),
-            "file_url": extr("id='main_image' src='", "'"),
+            "file_url": (extr("id='main_image' src='", "'") or
+                         extr("<source src='", "'")),
             "uploader": text.unquote(extr(
                 "class='username' href='/user/", "'")),
             "date"    : text.parse_datetime(
@@ -59,8 +60,10 @@ class PahealExtractor(Extractor):
         }
 
         dimensions, size, ext = extr("Info</th><td>", ">").split(" // ")
-        post["width"], _, post["height"] = dimensions.partition("x")
+        post["width"], _, height = dimensions.partition("x")
         post["size"] = text.parse_bytes(size[:-1])
+        post["height"], _, duration = height.partition(", ")
+        post["duration"] = text.parse_float(duration[:-1])
 
         return post
 
@@ -111,10 +114,12 @@ class PahealTagExtractor(PahealExtractor):
         tags, data, date = data.split("\n")
         dimensions, size, ext = data.split(" // ")
         width, _, height = dimensions.partition("x")
+        height, _, duration = height.partition(", ")
 
         return {
             "id": pid, "md5": md5, "file_url": url,
             "width": width, "height": height,
+            "duration": text.parse_float(duration[:-1]),
             "tags": text.unescape(tags),
             "size": text.parse_bytes(size[:-1]),
             "date": text.parse_datetime(date, "%B %d, %Y; %H:%M"),
@@ -163,6 +168,27 @@ class PahealPostExtractor(PahealExtractor):
                 "width": 1200,
             },
         }),
+        # video
+        ("https://rule34.paheal.net/post/view/3864982", {
+            "pattern": r"https://[\w]+\.paheal\.net/_images/7629fc0ff77e32637d"
+                       r"de5bf4f992b2cb/3864982%20-%20Metal_Gear%20Metal_Gear_"
+                       r"Solid_V%20Quiet%20Vg_erotica%20animated%20webm\.webm",
+            "keyword": {
+                "date": "dt:2020-09-06 01:59:03",
+                "duration": 30.0,
+                "extension": "webm",
+                "height": 2500,
+                "id": 3864982,
+                "md5": "7629fc0ff77e32637dde5bf4f992b2cb",
+                "size": 18454938,
+                "source": "https://twitter.com/VG_Worklog"
+                          "/status/1302407696294055936",
+                "tags": "Metal_Gear Metal_Gear_Solid_V Quiet "
+                        "Vg_erotica animated webm",
+                "uploader": "justausername",
+                "width": 1768,
+            },
+        }),
     )
 
     def __init__(self, match):
diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
index 8203885..4283081 100644
--- a/gallery_dl/extractor/poipiku.py
+++ b/gallery_dl/extractor/poipiku.py
@@ -42,6 +42,7 @@ class PoipikuExtractor(Extractor):
                     '<h2 class="UserInfoUserName">', '</').rpartition(">")[2]),
                 "description": text.unescape(extr(
                     'class="IllustItemDesc" >', '<')),
+                "_http_headers": {"Referer": post_url},
             }
 
             yield Message.Directory, post
@@ -54,7 +55,8 @@ class PoipikuExtractor(Extractor):
                 elif thumb.startswith(("//img.poipiku.com/img/", "/img/")):
                     continue
                 post["num"] += 1
-                url = text.ensure_http_scheme(thumb[:-8])
+                url = text.ensure_http_scheme(thumb[:-8]).replace(
+                    "//img.", "//img-org.", 1)
                 yield Message.Url, url, text.nameext_from_url(url, post)
 
             if not extr('> show all', '<'):
@@ -80,7 +82,8 @@ class PoipikuExtractor(Extractor):
             for thumb in text.extract_iter(
                     page, 'class="IllustItemThumbImg" src="', '"'):
                 post["num"] += 1
-                url = text.ensure_http_scheme(thumb[:-8])
+                url = text.ensure_http_scheme(thumb[:-8]).replace(
+                    "//img.", "//img-org.", 1)
                 yield Message.Url, url, text.nameext_from_url(url, post)
 
 
@@ -91,7 +94,7 @@ class PoipikuUserExtractor(PoipikuExtractor):
                r"(\d+)/?(?:$|[?&#])")
     test = (
         ("https://poipiku.com/25049/", {
-            "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049"
+            "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"
                        r"/\d+_\w+\.(jpe?g|png)$",
             "range": "1-10",
             "count": 10,
@@ -131,7 +134,7 @@ class PoipikuPostExtractor(PoipikuExtractor):
     pattern = BASE_PATTERN + r"/(\d+)/(\d+)"
     test = (
         ("https://poipiku.com/25049/5864576.html", {
-            "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049"
+            "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"
                        r"/005864576_EWN1Y65gQ\.png$",
             "keyword": {
                 "count": "1",
@@ -146,7 +149,7 @@ class PoipikuPostExtractor(PoipikuExtractor):
             },
         }),
         ("https://poipiku.com/2166245/6411749.html", {
-            "pattern": r"https://img\.poipiku\.com/user_img\d+/002166245"
+            "pattern": r"https://img-org\.poipiku\.com/user_img\d+/002166245"
                        r"/006411749_\w+\.jpeg$",
             "count": 4,
             "keyword": {
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index d35e24e..954a84f 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -46,10 +46,10 @@ class RedditExtractor(Extractor):
                         submission["created_utc"])
                     yield Message.Directory, submission
                     visited.add(submission["id"])
-                    url = submission["url"]
                     submission["num"] = 0
 
-                    if url.startswith("https://i.redd.it/"):
+                    url = submission["url"]
+                    if url and url.startswith("https://i.redd.it/"):
                         text.nameext_from_url(url, submission)
                         yield Message.Url, url, submission
 
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
index 2c3ed44..3a4fb0e 100644
--- a/gallery_dl/extractor/redgifs.py
+++ b/gallery_dl/extractor/redgifs.py
@@ -53,6 +53,7 @@ class RedgifsExtractor(Extractor):
         for fmt in self.formats:
             url = urls.get(fmt)
             if url:
+                url = url.replace("//thumbs2.", "//thumbs3.", 1)
                 text.nameext_from_url(url, gif)
                 yield url
 
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
index 4010da3..2264fe4 100644
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@@ -209,9 +209,9 @@ class SmugmugPathExtractor(SmugmugExtractor):
 class SmugmugAPI(oauth.OAuth1API):
     """Minimal interface for the smugmug API v2"""
     API_DOMAIN = "api.smugmug.com"
-    API_KEY = "DFqxg4jf7GrtsQ5PnbNB8899zKfnDrdK"
-    API_SECRET = ("fknV35p9r9BwZC4XbTzvCXpcSJRdD83S"
-                  "9nMFQm25ndGBzNPnwRDbRnnVBvqt4xTq")
+    API_KEY = "RCVHDGjcbc4Fhzq4qzqLdZmvwmwB6LM2"
+    API_SECRET = ("jGrdndvJqhTx8XSNs7TFTSSthhZHq92d"
+                  "dMpbpDpkDVNM7TDgnvLFMtfB5Mg5kH73")
     HEADERS = {"Accept": "application/json"}
 
     def album(self, album_id, expands=None):
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index b694fa0..6f53881 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -14,25 +14,6 @@ from datetime import datetime, timedelta
 import re
 
 
-def _original_inline_image(url):
-    return re.sub(
-        (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
-         r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
-        r"https://\1_1280.\2", url
-    )
-
-
-def _original_video(url):
-    return re.sub(
-        (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
-         r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"),
-        r"https://\1.\2", url
-    )
-
-
-POST_TYPES = frozenset((
-    "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
-
 BASE_PATTERN = (
     r"(?:tumblr:(?:https?://)?([^/]+)|"
     r"(?:https?://)?"
@@ -40,6 +21,9 @@ BASE_PATTERN = (
     r"([\w-]+\.tumblr\.com)))"
 )
 
+POST_TYPES = frozenset((
+    "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+
 
 class TumblrExtractor(Extractor):
     """Base class for tumblr extractors"""
@@ -79,6 +63,18 @@ class TumblrExtractor(Extractor):
     def items(self):
         blog = None
 
+        # pre-compile regular expressions
+        self._sub_video = re.compile(
+            r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
+            r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub
+        if self.inline:
+            self._sub_image = re.compile(
+                r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
+                r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub
+            self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn
+            _findall_image = re.compile('<img src="([^"]+)"').findall
+            _findall_video = re.compile('<source src="([^"]+)"').findall
+
         for post in self.posts():
             if self.date_min > post["timestamp"]:
                 return
@@ -120,7 +116,7 @@ class TumblrExtractor(Extractor):
 
                     if self.original and "/s2048x3072/" in photo["url"] and (
                             photo["width"] == 2048 or photo["height"] == 3072):
-                        photo["url"] = self._original_image(photo["url"])
+                        photo["url"] = self._original_photo(photo["url"])
 
                     del photo["original_size"]
                     del photo["alt_sizes"]
@@ -134,17 +130,18 @@ class TumblrExtractor(Extractor):
 
             url = post.get("video_url")  # type "video"
             if url:
-                posts.append(self._prepare(_original_video(url), post.copy()))
+                posts.append(self._prepare(
+                    self._original_video(url), post.copy()))
 
             if self.inline and "reblog" in post:  # inline media
                 # only "chat" posts are missing a "reblog" key in their
                 # API response, but they can't contain images/videos anyway
                 body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
-                for url in re.findall('<img src="([^"]+)"', body):
-                    url = _original_inline_image(url)
+                for url in _findall_image(body):
+                    url = self._original_inline_image(url)
                     posts.append(self._prepare_image(url, post.copy()))
-                for url in re.findall('<source src="([^"]+)"', body):
-                    url = _original_video(url)
+                for url in _findall_video(body):
+                    url = self._original_video(url)
                     posts.append(self._prepare(url, post.copy()))
 
             if self.external:  # external links
@@ -220,8 +217,21 @@ class TumblrExtractor(Extractor):
     def _skip_reblog_same_blog(self, post):
         return self.blog != post.get("reblogged_root_uuid")
 
-    def _original_image(self, url):
-        url = url.replace("/s2048x3072/", "/s99999x99999/", 1)
+    def _original_photo(self, url):
+        return self._update_image_token(
+            url.replace("/s2048x3072/", "/s99999x99999/", 1))
+
+    def _original_inline_image(self, url):
+        if self.original:
+            url, n = self._subn_orig_image("/s99999x99999/", url, 1)
+            if n:
+                return self._update_image_token(url)
+        return self._sub_image(r"https://\1_1280.\2", url)
+
+    def _original_video(self, url):
+        return self._sub_video(r"https://\1.\2", url)
+
+    def _update_image_token(self, url):
         headers = {"Accept": "text/html,*/*;q=0.8"}
         response = self.request(url, headers=headers)
         return text.extract(response.text, '" src="', '"')[0]
@@ -305,6 +315,14 @@ class TumblrPostExtractor(TumblrExtractor):
         ("https://mikf123.tumblr.com/post/181022380064/chat-post", {
             "count": 0,
         }),
+        ("https://kichatundk.tumblr.com/post/654953419288821760", {
+            "count": 2,  # high-quality images (#1846)
+            "content": "d6fcc7b6f750d835d55c7f31fa3b63be26c9f89b",
+        }),
+        ("https://hameru-is-cool.tumblr.com/post/639261855227002880", {
+            "count": 2,  # high-quality images (#1344)
+            "content": "6bc19a42787e46e1bba2ef4aeef5ca28fcd3cd34",
+        }),
         ("https://mikf123.tumblr.com/image/689860196535762944", {
             "pattern": r"^https://\d+\.media\.tumblr\.com"
                        r"/134791621559a79793563b636b5fe2c6"
@@ -446,10 +464,8 @@ class TumblrAPI(oauth.OAuth1API):
 
             # daily rate limit
             if response.headers.get("x-ratelimit-perday-remaining") == "0":
+                self.log.info("Daily API rate limit exceeded")
                 reset = response.headers.get("x-ratelimit-perday-reset")
-                t = (datetime.now() + timedelta(seconds=float(reset))).time()
-
-                self.log.error("Daily API rate limit exceeded")
 
                 api_key = self.api_key or self.session.auth.consumer_key
                 if api_key == self.API_KEY:
@@ -459,6 +475,11 @@ class TumblrAPI(oauth.OAuth1API):
                                   "ter/docs/configuration.rst#extractortumblra"
                                   "pi-key--api-secret")
 
+                if self.extractor.config("ratelimit") == "wait":
+                    self.extractor.wait(seconds=reset)
+                    return self._call(blog, endpoint, params)
+
+                t = (datetime.now() + timedelta(seconds=float(reset))).time()
                 raise exception.StopExtraction(
                     "Aborting - Rate limit will reset at %s",
                     "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second))
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 0df4ea2..ba0597e 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -41,6 +41,7 @@ class TwitterExtractor(Extractor):
         self.quoted = self.config("quoted", False)
         self.videos = self.config("videos", True)
         self.cards = self.config("cards", False)
+        self.cards_blacklist = self.config("cards-blacklist")
         self._user = self._user_obj = None
         self._user_cache = {}
         self._init_sizes()
@@ -154,8 +155,11 @@ class TwitterExtractor(Extractor):
                     })
             elif "media_url_https" in media:
                 url = media["media_url_https"]
-                base, _, fmt = url.rpartition(".")
-                base += "?format=" + fmt + "&name="
+                if url[-4] == ".":
+                    base, _, fmt = url.rpartition(".")
+                    base += "?format=" + fmt + "&name="
+                else:
+                    base = url.rpartition("=")[0] + "="
                 files.append(text.nameext_from_url(url, {
                     "url"        : base + self._size_image,
                     "width"      : width,
@@ -174,15 +178,23 @@ class TwitterExtractor(Extractor):
         card = tweet["card"]
         if "legacy" in card:
             card = card["legacy"]
-        name = card["name"]
+
+        name = card["name"].rpartition(":")[2]
+        bvals = card["binding_values"]
+        if isinstance(bvals, list):
+            bvals = {bval["key"]: bval["value"]
+                     for bval in card["binding_values"]}
+
+        cbl = self.cards_blacklist
+        if cbl:
+            if name in cbl:
+                return
+            if "vanity_url" in bvals:
+                domain = bvals["vanity_url"]["string_value"]
+                if domain in cbl or name + ":" + domain in cbl:
+                    return
 
         if name in ("summary", "summary_large_image"):
-            bvals = card["binding_values"]
-            if isinstance(bvals, list):
-                bvals = {
-                    bval["key"]: bval["value"]
-                    for bval in card["binding_values"]
-                }
             for prefix in ("photo_image_full_size_",
                            "summary_photo_image_",
                            "thumbnail_image_"):
@@ -199,19 +211,9 @@ class TwitterExtractor(Extractor):
                             files.append(value)
                             return
         elif name == "unified_card":
-            bvals = card["binding_values"]
-            if isinstance(bvals, list):
-                for bval in card["binding_values"]:
-                    if bval["key"] == "unified_card":
-                        bval = bval["value"]["string_value"]
-                        break
-            else:
-                bval = bvals["unified_card"]["string_value"]
-            data = json.loads(bval)
-            if data.get("type") == "image_carousel_website":
-                self._extract_media(
-                    tweet, data["media_entities"].values(), files)
-                return
+            data = json.loads(bvals["unified_card"]["string_value"])
+            self._extract_media(tweet, data["media_entities"].values(), files)
+            return
 
         if self.cards == "ytdl":
             tweet_id = tweet.get("rest_id") or tweet["id_str"]
@@ -735,16 +737,33 @@ class TwitterTweetExtractor(TwitterExtractor):
             "options": (("cards", True),),
             "pattern": r"https://pbs.twimg.com/card_img/\d+/",
         }),
-        # unified_card with image_carousel_website
+        # unified_card image_website (#2875)
+        ("https://twitter.com/i/web/status/1561674543323910144", {
+            "options": (("cards", True),),
+            "pattern": r"https://pbs\.twimg\.com/media/F.+=jpg",
+        }),
+        # unified_card image_carousel_website
         ("https://twitter.com/doax_vv_staff/status/1479438945662685184", {
             "options": (("cards", True),),
             "pattern": r"https://pbs\.twimg\.com/media/F.+=png",
             "count": 6,
         }),
+        # unified_card video_website (#2875)
+        ("https://twitter.com/bang_dream_1242/status/1561548715348746241", {
+            "options": (("cards", True),),
+            "pattern": r"https://video\.twimg\.com/amplify_video"
+                       r"/1560607284333449216/vid/720x720/\w+\.mp4",
+        }),
         # unified_card without type
         ("https://twitter.com/i/web/status/1466183847628865544", {
             "count": 0,
         }),
+        # 'cards-blacklist' option
+        ("https://twitter.com/i/web/status/1571141912295243776", {
+            "options": (("cards", "ytdl"),
+                        ("cards-blacklist", ("twitch.tv",))),
+            "count": 0,
+        }),
         # original retweets (#1026)
         ("https://twitter.com/jessica_3978/status/1296304589591810048", {
             "options": (("retweets", "original"),),
@@ -776,12 +795,20 @@ class TwitterTweetExtractor(TwitterExtractor):
         # age-restricted (#2354)
         ("https://twitter.com/mightbecursed/status/1492954264909479936", {
             "options": (("syndication", True),),
+            "keywords": {"date": "dt:2022-02-13 20:10:09"},
             "count": 1,
         }),
         # media alt texts / descriptions (#2617)
         ("https://twitter.com/my0nruri/status/1528379296041299968", {
             "keyword": {"description": "oc"}
         }),
+        # '?format=...&name=...'-style URLs
+        ("https://twitter.com/poco_dandy/status/1150646424461176832", {
+            "options": (("cards", True),),
+            "pattern": r"https://pbs.twimg.com/card_img/157\d+/\w+"
+                       r"\?format=(jpg|png)&name=orig$",
+            "range": "1-2",
+        }),
     )
 
     def __init__(self, match):
@@ -1442,6 +1469,10 @@ class TwitterAPI():
         else:
             retweet_id = None
 
+        tweet["created_at"] = text.parse_datetime(
+            tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime(
+            "%a %b %d %H:%M:%S +0000 %Y")
+
         if "video" in tweet:
             video = tweet["video"]
             video["variants"] = (max(
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index 2b5acd8..72cf438 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -11,6 +11,8 @@
 from .booru import BooruExtractor
 from ..cache import cache
 from .. import text, exception
+from xml.etree import ElementTree
+
 
 BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
 
@@ -54,7 +56,7 @@ class ZerochanExtractor(BooruExtractor):
 
         return response.cookies
 
-    def _parse_entry_page(self, entry_id):
+    def _parse_entry_html(self, entry_id):
         url = "{}/{}".format(self.root, entry_id)
         extr = text.extract_from(self.request(url).text)
 
@@ -66,10 +68,26 @@ class ZerochanExtractor(BooruExtractor):
                 '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),
             "width" : extr('"width": "', ' '),
             "height": extr('"height": "', ' '),
-            "size"  : extr('"contentSize": "', 'B'),
+            "size"  : text.parse_bytes(extr('"contentSize": "', 'B')),
             "path"  : text.split_html(extr(
                 'class="breadcrumbs', '</p>'))[3::2],
-            "tags"  : extr('alt="Tags: ', '"').split(", ")
+            "tags"  : extr('alt="Tags: Anime, ', '"').split(", ")
+        }
+
+    def _parse_entry_xml(self, entry_id):
+        url = "{}/{}?xml".format(self.root, entry_id)
+        item = ElementTree.fromstring(self.request(url).text)[0][-1]
+        #  content = item[4].attrib
+
+        return {
+            #  "id"    : entry_id,
+            #  "file_url": content["url"],
+            #  "width" : content["width"],
+            #  "height": content["height"],
+            #  "size"  : content["filesize"],
+            "name"  : item[2].text,
+            "tags"  : item[5].text.lstrip().split(", "),
+            "md5"   : item[6].text,
         }
 
 
@@ -105,6 +123,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
         url = self.root + "/" + self.search_tag
         params = text.parse_query(self.query)
         params["p"] = text.parse_int(params.get("p"), 1)
+        metadata = self.config("metadata")
 
         while True:
             page = self.request(url, params=params).text
@@ -115,15 +134,22 @@ class ZerochanTagExtractor(ZerochanExtractor):
                 post = extr('<li class="', '>')
                 if not post:
                     break
-                yield {
-                    "id"    : extr('href="/', '"'),
-                    "name"  : extr('alt="', '"'),
-                    "width" : extr('title="', 'x'),
-                    "height": extr('', ' '),
-                    "size"  : extr('', 'B'),
-                    "file_url": "https://static." + extr(
-                        '<a href="https://static.', '"'),
-                }
+
+                if metadata:
+                    entry_id = extr('href="/', '"')
+                    post = self._parse_entry_html(entry_id)
+                    post.update(self._parse_entry_xml(entry_id))
+                    yield post
+                else:
+                    yield {
+                        "id"    : extr('href="/', '"'),
+                        "name"  : extr('alt="', '"'),
+                        "width" : extr('title="', 'x'),
+                        "height": extr('', ' '),
+                        "size"  : extr('', 'B'),
+                        "file_url": "https://static." + extr(
+                            '<a href="https://static.', '"'),
+                    }
 
             if 'rel="next"' not in page:
                 break
@@ -153,4 +179,7 @@ class ZerochanImageExtractor(ZerochanExtractor):
         self.image_id = match.group(1)
 
     def posts(self):
-        return (self._parse_entry_page(self.image_id),)
+        post = self._parse_entry_html(self.image_id)
+        if self.config("metadata"):
+            post.update(self._parse_entry_xml(self.image_id))
+        return (post,)
diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py
index ff97add..4f376fe 100644
--- a/gallery_dl/postprocessor/zip.py
+++ b/gallery_dl/postprocessor/zip.py
@@ -26,6 +26,7 @@ class ZipPP(PostProcessor):
     def __init__(self, job, options):
         PostProcessor.__init__(self, job)
         self.delete = not options.get("keep-files", False)
+        self.files = options.get("files")
         ext = "." + options.get("extension", "zip")
         algorithm = options.get("compression", "store")
         if algorithm not in self.COMPRESSION_ALGORITHMS:
@@ -56,6 +57,9 @@ class ZipPP(PostProcessor):
         # 'NameToInfo' is not officially documented, but it's available
         # for all supported Python versions and using it directly is a lot
         # faster than calling getinfo()
+        if self.files:
+            self.write_extra(pathfmt, zfile, self.files)
+            self.files = None
         if pathfmt.filename not in zfile.NameToInfo:
             zfile.write(pathfmt.temppath, pathfmt.filename)
             pathfmt.delete = self.delete
@@ -69,6 +73,21 @@ class ZipPP(PostProcessor):
         with self.open() as zfile:
             self.write(pathfmt, zfile)
 
+    def write_extra(self, pathfmt, zfile, files):
+        for path in map(util.expand_path, files):
+            if not os.path.isabs(path):
+                path = os.path.join(pathfmt.realdirectory, path)
+            try:
+                zfile.write(path, os.path.basename(path))
+            except OSError as exc:
+                self.log.warning(
+                    "Unable to write %s to %s", path, zfile.filename)
+                self.log.debug("%s: %s", exc, exc.__class__.__name__)
+                pass
+            else:
+                if self.delete:
+                    util.remove_file(path)
+
     def finalize(self, pathfmt, status):
         if self.zfile:
             self.zfile.close()
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index d12d088..ce018fe 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-__version__ = "1.23.0"
+__version__ = "1.23.1"
author	Unit 193 <unit193@unit193.net>	2022-09-22 19:43:53 -0400
committer	Unit 193 <unit193@unit193.net>	2022-09-22 19:43:53 -0400
commit	e6b82556343116256be047ab7099bedd9063f66a (patch)
tree	884c0435863d130ec967163b82a2638ff1bd9505 /gallery_dl
parent	a768930761f7f20587ae40a8cacca0e55c85290a (diff)