New upstream version 1.22.1.upstream/1.22.1

author: Unit 193 <unit193@unit193.net> 2022-06-05 00:33:56 -0400
committer: Unit 193 <unit193@unit193.net> 2022-06-05 00:33:56 -0400
commit: 25442ea49f031d4d2df3353dd7e9ad2080e332da (patch)
tree: 14c2ee86b8d10cf0f79b4cd3ce8d6a34ebe52eba /gallery_dl/extractor
parent: ad61a6d8122973534ab63df48f6090954bc73db6 (diff)
16 files changed, 599 insertions, 307 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index cac8c2d..9cd9059 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -302,6 +302,7 @@ class Extractor():
         if cookies:
             if isinstance(cookies, dict):
                 self._update_cookies_dict(cookies, self.cookiedomain)
+
             elif isinstance(cookies, str):
                 cookiefile = util.expand_path(cookies)
                 try:
@@ -311,12 +312,27 @@ class Extractor():
                     self.log.warning("cookies: %s", exc)
                 else:
                     self._cookiefile = cookiefile
+
             elif isinstance(cookies, (list, tuple)):
-                from ..cookies import load_cookies
-                try:
-                    load_cookies(self._cookiejar, cookies)
-                except Exception as exc:
-                    self.log.warning("cookies: %s", exc)
+                key = tuple(cookies)
+                cookiejar = _browser_cookies.get(key)
+
+                if cookiejar is None:
+                    from ..cookies import load_cookies
+                    cookiejar = self._cookiejar.__class__()
+                    try:
+                        load_cookies(cookiejar, cookies)
+                    except Exception as exc:
+                        self.log.warning("cookies: %s", exc)
+                    else:
+                        _browser_cookies[key] = cookiejar
+                else:
+                    self.log.debug("Using cached cookies from %s", key)
+
+                setcookie = self._cookiejar.set_cookie
+                for cookie in cookiejar:
+                    setcookie(cookie)
+
             else:
                 self.log.warning(
                     "Expected 'dict', 'list', or 'str' value for 'cookies' "
@@ -692,6 +708,7 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address):
 
 
 _adapter_cache = {}
+_browser_cookies = {}
 
 
 HTTP_HEADERS = {
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 85ec0cf..70bee52 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -1311,7 +1311,7 @@ class DeviantartOAuthAPI():
             yield from results
 
             if not data["has_more"] and (
-                    self.strategy != "manual" or not results):
+                    self.strategy != "manual" or not results or not extend):
                 return
 
             if "next_cursor" in data:
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 2dd0c0c..bf9c983 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -133,6 +133,10 @@ INSTANCES = {
         "root": "https://tbib.org",
         "pattern": r"tbib\.org",
     },
+    "hypnohub": {
+        "root": "https://hypnohub.net",
+        "pattern": r"hypnohub\.net",
+    },
 }
 
 BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES)
@@ -159,6 +163,9 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor):
         ("https://tbib.org/index.php?page=post&s=list&tags=yuyaiyaui", {
             "count": ">= 120",
         }),
+        ("https://hypnohub.net/index.php?page=post&s=list&tags=gonoike_biwa", {
+            "url": "fe662b86d38c331fcac9c62af100167d404937dc",
+        }),
     )
 
     def __init__(self, match):
@@ -188,6 +195,10 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor):
         ("https://realbooru.com/index.php?page=pool&s=show&id=1", {
             "count": 3,
         }),
+        ("https://hypnohub.net/index.php?page=pool&s=show&id=61", {
+            "url": "d314826280073441a2da609f70ee814d1f4b9407",
+            "count": 3,
+        }),
     )
 
     def __init__(self, match):
@@ -241,6 +252,9 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor):
         ("https://tbib.org/index.php?page=favorites&s=view&id=7881", {
             "count": 3,
         }),
+        ("https://hypnohub.net/index.php?page=favorites&s=view&id=43546", {
+            "count": 3,
+        }),
     )
 
     def __init__(self, match):
@@ -310,6 +324,11 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
             "url": "5a6ebe07bfff8e6d27f7c30b5480f27abcb577d2",
             "content": "1c3831b6fbaa4686e3c79035b5d98460b1c85c43",
         }),
+        ("https://hypnohub.net/index.php?page=post&s=view&id=73964", {
+            "pattern": r"https://hypnohub\.net/images/7a/37"
+                       r"/7a37c0ba372f35767fb10c904a398831\.png",
+            "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
+        }),
     )
 
     def __init__(self, match):
diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py
index 501d114..0ccd7fa 100644
--- a/gallery_dl/extractor/gfycat.py
+++ b/gallery_dl/extractor/gfycat.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2017-2021 Mike Fährmann
+# Copyright 2017-2022 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -72,18 +72,60 @@ class GfycatExtractor(Extractor):
 class GfycatUserExtractor(GfycatExtractor):
     """Extractor for gfycat user profiles"""
     subcategory = "user"
-    directory_fmt = ("{category}", "{username|userName}")
-    pattern = r"(?:https?://)?gfycat\.com/@([^/?#]+)"
+    directory_fmt = ("{category}", "{username}")
+    pattern = r"(?:https?://)?gfycat\.com/@([^/?#]+)/?(?:$|\?|#)"
     test = ("https://gfycat.com/@gretta", {
         "pattern": r"https://giant\.gfycat\.com/[A-Za-z]+\.mp4",
         "count": ">= 100",
     })
 
+    def gfycats(self):
+        return GfycatAPI(self).user(self.key)
+
+
+class GfycatCollectionExtractor(GfycatExtractor):
+    """Extractor for a gfycat collection"""
+    subcategory = "collection"
+    directory_fmt = ("{category}", "{collection_owner}",
+                     "{collection_name|collection_id}")
+    pattern = (r"(?:https?://)?gfycat\.com/@([^/?#]+)/collections"
+               r"/(\w+)(?:/([^/?#]+))?")
+    test = ("https://gfycat.com/@reactions/collections/nHgy2DtE/no-text", {
+        "pattern": r"https://\w+\.gfycat\.com/[A-Za-z]+\.mp4",
+        "count": ">= 100",
+    })
+
+    def __init__(self, match):
+        GfycatExtractor.__init__(self, match)
+        self.collection_id = match.group(2)
+        self.collection_name = match.group(3)
+
     def metadata(self):
-        return {"userName": self.key}
+        return {
+            "collection_owner": self.key,
+            "collection_name" : self.collection_name,
+            "collection_id"   : self.collection_id,
+        }
 
     def gfycats(self):
-        return GfycatAPI(self).user(self.key)
+        return GfycatAPI(self).collection(self.key, self.collection_id)
+
+
+class GfycatCollectionsExtractor(GfycatExtractor):
+    """Extractor for a gfycat user's collections"""
+    subcategory = "collections"
+    pattern = r"(?:https?://)?gfycat\.com/@([^/?#]+)/collections/?(?:$|\?|#)"
+    test = ("https://gfycat.com/@sannahparker/collections", {
+        "pattern": GfycatCollectionExtractor.pattern,
+        "count": ">= 20",
+    })
+
+    def items(self):
+        for col in GfycatAPI(self).collections(self.key):
+            url = "https://gfycat.com/@{}/collections/{}/{}".format(
+                col["userId"], col["folderId"], col["linkText"])
+            col["_extractor"] = GfycatCollectionExtractor
+            yield Message.Queue, url, col
 
 
 class GfycatSearchExtractor(GfycatExtractor):
@@ -177,7 +219,6 @@ class GfycatAPI():
 
     def __init__(self, extractor):
         self.extractor = extractor
-        self.headers = {}
 
     def gfycat(self, gfycat_id):
         endpoint = "/v1/gfycats/" + gfycat_id
@@ -188,6 +229,17 @@ class GfycatAPI():
         params = {"count": 100}
         return self._pagination(endpoint, params)
 
+    def collection(self, user, collection):
+        endpoint = "/v1/users/{}/collections/{}/gfycats".format(
+            user, collection)
+        params = {"count": 100}
+        return self._pagination(endpoint, params)
+
+    def collections(self, user):
+        endpoint = "/v1/users/{}/collections".format(user)
+        params = {"count": 100}
+        return self._pagination(endpoint, params, "gfyCollections")
+
     def search(self, query):
         endpoint = "/v1/gfycats/search"
         params = {"search_text": query, "count": 150}
@@ -195,20 +247,13 @@ class GfycatAPI():
 
     def _call(self, endpoint, params=None):
         url = self.API_ROOT + endpoint
-        return self.extractor.request(
-            url, params=params, headers=self.headers).json()
+        return self.extractor.request(url, params=params).json()
 
-    def _pagination(self, endpoint, params):
+    def _pagination(self, endpoint, params, key="gfycats"):
         while True:
             data = self._call(endpoint, params)
-            gfycats = data["gfycats"]
-
-            for gfycat in gfycats:
-                if "gfyName" not in gfycat:
-                    gfycat.update(self.gfycat(gfycat["gfyId"]))
-                yield gfycat
+            yield from data[key]
 
-            if "found" not in data and len(gfycats) < params["count"] or \
-                    not data["gfycats"]:
+            if not data["cursor"]:
                 return
             params["cursor"] = data["cursor"]
diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py
index 37d2986..b53ebbe 100644
--- a/gallery_dl/extractor/gofile.py
+++ b/gallery_dl/extractor/gofile.py
@@ -5,7 +5,7 @@
 # published by the Free Software Foundation.
 
 from .common import Extractor, Message
-from .. import exception
+from .. import text, exception
 from ..cache import memcache
 
 
@@ -17,46 +17,45 @@ class GofileFolderExtractor(Extractor):
     archive_fmt = "{id}"
     pattern = r"(?:https?://)?(?:www\.)?gofile\.io/d/([^/?#]+)"
     test = (
-        ("https://gofile.io/d/5qHmQj", {
-            "pattern": r"https://file\d+\.gofile\.io/download"
+        ("https://gofile.io/d/k6BomI", {
+            "pattern": r"https://store\d+\.gofile\.io/download"
                        r"/\w{8}-\w{4}-\w{4}-\w{4}-\w{12}"
                        r"/test-%E3%83%86%E3%82%B9%E3%83%88-%2522%26!\.png",
             "keyword": {
                 "createTime": int,
-                "directLink": "re:https://store3.gofile.io/download/direct/.+",
+                "directLink": "re:https://store5.gofile.io/download/direct/.+",
                 "downloadCount": int,
                 "extension": "png",
                 "filename": "test-テスト-%22&!",
                 "folder": {
                     "childs": [
-                        "346429cc-aee4-4996-be3f-e58616fe231f",
-                        "765b6b12-b354-4e14-9a45-f763fa455682",
-                        "2a44600a-4a59-4389-addc-4a0d542c457b"
+                        "b0367d79-b8ba-407f-8342-aaf8eb815443",
+                        "7fd4a36a-c1dd-49ff-9223-d93f7d24093f"
                     ],
-                    "code": "5qHmQj",
-                    "createTime": 1648536501,
-                    "id": "45cd45d1-dc78-4553-923f-04091c621699",
-                    "isRoot": True,
+                    "code": "k6BomI",
+                    "createTime": 1654076165,
+                    "id": "fafb59f9-a7c7-4fea-a098-b29b8d97b03c",
                     "name": "root",
                     "public": True,
                     "totalDownloadCount": int,
-                    "totalSize": 364,
+                    "totalSize": 182,
                     "type": "folder"
                 },
                 "id": r"re:\w{8}-\w{4}-\w{4}-\w{4}-\w{12}",
-                "link": r"re:https://file17.gofile.io/download/.+\.png",
+                "link": r"re:https://store5.gofile.io/download/.+\.png",
                 "md5": "re:[0-9a-f]{32}",
                 "mimetype": "image/png",
                 "name": "test-テスト-%22&!.png",
                 "num": int,
-                "parentFolder": "45cd45d1-dc78-4553-923f-04091c621699",
-                "serverChoosen": "file17",
+                "parentFolder": "fafb59f9-a7c7-4fea-a098-b29b8d97b03c",
+                "serverChoosen": "store5",
                 "size": 182,
-                "thumbnail": r"re:https://store3.gofile.io/download/.+\.png",
+                "thumbnail": r"re:https://store5.gofile.io/download/.+\.png",
                 "type": "file"
             },
         }),
-        ("https://gofile.io/d/346429cc-aee4-4996-be3f-e58616fe231f", {
+        ("https://gofile.io/d/7fd4a36a-c1dd-49ff-9223-d93f7d24093f", {
+            "options": (("website-token", None),),
             "content": "0c8768055e4e20e7c7259608b67799171b691140",
         }),
     )
@@ -69,12 +68,17 @@ class GofileFolderExtractor(Extractor):
         recursive = self.config("recursive")
 
         token = self.config("api-token")
-        if token is None:
-            self.log.debug("creating temporary account")
+        if not token:
             token = self._create_account()
         self.session.cookies.set("accountToken", token, domain=".gofile.io")
+        self.api_token = token
 
-        folder = self._get_content(self.content_id, token)
+        token = self.config("website-token", "12345")
+        if not token:
+            token = self._get_website_token()
+        self.website_token = token
+
+        folder = self._get_content(self.content_id)
         yield Message.Directory, folder
 
         num = 0
@@ -102,13 +106,20 @@ class GofileFolderExtractor(Extractor):
 
     @memcache()
     def _create_account(self):
+        self.log.debug("Creating temporary account")
         return self._api_request("createAccount")["token"]
 
-    def _get_content(self, content_id, token):
+    @memcache()
+    def _get_website_token(self):
+        self.log.debug("Fetching website token")
+        page = self.request(self.root + "/contents/files.html").text
+        return text.extract(page, "websiteToken:", ",")[0].strip("\" ")
+
+    def _get_content(self, content_id):
         return self._api_request("getContent", {
             "contentId"   : content_id,
-            "token"       : token,
-            "websiteToken": "websiteToken",
+            "token"       : self.api_token,
+            "websiteToken": self.website_token,
         })
 
     def _api_request(self, endpoint, params=None):
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 82c9858..e536e22 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -11,7 +11,7 @@
 
 from .common import Extractor, Message
 from .. import text, util, exception
-from ..cache import cache
+from ..cache import cache, memcache
 import json
 import time
 import re
@@ -134,6 +134,32 @@ class InstagramExtractor(Extractor):
             url, params=params, headers=headers, cookies=cookies,
         ).json()["data"]
 
+    @memcache(keyarg=1)
+    def _user_by_screen_name(self, screen_name):
+        url = "https://www.instagram.com/{}/?__a=1&__d=dis".format(
+            screen_name)
+        headers = {
+            "Referer": "https://www.instagram.com/{}/".format(screen_name),
+            "X-CSRFToken"     : self.csrf_token,
+            "X-IG-App-ID"     : "936619743392459",
+            "X-IG-WWW-Claim"  : self.www_claim,
+            "X-Requested-With": "XMLHttpRequest",
+        }
+        cookies = {
+            "csrftoken": self.csrf_token,
+        }
+        return self.request(
+            url, headers=headers, cookies=cookies).json()["graphql"]["user"]
+
+    def _uid_by_screen_name(self, screen_name):
+        if screen_name.startswith("id:"):
+            return screen_name[3:]
+        return self._user_by_screen_name(screen_name)["id"]
+
+    def _media_by_id(self, post_id):
+        endpoint = "/v1/media/{}/info/".format(post_id)
+        return self._pagination_api(endpoint)
+
     def login(self):
         if not self._check_cookies(self.cookienames):
             username, password = self._get_auth_info()
@@ -186,19 +212,15 @@ class InstagramExtractor(Extractor):
 
     def _parse_post_graphql(self, post):
         typename = post["__typename"]
+
         if post.get("is_video") and "video_url" not in post:
-            url = "{}/tv/{}/".format(self.root, post["shortcode"])
-            post = self._extract_post_page(url)
-            if "items" in post:
-                return self._parse_post_api({"media": post["items"][0]})
-            post = post["graphql"]["shortcode_media"]
-        elif typename == "GraphSidecar" and \
+            media = next(self._media_by_id(post["id"]))
+            return self._parse_post_api(media)
+
+        if typename == "GraphSidecar" and \
                 "edge_sidecar_to_children" not in post:
-            url = "{}/p/{}/".format(self.root, post["shortcode"])
-            post = self._extract_post_page(url)
-            if "items" in post:
-                return self._parse_post_api({"media": post["items"][0]})
-            post = post["graphql"]["shortcode_media"]
+            media = next(self._media_by_id(post["id"]))
+            return self._parse_post_api(media)
 
         owner = post["owner"]
         data = {
@@ -238,7 +260,7 @@ class InstagramExtractor(Extractor):
                     "num": num,
                     "media_id"   : node["id"],
                     "shortcode"  : (node.get("shortcode") or
-                                    self._shortcode_from_id(node["id"])),
+                                    shortcode_from_id(node["id"])),
                     "display_url": node["display_url"],
                     "video_url"  : node.get("video_url"),
                     "width"      : dimensions["width"],
@@ -270,7 +292,7 @@ class InstagramExtractor(Extractor):
             owner = media["user"]
             data = {
                 "post_id" : media["pk"],
-                "post_shortcode": self._shortcode_from_id(media["pk"]),
+                "post_shortcode": shortcode_from_id(media["pk"]),
             }
 
             if "carousel_media" in media:
@@ -286,7 +308,7 @@ class InstagramExtractor(Extractor):
             data = {
                 "expires" : text.parse_timestamp(post.get("expiring_at")),
                 "post_id" : reel_id,
-                "post_shortcode": self._shortcode_from_id(reel_id),
+                "post_shortcode": shortcode_from_id(reel_id),
             }
 
         data["owner_id"] = owner["pk"]
@@ -314,7 +336,7 @@ class InstagramExtractor(Extractor):
                                                     media.get("taken_at")),
                 "media_id"   : item["pk"],
                 "shortcode"  : (item.get("code") or
-                                self._shortcode_from_id(item["pk"])),
+                                shortcode_from_id(item["pk"])),
                 "display_url": image["url"],
                 "video_url"  : video["url"] if video else None,
                 "width"      : media["width"],
@@ -326,14 +348,6 @@ class InstagramExtractor(Extractor):
         return data
 
     @staticmethod
-    def _shortcode_from_id(post_id):
-        return util.bencode(
-            int(post_id),
-            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-            "abcdefghijklmnopqrstuvwxyz"
-            "0123456789-_")
-
-    @staticmethod
     def _extract_tagged_users(src, dest):
         dest["tagged_users"] = tagged_users = []
 
@@ -383,20 +397,6 @@ class InstagramExtractor(Extractor):
                 json.loads(additional_data.partition(",")[2])
         return data
 
-    def _extract_profile_page(self, url):
-        page = self.request(url).text
-        data = self._extract_shared_data(page)["entry_data"]
-        if "HttpErrorPage" in data:
-            raise exception.NotFoundError("user")
-        return data["ProfilePage"][0]["graphql"]["user"]
-
-    def _extract_post_page(self, url):
-        page = self.request(url).text
-        data = self._extract_shared_data(page)["entry_data"]
-        if "HttpErrorPage" in data:
-            raise exception.NotFoundError("post")
-        return data["PostPage"][0]
-
     def _get_edge_data(self, user, key):
         cursor = self.config("cursor")
         if cursor or not key:
@@ -410,25 +410,40 @@ class InstagramExtractor(Extractor):
             }
         return user[key]
 
-    def _pagination_graphql(self, query_hash, variables, data):
+    def _pagination_graphql(self, query_hash, variables):
+        cursor = self.config("cursor")
+        if cursor:
+            variables["after"] = cursor
+
         while True:
+            data = next(iter(self._request_graphql(
+                query_hash, variables)["user"].values()))
+
             for edge in data["edges"]:
                 yield edge["node"]
 
             info = data["page_info"]
             if not info["has_next_page"]:
                 return
-            elif not data["edges"] and "_virtual" not in info:
+            elif not data["edges"]:
                 s = "" if self.item.endswith("s") else "s"
                 raise exception.StopExtraction(
                     "%s'%s posts are private", self.item, s)
 
             variables["after"] = self._cursor = info["end_cursor"]
             self.log.debug("Cursor: %s", self._cursor)
-            data = next(iter(self._request_graphql(
-                query_hash, variables)["user"].values()))
 
-    def _pagination_api(self, endpoint, params):
+    def _pagination_api(self, endpoint, params=None):
+        while True:
+            data = self._request_api(endpoint, params=params)
+            for item in data["items"]:
+                yield {"media": item}
+
+            if not data["more_available"]:
+                return
+            params["max_id"] = data["next_max_id"]
+
+    def _pagination_api_post(self, endpoint, params, post=False):
         while True:
             data = self._request_api(endpoint, method="POST", data=params)
             yield from data["items"]
@@ -446,6 +461,7 @@ class InstagramUserExtractor(InstagramExtractor):
     test = (
         ("https://www.instagram.com/instagram/"),
         ("https://www.instagram.com/instagram/?hl=en"),
+        ("https://www.instagram.com/id:25025320/"),
     )
 
     def items(self):
@@ -471,13 +487,9 @@ class InstagramPostsExtractor(InstagramExtractor):
     })
 
     def posts(self):
-        url = "{}/{}/".format(self.root, self.item)
-        user = self._extract_profile_page(url)
-
-        query_hash = "8c2a529969ee035a5063f2fc8602a0fd"
-        variables = {"id": user["id"], "first": 50}
-        edge = self._get_edge_data(user, "edge_owner_to_timeline_media")
-        return self._pagination_graphql(query_hash, variables, edge)
+        query_hash = "69cba40317214236af40e7efa697781d"
+        variables = {"id": self._uid_by_screen_name(self.item), "first": 50}
+        return self._pagination_graphql(query_hash, variables)
 
 
 class InstagramTaggedExtractor(InstagramExtractor):
@@ -495,8 +507,12 @@ class InstagramTaggedExtractor(InstagramExtractor):
     })
 
     def metadata(self):
-        url = "{}/{}/".format(self.root, self.item)
-        self.user = user = self._extract_profile_page(url)
+        if self.item.startswith("id:"):
+            self.user_id = self.item[3:]
+            return {"tagged_owner_id": self.user_id}
+
+        user = self._user_by_screen_name(self.item)
+        self.user_id = user["id"]
 
         return {
             "tagged_owner_id" : user["id"],
@@ -505,10 +521,9 @@ class InstagramTaggedExtractor(InstagramExtractor):
         }
 
     def posts(self):
-        query_hash = "be13233562af2d229b008d2976b998b5"
-        variables = {"id": self.user["id"], "first": 50}
-        edge = self._get_edge_data(self.user, None)
-        return self._pagination_graphql(query_hash, variables, edge)
+        endpoint = "/v1/usertags/{}/feed/".format(self.user_id)
+        params = {"count": 50}
+        return self._pagination_api(endpoint, params)
 
 
 class InstagramChannelExtractor(InstagramExtractor):
@@ -521,13 +536,9 @@ class InstagramChannelExtractor(InstagramExtractor):
     })
 
     def posts(self):
-        url = "{}/{}/channel/".format(self.root, self.item)
-        user = self._extract_profile_page(url)
-
         query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
-        variables = {"id": user["id"], "first": 50}
-        edge = self._get_edge_data(user, "edge_felix_video_timeline")
-        return self._pagination_graphql(query_hash, variables, edge)
+        variables = {"id": self._uid_by_screen_name(self.item), "first": 50}
+        return self._pagination_graphql(query_hash, variables)
 
 
 class InstagramSavedExtractor(InstagramExtractor):
@@ -537,13 +548,9 @@ class InstagramSavedExtractor(InstagramExtractor):
     test = ("https://www.instagram.com/instagram/saved/",)
 
     def posts(self):
-        url = "{}/{}/saved/".format(self.root, self.item)
-        user = self._extract_profile_page(url)
-
         query_hash = "2ce1d673055b99250e93b6f88f878fde"
-        variables = {"id": user["id"], "first": 50}
-        edge = self._get_edge_data(user, "edge_saved_media")
-        return self._pagination_graphql(query_hash, variables, edge)
+        variables = {"id": self._uid_by_screen_name(self.item), "first": 50}
+        return self._pagination_graphql(query_hash, variables)
 
 
 class InstagramTagExtractor(InstagramExtractor):
@@ -719,19 +726,7 @@ class InstagramPostExtractor(InstagramExtractor):
     )
 
     def posts(self):
-        query_hash = "2efa04f61586458cef44441f474eee7c"
-        variables = {
-            "shortcode"            : self.item,
-            "child_comment_count"  : 3,
-            "fetch_comment_count"  : 40,
-            "parent_comment_count" : 24,
-            "has_threaded_comments": True,
-        }
-        data = self._request_graphql(query_hash, variables)
-        media = data.get("shortcode_media")
-        if not media:
-            raise exception.NotFoundError("post")
-        return (media,)
+        return self._media_by_id(id_from_shortcode(self.item))
 
 
 class InstagramStoriesExtractor(InstagramExtractor):
@@ -755,17 +750,7 @@ class InstagramStoriesExtractor(InstagramExtractor):
         if self.highlight_id:
             reel_id = "highlight:" + self.highlight_id
         else:
-            url = "{}/stories/{}/".format(self.root, self.user)
-            with self.request(url, allow_redirects=False) as response:
-                if 300 <= response.status_code < 400:
-                    return ()
-                page = response.text
-            try:
-                data = self._extract_shared_data(page)["entry_data"]
-                user = data["StoriesPage"][0]["user"]
-            except KeyError:
-                return ()
-            reel_id = user["id"]
+            reel_id = self._uid_by_screen_name(self.user)
 
         endpoint = "/v1/feed/reels_media/"
         params = {"reel_ids": reel_id}
@@ -790,10 +775,8 @@ class InstagramHighlightsExtractor(InstagramExtractor):
     test = ("https://www.instagram.com/instagram/highlights",)
 
     def posts(self):
-        url = "{}/{}/".format(self.root, self.item)
-        user = self._extract_profile_page(url)
-
-        endpoint = "/v1/highlights/{}/highlights_tray/".format(user["id"])
+        endpoint = "/v1/highlights/{}/highlights_tray/".format(
+            self._uid_by_screen_name(self.item))
         tray = self._request_api(endpoint)["tray"]
         reel_ids = [highlight["id"] for highlight in tray]
 
@@ -820,13 +803,23 @@ class InstagramReelsExtractor(InstagramExtractor):
     })
 
     def posts(self):
-        url = "{}/{}/".format(self.root, self.item)
-        user = self._extract_profile_page(url)
-
         endpoint = "/v1/clips/user/"
         data = {
-            "target_user_id": user["id"],
+            "target_user_id": self._uid_by_screen_name(self.item),
             "page_size"     : "50",
         }
 
-        return self._pagination_api(endpoint, data)
+        return self._pagination_api_post(endpoint, data)
+
+
+def id_from_shortcode(shortcode):
+    return util.bdecode(shortcode, _ALPHABET)
+
+
+def shortcode_from_id(post_id):
+    return util.bencode(int(post_id), _ALPHABET)
+
+
+_ALPHABET = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789-_")
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index b6a508d..f3bd5d8 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -20,8 +20,14 @@ class LolisafeExtractor(BaseExtractor):
 
 
 BASE_PATTERN = LolisafeExtractor.update({
-    "bunkr": {"root": "https://bunkr.is", "pattern": r"bunkr\.(?:is|to)"},
-    "zzzz" : {"root": "https://zz.ht"   , "pattern": r"zz\.(?:ht|fo)"},
+    "bunkr": {
+        "root": "https://app.bunkr.is",
+        "pattern": r"(?:app\.)?bunkr\.(?:is|to)",
+    },
+    "zzzz" : {
+        "root": "https://zz.ht",
+        "pattern": r"zz\.(?:ht|fo)",
+    },
 })
 
 
@@ -29,7 +35,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
     subcategory = "album"
     pattern = BASE_PATTERN + "/a/([^/?#]+)"
     test = (
-        ("https://bunkr.is/a/Lktg9Keq", {
+        ("https://app.bunkr.is/a/Lktg9Keq", {
             "pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png",
             "content": "0c8768055e4e20e7c7259608b67799171b691140",
             "keyword": {
@@ -65,7 +71,11 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
 
         domain = self.config("domain")
         if domain is None or domain == "auto":
-            self.root = text.root_from_url(match.group(0))
+            if self.category == "bunkr":
+                self.root = "https://app.bunkr.is"
+            else:
+                self.root = text.root_from_url(match.group(0))
+
         else:
             self.root = text.ensure_http_scheme(domain)
 
diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py
index 65b9a83..27ec929 100644
--- a/gallery_dl/extractor/moebooru.py
+++ b/gallery_dl/extractor/moebooru.py
@@ -60,10 +60,6 @@ BASE_PATTERN = MoebooruExtractor.update({
         "root": "https://konachan.com",
         "pattern": r"konachan\.(?:com|net)",
     },
-    "hypnohub": {
-        "root": "https://hypnohub.net",
-        "pattern": r"hypnohub\.net",
-    },
     "sakugabooru": {
         "root": "https://www.sakugabooru.com",
         "pattern": r"(?:www\.)?sakugabooru\.com",
@@ -101,9 +97,6 @@ class MoebooruPostExtractor(MoebooruExtractor):
             },
         }),
         ("https://konachan.net/post/show/205189"),
-        ("https://hypnohub.net/post/show/73964", {
-            "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
-        }),
         ("https://www.sakugabooru.com/post/show/125570"),
         ("https://lolibooru.moe/post/show/287835"),
     )
@@ -130,9 +123,6 @@ class MoebooruTagExtractor(MoebooruExtractor):
             "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
         }),
         ("https://konachan.net/post?tags=patata"),
-        ("https://hypnohub.net/post?tags=gonoike_biwa", {
-            "url": "072330c34a1e773d0cafd00e64b8060d34b078b6",
-        }),
         ("https://www.sakugabooru.com/post?tags=nichijou"),
         ("https://lolibooru.moe/post?tags=ruu_%28tksymkw%29"),
     )
@@ -163,9 +153,6 @@ class MoebooruPoolExtractor(MoebooruExtractor):
             "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
         }),
         ("https://konachan.net/pool/show/95"),
-        ("https://hypnohub.net/pool/show/61", {
-            "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf",
-        }),
         ("https://www.sakugabooru.com/pool/show/54"),
         ("https://lolibooru.moe/pool/show/239"),
     )
@@ -198,10 +185,6 @@ class MoebooruPopularExtractor(MoebooruExtractor):
         }),
         ("https://konachan.com/post/popular_recent"),
         ("https://konachan.net/post/popular_recent"),
-        ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", {
-            "count": 20,
-        }),
-        ("https://hypnohub.net/post/popular_recent"),
         ("https://www.sakugabooru.com/post/popular_recent"),
         ("https://lolibooru.moe/post/popular_recent"),
     )
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 4dc880f..7d7c3f8 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2019-2021 Mike Fährmann
+# Copyright 2019-2022 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -201,6 +201,7 @@ class NozomiSearchExtractor(NozomiExtractor):
             return decode_nozomi(self.request(url).content)
 
         for tag in self.tags:
+            tag = tag.replace("/", "")
             if tag[0] == "-":
                 if not index:
                     index = set(nozomi("index"))
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index ad9f620..0a6a6d3 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2022 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -41,6 +41,29 @@ class PahealExtractor(Extractor):
     def get_posts(self):
         """Return an iterable containing data of all relevant posts"""
 
+    def _extract_post(self, post_id):
+        url = "{}/post/view/{}".format(self.root, post_id)
+        extr = text.extract_from(self.request(url).text)
+
+        post = {
+            "id"      : post_id,
+            "tags"    : extr(": ", "<"),
+            "md5"     : extr("/_thumbs/", "/"),
+            "file_url": extr("id='main_image' src='", "'"),
+            "uploader": text.unquote(extr(
+                "class='username' href='/user/", "'")),
+            "date"    : text.parse_datetime(
+                extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"),
+            "source"  : text.extract(
+                extr(">Source&nbsp;Link<", "</td>"), "href='", "'")[0],
+        }
+
+        dimensions, size, ext = extr("Info</th><td>", ">").split(" // ")
+        post["width"], _, post["height"] = dimensions.partition("x")
+        post["size"] = text.parse_bytes(size[:-1])
+
+        return post
+
 
 class PahealTagExtractor(PahealExtractor):
     """Extractor for images from rule34.paheal.net by search-tags"""
@@ -58,6 +81,9 @@ class PahealTagExtractor(PahealExtractor):
         PahealExtractor.__init__(self, match)
         self.tags = text.unquote(match.group(1))
 
+        if self.config("metadata"):
+            self._extract_data = self._extract_data_ex
+
     def get_metadata(self):
         return {"search_tags": self.tags}
 
@@ -87,42 +113,61 @@ class PahealTagExtractor(PahealExtractor):
         width, _, height = dimensions.partition("x")
 
         return {
-            "id": pid, "md5": md5, "tags": tags, "file_url": url,
+            "id": pid, "md5": md5, "file_url": url,
             "width": width, "height": height,
+            "tags": text.unescape(tags),
             "size": text.parse_bytes(size[:-1]),
+            "date": text.parse_datetime(date, "%B %d, %Y; %H:%M"),
         }
 
+    def _extract_data_ex(self, post):
+        pid = post[:post.index('"')]
+        return self._extract_post(pid)
+
 
 class PahealPostExtractor(PahealExtractor):
     """Extractor for single images from rule34.paheal.net"""
     subcategory = "post"
     pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
                r"/post/view/(\d+)")
-    test = ("https://rule34.paheal.net/post/view/481609", {
-        "pattern": r"https://tulip\.paheal\.net/_images"
-                   r"/bbdc1c33410c2cdce7556c7990be26b7/481609%20-%20"
-                   r"Azumanga_Daioh%20Osaka%20Vuvuzela%20inanimate\.jpg",
-        "keyword": "abe7c1220ba5601f9639aa79fbb9689674ec8f5c",
-        "content": "7b924bcf150b352ac75c9d281d061e174c851a11",
-    })
+    test = (
+        ("https://rule34.paheal.net/post/view/481609", {
+            "pattern": r"https://tulip\.paheal\.net/_images"
+                       r"/bbdc1c33410c2cdce7556c7990be26b7/481609%20-%20"
+                       r"Azumanga_Daioh%20Osaka%20Vuvuzela%20inanimate\.jpg",
+            "content": "7b924bcf150b352ac75c9d281d061e174c851a11",
+            "keyword": {
+                "date": "dt:2010-06-17 15:40:23",
+                "extension": "jpg",
+                "file_url": "re:https://tulip.paheal.net/_images/bbdc1c33410c",
+                "filename": "481609 - Azumanga_Daioh Osaka Vuvuzela inanimate",
+                "height": 660,
+                "id": 481609,
+                "md5": "bbdc1c33410c2cdce7556c7990be26b7",
+                "size": 157389,
+                "source": None,
+                "tags": "Azumanga_Daioh Osaka Vuvuzela inanimate",
+                "uploader": "CaptainButtface",
+                "width": 614,
+            },
+        }),
+        ("https://rule34.paheal.net/post/view/488534", {
+            "keyword": {
+                "date": "dt:2010-06-25 13:51:17",
+                "height": 800,
+                "md5": "b39edfe455a0381110c710d6ed2ef57d",
+                "size": 758989,
+                "source": "http://www.furaffinity.net/view/4057821/",
+                "tags": "Vuvuzela inanimate thelost-dragon",
+                "uploader": "leacheate_soup",
+                "width": 1200,
+            },
+        }),
+    )
 
     def __init__(self, match):
         PahealExtractor.__init__(self, match)
         self.post_id = match.group(1)
 
     def get_posts(self):
-        url = "{}/post/view/{}".format(self.root, self.post_id)
-        page = self.request(url).text
-
-        tags  , pos = text.extract(page, ": ", "<")
-        md5   , pos = text.extract(page, "/_thumbs/", "/", pos)
-        url   , pos = text.extract(page, "id='main_image' src='", "'", pos)
-        width , pos = text.extract(page, "data-width=", " ", pos)
-        height, pos = text.extract(page, "data-height=", " ", pos)
-
-        return ({
-            "id": self.post_id, "md5": md5, "tags": tags, "file_url": url,
-            "size"  : 0,
-            "width" : width.strip("'\""),
-            "height": height.strip("'\""),
-        },)
+        return (self._extract_post(self.post_id),)
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 9b35e42..f19e008 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -263,7 +263,8 @@ class PixivBackgroundExtractor(PixivExtractor):
         url = url.replace("_master1200.", ".")
         work = self._make_work("background", url, detail["user"])
         if url.endswith(".jpg"):
-            work["_fallback"] = (url[:-4] + ".png",)
+            url = url[:-4]
+            work["_fallback"] = (url + ".png", url + ".gif")
         return (work,)
 
 
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index 16b9191..ca7a3c6 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -23,8 +23,7 @@ class ReadcomiconlineBase():
     filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
     archive_fmt = "{issue_id}_{page}"
     root = "https://readcomiconline.li"
-    browser = "firefox"
-    request_interval = (1, 9)
+    request_interval = (3.0, 7.0)
 
     def request(self, url, **kwargs):
         """Detect and handle redirects to CAPTCHA pages"""
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 01538bf..d35e24e 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -144,8 +144,8 @@ class RedditExtractor(Extractor):
 class RedditSubredditExtractor(RedditExtractor):
     """Extractor for URLs from subreddits on reddit.com"""
     subcategory = "subreddit"
-    pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/"
-               r"([^/?#]+(?:/([a-z]+))?)/?(?:\?([^#]*))?(?:$|#)")
+    pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com"
+               r"(/r/[^/?#]+(?:/([a-z]+))?)/?(?:\?([^#]*))?(?:$|#)")
     test = (
         ("https://www.reddit.com/r/lavaporn/", {
             "range": "1-20",
@@ -168,6 +168,20 @@ class RedditSubredditExtractor(RedditExtractor):
         return self.api.submissions_subreddit(self.subreddit, self.params)
 
 
+class RedditHomeExtractor(RedditSubredditExtractor):
+    """Extractor for submissions from your home feed on reddit.com"""
+    subcategory = "home"
+    pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com"
+               r"((?:/([a-z]+))?)/?(?:\?([^#]*))?(?:$|#)")
+    test = (
+        ("https://www.reddit.com/", {
+            "range": "1-20",
+            "count": ">= 20",
+        }),
+        ("https://old.reddit.com/top/?sort=top&t=month"),
+    )
+
+
 class RedditUserExtractor(RedditExtractor):
     """Extractor for URLs from posts by a reddit user"""
     subcategory = "user"
@@ -321,7 +335,7 @@ class RedditAPI():
 
     def submissions_subreddit(self, subreddit, params):
         """Collect all (submission, comments)-tuples of a subreddit"""
-        endpoint = "/r/" + subreddit + "/.json"
+        endpoint = subreddit + "/.json"
         params["limit"] = 100
         return self._pagination(endpoint, params)
 
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 59e8be6..855833a 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -15,7 +15,8 @@ from ..cache import cache
 import collections
 
 BASE_PATTERN = r"(?:https?://)?" \
-    r"(?:sankaku\.app|(?:beta|chan)\.sankakucomplex\.com)"
+    r"(?:(?:chan|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \
+    r"(?:/[a-z]{2})?"
 
 
 class SankakuExtractor(BooruExtractor):
@@ -171,8 +172,11 @@ class SankakuPostExtractor(SankakuExtractor):
                 "tags_general": ["key(mangaka)", "key(mangaka)"],
             },
         }),
-        ("https://beta.sankakucomplex.com/post/show/360451"),
         ("https://chan.sankakucomplex.com/post/show/360451"),
+        ("https://chan.sankakucomplex.com/ja/post/show/360451"),
+        ("https://beta.sankakucomplex.com/post/show/360451"),
+        ("https://white.sankakucomplex.com/post/show/360451"),
+        ("https://black.sankakucomplex.com/post/show/360451"),
     )
 
     def __init__(self, match):
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index b57013a..d2e298c 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -118,8 +118,8 @@ class SubscribestarExtractor(Extractor):
             "author_id"  : text.parse_int(extr('data-user-id="', '"')),
             "author_name": text.unescape(extr('href="/', '"')),
             "author_nick": text.unescape(extr('>', '<')),
-            "date"       : self._parse_datetime(text.remove_html(extr(
-                'class="post-date">', '</'))),
+            "date"       : self._parse_datetime(extr(
+                'class="post-date">', '</').rpartition(">")[2]),
             "content"    : (extr(
                 '<div class="post-content', '<div class="post-uploads')
                 .partition(">")[2]),
@@ -152,8 +152,6 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
                 "type"   : "re:image|video|attachment",
                 "url"    : str,
                 "?pinned": bool,
-                "?height": int,
-                "?width" : int,
             },
         }),
         ("https://www.subscribestar.com/subscribestar", {
@@ -162,8 +160,8 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
             "range": "1",
         }),
         ("https://subscribestar.adult/kanashiipanda", {
-            "range": "21-40",
-            "count": 20,
+            "range": "1-10",
+            "count": 10,
         }),
     )
 
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 1929f98..a7068c8 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -10,28 +10,52 @@
 
 from .common import Extractor, Message
 from .. import text, exception
+from ..cache import cache
 import itertools
+import random
 import json
 
+BASE_PATTERN = r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
+USER_PATTERN = BASE_PATTERN + r"/(?:(u|n|p(?:rofile)?)/)?([^/?#]+)(?:/home)?"
+
 
 class WeiboExtractor(Extractor):
     category = "weibo"
     directory_fmt = ("{category}", "{user[screen_name]}")
     filename_fmt = "{status[id]}_{num:>02}.{extension}"
     archive_fmt = "{status[id]}_{num}"
-    root = "https://m.weibo.cn"
+    root = "https://weibo.com"
     request_interval = (1.0, 2.0)
 
     def __init__(self, match):
         Extractor.__init__(self, match)
+        self._prefix, self.user = match.groups()
         self.retweets = self.config("retweets", True)
         self.videos = self.config("videos", True)
+        self.livephoto = self.config("livephoto", True)
+
+        cookies = _cookie_cache()
+        if cookies is not None:
+            self.session.cookies.update(cookies)
+
+    def request(self, url, **kwargs):
+        response = Extractor.request(self, url, **kwargs)
+
+        if response.history and "passport.weibo.com" in response.url:
+            self._sina_visitor_system(response)
+            response = Extractor.request(self, url, **kwargs)
+
+        return response
 
     def items(self):
         original_retweets = (self.retweets == "original")
 
         for status in self.statuses():
 
+            status["date"] = text.parse_datetime(
+                status["created_at"], "%a %b %d %H:%M:%S %z %Y")
+            yield Message.Directory, status
+
             if self.retweets and "retweeted_status" in status:
                 if original_retweets:
                     status = status["retweeted_status"]
@@ -45,96 +69,69 @@ class WeiboExtractor(Extractor):
                 files = self._files_from_status(status)
 
             for num, file in enumerate(files, 1):
-                if num == 1:
-                    status["date"] = text.parse_datetime(
-                        status["created_at"], "%a %b %d %H:%M:%S %z %Y")
-                    yield Message.Directory, status
+                if file["url"].startswith("http:"):
+                    file["url"] = "https:" + file["url"][5:]
+                if "filename" not in file:
+                    text.nameext_from_url(file["url"], file)
                 file["status"] = status
                 file["num"] = num
                 yield Message.Url, file["url"], file
 
-    def statuses(self):
-        """Returns an iterable containing all relevant 'status' objects"""
+    def _files_from_status(self, status):
+        pic_ids = status.get("pic_ids")
+        if pic_ids:
+            pics = status["pic_infos"]
+            for pic_id in pic_ids:
+                pic = pics[pic_id]
+                pic_type = pic.get("type")
 
-    def _status_by_id(self, status_id):
-        url = "{}/detail/{}".format(self.root, status_id)
-        page = self.request(url, fatal=False).text
-        data = text.extract(page, "var $render_data = [", "][0] || {};")[0]
-        return json.loads(data)["status"] if data else None
+                if pic_type == "gif" and self.videos:
+                    yield {"url": pic["video"]}
 
-    def _files_from_status(self, status):
-        page_info = status.pop("page_info", ())
-        if "pics" in status:
-            if len(status["pics"]) < status["pic_num"]:
-                status = self._status_by_id(status["id"]) or status
-            for image in status.pop("pics"):
-                pid = image["pid"]
-                if "large" in image:
-                    image = image["large"]
-                geo = image.get("geo") or {}
-                yield text.nameext_from_url(image["url"], {
-                    "url"   : image["url"],
-                    "pid"   : pid,
-                    "width" : text.parse_int(geo.get("width")),
-                    "height": text.parse_int(geo.get("height")),
-                })
-
-        if self.videos and "media_info" in page_info:
-            info = page_info["media_info"]
-            url = info.get("stream_url_hd") or info.get("stream_url")
-            if url:
-                data = text.nameext_from_url(url, {
-                    "url"   : url,
-                    "pid"   : 0,
-                    "width" : 0,
-                    "height": 0,
-                })
-                if data["extension"] == "m3u8":
-                    data["extension"] = "mp4"
-                    data["url"] = "ytdl:" + url
-                    data["_ytdl_extra"] = {"protocol": "m3u8_native"}
-                yield data
+                elif pic_type == "livephoto" and self.livephoto:
+                    yield pic["largest"].copy()
 
+                    file = {"url": pic["video"]}
+                    file["filehame"], _, file["extension"] = \
+                        pic["video"].rpartition("%2F")[2].rpartition(".")
+                    yield file
 
-class WeiboUserExtractor(WeiboExtractor):
-    """Extractor for all images of a user on weibo.cn"""
-    subcategory = "user"
-    pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
-               r"/(?:u|p(?:rofile)?)/(\d+)")
-    test = (
-        ("https://m.weibo.cn/u/2314621010", {
-            "range": "1-30",
-        }),
-        # deleted (#2521)
-        ("https://weibo.com/u/7500315942", {
-            "count": 0,
-        }),
-        ("https://m.weibo.cn/profile/2314621010"),
-        ("https://m.weibo.cn/p/2304132314621010_-_WEIBO_SECOND_PROFILE_WEIBO"),
-        ("https://www.weibo.com/p/1003062314621010/home"),
-    )
+                else:
+                    yield pic["largest"].copy()
 
-    def __init__(self, match):
-        WeiboExtractor.__init__(self, match)
-        self.user_id = match.group(1)[-10:]
+        if "page_info" in status:
+            page_info = status["page_info"]
+            if "media_info" not in page_info or not self.videos:
+                return
+            media = max(page_info["media_info"]["playback_list"],
+                        key=lambda m: m["meta"]["quality_index"])
+            yield media["play_info"].copy()
 
-    def statuses(self):
-        url = self.root + "/api/container/getIndex"
+    def _status_by_id(self, status_id):
+        url = "{}/ajax/statuses/show?id={}".format(self.root, status_id)
+        return self.request(url).json()
+
+    def _user_id(self):
+        if self.user.isdecimal():
+            return self.user[-10:]
+        else:
+            url = "{}/ajax/profile/info?{}={}".format(
+                self.root,
+                "screen_name" if self._prefix == "n" else "custom",
+                self.user)
+            return self.request(url).json()["data"]["user"]["idstr"]
+
+    def _pagination(self, endpoint, params):
+        url = self.root + "/ajax" + endpoint
         headers = {
-            "Accept": "application/json, text/plain, */*",
             "X-Requested-With": "XMLHttpRequest",
-            "MWeibo-Pwa": "1",
             "X-XSRF-TOKEN": None,
-            "Referer": "{}/u/{}".format(self.root, self.user_id),
-        }
-        params = {
-            "type": "uid",
-            "value": self.user_id,
-            "containerid": "107603" + self.user_id,
+            "Referer": "{}/u/{}".format(self.root, params["uid"]),
         }
 
         while True:
             response = self.request(url, params=params, headers=headers)
+            headers["Accept"] = "application/json, text/plain, */*"
             headers["X-XSRF-TOKEN"] = response.cookies.get("XSRF-TOKEN")
 
             data = response.json()
@@ -145,56 +142,211 @@ class WeiboUserExtractor(WeiboExtractor):
                         '"%s"', data.get("msg") or "unknown error")
 
             data = data["data"]
-            for card in data["cards"]:
-                if "mblog" in card:
-                    yield card["mblog"]
-
-            info = data.get("cardlistInfo")
-            if not info:
-                # occasionally weibo returns an empty response
-                # repeating the same request usually/eventually yields
-                # the correct response.
-                continue
-
-            params["since_id"] = sid = info.get("since_id")
-            if not sid:
+            statuses = data["list"]
+            if not statuses:
                 return
+            yield from statuses
+
+            if "next_cursor" in data:
+                params["cursor"] = data["next_cursor"]
+            elif "page" in params:
+                params["page"] += 1
+            elif data["since_id"]:
+                params["sinceid"] = data["since_id"]
+            else:
+                params["since_id"] = statuses[-1]["id"] - 1
+
+    def _sina_visitor_system(self, response):
+        self.log.info("Sina Visitor System")
+
+        passport_url = "https://passport.weibo.com/visitor/genvisitor"
+        headers = {"Referer": response.url}
+        data = {
+            "cb": "gen_callback",
+            "fp": '{"os":"1","browser":"Gecko91,0,0,0","fonts":"undefined",'
+                  '"screenInfo":"1920*1080*24","plugins":""}',
+        }
+
+        page = Extractor.request(
+            self, passport_url, method="POST", headers=headers, data=data).text
+        data = json.loads(text.extract(page, "(", ");")[0])["data"]
+
+        passport_url = "https://passport.weibo.com/visitor/visitor"
+        params = {
+            "a"    : "incarnate",
+            "t"    : data["tid"],
+            "w"    : "2",
+            "c"    : "{:>03}".format(data["confidence"]),
+            "gc"   : "",
+            "cb"   : "cross_domain",
+            "from" : "weibo",
+            "_rand": random.random(),
+        }
+        response = Extractor.request(self, passport_url, params=params)
+        _cookie_cache.update("", response.cookies)
+
+
+class WeiboUserExtractor(WeiboExtractor):
+    """Extractor for weibo user profiles"""
+    subcategory = "user"
+    pattern = USER_PATTERN + r"(?:$|#)"
+    test = (
+        ("https://weibo.com/1758989602"),
+        ("https://weibo.com/u/1758989602"),
+        ("https://weibo.com/p/1758989602"),
+        ("https://m.weibo.cn/profile/2314621010"),
+        ("https://m.weibo.cn/p/2304132314621010_-_WEIBO_SECOND_PROFILE_WEIBO"),
+        ("https://www.weibo.com/p/1003062314621010/home"),
+    )
+
+    def items(self):
+        base = " {}/u/{}?tabtype=".format(self.root, self._user_id())
+        return self._dispatch_extractors((
+            (WeiboHomeExtractor  , base + "home"),
+            (WeiboFeedExtractor  , base + "feed"),
+            (WeiboVideosExtractor, base + "newVideo"),
+            (WeiboAlbumExtractor , base + "album"),
+        ), ("feed",))
+
+
+class WeiboHomeExtractor(WeiboExtractor):
+    """Extractor for weibo 'home' listings"""
+    subcategory = "home"
+    pattern = USER_PATTERN + r"\?tabtype=home"
+    test = ("https://weibo.com/1758989602?tabtype=home", {
+        "range": "1-30",
+        "count": 30,
+    })
+
+    def statuses(self):
+        endpoint = "/profile/myhot"
+        params = {"uid": self._user_id(), "page": 1, "feature": "2"}
+        return self._pagination(endpoint, params)
+
+
+class WeiboFeedExtractor(WeiboExtractor):
+    """Extractor for weibo user feeds"""
+    subcategory = "feed"
+    pattern = USER_PATTERN + r"\?tabtype=feed"
+    test = (
+        ("https://weibo.com/1758989602?tabtype=feed", {
+            "range": "1-30",
+            "count": 30,
+        }),
+        ("https://weibo.com/zhouyuxi77?tabtype=feed", {
+            "keyword": {"status": {"user": {"id": 7488709788}}},
+            "range": "1",
+        }),
+        ("https://www.weibo.com/n/周于希Sally?tabtype=feed", {
+            "keyword": {"status": {"user": {"id": 7488709788}}},
+            "range": "1",
+        }),
+        # deleted (#2521)
+        ("https://weibo.com/u/7500315942?tabtype=feed", {
+            "count": 0,
+        }),
+    )
+
+    def statuses(self):
+        endpoint = "/statuses/mymblog"
+        params = {"uid": self._user_id(), "feature": "0"}
+        return self._pagination(endpoint, params)
+
+
+class WeiboVideosExtractor(WeiboExtractor):
+    """Extractor for weibo 'newVideo' listings"""
+    subcategory = "videos"
+    pattern = USER_PATTERN + r"\?tabtype=newVideo"
+    test = ("https://weibo.com/1758989602?tabtype=newVideo", {
+        "pattern": r"https://f\.video\.weibocdn\.com/(../)?\w+\.mp4\?label=mp",
+        "range": "1-30",
+        "count": 30,
+    })
+
+    def statuses(self):
+        endpoint = "/profile/getWaterFallContent"
+        params = {"uid": self._user_id()}
+        return self._pagination(endpoint, params)
+
+
+class WeiboArticleExtractor(WeiboExtractor):
+    """Extractor for weibo 'article' listings"""
+    subcategory = "article"
+    pattern = USER_PATTERN + r"\?tabtype=article"
+    test = ("https://weibo.com/1758989602?tabtype=article", {
+        "count": 0,
+    })
+
+    def statuses(self):
+        endpoint = "/statuses/mymblog"
+        params = {"uid": self._user_id(), "page": 1, "feature": "10"}
+        return self._pagination(endpoint, params)
+
+
+class WeiboAlbumExtractor(WeiboExtractor):
+    """Extractor for weibo 'album' listings"""
+    subcategory = "album"
+    pattern = USER_PATTERN + r"\?tabtype=album"
+    test = ("https://weibo.com/1758989602?tabtype=album", {
+        "pattern": r"https://wx\d+\.sinaimg\.cn/large/\w{32}\.(jpg|png|gif)",
+        "range": "1-3",
+        "count": 3,
+    })
+
+    def statuses(self):
+        endpoint = "/profile/getImageWall"
+        params = {"uid": self._user_id()}
+
+        seen = set()
+        for image in self._pagination(endpoint, params):
+            mid = image["mid"]
+            if mid not in seen:
+                seen.add(mid)
+                yield self._status_by_id(mid)
 
 
 class WeiboStatusExtractor(WeiboExtractor):
     """Extractor for images from a status on weibo.cn"""
     subcategory = "status"
-    pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
-               r"/(?:detail|status|\d+)/(\w+)")
+    pattern = BASE_PATTERN + r"/(detail|status|\d+)/(\w+)"
     test = (
         ("https://m.weibo.cn/detail/4323047042991618", {
             "pattern": r"https?://wx\d+.sinaimg.cn/large/\w+.jpg",
             "keyword": {"status": {"date": "dt:2018-12-30 13:56:36"}},
         }),
         ("https://m.weibo.cn/detail/4339748116375525", {
-            "pattern": r"https?://f.us.sinaimg.cn/\w+\.mp4\?label=mp4_hd",
+            "pattern": r"https?://f.us.sinaimg.cn/\w+\.mp4\?label=mp4_1080p",
         }),
         # unavailable video (#427)
         ("https://m.weibo.cn/status/4268682979207023", {
-            "exception": exception.NotFoundError,
+            "exception": exception.HttpError,
         }),
         # non-numeric status ID (#664)
         ("https://weibo.com/3314883543/Iy7fj4qVg"),
         # original retweets (#1542)
         ("https://m.weibo.cn/detail/4600272267522211", {
             "options": (("retweets", "original"),),
-            "keyword": {"status": {"id": "4600167083287033"}},
+            "keyword": {"status": {"id": 4600167083287033}},
+        }),
+        # type == livephoto (#2146)
+        ("https://weibo.com/5643044717/KkuDZ4jAA", {
+            "range": "2,4,6",
+            "pattern": r"https://video\.weibo\.com/media/play\?livephoto="
+                       r"https%3A%2F%2Fus.sinaimg.cn%2F\w+\.mov",
+        }),
+        # type == gif
+        ("https://weibo.com/1758989602/LvBhm5DiP", {
+            "pattern": r"http://g\.us\.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM01041"
+                       r"20005tc0E010\.mp4\?label=gif_mp4",
         }),
         ("https://m.weibo.cn/status/4339748116375525"),
         ("https://m.weibo.cn/5746766133/4339748116375525"),
     )
 
-    def __init__(self, match):
-        WeiboExtractor.__init__(self, match)
-        self.status_id = match.group(1)
-
     def statuses(self):
-        status = self._status_by_id(self.status_id)
-        if not status:
-            raise exception.NotFoundError("status")
-        return (status,)
+        return (self._status_by_id(self.user),)
+
+
+@cache(maxage=356*86400)
+def _cookie_cache():
+    return None
author	Unit 193 <unit193@unit193.net>	2022-06-05 00:33:56 -0400
committer	Unit 193 <unit193@unit193.net>	2022-06-05 00:33:56 -0400
commit	25442ea49f031d4d2df3353dd7e9ad2080e332da (patch)
tree	14c2ee86b8d10cf0f79b4cd3ce8d6a34ebe52eba /gallery_dl/extractor
parent	ad61a6d8122973534ab63df48f6090954bc73db6 (diff)