diff options
| author | 2025-03-01 19:51:39 -0500 | |
|---|---|---|
| committer | 2025-03-01 19:51:39 -0500 | |
| commit | 889c7b8caec8fc0b9c7a583ed1d9cfa43518fc42 (patch) | |
| tree | cff4a7de7032843e4efe521d92dfce485ae944f1 /gallery_dl/extractor | |
| parent | a26df18796ff4e506b16bf32fcec9336233b9e2e (diff) | |
New upstream version 1.29.0.upstream/1.29.0
Diffstat (limited to 'gallery_dl/extractor')
31 files changed, 1320 insertions, 88 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index fc8d7b2..00b22d4 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -44,6 +44,7 @@ modules = [ "danbooru", "desktopography", "deviantart", + "discord", "dynastyscans", "e621", "erome", @@ -56,6 +57,7 @@ modules = [ "fapachi", "flickr", "furaffinity", + "furry34", "fuskator", "gelbooru", "gelbooru_v01", @@ -80,6 +82,7 @@ modules = [ "imgbox", "imgth", "imgur", + "imhentai", "inkbunny", "instagram", "issuu", @@ -168,6 +171,7 @@ modules = [ "tapas", "tcbscans", "telegraph", + "tiktok", "tmohentai", "toyhouse", "tsumino", diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py index b9de165..597ec40 100644 --- a/gallery_dl/extractor/bilibili.py +++ b/gallery_dl/extractor/bilibili.py @@ -81,6 +81,27 @@ class BilibiliArticleExtractor(BilibiliExtractor): yield Message.Url, url, text.nameext_from_url(url, article) +class BilibiliUserArticlesFavoriteExtractor(BilibiliExtractor): + subcategory = "user-articles-favorite" + pattern = (r"(?:https?://)?space\.bilibili\.com" + r"/(\d+)/favlist\?fid=opus") + example = "https://space.bilibili.com/12345/favlist?fid=opus" + _warning = True + + def _init(self): + BilibiliExtractor._init(self) + if self._warning: + if not self.cookies_check(("SESSDATA",)): + self.log.error("'SESSDATA' cookie required") + BilibiliUserArticlesFavoriteExtractor._warning = False + + def items(self): + for article in self.api.user_favlist(): + article["_extractor"] = BilibiliArticleExtractor + url = "{}/opus/{}".format(self.root, article["opus_id"]) + yield Message.Queue, url, article + + class BilibiliAPI(): def __init__(self, extractor): self.extractor = extractor @@ -122,3 +143,28 @@ class BilibiliAPI(): raise exception.StopExtraction( "%s: Unable to extract INITIAL_STATE data", article_id) self.extractor.wait(seconds=300) + + def user_favlist(self): + endpoint = "/opus/feed/fav" + params = {"page": 1, "page_size": 20} + + while True: + data = self._call(endpoint, params)["data"] + + yield from data["items"] + + if not data.get("has_more"): + break + params["page"] += 1 + + def login_user_id(self): + url = "https://api.bilibili.com/x/space/v2/myinfo" + data = self.extractor.request(url).json() + + if data["code"] != 0: + self.extractor.log.debug("Server response: %s", data) + raise exception.StopExtraction("API request failed,Are you login?") + try: + return data["data"]["profile"]["mid"] + except Exception: + raise exception.StopExtraction("API request failed") diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py index c28fad9..f3e441b 100644 --- a/gallery_dl/extractor/boosty.py +++ b/gallery_dl/extractor/boosty.py @@ -8,6 +8,7 @@ from .common import Extractor, Message from .. import text, util, exception +import itertools BASE_PATTERN = r"(?:https?://)?boosty\.to" @@ -53,7 +54,9 @@ class BoostyExtractor(Extractor): self.log.warning("Not allowed to access post %s", post["id"]) continue - files = self._process_post(post) + files = self._extract_files(post) + if self._user: + post["user"] = self._user data = { "post" : post, "user" : post.pop("user", None), @@ -69,15 +72,13 @@ class BoostyExtractor(Extractor): def posts(self): """Yield JSON content of all relevant posts""" - def _process_post(self, post): + def _extract_files(self, post): files = [] post["content"] = content = [] post["links"] = links = [] if "createdAt" in post: post["date"] = text.parse_timestamp(post["createdAt"]) - if self._user: - post["user"] = self._user for block in post["data"]: try: @@ -94,7 +95,7 @@ class BoostyExtractor(Extractor): elif type == "ok_video": if not self.videos: self.log.debug("%s: Skipping video %s", - post["int_id"], block["id"]) + post["id"], block["id"]) continue fmts = { fmt["type"]: fmt["url"] @@ -114,7 +115,7 @@ class BoostyExtractor(Extractor): else: self.log.warning( "%s: Found no suitable video format for %s", - post["int_id"], block["id"]) + post["id"], block["id"]) elif type == "link": url = block["url"] @@ -127,9 +128,12 @@ class BoostyExtractor(Extractor): elif type == "file": files.append(self._update_url(post, block)) + elif type == "smile": + content.append(":" + block["name"] + ":") + else: self.log.debug("%s: Unsupported data type '%s'", - post["int_id"], type) + post["id"], type) except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) @@ -219,6 +223,51 @@ class BoostyFollowingExtractor(BoostyExtractor): yield Message.Queue, url, user +class BoostyDirectMessagesExtractor(BoostyExtractor): + """Extractor for boosty.to direct messages""" + subcategory = "direct-messages" + directory_fmt = ("{category}", "{user[blogUrl]} ({user[id]})", + "Direct Messages") + pattern = BASE_PATTERN + r"/app/messages/?\?dialogId=(\d+)" + example = "https://boosty.to/app/messages?dialogId=12345" + + def items(self): + """Yield direct messages from a given dialog ID.""" + dialog_id = self.groups[0] + response = self.api.dialog(dialog_id) + signed_query = response.get("signedQuery") + + try: + messages = response["messages"]["data"] + offset = messages[0]["id"] + except Exception: + return + + try: + user = self.api.user(response["chatmate"]["url"]) + except Exception: + user = None + + messages.reverse() + for message in itertools.chain( + messages, + self.api.dialog_messages(dialog_id, offset=offset) + ): + message["signedQuery"] = signed_query + files = self._extract_files(message) + data = { + "post": message, + "user": user, + "count": len(files), + } + + yield Message.Directory, data + for data["num"], file in enumerate(files, 1): + data["file"] = file + url = file["url"] + yield Message.Url, url, text.nameext_from_url(url, data) + + class BoostyAPI(): """Interface for the Boosty API""" root = "https://api.boosty.to" @@ -367,3 +416,32 @@ class BoostyAPI(): if offset > data["total"]: return params["offset"] = offset + + def dialog(self, dialog_id): + endpoint = "/v1/dialog/{}".format(dialog_id) + return self._call(endpoint) + + def dialog_messages(self, dialog_id, limit=300, offset=None): + endpoint = "/v1/dialog/{}/message/".format(dialog_id) + params = { + "limit": limit, + "reverse": "true", + "offset": offset, + } + return self._pagination_dialog(endpoint, params) + + def _pagination_dialog(self, endpoint, params): + while True: + data = self._call(endpoint, params) + + yield from data["data"] + + try: + extra = data["extra"] + if extra.get("isLast"): + break + params["offset"] = offset = extra["offset"] + if not offset: + break + except Exception: + break diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 25e9fd5..201b8f4 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -10,7 +10,8 @@ from .common import Extractor from .lolisafe import LolisafeAlbumExtractor -from .. import text, config, exception +from .. import text, util, config, exception +import binascii import random if config.get(("extractor", "bunkr"), "tlds"): @@ -60,6 +61,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkr.si albums""" category = "bunkr" root = "https://bunkr.si" + root_dl = "https://get.bunkrr.su" + archive_fmt = "{album_id}_{id|id_url}" pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://bunkr.si/a/ID" @@ -68,6 +71,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): domain = self.groups[0] or self.groups[1] if domain not in LEGACY_DOMAINS: self.root = "https://" + domain + self.offset = 0 + + def skip(self, num): + self.offset = num + return num def request(self, url, **kwargs): kwargs["encoding"] = "utf-8" @@ -132,6 +140,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): } def _extract_files(self, items): + if self.offset: + items = util.advance(items, self.offset) + for item in items: try: url = text.unescape(text.extr(item, ' href="', '"')) @@ -154,26 +165,43 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): self.log.debug("", exc_info=exc) def _extract_file(self, webpage_url): - response = self.request(webpage_url) - page = response.text - file_url = (text.extr(page, '<source src="', '"') or - text.extr(page, '<img src="', '"')) + page = self.request(webpage_url).text + data_id = text.extr(page, 'data-file-id="', '"') + referer = self.root_dl + "/file/" + data_id + + url = self.root_dl + "/api/vs" + headers = {"Referer": referer} + data = self.request( + url, method="POST", headers=headers, json={"id": data_id}).json() + + if data.get("encrypted"): + file_url = self._decrypt_url(data["url"], data["timestamp"]) + else: + file_url = data["url"] + file_name = (text.extr(page, 'property="og:title" content="', '"') or text.extr(page, "<title>", " | Bunkr<")) - - if not file_url: - webpage_url = text.unescape(text.rextract( - page, ' href="', '"', page.rindex("Download"))[0]) - response = self.request(webpage_url) - file_url = text.rextract(response.text, ' href="', '"')[0] + fallback = text.extr(page, 'property="og:url" content="', '"') return { - "file" : text.unescape(file_url), + "file" : file_url, "name" : text.unescape(file_name), - "_http_headers" : {"Referer": response.url}, + "id_url" : data_id, + "_fallback" : (fallback,) if fallback else (), + "_http_headers" : {"Referer": referer}, "_http_validate": self._validate, } + def _decrypt_url(self, encrypted_b64, timestamp): + encrypted_bytes = binascii.a2b_base64(encrypted_b64) + key = "SECRET_KEY_{}".format(timestamp // 3600).encode() + div = len(key) + + return bytes([ + encrypted_bytes[i] ^ key[i % div] + for i in range(len(encrypted_bytes)) + ]).decode() + def _validate(self, response): if response.history and response.url.endswith("/maintenance-vid.mp4"): self.log.warning("File server in maintenance mode") diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index aedcea4..de22a7b 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -57,7 +57,8 @@ class CheveretoImageExtractor(CheveretoExtractor): image = { "id" : self.path.rpartition(".")[2], - "url" : extr('<meta property="og:image" content="', '"'), + "url" : (extr('<meta property="og:image" content="', '"') or + extr('url: "', '"')), "album": text.extr(extr("Added to <a", "/a>"), ">", "<"), "user" : extr('username: "', '"'), } diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 13fd88a..d58db6f 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -915,7 +915,7 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): options=ssl_options or None, ciphers=ssl_ciphers) if not requests.__version__ < "2.32": # https://github.com/psf/requests/pull/6731 - ssl_context.load_default_certs() + ssl_context.load_verify_locations(requests.certs.where()) ssl_context.check_hostname = False else: ssl_context = None diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py new file mode 100644 index 0000000..6a5fcc9 --- /dev/null +++ b/gallery_dl/extractor/discord.py @@ -0,0 +1,399 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://discord.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +BASE_PATTERN = r"(?:https?://)?discord\.com" + + +class DiscordExtractor(Extractor): + """Base class for Discord extractors""" + category = "discord" + root = "https://discord.com" + directory_fmt = ("{category}", "{server_id}_{server}", + "{channel_id}_{channel}") + filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}" + archive_fmt = "{message_id}_{num}" + + cdn_fmt = "https://cdn.discordapp.com/{}/{}/{}.png?size=4096" + + server_metadata = {} + server_channels_metadata = {} + + def _init(self): + self.token = self.config("token") + self.enabled_embeds = self.config("embeds", ["image", "gifv", "video"]) + self.enabled_threads = self.config("threads", True) + self.api = DiscordAPI(self) + + def extract_message_text(self, message): + text_content = [message["content"]] + + for embed in message["embeds"]: + if embed["type"] == "rich": + try: + text_content.append(embed["author"]["name"]) + except Exception: + pass + text_content.append(embed.get("title", "")) + text_content.append(embed.get("description", "")) + + for field in embed.get("fields", []): + text_content.append(field.get("name", "")) + text_content.append(field.get("value", "")) + + text_content.append(embed.get("footer", {}).get("text", "")) + + if message.get("poll"): + text_content.append(message["poll"]["question"]["text"]) + for answer in message["poll"]["answers"]: + text_content.append(answer["poll_media"]["text"]) + + return "\n".join(t for t in text_content if t) + + def extract_message(self, message): + # https://discord.com/developers/docs/resources/message#message-object-message-types + if message["type"] in (0, 19, 21): + message_metadata = {} + message_metadata.update(self.server_metadata) + message_metadata.update( + self.server_channels_metadata[message["channel_id"]]) + message_metadata.update({ + "author": message["author"]["username"], + "author_id": message["author"]["id"], + "author_files": [], + "message": self.extract_message_text(message), + "message_id": message["id"], + "date": text.parse_datetime( + message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z" + ), + "files": [] + }) + + for icon_type, icon_path in ( + ("avatar", "avatars"), + ("banner", "banners") + ): + if message["author"].get(icon_type): + message_metadata["author_files"].append({ + "url": self.cdn_fmt.format( + icon_path, + message_metadata["author_id"], + message["author"][icon_type] + ), + "filename": icon_type, + "extension": "png", + }) + + for attachment in message["attachments"]: + message_metadata["files"].append({ + "url": attachment["url"], + "type": "attachment", + }) + + for embed in message["embeds"]: + if embed["type"] in self.enabled_embeds: + for field in ("video", "image", "thumbnail"): + if field not in embed: + continue + url = embed[field].get("proxy_url") + if url is not None: + message_metadata["files"].append({ + "url": url, + "type": "embed", + }) + break + + for num, file in enumerate(message_metadata["files"], start=1): + text.nameext_from_url(file["url"], file) + file["num"] = num + + yield Message.Directory, message_metadata + + for file in message_metadata["files"]: + message_metadata_file = message_metadata.copy() + message_metadata_file.update(file) + yield Message.Url, file["url"], message_metadata_file + + def extract_channel_text(self, channel_id): + for message in self.api.get_channel_messages(channel_id): + yield from self.extract_message(message) + + def extract_channel_threads(self, channel_id): + for thread in self.api.get_channel_threads(channel_id): + id = self.parse_channel(thread)["channel_id"] + yield from self.extract_channel_text(id) + + def extract_channel(self, channel_id, safe=False): + try: + if channel_id not in self.server_channels_metadata: + self.parse_channel(self.api.get_channel(channel_id)) + + channel_type = ( + self.server_channels_metadata[channel_id]["channel_type"] + ) + + # https://discord.com/developers/docs/resources/channel#channel-object-channel-types + if channel_type in (0, 5): + yield from self.extract_channel_text(channel_id) + if self.enabled_threads: + yield from self.extract_channel_threads(channel_id) + elif channel_type in (1, 3, 10, 11, 12): + yield from self.extract_channel_text(channel_id) + elif channel_type in (15, 16): + yield from self.extract_channel_threads(channel_id) + elif channel_type in (4,): + for channel in self.server_channels_metadata.copy().values(): + if channel["parent_id"] == channel_id: + yield from self.extract_channel( + channel["channel_id"], safe=True) + elif not safe: + raise exception.StopExtraction( + "This channel type is not supported." + ) + except exception.HttpError as exc: + if not (exc.status == 403 and safe): + raise + + def parse_channel(self, channel): + parent_id = channel.get("parent_id") + channel_metadata = { + "channel": channel.get("name", ""), + "channel_id": channel.get("id"), + "channel_type": channel.get("type"), + "channel_topic": channel.get("topic", ""), + "parent_id": parent_id, + "is_thread": "thread_metadata" in channel + } + + if parent_id in self.server_channels_metadata: + parent_metadata = self.server_channels_metadata[parent_id] + channel_metadata.update({ + "parent": parent_metadata["channel"], + "parent_type": parent_metadata["channel_type"] + }) + + if channel_metadata["channel_type"] in (1, 3): + channel_metadata.update({ + "channel": "DMs", + "recipients": ( + [user["username"] for user in channel["recipients"]] + ), + "recipients_id": ( + [user["id"] for user in channel["recipients"]] + ) + }) + + channel_id = channel_metadata["channel_id"] + + self.server_channels_metadata[channel_id] = channel_metadata + return channel_metadata + + def parse_server(self, server): + self.server_metadata = { + "server": server["name"], + "server_id": server["id"], + "server_files": [], + "owner_id": server["owner_id"] + } + + for icon_type, icon_path in ( + ("icon", "icons"), + ("banner", "banners"), + ("splash", "splashes"), + ("discovery_splash", "discovery-splashes") + ): + if server.get(icon_type): + self.server_metadata["server_files"].append({ + "url": self.cdn_fmt.format( + icon_path, + self.server_metadata["server_id"], + server[icon_type] + ), + "filename": icon_type, + "extension": "png", + }) + + return self.server_metadata + + def build_server_and_channels(self, server_id): + server = self.api.get_server(server_id) + self.parse_server(server) + + for channel in self.api.get_server_channels(server_id): + self.parse_channel(channel) + + +class DiscordChannelExtractor(DiscordExtractor): + subcategory = "channel" + pattern = BASE_PATTERN + r"/channels/(\d+)/(?:\d+/threads/)?(\d+)/?$" + example = "https://discord.com/channels/1234567890/9876543210" + + def items(self): + server_id, channel_id = self.groups + + self.build_server_and_channels(server_id) + + return self.extract_channel(channel_id) + + +class DiscordMessageExtractor(DiscordExtractor): + subcategory = "message" + pattern = BASE_PATTERN + r"/channels/(\d+)/(\d+)/(\d+)/?$" + example = "https://discord.com/channels/1234567890/9876543210/2468013579" + + def items(self): + server_id, channel_id, message_id = self.groups + + self.build_server_and_channels(server_id) + + if channel_id not in self.server_channels_metadata: + self.parse_channel(self.api.get_channel(channel_id)) + + return self.extract_message( + self.api.get_message(channel_id, message_id)) + + +class DiscordServerExtractor(DiscordExtractor): + subcategory = "server" + pattern = BASE_PATTERN + r"/channels/(\d+)/?$" + example = "https://discord.com/channels/1234567890" + + def items(self): + server_id = self.groups[0] + + self.build_server_and_channels(server_id) + + for channel in self.server_channels_metadata.copy().values(): + if channel["channel_type"] in (0, 5, 15, 16): + yield from self.extract_channel( + channel["channel_id"], safe=True) + + +class DiscordDirectMessagesExtractor(DiscordExtractor): + subcategory = "direct-messages" + directory_fmt = ("{category}", "Direct Messages", + "{channel_id}_{recipients:J,}") + pattern = BASE_PATTERN + r"/channels/@me/(\d+)/?$" + example = "https://discord.com/channels/@me/1234567890" + + def items(self): + return self.extract_channel(self.groups[0]) + + +class DiscordDirectMessageExtractor(DiscordExtractor): + subcategory = "direct-message" + directory_fmt = ("{category}", "Direct Messages", + "{channel_id}_{recipients:J,}") + pattern = BASE_PATTERN + r"/channels/@me/(\d+)/(\d+)/?$" + example = "https://discord.com/channels/@me/1234567890/9876543210" + + def items(self): + channel_id, message_id = self.groups + + self.parse_channel(self.api.get_channel(channel_id)) + + return self.extract_message( + self.api.get_message(channel_id, message_id)) + + +class DiscordAPI(): + """Interface for the Discord API v10 + + https://discord.com/developers/docs/reference + """ + + def __init__(self, extractor): + self.extractor = extractor + self.root = extractor.root + "/api/v10" + self.headers = {"Authorization": extractor.token} + + def get_server(self, server_id): + """Get server information""" + return self._call("/guilds/" + server_id) + + def get_server_channels(self, server_id): + """Get server channels""" + return self._call("/guilds/" + server_id + "/channels") + + def get_channel(self, channel_id): + """Get channel information""" + return self._call("/channels/" + channel_id) + + def get_channel_threads(self, channel_id): + """Get channel threads""" + THREADS_BATCH = 25 + + def _method(offset): + return self._call("/channels/" + channel_id + "/threads/search", { + "sort_by": "last_message_time", + "sort_order": "desc", + "limit": THREADS_BATCH, + "offset": + offset, + })["threads"] + + return self._pagination(_method, THREADS_BATCH) + + def get_channel_messages(self, channel_id): + """Get channel messages""" + MESSAGES_BATCH = 100 + + before = None + + def _method(_): + nonlocal before + messages = self._call("/channels/" + channel_id + "/messages", { + "limit": MESSAGES_BATCH, + "before": before + }) + before = messages[-1]["id"] + return messages + + return self._pagination(_method, MESSAGES_BATCH) + + def get_message(self, channel_id, message_id): + """Get message information""" + return self._call("/channels/" + channel_id + "/messages", { + "limit": 1, + "around": message_id + })[0] + + def _call(self, endpoint, params=None): + url = self.root + endpoint + try: + response = self.extractor.request( + url, params=params, headers=self.headers) + except exception.HttpError as exc: + if exc.status == 401: + self._raise_invalid_token() + raise + return response.json() + + def _pagination(self, method, batch): + offset = 0 + while True: + data = method(offset) + yield from data + if len(data) < batch: + return + offset += len(data) + + @staticmethod + def _raise_invalid_token(): + raise exception.AuthenticationError("""Invalid or missing token. +Please provide a valid token following these instructions: + +1) Open Discord in your browser (https://discord.com/app); +2) Open your browser's Developer Tools (F12) and switch to the Network panel; +3) Reload the page and select any request going to https://discord.com/api/...; +4) In the "Headers" tab, look for an entry beginning with "Authorization: "; +5) Right-click the entry and click "Copy Value"; +6) Paste the token in your configuration file under "extractor.discord.token", +or run this command with the -o "token=[your token]" argument.""") diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index e6d136f..55549de 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -44,6 +44,8 @@ class EromeExtractor(Extractor): pos = page.index('<div class="user-profile', pos) user, pos = text.extract( page, 'href="https://www.erome.com/', '"', pos) + tags, pos = text.extract( + page, '<p class="mt-10"', '</p>', pos) urls = [] date = None @@ -59,11 +61,13 @@ class EromeExtractor(Extractor): date = text.parse_timestamp(ts) data = { - "album_id" : album_id, - "title" : text.unescape(title), - "user" : text.unquote(user), - "count" : len(urls), - "date" : date, + "album_id": album_id, + "title" : text.unescape(title), + "user" : text.unquote(user), + "count" : len(urls), + "date" : date, + "tags" : [t.replace("+", " ") + for t in text.extract_iter(tags, "?q=", '"')], "_http_headers": {"Referer": url}, } diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 44c4542..5f90afc 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -81,8 +81,8 @@ BASE_PATTERN = FoolfuukaExtractor.update({ "pattern": r"(?:www\.)?archiveofsins\.com", }, "b4k": { - "root": "https://arch.b4k.co", - "pattern": r"arch\.b4k\.co", + "root": "https://arch.b4k.dev", + "pattern": r"arch\.b4k\.(?:dev|co)", }, "desuarchive": { "root": "https://desuarchive.org", diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index d253582..1466390 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -23,6 +23,7 @@ class FuraffinityExtractor(Extractor): cookies_domain = ".furaffinity.net" cookies_names = ("a", "b") root = "https://www.furaffinity.net" + request_interval = 1.0 _warning = True def __init__(self, match): diff --git a/gallery_dl/extractor/furry34.py b/gallery_dl/extractor/furry34.py new file mode 100644 index 0000000..e0c7fdb --- /dev/null +++ b/gallery_dl/extractor/furry34.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://furry34.com/""" + +from .booru import BooruExtractor +from .. import text +import collections + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?furry34\.com" + + +class Furry34Extractor(BooruExtractor): + category = "furry34" + root = "https://furry34.com" + root_cdn = "https://furry34com.b-cdn.net" + filename_fmt = "{category}_{id}.{extension}" + per_page = 30 + + TAG_TYPES = { + None: "general", + 1 : "general", + 2 : "copyright", + 4 : "character", + 8 : "artist", + } + FORMATS = ( + ("100", "mov.mp4"), + ("101", "mov720.mp4"), + ("102", "mov480.mp4"), + ("10" , "pic.jpg"), + ) + + def _file_url(self, post): + files = post["files"] + for fmt, extension in self.FORMATS: + if fmt in files: + break + else: + fmt = next(iter(files)) + + post_id = post["id"] + root = self.root_cdn if files[fmt][0] else self.root + post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format( + root, post_id // 1000, post_id, post_id, extension) + post["format_id"] = fmt + post["format"] = extension.partition(".")[0] + + return url + + def _prepare(self, post): + post.pop("files", None) + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["filename"], _, post["format"] = post["filename"].rpartition(".") + if "tags" in post: + post["tags"] = [t["value"] for t in post["tags"]] + + def _tags(self, post, _): + if "tags" not in post: + post.update(self._fetch_post(post["id"])) + + tags = collections.defaultdict(list) + for tag in post["tags"]: + tags[tag["type"] or 1].append(tag["value"]) + types = self.TAG_TYPES + for type, values in tags.items(): + post["tags_" + types[type]] = values + + def _fetch_post(self, post_id): + url = "{}/api/v2/post/{}".format(self.root, post_id) + return self.request(url).json() + + def _pagination(self, endpoint, params=None): + url = "{}/api{}".format(self.root, endpoint) + + if params is None: + params = {} + params["sortBy"] = 0 + params["take"] = self.per_page + threshold = self.per_page + + while True: + data = self.request(url, method="POST", json=params).json() + + yield from data["items"] + + if len(data["items"]) < threshold: + return + params["cursor"] = data.get("cursor") + + +class Furry34PostExtractor(Furry34Extractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/post/(\d+)" + example = "https://furry34.com/post/12345" + + def posts(self): + return (self._fetch_post(self.groups[0]),) + + +class Furry34PlaylistExtractor(Furry34Extractor): + subcategory = "playlist" + directory_fmt = ("{category}", "{playlist_id}") + archive_fmt = "p_{playlist_id}_{id}" + pattern = BASE_PATTERN + r"/playlists/view/(\d+)" + example = "https://furry34.com/playlists/view/12345" + + def metadata(self): + return {"playlist_id": self.groups[0]} + + def posts(self): + endpoint = "/v2/post/search/playlist/" + self.groups[0] + return self._pagination(endpoint) + + +class Furry34TagExtractor(Furry34Extractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/(?:([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)" + example = "https://furry34.com/TAG" + + def _init(self): + tag, query = self.groups + params = text.parse_query(query) + + self.tags = tags = [] + if tag: + tags.extend(text.unquote(text.unquote(tag)).split("|")) + if "tags" in params: + tags.extend(params["tags"].split("|")) + + type = params.get("type") + if type == "video": + self.type = 1 + elif type == "image": + self.type = 0 + else: + self.type = None + + def metadata(self): + return {"search_tags": " ".join(self.tags)} + + def posts(self): + endpoint = "/v2/post/search/root" + params = {"includeTags": [t.replace("_", " ") for t in self.tags]} + if self.type is not None: + params["type"] = self.type + return self._pagination(endpoint, params) diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 370cd43..4b04732 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -37,6 +37,7 @@ class GenericExtractor(Extractor): example = "generic:https://www.nongnu.org/lzip/" def __init__(self, match): + self.subcategory = match.group('domain') Extractor.__init__(self, match) # Strip the "g(eneric):" prefix @@ -54,7 +55,6 @@ class GenericExtractor(Extractor): self.scheme = 'https://' self.url = text.ensure_http_scheme(self.url, self.scheme) - self.subcategory = match.group('domain') self.path = match.group('path') # Used to resolve relative image urls diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 481fb1e..20f8ea4 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -142,7 +142,8 @@ class ImgurGalleryExtractor(ImgurExtractor): class ImgurUserExtractor(ImgurExtractor): """Extractor for all images posted by a user""" subcategory = "user" - pattern = BASE_PATTERN + r"/user/([^/?#]+)(?:/posts|/submitted)?/?$" + pattern = (BASE_PATTERN + r"/user/(?!me(?:/|$|\?|#))" + r"([^/?#]+)(?:/posts|/submitted)?/?$") example = "https://imgur.com/user/USER" def items(self): @@ -174,6 +175,23 @@ class ImgurFavoriteFolderExtractor(ImgurExtractor): self.key, self.folder_id)) +class ImgurMeExtractor(ImgurExtractor): + """Extractor for your personal uploads""" + subcategory = "me" + pattern = BASE_PATTERN + r"/user/me(?:/posts)?(/hidden)?" + example = "https://imgur.com/user/me" + + def items(self): + if not self.cookies_check(("accesstoken",)): + self.log.error("'accesstoken' cookie required") + + if self.groups[0]: + posts = self.api.accounts_me_hiddenalbums() + else: + posts = self.api.accounts_me_allposts() + return self._items_queue(posts) + + class ImgurSubredditExtractor(ImgurExtractor): """Extractor for a subreddits's imgur links""" subcategory = "subreddit" @@ -215,6 +233,10 @@ class ImgurAPI(): self.client_id = extractor.config("client-id") or "546c25a59c58ad7" self.headers = {"Authorization": "Client-ID " + self.client_id} + def account_submissions(self, account): + endpoint = "/3/account/{}/submissions".format(account) + return self._pagination(endpoint) + def account_favorites(self, account): endpoint = "/3/account/{}/gallery_favorites".format(account) return self._pagination(endpoint) @@ -224,15 +246,29 @@ class ImgurAPI(): account, folder_id) return self._pagination_v2(endpoint) + def accounts_me_allposts(self): + endpoint = "/post/v1/accounts/me/all_posts" + params = { + "include": "media,tags,account", + "page" : 1, + "sort" : "-created_at", + } + return self._pagination_v2(endpoint, params) + + def accounts_me_hiddenalbums(self): + endpoint = "/post/v1/accounts/me/hidden_albums" + params = { + "include": "media,tags,account", + "page" : 1, + "sort" : "-created_at", + } + return self._pagination_v2(endpoint, params) + def gallery_search(self, query): endpoint = "/3/gallery/search" params = {"q": query} return self._pagination(endpoint, params) - def account_submissions(self, account): - endpoint = "/3/account/{}/submissions".format(account) - return self._pagination(endpoint) - def gallery_subreddit(self, subreddit): endpoint = "/3/gallery/r/{}".format(subreddit) return self._pagination(endpoint) @@ -284,12 +320,16 @@ class ImgurAPI(): if params is None: params = {} params["client_id"] = self.client_id - params["page"] = 0 - params["sort"] = "newest" + if "page" not in params: + params["page"] = 0 + if "sort" not in params: + params["sort"] = "newest" headers = {"Origin": "https://imgur.com"} while True: - data = self._call(endpoint, params, headers)["data"] + data = self._call(endpoint, params, headers) + if "data" in data: + data = data["data"] if not data: return yield from data diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py new file mode 100644 index 0000000..0439f5b --- /dev/null +++ b/gallery_dl/extractor/imhentai.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://imhentai.xxx/ and mirror sites""" + +from .common import GalleryExtractor, BaseExtractor, Message +from .. import text, util + + +class ImhentaiExtractor(BaseExtractor): + basecategory = "IMHentai" + + def _pagination(self, url): + prev = None + base = self.root + "/gallery/" + data = {"_extractor": ImhentaiGalleryExtractor} + + while True: + page = self.request(url).text + extr = text.extract_from(page) + + while True: + gallery_id = extr('<a href="/gallery/', '"') + if gallery_id == prev: + continue + if not gallery_id: + break + yield Message.Queue, base + gallery_id, data + prev = gallery_id + + href = text.rextract(page, "class='page-link' href='", "'")[0] + if not href or href == "#": + return + if href[0] == "/": + if href[1] == "/": + href = "https:" + href + else: + href = self.root + href + url = href + + +BASE_PATTERN = ImhentaiExtractor.update({ + "imhentai": { + "root": "https://imhentai.xxx", + "pattern": r"(?:www\.)?imhentai\.xxx", + }, + "hentaiera": { + "root": "https://hentaiera.com", + "pattern": r"(?:www\.)?hentaiera\.com", + }, + "hentairox": { + "root": "https://hentairox.com", + "pattern": r"(?:www\.)?hentairox\.com", + }, +}) + + +class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): + """Extractor for imhentai galleries""" + pattern = BASE_PATTERN + r"/(?:gallery|view)/(\d+)" + example = "https://imhentai.xxx/gallery/12345/" + + def __init__(self, match): + ImhentaiExtractor.__init__(self, match) + self.gallery_id = self.groups[-1] + self.gallery_url = "{}/gallery/{}/".format(self.root, self.gallery_id) + + def metadata(self, page): + extr = text.extract_from(page) + + data = { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.unescape(extr("<h1>", "<")), + "title_alt" : text.unescape(extr('class="subtitle">', "<")), + "parody" : self._split(extr(">Parodies", "</li>")), + "character" : self._split(extr(">Characters", "</li>")), + "tags" : self._split(extr(">Tags", "</li>")), + "artist" : self._split(extr(">Artists", "</li>")), + "group" : self._split(extr(">Groups", "</li>")), + "language" : self._split(extr(">Languages", "</li>")), + "type" : extr("href='/category/", "/"), + } + + if data["language"]: + data["lang"] = util.language_to_code(data["language"][0]) + + return data + + def _split(self, html): + results = [] + for tag in text.extract_iter(html, ">", "</a>"): + tag = tag.partition(" <span class='badge'>")[0] + if "<" in tag: + tag = text.remove_html(tag) + results.append(tag) + return results + + def images(self, page): + data = util.json_loads(text.extr(page, "$.parseJSON('", "'")) + base = text.extr(page, 'data-src="', '"').rpartition("/")[0] + "/" + exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"} + + results = [] + for i in map(str, range(1, len(data)+1)): + ext, width, height = data[i].split(",") + url = base + i + "." + exts[ext] + results.append((url, { + "width" : text.parse_int(width), + "height": text.parse_int(height), + })) + return results + + +class ImhentaiTagExtractor(ImhentaiExtractor): + """Extractor for imhentai tag searches""" + subcategory = "tag" + pattern = (BASE_PATTERN + r"(/(?:" + r"artist|category|character|group|language|parody|tag" + r")/([^/?#]+))") + example = "https://imhentai.xxx/tag/TAG/" + + def items(self): + url = self.root + self.groups[-2] + "/" + return self._pagination(url) + + +class ImhentaiSearchExtractor(ImhentaiExtractor): + """Extractor for imhentai search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/search/?\?([^#]+)" + example = "https://imhentai.xxx/search/?key=QUERY" + + def items(self): + url = self.root + "/search/?" + self.groups[-1] + return self._pagination(url) diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index b900113..65717b4 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -30,8 +30,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): def metadata(self, page): pos = page.rindex('id="initial-data"') - data = util.json_loads(text.rextract( - page, '<script data-json="', '"', pos)[0].replace(""", '"')) + data = util.json_loads(text.unescape(text.rextract( + page, '<script data-json="', '"', pos)[0])) doc = data["initialDocumentData"]["document"] doc["date"] = text.parse_datetime( diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 7f941bb..5c91eb9 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -24,10 +24,6 @@ class ItakuExtractor(Extractor): archive_fmt = "{id}" request_interval = (0.5, 1.5) - def __init__(self, match): - Extractor.__init__(self, match) - self.item = match.group(1) - def _init(self): self.api = ItakuAPI(self) self.videos = self.config("videos", True) @@ -62,11 +58,11 @@ class ItakuExtractor(Extractor): class ItakuGalleryExtractor(ItakuExtractor): """Extractor for posts from an itaku user gallery""" subcategory = "gallery" - pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery" + pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery(?:/(\d+))?" example = "https://itaku.ee/profile/USER/gallery" def posts(self): - return self.api.galleries_images(self.item) + return self.api.galleries_images(*self.groups) class ItakuImageExtractor(ItakuExtractor): @@ -75,7 +71,7 @@ class ItakuImageExtractor(ItakuExtractor): example = "https://itaku.ee/images/12345" def posts(self): - return (self.api.image(self.item),) + return (self.api.image(self.groups[0]),) class ItakuSearchExtractor(ItakuExtractor): @@ -84,7 +80,7 @@ class ItakuSearchExtractor(ItakuExtractor): example = "https://itaku.ee/home/images?tags=SEARCH" def posts(self): - params = text.parse_query_list(self.item) + params = text.parse_query_list(self.groups[0]) return self.api.search_images(params) @@ -138,7 +134,7 @@ class ItakuAPI(): params = { "cursor" : None, "owner" : self.user(username)["owner"], - "section" : section, + "sections" : section, "date_range": "", "maturity_rating": ("SFW", "Questionable", "NSFW"), "ordering" : "-date_added", diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 8ffa14b..648f7df 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -190,8 +190,8 @@ class NewgroundsExtractor(Extractor): extr = text.extract_from(page) data = extract_data(extr, post_url) - data["_comment"] = extr( - 'id="author_comments"', '</div>').partition(">")[2] + data["comment_html"] = data["_comment"] = extr( + 'id="author_comments"', '</div>').partition(">")[2].strip() data["comment"] = text.unescape(text.remove_html( data["_comment"] .replace("<p><br></p>", "\n\n").replace("<br>", "\n"), "", "")) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index e7540f8..815a214 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -83,8 +83,9 @@ class OAuthBase(Extractor): browser = None if browser and browser.open(url): - name = getattr(browser, "name", None) or "Browser" - self.log.info("Opening URL in %s:", name.capitalize()) + name = getattr(browser, "name", None) + if name: + self.log.info("Opening URL with %s:", name.capitalize()) else: self.log.info("Please open this URL in your browser:") diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 866e93a..f5a33d5 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -169,6 +169,12 @@ class PatreonExtractor(Extractor): attr["date"] = text.parse_datetime( attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + try: + attr["campaign"] = (included["campaign"][ + relationships["campaign"]["data"]["id"]]) + except Exception: + attr["campaign"] = None + tags = relationships.get("user_defined_tags") attr["tags"] = [ tag["id"].replace("user_defined;", "") @@ -324,7 +330,8 @@ class PatreonCreatorExtractor(PatreonExtractor): subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))" - r"(?:c/)?([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") + r"(?:profile/creators|(?:c/)?([^/?#]+)(?:/posts)?)" + r"/?(?:\?([^#]+))?") example = "https://www.patreon.com/USER" def posts(self): @@ -345,7 +352,7 @@ class PatreonCreatorExtractor(PatreonExtractor): return self._pagination(url) def _get_campaign_id(self, creator, query): - if creator.startswith("id:"): + if creator and creator.startswith("id:"): return creator[3:] campaign_id = query.get("c") or query.get("campaign_id") diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 1b67272..201d4d6 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -10,7 +10,6 @@ from .booru import BooruExtractor from .. import text, exception -import operator class PhilomenaExtractor(BooruExtractor): @@ -24,17 +23,22 @@ class PhilomenaExtractor(BooruExtractor): def _init(self): self.api = PhilomenaAPI(self) - if not self.config("svg", True): - self._file_url = operator.itemgetter("view_url") + self.svg = self.config("svg", True) def _file_url(self, post): - if post["format"] == "svg": - return post["view_url"].rpartition(".")[0] + ".svg" - return post["view_url"] + try: + url = post["representations"]["full"] + except Exception: + url = post["view_url"] + + if self.svg and post["format"] == "svg": + return url.rpartition(".")[0] + ".svg" + return url @staticmethod def _prepare(post): - post["date"] = text.parse_datetime(post["created_at"]) + post["date"] = text.parse_datetime( + post["created_at"][:19], "%Y-%m-%dT%H:%M:%S") BASE_PATTERN = PhilomenaExtractor.update({ diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 7fe8869..8a4905d 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -71,9 +71,12 @@ class PixivExtractor(Extractor): if self.meta_user: work.update(self.api.user_detail(work["user"]["id"])) if self.meta_comments: - if work["total_comments"]: - work["comments"] = list( - self.api.illust_comments(work["id"])) + if work["total_comments"] and not work.get("_ajax"): + try: + work["comments"] = list( + self.api.illust_comments(work["id"])) + except Exception: + work["comments"] = () else: work["comments"] = () if self.meta_bookmark and work["is_bookmarked"]: diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 89eafc8..f36b1f5 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -259,6 +259,8 @@ class RedditSubredditExtractor(RedditExtractor): self.subreddit, sub, params = match.groups() self.params = text.parse_query(params) if sub: + if sub == "search" and "restrict_sr" not in self.params: + self.params["restrict_sr"] = "1" self.subcategory += "-" + sub RedditExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 5e3a958..b5cdb9c 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -194,7 +194,6 @@ class SankakuAPI(): self.extractor = extractor self.headers = { "Accept" : "application/vnd.sankaku.api+json;v=2", - "Platform" : "web-app", "Api-Version": None, "Origin" : extractor.root, } diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 8668330..6c43941 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -51,6 +51,23 @@ class SubscribestarExtractor(Extractor): def posts(self): """Yield HTML content of all relevant posts""" + def request(self, url, **kwargs): + while True: + response = Extractor.request(self, url, **kwargs) + + if response.history and "/verify_subscriber" in response.url: + raise exception.StopExtraction( + "HTTP redirect to %s", response.url) + + content = response.content + if len(content) < 250 and b">redirected<" in content: + url = text.unescape(text.extr( + content, b'href="', b'"').decode()) + self.log.debug("HTML redirect message for %s", url) + continue + + return response + def login(self): if self.cookies_check(self.cookies_names): return @@ -189,10 +206,11 @@ class SubscribestarPostExtractor(SubscribestarExtractor): extr = text.extract_from(html) return { "post_id" : text.parse_int(extr('data-id="', '"')), - "author_name": text.unescape(extr('href="/', '"')), - "author_id" : text.parse_int(extr('data-user-id="', '"')), - "author_nick": text.unescape(extr('alt="', '"')), "date" : self._parse_datetime(extr( - '<span class="star_link-types">', '<')), + '<div class="section-title_date">', '<')), "content" : extr('<body>', '</body>').strip(), + "author_name": text.unescape(extr( + 'class="star_link" href="/', '"')), + "author_id" : text.parse_int(extr('data-user-id="', '"')), + "author_nick": text.unescape(extr('alt="', '"')), } diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py new file mode 100644 index 0000000..f129b1c --- /dev/null +++ b/gallery_dl/extractor/tiktok.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.tiktok.com/""" + +from .common import Extractor, Message +from .. import text, util, ytdl, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?tiktokv?\.com" + + +class TiktokExtractor(Extractor): + """Base class for TikTok extractors""" + category = "tiktok" + directory_fmt = ("{category}", "{user}") + filename_fmt = ( + "{id}{num:?_//>02} {title[b:150]}{img_id:? [/]/}.{extension}") + archive_fmt = "{id}_{num}_{img_id}" + root = "https://www.tiktok.com" + cookies_domain = ".tiktok.com" + + def _init(self): + self.audio = self.config("audio", True) + self.video = self.config("videos", True) + if not self.config("avatar", True): + self.avatar = util.false + + def items(self): + # We assume that all of the URLs served by urls() come from the same + # author. + downloaded_avatar = not self.avatar() + + for tiktok_url in self.urls(): + tiktok_url = self._sanitize_url(tiktok_url) + data = self._extract_rehydration_data(tiktok_url) + if "webapp.video-detail" not in data: + # Only /video/ links result in the video-detail dict we need. + # Try again using that form of link. + tiktok_url = self._sanitize_url( + data["seo.abtest"]["canonical"]) + data = self._extract_rehydration_data(tiktok_url) + video_detail = data["webapp.video-detail"] + + if not self._check_status_code(video_detail, tiktok_url): + continue + + post = video_detail["itemInfo"]["itemStruct"] + author = post["author"] + post["user"] = user = author["uniqueId"] + post["date"] = text.parse_timestamp(post["createTime"]) + original_title = title = post["desc"] + + if not downloaded_avatar: + avatar_url = author["avatarLarger"] + avatar = self._generate_avatar( + avatar_url, post, user, author["id"]) + yield Message.Directory, avatar + yield Message.Url, avatar_url, avatar + downloaded_avatar = True + + yield Message.Directory, post + ytdl_media = False + + if "imagePost" in post: + if not original_title: + title = "TikTok photo #{}".format(post["id"]) + img_list = post["imagePost"]["images"] + for i, img in enumerate(img_list, 1): + url = img["imageURL"]["urlList"][0] + text.nameext_from_url(url, post) + post.update({ + "type" : "image", + "image" : img, + "title" : title, + "num" : i, + "img_id": post["filename"].partition("~")[0], + "width" : img["imageWidth"], + "height": img["imageHeight"], + }) + yield Message.Url, url, post + + if self.audio and "music" in post: + ytdl_media = "audio" + + elif self.video and "video" in post: + ytdl_media = "video" + + else: + self.log.info("%s: Skipping post", tiktok_url) + + if ytdl_media: + if not original_title: + title = "TikTok {} #{}".format(ytdl_media, post["id"]) + post.update({ + "type" : ytdl_media, + "image" : None, + "filename" : "", + "extension" : "mp3" if ytdl_media == "audio" else "mp4", + "title" : title, + "num" : 0, + "img_id" : "", + "width" : 0, + "height" : 0, + }) + yield Message.Url, "ytdl:" + tiktok_url, post + + # If we couldn't download the avatar because the given user has no + # posts, we'll need to make a separate request for the user's page + # and download the avatar that way. + if not downloaded_avatar: + user_name = self.avatar() + profile_url = "https://www.tiktok.com/@{}".format(user_name) + data = self._extract_rehydration_data(profile_url) + data = data["webapp.user-detail"]["userInfo"]["user"] + data["user"] = user_name + avatar_url = data["avatarLarger"] + avatar = self._generate_avatar( + avatar_url, data, user_name, data["id"]) + yield Message.Directory, avatar + yield Message.Url, avatar_url, avatar + + def avatar(self): + return False + + def _generate_avatar(self, avatar_url, data, user_name, user_id): + avatar = text.nameext_from_url(avatar_url, data.copy()) + avatar.update({ + "type" : "avatar", + "title" : "@" + user_name, + "id" : user_id, + "img_id": avatar["filename"].partition("~")[0], + "num" : 0, + }) + return avatar + + def _sanitize_url(self, url): + return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1)) + + def _extract_rehydration_data(self, url): + html = self.request(url).text + data = text.extr( + html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" ' + 'type="application/json">', '</script>') + return util.json_loads(data)["__DEFAULT_SCOPE__"] + + def _check_status_code(self, detail, url): + status = detail.get("statusCode") + if not status: + return True + + if status == 10222: + self.log.error("%s: Login required to access this post", url) + elif status == 10204: + self.log.error("%s: Requested post not available", url) + elif status == 10231: + self.log.error("%s: Region locked - Try downloading with a" + "VPN/proxy connection", url) + else: + self.log.error( + "%s: Received unknown error code %s ('%s')", + url, status, detail.get("statusMsg") or "") + return False + + +class TiktokPostExtractor(TiktokExtractor): + """Extract a single video or photo TikTok link""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(?:@([\w_.-]*)|share)/(?:phot|vide)o/(\d+)" + example = "https://www.tiktok.com/@USER/photo/1234567890" + + def urls(self): + user, post_id = self.groups + url = "{}/@{}/video/{}".format(self.root, user or "", post_id) + return (url,) + + +class TiktokVmpostExtractor(TiktokExtractor): + """Extract a single video or photo TikTok VM link""" + subcategory = "vmpost" + pattern = (r"(?:https?://)?(?:" + r"(?:v[mt]\.)?tiktok\.com|(?:www\.)?tiktok\.com/t" + r")/(?!@)([^/?#]+)") + example = "https://vm.tiktok.com/1a2B3c4E5" + + def items(self): + url = text.ensure_http_scheme(self.url) + headers = {"User-Agent": "facebookexternalhit/1.1"} + + response = self.request(url, headers=headers, method="HEAD", + allow_redirects=False, notfound="post") + + url = response.headers.get("Location") + if not url or len(url) <= 28: + # https://www.tiktok.com/?_r=1 + raise exception.NotFoundError("post") + + data = {"_extractor": TiktokPostExtractor} + yield Message.Queue, url.partition("?")[0], data + + +class TiktokUserExtractor(TiktokExtractor): + """Extract a TikTok user's profile""" + subcategory = "user" + pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)" + example = "https://www.tiktok.com/@USER" + + def urls(self): + """Attempt to use yt-dlp/youtube-dl to extract links from a + user's page""" + + try: + module = ytdl.import_module(self.config("module")) + except (ImportError, SyntaxError) as exc: + self.log.error("Cannot import module '%s'", + getattr(exc, "name", "")) + self.log.debug("", exc_info=exc) + raise exception.ExtractionError("yt-dlp or youtube-dl is required " + "for this feature!") + extr_opts = { + "extract_flat" : True, + "ignore_no_formats_error": True, + } + user_opts = { + "retries" : self._retries, + "socket_timeout" : self._timeout, + "nocheckcertificate" : not self._verify, + "playlist_items" : str(self.config("tiktok-range", "")), + } + if self._proxies: + user_opts["proxy"] = self._proxies.get("http") + + ytdl_instance = ytdl.construct_YoutubeDL( + module, self, user_opts, extr_opts) + + # transfer cookies to ytdl + if self.cookies: + set_cookie = ytdl_instance.cookiejar.set_cookie + for cookie in self.cookies: + set_cookie(cookie) + + with ytdl_instance as ydl: + info_dict = ydl._YoutubeDL__extract_info( + "{}/@{}".format(self.root, self.groups[0]), + ydl.get_info_extractor("TikTokUser"), + False, {}, True) + # This should include video and photo posts in /video/ URL form. + return [video["url"] for video in info_dict["entries"]] + + def avatar(self): + return self.groups[0] diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index a725a2c..3b0ea36 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -12,7 +12,7 @@ from .booru import BooruExtractor from .. import text, exception import operator -BASE_PATTERN = r"(?:https?://)?twibooru\.org" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?twibooru\.org" class TwibooruExtractor(BooruExtractor): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 840e846..c391bad 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -234,6 +234,13 @@ class TwitterExtractor(Extractor): for fmt in self._size_fallback: yield base + fmt + def _extract_components(self, tweet, data, files): + for component_id in data["components"]: + com = data["component_objects"][component_id] + for conv in com["data"]["conversation_preview"]: + for url in conv.get("mediaUrls") or (): + files.append({"url": url}) + def _extract_card(self, tweet, files): card = tweet["card"] if "legacy" in card: @@ -272,7 +279,11 @@ class TwitterExtractor(Extractor): return elif name == "unified_card": data = util.json_loads(bvals["unified_card"]["string_value"]) - self._extract_media(tweet, data["media_entities"].values(), files) + if "media_entities" in data: + self._extract_media( + tweet, data["media_entities"].values(), files) + if "component_objects" in data: + self._extract_components(tweet, data, files) return if self.cards == "ytdl": @@ -1065,7 +1076,7 @@ class TwitterAPI(): else: csrf_token = None if not csrf_token: - csrf_token = util.generate_token(80) + csrf_token = util.generate_token() cookies.set("ct0", csrf_token, domain=cookies_domain) auth_token = cookies.get("auth_token", domain=cookies_domain) diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 5cde0d6..af3f32d 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -29,7 +29,17 @@ class VipergirlsExtractor(Extractor): def _init(self): domain = self.config("domain") if domain: - self.root = text.ensure_http_scheme(domain) + pos = domain.find("://") + if pos >= 0: + self.root = domain.rstrip("/") + self.cookies_domain = "." + domain[pos+1:].strip("/") + else: + domain = domain.strip("/") + self.root = "https://" + domain + self.cookies_domain = "." + domain + else: + self.root = "https://viper.click" + self.cookies_domain = ".viper.click" def items(self): self.login() diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 1c0c172..a53409c 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -38,7 +38,7 @@ class VscoExtractor(Extractor): if img["is_video"]: if not videos: continue - url = "https://" + img["video_url"] + url = text.ensure_http_scheme(img["video_url"]) else: base = img["responsive_url"].partition("/")[2] cdn, _, path = base.partition("/") @@ -63,6 +63,10 @@ class VscoExtractor(Extractor): "height": img["height"], "description": img.get("description") or "", }) + if data["extension"] == "m3u8": + url = "ytdl:" + url + data["_ytdl_manifest"] = "hls" + data["extension"] = "mp4" yield Message.Url, url, data def images(self): @@ -294,12 +298,33 @@ class VscoImageExtractor(VscoExtractor): pattern = USER_PATTERN + r"/media/([0-9a-fA-F]+)" example = "https://vsco.co/USER/media/0123456789abcdef" - def __init__(self, match): - VscoExtractor.__init__(self, match) - self.media_id = match.group(2) - def images(self): - url = "{}/{}/media/{}".format(self.root, self.user, self.media_id) + url = "{}/{}/media/{}".format(self.root, self.user, self.groups[1]) data = self._extract_preload_state(url) media = data["medias"]["byId"].popitem()[1]["media"] return (self._transform_media(media),) + + +class VscoVideoExtractor(VscoExtractor): + """Extractor for vsco.co videos links""" + subcategory = "video" + pattern = USER_PATTERN + r"/video/([^/?#]+)" + example = "https://vsco.co/USER/video/012345678-9abc-def0" + + def images(self): + url = "{}/{}/video/{}".format(self.root, self.user, self.groups[1]) + data = self._extract_preload_state(url) + media = data["medias"]["byId"].popitem()[1]["media"] + + return ({ + "_id" : media["id"], + "is_video" : True, + "grid_name" : "", + "upload_date" : media["createdDate"], + "responsive_url": media["posterUrl"], + "video_url" : "ytdl:" + media.get("playbackUrl"), + "image_meta" : None, + "width" : media["width"], + "height" : media["height"], + "description" : media["description"], + },) diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py index fc1badb..cacefd6 100644 --- a/gallery_dl/extractor/weebcentral.py +++ b/gallery_dl/extractor/weebcentral.py @@ -50,14 +50,16 @@ class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) manga_id = extr("'series_id': '", "'") - - data = self._extract_manga_data(manga_id) - data["chapter_id"] = self.groups[1] - data["chapter_type"] = extr("'chapter_type': '", "'") - + chapter_type = extr("'chapter_type': '", "'") chapter, sep, minor = extr("'number': '", "'").partition(".") - data["chapter"] = text.parse_int(chapter) - data["chapter_minor"] = sep + minor + + data = { + "chapter": text.parse_int(chapter), + "chapter_id": self.groups[1], + "chapter_type": chapter_type, + "chapter_minor": sep + minor, + } + data.update(self._extract_manga_data(manga_id)) return data diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 9885d79..3ed5a06 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -33,6 +33,7 @@ class WeiboExtractor(Extractor): self.livephoto = self.config("livephoto", True) self.retweets = self.config("retweets", False) self.videos = self.config("videos", True) + self.movies = self.config("movies", False) self.gifs = self.config("gifs", True) self.gifs_video = (self.gifs == "video") @@ -134,7 +135,10 @@ class WeiboExtractor(Extractor): if "page_info" in status: info = status["page_info"] if "media_info" in info and self.videos: - append(self._extract_video(info["media_info"])) + if info.get("type") != "5" or self.movies: + append(self._extract_video(info["media_info"])) + else: + self.log.debug("%s: Ignoring 'movie' video", status["id"]) def _extract_video(self, info): try: |
