aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py4
-rw-r--r--gallery_dl/extractor/bilibili.py46
-rw-r--r--gallery_dl/extractor/boosty.py92
-rw-r--r--gallery_dl/extractor/bunkr.py54
-rw-r--r--gallery_dl/extractor/chevereto.py3
-rw-r--r--gallery_dl/extractor/common.py2
-rw-r--r--gallery_dl/extractor/discord.py399
-rw-r--r--gallery_dl/extractor/erome.py14
-rw-r--r--gallery_dl/extractor/foolfuuka.py4
-rw-r--r--gallery_dl/extractor/furaffinity.py1
-rw-r--r--gallery_dl/extractor/furry34.py156
-rw-r--r--gallery_dl/extractor/generic.py2
-rw-r--r--gallery_dl/extractor/imgur.py56
-rw-r--r--gallery_dl/extractor/imhentai.py140
-rw-r--r--gallery_dl/extractor/issuu.py4
-rw-r--r--gallery_dl/extractor/itaku.py14
-rw-r--r--gallery_dl/extractor/newgrounds.py4
-rw-r--r--gallery_dl/extractor/oauth.py5
-rw-r--r--gallery_dl/extractor/patreon.py11
-rw-r--r--gallery_dl/extractor/philomena.py18
-rw-r--r--gallery_dl/extractor/pixiv.py9
-rw-r--r--gallery_dl/extractor/reddit.py2
-rw-r--r--gallery_dl/extractor/sankaku.py1
-rw-r--r--gallery_dl/extractor/subscribestar.py26
-rw-r--r--gallery_dl/extractor/tiktok.py253
-rw-r--r--gallery_dl/extractor/twibooru.py2
-rw-r--r--gallery_dl/extractor/twitter.py15
-rw-r--r--gallery_dl/extractor/vipergirls.py12
-rw-r--r--gallery_dl/extractor/vsco.py37
-rw-r--r--gallery_dl/extractor/weebcentral.py16
-rw-r--r--gallery_dl/extractor/weibo.py6
31 files changed, 1320 insertions, 88 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index fc8d7b2..00b22d4 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -44,6 +44,7 @@ modules = [
"danbooru",
"desktopography",
"deviantart",
+ "discord",
"dynastyscans",
"e621",
"erome",
@@ -56,6 +57,7 @@ modules = [
"fapachi",
"flickr",
"furaffinity",
+ "furry34",
"fuskator",
"gelbooru",
"gelbooru_v01",
@@ -80,6 +82,7 @@ modules = [
"imgbox",
"imgth",
"imgur",
+ "imhentai",
"inkbunny",
"instagram",
"issuu",
@@ -168,6 +171,7 @@ modules = [
"tapas",
"tcbscans",
"telegraph",
+ "tiktok",
"tmohentai",
"toyhouse",
"tsumino",
diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py
index b9de165..597ec40 100644
--- a/gallery_dl/extractor/bilibili.py
+++ b/gallery_dl/extractor/bilibili.py
@@ -81,6 +81,27 @@ class BilibiliArticleExtractor(BilibiliExtractor):
yield Message.Url, url, text.nameext_from_url(url, article)
+class BilibiliUserArticlesFavoriteExtractor(BilibiliExtractor):
+ subcategory = "user-articles-favorite"
+ pattern = (r"(?:https?://)?space\.bilibili\.com"
+ r"/(\d+)/favlist\?fid=opus")
+ example = "https://space.bilibili.com/12345/favlist?fid=opus"
+ _warning = True
+
+ def _init(self):
+ BilibiliExtractor._init(self)
+ if self._warning:
+ if not self.cookies_check(("SESSDATA",)):
+ self.log.error("'SESSDATA' cookie required")
+ BilibiliUserArticlesFavoriteExtractor._warning = False
+
+ def items(self):
+ for article in self.api.user_favlist():
+ article["_extractor"] = BilibiliArticleExtractor
+ url = "{}/opus/{}".format(self.root, article["opus_id"])
+ yield Message.Queue, url, article
+
+
class BilibiliAPI():
def __init__(self, extractor):
self.extractor = extractor
@@ -122,3 +143,28 @@ class BilibiliAPI():
raise exception.StopExtraction(
"%s: Unable to extract INITIAL_STATE data", article_id)
self.extractor.wait(seconds=300)
+
+ def user_favlist(self):
+ endpoint = "/opus/feed/fav"
+ params = {"page": 1, "page_size": 20}
+
+ while True:
+ data = self._call(endpoint, params)["data"]
+
+ yield from data["items"]
+
+ if not data.get("has_more"):
+ break
+ params["page"] += 1
+
+ def login_user_id(self):
+ url = "https://api.bilibili.com/x/space/v2/myinfo"
+ data = self.extractor.request(url).json()
+
+ if data["code"] != 0:
+ self.extractor.log.debug("Server response: %s", data)
+ raise exception.StopExtraction("API request failed,Are you login?")
+ try:
+ return data["data"]["profile"]["mid"]
+ except Exception:
+ raise exception.StopExtraction("API request failed")
diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py
index c28fad9..f3e441b 100644
--- a/gallery_dl/extractor/boosty.py
+++ b/gallery_dl/extractor/boosty.py
@@ -8,6 +8,7 @@
from .common import Extractor, Message
from .. import text, util, exception
+import itertools
BASE_PATTERN = r"(?:https?://)?boosty\.to"
@@ -53,7 +54,9 @@ class BoostyExtractor(Extractor):
self.log.warning("Not allowed to access post %s", post["id"])
continue
- files = self._process_post(post)
+ files = self._extract_files(post)
+ if self._user:
+ post["user"] = self._user
data = {
"post" : post,
"user" : post.pop("user", None),
@@ -69,15 +72,13 @@ class BoostyExtractor(Extractor):
def posts(self):
"""Yield JSON content of all relevant posts"""
- def _process_post(self, post):
+ def _extract_files(self, post):
files = []
post["content"] = content = []
post["links"] = links = []
if "createdAt" in post:
post["date"] = text.parse_timestamp(post["createdAt"])
- if self._user:
- post["user"] = self._user
for block in post["data"]:
try:
@@ -94,7 +95,7 @@ class BoostyExtractor(Extractor):
elif type == "ok_video":
if not self.videos:
self.log.debug("%s: Skipping video %s",
- post["int_id"], block["id"])
+ post["id"], block["id"])
continue
fmts = {
fmt["type"]: fmt["url"]
@@ -114,7 +115,7 @@ class BoostyExtractor(Extractor):
else:
self.log.warning(
"%s: Found no suitable video format for %s",
- post["int_id"], block["id"])
+ post["id"], block["id"])
elif type == "link":
url = block["url"]
@@ -127,9 +128,12 @@ class BoostyExtractor(Extractor):
elif type == "file":
files.append(self._update_url(post, block))
+ elif type == "smile":
+ content.append(":" + block["name"] + ":")
+
else:
self.log.debug("%s: Unsupported data type '%s'",
- post["int_id"], type)
+ post["id"], type)
except Exception as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
@@ -219,6 +223,51 @@ class BoostyFollowingExtractor(BoostyExtractor):
yield Message.Queue, url, user
+class BoostyDirectMessagesExtractor(BoostyExtractor):
+ """Extractor for boosty.to direct messages"""
+ subcategory = "direct-messages"
+ directory_fmt = ("{category}", "{user[blogUrl]} ({user[id]})",
+ "Direct Messages")
+ pattern = BASE_PATTERN + r"/app/messages/?\?dialogId=(\d+)"
+ example = "https://boosty.to/app/messages?dialogId=12345"
+
+ def items(self):
+ """Yield direct messages from a given dialog ID."""
+ dialog_id = self.groups[0]
+ response = self.api.dialog(dialog_id)
+ signed_query = response.get("signedQuery")
+
+ try:
+ messages = response["messages"]["data"]
+ offset = messages[0]["id"]
+ except Exception:
+ return
+
+ try:
+ user = self.api.user(response["chatmate"]["url"])
+ except Exception:
+ user = None
+
+ messages.reverse()
+ for message in itertools.chain(
+ messages,
+ self.api.dialog_messages(dialog_id, offset=offset)
+ ):
+ message["signedQuery"] = signed_query
+ files = self._extract_files(message)
+ data = {
+ "post": message,
+ "user": user,
+ "count": len(files),
+ }
+
+ yield Message.Directory, data
+ for data["num"], file in enumerate(files, 1):
+ data["file"] = file
+ url = file["url"]
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+
class BoostyAPI():
"""Interface for the Boosty API"""
root = "https://api.boosty.to"
@@ -367,3 +416,32 @@ class BoostyAPI():
if offset > data["total"]:
return
params["offset"] = offset
+
+ def dialog(self, dialog_id):
+ endpoint = "/v1/dialog/{}".format(dialog_id)
+ return self._call(endpoint)
+
+ def dialog_messages(self, dialog_id, limit=300, offset=None):
+ endpoint = "/v1/dialog/{}/message/".format(dialog_id)
+ params = {
+ "limit": limit,
+ "reverse": "true",
+ "offset": offset,
+ }
+ return self._pagination_dialog(endpoint, params)
+
+ def _pagination_dialog(self, endpoint, params):
+ while True:
+ data = self._call(endpoint, params)
+
+ yield from data["data"]
+
+ try:
+ extra = data["extra"]
+ if extra.get("isLast"):
+ break
+ params["offset"] = offset = extra["offset"]
+ if not offset:
+ break
+ except Exception:
+ break
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 25e9fd5..201b8f4 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -10,7 +10,8 @@
from .common import Extractor
from .lolisafe import LolisafeAlbumExtractor
-from .. import text, config, exception
+from .. import text, util, config, exception
+import binascii
import random
if config.get(("extractor", "bunkr"), "tlds"):
@@ -60,6 +61,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"""Extractor for bunkr.si albums"""
category = "bunkr"
root = "https://bunkr.si"
+ root_dl = "https://get.bunkrr.su"
+ archive_fmt = "{album_id}_{id|id_url}"
pattern = BASE_PATTERN + r"/a/([^/?#]+)"
example = "https://bunkr.si/a/ID"
@@ -68,6 +71,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
domain = self.groups[0] or self.groups[1]
if domain not in LEGACY_DOMAINS:
self.root = "https://" + domain
+ self.offset = 0
+
+ def skip(self, num):
+ self.offset = num
+ return num
def request(self, url, **kwargs):
kwargs["encoding"] = "utf-8"
@@ -132,6 +140,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
}
def _extract_files(self, items):
+ if self.offset:
+ items = util.advance(items, self.offset)
+
for item in items:
try:
url = text.unescape(text.extr(item, ' href="', '"'))
@@ -154,26 +165,43 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
self.log.debug("", exc_info=exc)
def _extract_file(self, webpage_url):
- response = self.request(webpage_url)
- page = response.text
- file_url = (text.extr(page, '<source src="', '"') or
- text.extr(page, '<img src="', '"'))
+ page = self.request(webpage_url).text
+ data_id = text.extr(page, 'data-file-id="', '"')
+ referer = self.root_dl + "/file/" + data_id
+
+ url = self.root_dl + "/api/vs"
+ headers = {"Referer": referer}
+ data = self.request(
+ url, method="POST", headers=headers, json={"id": data_id}).json()
+
+ if data.get("encrypted"):
+ file_url = self._decrypt_url(data["url"], data["timestamp"])
+ else:
+ file_url = data["url"]
+
file_name = (text.extr(page, 'property="og:title" content="', '"') or
text.extr(page, "<title>", " | Bunkr<"))
-
- if not file_url:
- webpage_url = text.unescape(text.rextract(
- page, ' href="', '"', page.rindex("Download"))[0])
- response = self.request(webpage_url)
- file_url = text.rextract(response.text, ' href="', '"')[0]
+ fallback = text.extr(page, 'property="og:url" content="', '"')
return {
- "file" : text.unescape(file_url),
+ "file" : file_url,
"name" : text.unescape(file_name),
- "_http_headers" : {"Referer": response.url},
+ "id_url" : data_id,
+ "_fallback" : (fallback,) if fallback else (),
+ "_http_headers" : {"Referer": referer},
"_http_validate": self._validate,
}
+ def _decrypt_url(self, encrypted_b64, timestamp):
+ encrypted_bytes = binascii.a2b_base64(encrypted_b64)
+ key = "SECRET_KEY_{}".format(timestamp // 3600).encode()
+ div = len(key)
+
+ return bytes([
+ encrypted_bytes[i] ^ key[i % div]
+ for i in range(len(encrypted_bytes))
+ ]).decode()
+
def _validate(self, response):
if response.history and response.url.endswith("/maintenance-vid.mp4"):
self.log.warning("File server in maintenance mode")
diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py
index aedcea4..de22a7b 100644
--- a/gallery_dl/extractor/chevereto.py
+++ b/gallery_dl/extractor/chevereto.py
@@ -57,7 +57,8 @@ class CheveretoImageExtractor(CheveretoExtractor):
image = {
"id" : self.path.rpartition(".")[2],
- "url" : extr('<meta property="og:image" content="', '"'),
+ "url" : (extr('<meta property="og:image" content="', '"') or
+ extr('url: "', '"')),
"album": text.extr(extr("Added to <a", "/a>"), ">", "<"),
"user" : extr('username: "', '"'),
}
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 13fd88a..d58db6f 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -915,7 +915,7 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address):
options=ssl_options or None, ciphers=ssl_ciphers)
if not requests.__version__ < "2.32":
# https://github.com/psf/requests/pull/6731
- ssl_context.load_default_certs()
+ ssl_context.load_verify_locations(requests.certs.where())
ssl_context.check_hostname = False
else:
ssl_context = None
diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py
new file mode 100644
index 0000000..6a5fcc9
--- /dev/null
+++ b/gallery_dl/extractor/discord.py
@@ -0,0 +1,399 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://discord.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+
+BASE_PATTERN = r"(?:https?://)?discord\.com"
+
+
+class DiscordExtractor(Extractor):
+ """Base class for Discord extractors"""
+ category = "discord"
+ root = "https://discord.com"
+ directory_fmt = ("{category}", "{server_id}_{server}",
+ "{channel_id}_{channel}")
+ filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}"
+ archive_fmt = "{message_id}_{num}"
+
+ cdn_fmt = "https://cdn.discordapp.com/{}/{}/{}.png?size=4096"
+
+ server_metadata = {}
+ server_channels_metadata = {}
+
+ def _init(self):
+ self.token = self.config("token")
+ self.enabled_embeds = self.config("embeds", ["image", "gifv", "video"])
+ self.enabled_threads = self.config("threads", True)
+ self.api = DiscordAPI(self)
+
+ def extract_message_text(self, message):
+ text_content = [message["content"]]
+
+ for embed in message["embeds"]:
+ if embed["type"] == "rich":
+ try:
+ text_content.append(embed["author"]["name"])
+ except Exception:
+ pass
+ text_content.append(embed.get("title", ""))
+ text_content.append(embed.get("description", ""))
+
+ for field in embed.get("fields", []):
+ text_content.append(field.get("name", ""))
+ text_content.append(field.get("value", ""))
+
+ text_content.append(embed.get("footer", {}).get("text", ""))
+
+ if message.get("poll"):
+ text_content.append(message["poll"]["question"]["text"])
+ for answer in message["poll"]["answers"]:
+ text_content.append(answer["poll_media"]["text"])
+
+ return "\n".join(t for t in text_content if t)
+
+ def extract_message(self, message):
+ # https://discord.com/developers/docs/resources/message#message-object-message-types
+ if message["type"] in (0, 19, 21):
+ message_metadata = {}
+ message_metadata.update(self.server_metadata)
+ message_metadata.update(
+ self.server_channels_metadata[message["channel_id"]])
+ message_metadata.update({
+ "author": message["author"]["username"],
+ "author_id": message["author"]["id"],
+ "author_files": [],
+ "message": self.extract_message_text(message),
+ "message_id": message["id"],
+ "date": text.parse_datetime(
+ message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z"
+ ),
+ "files": []
+ })
+
+ for icon_type, icon_path in (
+ ("avatar", "avatars"),
+ ("banner", "banners")
+ ):
+ if message["author"].get(icon_type):
+ message_metadata["author_files"].append({
+ "url": self.cdn_fmt.format(
+ icon_path,
+ message_metadata["author_id"],
+ message["author"][icon_type]
+ ),
+ "filename": icon_type,
+ "extension": "png",
+ })
+
+ for attachment in message["attachments"]:
+ message_metadata["files"].append({
+ "url": attachment["url"],
+ "type": "attachment",
+ })
+
+ for embed in message["embeds"]:
+ if embed["type"] in self.enabled_embeds:
+ for field in ("video", "image", "thumbnail"):
+ if field not in embed:
+ continue
+ url = embed[field].get("proxy_url")
+ if url is not None:
+ message_metadata["files"].append({
+ "url": url,
+ "type": "embed",
+ })
+ break
+
+ for num, file in enumerate(message_metadata["files"], start=1):
+ text.nameext_from_url(file["url"], file)
+ file["num"] = num
+
+ yield Message.Directory, message_metadata
+
+ for file in message_metadata["files"]:
+ message_metadata_file = message_metadata.copy()
+ message_metadata_file.update(file)
+ yield Message.Url, file["url"], message_metadata_file
+
+ def extract_channel_text(self, channel_id):
+ for message in self.api.get_channel_messages(channel_id):
+ yield from self.extract_message(message)
+
+ def extract_channel_threads(self, channel_id):
+ for thread in self.api.get_channel_threads(channel_id):
+ id = self.parse_channel(thread)["channel_id"]
+ yield from self.extract_channel_text(id)
+
+ def extract_channel(self, channel_id, safe=False):
+ try:
+ if channel_id not in self.server_channels_metadata:
+ self.parse_channel(self.api.get_channel(channel_id))
+
+ channel_type = (
+ self.server_channels_metadata[channel_id]["channel_type"]
+ )
+
+ # https://discord.com/developers/docs/resources/channel#channel-object-channel-types
+ if channel_type in (0, 5):
+ yield from self.extract_channel_text(channel_id)
+ if self.enabled_threads:
+ yield from self.extract_channel_threads(channel_id)
+ elif channel_type in (1, 3, 10, 11, 12):
+ yield from self.extract_channel_text(channel_id)
+ elif channel_type in (15, 16):
+ yield from self.extract_channel_threads(channel_id)
+ elif channel_type in (4,):
+ for channel in self.server_channels_metadata.copy().values():
+ if channel["parent_id"] == channel_id:
+ yield from self.extract_channel(
+ channel["channel_id"], safe=True)
+ elif not safe:
+ raise exception.StopExtraction(
+ "This channel type is not supported."
+ )
+ except exception.HttpError as exc:
+ if not (exc.status == 403 and safe):
+ raise
+
+ def parse_channel(self, channel):
+ parent_id = channel.get("parent_id")
+ channel_metadata = {
+ "channel": channel.get("name", ""),
+ "channel_id": channel.get("id"),
+ "channel_type": channel.get("type"),
+ "channel_topic": channel.get("topic", ""),
+ "parent_id": parent_id,
+ "is_thread": "thread_metadata" in channel
+ }
+
+ if parent_id in self.server_channels_metadata:
+ parent_metadata = self.server_channels_metadata[parent_id]
+ channel_metadata.update({
+ "parent": parent_metadata["channel"],
+ "parent_type": parent_metadata["channel_type"]
+ })
+
+ if channel_metadata["channel_type"] in (1, 3):
+ channel_metadata.update({
+ "channel": "DMs",
+ "recipients": (
+ [user["username"] for user in channel["recipients"]]
+ ),
+ "recipients_id": (
+ [user["id"] for user in channel["recipients"]]
+ )
+ })
+
+ channel_id = channel_metadata["channel_id"]
+
+ self.server_channels_metadata[channel_id] = channel_metadata
+ return channel_metadata
+
+ def parse_server(self, server):
+ self.server_metadata = {
+ "server": server["name"],
+ "server_id": server["id"],
+ "server_files": [],
+ "owner_id": server["owner_id"]
+ }
+
+ for icon_type, icon_path in (
+ ("icon", "icons"),
+ ("banner", "banners"),
+ ("splash", "splashes"),
+ ("discovery_splash", "discovery-splashes")
+ ):
+ if server.get(icon_type):
+ self.server_metadata["server_files"].append({
+ "url": self.cdn_fmt.format(
+ icon_path,
+ self.server_metadata["server_id"],
+ server[icon_type]
+ ),
+ "filename": icon_type,
+ "extension": "png",
+ })
+
+ return self.server_metadata
+
+ def build_server_and_channels(self, server_id):
+ server = self.api.get_server(server_id)
+ self.parse_server(server)
+
+ for channel in self.api.get_server_channels(server_id):
+ self.parse_channel(channel)
+
+
+class DiscordChannelExtractor(DiscordExtractor):
+ subcategory = "channel"
+ pattern = BASE_PATTERN + r"/channels/(\d+)/(?:\d+/threads/)?(\d+)/?$"
+ example = "https://discord.com/channels/1234567890/9876543210"
+
+ def items(self):
+ server_id, channel_id = self.groups
+
+ self.build_server_and_channels(server_id)
+
+ return self.extract_channel(channel_id)
+
+
+class DiscordMessageExtractor(DiscordExtractor):
+ subcategory = "message"
+ pattern = BASE_PATTERN + r"/channels/(\d+)/(\d+)/(\d+)/?$"
+ example = "https://discord.com/channels/1234567890/9876543210/2468013579"
+
+ def items(self):
+ server_id, channel_id, message_id = self.groups
+
+ self.build_server_and_channels(server_id)
+
+ if channel_id not in self.server_channels_metadata:
+ self.parse_channel(self.api.get_channel(channel_id))
+
+ return self.extract_message(
+ self.api.get_message(channel_id, message_id))
+
+
+class DiscordServerExtractor(DiscordExtractor):
+ subcategory = "server"
+ pattern = BASE_PATTERN + r"/channels/(\d+)/?$"
+ example = "https://discord.com/channels/1234567890"
+
+ def items(self):
+ server_id = self.groups[0]
+
+ self.build_server_and_channels(server_id)
+
+ for channel in self.server_channels_metadata.copy().values():
+ if channel["channel_type"] in (0, 5, 15, 16):
+ yield from self.extract_channel(
+ channel["channel_id"], safe=True)
+
+
+class DiscordDirectMessagesExtractor(DiscordExtractor):
+ subcategory = "direct-messages"
+ directory_fmt = ("{category}", "Direct Messages",
+ "{channel_id}_{recipients:J,}")
+ pattern = BASE_PATTERN + r"/channels/@me/(\d+)/?$"
+ example = "https://discord.com/channels/@me/1234567890"
+
+ def items(self):
+ return self.extract_channel(self.groups[0])
+
+
+class DiscordDirectMessageExtractor(DiscordExtractor):
+ subcategory = "direct-message"
+ directory_fmt = ("{category}", "Direct Messages",
+ "{channel_id}_{recipients:J,}")
+ pattern = BASE_PATTERN + r"/channels/@me/(\d+)/(\d+)/?$"
+ example = "https://discord.com/channels/@me/1234567890/9876543210"
+
+ def items(self):
+ channel_id, message_id = self.groups
+
+ self.parse_channel(self.api.get_channel(channel_id))
+
+ return self.extract_message(
+ self.api.get_message(channel_id, message_id))
+
+
+class DiscordAPI():
+ """Interface for the Discord API v10
+
+ https://discord.com/developers/docs/reference
+ """
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.root = extractor.root + "/api/v10"
+ self.headers = {"Authorization": extractor.token}
+
+ def get_server(self, server_id):
+ """Get server information"""
+ return self._call("/guilds/" + server_id)
+
+ def get_server_channels(self, server_id):
+ """Get server channels"""
+ return self._call("/guilds/" + server_id + "/channels")
+
+ def get_channel(self, channel_id):
+ """Get channel information"""
+ return self._call("/channels/" + channel_id)
+
+ def get_channel_threads(self, channel_id):
+ """Get channel threads"""
+ THREADS_BATCH = 25
+
+ def _method(offset):
+ return self._call("/channels/" + channel_id + "/threads/search", {
+ "sort_by": "last_message_time",
+ "sort_order": "desc",
+ "limit": THREADS_BATCH,
+ "offset": + offset,
+ })["threads"]
+
+ return self._pagination(_method, THREADS_BATCH)
+
+ def get_channel_messages(self, channel_id):
+ """Get channel messages"""
+ MESSAGES_BATCH = 100
+
+ before = None
+
+ def _method(_):
+ nonlocal before
+ messages = self._call("/channels/" + channel_id + "/messages", {
+ "limit": MESSAGES_BATCH,
+ "before": before
+ })
+ before = messages[-1]["id"]
+ return messages
+
+ return self._pagination(_method, MESSAGES_BATCH)
+
+ def get_message(self, channel_id, message_id):
+ """Get message information"""
+ return self._call("/channels/" + channel_id + "/messages", {
+ "limit": 1,
+ "around": message_id
+ })[0]
+
+ def _call(self, endpoint, params=None):
+ url = self.root + endpoint
+ try:
+ response = self.extractor.request(
+ url, params=params, headers=self.headers)
+ except exception.HttpError as exc:
+ if exc.status == 401:
+ self._raise_invalid_token()
+ raise
+ return response.json()
+
+ def _pagination(self, method, batch):
+ offset = 0
+ while True:
+ data = method(offset)
+ yield from data
+ if len(data) < batch:
+ return
+ offset += len(data)
+
+ @staticmethod
+ def _raise_invalid_token():
+ raise exception.AuthenticationError("""Invalid or missing token.
+Please provide a valid token following these instructions:
+
+1) Open Discord in your browser (https://discord.com/app);
+2) Open your browser's Developer Tools (F12) and switch to the Network panel;
+3) Reload the page and select any request going to https://discord.com/api/...;
+4) In the "Headers" tab, look for an entry beginning with "Authorization: ";
+5) Right-click the entry and click "Copy Value";
+6) Paste the token in your configuration file under "extractor.discord.token",
+or run this command with the -o "token=[your token]" argument.""")
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index e6d136f..55549de 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -44,6 +44,8 @@ class EromeExtractor(Extractor):
pos = page.index('<div class="user-profile', pos)
user, pos = text.extract(
page, 'href="https://www.erome.com/', '"', pos)
+ tags, pos = text.extract(
+ page, '<p class="mt-10"', '</p>', pos)
urls = []
date = None
@@ -59,11 +61,13 @@ class EromeExtractor(Extractor):
date = text.parse_timestamp(ts)
data = {
- "album_id" : album_id,
- "title" : text.unescape(title),
- "user" : text.unquote(user),
- "count" : len(urls),
- "date" : date,
+ "album_id": album_id,
+ "title" : text.unescape(title),
+ "user" : text.unquote(user),
+ "count" : len(urls),
+ "date" : date,
+ "tags" : [t.replace("+", " ")
+ for t in text.extract_iter(tags, "?q=", '"')],
"_http_headers": {"Referer": url},
}
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index 44c4542..5f90afc 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -81,8 +81,8 @@ BASE_PATTERN = FoolfuukaExtractor.update({
"pattern": r"(?:www\.)?archiveofsins\.com",
},
"b4k": {
- "root": "https://arch.b4k.co",
- "pattern": r"arch\.b4k\.co",
+ "root": "https://arch.b4k.dev",
+ "pattern": r"arch\.b4k\.(?:dev|co)",
},
"desuarchive": {
"root": "https://desuarchive.org",
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index d253582..1466390 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -23,6 +23,7 @@ class FuraffinityExtractor(Extractor):
cookies_domain = ".furaffinity.net"
cookies_names = ("a", "b")
root = "https://www.furaffinity.net"
+ request_interval = 1.0
_warning = True
def __init__(self, match):
diff --git a/gallery_dl/extractor/furry34.py b/gallery_dl/extractor/furry34.py
new file mode 100644
index 0000000..e0c7fdb
--- /dev/null
+++ b/gallery_dl/extractor/furry34.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://furry34.com/"""
+
+from .booru import BooruExtractor
+from .. import text
+import collections
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?furry34\.com"
+
+
+class Furry34Extractor(BooruExtractor):
+ category = "furry34"
+ root = "https://furry34.com"
+ root_cdn = "https://furry34com.b-cdn.net"
+ filename_fmt = "{category}_{id}.{extension}"
+ per_page = 30
+
+ TAG_TYPES = {
+ None: "general",
+ 1 : "general",
+ 2 : "copyright",
+ 4 : "character",
+ 8 : "artist",
+ }
+ FORMATS = (
+ ("100", "mov.mp4"),
+ ("101", "mov720.mp4"),
+ ("102", "mov480.mp4"),
+ ("10" , "pic.jpg"),
+ )
+
+ def _file_url(self, post):
+ files = post["files"]
+ for fmt, extension in self.FORMATS:
+ if fmt in files:
+ break
+ else:
+ fmt = next(iter(files))
+
+ post_id = post["id"]
+ root = self.root_cdn if files[fmt][0] else self.root
+ post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format(
+ root, post_id // 1000, post_id, post_id, extension)
+ post["format_id"] = fmt
+ post["format"] = extension.partition(".")[0]
+
+ return url
+
+ def _prepare(self, post):
+ post.pop("files", None)
+ post["date"] = text.parse_datetime(
+ post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["filename"], _, post["format"] = post["filename"].rpartition(".")
+ if "tags" in post:
+ post["tags"] = [t["value"] for t in post["tags"]]
+
+ def _tags(self, post, _):
+ if "tags" not in post:
+ post.update(self._fetch_post(post["id"]))
+
+ tags = collections.defaultdict(list)
+ for tag in post["tags"]:
+ tags[tag["type"] or 1].append(tag["value"])
+ types = self.TAG_TYPES
+ for type, values in tags.items():
+ post["tags_" + types[type]] = values
+
+ def _fetch_post(self, post_id):
+ url = "{}/api/v2/post/{}".format(self.root, post_id)
+ return self.request(url).json()
+
+ def _pagination(self, endpoint, params=None):
+ url = "{}/api{}".format(self.root, endpoint)
+
+ if params is None:
+ params = {}
+ params["sortBy"] = 0
+ params["take"] = self.per_page
+ threshold = self.per_page
+
+ while True:
+ data = self.request(url, method="POST", json=params).json()
+
+ yield from data["items"]
+
+ if len(data["items"]) < threshold:
+ return
+ params["cursor"] = data.get("cursor")
+
+
+class Furry34PostExtractor(Furry34Extractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/post/(\d+)"
+ example = "https://furry34.com/post/12345"
+
+ def posts(self):
+ return (self._fetch_post(self.groups[0]),)
+
+
+class Furry34PlaylistExtractor(Furry34Extractor):
+ subcategory = "playlist"
+ directory_fmt = ("{category}", "{playlist_id}")
+ archive_fmt = "p_{playlist_id}_{id}"
+ pattern = BASE_PATTERN + r"/playlists/view/(\d+)"
+ example = "https://furry34.com/playlists/view/12345"
+
+ def metadata(self):
+ return {"playlist_id": self.groups[0]}
+
+ def posts(self):
+ endpoint = "/v2/post/search/playlist/" + self.groups[0]
+ return self._pagination(endpoint)
+
+
+class Furry34TagExtractor(Furry34Extractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/(?:([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)"
+ example = "https://furry34.com/TAG"
+
+ def _init(self):
+ tag, query = self.groups
+ params = text.parse_query(query)
+
+ self.tags = tags = []
+ if tag:
+ tags.extend(text.unquote(text.unquote(tag)).split("|"))
+ if "tags" in params:
+ tags.extend(params["tags"].split("|"))
+
+ type = params.get("type")
+ if type == "video":
+ self.type = 1
+ elif type == "image":
+ self.type = 0
+ else:
+ self.type = None
+
+ def metadata(self):
+ return {"search_tags": " ".join(self.tags)}
+
+ def posts(self):
+ endpoint = "/v2/post/search/root"
+ params = {"includeTags": [t.replace("_", " ") for t in self.tags]}
+ if self.type is not None:
+ params["type"] = self.type
+ return self._pagination(endpoint, params)
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index 370cd43..4b04732 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -37,6 +37,7 @@ class GenericExtractor(Extractor):
example = "generic:https://www.nongnu.org/lzip/"
def __init__(self, match):
+ self.subcategory = match.group('domain')
Extractor.__init__(self, match)
# Strip the "g(eneric):" prefix
@@ -54,7 +55,6 @@ class GenericExtractor(Extractor):
self.scheme = 'https://'
self.url = text.ensure_http_scheme(self.url, self.scheme)
- self.subcategory = match.group('domain')
self.path = match.group('path')
# Used to resolve relative image urls
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index 481fb1e..20f8ea4 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -142,7 +142,8 @@ class ImgurGalleryExtractor(ImgurExtractor):
class ImgurUserExtractor(ImgurExtractor):
"""Extractor for all images posted by a user"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/user/([^/?#]+)(?:/posts|/submitted)?/?$"
+ pattern = (BASE_PATTERN + r"/user/(?!me(?:/|$|\?|#))"
+ r"([^/?#]+)(?:/posts|/submitted)?/?$")
example = "https://imgur.com/user/USER"
def items(self):
@@ -174,6 +175,23 @@ class ImgurFavoriteFolderExtractor(ImgurExtractor):
self.key, self.folder_id))
+class ImgurMeExtractor(ImgurExtractor):
+ """Extractor for your personal uploads"""
+ subcategory = "me"
+ pattern = BASE_PATTERN + r"/user/me(?:/posts)?(/hidden)?"
+ example = "https://imgur.com/user/me"
+
+ def items(self):
+ if not self.cookies_check(("accesstoken",)):
+ self.log.error("'accesstoken' cookie required")
+
+ if self.groups[0]:
+ posts = self.api.accounts_me_hiddenalbums()
+ else:
+ posts = self.api.accounts_me_allposts()
+ return self._items_queue(posts)
+
+
class ImgurSubredditExtractor(ImgurExtractor):
"""Extractor for a subreddits's imgur links"""
subcategory = "subreddit"
@@ -215,6 +233,10 @@ class ImgurAPI():
self.client_id = extractor.config("client-id") or "546c25a59c58ad7"
self.headers = {"Authorization": "Client-ID " + self.client_id}
+ def account_submissions(self, account):
+ endpoint = "/3/account/{}/submissions".format(account)
+ return self._pagination(endpoint)
+
def account_favorites(self, account):
endpoint = "/3/account/{}/gallery_favorites".format(account)
return self._pagination(endpoint)
@@ -224,15 +246,29 @@ class ImgurAPI():
account, folder_id)
return self._pagination_v2(endpoint)
+ def accounts_me_allposts(self):
+ endpoint = "/post/v1/accounts/me/all_posts"
+ params = {
+ "include": "media,tags,account",
+ "page" : 1,
+ "sort" : "-created_at",
+ }
+ return self._pagination_v2(endpoint, params)
+
+ def accounts_me_hiddenalbums(self):
+ endpoint = "/post/v1/accounts/me/hidden_albums"
+ params = {
+ "include": "media,tags,account",
+ "page" : 1,
+ "sort" : "-created_at",
+ }
+ return self._pagination_v2(endpoint, params)
+
def gallery_search(self, query):
endpoint = "/3/gallery/search"
params = {"q": query}
return self._pagination(endpoint, params)
- def account_submissions(self, account):
- endpoint = "/3/account/{}/submissions".format(account)
- return self._pagination(endpoint)
-
def gallery_subreddit(self, subreddit):
endpoint = "/3/gallery/r/{}".format(subreddit)
return self._pagination(endpoint)
@@ -284,12 +320,16 @@ class ImgurAPI():
if params is None:
params = {}
params["client_id"] = self.client_id
- params["page"] = 0
- params["sort"] = "newest"
+ if "page" not in params:
+ params["page"] = 0
+ if "sort" not in params:
+ params["sort"] = "newest"
headers = {"Origin": "https://imgur.com"}
while True:
- data = self._call(endpoint, params, headers)["data"]
+ data = self._call(endpoint, params, headers)
+ if "data" in data:
+ data = data["data"]
if not data:
return
yield from data
diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py
new file mode 100644
index 0000000..0439f5b
--- /dev/null
+++ b/gallery_dl/extractor/imhentai.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://imhentai.xxx/ and mirror sites"""
+
+from .common import GalleryExtractor, BaseExtractor, Message
+from .. import text, util
+
+
+class ImhentaiExtractor(BaseExtractor):
+ basecategory = "IMHentai"
+
+ def _pagination(self, url):
+ prev = None
+ base = self.root + "/gallery/"
+ data = {"_extractor": ImhentaiGalleryExtractor}
+
+ while True:
+ page = self.request(url).text
+ extr = text.extract_from(page)
+
+ while True:
+ gallery_id = extr('<a href="/gallery/', '"')
+ if gallery_id == prev:
+ continue
+ if not gallery_id:
+ break
+ yield Message.Queue, base + gallery_id, data
+ prev = gallery_id
+
+ href = text.rextract(page, "class='page-link' href='", "'")[0]
+ if not href or href == "#":
+ return
+ if href[0] == "/":
+ if href[1] == "/":
+ href = "https:" + href
+ else:
+ href = self.root + href
+ url = href
+
+
+BASE_PATTERN = ImhentaiExtractor.update({
+ "imhentai": {
+ "root": "https://imhentai.xxx",
+ "pattern": r"(?:www\.)?imhentai\.xxx",
+ },
+ "hentaiera": {
+ "root": "https://hentaiera.com",
+ "pattern": r"(?:www\.)?hentaiera\.com",
+ },
+ "hentairox": {
+ "root": "https://hentairox.com",
+ "pattern": r"(?:www\.)?hentairox\.com",
+ },
+})
+
+
+class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
+ """Extractor for imhentai galleries"""
+ pattern = BASE_PATTERN + r"/(?:gallery|view)/(\d+)"
+ example = "https://imhentai.xxx/gallery/12345/"
+
+ def __init__(self, match):
+ ImhentaiExtractor.__init__(self, match)
+ self.gallery_id = self.groups[-1]
+ self.gallery_url = "{}/gallery/{}/".format(self.root, self.gallery_id)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+
+ data = {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : text.unescape(extr("<h1>", "<")),
+ "title_alt" : text.unescape(extr('class="subtitle">', "<")),
+ "parody" : self._split(extr(">Parodies", "</li>")),
+ "character" : self._split(extr(">Characters", "</li>")),
+ "tags" : self._split(extr(">Tags", "</li>")),
+ "artist" : self._split(extr(">Artists", "</li>")),
+ "group" : self._split(extr(">Groups", "</li>")),
+ "language" : self._split(extr(">Languages", "</li>")),
+ "type" : extr("href='/category/", "/"),
+ }
+
+ if data["language"]:
+ data["lang"] = util.language_to_code(data["language"][0])
+
+ return data
+
+ def _split(self, html):
+ results = []
+ for tag in text.extract_iter(html, ">", "</a>"):
+ tag = tag.partition(" <span class='badge'>")[0]
+ if "<" in tag:
+ tag = text.remove_html(tag)
+ results.append(tag)
+ return results
+
+ def images(self, page):
+ data = util.json_loads(text.extr(page, "$.parseJSON('", "'"))
+ base = text.extr(page, 'data-src="', '"').rpartition("/")[0] + "/"
+ exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"}
+
+ results = []
+ for i in map(str, range(1, len(data)+1)):
+ ext, width, height = data[i].split(",")
+ url = base + i + "." + exts[ext]
+ results.append((url, {
+ "width" : text.parse_int(width),
+ "height": text.parse_int(height),
+ }))
+ return results
+
+
+class ImhentaiTagExtractor(ImhentaiExtractor):
+ """Extractor for imhentai tag searches"""
+ subcategory = "tag"
+ pattern = (BASE_PATTERN + r"(/(?:"
+ r"artist|category|character|group|language|parody|tag"
+ r")/([^/?#]+))")
+ example = "https://imhentai.xxx/tag/TAG/"
+
+ def items(self):
+ url = self.root + self.groups[-2] + "/"
+ return self._pagination(url)
+
+
+class ImhentaiSearchExtractor(ImhentaiExtractor):
+ """Extractor for imhentai search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
+ example = "https://imhentai.xxx/search/?key=QUERY"
+
+ def items(self):
+ url = self.root + "/search/?" + self.groups[-1]
+ return self._pagination(url)
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py
index b900113..65717b4 100644
--- a/gallery_dl/extractor/issuu.py
+++ b/gallery_dl/extractor/issuu.py
@@ -30,8 +30,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
def metadata(self, page):
pos = page.rindex('id="initial-data"')
- data = util.json_loads(text.rextract(
- page, '<script data-json="', '"', pos)[0].replace("&quot;", '"'))
+ data = util.json_loads(text.unescape(text.rextract(
+ page, '<script data-json="', '"', pos)[0]))
doc = data["initialDocumentData"]["document"]
doc["date"] = text.parse_datetime(
diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py
index 7f941bb..5c91eb9 100644
--- a/gallery_dl/extractor/itaku.py
+++ b/gallery_dl/extractor/itaku.py
@@ -24,10 +24,6 @@ class ItakuExtractor(Extractor):
archive_fmt = "{id}"
request_interval = (0.5, 1.5)
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.item = match.group(1)
-
def _init(self):
self.api = ItakuAPI(self)
self.videos = self.config("videos", True)
@@ -62,11 +58,11 @@ class ItakuExtractor(Extractor):
class ItakuGalleryExtractor(ItakuExtractor):
"""Extractor for posts from an itaku user gallery"""
subcategory = "gallery"
- pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery"
+ pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery(?:/(\d+))?"
example = "https://itaku.ee/profile/USER/gallery"
def posts(self):
- return self.api.galleries_images(self.item)
+ return self.api.galleries_images(*self.groups)
class ItakuImageExtractor(ItakuExtractor):
@@ -75,7 +71,7 @@ class ItakuImageExtractor(ItakuExtractor):
example = "https://itaku.ee/images/12345"
def posts(self):
- return (self.api.image(self.item),)
+ return (self.api.image(self.groups[0]),)
class ItakuSearchExtractor(ItakuExtractor):
@@ -84,7 +80,7 @@ class ItakuSearchExtractor(ItakuExtractor):
example = "https://itaku.ee/home/images?tags=SEARCH"
def posts(self):
- params = text.parse_query_list(self.item)
+ params = text.parse_query_list(self.groups[0])
return self.api.search_images(params)
@@ -138,7 +134,7 @@ class ItakuAPI():
params = {
"cursor" : None,
"owner" : self.user(username)["owner"],
- "section" : section,
+ "sections" : section,
"date_range": "",
"maturity_rating": ("SFW", "Questionable", "NSFW"),
"ordering" : "-date_added",
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 8ffa14b..648f7df 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -190,8 +190,8 @@ class NewgroundsExtractor(Extractor):
extr = text.extract_from(page)
data = extract_data(extr, post_url)
- data["_comment"] = extr(
- 'id="author_comments"', '</div>').partition(">")[2]
+ data["comment_html"] = data["_comment"] = extr(
+ 'id="author_comments"', '</div>').partition(">")[2].strip()
data["comment"] = text.unescape(text.remove_html(
data["_comment"]
.replace("<p><br></p>", "\n\n").replace("<br>", "\n"), "", ""))
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index e7540f8..815a214 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -83,8 +83,9 @@ class OAuthBase(Extractor):
browser = None
if browser and browser.open(url):
- name = getattr(browser, "name", None) or "Browser"
- self.log.info("Opening URL in %s:", name.capitalize())
+ name = getattr(browser, "name", None)
+ if name:
+ self.log.info("Opening URL with %s:", name.capitalize())
else:
self.log.info("Please open this URL in your browser:")
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 866e93a..f5a33d5 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -169,6 +169,12 @@ class PatreonExtractor(Extractor):
attr["date"] = text.parse_datetime(
attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ try:
+ attr["campaign"] = (included["campaign"][
+ relationships["campaign"]["data"]["id"]])
+ except Exception:
+ attr["campaign"] = None
+
tags = relationships.get("user_defined_tags")
attr["tags"] = [
tag["id"].replace("user_defined;", "")
@@ -324,7 +330,8 @@ class PatreonCreatorExtractor(PatreonExtractor):
subcategory = "creator"
pattern = (r"(?:https?://)?(?:www\.)?patreon\.com"
r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))"
- r"(?:c/)?([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?")
+ r"(?:profile/creators|(?:c/)?([^/?#]+)(?:/posts)?)"
+ r"/?(?:\?([^#]+))?")
example = "https://www.patreon.com/USER"
def posts(self):
@@ -345,7 +352,7 @@ class PatreonCreatorExtractor(PatreonExtractor):
return self._pagination(url)
def _get_campaign_id(self, creator, query):
- if creator.startswith("id:"):
+ if creator and creator.startswith("id:"):
return creator[3:]
campaign_id = query.get("c") or query.get("campaign_id")
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index 1b67272..201d4d6 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -10,7 +10,6 @@
from .booru import BooruExtractor
from .. import text, exception
-import operator
class PhilomenaExtractor(BooruExtractor):
@@ -24,17 +23,22 @@ class PhilomenaExtractor(BooruExtractor):
def _init(self):
self.api = PhilomenaAPI(self)
- if not self.config("svg", True):
- self._file_url = operator.itemgetter("view_url")
+ self.svg = self.config("svg", True)
def _file_url(self, post):
- if post["format"] == "svg":
- return post["view_url"].rpartition(".")[0] + ".svg"
- return post["view_url"]
+ try:
+ url = post["representations"]["full"]
+ except Exception:
+ url = post["view_url"]
+
+ if self.svg and post["format"] == "svg":
+ return url.rpartition(".")[0] + ".svg"
+ return url
@staticmethod
def _prepare(post):
- post["date"] = text.parse_datetime(post["created_at"])
+ post["date"] = text.parse_datetime(
+ post["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
BASE_PATTERN = PhilomenaExtractor.update({
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 7fe8869..8a4905d 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -71,9 +71,12 @@ class PixivExtractor(Extractor):
if self.meta_user:
work.update(self.api.user_detail(work["user"]["id"]))
if self.meta_comments:
- if work["total_comments"]:
- work["comments"] = list(
- self.api.illust_comments(work["id"]))
+ if work["total_comments"] and not work.get("_ajax"):
+ try:
+ work["comments"] = list(
+ self.api.illust_comments(work["id"]))
+ except Exception:
+ work["comments"] = ()
else:
work["comments"] = ()
if self.meta_bookmark and work["is_bookmarked"]:
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 89eafc8..f36b1f5 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -259,6 +259,8 @@ class RedditSubredditExtractor(RedditExtractor):
self.subreddit, sub, params = match.groups()
self.params = text.parse_query(params)
if sub:
+ if sub == "search" and "restrict_sr" not in self.params:
+ self.params["restrict_sr"] = "1"
self.subcategory += "-" + sub
RedditExtractor.__init__(self, match)
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 5e3a958..b5cdb9c 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -194,7 +194,6 @@ class SankakuAPI():
self.extractor = extractor
self.headers = {
"Accept" : "application/vnd.sankaku.api+json;v=2",
- "Platform" : "web-app",
"Api-Version": None,
"Origin" : extractor.root,
}
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index 8668330..6c43941 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -51,6 +51,23 @@ class SubscribestarExtractor(Extractor):
def posts(self):
"""Yield HTML content of all relevant posts"""
+ def request(self, url, **kwargs):
+ while True:
+ response = Extractor.request(self, url, **kwargs)
+
+ if response.history and "/verify_subscriber" in response.url:
+ raise exception.StopExtraction(
+ "HTTP redirect to %s", response.url)
+
+ content = response.content
+ if len(content) < 250 and b">redirected<" in content:
+ url = text.unescape(text.extr(
+ content, b'href="', b'"').decode())
+ self.log.debug("HTML redirect message for %s", url)
+ continue
+
+ return response
+
def login(self):
if self.cookies_check(self.cookies_names):
return
@@ -189,10 +206,11 @@ class SubscribestarPostExtractor(SubscribestarExtractor):
extr = text.extract_from(html)
return {
"post_id" : text.parse_int(extr('data-id="', '"')),
- "author_name": text.unescape(extr('href="/', '"')),
- "author_id" : text.parse_int(extr('data-user-id="', '"')),
- "author_nick": text.unescape(extr('alt="', '"')),
"date" : self._parse_datetime(extr(
- '<span class="star_link-types">', '<')),
+ '<div class="section-title_date">', '<')),
"content" : extr('<body>', '</body>').strip(),
+ "author_name": text.unescape(extr(
+ 'class="star_link" href="/', '"')),
+ "author_id" : text.parse_int(extr('data-user-id="', '"')),
+ "author_nick": text.unescape(extr('alt="', '"')),
}
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
new file mode 100644
index 0000000..f129b1c
--- /dev/null
+++ b/gallery_dl/extractor/tiktok.py
@@ -0,0 +1,253 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.tiktok.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, ytdl, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?tiktokv?\.com"
+
+
+class TiktokExtractor(Extractor):
+ """Base class for TikTok extractors"""
+ category = "tiktok"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = (
+ "{id}{num:?_//>02} {title[b:150]}{img_id:? [/]/}.{extension}")
+ archive_fmt = "{id}_{num}_{img_id}"
+ root = "https://www.tiktok.com"
+ cookies_domain = ".tiktok.com"
+
+ def _init(self):
+ self.audio = self.config("audio", True)
+ self.video = self.config("videos", True)
+ if not self.config("avatar", True):
+ self.avatar = util.false
+
+ def items(self):
+ # We assume that all of the URLs served by urls() come from the same
+ # author.
+ downloaded_avatar = not self.avatar()
+
+ for tiktok_url in self.urls():
+ tiktok_url = self._sanitize_url(tiktok_url)
+ data = self._extract_rehydration_data(tiktok_url)
+ if "webapp.video-detail" not in data:
+ # Only /video/ links result in the video-detail dict we need.
+ # Try again using that form of link.
+ tiktok_url = self._sanitize_url(
+ data["seo.abtest"]["canonical"])
+ data = self._extract_rehydration_data(tiktok_url)
+ video_detail = data["webapp.video-detail"]
+
+ if not self._check_status_code(video_detail, tiktok_url):
+ continue
+
+ post = video_detail["itemInfo"]["itemStruct"]
+ author = post["author"]
+ post["user"] = user = author["uniqueId"]
+ post["date"] = text.parse_timestamp(post["createTime"])
+ original_title = title = post["desc"]
+
+ if not downloaded_avatar:
+ avatar_url = author["avatarLarger"]
+ avatar = self._generate_avatar(
+ avatar_url, post, user, author["id"])
+ yield Message.Directory, avatar
+ yield Message.Url, avatar_url, avatar
+ downloaded_avatar = True
+
+ yield Message.Directory, post
+ ytdl_media = False
+
+ if "imagePost" in post:
+ if not original_title:
+ title = "TikTok photo #{}".format(post["id"])
+ img_list = post["imagePost"]["images"]
+ for i, img in enumerate(img_list, 1):
+ url = img["imageURL"]["urlList"][0]
+ text.nameext_from_url(url, post)
+ post.update({
+ "type" : "image",
+ "image" : img,
+ "title" : title,
+ "num" : i,
+ "img_id": post["filename"].partition("~")[0],
+ "width" : img["imageWidth"],
+ "height": img["imageHeight"],
+ })
+ yield Message.Url, url, post
+
+ if self.audio and "music" in post:
+ ytdl_media = "audio"
+
+ elif self.video and "video" in post:
+ ytdl_media = "video"
+
+ else:
+ self.log.info("%s: Skipping post", tiktok_url)
+
+ if ytdl_media:
+ if not original_title:
+ title = "TikTok {} #{}".format(ytdl_media, post["id"])
+ post.update({
+ "type" : ytdl_media,
+ "image" : None,
+ "filename" : "",
+ "extension" : "mp3" if ytdl_media == "audio" else "mp4",
+ "title" : title,
+ "num" : 0,
+ "img_id" : "",
+ "width" : 0,
+ "height" : 0,
+ })
+ yield Message.Url, "ytdl:" + tiktok_url, post
+
+ # If we couldn't download the avatar because the given user has no
+ # posts, we'll need to make a separate request for the user's page
+ # and download the avatar that way.
+ if not downloaded_avatar:
+ user_name = self.avatar()
+ profile_url = "https://www.tiktok.com/@{}".format(user_name)
+ data = self._extract_rehydration_data(profile_url)
+ data = data["webapp.user-detail"]["userInfo"]["user"]
+ data["user"] = user_name
+ avatar_url = data["avatarLarger"]
+ avatar = self._generate_avatar(
+ avatar_url, data, user_name, data["id"])
+ yield Message.Directory, avatar
+ yield Message.Url, avatar_url, avatar
+
+ def avatar(self):
+ return False
+
+ def _generate_avatar(self, avatar_url, data, user_name, user_id):
+ avatar = text.nameext_from_url(avatar_url, data.copy())
+ avatar.update({
+ "type" : "avatar",
+ "title" : "@" + user_name,
+ "id" : user_id,
+ "img_id": avatar["filename"].partition("~")[0],
+ "num" : 0,
+ })
+ return avatar
+
+ def _sanitize_url(self, url):
+ return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))
+
+ def _extract_rehydration_data(self, url):
+ html = self.request(url).text
+ data = text.extr(
+ html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
+ 'type="application/json">', '</script>')
+ return util.json_loads(data)["__DEFAULT_SCOPE__"]
+
+ def _check_status_code(self, detail, url):
+ status = detail.get("statusCode")
+ if not status:
+ return True
+
+ if status == 10222:
+ self.log.error("%s: Login required to access this post", url)
+ elif status == 10204:
+ self.log.error("%s: Requested post not available", url)
+ elif status == 10231:
+ self.log.error("%s: Region locked - Try downloading with a"
+ "VPN/proxy connection", url)
+ else:
+ self.log.error(
+ "%s: Received unknown error code %s ('%s')",
+ url, status, detail.get("statusMsg") or "")
+ return False
+
+
+class TiktokPostExtractor(TiktokExtractor):
+ """Extract a single video or photo TikTok link"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/(?:@([\w_.-]*)|share)/(?:phot|vide)o/(\d+)"
+ example = "https://www.tiktok.com/@USER/photo/1234567890"
+
+ def urls(self):
+ user, post_id = self.groups
+ url = "{}/@{}/video/{}".format(self.root, user or "", post_id)
+ return (url,)
+
+
+class TiktokVmpostExtractor(TiktokExtractor):
+ """Extract a single video or photo TikTok VM link"""
+ subcategory = "vmpost"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:v[mt]\.)?tiktok\.com|(?:www\.)?tiktok\.com/t"
+ r")/(?!@)([^/?#]+)")
+ example = "https://vm.tiktok.com/1a2B3c4E5"
+
+ def items(self):
+ url = text.ensure_http_scheme(self.url)
+ headers = {"User-Agent": "facebookexternalhit/1.1"}
+
+ response = self.request(url, headers=headers, method="HEAD",
+ allow_redirects=False, notfound="post")
+
+ url = response.headers.get("Location")
+ if not url or len(url) <= 28:
+ # https://www.tiktok.com/?_r=1
+ raise exception.NotFoundError("post")
+
+ data = {"_extractor": TiktokPostExtractor}
+ yield Message.Queue, url.partition("?")[0], data
+
+
+class TiktokUserExtractor(TiktokExtractor):
+ """Extract a TikTok user's profile"""
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)"
+ example = "https://www.tiktok.com/@USER"
+
+ def urls(self):
+ """Attempt to use yt-dlp/youtube-dl to extract links from a
+ user's page"""
+
+ try:
+ module = ytdl.import_module(self.config("module"))
+ except (ImportError, SyntaxError) as exc:
+ self.log.error("Cannot import module '%s'",
+ getattr(exc, "name", ""))
+ self.log.debug("", exc_info=exc)
+ raise exception.ExtractionError("yt-dlp or youtube-dl is required "
+ "for this feature!")
+ extr_opts = {
+ "extract_flat" : True,
+ "ignore_no_formats_error": True,
+ }
+ user_opts = {
+ "retries" : self._retries,
+ "socket_timeout" : self._timeout,
+ "nocheckcertificate" : not self._verify,
+ "playlist_items" : str(self.config("tiktok-range", "")),
+ }
+ if self._proxies:
+ user_opts["proxy"] = self._proxies.get("http")
+
+ ytdl_instance = ytdl.construct_YoutubeDL(
+ module, self, user_opts, extr_opts)
+
+ # transfer cookies to ytdl
+ if self.cookies:
+ set_cookie = ytdl_instance.cookiejar.set_cookie
+ for cookie in self.cookies:
+ set_cookie(cookie)
+
+ with ytdl_instance as ydl:
+ info_dict = ydl._YoutubeDL__extract_info(
+ "{}/@{}".format(self.root, self.groups[0]),
+ ydl.get_info_extractor("TikTokUser"),
+ False, {}, True)
+ # This should include video and photo posts in /video/ URL form.
+ return [video["url"] for video in info_dict["entries"]]
+
+ def avatar(self):
+ return self.groups[0]
diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py
index a725a2c..3b0ea36 100644
--- a/gallery_dl/extractor/twibooru.py
+++ b/gallery_dl/extractor/twibooru.py
@@ -12,7 +12,7 @@ from .booru import BooruExtractor
from .. import text, exception
import operator
-BASE_PATTERN = r"(?:https?://)?twibooru\.org"
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?twibooru\.org"
class TwibooruExtractor(BooruExtractor):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 840e846..c391bad 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -234,6 +234,13 @@ class TwitterExtractor(Extractor):
for fmt in self._size_fallback:
yield base + fmt
+ def _extract_components(self, tweet, data, files):
+ for component_id in data["components"]:
+ com = data["component_objects"][component_id]
+ for conv in com["data"]["conversation_preview"]:
+ for url in conv.get("mediaUrls") or ():
+ files.append({"url": url})
+
def _extract_card(self, tweet, files):
card = tweet["card"]
if "legacy" in card:
@@ -272,7 +279,11 @@ class TwitterExtractor(Extractor):
return
elif name == "unified_card":
data = util.json_loads(bvals["unified_card"]["string_value"])
- self._extract_media(tweet, data["media_entities"].values(), files)
+ if "media_entities" in data:
+ self._extract_media(
+ tweet, data["media_entities"].values(), files)
+ if "component_objects" in data:
+ self._extract_components(tweet, data, files)
return
if self.cards == "ytdl":
@@ -1065,7 +1076,7 @@ class TwitterAPI():
else:
csrf_token = None
if not csrf_token:
- csrf_token = util.generate_token(80)
+ csrf_token = util.generate_token()
cookies.set("ct0", csrf_token, domain=cookies_domain)
auth_token = cookies.get("auth_token", domain=cookies_domain)
diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py
index 5cde0d6..af3f32d 100644
--- a/gallery_dl/extractor/vipergirls.py
+++ b/gallery_dl/extractor/vipergirls.py
@@ -29,7 +29,17 @@ class VipergirlsExtractor(Extractor):
def _init(self):
domain = self.config("domain")
if domain:
- self.root = text.ensure_http_scheme(domain)
+ pos = domain.find("://")
+ if pos >= 0:
+ self.root = domain.rstrip("/")
+ self.cookies_domain = "." + domain[pos+1:].strip("/")
+ else:
+ domain = domain.strip("/")
+ self.root = "https://" + domain
+ self.cookies_domain = "." + domain
+ else:
+ self.root = "https://viper.click"
+ self.cookies_domain = ".viper.click"
def items(self):
self.login()
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index 1c0c172..a53409c 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -38,7 +38,7 @@ class VscoExtractor(Extractor):
if img["is_video"]:
if not videos:
continue
- url = "https://" + img["video_url"]
+ url = text.ensure_http_scheme(img["video_url"])
else:
base = img["responsive_url"].partition("/")[2]
cdn, _, path = base.partition("/")
@@ -63,6 +63,10 @@ class VscoExtractor(Extractor):
"height": img["height"],
"description": img.get("description") or "",
})
+ if data["extension"] == "m3u8":
+ url = "ytdl:" + url
+ data["_ytdl_manifest"] = "hls"
+ data["extension"] = "mp4"
yield Message.Url, url, data
def images(self):
@@ -294,12 +298,33 @@ class VscoImageExtractor(VscoExtractor):
pattern = USER_PATTERN + r"/media/([0-9a-fA-F]+)"
example = "https://vsco.co/USER/media/0123456789abcdef"
- def __init__(self, match):
- VscoExtractor.__init__(self, match)
- self.media_id = match.group(2)
-
def images(self):
- url = "{}/{}/media/{}".format(self.root, self.user, self.media_id)
+ url = "{}/{}/media/{}".format(self.root, self.user, self.groups[1])
data = self._extract_preload_state(url)
media = data["medias"]["byId"].popitem()[1]["media"]
return (self._transform_media(media),)
+
+
+class VscoVideoExtractor(VscoExtractor):
+ """Extractor for vsco.co videos links"""
+ subcategory = "video"
+ pattern = USER_PATTERN + r"/video/([^/?#]+)"
+ example = "https://vsco.co/USER/video/012345678-9abc-def0"
+
+ def images(self):
+ url = "{}/{}/video/{}".format(self.root, self.user, self.groups[1])
+ data = self._extract_preload_state(url)
+ media = data["medias"]["byId"].popitem()[1]["media"]
+
+ return ({
+ "_id" : media["id"],
+ "is_video" : True,
+ "grid_name" : "",
+ "upload_date" : media["createdDate"],
+ "responsive_url": media["posterUrl"],
+ "video_url" : "ytdl:" + media.get("playbackUrl"),
+ "image_meta" : None,
+ "width" : media["width"],
+ "height" : media["height"],
+ "description" : media["description"],
+ },)
diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py
index fc1badb..cacefd6 100644
--- a/gallery_dl/extractor/weebcentral.py
+++ b/gallery_dl/extractor/weebcentral.py
@@ -50,14 +50,16 @@ class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor):
def metadata(self, page):
extr = text.extract_from(page)
manga_id = extr("'series_id': '", "'")
-
- data = self._extract_manga_data(manga_id)
- data["chapter_id"] = self.groups[1]
- data["chapter_type"] = extr("'chapter_type': '", "'")
-
+ chapter_type = extr("'chapter_type': '", "'")
chapter, sep, minor = extr("'number': '", "'").partition(".")
- data["chapter"] = text.parse_int(chapter)
- data["chapter_minor"] = sep + minor
+
+ data = {
+ "chapter": text.parse_int(chapter),
+ "chapter_id": self.groups[1],
+ "chapter_type": chapter_type,
+ "chapter_minor": sep + minor,
+ }
+ data.update(self._extract_manga_data(manga_id))
return data
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 9885d79..3ed5a06 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -33,6 +33,7 @@ class WeiboExtractor(Extractor):
self.livephoto = self.config("livephoto", True)
self.retweets = self.config("retweets", False)
self.videos = self.config("videos", True)
+ self.movies = self.config("movies", False)
self.gifs = self.config("gifs", True)
self.gifs_video = (self.gifs == "video")
@@ -134,7 +135,10 @@ class WeiboExtractor(Extractor):
if "page_info" in status:
info = status["page_info"]
if "media_info" in info and self.videos:
- append(self._extract_video(info["media_info"]))
+ if info.get("type") != "5" or self.movies:
+ append(self._extract_video(info["media_info"]))
+ else:
+ self.log.debug("%s: Ignoring 'movie' video", status["id"])
def _extract_video(self, info):
try: