diff options
Diffstat (limited to 'gallery_dl')
45 files changed, 1611 insertions, 163 deletions
diff --git a/gallery_dl/aes.py b/gallery_dl/aes.py index 891104a..6727541 100644 --- a/gallery_dl/aes.py +++ b/gallery_dl/aes.py @@ -14,6 +14,13 @@ except ImportError: from Crypto.Cipher import AES as Cryptodome_AES except ImportError: Cryptodome_AES = None +except Exception as exc: + Cryptodome_AES = None + import logging + logging.getLogger("aes").warning( + "Error when trying to import 'Cryptodome' module (%s: %s)", + exc.__class__.__name__, exc) + del logging if Cryptodome_AES: diff --git a/gallery_dl/archive.py b/gallery_dl/archive.py index 5f05bbf..edecb10 100644 --- a/gallery_dl/archive.py +++ b/gallery_dl/archive.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,50 +9,94 @@ """Download Archives""" import os -import sqlite3 -from . import formatter +import logging +from . import util, formatter + +log = logging.getLogger("archive") + + +def connect(path, prefix, format, + table=None, mode=None, pragma=None, kwdict=None, cache_key=None): + keygen = formatter.parse(prefix + format).format_map + + if isinstance(path, str) and path.startswith( + ("postgres://", "postgresql://")): + if mode == "memory": + cls = DownloadArchivePostgresqlMemory + else: + cls = DownloadArchivePostgresql + else: + path = util.expand_path(path) + if kwdict is not None and "{" in path: + path = formatter.parse(path).format_map(kwdict) + if mode == "memory": + cls = DownloadArchiveMemory + else: + cls = DownloadArchive + + if kwdict is not None and table: + table = formatter.parse(table).format_map(kwdict) + + return cls(path, keygen, table, pragma, cache_key) + + +def sanitize(name): + return '"' + name.replace('"', "_") + '"' class DownloadArchive(): + _sqlite3 = None + + def __init__(self, path, keygen, table=None, pragma=None, cache_key=None): + if self._sqlite3 is None: + DownloadArchive._sqlite3 = __import__("sqlite3") - def __init__(self, path, format_string, pragma=None, - cache_key="_archive_key"): try: - con = sqlite3.connect(path, timeout=60, check_same_thread=False) - except sqlite3.OperationalError: + con = self._sqlite3.connect( + path, timeout=60, check_same_thread=False) + except self._sqlite3.OperationalError: os.makedirs(os.path.dirname(path)) - con = sqlite3.connect(path, timeout=60, check_same_thread=False) + con = self._sqlite3.connect( + path, timeout=60, check_same_thread=False) con.isolation_level = None - self.keygen = formatter.parse(format_string).format_map + self.keygen = keygen self.connection = con self.close = con.close self.cursor = cursor = con.cursor() - self._cache_key = cache_key + self._cache_key = cache_key or "_archive_key" + + table = "archive" if table is None else sanitize(table) + self._stmt_select = ( + "SELECT 1 " + "FROM " + table + " " + "WHERE entry=? " + "LIMIT 1") + self._stmt_insert = ( + "INSERT OR IGNORE INTO " + table + " " + "(entry) VALUES (?)") if pragma: for stmt in pragma: cursor.execute("PRAGMA " + stmt) try: - cursor.execute("CREATE TABLE IF NOT EXISTS archive " + cursor.execute("CREATE TABLE IF NOT EXISTS " + table + " " "(entry TEXT PRIMARY KEY) WITHOUT ROWID") - except sqlite3.OperationalError: + except self._sqlite3.OperationalError: # fallback for missing WITHOUT ROWID support (#553) - cursor.execute("CREATE TABLE IF NOT EXISTS archive " + cursor.execute("CREATE TABLE IF NOT EXISTS " + table + " " "(entry TEXT PRIMARY KEY)") def add(self, kwdict): """Add item described by 'kwdict' to archive""" key = kwdict.get(self._cache_key) or self.keygen(kwdict) - self.cursor.execute( - "INSERT OR IGNORE INTO archive (entry) VALUES (?)", (key,)) + self.cursor.execute(self._stmt_insert, (key,)) def check(self, kwdict): """Return True if the item described by 'kwdict' exists in archive""" key = kwdict[self._cache_key] = self.keygen(kwdict) - self.cursor.execute( - "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) + self.cursor.execute(self._stmt_select, (key,)) return self.cursor.fetchone() def finalize(self): @@ -61,9 +105,9 @@ class DownloadArchive(): class DownloadArchiveMemory(DownloadArchive): - def __init__(self, path, format_string, pragma=None, - cache_key="_archive_key"): - DownloadArchive.__init__(self, path, format_string, pragma, cache_key) + def __init__(self, path, keygen, table=None, pragma=None, cache_key=None): + DownloadArchive.__init__( + self, path, keygen, table, pragma, cache_key) self.keys = set() def add(self, kwdict): @@ -75,8 +119,7 @@ class DownloadArchiveMemory(DownloadArchive): key = kwdict[self._cache_key] = self.keygen(kwdict) if key in self.keys: return True - self.cursor.execute( - "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) + self.cursor.execute(self._stmt_select, (key,)) return self.cursor.fetchone() def finalize(self): @@ -87,12 +130,110 @@ class DownloadArchiveMemory(DownloadArchive): with self.connection: try: cursor.execute("BEGIN") - except sqlite3.OperationalError: + except self._sqlite3.OperationalError: pass - stmt = "INSERT OR IGNORE INTO archive (entry) VALUES (?)" + stmt = self._stmt_insert if len(self.keys) < 100: for key in self.keys: cursor.execute(stmt, (key,)) else: cursor.executemany(stmt, ((key,) for key in self.keys)) + + +class DownloadArchivePostgresql(): + _psycopg = None + + def __init__(self, uri, keygen, table=None, pragma=None, cache_key=None): + if self._psycopg is None: + DownloadArchivePostgresql._psycopg = __import__("psycopg") + + self.connection = con = self._psycopg.connect(uri) + self.cursor = cursor = con.cursor() + self.close = con.close + self.keygen = keygen + self._cache_key = cache_key or "_archive_key" + + table = "archive" if table is None else sanitize(table) + self._stmt_select = ( + "SELECT true " + "FROM " + table + " " + "WHERE entry=%s " + "LIMIT 1") + self._stmt_insert = ( + "INSERT INTO " + table + " (entry) " + "VALUES (%s) " + "ON CONFLICT DO NOTHING") + + try: + cursor.execute("CREATE TABLE IF NOT EXISTS " + table + " " + "(entry TEXT PRIMARY KEY)") + con.commit() + except Exception as exc: + log.error("%s: %s when creating '%s' table: %s", + con, exc.__class__.__name__, table, exc) + con.rollback() + raise + + def add(self, kwdict): + key = kwdict.get(self._cache_key) or self.keygen(kwdict) + try: + self.cursor.execute(self._stmt_insert, (key,)) + self.connection.commit() + except Exception as exc: + log.error("%s: %s when writing entry: %s", + self.connection, exc.__class__.__name__, exc) + self.connection.rollback() + + def check(self, kwdict): + key = kwdict[self._cache_key] = self.keygen(kwdict) + try: + self.cursor.execute(self._stmt_select, (key,)) + return self.cursor.fetchone() + except Exception as exc: + log.error("%s: %s when checking entry: %s", + self.connection, exc.__class__.__name__, exc) + self.connection.rollback() + return False + + def finalize(self): + pass + + +class DownloadArchivePostgresqlMemory(DownloadArchivePostgresql): + + def __init__(self, path, keygen, table=None, pragma=None, cache_key=None): + DownloadArchivePostgresql.__init__( + self, path, keygen, table, pragma, cache_key) + self.keys = set() + + def add(self, kwdict): + self.keys.add( + kwdict.get(self._cache_key) or + self.keygen(kwdict)) + + def check(self, kwdict): + key = kwdict[self._cache_key] = self.keygen(kwdict) + if key in self.keys: + return True + try: + self.cursor.execute(self._stmt_select, (key,)) + return self.cursor.fetchone() + except Exception as exc: + log.error("%s: %s when checking entry: %s", + self.connection, exc.__class__.__name__, exc) + self.connection.rollback() + return False + + def finalize(self): + if not self.keys: + return + try: + self.cursor.executemany( + self._stmt_insert, + ((key,) for key in self.keys)) + self.connection.commit() + except Exception as exc: + log.error("%s: %s when writing entries: %s", + self.connection, exc.__class__.__name__, exc) + self.connection.rollback() diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index 1168d83..8430884 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2022 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,6 +10,7 @@ import os from .. import config, util +_config = config._config class DownloaderBase(): @@ -17,8 +18,15 @@ class DownloaderBase(): scheme = "" def __init__(self, job): + extractor = job.extractor + + opts = self._extractor_config(extractor) + if opts: + self.opts = opts + self.config = self.config_opts + self.out = job.out - self.session = job.extractor.session + self.session = extractor.session self.part = self.config("part", True) self.partdir = self.config("part-directory") self.log = job.get_logger("downloader." + self.scheme) @@ -29,7 +37,7 @@ class DownloaderBase(): proxies = self.config("proxy", util.SENTINEL) if proxies is util.SENTINEL: - self.proxies = job.extractor._proxies + self.proxies = extractor._proxies else: self.proxies = util.build_proxy_map(proxies, self.log) @@ -37,5 +45,45 @@ class DownloaderBase(): """Interpolate downloader config value for 'key'""" return config.interpolate(("downloader", self.scheme), key, default) + def config_opts(self, key, default=None, conf=_config): + if key in conf: + return conf[key] + value = self.opts.get(key, util.SENTINEL) + if value is not util.SENTINEL: + return value + return config.interpolate(("downloader", self.scheme), key, default) + + def _extractor_config(self, extractor): + path = extractor._cfgpath + if not isinstance(path, list): + return self._extractor_opts(path[1], path[2]) + + opts = {} + for cat, sub in reversed(path): + popts = self._extractor_opts(cat, sub) + if popts: + opts.update(popts) + return opts + + def _extractor_opts(self, category, subcategory): + cfg = config.get(("extractor",), category) + if not cfg: + return None + + copts = cfg.get(self.scheme) + if copts: + if subcategory in cfg: + sopts = cfg[subcategory].get(self.scheme) + if sopts: + opts = copts.copy() + opts.update(sopts) + return opts + return copts + + if subcategory in cfg: + return cfg[subcategory].get(self.scheme) + + return None + def download(self, url, pathfmt): """Write data from 'url' into the file specified by 'pathfmt'""" diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index c8aeef8..449ffe8 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -12,7 +12,7 @@ import time import mimetypes from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase -from .. import text, util +from .. import text, util, output from ssl import SSLError @@ -38,6 +38,7 @@ class HttpDownloader(DownloaderBase): self.verify = self.config("verify", extractor._verify) self.mtime = self.config("mtime", True) self.rate = self.config("rate") + interval_429 = self.config("sleep-429") if not self.config("consume-content", False): # this resets the underlying TCP connection, and therefore @@ -79,12 +80,16 @@ class HttpDownloader(DownloaderBase): self.receive = self._receive_rate if self.progress < 0.0: self.progress = 0.0 + if interval_429 is None: + self.interval_429 = extractor._interval_429 + else: + self.interval_429 = util.build_duration_func(interval_429) def download(self, url, pathfmt): try: return self._download_impl(url, pathfmt) except Exception: - print() + output.stderr_write("\n") raise finally: # remove file from incomplete downloads @@ -93,7 +98,7 @@ class HttpDownloader(DownloaderBase): def _download_impl(self, url, pathfmt): response = None - tries = 0 + tries = code = 0 msg = "" metadata = self.metadata @@ -111,10 +116,17 @@ class HttpDownloader(DownloaderBase): if response: self.release_conn(response) response = None + self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) if tries > self.retries: return False - time.sleep(tries) + + if code == 429 and self.interval_429: + s = self.interval_429() + time.sleep(s if s > tries else tries) + else: + time.sleep(tries) + code = 0 tries += 1 file_header = None @@ -257,7 +269,7 @@ class HttpDownloader(DownloaderBase): else response.iter_content(16), b"") except (RequestException, SSLError) as exc: msg = str(exc) - print() + output.stderr_write("\n") continue if self._adjust_extension(pathfmt, file_header) and \ pathfmt.exists(): @@ -291,14 +303,14 @@ class HttpDownloader(DownloaderBase): self.receive(fp, content, size, offset) except (RequestException, SSLError) as exc: msg = str(exc) - print() + output.stderr_write("\n") continue # check file size if size and fp.tell() < size: msg = "file size mismatch ({} < {})".format( fp.tell(), size) - print() + output.stderr_write("\n") continue break @@ -317,7 +329,7 @@ class HttpDownloader(DownloaderBase): for _ in response.iter_content(self.chunk_size): pass except (RequestException, SSLError) as exc: - print() + output.stderr_write("\n") self.log.debug( "Unable to consume response body (%s: %s); " "closing the connection anyway", exc.__class__.__name__, exc) diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 40cddec..1242098 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -48,6 +48,13 @@ class YoutubeDLDownloader(DownloaderBase): self.log.debug("", exc_info=exc) self.download = lambda u, p: False return False + + try: + ytdl_version = module.version.__version__ + except Exception: + ytdl_version = "" + self.log.debug("Using %s version %s", module, ytdl_version) + self.ytdl_instance = ytdl_instance = ytdl.construct_YoutubeDL( module, self, self.ytdl_opts) if self.outtmpl == "default": diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index fc8d7b2..00b22d4 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -44,6 +44,7 @@ modules = [ "danbooru", "desktopography", "deviantart", + "discord", "dynastyscans", "e621", "erome", @@ -56,6 +57,7 @@ modules = [ "fapachi", "flickr", "furaffinity", + "furry34", "fuskator", "gelbooru", "gelbooru_v01", @@ -80,6 +82,7 @@ modules = [ "imgbox", "imgth", "imgur", + "imhentai", "inkbunny", "instagram", "issuu", @@ -168,6 +171,7 @@ modules = [ "tapas", "tcbscans", "telegraph", + "tiktok", "tmohentai", "toyhouse", "tsumino", diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py index b9de165..597ec40 100644 --- a/gallery_dl/extractor/bilibili.py +++ b/gallery_dl/extractor/bilibili.py @@ -81,6 +81,27 @@ class BilibiliArticleExtractor(BilibiliExtractor): yield Message.Url, url, text.nameext_from_url(url, article) +class BilibiliUserArticlesFavoriteExtractor(BilibiliExtractor): + subcategory = "user-articles-favorite" + pattern = (r"(?:https?://)?space\.bilibili\.com" + r"/(\d+)/favlist\?fid=opus") + example = "https://space.bilibili.com/12345/favlist?fid=opus" + _warning = True + + def _init(self): + BilibiliExtractor._init(self) + if self._warning: + if not self.cookies_check(("SESSDATA",)): + self.log.error("'SESSDATA' cookie required") + BilibiliUserArticlesFavoriteExtractor._warning = False + + def items(self): + for article in self.api.user_favlist(): + article["_extractor"] = BilibiliArticleExtractor + url = "{}/opus/{}".format(self.root, article["opus_id"]) + yield Message.Queue, url, article + + class BilibiliAPI(): def __init__(self, extractor): self.extractor = extractor @@ -122,3 +143,28 @@ class BilibiliAPI(): raise exception.StopExtraction( "%s: Unable to extract INITIAL_STATE data", article_id) self.extractor.wait(seconds=300) + + def user_favlist(self): + endpoint = "/opus/feed/fav" + params = {"page": 1, "page_size": 20} + + while True: + data = self._call(endpoint, params)["data"] + + yield from data["items"] + + if not data.get("has_more"): + break + params["page"] += 1 + + def login_user_id(self): + url = "https://api.bilibili.com/x/space/v2/myinfo" + data = self.extractor.request(url).json() + + if data["code"] != 0: + self.extractor.log.debug("Server response: %s", data) + raise exception.StopExtraction("API request failed,Are you login?") + try: + return data["data"]["profile"]["mid"] + except Exception: + raise exception.StopExtraction("API request failed") diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py index c28fad9..f3e441b 100644 --- a/gallery_dl/extractor/boosty.py +++ b/gallery_dl/extractor/boosty.py @@ -8,6 +8,7 @@ from .common import Extractor, Message from .. import text, util, exception +import itertools BASE_PATTERN = r"(?:https?://)?boosty\.to" @@ -53,7 +54,9 @@ class BoostyExtractor(Extractor): self.log.warning("Not allowed to access post %s", post["id"]) continue - files = self._process_post(post) + files = self._extract_files(post) + if self._user: + post["user"] = self._user data = { "post" : post, "user" : post.pop("user", None), @@ -69,15 +72,13 @@ class BoostyExtractor(Extractor): def posts(self): """Yield JSON content of all relevant posts""" - def _process_post(self, post): + def _extract_files(self, post): files = [] post["content"] = content = [] post["links"] = links = [] if "createdAt" in post: post["date"] = text.parse_timestamp(post["createdAt"]) - if self._user: - post["user"] = self._user for block in post["data"]: try: @@ -94,7 +95,7 @@ class BoostyExtractor(Extractor): elif type == "ok_video": if not self.videos: self.log.debug("%s: Skipping video %s", - post["int_id"], block["id"]) + post["id"], block["id"]) continue fmts = { fmt["type"]: fmt["url"] @@ -114,7 +115,7 @@ class BoostyExtractor(Extractor): else: self.log.warning( "%s: Found no suitable video format for %s", - post["int_id"], block["id"]) + post["id"], block["id"]) elif type == "link": url = block["url"] @@ -127,9 +128,12 @@ class BoostyExtractor(Extractor): elif type == "file": files.append(self._update_url(post, block)) + elif type == "smile": + content.append(":" + block["name"] + ":") + else: self.log.debug("%s: Unsupported data type '%s'", - post["int_id"], type) + post["id"], type) except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) @@ -219,6 +223,51 @@ class BoostyFollowingExtractor(BoostyExtractor): yield Message.Queue, url, user +class BoostyDirectMessagesExtractor(BoostyExtractor): + """Extractor for boosty.to direct messages""" + subcategory = "direct-messages" + directory_fmt = ("{category}", "{user[blogUrl]} ({user[id]})", + "Direct Messages") + pattern = BASE_PATTERN + r"/app/messages/?\?dialogId=(\d+)" + example = "https://boosty.to/app/messages?dialogId=12345" + + def items(self): + """Yield direct messages from a given dialog ID.""" + dialog_id = self.groups[0] + response = self.api.dialog(dialog_id) + signed_query = response.get("signedQuery") + + try: + messages = response["messages"]["data"] + offset = messages[0]["id"] + except Exception: + return + + try: + user = self.api.user(response["chatmate"]["url"]) + except Exception: + user = None + + messages.reverse() + for message in itertools.chain( + messages, + self.api.dialog_messages(dialog_id, offset=offset) + ): + message["signedQuery"] = signed_query + files = self._extract_files(message) + data = { + "post": message, + "user": user, + "count": len(files), + } + + yield Message.Directory, data + for data["num"], file in enumerate(files, 1): + data["file"] = file + url = file["url"] + yield Message.Url, url, text.nameext_from_url(url, data) + + class BoostyAPI(): """Interface for the Boosty API""" root = "https://api.boosty.to" @@ -367,3 +416,32 @@ class BoostyAPI(): if offset > data["total"]: return params["offset"] = offset + + def dialog(self, dialog_id): + endpoint = "/v1/dialog/{}".format(dialog_id) + return self._call(endpoint) + + def dialog_messages(self, dialog_id, limit=300, offset=None): + endpoint = "/v1/dialog/{}/message/".format(dialog_id) + params = { + "limit": limit, + "reverse": "true", + "offset": offset, + } + return self._pagination_dialog(endpoint, params) + + def _pagination_dialog(self, endpoint, params): + while True: + data = self._call(endpoint, params) + + yield from data["data"] + + try: + extra = data["extra"] + if extra.get("isLast"): + break + params["offset"] = offset = extra["offset"] + if not offset: + break + except Exception: + break diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 25e9fd5..201b8f4 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -10,7 +10,8 @@ from .common import Extractor from .lolisafe import LolisafeAlbumExtractor -from .. import text, config, exception +from .. import text, util, config, exception +import binascii import random if config.get(("extractor", "bunkr"), "tlds"): @@ -60,6 +61,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkr.si albums""" category = "bunkr" root = "https://bunkr.si" + root_dl = "https://get.bunkrr.su" + archive_fmt = "{album_id}_{id|id_url}" pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://bunkr.si/a/ID" @@ -68,6 +71,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): domain = self.groups[0] or self.groups[1] if domain not in LEGACY_DOMAINS: self.root = "https://" + domain + self.offset = 0 + + def skip(self, num): + self.offset = num + return num def request(self, url, **kwargs): kwargs["encoding"] = "utf-8" @@ -132,6 +140,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): } def _extract_files(self, items): + if self.offset: + items = util.advance(items, self.offset) + for item in items: try: url = text.unescape(text.extr(item, ' href="', '"')) @@ -154,26 +165,43 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): self.log.debug("", exc_info=exc) def _extract_file(self, webpage_url): - response = self.request(webpage_url) - page = response.text - file_url = (text.extr(page, '<source src="', '"') or - text.extr(page, '<img src="', '"')) + page = self.request(webpage_url).text + data_id = text.extr(page, 'data-file-id="', '"') + referer = self.root_dl + "/file/" + data_id + + url = self.root_dl + "/api/vs" + headers = {"Referer": referer} + data = self.request( + url, method="POST", headers=headers, json={"id": data_id}).json() + + if data.get("encrypted"): + file_url = self._decrypt_url(data["url"], data["timestamp"]) + else: + file_url = data["url"] + file_name = (text.extr(page, 'property="og:title" content="', '"') or text.extr(page, "<title>", " | Bunkr<")) - - if not file_url: - webpage_url = text.unescape(text.rextract( - page, ' href="', '"', page.rindex("Download"))[0]) - response = self.request(webpage_url) - file_url = text.rextract(response.text, ' href="', '"')[0] + fallback = text.extr(page, 'property="og:url" content="', '"') return { - "file" : text.unescape(file_url), + "file" : file_url, "name" : text.unescape(file_name), - "_http_headers" : {"Referer": response.url}, + "id_url" : data_id, + "_fallback" : (fallback,) if fallback else (), + "_http_headers" : {"Referer": referer}, "_http_validate": self._validate, } + def _decrypt_url(self, encrypted_b64, timestamp): + encrypted_bytes = binascii.a2b_base64(encrypted_b64) + key = "SECRET_KEY_{}".format(timestamp // 3600).encode() + div = len(key) + + return bytes([ + encrypted_bytes[i] ^ key[i % div] + for i in range(len(encrypted_bytes)) + ]).decode() + def _validate(self, response): if response.history and response.url.endswith("/maintenance-vid.mp4"): self.log.warning("File server in maintenance mode") diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index aedcea4..de22a7b 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -57,7 +57,8 @@ class CheveretoImageExtractor(CheveretoExtractor): image = { "id" : self.path.rpartition(".")[2], - "url" : extr('<meta property="og:image" content="', '"'), + "url" : (extr('<meta property="og:image" content="', '"') or + extr('url: "', '"')), "album": text.extr(extr("Added to <a", "/a>"), ">", "<"), "user" : extr('username: "', '"'), } diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 13fd88a..d58db6f 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -915,7 +915,7 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): options=ssl_options or None, ciphers=ssl_ciphers) if not requests.__version__ < "2.32": # https://github.com/psf/requests/pull/6731 - ssl_context.load_default_certs() + ssl_context.load_verify_locations(requests.certs.where()) ssl_context.check_hostname = False else: ssl_context = None diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py new file mode 100644 index 0000000..6a5fcc9 --- /dev/null +++ b/gallery_dl/extractor/discord.py @@ -0,0 +1,399 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://discord.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +BASE_PATTERN = r"(?:https?://)?discord\.com" + + +class DiscordExtractor(Extractor): + """Base class for Discord extractors""" + category = "discord" + root = "https://discord.com" + directory_fmt = ("{category}", "{server_id}_{server}", + "{channel_id}_{channel}") + filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}" + archive_fmt = "{message_id}_{num}" + + cdn_fmt = "https://cdn.discordapp.com/{}/{}/{}.png?size=4096" + + server_metadata = {} + server_channels_metadata = {} + + def _init(self): + self.token = self.config("token") + self.enabled_embeds = self.config("embeds", ["image", "gifv", "video"]) + self.enabled_threads = self.config("threads", True) + self.api = DiscordAPI(self) + + def extract_message_text(self, message): + text_content = [message["content"]] + + for embed in message["embeds"]: + if embed["type"] == "rich": + try: + text_content.append(embed["author"]["name"]) + except Exception: + pass + text_content.append(embed.get("title", "")) + text_content.append(embed.get("description", "")) + + for field in embed.get("fields", []): + text_content.append(field.get("name", "")) + text_content.append(field.get("value", "")) + + text_content.append(embed.get("footer", {}).get("text", "")) + + if message.get("poll"): + text_content.append(message["poll"]["question"]["text"]) + for answer in message["poll"]["answers"]: + text_content.append(answer["poll_media"]["text"]) + + return "\n".join(t for t in text_content if t) + + def extract_message(self, message): + # https://discord.com/developers/docs/resources/message#message-object-message-types + if message["type"] in (0, 19, 21): + message_metadata = {} + message_metadata.update(self.server_metadata) + message_metadata.update( + self.server_channels_metadata[message["channel_id"]]) + message_metadata.update({ + "author": message["author"]["username"], + "author_id": message["author"]["id"], + "author_files": [], + "message": self.extract_message_text(message), + "message_id": message["id"], + "date": text.parse_datetime( + message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z" + ), + "files": [] + }) + + for icon_type, icon_path in ( + ("avatar", "avatars"), + ("banner", "banners") + ): + if message["author"].get(icon_type): + message_metadata["author_files"].append({ + "url": self.cdn_fmt.format( + icon_path, + message_metadata["author_id"], + message["author"][icon_type] + ), + "filename": icon_type, + "extension": "png", + }) + + for attachment in message["attachments"]: + message_metadata["files"].append({ + "url": attachment["url"], + "type": "attachment", + }) + + for embed in message["embeds"]: + if embed["type"] in self.enabled_embeds: + for field in ("video", "image", "thumbnail"): + if field not in embed: + continue + url = embed[field].get("proxy_url") + if url is not None: + message_metadata["files"].append({ + "url": url, + "type": "embed", + }) + break + + for num, file in enumerate(message_metadata["files"], start=1): + text.nameext_from_url(file["url"], file) + file["num"] = num + + yield Message.Directory, message_metadata + + for file in message_metadata["files"]: + message_metadata_file = message_metadata.copy() + message_metadata_file.update(file) + yield Message.Url, file["url"], message_metadata_file + + def extract_channel_text(self, channel_id): + for message in self.api.get_channel_messages(channel_id): + yield from self.extract_message(message) + + def extract_channel_threads(self, channel_id): + for thread in self.api.get_channel_threads(channel_id): + id = self.parse_channel(thread)["channel_id"] + yield from self.extract_channel_text(id) + + def extract_channel(self, channel_id, safe=False): + try: + if channel_id not in self.server_channels_metadata: + self.parse_channel(self.api.get_channel(channel_id)) + + channel_type = ( + self.server_channels_metadata[channel_id]["channel_type"] + ) + + # https://discord.com/developers/docs/resources/channel#channel-object-channel-types + if channel_type in (0, 5): + yield from self.extract_channel_text(channel_id) + if self.enabled_threads: + yield from self.extract_channel_threads(channel_id) + elif channel_type in (1, 3, 10, 11, 12): + yield from self.extract_channel_text(channel_id) + elif channel_type in (15, 16): + yield from self.extract_channel_threads(channel_id) + elif channel_type in (4,): + for channel in self.server_channels_metadata.copy().values(): + if channel["parent_id"] == channel_id: + yield from self.extract_channel( + channel["channel_id"], safe=True) + elif not safe: + raise exception.StopExtraction( + "This channel type is not supported." + ) + except exception.HttpError as exc: + if not (exc.status == 403 and safe): + raise + + def parse_channel(self, channel): + parent_id = channel.get("parent_id") + channel_metadata = { + "channel": channel.get("name", ""), + "channel_id": channel.get("id"), + "channel_type": channel.get("type"), + "channel_topic": channel.get("topic", ""), + "parent_id": parent_id, + "is_thread": "thread_metadata" in channel + } + + if parent_id in self.server_channels_metadata: + parent_metadata = self.server_channels_metadata[parent_id] + channel_metadata.update({ + "parent": parent_metadata["channel"], + "parent_type": parent_metadata["channel_type"] + }) + + if channel_metadata["channel_type"] in (1, 3): + channel_metadata.update({ + "channel": "DMs", + "recipients": ( + [user["username"] for user in channel["recipients"]] + ), + "recipients_id": ( + [user["id"] for user in channel["recipients"]] + ) + }) + + channel_id = channel_metadata["channel_id"] + + self.server_channels_metadata[channel_id] = channel_metadata + return channel_metadata + + def parse_server(self, server): + self.server_metadata = { + "server": server["name"], + "server_id": server["id"], + "server_files": [], + "owner_id": server["owner_id"] + } + + for icon_type, icon_path in ( + ("icon", "icons"), + ("banner", "banners"), + ("splash", "splashes"), + ("discovery_splash", "discovery-splashes") + ): + if server.get(icon_type): + self.server_metadata["server_files"].append({ + "url": self.cdn_fmt.format( + icon_path, + self.server_metadata["server_id"], + server[icon_type] + ), + "filename": icon_type, + "extension": "png", + }) + + return self.server_metadata + + def build_server_and_channels(self, server_id): + server = self.api.get_server(server_id) + self.parse_server(server) + + for channel in self.api.get_server_channels(server_id): + self.parse_channel(channel) + + +class DiscordChannelExtractor(DiscordExtractor): + subcategory = "channel" + pattern = BASE_PATTERN + r"/channels/(\d+)/(?:\d+/threads/)?(\d+)/?$" + example = "https://discord.com/channels/1234567890/9876543210" + + def items(self): + server_id, channel_id = self.groups + + self.build_server_and_channels(server_id) + + return self.extract_channel(channel_id) + + +class DiscordMessageExtractor(DiscordExtractor): + subcategory = "message" + pattern = BASE_PATTERN + r"/channels/(\d+)/(\d+)/(\d+)/?$" + example = "https://discord.com/channels/1234567890/9876543210/2468013579" + + def items(self): + server_id, channel_id, message_id = self.groups + + self.build_server_and_channels(server_id) + + if channel_id not in self.server_channels_metadata: + self.parse_channel(self.api.get_channel(channel_id)) + + return self.extract_message( + self.api.get_message(channel_id, message_id)) + + +class DiscordServerExtractor(DiscordExtractor): + subcategory = "server" + pattern = BASE_PATTERN + r"/channels/(\d+)/?$" + example = "https://discord.com/channels/1234567890" + + def items(self): + server_id = self.groups[0] + + self.build_server_and_channels(server_id) + + for channel in self.server_channels_metadata.copy().values(): + if channel["channel_type"] in (0, 5, 15, 16): + yield from self.extract_channel( + channel["channel_id"], safe=True) + + +class DiscordDirectMessagesExtractor(DiscordExtractor): + subcategory = "direct-messages" + directory_fmt = ("{category}", "Direct Messages", + "{channel_id}_{recipients:J,}") + pattern = BASE_PATTERN + r"/channels/@me/(\d+)/?$" + example = "https://discord.com/channels/@me/1234567890" + + def items(self): + return self.extract_channel(self.groups[0]) + + +class DiscordDirectMessageExtractor(DiscordExtractor): + subcategory = "direct-message" + directory_fmt = ("{category}", "Direct Messages", + "{channel_id}_{recipients:J,}") + pattern = BASE_PATTERN + r"/channels/@me/(\d+)/(\d+)/?$" + example = "https://discord.com/channels/@me/1234567890/9876543210" + + def items(self): + channel_id, message_id = self.groups + + self.parse_channel(self.api.get_channel(channel_id)) + + return self.extract_message( + self.api.get_message(channel_id, message_id)) + + +class DiscordAPI(): + """Interface for the Discord API v10 + + https://discord.com/developers/docs/reference + """ + + def __init__(self, extractor): + self.extractor = extractor + self.root = extractor.root + "/api/v10" + self.headers = {"Authorization": extractor.token} + + def get_server(self, server_id): + """Get server information""" + return self._call("/guilds/" + server_id) + + def get_server_channels(self, server_id): + """Get server channels""" + return self._call("/guilds/" + server_id + "/channels") + + def get_channel(self, channel_id): + """Get channel information""" + return self._call("/channels/" + channel_id) + + def get_channel_threads(self, channel_id): + """Get channel threads""" + THREADS_BATCH = 25 + + def _method(offset): + return self._call("/channels/" + channel_id + "/threads/search", { + "sort_by": "last_message_time", + "sort_order": "desc", + "limit": THREADS_BATCH, + "offset": + offset, + })["threads"] + + return self._pagination(_method, THREADS_BATCH) + + def get_channel_messages(self, channel_id): + """Get channel messages""" + MESSAGES_BATCH = 100 + + before = None + + def _method(_): + nonlocal before + messages = self._call("/channels/" + channel_id + "/messages", { + "limit": MESSAGES_BATCH, + "before": before + }) + before = messages[-1]["id"] + return messages + + return self._pagination(_method, MESSAGES_BATCH) + + def get_message(self, channel_id, message_id): + """Get message information""" + return self._call("/channels/" + channel_id + "/messages", { + "limit": 1, + "around": message_id + })[0] + + def _call(self, endpoint, params=None): + url = self.root + endpoint + try: + response = self.extractor.request( + url, params=params, headers=self.headers) + except exception.HttpError as exc: + if exc.status == 401: + self._raise_invalid_token() + raise + return response.json() + + def _pagination(self, method, batch): + offset = 0 + while True: + data = method(offset) + yield from data + if len(data) < batch: + return + offset += len(data) + + @staticmethod + def _raise_invalid_token(): + raise exception.AuthenticationError("""Invalid or missing token. +Please provide a valid token following these instructions: + +1) Open Discord in your browser (https://discord.com/app); +2) Open your browser's Developer Tools (F12) and switch to the Network panel; +3) Reload the page and select any request going to https://discord.com/api/...; +4) In the "Headers" tab, look for an entry beginning with "Authorization: "; +5) Right-click the entry and click "Copy Value"; +6) Paste the token in your configuration file under "extractor.discord.token", +or run this command with the -o "token=[your token]" argument.""") diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index e6d136f..55549de 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -44,6 +44,8 @@ class EromeExtractor(Extractor): pos = page.index('<div class="user-profile', pos) user, pos = text.extract( page, 'href="https://www.erome.com/', '"', pos) + tags, pos = text.extract( + page, '<p class="mt-10"', '</p>', pos) urls = [] date = None @@ -59,11 +61,13 @@ class EromeExtractor(Extractor): date = text.parse_timestamp(ts) data = { - "album_id" : album_id, - "title" : text.unescape(title), - "user" : text.unquote(user), - "count" : len(urls), - "date" : date, + "album_id": album_id, + "title" : text.unescape(title), + "user" : text.unquote(user), + "count" : len(urls), + "date" : date, + "tags" : [t.replace("+", " ") + for t in text.extract_iter(tags, "?q=", '"')], "_http_headers": {"Referer": url}, } diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 44c4542..5f90afc 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -81,8 +81,8 @@ BASE_PATTERN = FoolfuukaExtractor.update({ "pattern": r"(?:www\.)?archiveofsins\.com", }, "b4k": { - "root": "https://arch.b4k.co", - "pattern": r"arch\.b4k\.co", + "root": "https://arch.b4k.dev", + "pattern": r"arch\.b4k\.(?:dev|co)", }, "desuarchive": { "root": "https://desuarchive.org", diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index d253582..1466390 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -23,6 +23,7 @@ class FuraffinityExtractor(Extractor): cookies_domain = ".furaffinity.net" cookies_names = ("a", "b") root = "https://www.furaffinity.net" + request_interval = 1.0 _warning = True def __init__(self, match): diff --git a/gallery_dl/extractor/furry34.py b/gallery_dl/extractor/furry34.py new file mode 100644 index 0000000..e0c7fdb --- /dev/null +++ b/gallery_dl/extractor/furry34.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://furry34.com/""" + +from .booru import BooruExtractor +from .. import text +import collections + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?furry34\.com" + + +class Furry34Extractor(BooruExtractor): + category = "furry34" + root = "https://furry34.com" + root_cdn = "https://furry34com.b-cdn.net" + filename_fmt = "{category}_{id}.{extension}" + per_page = 30 + + TAG_TYPES = { + None: "general", + 1 : "general", + 2 : "copyright", + 4 : "character", + 8 : "artist", + } + FORMATS = ( + ("100", "mov.mp4"), + ("101", "mov720.mp4"), + ("102", "mov480.mp4"), + ("10" , "pic.jpg"), + ) + + def _file_url(self, post): + files = post["files"] + for fmt, extension in self.FORMATS: + if fmt in files: + break + else: + fmt = next(iter(files)) + + post_id = post["id"] + root = self.root_cdn if files[fmt][0] else self.root + post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format( + root, post_id // 1000, post_id, post_id, extension) + post["format_id"] = fmt + post["format"] = extension.partition(".")[0] + + return url + + def _prepare(self, post): + post.pop("files", None) + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["filename"], _, post["format"] = post["filename"].rpartition(".") + if "tags" in post: + post["tags"] = [t["value"] for t in post["tags"]] + + def _tags(self, post, _): + if "tags" not in post: + post.update(self._fetch_post(post["id"])) + + tags = collections.defaultdict(list) + for tag in post["tags"]: + tags[tag["type"] or 1].append(tag["value"]) + types = self.TAG_TYPES + for type, values in tags.items(): + post["tags_" + types[type]] = values + + def _fetch_post(self, post_id): + url = "{}/api/v2/post/{}".format(self.root, post_id) + return self.request(url).json() + + def _pagination(self, endpoint, params=None): + url = "{}/api{}".format(self.root, endpoint) + + if params is None: + params = {} + params["sortBy"] = 0 + params["take"] = self.per_page + threshold = self.per_page + + while True: + data = self.request(url, method="POST", json=params).json() + + yield from data["items"] + + if len(data["items"]) < threshold: + return + params["cursor"] = data.get("cursor") + + +class Furry34PostExtractor(Furry34Extractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/post/(\d+)" + example = "https://furry34.com/post/12345" + + def posts(self): + return (self._fetch_post(self.groups[0]),) + + +class Furry34PlaylistExtractor(Furry34Extractor): + subcategory = "playlist" + directory_fmt = ("{category}", "{playlist_id}") + archive_fmt = "p_{playlist_id}_{id}" + pattern = BASE_PATTERN + r"/playlists/view/(\d+)" + example = "https://furry34.com/playlists/view/12345" + + def metadata(self): + return {"playlist_id": self.groups[0]} + + def posts(self): + endpoint = "/v2/post/search/playlist/" + self.groups[0] + return self._pagination(endpoint) + + +class Furry34TagExtractor(Furry34Extractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/(?:([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)" + example = "https://furry34.com/TAG" + + def _init(self): + tag, query = self.groups + params = text.parse_query(query) + + self.tags = tags = [] + if tag: + tags.extend(text.unquote(text.unquote(tag)).split("|")) + if "tags" in params: + tags.extend(params["tags"].split("|")) + + type = params.get("type") + if type == "video": + self.type = 1 + elif type == "image": + self.type = 0 + else: + self.type = None + + def metadata(self): + return {"search_tags": " ".join(self.tags)} + + def posts(self): + endpoint = "/v2/post/search/root" + params = {"includeTags": [t.replace("_", " ") for t in self.tags]} + if self.type is not None: + params["type"] = self.type + return self._pagination(endpoint, params) diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 370cd43..4b04732 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -37,6 +37,7 @@ class GenericExtractor(Extractor): example = "generic:https://www.nongnu.org/lzip/" def __init__(self, match): + self.subcategory = match.group('domain') Extractor.__init__(self, match) # Strip the "g(eneric):" prefix @@ -54,7 +55,6 @@ class GenericExtractor(Extractor): self.scheme = 'https://' self.url = text.ensure_http_scheme(self.url, self.scheme) - self.subcategory = match.group('domain') self.path = match.group('path') # Used to resolve relative image urls diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 481fb1e..20f8ea4 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -142,7 +142,8 @@ class ImgurGalleryExtractor(ImgurExtractor): class ImgurUserExtractor(ImgurExtractor): """Extractor for all images posted by a user""" subcategory = "user" - pattern = BASE_PATTERN + r"/user/([^/?#]+)(?:/posts|/submitted)?/?$" + pattern = (BASE_PATTERN + r"/user/(?!me(?:/|$|\?|#))" + r"([^/?#]+)(?:/posts|/submitted)?/?$") example = "https://imgur.com/user/USER" def items(self): @@ -174,6 +175,23 @@ class ImgurFavoriteFolderExtractor(ImgurExtractor): self.key, self.folder_id)) +class ImgurMeExtractor(ImgurExtractor): + """Extractor for your personal uploads""" + subcategory = "me" + pattern = BASE_PATTERN + r"/user/me(?:/posts)?(/hidden)?" + example = "https://imgur.com/user/me" + + def items(self): + if not self.cookies_check(("accesstoken",)): + self.log.error("'accesstoken' cookie required") + + if self.groups[0]: + posts = self.api.accounts_me_hiddenalbums() + else: + posts = self.api.accounts_me_allposts() + return self._items_queue(posts) + + class ImgurSubredditExtractor(ImgurExtractor): """Extractor for a subreddits's imgur links""" subcategory = "subreddit" @@ -215,6 +233,10 @@ class ImgurAPI(): self.client_id = extractor.config("client-id") or "546c25a59c58ad7" self.headers = {"Authorization": "Client-ID " + self.client_id} + def account_submissions(self, account): + endpoint = "/3/account/{}/submissions".format(account) + return self._pagination(endpoint) + def account_favorites(self, account): endpoint = "/3/account/{}/gallery_favorites".format(account) return self._pagination(endpoint) @@ -224,15 +246,29 @@ class ImgurAPI(): account, folder_id) return self._pagination_v2(endpoint) + def accounts_me_allposts(self): + endpoint = "/post/v1/accounts/me/all_posts" + params = { + "include": "media,tags,account", + "page" : 1, + "sort" : "-created_at", + } + return self._pagination_v2(endpoint, params) + + def accounts_me_hiddenalbums(self): + endpoint = "/post/v1/accounts/me/hidden_albums" + params = { + "include": "media,tags,account", + "page" : 1, + "sort" : "-created_at", + } + return self._pagination_v2(endpoint, params) + def gallery_search(self, query): endpoint = "/3/gallery/search" params = {"q": query} return self._pagination(endpoint, params) - def account_submissions(self, account): - endpoint = "/3/account/{}/submissions".format(account) - return self._pagination(endpoint) - def gallery_subreddit(self, subreddit): endpoint = "/3/gallery/r/{}".format(subreddit) return self._pagination(endpoint) @@ -284,12 +320,16 @@ class ImgurAPI(): if params is None: params = {} params["client_id"] = self.client_id - params["page"] = 0 - params["sort"] = "newest" + if "page" not in params: + params["page"] = 0 + if "sort" not in params: + params["sort"] = "newest" headers = {"Origin": "https://imgur.com"} while True: - data = self._call(endpoint, params, headers)["data"] + data = self._call(endpoint, params, headers) + if "data" in data: + data = data["data"] if not data: return yield from data diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py new file mode 100644 index 0000000..0439f5b --- /dev/null +++ b/gallery_dl/extractor/imhentai.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://imhentai.xxx/ and mirror sites""" + +from .common import GalleryExtractor, BaseExtractor, Message +from .. import text, util + + +class ImhentaiExtractor(BaseExtractor): + basecategory = "IMHentai" + + def _pagination(self, url): + prev = None + base = self.root + "/gallery/" + data = {"_extractor": ImhentaiGalleryExtractor} + + while True: + page = self.request(url).text + extr = text.extract_from(page) + + while True: + gallery_id = extr('<a href="/gallery/', '"') + if gallery_id == prev: + continue + if not gallery_id: + break + yield Message.Queue, base + gallery_id, data + prev = gallery_id + + href = text.rextract(page, "class='page-link' href='", "'")[0] + if not href or href == "#": + return + if href[0] == "/": + if href[1] == "/": + href = "https:" + href + else: + href = self.root + href + url = href + + +BASE_PATTERN = ImhentaiExtractor.update({ + "imhentai": { + "root": "https://imhentai.xxx", + "pattern": r"(?:www\.)?imhentai\.xxx", + }, + "hentaiera": { + "root": "https://hentaiera.com", + "pattern": r"(?:www\.)?hentaiera\.com", + }, + "hentairox": { + "root": "https://hentairox.com", + "pattern": r"(?:www\.)?hentairox\.com", + }, +}) + + +class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): + """Extractor for imhentai galleries""" + pattern = BASE_PATTERN + r"/(?:gallery|view)/(\d+)" + example = "https://imhentai.xxx/gallery/12345/" + + def __init__(self, match): + ImhentaiExtractor.__init__(self, match) + self.gallery_id = self.groups[-1] + self.gallery_url = "{}/gallery/{}/".format(self.root, self.gallery_id) + + def metadata(self, page): + extr = text.extract_from(page) + + data = { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.unescape(extr("<h1>", "<")), + "title_alt" : text.unescape(extr('class="subtitle">', "<")), + "parody" : self._split(extr(">Parodies", "</li>")), + "character" : self._split(extr(">Characters", "</li>")), + "tags" : self._split(extr(">Tags", "</li>")), + "artist" : self._split(extr(">Artists", "</li>")), + "group" : self._split(extr(">Groups", "</li>")), + "language" : self._split(extr(">Languages", "</li>")), + "type" : extr("href='/category/", "/"), + } + + if data["language"]: + data["lang"] = util.language_to_code(data["language"][0]) + + return data + + def _split(self, html): + results = [] + for tag in text.extract_iter(html, ">", "</a>"): + tag = tag.partition(" <span class='badge'>")[0] + if "<" in tag: + tag = text.remove_html(tag) + results.append(tag) + return results + + def images(self, page): + data = util.json_loads(text.extr(page, "$.parseJSON('", "'")) + base = text.extr(page, 'data-src="', '"').rpartition("/")[0] + "/" + exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"} + + results = [] + for i in map(str, range(1, len(data)+1)): + ext, width, height = data[i].split(",") + url = base + i + "." + exts[ext] + results.append((url, { + "width" : text.parse_int(width), + "height": text.parse_int(height), + })) + return results + + +class ImhentaiTagExtractor(ImhentaiExtractor): + """Extractor for imhentai tag searches""" + subcategory = "tag" + pattern = (BASE_PATTERN + r"(/(?:" + r"artist|category|character|group|language|parody|tag" + r")/([^/?#]+))") + example = "https://imhentai.xxx/tag/TAG/" + + def items(self): + url = self.root + self.groups[-2] + "/" + return self._pagination(url) + + +class ImhentaiSearchExtractor(ImhentaiExtractor): + """Extractor for imhentai search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/search/?\?([^#]+)" + example = "https://imhentai.xxx/search/?key=QUERY" + + def items(self): + url = self.root + "/search/?" + self.groups[-1] + return self._pagination(url) diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index b900113..65717b4 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -30,8 +30,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): def metadata(self, page): pos = page.rindex('id="initial-data"') - data = util.json_loads(text.rextract( - page, '<script data-json="', '"', pos)[0].replace(""", '"')) + data = util.json_loads(text.unescape(text.rextract( + page, '<script data-json="', '"', pos)[0])) doc = data["initialDocumentData"]["document"] doc["date"] = text.parse_datetime( diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 7f941bb..5c91eb9 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -24,10 +24,6 @@ class ItakuExtractor(Extractor): archive_fmt = "{id}" request_interval = (0.5, 1.5) - def __init__(self, match): - Extractor.__init__(self, match) - self.item = match.group(1) - def _init(self): self.api = ItakuAPI(self) self.videos = self.config("videos", True) @@ -62,11 +58,11 @@ class ItakuExtractor(Extractor): class ItakuGalleryExtractor(ItakuExtractor): """Extractor for posts from an itaku user gallery""" subcategory = "gallery" - pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery" + pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery(?:/(\d+))?" example = "https://itaku.ee/profile/USER/gallery" def posts(self): - return self.api.galleries_images(self.item) + return self.api.galleries_images(*self.groups) class ItakuImageExtractor(ItakuExtractor): @@ -75,7 +71,7 @@ class ItakuImageExtractor(ItakuExtractor): example = "https://itaku.ee/images/12345" def posts(self): - return (self.api.image(self.item),) + return (self.api.image(self.groups[0]),) class ItakuSearchExtractor(ItakuExtractor): @@ -84,7 +80,7 @@ class ItakuSearchExtractor(ItakuExtractor): example = "https://itaku.ee/home/images?tags=SEARCH" def posts(self): - params = text.parse_query_list(self.item) + params = text.parse_query_list(self.groups[0]) return self.api.search_images(params) @@ -138,7 +134,7 @@ class ItakuAPI(): params = { "cursor" : None, "owner" : self.user(username)["owner"], - "section" : section, + "sections" : section, "date_range": "", "maturity_rating": ("SFW", "Questionable", "NSFW"), "ordering" : "-date_added", diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 8ffa14b..648f7df 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -190,8 +190,8 @@ class NewgroundsExtractor(Extractor): extr = text.extract_from(page) data = extract_data(extr, post_url) - data["_comment"] = extr( - 'id="author_comments"', '</div>').partition(">")[2] + data["comment_html"] = data["_comment"] = extr( + 'id="author_comments"', '</div>').partition(">")[2].strip() data["comment"] = text.unescape(text.remove_html( data["_comment"] .replace("<p><br></p>", "\n\n").replace("<br>", "\n"), "", "")) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index e7540f8..815a214 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -83,8 +83,9 @@ class OAuthBase(Extractor): browser = None if browser and browser.open(url): - name = getattr(browser, "name", None) or "Browser" - self.log.info("Opening URL in %s:", name.capitalize()) + name = getattr(browser, "name", None) + if name: + self.log.info("Opening URL with %s:", name.capitalize()) else: self.log.info("Please open this URL in your browser:") diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 866e93a..f5a33d5 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -169,6 +169,12 @@ class PatreonExtractor(Extractor): attr["date"] = text.parse_datetime( attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + try: + attr["campaign"] = (included["campaign"][ + relationships["campaign"]["data"]["id"]]) + except Exception: + attr["campaign"] = None + tags = relationships.get("user_defined_tags") attr["tags"] = [ tag["id"].replace("user_defined;", "") @@ -324,7 +330,8 @@ class PatreonCreatorExtractor(PatreonExtractor): subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))" - r"(?:c/)?([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") + r"(?:profile/creators|(?:c/)?([^/?#]+)(?:/posts)?)" + r"/?(?:\?([^#]+))?") example = "https://www.patreon.com/USER" def posts(self): @@ -345,7 +352,7 @@ class PatreonCreatorExtractor(PatreonExtractor): return self._pagination(url) def _get_campaign_id(self, creator, query): - if creator.startswith("id:"): + if creator and creator.startswith("id:"): return creator[3:] campaign_id = query.get("c") or query.get("campaign_id") diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 1b67272..201d4d6 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -10,7 +10,6 @@ from .booru import BooruExtractor from .. import text, exception -import operator class PhilomenaExtractor(BooruExtractor): @@ -24,17 +23,22 @@ class PhilomenaExtractor(BooruExtractor): def _init(self): self.api = PhilomenaAPI(self) - if not self.config("svg", True): - self._file_url = operator.itemgetter("view_url") + self.svg = self.config("svg", True) def _file_url(self, post): - if post["format"] == "svg": - return post["view_url"].rpartition(".")[0] + ".svg" - return post["view_url"] + try: + url = post["representations"]["full"] + except Exception: + url = post["view_url"] + + if self.svg and post["format"] == "svg": + return url.rpartition(".")[0] + ".svg" + return url @staticmethod def _prepare(post): - post["date"] = text.parse_datetime(post["created_at"]) + post["date"] = text.parse_datetime( + post["created_at"][:19], "%Y-%m-%dT%H:%M:%S") BASE_PATTERN = PhilomenaExtractor.update({ diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 7fe8869..8a4905d 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -71,9 +71,12 @@ class PixivExtractor(Extractor): if self.meta_user: work.update(self.api.user_detail(work["user"]["id"])) if self.meta_comments: - if work["total_comments"]: - work["comments"] = list( - self.api.illust_comments(work["id"])) + if work["total_comments"] and not work.get("_ajax"): + try: + work["comments"] = list( + self.api.illust_comments(work["id"])) + except Exception: + work["comments"] = () else: work["comments"] = () if self.meta_bookmark and work["is_bookmarked"]: diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 89eafc8..f36b1f5 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -259,6 +259,8 @@ class RedditSubredditExtractor(RedditExtractor): self.subreddit, sub, params = match.groups() self.params = text.parse_query(params) if sub: + if sub == "search" and "restrict_sr" not in self.params: + self.params["restrict_sr"] = "1" self.subcategory += "-" + sub RedditExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 5e3a958..b5cdb9c 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -194,7 +194,6 @@ class SankakuAPI(): self.extractor = extractor self.headers = { "Accept" : "application/vnd.sankaku.api+json;v=2", - "Platform" : "web-app", "Api-Version": None, "Origin" : extractor.root, } diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 8668330..6c43941 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -51,6 +51,23 @@ class SubscribestarExtractor(Extractor): def posts(self): """Yield HTML content of all relevant posts""" + def request(self, url, **kwargs): + while True: + response = Extractor.request(self, url, **kwargs) + + if response.history and "/verify_subscriber" in response.url: + raise exception.StopExtraction( + "HTTP redirect to %s", response.url) + + content = response.content + if len(content) < 250 and b">redirected<" in content: + url = text.unescape(text.extr( + content, b'href="', b'"').decode()) + self.log.debug("HTML redirect message for %s", url) + continue + + return response + def login(self): if self.cookies_check(self.cookies_names): return @@ -189,10 +206,11 @@ class SubscribestarPostExtractor(SubscribestarExtractor): extr = text.extract_from(html) return { "post_id" : text.parse_int(extr('data-id="', '"')), - "author_name": text.unescape(extr('href="/', '"')), - "author_id" : text.parse_int(extr('data-user-id="', '"')), - "author_nick": text.unescape(extr('alt="', '"')), "date" : self._parse_datetime(extr( - '<span class="star_link-types">', '<')), + '<div class="section-title_date">', '<')), "content" : extr('<body>', '</body>').strip(), + "author_name": text.unescape(extr( + 'class="star_link" href="/', '"')), + "author_id" : text.parse_int(extr('data-user-id="', '"')), + "author_nick": text.unescape(extr('alt="', '"')), } diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py new file mode 100644 index 0000000..f129b1c --- /dev/null +++ b/gallery_dl/extractor/tiktok.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.tiktok.com/""" + +from .common import Extractor, Message +from .. import text, util, ytdl, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?tiktokv?\.com" + + +class TiktokExtractor(Extractor): + """Base class for TikTok extractors""" + category = "tiktok" + directory_fmt = ("{category}", "{user}") + filename_fmt = ( + "{id}{num:?_//>02} {title[b:150]}{img_id:? [/]/}.{extension}") + archive_fmt = "{id}_{num}_{img_id}" + root = "https://www.tiktok.com" + cookies_domain = ".tiktok.com" + + def _init(self): + self.audio = self.config("audio", True) + self.video = self.config("videos", True) + if not self.config("avatar", True): + self.avatar = util.false + + def items(self): + # We assume that all of the URLs served by urls() come from the same + # author. + downloaded_avatar = not self.avatar() + + for tiktok_url in self.urls(): + tiktok_url = self._sanitize_url(tiktok_url) + data = self._extract_rehydration_data(tiktok_url) + if "webapp.video-detail" not in data: + # Only /video/ links result in the video-detail dict we need. + # Try again using that form of link. + tiktok_url = self._sanitize_url( + data["seo.abtest"]["canonical"]) + data = self._extract_rehydration_data(tiktok_url) + video_detail = data["webapp.video-detail"] + + if not self._check_status_code(video_detail, tiktok_url): + continue + + post = video_detail["itemInfo"]["itemStruct"] + author = post["author"] + post["user"] = user = author["uniqueId"] + post["date"] = text.parse_timestamp(post["createTime"]) + original_title = title = post["desc"] + + if not downloaded_avatar: + avatar_url = author["avatarLarger"] + avatar = self._generate_avatar( + avatar_url, post, user, author["id"]) + yield Message.Directory, avatar + yield Message.Url, avatar_url, avatar + downloaded_avatar = True + + yield Message.Directory, post + ytdl_media = False + + if "imagePost" in post: + if not original_title: + title = "TikTok photo #{}".format(post["id"]) + img_list = post["imagePost"]["images"] + for i, img in enumerate(img_list, 1): + url = img["imageURL"]["urlList"][0] + text.nameext_from_url(url, post) + post.update({ + "type" : "image", + "image" : img, + "title" : title, + "num" : i, + "img_id": post["filename"].partition("~")[0], + "width" : img["imageWidth"], + "height": img["imageHeight"], + }) + yield Message.Url, url, post + + if self.audio and "music" in post: + ytdl_media = "audio" + + elif self.video and "video" in post: + ytdl_media = "video" + + else: + self.log.info("%s: Skipping post", tiktok_url) + + if ytdl_media: + if not original_title: + title = "TikTok {} #{}".format(ytdl_media, post["id"]) + post.update({ + "type" : ytdl_media, + "image" : None, + "filename" : "", + "extension" : "mp3" if ytdl_media == "audio" else "mp4", + "title" : title, + "num" : 0, + "img_id" : "", + "width" : 0, + "height" : 0, + }) + yield Message.Url, "ytdl:" + tiktok_url, post + + # If we couldn't download the avatar because the given user has no + # posts, we'll need to make a separate request for the user's page + # and download the avatar that way. + if not downloaded_avatar: + user_name = self.avatar() + profile_url = "https://www.tiktok.com/@{}".format(user_name) + data = self._extract_rehydration_data(profile_url) + data = data["webapp.user-detail"]["userInfo"]["user"] + data["user"] = user_name + avatar_url = data["avatarLarger"] + avatar = self._generate_avatar( + avatar_url, data, user_name, data["id"]) + yield Message.Directory, avatar + yield Message.Url, avatar_url, avatar + + def avatar(self): + return False + + def _generate_avatar(self, avatar_url, data, user_name, user_id): + avatar = text.nameext_from_url(avatar_url, data.copy()) + avatar.update({ + "type" : "avatar", + "title" : "@" + user_name, + "id" : user_id, + "img_id": avatar["filename"].partition("~")[0], + "num" : 0, + }) + return avatar + + def _sanitize_url(self, url): + return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1)) + + def _extract_rehydration_data(self, url): + html = self.request(url).text + data = text.extr( + html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" ' + 'type="application/json">', '</script>') + return util.json_loads(data)["__DEFAULT_SCOPE__"] + + def _check_status_code(self, detail, url): + status = detail.get("statusCode") + if not status: + return True + + if status == 10222: + self.log.error("%s: Login required to access this post", url) + elif status == 10204: + self.log.error("%s: Requested post not available", url) + elif status == 10231: + self.log.error("%s: Region locked - Try downloading with a" + "VPN/proxy connection", url) + else: + self.log.error( + "%s: Received unknown error code %s ('%s')", + url, status, detail.get("statusMsg") or "") + return False + + +class TiktokPostExtractor(TiktokExtractor): + """Extract a single video or photo TikTok link""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(?:@([\w_.-]*)|share)/(?:phot|vide)o/(\d+)" + example = "https://www.tiktok.com/@USER/photo/1234567890" + + def urls(self): + user, post_id = self.groups + url = "{}/@{}/video/{}".format(self.root, user or "", post_id) + return (url,) + + +class TiktokVmpostExtractor(TiktokExtractor): + """Extract a single video or photo TikTok VM link""" + subcategory = "vmpost" + pattern = (r"(?:https?://)?(?:" + r"(?:v[mt]\.)?tiktok\.com|(?:www\.)?tiktok\.com/t" + r")/(?!@)([^/?#]+)") + example = "https://vm.tiktok.com/1a2B3c4E5" + + def items(self): + url = text.ensure_http_scheme(self.url) + headers = {"User-Agent": "facebookexternalhit/1.1"} + + response = self.request(url, headers=headers, method="HEAD", + allow_redirects=False, notfound="post") + + url = response.headers.get("Location") + if not url or len(url) <= 28: + # https://www.tiktok.com/?_r=1 + raise exception.NotFoundError("post") + + data = {"_extractor": TiktokPostExtractor} + yield Message.Queue, url.partition("?")[0], data + + +class TiktokUserExtractor(TiktokExtractor): + """Extract a TikTok user's profile""" + subcategory = "user" + pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)" + example = "https://www.tiktok.com/@USER" + + def urls(self): + """Attempt to use yt-dlp/youtube-dl to extract links from a + user's page""" + + try: + module = ytdl.import_module(self.config("module")) + except (ImportError, SyntaxError) as exc: + self.log.error("Cannot import module '%s'", + getattr(exc, "name", "")) + self.log.debug("", exc_info=exc) + raise exception.ExtractionError("yt-dlp or youtube-dl is required " + "for this feature!") + extr_opts = { + "extract_flat" : True, + "ignore_no_formats_error": True, + } + user_opts = { + "retries" : self._retries, + "socket_timeout" : self._timeout, + "nocheckcertificate" : not self._verify, + "playlist_items" : str(self.config("tiktok-range", "")), + } + if self._proxies: + user_opts["proxy"] = self._proxies.get("http") + + ytdl_instance = ytdl.construct_YoutubeDL( + module, self, user_opts, extr_opts) + + # transfer cookies to ytdl + if self.cookies: + set_cookie = ytdl_instance.cookiejar.set_cookie + for cookie in self.cookies: + set_cookie(cookie) + + with ytdl_instance as ydl: + info_dict = ydl._YoutubeDL__extract_info( + "{}/@{}".format(self.root, self.groups[0]), + ydl.get_info_extractor("TikTokUser"), + False, {}, True) + # This should include video and photo posts in /video/ URL form. + return [video["url"] for video in info_dict["entries"]] + + def avatar(self): + return self.groups[0] diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index a725a2c..3b0ea36 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -12,7 +12,7 @@ from .booru import BooruExtractor from .. import text, exception import operator -BASE_PATTERN = r"(?:https?://)?twibooru\.org" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?twibooru\.org" class TwibooruExtractor(BooruExtractor): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 840e846..c391bad 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -234,6 +234,13 @@ class TwitterExtractor(Extractor): for fmt in self._size_fallback: yield base + fmt + def _extract_components(self, tweet, data, files): + for component_id in data["components"]: + com = data["component_objects"][component_id] + for conv in com["data"]["conversation_preview"]: + for url in conv.get("mediaUrls") or (): + files.append({"url": url}) + def _extract_card(self, tweet, files): card = tweet["card"] if "legacy" in card: @@ -272,7 +279,11 @@ class TwitterExtractor(Extractor): return elif name == "unified_card": data = util.json_loads(bvals["unified_card"]["string_value"]) - self._extract_media(tweet, data["media_entities"].values(), files) + if "media_entities" in data: + self._extract_media( + tweet, data["media_entities"].values(), files) + if "component_objects" in data: + self._extract_components(tweet, data, files) return if self.cards == "ytdl": @@ -1065,7 +1076,7 @@ class TwitterAPI(): else: csrf_token = None if not csrf_token: - csrf_token = util.generate_token(80) + csrf_token = util.generate_token() cookies.set("ct0", csrf_token, domain=cookies_domain) auth_token = cookies.get("auth_token", domain=cookies_domain) diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 5cde0d6..af3f32d 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -29,7 +29,17 @@ class VipergirlsExtractor(Extractor): def _init(self): domain = self.config("domain") if domain: - self.root = text.ensure_http_scheme(domain) + pos = domain.find("://") + if pos >= 0: + self.root = domain.rstrip("/") + self.cookies_domain = "." + domain[pos+1:].strip("/") + else: + domain = domain.strip("/") + self.root = "https://" + domain + self.cookies_domain = "." + domain + else: + self.root = "https://viper.click" + self.cookies_domain = ".viper.click" def items(self): self.login() diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 1c0c172..a53409c 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -38,7 +38,7 @@ class VscoExtractor(Extractor): if img["is_video"]: if not videos: continue - url = "https://" + img["video_url"] + url = text.ensure_http_scheme(img["video_url"]) else: base = img["responsive_url"].partition("/")[2] cdn, _, path = base.partition("/") @@ -63,6 +63,10 @@ class VscoExtractor(Extractor): "height": img["height"], "description": img.get("description") or "", }) + if data["extension"] == "m3u8": + url = "ytdl:" + url + data["_ytdl_manifest"] = "hls" + data["extension"] = "mp4" yield Message.Url, url, data def images(self): @@ -294,12 +298,33 @@ class VscoImageExtractor(VscoExtractor): pattern = USER_PATTERN + r"/media/([0-9a-fA-F]+)" example = "https://vsco.co/USER/media/0123456789abcdef" - def __init__(self, match): - VscoExtractor.__init__(self, match) - self.media_id = match.group(2) - def images(self): - url = "{}/{}/media/{}".format(self.root, self.user, self.media_id) + url = "{}/{}/media/{}".format(self.root, self.user, self.groups[1]) data = self._extract_preload_state(url) media = data["medias"]["byId"].popitem()[1]["media"] return (self._transform_media(media),) + + +class VscoVideoExtractor(VscoExtractor): + """Extractor for vsco.co videos links""" + subcategory = "video" + pattern = USER_PATTERN + r"/video/([^/?#]+)" + example = "https://vsco.co/USER/video/012345678-9abc-def0" + + def images(self): + url = "{}/{}/video/{}".format(self.root, self.user, self.groups[1]) + data = self._extract_preload_state(url) + media = data["medias"]["byId"].popitem()[1]["media"] + + return ({ + "_id" : media["id"], + "is_video" : True, + "grid_name" : "", + "upload_date" : media["createdDate"], + "responsive_url": media["posterUrl"], + "video_url" : "ytdl:" + media.get("playbackUrl"), + "image_meta" : None, + "width" : media["width"], + "height" : media["height"], + "description" : media["description"], + },) diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py index fc1badb..cacefd6 100644 --- a/gallery_dl/extractor/weebcentral.py +++ b/gallery_dl/extractor/weebcentral.py @@ -50,14 +50,16 @@ class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) manga_id = extr("'series_id': '", "'") - - data = self._extract_manga_data(manga_id) - data["chapter_id"] = self.groups[1] - data["chapter_type"] = extr("'chapter_type': '", "'") - + chapter_type = extr("'chapter_type': '", "'") chapter, sep, minor = extr("'number': '", "'").partition(".") - data["chapter"] = text.parse_int(chapter) - data["chapter_minor"] = sep + minor + + data = { + "chapter": text.parse_int(chapter), + "chapter_id": self.groups[1], + "chapter_type": chapter_type, + "chapter_minor": sep + minor, + } + data.update(self._extract_manga_data(manga_id)) return data diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 9885d79..3ed5a06 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -33,6 +33,7 @@ class WeiboExtractor(Extractor): self.livephoto = self.config("livephoto", True) self.retweets = self.config("retweets", False) self.videos = self.config("videos", True) + self.movies = self.config("movies", False) self.gifs = self.config("gifs", True) self.gifs_video = (self.gifs == "video") @@ -134,7 +135,10 @@ class WeiboExtractor(Extractor): if "page_info" in status: info = status["page_info"] if "media_info" in info and self.videos: - append(self._extract_video(info["media_info"])) + if info.get("type") != "5" or self.movies: + append(self._extract_video(info["media_info"])) + else: + self.log.debug("%s: Ignoring 'movie' video", status["id"]) def _extract_video(self, info): try: diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 2914927..bea35e3 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -551,28 +551,24 @@ class DownloadJob(Job): archive_path = cfg("archive") if archive_path: - archive_path = util.expand_path(archive_path) - + archive_table = cfg("archive-table") archive_prefix = cfg("archive-prefix") if archive_prefix is None: - archive_prefix = extr.category + archive_prefix = extr.category if archive_table is None else "" archive_format = cfg("archive-format") if archive_format is None: archive_format = extr.archive_fmt try: - if "{" in archive_path: - archive_path = formatter.parse( - archive_path).format_map(kwdict) - if cfg("archive-mode") == "memory": - archive_cls = archive.DownloadArchiveMemory - else: - archive_cls = archive.DownloadArchive - self.archive = archive_cls( + self.archive = archive.connect( archive_path, - archive_prefix + archive_format, + archive_prefix, + archive_format, + archive_table, + cfg("archive-mode"), cfg("archive-pragma"), + kwdict, ) except Exception as exc: extr.log.warning( diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 222679a..3c03271 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -179,11 +179,15 @@ class PrintAction(argparse.Action): if not format_string: return - if "{" not in format_string and \ - " " not in format_string and \ - format_string[0] != "\f": - format_string = "{" + format_string + "}" - if format_string[-1] != "\n": + if format_string.startswith("\\f"): + format_string = "\f" + format_string[2:] + + if format_string[0] == "\f": + if format_string[1] == "F" and format_string[-1] != "\n": + format_string += "\n" + elif "{" not in format_string and " " not in format_string: + format_string = "{" + format_string + "}\n" + elif format_string[-1] != "\n": format_string += "\n" namespace.postprocessors.append({ diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index a9143a6..3099547 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2023 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Common classes and constants used by postprocessor modules.""" -from .. import util, formatter, archive +from .. import archive class PostProcessor(): @@ -25,11 +25,11 @@ class PostProcessor(): archive_path = options.get("archive") if archive_path: extr = job.extractor - archive_path = util.expand_path(archive_path) + archive_table = options.get("archive-table") archive_prefix = options.get("archive-prefix") if archive_prefix is None: - archive_prefix = extr.category + archive_prefix = extr.category if archive_table is None else "" archive_format = options.get("archive-format") if archive_format is None: @@ -38,13 +38,14 @@ class PostProcessor(): archive_format = prefix + extr.archive_fmt try: - if "{" in archive_path: - archive_path = formatter.parse(archive_path).format_map( - job.pathfmt.kwdict) - self.archive = archive.DownloadArchive( + self.archive = archive.connect( archive_path, - archive_prefix + archive_format, + archive_prefix, + archive_format, + archive_table, + "file", options.get("archive-pragma"), + job.pathfmt.kwdict, "_archive_" + self.name, ) except Exception as exc: diff --git a/gallery_dl/postprocessor/compare.py b/gallery_dl/postprocessor/compare.py index 3bb63c8..c6bc54d 100644 --- a/gallery_dl/postprocessor/compare.py +++ b/gallery_dl/postprocessor/compare.py @@ -9,7 +9,7 @@ """Compare versions of the same file and replace/enumerate them on mismatch""" from .common import PostProcessor -from .. import text, util, exception +from .. import text, util, output, exception import os @@ -83,7 +83,7 @@ class ComparePP(PostProcessor): self._equal_cnt += 1 if self._equal_cnt >= self._equal_max: util.remove_file(pathfmt.temppath) - print() + output.stderr_write("\n") raise self._equal_exc() pathfmt.delete = True diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index fec4ab0..3a32b39 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -9,7 +9,7 @@ """Convert Pixiv Ugoira to WebM""" from .common import PostProcessor -from .. import util +from .. import util, output import subprocess import tempfile import zipfile @@ -226,13 +226,13 @@ class UgoiraPP(PostProcessor): if self._finalize: self._finalize(pathfmt, tempdir) except OSError as exc: - print() + output.stderr_write("\n") self.log.error("Unable to invoke FFmpeg (%s: %s)", exc.__class__.__name__, exc) self.log.debug("", exc_info=exc) pathfmt.realpath = pathfmt.temppath except Exception as exc: - print() + output.stderr_write("\n") self.log.error("%s: %s", exc.__class__.__name__, exc) self.log.debug("", exc_info=exc) pathfmt.realpath = pathfmt.temppath @@ -296,7 +296,7 @@ class UgoiraPP(PostProcessor): out = None if self.output else subprocess.DEVNULL retcode = util.Popen(args, stdout=out, stderr=out).wait() if retcode: - print() + output.stderr_write("\n") self.log.error("Non-zero exit status when running %s (%s)", args, retcode) raise ValueError() diff --git a/gallery_dl/update.py b/gallery_dl/update.py index b068e37..6650ec4 100644 --- a/gallery_dl/update.py +++ b/gallery_dl/update.py @@ -12,7 +12,7 @@ import sys from .extractor.common import Extractor, Message from .job import DownloadJob -from . import util, version, exception +from . import util, version, output, exception REPOS = { "stable" : "mikf/gallery-dl", @@ -23,14 +23,14 @@ REPOS = { BINARIES_STABLE = { "windows" : "gallery-dl.exe", - "windows_x86": "gallery-dl.exe", "windows_x64": "gallery-dl.exe", + "windows_x86": "gallery-dl_x86.exe", "linux" : "gallery-dl.bin", } BINARIES_DEV = { "windows" : "gallery-dl_windows.exe", - "windows_x86": "gallery-dl_windows_x86.exe", "windows_x64": "gallery-dl_windows.exe", + "windows_x86": "gallery-dl_windows_x86.exe", "linux" : "gallery-dl_linux", "macos" : "gallery-dl_macos", } @@ -143,13 +143,13 @@ class UpdateJob(DownloadJob): def _warning(self, msg, *args): if self._newline: self._newline = False - print() + output.stderr_write("\n") self.extractor.log.warning(msg, *args) def _error(self, msg, *args): if self._newline: self._newline = False - print() + output.stderr_write("\n") self.status |= 1 self.extractor.log.error(msg, *args) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 2302088..7034c0c 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -770,7 +770,7 @@ def import_file(path): finally: del sys.path[0] else: - return __import__(name) + return __import__(name.replace("-", "_")) def build_duration_func(duration, min=0.0): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d252bed..0c75005 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.28.5" +__version__ = "1.29.0" __variant__ = None diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index 32545e2..319e781 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -20,7 +20,7 @@ def import_module(module_name): return __import__("yt_dlp") except (ImportError, SyntaxError): return __import__("youtube_dl") - return __import__(module_name.replace("-", "_")) + return util.import_file(module_name) def construct_YoutubeDL(module, obj, user_opts, system_opts=None): |
