diff options
Diffstat (limited to 'gallery_dl')
51 files changed, 1367 insertions, 421 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 19ea77b..bc44b35 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -38,6 +38,11 @@ def main(): except ImportError: import toml config.load(args.configs_toml, strict=True, loads=toml.loads) + if not args.colors: + output.ANSI = False + config.set((), "colors", False) + if util.WINDOWS: + config.set(("output",), "ansi", False) if args.filename: filename = args.filename if filename == "/O": @@ -86,7 +91,7 @@ def main(): signal.signal(signal_num, signal.SIG_IGN) # enable ANSI escape sequences on Windows - if util.WINDOWS and config.get(("output",), "ansi"): + if util.WINDOWS and config.get(("output",), "ansi", output.COLORS): from ctypes import windll, wintypes, byref kernel32 = windll.kernel32 mode = wintypes.DWORD() @@ -113,7 +118,7 @@ def main(): # loglevels output.configure_logging(args.loglevel) - if args.loglevel >= logging.ERROR: + if args.loglevel >= logging.WARNING: config.set(("output",), "mode", "null") config.set(("downloader",), "progress", None) elif args.loglevel <= logging.DEBUG: @@ -122,7 +127,7 @@ def main(): extra = "" if util.EXECUTABLE: - extra = " - Executable" + extra = " - Executable ({})".format(version.__variant__) else: git_head = util.git_head() if git_head: @@ -178,7 +183,13 @@ def main(): else: extractor._module_iter = iter(modules[0]) - if args.list_modules: + if args.update: + from . import update + extr = update.UpdateExtractor.from_url("update:" + args.update) + ujob = update.UpdateJob(extr) + return ujob.run() + + elif args.list_modules: extractor.modules.append("") sys.stdout.write("\n".join(extractor.modules)) @@ -202,6 +213,7 @@ def main(): if cnt is None: log.error("Database file not available") + return 1 else: log.info( "Deleted %d %s from '%s'", @@ -294,6 +306,7 @@ def main(): input_manager.next() return retval + return 0 except KeyboardInterrupt: raise SystemExit("\nKeyboardInterrupt") diff --git a/gallery_dl/archive.py b/gallery_dl/archive.py new file mode 100644 index 0000000..5f05bbf --- /dev/null +++ b/gallery_dl/archive.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Download Archives""" + +import os +import sqlite3 +from . import formatter + + +class DownloadArchive(): + + def __init__(self, path, format_string, pragma=None, + cache_key="_archive_key"): + try: + con = sqlite3.connect(path, timeout=60, check_same_thread=False) + except sqlite3.OperationalError: + os.makedirs(os.path.dirname(path)) + con = sqlite3.connect(path, timeout=60, check_same_thread=False) + con.isolation_level = None + + self.keygen = formatter.parse(format_string).format_map + self.connection = con + self.close = con.close + self.cursor = cursor = con.cursor() + self._cache_key = cache_key + + if pragma: + for stmt in pragma: + cursor.execute("PRAGMA " + stmt) + + try: + cursor.execute("CREATE TABLE IF NOT EXISTS archive " + "(entry TEXT PRIMARY KEY) WITHOUT ROWID") + except sqlite3.OperationalError: + # fallback for missing WITHOUT ROWID support (#553) + cursor.execute("CREATE TABLE IF NOT EXISTS archive " + "(entry TEXT PRIMARY KEY)") + + def add(self, kwdict): + """Add item described by 'kwdict' to archive""" + key = kwdict.get(self._cache_key) or self.keygen(kwdict) + self.cursor.execute( + "INSERT OR IGNORE INTO archive (entry) VALUES (?)", (key,)) + + def check(self, kwdict): + """Return True if the item described by 'kwdict' exists in archive""" + key = kwdict[self._cache_key] = self.keygen(kwdict) + self.cursor.execute( + "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) + return self.cursor.fetchone() + + def finalize(self): + pass + + +class DownloadArchiveMemory(DownloadArchive): + + def __init__(self, path, format_string, pragma=None, + cache_key="_archive_key"): + DownloadArchive.__init__(self, path, format_string, pragma, cache_key) + self.keys = set() + + def add(self, kwdict): + self.keys.add( + kwdict.get(self._cache_key) or + self.keygen(kwdict)) + + def check(self, kwdict): + key = kwdict[self._cache_key] = self.keygen(kwdict) + if key in self.keys: + return True + self.cursor.execute( + "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) + return self.cursor.fetchone() + + def finalize(self): + if not self.keys: + return + + cursor = self.cursor + with self.connection: + try: + cursor.execute("BEGIN") + except sqlite3.OperationalError: + pass + + stmt = "INSERT OR IGNORE INTO archive (entry) VALUES (?)" + if len(self.keys) < 100: + for key in self.keys: + cursor.execute(stmt, (key,)) + else: + cursor.executemany(stmt, ((key,) for key in self.keys)) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 478abb6..b4986c1 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -10,7 +10,6 @@ # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/cookies.py import binascii -import contextlib import ctypes import logging import os @@ -147,7 +146,8 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, set_cookie(Cookie( 0, name, value, None, False, domain, bool(domain), domain.startswith("."), - path, bool(path), secure, expires, False, None, None, {}, + path, bool(path), secure, expires or None, False, + None, None, {}, )) if failed_cookies > 0: @@ -682,7 +682,8 @@ def _get_gnome_keyring_password(browser_keyring_name): # lists all keys and presumably searches for its key in the list. # It appears that we must do the same. # https://github.com/jaraco/keyring/issues/556 - with contextlib.closing(secretstorage.dbus_init()) as con: + con = secretstorage.dbus_init() + try: col = secretstorage.get_default_collection(con) label = browser_keyring_name + " Safe Storage" for item in col.get_all_items(): @@ -691,6 +692,8 @@ def _get_gnome_keyring_password(browser_keyring_name): else: _log_error("Failed to read from GNOME keyring") return b"" + finally: + con.close() def _get_linux_keyring_password(browser_keyring_name, keyring): @@ -857,7 +860,7 @@ class DatabaseConnection(): def Popen_communicate(*args): - proc = subprocess.Popen( + proc = util.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) try: stdout, stderr = proc.communicate() @@ -999,6 +1002,12 @@ def _decrypt_windows_dpapi(ciphertext): def _find_most_recently_used_file(root, filename): + # if the provided root points to an exact profile path + # check if it contains the wanted filename + first_choice = os.path.join(root, filename) + if os.path.exists(first_choice): + return first_choice + # if there are multiple browser profiles, take the most recently used one paths = [] for curr_root, dirs, files in os.walk(root): diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 0ff5dd9..54750ac 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -98,6 +98,8 @@ class HttpDownloader(DownloaderBase): metadata = self.metadata kwdict = pathfmt.kwdict + expected_status = kwdict.get( + "_http_expected_status", ()) adjust_extension = kwdict.get( "_http_adjust_extension", self.adjust_extension) @@ -151,7 +153,7 @@ class HttpDownloader(DownloaderBase): # check response code = response.status_code - if code == 200: # OK + if code == 200 or code in expected_status: # OK offset = 0 size = response.headers.get("Content-Length") elif code == 206: # Partial Content @@ -399,6 +401,8 @@ MIME_TYPES = { "video/webm": "webm", "video/ogg" : "ogg", "video/mp4" : "mp4", + "video/m4v" : "m4v", + "video/x-m4v": "m4v", "video/quicktime": "mov", "audio/wav" : "wav", @@ -441,7 +445,8 @@ SIGNATURE_CHECKS = { "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00", "psd" : lambda s: s[0:4] == b"8BPS", "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in ( - b"mp4", b"avc", b"iso", b"M4V")), + b"mp4", b"avc", b"iso")), + "m4v" : lambda s: s[4:11] == b"ftypM4V", "mov" : lambda s: s[4:12] == b"ftypqt ", "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3", "ogg" : lambda s: s[0:4] == b"OggS", diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index d198369..948a605 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -64,7 +64,7 @@ class _4archiveThreadExtractor(Extractor): data = { "name": extr('class="name">', "</span>"), "date": text.parse_datetime( - extr('class="dateTime postNum" >', "<").strip(), + extr('class="dateTime postNum">', "<").strip(), "%Y-%m-%d %H:%M:%S"), "no" : text.parse_int(extr('href="#p', '"')), } diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index fc16f43..a4b0997 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -26,6 +26,9 @@ class _8chanExtractor(Extractor): self.root = "https://8chan." + match.group(1) Extractor.__init__(self, match) + def _init(self): + self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2]) + @memcache() def cookies_prepare(self): # fetch captcha cookies diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 49fde7b..ce1a78d 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -22,6 +22,7 @@ class ArtstationExtractor(Extractor): directory_fmt = ("{category}", "{userinfo[username]}") archive_fmt = "{asset[id]}" browser = "firefox" + tls12 = False root = "https://www.artstation.com" def __init__(self, match): diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index 84c3187..c97bf65 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -317,7 +317,7 @@ class BlueskyAPI(): def get_author_feed(self, actor, filter="posts_and_author_threads"): endpoint = "app.bsky.feed.getAuthorFeed" params = { - "actor" : self._did_from_actor(actor), + "actor" : self._did_from_actor(actor, True), "filter": filter, "limit" : "100", } @@ -327,7 +327,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getFeed" params = { "feed" : "at://{}/app.bsky.feed.generator/{}".format( - self._did_from_actor(actor, False), feed), + self._did_from_actor(actor), feed), "limit": "100", } return self._pagination(endpoint, params) @@ -344,7 +344,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getListFeed" params = { "list" : "at://{}/app.bsky.graph.list/{}".format( - self._did_from_actor(actor, False), list), + self._did_from_actor(actor), list), "limit": "100", } return self._pagination(endpoint, params) @@ -391,7 +391,7 @@ class BlueskyAPI(): } return self._pagination(endpoint, params, "posts") - def _did_from_actor(self, actor, user_did=True): + def _did_from_actor(self, actor, user_did=False): if actor.startswith("did:"): did = actor else: diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py new file mode 100644 index 0000000..a9ccab5 --- /dev/null +++ b/gallery_dl/extractor/cien.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://ci-en.net/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" + + +class CienExtractor(Extractor): + category = "cien" + root = "https://ci-en.net" + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + Extractor.__init__(self, match) + + def _pagination_articles(self, url, params): + data = {"extractor": CienArticleExtractor} + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + + for card in text.extract_iter( + page, ' class="c-cardCase-item', '</div>'): + article_url = text.extr(card, ' href="', '"') + yield Message.Queue, article_url, data + + if ' rel="next"' not in page: + return + params["page"] += 1 + + +class CienArticleExtractor(CienExtractor): + subcategory = "article" + pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)" + example = "https://ci-en.net/creator/123/article/12345" + + def items(self): + url = "{}/creator/{}/article/{}".format( + self.root, self.groups[0], self.groups[1]) + page = self.request(url, notfound="article").text + return + yield 1 + + +class CienCreatorExtractor(CienExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" + example = "https://ci-en.net/creator/123" + + def items(self): + url = "{}/creator/{}/article".format(self.root, self.groups[0]) + params = text.parse_query(self.groups[1]) + params["mode"] = "list" + return self._pagination_articles(url, params) + + +class CienRecentExtractor(CienExtractor): + subcategory = "recent" + pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?" + example = "https://ci-en.net/mypage/recent" + + def items(self): + url = self.root + "/mypage/recent" + params = text.parse_query(self.groups[0]) + return self._pagination_articles(url, params) + + +class CienFollowingExtractor(CienExtractor): + subcategory = "following" + pattern = BASE_PATTERN + r"/mypage/subscription(/following)?" + example = "https://ci-en.net/mypage/subscription" + + def items(self): + url = self.root + "/mypage/recent" + params = text.parse_query(self.groups[0]) + return self._pagination_articles(url, params) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index d14e13a..8771261 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -14,6 +14,7 @@ import ssl import time import netrc import queue +import getpass import logging import datetime import requests @@ -21,6 +22,7 @@ import threading from requests.adapters import HTTPAdapter from .message import Message from .. import config, text, util, cache, exception +urllib3 = requests.packages.urllib3 class Extractor(): @@ -45,6 +47,8 @@ class Extractor(): def __init__(self, match): self.log = logging.getLogger(self.category) self.url = match.string + self.match = match + self.groups = match.groups() self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" @@ -168,22 +172,25 @@ class Extractor(): requests.exceptions.ChunkedEncodingError, requests.exceptions.ContentDecodingError) as exc: msg = exc + code = 0 except (requests.exceptions.RequestException) as exc: raise exception.HttpError(exc) else: code = response.status_code if self._write_pages: self._dump_response(response) - if 200 <= code < 400 or fatal is None and \ - (400 <= code < 500) or not fatal and \ - (400 <= code < 429 or 431 <= code < 500): + if ( + code < 400 or + code < 500 and (not fatal and code != 429 or fatal is None) + ): if encoding: response.encoding = encoding return response if notfound and code == 404: raise exception.NotFoundError(notfound) - msg = "'{} {}' for '{}'".format(code, response.reason, url) + msg = "'{} {}' for '{}'".format( + code, response.reason, response.url) server = response.headers.get("Server") if server and server.startswith("cloudflare") and \ code in (403, 503): @@ -194,7 +201,10 @@ class Extractor(): if b'name="captcha-bypass"' in content: self.log.warning("Cloudflare CAPTCHA") break - if code not in retry_codes and code < 500: + + if code == 429 and self._interval_429: + pass + elif code not in retry_codes and code < 500: break finally: @@ -204,20 +214,24 @@ class Extractor(): if tries > retries: break + seconds = tries if self._interval: - seconds = self._interval() - if seconds < tries: - seconds = tries + s = self._interval() + if seconds < s: + seconds = s + if code == 429 and self._interval_429: + s = self._interval_429() + if seconds < s: + seconds = s + self.wait(seconds=seconds, reason="429 Too Many Requests") else: - seconds = tries - - self.sleep(seconds, "retry") + self.sleep(seconds, "retry") tries += 1 raise exception.HttpError(msg, response) def wait(self, seconds=None, until=None, adjust=1.0, - reason="rate limit reset"): + reason="rate limit"): now = time.time() if seconds: @@ -240,7 +254,7 @@ class Extractor(): if reason: t = datetime.datetime.fromtimestamp(until).time() isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second) - self.log.info("Waiting until %s for %s.", isotime, reason) + self.log.info("Waiting until %s (%s)", isotime, reason) time.sleep(seconds) def sleep(self, seconds, reason): @@ -248,6 +262,15 @@ class Extractor(): seconds, reason) time.sleep(seconds) + def input(self, prompt, echo=True): + if echo: + try: + return input(prompt) + except (EOFError, OSError): + return None + else: + return getpass.getpass(prompt) + def _get_auth_info(self): """Return authentication information as (username, password) tuple""" username = self.config("username") @@ -280,6 +303,9 @@ class Extractor(): self.config("sleep-request", self.request_interval), self.request_interval_min, ) + self._interval_429 = util.build_duration_func( + self.config("sleep-429", 60), + ) if self._retries < 0: self._retries = float("inf") @@ -439,9 +465,11 @@ class Extractor(): if not path: return + path_tmp = path + ".tmp" try: - with open(path, "w") as fp: + with open(path_tmp, "w") as fp: util.cookiestxt_store(fp, self.cookies) + os.replace(path_tmp, path) except OSError as exc: self.log.warning("cookies: %s", exc) @@ -599,7 +627,7 @@ class GalleryExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.gallery_url = self.root + match.group(1) if url is None else url + self.gallery_url = self.root + self.groups[0] if url is None else url def items(self): self.login() @@ -674,7 +702,7 @@ class MangaExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.manga_url = url or self.root + match.group(1) + self.manga_url = self.root + self.groups[0] if url is None else url if self.config("chapter-reverse", False): self.reverse = not self.reverse @@ -736,17 +764,18 @@ class BaseExtractor(Extractor): instances = () def __init__(self, match): - if not self.category: - self._init_category(match) Extractor.__init__(self, match) + if not self.category: + self._init_category() + self._cfgpath = ("extractor", self.category, self.subcategory) - def _init_category(self, match): - for index, group in enumerate(match.groups()): + def _init_category(self): + for index, group in enumerate(self.groups): if group is not None: if index: self.category, self.root, info = self.instances[index-1] if not self.root: - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(self.match.group(0)) self.config_instance = info.get else: self.root = group @@ -806,12 +835,12 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): pass if ssl_options or ssl_ciphers: - ssl_context = ssl.create_default_context() - if ssl_options: - ssl_context.options |= ssl_options - if ssl_ciphers: - ssl_context.set_ecdh_curve("prime256v1") - ssl_context.set_ciphers(ssl_ciphers) + ssl_context = urllib3.connection.create_urllib3_context( + options=ssl_options or None, ciphers=ssl_ciphers) + if requests.__version__ > "2.31": + # https://github.com/psf/requests/pull/6731 + ssl_context.load_default_certs() + ssl_context.check_hostname = False else: ssl_context = None @@ -931,8 +960,6 @@ SSL_CIPHERS = { } -urllib3 = requests.packages.urllib3 - # detect brotli support try: BROTLI = urllib3.response.brotli is not None diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index ca8acaa..993885a 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1457,9 +1457,8 @@ class DeviantartOAuthAPI(): self.log.info( "Register your own OAuth application and use its " "credentials to prevent this error: " - "https://github.com/mikf/gallery-dl/blob/master/do" - "cs/configuration.rst#extractordeviantartclient-id" - "--client-secret") + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-deviantart-client-id-client-secret") else: if log: self.log.error(msg) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index acad95c..1805403 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -50,7 +50,7 @@ class ExhentaiExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and response.headers.get("Content-Length") == "0": + if "Cache-Control" not in response.headers and not response.content: self.log.info("blank page") raise exception.AuthorizationError() return response @@ -95,7 +95,11 @@ class ExhentaiExtractor(Extractor): self.cookies.clear() response = self.request(url, method="POST", headers=headers, data=data) - if b"You are now logged in as:" not in response.content: + content = response.content + if b"You are now logged in as:" not in content: + if b"The captcha was not entered correctly" in content: + raise exception.AuthenticationError( + "CAPTCHA required. Use cookies instead.") raise exception.AuthenticationError() # collect more cookies @@ -437,7 +441,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.AuthorizationError() if page.startswith(("Key missing", "Gallery not found")): raise exception.NotFoundError("gallery") - if "hentai.org/mpv/" in page: + if page.count("hentai.org/mpv/") > 1: self.log.warning("Enabled Multi-Page Viewer is not supported") return page diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 715abcb..85dd896 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -117,8 +117,8 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(match.lastindex-1) - self.thread = match.group(match.lastindex) + self.board = self.groups[-2] + self.thread = self.groups[-1] self.data = None def metadata(self): @@ -140,20 +140,22 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): class FoolfuukaBoardExtractor(FoolfuukaExtractor): """Base extractor for FoolFuuka based boards/archives""" subcategory = "board" - pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/(?:page/)?(\d*))?$" example = "https://archived.moe/a/" def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(match.lastindex) + self.board = self.groups[-2] + self.page = self.groups[-1] def items(self): index_base = "{}/_/api/chan/index/?board={}&page=".format( self.root, self.board) thread_base = "{}/{}/thread/".format(self.root, self.board) - for page in itertools.count(1): - with self.request(index_base + format(page)) as response: + page = self.page + for pnum in itertools.count(text.parse_int(page, 1)): + with self.request(index_base + format(pnum)) as response: try: threads = response.json() except ValueError: @@ -167,6 +169,9 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): thread["_extractor"] = FoolfuukaThreadExtractor yield Message.Queue, thread["url"], thread + if page: + return + class FoolfuukaSearchExtractor(FoolfuukaExtractor): """Base extractor for search results on FoolFuuka based boards/archives""" @@ -179,17 +184,16 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): def __init__(self, match): FoolfuukaExtractor.__init__(self, match) self.params = params = {} - args = match.group(match.lastindex).split("/") - key = None - for arg in args: + key = None + for arg in self.groups[-1].split("/"): if key: params[key] = text.unescape(arg) key = None else: key = arg - board = match.group(match.lastindex-1) + board = self.groups[-2] if board != "_": params["boards"] = board diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 56721d0..6040187 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net" +BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net" class FuraffinityExtractor(Extractor): diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 2459a61..37c776e 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -51,19 +51,44 @@ class GelbooruBase(): params["pid"] = self.page_start params["limit"] = self.per_page limit = self.per_page // 2 + pid = False + + if "tags" in params: + tags = params["tags"].split() + op = "<" + id = False + + for tag in tags: + if tag.startswith("sort:"): + if tag == "sort:id:asc": + op = ">" + elif tag == "sort:id" or tag.startswith("sort:id:"): + op = "<" + else: + pid = True + elif tag.startswith("id:"): + id = True + + if not pid: + if id: + tag = "id:" + op + tags = [t for t in tags if not t.startswith(tag)] + tags = "{} id:{}".format(" ".join(tags), op) while True: posts = self._api_request(params) - for post in posts: - yield post + yield from posts if len(posts) < limit: return - if "pid" in params: - del params["pid"] - params["tags"] = "{} id:<{}".format(self.tags, post["id"]) + if pid: + params["pid"] += 1 + else: + if "pid" in params: + del params["pid"] + params["tags"] = tags + str(posts[-1]["id"]) def _pagination_html(self, params): url = self.root + "/index.php" diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 7ab6d02..8d8b8ad 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -25,7 +25,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.api_root = self.config_instance("api_root") or self.root if self.category == "realbooru": - self._file_url = self._file_url_realbooru + self.items = self._items_realbooru self._tags = self._tags_realbooru def _api_request(self, params): @@ -124,6 +124,35 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url + def _items_realbooru(self): + from .common import Message + data = self.metadata() + + for post in self.posts(): + try: + html = self._html(post) + fallback = post["file_url"] + url = post["file_url"] = text.rextract( + html, 'href="', '"', html.index(">Original<"))[0] + except Exception: + self.log.debug("Unable to fetch download URL for post %s " + "(md5: %s)", post.get("id"), post.get("md5")) + continue + + text.nameext_from_url(url, post) + post.update(data) + self._prepare(post) + self._tags(post, html) + + path = url.rpartition("/")[0] + post["_fallback"] = ( + "{}/{}.{}".format(path, post["md5"], post["extension"]), + fallback, + ) + + yield Message.Directory, post + yield Message.Url, url, post + def _tags_realbooru(self, post, page): tag_container = text.extr(page, 'id="tagLink"', '</div>') tags = collections.defaultdict(list) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index aadce6c..4a9759f 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://hiperdex.com/""" +"""Extractors for https://hiperdex.top/""" from .common import ChapterExtractor, MangaExtractor from .. import text @@ -14,18 +14,18 @@ from ..cache import memcache import re BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" - r"(?:1st)?hiperdex\d?\.(?:com|net|info))") + r"(?:1st)?hiperdex\d?\.(?:com|net|info|top))") class HiperdexBase(): """Base class for hiperdex extractors""" category = "hiperdex" - root = "https://hiperdex.com" + root = "https://hiperdex.top" @memcache(keyarg=1) def manga_data(self, manga, page=None): if not page: - url = "{}/mangas/{}/".format(self.root, manga) + url = "{}/manga/{}/".format(self.root, manga) page = self.request(url).text extr = text.extract_from(page) @@ -67,9 +67,9 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): - """Extractor for manga chapters from hiperdex.com""" + """Extractor for hiperdex manga chapters""" pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))" - example = "https://hiperdex.com/mangas/MANGA/CHAPTER/" + example = "https://hiperdex.top/manga/MANGA/CHAPTER/" def __init__(self, match): root, path, self.manga, self.chapter = match.groups() @@ -88,10 +88,10 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): - """Extractor for manga from hiperdex.com""" + """Extractor for hiperdex manga""" chapterclass = HiperdexChapterExtractor pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$" - example = "https://hiperdex.com/mangas/MANGA/" + example = "https://hiperdex.top/manga/MANGA/" def __init__(self, match): root, path, self.manga = match.groups() @@ -121,13 +121,13 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): - """Extractor for an artists's manga on hiperdex.com""" + """Extractor for an artists's manga on hiperdex""" subcategory = "artist" categorytransfer = False chapterclass = HiperdexMangaExtractor reverse = False pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))" - example = "https://hiperdex.com/manga-artist/NAME/" + example = "https://hiperdex.top/manga-artist/NAME/" def __init__(self, match): self.root = text.ensure_http_scheme(match.group(1)) diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 6d3184d..a2b51be 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -23,6 +23,7 @@ class HotleakExtractor(Extractor): def items(self): for post in self.posts(): + post["_http_expected_status"] = (404,) yield Message.Directory, post yield Message.Url, post["url"], post diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 86b1edd..481fb1e 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -68,7 +68,7 @@ class ImgurImageExtractor(ImgurExtractor): filename_fmt = "{category}_{id}{title:?_//}.{extension}" archive_fmt = "{id}" pattern = (BASE_PATTERN + r"/(?!gallery|search)" - r"(?:r/\w+/)?(\w{7}|\w{5})[sbtmlh]?") + r"(?:r/\w+/)?(?:[^/?#]+-)?(\w{7}|\w{5})[sbtmlh]?") example = "https://imgur.com/abcdefg" def items(self): @@ -93,7 +93,7 @@ class ImgurAlbumExtractor(ImgurExtractor): directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}") filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}" archive_fmt = "{album[id]}_{id}" - pattern = BASE_PATTERN + r"/a/(\w{7}|\w{5})" + pattern = BASE_PATTERN + r"/a/(?:[^/?#]+-)?(\w{7}|\w{5})" example = "https://imgur.com/a/abcde" def items(self): @@ -126,7 +126,7 @@ class ImgurAlbumExtractor(ImgurExtractor): class ImgurGalleryExtractor(ImgurExtractor): """Extractor for imgur galleries""" subcategory = "gallery" - pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(\w{7}|\w{5})" + pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(?:[^/?#]+-)?(\w{7}|\w{5})" example = "https://imgur.com/gallery/abcde" def items(self): diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 62586af..2ae8cbe 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -330,15 +330,18 @@ class InkbunnyAPI(): def _call(self, endpoint, params): url = "https://inkbunny.net/api_" + endpoint + ".php" params["sid"] = self.session_id - data = self.extractor.request(url, params=params).json() - if "error_code" in data: + while True: + data = self.extractor.request(url, params=params).json() + + if "error_code" not in data: + return data + if str(data["error_code"]) == "2": self.authenticate(invalidate=True) - return self._call(endpoint, params) - raise exception.StopExtraction(data.get("error_message")) + continue - return data + raise exception.StopExtraction(data.get("error_message")) def _pagination_search(self, params): params["page"] = 1 diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 9c77b7a..b0c24de 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -57,7 +57,7 @@ class KemonopartyExtractor(Extractor): generators = self._build_file_generators(self.config("files")) duplicates = self.config("duplicates") comments = self.config("comments") - username = dms = None + username = dms = announcements = None # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} @@ -68,6 +68,8 @@ class KemonopartyExtractor(Extractor): '<meta name="artist_name" content="', '"')[0]) if self.config("dms"): dms = True + if self.config("announcements"): + announcements = True posts = self.posts() max_posts = self.config("max-posts") @@ -80,7 +82,7 @@ class KemonopartyExtractor(Extractor): self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers post["date"] = self._parse_datetime( - post["published"] or post["added"]) + post.get("published") or post.get("added") or "") if username: post["username"] = username @@ -88,8 +90,12 @@ class KemonopartyExtractor(Extractor): post["comments"] = self._extract_comments(post) if dms is not None: if dms is True: - dms = self._extract_dms(post) + dms = self._extract_cards(post, "dms") post["dms"] = dms + if announcements is not None: + if announcements is True: + announcements = self._extract_cards(post, "announcements") + post["announcements"] = announcements files = [] hashes = set() @@ -156,7 +162,7 @@ class KemonopartyExtractor(Extractor): def _file(self, post): file = post["file"] - if not file: + if not file or "path" not in file: return () file["type"] = "file" return (file,) @@ -200,21 +206,21 @@ class KemonopartyExtractor(Extractor): }) return comments - def _extract_dms(self, post): - url = "{}/{}/user/{}/dms".format( - self.root, post["service"], post["user"]) + def _extract_cards(self, post, type): + url = "{}/{}/user/{}/{}".format( + self.root, post["service"], post["user"], type) page = self.request(url).text - dms = [] - for dm in text.extract_iter(page, "<article", "</article>"): - footer = text.extr(dm, "<footer", "</footer>") - dms.append({ + cards = [] + for card in text.extract_iter(page, "<article", "</article>"): + footer = text.extr(card, "<footer", "</footer>") + cards.append({ "body": text.unescape(text.extr( - dm, "<pre>", "</pre></", + card, "<pre>", "</pre></", ).strip()), - "date": text.extr(footer, 'Published: ', '\n'), + "date": text.extr(footer, ': ', '\n'), }) - return dms + return cards def _parse_datetime(self, date_string): if len(date_string) > 19: @@ -494,7 +500,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.favorites = (text.parse_query(match.group(3)).get("type") or + self.params = text.parse_query(match.group(3)) + self.favorites = (self.params.get("type") or self.config("favorites") or "artist") @@ -502,9 +509,17 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): self._prepare_ddosguard_cookies() self.login() + sort = self.params.get("sort") + order = self.params.get("order") or "desc" + if self.favorites == "artist": users = self.request( self.root + "/api/v1/account/favorites?type=artist").json() + + if not sort: + sort = "updated" + users.sort(key=lambda x: x[sort], reverse=(order == "desc")) + for user in users: user["_extractor"] = KemonopartyUserExtractor url = "{}/{}/user/{}".format( @@ -514,6 +529,11 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): elif self.favorites == "post": posts = self.request( self.root + "/api/v1/account/favorites?type=post").json() + + if not sort: + sort = "faved_seq" + posts.sort(key=lambda x: x[sort], reverse=(order == "desc")) + for post in posts: post["_extractor"] = KemonopartyPostExtractor url = "{}/{}/user/{}/post/{}".format( diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 030d7d1..cb7f701 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -29,6 +29,7 @@ class MastodonExtractor(BaseExtractor): self.instance = self.root.partition("://")[2] self.reblogs = self.config("reblogs", False) self.replies = self.config("replies", True) + self.cards = self.config("cards", False) def items(self): for status in self.statuses(): @@ -48,6 +49,17 @@ class MastodonExtractor(BaseExtractor): if status["reblog"]: attachments.extend(status["reblog"]["media_attachments"]) + if self.cards: + card = status.get("card") + if card: + url = card.get("image") + if url: + card["weburl"] = card.get("url") + card["url"] = url + card["id"] = "card" + "".join( + url.split("/")[6:-2]).lstrip("0") + attachments.append(card) + status["instance"] = self.instance acct = status["account"]["acct"] status["instance_remote"] = \ @@ -120,6 +132,7 @@ class MastodonUserExtractor(MastodonExtractor): api.account_id_by_username(self.item), only_media=( not self.reblogs and + not self.cards and not self.config("text-posts", False) ), exclude_replies=not self.replies, @@ -136,6 +149,36 @@ class MastodonBookmarkExtractor(MastodonExtractor): return MastodonAPI(self).account_bookmarks() +class MastodonFavoriteExtractor(MastodonExtractor): + """Extractor for mastodon favorites""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/favourites" + example = "https://mastodon.social/favourites" + + def statuses(self): + return MastodonAPI(self).account_favorites() + + +class MastodonListExtractor(MastodonExtractor): + """Extractor for mastodon lists""" + subcategory = "list" + pattern = BASE_PATTERN + r"/lists/(\w+)" + example = "https://mastodon.social/lists/12345" + + def statuses(self): + return MastodonAPI(self).timelines_list(self.item) + + +class MastodonHashtagExtractor(MastodonExtractor): + """Extractor for mastodon hashtags""" + subcategory = "hashtag" + pattern = BASE_PATTERN + r"/tags/(\w+)" + example = "https://mastodon.social/tags/NAME" + + def statuses(self): + return MastodonAPI(self).timelines_tag(self.item) + + class MastodonFollowingExtractor(MastodonExtractor): """Extractor for followed mastodon users""" subcategory = "following" @@ -205,37 +248,55 @@ class MastodonAPI(): raise exception.NotFoundError("account") def account_bookmarks(self): + """Statuses the user has bookmarked""" endpoint = "/v1/bookmarks" return self._pagination(endpoint, None) + def account_favorites(self): + """Statuses the user has favourited""" + endpoint = "/v1/favourites" + return self._pagination(endpoint, None) + def account_following(self, account_id): + """Accounts which the given account is following""" endpoint = "/v1/accounts/{}/following".format(account_id) return self._pagination(endpoint, None) def account_lookup(self, username): + """Quickly lookup a username to see if it is available""" endpoint = "/v1/accounts/lookup" params = {"acct": username} return self._call(endpoint, params).json() def account_search(self, query, limit=40): - """Search for accounts""" + """Search for matching accounts by username or display name""" endpoint = "/v1/accounts/search" params = {"q": query, "limit": limit} return self._call(endpoint, params).json() def account_statuses(self, account_id, only_media=True, exclude_replies=False): - """Fetch an account's statuses""" + """Statuses posted to the given account""" endpoint = "/v1/accounts/{}/statuses".format(account_id) - params = {"only_media" : "1" if only_media else "0", - "exclude_replies": "1" if exclude_replies else "0"} + params = {"only_media" : "true" if only_media else "false", + "exclude_replies": "true" if exclude_replies else "false"} return self._pagination(endpoint, params) def status(self, status_id): - """Fetch a status""" + """Obtain information about a status""" endpoint = "/v1/statuses/" + status_id return self._call(endpoint).json() + def timelines_list(self, list_id): + """View statuses in the given list timeline""" + endpoint = "/v1/timelines/list/" + list_id + return self._pagination(endpoint, None) + + def timelines_tag(self, hashtag): + """View public statuses containing the given hashtag""" + endpoint = "/v1/timelines/tag/" + hashtag + return self._pagination(endpoint, None) + def _call(self, endpoint, params=None): if endpoint.startswith("http"): url = endpoint diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 4cdcf87..7ac3a3a 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -102,30 +102,55 @@ class NewgroundsExtractor(Extractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - url = self.root + "/passport/" + url = self.root + "/passport" response = self.request(url) if response.history and response.url.endswith("/social"): return self.cookies page = response.text - headers = {"Origin": self.root, "Referer": url} + headers = { + "Accept": "application/json, text/javascript, */*; q=0.01", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + "X-Requested-With": "XMLHttpRequest", + "Origin": self.root, + "Referer": url, + } url = text.urljoin(self.root, text.extr(page, 'action="', '"')) data = { - "username": username, - "password": password, - "remember": "1", - "login" : "1", "auth" : text.extr(page, 'name="auth" value="', '"'), + "remember": "1", + "username": username, + "password": str(password), + "code" : "", + "codehint": "------", + "mfaCheck": "1", } - response = self.request(url, method="POST", headers=headers, data=data) - if not response.history: - raise exception.AuthenticationError() + while True: + response = self.request( + url, method="POST", headers=headers, data=data) + result = response.json() + + if result.get("success"): + break + if "errors" in result: + raise exception.AuthenticationError( + '"' + '", "'.join(result["errors"]) + '"') + + if result.get("requiresMfa"): + data["code"] = self.input("Verification Code: ") + data["codehint"] = " " + elif result.get("requiresEmailMfa"): + email = result.get("obfuscatedEmail") + prompt = "Email Verification Code ({}): ".format(email) + data["code"] = self.input(prompt) + data["codehint"] = " " + + data.pop("mfaCheck", None) return { cookie.name: cookie.value - for cookie in response.history[0].cookies - if cookie.expires and cookie.domain == self.cookies_domain + for cookie in response.cookies } def extract_post(self, post_url): diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 8c8a5a9..5571575 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -110,7 +110,7 @@ class OAuthBase(Extractor): # get a request token params = {"oauth_callback": self.redirect_uri} - data = self.session.get(request_token_url, params=params).text + data = self.request(request_token_url, params=params).text data = text.parse_query(data) self.session.auth.token_secret = data["oauth_token_secret"] @@ -120,7 +120,7 @@ class OAuthBase(Extractor): data = self.open(authorize_url, params) # exchange the request token for an access token - data = self.session.get(access_token_url, params=data).text + data = self.request(access_token_url, params=data).text data = text.parse_query(data) token = data["oauth_token"] token_secret = data["oauth_token_secret"] @@ -189,7 +189,8 @@ class OAuthBase(Extractor): data["client_id"] = client_id data["client_secret"] = client_secret - data = self.session.post(token_url, data=data, auth=auth).json() + data = self.request( + token_url, method="POST", data=data, auth=auth).json() # check token response if "error" in data: @@ -386,7 +387,7 @@ class OAuthMastodon(OAuthBase): "redirect_uris": self.redirect_uri, "scopes": "read", } - data = self.session.post(url, data=data).json() + data = self.request(url, method="POST", data=data).json() if "client_id" not in data or "client_secret" not in data: raise exception.StopExtraction( @@ -441,7 +442,8 @@ class OAuthPixiv(OAuthBase): "redirect_uri" : "https://app-api.pixiv.net" "/web/v1/users/auth/pixiv/callback", } - data = self.session.post(url, headers=headers, data=data).json() + data = self.request( + url, method="POST", headers=headers, data=data).json() if "error" in data: stdout_write("\n{}\n".format(data)) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 62d11f2..eb6d677 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -263,8 +263,9 @@ class PatreonExtractor(Extractor): page, 'id="__NEXT_DATA__" type="application/json">', '</script') if data: try: - return (util.json_loads(data)["props"]["pageProps"] - ["bootstrapEnvelope"]["bootstrap"]) + data = util.json_loads(data) + env = data["props"]["pageProps"]["bootstrapEnvelope"] + return env.get("pageBootstrap") or env["bootstrap"] except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py index 5cfdc43..83f3577 100644 --- a/gallery_dl/extractor/pixeldrain.py +++ b/gallery_dl/extractor/pixeldrain.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2023 Mike Fährmann +# Copyright 2023-2024 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -59,12 +59,13 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor): directory_fmt = ("{category}", "{album[date]:%Y-%m-%d} {album[title]} ({album[id]})") filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}" - pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)" + pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)(?:#item=(\d+))?" example = "https://pixeldrain.com/l/abcdefgh" def __init__(self, match): Extractor.__init__(self, match) self.album_id = match.group(1) + self.file_index = match.group(2) def items(self): url = "{}/api/list/{}".format(self.root, self.album_id) @@ -74,11 +75,20 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor): album["count"] = album["file_count"] album["date"] = self.parse_datetime(album["date_created"]) + if self.file_index: + idx = text.parse_int(self.file_index) + try: + files = (files[idx],) + except LookupError: + files = () + else: + idx = 0 + del album["files"] del album["file_count"] yield Message.Directory, {"album": album} - for num, file in enumerate(files, 1): + for num, file in enumerate(files, idx+1): file["album"] = album file["num"] = num file["url"] = url = "{}/api/file/{}?download".format( diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 862a7db..d732894 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -104,8 +104,9 @@ class PixivExtractor(Extractor): elif work["page_count"] == 1: url = meta_single_page["original_image_url"] if url == url_sanity: - self.log.debug("Skipping 'sanity_level' warning (%s)", - work["id"]) + self.log.warning( + "Unable to download work %s ('sanity_level' warning)", + work["id"]) continue work["date_url"] = self._date_from_url(url) yield Message.Url, url, text.nameext_from_url(url, work) @@ -619,6 +620,7 @@ class PixivNovelExtractor(PixivExtractor): meta_user = self.config("metadata") meta_bookmark = self.config("metadata-bookmark") embeds = self.config("embeds") + covers = self.config("covers") if embeds: headers = { @@ -658,6 +660,19 @@ class PixivNovelExtractor(PixivExtractor): novel["extension"] = "txt" yield Message.Url, "text:" + content, novel + if covers: + path = novel["image_urls"]["large"].partition("/img/")[2] + url = ("https://i.pximg.net/novel-cover-original/img/" + + path.rpartition(".")[0].replace("_master1200", "")) + novel["date_url"] = self._date_from_url(url) + novel["num"] += 1 + novel["suffix"] = "_p{:02}".format(novel["num"]) + novel["_fallback"] = (url + ".png",) + url_jpg = url + ".jpg" + text.nameext_from_url(url_jpg, novel) + yield Message.Url, url_jpg, novel + del novel["_fallback"] + if embeds: desktop = False illusts = {} diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index f42016f..bd22283 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -23,6 +23,12 @@ class PoipikuExtractor(Extractor): archive_fmt = "{post_id}_{num}" request_interval = (0.5, 1.5) + def _init(self): + self.cookies.set( + "LANG", "en", domain="poipiku.com") + self.cookies.set( + "POIPIKU_CONTENTS_VIEW_MODE", "1", domain="poipiku.com") + def items(self): password = self.config("password", "") @@ -59,7 +65,7 @@ class PoipikuExtractor(Extractor): "//img.", "//img-org.", 1) yield Message.Url, url, text.nameext_from_url(url, post) - if not extr(' show all(+', '<'): + if not extr('ShowAppendFile', '<'): continue url = self.root + "/f/ShowAppendFileF.jsp" diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 3569860..115de9a 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -35,10 +35,7 @@ class ReadcomiconlineBase(): self.log.warning( "Redirect to \n%s\nVisit this URL in your browser, solve " "the CAPTCHA, and press ENTER to continue", response.url) - try: - input() - except (EOFError, OSError): - pass + self.input() else: raise exception.StopExtraction( "Redirect to \n%s\nVisit this URL in your browser and " diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index e099c7e..ce602f6 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -74,8 +74,8 @@ class RedditExtractor(Extractor): yield Message.Url, url, submission elif "gallery_data" in media: - for submission["num"], url in enumerate( - self._extract_gallery(media), 1): + for url in self._extract_gallery(media): + submission["num"] += 1 text.nameext_from_url(url, submission) yield Message.Url, url, submission @@ -99,7 +99,10 @@ class RedditExtractor(Extractor): urls.append((url, submission)) for comment in comments: html = comment["body_html"] or "" - if ' href="' in html: + href = (' href="' in html) + media = ("media_metadata" in comment) + + if media or href: comment["date"] = text.parse_timestamp( comment["created_utc"]) if submission: @@ -107,6 +110,14 @@ class RedditExtractor(Extractor): data["comment"] = comment else: data = comment + + if media: + for embed in self._extract_embed(comment): + submission["num"] += 1 + text.nameext_from_url(embed, submission) + yield Message.Url, embed, submission + + if href: for url in text.extract_iter(html, ' href="', '"'): urls.append((url, data)) @@ -118,6 +129,7 @@ class RedditExtractor(Extractor): if url.startswith(( "https://www.reddit.com/message/compose", "https://reddit.com/message/compose", + "https://preview.redd.it/", )): continue @@ -172,6 +184,27 @@ class RedditExtractor(Extractor): submission["id"], item["media_id"]) self.log.debug(src) + def _extract_embed(self, submission): + meta = submission["media_metadata"] + if not meta: + return + + for mid, data in meta.items(): + if data["status"] != "valid" or "s" not in data: + self.log.warning( + "embed %s: skipping item %s (status: %s)", + submission["id"], mid, data.get("status")) + continue + src = data["s"] + url = src.get("u") or src.get("gif") or src.get("mp4") + if url: + yield url.partition("?")[0].replace("/preview.", "/i.", 1) + else: + self.log.error( + "embed %s: unable to fetch download URL for item %s", + submission["id"], mid) + self.log.debug(src) + def _extract_video_ytdl(self, submission): return "https://www.reddit.com" + submission["permalink"] @@ -454,14 +487,14 @@ class RedditAPI(): remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: - if self._warn_429: - self._warn_429 = False + self.log.warning("API rate limit exceeded") + if self._warn_429 and self.client_id == self.CLIENT_ID: self.log.info( "Register your own OAuth application and use its " "credentials to prevent this error: " - "https://github.com/mikf/gallery-dl/blob/master" - "/docs/configuration.rst" - "#extractorredditclient-id--user-agent") + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-reddit-client-id-user-agent") + self._warn_429 = False self.extractor.wait( seconds=response.headers["x-ratelimit-reset"]) continue diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index edfe1dc..23ba340 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, util, exception +from ..cache import cache class SeigaExtractor(Extractor): @@ -17,6 +18,7 @@ class SeigaExtractor(Extractor): category = "seiga" archive_fmt = "{image_id}" cookies_domain = ".nicovideo.jp" + cookies_names = ("user_session",) root = "https://seiga.nicovideo.jp" def __init__(self, match): @@ -24,8 +26,7 @@ class SeigaExtractor(Extractor): self.start_image = 0 def items(self): - if not self.cookies_check(("user_session",)): - raise exception.StopExtraction("'user_session' cookie required") + self.login() images = iter(self.get_images()) data = next(images) @@ -50,6 +51,59 @@ class SeigaExtractor(Extractor): "HTTP redirect to login page (%s)", location.partition("?")[0]) return location.replace("/o/", "/priv/", 1) + def login(self): + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + return self.cookies_update(self._login_impl(username, password)) + + raise exception.AuthorizationError( + "username & password or 'user_session' cookie required") + + @cache(maxage=365*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + root = "https://account.nicovideo.jp" + response = self.request(root + "/login?site=seiga") + page = response.text + + data = { + "mail_tel": username, + "password": password, + } + url = root + text.unescape(text.extr(page, '<form action="', '"')) + response = self.request(url, method="POST", data=data) + + if "message=cant_login" in response.url: + raise exception.AuthenticationError() + + if "/mfa" in response.url: + page = response.text + email = text.extr(page, 'class="userAccount">', "<") + code = self.input("Email Confirmation Code ({}): ".format(email)) + + data = { + "otp": code, + "loginBtn": "Login", + "device_name": "gdl", + } + url = root + text.unescape(text.extr(page, '<form action="', '"')) + response = self.request(url, method="POST", data=data) + + if not response.history and \ + b"Confirmation code is incorrect" in response.content: + raise exception.AuthenticationError( + "Incorrect Confirmation Code") + + return { + cookie.name: cookie.value + for cookie in self.cookies + if cookie.expires and cookie.domain == self.cookies_domain + } + class SeigaUserExtractor(SeigaExtractor): """Extractor for images of a user from seiga.nicovideo.jp""" diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index b56ed27..e5e7a6b 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -47,13 +47,13 @@ class SlidesharePresentationExtractor(GalleryExtractor): } def images(self, page): - parts = self.slideshow["slideImages"][0]["baseUrl"].split("/") - - begin = "{}/95/{}-".format( - "/".join(parts[:4]), - self.slideshow["strippedTitle"], + slides = self.slideshow["slides"] + begin = "{}/{}/95/{}-".format( + slides["host"], + slides["imageLocation"], + slides["title"], ) - end = "-1024.jpg?" + parts[-1].rpartition("?")[2] + end = "-1024.jpg" return [ (begin + str(n) + end, None) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index d4adfed..0abb3ab 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -43,6 +43,8 @@ class SubscribestarExtractor(Extractor): item.update(data) item["num"] = num text.nameext_from_url(item.get("name") or item["url"], item) + if item["url"][0] == "/": + item["url"] = self.root + item["url"] yield Message.Url, item["url"], item def posts(self): diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index 0a9df20..167953d 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -151,3 +151,18 @@ class TapasEpisodeExtractor(TapasExtractor): def episode_ids(self): return (self.episode_id,) + + +class TapasCreatorExtractor(TapasExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)" + example = "https://tapas.io/CREATOR" + + def items(self): + url = "{}/{}/series".format(self.root, self.groups[0]) + page = self.request(url).text + page = text.extr(page, '<ul class="content-list-wrap', "</ul>") + + data = {"_extractor": TapasSeriesExtractor} + for path in text.extract_iter(page, ' href="', '"'): + yield Message.Queue, self.root + path, data diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py index a3ef26c..de6f3ee 100644 --- a/gallery_dl/extractor/tcbscans.py +++ b/gallery_dl/extractor/tcbscans.py @@ -30,7 +30,7 @@ class TcbscansChapterExtractor(ChapterExtractor): page, 'font-bold mt-8">', "</h1>").rpartition(" - Chapter ") chapter, sep, minor = chapter.partition(".") return { - "manga": text.unescape(manga), + "manga": text.unescape(manga).strip(), "chapter": text.parse_int(chapter), "chapter_minor": sep + minor, "lang": "en", "language": "English", diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index fee0145..c34910f 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -447,9 +447,9 @@ class TumblrAPI(oauth.OAuth1API): if api_key == self.API_KEY: self.log.info( "Register your own OAuth application and use its " - "credentials to prevent this error: https://githu" - "b.com/mikf/gallery-dl/blob/master/docs/configurat" - "ion.rst#extractortumblrapi-key--api-secret") + "credentials to prevent this error: " + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-tumblr-api-key-api-secret") if self.extractor.config("ratelimit") == "wait": self.extractor.wait(seconds=reset) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index a5bd984..ff77828 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -6,17 +6,18 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://twitter.com/""" +"""Extractors for https://x.com/""" from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache import itertools +import random import json import re BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?" - r"(?:(?:[fv]x)?twitter|(?:fixup)?x)\.com") + r"(?:(?:[fv]x)?twitter|(?:fix(?:up|v))?x)\.com") class TwitterExtractor(Extractor): @@ -25,9 +26,9 @@ class TwitterExtractor(Extractor): directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{retweet_id}_{num}" - cookies_domain = ".twitter.com" + cookies_domain = ".x.com" cookies_names = ("auth_token",) - root = "https://twitter.com" + root = "https://x.com" browser = "firefox" def __init__(self, match): @@ -243,8 +244,8 @@ class TwitterExtractor(Extractor): # collect URLs from entities for url in tweet["entities"].get("urls") or (): - url = url["expanded_url"] - if "//twitpic.com/" not in url or "/photos/" in url: + url = url.get("expanded_url") or url.get("url") or "" + if not url or "//twitpic.com/" not in url or "/photos/" in url: continue if url.startswith("http:"): url = "https" + url[4:] @@ -336,12 +337,20 @@ class TwitterExtractor(Extractor): urls = entities.get("urls") if urls: for url in urls: - content = content.replace(url["url"], url["expanded_url"]) + try: + content = content.replace(url["url"], url["expanded_url"]) + except KeyError: + pass txt, _, tco = content.rpartition(" ") tdata["content"] = txt if tco.startswith("https://t.co/") else content if "birdwatch_pivot" in tweet: - tdata["birdwatch"] = tweet["birdwatch_pivot"]["subtitle"]["text"] + try: + tdata["birdwatch"] = \ + tweet["birdwatch_pivot"]["subtitle"]["text"] + except KeyError: + self.log.debug("Unable to extract 'birdwatch' note from %s", + tweet["birdwatch_pivot"]) if "in_reply_to_screen_name" in legacy: tdata["reply_to"] = legacy["in_reply_to_screen_name"] if "quoted_by" in legacy: @@ -398,7 +407,10 @@ class TwitterExtractor(Extractor): urls = entities["description"].get("urls") if urls: for url in urls: - descr = descr.replace(url["url"], url["expanded_url"]) + try: + descr = descr.replace(url["url"], url["expanded_url"]) + except KeyError: + pass udata["description"] = descr if "url" in entities: @@ -483,7 +495,13 @@ class TwitterExtractor(Extractor): username, password = self._get_auth_info() if username: - self.cookies_update(_login_impl(self, username, password)) + return self.cookies_update(_login_impl(self, username, password)) + + for cookie in self.cookies: + if cookie.domain == ".twitter.com": + self.cookies.set( + cookie.name, cookie.value, domain=self.cookies_domain, + expires=cookie.expires, secure=cookie.secure) class TwitterUserExtractor(TwitterExtractor): @@ -491,7 +509,7 @@ class TwitterUserExtractor(TwitterExtractor): subcategory = "user" pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])" r"|i(?:/user/|ntent/user\?user_id=)(\d+))") - example = "https://twitter.com/USER" + example = "https://x.com/USER" def __init__(self, match): TwitterExtractor.__init__(self, match) @@ -519,7 +537,7 @@ class TwitterTimelineExtractor(TwitterExtractor): """Extractor for a Twitter user timeline""" subcategory = "timeline" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)" - example = "https://twitter.com/USER/timeline" + example = "https://x.com/USER/timeline" def tweets(self): # yield initial batch of (media) tweets @@ -566,7 +584,7 @@ class TwitterTweetsExtractor(TwitterExtractor): """Extractor for Tweets from a user's Tweets timeline""" subcategory = "tweets" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)" - example = "https://twitter.com/USER/tweets" + example = "https://x.com/USER/tweets" def tweets(self): return self.api.user_tweets(self.user) @@ -576,7 +594,7 @@ class TwitterRepliesExtractor(TwitterExtractor): """Extractor for Tweets from a user's timeline including replies""" subcategory = "replies" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/with_replies(?!\w)" - example = "https://twitter.com/USER/with_replies" + example = "https://x.com/USER/with_replies" def tweets(self): return self.api.user_tweets_and_replies(self.user) @@ -586,7 +604,7 @@ class TwitterMediaExtractor(TwitterExtractor): """Extractor for Tweets from a user's Media timeline""" subcategory = "media" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)" - example = "https://twitter.com/USER/media" + example = "https://x.com/USER/media" def tweets(self): return self.api.user_media(self.user) @@ -596,7 +614,7 @@ class TwitterLikesExtractor(TwitterExtractor): """Extractor for liked tweets""" subcategory = "likes" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)" - example = "https://twitter.com/USER/likes" + example = "https://x.com/USER/likes" def metadata(self): return {"user_likes": self.user} @@ -609,7 +627,7 @@ class TwitterBookmarkExtractor(TwitterExtractor): """Extractor for bookmarked tweets""" subcategory = "bookmark" pattern = BASE_PATTERN + r"/i/bookmarks()" - example = "https://twitter.com/i/bookmarks" + example = "https://x.com/i/bookmarks" def tweets(self): return self.api.user_bookmarks() @@ -625,7 +643,7 @@ class TwitterListExtractor(TwitterExtractor): """Extractor for Twitter lists""" subcategory = "list" pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$" - example = "https://twitter.com/i/lists/12345" + example = "https://x.com/i/lists/12345" def tweets(self): return self.api.list_latest_tweets_timeline(self.user) @@ -635,7 +653,7 @@ class TwitterListMembersExtractor(TwitterExtractor): """Extractor for members of a Twitter list""" subcategory = "list-members" pattern = BASE_PATTERN + r"/i/lists/(\d+)/members" - example = "https://twitter.com/i/lists/12345/members" + example = "https://x.com/i/lists/12345/members" def items(self): self.login() @@ -646,7 +664,7 @@ class TwitterFollowingExtractor(TwitterExtractor): """Extractor for followed users""" subcategory = "following" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)" - example = "https://twitter.com/USER/following" + example = "https://x.com/USER/following" def items(self): self.login() @@ -657,7 +675,7 @@ class TwitterSearchExtractor(TwitterExtractor): """Extractor for Twitter search results""" subcategory = "search" pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" - example = "https://twitter.com/search?q=QUERY" + example = "https://x.com/search?q=QUERY" def metadata(self): return {"search": text.unquote(self.user)} @@ -688,7 +706,7 @@ class TwitterHashtagExtractor(TwitterExtractor): """Extractor for Twitter hashtags""" subcategory = "hashtag" pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)" - example = "https://twitter.com/hashtag/NAME" + example = "https://x.com/hashtag/NAME" def items(self): url = "{}/search?q=%23{}".format(self.root, self.user) @@ -700,7 +718,7 @@ class TwitterCommunityExtractor(TwitterExtractor): """Extractor for a Twitter community""" subcategory = "community" pattern = BASE_PATTERN + r"/i/communities/(\d+)" - example = "https://twitter.com/i/communities/12345" + example = "https://x.com/i/communities/12345" def tweets(self): if self.textonly: @@ -712,7 +730,7 @@ class TwitterCommunitiesExtractor(TwitterExtractor): """Extractor for followed Twitter communities""" subcategory = "communities" pattern = BASE_PATTERN + r"/([^/?#]+)/communities/?$" - example = "https://twitter.com/i/communities" + example = "https://x.com/i/communities" def tweets(self): return self.api.communities_main_page_timeline(self.user) @@ -724,7 +742,7 @@ class TwitterEventExtractor(TwitterExtractor): directory_fmt = ("{category}", "Events", "{event[id]} {event[short_title]}") pattern = BASE_PATTERN + r"/i/events/(\d+)" - example = "https://twitter.com/i/events/12345" + example = "https://x.com/i/events/12345" def metadata(self): return {"event": self.api.live_event(self.user)} @@ -736,8 +754,9 @@ class TwitterEventExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): """Extractor for individual tweets""" subcategory = "tweet" - pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)/?$" - example = "https://twitter.com/USER/status/12345" + pattern = (BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" + r"/?(?:$|\?|#|photo/|video/)") + example = "https://x.com/USER/status/12345" def __init__(self, match): TwitterExtractor.__init__(self, match) @@ -817,7 +836,7 @@ class TwitterQuotesExtractor(TwitterExtractor): """Extractor for quotes of a Tweet""" subcategory = "quotes" pattern = BASE_PATTERN + r"/(?:[^/?#]+|i/web)/status/(\d+)/quotes" - example = "https://twitter.com/USER/status/12345/quotes" + example = "https://x.com/USER/status/12345/quotes" def items(self): url = "{}/search?q=quoted_tweet_id:{}".format(self.root, self.user) @@ -830,7 +849,7 @@ class TwitterAvatarExtractor(TwitterExtractor): filename_fmt = "avatar {date}.{extension}" archive_fmt = "AV_{user[id]}_{date}" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/photo" - example = "https://twitter.com/USER/photo" + example = "https://x.com/USER/photo" def tweets(self): self.api._user_id_by_screen_name(self.user) @@ -852,7 +871,7 @@ class TwitterBackgroundExtractor(TwitterExtractor): filename_fmt = "background {date}.{extension}" archive_fmt = "BG_{user[id]}_{date}" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/header_photo" - example = "https://twitter.com/USER/header_photo" + example = "https://x.com/USER/header_photo" def tweets(self): self.api._user_id_by_screen_name(self.user) @@ -899,7 +918,7 @@ class TwitterAPI(): self.extractor = extractor self.log = extractor.log - self.root = "https://twitter.com/i/api" + self.root = "https://x.com/i/api" self._nsfw_warning = True self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode @@ -919,7 +938,7 @@ class TwitterAPI(): self.headers = { "Accept": "*/*", - "Referer": "https://twitter.com/", + "Referer": extractor.root + "/", "content-type": "application/json", "x-guest-token": None, "x-twitter-auth-type": "OAuth2Session" if auth_token else None, @@ -1262,7 +1281,7 @@ class TwitterAPI(): endpoint = "/1.1/guest/activate.json" self.log.info("Requesting guest token") return str(self._call( - endpoint, None, "POST", False, "https://api.twitter.com", + endpoint, None, "POST", False, "https://api.x.com", )["guest_token"]) def _authenticate_guest(self): @@ -1288,63 +1307,72 @@ class TwitterAPI(): if csrf_token: self.headers["x-csrf-token"] = csrf_token - if response.status_code < 400: + remaining = int(response.headers.get("x-rate-limit-remaining", 6)) + if remaining < 6 and remaining <= random.randrange(1, 6): + self._handle_ratelimit(response) + continue + + try: data = response.json() + except ValueError: + data = {"errors": ({"message": response.text},)} + + errors = data.get("errors") + if not errors: + return data + + retry = False + for error in errors: + msg = error.get("message") or "Unspecified" + self.log.debug("API error: '%s'", msg) + + if "this account is temporarily locked" in msg: + msg = "Account temporarily locked" + if self.extractor.config("locked") != "wait": + raise exception.AuthorizationError(msg) + self.log.warning(msg) + self.extractor.input("Press ENTER to retry.") + retry = True + + elif "Could not authenticate you" in msg: + if not self.extractor.config("relogin", True): + continue - errors = data.get("errors") - if not errors: - return data + username, password = self.extractor._get_auth_info() + if not username: + continue - retry = False - for error in errors: - msg = error.get("message") or "Unspecified" - self.log.debug("API error: '%s'", msg) + _login_impl.invalidate(username) + self.extractor.cookies_update( + _login_impl(self.extractor, username, password)) + self.__init__(self.extractor) + retry = True - if "this account is temporarily locked" in msg: - msg = "Account temporarily locked" - if self.extractor.config("locked") != "wait": - raise exception.AuthorizationError(msg) - self.log.warning("%s. Press ENTER to retry.", msg) - try: - input() - except (EOFError, OSError): - pass - retry = True - - elif msg.lower().startswith("timeout"): - retry = True + elif msg.lower().startswith("timeout"): + retry = True - if not retry: - return data - elif self.headers["x-twitter-auth-type"]: + if retry: + if self.headers["x-twitter-auth-type"]: self.log.debug("Retrying API request") continue + else: + # fall through to "Login Required" + response.status_code = 404 - # fall through to "Login Required" - response.status_code = 404 - - if response.status_code == 429: - # rate limit exceeded - if self.extractor.config("ratelimit") == "abort": - raise exception.StopExtraction("Rate limit exceeded") - - until = response.headers.get("x-rate-limit-reset") - seconds = None if until else 60 - self.extractor.wait(until=until, seconds=seconds) - continue - - if response.status_code in (403, 404) and \ + if response.status_code < 400: + return data + elif response.status_code in (403, 404) and \ not self.headers["x-twitter-auth-type"]: raise exception.AuthorizationError("Login required") + elif response.status_code == 429: + self._handle_ratelimit(response) + continue # error try: - data = response.json() - errors = ", ".join(e["message"] for e in data["errors"]) - except ValueError: - errors = response.text + errors = ", ".join(e["message"] for e in errors) except Exception: - errors = data.get("errors", "") + pass raise exception.StopExtraction( "%s %s (%s)", response.status_code, response.reason, errors) @@ -1680,6 +1708,13 @@ class TwitterAPI(): return variables["cursor"] = cursor + def _handle_ratelimit(self, response): + if self.extractor.config("ratelimit") == "abort": + raise exception.StopExtraction("Rate limit exceeded") + + until = response.headers.get("x-rate-limit-reset") + self.extractor.wait(until=until, seconds=None if until else 60) + def _process_tombstone(self, entry, tombstone): text = (tombstone.get("richText") or tombstone["text"])["text"] tweet_id = entry["entryId"].rpartition("-")[2] @@ -1695,22 +1730,22 @@ class TwitterAPI(): @cache(maxage=365*86400, keyarg=1) def _login_impl(extr, username, password): - import re - import random + def process(data, params=None): + response = extr.request( + url, params=params, headers=headers, json=data, + method="POST", fatal=None) - if re.fullmatch(r"[\w.%+-]+@[\w.-]+\.\w{2,}", username): - extr.log.warning( - "Login with email is no longer possible. " - "You need to provide your username or phone number instead.") - - def process(response): try: data = response.json() except ValueError: data = {"errors": ({"message": "Invalid response"},)} else: if response.status_code < 400: - return data["flow_token"] + try: + return (data["flow_token"], + data["subtasks"][0]["subtask_id"]) + except LookupError: + pass errors = [] for error in data.get("errors") or (): @@ -1719,9 +1754,13 @@ def _login_impl(extr, username, password): extr.log.debug(response.text) raise exception.AuthenticationError(", ".join(errors)) - extr.cookies.clear() + cookies = extr.cookies + cookies.clear() api = TwitterAPI(extr) api._authenticate_guest() + + url = "https://api.x.com/1.1/onboarding/task.json" + params = {"flow_name": "login"} headers = api.headers extr.log.info("Logging in as %s", username) @@ -1778,31 +1817,18 @@ def _login_impl(extr, username, password): "web_modal": 1, }, } - url = "https://api.twitter.com/1.1/onboarding/task.json?flow_name=login" - response = extr.request(url, method="POST", headers=headers, json=data) - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "LoginJsInstrumentationSubtask", + flow_token, subtask = process(data, params) + while not cookies.get("auth_token"): + if subtask == "LoginJsInstrumentationSubtask": + data = { "js_instrumentation": { "response": "{}", "link": "next_link", }, - }, - ], - } - url = "https://api.twitter.com/1.1/onboarding/task.json" - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - - # username - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "LoginEnterUserIdentifierSSO", + } + elif subtask == "LoginEnterUserIdentifierSSO": + data = { "settings_list": { "setting_responses": [ { @@ -1814,48 +1840,61 @@ def _login_impl(extr, username, password): ], "link": "next_link", }, - }, - ], - } - # url = "https://api.twitter.com/1.1/onboarding/task.json" - extr.sleep(random.uniform(2.0, 4.0), "login (username)") - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - - # password - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "LoginEnterPassword", + } + elif subtask == "LoginEnterPassword": + data = { "enter_password": { "password": password, "link": "next_link", }, - }, - ], - } - # url = "https://api.twitter.com/1.1/onboarding/task.json" - extr.sleep(random.uniform(2.0, 4.0), "login (password)") - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - - # account duplication check ? - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "AccountDuplicationCheck", + } + elif subtask == "LoginEnterAlternateIdentifierSubtask": + alt = extr.input( + "Alternate Identifier (username, email, phone number): ") + data = { + "enter_text": { + "text": alt, + "link": "next_link", + }, + } + elif subtask == "LoginTwoFactorAuthChallenge": + data = { + "enter_text": { + "text": extr.input("2FA Token: "), + "link": "next_link", + }, + } + elif subtask == "LoginAcid": + data = { + "enter_text": { + "text": extr.input("Email Verification Code: "), + "link": "next_link", + }, + } + elif subtask == "AccountDuplicationCheck": + data = { "check_logged_in_account": { "link": "AccountDuplicationCheck_false", }, - }, - ], - } - # url = "https://api.twitter.com/1.1/onboarding/task.json" - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - process(response) + } + elif subtask == "ArkoseLogin": + raise exception.AuthenticationError("Login requires CAPTCHA") + elif subtask == "DenyLoginSubtask": + raise exception.AuthenticationError("Login rejected as suspicious") + elif subtask == "ArkoseLogin": + raise exception.AuthenticationError("No auth token cookie") + else: + raise exception.StopExtraction("Unrecognized subtask %s", subtask) + + inputs = {"subtask_id": subtask} + inputs.update(data) + data = { + "flow_token": flow_token, + "subtask_inputs": [inputs], + } + + extr.sleep(random.uniform(1.0, 3.0), "login ({})".format(subtask)) + flow_token, subtask = process(data) return { cookie.name: cookie.value diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 41141c6..c112f4a 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -46,6 +46,8 @@ class VscoExtractor(Extractor): url = "https://image-{}.vsco.co/{}".format(cdn, path) elif cdn.isdecimal(): url = "https://image.vsco.co/" + base + elif img["responsive_url"].startswith("http"): + url = img["responsive_url"] else: url = "https://" + img["responsive_url"] @@ -238,6 +240,34 @@ class VscoSpacesExtractor(VscoExtractor): yield Message.Queue, url, space +class VscoAvatarExtractor(VscoExtractor): + """Extractor for vsco.co user avatars""" + subcategory = "avatar" + pattern = USER_PATTERN + r"/avatar" + example = "https://vsco.co/USER/avatar" + + def images(self): + url = "{}/{}/gallery".format(self.root, self.user) + page = self.request(url).text + piid = text.extr(page, '"profileImageId":"', '"') + + url = "https://im.vsco.co/" + piid + # needs GET request, since HEAD does not redirect to full URL + response = self.request(url, allow_redirects=False) + + return ({ + "_id" : piid, + "is_video" : False, + "grid_name" : "", + "upload_date" : 0, + "responsive_url": response.headers["Location"], + "video_url" : "", + "image_meta" : None, + "width" : 0, + "height" : 0, + },) + + class VscoImageExtractor(VscoExtractor): """Extractor for individual images on vsco.co""" subcategory = "image" diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index ac00682..9370cfb 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -27,9 +27,9 @@ class WikimediaExtractor(BaseExtractor): if self.category == "wikimedia": self.category = self.root.split(".")[-2] - elif self.category == "fandom": - self.category = \ - "fandom-" + self.root.partition(".")[0].rpartition("/")[2] + elif self.category in ("fandom", "wikigg"): + self.category = "{}-{}".format( + self.category, self.root.partition(".")[0].rpartition("/")[2]) if path.startswith("wiki/"): path = path[5:] @@ -69,14 +69,18 @@ class WikimediaExtractor(BaseExtractor): def items(self): for info in self._pagination(self.params): - image = info["imageinfo"][0] + try: + image = info["imageinfo"][0] + except LookupError: + self.log.debug("Missing 'imageinfo' for %s", info) + continue image["metadata"] = { m["name"]: m["value"] - for m in image["metadata"]} + for m in image["metadata"] or ()} image["commonmetadata"] = { m["name"]: m["value"] - for m in image["commonmetadata"]} + for m in image["commonmetadata"] or ()} filename = image["canonicaltitle"] image["filename"], _, image["extension"] = \ @@ -148,6 +152,10 @@ BASE_PATTERN = WikimediaExtractor.update({ "root": None, "pattern": r"[\w-]+\.fandom\.com", }, + "wikigg": { + "root": None, + "pattern": r"\w+\.wiki\.gg", + }, "mariowiki": { "root": "https://www.mariowiki.com", "pattern": r"(?:www\.)?mariowiki\.com", diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index b83cf21..0b212d5 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -243,13 +243,12 @@ class TemplateFStringFormatter(FStringFormatter): def parse_field_name(field_name): + if field_name[0] == "'": + return "_lit", (operator.itemgetter(field_name[1:-1]),) + first, rest = _string.formatter_field_name_split(field_name) funcs = [] - if first[0] == "'": - funcs.append(operator.itemgetter(first[1:-1])) - first = "_lit" - for is_attr, key in rest: if is_attr: func = operator.attrgetter diff --git a/gallery_dl/job.py b/gallery_dl/job.py index eb10a0c..4562b05 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -11,10 +11,23 @@ import errno import logging import functools import collections -from . import extractor, downloader, postprocessor -from . import config, text, util, path, formatter, output, exception, version + +from . import ( + extractor, + downloader, + postprocessor, + archive, + config, + exception, + formatter, + output, + path, + text, + util, + version, +) from .extractor.message import Message -from .output import stdout_write +stdout_write = output.stdout_write class Job(): @@ -29,8 +42,9 @@ class Job(): self.extractor = extr self.pathfmt = None - self.kwdict = {} self.status = 0 + self.kwdict = {} + self.kwdict_eval = False cfgpath = [] if parent: @@ -107,7 +121,16 @@ class Job(): # user-supplied metadata kwdict = extr.config("keywords") if kwdict: - self.kwdict.update(kwdict) + if extr.config("keywords-eval"): + self.kwdict_eval = [] + for key, value in kwdict.items(): + if isinstance(value, str): + fmt = formatter.parse(value, None, util.identity) + self.kwdict_eval.append((key, fmt.format_map)) + else: + self.kwdict[key] = value + else: + self.kwdict.update(kwdict) def run(self): """Execute or run the job""" @@ -202,6 +225,9 @@ class Job(): kwdict.pop(self.metadata_http, None) if self.kwdict: kwdict.update(self.kwdict) + if self.kwdict_eval: + for key, valuegen in self.kwdict_eval: + kwdict[key] = valuegen(kwdict) def _init(self): self.extractor.initialize() @@ -423,6 +449,8 @@ class DownloadJob(Job): def handle_finalize(self): if self.archive: + if not self.status: + self.archive.finalize() self.archive.close() pathfmt = self.pathfmt @@ -453,9 +481,12 @@ class DownloadJob(Job): for callback in self.hooks["skip"]: callback(pathfmt) if self._skipexc: - self._skipcnt += 1 - if self._skipcnt >= self._skipmax: - raise self._skipexc() + if not self._skipftr or self._skipftr(pathfmt.kwdict): + self._skipcnt += 1 + if self._skipcnt >= self._skipmax: + raise self._skipexc() + else: + self._skipcnt = 0 def download(self, url): """Download 'url'""" @@ -507,23 +538,28 @@ class DownloadJob(Job): # monkey-patch method to do nothing and always return True self.download = pathfmt.fix_extension - archive = cfg("archive") - if archive: - archive = util.expand_path(archive) + archive_path = cfg("archive") + if archive_path: + archive_path = util.expand_path(archive_path) archive_format = (cfg("archive-prefix", extr.category) + cfg("archive-format", extr.archive_fmt)) archive_pragma = (cfg("archive-pragma")) try: - if "{" in archive: - archive = formatter.parse(archive).format_map(kwdict) - self.archive = util.DownloadArchive( - archive, archive_format, archive_pragma) + if "{" in archive_path: + archive_path = formatter.parse( + archive_path).format_map(kwdict) + if cfg("archive-mode") == "memory": + archive_cls = archive.DownloadArchiveMemory + else: + archive_cls = archive.DownloadArchive + self.archive = archive_cls( + archive_path, archive_format, archive_pragma) except Exception as exc: extr.log.warning( "Failed to open download archive at '%s' (%s: %s)", - archive, exc.__class__.__name__, exc) + archive_path, exc.__class__.__name__, exc) else: - extr.log.debug("Using download archive '%s'", archive) + extr.log.debug("Using download archive '%s'", archive_path) skip = cfg("skip", True) if skip: @@ -539,6 +575,12 @@ class DownloadJob(Job): elif skip == "exit": self._skipexc = SystemExit self._skipmax = text.parse_int(smax) + + skip_filter = cfg("skip-filter") + if skip_filter: + self._skipftr = util.compile_expression(skip_filter) + else: + self._skipftr = None else: # monkey-patch methods to always return False pathfmt.exists = lambda x=None: False diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 72a602f..12622d0 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -173,6 +173,28 @@ def build_parser(): action="version", version=version.__version__, help="Print program version and exit", ) + if util.EXECUTABLE: + general.add_argument( + "-U", "--update", + dest="update", action="store_const", const="latest", + help="Update to the latest version", + ) + general.add_argument( + "--update-to", + dest="update", metavar="[CHANNEL@]TAG", + help="Upgrade/downgrade to a specific version", + ) + general.add_argument( + "--update-check", + dest="update", action="store_const", const="check", + help="Check if a newer version is available", + ) + else: + general.add_argument( + "-U", "--update-check", + dest="update", action="store_const", const="check", + help="Check if a newer version is available", + ) general.add_argument( "-f", "--filename", dest="filename", metavar="FORMAT", @@ -250,6 +272,12 @@ def build_parser(): help="Activate quiet mode", ) output.add_argument( + "-w", "--warning", + dest="loglevel", + action="store_const", const=logging.WARNING, + help="Print only warnings and errors", + ) + output.add_argument( "-v", "--verbose", dest="loglevel", action="store_const", const=logging.DEBUG, @@ -319,6 +347,11 @@ def build_parser(): help=("Write downloaded intermediary pages to files " "in the current directory to debug problems"), ) + output.add_argument( + "--no-colors", + dest="colors", action="store_false", + help=("Do not emit ANSI color codes in output"), + ) downloader = parser.add_argument_group("Downloader Options") downloader.add_argument( diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 2bcc222..3518545 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -16,11 +16,39 @@ from . import config, util, formatter # -------------------------------------------------------------------- +# Globals + +COLORS = not os.environ.get("NO_COLOR") +COLORS_DEFAULT = { + "success": "1;32", + "skip" : "2", + "debug" : "0;37", + "info" : "1;37", + "warning": "1;33", + "error" : "1;31", +} if COLORS else {} + +if util.WINDOWS: + ANSI = COLORS and os.environ.get("TERM") == "ANSI" + OFFSET = 1 + CHAR_SKIP = "# " + CHAR_SUCCESS = "* " + CHAR_ELLIPSIES = "..." +else: + ANSI = COLORS + OFFSET = 0 + CHAR_SKIP = "# " + CHAR_SUCCESS = "✔ " + CHAR_ELLIPSIES = "…" + + +# -------------------------------------------------------------------- # Logging LOG_FORMAT = "[{name}][{levelname}] {message}" LOG_FORMAT_DATE = "%Y-%m-%d %H:%M:%S" LOG_LEVEL = logging.INFO +LOG_LEVELS = ("debug", "info", "warning", "error") class Logger(logging.Logger): @@ -129,7 +157,7 @@ class Formatter(logging.Formatter): def __init__(self, fmt, datefmt): if isinstance(fmt, dict): - for key in ("debug", "info", "warning", "error"): + for key in LOG_LEVELS: value = fmt[key] if key in fmt else LOG_FORMAT fmt[key] = (formatter.parse(value).format_map, "{asctime" in value) @@ -187,16 +215,36 @@ def configure_logging(loglevel): # stream logging handler handler = root.handlers[0] opts = config.interpolate(("output",), "log") + + colors = config.interpolate(("output",), "colors") + if colors is None: + colors = COLORS_DEFAULT + if colors and not opts: + opts = LOG_FORMAT + if opts: if isinstance(opts, str): - opts = {"format": opts} - if handler.level == LOG_LEVEL and "level" in opts: + logfmt = opts + opts = {} + elif "format" in opts: + logfmt = opts["format"] + else: + logfmt = LOG_FORMAT + + if not isinstance(logfmt, dict) and colors: + ansifmt = "\033[{}m{}\033[0m".format + lf = {} + for level in LOG_LEVELS: + c = colors.get(level) + lf[level] = ansifmt(c, logfmt) if c else logfmt + logfmt = lf + + handler.setFormatter(Formatter( + logfmt, opts.get("format-date", LOG_FORMAT_DATE))) + + if "level" in opts and handler.level == LOG_LEVEL: handler.setLevel(opts["level"]) - if "format" in opts or "format-date" in opts: - handler.setFormatter(Formatter( - opts.get("format", LOG_FORMAT), - opts.get("format-date", LOG_FORMAT_DATE), - )) + if minlevel > handler.level: minlevel = handler.level @@ -307,9 +355,12 @@ def select(): mode = config.get(("output",), "mode") if mode is None or mode == "auto": - if hasattr(sys.stdout, "isatty") and sys.stdout.isatty(): - output = ColorOutput() if ANSI else TerminalOutput() - else: + try: + if sys.stdout.isatty(): + output = ColorOutput() if ANSI else TerminalOutput() + else: + output = PipeOutput() + except Exception: output = PipeOutput() elif isinstance(mode, dict): output = CustomOutput(mode) @@ -388,7 +439,10 @@ class ColorOutput(TerminalOutput): def __init__(self): TerminalOutput.__init__(self) - colors = config.get(("output",), "colors") or {} + colors = config.interpolate(("output",), "colors") + if colors is None: + colors = COLORS_DEFAULT + self.color_skip = "\033[{}m".format( colors.get("skip", "2")) self.color_success = "\r\033[{}m".format( @@ -514,17 +568,3 @@ def shorten_string_eaw(txt, limit, sep="…", cache=EAWCache()): right -= 1 return txt[:left] + sep + txt[right+1:] - - -if util.WINDOWS: - ANSI = os.environ.get("TERM") == "ANSI" - OFFSET = 1 - CHAR_SKIP = "# " - CHAR_SUCCESS = "* " - CHAR_ELLIPSIES = "..." -else: - ANSI = True - OFFSET = 0 - CHAR_SKIP = "# " - CHAR_SUCCESS = "✔ " - CHAR_ELLIPSIES = "…" diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index 1d2fba8..d4e1603 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -8,7 +8,7 @@ """Common classes and constants used by postprocessor modules.""" -from .. import util, formatter +from .. import util, formatter, archive class PostProcessor(): @@ -22,30 +22,31 @@ class PostProcessor(): return self.__class__.__name__ def _init_archive(self, job, options, prefix=None): - archive = options.get("archive") - if archive: + archive_path = options.get("archive") + if archive_path: extr = job.extractor - archive = util.expand_path(archive) + archive_path = util.expand_path(archive_path) if not prefix: prefix = "_" + self.name.upper() + "_" archive_format = ( options.get("archive-prefix", extr.category) + options.get("archive-format", prefix + extr.archive_fmt)) try: - if "{" in archive: - archive = formatter.parse(archive).format_map( + if "{" in archive_path: + archive_path = formatter.parse(archive_path).format_map( job.pathfmt.kwdict) - self.archive = util.DownloadArchive( - archive, archive_format, + self.archive = archive.DownloadArchive( + archive_path, archive_format, options.get("archive-pragma"), "_archive_" + self.name) except Exception as exc: self.log.warning( "Failed to open %s archive at '%s' (%s: %s)", - self.name, archive, exc.__class__.__name__, exc) + self.name, archive_path, exc.__class__.__name__, exc) else: - self.log.debug("Using %s archive '%s'", self.name, archive) + self.log.debug( + "Using %s archive '%s'", self.name, archive_path) return True - else: - self.archive = None + + self.archive = None return False diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index e7ed2f6..7d2be2b 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -10,7 +10,6 @@ from .common import PostProcessor from .. import util, formatter -import subprocess import os import re @@ -80,14 +79,14 @@ class ExecPP(PostProcessor): def _exec(self, args, shell): self.log.debug("Running '%s'", args) - retcode = subprocess.Popen(args, shell=shell).wait() + retcode = util.Popen(args, shell=shell).wait() if retcode: self.log.warning("'%s' returned with non-zero exit status (%d)", args, retcode) def _exec_async(self, args, shell): self.log.debug("Running '%s'", args) - subprocess.Popen(args, shell=shell) + util.Popen(args, shell=shell) def _replace(self, match): name = match.group(1) diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py index ea61b7b..6ded1e2 100644 --- a/gallery_dl/postprocessor/mtime.py +++ b/gallery_dl/postprocessor/mtime.py @@ -33,6 +33,9 @@ class MtimePP(PostProcessor): def run(self, pathfmt): mtime = self._get(pathfmt.kwdict) + if mtime is None: + return + pathfmt.kwdict["_mtime"] = ( util.datetime_to_timestamp(mtime) if isinstance(mtime, datetime) else diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index b713c6f..c63a3d9 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -155,7 +155,9 @@ class UgoiraPP(PostProcessor): self.log.error("Unable to invoke FFmpeg (%s: %s)", exc.__class__.__name__, exc) pathfmt.realpath = pathfmt.temppath - except Exception: + except Exception as exc: + print() + self.log.error("%s: %s", exc.__class__.__name__, exc) pathfmt.realpath = pathfmt.temppath else: if self.mtime: @@ -171,7 +173,7 @@ class UgoiraPP(PostProcessor): def _exec(self, args): self.log.debug(args) out = None if self.output else subprocess.DEVNULL - retcode = subprocess.Popen(args, stdout=out, stderr=out).wait() + retcode = util.Popen(args, stdout=out, stderr=out).wait() if retcode: print() self.log.error("Non-zero exit status when running %s (%s)", diff --git a/gallery_dl/text.py b/gallery_dl/text.py index b7b5211..9258187 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -73,7 +73,7 @@ def filename_from_url(url): """Extract the last part of an URL to use as a filename""" try: return url.partition("?")[0].rpartition("/")[2] - except (TypeError, AttributeError): + except Exception: return "" @@ -122,7 +122,7 @@ def extract(txt, begin, end, pos=0): first = txt.index(begin, pos) + len(begin) last = txt.index(end, first) return txt[first:last], last+len(end) - except (ValueError, TypeError, AttributeError): + except Exception: return None, pos @@ -131,7 +131,7 @@ def extr(txt, begin, end, default=""): try: first = txt.index(begin) + len(begin) return txt[first:txt.index(end, first)] - except (ValueError, TypeError, AttributeError): + except Exception: return default @@ -141,7 +141,7 @@ def rextract(txt, begin, end, pos=-1): first = txt.rindex(begin, 0, pos) last = txt.index(end, first + lbeg) return txt[first + lbeg:last], first - except (ValueError, TypeError, AttributeError): + except Exception: return None, pos @@ -167,7 +167,7 @@ def extract_iter(txt, begin, end, pos=0): last = index(end, first) pos = last + lend yield txt[first:last] - except (ValueError, TypeError, AttributeError): + except Exception: return @@ -180,7 +180,7 @@ def extract_from(txt, pos=0, default=""): last = index(end, first) pos = last + len(end) return txt[first:last] - except (ValueError, TypeError, AttributeError): + except Exception: return default return extr @@ -200,7 +200,7 @@ def parse_bytes(value, default=0, suffixes="bkmgtp"): """Convert a bytes-amount ("500k", "2.5M", ...) to int""" try: last = value[-1].lower() - except (TypeError, LookupError): + except Exception: return default if last in suffixes: @@ -221,7 +221,7 @@ def parse_int(value, default=0): return default try: return int(value) - except (ValueError, TypeError): + except Exception: return default @@ -231,7 +231,7 @@ def parse_float(value, default=0.0): return default try: return float(value) - except (ValueError, TypeError): + except Exception: return default @@ -242,7 +242,7 @@ def parse_query(qs): for key, value in urllib.parse.parse_qsl(qs): if key not in result: result[key] = value - except AttributeError: + except Exception: pass return result @@ -251,7 +251,7 @@ def parse_timestamp(ts, default=None): """Create a datetime object from a unix timestamp""" try: return datetime.datetime.utcfromtimestamp(int(ts)) - except (TypeError, ValueError, OverflowError): + except Exception: return default diff --git a/gallery_dl/update.py b/gallery_dl/update.py new file mode 100644 index 0000000..b068e37 --- /dev/null +++ b/gallery_dl/update.py @@ -0,0 +1,218 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import os +import re +import sys + +from .extractor.common import Extractor, Message +from .job import DownloadJob +from . import util, version, exception + +REPOS = { + "stable" : "mikf/gallery-dl", + "dev" : "gdl-org/builds", + "nightly": "gdl-org/builds", + "master" : "gdl-org/builds", +} + +BINARIES_STABLE = { + "windows" : "gallery-dl.exe", + "windows_x86": "gallery-dl.exe", + "windows_x64": "gallery-dl.exe", + "linux" : "gallery-dl.bin", +} +BINARIES_DEV = { + "windows" : "gallery-dl_windows.exe", + "windows_x86": "gallery-dl_windows_x86.exe", + "windows_x64": "gallery-dl_windows.exe", + "linux" : "gallery-dl_linux", + "macos" : "gallery-dl_macos", +} +BINARIES = { + "stable" : BINARIES_STABLE, + "dev" : BINARIES_DEV, + "nightly": BINARIES_DEV, + "master" : BINARIES_DEV, +} + + +class UpdateJob(DownloadJob): + + def handle_url(self, url, kwdict): + if not self._check_update(kwdict): + if kwdict["_check"]: + self.status |= 1 + return self.extractor.log.info( + "gallery-dl is up to date (%s)", version.__version__) + + if kwdict["_check"]: + return self.extractor.log.info( + "A new release is available: %s -> %s", + version.__version__, kwdict["tag_name"]) + + self.extractor.log.info( + "Updating from %s to %s", + version.__version__, kwdict["tag_name"]) + + path_old = sys.executable + ".old" + path_new = sys.executable + ".new" + directory, filename = os.path.split(sys.executable) + + pathfmt = self.pathfmt + pathfmt.extension = "new" + pathfmt.filename = filename + pathfmt.temppath = path_new + pathfmt.realpath = pathfmt.path = sys.executable + pathfmt.realdirectory = pathfmt.directory = directory + + self._newline = True + if not self.download(url): + self.status |= 4 + return self._error("Failed to download %s", url.rpartition("/")[2]) + + if not util.WINDOWS: + try: + mask = os.stat(sys.executable).st_mode + except OSError: + mask = 0o755 + self._warning("Unable to get file permission bits") + + try: + os.replace(sys.executable, path_old) + except OSError: + return self._error("Unable to move current executable") + + try: + pathfmt.finalize() + except OSError: + self._error("Unable to overwrite current executable") + return os.replace(path_old, sys.executable) + + if util.WINDOWS: + import atexit + import subprocess + + cmd = 'ping 127.0.0.1 -n 5 -w 1000 & del /F "{}"'.format(path_old) + atexit.register( + util.Popen, cmd, shell=True, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + + else: + try: + os.unlink(path_old) + except OSError: + self._warning("Unable to delete old executable") + + try: + os.chmod(sys.executable, mask) + except OSError: + self._warning("Unable to restore file permission bits") + + self.out.success(pathfmt.path) + + def _check_update(self, kwdict): + if kwdict["_exact"]: + return True + + tag = kwdict["tag_name"] + + if tag[0] == "v": + kwdict["tag_name"] = tag = tag[1:] + ver, _, dev = version.__version__.partition("-") + + version_local = [int(v) for v in ver.split(".")] + version_remote = [int(v) for v in tag.split(".")] + + if dev: + version_local[-1] -= 0.5 + if version_local >= version_remote: + return False + + elif version.__version__.endswith(":" + tag): + return False + + return True + + def _warning(self, msg, *args): + if self._newline: + self._newline = False + print() + self.extractor.log.warning(msg, *args) + + def _error(self, msg, *args): + if self._newline: + self._newline = False + print() + self.status |= 1 + self.extractor.log.error(msg, *args) + + +class UpdateExtractor(Extractor): + category = "update" + root = "https://github.com" + root_api = "https://api.github.com" + pattern = r"update(?::(.+))?" + + def items(self): + tag = "latest" + check = exact = False + + variant = version.__variant__ or "stable/windows" + repo, _, binary = variant.partition("/") + + target = self.groups[0] + if target == "latest": + pass + elif target == "check": + check = True + else: + channel, sep, target = target.partition("@") + if sep: + repo = channel + tag = target + exact = True + elif channel in REPOS: + repo = channel + else: + tag = channel + exact = True + + if re.match(r"\d\.\d+\.\d+", tag): + tag = "v" + tag + + try: + path_repo = REPOS[repo or "stable"] + except KeyError: + raise exception.StopExtraction("Invalid channel '%s'", repo) + + path_tag = tag if tag == "latest" else "tags/" + tag + url = "{}/repos/{}/releases/{}".format( + self.root_api, path_repo, path_tag) + headers = { + "Accept": "application/vnd.github+json", + "User-Agent": util.USERAGENT, + "X-GitHub-Api-Version": "2022-11-28", + } + data = self.request(url, headers=headers, notfound="tag").json() + data["_check"] = check + data["_exact"] = exact + + if binary == "linux" and \ + repo != "stable" and \ + data["tag_name"] <= "2024.05.28": + binary_name = "gallery-dl_ubuntu" + else: + binary_name = BINARIES[repo][binary] + + url = "{}/{}/releases/download/{}/{}".format( + self.root, path_repo, data["tag_name"], binary_name) + + yield Message.Directory, data + yield Message.Url, url, data diff --git a/gallery_dl/util.py b/gallery_dl/util.py index bc9418f..861ec7e 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -16,7 +16,6 @@ import time import random import getpass import hashlib -import sqlite3 import binascii import datetime import functools @@ -339,7 +338,7 @@ def extract_headers(response): @functools.lru_cache(maxsize=None) def git_head(): try: - out, err = subprocess.Popen( + out, err = Popen( ("git", "rev-parse", "--short", "HEAD"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -579,6 +578,33 @@ GLOBALS = { } +if EXECUTABLE and hasattr(sys, "_MEIPASS"): + # https://github.com/pyinstaller/pyinstaller/blob/develop/doc + # /runtime-information.rst#ld_library_path--libpath-considerations + _popen_env = os.environ.copy() + + orig = _popen_env.get("LD_LIBRARY_PATH_ORIG") + if orig is None: + _popen_env.pop("LD_LIBRARY_PATH", None) + else: + _popen_env["LD_LIBRARY_PATH"] = orig + + orig = _popen_env.get("DYLD_LIBRARY_PATH_ORIG") + if orig is None: + _popen_env.pop("DYLD_LIBRARY_PATH", None) + else: + _popen_env["DYLD_LIBRARY_PATH"] = orig + + del orig + + class Popen(subprocess.Popen): + def __init__(self, args, **kwargs): + kwargs["env"] = _popen_env + subprocess.Popen.__init__(self, args, **kwargs) +else: + Popen = subprocess.Popen + + def compile_expression(expr, name="<expr>", globals=None): code_object = compile(expr, name, "eval") return functools.partial(eval, code_object, globals or GLOBALS) @@ -825,46 +851,3 @@ class FilterPredicate(): raise except Exception as exc: raise exception.FilterError(exc) - - -class DownloadArchive(): - - def __init__(self, path, format_string, pragma=None, - cache_key="_archive_key"): - try: - con = sqlite3.connect(path, timeout=60, check_same_thread=False) - except sqlite3.OperationalError: - os.makedirs(os.path.dirname(path)) - con = sqlite3.connect(path, timeout=60, check_same_thread=False) - con.isolation_level = None - - from . import formatter - self.keygen = formatter.parse(format_string).format_map - self.close = con.close - self.cursor = cursor = con.cursor() - self._cache_key = cache_key - - if pragma: - for stmt in pragma: - cursor.execute("PRAGMA " + stmt) - - try: - cursor.execute("CREATE TABLE IF NOT EXISTS archive " - "(entry TEXT PRIMARY KEY) WITHOUT ROWID") - except sqlite3.OperationalError: - # fallback for missing WITHOUT ROWID support (#553) - cursor.execute("CREATE TABLE IF NOT EXISTS archive " - "(entry TEXT PRIMARY KEY)") - - def check(self, kwdict): - """Return True if the item described by 'kwdict' exists in archive""" - key = kwdict[self._cache_key] = self.keygen(kwdict) - self.cursor.execute( - "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) - return self.cursor.fetchone() - - def add(self, kwdict): - """Add item described by 'kwdict' to archive""" - key = kwdict.get(self._cache_key) or self.keygen(kwdict) - self.cursor.execute( - "INSERT OR IGNORE INTO archive (entry) VALUES (?)", (key,)) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d438ba4..6557763 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.9" +__version__ = "1.27.0" +__variant__ = None |
