diff options
| author | 2025-12-20 05:49:04 -0500 | |
|---|---|---|
| committer | 2025-12-20 05:49:04 -0500 | |
| commit | a24ec1647aeac35a63b744ea856011ad6e06be3b (patch) | |
| tree | ae94416de786aeddd05d99559098f7f16bb103a6 /gallery_dl | |
| parent | 33f8a8a37a9cba738ef25fb99955f0730da9eb48 (diff) | |
New upstream version 1.31.1.upstream/1.31.1
Diffstat (limited to 'gallery_dl')
264 files changed, 4651 insertions, 2593 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index fdcb6d0..98f8c12 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -284,14 +284,14 @@ def main(): # unsupported file logging handler if handler := output.setup_logging_handler( - "unsupportedfile", fmt="{message}"): + "unsupportedfile", fmt="{message}", defer=True): ulog = job.Job.ulog = logging.getLogger("unsupported") ulog.addHandler(handler) ulog.propagate = False # error file logging handler if handler := output.setup_logging_handler( - "errorfile", fmt="{message}", mode="a"): + "errorfile", fmt="{message}", mode="a", defer=True): elog = input_manager.err = logging.getLogger("errorfile") elog.addHandler(handler) elog.propagate = False diff --git a/gallery_dl/actions.py b/gallery_dl/actions.py index 971c4d9..5d2f645 100644 --- a/gallery_dl/actions.py +++ b/gallery_dl/actions.py @@ -148,6 +148,11 @@ class LoggerAdapter(): if cond(msg): action(args) + def traceback(self, exc): + if self.logger.isEnabledFor(logging.DEBUG): + self.logger._log( + logging.DEBUG, "", None, exc_info=exc, extra=self.extra) + def _level_to_int(level): try: diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index ba719ac..26f8244 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -119,7 +119,7 @@ def load_cookies_webkit(browser_name, profile=None, domain=None): for page_size in page_sizes: _webkit_parse_cookies_page(p.read_bytes(page_size), cookies) _log_info("Extracted %s cookies from %s", - browser_name.capitalize(), len(cookies)) + len(cookies), browser_name.capitalize()) return cookies diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py index e1b936e..79dc5cb 100644 --- a/gallery_dl/downloader/__init__.py +++ b/gallery_dl/downloader/__init__.py @@ -27,7 +27,7 @@ def find(scheme): scheme = "http" if scheme in modules: # prevent unwanted imports try: - module = __import__(scheme, globals(), None, (), 1) + module = __import__(scheme, globals(), None, None, 1) except ImportError: pass else: diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index 7cd8d10..66996f7 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -31,8 +31,15 @@ class DownloaderBase(): self.partdir = self.config("part-directory") if self.partdir: - self.partdir = util.expand_path(self.partdir) - os.makedirs(self.partdir, exist_ok=True) + if isinstance(self.partdir, dict): + self.partdir = [ + (util.compile_filter(expr) if expr else util.true, + util.expand_path(pdir)) + for expr, pdir in self.partdir.items() + ] + else: + self.partdir = util.expand_path(self.partdir) + os.makedirs(self.partdir, exist_ok=True) proxies = self.config("proxy", util.SENTINEL) if proxies is util.SENTINEL: diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 248bf70..703dcca 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -95,7 +95,7 @@ class HttpDownloader(DownloaderBase): except Exception as exc: if self.downloading: output.stderr_write("\n") - self.log.debug("", exc_info=exc) + self.log.traceback(exc) raise finally: # remove file from incomplete downloads @@ -230,6 +230,10 @@ class HttpDownloader(DownloaderBase): # check file size size = text.parse_int(size, None) if size is not None: + if not size: + self.release_conn(response) + self.log.warning("Empty file") + return False if self.minsize and size < self.minsize: self.release_conn(response) self.log.warning( @@ -342,9 +346,15 @@ class HttpDownloader(DownloaderBase): raise # check file size - if size and fp.tell() < size: - msg = f"file size mismatch ({fp.tell()} < {size})" - output.stderr_write("\n") + if size and (fsize := fp.tell()) < size: + if (segmented := kwdict.get("_http_segmented")) and \ + segmented is True or segmented == fsize: + tries -= 1 + msg = "Resuming segmented download" + output.stdout_write("\r") + else: + msg = f"file size mismatch ({fsize} < {size})" + output.stderr_write("\n") continue break diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index a56a6be..e9b3294 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -22,9 +22,9 @@ class YoutubeDLDownloader(DownloaderBase): DownloaderBase.__init__(self, job) extractor = job.extractor - retries = self.config("retries", extractor._retries) + self.retries = self.config("retries", extractor._retries) self.ytdl_opts = { - "retries": retries+1 if retries >= 0 else float("inf"), + "retries": self.retries+1 if self.retries >= 0 else float("inf"), "socket_timeout": self.config("timeout", extractor._timeout), "nocheckcertificate": not self.config("verify", extractor._verify), "proxy": self.proxies.get("http") if self.proxies else None, @@ -39,17 +39,25 @@ class YoutubeDLDownloader(DownloaderBase): def download(self, url, pathfmt): kwdict = pathfmt.kwdict + tries = 0 - ytdl_instance = kwdict.pop("_ytdl_instance", None) - if not ytdl_instance: + if ytdl_instance := kwdict.pop("_ytdl_instance", None): + # 'ytdl' extractor + self._prepare(ytdl_instance) + info_dict = kwdict.pop("_ytdl_info_dict") + else: + # other extractors ytdl_instance = self.ytdl_instance if not ytdl_instance: try: module = ytdl.import_module(self.config("module")) except (ImportError, SyntaxError) as exc: - self.log.error("Cannot import module '%s'", - getattr(exc, "name", "")) - self.log.debug("", exc_info=exc) + if exc.__context__: + self.log.error("Cannot import yt-dlp or youtube-dl") + else: + self.log.error("Cannot import module '%s'", + getattr(exc, "name", "")) + self.log.traceback(exc) self.download = lambda u, p: False return False @@ -63,6 +71,8 @@ class YoutubeDLDownloader(DownloaderBase): module, self, self.ytdl_opts) if self.outtmpl == "default": self.outtmpl = module.DEFAULT_OUTTMPL + self._prepare(ytdl_instance) + if self.forward_cookies: self.log.debug("Forwarding cookies to %s", ytdl_instance.__module__) @@ -70,45 +80,150 @@ class YoutubeDLDownloader(DownloaderBase): for cookie in self.session.cookies: set_cookie(cookie) - if "__gdl_initialize" in ytdl_instance.params: - del ytdl_instance.params["__gdl_initialize"] + url = url[5:] + manifest = kwdict.get("_ytdl_manifest") + while True: + tries += 1 + self.error = None + try: + if manifest is None: + info_dict = self._extract_url( + ytdl_instance, url) + else: + info_dict = self._extract_manifest( + ytdl_instance, url, kwdict) + except Exception as exc: + self.log.traceback(exc) + cls = exc.__class__ + if cls.__module__ == "builtins": + tries = False + msg = f"{cls.__name__}: {exc}" + else: + if self.error is not None: + msg = self.error + elif not info_dict: + msg = "Empty 'info_dict' data" + else: + break + + if tries: + self.log.error("%s (%s/%s)", msg, tries, self.retries+1) + else: + self.log.error(msg) + return False + if tries > self.retries: + return False - if self.progress is not None: - ytdl_instance.add_progress_hook(self._progress_hook) - if rlf := ytdl_instance.params.pop("__gdl_ratelimit_func", False): - self.rate_dyn = rlf + if extra := kwdict.get("_ytdl_extra"): + info_dict.update(extra) - info_dict = kwdict.pop("_ytdl_info_dict", None) - if not info_dict: - url = url[5:] + while True: + tries += 1 + self.error = None try: - if manifest := kwdict.pop("_ytdl_manifest", None): - info_dict = self._extract_manifest( - ytdl_instance, url, manifest, - kwdict.pop("_ytdl_manifest_data", None), - kwdict.pop("_ytdl_manifest_headers", None), - kwdict.pop("_ytdl_manifest_cookies", None)) + if "entries" in info_dict: + success = self._download_playlist( + ytdl_instance, pathfmt, info_dict) else: - info_dict = self._extract_info(ytdl_instance, url) + success = self._download_video( + ytdl_instance, pathfmt, info_dict) except Exception as exc: - self.log.debug("", exc_info=exc) - self.log.warning("%s: %s", exc.__class__.__name__, exc) + self.log.traceback(exc) + cls = exc.__class__ + if cls.__module__ == "builtins": + tries = False + msg = f"{cls.__name__}: {exc}" + else: + if self.error is not None: + msg = self.error + elif not success: + msg = "Error" + else: + break - if not info_dict: + if tries: + self.log.error("%s (%s/%s)", msg, tries, self.retries+1) + else: + self.log.error(msg) return False + if tries > self.retries: + return False + return True + + def _extract_url(self, ytdl, url): + return ytdl.extract_info(url, download=False) + + def _extract_manifest(self, ytdl, url, kwdict): + extr = ytdl.get_info_extractor("Generic") + video_id = extr._generic_id(url) + + if cookies := kwdict.get("_ytdl_manifest_cookies"): + if isinstance(cookies, dict): + cookies = cookies.items() + set_cookie = ytdl.cookiejar.set_cookie + for name, value in cookies: + set_cookie(Cookie( + 0, name, value, None, False, + "", False, False, "/", False, + False, None, False, None, None, {}, + )) + + type = kwdict["_ytdl_manifest"] + data = kwdict.get("_ytdl_manifest_data") + headers = kwdict.get("_ytdl_manifest_headers") + if type == "hls": + if data is None: + try: + fmts, subs = extr._extract_m3u8_formats_and_subtitles( + url, video_id, "mp4", headers=headers) + except AttributeError: + fmts = extr._extract_m3u8_formats( + url, video_id, "mp4", headers=headers) + subs = None + else: + try: + fmts, subs = extr._parse_m3u8_formats_and_subtitles( + data, url, "mp4", headers=headers) + except AttributeError: + fmts = extr._parse_m3u8_formats( + data, url, "mp4", headers=headers) + subs = None - if "entries" in info_dict: - index = kwdict.get("_ytdl_index") - if index is None: - return self._download_playlist( - ytdl_instance, pathfmt, info_dict) + elif type == "dash": + if data is None: + try: + fmts, subs = extr._extract_mpd_formats_and_subtitles( + url, video_id, headers=headers) + except AttributeError: + fmts = extr._extract_mpd_formats( + url, video_id, headers=headers) + subs = None else: - info_dict = info_dict["entries"][index] + if isinstance(data, str): + data = ElementTree.fromstring(data) + try: + fmts, subs = extr._parse_mpd_formats_and_subtitles( + data, mpd_id="dash") + except AttributeError: + fmts = extr._parse_mpd_formats( + data, mpd_id="dash") + subs = None - if extra := kwdict.get("_ytdl_extra"): - info_dict.update(extra) + else: + raise ValueError(f"Unsupported manifest type '{type}'") - return self._download_video(ytdl_instance, pathfmt, info_dict) + if headers: + for fmt in fmts: + fmt["http_headers"] = headers + + info_dict = { + "extractor": "", + "id" : video_id, + "title" : video_id, + "formats" : fmts, + "subtitles": subs, + } + return ytdl.process_ie_result(info_dict, download=False) def _download_video(self, ytdl_instance, pathfmt, info_dict): if "url" in info_dict: @@ -161,12 +276,7 @@ class YoutubeDLDownloader(DownloaderBase): path = pathfmt.realpath.replace("%", "%%") self._set_outtmpl(ytdl_instance, path) - try: - ytdl_instance.process_info(info_dict) - except Exception as exc: - self.log.debug("", exc_info=exc) - return False - + ytdl_instance.process_info(info_dict) pathfmt.temppath = info_dict.get("filepath") or info_dict["_filename"] return True @@ -188,78 +298,20 @@ class YoutubeDLDownloader(DownloaderBase): ytdl_instance.process_info(entry) status = True except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.error("%s: %s", exc.__class__.__name__, exc) return status - def _extract_info(self, ytdl, url): - return ytdl.extract_info(url, download=False) - - def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None, - headers=None, cookies=None): - extr = ytdl.get_info_extractor("Generic") - video_id = extr._generic_id(url) - - if cookies is not None: - if isinstance(cookies, dict): - cookies = cookies.items() - set_cookie = ytdl.cookiejar.set_cookie - for name, value in cookies: - set_cookie(Cookie( - 0, name, value, None, False, - "", False, False, "/", False, - False, None, False, None, None, {}, - )) + def _prepare(self, ytdl_instance): + if "__gdl_initialize" not in ytdl_instance.params: + return - if manifest_type == "hls": - if manifest_data is None: - try: - fmts, subs = extr._extract_m3u8_formats_and_subtitles( - url, video_id, "mp4", headers=headers) - except AttributeError: - fmts = extr._extract_m3u8_formats( - url, video_id, "mp4", headers=headers) - subs = None - else: - try: - fmts, subs = extr._parse_m3u8_formats_and_subtitles( - url, video_id, "mp4") - except AttributeError: - fmts = extr._parse_m3u8_formats(url, video_id, "mp4") - subs = None - - elif manifest_type == "dash": - if manifest_data is None: - try: - fmts, subs = extr._extract_mpd_formats_and_subtitles( - url, video_id, headers=headers) - except AttributeError: - fmts = extr._extract_mpd_formats( - url, video_id, headers=headers) - subs = None - else: - if isinstance(manifest_data, str): - manifest_data = ElementTree.fromstring(manifest_data) - try: - fmts, subs = extr._parse_mpd_formats_and_subtitles( - manifest_data, mpd_id="dash") - except AttributeError: - fmts = extr._parse_mpd_formats( - manifest_data, mpd_id="dash") - subs = None - - else: - self.log.error("Unsupported manifest type '%s'", manifest_type) - return None - - info_dict = { - "extractor": "", - "id" : video_id, - "title" : video_id, - "formats" : fmts, - "subtitles": subs, - } - return ytdl.process_ie_result(info_dict, download=False) + del ytdl_instance.params["__gdl_initialize"] + if self.progress is not None: + ytdl_instance.add_progress_hook(self._progress_hook) + if rlf := ytdl_instance.params.pop("__gdl_ratelimit_func", False): + self.rate_dyn = rlf + ytdl_instance.params["logger"] = LoggerAdapter(self, ytdl_instance) def _progress_hook(self, info): if info["status"] == "downloading" and \ @@ -284,6 +336,31 @@ class YoutubeDLDownloader(DownloaderBase): ytdl_instance.params["outtmpl"] = {"default": outtmpl} +class LoggerAdapter(): + __slots__ = ("obj", "log") + + def __init__(self, obj, ytdl_instance): + self.obj = obj + self.log = ytdl_instance.params.get("logger") + + def debug(self, msg): + if self.log is not None: + if msg[0] == "[": + msg = msg[msg.find("]")+2:] + self.log.debug(msg) + + def warning(self, msg): + if self.log is not None: + if "WARNING:" in msg: + msg = msg[msg.find(" ")+1:] + self.log.warning(msg) + + def error(self, msg): + if "ERROR:" in msg: + msg = msg[msg.find(" ")+1:] + self.obj.error = msg + + def compatible_formats(formats): """Returns True if 'formats' are compatible for merge""" video_ext = formats[0].get("ext") diff --git a/gallery_dl/dt.py b/gallery_dl/dt.py new file mode 100644 index 0000000..b37ebf3 --- /dev/null +++ b/gallery_dl/dt.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Date/Time utilities""" + +import sys +import time +from datetime import datetime, date, timedelta, timezone # noqa F401 + + +class NullDatetime(datetime): + + def __bool__(self): + return False + + def __str__(self): + return "[Invalid DateTime]" + + def __format__(self, format_spec): + return "[Invalid DateTime]" + + +NONE = NullDatetime(1, 1, 1) +EPOCH = datetime(1970, 1, 1) +SECOND = timedelta(0, 1) + + +def normalize(dt): + # if (o := dt.utcoffset()) is not None: + # return dt.replace(tzinfo=None, microsecond=0) - o + if dt.tzinfo is not None: + return dt.astimezone(timezone.utc).replace(tzinfo=None, microsecond=0) + if dt.microsecond: + return dt.replace(microsecond=0) + return dt + + +def convert(value): + """Convert 'value' to a naive UTC datetime object""" + if not value: + return NONE + if isinstance(value, datetime): + return normalize(value) + if isinstance(value, str) and (dt := parse_iso(value)) is not NONE: + return dt + return parse_ts(value) + + +def parse(dt_string, format): + """Parse 'dt_string' according to 'format'""" + try: + return normalize(datetime.strptime(dt_string, format)) + except Exception: + return NONE + + +if sys.hexversion < 0x30c0000: + # Python <= 3.11 + def parse_iso(dt_string): + """Parse 'dt_string' as ISO 8601 value""" + try: + if dt_string[-1] == "Z": + # compat for Python < 3.11 + dt_string = dt_string[:-1] + elif dt_string[-5] in "+-": + # compat for Python < 3.11 + dt_string = f"{dt_string[:-2]}:{dt_string[-2:]}" + return normalize(datetime.fromisoformat(dt_string)) + except Exception: + return NONE + + from_ts = datetime.utcfromtimestamp + now = datetime.utcnow + +else: + # Python >= 3.12 + def parse_iso(dt_string): + """Parse 'dt_string' as ISO 8601 value""" + try: + return normalize(datetime.fromisoformat(dt_string)) + except Exception: + return NONE + + def from_ts(ts=None): + """Convert Unix timestamp to naive UTC datetime""" + Y, m, d, H, M, S, _, _, _ = time.gmtime(ts) + return datetime(Y, m, d, H, M, S) + + now = from_ts + + +def parse_ts(ts, default=NONE): + """Create a datetime object from a Unix timestamp""" + try: + return from_ts(int(ts)) + except Exception: + return default + + +def to_ts(dt): + """Convert naive UTC datetime to Unix timestamp""" + return (dt - EPOCH) / SECOND + + +def to_ts_string(dt): + """Convert naive UTC datetime to Unix timestamp string""" + try: + return str((dt - EPOCH) // SECOND) + except Exception: + return "" diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py index 912a251..1f17c99 100644 --- a/gallery_dl/extractor/2ch.py +++ b/gallery_dl/extractor/2ch.py @@ -4,28 +4,28 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://2ch.su/""" +"""Extractors for https://2ch.org/""" from .common import Extractor, Message from .. import text, util -BASE_PATTERN = r"(?:https?://)?2ch\.(su|life|hk)" +BASE_PATTERN = r"(?:https?://)?2ch\.(org|su|life|hk)" class _2chThreadExtractor(Extractor): """Extractor for 2ch threads""" category = "2ch" subcategory = "thread" - root = "https://2ch.su" + root = "https://2ch.org" directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{tim}{filename:? //}.{extension}" archive_fmt = "{board}_{thread}_{tim}" pattern = rf"{BASE_PATTERN}/([^/?#]+)/res/(\d+)" - example = "https://2ch.su/a/res/12345.html" + example = "https://2ch.org/a/res/12345.html" def __init__(self, match): tld = match[1] - self.root = f"https://2ch.{'su' if tld == 'hk' else tld}" + self.root = f"https://2ch.{'org' if tld == 'hk' else tld}" Extractor.__init__(self, match) def items(self): @@ -42,11 +42,11 @@ class _2chThreadExtractor(Extractor): "title" : text.unescape(title)[:50], } - yield Message.Directory, thread + yield Message.Directory, "", thread for post in posts: if files := post.get("files"): post["post_name"] = post["name"] - post["date"] = text.parse_timestamp(post["timestamp"]) + post["date"] = self.parse_timestamp(post["timestamp"]) del post["files"] del post["name"] @@ -65,9 +65,9 @@ class _2chBoardExtractor(Extractor): """Extractor for 2ch boards""" category = "2ch" subcategory = "board" - root = "https://2ch.su" + root = "https://2ch.org" pattern = rf"{BASE_PATTERN}/([^/?#]+)/?$" - example = "https://2ch.su/a/" + example = "https://2ch.org/a/" def __init__(self, match): tld = match[1] diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index 9927b5a..0e250c9 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -31,7 +31,7 @@ class _2chanThreadExtractor(Extractor): f"/{self.board}/res/{self.thread}.htm") page = self.request(url).text data = self.metadata(page) - yield Message.Directory, data + yield Message.Directory, "", data for post in self.posts(page): if "filename" not in post: continue diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index ee3510c..4456fd6 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -1,40 +1,55 @@ # -*- coding: utf-8 -*- +# Copyright 2022-2025 Mike Fährmann +# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://sturdychan.help/""" +"""Extractors for 2chen boards""" -from .common import Extractor, Message +from .common import BaseExtractor, Message from .. import text -BASE_PATTERN = r"(?:https?://)?(?:sturdychan.help|2chen\.(?:moe|club))" +class _2chenExtractor(BaseExtractor): + basecategory = "2chen" -class _2chenThreadExtractor(Extractor): + +BASE_PATTERN = _2chenExtractor.update({ + "sturdychan": { + "root": "https://sturdychan.help", + "pattern": r"(?:sturdychan\.help|2chen\.(?:moe|club))", + }, + "schan": { + "root": "https://schan.help/", + "pattern": r"schan\.help", + }, +}) + + +class _2chenThreadExtractor(_2chenExtractor): """Extractor for 2chen threads""" - category = "2chen" subcategory = "thread" - root = "https://sturdychan.help" directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{time} {filename}.{extension}" - archive_fmt = "{board}_{thread}_{hash}_{time}" - pattern = BASE_PATTERN + r"/([^/?#]+)/(\d+)" + archive_fmt = "{board}_{thread}_{no}_{time}" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/(\d+)" example = "https://sturdychan.help/a/12345/" - def __init__(self, match): - Extractor.__init__(self, match) - self.board, self.thread = match.groups() - def items(self): - url = f"{self.root}/{self.board}/{self.thread}" + board = self.groups[-2] + thread = self.kwdict["thread"] = self.groups[-1] + url = f"{self.root}/{board}/{thread}" page = self.request(url, encoding="utf-8", notfound="thread").text - data = self.metadata(page) - yield Message.Directory, data - for post in self.posts(page): + self.kwdict["board"], pos = text.extract( + page, 'class="board">/', '/<') + self.kwdict["title"] = text.unescape(text.extract( + page, "<h3>", "</h3>", pos)[0]) + yield Message.Directory, "", {} + for post in self.posts(page): url = post["url"] if not url: continue @@ -42,20 +57,10 @@ class _2chenThreadExtractor(Extractor): url = self.root + url post["url"] = url = url.partition("?")[0] - post.update(data) post["time"] = text.parse_int(post["date"].timestamp()) yield Message.Url, url, text.nameext_from_url( post["filename"], post) - def metadata(self, page): - board, pos = text.extract(page, 'class="board">/', '/<') - title = text.extract(page, "<h3>", "</h3>", pos)[0] - return { - "board" : board, - "thread": self.thread, - "title" : text.unescape(title), - } - def posts(self, page): """Return iterable with relevant posts""" return map(self.parse, text.extract_iter( @@ -65,31 +70,25 @@ class _2chenThreadExtractor(Extractor): extr = text.extract_from(post) return { "name" : text.unescape(extr("<span>", "</span>")), - "date" : text.parse_datetime( + "date" : self.parse_datetime( extr("<time", "<").partition(">")[2], "%d %b %Y (%a) %H:%M:%S" ), "no" : extr('href="#p', '"'), - "url" : extr('</a><a href="', '"'), "filename": text.unescape(extr('download="', '"')), + "url" : text.extr(extr("<figure>", "</"), 'href="', '"'), "hash" : extr('data-hash="', '"'), } -class _2chenBoardExtractor(Extractor): +class _2chenBoardExtractor(_2chenExtractor): """Extractor for 2chen boards""" - category = "2chen" subcategory = "board" - root = "https://sturdychan.help" - pattern = BASE_PATTERN + r"/([^/?#]+)(?:/catalog|/?$)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:/catalog|/?$)" example = "https://sturdychan.help/a/" - def __init__(self, match): - Extractor.__init__(self, match) - self.board = match[1] - def items(self): - url = f"{self.root}/{self.board}/catalog" + url = f"{self.root}/{self.groups[-1]}/catalog" page = self.request(url, notfound="board").text data = {"_extractor": _2chenThreadExtractor} for thread in text.extract_iter( diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index ec5f0cb..154295e 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -29,7 +29,7 @@ class _35photoExtractor(Extractor): url = photo["url"] if first: first = False - yield Message.Directory, photo + yield Message.Directory, "", photo yield Message.Url, url, text.nameext_from_url(url, photo) def metadata(self): diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index 4c43464..a6dedde 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -7,7 +7,7 @@ """Extractors for https://4archive.org/""" from .common import Extractor, Message -from .. import text, util +from .. import text, dt class _4archiveThreadExtractor(Extractor): @@ -37,8 +37,8 @@ class _4archiveThreadExtractor(Extractor): for post in posts: post.update(data) - post["time"] = int(util.datetime_to_timestamp(post["date"])) - yield Message.Directory, post + post["time"] = int(dt.to_ts(post["date"])) + yield Message.Directory, "", post if "url" in post: yield Message.Url, post["url"], text.nameext_from_url( post["filename"], post) @@ -61,10 +61,9 @@ class _4archiveThreadExtractor(Extractor): extr = text.extract_from(post) data = { "name": extr('class="name">', "</span>"), - "date": text.parse_datetime( + "date": self.parse_datetime_iso( (extr('class="dateTime">', "<") or - extr('class="dateTime postNum" >', "<")).strip(), - "%Y-%m-%d %H:%M:%S"), + extr('class="dateTime postNum" >', "<")).strip()), "no" : text.parse_int(extr(">Post No.", "<")), } if 'class="file"' in post: diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py index d81f305..ba24899 100644 --- a/gallery_dl/extractor/4chan.py +++ b/gallery_dl/extractor/4chan.py @@ -38,7 +38,7 @@ class _4chanThreadExtractor(Extractor): "title" : text.unescape(title)[:50], } - yield Message.Directory, data + yield Message.Directory, "", data for post in posts: if "filename" in post: post.update(data) diff --git a/gallery_dl/extractor/4chanarchives.py b/gallery_dl/extractor/4chanarchives.py index c187b41..16f4b39 100644 --- a/gallery_dl/extractor/4chanarchives.py +++ b/gallery_dl/extractor/4chanarchives.py @@ -40,7 +40,7 @@ class _4chanarchivesThreadExtractor(Extractor): for post in posts: post.update(data) - yield Message.Directory, post + yield Message.Directory, "", post if "url" in post: yield Message.Url, post["url"], post diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index d1ac503..b74bc90 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -31,7 +31,7 @@ class _500pxExtractor(Extractor): photo["extension"] = photo["image_format"] if data: photo.update(data) - yield Message.Directory, photo + yield Message.Directory, "", photo yield Message.Url, url, photo def metadata(self): @@ -92,7 +92,7 @@ class _500pxExtractor(Extractor): class _500pxUserExtractor(_500pxExtractor): """Extractor for photos from a user's photostream on 500px.com""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!photo/|liked)(?:p/)?([^/?#]+)/?(?:$|[?#])" + pattern = rf"{BASE_PATTERN}/(?!photo/|liked)(?:p/)?([^/?#]+)/?(?:$|[?#])" example = "https://500px.com/USER" def __init__(self, match): @@ -121,8 +121,8 @@ class _500pxGalleryExtractor(_500pxExtractor): """Extractor for photo galleries on 500px.com""" subcategory = "gallery" directory_fmt = ("{category}", "{user[username]}", "{gallery[name]}") - pattern = (BASE_PATTERN + r"/(?!photo/)(?:p/)?" - r"([^/?#]+)/galleries/([^/?#]+)") + pattern = (rf"{BASE_PATTERN}/(?!photo/)(?:p/)?" + rf"([^/?#]+)/galleries/([^/?#]+)") example = "https://500px.com/USER/galleries/GALLERY" def __init__(self, match): @@ -178,7 +178,7 @@ class _500pxGalleryExtractor(_500pxExtractor): class _500pxFavoriteExtractor(_500pxExtractor): """Extractor for favorite 500px photos""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/liked/?$" + pattern = rf"{BASE_PATTERN}/liked/?$" example = "https://500px.com/liked" def photos(self): @@ -202,7 +202,7 @@ class _500pxFavoriteExtractor(_500pxExtractor): class _500pxImageExtractor(_500pxExtractor): """Extractor for individual images from 500px.com""" subcategory = "image" - pattern = BASE_PATTERN + r"/photo/(\d+)" + pattern = rf"{BASE_PATTERN}/photo/(\d+)" example = "https://500px.com/photo/12345/TITLE" def __init__(self, match): diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index 0385067..3230182 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -9,9 +9,8 @@ """Extractors for https://8chan.moe/""" from .common import Extractor, Message -from .. import text, util +from .. import text, dt from ..cache import memcache -from datetime import timedelta import itertools BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)" @@ -44,7 +43,7 @@ class _8chanExtractor(Extractor): def cookies_prepare(self): # fetch captcha cookies # (necessary to download without getting interrupted) - now = util.datetime_utcnow() + now = dt.now() url = self.root + "/captcha.js" params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")} self.request(url, params=params).content @@ -57,7 +56,7 @@ class _8chanExtractor(Extractor): if cookie.domain.endswith(domain): cookie.expires = None if cookie.name == "captchaexpiration": - cookie.value = (now + timedelta(30, 300)).strftime( + cookie.value = (now + dt.timedelta(30, 300)).strftime( "%a, %d %b %Y %H:%M:%S GMT") return self.cookies @@ -70,7 +69,7 @@ class _8chanThreadExtractor(_8chanExtractor): "{threadId} {subject[:50]}") filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}" archive_fmt = "{boardUri}_{postId}_{num}" - pattern = BASE_PATTERN + r"/([^/?#]+)/(?:res|last)/(\d+)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/(?:res|last)/(\d+)" example = "https://8chan.moe/a/res/12345.html" def items(self): @@ -92,7 +91,7 @@ class _8chanThreadExtractor(_8chanExtractor): # download files posts = thread.pop("posts", ()) - yield Message.Directory, thread + yield Message.Directory, "", thread for post in itertools.chain((thread,), posts): files = post.pop("files", ()) if not files: @@ -108,7 +107,7 @@ class _8chanThreadExtractor(_8chanExtractor): class _8chanBoardExtractor(_8chanExtractor): """Extractor for 8chan boards""" subcategory = "board" - pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/(?:(\d+)\.html)?$" example = "https://8chan.moe/a/" def items(self): diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index 120cd8a..a8d8b44 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -40,7 +40,7 @@ class _8musesAlbumExtractor(Extractor): if images := data.get("pictures"): count = len(images) album = self._make_album(data["album"]) - yield Message.Directory, {"album": album, "count": count} + yield Message.Directory, "", {"album": album, "count": count} for num, image in enumerate(images, 1): url = self.root + "/image/fl/" + image["publicUri"] img = { @@ -85,8 +85,7 @@ class _8musesAlbumExtractor(Extractor): "parent" : text.parse_int(album["parentId"]), "views" : text.parse_int(album["numberViews"]), "likes" : text.parse_int(album["numberLikes"]), - "date" : text.parse_datetime( - album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"), + "date" : self.parse_datetime_iso(album["updatedAt"]), } def _unobfuscate(self, data): diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index c7e33c8..64134d0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -26,8 +26,10 @@ modules = [ "ao3", "arcalive", "architizer", + "arena", "artstation", "aryion", + "audiochan", "batoto", "bbc", "behance", @@ -39,9 +41,11 @@ modules = [ "booth", "bunkr", "catbox", + "cfake", "chevereto", "cien", "civitai", + "comedywildlifephoto", "comick", "comicvine", "cyberdrop", @@ -54,6 +58,7 @@ modules = [ "discord", "dynastyscans", "e621", + "eporner", "erome", "everia", "exhentai", @@ -63,6 +68,8 @@ modules = [ "fantia", "fapello", "fapachi", + "fikfap", + "fitnakedgirls", "flickr", "furaffinity", "furry34", @@ -106,6 +113,7 @@ modules = [ "kemono", "khinsider", "komikcast", + "koofr", "leakgallery", "lensdump", "lexica", @@ -140,12 +148,14 @@ modules = [ "nozomi", "nsfwalbum", "nudostar", + "okporn", "paheal", "patreon", "pexels", "philomena", "photovogue", "picarto", + "picazor", "pictoa", "piczel", "pillowfort", @@ -158,12 +168,12 @@ modules = [ "poringa", "pornhub", "pornpics", + "pornstarstube", "postmill", "rawkuma", "reactor", "readcomiconline", "realbooru", - "redbust", "reddit", "redgifs", "rule34us", @@ -179,7 +189,6 @@ modules = [ "senmanga", "sexcom", "shimmie2", - "simpcity", "simplyhentai", "sizebooru", "skeb", @@ -190,6 +199,7 @@ modules = [ "speakerdeck", "steamgriddb", "subscribestar", + "sxypix", "szurubooru", "tapas", "tcbscans", @@ -221,11 +231,13 @@ modules = [ "webmshare", "webtoons", "weebcentral", + "weebdex", "weibo", "wikiart", "wikifeet", "wikimedia", "xasiat", + "xenforo", "xfolio", "xhamster", "xvideos", @@ -299,7 +311,7 @@ def _list_classes(): def _modules_internal(): globals_ = globals() for module_name in modules: - yield __import__(module_name, globals_, None, (), 1) + yield __import__(module_name, globals_, None, None, 1) def _modules_path(path, files): diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py index 3249ae6..e9adf97 100644 --- a/gallery_dl/extractor/adultempire.py +++ b/gallery_dl/extractor/adultempire.py @@ -33,7 +33,7 @@ class AdultempireGalleryExtractor(GalleryExtractor): "gallery_id": text.parse_int(self.gallery_id), "title" : text.unescape(extr('title="', '"')), "studio" : extr(">studio</small>", "<").strip(), - "date" : text.parse_datetime(extr( + "date" : self.parse_datetime(extr( ">released</small>", "<").strip(), "%m/%d/%Y"), "actors" : sorted(text.split_html(extr( '<ul class="item-details item-cast-list ', '</ul>'))[1:]), diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py index 5bb1835..55b17c7 100644 --- a/gallery_dl/extractor/agnph.py +++ b/gallery_dl/extractor/agnph.py @@ -9,7 +9,7 @@ """Extractors for https://agn.ph/""" from . import booru -from .. import text, util +from .. import text import collections BASE_PATTERN = r"(?:https?://)?agn\.ph" @@ -33,7 +33,7 @@ class AgnphExtractor(booru.BooruExtractor): self.cookies.set("confirmed_age", "true", domain="agn.ph") def _prepare(self, post): - post["date"] = text.parse_timestamp(post["created_at"]) + post["date"] = self.parse_timestamp(post["created_at"]) post["status"] = post["status"].strip() post["has_children"] = ("true" in post["has_children"]) @@ -70,7 +70,7 @@ class AgnphExtractor(booru.BooruExtractor): return tags = collections.defaultdict(list) - pattern = util.re(r'class="(.)typetag">([^<]+)') + pattern = text.re(r'class="(.)typetag">([^<]+)') for tag_type, tag_name in pattern.findall(tag_container): tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) for key, value in tags.items(): @@ -81,7 +81,7 @@ class AgnphTagExtractor(AgnphExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/gallery/post/(?:\?([^#]+))?$" + pattern = rf"{BASE_PATTERN}/gallery/post/(?:\?([^#]+))?$" example = "https://agn.ph/gallery/post/?search=TAG" def __init__(self, match): @@ -99,7 +99,7 @@ class AgnphTagExtractor(AgnphExtractor): class AgnphPostExtractor(AgnphExtractor): subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/gallery/post/show/(\d+)" + pattern = rf"{BASE_PATTERN}/gallery/post/show/(\d+)" example = "https://agn.ph/gallery/post/show/12345/" def posts(self): diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py index 60380c4..716492e 100644 --- a/gallery_dl/extractor/ao3.py +++ b/gallery_dl/extractor/ao3.py @@ -118,7 +118,7 @@ class Ao3WorkExtractor(Ao3Extractor): directory_fmt = ("{category}", "{author}") filename_fmt = "{id} {title}.{extension}" archive_fmt = "{id}.{extension}" - pattern = BASE_PATTERN + r"/works/(\d+)" + pattern = rf"{BASE_PATTERN}/works/(\d+)" example = "https://archiveofourown.org/works/12345" def _init(self): @@ -182,11 +182,11 @@ class Ao3WorkExtractor(Ao3Extractor): extr('<dd class="freeform tags">', "</dd>")), "lang" : extr('<dd class="language" lang="', '"'), "series" : extr('<dd class="series">', "</dd>"), - "date" : text.parse_datetime( - extr('<dd class="published">', "<"), "%Y-%m-%d"), - "date_completed": text.parse_datetime( - extr('>Completed:</dt><dd class="status">', "<"), "%Y-%m-%d"), - "date_updated" : text.parse_timestamp( + "date" : self.parse_datetime_iso(extr( + '<dd class="published">', "<")), + "date_completed": self.parse_datetime_iso(extr( + '>Completed:</dt><dd class="status">', "<")), + "date_updated" : self.parse_timestamp( path.rpartition("updated_at=")[2]), "words" : text.parse_int( extr('<dd class="words">', "<").replace(",", "")), @@ -220,7 +220,7 @@ class Ao3WorkExtractor(Ao3Extractor): else: data["series"] = None - yield Message.Directory, data + yield Message.Directory, "", data for fmt in self.formats: try: url = text.urljoin(self.root, fmts[fmt]) @@ -233,28 +233,28 @@ class Ao3WorkExtractor(Ao3Extractor): class Ao3SeriesExtractor(Ao3Extractor): """Extractor for AO3 works of a series""" subcategory = "series" - pattern = BASE_PATTERN + r"(/series/(\d+))" + pattern = rf"{BASE_PATTERN}(/series/(\d+))" example = "https://archiveofourown.org/series/12345" class Ao3TagExtractor(Ao3Extractor): """Extractor for AO3 works by tag""" subcategory = "tag" - pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)" + pattern = rf"{BASE_PATTERN}(/tags/([^/?#]+)/works(?:/?\?.+)?)" example = "https://archiveofourown.org/tags/TAG/works" class Ao3SearchExtractor(Ao3Extractor): """Extractor for AO3 search results""" subcategory = "search" - pattern = BASE_PATTERN + r"(/works/search/?\?.+)" + pattern = rf"{BASE_PATTERN}(/works/search/?\?.+)" example = "https://archiveofourown.org/works/search?work_search[query]=air" class Ao3UserExtractor(Dispatch, Ao3Extractor): """Extractor for an AO3 user profile""" - pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)" - r"(?:/profile)?/?(?:$|\?|#)") + pattern = (rf"{BASE_PATTERN}/users/([^/?#]+(?:/pseuds/[^/?#]+)?)" + rf"(?:/profile)?/?(?:$|\?|#)") example = "https://archiveofourown.org/users/USER" def items(self): @@ -269,16 +269,16 @@ class Ao3UserExtractor(Dispatch, Ao3Extractor): class Ao3UserWorksExtractor(Ao3Extractor): """Extractor for works of an AO3 user""" subcategory = "user-works" - pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" - r"works(?:/?\?.+)?)") + pattern = (rf"{BASE_PATTERN}(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" + rf"works(?:/?\?.+)?)") example = "https://archiveofourown.org/users/USER/works" class Ao3UserSeriesExtractor(Ao3Extractor): """Extractor for series of an AO3 user""" subcategory = "user-series" - pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" - r"series(?:/?\?.+)?)") + pattern = (rf"{BASE_PATTERN}(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" + rf"series(?:/?\?.+)?)") example = "https://archiveofourown.org/users/USER/series" def items(self): @@ -297,8 +297,8 @@ class Ao3UserSeriesExtractor(Ao3Extractor): class Ao3UserBookmarkExtractor(Ao3Extractor): """Extractor for bookmarked works of an AO3 user""" subcategory = "user-bookmark" - pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" - r"bookmarks(?:/?\?.+)?)") + pattern = (rf"{BASE_PATTERN}(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" + rf"bookmarks(?:/?\?.+)?)") example = "https://archiveofourown.org/users/USER/bookmarks" def items(self): @@ -308,7 +308,7 @@ class Ao3UserBookmarkExtractor(Ao3Extractor): class Ao3SubscriptionsExtractor(Ao3Extractor): """Extractor for your AO3 account's subscriptions""" subcategory = "subscriptions" - pattern = BASE_PATTERN + r"(/users/([^/?#]+)/subscriptions(?:/?\?.+)?)" + pattern = rf"{BASE_PATTERN}(/users/([^/?#]+)/subscriptions(?:/?\?.+)?)" example = "https://archiveofourown.org/users/USER/subscriptions" def items(self): diff --git a/gallery_dl/extractor/arcalive.py b/gallery_dl/extractor/arcalive.py index 1df7e0f..f950d14 100644 --- a/gallery_dl/extractor/arcalive.py +++ b/gallery_dl/extractor/arcalive.py @@ -36,7 +36,7 @@ class ArcalivePostExtractor(ArcaliveExtractor): directory_fmt = ("{category}", "{boardSlug}") filename_fmt = "{id}_{num}{title:? //[b:230]}.{extension}" archive_fmt = "{id}_{num}" - pattern = BASE_PATTERN + r"/b/(?:\w+)/(\d+)" + pattern = rf"{BASE_PATTERN}/b/(?:\w+)/(\d+)" example = "https://arca.live/b/breaking/123456789" def items(self): @@ -49,13 +49,12 @@ class ArcalivePostExtractor(ArcaliveExtractor): files = self._extract_files(post) post["count"] = len(files) - post["date"] = text.parse_datetime( - post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S") + post["date"] = self.parse_datetime_iso(post["createdAt"][:19]) post["post_url"] = post_url = \ f"{self.root}/b/{post['boardSlug']}/{post['id']}" post["_http_headers"] = {"Referer": post_url + "?p=1"} - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], file in enumerate(files, 1): post.update(file) url = file["url"] @@ -64,7 +63,7 @@ class ArcalivePostExtractor(ArcaliveExtractor): def _extract_files(self, post): files = [] - for video, media in util.re(r"<(?:img|vide(o)) ([^>]+)").findall( + for video, media in text.re(r"<(?:img|vide(o)) ([^>]+)").findall( post["content"]): if not self.emoticons and 'class="arca-emoticon"' in media: continue @@ -116,7 +115,7 @@ class ArcalivePostExtractor(ArcaliveExtractor): class ArcaliveBoardExtractor(ArcaliveExtractor): """Extractor for an arca.live board's posts""" subcategory = "board" - pattern = BASE_PATTERN + r"/b/([^/?#]+)/?(?:\?([^#]+))?$" + pattern = rf"{BASE_PATTERN}/b/([^/?#]+)/?(?:\?([^#]+))?$" example = "https://arca.live/b/breaking" def articles(self): @@ -128,7 +127,7 @@ class ArcaliveBoardExtractor(ArcaliveExtractor): class ArcaliveUserExtractor(ArcaliveExtractor): """Extractor for an arca.live users's posts""" subcategory = "user" - pattern = BASE_PATTERN + r"/u/@([^/?#]+)/?(?:\?([^#]+))?$" + pattern = rf"{BASE_PATTERN}/u/@([^/?#]+)/?(?:\?([^#]+))?$" example = "https://arca.live/u/@USER" def articles(self): diff --git a/gallery_dl/extractor/arena.py b/gallery_dl/extractor/arena.py new file mode 100644 index 0000000..ada2fa1 --- /dev/null +++ b/gallery_dl/extractor/arena.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractor for https://are.na/""" + +from .common import GalleryExtractor + + +class ArenaChannelExtractor(GalleryExtractor): + """Extractor for are.na channels""" + category = "arena" + subcategory = "channel" + root = "https://are.na" + directory_fmt = ("{category}", "{user[full_name]} ({user[id]})", + "{channel[title]} ({channel[id]})") + filename_fmt = "{num:>03}{block[id]:? //}.{extension}" + archive_fmt = "{channel[id]}/{block[id]}" + pattern = r"(?:https?://)?(?:www\.)?are\.na/[^/?#]+/([^/?#]+)" + example = "https://are.na/evan-collins-1522646491/cassette-futurism" + + def metadata(self, page): + channel = self.request_json( + f"https://api.are.na/v2/channels/{self.groups[0]}") + + channel["date"] = self.parse_datetime_iso( + channel["created_at"]) + channel["date_updated"] = self.parse_datetime_iso( + channel["updated_at"]) + channel.pop("contents", None) + + return { + "count" : channel.get("length"), + "user" : channel.pop("user", None), + "owner" : channel.pop("owner", None), + "channel": channel, + } + + def images(self, page): + api = f"https://api.are.na/v2/channels/{self.groups[0]}/contents" + limit = 100 + params = {"page": 1, "per": limit} + + while True: + data = self.request_json(api, params=params) + + contents = data.get("contents") + if not contents: + return + + for block in contents: + url = None + + # Attachments (e.g., PDFs, files) + if attachment := block.get("attachment"): + url = attachment.get("url") + + # Images + elif image := block.get("image"): + # Prefer original image + if original := image.get("original"): + url = original.get("url") + # Fallback to display/large image if present + elif display := image.get("display"): + url = display.get("url") + elif large := image.get("large"): + url = large.get("url") + + # Some Links/Channels may not have downloadable media + if not url: + continue + + block["date"] = self.parse_datetime_iso( + block["created_at"]) + block["date_updated"] = self.parse_datetime_iso( + block["updated_at"]) + + yield url, { + "block" : block, + "source": block.pop("source", None), + } + + if len(contents) < limit: + return + params["page"] += 1 diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index fdb92c4..f1b55ce 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -47,7 +47,7 @@ class ArtstationExtractor(Extractor): asset.update(data) adict = asset["asset"] asset["num"] = num - yield Message.Directory, asset + yield Message.Directory, "", asset if adict["has_embedded_player"]: if url := self._extract_embed(asset): @@ -126,8 +126,7 @@ class ArtstationExtractor(Extractor): data["title"] = text.unescape(data["title"]) data["description"] = text.unescape(text.remove_html( data["description"])) - data["date"] = text.parse_datetime( - data["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + data["date"] = self.parse_datetime_iso(data["created_at"]) assets = data["assets"] del data["assets"] @@ -334,7 +333,7 @@ class ArtstationChallengeExtractor(ArtstationExtractor): update_url = f"{self.root}/contests/submission_updates.json" challenge = self.request_json(challenge_url) - yield Message.Directory, {"challenge": challenge} + yield Message.Directory, "", {"challenge": challenge} params = {"sorting": self.sorting} for submission in self._pagination(submission_url, params): diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 38b8ee4..5e5d1f2 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -9,10 +9,9 @@ """Extractors for https://aryion.com/""" from .common import Extractor, Message -from .. import text, util, exception +from .. import text, util, dt, exception from ..cache import cache from email.utils import parsedate_tz -from datetime import datetime BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4" @@ -20,7 +19,7 @@ BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4" class AryionExtractor(Extractor): """Base class for aryion extractors""" category = "aryion" - directory_fmt = ("{category}", "{user!l}", "{path:J - }") + directory_fmt = ("{category}", "{user!l}", "{path:I}") filename_fmt = "{id} {title}.{extension}" archive_fmt = "{id}" cookies_domain = ".aryion.com" @@ -64,7 +63,7 @@ class AryionExtractor(Extractor): if post := self._parse_post(post_id): if data: post.update(data) - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, post["url"], post elif post is False and self.recursive: base = self.root + "/g4/view/" @@ -78,20 +77,20 @@ class AryionExtractor(Extractor): def metadata(self): """Return general metadata""" - def _pagination_params(self, url, params=None, needle=None): + def _pagination_params(self, url, params=None, needle=None, quote="'"): if params is None: params = {"p": 1} else: params["p"] = text.parse_int(params.get("p"), 1) if needle is None: - needle = "class='gallery-item' id='" + needle = "class='gallery-item' id=" + quote while True: page = self.request(url, params=params).text cnt = 0 - for post_id in text.extract_iter(page, needle, "'"): + for post_id in text.extract_iter(page, needle, quote): cnt += 1 yield post_id @@ -109,6 +108,42 @@ class AryionExtractor(Extractor): return url = self.root + text.rextr(page, "href='", "'", pos) + def _pagination_folders(self, url, folder=None, seen=None): + if folder is None: + self.kwdict["folder"] = "" + else: + url = f"{url}/{folder}" + self.kwdict["folder"] = folder = text.unquote(folder) + self.log.debug("Descending into folder '%s'", folder) + + params = {"p": 1} + while True: + page = self.request(url, params=params).text + + cnt = 0 + for item in text.extract_iter( + page, "<li class='gallery-item", "</li>"): + cnt += 1 + if text.extr(item, 'data-item-type="', '"') == "Folders": + folder = text.extr(item, "href='", "'").rpartition("/")[2] + if seen is None: + seen = set() + if folder not in seen: + seen.add(folder) + if self.recursive: + yield from self._pagination_folders( + url, folder, seen) + else: + self.log.debug("Skipping folder '%s'", folder) + else: + yield text.extr(item, "data-item-id='", "'") + + if cnt < 40 and ">Next >><" not in page: + break + params["p"] += 1 + + self.kwdict["folder"] = "" + def _parse_post(self, post_id): url = f"{self.root}/g4/data.php?id={post_id}" with self.request(url, method="HEAD", fatal=False) as response: @@ -154,9 +189,11 @@ class AryionExtractor(Extractor): "user" : self.user or artist, "title" : title, "artist": artist, + "description": text.unescape(extr( + 'property="og:description" content="', '"')), "path" : text.split_html(extr( "cookiecrumb'>", '</span'))[4:-1:2], - "date" : datetime(*parsedate_tz(lmod)[:6]), + "date" : dt.datetime(*parsedate_tz(lmod)[:6]), "size" : text.parse_int(clen), "views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")), "width" : text.parse_int(extr("Resolution</b>:", "x")), @@ -164,8 +201,6 @@ class AryionExtractor(Extractor): "comments" : text.parse_int(extr("Comments</b>:", "<")), "favorites": text.parse_int(extr("Favorites</b>:", "<")), "tags" : text.split_html(extr("class='taglist'>", "</span>")), - "description": text.unescape(text.remove_html(extr( - "<p>", "</p>"), "", "")), "filename" : fname, "extension": ext, "_http_lastmodified": lmod, @@ -176,14 +211,11 @@ class AryionGalleryExtractor(AryionExtractor): """Extractor for a user's gallery on eka's portal""" subcategory = "gallery" categorytransfer = True - pattern = BASE_PATTERN + r"/(?:gallery/|user/|latest.php\?name=)([^/?#]+)" + pattern = rf"{BASE_PATTERN}/(?:gallery/|user/|latest.php\?name=)([^/?#]+)" example = "https://aryion.com/g4/gallery/USER" - def __init__(self, match): - AryionExtractor.__init__(self, match) - self.offset = 0 - def _init(self): + self.offset = 0 self.recursive = self.config("recursive", True) def skip(self, num): @@ -204,15 +236,34 @@ class AryionGalleryExtractor(AryionExtractor): class AryionFavoriteExtractor(AryionExtractor): """Extractor for a user's favorites gallery""" subcategory = "favorite" - directory_fmt = ("{category}", "{user!l}", "favorites") + directory_fmt = ("{category}", "{user!l}", "favorites", "{folder}") archive_fmt = "f_{user}_{id}" - categorytransfer = True - pattern = BASE_PATTERN + r"/favorites/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/favorites/([^/?#]+)(?:/([^?#]+))?" example = "https://aryion.com/g4/favorites/USER" + def _init(self): + self.recursive = self.config("recursive", True) + def posts(self): url = f"{self.root}/g4/favorites/{self.user}" - return self._pagination_params(url, None, "data-item-id='") + return self._pagination_folders(url, self.groups[1]) + + +class AryionWatchExtractor(AryionExtractor): + """Extractor for your watched users and tags""" + subcategory = "watch" + directory_fmt = ("{category}", "{user!l}",) + pattern = rf"{BASE_PATTERN}/messagepage\.php()" + example = "https://aryion.com/g4/messagepage.php" + + def posts(self): + if not self.cookies_check(self.cookies_names): + raise exception.AuthRequired( + ("username & password", "authenticated cookies"), + "watched Submissions") + self.cookies.set("g4p_msgpage_style", "plain", domain="aryion.com") + url = self.root + "/g4/messagepage.php" + return self._pagination_params(url, None, 'data-item-id="', '"') class AryionTagExtractor(AryionExtractor): @@ -220,7 +271,7 @@ class AryionTagExtractor(AryionExtractor): subcategory = "tag" directory_fmt = ("{category}", "tags", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/tags\.php\?([^#]+)" + pattern = rf"{BASE_PATTERN}/tags\.php\?([^#]+)" example = "https://aryion.com/g4/tags.php?tag=TAG" def _init(self): @@ -235,10 +286,34 @@ class AryionTagExtractor(AryionExtractor): return self._pagination_params(url, self.params) +class AryionSearchExtractor(AryionExtractor): + """Extractor for searches on eka's portal""" + subcategory = "search" + directory_fmt = ("{category}", "searches", "{search[prefix]}" + "{search[q]|search[tags]|search[user]}") + archive_fmt = ("s_{search[prefix]}" + "{search[q]|search[tags]|search[user]}_{id}") + pattern = rf"{BASE_PATTERN}/search\.php\?([^#]+)" + example = "https://aryion.com/g4/search.php?q=TEXT&tags=TAGS&user=USER" + + def metadata(self): + params = text.parse_query(self.user) + return {"search": { + **params, + "prefix": ("" if params.get("q") else + "t_" if params.get("tags") else + "u_" if params.get("user") else ""), + }} + + def posts(self): + url = f"{self.root}/g4/search.php?{self.user}" + return self._pagination_next(url) + + class AryionPostExtractor(AryionExtractor): """Extractor for individual posts on eka's portal""" subcategory = "post" - pattern = BASE_PATTERN + r"/view/(\d+)" + pattern = rf"{BASE_PATTERN}/view/(\d+)" example = "https://aryion.com/g4/view/12345" def posts(self): diff --git a/gallery_dl/extractor/audiochan.py b/gallery_dl/extractor/audiochan.py new file mode 100644 index 0000000..b708ce7 --- /dev/null +++ b/gallery_dl/extractor/audiochan.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://audiochan.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?audiochan\.com" + + +class AudiochanExtractor(Extractor): + """Base class for audiochan extractors""" + category = "audiochan" + root = "https://audiochan.com" + root_api = "https://api.audiochan.com" + directory_fmt = ("{category}", "{user[display_name]}") + filename_fmt = "{title} ({slug}).{extension}" + archive_fmt = "{audioFile[id]}" + + def _init(self): + self.user = False + self.headers_api = { + "content-type" : "application/json", + "Origin" : self.root, + "Sec-Fetch-Dest" : "empty", + "Sec-Fetch-Mode" : "cors", + "Sec-Fetch-Site" : "same-site", + } + self.headers_dl = { + "Accept": "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9," + "application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5", + "Sec-Fetch-Dest" : "audio", + "Sec-Fetch-Mode" : "no-cors", + "Sec-Fetch-Site" : "same-site", + "Accept-Encoding": "identity", + } + + def items(self): + for post in self.posts(): + file = post["audioFile"] + + post["_http_headers"] = self.headers_dl + post["date"] = self.parse_datetime_iso(file["created_at"]) + post["date_updated"] = self.parse_datetime_iso(file["updated_at"]) + post["description"] = self._extract_description( + post["description"]) + + tags = [] + for tag in post["tags"]: + if "tag" in tag: + tag = tag["tag"] + tags.append(f"{tag['category']}:{tag['name']}") + post["tags"] = tags + + if self.user: + post["user"] = post["credits"][0]["user"] + + if not (url := file["url"]): + post["_http_segmented"] = 600000 + url = file["stream_url"] + + yield Message.Directory, "", post + text.nameext_from_name(file["filename"], post) + yield Message.Url, url, post + + def request_api(self, endpoint, params=None): + url = self.root_api + endpoint + return self.request_json(url, params=params, headers=self.headers_api) + + def _pagination(self, endpoint, params, key=None): + params["page"] = 1 + params["limit"] = "12" + + while True: + data = self.request_api(endpoint, params) + if key is not None: + data = data[key] + + yield from data["data"] + + if not data["has_more"]: + break + params["page"] += 1 + + def _extract_description(self, description, texts=None): + if texts is None: + texts = [] + + if "text" in description: + texts.append(description["text"]) + elif "content" in description: + for desc in description["content"]: + self._extract_description(desc, texts) + + return texts + + +class AudiochanAudioExtractor(AudiochanExtractor): + subcategory = "audio" + pattern = rf"{BASE_PATTERN}/a/([^/?#]+)" + example = "https://audiochan.com/a/SLUG" + + def posts(self): + self.user = True + audio = self.request_api("/audios/slug/" + self.groups[0]) + return (audio,) + + +class AudiochanUserExtractor(AudiochanExtractor): + subcategory = "user" + pattern = rf"{BASE_PATTERN}/u/([^/?#]+)" + example = "https://audiochan.com/u/USER" + + def posts(self): + endpoint = "/users/" + self.groups[0] + self.kwdict["user"] = self.request_api(endpoint)["data"] + + params = { + "sfw_only": "false", + "sort" : "new", + } + return self._pagination(endpoint + "/audios", params) + + +class AudiochanCollectionExtractor(AudiochanExtractor): + subcategory = "collection" + pattern = rf"{BASE_PATTERN}/c/([^/?#]+)" + example = "https://audiochan.com/c/SLUG" + + def posts(self): + slug = self.groups[0] + endpoint = "/collections/" + slug + self.kwdict["collection"] = col = self.request_api(endpoint) + col.pop("audios", None) + col.pop("items", None) + + endpoint = f"/collections/slug/{slug}/items" + return self._pagination(endpoint, {}) + + +class AudiochanSearchExtractor(AudiochanExtractor): + subcategory = "search" + pattern = rf"{BASE_PATTERN}/search/?\?([^#]+)" + example = "https://audiochan.com/search?q=QUERY" + + def posts(self): + self.user = True + endpoint = "/search" + params = text.parse_query(self.groups[0]) + params["sfw_only"] = "false" + self.kwdict["search_tags"] = params.get("q") + return self._pagination(endpoint, params, "audios") diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index a7d1b78..f8e803b 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -80,7 +80,7 @@ class BatotoBase(): class BatotoChapterExtractor(BatotoBase, ChapterExtractor): """Extractor for batoto manga chapters""" archive_fmt = "{chapter_id}_{page}" - pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)" + pattern = rf"{BASE_PATTERN}/(?:title/[^/?#]+|chapter)/(\d+)" example = "https://xbato.org/title/12345-MANGA/54321" def __init__(self, match): @@ -104,7 +104,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): info = text.remove_html(extr('link-hover">', "</")) info = text.unescape(info) - match = util.re( + match = text.re( r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?" r"(?:Chapter|Episode)\s*(\d+)([\w.]*)").match(info) if match: @@ -123,7 +123,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): "chapter_minor" : minor, "chapter_string": info, "chapter_id" : text.parse_int(self.chapter_id), - "date" : text.parse_timestamp(extr(' time="', '"')[:-3]), + "date" : self.parse_timestamp(extr(' time="', '"')[:-3]), } def images(self, page): @@ -139,8 +139,8 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor): """Extractor for batoto manga""" reverse = False chapterclass = BatotoChapterExtractor - pattern = (BASE_PATTERN + - r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$") + pattern = (rf"{BASE_PATTERN}" + rf"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$") example = "https://xbato.org/title/12345-MANGA/" def __init__(self, match): @@ -167,8 +167,7 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor): data["chapter"] = text.parse_int(chapter) data["chapter_minor"] = sep + minor - data["date"] = text.parse_datetime( - extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ") + data["date"] = self.parse_datetime_iso(extr('time="', '"')) url = f"{self.root}/title/{href}" results.append((url, data.copy())) @@ -188,9 +187,9 @@ def _manga_info(self, manga_id, page=None): "manga" : data["name"][1], "manga_id" : text.parse_int(manga_id), "manga_slug" : data["slug"][1], - "manga_date" : text.parse_timestamp( + "manga_date" : self.parse_timestamp( data["dateCreate"][1] // 1000), - "manga_date_updated": text.parse_timestamp( + "manga_date_updated": self.parse_timestamp( data["dateUpdate"][1] / 1000), "author" : json_list(data["authors"]), "artist" : json_list(data["artists"]), diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index 8efb3db..cb357d1 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -18,11 +18,10 @@ class BbcGalleryExtractor(GalleryExtractor): """Extractor for a programme gallery on bbc.co.uk""" category = "bbc" root = "https://www.bbc.co.uk" - directory_fmt = ("{category}", "{path[0]}", "{path[1]}", "{path[2]}", - "{path[3:]:J - /}") + directory_fmt = ("{category}", "{path:I}") filename_fmt = "{num:>02}.{extension}" archive_fmt = "{programme}_{num}" - pattern = BASE_PATTERN + r"[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$" + pattern = rf"{BASE_PATTERN}[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$" example = "https://www.bbc.co.uk/programmes/PATH" def metadata(self, page): @@ -72,7 +71,7 @@ class BbcProgrammeExtractor(Extractor): category = "bbc" subcategory = "programme" root = "https://www.bbc.co.uk" - pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?" + pattern = rf"{BASE_PATTERN}[^/?#]+/galleries)(?:/?\?page=(\d+))?" example = "https://www.bbc.co.uk/programmes/ID/galleries" def items(self): diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 4a7c074..bb0562d 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -67,7 +67,7 @@ class BehanceExtractor(Extractor): tags = [tag["title"] for tag in tags] data["tags"] = tags - data["date"] = text.parse_timestamp( + data["date"] = self.parse_timestamp( data.get("publishedOn") or data.get("conceived_on") or 0) if creator := data.get("creator"): @@ -109,7 +109,7 @@ class BehanceGalleryExtractor(BehanceExtractor): imgs = self.get_images(data) data["count"] = len(imgs) - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], (url, module) in enumerate(imgs, 1): data["module"] = module data["extension"] = (module.get("extension") or diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py index ce50a91..33f4ad3 100644 --- a/gallery_dl/extractor/bellazon.py +++ b/gallery_dl/extractor/bellazon.py @@ -46,8 +46,8 @@ class BellazonExtractor(Extractor): data = {"post": post} post["count"] = data["count"] = len(urls) - yield Message.Directory, data - data["num"] = 0 + yield Message.Directory, "", data + data["num"] = data["num_internal"] = data["num_external"] = 0 for info, url, url_img in urls: url = text.unescape(url or url_img) @@ -59,27 +59,35 @@ class BellazonExtractor(Extractor): ): continue data["num"] += 1 + data["num_internal"] += 1 if not (alt := text.extr(info, ' alt="', '"')) or ( alt.startswith("post-") and "_thumb." in alt): - name = url + dc = text.nameext_from_url(url, data.copy()) else: - name = text.unescape(alt) + dc = data.copy() + dc["name"] = name = text.unescape(alt) + dc["filename"] = name.partition(".")[0] - dc = text.nameext_from_url(name, data.copy()) dc["id"] = text.extr(info, 'data-fileid="', '"') if ext := text.extr(info, 'data-fileext="', '"'): dc["extension"] = ext elif "/core/interface/file/attachment.php" in url: if not dc["id"]: - dc["id"] = url.rpartition("?id=")[2] + dc["id"] = \ + url.rpartition("?id=")[2].partition("&")[0] if name := text.extr(info, ">", "<").strip(): - text.nameext_from_url(name, dc) + dc["name"] = name = text.unescape(name) + text.nameext_from_name(name, dc) + else: + dc["extension"] = text.ext_from_url(url) if url[0] == "/": url = f"https:{url}" yield Message.Url, url, dc else: + data["num"] += 1 + data["num_external"] += 1 yield Message.Queue, url, data def _pagination(self, base, pnum=None): @@ -106,7 +114,7 @@ class BellazonExtractor(Extractor): def _pagination_reverse(self, base, pnum=None): base = f"{self.root}{base}" - url = f"{base}/page/9999/" # force redirect to highest page number + url = f"{base}/page/{'9999' if pnum is None else pnum}/" with self.request(url) as response: parts = response.url.rsplit("/", 3) pnum = text.parse_int(parts[2]) if parts[1] == "page" else 1 @@ -130,7 +138,7 @@ class BellazonExtractor(Extractor): author = schema["author"] stats = schema["interactionStatistic"] url_t = schema["url"] - url_a = author["url"] + url_a = author.get("url") or "" path = text.split_html(text.extr( page, '<nav class="ipsBreadcrumb', "</nav>"))[2:-1] @@ -141,8 +149,8 @@ class BellazonExtractor(Extractor): "title": schema["headline"], "views": stats[0]["userInteractionCount"], "posts": stats[1]["userInteractionCount"], - "date" : text.parse_datetime(schema["datePublished"]), - "date_updated": text.parse_datetime(schema["dateModified"]), + "date" : self.parse_datetime_iso(schema["datePublished"]), + "date_updated": self.parse_datetime_iso(schema["dateModified"]), "description" : text.unescape(schema["text"]).strip(), "section" : path[-2], "author" : author["name"], @@ -151,8 +159,12 @@ class BellazonExtractor(Extractor): thread["id"], _, thread["slug"] = \ url_t.rsplit("/", 2)[1].partition("-") - thread["author_id"], _, thread["author_slug"] = \ - url_a.rsplit("/", 2)[1].partition("-") + + if url_a: + thread["author_id"], _, thread["author_slug"] = \ + url_a.rsplit("/", 2)[1].partition("-") + else: + thread["author_id"] = thread["author_slug"] = "" return thread @@ -162,15 +174,18 @@ class BellazonExtractor(Extractor): post = { "id": extr('id="elComment_', '"'), "author_url": extr(" href='", "'"), - "date": text.parse_datetime(extr("datetime='", "'")), + "date": self.parse_datetime_iso(extr("datetime='", "'")), "content": extr("<!-- Post content -->", "\n\t\t</div>"), } if (pos := post["content"].find(">")) >= 0: post["content"] = post["content"][pos+1:].strip() - post["author_id"], _, post["author_slug"] = \ - post["author_url"].rsplit("/", 2)[1].partition("-") + if url_a := post["author_url"]: + post["author_id"], _, post["author_slug"] = \ + url_a.rsplit("/", 2)[1].partition("-") + else: + post["author_id"] = post["author_slug"] = "" return post diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py index 3f0acff..fe10150 100644 --- a/gallery_dl/extractor/bilibili.py +++ b/gallery_dl/extractor/bilibili.py @@ -74,7 +74,7 @@ class BilibiliArticleExtractor(BilibiliExtractor): pass article["count"] = len(pics) - yield Message.Directory, article + yield Message.Directory, "", article for article["num"], pic in enumerate(pics, 1): url = pic["url"] article.update(pic) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index af43446..766272f 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -13,7 +13,7 @@ from .. import text, util def original(url): - return (util.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)") + return (text.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)") .sub(r"\1s0", url) .replace("http:", "https:", 1)) @@ -32,7 +32,7 @@ class BloggerExtractor(BaseExtractor): self.videos = self.config("videos", True) if self.videos: - self.findall_video = util.re( + self.findall_video = text.re( r"""src=["'](https?://www\.blogger\.com""" r"""/video\.g\?token=[^"']+)""").findall @@ -40,10 +40,10 @@ class BloggerExtractor(BaseExtractor): blog = self.api.blog_by_url("http://" + self.blog) blog["pages"] = blog["pages"]["totalItems"] blog["posts"] = blog["posts"]["totalItems"] - blog["date"] = text.parse_datetime(blog["published"]) + blog["date"] = self.parse_datetime_iso(blog["published"]) del blog["selfLink"] - findall_image = util.re( + findall_image = text.re( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' r'lh\d+(?:-\w+)?\.googleusercontent\.com|' @@ -65,14 +65,14 @@ class BloggerExtractor(BaseExtractor): post["author"] = post["author"]["displayName"] post["replies"] = post["replies"]["totalItems"] post["content"] = text.remove_html(content) - post["date"] = text.parse_datetime(post["published"]) + post["date"] = self.parse_datetime_iso(post["published"]) del post["selfLink"] del post["blog"] data = {"blog": blog, "post": post} if metadata: data.update(metadata) - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], url in enumerate(files, 1): data["url"] = url @@ -117,7 +117,7 @@ BASE_PATTERN = BloggerExtractor.update({ class BloggerPostExtractor(BloggerExtractor): """Extractor for a single blog post""" subcategory = "post" - pattern = BASE_PATTERN + r"(/\d\d\d\d/\d\d/[^/?#]+\.html)" + pattern = rf"{BASE_PATTERN}(/\d\d\d\d/\d\d/[^/?#]+\.html)" example = "https://BLOG.blogspot.com/1970/01/TITLE.html" def posts(self, blog): @@ -127,7 +127,7 @@ class BloggerPostExtractor(BloggerExtractor): class BloggerBlogExtractor(BloggerExtractor): """Extractor for an entire Blogger blog""" subcategory = "blog" - pattern = BASE_PATTERN + r"/?$" + pattern = rf"{BASE_PATTERN}/?$" example = "https://BLOG.blogspot.com/" def posts(self, blog): @@ -137,7 +137,7 @@ class BloggerBlogExtractor(BloggerExtractor): class BloggerSearchExtractor(BloggerExtractor): """Extractor for Blogger search resuls""" subcategory = "search" - pattern = BASE_PATTERN + r"/search/?\?q=([^&#]+)" + pattern = rf"{BASE_PATTERN}/search/?\?q=([^&#]+)" example = "https://BLOG.blogspot.com/search?q=QUERY" def metadata(self): @@ -151,7 +151,7 @@ class BloggerSearchExtractor(BloggerExtractor): class BloggerLabelExtractor(BloggerExtractor): """Extractor for Blogger posts by label""" subcategory = "label" - pattern = BASE_PATTERN + r"/search/label/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/search/label/([^/?#]+)" example = "https://BLOG.blogspot.com/search/label/LABEL" def metadata(self): diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index e8c5707..c981608 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -14,7 +14,7 @@ from ..cache import cache, memcache BASE_PATTERN = (r"(?:https?://)?" r"(?:(?:www\.)?(?:c|[fv]x)?bs[ky]y[ex]?\.app|main\.bsky\.dev)") -USER_PATTERN = BASE_PATTERN + r"/profile/([^/?#]+)" +USER_PATTERN = rf"{BASE_PATTERN}/profile/([^/?#]+)" class BlueskyExtractor(Extractor): @@ -60,7 +60,7 @@ class BlueskyExtractor(Extractor): self._prepare(post) files = self._extract_files(post) - yield Message.Directory, post + yield Message.Directory, "", post if files: did = post["author"]["did"] base = (f"{self.api.service_endpoint(did)}/xrpc" @@ -135,8 +135,7 @@ class BlueskyExtractor(Extractor): post["instance"] = self.instance post["post_id"] = self._pid(post) - post["date"] = text.parse_datetime( - post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S") + post["date"] = self.parse_datetime_iso(post["createdAt"][:19]) def _extract_files(self, post): if "embed" not in post: @@ -217,7 +216,7 @@ class BlueskyExtractor(Extractor): class BlueskyUserExtractor(Dispatch, BlueskyExtractor): - pattern = USER_PATTERN + r"$" + pattern = rf"{USER_PATTERN}$" example = "https://bsky.app/profile/HANDLE" def items(self): @@ -238,7 +237,7 @@ class BlueskyUserExtractor(Dispatch, BlueskyExtractor): class BlueskyPostsExtractor(BlueskyExtractor): subcategory = "posts" - pattern = USER_PATTERN + r"/posts" + pattern = rf"{USER_PATTERN}/posts" example = "https://bsky.app/profile/HANDLE/posts" def posts(self): @@ -248,7 +247,7 @@ class BlueskyPostsExtractor(BlueskyExtractor): class BlueskyRepliesExtractor(BlueskyExtractor): subcategory = "replies" - pattern = USER_PATTERN + r"/replies" + pattern = rf"{USER_PATTERN}/replies" example = "https://bsky.app/profile/HANDLE/replies" def posts(self): @@ -258,7 +257,7 @@ class BlueskyRepliesExtractor(BlueskyExtractor): class BlueskyMediaExtractor(BlueskyExtractor): subcategory = "media" - pattern = USER_PATTERN + r"/media" + pattern = rf"{USER_PATTERN}/media" example = "https://bsky.app/profile/HANDLE/media" def posts(self): @@ -268,7 +267,7 @@ class BlueskyMediaExtractor(BlueskyExtractor): class BlueskyVideoExtractor(BlueskyExtractor): subcategory = "video" - pattern = USER_PATTERN + r"/video" + pattern = rf"{USER_PATTERN}/video" example = "https://bsky.app/profile/HANDLE/video" def posts(self): @@ -278,7 +277,7 @@ class BlueskyVideoExtractor(BlueskyExtractor): class BlueskyLikesExtractor(BlueskyExtractor): subcategory = "likes" - pattern = USER_PATTERN + r"/likes" + pattern = rf"{USER_PATTERN}/likes" example = "https://bsky.app/profile/HANDLE/likes" def posts(self): @@ -289,7 +288,7 @@ class BlueskyLikesExtractor(BlueskyExtractor): class BlueskyFeedExtractor(BlueskyExtractor): subcategory = "feed" - pattern = USER_PATTERN + r"/feed/([^/?#]+)" + pattern = rf"{USER_PATTERN}/feed/([^/?#]+)" example = "https://bsky.app/profile/HANDLE/feed/NAME" def posts(self): @@ -299,7 +298,7 @@ class BlueskyFeedExtractor(BlueskyExtractor): class BlueskyListExtractor(BlueskyExtractor): subcategory = "list" - pattern = USER_PATTERN + r"/lists/([^/?#]+)" + pattern = rf"{USER_PATTERN}/lists/([^/?#]+)" example = "https://bsky.app/profile/HANDLE/lists/ID" def posts(self): @@ -309,7 +308,7 @@ class BlueskyListExtractor(BlueskyExtractor): class BlueskyFollowingExtractor(BlueskyExtractor): subcategory = "following" - pattern = USER_PATTERN + r"/follows" + pattern = rf"{USER_PATTERN}/follows" example = "https://bsky.app/profile/HANDLE/follows" def items(self): @@ -321,7 +320,7 @@ class BlueskyFollowingExtractor(BlueskyExtractor): class BlueskyPostExtractor(BlueskyExtractor): subcategory = "post" - pattern = USER_PATTERN + r"/post/([^/?#]+)" + pattern = rf"{USER_PATTERN}/post/([^/?#]+)" example = "https://bsky.app/profile/HANDLE/post/ID" def posts(self): @@ -331,19 +330,19 @@ class BlueskyPostExtractor(BlueskyExtractor): class BlueskyInfoExtractor(BlueskyExtractor): subcategory = "info" - pattern = USER_PATTERN + r"/info" + pattern = rf"{USER_PATTERN}/info" example = "https://bsky.app/profile/HANDLE/info" def items(self): self._metadata_user = True self.api._did_from_actor(self.groups[0]) - return iter(((Message.Directory, self._user),)) + return iter(((Message.Directory, "", self._user),)) class BlueskyAvatarExtractor(BlueskyExtractor): subcategory = "avatar" filename_fmt = "avatar_{post_id}.{extension}" - pattern = USER_PATTERN + r"/avatar" + pattern = rf"{USER_PATTERN}/avatar" example = "https://bsky.app/profile/HANDLE/avatar" def posts(self): @@ -353,7 +352,7 @@ class BlueskyAvatarExtractor(BlueskyExtractor): class BlueskyBackgroundExtractor(BlueskyExtractor): subcategory = "background" filename_fmt = "background_{post_id}.{extension}" - pattern = USER_PATTERN + r"/ba(?:nner|ckground)" + pattern = rf"{USER_PATTERN}/ba(?:nner|ckground)" example = "https://bsky.app/profile/HANDLE/banner" def posts(self): @@ -362,7 +361,7 @@ class BlueskyBackgroundExtractor(BlueskyExtractor): class BlueskySearchExtractor(BlueskyExtractor): subcategory = "search" - pattern = BASE_PATTERN + r"/search(?:/|\?q=)(.+)" + pattern = rf"{BASE_PATTERN}/search(?:/|\?q=)(.+)" example = "https://bsky.app/search?q=QUERY" def posts(self): @@ -372,7 +371,7 @@ class BlueskySearchExtractor(BlueskyExtractor): class BlueskyHashtagExtractor(BlueskyExtractor): subcategory = "hashtag" - pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)(?:/(top|latest))?" + pattern = rf"{BASE_PATTERN}/hashtag/([^/?#]+)(?:/(top|latest))?" example = "https://bsky.app/hashtag/NAME" def posts(self): @@ -382,7 +381,7 @@ class BlueskyHashtagExtractor(BlueskyExtractor): class BlueskyBookmarkExtractor(BlueskyExtractor): subcategory = "bookmark" - pattern = BASE_PATTERN + r"/saved" + pattern = rf"{BASE_PATTERN}/saved" example = "https://bsky.app/saved" def posts(self): @@ -401,7 +400,9 @@ class BlueskyAPI(): self.headers = {"Accept": "application/json"} self.username, self.password = extractor._get_auth_info() - if self.username: + if srv := extractor.config("api-server", False): + self.root = srv.rstrip("/") + elif self.username: self.root = "https://bsky.social" else: self.root = "https://api.bsky.app" diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index ae455bf..4858a4b 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -57,7 +57,7 @@ class BooruExtractor(BaseExtractor): post.update(data) self._prepare(post) - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, url, post def skip(self, num): diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py index 22f3259..5add768 100644 --- a/gallery_dl/extractor/boosty.py +++ b/gallery_dl/extractor/boosty.py @@ -49,6 +49,9 @@ class BoostyExtractor(Extractor): self.videos = videos def items(self): + headers = self.api.headers.copy() + del headers["Accept"] + for post in self.posts(): if not post.get("hasAccess"): self.log.warning("Not allowed to access post %s", post["id"]) @@ -61,9 +64,10 @@ class BoostyExtractor(Extractor): "post" : post, "user" : post.pop("user", None), "count": len(files), + "_http_headers": headers, } - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], file in enumerate(files, 1): data["file"] = file url = file["url"] @@ -78,7 +82,7 @@ class BoostyExtractor(Extractor): post["links"] = links = [] if "createdAt" in post: - post["date"] = text.parse_timestamp(post["createdAt"]) + post["date"] = self.parse_timestamp(post["createdAt"]) for block in post["data"]: try: @@ -159,7 +163,7 @@ class BoostyExtractor(Extractor): class BoostyUserExtractor(BoostyExtractor): """Extractor for boosty.to user profiles""" subcategory = "user" - pattern = BASE_PATTERN + r"/([^/?#]+)(?:\?([^#]+))?$" + pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:\?([^#]+))?$" example = "https://boosty.to/USER" def posts(self): @@ -175,7 +179,7 @@ class BoostyMediaExtractor(BoostyExtractor): subcategory = "media" directory_fmt = "{category}", "{user[blogUrl]} ({user[id]})", "media" filename_fmt = "{post[id]}_{num}.{extension}" - pattern = BASE_PATTERN + r"/([^/?#]+)/media/([^/?#]+)(?:\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/media/([^/?#]+)(?:\?([^#]+))?" example = "https://boosty.to/USER/media/all" def posts(self): @@ -188,7 +192,7 @@ class BoostyMediaExtractor(BoostyExtractor): class BoostyFeedExtractor(BoostyExtractor): """Extractor for your boosty.to subscription feed""" subcategory = "feed" - pattern = BASE_PATTERN + r"/(?:\?([^#]+))?(?:$|#)" + pattern = rf"{BASE_PATTERN}/(?:\?([^#]+))?(?:$|#)" example = "https://boosty.to/" def posts(self): @@ -199,7 +203,7 @@ class BoostyFeedExtractor(BoostyExtractor): class BoostyPostExtractor(BoostyExtractor): """Extractor for boosty.to posts""" subcategory = "post" - pattern = BASE_PATTERN + r"/([^/?#]+)/posts/([0-9a-f-]+)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/posts/([0-9a-f-]+)" example = "https://boosty.to/USER/posts/01234567-89ab-cdef-0123-456789abcd" def posts(self): @@ -212,7 +216,7 @@ class BoostyPostExtractor(BoostyExtractor): class BoostyFollowingExtractor(BoostyExtractor): """Extractor for your boosty.to subscribed users""" subcategory = "following" - pattern = BASE_PATTERN + r"/app/settings/subscriptions" + pattern = rf"{BASE_PATTERN}/app/settings/subscriptions" example = "https://boosty.to/app/settings/subscriptions" def items(self): @@ -227,7 +231,7 @@ class BoostyDirectMessagesExtractor(BoostyExtractor): subcategory = "direct-messages" directory_fmt = ("{category}", "{user[blogUrl]} ({user[id]})", "Direct Messages") - pattern = BASE_PATTERN + r"/app/messages/?\?dialogId=(\d+)" + pattern = rf"{BASE_PATTERN}/app/messages/?\?dialogId=(\d+)" example = "https://boosty.to/app/messages?dialogId=12345" def items(self): @@ -260,7 +264,7 @@ class BoostyDirectMessagesExtractor(BoostyExtractor): "count": len(files), } - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], file in enumerate(files, 1): data["file"] = file url = file["url"] @@ -280,8 +284,12 @@ class BoostyAPI(): if not access_token: if auth := self.extractor.cookies.get("auth", domain=".boosty.to"): - access_token = text.extr( - text.unquote(auth), '"accessToken":"', '"') + auth = text.unquote(auth) + access_token = text.extr(auth, '"accessToken":"', '"') + if expires := text.extr(auth, '"expiresAt":', ','): + import time + if text.parse_int(expires) < time.time() * 1000: + extractor.log.warning("'auth' cookie tokens expired") if access_token: self.headers["Authorization"] = "Bearer " + access_token diff --git a/gallery_dl/extractor/booth.py b/gallery_dl/extractor/booth.py index 0fcb1cb..3c000b1 100644 --- a/gallery_dl/extractor/booth.py +++ b/gallery_dl/extractor/booth.py @@ -70,8 +70,7 @@ class BoothItemExtractor(BoothExtractor): url + ".json", headers=headers, interval=False) item["booth_category"] = item.pop("category", None) - item["date"] = text.parse_datetime( - item["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + item["date"] = self.parse_datetime_iso(item["published_at"]) item["tags"] = [t["name"] for t in item["tags"]] shop = item["shop"] @@ -84,7 +83,7 @@ class BoothItemExtractor(BoothExtractor): item["count"] = 0 shop["uuid"] = util.NONE - yield Message.Directory, item + yield Message.Directory, "", item for num, file in enumerate(files, 1): url = file["url"] file["num"] = num diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 14ebc48..ed9cd0f 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -11,6 +11,7 @@ from .common import Extractor from .lolisafe import LolisafeAlbumExtractor from .. import text, util, config, exception +from ..cache import memcache import random if config.get(("extractor", "bunkr"), "tlds"): @@ -63,7 +64,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): root_dl = "https://get.bunkrr.su" root_api = "https://apidl.bunkr.ru" archive_fmt = "{album_id}_{id|id_url|slug}" - pattern = BASE_PATTERN + r"/a/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/a/([^/?#]+)" example = "https://bunkr.si/a/ID" def __init__(self, match): @@ -167,7 +168,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): item, 'name: "', ".") file["size"] = text.parse_int(text.extr( item, "size: ", " ,\n")) - file["date"] = text.parse_datetime(text.extr( + file["date"] = self.parse_datetime(text.extr( item, 'timestamp: "', '"'), "%H:%M:%S %d/%m/%Y") yield file @@ -176,6 +177,10 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) self.log.debug("%s", item, exc_info=exc) + if isinstance(exc, exception.HttpError) and \ + exc.status == 400 and \ + exc.response.url.startswith(self.root_api): + raise exception.AbortExtraction("Album deleted") def _extract_file(self, data_id): referer = f"{self.root_dl}/file/{data_id}" @@ -211,7 +216,7 @@ class BunkrMediaExtractor(BunkrAlbumExtractor): """Extractor for bunkr.si media links""" subcategory = "media" directory_fmt = ("{category}",) - pattern = BASE_PATTERN + r"(/[fvid]/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/[fvid]/[^/?#]+)" example = "https://bunkr.si/f/FILENAME" def fetch_album(self, album_id): @@ -227,10 +232,26 @@ class BunkrMediaExtractor(BunkrAlbumExtractor): self.log.error("%s: %s", exc.__class__.__name__, exc) return (), {} + album_id, album_name, album_size = self._album_info(text.extr( + page, ' href="../a/', '"')) return (file,), { - "album_id" : "", - "album_name" : "", - "album_size" : -1, - "description": "", - "count" : 1, + "album_id" : album_id, + "album_name": album_name, + "album_size": album_size, + "count" : 1, } + + @memcache(keyarg=1) + def _album_info(self, album_id): + if album_id: + try: + page = self.request(f"{self.root}/a/{album_id}").text + return ( + album_id, + text.unescape(text.unescape(text.extr( + page, 'property="og:title" content="', '"'))), + text.extr(page, '<span class="font-semibold">(', ')'), + ) + except Exception: + pass + return album_id, "", -1 diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py index 22f7a97..2066839 100644 --- a/gallery_dl/extractor/catbox.py +++ b/gallery_dl/extractor/catbox.py @@ -28,7 +28,7 @@ class CatboxAlbumExtractor(GalleryExtractor): return { "album_id" : self.page_url.rpartition("/")[2], "album_name" : text.unescape(extr("<h1>", "<")), - "date" : text.parse_datetime(extr( + "date" : self.parse_datetime(extr( "<p>Created ", "<"), "%B %d %Y"), "description": text.unescape(extr("<p>", "<")), } @@ -52,5 +52,5 @@ class CatboxFileExtractor(Extractor): def items(self): url = text.ensure_http_scheme(self.url) file = text.nameext_from_url(url, {"url": url}) - yield Message.Directory, file + yield Message.Directory, "", file yield Message.Url, url, file diff --git a/gallery_dl/extractor/cfake.py b/gallery_dl/extractor/cfake.py new file mode 100644 index 0000000..4c37455 --- /dev/null +++ b/gallery_dl/extractor/cfake.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://cfake.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?cfake\.com" + + +class CfakeExtractor(Extractor): + """Base class for cfake extractors""" + category = "cfake" + root = "https://cfake.com" + directory_fmt = ("{category}", "{type}", "{type_name} ({type_id})") + filename_fmt = "{category}_{type_name}_{id}.{extension}" + archive_fmt = "{id}" + + def items(self): + type, type_name, type_id, sub_id, pnum = self.groups + + if type.endswith("ies"): + type = type[:-3] + "y" + + kwdict = self.kwdict + kwdict["type"] = type + kwdict["type_id"] = text.parse_int(type_id) + kwdict["type_name"] = text.unquote(type_name).replace("_", " ") + kwdict["sub_id"] = text.parse_int(sub_id) + kwdict["page"] = pnum = text.parse_int(pnum, 1) + yield Message.Directory, "", {} + + base = f"{self.root}/images/{type}/{type_name}/{type_id}" + if sub_id: + base = f"{base}/{sub_id}" + + while True: + url = base if pnum < 2 else f"{base}/p{pnum}" + page = self.request(url).text + + # Extract and yield images + num = 0 + for image in self._extract_images(page): + num += 1 + image["num"] = num + (pnum - 1) * 50 + url = image["url"] + yield Message.Url, url, text.nameext_from_url(url, image) + + # Check for next page + if not num or not (pnum := self._check_pagination(page)): + return + kwdict["page"] = pnum + + def _extract_images(self, page): + """Extract image URLs and metadata from a gallery page""" + for item in text.extract_iter( + page, '<a href="javascript:showimage(', '</div></div>'): + + # Extract image path from showimage call + # Format: 'big.php?show=2025/filename.jpg&id_picture=... + show_param = text.extr(item, "show=", "&") + if not show_param: + continue + + # Extract metadata + picture_id = text.extr(item, "id_picture=", "&") + name_param = text.extr(item, "p_name=", "'") + + # Extract date + date = text.extr(item, 'id="date_vignette">', '</div>') + + # Extract rating + rating_text = text.extr(item, 'class="current-rating"', '</li>') + rating = text.extr(rating_text, 'width:', 'px') + + # Convert thumbnail path to full image path + # show_param is like "2025/filename.jpg" + image_url = f"{self.root}/medias/photos/{show_param}" + + yield { + "url": image_url, + "id": text.parse_int(picture_id) if picture_id else 0, + "name": text.unescape(name_param) if name_param else "", + "date": date, + "rating": rating, + } + + def _check_pagination(self, page): + """Check if there are more pages and return next page number""" + # Look for current page indicator + # Format: id="num_page_current" ><a href=".../ p1">1</a> + current_section = text.extr( + page, 'id="num_page_current"', '</div>') + if not current_section: + return None + + # Extract current page number from the link text + current_page_str = text.extr(current_section, '">', '</a>') + if not current_page_str: + return None + + current_page = text.parse_int(current_page_str) + if not current_page: + return None + + next_page = current_page + 1 + + # Check if next page link exists anywhere in the page + # Look for href="/images/.../pN" pattern + if f'/p{next_page}"' in page or f'/p{next_page} ' in page: + return next_page + + return None + + +class CfakeCelebrityExtractor(CfakeExtractor): + """Extractor for celebrity image galleries from cfake.com""" + subcategory = "celebrity" + pattern = (BASE_PATTERN + r"/images/(celebrity)" + r"/([^/?#]+)/(\d+)()(?:/p(\d+))?") + example = "https://cfake.com/images/celebrity/NAME/123" + + +class CfakeCategoryExtractor(CfakeExtractor): + """Extractor for category image galleries from cfake.com""" + subcategory = "category" + pattern = (BASE_PATTERN + r"/images/(categories)" + r"/([^/?#]+)/(\d+)()(?:/p(\d+))?") + example = "https://cfake.com/images/categories/NAME/123" + + +class CfakeCreatedExtractor(CfakeExtractor): + """Extractor for 'created' image galleries from cfake.com""" + subcategory = "created" + pattern = (BASE_PATTERN + r"/images/(created)" + r"/([^/?#]+)/(\d+)/(\d+)(?:/p(\d+))?") + example = "https://cfake.com/images/created/NAME/12345/123" + + +class CfakeCountryExtractor(CfakeExtractor): + """Extractor for country image galleries from cfake.com""" + subcategory = "country" + pattern = (BASE_PATTERN + r"/images/(country)" + r"/([^/?#]+)/(\d+)/(\d+)(?:/p(\d+))?") + example = "https://cfake.com/images/country/NAME/12345/123" diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 1552899..9a766d0 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -17,14 +17,17 @@ class CheveretoExtractor(BaseExtractor): basecategory = "chevereto" directory_fmt = ("{category}", "{user}", "{album}") archive_fmt = "{id}" + parent = True def _init(self): self.path = self.groups[-1] - def _pagination(self, url): - while True: - page = self.request(url).text + def _pagination(self, url, callback=None): + page = self.request(url).text + if callback is not None: + callback(page) + while True: for item in text.extract_iter( page, '<div class="list-item-image ', 'image-container'): yield text.urljoin(self.root, text.extr( @@ -35,12 +38,13 @@ class CheveretoExtractor(BaseExtractor): return if url[0] == "/": url = self.root + url + page = self.request(url).text BASE_PATTERN = CheveretoExtractor.update({ "jpgfish": { - "root": "https://jpg6.su", - "pattern": r"(?:www\.)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", + "root": "https://jpg7.cr", + "pattern": r"(?:www\.)?jpe?g\d?\.(?:cr|su|pet|fish(?:ing)?|church)", }, "imagepond": { "root": "https://imagepond.net", @@ -56,8 +60,8 @@ BASE_PATTERN = CheveretoExtractor.update({ class CheveretoImageExtractor(CheveretoExtractor): """Extractor for chevereto images""" subcategory = "image" - pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)" - example = "https://jpg2.su/img/TITLE.ID" + pattern = rf"{BASE_PATTERN}(/im(?:g|age)/[^/?#]+)" + example = "https://jpg7.cr/img/TITLE.ID" def items(self): url = self.root + self.path @@ -74,25 +78,27 @@ class CheveretoImageExtractor(CheveretoExtractor): url, b"seltilovessimpcity@simpcityhatesscrapers", fromhex=True) + album_url, _, album_name = extr("Added to <a", "</a>").rpartition(">") file = { "id" : self.path.rpartition("/")[2].rpartition(".")[2], "url" : url, - "album": text.remove_html(extr( - "Added to <a", "</a>").rpartition(">")[2]), - "date" : text.parse_datetime(extr( - '<span title="', '"'), "%Y-%m-%d %H:%M:%S"), + "album": text.remove_html(album_name), + "date" : self.parse_datetime_iso(extr('<span title="', '"')), "user" : extr('username: "', '"'), } + file["album_slug"], _, file["album_id"] = text.rextr( + album_url, "/", '"').rpartition(".") + text.nameext_from_url(file["url"], file) - yield Message.Directory, file + yield Message.Directory, "", file yield Message.Url, file["url"], file class CheveretoVideoExtractor(CheveretoExtractor): """Extractor for chevereto videos""" subcategory = "video" - pattern = BASE_PATTERN + r"(/video/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/video/[^/?#]+)" example = "https://imagepond.net/video/TITLE.ID" def items(self): @@ -114,13 +120,17 @@ class CheveretoVideoExtractor(CheveretoExtractor): 'property="video:height" content="', '"')), "duration" : extr( 'class="far fa-clock"></i>', "—"), - "album": text.remove_html(extr( - "Added to <a", "</a>").rpartition(">")[2]), - "date" : text.parse_datetime(extr( - '<span title="', '"'), "%Y-%m-%d %H:%M:%S"), + "album" : extr( + "Added to <a", "</a>"), + "date" : self.parse_datetime_iso(extr('<span title="', '"')), "user" : extr('username: "', '"'), } + album_url, _, album_name = file["album"].rpartition(">") + file["album"] = text.remove_html(album_name) + file["album_slug"], _, file["album_id"] = text.rextr( + album_url, "/", '"').rpartition(".") + try: min, _, sec = file["duration"].partition(":") file["duration"] = int(min) * 60 + int(sec) @@ -128,15 +138,15 @@ class CheveretoVideoExtractor(CheveretoExtractor): pass text.nameext_from_url(file["url"], file) - yield Message.Directory, file + yield Message.Directory, "", file yield Message.Url, file["url"], file class CheveretoAlbumExtractor(CheveretoExtractor): """Extractor for chevereto albums""" subcategory = "album" - pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)" - example = "https://jpg2.su/album/TITLE.ID" + pattern = rf"{BASE_PATTERN}(/a(?:lbum)?/[^/?#]+(?:/sub)?)" + example = "https://jpg7.cr/album/TITLE.ID" def items(self): url = self.root + self.path @@ -148,16 +158,31 @@ class CheveretoAlbumExtractor(CheveretoExtractor): else: albums = (url,) + kwdict = self.kwdict for album in albums: - for item_url in self._pagination(album): + for kwdict["num"], item_url in enumerate(self._pagination( + album, self._extract_metadata_album), 1): data = data_video if "/video/" in item_url else data_image yield Message.Queue, item_url, data + def _extract_metadata_album(self, page): + url, pos = text.extract( + page, 'property="og:url" content="', '"') + title, pos = text.extract( + page, 'property="og:title" content="', '"', pos) + + kwdict = self.kwdict + kwdict["album_slug"], _, kwdict["album_id"] = \ + url[url.rfind("/")+1:].rpartition(".") + kwdict["album"] = text.unescape(title) + kwdict["count"] = text.parse_int(text.extract( + page, 'data-text="image-count">', "<", pos)[0]) + class CheveretoCategoryExtractor(CheveretoExtractor): """Extractor for chevereto galleries""" subcategory = "category" - pattern = BASE_PATTERN + r"(/category/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/category/[^/?#]+)" example = "https://imglike.com/category/TITLE" def items(self): @@ -169,8 +194,8 @@ class CheveretoCategoryExtractor(CheveretoExtractor): class CheveretoUserExtractor(CheveretoExtractor): """Extractor for chevereto users""" subcategory = "user" - pattern = BASE_PATTERN + r"(/[^/?#]+(?:/albums)?)" - example = "https://jpg2.su/USER" + pattern = rf"{BASE_PATTERN}(/[^/?#]+(?:/albums)?)" + example = "https://jpg7.cr/USER" def items(self): url = self.root + self.path diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py index 45e5dab..c68af2e 100644 --- a/gallery_dl/extractor/cien.py +++ b/gallery_dl/extractor/cien.py @@ -34,7 +34,7 @@ class CienExtractor(Extractor): page = self.request(url, params=params).text for card in text.extract_iter( - page, ' class="c-cardCase-item', '</div>'): + page, ' class="c-cardCase-item', '</figure>'): article_url = text.extr(card, ' href="', '"') yield Message.Queue, article_url, data @@ -48,7 +48,7 @@ class CienArticleExtractor(CienExtractor): filename_fmt = "{num:>02} {filename}.{extension}" directory_fmt = ("{category}", "{author[name]}", "{post_id} {name}") archive_fmt = "{post_id}_{num}" - pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)" + pattern = rf"{BASE_PATTERN}/creator/(\d+)/article/(\d+)" example = "https://ci-en.net/creator/123/article/12345" def items(self): @@ -61,7 +61,7 @@ class CienArticleExtractor(CienExtractor): post["post_url"] = url post["post_id"] = text.parse_int(post_id) post["count"] = len(files) - post["date"] = text.parse_datetime(post["datePublished"]) + post["date"] = self.parse_datetime_iso(post["datePublished"]) try: post["author"]["id"] = text.parse_int(author_id) @@ -70,7 +70,7 @@ class CienArticleExtractor(CienExtractor): except Exception: pass - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], file in enumerate(files, 1): post.update(file) if "extension" not in file: @@ -160,7 +160,7 @@ class CienArticleExtractor(CienExtractor): class CienCreatorExtractor(CienExtractor): subcategory = "creator" - pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" + pattern = rf"{BASE_PATTERN}/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" example = "https://ci-en.net/creator/123" def items(self): @@ -172,7 +172,7 @@ class CienCreatorExtractor(CienExtractor): class CienRecentExtractor(CienExtractor): subcategory = "recent" - pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/mypage/recent(?:\?([^#]+))?" example = "https://ci-en.net/mypage/recent" def items(self): @@ -183,7 +183,7 @@ class CienRecentExtractor(CienExtractor): class CienFollowingExtractor(CienExtractor): subcategory = "following" - pattern = BASE_PATTERN + r"/mypage/subscription(/following)?" + pattern = rf"{BASE_PATTERN}/mypage/subscription(/following)?" example = "https://ci-en.net/mypage/subscription" def items(self): diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index 26ee3fd..742c561 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -15,7 +15,7 @@ import itertools import time BASE_PATTERN = r"(?:https?://)?civitai\.com" -USER_PATTERN = BASE_PATTERN + r"/user/([^/?#]+)" +USER_PATTERN = rf"{BASE_PATTERN}/user/([^/?#]+)" class CivitaiExtractor(Extractor): @@ -61,13 +61,14 @@ class CivitaiExtractor(Extractor): if isinstance(metadata, str): metadata = metadata.split(",") elif not isinstance(metadata, (list, tuple)): - metadata = ("generation", "version", "post") + metadata = {"generation", "version", "post", "tags"} self._meta_generation = ("generation" in metadata) self._meta_version = ("version" in metadata) self._meta_post = ("post" in metadata) + self._meta_tags = ("tags" in metadata) else: self._meta_generation = self._meta_version = self._meta_post = \ - False + self._meta_tags = False def items(self): if models := self.models(): @@ -86,8 +87,7 @@ class CivitaiExtractor(Extractor): images = self.api.images_post(post["id"]) post = self.api.post(post["id"]) - post["date"] = text.parse_datetime( - post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["publishedAt"]) data = { "post": post, "user": post.pop("user"), @@ -96,7 +96,7 @@ class CivitaiExtractor(Extractor): data["model"], data["version"] = \ self._extract_meta_version(post) - yield Message.Directory, data + yield Message.Directory, "", data for file in self._image_results(images): file.update(data) yield Message.Url, file["url"], file @@ -111,8 +111,9 @@ class CivitaiExtractor(Extractor): } if self._meta_generation: - data["generation"] = \ - self._extract_meta_generation(file) + data["generation"] = self._extract_meta_generation(file) + if self._meta_tags: + data["tags"] = self._extract_meta_tags(file) if self._meta_version: data["model"], data["version"] = \ self._extract_meta_version(file, False) @@ -122,8 +123,7 @@ class CivitaiExtractor(Extractor): data["post"] = post = self._extract_meta_post(file) if post: post.pop("user", None) - file["date"] = text.parse_datetime( - file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + file["date"] = self.parse_datetime_iso(file["createdAt"]) data["url"] = url = self._url(file) text.nameext_from_url(url, data) @@ -131,7 +131,7 @@ class CivitaiExtractor(Extractor): data["extension"] = ( self._video_ext if file.get("type") == "video" else self._image_ext) - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data return @@ -180,10 +180,11 @@ class CivitaiExtractor(Extractor): if "id" not in file and data["filename"].isdecimal(): file["id"] = text.parse_int(data["filename"]) if "date" not in file: - file["date"] = text.parse_datetime( - file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + file["date"] = self.parse_datetime_iso(file["createdAt"]) if self._meta_generation: file["generation"] = self._extract_meta_generation(file) + if self._meta_tags: + file["tags"] = self._extract_meta_tags(file) yield data def _image_reactions(self): @@ -211,16 +212,21 @@ class CivitaiExtractor(Extractor): try: return self.api.image_generationdata(image["id"]) except Exception as exc: - return self.log.debug("", exc_info=exc) + return self.log.traceback(exc) def _extract_meta_post(self, image): try: post = self.api.post(image["postId"]) - post["date"] = text.parse_datetime( - post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["publishedAt"]) return post except Exception as exc: - return self.log.debug("", exc_info=exc) + return self.log.traceback(exc) + + def _extract_meta_tags(self, image): + try: + return self.api.tag_getvotabletags(image["id"]) + except Exception as exc: + return self.log.traceback(exc) def _extract_meta_version(self, item, is_post=True): try: @@ -228,7 +234,7 @@ class CivitaiExtractor(Extractor): version = self.api.model_version(version_id).copy() return version.pop("model", None), version except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) return None, None def _extract_version_id(self, item, is_post=True): @@ -252,7 +258,7 @@ class CivitaiModelExtractor(CivitaiExtractor): directory_fmt = ("{category}", "{user[username]}", "{model[id]}{model[name]:? //}", "{version[id]}{version[name]:? //}") - pattern = BASE_PATTERN + r"/models/(\d+)(?:/?\?modelVersionId=(\d+))?" + pattern = rf"{BASE_PATTERN}/models/(\d+)(?:/?\?modelVersionId=(\d+))?" example = "https://civitai.com/models/12345/TITLE" def items(self): @@ -278,8 +284,7 @@ class CivitaiModelExtractor(CivitaiExtractor): versions = (version,) for version in versions: - version["date"] = text.parse_datetime( - version["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + version["date"] = self.parse_datetime_iso(version["createdAt"]) data = { "model" : model, @@ -287,7 +292,7 @@ class CivitaiModelExtractor(CivitaiExtractor): "user" : user, } - yield Message.Directory, data + yield Message.Directory, "", data for file in self._extract_files(model, version, user): file.update(data) yield Message.Url, file["url"], file @@ -342,9 +347,9 @@ class CivitaiModelExtractor(CivitaiExtractor): params = { "modelVersionId": version["id"], "prioritizedUserIds": (user["id"],), - "period": "AllTime", - "sort": "Most Reactions", - "limit": 20, + "period" : self.api._param_period(), + "sort" : self.api._param_sort(), + "limit" : 20, "pending": True, } images = self.api.images(params, defaults=False) @@ -370,7 +375,7 @@ class CivitaiModelExtractor(CivitaiExtractor): class CivitaiImageExtractor(CivitaiExtractor): subcategory = "image" - pattern = BASE_PATTERN + r"/images/(\d+)" + pattern = rf"{BASE_PATTERN}/images/(\d+)" example = "https://civitai.com/images/12345" def images(self): @@ -381,7 +386,7 @@ class CivitaiCollectionExtractor(CivitaiExtractor): subcategory = "collection" directory_fmt = ("{category}", "{user_collection[username]}", "collections", "{collection[id]}{collection[name]:? //}") - pattern = BASE_PATTERN + r"/collections/(\d+)" + pattern = rf"{BASE_PATTERN}/collections/(\d+)" example = "https://civitai.com/collections/12345" def images(self): @@ -391,8 +396,8 @@ class CivitaiCollectionExtractor(CivitaiExtractor): params = { "collectionId" : cid, - "period" : "AllTime", - "sort" : "Newest", + "period" : self.api._param_period(), + "sort" : self.api._param_sort(), "browsingLevel" : self.api.nsfw, "include" : ("cosmetics",), } @@ -403,7 +408,7 @@ class CivitaiPostExtractor(CivitaiExtractor): subcategory = "post" directory_fmt = ("{category}", "{username|user[username]}", "posts", "{post[id]}{post[title]:? //}") - pattern = BASE_PATTERN + r"/posts/(\d+)" + pattern = rf"{BASE_PATTERN}/posts/(\d+)" example = "https://civitai.com/posts/12345" def posts(self): @@ -412,7 +417,7 @@ class CivitaiPostExtractor(CivitaiExtractor): class CivitaiTagExtractor(CivitaiExtractor): subcategory = "tag" - pattern = BASE_PATTERN + r"/tag/([^/?&#]+)" + pattern = rf"{BASE_PATTERN}/tag/([^/?&#]+)" example = "https://civitai.com/tag/TAG" def models(self): @@ -422,7 +427,7 @@ class CivitaiTagExtractor(CivitaiExtractor): class CivitaiSearchModelsExtractor(CivitaiExtractor): subcategory = "search-models" - pattern = BASE_PATTERN + r"/search/models\?([^#]+)" + pattern = rf"{BASE_PATTERN}/search/models\?([^#]+)" example = "https://civitai.com/search/models?query=QUERY" def models(self): @@ -433,7 +438,7 @@ class CivitaiSearchModelsExtractor(CivitaiExtractor): class CivitaiSearchImagesExtractor(CivitaiExtractor): subcategory = "search-images" - pattern = BASE_PATTERN + r"/search/images\?([^#]+)" + pattern = rf"{BASE_PATTERN}/search/images\?([^#]+)" example = "https://civitai.com/search/images?query=QUERY" def images(self): @@ -444,7 +449,7 @@ class CivitaiSearchImagesExtractor(CivitaiExtractor): class CivitaiModelsExtractor(CivitaiExtractor): subcategory = "models" - pattern = BASE_PATTERN + r"/models(?:/?\?([^#]+))?(?:$|#)" + pattern = rf"{BASE_PATTERN}/models(?:/?\?([^#]+))?(?:$|#)" example = "https://civitai.com/models" def models(self): @@ -454,7 +459,7 @@ class CivitaiModelsExtractor(CivitaiExtractor): class CivitaiImagesExtractor(CivitaiExtractor): subcategory = "images" - pattern = BASE_PATTERN + r"/images(?:/?\?([^#]+))?(?:$|#)" + pattern = rf"{BASE_PATTERN}/images(?:/?\?([^#]+))?(?:$|#)" example = "https://civitai.com/images" def images(self): @@ -465,7 +470,7 @@ class CivitaiImagesExtractor(CivitaiExtractor): class CivitaiVideosExtractor(CivitaiExtractor): subcategory = "videos" - pattern = BASE_PATTERN + r"/videos(?:/?\?([^#]+))?(?:$|#)" + pattern = rf"{BASE_PATTERN}/videos(?:/?\?([^#]+))?(?:$|#)" example = "https://civitai.com/videos" def images(self): @@ -476,7 +481,7 @@ class CivitaiVideosExtractor(CivitaiExtractor): class CivitaiPostsExtractor(CivitaiExtractor): subcategory = "posts" - pattern = BASE_PATTERN + r"/posts(?:/?\?([^#]+))?(?:$|#)" + pattern = rf"{BASE_PATTERN}/posts(?:/?\?([^#]+))?(?:$|#)" example = "https://civitai.com/posts" def posts(self): @@ -485,7 +490,7 @@ class CivitaiPostsExtractor(CivitaiExtractor): class CivitaiUserExtractor(Dispatch, CivitaiExtractor): - pattern = USER_PATTERN + r"/?(?:$|\?|#)" + pattern = rf"{USER_PATTERN}/?(?:$|\?|#)" example = "https://civitai.com/user/USER" def items(self): @@ -501,7 +506,7 @@ class CivitaiUserExtractor(Dispatch, CivitaiExtractor): class CivitaiUserModelsExtractor(CivitaiExtractor): subcategory = "user-models" - pattern = USER_PATTERN + r"/models/?(?:\?([^#]+))?" + pattern = rf"{USER_PATTERN}/models/?(?:\?([^#]+))?" example = "https://civitai.com/user/USER/models" def models(self): @@ -515,7 +520,7 @@ class CivitaiUserPostsExtractor(CivitaiExtractor): subcategory = "user-posts" directory_fmt = ("{category}", "{username|user[username]}", "posts", "{post[id]}{post[title]:? //}") - pattern = USER_PATTERN + r"/posts/?(?:\?([^#]+))?" + pattern = rf"{USER_PATTERN}/posts/?(?:\?([^#]+))?" example = "https://civitai.com/user/USER/posts" def posts(self): @@ -527,7 +532,7 @@ class CivitaiUserPostsExtractor(CivitaiExtractor): class CivitaiUserImagesExtractor(CivitaiExtractor): subcategory = "user-images" - pattern = USER_PATTERN + r"/images/?(?:\?([^#]+))?" + pattern = rf"{USER_PATTERN}/images/?(?:\?([^#]+))?" example = "https://civitai.com/user/USER/images" def __init__(self, match): @@ -548,7 +553,7 @@ class CivitaiUserImagesExtractor(CivitaiExtractor): class CivitaiUserVideosExtractor(CivitaiExtractor): subcategory = "user-videos" directory_fmt = ("{category}", "{username|user[username]}", "videos") - pattern = USER_PATTERN + r"/videos/?(?:\?([^#]+))?" + pattern = rf"{USER_PATTERN}/videos/?(?:\?([^#]+))?" example = "https://civitai.com/user/USER/videos" def __init__(self, match): @@ -567,7 +572,7 @@ class CivitaiUserVideosExtractor(CivitaiExtractor): class CivitaiUserCollectionsExtractor(CivitaiExtractor): subcategory = "user-collections" - pattern = USER_PATTERN + r"/collections/?(?:\?([^#]+))?" + pattern = rf"{USER_PATTERN}/collections/?(?:\?([^#]+))?" example = "https://civitai.com/user/USER/collections" def items(self): @@ -586,16 +591,15 @@ class CivitaiGeneratedExtractor(CivitaiExtractor): subcategory = "generated" filename_fmt = "{filename}.{extension}" directory_fmt = ("{category}", "generated") - pattern = f"{BASE_PATTERN}/generate" + pattern = rf"{BASE_PATTERN}/generate" example = "https://civitai.com/generate" def items(self): self._require_auth() for gen in self.api.orchestrator_queryGeneratedImages(): - gen["date"] = text.parse_datetime( - gen["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") - yield Message.Directory, gen + gen["date"] = self.parse_datetime_iso(gen["createdAt"]) + yield Message.Directory, "", gen for step in gen.pop("steps", ()): for image in step.pop("images", ()): data = {"file": image, **step, **gen} @@ -719,8 +723,8 @@ class CivitaiTrpcAPI(): if defaults: params = self._merge_params(params, { "useIndex" : True, - "period" : "AllTime", - "sort" : "Newest", + "period" : self._param_period(), + "sort" : self._param_sort(), "withMeta" : False, # Metadata Only "fromPlatform" : False, # Made On-Site "browsingLevel": self.nsfw, @@ -733,8 +737,8 @@ class CivitaiTrpcAPI(): def images_gallery(self, model, version, user): endpoint = "image.getImagesAsPostsInfinite" params = { - "period" : "AllTime", - "sort" : "Newest", + "period" : self._param_period(), + "sort" : self._param_sort(), "modelVersionId": version["id"], "modelId" : model["id"], "hidden" : False, @@ -768,9 +772,9 @@ class CivitaiTrpcAPI(): if defaults: params = self._merge_params(params, { - "period" : "AllTime", + "period" : self._param_period(), "periodMode" : "published", - "sort" : "Newest", + "sort" : self._param_sort(), "pending" : False, "hidden" : False, "followed" : False, @@ -797,9 +801,9 @@ class CivitaiTrpcAPI(): if defaults: params = self._merge_params(params, { "browsingLevel": self.nsfw, - "period" : "AllTime", + "period" : self._param_period(), "periodMode" : "published", - "sort" : "Newest", + "sort" : self._param_sort(), "followed" : False, "draftOnly" : False, "pending" : True, @@ -821,12 +825,17 @@ class CivitaiTrpcAPI(): if defaults: params = self._merge_params(params, { "browsingLevel": self.nsfw, - "sort" : "Newest", + "sort" : self._param_sort(), }) params = self._type_params(params) return self._pagination(endpoint, params) + def tag_getvotabletags(self, image_id): + endpoint = "tag.getVotableTags" + params = {"id": int(image_id), "type": "image"} + return self._call(endpoint, params) + def user(self, username): endpoint = "user.getCreator" params = {"username": username} @@ -835,7 +844,7 @@ class CivitaiTrpcAPI(): def orchestrator_queryGeneratedImages(self): endpoint = "orchestrator.queryGeneratedImages" params = { - "ascending": False, + "ascending": True if self._param_sort() == "Oldest" else False, "tags" : ("gen",), "authed" : True, } @@ -908,6 +917,21 @@ class CivitaiTrpcAPI(): params[name] = [type(item) for item in value] return params + def _param_period(self): + if period := self.extractor.config("period"): + return period + return "AllTime" + + def _param_sort(self): + if sort := self.extractor.config("sort"): + s = sort[0].lower() + if s in "drn": + return "Newest" + if s in "ao": + return "Oldest" + return sort + return "Newest" + def _bool(value): return value == "true" diff --git a/gallery_dl/extractor/comedywildlifephoto.py b/gallery_dl/extractor/comedywildlifephoto.py new file mode 100644 index 0000000..a1c1ef4 --- /dev/null +++ b/gallery_dl/extractor/comedywildlifephoto.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.comedywildlifephoto.com/""" + +from .common import GalleryExtractor +from .. import text + + +class ComedywildlifephotoGalleryExtractor(GalleryExtractor): + """Extractor for comedywildlifephoto galleries""" + category = "comedywildlifephoto" + root = "https://www.comedywildlifephoto.com" + directory_fmt = ("{category}", "{section}", "{title}") + filename_fmt = "{num:>03} {filename}.{extension}" + archive_fmt = "{section}/{title}/{num}" + pattern = (r"(?:https?://)?(?:www\.)?comedywildlifephoto\.com" + r"(/gallery/[^/?#]+/[^/?#]+\.php)") + example = "https://www.comedywildlifephoto.com/gallery/SECTION/TITLE.php" + + def metadata(self, page): + extr = text.extract_from(page) + + return { + "section": extr("<h1>", "<").strip(), + "title" : extr(">", "<"), + "description": text.unescape(extr( + 'class="c1 np">', "<div")), + } + + def images(self, page): + results = [] + + for fig in text.extract_iter(page, "<figure", "</figure>"): + width, _, height = text.extr( + fig, 'data-size="', '"').partition("x") + results.append(( + self.root + text.extr(fig, 'href="', '"'), { + "width" : text.parse_int(width), + "height" : text.parse_int(height), + "caption": text.unescape(text.extr( + fig, "<figcaption>", "<")), + } + )) + + return results diff --git a/gallery_dl/extractor/comick.py b/gallery_dl/extractor/comick.py index c76694c..9816786 100644 --- a/gallery_dl/extractor/comick.py +++ b/gallery_dl/extractor/comick.py @@ -27,7 +27,7 @@ class ComickCoversExtractor(ComickBase, GalleryExtractor): directory_fmt = ("{category}", "{manga}", "Covers") filename_fmt = "{volume:>02}_{lang}.{extension}" archive_fmt = "c_{id}" - pattern = BASE_PATTERN + r"/comic/([\w-]+)/cover" + pattern = rf"{BASE_PATTERN}/comic/([\w-]+)/cover" example = "https://comick.io/comic/MANGA/cover" def metadata(self, page): @@ -60,7 +60,7 @@ class ComickCoversExtractor(ComickBase, GalleryExtractor): class ComickChapterExtractor(ComickBase, ChapterExtractor): """Extractor for comick.io manga chapters""" archive_fmt = "{chapter_hid}_{page}" - pattern = (BASE_PATTERN + r"/comic/([\w-]+)" + pattern = (rf"{BASE_PATTERN}/comic/([\w-]+)" r"/(\w+(?:-(?:chapter|volume)-[^/?#]+)?)") example = "https://comick.io/comic/MANGA/ID-chapter-123-en" @@ -114,10 +114,8 @@ class ComickChapterExtractor(ComickBase, ChapterExtractor): "chapter_hid" : ch["hid"], "chapter_string": chstr, "group" : ch["group_name"], - "date" : text.parse_datetime( - ch["created_at"][:19], "%Y-%m-%dT%H:%M:%S"), - "date_updated" : text.parse_datetime( - ch["updated_at"][:19], "%Y-%m-%dT%H:%M:%S"), + "date" : self.parse_datetime_iso(ch["created_at"][:19]), + "date_updated" : self.parse_datetime_iso(ch["updated_at"][:19]), "lang" : ch["lang"], } @@ -142,7 +140,7 @@ class ComickChapterExtractor(ComickBase, ChapterExtractor): class ComickMangaExtractor(ComickBase, MangaExtractor): """Extractor for comick.io manga""" - pattern = BASE_PATTERN + r"/comic/([\w-]+)/?(?:\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/comic/([\w-]+)/?(?:\?([^#]+))?" example = "https://comick.io/comic/MANGA" def items(self): diff --git a/gallery_dl/extractor/comicvine.py b/gallery_dl/extractor/comicvine.py index 39397b9..f579ef7 100644 --- a/gallery_dl/extractor/comicvine.py +++ b/gallery_dl/extractor/comicvine.py @@ -60,6 +60,6 @@ class ComicvineTagExtractor(BooruExtractor): _file_url = operator.itemgetter("original") def _prepare(self, post): - post["date"] = text.parse_datetime( + post["date"] = self.parse_datetime( post["dateCreated"], "%a, %b %d %Y") post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 34e65c5..13c7bbe 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -19,11 +19,10 @@ import getpass import logging import requests import threading -from datetime import datetime from xml.etree import ElementTree from requests.adapters import HTTPAdapter from .message import Message -from .. import config, output, text, util, cache, exception +from .. import config, output, text, util, dt, cache, exception urllib3 = requests.packages.urllib3 @@ -32,7 +31,9 @@ class Extractor(): category = "" subcategory = "" basecategory = "" + basesubcategory = "" categorytransfer = False + parent = False directory_fmt = ("{category}",) filename_fmt = "{filename}.{extension}" archive_fmt = "" @@ -64,6 +65,10 @@ class Extractor(): else: self.category = CATEGORY_MAP[self.category] + self.parse_datetime = dt.parse + self.parse_datetime_iso = dt.parse_iso + self.parse_timestamp = dt.parse_ts + self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" @@ -89,7 +94,8 @@ class Extractor(): pass def items(self): - yield Message.Version, 1 + return + yield def skip(self, num): return 0 @@ -313,9 +319,9 @@ class Extractor(): seconds = float(seconds) until = now + seconds elif until: - if isinstance(until, datetime): + if isinstance(until, dt.datetime): # convert to UTC timestamp - until = util.datetime_to_timestamp(until) + until = dt.to_ts(until) else: until = float(until) seconds = until - now @@ -327,7 +333,7 @@ class Extractor(): return if reason: - t = datetime.fromtimestamp(until).time() + t = dt.datetime.fromtimestamp(until).time() isotime = f"{t.hour:02}:{t.minute:02}:{t.second:02}" self.log.info("Waiting until %s (%s)", isotime, reason) time.sleep(seconds) @@ -652,7 +658,7 @@ class Extractor(): self.log.warning( "cookies: %s/%s expired at %s", cookie.domain.lstrip("."), cookie.name, - datetime.fromtimestamp(cookie.expires)) + dt.datetime.fromtimestamp(cookie.expires)) continue elif diff <= 86400: @@ -693,13 +699,16 @@ class Extractor(): def get(key, default): ts = self.config(key, default) if isinstance(ts, str): - try: - ts = int(datetime.strptime(ts, fmt).timestamp()) - except ValueError as exc: - self.log.warning("Unable to parse '%s': %s", key, exc) + dt_obj = dt.parse_iso(ts) if fmt is None else dt.parse(ts, fmt) + if dt_obj is dt.NONE: + self.log.warning( + "Unable to parse '%s': Invalid %s string '%s'", + key, "isoformat" if fmt is None else "date", ts) ts = default + else: + ts = int(dt.to_ts(dt_obj)) return ts - fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S") + fmt = self.config("date-format") return get("date-min", dmin), get("date-max", dmax) @classmethod @@ -793,7 +802,7 @@ class GalleryExtractor(Extractor): enum = util.enumerate_reversed images = enum(imgs, 1) - yield Message.Directory, data + yield Message.Directory, "", data enum_key = self.enum if assets: @@ -912,7 +921,7 @@ class Dispatch(): elif isinstance(include, str): include = include.replace(" ", "").split(",") - results = [(Message.Version, 1)] + results = [] for category in include: try: extr, url = extractors[category] @@ -962,18 +971,16 @@ class BaseExtractor(Extractor): def __init__(self, match): if not self.category: - self.groups = match.groups() - self.match = match - self._init_category() + self._init_category(match) Extractor.__init__(self, match) - def _init_category(self): - for index, group in enumerate(self.groups): + def _init_category(self, match): + for index, group in enumerate(match.groups()): if group is not None: if index: self.category, self.root, info = self.instances[index-1] if not self.root: - self.root = text.root_from_url(self.match[0]) + self.root = text.root_from_url(match[0]) self.config_instance = info.get else: self.root = group diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index b3944f7..93d3953 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -4,27 +4,27 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://cyberdrop.me/""" +"""Extractors for https://cyberdrop.cr/""" from . import lolisafe from .common import Message from .. import text -BASE_PATTERN = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:cr|me|to)" class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): """Extractor for cyberdrop albums""" category = "cyberdrop" - root = "https://cyberdrop.me" - root_api = "https://api.cyberdrop.me" - pattern = BASE_PATTERN + r"/a/([^/?#]+)" - example = "https://cyberdrop.me/a/ID" + root = "https://cyberdrop.cr" + root_api = "https://api.cyberdrop.cr" + pattern = rf"{BASE_PATTERN}/a/([^/?#]+)" + example = "https://cyberdrop.cr/a/ID" def items(self): files, data = self.fetch_album(self.album_id) - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], file in enumerate(files, 1): file.update(data) text.nameext_from_url(file["name"], file) @@ -47,7 +47,7 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): "album_name" : text.unescape(extr('title="', '"')), "album_size" : text.parse_bytes(extr( '<p class="title">', "B")), - "date" : text.parse_datetime(extr( + "date" : self.parse_datetime(extr( '<p class="title">', '<'), "%d.%m.%Y"), "description": text.unescape(text.unescape( # double desc.rpartition(" [R")[0])), @@ -76,8 +76,8 @@ class CyberdropMediaExtractor(CyberdropAlbumExtractor): """Extractor for cyberdrop media links""" subcategory = "media" directory_fmt = ("{category}",) - pattern = BASE_PATTERN + r"/f/([^/?#]+)" - example = "https://cyberdrop.me/f/ID" + pattern = rf"{BASE_PATTERN}/f/([^/?#]+)" + example = "https://cyberdrop.cr/f/ID" def fetch_album(self, album_id): return self._extract_files((album_id,)), { diff --git a/gallery_dl/extractor/cyberfile.py b/gallery_dl/extractor/cyberfile.py index 2ea81d6..e8c0061 100644 --- a/gallery_dl/extractor/cyberfile.py +++ b/gallery_dl/extractor/cyberfile.py @@ -56,7 +56,9 @@ class CyberfileFolderExtractor(CyberfileExtractor): url = f"{self.root}/folder/{folder_hash}" folder_num = text.extr(self.request(url).text, "ages('folder', '", "'") - extract_urls = text.re(r'dtfullurl="([^"]+)').findall + extract_folders = text.re(r'sharing-url="([^"]+)').findall + extract_files = text.re(r'dtfullurl="([^"]+)').findall + recursive = self.config("recursive", True) perpage = 600 data = { @@ -67,25 +69,63 @@ class CyberfileFolderExtractor(CyberfileExtractor): "filterOrderBy": "", } resp = self.request_api("/account/ajax/load_files", data) + html = resp["html"] folder = { - "_extractor" : CyberfileFileExtractor, "folder_hash": folder_hash, "folder_num" : text.parse_int(folder_num), "folder" : resp["page_title"], } while True: - urls = extract_urls(resp["html"]) - for url in urls: - yield Message.Queue, url, folder - - if len(urls) < perpage: + folders = extract_folders(html) + if recursive and folders: + folder["_extractor"] = CyberfileFolderExtractor + for url in folders: + yield Message.Queue, url, folder + + if files := extract_files(html): + folder["_extractor"] = CyberfileFileExtractor + for url in files: + yield Message.Queue, url, folder + + if len(folders) + len(files) < perpage: return data["pageStart"] += 1 resp = self.request_api("/account/ajax/load_files", data) +class CyberfileSharedExtractor(CyberfileExtractor): + subcategory = "shared" + pattern = rf"{BASE_PATTERN}/shared/([a-zA-Z0-9]+)" + example = "https://cyberfile.me/shared/AbCdEfGhIjK" + + def items(self): + # get 'filehosting' cookie + url = f"{self.root}/shared/{self.groups[0]}" + self.request(url, method="HEAD") + + data = { + "pageType" : "nonaccountshared", + "nodeId" : "", + "pageStart": "1", + "perPage" : "500", + "filterOrderBy": "", + } + resp = self.request_api("/account/ajax/load_files", data) + + html = resp["html"] + pos = html.find("<!-- /.navbar-collapse -->") + 26 + + data = {"_extractor": CyberfileFolderExtractor} + for url in text.extract_iter(html, 'sharing-url="', '"', pos): + yield Message.Queue, url, data + + data = {"_extractor": CyberfileFileExtractor} + for url in text.extract_iter(html, 'dtfullurl="', '"', pos): + yield Message.Queue, url, data + + class CyberfileFileExtractor(CyberfileExtractor): subcategory = "file" directory_fmt = ("{category}", "{uploader}", "{folder}") @@ -113,7 +153,7 @@ class CyberfileFileExtractor(CyberfileExtractor): "Filesize:", "</tr>"))[:-1]), "tags" : text.split_html(extr( "Keywords:", "</tr>")), - "date" : text.parse_datetime(text.remove_html(extr( + "date" : self.parse_datetime(text.remove_html(extr( "Uploaded:", "</tr>")), "%d/%m/%Y %H:%M:%S"), "permissions": text.remove_html(extr( "Permissions:", "</tr>")).split(" & "), @@ -121,5 +161,5 @@ class CyberfileFileExtractor(CyberfileExtractor): file["file_url"] = url = extr("openUrl('", "'") text.nameext_from_url(file["name"] or url, file) - yield Message.Directory, file + yield Message.Directory, "", file yield Message.Url, url, file diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 29c7763..5ea33c4 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -9,8 +9,7 @@ """Extractors for https://danbooru.donmai.us/ and other Danbooru instances""" from .common import BaseExtractor, Message -from .. import text, util -import datetime +from .. import text, util, dt class DanbooruExtractor(BaseExtractor): @@ -64,13 +63,12 @@ class DanbooruExtractor(BaseExtractor): except KeyError: if self.external and post["source"]: post.update(data) - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Queue, post["source"], post continue text.nameext_from_url(url, post) - post["date"] = text.parse_datetime( - post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + post["date"] = dt.parse_iso(post["created_at"]) post["tags"] = ( post["tag_string"].split(" ") @@ -108,7 +106,7 @@ class DanbooruExtractor(BaseExtractor): url = self.root + url post.update(data) - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, url, post def items_artists(self): @@ -253,7 +251,7 @@ class DanbooruTagExtractor(DanbooruExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=([^&#]*)" + pattern = rf"{BASE_PATTERN}/posts\?(?:[^&#]*&)*tags=([^&#]*)" example = "https://danbooru.donmai.us/posts?tags=TAG" def metadata(self): @@ -281,7 +279,7 @@ class DanbooruTagExtractor(DanbooruExtractor): class DanbooruRandomExtractor(DanbooruTagExtractor): """Extractor for a random danbooru post""" subcategory = "random" - pattern = BASE_PATTERN + r"/posts/random(?:\?(?:[^&#]*&)*tags=([^&#]*))?" + pattern = rf"{BASE_PATTERN}/posts/random(?:\?(?:[^&#]*&)*tags=([^&#]*))?" example = "https://danbooru.donmai.us/posts/random?tags=TAG" def metadata(self): @@ -301,7 +299,7 @@ class DanbooruPoolExtractor(DanbooruExtractor): directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}") filename_fmt = "{num:>04}_{id}_{filename}.{extension}" archive_fmt = "p_{pool[id]}_{id}" - pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)" + pattern = rf"{BASE_PATTERN}/pool(?:s|/show)/(\d+)" example = "https://danbooru.donmai.us/pools/12345" def metadata(self): @@ -319,7 +317,7 @@ class DanbooruFavgroupExtractor(DanbooruExtractor): "{favgroup[id]} {favgroup[name]}") filename_fmt = "{num:>04}_{id}_{filename}.{extension}" archive_fmt = "fg_{favgroup[id]}_{id}" - pattern = BASE_PATTERN + r"/favorite_group(?:s|/show)/(\d+)" + pattern = rf"{BASE_PATTERN}/favorite_group(?:s|/show)/(\d+)" example = "https://danbooru.donmai.us/favorite_groups/12345" def metadata(self): @@ -334,7 +332,7 @@ class DanbooruPostExtractor(DanbooruExtractor): """Extractor for single danbooru posts""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post(?:s|/show)/(\d+)" + pattern = rf"{BASE_PATTERN}/post(?:s|/show)/(\d+)" example = "https://danbooru.donmai.us/posts/12345" def posts(self): @@ -346,22 +344,48 @@ class DanbooruPostExtractor(DanbooruExtractor): return (post,) +class DanbooruMediaassetExtractor(DanbooruExtractor): + """Extractor for a danbooru media asset""" + subcategory = "media-asset" + filename_fmt = "{category}_ma{id}_{filename}.{extension}" + archive_fmt = "m{id}" + pattern = rf"{BASE_PATTERN}/media_assets/(\d+)" + example = "https://danbooru.donmai.us/media_assets/12345" + + def posts(self): + url = f"{self.root}/media_assets/{self.groups[-1]}.json" + asset = self.request_json(url) + + asset["file_url"] = asset["variants"][-1]["url"] + asset["tag_string"] = \ + asset["tag_string_artist"] = \ + asset["tag_string_character"] = \ + asset["tag_string_copyright"] = \ + asset["tag_string_general"] = \ + asset["tag_string_meta"] = "" + + if self.includes: + params = {"only": self.includes} + asset.update(self.request_json(url, params=params)) + return (asset,) + + class DanbooruPopularExtractor(DanbooruExtractor): """Extractor for popular images from danbooru""" subcategory = "popular" directory_fmt = ("{category}", "popular", "{scale}", "{date}") archive_fmt = "P_{scale[0]}_{date}_{id}" - pattern = BASE_PATTERN + r"/(?:explore/posts/)?popular(?:\?([^#]*))?" + pattern = rf"{BASE_PATTERN}/(?:explore/posts/)?popular(?:\?([^#]*))?" example = "https://danbooru.donmai.us/explore/posts/popular" def metadata(self): self.params = params = text.parse_query(self.groups[-1]) scale = params.get("scale", "day") - date = params.get("date") or datetime.date.today().isoformat() + date = params.get("date") or dt.date.today().isoformat() if scale == "week": - date = datetime.date.fromisoformat(date) - date = (date - datetime.timedelta(days=date.weekday())).isoformat() + date = dt.date.fromisoformat(date) + date = (date - dt.timedelta(days=date.weekday())).isoformat() elif scale == "month": date = date[:-3] @@ -374,7 +398,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): class DanbooruArtistExtractor(DanbooruExtractor): """Extractor for danbooru artists""" subcategory = "artist" - pattern = BASE_PATTERN + r"/artists/(\d+)" + pattern = rf"{BASE_PATTERN}/artists/(\d+)" example = "https://danbooru.donmai.us/artists/12345" items = DanbooruExtractor.items_artists @@ -387,7 +411,7 @@ class DanbooruArtistExtractor(DanbooruExtractor): class DanbooruArtistSearchExtractor(DanbooruExtractor): """Extractor for danbooru artist searches""" subcategory = "artist-search" - pattern = BASE_PATTERN + r"/artists/?\?([^#]+)" + pattern = rf"{BASE_PATTERN}/artists/?\?([^#]+)" example = "https://danbooru.donmai.us/artists?QUERY" items = DanbooruExtractor.items_artists diff --git a/gallery_dl/extractor/dankefuerslesen.py b/gallery_dl/extractor/dankefuerslesen.py index 1c4b7d8..ed7e40b 100644 --- a/gallery_dl/extractor/dankefuerslesen.py +++ b/gallery_dl/extractor/dankefuerslesen.py @@ -28,7 +28,7 @@ class DankefuerslesenBase(): class DankefuerslesenChapterExtractor(DankefuerslesenBase, ChapterExtractor): """Extractor for Danke fürs Lesen manga chapters""" - pattern = BASE_PATTERN + r"/read/manga/([\w-]+)/([\w-]+)" + pattern = rf"{BASE_PATTERN}/read/manga/([\w-]+)/([\w-]+)" example = "https://danke.moe/read/manga/TITLE/123/1/" def _init(self): @@ -68,7 +68,7 @@ class DankefuerslesenChapterExtractor(DankefuerslesenBase, ChapterExtractor): "chapter_minor": minor, "group" : manga["groups"][group_id].split(" & "), "group_id" : text.parse_int(group_id), - "date" : text.parse_timestamp(data["release_date"][group_id]), + "date" : self.parse_timestamp(data["release_date"][group_id]), "lang" : util.NONE, "language" : util.NONE, } @@ -95,7 +95,7 @@ class DankefuerslesenMangaExtractor(DankefuerslesenBase, MangaExtractor): """Extractor for Danke fürs Lesen manga""" chapterclass = DankefuerslesenChapterExtractor reverse = False - pattern = BASE_PATTERN + r"/read/manga/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/read/manga/([^/?#]+)" example = "https://danke.moe/read/manga/TITLE/" def chapters(self, page): diff --git a/gallery_dl/extractor/desktopography.py b/gallery_dl/extractor/desktopography.py index 364d88f..be25053 100644 --- a/gallery_dl/extractor/desktopography.py +++ b/gallery_dl/extractor/desktopography.py @@ -22,7 +22,7 @@ class DesktopographyExtractor(Extractor): class DesktopographySiteExtractor(DesktopographyExtractor): """Extractor for all desktopography exhibitions """ subcategory = "site" - pattern = BASE_PATTERN + r"/$" + pattern = rf"{BASE_PATTERN}/$" example = "https://desktopography.net/" def items(self): @@ -41,7 +41,7 @@ class DesktopographySiteExtractor(DesktopographyExtractor): class DesktopographyExhibitionExtractor(DesktopographyExtractor): """Extractor for a yearly desktopography exhibition""" subcategory = "exhibition" - pattern = BASE_PATTERN + r"/exhibition-([^/?#]+)/" + pattern = rf"{BASE_PATTERN}/exhibition-([^/?#]+)/" example = "https://desktopography.net/exhibition-2020/" def __init__(self, match): @@ -70,7 +70,7 @@ class DesktopographyExhibitionExtractor(DesktopographyExtractor): class DesktopographyEntryExtractor(DesktopographyExtractor): """Extractor for all resolutions of a desktopography wallpaper""" subcategory = "entry" - pattern = BASE_PATTERN + r"/portfolios/([\w-]+)" + pattern = rf"{BASE_PATTERN}/portfolios/([\w-]+)" example = "https://desktopography.net/portfolios/NAME/" def __init__(self, match): @@ -82,7 +82,7 @@ class DesktopographyEntryExtractor(DesktopographyExtractor): page = self.request(url).text entry_data = {"entry": self.entry} - yield Message.Directory, entry_data + yield Message.Directory, "", entry_data for image_data in text.extract_iter( page, diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 39690da..5bd43d4 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -9,7 +9,7 @@ """Extractors for https://www.deviantart.com/""" from .common import Extractor, Message, Dispatch -from .. import text, util, exception +from .. import text, util, dt, exception from ..cache import cache, memcache import collections import mimetypes @@ -64,13 +64,13 @@ class DeviantartExtractor(Extractor): if self.quality: if self.quality == "png": self.quality = "-fullview.png?" - self.quality_sub = util.re(r"-fullview\.[a-z0-9]+\?").sub + self.quality_sub = text.re(r"-fullview\.[a-z0-9]+\?").sub else: self.quality = f",q_{self.quality}" - self.quality_sub = util.re(r",q_\d+").sub + self.quality_sub = text.re(r",q_\d+").sub if self.intermediary: - self.intermediary_subn = util.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn + self.intermediary_subn = text.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn if isinstance(self.original, str) and \ self.original.lower().startswith("image"): @@ -154,7 +154,7 @@ class DeviantartExtractor(Extractor): deviation.update(data) self.prepare(deviation) - yield Message.Directory, deviation + yield Message.Directory, "", deviation if "content" in deviation: content = self._extract_content(deviation) @@ -259,7 +259,7 @@ class DeviantartExtractor(Extractor): deviation["published_time"] = text.parse_int( deviation["published_time"]) - deviation["date"] = text.parse_timestamp( + deviation["date"] = self.parse_timestamp( deviation["published_time"]) if self.comments: @@ -269,7 +269,7 @@ class DeviantartExtractor(Extractor): ) # filename metadata - sub = util.re(r"\W").sub + sub = text.re(r"\W").sub deviation["filename"] = "".join(( sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["author"]["username"].lower()), "-d", @@ -404,7 +404,7 @@ class DeviantartExtractor(Extractor): try: return self._tiptap_to_html(markup) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.error("%s: '%s: %s'", deviation["index"], exc.__class__.__name__, exc) @@ -675,7 +675,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ def _find_folder(self, folders, name, uuid): if uuid.isdecimal(): - match = util.re( + match = text.re( "(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match for folder in folders: if match(folder["name"]): @@ -864,7 +864,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ class DeviantartUserExtractor(Dispatch, DeviantartExtractor): """Extractor for an artist's user profile""" - pattern = BASE_PATTERN + r"/?$" + pattern = rf"{BASE_PATTERN}/?$" example = "https://www.deviantart.com/USER" def items(self): @@ -887,8 +887,8 @@ class DeviantartGalleryExtractor(DeviantartExtractor): """Extractor for all deviations from an artist's gallery""" subcategory = "gallery" archive_fmt = "g_{_username}_{index}.{extension}" - pattern = (BASE_PATTERN + r"/gallery" - r"(?:/all|/recommended-for-you|/?\?catpath=)?/?$") + pattern = (rf"{BASE_PATTERN}/gallery" + r"(?:/all|/recommended-for-you)?/?(\?(?!q=).*)?$") example = "https://www.deviantart.com/USER/gallery/" def deviations(self): @@ -902,7 +902,7 @@ class DeviantartAvatarExtractor(DeviantartExtractor): """Extractor for an artist's avatar""" subcategory = "avatar" archive_fmt = "a_{_username}_{index}" - pattern = BASE_PATTERN + r"/avatar" + pattern = rf"{BASE_PATTERN}/avatar" example = "https://www.deviantart.com/USER/avatar/" def deviations(self): @@ -956,7 +956,7 @@ class DeviantartBackgroundExtractor(DeviantartExtractor): """Extractor for an artist's banner""" subcategory = "background" archive_fmt = "b_{index}" - pattern = BASE_PATTERN + r"/ba(?:nner|ckground)" + pattern = rf"{BASE_PATTERN}/ba(?:nner|ckground)" example = "https://www.deviantart.com/USER/banner/" def deviations(self): @@ -972,7 +972,7 @@ class DeviantartFolderExtractor(DeviantartExtractor): subcategory = "folder" directory_fmt = ("{category}", "{username}", "{folder[title]}") archive_fmt = "F_{folder[uuid]}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery/([^/?#]+)/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/gallery/([^/?#]+)/([^/?#]+)" example = "https://www.deviantart.com/USER/gallery/12345/TITLE" def __init__(self, match): @@ -1088,7 +1088,7 @@ class DeviantartFavoriteExtractor(DeviantartExtractor): subcategory = "favorite" directory_fmt = ("{category}", "{username}", "Favourites") archive_fmt = "f_{_username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/favourites(?:/all|/?\?catpath=)?/?$" + pattern = rf"{BASE_PATTERN}/favourites(?:/all|/?\?catpath=)?/?$" example = "https://www.deviantart.com/USER/favourites/" def deviations(self): @@ -1105,7 +1105,7 @@ class DeviantartCollectionExtractor(DeviantartExtractor): directory_fmt = ("{category}", "{username}", "Favourites", "{collection[title]}") archive_fmt = "C_{collection[uuid]}_{index}.{extension}" - pattern = BASE_PATTERN + r"/favourites/([^/?#]+)/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/favourites/([^/?#]+)/([^/?#]+)" example = "https://www.deviantart.com/USER/favourites/12345/TITLE" def __init__(self, match): @@ -1136,7 +1136,7 @@ class DeviantartJournalExtractor(DeviantartExtractor): subcategory = "journal" directory_fmt = ("{category}", "{username}", "Journal") archive_fmt = "j_{_username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/(?:posts(?:/journals)?|journal)/?(?:\?.*)?$" + pattern = rf"{BASE_PATTERN}/(?:posts(?:/journals)?|journal)/?(?:\?.*)?$" example = "https://www.deviantart.com/USER/posts/journals/" def deviations(self): @@ -1149,7 +1149,7 @@ class DeviantartStatusExtractor(DeviantartExtractor): directory_fmt = ("{category}", "{username}", "Status") filename_fmt = "{category}_{index}_{title}_{date}.{extension}" archive_fmt = "S_{_username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/posts/statuses" + pattern = rf"{BASE_PATTERN}/posts/statuses" example = "https://www.deviantart.com/USER/posts/statuses/" def deviations(self): @@ -1187,8 +1187,8 @@ class DeviantartStatusExtractor(DeviantartExtractor): deviation["username"] = deviation["author"]["username"] deviation["_username"] = deviation["username"].lower() - deviation["date"] = dt = text.parse_datetime(deviation["ts"]) - deviation["published_time"] = int(util.datetime_to_timestamp(dt)) + deviation["date"] = d = self.parse_datetime_iso(deviation["ts"]) + deviation["published_time"] = int(dt.to_ts(d)) deviation["da_category"] = "Status" deviation["category_path"] = "status" @@ -1253,7 +1253,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor): """Extractor for single deviations""" subcategory = "deviation" archive_fmt = "g_{_username}_{index}.{extension}" - pattern = (BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)" + pattern = (rf"{BASE_PATTERN}/(art|journal)/(?:[^/?#]+-)?(\d+)" r"|(?:https?://)?(?:www\.)?(?:fx)?deviantart\.com/" r"(?:view/|deviation/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)" r"(\d+)" # bare deviation ID without slug @@ -1315,7 +1315,7 @@ class DeviantartScrapsExtractor(DeviantartExtractor): subcategory = "scraps" directory_fmt = ("{category}", "{username}", "Scraps") archive_fmt = "s_{_username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b" + pattern = rf"{BASE_PATTERN}/gallery/(?:\?catpath=)?scraps\b" example = "https://www.deviantart.com/USER/gallery/scraps" def deviations(self): @@ -1382,7 +1382,7 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): """Extractor for deviantart gallery searches""" subcategory = "gallery-search" archive_fmt = "g_{_username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery/?\?(q=[^#]+)" + pattern = rf"{BASE_PATTERN}/gallery/?\?(q=[^#]+)" example = "https://www.deviantart.com/USER/gallery?q=QUERY" def __init__(self, match): @@ -1412,7 +1412,7 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): class DeviantartFollowingExtractor(DeviantartExtractor): """Extractor for user's watched users""" subcategory = "following" - pattern = BASE_PATTERN + "/(?:about#)?watching" + pattern = rf"{BASE_PATTERN}/(?:about#)?watching" example = "https://www.deviantart.com/USER/about#watching" def items(self): diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 85358ba..bbc1ef0 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -40,5 +40,5 @@ class DirectlinkExtractor(Extractor): data["_http_headers"] = { "Referer": self.url.encode("latin-1", "ignore")} - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, self.url, data diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py index 216e486..0e7f309 100644 --- a/gallery_dl/extractor/discord.py +++ b/gallery_dl/extractor/discord.py @@ -19,7 +19,7 @@ class DiscordExtractor(Extractor): root = "https://discord.com" directory_fmt = ("{category}", "{server_id}_{server}", "{channel_id}_{channel}") - filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}" + filename_fmt = "{message_id}_{num:>02}_{filename[:220]}.{extension}" archive_fmt = "{message_id}_{num}" server_metadata = {} @@ -72,9 +72,7 @@ class DiscordExtractor(Extractor): "author_files": [], "message": self.extract_message_text(message), "message_id": message["id"], - "date": text.parse_datetime( - message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z" - ), + "date": self.parse_datetime_iso(message["timestamp"]), "files": [] }) @@ -122,7 +120,7 @@ class DiscordExtractor(Extractor): text.nameext_from_url(file["url"], file) file["num"] = num - yield Message.Directory, message_metadata + yield Message.Directory, "", message_metadata for file in message_metadata["files"]: message_metadata_file = message_metadata.copy() @@ -240,7 +238,7 @@ class DiscordExtractor(Extractor): class DiscordChannelExtractor(DiscordExtractor): subcategory = "channel" - pattern = BASE_PATTERN + r"/channels/(\d+)/(?:\d+/threads/)?(\d+)/?$" + pattern = rf"{BASE_PATTERN}/channels/(\d+)/(?:\d+/threads/)?(\d+)/?$" example = "https://discord.com/channels/1234567890/9876543210" def items(self): @@ -253,7 +251,7 @@ class DiscordChannelExtractor(DiscordExtractor): class DiscordMessageExtractor(DiscordExtractor): subcategory = "message" - pattern = BASE_PATTERN + r"/channels/(\d+)/(\d+)/(\d+)/?$" + pattern = rf"{BASE_PATTERN}/channels/(\d+)/(\d+)/(\d+)/?$" example = "https://discord.com/channels/1234567890/9876543210/2468013579" def items(self): @@ -270,7 +268,7 @@ class DiscordMessageExtractor(DiscordExtractor): class DiscordServerExtractor(DiscordExtractor): subcategory = "server" - pattern = BASE_PATTERN + r"/channels/(\d+)/?$" + pattern = rf"{BASE_PATTERN}/channels/(\d+)/?$" example = "https://discord.com/channels/1234567890" def items(self): @@ -288,7 +286,7 @@ class DiscordDirectMessagesExtractor(DiscordExtractor): subcategory = "direct-messages" directory_fmt = ("{category}", "Direct Messages", "{channel_id}_{recipients:J,}") - pattern = BASE_PATTERN + r"/channels/@me/(\d+)/?$" + pattern = rf"{BASE_PATTERN}/channels/@me/(\d+)/?$" example = "https://discord.com/channels/@me/1234567890" def items(self): @@ -299,7 +297,7 @@ class DiscordDirectMessageExtractor(DiscordExtractor): subcategory = "direct-message" directory_fmt = ("{category}", "Direct Messages", "{channel_id}_{recipients:J,}") - pattern = BASE_PATTERN + r"/channels/@me/(\d+)/(\d+)/?$" + pattern = rf"{BASE_PATTERN}/channels/@me/(\d+)/(\d+)/?$" example = "https://discord.com/channels/@me/1234567890/9876543210" def items(self): diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 3e0424d..36423db 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -41,12 +41,12 @@ class DynastyscansBase(): class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): """Extractor for manga-chapters from dynasty-scans.com""" - pattern = BASE_PATTERN + r"(/chapters/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/chapters/[^/?#]+)" example = "https://dynasty-scans.com/chapters/NAME" def metadata(self, page): extr = text.extract_from(page) - match = util.re( + match = text.re( r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name r"(?: ch(\d+)([^:<]*))?" # chapter info r"(?:: (.+))?" # title @@ -62,7 +62,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): "author" : text.remove_html(author), "group" : (text.remove_html(group) or text.extr(group, ' alt="', '"')), - "date" : text.parse_datetime(extr( + "date" : self.parse_datetime(extr( '"icon-calendar"></i> ', '<'), "%b %d, %Y"), "tags" : text.split_html(extr( "class='tags'>", "<div id='chapter-actions'")), @@ -81,7 +81,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): class DynastyscansMangaExtractor(DynastyscansBase, MangaExtractor): chapterclass = DynastyscansChapterExtractor reverse = False - pattern = BASE_PATTERN + r"(/series/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/series/[^/?#]+)" example = "https://dynasty-scans.com/series/NAME" def chapters(self, page): @@ -97,7 +97,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor): directory_fmt = ("{category}", "Images") filename_fmt = "{image_id}.{extension}" archive_fmt = "i_{image_id}" - pattern = BASE_PATTERN + r"/images/?(?:\?([^#]+))?$" + pattern = rf"{BASE_PATTERN}/images/?(?:\?([^#]+))?$" example = "https://dynasty-scans.com/images?QUERY" def __init__(self, match): @@ -105,7 +105,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor): self.query = match[1] or "" def items(self): - yield Message.Directory, {} + yield Message.Directory, "", {} for image_id in self.images(): image = self._parse_image_page(image_id) url = image["url"] @@ -126,7 +126,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor): class DynastyscansImageExtractor(DynastyscansSearchExtractor): """Extractor for individual images on dynasty-scans.com""" subcategory = "image" - pattern = BASE_PATTERN + r"/images/(\d+)" + pattern = rf"{BASE_PATTERN}/images/(\d+)" example = "https://dynasty-scans.com/images/12345" def images(self): @@ -136,7 +136,7 @@ class DynastyscansImageExtractor(DynastyscansSearchExtractor): class DynastyscansAnthologyExtractor(DynastyscansSearchExtractor): """Extractor for dynasty-scans anthologies""" subcategory = "anthology" - pattern = BASE_PATTERN + r"/anthologies/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/anthologies/([^/?#]+)" example = "https://dynasty-scans.com/anthologies/TITLE" def items(self): @@ -166,8 +166,6 @@ class DynastyscansAnthologyExtractor(DynastyscansSearchExtractor): data["scanlator"] = content[1].text[11:] data["tags"] = content[2].text[6:].lower().split(", ") data["title"] = element[5].text - data["date"] = text.parse_datetime( - element[1].text, "%Y-%m-%dT%H:%M:%S%z") - data["date_updated"] = text.parse_datetime( - element[2].text, "%Y-%m-%dT%H:%M:%S%z") + data["date"] = self.parse_datetime_iso(element[1].text) + data["date_updated"] = self.parse_datetime_iso(element[2].text) yield Message.Queue, element[4].text, data diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 71c3b30..cc6708d 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -51,13 +51,18 @@ class E621Extractor(danbooru.DanbooruExtractor): post["filename"] = file["md5"] post["extension"] = file["ext"] - post["date"] = text.parse_datetime( - post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + post["date"] = self.parse_datetime_iso(post["created_at"]) post.update(data) - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, file["url"], post + def items_artists(self): + for artist in self.artists(): + artist["_extractor"] = E621TagExtractor + url = f"{self.root}/posts?tags={text.quote(artist['name'])}" + yield Message.Queue, url, artist + def _get_notes(self, id): return self.request_json( f"{self.root}/notes.json?search[post_id]={id}") @@ -89,13 +94,13 @@ BASE_PATTERN = E621Extractor.update({ class E621TagExtractor(E621Extractor, danbooru.DanbooruTagExtractor): """Extractor for e621 posts from tag searches""" - pattern = BASE_PATTERN + r"/posts?(?:\?[^#]*?tags=|/index/\d+/)([^&#]*)" + pattern = rf"{BASE_PATTERN}/posts?(?:\?[^#]*?tags=|/index/\d+/)([^&#]*)" example = "https://e621.net/posts?tags=TAG" class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor): """Extractor for e621 pools""" - pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)" + pattern = rf"{BASE_PATTERN}/pool(?:s|/show)/(\d+)" example = "https://e621.net/pools/12345" def posts(self): @@ -120,7 +125,7 @@ class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor): class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): """Extractor for single e621 posts""" - pattern = BASE_PATTERN + r"/post(?:s|/show)/(\d+)" + pattern = rf"{BASE_PATTERN}/post(?:s|/show)/(\d+)" example = "https://e621.net/posts/12345" def posts(self): @@ -130,19 +135,38 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor): """Extractor for popular images from e621""" - pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?" + pattern = rf"{BASE_PATTERN}/explore/posts/popular(?:\?([^#]*))?" example = "https://e621.net/explore/posts/popular" def posts(self): return self._pagination("/popular.json", self.params) +class E621ArtistExtractor(E621Extractor, danbooru.DanbooruArtistExtractor): + """Extractor for e621 artists""" + subcategory = "artist" + pattern = rf"{BASE_PATTERN}/artists/(\d+)" + example = "https://e621.net/artists/12345" + + items = E621Extractor.items_artists + + +class E621ArtistSearchExtractor(E621Extractor, + danbooru.DanbooruArtistSearchExtractor): + """Extractor for e621 artist searches""" + subcategory = "artist-search" + pattern = rf"{BASE_PATTERN}/artists/?\?([^#]+)" + example = "https://e621.net/artists?QUERY" + + items = E621Extractor.items_artists + + class E621FavoriteExtractor(E621Extractor): """Extractor for e621 favorites""" subcategory = "favorite" directory_fmt = ("{category}", "Favorites", "{user_id}") archive_fmt = "f_{user_id}_{id}" - pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?" + pattern = rf"{BASE_PATTERN}/favorites(?:\?([^#]*))?" example = "https://e621.net/favorites" def metadata(self): diff --git a/gallery_dl/extractor/eporner.py b/gallery_dl/extractor/eporner.py new file mode 100644 index 0000000..307f14b --- /dev/null +++ b/gallery_dl/extractor/eporner.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.eporner.com/""" + +from .common import GalleryExtractor +from .. import text + + +class EpornerGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from eporner.com""" + category = "eporner" + root = "https://eporner.com" + pattern = (r"(?:https?://)?(?:www\.)?eporner\.com" + r"/gallery/(\w+)(?:/([\w-]+))?") + example = "https://www.eporner.com/gallery/GID/SLUG/" + + def __init__(self, match): + url = f"{self.root}/gallery/{match[1]}/{match[2]}/" + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + title = text.extr(page, "<title>", " - EPORNER</title>") + if title.endswith(" Photo Gallery"): + title = title[:-14] + + return { + "gallery_id": self.groups[0], + "title" : text.unescape(title), + "slug" : text.extr( + page, "/gallery/", '/"').rpartition("/")[2], + "description": text.unescape(text.extr( + page, 'name="description" content="', '"')), + "tags": text.extr( + page, 'EP.ads.keywords = "', '"').split(","), + } + + def images(self, page): + album = text.extr( + page, 'class="photosgrid gallerygrid"', "id='gallerySlideBox'") + + results = [] + for url in text.extract_iter(album, ' src="', '"'): + url, _, ext = url.rpartition(".") + # Preview images have a resolution suffix. + # E.g. "11208293-image-3_296x1000.jpg". + # The same name, but without the suffix, leads to the full image. + url = url[:url.rfind("_")] + name = url[url.rfind("/")+1:] + results.append((f"{url}.{ext}", {"id": name[:name.find("-")]})) + return results diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 68cfdbc..2c9ab47 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -64,7 +64,7 @@ class EromeExtractor(Extractor): class EromeAlbumExtractor(EromeExtractor): """Extractor for albums on erome.com""" subcategory = "album" - pattern = BASE_PATTERN + r"/a/(\w+)" + pattern = rf"{BASE_PATTERN}/a/(\w+)" example = "https://www.erome.com/a/ID" def items(self): @@ -74,8 +74,12 @@ class EromeAlbumExtractor(EromeExtractor): try: page = self.request(url).text except exception.HttpError as exc: + if exc.status == 410: + msg = text.extr(exc.response.text, "<h1>", "<") + else: + msg = "Unable to fetch album page" raise exception.AbortExtraction( - f"{album_id}: Unable to fetch album page ({exc})") + f"{album_id}: {msg} ({exc})") title, pos = text.extract( page, 'property="og:title" content="', '"') @@ -96,7 +100,7 @@ class EromeAlbumExtractor(EromeExtractor): if not date: ts = text.extr(group, '?v=', '"') if len(ts) > 1: - date = text.parse_timestamp(ts) + date = self.parse_timestamp(ts) data = { "album_id": album_id, @@ -110,14 +114,14 @@ class EromeAlbumExtractor(EromeExtractor): "_http_headers": {"Referer": url}, } - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], url in enumerate(urls, 1): yield Message.Url, url, text.nameext_from_url(url, data) class EromeUserExtractor(EromeExtractor): subcategory = "user" - pattern = BASE_PATTERN + r"/(?!a/|search\?)([^/?#]+)(?:/?\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/(?!a/|search\?)([^/?#]+)(?:/?\?([^#]+))?" example = "https://www.erome.com/USER" def albums(self): @@ -133,7 +137,7 @@ class EromeUserExtractor(EromeExtractor): class EromeSearchExtractor(EromeExtractor): subcategory = "search" - pattern = BASE_PATTERN + r"/search/?\?(q=[^#]+)" + pattern = rf"{BASE_PATTERN}/search/?\?(q=[^#]+)" example = "https://www.erome.com/search?q=QUERY" def albums(self): diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py index 91672bb..ce29800 100644 --- a/gallery_dl/extractor/everia.py +++ b/gallery_dl/extractor/everia.py @@ -7,7 +7,7 @@ """Extractors for https://everia.club""" from .common import Extractor, Message -from .. import text, util +from .. import text BASE_PATTERN = r"(?:https?://)?everia\.club" @@ -25,7 +25,7 @@ class EveriaExtractor(Extractor): return self._pagination(self.groups[0]) def _pagination(self, path, params=None, pnum=1): - find_posts = util.re(r'thumbnail">\s*<a href="([^"]+)').findall + find_posts = text.re(r'thumbnail">\s*<a href="([^"]+)').findall while True: if pnum == 1: @@ -45,14 +45,14 @@ class EveriaPostExtractor(EveriaExtractor): subcategory = "post" directory_fmt = ("{category}", "{title}") archive_fmt = "{post_url}_{num}" - pattern = BASE_PATTERN + r"(/\d{4}/\d{2}/\d{2}/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/\d{{4}}/\d{{2}}/\d{{2}}/[^/?#]+)" example = "https://everia.club/0000/00/00/TITLE" def items(self): url = self.root + self.groups[0] + "/" page = self.request(url).text content = text.extr(page, 'itemprop="text">', "<h3") - urls = util.re(r'img.*?lazy-src="([^"]+)').findall(content) + urls = text.re(r'img.*?lazy-src="([^"]+)').findall(content) data = { "title": text.unescape( @@ -64,7 +64,7 @@ class EveriaPostExtractor(EveriaExtractor): "count": len(urls), } - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], url in enumerate(urls, 1): url = text.unquote(url) yield Message.Url, url, text.nameext_from_url(url, data) @@ -72,26 +72,26 @@ class EveriaPostExtractor(EveriaExtractor): class EveriaTagExtractor(EveriaExtractor): subcategory = "tag" - pattern = BASE_PATTERN + r"(/tag/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/tag/[^/?#]+)" example = "https://everia.club/tag/TAG" class EveriaCategoryExtractor(EveriaExtractor): subcategory = "category" - pattern = BASE_PATTERN + r"(/category/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/category/[^/?#]+)" example = "https://everia.club/category/CATEGORY" class EveriaDateExtractor(EveriaExtractor): subcategory = "date" - pattern = (BASE_PATTERN + - r"(/\d{4}(?:/\d{2})?(?:/\d{2})?)(?:/page/\d+)?/?$") + pattern = (rf"{BASE_PATTERN}" + rf"(/\d{{4}}(?:/\d{{2}})?(?:/\d{{2}})?)(?:/page/\d+)?/?$") example = "https://everia.club/0000/00/00" class EveriaSearchExtractor(EveriaExtractor): subcategory = "search" - pattern = BASE_PATTERN + r"/(?:page/\d+/)?\?s=([^&#]+)" + pattern = rf"{BASE_PATTERN}/(?:page/\d+/)?\?s=([^&#]+)" example = "https://everia.club/?s=SEARCH" def posts(self): diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index f147959..9dab923 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -115,9 +115,9 @@ class ExhentaiExtractor(Extractor): class ExhentaiGalleryExtractor(ExhentaiExtractor): """Extractor for image galleries from exhentai.org""" subcategory = "gallery" - pattern = (BASE_PATTERN + - r"(?:/g/(\d+)/([\da-f]{10})" - r"|/s/([\da-f]{10})/(\d+)-(\d+))") + pattern = (rf"{BASE_PATTERN}/(?:" + rf"g/(\d+)/([\da-f]{{10}})|" + rf"s/([\da-f]{{10}})/(\d+)-(\d+))") example = "https://e-hentai.org/g/12345/67890abcde/" def __init__(self, match): @@ -193,7 +193,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.data = data = self.get_metadata(gpage) self.count = text.parse_int(data["filecount"]) - yield Message.Directory, data + yield Message.Directory, "", data images = itertools.chain( (self.image_from_page(ipage),), self.images_from_api()) @@ -216,7 +216,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def _items_hitomi(self): if self.config("metadata", False): data = self.metadata_from_api() - data["date"] = text.parse_timestamp(data["posted"]) + data["date"] = self.parse_timestamp(data["posted"]) else: data = {} @@ -226,14 +226,14 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): yield Message.Queue, url, data def _items_metadata(self): - yield Message.Directory, self.metadata_from_api() + yield Message.Directory, "", self.metadata_from_api() def get_metadata(self, page): """Extract gallery metadata""" data = self.metadata_from_page(page) if self.config("metadata", False): data.update(self.metadata_from_api()) - data["date"] = text.parse_timestamp(data["posted"]) + data["date"] = self.parse_timestamp(data["posted"]) if self.config("tags", False): tags = collections.defaultdict(list) for tag in data["tags"]: @@ -258,8 +258,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "_" : extr('<div id="gdc"><div class="cs ct', '"'), "eh_category" : extr('>', '<'), "uploader" : extr('<div id="gdn">', '</div>'), - "date" : text.parse_datetime(extr( - '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"), + "date" : self.parse_datetime_iso(extr( + '>Posted:</td><td class="gdt2">', '</td>')), "parent" : extr( '>Parent:</td><td class="gdt2"><a href="', '"'), "expunged" : "Yes" != extr( @@ -563,7 +563,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): class ExhentaiSearchExtractor(ExhentaiExtractor): """Extractor for exhentai search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/(?:\?([^#]*)|tag/([^/?#]+))" + pattern = rf"{BASE_PATTERN}/(?:\?([^#]*)|tag/([^/?#]+))" example = "https://e-hentai.org/?f_search=QUERY" def __init__(self, match): @@ -620,7 +620,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor): """Extractor for favorited exhentai galleries""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites\.php(?:\?([^#]*)())?" + pattern = rf"{BASE_PATTERN}/favorites\.php(?:\?([^#]*)())?" example = "https://e-hentai.org/favorites.php" def _init(self): diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index 6061737..5d56a5f 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -11,9 +11,9 @@ from .. import text, util, exception from ..cache import memcache BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com" -USER_PATTERN = (BASE_PATTERN + - r"/(?!media/|photo/|photo.php|watch/)" - r"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)") +USER_PATTERN = (rf"{BASE_PATTERN}/" + rf"(?!media/|photo/|photo.php|watch/|permalink.php)" + rf"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)") class FacebookExtractor(Extractor): @@ -108,7 +108,7 @@ class FacebookExtractor(Extractor): '"message":{"delight_ranges"', '"},"message_preferred_body"' ).rsplit('],"text":"', 1)[-1]), - "date": text.parse_timestamp( + "date": self.parse_timestamp( text.extr(photo_page, '\\"publish_time\\":', ',') or text.extr(photo_page, '"created_time":', ',') ), @@ -172,7 +172,7 @@ class FacebookExtractor(Extractor): "user_id": text.extr( video_page, '"owner":{"__typename":"User","id":"', '"' ), - "date": text.parse_timestamp(text.extr( + "date": self.parse_timestamp(text.extr( video_page, '\\"publish_time\\":', ',' )), "type": "video" @@ -292,7 +292,7 @@ class FacebookExtractor(Extractor): else: retries = 0 photo.update(set_data) - yield Message.Directory, photo + yield Message.Directory, "", photo yield Message.Url, photo["url"], photo if not photo["next_photo_id"]: @@ -389,9 +389,9 @@ class FacebookExtractor(Extractor): class FacebookPhotoExtractor(FacebookExtractor): """Base class for Facebook Photo extractors""" subcategory = "photo" - pattern = (BASE_PATTERN + - r"/(?:[^/?#]+/photos/[^/?#]+/|photo(?:.php)?/?\?" - r"(?:[^&#]+&)*fbid=)([^/?&#]+)[^/?#]*(?<!&setextract)$") + pattern = (rf"{BASE_PATTERN}/" + rf"(?:[^/?#]+/photos/[^/?#]+/|photo(?:.php)?/?\?" + rf"(?:[^&#]+&)*fbid=)([^/?&#]+)[^/?#]*(?<!&setextract)$") example = "https://www.facebook.com/photo/?fbid=PHOTO_ID" def items(self): @@ -408,7 +408,7 @@ class FacebookPhotoExtractor(FacebookExtractor): directory = self.parse_set_page(set_page) - yield Message.Directory, directory + yield Message.Directory, "", directory yield Message.Url, photo["url"], photo if self.author_followups: @@ -427,12 +427,11 @@ class FacebookSetExtractor(FacebookExtractor): """Base class for Facebook Set extractors""" subcategory = "set" pattern = ( - BASE_PATTERN + - r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)" - r"[^/?#]*(?<!&setextract)$" - r"|([^/?#]+/posts/[^/?#]+)" - r"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)" - ) + rf"{BASE_PATTERN}/" + rf"(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)" + rf"[^/?#]*(?<!&setextract)$" + rf"|([^/?#]+/posts/[^/?#]+)" + rf"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)") example = "https://www.facebook.com/media/set/?set=SET_ID" def items(self): @@ -455,7 +454,7 @@ class FacebookVideoExtractor(FacebookExtractor): """Base class for Facebook Video extractors""" subcategory = "video" directory_fmt = ("{category}", "{username}", "{subcategory}") - pattern = BASE_PATTERN + r"/(?:[^/?#]+/videos/|watch/?\?v=)([^/?&#]+)" + pattern = rf"{BASE_PATTERN}/(?:[^/?#]+/videos/|watch/?\?v=)([^/?&#]+)" example = "https://www.facebook.com/watch/?v=VIDEO_ID" def items(self): @@ -468,7 +467,7 @@ class FacebookVideoExtractor(FacebookExtractor): if "url" not in video: return - yield Message.Directory, video + yield Message.Directory, "", video if self.videos == "ytdl": yield Message.Url, "ytdl:" + video_url, video @@ -482,18 +481,18 @@ class FacebookInfoExtractor(FacebookExtractor): """Extractor for Facebook Profile data""" subcategory = "info" directory_fmt = ("{category}", "{username}") - pattern = USER_PATTERN + r"/info" + pattern = rf"{USER_PATTERN}/info" example = "https://www.facebook.com/USERNAME/info" def items(self): user = self._extract_profile(self.groups[0]) - return iter(((Message.Directory, user),)) + return iter(((Message.Directory, "", user),)) class FacebookAlbumsExtractor(FacebookExtractor): """Extractor for Facebook Profile albums""" subcategory = "albums" - pattern = USER_PATTERN + r"/photos_albums(?:/([^/?#]+))?" + pattern = rf"{USER_PATTERN}/photos_albums(?:/([^/?#]+))?" example = "https://www.facebook.com/USERNAME/photos_albums" def items(self): @@ -526,7 +525,7 @@ class FacebookAlbumsExtractor(FacebookExtractor): class FacebookPhotosExtractor(FacebookExtractor): """Extractor for Facebook Profile Photos""" subcategory = "photos" - pattern = USER_PATTERN + r"/photos(?:_by)?" + pattern = rf"{USER_PATTERN}/photos(?:_by)?" example = "https://www.facebook.com/USERNAME/photos" def items(self): @@ -543,7 +542,7 @@ class FacebookPhotosExtractor(FacebookExtractor): class FacebookAvatarExtractor(FacebookExtractor): """Extractor for Facebook Profile Avatars""" subcategory = "avatar" - pattern = USER_PATTERN + r"/avatar" + pattern = rf"{USER_PATTERN}/avatar" example = "https://www.facebook.com/USERNAME/avatar" def items(self): @@ -559,13 +558,13 @@ class FacebookAvatarExtractor(FacebookExtractor): set_page = self.request(set_url).text directory = self.parse_set_page(set_page) - yield Message.Directory, directory + yield Message.Directory, "", directory yield Message.Url, avatar["url"], avatar class FacebookUserExtractor(Dispatch, FacebookExtractor): """Extractor for Facebook Profiles""" - pattern = USER_PATTERN + r"/?(?:$|\?|#)" + pattern = rf"{USER_PATTERN}/?(?:$|\?|#)" example = "https://www.facebook.com/USERNAME" def items(self): diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 70b06e7..036b388 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -66,18 +66,17 @@ class FanboxExtractor(Extractor): if fee_max is not None and fee_max < item["feeRequired"]: self.log.warning("Skipping post %s (feeRequired of %s > %s)", item["id"], item["feeRequired"], fee_max) - continue - - try: - url = "https://api.fanbox.cc/post.info?postId=" + item["id"] - body = self.request_json(url, headers=self.headers)["body"] - content_body, post = self._extract_post(body) - except Exception as exc: - self.log.warning("Skipping post %s (%s: %s)", - item["id"], exc.__class__.__name__, exc) - continue - - yield Message.Directory, post + else: + try: + url = ("https://api.fanbox.cc/post.info?postId=" + + item["id"]) + item = self.request_json(url, headers=self.headers)["body"] + except Exception as exc: + self.log.warning("Skipping post %s (%s: %s)", + item["id"], exc.__class__.__name__, exc) + + content_body, post = self._extract_post(item) + yield Message.Directory, "", post yield from self._get_urls_from_post(content_body, post) def posts(self): @@ -128,15 +127,19 @@ class FanboxExtractor(Extractor): if file.get("extension", "").lower() in exts ] - post["date"] = text.parse_datetime(post["publishedDatetime"]) + try: + post["date"] = self.parse_datetime_iso(post["publishedDatetime"]) + except Exception: + post["date"] = None post["text"] = content_body.get("text") if content_body else None post["isCoverImage"] = False - if self._meta_user: - post["user"] = self._get_user_data(post["creatorId"]) - if self._meta_plan: + cid = post.get("creatorId") + if self._meta_user and cid is not None: + post["user"] = self._get_user_data(cid) + if self._meta_plan and cid is not None: plans = self._get_plan_data(post["creatorId"]) - fee = post["feeRequired"] + fee = post.get("feeRequired") or 0 try: post["plan"] = plans[fee] except KeyError: @@ -147,7 +150,7 @@ class FanboxExtractor(Extractor): plan["fee"] = fee post["plan"] = plans[fee] = plan if self._meta_comments: - if post["commentCount"]: + if post.get("commentCount"): post["comments"] = list(self._get_comment_data(post["id"])) else: post["commentd"] = () @@ -216,7 +219,7 @@ class FanboxExtractor(Extractor): def _get_urls_from_post(self, content_body, post): num = 0 if cover_image := post.get("coverImageUrl"): - cover_image = util.re("/c/[0-9a-z_]+").sub("", cover_image) + cover_image = text.re("/c/[0-9a-z_]+").sub("", cover_image) final_post = post.copy() final_post["isCoverImage"] = True final_post["fileUrl"] = cover_image @@ -352,7 +355,7 @@ class FanboxExtractor(Extractor): class FanboxCreatorExtractor(FanboxExtractor): """Extractor for a Fanbox creator's works""" subcategory = "creator" - pattern = USER_PATTERN + r"(?:/posts)?/?$" + pattern = rf"{USER_PATTERN}(?:/posts)?/?$" example = "https://USER.fanbox.cc/" def posts(self): @@ -362,15 +365,26 @@ class FanboxCreatorExtractor(FanboxExtractor): def _pagination_creator(self, url): urls = self.request_json(url, headers=self.headers)["body"] + if offset := self.config("offset"): + quotient, remainder = divmod(offset, 10) + if quotient: + urls = urls[quotient:] + else: + remainder = None + for url in urls: url = text.ensure_http_scheme(url) - yield from self.request_json(url, headers=self.headers)["body"] + posts = self.request_json(url, headers=self.headers)["body"] + if remainder: + posts = posts[remainder:] + remainder = None + yield from posts class FanboxPostExtractor(FanboxExtractor): """Extractor for media from a single Fanbox post""" subcategory = "post" - pattern = USER_PATTERN + r"/posts/(\d+)" + pattern = rf"{USER_PATTERN}/posts/(\d+)" example = "https://USER.fanbox.cc/posts/12345" def posts(self): @@ -380,7 +394,7 @@ class FanboxPostExtractor(FanboxExtractor): class FanboxHomeExtractor(FanboxExtractor): """Extractor for your Fanbox home feed""" subcategory = "home" - pattern = BASE_PATTERN + r"/?$" + pattern = rf"{BASE_PATTERN}/?$" example = "https://fanbox.cc/" def posts(self): @@ -391,7 +405,7 @@ class FanboxHomeExtractor(FanboxExtractor): class FanboxSupportingExtractor(FanboxExtractor): """Extractor for your supported Fanbox users feed""" subcategory = "supporting" - pattern = BASE_PATTERN + r"/home/supporting" + pattern = rf"{BASE_PATTERN}/home/supporting" example = "https://fanbox.cc/home/supporting" def posts(self): @@ -403,6 +417,7 @@ class FanboxRedirectExtractor(Extractor): """Extractor for pixiv redirects to fanbox.cc""" category = "fanbox" subcategory = "redirect" + cookies_domain = None pattern = r"(?:https?://)?(?:www\.)?pixiv\.net/fanbox/creator/(\d+)" example = "https://www.pixiv.net/fanbox/creator/12345" diff --git a/gallery_dl/extractor/fansly.py b/gallery_dl/extractor/fansly.py index 7138599..ba60b15 100644 --- a/gallery_dl/extractor/fansly.py +++ b/gallery_dl/extractor/fansly.py @@ -35,9 +35,9 @@ class FanslyExtractor(Extractor): for post in self.posts(): files = self._extract_files(post) post["count"] = len(files) - post["date"] = text.parse_timestamp(post["createdAt"]) + post["date"] = self.parse_timestamp(post["createdAt"]) - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], file in enumerate(files, 1): post.update(file) url = file["url"] @@ -61,7 +61,8 @@ class FanslyExtractor(Extractor): yield from self.posts_wall(account, wall) def _extract_files(self, post): - files = [] + if "attachments" not in post: + return () if "_extra" in post: extra = post.pop("_extra", ()) @@ -75,11 +76,12 @@ class FanslyExtractor(Extractor): if mid in media ) + files = [] for attachment in post.pop("attachments"): try: self._extract_attachment(files, post, attachment) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.error( "%s/%s, Failed to extract media (%s: %s)", post["id"], attachment.get("id"), @@ -117,8 +119,8 @@ class FanslyExtractor(Extractor): file = { **variant, "format": variant["type"], - "date": text.parse_timestamp(media["createdAt"]), - "date_updated": text.parse_timestamp(media["updatedAt"]), + "date": self.parse_timestamp(media["createdAt"]), + "date_updated": self.parse_timestamp(media["updatedAt"]), } if "metadata" in location: @@ -331,12 +333,20 @@ class FanslyAPI(): posts = response["posts"] for post in posts: - post["account"] = accounts[post.pop("accountId")] + try: + post["account"] = accounts[post.pop("accountId")] + except KeyError: + pass extra = None attachments = [] for attachment in post["attachments"]: - cid = attachment["contentId"] + try: + cid = attachment["contentId"] + except KeyError: + attachments.append(attachment) + continue + if cid in media: attachments.append(media[cid]) elif cid in bundles: diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index e32a86b..d13ec13 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -48,7 +48,7 @@ class FantiaExtractor(Extractor): for content in contents: files = self._process_content(post, content) - yield Message.Directory, post + yield Message.Directory, "", post if content["visible_status"] != "visible": self.log.warning( @@ -101,7 +101,7 @@ class FantiaExtractor(Extractor): "comment": resp["comment"], "rating": resp["rating"], "posted_at": resp["posted_at"], - "date": text.parse_datetime( + "date": self.parse_datetime( resp["posted_at"], "%a, %d %b %Y %H:%M:%S %z"), "fanclub_id": resp["fanclub"]["id"], "fanclub_user_id": resp["fanclub"]["user"]["id"], diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py index 7ff71b0..a18ce31 100644 --- a/gallery_dl/extractor/fapachi.py +++ b/gallery_dl/extractor/fapachi.py @@ -34,7 +34,7 @@ class FapachiPostExtractor(Extractor): page = self.request(f"{self.root}/{self.user}/media/{self.id}").text url = self.root + text.extract( page, 'data-src="', '"', page.index('class="media-img'))[0] - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index b961cbe..afef942 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -20,7 +20,7 @@ class FapelloPostExtractor(Extractor): directory_fmt = ("{category}", "{model}") filename_fmt = "{model}_{id}.{extension}" archive_fmt = "{type}_{model}_{id}" - pattern = BASE_PATTERN + r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)" + pattern = rf"{BASE_PATTERN}/(?!search/|popular_videos/)([^/?#]+)/(\d+)" example = "https://fapello.com/MODEL/12345/" def __init__(self, match): @@ -44,7 +44,7 @@ class FapelloPostExtractor(Extractor): } url = text.extr(page, 'src="', '"').replace( ".md", "").replace(".th", "") - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, text.nameext_from_url(url, data) @@ -52,9 +52,9 @@ class FapelloModelExtractor(Extractor): """Extractor for all posts from a fapello model""" category = "fapello" subcategory = "model" - pattern = (BASE_PATTERN + r"/(?!top-(?:likes|followers)|popular_videos" - r"|videos|trending|search/?$)" - r"([^/?#]+)/?$") + pattern = (rf"{BASE_PATTERN}/(?!top-(?:likes|followers)|popular_videos" + rf"|videos|trending|search/?$)" + rf"([^/?#]+)/?$") example = "https://fapello.com/model/" def __init__(self, match): @@ -85,9 +85,9 @@ class FapelloPathExtractor(Extractor): """Extractor for models and posts from fapello.com paths""" category = "fapello" subcategory = "path" - pattern = (BASE_PATTERN + - r"/(?!search/?$)(top-(?:likes|followers)|videos|trending" - r"|popular_videos/[^/?#]+)/?$") + pattern = (rf"{BASE_PATTERN}/(?!search/?$)" + rf"(top-(?:likes|followers)|videos|trending" + rf"|popular_videos/[^/?#]+)/?$") example = "https://fapello.com/trending/" def __init__(self, match): diff --git a/gallery_dl/extractor/fikfap.py b/gallery_dl/extractor/fikfap.py new file mode 100644 index 0000000..75071c5 --- /dev/null +++ b/gallery_dl/extractor/fikfap.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://fikfap.com/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?fikfap\.com" + + +class FikfapExtractor(Extractor): + """Base class for fikfap extractors""" + category = "fikfap" + root = "https://fikfap.com" + root_api = "https://api.fikfap.com" + directory_fmt = ("{category}", "{author[username]}") + filename_fmt = "{postId} {label[:240]}.{extension}" + archive_fmt = "{postId}" + + def items(self): + headers = { + "Referer" : self.root + "/", + "Origin" : self.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "cross-site", + } + + for post in self.posts(): + if url := post.get("videoFileOriginalUrl"): + post["extension"] = text.ext_from_url(url) + elif url := post.get("videoStreamUrl"): + url = "ytdl:" + url + post["extension"] = "mp4" + post["_ytdl_manifest"] = "hls" + post["_ytdl_manifest_headers"] = headers + else: + self.log.warning("%s: No video available", post["postId"]) + continue + + post["date"] = self.parse_datetime_iso(post["createdAt"]) + post["date_updated"] = self.parse_datetime_iso(post["updatedAt"]) + post["tags"] = [t["label"] for t in post["hashtags"]] + post["filename"] = post["label"] + + yield Message.Directory, "", post + yield Message.Url, url, post + + def request_api(self, url, params): + return self.request_json(url, params=params, headers={ + "Referer" : self.root + "/", + "Authorization-Anonymous": "2527cc30-c3c5-41be-b8bb-104b6ea7a206", + "IsLoggedIn" : "false", + "IsPWA" : "false", + "Origin" : self.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + }) + + +class FikfapPostExtractor(FikfapExtractor): + subcategory = "post" + pattern = rf"{BASE_PATTERN}/user/(\w+)/post/(\d+)" + example = "https://fikfap.com/user/USER/post/12345" + + def posts(self): + user, pid = self.groups + + url = f"{self.root_api}/profile/username/{user}/posts" + params = {"amount" : "1", "startId": pid} + posts = self.request_api(url, params) + + pid = int(pid) + for post in posts: + if post["postId"] == pid: + return (post,) + raise exception.NotFoundError("post") + + +class FikfapUserExtractor(FikfapExtractor): + subcategory = "user" + pattern = rf"{BASE_PATTERN}/user/(\w+)" + example = "https://fikfap.com/user/USER" + + def posts(self): + user = self.groups[0] + + url = f"{self.root_api}/profile/username/{user}/posts" + params = {"amount": "21"} + + while True: + data = self.request_api(url, params) + + yield from data + + if len(data) < 21: + return + params["afterId"] = data[-1]["postId"] diff --git a/gallery_dl/extractor/fitnakedgirls.py b/gallery_dl/extractor/fitnakedgirls.py new file mode 100644 index 0000000..d252ec4 --- /dev/null +++ b/gallery_dl/extractor/fitnakedgirls.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://fitnakedgirls.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?fitnakedgirls\.com" + + +class FitnakedgirlsExtractor(Extractor): + """Base class for fitnakedgirls extractors""" + category = "fitnakedgirls" + root = "https://fitnakedgirls.com" + + def items(self): + data = {"_extractor": FitnakedgirlsGalleryExtractor} + for url in self.galleries(): + yield Message.Queue, url, data + + def _pagination(self, base): + url = base + pnum = 1 + + while True: + page = self.request(url).text + + for post in text.extract_iter( + page, 'class="entry-body', "</a>"): + yield text.extr(post, 'href="', '"') + + pnum += 1 + url = f"{base}page/{pnum}/" + if f'href="{url}"' not in page: + return + + def _extract_title(self, extr, sep=" - "): + title = text.unescape(extr("<title>", "<")) + if sep in title: + title = title.rpartition(sep)[0] + return title.strip() + + +class FitnakedgirlsGalleryExtractor(GalleryExtractor, FitnakedgirlsExtractor): + """Extractor for fitnakedgirls galleries""" + directory_fmt = ("{category}", "{title}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{gallery_id}_{filename}" + pattern = rf"{BASE_PATTERN}/photos/gallery/([\w-]+)/?$" + example = "https://fitnakedgirls.com/photos/gallery/MODEL-nude/" + + def __init__(self, match): + url = f"{self.root}/photos/gallery/{match[1]}/" + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page) + title = self._extract_title(extr) + + # Strip common patterns to get cleaner model name + for pattern in (" Nudes", " Nude", " nudes", " nude"): + if pattern in title: + title = title.partition(pattern)[0] + break + + return { + "gallery_id" : text.parse_int(extr('data-post-id="', '"')), + "gallery_slug": self.groups[0], + "model": title, + "title": title, + "date" : self.parse_datetime_iso(extr( + 'article:published_time" content="', '"')), + } + + def images(self, page): + results = [] + + content = text.extr( + page, 'itemprop="articleBody"', '<!-- .entry-content -->') or page + + # Extract videos from wp-block-video figures + for figure in text.extract_iter( + content, '<figure class="wp-block-video">', '</figure>'): + if src := text.extr(figure, 'src="', '"'): + if "/wp-content/uploads/" in src: + results.append((src, None)) + + # Extract images from wp-block-image figures (newer template) + for figure in text.extract_iter( + content, '<figure class="wp-block-image', '</figure>'): + if src := text.extr(figure, 'data-src="', '"'): + if "/wp-content/uploads/" in src: + results.append((src, None)) + + # Fallback: Extract images with size-large class (older template) + if not results: + for img in text.extract_iter(content, "<img ", ">"): + if "size-large" in img: + if src := text.extr(img, 'data-src="', '"'): + if "/wp-content/uploads/" in src: + results.append((src, None)) + + return results + + +class FitnakedgirlsCategoryExtractor(FitnakedgirlsExtractor): + """Extractor for fitnakedgirls category pages""" + subcategory = "category" + pattern = rf"{BASE_PATTERN}/photos/gallery/category/([\w-]+)" + example = "https://fitnakedgirls.com/photos/gallery/category/CATEGORY/" + + def galleries(self): + base = f"{self.root}/photos/gallery/category/{self.groups[0]}/" + return self._pagination(base) + + +class FitnakedgirlsTagExtractor(FitnakedgirlsExtractor): + """Extractor for fitnakedgirls tag pages""" + subcategory = "tag" + pattern = rf"{BASE_PATTERN}/photos/gallery/tag/([\w-]+)" + example = "https://fitnakedgirls.com/photos/gallery/tag/TAG/" + + def galleries(self): + base = f"{self.root}/photos/gallery/tag/{self.groups[0]}/" + return self._pagination(base) + + +class FitnakedgirlsVideoExtractor(FitnakedgirlsExtractor): + """Extractor for fitnakedgirls video posts""" + subcategory = "video" + directory_fmt = ("{category}", "{title}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{video_id}_{filename}" + pattern = rf"{BASE_PATTERN}/videos/(\d+)/(\d+)/([\w-]+)" + example = "https://fitnakedgirls.com/videos/2025/08/VIDEO-TITLE/" + + def items(self): + year, month, slug = self.groups + url = f"{self.root}/videos/{year}/{month}/{slug}/" + page = self.request(url).text + + extr = text.extract_from(page) + data = { + "slug" : slug, + "title" : self._extract_title(extr, " | "), + "video_id": text.parse_int(extr('data-post-id="', '"')), + "date" : self.parse_datetime_iso( + extr('article:published_time" content="', '"')), + } + + yield Message.Directory, "", data + + content = text.extr( + page, 'itemprop="articleBody"', '<!-- .entry-content -->') or page + for video in text.extract_iter(content, "<video ", "</video>"): + if src := text.extr(video, 'src="', '"'): + if "/wp-content/uploads/" in src: + yield Message.Url, src, text.nameext_from_url(src, data) + + +class FitnakedgirlsBlogExtractor(FitnakedgirlsExtractor): + """Extractor for fitnakedgirls blog posts""" + subcategory = "blog" + directory_fmt = ("{category}", "{title}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{post_id}_{filename}" + pattern = rf"{BASE_PATTERN}/fitblog/([\w-]+)" + example = "https://fitnakedgirls.com/fitblog/MODEL-NAME/" + + def items(self): + slug = self.groups[0] + url = f"{self.root}/fitblog/{slug}/" + page = self.request(url).text + + extr = text.extract_from(page) + data = { + "slug" : slug, + "title" : self._extract_title(extr), + "post_id": text.parse_int(extr('data-post-id="', '"')), + "date" : self.parse_datetime_iso( + extr('article:published_time" content="', '"')), + } + + yield Message.Directory, "", data + + # Extract images from wp-block-image figures + content = text.extr( + page, 'itemprop="articleBody"', '<!-- .entry-content -->') or page + for figure in text.extract_iter( + content, '<figure class="wp-block-image', '</figure>'): + # Try srcset first for highest resolution + if srcset := text.extr(figure, 'srcset="', '"'): + # Get the last (largest) image from srcset + urls = srcset.split(", ") + if urls: + src = urls[-1].partition(" ")[0] + if "/wp-content/uploads/" in src: + yield Message.Url, src, text.nameext_from_url( + src, data) + continue + # Fallback to src + if src := text.extr(figure, 'src="', '"'): + if "/wp-content/uploads/" in src: + yield Message.Url, src, text.nameext_from_url(src, data) diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 35263a3..1446eb8 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, oauth, util, exception +from ..cache import memcache BASE_PATTERN = r"(?:https?://)?(?:www\.|secure\.|m\.)?flickr\.com" @@ -17,6 +18,7 @@ BASE_PATTERN = r"(?:https?://)?(?:www\.|secure\.|m\.)?flickr\.com" class FlickrExtractor(Extractor): """Base class for flickr extractors""" category = "flickr" + root = "https://www.flickr.com" filename_fmt = "{category}_{id}.{extension}" directory_fmt = ("{category}", "{user[username]}") archive_fmt = "{id}" @@ -24,11 +26,12 @@ class FlickrExtractor(Extractor): request_interval_min = 0.5 def _init(self): - self.api = FlickrAPI(self) self.user = None self.item_id = self.groups[0] def items(self): + self.api = FlickrAPI(self) + data = self.metadata() extract = self.api._extract_format for photo in self.photos(): @@ -38,11 +41,11 @@ class FlickrExtractor(Extractor): self.log.warning( "Skipping photo %s (%s: %s)", photo["id"], exc.__class__.__name__, exc) - self.log.debug("", exc_info=exc) + self.log.traceback(exc) else: photo.update(data) url = self._file_url(photo) - yield Message.Directory, photo + yield Message.Directory, "", photo yield Message.Url, url, text.nameext_from_url(url, photo) def metadata(self): @@ -75,6 +78,8 @@ class FlickrImageExtractor(FlickrExtractor): example = "https://www.flickr.com/photos/USER/12345" def items(self): + self.api = FlickrAPI(self) + item_id, enc_id = self.groups if enc_id is not None: alphabet = ("123456789abcdefghijkmnopqrstu" @@ -98,7 +103,7 @@ class FlickrImageExtractor(FlickrExtractor): photo["comments"] = text.parse_int(photo["comments"]["_content"]) photo["description"] = photo["description"]["_content"] photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]] - photo["date"] = text.parse_timestamp(photo["dateuploaded"]) + photo["date"] = self.parse_timestamp(photo["dateuploaded"]) photo["views"] = text.parse_int(photo["views"]) photo["id"] = text.parse_int(photo["id"]) @@ -109,7 +114,7 @@ class FlickrImageExtractor(FlickrExtractor): location[key] = value["_content"] url = self._file_url(photo) - yield Message.Directory, photo + yield Message.Directory, "", photo yield Message.Url, url, text.nameext_from_url(url, photo) @@ -119,7 +124,7 @@ class FlickrAlbumExtractor(FlickrExtractor): directory_fmt = ("{category}", "{user[username]}", "Albums", "{album[id]} {album[title]}") archive_fmt = "a_{album[id]}_{id}" - pattern = BASE_PATTERN + r"/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?" + pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?" example = "https://www.flickr.com/photos/USER/albums/12345" def items(self): @@ -129,6 +134,8 @@ class FlickrAlbumExtractor(FlickrExtractor): return self._album_items() def _album_items(self): + self.api = FlickrAPI(self) + data = FlickrExtractor.metadata(self) data["_extractor"] = FlickrAlbumExtractor @@ -159,7 +166,7 @@ class FlickrGalleryExtractor(FlickrExtractor): directory_fmt = ("{category}", "{user[username]}", "Galleries", "{gallery[gallery_id]} {gallery[title]}") archive_fmt = "g_{gallery[id]}_{id}" - pattern = BASE_PATTERN + r"/photos/([^/?#]+)/galleries/(\d+)" + pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)/galleries/(\d+)" example = "https://www.flickr.com/photos/USER/galleries/12345/" def metadata(self): @@ -177,7 +184,7 @@ class FlickrGroupExtractor(FlickrExtractor): subcategory = "group" directory_fmt = ("{category}", "Groups", "{group[groupname]}") archive_fmt = "G_{group[nsid]}_{id}" - pattern = BASE_PATTERN + r"/groups/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/groups/([^/?#]+)" example = "https://www.flickr.com/groups/NAME/" def metadata(self): @@ -192,7 +199,7 @@ class FlickrUserExtractor(FlickrExtractor): """Extractor for the photostream of a flickr user""" subcategory = "user" archive_fmt = "u_{user[nsid]}_{id}" - pattern = BASE_PATTERN + r"/photos/([^/?#]+)/?$" + pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)/?$" example = "https://www.flickr.com/photos/USER/" def photos(self): @@ -204,7 +211,7 @@ class FlickrFavoriteExtractor(FlickrExtractor): subcategory = "favorite" directory_fmt = ("{category}", "{user[username]}", "Favorites") archive_fmt = "f_{user[nsid]}_{id}" - pattern = BASE_PATTERN + r"/photos/([^/?#]+)/favorites" + pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)/favorites" example = "https://www.flickr.com/photos/USER/favorites" def photos(self): @@ -216,7 +223,7 @@ class FlickrSearchExtractor(FlickrExtractor): subcategory = "search" directory_fmt = ("{category}", "Search", "{search[text]}") archive_fmt = "s_{search}_{id}" - pattern = BASE_PATTERN + r"/search/?\?([^#]+)" + pattern = rf"{BASE_PATTERN}/search/?\?([^#]+)" example = "https://flickr.com/search/?text=QUERY" def metadata(self): @@ -236,8 +243,8 @@ class FlickrAPI(oauth.OAuth1API): """ API_URL = "https://api.flickr.com/services/rest/" - API_KEY = "90c368449018a0cb880ea4889cbb8681" - API_SECRET = "e4b83e319c11e9e1" + # API_KEY = "" + API_SECRET = "" FORMATS = [ ("o" , "Original" , None), ("6k", "X-Large 6K" , 6144), @@ -282,6 +289,14 @@ class FlickrAPI(oauth.OAuth1API): "10": "Public Domain Mark", } + @property + @memcache(maxage=3600) + def API_KEY(self): + extr = self.extractor + extr.log.info("Retrieving public API key") + page = extr.request(extr.root + "/prints").text + return text.extr(page, '.flickr.api.site_key = "', '"') + def __init__(self, extractor): oauth.OAuth1API.__init__(self, extractor) @@ -489,7 +504,7 @@ class FlickrAPI(oauth.OAuth1API): def _extract_format(self, photo): photo["description"] = photo["description"]["_content"].strip() photo["views"] = text.parse_int(photo["views"]) - photo["date"] = text.parse_timestamp(photo["dateupload"]) + photo["date"] = self.extractor.parse_timestamp(photo["dateupload"]) photo["tags"] = photo["tags"].split() self._extract_metadata(photo) diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index dc23488..3c69489 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -31,7 +31,7 @@ class FoolfuukaExtractor(BaseExtractor): self.fixup_redirect = False def items(self): - yield Message.Directory, self.metadata() + yield Message.Directory, "", self.metadata() for post in self.posts(): media = post["media"] if not media: @@ -147,7 +147,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): subcategory = "thread" directory_fmt = ("{category}", "{board[shortname]}", "{thread_num} {title|comment[:50]}") - pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/thread/(\d+)" example = "https://archived.moe/a/thread/12345/" def __init__(self, match): @@ -174,7 +174,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): class FoolfuukaBoardExtractor(FoolfuukaExtractor): """Base extractor for FoolFuuka based boards/archives""" subcategory = "board" - pattern = BASE_PATTERN + r"/([^/?#]+)(?:/(?:page/)?(\d*))?$" + pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:/(?:page/)?(\d*))?$" example = "https://archived.moe/a/" def __init__(self, match): @@ -210,7 +210,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): """Base extractor for search results on FoolFuuka based boards/archives""" subcategory = "search" directory_fmt = ("{category}", "search", "{search}") - pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)" example = "https://archived.moe/_/search/text/QUERY/" request_interval = (0.5, 1.5) @@ -265,7 +265,7 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor): """Base extractor for FoolFuuka galleries""" subcategory = "gallery" directory_fmt = ("{category}", "{board}", "gallery") - pattern = BASE_PATTERN + r"/([^/?#]+)/gallery(?:/(\d+))?" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/gallery(?:/(\d+))?" example = "https://archived.moe/a/gallery" def metadata(self): diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 7c59f72..d932174 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -47,7 +47,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): filename_fmt = ( "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}") archive_fmt = "{id}" - pattern = BASE_PATTERN + r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" + pattern = rf"{BASE_PATTERN}(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" example = "https://read.powermanga.org/read/MANGA/en/0/123/" def items(self): @@ -58,7 +58,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): data["count"] = len(imgs) data["chapter_id"] = text.parse_int(imgs[0]["chapter_id"]) - yield Message.Directory, data + yield Message.Directory, "", data enum = util.enumerate_reversed if self.config( "page-reverse") else enumerate for data["page"], image in enum(imgs, 1): @@ -91,7 +91,7 @@ class FoolslideMangaExtractor(FoolslideExtractor): """Base class for manga extractors for FoOlSlide based sites""" subcategory = "manga" categorytransfer = True - pattern = BASE_PATTERN + r"(/series/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/series/[^/?#]+)" example = "https://read.powermanga.org/series/MANGA/" def items(self): diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 0d24f83..ad57a6b 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -54,7 +54,7 @@ class FuraffinityExtractor(Extractor): if post := self._parse_post(post_id): if metadata: post.update(metadata) - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, post["url"], post if self.external: @@ -95,7 +95,7 @@ class FuraffinityExtractor(Extractor): if self._new_layout: data["tags"] = text.split_html(extr( - 'class="tags-row">', '</section>')) + "<h3>Keywords</h3>", "</section>")) data["scraps"] = (extr(' submissions">', "<") == "Scraps") data["title"] = text.unescape(extr("<h2><p>", "</p></h2>")) data["artist_url"] = extr('title="', '"').strip() @@ -143,7 +143,7 @@ class FuraffinityExtractor(Extractor): data["folders"] = () # folders not present in old layout data["user"] = self.user or data["artist_url"] - data["date"] = text.parse_timestamp(data["filename"].partition(".")[0]) + data["date"] = self.parse_timestamp(data["filename"].partition(".")[0]) data["description"] = self._process_description(data["_description"]) data["thumbnail"] = (f"https://t.furaffinity.net/{post_id}@600-" f"{path.rsplit('/', 2)[1]}.jpg") @@ -231,7 +231,7 @@ class FuraffinityExtractor(Extractor): class FuraffinityGalleryExtractor(FuraffinityExtractor): """Extractor for a furaffinity user's gallery""" subcategory = "gallery" - pattern = BASE_PATTERN + r"/gallery/([^/?#]+)(?:$|/(?!folder/))" + pattern = rf"{BASE_PATTERN}/gallery/([^/?#]+)(?:$|/(?!folder/))" example = "https://www.furaffinity.net/gallery/USER/" def posts(self): @@ -243,7 +243,7 @@ class FuraffinityFolderExtractor(FuraffinityExtractor): subcategory = "folder" directory_fmt = ("{category}", "{user!l}", "Folders", "{folder_id}{folder_name:? //}") - pattern = BASE_PATTERN + r"/gallery/([^/?#]+)/folder/(\d+)(?:/([^/?#]+))?" + pattern = rf"{BASE_PATTERN}/gallery/([^/?#]+)/folder/(\d+)(?:/([^/?#]+))?" example = "https://www.furaffinity.net/gallery/USER/folder/12345/FOLDER" def metadata(self): @@ -260,7 +260,7 @@ class FuraffinityScrapsExtractor(FuraffinityExtractor): """Extractor for a furaffinity user's scraps""" subcategory = "scraps" directory_fmt = ("{category}", "{user!l}", "Scraps") - pattern = BASE_PATTERN + r"/scraps/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/scraps/([^/?#]+)" example = "https://www.furaffinity.net/scraps/USER/" def posts(self): @@ -271,7 +271,7 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): """Extractor for a furaffinity user's favorites""" subcategory = "favorite" directory_fmt = ("{category}", "{user!l}", "Favorites") - pattern = BASE_PATTERN + r"/favorites/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/favorites/([^/?#]+)" example = "https://www.furaffinity.net/favorites/USER/" def posts(self): @@ -287,7 +287,7 @@ class FuraffinitySearchExtractor(FuraffinityExtractor): """Extractor for furaffinity search results""" subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") - pattern = BASE_PATTERN + r"/search(?:/([^/?#]+))?/?[?&]([^#]+)" + pattern = rf"{BASE_PATTERN}/search(?:/([^/?#]+))?/?[?&]([^#]+)" example = "https://www.furaffinity.net/search/?q=QUERY" def __init__(self, match): @@ -306,7 +306,7 @@ class FuraffinitySearchExtractor(FuraffinityExtractor): class FuraffinityPostExtractor(FuraffinityExtractor): """Extractor for individual posts on furaffinity""" subcategory = "post" - pattern = BASE_PATTERN + r"/(?:view|full)/(\d+)" + pattern = rf"{BASE_PATTERN}/(?:view|full)/(\d+)" example = "https://www.furaffinity.net/view/12345/" def posts(self): @@ -317,7 +317,7 @@ class FuraffinityPostExtractor(FuraffinityExtractor): class FuraffinityUserExtractor(Dispatch, FuraffinityExtractor): """Extractor for furaffinity user profiles""" - pattern = BASE_PATTERN + r"/user/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)" example = "https://www.furaffinity.net/user/USER/" def items(self): @@ -333,7 +333,7 @@ class FuraffinityUserExtractor(Dispatch, FuraffinityExtractor): class FuraffinityFollowingExtractor(FuraffinityExtractor): """Extractor for a furaffinity user's watched users""" subcategory = "following" - pattern = BASE_PATTERN + "/watchlist/by/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/watchlist/by/([^/?#]+)" example = "https://www.furaffinity.net/watchlist/by/USER/" def items(self): @@ -355,7 +355,7 @@ class FuraffinityFollowingExtractor(FuraffinityExtractor): class FuraffinitySubmissionsExtractor(FuraffinityExtractor): """Extractor for new furaffinity submissions""" subcategory = "submissions" - pattern = BASE_PATTERN + r"(/msg/submissions(?:/[^/?#]+)?)" + pattern = rf"{BASE_PATTERN}(/msg/submissions(?:/[^/?#]+)?)" example = "https://www.furaffinity.net/msg/submissions" def posts(self): diff --git a/gallery_dl/extractor/furry34.py b/gallery_dl/extractor/furry34.py index a93ec75..95b98db 100644 --- a/gallery_dl/extractor/furry34.py +++ b/gallery_dl/extractor/furry34.py @@ -55,8 +55,7 @@ class Furry34Extractor(BooruExtractor): def _prepare(self, post): post.pop("files", None) - post["date"] = text.parse_datetime( - post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["created"]) post["filename"], _, post["format"] = post["filename"].rpartition(".") if "tags" in post: post["tags"] = [t["value"] for t in post["tags"]] @@ -98,7 +97,7 @@ class Furry34Extractor(BooruExtractor): class Furry34PostExtractor(Furry34Extractor): subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/(\d+)" + pattern = rf"{BASE_PATTERN}/post/(\d+)" example = "https://furry34.com/post/12345" def posts(self): @@ -109,7 +108,7 @@ class Furry34PlaylistExtractor(Furry34Extractor): subcategory = "playlist" directory_fmt = ("{category}", "{playlist_id}") archive_fmt = "p_{playlist_id}_{id}" - pattern = BASE_PATTERN + r"/playlists/view/(\d+)" + pattern = rf"{BASE_PATTERN}/playlists/view/(\d+)" example = "https://furry34.com/playlists/view/12345" def metadata(self): @@ -124,7 +123,7 @@ class Furry34TagExtractor(Furry34Extractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/(?:([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)" + pattern = rf"{BASE_PATTERN}/(?:([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)" example = "https://furry34.com/TAG" def _init(self): diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index f32059e..0571fcd 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -148,7 +148,7 @@ class GelbooruBase(): class GelbooruTagExtractor(GelbooruBase, gelbooru_v02.GelbooruV02TagExtractor): """Extractor for images from gelbooru.com based on search-tags""" - pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]*)" + pattern = rf"{BASE_PATTERN}page=post&s=list&tags=([^&#]*)" example = "https://gelbooru.com/index.php?page=post&s=list&tags=TAG" @@ -156,7 +156,7 @@ class GelbooruPoolExtractor(GelbooruBase, gelbooru_v02.GelbooruV02PoolExtractor): """Extractor for gelbooru pools""" per_page = 45 - pattern = BASE_PATTERN + r"page=pool&s=show&id=(\d+)" + pattern = rf"{BASE_PATTERN}page=pool&s=show&id=(\d+)" example = "https://gelbooru.com/index.php?page=pool&s=show&id=12345" skip = GelbooruBase._skip_offset @@ -187,7 +187,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, gelbooru_v02.GelbooruV02FavoriteExtractor): """Extractor for gelbooru favorites""" per_page = 100 - pattern = BASE_PATTERN + r"page=favorites&s=view&id=(\d+)" + pattern = rf"{BASE_PATTERN}page=favorites&s=view&id=(\d+)" example = "https://gelbooru.com/index.php?page=favorites&s=view&id=12345" skip = GelbooruBase._skip_offset @@ -246,7 +246,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, for fav in favs: for post in self._api_request({"id": fav["favorite"]}): - post["date_favorited"] = text.parse_timestamp(fav["added"]) + post["date_favorited"] = self.parse_timestamp(fav["added"]) yield post params["pid"] += 1 @@ -273,7 +273,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, for fav in favs: for post in self._api_request({"id": fav["favorite"]}): - post["date_favorited"] = text.parse_timestamp(fav["added"]) + post["date_favorited"] = self.parse_timestamp(fav["added"]) yield post params["pid"] -= 1 @@ -284,10 +284,10 @@ class GelbooruFavoriteExtractor(GelbooruBase, class GelbooruPostExtractor(GelbooruBase, gelbooru_v02.GelbooruV02PostExtractor): """Extractor for single images from gelbooru.com""" - pattern = (BASE_PATTERN + - r"(?=(?:[^#]+&)?page=post(?:&|#|$))" - r"(?=(?:[^#]+&)?s=view(?:&|#|$))" - r"(?:[^#]+&)?id=(\d+)") + pattern = (rf"{BASE_PATTERN}" + rf"(?=(?:[^#]+&)?page=post(?:&|#|$))" + rf"(?=(?:[^#]+&)?s=view(?:&|#|$))" + rf"(?:[^#]+&)?id=(\d+)") example = "https://gelbooru.com/index.php?page=post&s=view&id=12345" diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 61d0545..7b9c732 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -35,8 +35,7 @@ class GelbooruV01Extractor(booru.BooruExtractor): } post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] - post["date"] = text.parse_datetime( - post["created_at"], "%Y-%m-%d %H:%M:%S") + post["date"] = self.parse_datetime_iso(post["created_at"]) return post @@ -88,7 +87,7 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=list&tags=([^&#]+)" example = "https://allgirl.booru.org/index.php?page=post&s=list&tags=TAG" def metadata(self): @@ -105,7 +104,7 @@ class GelbooruV01FavoriteExtractor(GelbooruV01Extractor): directory_fmt = ("{category}", "favorites", "{favorite_id}") archive_fmt = "f_{favorite_id}_{id}" per_page = 50 - pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=favorites&s=view&id=(\d+)" example = "https://allgirl.booru.org/index.php?page=favorites&s=view&id=1" def metadata(self): @@ -121,7 +120,7 @@ class GelbooruV01FavoriteExtractor(GelbooruV01Extractor): class GelbooruV01PostExtractor(GelbooruV01Extractor): subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=view&id=(\d+)" example = "https://allgirl.booru.org/index.php?page=post&s=view&id=12345" def posts(self): diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 33db4e4..122f5a9 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -96,7 +96,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = self.page_start * self.per_page data = {} - find_ids = util.re(r"\sid=\"p(\d+)").findall + find_ids = text.re(r"\sid=\"p(\d+)").findall while True: page = self.request(url, params=params).text @@ -122,7 +122,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): def _prepare(self, post): post["tags"] = post["tags"].strip() - post["date"] = text.parse_datetime( + post["date"] = self.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") def _html(self, post): @@ -136,7 +136,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): return tags = collections.defaultdict(list) - pattern = util.re(r"(?s)tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)") + pattern = text.re(r"(?s)tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)") for tag_type, tag_name in pattern.findall(tag_container): tags[tag_type].append(text.unescape(text.unquote(tag_name))) for key, value in tags.items(): @@ -190,7 +190,7 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=list&tags=([^&#]*)" example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG" def posts(self): @@ -206,7 +206,7 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" - pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=pool&s=show&id=(\d+)" example = "https://safebooru.org/index.php?page=pool&s=show&id=12345" def __init__(self, match): @@ -257,7 +257,7 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): directory_fmt = ("{category}", "favorites", "{favorite_id}") archive_fmt = "f_{favorite_id}_{id}" per_page = 50 - pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=favorites&s=view&id=(\d+)" example = "https://safebooru.org/index.php?page=favorites&s=view&id=12345" def metadata(self): @@ -275,7 +275,7 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): class GelbooruV02PostExtractor(GelbooruV02Extractor): subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=view&id=(\d+)" example = "https://safebooru.org/index.php?page=post&s=view&id=12345" def posts(self): diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 407e478..99e6ea7 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -7,7 +7,7 @@ """Generic information extractor""" from .common import Extractor, Message -from .. import config, text, util +from .. import config, text import os.path @@ -75,7 +75,7 @@ class GenericExtractor(Extractor): pass images = enumerate(imgs, 1) - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], (url, imgdata) in images: if imgdata: @@ -171,8 +171,8 @@ class GenericExtractor(Extractor): r"(?:[^\"'<>\s]*)?" # optional query and fragment ) - imageurls_src = util.re(imageurl_pattern_src).findall(page) - imageurls_ext = util.re(imageurl_pattern_ext).findall(page) + imageurls_src = text.re(imageurl_pattern_src).findall(page) + imageurls_ext = text.re(imageurl_pattern_ext).findall(page) imageurls = imageurls_src + imageurls_ext # Resolve relative urls @@ -181,7 +181,7 @@ class GenericExtractor(Extractor): # by prepending a suitable base url. # # If the page contains a <base> element, use it as base url - basematch = util.re( + basematch = text.re( r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)").search(page) if basematch: self.baseurl = basematch['url'].rstrip('/') diff --git a/gallery_dl/extractor/girlsreleased.py b/gallery_dl/extractor/girlsreleased.py index 5e68a63..0fbdeff 100644 --- a/gallery_dl/extractor/girlsreleased.py +++ b/gallery_dl/extractor/girlsreleased.py @@ -41,7 +41,7 @@ class GirlsreleasedExtractor(Extractor): class GirlsreleasedSetExtractor(GirlsreleasedExtractor): """Extractor for girlsreleased galleries""" subcategory = "set" - pattern = BASE_PATTERN + r"/set/(\d+)" + pattern = rf"{BASE_PATTERN}/set/(\d+)" example = "https://girlsreleased.com/set/12345" def items(self): @@ -52,11 +52,11 @@ class GirlsreleasedSetExtractor(GirlsreleasedExtractor): "id": json["id"], "site": json["site"], "model": [model for _, model in json["models"]], - "date": text.parse_timestamp(json["date"]), + "date": self.parse_timestamp(json["date"]), "count": len(json["images"]), "url": "https://girlsreleased.com/set/" + json["id"], } - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], image in enumerate(json["images"], 1): text.nameext_from_url(image[5], data) yield Message.Queue, image[3], data @@ -65,12 +65,12 @@ class GirlsreleasedSetExtractor(GirlsreleasedExtractor): class GirlsreleasedModelExtractor(GirlsreleasedExtractor): """Extractor for girlsreleased models""" subcategory = _path = "model" - pattern = BASE_PATTERN + r"/model/(\d+(?:/.+)?)" + pattern = rf"{BASE_PATTERN}/model/(\d+(?:/.+)?)" example = "https://girlsreleased.com/model/12345/MODEL" class GirlsreleasedSiteExtractor(GirlsreleasedExtractor): """Extractor for girlsreleased sites""" subcategory = _path = "site" - pattern = BASE_PATTERN + r"/site/([^/?#]+(?:/model/\d+/?.*)?)" + pattern = rf"{BASE_PATTERN}/site/([^/?#]+(?:/model/\d+/?.*)?)" example = "https://girlsreleased.com/site/SITE" diff --git a/gallery_dl/extractor/girlswithmuscle.py b/gallery_dl/extractor/girlswithmuscle.py index 51b979f..e61e472 100644 --- a/gallery_dl/extractor/girlswithmuscle.py +++ b/gallery_dl/extractor/girlswithmuscle.py @@ -5,7 +5,7 @@ # published by the Free Software Foundation. from .common import Extractor, Message -from .. import text, util, exception +from .. import text, exception from ..cache import cache BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlswithmuscle\.com" @@ -60,7 +60,7 @@ class GirlswithmuscleExtractor(Extractor): class GirlswithmusclePostExtractor(GirlswithmuscleExtractor): """Extractor for individual posts on girlswithmuscle.com""" subcategory = "post" - pattern = BASE_PATTERN + r"/(\d+)" + pattern = rf"{BASE_PATTERN}/(\d+)" example = "https://www.girlswithmuscle.com/12345/" def items(self): @@ -80,7 +80,7 @@ class GirlswithmusclePostExtractor(GirlswithmuscleExtractor): metadata["type"] = "video" text.nameext_from_url(url, metadata) - yield Message.Directory, metadata + yield Message.Directory, "", metadata yield Message.Url, url, metadata def metadata(self, page): @@ -101,9 +101,8 @@ class GirlswithmusclePostExtractor(GirlswithmuscleExtractor): "model": model, "model_list": self._parse_model_list(model), "tags": text.split_html(tags)[1::2], - "date": text.parse_datetime( - text.extr(page, 'class="hover-time" title="', '"')[:19], - "%Y-%m-%d %H:%M:%S"), + "date": self.parse_datetime_iso(text.extr( + page, 'class="hover-time" title="', '"')[:19]), "is_favorite": self._parse_is_favorite(page), "source_filename": source, "uploader": uploader, @@ -144,7 +143,7 @@ class GirlswithmusclePostExtractor(GirlswithmuscleExtractor): class GirlswithmuscleSearchExtractor(GirlswithmuscleExtractor): """Extractor for search results on girlswithmuscle.com""" subcategory = "search" - pattern = BASE_PATTERN + r"/images/(.*)" + pattern = rf"{BASE_PATTERN}/images/(.*)" example = "https://www.girlswithmuscle.com/images/?name=MODEL" def pages(self): @@ -156,7 +155,7 @@ class GirlswithmuscleSearchExtractor(GirlswithmuscleExtractor): raise exception.AuthorizationError(msg) page = response.text - match = util.re(r"Page (\d+) of (\d+)").search(page) + match = text.re(r"Page (\d+) of (\d+)").search(page) current, total = match.groups() current, total = text.parse_int(current), text.parse_int(total) diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 0a6c9b9..7c9755a 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -39,7 +39,7 @@ class GofileFolderExtractor(Extractor): self._get_website_token()) folder = self._get_content(self.content_id, password) - yield Message.Directory, folder + yield Message.Directory, "", folder try: contents = folder.pop("children") @@ -75,14 +75,16 @@ class GofileFolderExtractor(Extractor): @cache(maxage=86400) def _get_website_token(self): self.log.debug("Fetching website token") - page = self.request(self.root + "/dist/js/global.js").text + page = self.request(self.root + "/dist/js/config.js").text return text.extr(page, '.wt = "', '"') def _get_content(self, content_id, password=None): - headers = {"Authorization": "Bearer " + self.api_token} - params = {"wt": self.website_token} - if password is not None: - params["password"] = hashlib.sha256(password.encode()).hexdigest() + headers = { + "Authorization" : "Bearer " + self.api_token, + "X-Website-Token": self.website_token, + } + params = None if password is None else {"password": hashlib.sha256( + password.encode()).hexdigest()} return self._api_request("contents/" + content_id, params, headers) def _api_request(self, endpoint, params=None, headers=None, method="GET"): diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 8e350d6..7065d7b 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -7,7 +7,7 @@ """Extractors for https://hatenablog.com""" from .common import Extractor, Message -from .. import text, util +from .. import text BASE_PATTERN = ( @@ -30,11 +30,11 @@ class HatenablogExtractor(Extractor): self.domain = match[1] or match[2] def _init(self): - self._find_img = util.re(r'<img +([^>]+)').finditer + self._find_img = text.re(r'<img +([^>]+)').finditer def _handle_article(self, article: str): extr = text.extract_from(article) - date = text.parse_datetime(extr('<time datetime="', '"')) + date = self.parse_datetime_iso(extr('<time datetime="', '"')) entry_link = text.unescape(extr('<a href="', '"')) entry = entry_link.partition("/entry/")[2] title = text.unescape(extr('>', '<')) @@ -56,7 +56,7 @@ class HatenablogExtractor(Extractor): "title": title, "count": len(images), } - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], url in enumerate(images, 1): yield Message.Url, url, text.nameext_from_url(url, data) @@ -73,7 +73,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor): def _init(self): HatenablogExtractor._init(self) - self._find_pager_url = util.re( + self._find_pager_url = text.re( r' class="pager-next">\s*<a href="([^"]+)').search def items(self): @@ -123,7 +123,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor): class HatenablogEntryExtractor(HatenablogExtractor): """Extractor for a single entry URL""" subcategory = "entry" - pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE + pattern = rf"{BASE_PATTERN}/entry/([^?#]+){QUERY_RE}" example = "https://BLOG.hatenablog.com/entry/PATH" def __init__(self, match): @@ -146,21 +146,21 @@ class HatenablogEntryExtractor(HatenablogExtractor): class HatenablogHomeExtractor(HatenablogEntriesExtractor): """Extractor for a blog's home page""" subcategory = "home" - pattern = BASE_PATTERN + r"(/?)" + QUERY_RE + pattern = rf"{BASE_PATTERN}(/?){QUERY_RE}" example = "https://BLOG.hatenablog.com" class HatenablogArchiveExtractor(HatenablogEntriesExtractor): """Extractor for a blog's archive page""" subcategory = "archive" - pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" - r"|/category/[^?#]+)?)" + QUERY_RE) + pattern = (rf"{BASE_PATTERN}(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + rf"|/category/[^?#]+)?){QUERY_RE}") example = "https://BLOG.hatenablog.com/archive/2024" class HatenablogSearchExtractor(HatenablogEntriesExtractor): """Extractor for a blog's search results""" subcategory = "search" - pattern = BASE_PATTERN + r"(/search)" + QUERY_RE + pattern = rf"{BASE_PATTERN}(/search){QUERY_RE}" example = "https://BLOG.hatenablog.com/search?q=QUERY" allowed_parameters = ("q",) diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index ac4cd02..0e4c040 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -30,7 +30,7 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) chapter, sep, minor = self.groups[1].partition(".") - match = util.re( + match = text.re( r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - " r"([^:]+): (.+) . Page 1 ").match(title) if match: diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 5c2628f..b5f3d0e 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -38,7 +38,7 @@ class HentaicosplaysGalleryExtractor( directory_fmt = ("{site}", "{title}") filename_fmt = "{filename}.{extension}" archive_fmt = "{title}_{filename}" - pattern = BASE_PATTERN + r"/(?:image|story)/([\w-]+)" + pattern = rf"{BASE_PATTERN}/(?:image|story)/([\w-]+)" example = "https://hentai-cosplay-xxx.com/image/TITLE/" def __init__(self, match): diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index a08f7bb..882183b 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -43,7 +43,7 @@ class HentaifoundryExtractor(Extractor): for post_url in util.advance(self.posts(), self.start_post): image = self._parse_post(post_url) image.update(data) - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, image["src"], image def skip(self, num): @@ -86,7 +86,8 @@ class HentaifoundryExtractor(Extractor): .replace("\r\n", "\n")), "ratings" : [text.unescape(r) for r in text.extract_iter(extr( "class='ratings_box'", "</div>"), "title='", "'")], - "date" : text.parse_datetime(extr("datetime='", "'")), + "categories" : self._extract_categories(extr), + "date" : self.parse_datetime_iso(extr("datetime='", "'")), "views" : text.parse_int(extr(">Views</span>", "<")), "score" : text.parse_int(extr(">Vote Score</span>", "<")), "media" : text.unescape(extr(">Media</span>", "<").strip()), @@ -126,7 +127,7 @@ class HentaifoundryExtractor(Extractor): "title" : text.unescape(extr( "<div class='titlebar'>", "</a>").rpartition(">")[2]), "author" : text.unescape(extr('alt="', '"')), - "date" : text.parse_datetime(extr( + "date" : self.parse_datetime(extr( ">Updated<", "</span>").rpartition(">")[2], "%B %d, %Y"), "status" : extr("class='indent'>", "<"), } @@ -141,11 +142,17 @@ class HentaifoundryExtractor(Extractor): path = extr('class="pdfLink" href="', '"') data["src"] = self.root + path data["index"] = text.parse_int(path.rsplit("/", 2)[1]) + data["categories"] = self._extract_categories(extr) data["ratings"] = [text.unescape(r) for r in text.extract_iter(extr( "class='ratings_box'", "</div>"), "title='", "'")] return text.nameext_from_url(data["src"], data) + def _extract_categories(self, extr): + return [text.unescape(text.extr(c, ">", "<")) + for c in extr('class="categoryBreadcrumbs">', "</span>") + .split("»")] + def _request_check(self, url, **kwargs): self.request = self._request_original @@ -207,7 +214,7 @@ class HentaifoundryExtractor(Extractor): class HentaifoundryUserExtractor(Dispatch, HentaifoundryExtractor): """Extractor for a hentaifoundry user profile""" - pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/profile" example = "https://www.hentai-foundry.com/user/USER/profile" def items(self): @@ -228,7 +235,7 @@ class HentaifoundryUserExtractor(Dispatch, HentaifoundryExtractor): class HentaifoundryPicturesExtractor(HentaifoundryExtractor): """Extractor for all pictures of a hentaifoundry user""" subcategory = "pictures" - pattern = BASE_PATTERN + r"/pictures/user/([^/?#]+)(?:/page/(\d+))?/?$" + pattern = rf"{BASE_PATTERN}/pictures/user/([^/?#]+)(?:/page/(\d+))?/?$" example = "https://www.hentai-foundry.com/pictures/user/USER" def __init__(self, match): @@ -240,7 +247,7 @@ class HentaifoundryScrapsExtractor(HentaifoundryExtractor): """Extractor for scraps of a hentaifoundry user""" subcategory = "scraps" directory_fmt = ("{category}", "{user}", "Scraps") - pattern = BASE_PATTERN + r"/pictures/user/([^/?#]+)/scraps" + pattern = rf"{BASE_PATTERN}/pictures/user/([^/?#]+)/scraps" example = "https://www.hentai-foundry.com/pictures/user/USER/scraps" def __init__(self, match): @@ -253,7 +260,7 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): subcategory = "favorite" directory_fmt = ("{category}", "{user}", "Favorites") archive_fmt = "f_{user}_{index}" - pattern = BASE_PATTERN + r"/user/([^/?#]+)/faves/pictures" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/faves/pictures" example = "https://www.hentai-foundry.com/user/USER/faves/pictures" def __init__(self, match): @@ -266,7 +273,7 @@ class HentaifoundryTagExtractor(HentaifoundryExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{index}" - pattern = BASE_PATTERN + r"/pictures/tagged/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/pictures/tagged/([^/?#]+)" example = "https://www.hentai-foundry.com/pictures/tagged/TAG" def __init__(self, match): @@ -282,7 +289,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor): subcategory = "recent" directory_fmt = ("{category}", "Recent Pictures", "{date}") archive_fmt = "r_{index}" - pattern = BASE_PATTERN + r"/pictures/recent/(\d\d\d\d-\d\d-\d\d)" + pattern = rf"{BASE_PATTERN}/pictures/recent/(\d\d\d\d-\d\d-\d\d)" example = "https://www.hentai-foundry.com/pictures/recent/1970-01-01" def __init__(self, match): @@ -298,7 +305,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor): subcategory = "popular" directory_fmt = ("{category}", "Popular Pictures") archive_fmt = "p_{index}" - pattern = BASE_PATTERN + r"/pictures/popular()" + pattern = rf"{BASE_PATTERN}/pictures/popular()" example = "https://www.hentai-foundry.com/pictures/popular" def __init__(self, match): @@ -324,7 +331,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): f"/{self.index}/?enterAgree=1") image = self._parse_post(post_url) image["user"] = self.user - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, image["src"], image @@ -332,14 +339,14 @@ class HentaifoundryStoriesExtractor(HentaifoundryExtractor): """Extractor for stories of a hentaifoundry user""" subcategory = "stories" archive_fmt = "s_{index}" - pattern = BASE_PATTERN + r"/stories/user/([^/?#]+)(?:/page/(\d+))?/?$" + pattern = rf"{BASE_PATTERN}/stories/user/([^/?#]+)(?:/page/(\d+))?/?$" example = "https://www.hentai-foundry.com/stories/user/USER" def items(self): self._init_site_filters() for story_html in util.advance(self.stories(), self.start_post): story = self._parse_story(story_html) - yield Message.Directory, story + yield Message.Directory, "", story yield Message.Url, story["src"], story def stories(self): @@ -351,7 +358,7 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor): """Extractor for a hentaifoundry story""" subcategory = "story" archive_fmt = "s_{index}" - pattern = BASE_PATTERN + r"/stories/user/([^/?#]+)/(\d+)" + pattern = rf"{BASE_PATTERN}/stories/user/([^/?#]+)/(\d+)" example = "https://www.hentai-foundry.com/stories/user/USER/12345/TITLE" skip = Extractor.skip @@ -364,5 +371,5 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor): story_url = (f"{self.root}/stories/user/{self.user}" f"/{self.index}/x?enterAgree=1") story = self._parse_story(self.request(story_url).text) - yield Message.Directory, story + yield Message.Directory, "", story yield Message.Url, story["src"], story diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py index f4f9d86..2ca462e 100644 --- a/gallery_dl/extractor/hentaihand.py +++ b/gallery_dl/extractor/hentaihand.py @@ -35,8 +35,7 @@ class HentaihandGalleryExtractor(GalleryExtractor): "language" : info["language"]["name"], "lang" : util.language_to_code(info["language"]["name"]), "tags" : [t["slug"] for t in info["tags"]], - "date" : text.parse_datetime( - info["uploaded_at"], "%Y-%m-%d"), + "date" : self.parse_datetime_iso(info["uploaded_at"]), } for key in ("artists", "authors", "groups", "characters", "relationships", "parodies"): diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py index b894d77..ef72366 100644 --- a/gallery_dl/extractor/hentaihere.py +++ b/gallery_dl/extractor/hentaihere.py @@ -33,7 +33,7 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): title = text.extr(page, "<title>", "</title>") chapter_id = text.extr(page, 'report/C', '"') chapter, sep, minor = self.chapter.partition(".") - match = util.re( + match = text.re( r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by " r"(.+) at ").match(title) return { diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index a75eee0..0eaf798 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -9,7 +9,7 @@ """Extractors for https://hiperdex.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, util +from .. import text from ..cache import memcache BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" @@ -67,7 +67,7 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): """Extractor for hiperdex manga chapters""" - pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))" + pattern = rf"{BASE_PATTERN}(/mangas?/([^/?#]+)/([^/?#]+))" example = "https://hiperdex.com/manga/MANGA/CHAPTER/" def __init__(self, match): @@ -79,7 +79,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): return self.chapter_data(self.chapter) def images(self, page): - pattern = util.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)') + pattern = text.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)') return [ (url.strip(), None) for url in pattern.findall(page) @@ -89,7 +89,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): """Extractor for hiperdex manga""" chapterclass = HiperdexChapterExtractor - pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$" + pattern = rf"{BASE_PATTERN}(/mangas?/([^/?#]+))/?$" example = "https://hiperdex.com/manga/MANGA/" def __init__(self, match): @@ -125,7 +125,7 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): categorytransfer = False chapterclass = HiperdexMangaExtractor reverse = False - pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))" + pattern = rf"{BASE_PATTERN}(/manga-a(?:rtist|uthor)/(?:[^/?#]+))" example = "https://hiperdex.com/manga-artist/NAME/" def __init__(self, match): diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 82bed80..b05a9a7 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -84,7 +84,7 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): "type" : info["type"].capitalize(), "language" : language, "lang" : util.language_to_code(language), - "date" : text.parse_datetime(date, "%Y-%m-%d %H:%M:%S%z"), + "date" : self.parse_datetime_iso(date), "tags" : tags, "artist" : [o["artist"] for o in iget("artists") or ()], "group" : [o["group"] for o in iget("groups") or ()], diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 587d88c..953cf4e 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -30,7 +30,7 @@ class HotleakExtractor(Extractor): .replace("_thumb.", ".") ) post["_http_expected_status"] = (404,) - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, post["url"], post def posts(self): @@ -61,7 +61,7 @@ def decode_video_url(url): class HotleakPostExtractor(HotleakExtractor): """Extractor for individual posts on hotleak""" subcategory = "post" - pattern = (BASE_PATTERN + r"/(?!(?:hot|creators|videos|photos)(?:$|/))" + pattern = (rf"{BASE_PATTERN}/(?!(?:hot|creators|videos|photos)(?:$|/))" r"([^/]+)/(photo|video)/(\d+)") example = "https://hotleak.vip/MODEL/photo/12345" @@ -96,7 +96,7 @@ class HotleakPostExtractor(HotleakExtractor): class HotleakCreatorExtractor(HotleakExtractor): """Extractor for all posts from a hotleak creator""" subcategory = "creator" - pattern = (BASE_PATTERN + r"/(?!(?:hot|creators|videos|photos)(?:$|/))" + pattern = (rf"{BASE_PATTERN}/(?!(?:hot|creators|videos|photos)(?:$|/))" r"([^/?#]+)/?$") example = "https://hotleak.vip/MODEL" @@ -150,7 +150,7 @@ class HotleakCreatorExtractor(HotleakExtractor): class HotleakCategoryExtractor(HotleakExtractor): """Extractor for hotleak categories""" subcategory = "category" - pattern = BASE_PATTERN + r"/(hot|creators|videos|photos)(?:/?\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/(hot|creators|videos|photos)(?:/?\?([^#]+))?" example = "https://hotleak.vip/photos" def __init__(self, match): @@ -172,7 +172,7 @@ class HotleakCategoryExtractor(HotleakExtractor): class HotleakSearchExtractor(HotleakExtractor): """Extractor for hotleak search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search(?:/?\?([^#]+))" + pattern = rf"{BASE_PATTERN}/search(?:/?\?([^#]+))" example = "https://hotleak.vip/search?search=QUERY" def __init__(self, match): diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 26fd595..a8f1298 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -11,7 +11,8 @@ from . import sankaku BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" - r"idol(?:\.sankaku)?complex\.com(?:/[a-z]{2})?") + r"idol(?:\.sankaku)?complex\.com" + r"(?:/[a-z]{2}(?:[-_][A-Z]{2})?)?") class IdolcomplexBase(): @@ -28,17 +29,17 @@ class IdolcomplexBase(): class IdolcomplexTagExtractor(IdolcomplexBase, sankaku.SankakuTagExtractor): """Extractor for idolcomplex tag searches""" - pattern = BASE_PATTERN + r"(?:/posts)?/?\?([^#]*)" + pattern = rf"{BASE_PATTERN}(?:/posts)?/?\?([^#]*)" example = "https://www.idolcomplex.com/en/posts?tags=TAGS" class IdolcomplexPoolExtractor(IdolcomplexBase, sankaku.SankakuPoolExtractor): """Extractor for idolcomplex pools""" - pattern = BASE_PATTERN + r"/pools?/(?:show/)?(\w+)" + pattern = rf"{BASE_PATTERN}/pools?/(?:show/)?(\w+)" example = "https://www.idolcomplex.com/en/pools/0123456789abcdef" class IdolcomplexPostExtractor(IdolcomplexBase, sankaku.SankakuPostExtractor): """Extractor for individual idolcomplex posts""" - pattern = BASE_PATTERN + r"/posts?(?:/show)?/(\w+)" + pattern = rf"{BASE_PATTERN}/posts?(?:/show)?/(\w+)" example = "https://www.idolcomplex.com/en/posts/0123456789abcdef" diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index abba9df..66fbdd6 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -9,7 +9,7 @@ """Extractors for https://www.imagebam.com/""" from .common import Extractor, Message -from .. import text, util +from .. import text class ImagebamExtractor(Extractor): @@ -30,12 +30,10 @@ class ImagebamExtractor(Extractor): url, pos = text.extract(page, '<img src="https://images', '"') filename = text.unescape(text.extract(page, 'alt="', '"', pos)[0]) - data = { + return text.nameext_from_name(filename, { "url" : "https://images" + url, "image_key": path.rpartition("/")[2], - } - data["filename"], _, data["extension"] = filename.rpartition(".") - return data + }) class ImagebamGalleryExtractor(ImagebamExtractor): @@ -58,7 +56,7 @@ class ImagebamGalleryExtractor(ImagebamExtractor): data["count"] = len(images) data["gallery_key"] = self.path.rpartition("/")[2] - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], path in enumerate(images, 1): image = self._parse_image_page(path) image.update(data) @@ -69,7 +67,7 @@ class ImagebamGalleryExtractor(ImagebamExtractor): page, 'id="gallery-name">', '<').strip())} def images(self, page): - findall = util.re(r'<a href="https://www\.imagebam\.com' + findall = text.re(r'<a href="https://www\.imagebam\.com' r'(/(?:image/|view/M)[a-zA-Z0-9]+)').findall paths = [] while True: @@ -96,5 +94,5 @@ class ImagebamImageExtractor(ImagebamExtractor): path = ("/view/" if path[10] == "M" else "/image/") + path[10:] image = self._parse_image_page(path) - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, image["url"], image diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 464e489..074b4ae 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -19,7 +19,7 @@ class ImagechestGalleryExtractor(GalleryExtractor): """Extractor for image galleries from imgchest.com""" category = "imagechest" root = "https://imgchest.com" - pattern = BASE_PATTERN + r"/p/([A-Za-z0-9]{11})" + pattern = rf"{BASE_PATTERN}/p/([A-Za-z0-9]{{11}})" example = "https://imgchest.com/p/abcdefghijk" def __init__(self, match): @@ -53,11 +53,9 @@ class ImagechestGalleryExtractor(GalleryExtractor): def _metadata_api(self, page): post = self.api.post(self.gallery_id) - post["date"] = text.parse_datetime( - post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["created"]) for img in post["images"]: - img["date"] = text.parse_datetime( - img["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + img["date"] = self.parse_datetime_iso(img["created"]) post["gallery_id"] = self.gallery_id post.pop("image_count", None) @@ -80,7 +78,7 @@ class ImagechestUserExtractor(Extractor): category = "imagechest" subcategory = "user" root = "https://imgchest.com" - pattern = BASE_PATTERN + r"/u/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/u/([^/?#]+)" example = "https://imgchest.com/u/USER" def items(self): diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 993af7c..f727969 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -39,7 +39,7 @@ class ImagefapExtractor(Extractor): class ImagefapGalleryExtractor(ImagefapExtractor): """Extractor for image galleries from imagefap.com""" subcategory = "gallery" - pattern = BASE_PATTERN + r"/(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)" + pattern = rf"{BASE_PATTERN}/(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)" example = "https://www.imagefap.com/gallery/12345" def __init__(self, match): @@ -51,7 +51,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor): url = f"{self.root}/gallery/{self.gid}" page = self.request(url).text data = self.get_job_metadata(page) - yield Message.Directory, data + yield Message.Directory, "", data for url, image in self.get_images(): data.update(image) yield Message.Url, url, data @@ -110,7 +110,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor): class ImagefapImageExtractor(ImagefapExtractor): """Extractor for single images from imagefap.com""" subcategory = "image" - pattern = BASE_PATTERN + r"/photo/(\d+)" + pattern = rf"{BASE_PATTERN}/photo/(\d+)" example = "https://www.imagefap.com/photo/12345" def __init__(self, match): @@ -119,7 +119,7 @@ class ImagefapImageExtractor(ImagefapExtractor): def items(self): url, data = self.get_image() - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data def get_image(self): @@ -148,9 +148,9 @@ class ImagefapImageExtractor(ImagefapExtractor): class ImagefapFolderExtractor(ImagefapExtractor): """Extractor for imagefap user folders""" subcategory = "folder" - pattern = (BASE_PATTERN + r"/(?:organizer/|" - r"(?:usergallery\.php\?user(id)?=([^&#]+)&" - r"|profile/([^/?#]+)/galleries\?)folderid=)(\d+|-1)") + pattern = (rf"{BASE_PATTERN}/(?:organizer/|" + rf"(?:usergallery\.php\?user(id)?=([^&#]+)&" + rf"|profile/([^/?#]+)/galleries\?)folderid=)(\d+|-1)") example = "https://www.imagefap.com/organizer/12345" def __init__(self, match): @@ -206,9 +206,9 @@ class ImagefapFolderExtractor(ImagefapExtractor): class ImagefapUserExtractor(ImagefapExtractor): """Extractor for an imagefap user profile""" subcategory = "user" - pattern = (BASE_PATTERN + - r"/(?:profile(?:\.php\?user=|/)([^/?#]+)(?:/galleries)?" - r"|usergallery\.php\?userid=(\d+))(?:$|#)") + pattern = (rf"{BASE_PATTERN}/(?:" + rf"profile(?:\.php\?user=|/)([^/?#]+)(?:/galleries)?|" + rf"usergallery\.php\?userid=(\d+))(?:$|#)") example = "https://www.imagefap.com/profile/USER" def __init__(self, match): diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 817d2c4..21e6cf8 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -19,6 +19,7 @@ class ImagehostImageExtractor(Extractor): basecategory = "imagehost" subcategory = "image" archive_fmt = "{token}" + parent = True _https = True _params = None _cookies = None @@ -27,7 +28,10 @@ class ImagehostImageExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.page_url = f"http{'s' if self._https else ''}://{match[1]}" + if self.root: + self.page_url = f"{self.root}{match[1]}" + else: + self.page_url = f"http{'s' if self._https else ''}://{match[1]}" self.token = match[2] if self._params == "simple": @@ -53,14 +57,25 @@ class ImagehostImageExtractor(Extractor): ).text url, filename = self.get_info(page) - data = text.nameext_from_url(filename, {"token": self.token}) + if not url: + return + + if filename: + data = text.nameext_from_name(filename) + if not data["extension"]: + data["extension"] = text.ext_from_url(url) + else: + data = text.nameext_from_url(url) + data["token"] = self.token + data["post_url"] = self.page_url data.update(self.metadata(page)) + if self._https and url.startswith("http:"): url = "https:" + url[5:] if self._validate is not None: data["_http_validate"] = self._validate - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data def get_info(self, page): @@ -70,6 +85,9 @@ class ImagehostImageExtractor(Extractor): """Return additional metadata""" return () + def not_found(self, resource=None): + raise exception.NotFoundError(resource or self.__class__.subcategory) + class ImxtoImageExtractor(ImagehostImageExtractor): """Extractor for single images from imx.to""" @@ -92,7 +110,7 @@ class ImxtoImageExtractor(ImagehostImageExtractor): url, pos = text.extract( page, '<div style="text-align:center;"><a href="', '"') if not url: - raise exception.NotFoundError("image") + self.not_found() filename, pos = text.extract(page, ' title="', '"', pos) if self.url_ext and filename: filename += splitext(url)[1] @@ -152,7 +170,7 @@ class AcidimgImageExtractor(ImagehostImageExtractor): if not url: url, pos = text.extract(page, '<img class="centred" src="', '"') if not url: - raise exception.NotFoundError("image") + self.not_found() filename, pos = text.extract(page, "alt='", "'", pos) if not filename: @@ -169,7 +187,11 @@ class ImagevenueImageExtractor(ImagehostImageExtractor): example = "https://www.imagevenue.com/ME123456789" def get_info(self, page): - pos = page.index('class="card-body') + try: + pos = page.index('class="card-body') + except ValueError: + self.not_found() + url, pos = text.extract(page, '<img src="', '"', pos) if url.endswith("/loader.svg"): url, pos = text.extract(page, '<img src="', '"', pos) @@ -199,6 +221,8 @@ class ImagetwistImageExtractor(ImagehostImageExtractor): def get_info(self, page): url , pos = text.extract(page, '<img src="', '"') + if url and url.startswith("/imgs/"): + self.not_found() filename, pos = text.extract(page, ' alt="', '"', pos) return url, filename @@ -249,7 +273,7 @@ class ImgspiceImageExtractor(ImagehostImageExtractor): def get_info(self, page): pos = page.find('id="imgpreview"') if pos < 0: - raise exception.NotFoundError("image") + self.not_found() url , pos = text.extract(page, 'src="', '"', pos) name, pos = text.extract(page, 'alt="', '"', pos) return url, text.unescape(name) @@ -258,23 +282,26 @@ class ImgspiceImageExtractor(ImagehostImageExtractor): class PixhostImageExtractor(ImagehostImageExtractor): """Extractor for single images from pixhost.to""" category = "pixhost" - pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)" - r"/show/\d+/(\d+)_[^/?#]+)") + root = "https://pixhost.to" + pattern = (r"(?:https?://)?(?:www\.)?pixhost\.(?:to|org)" + r"(/show/\d+/(\d+)_[^/?#]+)") example = "https://pixhost.to/show/123/12345_NAME.EXT" _cookies = {"pixhostads": "1", "pixhosttest": "1"} def get_info(self, page): - url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"") - filename, pos = text.extract(page, "alt=\"", "\"", pos) - return url, filename + self.kwdict["directory"] = self.page_url.rsplit("/")[-2] + url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"") + name, pos = text.extract(page, "alt=\"", "\"", pos) + return url, text.unescape(name) if name else None class PixhostGalleryExtractor(ImagehostImageExtractor): """Extractor for image galleries from pixhost.to""" category = "pixhost" subcategory = "gallery" - pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)" - r"/gallery/([^/?#]+))") + root = "https://pixhost.to" + pattern = (r"(?:https?://)?(?:www\.)?pixhost\.(?:to|org)" + r"(/gallery/([^/?#]+))") example = "https://pixhost.to/gallery/ID" def items(self): @@ -288,29 +315,39 @@ class PixhostGalleryExtractor(ImagehostImageExtractor): class PostimgImageExtractor(ImagehostImageExtractor): """Extractor for single images from postimages.org""" category = "postimg" - pattern = (r"(?:https?://)?((?:www\.)?(?:postim(?:ages|g)|pixxxels)" - r"\.(?:cc|org)/(?!gallery/)(?:image/)?([^/?#]+)/?)") - example = "https://postimages.org/ID" + root = "https://postimg.cc" + pattern = (r"(?:https?://)?(?:www\.)?(?:postim(?:ages|g)|pixxxels)" + r"\.(?:cc|org)(/(?!gallery/)(?:image/)?([^/?#]+)/?)") + example = "https://postimg.cc/ID" def get_info(self, page): pos = page.index(' id="download"') url , pos = text.rextract(page, ' href="', '"', pos) - filename, pos = text.extract(page, 'class="imagename">', '<', pos) - return url, text.unescape(filename) + filename, pos = text.extract(page, ' class="my-4">', '<', pos) + return url, text.unescape(filename) if filename else None class PostimgGalleryExtractor(ImagehostImageExtractor): """Extractor for images galleries from postimages.org""" category = "postimg" subcategory = "gallery" - pattern = (r"(?:https?://)?((?:www\.)?(?:postim(?:ages|g)|pixxxels)" - r"\.(?:cc|org)/gallery/([^/?#]+))") - example = "https://postimages.org/gallery/ID" + root = "https://postimg.cc" + pattern = (r"(?:https?://)?(?:www\.)?(?:postim(?:ages|g)|pixxxels)" + r"\.(?:cc|org)(/gallery/([^/?#]+))") + example = "https://postimg.cc/gallery/ID" def items(self): page = self.request(self.page_url).text - data = {"_extractor": PostimgImageExtractor} - for url in text.extract_iter(page, ' class="thumb"><a href="', '"'): + title = text.extr( + page, 'property="og:title" content="', ' — Postimages"') + + data = { + "_extractor" : PostimgImageExtractor, + "gallery_title": text.unescape(title), + } + + for token in text.extract_iter(page, 'data-image="', '"'): + url = f"{self.root}/{token}" yield Message.Queue, url, data @@ -323,7 +360,7 @@ class TurboimagehostImageExtractor(ImagehostImageExtractor): def get_info(self, page): url = text.extract(page, 'src="', '"', page.index("<img "))[0] - return url, url + return url, None class TurboimagehostGalleryExtractor(ImagehostImageExtractor): @@ -343,7 +380,7 @@ class TurboimagehostGalleryExtractor(ImagehostImageExtractor): if params["p"] == 1 and \ "Requested gallery don`t exist on our website." in page: - raise exception.NotFoundError("gallery") + self.not_found() thumb_url = None for thumb_url in text.extract_iter(page, '"><a href="', '"'): @@ -362,7 +399,7 @@ class ViprImageExtractor(ImagehostImageExtractor): def get_info(self, page): url = text.extr(page, '<img src="', '"') - return url, url + return url, None class ImgclickImageExtractor(ImagehostImageExtractor): @@ -439,14 +476,16 @@ class ImgdriveImageExtractor(ImagehostImageExtractor): class SilverpicImageExtractor(ImagehostImageExtractor): """Extractor for single images from silverpic.com""" category = "silverpic" - pattern = (r"(?:https?://)?((?:www\.)?silverpic\.com" - r"/([a-z0-9]{10,})/[\S]+\.html)") - example = "https://silverpic.com/a1b2c3d4f5g6/NAME.EXT.html" + root = "https://silverpic.net" + _params = "complex" + pattern = (r"(?:https?://)?(?:www\.)?silverpic\.(?:net|com)" + r"(/([a-z0-9]{10,})/[\S]+\.html)") + example = "https://silverpic.net/a1b2c3d4f5g6/NAME.EXT.html" def get_info(self, page): url, pos = text.extract(page, '<img src="/img/', '"') alt, pos = text.extract(page, 'alt="', '"', pos) - return f"https://silverpic.com/img/{url}", alt + return f"{self.root}/img/{url}", alt def metadata(self, page): pos = page.find('<img src="/img/') diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index d9a63c7..d957328 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -30,7 +30,7 @@ class ImgbbExtractor(Extractor): for image in self.posts(): url = image["url"] text.nameext_from_url(url, image) - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, url, image def login(self): @@ -159,8 +159,7 @@ class ImgbbImageExtractor(ImgbbExtractor): "width" : text.parse_int(extr('"og:image:width" content="', '"')), "height": text.parse_int(extr('"og:image:height" content="', '"')), "album" : extr("Added to <a", "</a>"), - "date" : text.parse_datetime(extr( - '<span title="', '"'), "%Y-%m-%d %H:%M:%S"), + "date" : self.parse_datetime_iso(extr('<span title="', '"')), "user" : util.json_loads(extr( "CHV.obj.resource=", "};") + "}").get("user"), } diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py index 5def88d..8ac66f4 100644 --- a/gallery_dl/extractor/imgbox.py +++ b/gallery_dl/extractor/imgbox.py @@ -9,7 +9,7 @@ """Extractors for https://imgbox.com/""" from .common import Extractor, Message, AsynchronousMixin -from .. import text, util, exception +from .. import text, exception class ImgboxExtractor(Extractor): @@ -19,7 +19,7 @@ class ImgboxExtractor(Extractor): def items(self): data = self.get_job_metadata() - yield Message.Directory, data + yield Message.Directory, "", data for image_key in self.get_image_keys(): imgpage = self.request(self.root + "/" + image_key).text @@ -69,7 +69,7 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor): page = self.request(self.root + "/g/" + self.gallery_key).text if "The specified gallery could not be found." in page: raise exception.NotFoundError("gallery") - self.image_keys = util.re( + self.image_keys = text.re( r'<a href="/([^"]+)"><img alt="').findall(page) title = text.extr(page, "<h1>", "</h1>") @@ -88,7 +88,10 @@ class ImgboxImageExtractor(ImgboxExtractor): """Extractor for single images from imgbox.com""" subcategory = "image" archive_fmt = "{image_key}" - pattern = r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})" + pattern = (r"(?:https?://)?(?:" + r"(?:www\.|i\.)?imgbox\.com|" + r"images\d+\.imgbox\.com/[0-9a-f]{2}/[0-9a-f]{2}" + r")/([A-Za-z0-9]{8})") example = "https://imgbox.com/1234abcd" def __init__(self, match): diff --git a/gallery_dl/extractor/imgpile.py b/gallery_dl/extractor/imgpile.py index 9fc3a9c..f634203 100644 --- a/gallery_dl/extractor/imgpile.py +++ b/gallery_dl/extractor/imgpile.py @@ -54,7 +54,7 @@ class ImgpilePostExtractor(ImgpileExtractor): data = {"post": post} data["count"] = post["count"] = len(files) - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], file in enumerate(files, 1): data.update(file) url = file["url"] diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py index 7e5e6cf..4a81e53 100644 --- a/gallery_dl/extractor/imgth.py +++ b/gallery_dl/extractor/imgth.py @@ -31,7 +31,7 @@ class ImgthGalleryExtractor(GalleryExtractor): "title": text.unescape(extr("<h1>", "</h1>")), "count": text.parse_int(extr( "total of images in this gallery: ", " ")), - "date" : text.parse_datetime( + "date" : self.parse_datetime( extr("created on ", " by <") .replace("th, ", " ", 1).replace("nd, ", " ", 1) .replace("st, ", " ", 1), "%B %d %Y at %H:%M"), diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 1ac76e0..4755388 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -38,7 +38,7 @@ class ImgurExtractor(Extractor): image["url"] = url = \ f"https://i.imgur.com/{image['id']}.{image['ext']}" - image["date"] = text.parse_datetime(image["created_at"]) + image["date"] = self.parse_datetime_iso(image["created_at"]) image["_http_validate"] = self._validate text.nameext_from_url(url, image) @@ -67,7 +67,7 @@ class ImgurImageExtractor(ImgurExtractor): subcategory = "image" filename_fmt = "{category}_{id}{title:?_//}.{extension}" archive_fmt = "{id}" - pattern = (BASE_PATTERN + r"/(?!gallery|search)" + pattern = (rf"{BASE_PATTERN}/(?!gallery|search)" r"(?:r/\w+/)?(?:[^/?#]+-)?(\w{7}|\w{5})[sbtmlh]?") example = "https://imgur.com/abcdefg" @@ -83,7 +83,7 @@ class ImgurImageExtractor(ImgurExtractor): image.update(image["media"][0]) del image["media"] url = self._prepare(image) - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, url, image @@ -93,7 +93,7 @@ class ImgurAlbumExtractor(ImgurExtractor): directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}") filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}" archive_fmt = "{album[id]}_{id}" - pattern = BASE_PATTERN + r"/a/(?:[^/?#]+-)?(\w{7}|\w{5})" + pattern = rf"{BASE_PATTERN}/a/(?:[^/?#]+-)?(\w{{7}}|\w{{5}})" example = "https://imgur.com/a/abcde" def items(self): @@ -106,7 +106,7 @@ class ImgurAlbumExtractor(ImgurExtractor): del album["media"] count = len(images) - album["date"] = text.parse_datetime(album["created_at"]) + album["date"] = self.parse_datetime_iso(album["created_at"]) try: del album["ad_url"] @@ -119,14 +119,15 @@ class ImgurAlbumExtractor(ImgurExtractor): image["num"] = num image["count"] = count image["album"] = album - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, url, image class ImgurGalleryExtractor(ImgurExtractor): """Extractor for imgur galleries""" subcategory = "gallery" - pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(?:[^/?#]+-)?(\w{7}|\w{5})" + pattern = (rf"{BASE_PATTERN}/" + rf"(?:gallery|t/\w+)/(?:[^/?#]+-)?(\w{{7}}|\w{{5}})") example = "https://imgur.com/gallery/abcde" def items(self): @@ -142,7 +143,7 @@ class ImgurGalleryExtractor(ImgurExtractor): class ImgurUserExtractor(ImgurExtractor): """Extractor for all images posted by a user""" subcategory = "user" - pattern = (BASE_PATTERN + r"/user/(?!me(?:/|$|\?|#))" + pattern = (rf"{BASE_PATTERN}/user/(?!me(?:/|$|\?|#))" r"([^/?#]+)(?:/posts|/submitted)?/?$") example = "https://imgur.com/user/USER" @@ -153,7 +154,7 @@ class ImgurUserExtractor(ImgurExtractor): class ImgurFavoriteExtractor(ImgurExtractor): """Extractor for a user's favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/?$" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/favorites/?$" example = "https://imgur.com/user/USER/favorites" def items(self): @@ -163,7 +164,7 @@ class ImgurFavoriteExtractor(ImgurExtractor): class ImgurFavoriteFolderExtractor(ImgurExtractor): """Extractor for a user's favorites folder""" subcategory = "favorite-folder" - pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/folder/(\d+)" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/favorites/folder/(\d+)" example = "https://imgur.com/user/USER/favorites/folder/12345/TITLE" def __init__(self, match): @@ -178,7 +179,7 @@ class ImgurFavoriteFolderExtractor(ImgurExtractor): class ImgurMeExtractor(ImgurExtractor): """Extractor for your personal uploads""" subcategory = "me" - pattern = BASE_PATTERN + r"/user/me(?:/posts)?(/hidden)?" + pattern = rf"{BASE_PATTERN}/user/me(?:/posts)?(/hidden)?" example = "https://imgur.com/user/me" def items(self): @@ -195,7 +196,7 @@ class ImgurMeExtractor(ImgurExtractor): class ImgurSubredditExtractor(ImgurExtractor): """Extractor for a subreddits's imgur links""" subcategory = "subreddit" - pattern = BASE_PATTERN + r"/r/([^/?#]+)/?$" + pattern = rf"{BASE_PATTERN}/r/([^/?#]+)/?$" example = "https://imgur.com/r/SUBREDDIT" def items(self): @@ -205,7 +206,7 @@ class ImgurSubredditExtractor(ImgurExtractor): class ImgurTagExtractor(ImgurExtractor): """Extractor for imgur tag searches""" subcategory = "tag" - pattern = BASE_PATTERN + r"/t/([^/?#]+)$" + pattern = rf"{BASE_PATTERN}/t/([^/?#]+)$" example = "https://imgur.com/t/TAG" def items(self): @@ -215,7 +216,7 @@ class ImgurTagExtractor(ImgurExtractor): class ImgurSearchExtractor(ImgurExtractor): """Extractor for imgur search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search(?:/[^?#]+)?/?\?q=([^&#]+)" + pattern = rf"{BASE_PATTERN}/search(?:/[^?#]+)?/?\?q=([^&#]+)" example = "https://imgur.com/search?q=UERY" def items(self): diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py index 5ad1c30..d83dcc8 100644 --- a/gallery_dl/extractor/imhentai.py +++ b/gallery_dl/extractor/imhentai.py @@ -79,7 +79,7 @@ BASE_PATTERN = ImhentaiExtractor.update({ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): """Extractor for imhentai galleries""" - pattern = BASE_PATTERN + r"/(?:gallery|view)/(\d+)" + pattern = rf"{BASE_PATTERN}/(?:gallery|view)/(\d+)" example = "https://imhentai.xxx/gallery/12345/" def __init__(self, match): @@ -141,7 +141,7 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): class ImhentaiTagExtractor(ImhentaiExtractor): """Extractor for imhentai tag searches""" subcategory = "tag" - pattern = (BASE_PATTERN + r"(/(?:" + pattern = (rf"{BASE_PATTERN}(/(?:" r"artist|category|character|group|language|parody|tag" r")/([^/?#]+))") example = "https://imhentai.xxx/tag/TAG/" @@ -154,9 +154,8 @@ class ImhentaiTagExtractor(ImhentaiExtractor): class ImhentaiSearchExtractor(ImhentaiExtractor): """Extractor for imhentai search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search(/?\?[^#]+|/[^/?#]+/?)" + pattern = rf"{BASE_PATTERN}(/(?:advanced-)?search/?\?[^#]+|/[^/?#]+/?)" example = "https://imhentai.xxx/search/?key=QUERY" def items(self): - url = self.root + "/search" + self.groups[-1] - return self._pagination(url) + return self._pagination(self.root + self.groups[-1]) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 45ae52e..547d4ee 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -35,8 +35,8 @@ class InkbunnyExtractor(Extractor): for post in self.posts(): post.update(metadata) - post["date"] = text.parse_datetime( - post["create_datetime"] + "00", "%Y-%m-%d %H:%M:%S.%f%z") + post["date"] = self.parse_datetime_iso( + post["create_datetime"][:19]) post["tags"] = [kw["keyword_name"] for kw in post["keywords"]] post["ratings"] = [r["name"] for r in post["ratings"]] files = post["files"] @@ -48,12 +48,12 @@ class InkbunnyExtractor(Extractor): del post["keywords"] del post["files"] - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], file in enumerate(files, 1): post.update(file) post["deleted"] = (file["deleted"] == "t") - post["date"] = text.parse_datetime( - file["create_datetime"] + "00", "%Y-%m-%d %H:%M:%S.%f%z") + post["date"] = self.parse_datetime_iso( + file["create_datetime"][:19]) text.nameext_from_url(file["file_name"], post) url = file["file_url_full"] @@ -71,7 +71,7 @@ class InkbunnyExtractor(Extractor): class InkbunnyUserExtractor(InkbunnyExtractor): """Extractor for inkbunny user profiles""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!s/)(gallery/|scraps/)?(\w+)(?:$|[/?#])" + pattern = rf"{BASE_PATTERN}/(?!s/)(gallery/|scraps/)?(\w+)(?:$|[/?#])" example = "https://inkbunny.net/USER" def __init__(self, match): @@ -101,7 +101,7 @@ class InkbunnyUserExtractor(InkbunnyExtractor): class InkbunnyPoolExtractor(InkbunnyExtractor): """Extractor for inkbunny pools""" subcategory = "pool" - pattern = (BASE_PATTERN + r"/(?:" + pattern = (rf"{BASE_PATTERN}/(?:" r"poolview_process\.php\?pool_id=(\d+)|" r"submissionsviewall\.php" r"\?((?:[^#]+&)?mode=pool(?:&[^#]+)?))") @@ -132,7 +132,7 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): """Extractor for inkbunny user favorites""" subcategory = "favorite" directory_fmt = ("{category}", "{favs_username!l}", "Favorites") - pattern = (BASE_PATTERN + r"/(?:" + pattern = (rf"{BASE_PATTERN}/(?:" r"userfavorites_process\.php\?favs_user_id=(\d+)|" r"submissionsviewall\.php" r"\?((?:[^#]+&)?mode=userfavs(?:&[^#]+)?))") @@ -175,7 +175,7 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): class InkbunnyUnreadExtractor(InkbunnyExtractor): """Extractor for unread inkbunny submissions""" subcategory = "unread" - pattern = (BASE_PATTERN + r"/submissionsviewall\.php" + pattern = (rf"{BASE_PATTERN}/submissionsviewall\.php" r"\?((?:[^#]+&)?mode=unreadsubs(?:&[^#]+)?)") example = ("https://inkbunny.net/submissionsviewall.php" "?text=&mode=unreadsubs&type=") @@ -195,7 +195,7 @@ class InkbunnyUnreadExtractor(InkbunnyExtractor): class InkbunnySearchExtractor(InkbunnyExtractor): """Extractor for inkbunny search results""" subcategory = "search" - pattern = (BASE_PATTERN + r"/submissionsviewall\.php" + pattern = (rf"{BASE_PATTERN}/submissionsviewall\.php" r"\?((?:[^#]+&)?mode=search(?:&[^#]+)?)") example = ("https://inkbunny.net/submissionsviewall.php" "?text=TAG&mode=search&type=") @@ -229,7 +229,7 @@ class InkbunnySearchExtractor(InkbunnyExtractor): class InkbunnyFollowingExtractor(InkbunnyExtractor): """Extractor for inkbunny user watches""" subcategory = "following" - pattern = (BASE_PATTERN + r"/(?:" + pattern = (rf"{BASE_PATTERN}/(?:" r"watchlist_process\.php\?mode=watching&user_id=(\d+)|" r"usersviewall\.php" r"\?((?:[^#]+&)?mode=watching(?:&[^#]+)?))") @@ -268,7 +268,7 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor): class InkbunnyPostExtractor(InkbunnyExtractor): """Extractor for individual Inkbunny posts""" subcategory = "post" - pattern = BASE_PATTERN + r"/s/(\d+)" + pattern = rf"{BASE_PATTERN}/s/(\d+)" example = "https://inkbunny.net/s/12345" def __init__(self, match): diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 71964e9..b89369f 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -16,7 +16,7 @@ import itertools import binascii BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com" -USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)" +USER_PATTERN = rf"{BASE_PATTERN}/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)" class InstagramExtractor(Extractor): @@ -38,7 +38,7 @@ class InstagramExtractor(Extractor): def _init(self): self.www_claim = "0" self.csrf_token = util.generate_token() - self._find_tags = util.re(r"#\w+").findall + self._find_tags = text.re(r"#\w+").findall self._logged_in = True self._cursor = None self._user = None @@ -62,8 +62,10 @@ class InstagramExtractor(Extractor): data = self.metadata() if videos := self.config("videos", True): - videos_dash = (videos != "merged") + self.videos_dash = videos_dash = (videos != "merged") videos_headers = {"User-Agent": "Mozilla/5.0"} + else: + self.videos_dash = False previews = self.config("previews", False) max_posts = self.config("max-posts") @@ -86,7 +88,7 @@ class InstagramExtractor(Extractor): files = post.pop("_files") post["count"] = len(files) - yield Message.Directory, post + yield Message.Directory, "", post if "date" in post: del post["date"] @@ -173,7 +175,7 @@ class InstagramExtractor(Extractor): post_url = f"{self.root}/stories/highlights/{reel_id}/" data = { "user" : post.get("user"), - "expires": text.parse_timestamp(expires), + "expires": self.parse_timestamp(expires), "post_id": reel_id, "post_shortcode": shortcode_from_id(reel_id), "post_url": post_url, @@ -224,7 +226,7 @@ class InstagramExtractor(Extractor): data["owner_id"] = owner["pk"] data["username"] = owner.get("username") data["fullname"] = owner.get("full_name") - data["post_date"] = data["date"] = text.parse_timestamp( + data["post_date"] = data["date"] = self.parse_timestamp( post.get("taken_at") or post.get("created_at") or post.get("seen")) data["_files"] = files = [] for num, item in enumerate(items, 1): @@ -236,13 +238,23 @@ class InstagramExtractor(Extractor): data["post_shortcode"]) continue + width_orig = item.get("original_width", 0) + height_orig = item.get("original_height", 0) + if video_versions := item.get("video_versions"): video = max( video_versions, key=lambda x: (x["width"], x["height"], x["type"]), ) - manifest = item.get("video_dash_manifest") + media = video + if (manifest := item.get("video_dash_manifest")) and \ + self.videos_dash: + width = width_orig + height = height_orig + else: + width = video["width"] + height = video["height"] if self._warn_video: self._warn_video = False @@ -254,22 +266,21 @@ class InstagramExtractor(Extractor): else: video = manifest = None media = image + width = image["width"] + height = image["height"] - if self._warn_image < ( - (image["width"] < item.get("original_width", 0)) + - (image["height"] < item.get("original_height", 0))): + if self._warn_image < ((width < width_orig) + + (height < height_orig)): self.log.warning( "%s: Available image resolutions lower than the " "original (%sx%s < %sx%s). " "Consider refreshing your cookies.", data["post_shortcode"], - image["width"], image["height"], - item.get("original_width", 0), - item.get("original_height", 0)) + width, height, width_orig, height_orig) media = { "num" : num, - "date" : text.parse_timestamp(item.get("taken_at") or + "date" : self.parse_timestamp(item.get("taken_at") or media.get("taken_at") or post.get("taken_at")), "media_id" : item["pk"], @@ -277,8 +288,10 @@ class InstagramExtractor(Extractor): shortcode_from_id(item["pk"])), "display_url": image["url"], "video_url" : video["url"] if video else None, - "width" : media["width"], - "height" : media["height"], + "width" : width, + "width_original" : width_orig, + "height" : height, + "height_original": height_orig, } if manifest is not None: @@ -288,7 +301,9 @@ class InstagramExtractor(Extractor): if "reshared_story_media_author" in item: media["author"] = item["reshared_story_media_author"] if "expiring_at" in item: - media["expires"] = text.parse_timestamp(post["expiring_at"]) + media["expires"] = self.parse_timestamp(post["expiring_at"]) + if "subscription_media_visibility" in item: + media["subscription"] = item["subscription_media_visibility"] self._extract_tagged_users(item, media) files.append(media) @@ -331,7 +346,7 @@ class InstagramExtractor(Extractor): "post_id" : post["id"], "post_shortcode": post["shortcode"], "post_url" : f"{self.root}/p/{post['shortcode']}/", - "post_date" : text.parse_timestamp(post["taken_at_timestamp"]), + "post_date" : self.parse_timestamp(post["taken_at_timestamp"]), "description": text.parse_unicode_escapes("\n".join( edge["node"]["text"] for edge in post["edge_media_to_caption"]["edges"] @@ -490,7 +505,7 @@ class InstagramPostExtractor(InstagramExtractor): class InstagramUserExtractor(Dispatch, InstagramExtractor): """Extractor for an Instagram user profile""" - pattern = USER_PATTERN + r"/?(?:$|[?#])" + pattern = rf"{USER_PATTERN}/?(?:$|[?#])" example = "https://www.instagram.com/USER/" def items(self): @@ -510,7 +525,7 @@ class InstagramUserExtractor(Dispatch, InstagramExtractor): class InstagramPostsExtractor(InstagramExtractor): """Extractor for an Instagram user's posts""" subcategory = "posts" - pattern = USER_PATTERN + r"/posts" + pattern = rf"{USER_PATTERN}/posts" example = "https://www.instagram.com/USER/posts/" def posts(self): @@ -527,7 +542,7 @@ class InstagramPostsExtractor(InstagramExtractor): class InstagramReelsExtractor(InstagramExtractor): """Extractor for an Instagram user's reels""" subcategory = "reels" - pattern = USER_PATTERN + r"/reels" + pattern = rf"{USER_PATTERN}/reels" example = "https://www.instagram.com/USER/reels/" def posts(self): @@ -544,7 +559,7 @@ class InstagramReelsExtractor(InstagramExtractor): class InstagramTaggedExtractor(InstagramExtractor): """Extractor for an Instagram user's tagged posts""" subcategory = "tagged" - pattern = USER_PATTERN + r"/tagged" + pattern = rf"{USER_PATTERN}/tagged" example = "https://www.instagram.com/USER/tagged/" def metadata(self): @@ -570,7 +585,7 @@ class InstagramTaggedExtractor(InstagramExtractor): class InstagramGuideExtractor(InstagramExtractor): """Extractor for an Instagram guide""" subcategory = "guide" - pattern = USER_PATTERN + r"/guide/[^/?#]+/(\d+)" + pattern = rf"{USER_PATTERN}/guide/[^/?#]+/(\d+)" example = "https://www.instagram.com/USER/guide/NAME/12345" def __init__(self, match): @@ -587,7 +602,7 @@ class InstagramGuideExtractor(InstagramExtractor): class InstagramSavedExtractor(InstagramExtractor): """Extractor for an Instagram user's saved media""" subcategory = "saved" - pattern = USER_PATTERN + r"/saved(?:/all-posts)?/?$" + pattern = rf"{USER_PATTERN}/saved(?:/all-posts)?/?$" example = "https://www.instagram.com/USER/saved/" def posts(self): @@ -597,7 +612,7 @@ class InstagramSavedExtractor(InstagramExtractor): class InstagramCollectionExtractor(InstagramExtractor): """Extractor for Instagram collection""" subcategory = "collection" - pattern = USER_PATTERN + r"/saved/([^/?#]+)/([^/?#]+)" + pattern = rf"{USER_PATTERN}/saved/([^/?#]+)/([^/?#]+)" example = "https://www.instagram.com/USER/saved/COLLECTION/12345" def __init__(self, match): @@ -623,7 +638,7 @@ class InstagramStoriesTrayExtractor(InstagramExtractor): def items(self): base = f"{self.root}/stories/id:" for story in self.api.reels_tray(): - story["date"] = text.parse_timestamp(story["latest_reel_media"]) + story["date"] = self.parse_timestamp(story["latest_reel_media"]) story["_extractor"] = InstagramStoriesExtractor yield Message.Queue, f"{base}{story['id']}/", story @@ -681,7 +696,7 @@ class InstagramStoriesExtractor(InstagramExtractor): class InstagramHighlightsExtractor(InstagramExtractor): """Extractor for an Instagram user's story highlights""" subcategory = "highlights" - pattern = USER_PATTERN + r"/highlights" + pattern = rf"{USER_PATTERN}/highlights" example = "https://www.instagram.com/USER/highlights/" def posts(self): @@ -692,7 +707,7 @@ class InstagramHighlightsExtractor(InstagramExtractor): class InstagramFollowersExtractor(InstagramExtractor): """Extractor for an Instagram user's followers""" subcategory = "followers" - pattern = USER_PATTERN + r"/followers" + pattern = rf"{USER_PATTERN}/followers" example = "https://www.instagram.com/USER/followers/" def items(self): @@ -706,7 +721,7 @@ class InstagramFollowersExtractor(InstagramExtractor): class InstagramFollowingExtractor(InstagramExtractor): """Extractor for an Instagram user's followed users""" subcategory = "following" - pattern = USER_PATTERN + r"/following" + pattern = rf"{USER_PATTERN}/following" example = "https://www.instagram.com/USER/following/" def items(self): @@ -721,7 +736,7 @@ class InstagramTagExtractor(InstagramExtractor): """Extractor for Instagram tags""" subcategory = "tag" directory_fmt = ("{category}", "{subcategory}", "{tag}") - pattern = BASE_PATTERN + r"/explore/tags/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/explore/tags/([^/?#]+)" example = "https://www.instagram.com/explore/tags/TAG/" def metadata(self): @@ -734,7 +749,7 @@ class InstagramTagExtractor(InstagramExtractor): class InstagramInfoExtractor(InstagramExtractor): """Extractor for an Instagram user's profile data""" subcategory = "info" - pattern = USER_PATTERN + r"/info" + pattern = rf"{USER_PATTERN}/info" example = "https://www.instagram.com/USER/info/" def items(self): @@ -744,13 +759,13 @@ class InstagramInfoExtractor(InstagramExtractor): else: user = self.api.user_by_name(screen_name) - return iter(((Message.Directory, user),)) + return iter(((Message.Directory, "", user),)) class InstagramAvatarExtractor(InstagramExtractor): """Extractor for an Instagram user's avatar""" subcategory = "avatar" - pattern = USER_PATTERN + r"/avatar" + pattern = rf"{USER_PATTERN}/avatar" example = "https://www.instagram.com/USER/avatar/" def posts(self): @@ -858,8 +873,11 @@ class InstagramRestAPI(): def user_by_name(self, screen_name): endpoint = "/v1/users/web_profile_info/" params = {"username": screen_name} - return self._call( - endpoint, params=params, notfound="user")["data"]["user"] + try: + return self._call( + endpoint, params=params, notfound="user")["data"]["user"] + except KeyError: + raise exception.NotFoundError("user") @memcache(keyarg=1) def user_by_id(self, user_id): diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index 06c5caa..c3fbf8d 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -36,8 +36,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): '{"":' + data.replace('\\"', '"'))) doc = data["initialDocumentData"]["document"] - doc["date"] = text.parse_datetime( - doc["originalPublishDateInISOString"], "%Y-%m-%dT%H:%M:%S.%fZ") + doc["date"] = self.parse_datetime_iso( + doc["originalPublishDateInISOString"]) self.count = text.parse_int(doc["pageCount"]) self.base = (f"https://image.isu.pub/{doc['revisionId']}-" @@ -68,7 +68,7 @@ class IssuuUserExtractor(IssuuBase, Extractor): data = text.extr(html, '\\"docs\\":', '}]\\n"]') docs = util.json_loads(data.replace('\\"', '"')) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) return for publication in docs: diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 19ffc50..566ee8b 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -13,7 +13,7 @@ from ..cache import memcache from .. import text, util BASE_PATTERN = r"(?:https?://)?itaku\.ee" -USER_PATTERN = BASE_PATTERN + r"/profile/([^/?#]+)" +USER_PATTERN = rf"{BASE_PATTERN}/profile/([^/?#]+)" class ItakuExtractor(Extractor): @@ -32,8 +32,7 @@ class ItakuExtractor(Extractor): def items(self): if images := self.images(): for image in images: - image["date"] = text.parse_datetime( - image["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ") + image["date"] = self.parse_datetime_iso(image["date_added"]) for category, tags in image.pop("categorized_tags").items(): image[f"tags_{category.lower()}"] = [ t["name"] for t in tags] @@ -52,7 +51,7 @@ class ItakuExtractor(Extractor): else: url = image["image"] - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, url, text.nameext_from_url(url, image) return @@ -60,15 +59,14 @@ class ItakuExtractor(Extractor): for post in posts: images = post.pop("gallery_images") or () post["count"] = len(images) - post["date"] = text.parse_datetime( - post["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["date_added"]) post["tags"] = [t["name"] for t in post["tags"]] - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], image in enumerate(images, 1): post["file"] = image - image["date"] = text.parse_datetime( - image["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ") + image["date"] = self.parse_datetime_iso( + image["date_added"]) url = image["image"] yield Message.Url, url, text.nameext_from_url(url, post) @@ -88,7 +86,7 @@ class ItakuExtractor(Extractor): class ItakuGalleryExtractor(ItakuExtractor): """Extractor for an itaku user's gallery""" subcategory = "gallery" - pattern = USER_PATTERN + r"/gallery(?:/(\d+))?" + pattern = rf"{USER_PATTERN}/gallery(?:/(\d+))?" example = "https://itaku.ee/profile/USER/gallery" def images(self): @@ -106,7 +104,7 @@ class ItakuPostsExtractor(ItakuExtractor): "{id}{title:? //}") filename_fmt = "{file[id]}{file[title]:? //}.{extension}" archive_fmt = "{id}_{file[id]}" - pattern = USER_PATTERN + r"/posts(?:/(\d+))?" + pattern = rf"{USER_PATTERN}/posts(?:/(\d+))?" example = "https://itaku.ee/profile/USER/posts" def posts(self): @@ -120,7 +118,7 @@ class ItakuPostsExtractor(ItakuExtractor): class ItakuStarsExtractor(ItakuExtractor): """Extractor for an itaku user's starred images""" subcategory = "stars" - pattern = USER_PATTERN + r"/stars(?:/(\d+))?" + pattern = rf"{USER_PATTERN}/stars(?:/(\d+))?" example = "https://itaku.ee/profile/USER/stars" def images(self): @@ -134,7 +132,7 @@ class ItakuStarsExtractor(ItakuExtractor): class ItakuFollowingExtractor(ItakuExtractor): subcategory = "following" - pattern = USER_PATTERN + r"/following" + pattern = rf"{USER_PATTERN}/following" example = "https://itaku.ee/profile/USER/following" def users(self): @@ -145,7 +143,7 @@ class ItakuFollowingExtractor(ItakuExtractor): class ItakuFollowersExtractor(ItakuExtractor): subcategory = "followers" - pattern = USER_PATTERN + r"/followers" + pattern = rf"{USER_PATTERN}/followers" example = "https://itaku.ee/profile/USER/followers" def users(self): @@ -157,7 +155,7 @@ class ItakuFollowersExtractor(ItakuExtractor): class ItakuBookmarksExtractor(ItakuExtractor): """Extractor for an itaku bookmarks folder""" subcategory = "bookmarks" - pattern = USER_PATTERN + r"/bookmarks/(image|user)/(\d+)" + pattern = rf"{USER_PATTERN}/bookmarks/(image|user)/(\d+)" example = "https://itaku.ee/profile/USER/bookmarks/image/12345" def _init(self): @@ -178,23 +176,23 @@ class ItakuBookmarksExtractor(ItakuExtractor): class ItakuUserExtractor(Dispatch, ItakuExtractor): """Extractor for itaku user profiles""" - pattern = USER_PATTERN + r"/?(?:$|\?|#)" + pattern = rf"{USER_PATTERN}/?(?:$|\?|#)" example = "https://itaku.ee/profile/USER" def items(self): base = f"{self.root}/profile/{self.groups[0]}/" return self._dispatch_extractors(( - (ItakuGalleryExtractor , base + "gallery"), - (ItakuPostsExtractor , base + "posts"), - (ItakuFollowersExtractor, base + "followers"), - (ItakuFollowingExtractor, base + "following"), - (ItakuStarsExtractor , base + "stars"), + (ItakuGalleryExtractor , f"{base}gallery"), + (ItakuPostsExtractor , f"{base}posts"), + (ItakuFollowersExtractor, f"{base}followers"), + (ItakuFollowingExtractor, f"{base}following"), + (ItakuStarsExtractor , f"{base}stars"), ), ("gallery",)) class ItakuImageExtractor(ItakuExtractor): subcategory = "image" - pattern = BASE_PATTERN + r"/images/(\d+)" + pattern = rf"{BASE_PATTERN}/images/(\d+)" example = "https://itaku.ee/images/12345" def images(self): @@ -207,7 +205,7 @@ class ItakuPostExtractor(ItakuExtractor): "{id}{title:? //}") filename_fmt = "{file[id]}{file[title]:? //}.{extension}" archive_fmt = "{id}_{file[id]}" - pattern = BASE_PATTERN + r"/posts/(\d+)" + pattern = rf"{BASE_PATTERN}/posts/(\d+)" example = "https://itaku.ee/posts/12345" def posts(self): @@ -216,7 +214,7 @@ class ItakuPostExtractor(ItakuExtractor): class ItakuSearchExtractor(ItakuExtractor): subcategory = "search" - pattern = BASE_PATTERN + r"/home/images/?\?([^#]+)" + pattern = rf"{BASE_PATTERN}/home/images/?\?([^#]+)" example = "https://itaku.ee/home/images?tags=SEARCH" def images(self): @@ -248,7 +246,7 @@ class ItakuAPI(): def __init__(self, extractor): self.extractor = extractor - self.root = extractor.root + "/api" + self.root = f"{extractor.root}/api" self.headers = { "Accept": "application/json, text/plain, */*", } @@ -259,7 +257,7 @@ class ItakuAPI(): "cursor" : None, "date_range": "", "maturity_rating": ("SFW", "Questionable", "NSFW"), - "ordering" : "-date_added", + "ordering" : self._order(), "page" : "1", "page_size" : "30", "visibility": ("PUBLIC", "PROFILE_ONLY"), @@ -273,7 +271,7 @@ class ItakuAPI(): "cursor" : None, "date_range": "", "maturity_rating": ("SFW", "Questionable", "NSFW"), - "ordering" : "-date_added", + "ordering" : self._order(), "page" : "1", "page_size" : "30", **params, @@ -284,7 +282,7 @@ class ItakuAPI(): endpoint = "/user_profiles/" params = { "cursor" : None, - "ordering" : "-date_added", + "ordering" : self._order(), "page" : "1", "page_size": "50", "sfw_only" : "false", @@ -311,7 +309,7 @@ class ItakuAPI(): def _call(self, endpoint, params=None): if not endpoint.startswith("http"): - endpoint = self.root + endpoint + endpoint = f"{self.root}{endpoint}" return self.extractor.request_json( endpoint, params=params, headers=self.headers) @@ -330,3 +328,11 @@ class ItakuAPI(): return data = self._call(url_next) + + def _order(self): + if order := self.extractor.config("order"): + if order in {"a", "asc", "r", "reverse"}: + return "date_added" + if order not in {"d", "desc"}: + return order + return "-date_added" diff --git a/gallery_dl/extractor/itchio.py b/gallery_dl/extractor/itchio.py index 6312e58..6fefe33 100644 --- a/gallery_dl/extractor/itchio.py +++ b/gallery_dl/extractor/itchio.py @@ -57,5 +57,5 @@ class ItchioGameExtractor(Extractor): game = {"game": game, "user": user, "id": upload_id} url = info["url"] - yield Message.Directory, game + yield Message.Directory, "", game yield Message.Url, url, text.nameext_from_url(url, game) diff --git a/gallery_dl/extractor/iwara.py b/gallery_dl/extractor/iwara.py index 8af2f42..d9d1cf0 100644 --- a/gallery_dl/extractor/iwara.py +++ b/gallery_dl/extractor/iwara.py @@ -47,7 +47,7 @@ class IwaraExtractor(Extractor): group_info["type"] = "image" group_info["count"] = len(files) - yield Message.Directory, group_info + yield Message.Directory, "", group_info for num, file in enumerate(files, 1): file_info = self.extract_media_info(file, None) file_id = file_info["file_id"] @@ -78,7 +78,7 @@ class IwaraExtractor(Extractor): video["id"], exc.__class__.__name__, exc) continue - yield Message.Directory, info + yield Message.Directory, "", info yield Message.Url, f"https:{download_url}", info def items_user(self, users, key=None): @@ -122,10 +122,10 @@ class IwaraExtractor(Extractor): info["file_id"] = file_info.get("id") info["filename"] = filename info["extension"] = extension - info["date"] = text.parse_datetime( - file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ") - info["date_updated"] = text.parse_datetime( - file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ") + info["date"] = self.parse_datetime_iso( + file_info.get("createdAt")) + info["date_updated"] = self.parse_datetime_iso( + file_info.get("updatedAt")) info["mime"] = file_info.get("mime") info["size"] = file_info.get("size") info["width"] = file_info.get("width") @@ -144,8 +144,7 @@ class IwaraExtractor(Extractor): "status" : user.get("status"), "role" : user.get("role"), "premium": user.get("premium"), - "date" : text.parse_datetime( - user.get("createdAt"), "%Y-%m-%dT%H:%M:%S.000Z"), + "date" : self.parse_datetime_iso(user.get("createdAt")), "description": profile.get("body"), } diff --git a/gallery_dl/extractor/jschan.py b/gallery_dl/extractor/jschan.py index 5f3e75a..5dacf70 100644 --- a/gallery_dl/extractor/jschan.py +++ b/gallery_dl/extractor/jschan.py @@ -30,7 +30,7 @@ class JschanThreadExtractor(JschanExtractor): "{threadId} {subject|nomarkup[:50]}") filename_fmt = "{postId}{num:?-//} {filename}.{extension}" archive_fmt = "{board}_{postId}_{num}" - pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)\.html" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/thread/(\d+)\.html" example = "https://94chan.org/a/thread/12345.html" def items(self): @@ -39,7 +39,7 @@ class JschanThreadExtractor(JschanExtractor): thread["threadId"] = thread["postId"] posts = thread.pop("replies", ()) - yield Message.Directory, thread + yield Message.Directory, "", thread for post in itertools.chain((thread,), posts): if files := post.pop("files", ()): thread.update(post) @@ -56,7 +56,7 @@ class JschanThreadExtractor(JschanExtractor): class JschanBoardExtractor(JschanExtractor): """Extractor for jschan boards""" subcategory = "board" - pattern = (BASE_PATTERN + r"/([^/?#]+)" + pattern = (rf"{BASE_PATTERN}/([^/?#]+)" r"(?:/index\.html|/catalog\.html|/\d+\.html|/?$)") example = "https://94chan.org/a/" diff --git a/gallery_dl/extractor/kabeuchi.py b/gallery_dl/extractor/kabeuchi.py index c259c47..88f2e32 100644 --- a/gallery_dl/extractor/kabeuchi.py +++ b/gallery_dl/extractor/kabeuchi.py @@ -32,9 +32,8 @@ class KabeuchiUserExtractor(Extractor): if post.get("is_ad") or not post["image1"]: continue - post["date"] = text.parse_datetime( - post["created_at"], "%Y-%m-%d %H:%M:%S") - yield Message.Directory, post + post["date"] = self.parse_datetime_iso(post["created_at"]) + yield Message.Directory, "", post for key in keys: name = post[key] diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py index f55a930..3c1eb24 100644 --- a/gallery_dl/extractor/keenspot.py +++ b/gallery_dl/extractor/keenspot.py @@ -34,7 +34,7 @@ class KeenspotComicExtractor(Extractor): def items(self): data = {"comic": self.comic} - yield Message.Directory, data + yield Message.Directory, "", data with self.request(self.root + "/") as response: if response.history: diff --git a/gallery_dl/extractor/kemono.py b/gallery_dl/extractor/kemono.py index b4a8abc..bf35670 100644 --- a/gallery_dl/extractor/kemono.py +++ b/gallery_dl/extractor/kemono.py @@ -16,7 +16,7 @@ import json BASE_PATTERN = (r"(?:https?://)?(?:www\.|beta\.)?" r"(kemono|coomer)\.(cr|s[tu]|party)") -USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" +USER_PATTERN = rf"{BASE_PATTERN}/([^/?#]+)/user/([^/?#]+)" HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})" @@ -44,7 +44,7 @@ class KemonoExtractor(Extractor): order = self.config("order-revisions") self.revisions_reverse = order[0] in ("r", "a") if order else False - self._find_inline = util.re( + self._find_inline = text.re( r'src="(?:https?://(?:kemono\.cr|coomer\.st))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall self._json_dumps = json.JSONEncoder( @@ -52,7 +52,7 @@ class KemonoExtractor(Extractor): sort_keys=True, separators=(",", ":")).encode def items(self): - find_hash = util.re(HASH_PATTERN).match + find_hash = text.re(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) announcements = True if self.config("announcements") else None archives = True if self.config("archives") else False @@ -145,18 +145,24 @@ class KemonoExtractor(Extractor): file["hash"] = hash = "" if url[0] == "/": - url = self.root + "/data" + url + url = f"{self.root}/data{url}" elif url.startswith(self.root): - url = self.root + "/data" + url[20:] + url = f"{self.root}/data{url[20:]}" file["url"] = url - text.nameext_from_url(file.get("name", url), file) - ext = text.ext_from_url(url) - if not file["extension"]: - file["extension"] = ext - elif ext == "txt" and file["extension"] != "txt": - file["_http_validate"] = _validate - elif ext in exts_archive or \ + if name := file.get("name"): + text.nameext_from_name(name, file) + ext = text.ext_from_url(url) + + if not file["extension"]: + file["extension"] = ext + elif ext == "txt" and file["extension"] != "txt": + file["_http_validate"] = _validate + else: + text.nameext_from_url(url, file) + ext = file["extension"] + + if ext in exts_archive or \ ext == "bin" and file["extension"] in exts_archive: file["type"] = "archive" if archives: @@ -176,7 +182,7 @@ class KemonoExtractor(Extractor): files.append(file) post["count"] = len(files) - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], file in enumerate(files, 1): if "id" in file: del file["id"] @@ -194,13 +200,13 @@ class KemonoExtractor(Extractor): username = username[0] self.log.info("Logging in as %s", username) - url = self.root + "/api/v1/authentication/login" + url = f"{self.root}/api/v1/authentication/login" data = {"username": username, "password": password} response = self.request(url, method="POST", json=data, fatal=False) if response.status_code >= 400: try: - msg = '"' + response.json()["error"] + '"' + msg = f'"{response.json()["error"]}"' except Exception: msg = '"Username or password is incorrect"' raise exception.AuthenticationError(msg) @@ -238,7 +244,7 @@ class KemonoExtractor(Extractor): def _parse_datetime(self, date_string): if len(date_string) > 19: date_string = date_string[:19] - return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S") + return self.parse_datetime_iso(date_string) def _revisions(self, posts): return itertools.chain.from_iterable( @@ -316,7 +322,7 @@ def _validate(response): class KemonoUserExtractor(KemonoExtractor): """Extractor for all posts from a kemono.cr user listing""" subcategory = "user" - pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)" + pattern = rf"{USER_PATTERN}/?(?:\?([^#]+))?(?:$|\?|#)" example = "https://kemono.cr/SERVICE/user/12345" def __init__(self, match): @@ -339,7 +345,7 @@ class KemonoUserExtractor(KemonoExtractor): class KemonoPostsExtractor(KemonoExtractor): """Extractor for kemono.cr post listings""" subcategory = "posts" - pattern = BASE_PATTERN + r"/posts()()(?:/?\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/posts()()(?:/?\?([^#]+))?" example = "https://kemono.cr/posts" def posts(self): @@ -351,7 +357,7 @@ class KemonoPostsExtractor(KemonoExtractor): class KemonoPostExtractor(KemonoExtractor): """Extractor for a single kemono.cr post""" subcategory = "post" - pattern = USER_PATTERN + r"/post/([^/?#]+)(/revisions?(?:/(\d*))?)?" + pattern = rf"{USER_PATTERN}/post/([^/?#]+)(/revisions?(?:/(\d*))?)?" example = "https://kemono.cr/SERVICE/user/12345/post/12345" def __init__(self, match): @@ -384,7 +390,7 @@ class KemonoDiscordExtractor(KemonoExtractor): "{server_id} {server}", "{channel_id} {channel}") filename_fmt = "{id}_{num:>02}_{filename}.{extension}" archive_fmt = "discord_{server_id}_{id}_{num}" - pattern = BASE_PATTERN + r"/discord/server/(\d+)[/#](?:channel/)?(\d+)" + pattern = rf"{BASE_PATTERN}/discord/server/(\d+)[/#](?:channel/)?(\d+)" example = "https://kemono.cr/discord/server/12345/12345" def items(self): @@ -407,10 +413,10 @@ class KemonoDiscordExtractor(KemonoExtractor): "parent_id" : channel["parent_channel_id"], } - find_inline = util.re( + find_inline = text.re( r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall - find_hash = util.re(HASH_PATTERN).match + find_hash = text.re(HASH_PATTERN).match if (order := self.config("order-posts")) and order[0] in ("r", "d"): posts = self.api.discord_channel(channel_id, channel["post_count"]) @@ -428,13 +434,13 @@ class KemonoDiscordExtractor(KemonoExtractor): attachment["type"] = "attachment" files.append(attachment) for path in find_inline(post["content"] or ""): - files.append({"path": "https://cdn.discordapp.com" + path, + files.append({"path": f"https://cdn.discordapp.com{path}", "name": path, "type": "inline", "hash": ""}) post.update(data) post["date"] = self._parse_datetime(post["published"]) post["count"] = len(files) - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], file in enumerate(files, 1): post["hash"] = file["hash"] @@ -446,15 +452,15 @@ class KemonoDiscordExtractor(KemonoExtractor): post["extension"] = text.ext_from_url(url) if url[0] == "/": - url = self.root + "/data" + url + url = f"{self.root}/data{url}" elif url.startswith(self.root): - url = self.root + "/data" + url[20:] + url = f"{self.root}/data{url[20:]}" yield Message.Url, url, post class KemonoDiscordServerExtractor(KemonoExtractor): subcategory = "discord-server" - pattern = BASE_PATTERN + r"/discord/server/(\d+)$" + pattern = rf"{BASE_PATTERN}/discord/server/(\d+)$" example = "https://kemono.cr/discord/server/12345" def items(self): @@ -482,7 +488,7 @@ def discord_server_info(extr, server_id): class KemonoFavoriteExtractor(KemonoExtractor): """Extractor for kemono.cr favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/(?:account/)?favorites()()(?:/?\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/(?:account/)?favorites()()(?:/?\?([^#]+))?" example = "https://kemono.cr/account/favorites/artists" def items(self): @@ -530,7 +536,7 @@ class KemonoFavoriteExtractor(KemonoExtractor): class KemonoArtistsExtractor(KemonoExtractor): """Extractor for kemono artists""" subcategory = "artists" - pattern = BASE_PATTERN + r"/artists(?:\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/artists(?:\?([^#]+))?" example = "https://kemono.cr/artists" def items(self): @@ -564,32 +570,32 @@ class KemonoArtistsExtractor(KemonoExtractor): class KemonoAPI(): - """Interface for the Kemono API v1.1.0 + """Interface for the Kemono API v1.3.0 https://kemono.cr/documentation/api """ def __init__(self, extractor): self.extractor = extractor - self.root = extractor.root + "/api/v1" + self.root = f"{extractor.root}/api" self.headers = {"Accept": "text/css"} def posts(self, offset=0, query=None, tags=None): - endpoint = "/posts" + endpoint = "/v1/posts" params = {"q": query, "o": offset, "tag": tags} return self._pagination(endpoint, params, 50, "posts") def file(self, file_hash): - endpoint = "/file/" + file_hash + endpoint = f"/v1/file/{file_hash}" return self._call(endpoint) def creators(self): - endpoint = "/creators" + endpoint = "/v1/creators" return self._call(endpoint) def creator_posts(self, service, creator_id, offset=0, query=None, tags=None): - endpoint = f"/{service}/user/{creator_id}/posts" + endpoint = f"/v1/{service}/user/{creator_id}/posts" params = {"o": offset, "tag": tags, "q": query} return self._pagination(endpoint, params, 50) @@ -601,58 +607,58 @@ class KemonoAPI(): service, creator_id, post["id"])["post"] def creator_announcements(self, service, creator_id): - endpoint = f"/{service}/user/{creator_id}/announcements" + endpoint = f"/v1/{service}/user/{creator_id}/announcements" return self._call(endpoint) def creator_dms(self, service, creator_id): - endpoint = f"/{service}/user/{creator_id}/dms" + endpoint = f"/v1/{service}/user/{creator_id}/dms" return self._call(endpoint) def creator_fancards(self, service, creator_id): - endpoint = f"/{service}/user/{creator_id}/fancards" + endpoint = f"/v1/{service}/user/{creator_id}/fancards" return self._call(endpoint) def creator_post(self, service, creator_id, post_id): - endpoint = f"/{service}/user/{creator_id}/post/{post_id}" + endpoint = f"/v1/{service}/user/{creator_id}/post/{post_id}" return self._call(endpoint) def creator_post_comments(self, service, creator_id, post_id): - endpoint = f"/{service}/user/{creator_id}/post/{post_id}/comments" + endpoint = f"/v1/{service}/user/{creator_id}/post/{post_id}/comments" return self._call(endpoint, fatal=False) def creator_post_revisions(self, service, creator_id, post_id): - endpoint = f"/{service}/user/{creator_id}/post/{post_id}/revisions" + endpoint = f"/v1/{service}/user/{creator_id}/post/{post_id}/revisions" return self._call(endpoint, fatal=False) def creator_profile(self, service, creator_id): - endpoint = f"/{service}/user/{creator_id}/profile" + endpoint = f"/v1/{service}/user/{creator_id}/profile" return self._call(endpoint) def creator_links(self, service, creator_id): - endpoint = f"/{service}/user/{creator_id}/links" + endpoint = f"/v1/{service}/user/{creator_id}/links" return self._call(endpoint) def creator_tags(self, service, creator_id): - endpoint = f"/{service}/user/{creator_id}/tags" + endpoint = f"/v1/{service}/user/{creator_id}/tags" return self._call(endpoint) def discord_channel(self, channel_id, post_count=None): - endpoint = f"/discord/channel/{channel_id}" + endpoint = f"/v1/discord/channel/{channel_id}" if post_count is None: return self._pagination(endpoint, {}, 150) else: return self._pagination_reverse(endpoint, {}, 150, post_count) def discord_channel_lookup(self, server_id): - endpoint = f"/discord/channel/lookup/{server_id}" + endpoint = f"/v1/discord/channel/lookup/{server_id}" return self._call(endpoint) def discord_server(self, server_id): - endpoint = f"/discord/server/{server_id}" + endpoint = f"/v1/discord/server/{server_id}" return self._call(endpoint) def account_favorites(self, type): - endpoint = "/account/favorites" + endpoint = "/v1/account/favorites" params = {"type": type} return self._call(endpoint, params) diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index f22d54e..8d1497d 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -35,7 +35,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): raise exception.NotFoundError("soundtrack") data = self.metadata(page) - yield Message.Directory, data + yield Message.Directory, "", data if self.config("covers", False): for num, url in enumerate(self._extract_covers(page), 1): diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 816bc3d..e2f00e1 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -9,7 +9,7 @@ """Extractors for https://komikcast.li/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, util +from .. import text BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" r"komikcast\d*\.(?:l(?:i|a|ol)|com|cz|site|mo?e)") @@ -25,7 +25,7 @@ class KomikcastBase(): if data is None: data = {} - pattern = util.re(r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?") + pattern = text.re(r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?") match = pattern.match(text.unescape(chapter_string)) manga, chapter, data["chapter_minor"], title = match.groups() @@ -44,7 +44,7 @@ class KomikcastBase(): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): """Extractor for komikcast manga chapters""" - pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)" + pattern = rf"{BASE_PATTERN}(/chapter/[^/?#]+/)" example = "https://komikcast.li/chapter/TITLE/" def metadata(self, page): @@ -54,7 +54,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): def images(self, page): readerarea = text.extr( page, '<div class="main-reading-area', '</div') - pattern = util.re(r"<img[^>]* src=[\"']([^\"']+)") + pattern = text.re(r"<img[^>]* src=[\"']([^\"']+)") return [ (text.unescape(url), None) for url in pattern.findall(readerarea) @@ -64,7 +64,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): """Extractor for komikcast manga""" chapterclass = KomikcastChapterExtractor - pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+/?)$" + pattern = rf"{BASE_PATTERN}(/(?:komik/)?[^/?#]+/?)$" example = "https://komikcast.li/komik/TITLE" def chapters(self, page): diff --git a/gallery_dl/extractor/koofr.py b/gallery_dl/extractor/koofr.py new file mode 100644 index 0000000..9ebc133 --- /dev/null +++ b/gallery_dl/extractor/koofr.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://koofr.net/""" + +from .common import Extractor, Message +from .. import text + + +class KoofrSharedExtractor(Extractor): + """Base class for koofr extractors""" + category = "koofr" + subcategory = "shared" + root = "https://app.koofr.net" + pattern = (r"(?:https?://)?(?:" + r"(?:app\.)?koofr\.(?:net|eu)/links/([\w-]+)|" + r"k00\.fr/(\w+))") + example = "https://app.koofr.net/links/UUID" + + def items(self): + uuid, code = self.groups + if code is not None: + uuid = self.request_location( + "https://k00.fr/" + code, method="GET").rpartition("/")[2] + + url = f"{self.root}/api/v2/public/links/{uuid}" + referer = f"{self.root}/links/{uuid}" + password = self.config("password") + params = {"password": password or ""} + headers = { + "Referer" : referer, + "X-Client" : "newfrontend", + "X-Koofr-Version": "2.1", + "Sec-Fetch-Dest" : "empty", + "Sec-Fetch-Mode" : "cors", + "Sec-Fetch-Site" : "same-origin", + } + data = self.request_json(url, params=params, headers=headers) + + name = data["name"] + file = text.nameext_from_name(name, data["file"]) + file["_http_headers"] = {"Referer": referer} + + root = data.get("publicUrlBase") or self.root + url = f"{root}/content/links/{uuid}/files/get/{name}?path=/&force=" + if password: + url = f"{url}&password={password}" + + yield Message.Directory, "", file + yield Message.Url, url, file diff --git a/gallery_dl/extractor/leakgallery.py b/gallery_dl/extractor/leakgallery.py index c609891..2939304 100644 --- a/gallery_dl/extractor/leakgallery.py +++ b/gallery_dl/extractor/leakgallery.py @@ -37,7 +37,7 @@ class LeakgalleryExtractor(Extractor): media["url"] = url = f"https://cdn.leakgallery.com/{path}" text.nameext_from_url(url, media) - yield Message.Directory, media + yield Message.Directory, "", media yield Message.Url, url, media def _pagination(self, type, base, params=None, creator=None, pnum=1): @@ -81,7 +81,7 @@ class LeakgalleryUserExtractor(LeakgalleryExtractor): class LeakgalleryTrendingExtractor(LeakgalleryExtractor): """Extractor for trending posts on leakgallery.com""" subcategory = "trending" - pattern = BASE_PATTERN + r"/trending-medias(?:/([\w-]+))?" + pattern = rf"{BASE_PATTERN}/trending-medias(?:/([\w-]+))?" example = "https://leakgallery.com/trending-medias/Week" def items(self): @@ -93,7 +93,7 @@ class LeakgalleryTrendingExtractor(LeakgalleryExtractor): class LeakgalleryMostlikedExtractor(LeakgalleryExtractor): """Extractor for most liked posts on leakgallery.com""" subcategory = "mostliked" - pattern = BASE_PATTERN + r"/most-liked" + pattern = rf"{BASE_PATTERN}/most-liked" example = "https://leakgallery.com/most-liked" def items(self): @@ -104,7 +104,7 @@ class LeakgalleryMostlikedExtractor(LeakgalleryExtractor): class LeakgalleryPostExtractor(LeakgalleryExtractor): """Extractor for individual posts on leakgallery.com""" subcategory = "post" - pattern = BASE_PATTERN + r"/([^/?#]+)/(\d+)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/(\d+)" example = "https://leakgallery.com/CREATOR/12345" def items(self): @@ -134,7 +134,7 @@ class LeakgalleryPostExtractor(LeakgalleryExtractor): "url": url, } text.nameext_from_url(url, data) - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data except Exception as exc: self.log.error("Failed to extract post page %s/%s: %s", diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index b0198d5..a7b1318 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -31,7 +31,7 @@ class LensdumpBase(): class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): subcategory = "album" - pattern = BASE_PATTERN + r"/a/(\w+)(?:/?\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/a/(\w+)(?:/?\?([^#]+))?" example = "https://lensdump.com/a/ID" def __init__(self, match): @@ -76,7 +76,7 @@ class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): class LensdumpAlbumsExtractor(LensdumpBase, Extractor): """Extractor for album list from lensdump.com""" subcategory = "albums" - pattern = BASE_PATTERN + r"/(?![ai]/)([^/?#]+)(?:/?\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/(?![ai]/)([^/?#]+)(?:/?\?([^#]+))?" example = "https://lensdump.com/USER" def items(self): @@ -119,10 +119,9 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): 'property="image:width" content="', '"')), "height": text.parse_int(extr( 'property="image:height" content="', '"')), - "date" : text.parse_datetime(extr( - '<span title="', '"'), "%Y-%m-%d %H:%M:%S"), + "date" : self.parse_datetime_iso(extr('<span title="', '"')), } text.nameext_from_url(data["url"], data) - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, data["url"], data diff --git a/gallery_dl/extractor/lexica.py b/gallery_dl/extractor/lexica.py index 6e54847..fc44f51 100644 --- a/gallery_dl/extractor/lexica.py +++ b/gallery_dl/extractor/lexica.py @@ -36,7 +36,7 @@ class LexicaSearchExtractor(Extractor): image["filename"] = image["id"] image["extension"] = "jpg" image["search_tags"] = tags - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, base + image["id"], image def posts(self): diff --git a/gallery_dl/extractor/lightroom.py b/gallery_dl/extractor/lightroom.py index b557149..27aa15a 100644 --- a/gallery_dl/extractor/lightroom.py +++ b/gallery_dl/extractor/lightroom.py @@ -35,7 +35,7 @@ class LightroomGalleryExtractor(Extractor): images = self.images(album) for img in images: url = img["url"] - yield Message.Directory, img + yield Message.Directory, "", img yield Message.Url, url, text.nameext_from_url(url, img) def metadata(self, album): diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py index ab3be69..706194a 100644 --- a/gallery_dl/extractor/livedoor.py +++ b/gallery_dl/extractor/livedoor.py @@ -27,7 +27,7 @@ class LivedoorExtractor(Extractor): def items(self): for post in self.posts(): if images := self._images(post): - yield Message.Directory, {"post": post} + yield Message.Directory, "", {"post": post} for image in images: yield Message.Url, image["url"], image @@ -45,7 +45,7 @@ class LivedoorExtractor(Extractor): "title" : text.unescape(extr('dc:title="', '"')), "categories" : extr('dc:subject="', '"').partition(",")[::2], "description": extr('dc:description="', '"'), - "date" : text.parse_datetime(extr('dc:date="', '"')), + "date" : self.parse_datetime_iso(extr('dc:date="', '"')), "tags" : text.split_html(tags)[1:] if tags else [], "user" : self.user, "body" : body, diff --git a/gallery_dl/extractor/lofter.py b/gallery_dl/extractor/lofter.py index c20d983..b1f58ac 100644 --- a/gallery_dl/extractor/lofter.py +++ b/gallery_dl/extractor/lofter.py @@ -29,7 +29,7 @@ class LofterExtractor(Extractor): post = post["post"] post["blog_name"] = post["blogInfo"]["blogName"] - post["date"] = text.parse_timestamp(post["publishTime"] // 1000) + post["date"] = self.parse_timestamp(post["publishTime"] // 1000) post_type = post["type"] # Article @@ -63,7 +63,7 @@ class LofterExtractor(Extractor): post["id"], post_type) post["count"] = len(image_urls) - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], url in enumerate(image_urls, 1): yield Message.Url, url, text.nameext_from_url(url, post) diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 5233033..d17549d 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -25,7 +25,7 @@ BASE_PATTERN = LolisafeExtractor.update({ class LolisafeAlbumExtractor(LolisafeExtractor): subcategory = "album" - pattern = BASE_PATTERN + "/a/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/a/([^/?#]+)" example = "https://xbunkr.com/a/ID" def __init__(self, match): @@ -42,7 +42,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor): def items(self): files, data = self.fetch_album(self.album_id) - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], file in enumerate(files, 1): url = file["file"] file.update(data) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 0cbc523..2abd1c8 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -58,7 +58,7 @@ class LusciousAlbumExtractor(LusciousExtractor): def items(self): album = self.metadata() - yield Message.Directory, {"album": album} + yield Message.Directory, "", {"album": album} for num, image in enumerate(self.images(), 1): image["num"] = num image["album"] = album @@ -69,7 +69,7 @@ class LusciousAlbumExtractor(LusciousExtractor): image["thumbnail"] = "" image["tags"] = [item["text"] for item in image["tags"]] - image["date"] = text.parse_timestamp(image["created"]) + image["date"] = self.parse_timestamp(image["created"]) image["id"] = text.parse_int(image["id"]) url = (image["url_to_original"] or image["url_to_video"] @@ -188,7 +188,7 @@ fragment AlbumStandard on Album { album["created_by"] = album["created_by"]["display_name"] album["id"] = text.parse_int(album["id"]) - album["date"] = text.parse_timestamp(album["created"]) + album["date"] = self.parse_timestamp(album["created"]) return album diff --git a/gallery_dl/extractor/lynxchan.py b/gallery_dl/extractor/lynxchan.py index fde2df5..7cf1282 100644 --- a/gallery_dl/extractor/lynxchan.py +++ b/gallery_dl/extractor/lynxchan.py @@ -39,7 +39,7 @@ class LynxchanThreadExtractor(LynxchanExtractor): "{threadId} {subject|message[:50]}") filename_fmt = "{postId}{num:?-//} {filename}.{extension}" archive_fmt = "{boardUri}_{postId}_{num}" - pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/res/(\d+)" example = "https://endchan.org/a/res/12345.html" def items(self): @@ -48,7 +48,7 @@ class LynxchanThreadExtractor(LynxchanExtractor): thread["postId"] = thread["threadId"] posts = thread.pop("posts", ()) - yield Message.Directory, thread + yield Message.Directory, "", thread for post in itertools.chain((thread,), posts): if files := post.pop("files", ()): thread.update(post) @@ -63,7 +63,7 @@ class LynxchanThreadExtractor(LynxchanExtractor): class LynxchanBoardExtractor(LynxchanExtractor): """Extractor for LynxChan boards""" subcategory = "board" - pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:/index|/catalog|/\d+|/?$)" example = "https://endchan.org/a/" def items(self): diff --git a/gallery_dl/extractor/madokami.py b/gallery_dl/extractor/madokami.py index 1db5126..e15b90d 100644 --- a/gallery_dl/extractor/madokami.py +++ b/gallery_dl/extractor/madokami.py @@ -47,8 +47,7 @@ class MadokamiMangaExtractor(MadokamiExtractor): "path": text.unescape(extr('href="', '"')), "chapter_string": text.unescape(extr(">", "<")), "size": text.parse_bytes(extr("<td>", "</td>")), - "date": text.parse_datetime( - extr("<td>", "</td>").strip(), "%Y-%m-%d %H:%M"), + "date": self.parse_datetime_iso(extr("<td>", "</td>").strip()), }) if self.config("chapter-reverse"): @@ -89,5 +88,5 @@ class MadokamiMangaExtractor(MadokamiExtractor): url = f"{self.root}{ch['path']}" text.nameext_from_url(url, ch) - yield Message.Directory, ch + yield Message.Directory, "", ch yield Message.Url, url, ch diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 16eb650..0a1709d 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -68,7 +68,7 @@ class MangadexExtractor(Extractor): "chapter" : text.parse_int(chnum), "chapter_minor": f"{sep}{minor}", "chapter_id": chapter["id"], - "date" : text.parse_datetime(cattributes["publishAt"]), + "date" : self.parse_datetime_iso(cattributes["publishAt"]), "group" : [group["attributes"]["name"] for group in relationships["scanlation_group"]], "lang" : lang, @@ -95,7 +95,7 @@ class MangadexCoversExtractor(MangadexExtractor): name = data["cover"] text.nameext_from_url(name, data) data["cover_id"] = data["filename"] - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, f"{base}{name}", data def _transform_cover(self, cover): @@ -109,8 +109,8 @@ class MangadexCoversExtractor(MangadexExtractor): "cover" : cattributes["fileName"], "lang" : cattributes.get("locale"), "volume" : text.parse_int(cattributes["volume"]), - "date" : text.parse_datetime(cattributes["createdAt"]), - "date_updated": text.parse_datetime(cattributes["updatedAt"]), + "date" : self.parse_datetime_iso(cattributes["createdAt"]), + "date_updated": self.parse_datetime_iso(cattributes["updatedAt"]), } @@ -134,15 +134,21 @@ class MangadexChapterExtractor(MangadexExtractor): f"available on MangaDex and can instead be read on the " f"official publisher's website at {data['_external_url']}.") - yield Message.Directory, data + yield Message.Directory, "", data + + if self.config("data-saver", False): + path = "data-saver" + key = "dataSaver" + else: + path = key = "data" server = self.api.athome_server(self.uuid) chapter = server["chapter"] - base = f"{server['baseUrl']}/data/{chapter['hash']}/" + base = f"{server['baseUrl']}/{path}/{chapter['hash']}/" enum = util.enumerate_reversed if self.config( "page-reverse") else enumerate - for data["page"], page in enum(chapter["data"], 1): + for data["page"], page in enum(chapter[key], 1): text.nameext_from_url(page, data) yield Message.Url, f"{base}{page}", data @@ -454,7 +460,7 @@ def _manga_info(self, uuid): "manga_id": manga["id"], "manga_titles": [t.popitem()[1] for t in mattr.get("altTitles") or ()], - "manga_date" : text.parse_datetime(mattr.get("createdAt")), + "manga_date" : self.parse_datetime_iso(mattr.get("createdAt")), "description" : (mattr["description"].get("en") or next(iter(mattr["description"].values()), "")), "demographic": mattr.get("publicationDemographic"), diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index 76f4b7e..8fa645b 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -18,8 +18,8 @@ class MangafoxChapterExtractor(ChapterExtractor): """Extractor for manga chapters from fanfox.net""" category = "mangafox" root = "https://m.fanfox.net" - pattern = BASE_PATTERN + \ - r"(/manga/[^/?#]+/((?:v([^/?#]+)/)?c(\d+)([^/?#]*)))" + pattern = (rf"{BASE_PATTERN}" + rf"(/manga/[^/?#]+/((?:v([^/?#]+)/)?c(\d+)([^/?#]*)))") example = "https://fanfox.net/manga/TITLE/v01/c001/1.html" def __init__(self, match): @@ -62,7 +62,7 @@ class MangafoxMangaExtractor(MangaExtractor): category = "mangafox" root = "https://m.fanfox.net" chapterclass = MangafoxChapterExtractor - pattern = BASE_PATTERN + r"(/manga/[^/?#]+)/?$" + pattern = rf"{BASE_PATTERN}(/manga/[^/?#]+)/?$" example = "https://fanfox.net/manga/TITLE" def chapters(self, page): @@ -99,7 +99,7 @@ class MangafoxMangaExtractor(MangaExtractor): "chapter" : text.parse_int(chapter), "chapter_minor" : minor or "", "chapter_string": cstr, - "date" : text.parse_datetime( + "date" : self.parse_datetime( extr('right">', '</span>'), "%b %d, %Y"), } chapter.update(data) diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index 151e809..9b3a3a1 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -9,7 +9,7 @@ """Extractors for https://www.mangahere.cc/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, util +from .. import text class MangahereBase(): @@ -102,7 +102,7 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor): info, pos = text.extract(page, 'class="title3">', '<', pos) date, pos = text.extract(page, 'class="title2">', '<', pos) - match = util.re( + match = text.re( r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?").match(info) if match: volume, chapter, minor, title = match.groups() diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index a6948e3..3ecf934 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -39,7 +39,7 @@ BASE_PATTERN = ManganeloExtractor.update({ class ManganeloChapterExtractor(ManganeloExtractor, ChapterExtractor): """Extractor for manganelo manga chapters""" - pattern = BASE_PATTERN + r"(/manga/[^/?#]+/chapter-[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/manga/[^/?#]+/chapter-[^/?#]+)" example = "https://www.mangakakalot.gg/manga/MANGA_NAME/chapter-123" def __init__(self, match): @@ -50,10 +50,10 @@ class ManganeloChapterExtractor(ManganeloExtractor, ChapterExtractor): extr = text.extract_from(page) data = { - "date" : text.parse_datetime(extr( - '"datePublished": "', '"')[:19], "%Y-%m-%dT%H:%M:%S"), - "date_updated": text.parse_datetime(extr( - '"dateModified": "', '"')[:19], "%Y-%m-%dT%H:%M:%S"), + "date" : self.parse_datetime_iso(extr( + '"datePublished": "', '"')[:19]), + "date_updated": self.parse_datetime_iso(extr( + '"dateModified": "', '"')[:19]), "manga_id" : text.parse_int(extr("comic_id =", ";")), "chapter_id" : text.parse_int(extr("chapter_id =", ";")), "manga" : extr("comic_name =", ";").strip('" '), @@ -86,7 +86,7 @@ class ManganeloChapterExtractor(ManganeloExtractor, ChapterExtractor): class ManganeloMangaExtractor(ManganeloExtractor, MangaExtractor): """Extractor for manganelo manga""" chapterclass = ManganeloChapterExtractor - pattern = BASE_PATTERN + r"(/manga/[^/?#]+)$" + pattern = rf"{BASE_PATTERN}(/manga/[^/?#]+)$" example = "https://www.mangakakalot.gg/manga/MANGA_NAME" def __init__(self, match): @@ -99,7 +99,7 @@ class ManganeloMangaExtractor(ManganeloExtractor, MangaExtractor): manga = text.unescape(extr("<h1>", "<")) author = text.remove_html(extr("<li>Author(s) :", "</a>")) status = extr("<li>Status :", "<").strip() - update = text.parse_datetime(extr( + update = self.parse_datetime(extr( "<li>Last updated :", "<").strip(), "%b-%d-%Y %I:%M:%S %p") tags = text.split_html(extr(">Genres :", "</li>"))[::2] @@ -121,7 +121,7 @@ class ManganeloMangaExtractor(ManganeloExtractor, MangaExtractor): "chapter" : text.parse_int(chapter), "chapter_minor": (sep and ".") + minor, "title" : title.partition(": ")[2], - "date" : text.parse_datetime(date, "%b-%d-%Y %H:%M"), + "date" : self.parse_datetime(date, "%b-%d-%Y %H:%M"), "lang" : "en", "language": "English", })) diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 19aee33..e2f9166 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -23,7 +23,7 @@ class MangaparkBase(): category = "mangapark" def _parse_chapter_title(self, title): - match = util.re( + match = text.re( r"(?i)" r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" @@ -70,8 +70,8 @@ class MangaparkBase(): class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): """Extractor for manga-chapters from mangapark.net""" - pattern = (BASE_PATTERN + - r"/(?:title/[^/?#]+/|comic/\d+/[^/?#]+/[^/?#]+-i)(\d+)") + pattern = (rf"{BASE_PATTERN}/" + rf"(?:title/[^/?#]+/|comic/\d+/[^/?#]+/[^/?#]+-i)(\d+)") example = "https://mangapark.net/title/MANGA/12345-en-ch.01" def __init__(self, match): @@ -101,7 +101,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): "language" : util.code_to_language(lang), "source" : chapter["srcTitle"], "source_id" : chapter["sourceId"], - "date" : text.parse_timestamp(chapter["dateCreate"] // 1000), + "date" : self.parse_timestamp(chapter["dateCreate"] // 1000), } def images(self, _): @@ -111,7 +111,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): class MangaparkMangaExtractor(MangaparkBase, Extractor): """Extractor for manga from mangapark.net""" subcategory = "manga" - pattern = BASE_PATTERN + r"/(?:title|comic)/(\d+)(?:[/-][^/?#]*)?/?$" + pattern = rf"{BASE_PATTERN}/(?:title|comic)/(\d+)(?:[/-][^/?#]*)?/?$" example = "https://mangapark.net/title/12345-MANGA" def __init__(self, match): @@ -138,7 +138,7 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): "language" : util.code_to_language(lang), "source" : chapter["srcTitle"], "source_id" : chapter["sourceId"], - "date" : text.parse_timestamp( + "date" : self.parse_timestamp( chapter["dateCreate"] // 1000), "_extractor": MangaparkChapterExtractor, } diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index a3bdf39..82fddde 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -7,7 +7,7 @@ """Extractors for https://mangaread.org/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, util, exception +from .. import text, exception class MangareadBase(): @@ -16,7 +16,7 @@ class MangareadBase(): root = "https://www.mangaread.org" def parse_chapter_string(self, chapter_string, data): - match = util.re( + match = text.re( r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?" ).match(text.unescape(chapter_string).strip()) manga, chapter, minor, title = match.groups() diff --git a/gallery_dl/extractor/mangataro.py b/gallery_dl/extractor/mangataro.py index f4cc058..029bc2e 100644 --- a/gallery_dl/extractor/mangataro.py +++ b/gallery_dl/extractor/mangataro.py @@ -40,10 +40,8 @@ class MangataroChapterExtractor(MangataroBase, ChapterExtractor): "chapter_minor": str(round(minor, 5))[1:] if minor else "", "chapter_id" : text.parse_int(chapter_id), "chapter_url" : comic["url"], - "date" : text.parse_datetime( - comic["datePublished"], "%Y-%m-%dT%H:%M:%S%z"), - "date_updated" : text.parse_datetime( - comic["dateModified"], "%Y-%m-%dT%H:%M:%S%z"), + "date" : self.parse_datetime_iso(comic["datePublished"]), + "date_updated" : self.parse_datetime_iso(comic["dateModified"]), } def images(self, page): diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index beb13ce..60f0de9 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -91,7 +91,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor): data = self.metadata(page) imgs = self.images(url, page) - yield Message.Directory, data + yield Message.Directory, "", data data["extension"] = None for data["num"], path in enumerate(imgs, 1): @@ -119,7 +119,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor): "album": { "id": self.album_id, "name": text.unescape(title), - "date": text.parse_datetime(date.strip(), "%Y.%m.%d %H:%M"), + "date": self.parse_datetime(date.strip(), "%Y.%m.%d %H:%M"), "description": text.unescape(descr), }, "count": text.parse_int(count), diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 1bab63a..165f8b8 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -64,10 +64,9 @@ class MastodonExtractor(BaseExtractor): status["count"] = len(attachments) status["tags"] = [tag["name"] for tag in status["tags"]] - status["date"] = text.parse_datetime( - status["created_at"][:19], "%Y-%m-%dT%H:%M:%S") + status["date"] = self.parse_datetime_iso(status["created_at"][:19]) - yield Message.Directory, status + yield Message.Directory, "", status for status["num"], media in enumerate(attachments, 1): status["media"] = media url = media["url"] @@ -119,7 +118,7 @@ BASE_PATTERN = MastodonExtractor.update({ class MastodonUserExtractor(MastodonExtractor): """Extractor for all images of an account/user""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?:@|users/)([^/?#]+)(?:/media)?/?$" + pattern = rf"{BASE_PATTERN}/(?:@|users/)([^/?#]+)(?:/media)?/?$" example = "https://mastodon.social/@USER" def statuses(self): @@ -139,7 +138,7 @@ class MastodonUserExtractor(MastodonExtractor): class MastodonBookmarkExtractor(MastodonExtractor): """Extractor for mastodon bookmarks""" subcategory = "bookmark" - pattern = BASE_PATTERN + r"/bookmarks" + pattern = rf"{BASE_PATTERN}/bookmarks" example = "https://mastodon.social/bookmarks" def statuses(self): @@ -149,7 +148,7 @@ class MastodonBookmarkExtractor(MastodonExtractor): class MastodonFavoriteExtractor(MastodonExtractor): """Extractor for mastodon favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favourites" + pattern = rf"{BASE_PATTERN}/favourites" example = "https://mastodon.social/favourites" def statuses(self): @@ -159,7 +158,7 @@ class MastodonFavoriteExtractor(MastodonExtractor): class MastodonListExtractor(MastodonExtractor): """Extractor for mastodon lists""" subcategory = "list" - pattern = BASE_PATTERN + r"/lists/(\w+)" + pattern = rf"{BASE_PATTERN}/lists/(\w+)" example = "https://mastodon.social/lists/12345" def statuses(self): @@ -169,7 +168,7 @@ class MastodonListExtractor(MastodonExtractor): class MastodonHashtagExtractor(MastodonExtractor): """Extractor for mastodon hashtags""" subcategory = "hashtag" - pattern = BASE_PATTERN + r"/tags/(\w+)" + pattern = rf"{BASE_PATTERN}/tags/(\w+)" example = "https://mastodon.social/tags/NAME" def statuses(self): @@ -179,7 +178,7 @@ class MastodonHashtagExtractor(MastodonExtractor): class MastodonFollowingExtractor(MastodonExtractor): """Extractor for followed mastodon users""" subcategory = "following" - pattern = BASE_PATTERN + r"/(?:@|users/)([^/?#]+)/following" + pattern = rf"{BASE_PATTERN}/(?:@|users/)([^/?#]+)/following" example = "https://mastodon.social/@USER/following" def items(self): @@ -194,7 +193,7 @@ class MastodonFollowingExtractor(MastodonExtractor): class MastodonStatusExtractor(MastodonExtractor): """Extractor for images from a status""" subcategory = "status" - pattern = (BASE_PATTERN + r"/(?:@[^/?#]+|(?:users/[^/?#]+/)?" + pattern = (rf"{BASE_PATTERN}/(?:@[^/?#]+|(?:users/[^/?#]+/)?" r"(?:statuses|notice|objects()))/(?!following)([^/?#]+)") example = "https://mastodon.social/@USER/12345" @@ -319,10 +318,8 @@ class MastodonAPI(): if code == 404: raise exception.NotFoundError() if code == 429: - self.extractor.wait(until=text.parse_datetime( - response.headers["x-ratelimit-reset"], - "%Y-%m-%dT%H:%M:%S.%fZ", - )) + self.extractor.wait(until=self.extractor.parse_datetime_iso( + response.headers["x-ratelimit-reset"])) continue raise exception.AbortExtraction(response.json().get("error")) diff --git a/gallery_dl/extractor/message.py b/gallery_dl/extractor/message.py index d5c2554..6eda213 100644 --- a/gallery_dl/extractor/message.py +++ b/gallery_dl/extractor/message.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,13 +14,14 @@ class Message(): is one of the following identifiers. This message-identifier determines the type and meaning of the other elements in such a tuple. - - Message.Version: + - Message.Version: # obsolete - Message protocol version (currently always '1') - 2nd element specifies the version of all following messages as integer - Message.Directory: - Sets the target directory for all following images - - 2nd element is a dictionary containing general metadata + - 2nd element is unused + - 3rd element is a dictionary containing general metadata - Message.Url: - Image URL and its metadata @@ -45,7 +46,7 @@ class Message(): - The additional URLs serve as a fallback if the primary one fails """ - Version = 1 + # Version = 1 Directory = 2 Url = 3 # Headers = 4 diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py index 42eaeef..ca3ae18 100644 --- a/gallery_dl/extractor/misskey.py +++ b/gallery_dl/extractor/misskey.py @@ -7,7 +7,7 @@ """Extractors for Misskey instances""" from .common import BaseExtractor, Message, Dispatch -from .. import text, exception +from .. import text, dt, exception from ..cache import memcache @@ -18,10 +18,6 @@ class MisskeyExtractor(BaseExtractor): filename_fmt = "{category}_{id}_{file[id]}.{extension}" archive_fmt = "{id}_{file[id]}" - def __init__(self, match): - BaseExtractor.__init__(self, match) - self.item = self.groups[-1] - def _init(self): self.api = MisskeyAPI(self) self.instance = self.root.rpartition("://")[2] @@ -48,13 +44,11 @@ class MisskeyExtractor(BaseExtractor): note["instance"] = self.instance note["instance_remote"] = note["user"]["host"] note["count"] = len(files) - note["date"] = text.parse_datetime( - note["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z") + note["date"] = self.parse_datetime_iso(note["createdAt"]) - yield Message.Directory, note + yield Message.Directory, "", note for note["num"], file in enumerate(files, 1): - file["date"] = text.parse_datetime( - file["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z") + file["date"] = self.parse_datetime_iso(file["createdAt"]) note["file"] = file url = file["url"] yield Message.Url, url, text.nameext_from_url(url, note) @@ -108,11 +102,11 @@ BASE_PATTERN = MisskeyExtractor.update({ class MisskeyUserExtractor(Dispatch, MisskeyExtractor): """Extractor for all images of a Misskey user""" subcategory = "user" - pattern = BASE_PATTERN + r"/@([^/?#]+)/?$" + pattern = rf"{BASE_PATTERN}/@([^/?#]+)/?$" example = "https://misskey.io/@USER" def items(self): - base = f"{self.root}/@{self.item}/" + base = f"{self.root}/@{self.groups[-1]}/" return self._dispatch_extractors(( (MisskeyInfoExtractor , base + "info"), (MisskeyAvatarExtractor , base + "avatar"), @@ -124,32 +118,33 @@ class MisskeyUserExtractor(Dispatch, MisskeyExtractor): class MisskeyNotesExtractor(MisskeyExtractor): """Extractor for a Misskey user's notes""" subcategory = "notes" - pattern = BASE_PATTERN + r"/@([^/?#]+)/notes" + pattern = rf"{BASE_PATTERN}/@([^/?#]+)/notes" example = "https://misskey.io/@USER/notes" def notes(self): - return self.api.users_notes(self.api.user_id_by_username(self.item)) + return self.api.users_notes(self.api.user_id_by_username( + self.groups[-1])) class MisskeyInfoExtractor(MisskeyExtractor): """Extractor for a Misskey user's profile data""" subcategory = "info" - pattern = BASE_PATTERN + r"/@([^/?#]+)/info" + pattern = rf"{BASE_PATTERN}/@([^/?#]+)/info" example = "https://misskey.io/@USER/info" def items(self): - user = self.api.users_show(self.item) - return iter(((Message.Directory, user),)) + user = self.api.users_show(self.groups[-1]) + return iter(((Message.Directory, "", user),)) class MisskeyAvatarExtractor(MisskeyExtractor): """Extractor for a Misskey user's avatar""" subcategory = "avatar" - pattern = BASE_PATTERN + r"/@([^/?#]+)/avatar" + pattern = rf"{BASE_PATTERN}/@([^/?#]+)/avatar" example = "https://misskey.io/@USER/avatar" def notes(self): - user = self.api.users_show(self.item) + user = self.api.users_show(self.groups[-1]) url = user.get("avatarUrl") return (self._make_note("avatar", user, url),) if url else () @@ -157,11 +152,11 @@ class MisskeyAvatarExtractor(MisskeyExtractor): class MisskeyBackgroundExtractor(MisskeyExtractor): """Extractor for a Misskey user's banner image""" subcategory = "background" - pattern = BASE_PATTERN + r"/@([^/?#]+)/ba(?:nner|ckground)" + pattern = rf"{BASE_PATTERN}/@([^/?#]+)/ba(?:nner|ckground)" example = "https://misskey.io/@USER/banner" def notes(self): - user = self.api.users_show(self.item) + user = self.api.users_show(self.groups[-1]) url = user.get("bannerUrl") return (self._make_note("background", user, url),) if url else () @@ -169,11 +164,11 @@ class MisskeyBackgroundExtractor(MisskeyExtractor): class MisskeyFollowingExtractor(MisskeyExtractor): """Extractor for followed Misskey users""" subcategory = "following" - pattern = BASE_PATTERN + r"/@([^/?#]+)/following" + pattern = rf"{BASE_PATTERN}/@([^/?#]+)/following" example = "https://misskey.io/@USER/following" def items(self): - user_id = self.api.user_id_by_username(self.item) + user_id = self.api.user_id_by_username(self.groups[-1]) for user in self.api.users_following(user_id): user = user["followee"] url = f"{self.root}/@{user['username']}" @@ -186,17 +181,17 @@ class MisskeyFollowingExtractor(MisskeyExtractor): class MisskeyNoteExtractor(MisskeyExtractor): """Extractor for images from a Note""" subcategory = "note" - pattern = BASE_PATTERN + r"/notes/(\w+)" + pattern = rf"{BASE_PATTERN}/notes/(\w+)" example = "https://misskey.io/notes/98765" def notes(self): - return (self.api.notes_show(self.item),) + return (self.api.notes_show(self.groups[-1]),) class MisskeyFavoriteExtractor(MisskeyExtractor): """Extractor for favorited notes""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/(?:my|api/i)/favorites" + pattern = rf"{BASE_PATTERN}/(?:my|api/i)/favorites" example = "https://misskey.io/my/favorites" def notes(self): @@ -253,12 +248,39 @@ class MisskeyAPI(): return self.extractor.request_json(url, method="POST", json=data) def _pagination(self, endpoint, data): + extr = self.extractor data["limit"] = 100 - data["withRenotes"] = self.extractor.renotes + data["withRenotes"] = extr.renotes + data["withFiles"] = False if extr.config("text-posts") else True + + date_min, date_max = extr._get_date_min_max() + if (order := extr.config("order-posts")) and \ + order[0] in ("a", "r"): + key = "sinceId" + data["sinceDate"] = 1 if date_min is None else date_min * 1000 + date_stop = None if date_max is None else date_max + else: + key = "untilId" + date_stop = None + if date_min is not None: + data["sinceDate"] = date_min * 1000 + if date_max is None: + # ensure notes are returned in descending order + data["untilDate"] = (int(dt.time.time()) + 1000) * 1000 + if date_max is not None: + data["untilDate"] = date_max * 1000 while True: notes = self._call(endpoint, data) if not notes: return - yield from notes - data["untilId"] = notes[-1]["id"] + elif date_stop is not None and dt.to_ts(dt.parse_iso( + notes[-1]["createdAt"])) > date_stop: + for idx, note in enumerate(notes): + if dt.to_ts(dt.parse_iso(note["createdAt"])) > date_stop: + yield from notes[:idx] + return + else: + yield from notes + + data[key] = notes[-1]["id"] diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index ba27994..23f8fd9 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -9,9 +9,8 @@ """Extractors for Moebooru based sites""" from .booru import BooruExtractor -from .. import text, util +from .. import text, dt import collections -import datetime class MoebooruExtractor(BooruExtractor): @@ -21,7 +20,7 @@ class MoebooruExtractor(BooruExtractor): page_start = 1 def _prepare(self, post): - post["date"] = text.parse_timestamp(post["created_at"]) + post["date"] = dt.parse_ts(post["created_at"]) def _html(self, post): url = f"{self.root}/post/show/{post['id']}" @@ -33,7 +32,7 @@ class MoebooruExtractor(BooruExtractor): return tags = collections.defaultdict(list) - pattern = util.re(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)") + pattern = text.re(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)") for tag_type, tag_name in pattern.findall(tag_container): tags[tag_type].append(text.unquote(tag_name)) for key, value in tags.items(): @@ -93,7 +92,7 @@ class MoebooruTagExtractor(MoebooruExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/post\?(?:[^&#]*&)*tags=([^&#]*)" + pattern = rf"{BASE_PATTERN}/post\?(?:[^&#]*&)*tags=([^&#]*)" example = "https://yande.re/post?tags=TAG" def __init__(self, match): @@ -112,7 +111,7 @@ class MoebooruPoolExtractor(MoebooruExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" - pattern = BASE_PATTERN + r"/pool/show/(\d+)" + pattern = rf"{BASE_PATTERN}/pool/show/(\d+)" example = "https://yande.re/pool/show/12345" def __init__(self, match): @@ -136,7 +135,7 @@ class MoebooruPoolExtractor(MoebooruExtractor): class MoebooruPostExtractor(MoebooruExtractor): subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/show/(\d+)" + pattern = rf"{BASE_PATTERN}/post/show/(\d+)" example = "https://yande.re/post/show/12345" def posts(self): @@ -148,8 +147,8 @@ class MoebooruPopularExtractor(MoebooruExtractor): subcategory = "popular" directory_fmt = ("{category}", "popular", "{scale}", "{date}") archive_fmt = "P_{scale[0]}_{date}_{id}" - pattern = BASE_PATTERN + \ - r"/post/popular_(by_(?:day|week|month)|recent)(?:\?([^#]*))?" + pattern = (rf"{BASE_PATTERN}" + rf"/post/popular_(by_(?:day|week|month)|recent)(?:\?([^#]*))?") example = "https://yande.re/post/popular_by_month?year=YYYY&month=MM" def __init__(self, match): @@ -164,14 +163,14 @@ class MoebooruPopularExtractor(MoebooruExtractor): date = (f"{params['year']:>04}-{params.get('month', '01'):>02}-" f"{params.get('day', '01'):>02}") else: - date = datetime.date.today().isoformat() + date = dt.date.today().isoformat() scale = self.scale if scale.startswith("by_"): scale = scale[3:] if scale == "week": - date = datetime.date.fromisoformat(date) - date = (date - datetime.timedelta(days=date.weekday())).isoformat() + date = dt.date.fromisoformat(date) + date = (date - dt.timedelta(days=date.weekday())).isoformat() elif scale == "month": date = date[:-3] diff --git a/gallery_dl/extractor/motherless.py b/gallery_dl/extractor/motherless.py index 48137ce..c20f138 100644 --- a/gallery_dl/extractor/motherless.py +++ b/gallery_dl/extractor/motherless.py @@ -9,9 +9,8 @@ """Extractors for https://motherless.com/""" from .common import Extractor, Message -from .. import text, util, exception +from .. import text, dt, exception from ..cache import memcache -from datetime import timedelta BASE_PATTERN = r"(?:https?://)?motherless\.com" @@ -42,6 +41,8 @@ class MotherlessExtractor(Extractor): path, _, media_id = path.rpartition("/") data = { "id" : media_id, + "title": text.unescape( + (t := extr("<title>", "<")) and t[:t.rfind(" | ")]), "type" : extr("__mediatype = '", "'"), "group": extr("__group = '", "'"), "url" : extr("__fileurl = '", "'"), @@ -50,7 +51,6 @@ class MotherlessExtractor(Extractor): for tag in text.extract_iter( extr('class="media-meta-tags">', "</div>"), ">#", "<") ], - "title": text.unescape(extr("<h1>", "<")), "views": text.parse_int(extr( 'class="count">', " ").replace(",", "")), "favorites": text.parse_int(extr( @@ -115,14 +115,14 @@ class MotherlessExtractor(Extractor): return data - def _parse_datetime(self, dt): - if " ago" not in dt: - return text.parse_datetime(dt, "%d %b %Y") + def _parse_datetime(self, dt_string): + if " ago" not in dt_string: + return dt.parse(dt_string, "%d %b %Y") - value = text.parse_int(dt[:-5]) - delta = timedelta(0, value*3600) if dt[-5] == "h" else timedelta(value) - return (util.datetime_utcnow() - delta).replace( - hour=0, minute=0, second=0) + value = text.parse_int(dt_string[:-5]) + delta = (dt.timedelta(0, value*3600) if dt_string[-5] == "h" else + dt.timedelta(value)) + return (dt.now() - delta).replace(hour=0, minute=0, second=0) @memcache(keyarg=2) def _extract_gallery_title(self, page, gallery_id): @@ -132,10 +132,9 @@ class MotherlessExtractor(Extractor): if title: return text.unescape(title.strip()) - pos = page.find(f' href="/G{gallery_id}"') - if pos >= 0: - return text.unescape(text.extract( - page, ' title="', '"', pos)[0]) + if f' href="/G{gallery_id}"' in page: + return text.unescape( + (t := text.extr(page, "<title>", "<")) and t[:t.rfind(" | ")]) return "" @@ -153,15 +152,15 @@ class MotherlessExtractor(Extractor): class MotherlessMediaExtractor(MotherlessExtractor): """Extractor for a single image/video from motherless.com""" subcategory = "media" - pattern = (BASE_PATTERN + - r"/((?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?" - r"(?!G)[A-Z0-9]+)") + pattern = (rf"{BASE_PATTERN}/(" + rf"(?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?" + rf"(?!G)[A-Z0-9]+)") example = "https://motherless.com/ABC123" def items(self): file = self._extract_media(self.groups[0]) url = file["url"] - yield Message.Directory, file + yield Message.Directory, "", file yield Message.Url, url, text.nameext_from_url(url, file) @@ -171,7 +170,7 @@ class MotherlessGalleryExtractor(MotherlessExtractor): directory_fmt = ("{category}", "{uploader}", "{gallery_id} {gallery_title}") archive_fmt = "{gallery_id}_{id}" - pattern = BASE_PATTERN + "/G([IVG])?([A-Z0-9]+)/?$" + pattern = rf"{BASE_PATTERN}/G([IVG])?([A-Z0-9]+)/?$" example = "https://motherless.com/GABC123" def items(self): @@ -198,7 +197,7 @@ class MotherlessGalleryExtractor(MotherlessExtractor): file["num"] = num file["thumbnail"] = thumbnail url = file["url"] - yield Message.Directory, file + yield Message.Directory, "", file yield Message.Url, url, text.nameext_from_url(url, file) @@ -207,7 +206,7 @@ class MotherlessGroupExtractor(MotherlessExtractor): directory_fmt = ("{category}", "{uploader}", "{group_id} {group_title}") archive_fmt = "{group_id}_{id}" - pattern = BASE_PATTERN + "/g([iv]?)/?([a-z0-9_]+)/?$" + pattern = rf"{BASE_PATTERN}/g([iv]?)/?([a-z0-9_]+)/?$" example = "https://motherless.com/g/abc123" def items(self): @@ -236,5 +235,5 @@ class MotherlessGroupExtractor(MotherlessExtractor): file["uploader"] = uploader file["group"] = file["group_id"] url = file["url"] - yield Message.Directory, file + yield Message.Directory, "", file yield Message.Url, url, text.nameext_from_url(url, file) diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py index 2a39dc9..936f857 100644 --- a/gallery_dl/extractor/myhentaigallery.py +++ b/gallery_dl/extractor/myhentaigallery.py @@ -6,17 +6,21 @@ """Extractors for https://myhentaigallery.com/""" -from .common import GalleryExtractor +from .common import Extractor, GalleryExtractor, Message from .. import text, exception +BASE_PATTERN = r"(?:https?://)?myhentaigallery\.com" -class MyhentaigalleryGalleryExtractor(GalleryExtractor): - """Extractor for image galleries from myhentaigallery.com""" + +class MyhentaigalleryBase(): category = "myhentaigallery" root = "https://myhentaigallery.com" + + +class MyhentaigalleryGalleryExtractor(MyhentaigalleryBase, GalleryExtractor): + """Extractor for image galleries from myhentaigallery.com""" directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}") - pattern = (r"(?:https?://)?myhentaigallery\.com" - r"/g(?:allery/(?:thumbnails|show))?/(\d+)") + pattern = rf"{BASE_PATTERN}/g(?:allery/(?:thumbnails|show))?/(\d+)" example = "https://myhentaigallery.com/g/12345" def __init__(self, match): @@ -53,3 +57,32 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor): "/thumbnail/", "/original/"), None) for url in text.extract_iter(page, 'class="comic-thumb"', '</div>') ] + + +class MyhentaigalleryTagExtractor(MyhentaigalleryBase, Extractor): + """Extractor for myhentaigallery tag searches""" + subcategory = "tag" + pattern = rf"{BASE_PATTERN}(/g/(artist|category|group|parody)/(\d+).*)" + example = "https://myhentaigallery.com/g/category/123" + + def items(self): + data = {"_extractor": MyhentaigalleryGalleryExtractor} + for url in self.galleries(): + yield Message.Queue, url, data + + def galleries(self): + root = self.root + url = root + self.groups[0] + + while True: + page = self.request(url).text + + for inner in text.extract_iter( + page, '<div class="comic-inner">', "<div"): + yield root + text.extr(inner, 'href="', '"') + + try: + pos = page.index(">Next<") + except ValueError: + return + url = root + text.rextr(page, 'href="', '"', pos) diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index 0223d0b..3a21122 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -49,7 +49,7 @@ class MyportfolioGalleryExtractor(Extractor): data = self.metadata(page) imgs = self.images(page) data["count"] = len(imgs) - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], url in enumerate(imgs, 1): yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/naverblog.py b/gallery_dl/extractor/naverblog.py index b55e001..cc96e09 100644 --- a/gallery_dl/extractor/naverblog.py +++ b/gallery_dl/extractor/naverblog.py @@ -9,8 +9,7 @@ """Extractors for https://blog.naver.com/""" from .common import GalleryExtractor, Extractor, Message -from .. import text, util -import datetime +from .. import text, util, dt import time @@ -67,11 +66,11 @@ class NaverBlogPostExtractor(NaverBlogBase, GalleryExtractor): return data - def _parse_datetime(self, date_string): - if "ì „" in date_string: + def _parse_datetime(self, dt_string): + if "ì „" in dt_string: ts = time.gmtime() - return datetime.datetime(ts.tm_year, ts.tm_mon, ts.tm_mday) - return text.parse_datetime(date_string, "%Y. %m. %d. %H:%M") + return dt.datetime(ts.tm_year, ts.tm_mon, ts.tm_mday) + return dt.parse(dt_string, "%Y. %m. %d. %H:%M") def images(self, page): files = [] diff --git a/gallery_dl/extractor/naverchzzk.py b/gallery_dl/extractor/naverchzzk.py index de4ee7a..5b56710 100644 --- a/gallery_dl/extractor/naverchzzk.py +++ b/gallery_dl/extractor/naverchzzk.py @@ -31,17 +31,17 @@ class NaverChzzkExtractor(Extractor): data["uid"] = data["objectId"] data["user"] = comment["user"] data["count"] = len(files) - data["date"] = text.parse_datetime( + data["date"] = self.parse_datetime( data["createdDate"], "%Y%m%d%H%M%S") - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], file in enumerate(files, 1): if extra := file.get("extraJson"): file.update(util.json_loads(extra)) - file["date"] = text.parse_datetime( - file["createdDate"], "%Y-%m-%dT%H:%M:%S.%f%z") - file["date_updated"] = text.parse_datetime( - file["updatedDate"], "%Y-%m-%dT%H:%M:%S.%f%z") + file["date"] = self.parse_datetime_iso( + file["createdDate"]) + file["date_updated"] = self.parse_datetime_iso( + file["updatedDate"]) data["file"] = file url = file["attachValue"] yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index 3211941..72089d0 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -27,7 +27,7 @@ class NaverWebtoonEpisodeExtractor(NaverWebtoonBase, GalleryExtractor): directory_fmt = ("{category}", "{comic}") filename_fmt = "{episode:>03}-{num:>02}.{extension}" archive_fmt = "{title_id}_{episode}_{num}" - pattern = BASE_PATTERN + r"/detail(?:\.nhn)?\?([^#]+)" + pattern = rf"{BASE_PATTERN}/detail(?:\.nhn)?\?([^#]+)" example = "https://comic.naver.com/webtoon/detail?titleId=12345&no=1" def __init__(self, match): @@ -66,7 +66,7 @@ class NaverWebtoonEpisodeExtractor(NaverWebtoonBase, GalleryExtractor): class NaverWebtoonComicExtractor(NaverWebtoonBase, Extractor): subcategory = "comic" categorytransfer = True - pattern = BASE_PATTERN + r"/list(?:\.nhn)?\?([^#]+)" + pattern = rf"{BASE_PATTERN}/list(?:\.nhn)?\?([^#]+)" example = "https://comic.naver.com/webtoon/list?titleId=12345" def __init__(self, match): diff --git a/gallery_dl/extractor/nekohouse.py b/gallery_dl/extractor/nekohouse.py index e6b0461..728912b 100644 --- a/gallery_dl/extractor/nekohouse.py +++ b/gallery_dl/extractor/nekohouse.py @@ -12,7 +12,7 @@ from .common import Extractor, Message from .. import text BASE_PATTERN = r"(?:https?://)?nekohouse\.su" -USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" +USER_PATTERN = rf"{BASE_PATTERN}/([^/?#]+)/user/([^/?#]+)" class NekohouseExtractor(Extractor): @@ -27,7 +27,7 @@ class NekohousePostExtractor(NekohouseExtractor): "{post_id} {date} {title[b:230]}") filename_fmt = "{num:>02} {id|filename}.{extension}" archive_fmt = "{service}_{user_id}_{post_id}_{hash}" - pattern = USER_PATTERN + r"/post/([^/?#]+)" + pattern = rf"{USER_PATTERN}/post/([^/?#]+)" example = "https://nekohouse.su/SERVICE/user/12345/post/12345" def items(self): @@ -42,7 +42,7 @@ class NekohousePostExtractor(NekohouseExtractor): post["post_id"] = post_id post["count"] = len(files) - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], file in enumerate(files, 1): url = file["url"] text.nameext_from_url(url, file) @@ -59,8 +59,8 @@ class NekohousePostExtractor(NekohouseExtractor): 'class="scrape__user-name', '</').rpartition(">")[2].strip()), "title" : text.unescape(extr( 'class="scrape__title', '</').rpartition(">")[2]), - "date" : text.parse_datetime(extr( - 'datetime="', '"')[:19], "%Y-%m-%d %H:%M:%S"), + "date" : self.parse_datetime_iso(extr( + 'datetime="', '"')[:19]), "content": text.unescape(extr( 'class="scrape__content">', "</div>").strip()), } @@ -98,7 +98,7 @@ class NekohousePostExtractor(NekohouseExtractor): class NekohouseUserExtractor(NekohouseExtractor): subcategory = "user" - pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)" + pattern = rf"{USER_PATTERN}/?(?:\?([^#]+))?(?:$|\?|#)" example = "https://nekohouse.su/SERVICE/user/12345" def items(self): diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index ffb4cad..f980f4b 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -9,7 +9,7 @@ """Extractors for https://www.newgrounds.com/""" from .common import Extractor, Message, Dispatch -from .. import text, util, exception +from .. import text, util, dt, exception from ..cache import cache import itertools @@ -34,7 +34,7 @@ class NewgroundsExtractor(Extractor): self.user_root = f"https://{self.user}.newgrounds.com" def _init(self): - self._extract_comment_urls = util.re( + self._extract_comment_urls = text.re( r'(?:<img |data-smartload-)src="([^"]+)').findall self.flash = self.config("flash", True) @@ -58,13 +58,13 @@ class NewgroundsExtractor(Extractor): post = self.extract_post(post_url) url = post.get("url") except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) url = None if url: if metadata: post.update(metadata) - yield Message.Directory, post + yield Message.Directory, "", post post["num"] = 0 yield Message.Url, url, text.nameext_from_url(url, post) @@ -88,6 +88,7 @@ class NewgroundsExtractor(Extractor): text.nameext_from_url(url, post) yield Message.Url, url, post else: + self.status |= 1 self.log.warning( "Unable to get download URL for '%s'", post_url) @@ -218,7 +219,7 @@ class NewgroundsExtractor(Extractor): "description": text.unescape(extr(':description" content="', '"')), "type" : "art", "_type" : "i", - "date" : text.parse_datetime(extr( + "date" : dt.parse_iso(extr( 'itemprop="datePublished" content="', '"')), "rating" : extr('class="rated-', '"'), "url" : full('src="', '"'), @@ -268,7 +269,7 @@ class NewgroundsExtractor(Extractor): "description": text.unescape(extr(':description" content="', '"')), "type" : "audio", "_type" : "a", - "date" : text.parse_datetime(extr( + "date" : dt.parse_iso(extr( 'itemprop="datePublished" content="', '"')), "url" : extr('{"url":"', '"').replace("\\/", "/"), "index" : text.parse_int(index), @@ -287,7 +288,7 @@ class NewgroundsExtractor(Extractor): src = src.replace("\\/", "/") formats = () type = extr(',"description":"', '"') - date = text.parse_datetime(extr( + date = dt.parse_iso(extr( 'itemprop="datePublished" content="', '"')) if type: type = type.rpartition(" ")[2].lower() @@ -302,7 +303,7 @@ class NewgroundsExtractor(Extractor): sources = self.request_json(url, headers=headers)["sources"] formats = self._video_formats(sources) src = next(formats, "") - date = text.parse_timestamp(src.rpartition("?")[2]) + date = self.parse_timestamp(src.rpartition("?")[2]) type = "movie" return { @@ -321,7 +322,7 @@ class NewgroundsExtractor(Extractor): def _video_formats(self, sources): src = sources["360p"][0]["src"] - sub = util.re(r"\.360p\.\w+").sub + sub = text.re(r"\.360p\.\w+").sub for fmt in self.format: try: @@ -411,7 +412,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor): class NewgroundsMediaExtractor(NewgroundsExtractor): """Extractor for a media file from newgrounds.com""" subcategory = "media" - pattern = BASE_PATTERN + r"(/(?:portal/view|audio/listen)/\d+)" + pattern = rf"{BASE_PATTERN}(/(?:portal/view|audio/listen)/\d+)" example = "https://www.newgrounds.com/portal/view/12345" def __init__(self, match): @@ -426,34 +427,34 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): class NewgroundsArtExtractor(NewgroundsExtractor): """Extractor for all images of a newgrounds user""" subcategory = _path = "art" - pattern = USER_PATTERN + r"/art(?:(?:/page/|/?\?page=)(\d+))?/?$" + pattern = rf"{USER_PATTERN}/art(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/art" class NewgroundsAudioExtractor(NewgroundsExtractor): """Extractor for all audio submissions of a newgrounds user""" subcategory = _path = "audio" - pattern = USER_PATTERN + r"/audio(?:(?:/page/|/?\?page=)(\d+))?/?$" + pattern = rf"{USER_PATTERN}/audio(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/audio" class NewgroundsMoviesExtractor(NewgroundsExtractor): """Extractor for all movies of a newgrounds user""" subcategory = _path = "movies" - pattern = USER_PATTERN + r"/movies(?:(?:/page/|/?\?page=)(\d+))?/?$" + pattern = rf"{USER_PATTERN}/movies(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/movies" class NewgroundsGamesExtractor(NewgroundsExtractor): """Extractor for a newgrounds user's games""" subcategory = _path = "games" - pattern = USER_PATTERN + r"/games(?:(?:/page/|/?\?page=)(\d+))?/?$" + pattern = rf"{USER_PATTERN}/games(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/games" class NewgroundsUserExtractor(Dispatch, NewgroundsExtractor): """Extractor for a newgrounds user profile""" - pattern = USER_PATTERN + r"/?$" + pattern = rf"{USER_PATTERN}/?$" example = "https://USER.newgrounds.com" def items(self): @@ -470,7 +471,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): """Extractor for posts favorited by a newgrounds user""" subcategory = "favorite" directory_fmt = ("{category}", "{user}", "Favorites") - pattern = (USER_PATTERN + r"/favorites(?!/following)(?:/(art|audio|movies)" + pattern = (rf"{USER_PATTERN}/favorites(?!/following)(?:/(art|audio|movies)" r"(?:(?:/page/|/?\?page=)(\d+))?)?") example = "https://USER.newgrounds.com/favorites" @@ -516,7 +517,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor): """Extractor for a newgrounds user's favorited users""" subcategory = "following" - pattern = (USER_PATTERN + r"/favorites/(following)" + pattern = (rf"{USER_PATTERN}/favorites/(following)" r"(?:(?:/page/|/?\?page=)(\d+))?") example = "https://USER.newgrounds.com/favorites/following" @@ -538,7 +539,7 @@ class NewgroundsSearchExtractor(NewgroundsExtractor): """Extractor for newgrounds.com search reesults""" subcategory = "search" directory_fmt = ("{category}", "search", "{search_tags}") - pattern = BASE_PATTERN + r"/search/conduct/([^/?#]+)/?\?([^#]+)" + pattern = rf"{BASE_PATTERN}/search/conduct/([^/?#]+)/?\?([^#]+)" example = "https://www.newgrounds.com/search/conduct/art?terms=QUERY" def __init__(self, match): diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index c6df835..a6b01c2 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -9,7 +9,7 @@ """Extractors for nijie instances""" from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin -from .. import text, exception +from .. import text, dt, exception from ..cache import cache @@ -59,7 +59,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): urls = self._extract_images(image_id, page) data["count"] = len(urls) - yield Message.Directory, data + yield Message.Directory, "", data for num, url in enumerate(urls): image = text.nameext_from_url(url, { "num": num, @@ -82,8 +82,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): "title" : keywords[0].strip(), "description": text.unescape(extr( '"description": "', '"').replace("&", "&")), - "date" : text.parse_datetime(extr( - '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y", 9), + "date" : dt.parse(extr( + '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y" + ) - dt.timedelta(hours=9), "artist_id" : text.parse_int(extr('/members.php?id=', '"')), "artist_name": keywords[1], "tags" : keywords[2:-1], @@ -101,9 +102,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): "artist_id" : text.parse_int(extr('members.php?id=', '"')), "artist_name": keywords[1], "tags" : keywords[2:-1], - "date" : text.parse_datetime(extr( - "itemprop='datePublished' content=", "<").rpartition(">")[2], - "%Y-%m-%d %H:%M:%S", 9), + "date" : dt.parse_iso(extr( + "itemprop='datePublished' content=", "<").rpartition(">")[2] + ) - dt.timedelta(hours=9), } def _extract_images(self, image_id, page): @@ -177,7 +178,7 @@ BASE_PATTERN = NijieExtractor.update({ class NijieUserExtractor(Dispatch, NijieExtractor): """Extractor for nijie user profiles""" - pattern = BASE_PATTERN + r"/members\.php\?id=(\d+)" + pattern = rf"{BASE_PATTERN}/members\.php\?id=(\d+)" example = "https://nijie.info/members.php?id=12345" def items(self): @@ -193,7 +194,7 @@ class NijieUserExtractor(Dispatch, NijieExtractor): class NijieIllustrationExtractor(NijieExtractor): """Extractor for all illustrations of a nijie-user""" subcategory = "illustration" - pattern = BASE_PATTERN + r"/members_illust\.php\?id=(\d+)" + pattern = rf"{BASE_PATTERN}/members_illust\.php\?id=(\d+)" example = "https://nijie.info/members_illust.php?id=12345" def image_ids(self): @@ -203,7 +204,7 @@ class NijieIllustrationExtractor(NijieExtractor): class NijieDoujinExtractor(NijieExtractor): """Extractor for doujin entries of a nijie user""" subcategory = "doujin" - pattern = BASE_PATTERN + r"/members_dojin\.php\?id=(\d+)" + pattern = rf"{BASE_PATTERN}/members_dojin\.php\?id=(\d+)" example = "https://nijie.info/members_dojin.php?id=12345" def image_ids(self): @@ -215,7 +216,7 @@ class NijieFavoriteExtractor(NijieExtractor): subcategory = "favorite" directory_fmt = ("{category}", "bookmarks", "{user_id}") archive_fmt = "f_{user_id}_{image_id}_{num}" - pattern = BASE_PATTERN + r"/user_like_illust_view\.php\?id=(\d+)" + pattern = rf"{BASE_PATTERN}/user_like_illust_view\.php\?id=(\d+)" example = "https://nijie.info/user_like_illust_view.php?id=12345" def image_ids(self): @@ -233,7 +234,7 @@ class NijieNuitaExtractor(NijieExtractor): subcategory = "nuita" directory_fmt = ("{category}", "nuita", "{user_id}") archive_fmt = "n_{user_id}_{image_id}_{num}" - pattern = BASE_PATTERN + r"/history_nuita\.php\?id=(\d+)" + pattern = rf"{BASE_PATTERN}/history_nuita\.php\?id=(\d+)" example = "https://nijie.info/history_nuita.php?id=12345" def image_ids(self): @@ -252,7 +253,7 @@ class NijieNuitaExtractor(NijieExtractor): class NijieFeedExtractor(NijieExtractor): """Extractor for nijie liked user feed""" subcategory = "feed" - pattern = BASE_PATTERN + r"/like_user_view\.php" + pattern = rf"{BASE_PATTERN}/like_user_view\.php" example = "https://nijie.info/like_user_view.php" def image_ids(self): @@ -265,7 +266,7 @@ class NijieFeedExtractor(NijieExtractor): class NijieFollowedExtractor(NijieExtractor): """Extractor for followed nijie users""" subcategory = "followed" - pattern = BASE_PATTERN + r"/like_my\.php" + pattern = rf"{BASE_PATTERN}/like_my\.php" example = "https://nijie.info/like_my.php" def items(self): @@ -291,7 +292,7 @@ class NijieFollowedExtractor(NijieExtractor): class NijieImageExtractor(NijieExtractor): """Extractor for a nijie work/image""" subcategory = "image" - pattern = BASE_PATTERN + r"/view(?:_popup)?\.php\?id=(\d+)" + pattern = rf"{BASE_PATTERN}/view(?:_popup)?\.php\?id=(\d+)" example = "https://nijie.info/view.php?id=12345" def image_ids(self): diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 69d8299..321883c 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -97,7 +97,7 @@ class NitterExtractor(BaseExtractor): files = () tweet["count"] = len(files) - yield Message.Directory, tweet + yield Message.Directory, "", tweet for tweet["num"], file in enumerate(files, 1): url = file["url"] file.update(tweet) @@ -114,7 +114,7 @@ class NitterExtractor(BaseExtractor): return { "author" : author, "user" : self.user_obj or author, - "date" : text.parse_datetime( + "date" : self.parse_datetime( extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"), "tweet_id": link.rpartition("/")[2].partition("#")[0], "content": extr('class="tweet-content', "</div").partition(">")[2], @@ -142,7 +142,7 @@ class NitterExtractor(BaseExtractor): return { "author" : author, "user" : self.user_obj or author, - "date" : text.parse_datetime( + "date" : self.parse_datetime( extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"), "tweet_id": link.rpartition("/")[2].partition("#")[0], "content" : extr('class="quote-text', "</div").partition(">")[2], @@ -173,7 +173,7 @@ class NitterExtractor(BaseExtractor): "nick" : extr('title="', '"'), "name" : extr('title="@', '"'), "description" : extr('<p dir="auto">', '<'), - "date" : text.parse_datetime( + "date" : self.parse_datetime( extr('class="profile-joindate"><span title="', '"'), "%I:%M %p - %d %b %Y"), "statuses_count" : text.parse_int(extr( @@ -229,12 +229,12 @@ class NitterExtractor(BaseExtractor): BASE_PATTERN = NitterExtractor.update({ }) -USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)" +USER_PATTERN = rf"{BASE_PATTERN}/(i(?:/user/|d:)(\d+)|[^/?#]+)" class NitterTweetsExtractor(NitterExtractor): subcategory = "tweets" - pattern = USER_PATTERN + r"(?:/tweets)?(?:$|\?|#)" + pattern = rf"{USER_PATTERN}(?:/tweets)?(?:$|\?|#)" example = "https://nitter.net/USER" def tweets(self): @@ -243,7 +243,7 @@ class NitterTweetsExtractor(NitterExtractor): class NitterRepliesExtractor(NitterExtractor): subcategory = "replies" - pattern = USER_PATTERN + r"/with_replies" + pattern = rf"{USER_PATTERN}/with_replies" example = "https://nitter.net/USER/with_replies" def tweets(self): @@ -252,7 +252,7 @@ class NitterRepliesExtractor(NitterExtractor): class NitterMediaExtractor(NitterExtractor): subcategory = "media" - pattern = USER_PATTERN + r"/media" + pattern = rf"{USER_PATTERN}/media" example = "https://nitter.net/USER/media" def tweets(self): @@ -261,7 +261,7 @@ class NitterMediaExtractor(NitterExtractor): class NitterSearchExtractor(NitterExtractor): subcategory = "search" - pattern = USER_PATTERN + r"/search" + pattern = rf"{USER_PATTERN}/search" example = "https://nitter.net/USER/search" def tweets(self): @@ -274,7 +274,7 @@ class NitterTweetExtractor(NitterExtractor): directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{num}" - pattern = BASE_PATTERN + r"/(i/web|[^/?#]+)/status/(\d+())" + pattern = rf"{BASE_PATTERN}/(i/web|[^/?#]+)/status/(\d+())" example = "https://nitter.net/USER/status/12345" def tweets(self): diff --git a/gallery_dl/extractor/noop.py b/gallery_dl/extractor/noop.py index df2316c..fe88e63 100644 --- a/gallery_dl/extractor/noop.py +++ b/gallery_dl/extractor/noop.py @@ -8,7 +8,7 @@ """noop extractor""" -from .common import Extractor, Message +from .common import Extractor class NoopExtractor(Extractor): @@ -17,11 +17,9 @@ class NoopExtractor(Extractor): example = "noop" def items(self): - # yield *something* to prevent a 'No results' message - yield Message.Version, 1 - # Save cookies manually, since it happens automatically only after # extended extractor initialization, i.e. Message.Directory, which # itself might cause some unintended effects. if self.cookies: self.cookies_store() + return iter(((-1, "", None),)) diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 528aff2..fdd3594 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -9,7 +9,7 @@ """Extractors for https://nozomi.la/""" from .common import Extractor, Message -from .. import text +from .. import text, dt def decode_nozomi(n): @@ -49,10 +49,9 @@ class NozomiExtractor(Extractor): post["character"] = self._list(post.get("character")) try: - post["date"] = text.parse_datetime( - post["date"] + ":00", "%Y-%m-%d %H:%M:%S%z") + post["date"] = dt.parse_iso(post["date"] + ":00") except Exception: - post["date"] = None + post["date"] = dt.NONE post.update(data) @@ -61,7 +60,7 @@ class NozomiExtractor(Extractor): if key in post: del post[key] - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], image in enumerate(images, 1): post["filename"] = post["dataid"] = did = image["dataid"] post["is_video"] = video = \ diff --git a/gallery_dl/extractor/nudostar.py b/gallery_dl/extractor/nudostar.py index 467d36a..2eb4340 100644 --- a/gallery_dl/extractor/nudostar.py +++ b/gallery_dl/extractor/nudostar.py @@ -21,7 +21,7 @@ class NudostarExtractor(GalleryExtractor): class NudostarModelExtractor(NudostarExtractor): """Extractor for NudoStar models""" subcategory = "model" - pattern = BASE_PATTERN + r"(/models/([^/?#]+)/?)$" + pattern = rf"{BASE_PATTERN}(/models/([^/?#]+)/?)$" example = "https://nudostar.tv/models/MODEL/" def metadata(self, page): @@ -53,7 +53,7 @@ class NudostarModelExtractor(NudostarExtractor): class NudostarImageExtractor(NudostarExtractor): """Extractor for NudoStar images""" subcategory = "image" - pattern = BASE_PATTERN + r"(/models/([^/?#]+)/(\d+)/)" + pattern = rf"{BASE_PATTERN}(/models/([^/?#]+)/(\d+)/)" example = "https://nudostar.tv/models/MODEL/123/" def items(self): @@ -67,5 +67,5 @@ class NudostarImageExtractor(NudostarExtractor): data["num"] = text.parse_int(self.groups[2]) data["url"] = img_url - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, img_url, data diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index ff192c2..a0e3c9f 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -8,16 +8,14 @@ """Utility classes to setup OAuth and link accounts to gallery-dl""" -from .common import Extractor, Message +from .common import Extractor from .. import text, oauth, util, config, exception from ..output import stdout_write from ..cache import cache, memcache -import urllib.parse -import binascii -import hashlib REDIRECT_URI_LOCALHOST = "http://localhost:6414/" REDIRECT_URI_HTTPS = "https://mikf.github.io/gallery-dl/oauth-redirect.html" +NOOP = ((-1, "", None),) class OAuthBase(Extractor): @@ -86,7 +84,7 @@ class OAuthBase(Extractor): def open(self, url, params, recv=None): """Open 'url' in browser amd return response parameters""" - url += "?" + urllib.parse.urlencode(params) + url = f"{url}?{text.build_query(params)}" if browser := self.config("browser", True): try: @@ -257,16 +255,18 @@ class OAuthFlickr(OAuthBase): redirect_uri = REDIRECT_URI_HTTPS def items(self): - yield Message.Version, 1 - from . import flickr + # from . import flickr self._oauth1_authorization_flow( - flickr.FlickrAPI.API_KEY, - flickr.FlickrAPI.API_SECRET, + # flickr.FlickrAPI.API_KEY, + # flickr.FlickrAPI.API_SECRET, + "", + "", "https://www.flickr.com/services/oauth/request_token", "https://www.flickr.com/services/oauth/authorize", "https://www.flickr.com/services/oauth/access_token", ) + return iter(NOOP) class OAuthSmugmug(OAuthBase): @@ -275,7 +275,6 @@ class OAuthSmugmug(OAuthBase): example = "oauth:smugmug" def items(self): - yield Message.Version, 1 from . import smugmug self._oauth1_authorization_flow( @@ -285,6 +284,7 @@ class OAuthSmugmug(OAuthBase): "https://api.smugmug.com/services/oauth/1.0a/authorize", "https://api.smugmug.com/services/oauth/1.0a/getAccessToken", ) + return iter(NOOP) class OAuthTumblr(OAuthBase): @@ -293,7 +293,6 @@ class OAuthTumblr(OAuthBase): example = "oauth:tumblr" def items(self): - yield Message.Version, 1 from . import tumblr self._oauth1_authorization_flow( @@ -303,6 +302,7 @@ class OAuthTumblr(OAuthBase): "https://www.tumblr.com/oauth/authorize", "https://www.tumblr.com/oauth/access_token", ) + return iter(NOOP) # -------------------------------------------------------------------- @@ -315,7 +315,6 @@ class OAuthDeviantart(OAuthBase): redirect_uri = REDIRECT_URI_HTTPS def items(self): - yield Message.Version, 1 from . import deviantart self._oauth2_authorization_code_grant( @@ -328,6 +327,7 @@ class OAuthDeviantart(OAuthBase): scope="browse user.manage", cache=deviantart._refresh_token_cache, ) + return iter(NOOP) class OAuthReddit(OAuthBase): @@ -336,7 +336,6 @@ class OAuthReddit(OAuthBase): example = "oauth:reddit" def items(self): - yield Message.Version, 1 from . import reddit self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT @@ -350,6 +349,7 @@ class OAuthReddit(OAuthBase): scope="read history", cache=reddit._refresh_token_cache, ) + return iter(NOOP) class OAuthMastodon(OAuthBase): @@ -362,7 +362,6 @@ class OAuthMastodon(OAuthBase): self.instance = match[1] def items(self): - yield Message.Version, 1 from . import mastodon for _, root, application in mastodon.MastodonExtractor.instances: @@ -382,6 +381,7 @@ class OAuthMastodon(OAuthBase): key="access_token", cache=mastodon._access_token_cache, ) + return iter(NOOP) @cache(maxage=36500*86400, keyarg=1) def _register(self, instance): @@ -416,8 +416,9 @@ class OAuthPixiv(OAuthBase): example = "oauth:pixiv" def items(self): - yield Message.Version, 1 from . import pixiv + import binascii + import hashlib code_verifier = util.generate_token(32) digest = hashlib.sha256(code_verifier.encode()).digest() @@ -464,6 +465,7 @@ class OAuthPixiv(OAuthBase): self.log.info("Writing 'refresh-token' to cache") stdout_write(self._generate_message(("refresh-token",), (token,))) + return iter(NOOP) def _input_code(self): stdout_write("""\ diff --git a/gallery_dl/extractor/okporn.py b/gallery_dl/extractor/okporn.py new file mode 100644 index 0000000..e03f7cb --- /dev/null +++ b/gallery_dl/extractor/okporn.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://ok.porn/""" + +from .common import GalleryExtractor +from .. import text + + +class OkpornGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from ok.porn""" + category = "okporn" + root = "https://ok.porn" + pattern = r"(?:https?://)?(?:www\.)?ok\.porn/albums/(\d+)" + example = "https://ok.porn/albums/12345/" + + def __init__(self, match): + url = f"{self.root}/albums/{match[1]}/" + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + return { + "gallery_id" : text.parse_int(self.groups[0]), + "title" : text.unescape(text.extr( + page, "h1 class=title>", "</h1>")), + "description": text.unescape(text.extr( + page, 'name="description" content="', '"')), + "tags": text.extr( + page, 'name="keywords" content="', '"').split(", "), + } + + def images(self, page): + return [ + (url, None) + for url in text.extract_iter(page, 'data-original="', '"') + ] diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 490243a..d56331f 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -31,7 +31,7 @@ class PahealExtractor(Extractor): post["width"] = text.parse_int(post["width"]) post["height"] = text.parse_int(post["height"]) post.update(data) - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, post["file_url"], post def get_metadata(self): @@ -53,8 +53,7 @@ class PahealExtractor(Extractor): extr("<source src='", "'")), "uploader": text.unquote(extr( "class='username' href='/user/", "'")), - "date" : text.parse_datetime( - extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"), + "date" : self.parse_datetime_iso(extr("datetime='", "'")), "source" : text.unescape(text.extr( extr(">Source Link<", "</td>"), "href='", "'")), } @@ -133,7 +132,7 @@ class PahealTagExtractor(PahealExtractor): "duration" : text.parse_float(duration[:-1]), "tags" : text.unescape(tags), "size" : text.parse_bytes(size[:-1]), - "date" : text.parse_datetime(date, "%B %d, %Y; %H:%M"), + "date" : self.parse_datetime(date, "%B %d, %Y; %H:%M"), "filename" : f"{pid} - {tags}", "extension": ext, } diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index cf1a6d6..12dfd48 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -9,7 +9,7 @@ """Extractors for https://www.patreon.com/""" from .common import Extractor, Message -from .. import text, util, exception +from .. import text, util, dt, exception from ..cache import memcache import collections import itertools @@ -46,20 +46,21 @@ class PatreonExtractor(Extractor): for post in self.posts(): - yield Message.Directory, post + yield Message.Directory, "", post if not post.get("current_user_can_view", True): self.log.warning("Not allowed to view post %s", post["id"]) continue post["num"] = 0 hashes = set() - for kind, url, name in itertools.chain.from_iterable( + for kind, file, url, name in itertools.chain.from_iterable( g(post) for g in generators): fhash = self._filehash(url) if fhash not in hashes or not fhash: hashes.add(fhash) post["hash"] = fhash post["type"] = kind + post["file"] = file post["num"] += 1 text.nameext_from_url(name, post) if text.ext_from_url(url) == "m3u8": @@ -86,7 +87,7 @@ class PatreonExtractor(Extractor): name = url else: name = self._filename(url) or url - return (("postfile", url, name),) + return (("postfile", postfile, url, name),) return () def _images(self, post): @@ -94,7 +95,7 @@ class PatreonExtractor(Extractor): for image in images: if url := self._images_url(image): name = image.get("file_name") or self._filename(url) or url - yield "image", url, name + yield "image", image, url, name def _images_url(self, image): return image.get("download_url") @@ -109,24 +110,24 @@ class PatreonExtractor(Extractor): if image := post.get("image"): if url := image.get("large_url"): name = image.get("file_name") or self._filename(url) or url - return (("image_large", url, name),) + return (("image_large", image, url, name),) return () def _attachments(self, post): for attachment in post.get("attachments") or (): if url := self.request_location(attachment["url"], fatal=False): - yield "attachment", url, attachment["name"] + yield "attachment", attachment, url, attachment["name"] for attachment in post.get("attachments_media") or (): if url := attachment.get("download_url"): - yield "attachment", url, attachment["file_name"] + yield "attachment", attachment, url, attachment["file_name"] def _content(self, post): if content := post.get("content"): for img in text.extract_iter( content, '<img data-media-id="', '>'): if url := text.extr(img, 'src="', '"'): - yield "content", url, self._filename(url) or url + yield "content", None, url, self._filename(url) or url def posts(self): """Return all relevant post objects""" @@ -177,8 +178,7 @@ class PatreonExtractor(Extractor): post, included, "attachments") attr["attachments_media"] = self._files( post, included, "attachments_media") - attr["date"] = text.parse_datetime( - attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + attr["date"] = self.parse_datetime_iso(attr["published_at"]) try: attr["campaign"] = (included["campaign"][ @@ -226,8 +226,7 @@ class PatreonExtractor(Extractor): user = response.json()["data"] attr = user["attributes"] attr["id"] = user["id"] - attr["date"] = text.parse_datetime( - attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z") + attr["date"] = self.parse_datetime_iso(attr["created"]) return attr def _collection(self, collection_id): @@ -236,8 +235,7 @@ class PatreonExtractor(Extractor): coll = data["data"] attr = coll["attributes"] attr["id"] = coll["id"] - attr["date"] = text.parse_datetime( - attr["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + attr["date"] = self.parse_datetime_iso(attr["created_at"]) return attr def _filename(self, url): @@ -256,7 +254,7 @@ class PatreonExtractor(Extractor): return part return "" - def _build_url(self, endpoint, query): + def _build_url(self, endpoint, sort, query): return ( f"https://www.patreon.com/api/{endpoint}" @@ -291,11 +289,20 @@ class PatreonExtractor(Extractor): "preview_views,video_duration" f"&page[cursor]={self._init_cursor()}" - f"{query}" + f"{query}{self._order(sort)}" "&json-api-version=1.0" ) + def _order(self, sort): + if order := self.config("order-posts"): + if order in {"d", "desc"}: + order = "-published_at" + elif order in {"a", "asc", "r", "reverse"}: + order = "published_at" + return f"&sort={order}" + return f"&sort={sort}" if sort else "" + def _build_file_generators(self, filetypes): if filetypes is None: return (self._images, self._image_large, @@ -358,17 +365,26 @@ class PatreonCollectionExtractor(PatreonExtractor): campaign_id = text.extr( collection["thumbnail"]["url"], "/campaign/", "/") - url = self._build_url("posts", ( + url = self._build_url("posts", "collection_order", ( # patreon returns '400 Bad Request' without campaign_id filter f"&filter[campaign_id]={campaign_id}" "&filter[contains_exclusive_posts]=true" "&filter[is_draft]=false" f"&filter[collection_id]={collection_id}" "&filter[include_drops]=true" - "&sort=collection_order" )) return self._pagination(url) + def _order(self, sort): + if order := self.config("order-posts"): + if order in {"a", "asc"}: + order = "collection_order" + elif order in {"d", "desc", "r", "reverse"}: + # "-collection_order" results in a '400 Bad Request' error + order = "-published_at" + return f"&sort={order}" + return f"&sort={sort}" if sort else "" + class PatreonCreatorExtractor(PatreonExtractor): """Extractor for a creator's works""" @@ -387,12 +403,11 @@ class PatreonCreatorExtractor(PatreonExtractor): campaign_id = self._get_campaign_id(creator, params) self.log.debug("campaign_id: %s", campaign_id) - url = self._build_url("posts", ( + url = self._build_url("posts", params.get("sort", "-published_at"), ( f"&filter[campaign_id]={campaign_id}" "&filter[contains_exclusive_posts]=true" "&filter[is_draft]=false" f"{self._get_filters(params)}" - f"&sort={params.get('sort', '-published_at')}" )) return self._pagination(url) @@ -445,11 +460,10 @@ class PatreonUserExtractor(PatreonExtractor): def posts(self): if date_max := self._get_date_min_max(None, None)[1]: - self._cursor = cursor = \ - util.datetime_from_timestamp(date_max).isoformat() + self._cursor = cursor = dt.from_ts(date_max).isoformat() self._init_cursor = lambda: cursor - url = self._build_url("stream", ( + url = self._build_url("stream", None, ( "&filter[is_following]=true" "&json-api-use-default-includes=false" )) diff --git a/gallery_dl/extractor/pexels.py b/gallery_dl/extractor/pexels.py index f95d409..9e2f40c 100644 --- a/gallery_dl/extractor/pexels.py +++ b/gallery_dl/extractor/pexels.py @@ -35,8 +35,7 @@ class PexelsExtractor(Extractor): post["type"] = attr["type"] post.update(metadata) - post["date"] = text.parse_datetime( - post["created_at"][:-5], "%Y-%m-%dT%H:%M:%S") + post["date"] = self.parse_datetime_iso(post["created_at"][:-5]) if "image" in post: url, _, query = post["image"]["download_link"].partition("?") @@ -49,7 +48,7 @@ class PexelsExtractor(Extractor): self.log.warning("%s: Unsupported post type", post.get("id")) continue - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, url, text.nameext_from_url(name, post) def posts(self): @@ -63,7 +62,7 @@ class PexelsCollectionExtractor(PexelsExtractor): """Extractor for a pexels.com collection""" subcategory = "collection" directory_fmt = ("{category}", "Collections", "{collection}") - pattern = BASE_PATTERN + r"/collections/((?:[^/?#]*-)?(\w+))" + pattern = rf"{BASE_PATTERN}/collections/((?:[^/?#]*-)?(\w+))" example = "https://www.pexels.com/collections/SLUG-a1b2c3/" def metadata(self): @@ -78,7 +77,7 @@ class PexelsSearchExtractor(PexelsExtractor): """Extractor for pexels.com search results""" subcategory = "search" directory_fmt = ("{category}", "Searches", "{search_tags}") - pattern = BASE_PATTERN + r"/search/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/search/([^/?#]+)" example = "https://www.pexels.com/search/QUERY/" def metadata(self): @@ -92,7 +91,7 @@ class PexelsUserExtractor(PexelsExtractor): """Extractor for pexels.com user galleries""" subcategory = "user" directory_fmt = ("{category}", "@{user[slug]}") - pattern = BASE_PATTERN + r"/(@(?:(?:[^/?#]*-)?(\d+)|[^/?#]+))" + pattern = rf"{BASE_PATTERN}/(@(?:(?:[^/?#]*-)?(\d+)|[^/?#]+))" example = "https://www.pexels.com/@USER-12345/" def posts(self): @@ -101,7 +100,7 @@ class PexelsUserExtractor(PexelsExtractor): class PexelsImageExtractor(PexelsExtractor): subcategory = "image" - pattern = BASE_PATTERN + r"/photo/((?:[^/?#]*-)?\d+)" + pattern = rf"{BASE_PATTERN}/photo/((?:[^/?#]*-)?\d+)" example = "https://www.pexels.com/photo/SLUG-12345/" def posts(self): diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 8891dc0..3634c66 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -36,8 +36,7 @@ class PhilomenaExtractor(BooruExtractor): return url def _prepare(self, post): - post["date"] = text.parse_datetime( - post["created_at"][:19], "%Y-%m-%dT%H:%M:%S") + post["date"] = self.parse_datetime_iso(post["created_at"][:19]) BASE_PATTERN = PhilomenaExtractor.update({ @@ -62,7 +61,7 @@ BASE_PATTERN = PhilomenaExtractor.update({ class PhilomenaPostExtractor(PhilomenaExtractor): """Extractor for single posts on a Philomena booru""" subcategory = "post" - pattern = BASE_PATTERN + r"/(?:images/)?(\d+)" + pattern = rf"{BASE_PATTERN}/(?:images/)?(\d+)" example = "https://derpibooru.org/images/12345" def posts(self): @@ -73,7 +72,7 @@ class PhilomenaSearchExtractor(PhilomenaExtractor): """Extractor for Philomena search results""" subcategory = "search" directory_fmt = ("{category}", "{search_tags}") - pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))" + pattern = rf"{BASE_PATTERN}/(?:search/?\?([^#]+)|tags/([^/?#]+))" example = "https://derpibooru.org/search?q=QUERY" def __init__(self, match): @@ -107,7 +106,7 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor): subcategory = "gallery" directory_fmt = ("{category}", "galleries", "{gallery[id]} {gallery[title]}") - pattern = BASE_PATTERN + r"/galleries/(\d+)" + pattern = rf"{BASE_PATTERN}/galleries/(\d+)" example = "https://derpibooru.org/galleries/12345" def metadata(self): diff --git a/gallery_dl/extractor/photovogue.py b/gallery_dl/extractor/photovogue.py index e604304..cb16b23 100644 --- a/gallery_dl/extractor/photovogue.py +++ b/gallery_dl/extractor/photovogue.py @@ -18,7 +18,7 @@ class PhotovogueUserExtractor(Extractor): directory_fmt = ("{category}", "{photographer[id]} {photographer[name]}") filename_fmt = "{id} {title}.{extension}" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/photographers/(\d+)" + pattern = rf"{BASE_PATTERN}/photographers/(\d+)" example = "https://www.vogue.com/photovogue/photographers/12345" def __init__(self, match): @@ -29,10 +29,9 @@ class PhotovogueUserExtractor(Extractor): for photo in self.photos(): url = photo["gallery_image"] photo["title"] = photo["title"].strip() - photo["date"] = text.parse_datetime( - photo["date"], "%Y-%m-%dT%H:%M:%S.%f%z") + photo["date"] = self.parse_datetime_iso(photo["date"]) - yield Message.Directory, photo + yield Message.Directory, "", photo yield Message.Url, url, text.nameext_from_url(url, photo) def photos(self): diff --git a/gallery_dl/extractor/picarto.py b/gallery_dl/extractor/picarto.py index 62ac38a..b0fa079 100644 --- a/gallery_dl/extractor/picarto.py +++ b/gallery_dl/extractor/picarto.py @@ -29,10 +29,9 @@ class PicartoGalleryExtractor(Extractor): def items(self): for post in self.posts(): - post["date"] = text.parse_datetime( - post["created_at"], "%Y-%m-%d %H:%M:%S") + post["date"] = self.parse_datetime_iso(post["created_at"]) variations = post.pop("variations", ()) - yield Message.Directory, post + yield Message.Directory, "", post image = post["default_image"] if not image: diff --git a/gallery_dl/extractor/picazor.py b/gallery_dl/extractor/picazor.py new file mode 100644 index 0000000..df1f436 --- /dev/null +++ b/gallery_dl/extractor/picazor.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://picazor.com/""" + +from .common import Extractor, Message +from .. import text + + +class PicazorUserExtractor(Extractor): + """Extractor for picazor users""" + category = "picazor" + subcategory = "user" + root = "https://picazor.com" + browser = "firefox" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{id}_{num:>03}.{extension}" + archive_fmt = "{id}_{num}" + pattern = r"(?:https?://)?(?:www\.)?picazor\.com/[a-z]{2}/([^/?#]+)" + example = "https://picazor.com/en/USERNAME" + + def items(self): + user = self.groups[0] + first = True + + url = f"{self.root}/api/files/{user}/sfiles" + params = {"page": 1} + headers = {"Referer": f"{self.root}/en/{user}"} + + while True: + data = self.request_json(url, params=params, headers=headers) + if not data: + break + + for item in data: + path = item.get("path") + if not path: + continue + + if first: + first = False + self.kwdict["user"] = user + self.kwdict["count"] = item.get("order") + yield Message.Directory, "", { + "subject": item.get("subject"), + "user" : user, + } + + item.pop("blurDataURL", None) + item["num"] = item["order"] + + file_url = self.root + path + text.nameext_from_url(file_url, item) + yield Message.Url, file_url, item + + params["page"] += 1 diff --git a/gallery_dl/extractor/pictoa.py b/gallery_dl/extractor/pictoa.py index da252f3..0dfe304 100644 --- a/gallery_dl/extractor/pictoa.py +++ b/gallery_dl/extractor/pictoa.py @@ -24,7 +24,7 @@ class PictoaExtractor(Extractor): class PictoaImageExtractor(PictoaExtractor): """Extractor for single images from pictoa.com""" subcategory = "image" - pattern = BASE_PATTERN + r"/albums/(?:[\w-]+-)?(\d+)/(\d+)" + pattern = rf"{BASE_PATTERN}/albums/(?:[\w-]+-)?(\d+)/(\d+)" example = "https://www.pictoa.com/albums/NAME-12345/12345.html" def items(self): @@ -43,14 +43,14 @@ class PictoaImageExtractor(PictoaExtractor): } text.nameext_from_url(image_url, data) - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, image_url, data class PictoaAlbumExtractor(PictoaExtractor): """Extractor for image albums from pictoa.com""" subcategory = "album" - pattern = BASE_PATTERN + r"/albums/(?:[\w-]+-)?(\d+).html" + pattern = rf"{BASE_PATTERN}/albums/(?:[\w-]+-)?(\d+).html" example = "https://www.pictoa.com/albums/NAME-12345.html" def items(self): diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py index 968776b..6661e7d 100644 --- a/gallery_dl/extractor/piczel.py +++ b/gallery_dl/extractor/piczel.py @@ -26,14 +26,13 @@ class PiczelExtractor(Extractor): def items(self): for post in self.posts(): post["tags"] = [t["title"] for t in post["tags"] if t["title"]] - post["date"] = text.parse_datetime( - post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + post["date"] = self.parse_datetime_iso(post["created_at"]) if post["multi"]: images = post["images"] del post["images"] post["count"] = len(images) - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], image in enumerate(images): if "id" in image: del image["id"] @@ -43,7 +42,7 @@ class PiczelExtractor(Extractor): else: post["count"] = 1 - yield Message.Directory, post + yield Message.Directory, "", post post["num"] = 0 url = post["image"]["url"] yield Message.Url, url, text.nameext_from_url(url, post) @@ -67,7 +66,7 @@ class PiczelExtractor(Extractor): class PiczelUserExtractor(PiczelExtractor): """Extractor for all images from a user's gallery""" subcategory = "user" - pattern = BASE_PATTERN + r"/gallery/([^/?#]+)/?$" + pattern = rf"{BASE_PATTERN}/gallery/([^/?#]+)/?$" example = "https://piczel.tv/gallery/USER" def posts(self): @@ -80,7 +79,7 @@ class PiczelFolderExtractor(PiczelExtractor): subcategory = "folder" directory_fmt = ("{category}", "{user[username]}", "{folder[name]}") archive_fmt = "f{folder[id]}_{id}_{num}" - pattern = BASE_PATTERN + r"/gallery/(?!image/)[^/?#]+/(\d+)" + pattern = rf"{BASE_PATTERN}/gallery/(?!image/)[^/?#]+/(\d+)" example = "https://piczel.tv/gallery/USER/12345" def posts(self): @@ -91,7 +90,7 @@ class PiczelFolderExtractor(PiczelExtractor): class PiczelImageExtractor(PiczelExtractor): """Extractor for individual images""" subcategory = "image" - pattern = BASE_PATTERN + r"/gallery/image/(\d+)" + pattern = rf"{BASE_PATTERN}/gallery/image/(\d+)" example = "https://piczel.tv/gallery/image/12345" def posts(self): diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py index 05bc8e7..0b750fe 100644 --- a/gallery_dl/extractor/pillowfort.py +++ b/gallery_dl/extractor/pillowfort.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from ..cache import cache -from .. import text, util, exception +from .. import text, exception BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social" @@ -36,7 +36,7 @@ class PillowfortExtractor(Extractor): external = self.config("external", False) if inline: - inline = util.re(r'src="(https://img\d+\.pillowfort\.social' + inline = text.re(r'src="(https://img\d+\.pillowfort\.social' r'/posts/[^"]+)').findall for post in self.posts(): @@ -48,11 +48,10 @@ class PillowfortExtractor(Extractor): for url in inline(post["content"]): files.append({"url": url}) - post["date"] = text.parse_datetime( - post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + post["date"] = self.parse_datetime_iso(post["created_at"]) post["post_id"] = post.pop("id") post["count"] = len(files) - yield Message.Directory, post + yield Message.Directory, "", post post["num"] = 0 for file in files: @@ -76,8 +75,7 @@ class PillowfortExtractor(Extractor): if "id" not in file: post["id"] = post["hash"] if "created_at" in file: - post["date"] = text.parse_datetime( - file["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + post["date"] = self.parse_datetime_iso(file["created_at"]) yield msgtype, url, post @@ -121,7 +119,7 @@ class PillowfortExtractor(Extractor): class PillowfortPostExtractor(PillowfortExtractor): """Extractor for a single pillowfort post""" subcategory = "post" - pattern = BASE_PATTERN + r"/posts/(\d+)" + pattern = rf"{BASE_PATTERN}/posts/(\d+)" example = "https://www.pillowfort.social/posts/12345" def posts(self): @@ -132,7 +130,7 @@ class PillowfortPostExtractor(PillowfortExtractor): class PillowfortUserExtractor(PillowfortExtractor): """Extractor for all posts of a pillowfort user""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+(?:/tagged/[^/?#]+)?)" + pattern = rf"{BASE_PATTERN}/(?!posts/)([^/?#]+(?:/tagged/[^/?#]+)?)" example = "https://www.pillowfort.social/USER" def posts(self): diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index ff771fb..7aa32ec 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -46,7 +46,7 @@ class PinterestExtractor(Extractor): try: files = self._extract_files(pin) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.warning( "%s: Error when extracting download URLs (%s: %s)", pin.get("id"), exc.__class__.__name__, exc) @@ -63,7 +63,7 @@ class PinterestExtractor(Extractor): if value := pin.get(key): pin[key] = value.strip() - yield Message.Directory, pin + yield Message.Directory, "", pin for pin["num"], file in enumerate(files, 1): url = file["url"] text.nameext_from_url(url, pin) @@ -207,7 +207,7 @@ class PinterestExtractor(Extractor): class PinterestUserExtractor(PinterestExtractor): """Extractor for a user's boards""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)(?:/_saved)?/?$" + pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)(?:/_saved)?/?$" example = "https://www.pinterest.com/USER/" def __init__(self, match): @@ -225,7 +225,7 @@ class PinterestAllpinsExtractor(PinterestExtractor): """Extractor for a user's 'All Pins' feed""" subcategory = "allpins" directory_fmt = ("{category}", "{user}") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/pins/?$" + pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)/pins/?$" example = "https://www.pinterest.com/USER/pins/" def __init__(self, match): @@ -243,7 +243,7 @@ class PinterestCreatedExtractor(PinterestExtractor): """Extractor for a user's created pins""" subcategory = "created" directory_fmt = ("{category}", "{user}") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/_created/?$" + pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)/_created/?$" example = "https://www.pinterest.com/USER/_created/" def __init__(self, match): @@ -263,7 +263,7 @@ class PinterestSectionExtractor(PinterestExtractor): directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}", "{section[title]}") archive_fmt = "{board[id]}_{id}" - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/([^/?#]+)/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)/([^/?#]+)/([^/?#]+)" example = "https://www.pinterest.com/USER/BOARD/SECTION" def __init__(self, match): @@ -291,7 +291,7 @@ class PinterestSearchExtractor(PinterestExtractor): """Extractor for Pinterest search results""" subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") - pattern = BASE_PATTERN + r"/search/pins/?\?q=([^&#]+)" + pattern = rf"{BASE_PATTERN}/search/pins/?\?q=([^&#]+)" example = "https://www.pinterest.com/search/pins/?q=QUERY" def __init__(self, match): @@ -308,7 +308,7 @@ class PinterestSearchExtractor(PinterestExtractor): class PinterestPinExtractor(PinterestExtractor): """Extractor for images from a single pin from pinterest.com""" subcategory = "pin" - pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)" + pattern = rf"{BASE_PATTERN}/pin/([^/?#]+)(?!.*#related$)" example = "https://www.pinterest.com/pin/12345/" def __init__(self, match): @@ -329,7 +329,7 @@ class PinterestBoardExtractor(PinterestExtractor): subcategory = "board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") archive_fmt = "{board[id]}_{id}" - pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)" + pattern = (rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)" r"/([^/?#]+)/?(?!.*#related$)") example = "https://www.pinterest.com/USER/BOARD/" @@ -361,7 +361,7 @@ class PinterestRelatedPinExtractor(PinterestPinExtractor): """Extractor for related pins of another pin from pinterest.com""" subcategory = "related-pin" directory_fmt = ("{category}", "related {original_pin[id]}") - pattern = BASE_PATTERN + r"/pin/([^/?#]+).*#related$" + pattern = rf"{BASE_PATTERN}/pin/([^/?#]+).*#related$" example = "https://www.pinterest.com/pin/12345/#related" def metadata(self): @@ -376,7 +376,7 @@ class PinterestRelatedBoardExtractor(PinterestBoardExtractor): subcategory = "related-board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}", "related") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/([^/?#]+)/?#related$" + pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)/([^/?#]+)/?#related$" example = "https://www.pinterest.com/USER/BOARD/#related" def pins(self): diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py index 73f4b1f..1486976 100644 --- a/gallery_dl/extractor/pixeldrain.py +++ b/gallery_dl/extractor/pixeldrain.py @@ -24,16 +24,12 @@ class PixeldrainExtractor(Extractor): if api_key := self.config("api-key"): self.session.auth = util.HTTPBasicAuth("", api_key) - def parse_datetime(self, date_string): - return text.parse_datetime( - date_string, "%Y-%m-%dT%H:%M:%S.%fZ") - class PixeldrainFileExtractor(PixeldrainExtractor): """Extractor for pixeldrain files""" subcategory = "file" filename_fmt = "{filename[:230]} ({id}).{extension}" - pattern = BASE_PATTERN + r"/(?:u|api/file)/(\w+)" + pattern = rf"{BASE_PATTERN}/(?:u|api/file)/(\w+)" example = "https://pixeldrain.com/u/abcdefgh" def __init__(self, match): @@ -45,10 +41,10 @@ class PixeldrainFileExtractor(PixeldrainExtractor): file = self.request_json(url + "/info") file["url"] = url + "?download" - file["date"] = self.parse_datetime(file["date_upload"]) + file["date"] = self.parse_datetime_iso(file["date_upload"]) text.nameext_from_url(file["name"], file) - yield Message.Directory, file + yield Message.Directory, "", file yield Message.Url, file["url"], file @@ -58,7 +54,7 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor): directory_fmt = ("{category}", "{album[date]:%Y-%m-%d} {album[title]} ({album[id]})") filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}" - pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)(?:#item=(\d+))?" + pattern = rf"{BASE_PATTERN}/(?:l|api/list)/(\w+)(?:#item=(\d+))?" example = "https://pixeldrain.com/l/abcdefgh" def __init__(self, match): @@ -72,7 +68,7 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor): files = album["files"] album["count"] = album["file_count"] - album["date"] = self.parse_datetime(album["date_created"]) + album["date"] = self.parse_datetime_iso(album["date_created"]) if self.file_index: idx = text.parse_int(self.file_index) @@ -86,12 +82,12 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor): del album["files"] del album["file_count"] - yield Message.Directory, {"album": album} + yield Message.Directory, "", {"album": album} for num, file in enumerate(files, idx+1): file["album"] = album file["num"] = num file["url"] = url = f"{self.root}/api/file/{file['id']}?download" - file["date"] = self.parse_datetime(file["date_upload"]) + file["date"] = self.parse_datetime_iso(file["date_upload"]) text.nameext_from_url(file["name"], file) yield Message.Url, url, file @@ -101,7 +97,7 @@ class PixeldrainFolderExtractor(PixeldrainExtractor): subcategory = "folder" filename_fmt = "{filename[:230]}.{extension}" archive_fmt = "{path}_{num}" - pattern = BASE_PATTERN + r"/(?:d|api/filesystem)/([^?]+)" + pattern = rf"{BASE_PATTERN}/(?:d|api/filesystem)/([^?]+)" example = "https://pixeldrain.com/d/abcdefgh" def metadata(self, data): @@ -112,7 +108,7 @@ class PixeldrainFolderExtractor(PixeldrainExtractor): "mime_type" : data["file_type"], "size" : data["file_size"], "hash_sha256": data["sha256_sum"], - "date" : self.parse_datetime(data["created"]), + "date" : self.parse_datetime_iso(data["created"]), } def items(self): @@ -135,7 +131,7 @@ class PixeldrainFolderExtractor(PixeldrainExtractor): folder = self.metadata(path) folder["id"] = paths[0]["id"] - yield Message.Directory, folder + yield Message.Directory, "", folder num = 0 for child in children: diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 6276a2a..eb1a7f2 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -9,14 +9,13 @@ """Extractors for https://www.pixiv.net/""" from .common import Extractor, Message, Dispatch -from .. import text, util, exception +from .. import text, util, dt, exception from ..cache import cache, memcache -from datetime import datetime, timedelta import itertools import hashlib BASE_PATTERN = r"(?:https?://)?(?:www\.|touch\.)?ph?ixiv\.net" -USER_PATTERN = BASE_PATTERN + r"/(?:en/)?users/(\d+)" +USER_PATTERN = rf"{BASE_PATTERN}/(?:en/)?users/(\d+)" class PixivExtractor(Extractor): @@ -44,7 +43,7 @@ class PixivExtractor(Extractor): self.meta_captions = self.config("captions") if self.sanity_workaround or self.meta_captions: - self.meta_captions_sub = util.re( + self.meta_captions_sub = text.re( r'<a href="/jump\.php\?([^"]+)').sub def items(self): @@ -96,12 +95,12 @@ class PixivExtractor(Extractor): if transform_tags: transform_tags(work) work["num"] = 0 - work["date"] = text.parse_datetime(work["create_date"]) + work["date"] = dt.parse_iso(work["create_date"]) work["rating"] = ratings.get(work["x_restrict"]) work["suffix"] = "" work.update(metadata) - yield Message.Directory, work + yield Message.Directory, "", work for work["num"], file in enumerate(files): url = file["url"] work.update(file) @@ -149,7 +148,7 @@ class PixivExtractor(Extractor): self._extract_ajax(work, body) return self._extract_ugoira(work, url) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.warning( "%s: Unable to extract Ugoira URL. Provide " "logged-in cookies to access it", work["id"]) @@ -238,10 +237,13 @@ class PixivExtractor(Extractor): return data["body"] self.log.debug("Server response: %s", util.json_dumps(data)) - return self.log.error( - "'%s'", data.get("message") or "General Error") + if (msg := data.get("message")) == "An unknown error occurred": + msg = "Invalid 'PHPSESSID' cookie" + else: + msg = f"'{msg or 'General Error'}'" + self.log.error("%s", msg) except Exception: - return None + pass def _extract_ajax(self, work, body): work["_ajax"] = True @@ -274,6 +276,9 @@ class PixivExtractor(Extractor): "profile_image_urls": {}, } + if "is_bookmarked" not in work: + work["is_bookmarked"] = True if body.get("bookmarkData") else False + work["tags"] = tags = [] for tag in body["tags"]["tags"]: name = tag["tag"] @@ -350,10 +355,10 @@ class PixivExtractor(Extractor): if fmt in urls: yield urls[fmt] - def _date_from_url(self, url, offset=timedelta(hours=9)): + def _date_from_url(self, url, offset=dt.timedelta(hours=9)): try: _, _, _, _, _, y, m, d, H, M, S, _ = url.split("/") - return datetime( + return dt.datetime( int(y), int(m), int(d), int(H), int(M), int(S)) - offset except Exception: return None @@ -388,7 +393,7 @@ class PixivExtractor(Extractor): class PixivUserExtractor(Dispatch, PixivExtractor): """Extractor for a pixiv user profile""" - pattern = (BASE_PATTERN + r"/(?:" + pattern = (rf"{BASE_PATTERN}/(?:" r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id=" r")(\d+)(?:$|[?#])") example = "https://www.pixiv.net/en/users/12345" @@ -411,7 +416,7 @@ class PixivUserExtractor(Dispatch, PixivExtractor): class PixivArtworksExtractor(PixivExtractor): """Extractor for artworks of a pixiv user""" subcategory = "artworks" - pattern = (BASE_PATTERN + r"/(?:" + pattern = (rf"{BASE_PATTERN}/(?:" r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" r"(?:/([^/?#]+))?/?(?:$|[?#])" r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") @@ -450,7 +455,7 @@ class PixivArtworksExtractor(PixivExtractor): ajax_ids.extend(map(int, body["manga"])) ajax_ids.sort() except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.warning("u%s: Failed to collect artwork IDs " "using AJAX API", self.user_id) else: @@ -500,7 +505,7 @@ class PixivAvatarExtractor(PixivExtractor): subcategory = "avatar" filename_fmt = "avatar{date:?_//%Y-%m-%d}.{extension}" archive_fmt = "avatar_{user[id]}_{date}" - pattern = USER_PATTERN + r"/avatar" + pattern = rf"{USER_PATTERN}/avatar" example = "https://www.pixiv.net/en/users/12345/avatar" def _init(self): @@ -518,7 +523,7 @@ class PixivBackgroundExtractor(PixivExtractor): subcategory = "background" filename_fmt = "background{date:?_//%Y-%m-%d}.{extension}" archive_fmt = "background_{user[id]}_{date}" - pattern = USER_PATTERN + "/background" + pattern = rf"{USER_PATTERN}/background" example = "https://www.pixiv.net/en/users/12345/background" def _init(self): @@ -580,7 +585,7 @@ class PixivWorkExtractor(PixivExtractor): class PixivUnlistedExtractor(PixivExtractor): """Extractor for a unlisted pixiv illustrations""" subcategory = "unlisted" - pattern = BASE_PATTERN + r"/(?:en/)?artworks/unlisted/(\w+)" + pattern = rf"{BASE_PATTERN}/(?:en/)?artworks/unlisted/(\w+)" example = "https://www.pixiv.net/en/artworks/unlisted/a1b2c3d4e5f6g7h8i9j0" def _extract_files(self, work): @@ -599,7 +604,7 @@ class PixivFavoriteExtractor(PixivExtractor): directory_fmt = ("{category}", "bookmarks", "{user_bookmark[id]} {user_bookmark[account]}") archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}" - pattern = (BASE_PATTERN + r"/(?:(?:en/)?" + pattern = (rf"{BASE_PATTERN}/(?:(?:en/)?" r"users/(\d+)/(bookmarks/artworks|following)(?:/([^/?#]+))?" r"|bookmark\.php)(?:\?([^#]*))?") example = "https://www.pixiv.net/en/users/12345/bookmarks/artworks" @@ -662,7 +667,7 @@ class PixivRankingExtractor(PixivExtractor): archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}" directory_fmt = ("{category}", "rankings", "{ranking[mode]}", "{ranking[date]}") - pattern = BASE_PATTERN + r"/ranking\.php(?:\?([^#]*))?" + pattern = rf"{BASE_PATTERN}/ranking\.php(?:\?([^#]*))?" example = "https://www.pixiv.net/ranking.php" def __init__(self, match): @@ -712,8 +717,7 @@ class PixivRankingExtractor(PixivExtractor): self.log.warning("invalid date '%s'", date) date = None if not date: - now = util.datetime_utcnow() - date = (now - timedelta(days=1)).strftime("%Y-%m-%d") + date = (dt.now() - dt.timedelta(days=1)).strftime("%Y-%m-%d") self.date = date self.type = type = query.get("content") @@ -732,7 +736,7 @@ class PixivSearchExtractor(PixivExtractor): subcategory = "search" archive_fmt = "s_{search[word]}_{id}{num}.{extension}" directory_fmt = ("{category}", "search", "{search[word]}") - pattern = (BASE_PATTERN + r"/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?" + pattern = (rf"{BASE_PATTERN}/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?" r"|search\.php)(?:\?([^#]+))?") example = "https://www.pixiv.net/en/tags/TAG" @@ -798,7 +802,7 @@ class PixivFollowExtractor(PixivExtractor): subcategory = "follow" archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}" directory_fmt = ("{category}", "following") - pattern = BASE_PATTERN + r"/bookmark_new_illust\.php" + pattern = rf"{BASE_PATTERN}/bookmark_new_illust\.php" example = "https://www.pixiv.net/bookmark_new_illust.php" def works(self): @@ -847,7 +851,7 @@ class PixivSeriesExtractor(PixivExtractor): directory_fmt = ("{category}", "{user[id]} {user[account]}", "{series[id]} {series[title]}") filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" - pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)" + pattern = rf"{BASE_PATTERN}/user/(\d+)/series/(\d+)" example = "https://www.pixiv.net/user/12345/series/12345" def __init__(self, match): @@ -888,11 +892,10 @@ class PixivSketchExtractor(Extractor): for post in self.posts(): media = post["media"] post["post_id"] = post["id"] - post["date"] = text.parse_datetime( - post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + post["date"] = dt.parse_iso(post["created_at"]) util.delete_items(post, ("id", "media", "_links")) - yield Message.Directory, post + yield Message.Directory, "", post post["_http_headers"] = headers for photo in media: @@ -969,11 +972,11 @@ class PixivNovelExtractor(PixivExtractor): if transform_tags: transform_tags(novel) novel["num"] = 0 - novel["date"] = text.parse_datetime(novel["create_date"]) + novel["date"] = dt.parse_iso(novel["create_date"]) novel["rating"] = ratings.get(novel["x_restrict"]) novel["suffix"] = "" - yield Message.Directory, novel + yield Message.Directory, "", novel try: content = self.api.novel_webview(novel["id"])["text"] @@ -1039,7 +1042,7 @@ class PixivNovelExtractor(PixivExtractor): class PixivNovelNovelExtractor(PixivNovelExtractor): """Extractor for pixiv novels""" subcategory = "novel" - pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)" + pattern = rf"{BASE_PATTERN}/n(?:ovel/show\.php\?id=|/)(\d+)" example = "https://www.pixiv.net/novel/show.php?id=12345" def novels(self): @@ -1053,7 +1056,7 @@ class PixivNovelNovelExtractor(PixivNovelExtractor): class PixivNovelUserExtractor(PixivNovelExtractor): """Extractor for pixiv users' novels""" subcategory = "user" - pattern = USER_PATTERN + r"/novels" + pattern = rf"{USER_PATTERN}/novels" example = "https://www.pixiv.net/en/users/12345/novels" def novels(self): @@ -1063,7 +1066,7 @@ class PixivNovelUserExtractor(PixivNovelExtractor): class PixivNovelSeriesExtractor(PixivNovelExtractor): """Extractor for pixiv novel series""" subcategory = "series" - pattern = BASE_PATTERN + r"/novel/series/(\d+)" + pattern = rf"{BASE_PATTERN}/novel/series/(\d+)" example = "https://www.pixiv.net/novel/series/12345" def novels(self): @@ -1073,7 +1076,7 @@ class PixivNovelSeriesExtractor(PixivNovelExtractor): class PixivNovelBookmarkExtractor(PixivNovelExtractor): """Extractor for bookmarked pixiv novels""" subcategory = "bookmark" - pattern = (USER_PATTERN + r"/bookmarks/novels" + pattern = (rf"{USER_PATTERN}/bookmarks/novels" r"(?:/([^/?#]+))?(?:/?\?([^#]+))?") example = "https://www.pixiv.net/en/users/12345/bookmarks/novels" @@ -1151,7 +1154,7 @@ class PixivAppAPI(): "get_secure_url": "1", } - time = util.datetime_utcnow().strftime("%Y-%m-%dT%H:%M:%S+00:00") + time = dt.now().strftime("%Y-%m-%dT%H:%M:%S+00:00") headers = { "X-Client-Time": time, "X-Client-Hash": hashlib.md5( @@ -1326,11 +1329,11 @@ class PixivAppAPI(): sort = params["sort"] if sort == "date_desc": date_key = "end_date" - date_off = timedelta(days=1) + date_off = dt.timedelta(days=1) date_cmp = lambda lhs, rhs: lhs >= rhs # noqa E731 elif sort == "date_asc": date_key = "start_date" - date_off = timedelta(days=-1) + date_off = dt.timedelta(days=-1) date_cmp = lambda lhs, rhs: lhs <= rhs # noqa E731 else: date_key = None @@ -1357,8 +1360,8 @@ class PixivAppAPI(): if date_key and text.parse_int(params.get("offset")) >= 5000: date_last = data["illusts"][-1]["create_date"] - date_val = (text.parse_datetime( - date_last) + date_off).strftime("%Y-%m-%d") + date_val = (dt.parse_iso(date_last) + date_off).strftime( + "%Y-%m-%d") self.log.info("Reached 'offset' >= 5000; " "Updating '%s' to '%s'", date_key, date_val) params[date_key] = date_val diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py index 75c06bb..2feab95 100644 --- a/gallery_dl/extractor/pixnet.py +++ b/gallery_dl/extractor/pixnet.py @@ -65,7 +65,7 @@ class PixnetImageExtractor(PixnetExtractor): subcategory = "image" filename_fmt = "{id}.{extension}" directory_fmt = ("{category}", "{blog}") - pattern = BASE_PATTERN + r"/album/photo/(\d+)" + pattern = rf"{BASE_PATTERN}/album/photo/(\d+)" example = "https://USER.pixnet.net/album/photo/12345" def items(self): @@ -83,7 +83,7 @@ class PixnetImageExtractor(PixnetExtractor): data["blog"] = self.blog data["user"] = data.pop("author_name") - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, data["url"], data @@ -92,7 +92,7 @@ class PixnetSetExtractor(PixnetExtractor): subcategory = "set" directory_fmt = ("{category}", "{blog}", "{folder_id} {folder_title}", "{set_id} {set_title}") - pattern = BASE_PATTERN + r"/album/set/(\d+)" + pattern = rf"{BASE_PATTERN}/album/set/(\d+)" example = "https://USER.pixnet.net/album/set/12345" def items(self): @@ -100,7 +100,7 @@ class PixnetSetExtractor(PixnetExtractor): page = self.request(url, encoding="utf-8").text data = self.metadata(page) - yield Message.Directory, data + yield Message.Directory, "", data for num, info in enumerate(self._pagination(page), 1): url, pos = text.extract(info, ' href="', '"') src, pos = text.extract(info, ' src="', '"', pos) @@ -137,7 +137,7 @@ class PixnetFolderExtractor(PixnetExtractor): """Extractor for all sets in a pixnet folder""" subcategory = "folder" url_fmt = "{}/album/folder/{}" - pattern = BASE_PATTERN + r"/album/folder/(\d+)" + pattern = rf"{BASE_PATTERN}/album/folder/(\d+)" example = "https://USER.pixnet.net/album/folder/12345" @@ -145,5 +145,5 @@ class PixnetUserExtractor(PixnetExtractor): """Extractor for all sets and folders of a pixnet user""" subcategory = "user" url_fmt = "{}{}/album/list" - pattern = BASE_PATTERN + r"()(?:/blog|/album(?:/list)?)?/?(?:$|[?#])" + pattern = rf"{BASE_PATTERN}()(?:/blog|/album(?:/list)?)?/?(?:$|[?#])" example = "https://USER.pixnet.net/" diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index 37b9b10..76ca59f 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -9,8 +9,7 @@ """Extractors for https://www.plurk.com/""" from .common import Extractor, Message -from .. import text, util, exception -import datetime +from .. import text, util, dt, exception class PlurkExtractor(Extractor): @@ -62,7 +61,7 @@ class PlurkExtractor(Extractor): if not data: raise exception.NotFoundError("user") return util.json_loads( - util.re(r"new Date\(([^)]+)\)").sub(r"\1", data)) + text.re(r"new Date\(([^)]+)\)").sub(r"\1", data)) class PlurkTimelineExtractor(PlurkExtractor): @@ -88,12 +87,10 @@ class PlurkTimelineExtractor(PlurkExtractor): while plurks: yield from plurks - offset = datetime.datetime.strptime( - plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z") + offset = dt.parse(plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z") data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z") - response = self.request( - url, method="POST", headers=headers, data=data) - plurks = response.json()["plurks"] + plurks = self.request_json( + url, method="POST", headers=headers, data=data)["plurks"] class PlurkPostExtractor(PlurkExtractor): diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index 32ca528..c3aaaba 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -81,7 +81,7 @@ class PoipikuExtractor(Extractor): "PasswordIcon", ">"): post["password"] = True - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], url in enumerate(extract_files( post, thumb, extr), 1): yield Message.Url, url, text.nameext_from_url(url, post) diff --git a/gallery_dl/extractor/poringa.py b/gallery_dl/extractor/poringa.py index da17eae..832bedf 100644 --- a/gallery_dl/extractor/poringa.py +++ b/gallery_dl/extractor/poringa.py @@ -68,7 +68,7 @@ class PoringaExtractor(Extractor): main_post, '<img class="imagen" border="0" src="', '"')) data["count"] = len(urls) - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], url in enumerate(urls, 1): yield Message.Url, url, text.nameext_from_url(url, data) @@ -104,7 +104,7 @@ class PoringaExtractor(Extractor): class PoringaPostExtractor(PoringaExtractor): """Extractor for posts on poringa.net""" subcategory = "post" - pattern = BASE_PATTERN + r"/posts/imagenes/(\d+)" + pattern = rf"{BASE_PATTERN}/posts/imagenes/(\d+)" example = "http://www.poringa.net/posts/imagenes/12345/TITLE.html" def posts(self): @@ -113,7 +113,7 @@ class PoringaPostExtractor(PoringaExtractor): class PoringaUserExtractor(PoringaExtractor): subcategory = "user" - pattern = BASE_PATTERN + r"/(\w+)$" + pattern = rf"{BASE_PATTERN}/(\w+)$" example = "http://www.poringa.net/USER" def posts(self): @@ -124,7 +124,7 @@ class PoringaUserExtractor(PoringaExtractor): class PoringaSearchExtractor(PoringaExtractor): subcategory = "search" - pattern = BASE_PATTERN + r"/buscar/\?&?q=([^&#]+)" + pattern = rf"{BASE_PATTERN}/buscar/\?&?q=([^&#]+)" example = "http://www.poringa.net/buscar/?q=QUERY" def posts(self): diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index 1211397..5ced315 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -54,7 +54,7 @@ class PornhubGalleryExtractor(PornhubExtractor): directory_fmt = ("{category}", "{user}", "{gallery[id]} {gallery[title]}") filename_fmt = "{num:>03}_{id}.{extension}" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/album/(\d+)" + pattern = rf"{BASE_PATTERN}/album/(\d+)" example = "https://www.pornhub.com/album/12345" def __init__(self, match): @@ -64,7 +64,7 @@ class PornhubGalleryExtractor(PornhubExtractor): def items(self): data = self.metadata() - yield Message.Directory, data + yield Message.Directory, "", data for num, img in enumerate(self.images(), 1): image = { @@ -134,7 +134,7 @@ class PornhubGifExtractor(PornhubExtractor): directory_fmt = ("{category}", "{user}", "gifs") filename_fmt = "{id} {title}.{extension}" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/gif/(\d+)" + pattern = rf"{BASE_PATTERN}/gif/(\d+)" example = "https://www.pornhub.com/gif/12345" def __init__(self, match): @@ -150,21 +150,20 @@ class PornhubGifExtractor(PornhubExtractor): "tags" : extr("data-context-tag='", "'").split(","), "title": extr('"name": "', '"'), "url" : extr('"contentUrl": "', '"'), - "date" : text.parse_datetime( - extr('"uploadDate": "', '"'), "%Y-%m-%d"), + "date" : self.parse_datetime_iso(extr('"uploadDate": "', '"')), "viewkey" : extr('From this video: ' '<a href="/view_video.php?viewkey=', '"'), "timestamp": extr('lass="directLink tstamp" rel="nofollow">', '<'), "user" : text.remove_html(extr("Created by:", "</div>")), } - yield Message.Directory, gif + yield Message.Directory, "", gif yield Message.Url, gif["url"], text.nameext_from_url(gif["url"], gif) class PornhubUserExtractor(Dispatch, PornhubExtractor): """Extractor for a pornhub user""" - pattern = BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)/?$" + pattern = rf"{BASE_PATTERN}/((?:users|model|pornstar)/[^/?#]+)/?$" example = "https://www.pornhub.com/model/USER" def items(self): @@ -178,7 +177,7 @@ class PornhubUserExtractor(Dispatch, PornhubExtractor): class PornhubPhotosExtractor(PornhubExtractor): """Extractor for all galleries of a pornhub user""" subcategory = "photos" - pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)" + pattern = (rf"{BASE_PATTERN}/((?:users|model|pornstar)/[^/?#]+)" "/(photos(?:/[^/?#]+)?)") example = "https://www.pornhub.com/model/USER/photos" @@ -199,7 +198,7 @@ class PornhubPhotosExtractor(PornhubExtractor): class PornhubGifsExtractor(PornhubExtractor): """Extractor for a pornhub user's gifs""" subcategory = "gifs" - pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)" + pattern = (rf"{BASE_PATTERN}/((?:users|model|pornstar)/[^/?#]+)" "/(gifs(?:/[^/?#]+)?)") example = "https://www.pornhub.com/model/USER/gifs" diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py index 34a0111..9c926e8 100644 --- a/gallery_dl/extractor/pornpics.py +++ b/gallery_dl/extractor/pornpics.py @@ -58,7 +58,7 @@ class PornpicsExtractor(Extractor): class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor): """Extractor for pornpics galleries""" - pattern = BASE_PATTERN + r"/galleries/((?:[^/?#]+-)?(\d+))" + pattern = rf"{BASE_PATTERN}/galleries/((?:[^/?#]+-)?(\d+))" example = "https://www.pornpics.com/galleries/TITLE-12345/" def __init__(self, match): @@ -94,7 +94,7 @@ class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor): class PornpicsTagExtractor(PornpicsExtractor): """Extractor for galleries from pornpics tag searches""" subcategory = "tag" - pattern = BASE_PATTERN + r"/tags/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/tags/([^/?#]+)" example = "https://www.pornpics.com/tags/TAGS/" def galleries(self): @@ -105,7 +105,7 @@ class PornpicsTagExtractor(PornpicsExtractor): class PornpicsSearchExtractor(PornpicsExtractor): """Extractor for galleries from pornpics search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/(?:\?q=|pornstars/|channels/)([^/&#]+)" + pattern = rf"{BASE_PATTERN}/(?:\?q=|pornstars/|channels/)([^/&#]+)" example = "https://www.pornpics.com/?q=QUERY" def galleries(self): @@ -116,3 +116,35 @@ class PornpicsSearchExtractor(PornpicsExtractor): "offset": 0, } return self._pagination(url, params) + + +class PornpicsListingExtractor(PornpicsExtractor): + """Extractor for galleries from pornpics listing pages + + These pages (popular, recent, etc.) don't support JSON pagination + and use single quotes in HTML, unlike category pages. + """ + subcategory = "listing" + pattern = (rf"{BASE_PATTERN}" + rf"/(popular|recent|rating|likes|views|comments)/?$") + example = "https://www.pornpics.com/popular/" + + def galleries(self): + url = f"{self.root}/{self.groups[0]}/" + page = self.request(url).text + return [ + {"g_url": href} + for href in text.extract_iter( + page, "class='rel-link' href='", "'") + ] + + +class PornpicsCategoryExtractor(PornpicsExtractor): + """Extractor for galleries from pornpics categories""" + subcategory = "category" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/?$" + example = "https://www.pornpics.com/ass/" + + def galleries(self): + url = f"{self.root}/{self.groups[0]}/" + return self._pagination(url) diff --git a/gallery_dl/extractor/pornstarstube.py b/gallery_dl/extractor/pornstarstube.py new file mode 100644 index 0000000..82519a0 --- /dev/null +++ b/gallery_dl/extractor/pornstarstube.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://pornstars.tube/""" + +from .common import GalleryExtractor +from .. import text + + +class PornstarstubeGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from pornstars.tube""" + category = "pornstarstube" + root = "https://pornstars.tube" + pattern = (r"(?:https?://)?(?:www\.)?pornstars\.tube" + r"/albums/(\d+)(?:/([\w-]+))?") + example = "https://pornstars.tube/albums/12345/SLUG/" + + def __init__(self, match): + url = f"{self.root}/albums/{match[1]}/{match[2] or 'a'}/" + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + gid, slug = self.groups + return { + "gallery_id": text.parse_int(gid), + "slug" : slug or "", + "title" : text.unescape(text.extr( + page, "<title>", " - PORNSTARS.TUBE</title>")), + "description": text.unescape(text.extr( + page, 'name="description" content="', '"')), + "tags": text.extr( + page, 'name="keywords" content="', '"').split(", "), + } + + def images(self, page): + album = text.extr(page, 'class="block-album"', "\n</div>") + return [ + (url, None) + for url in text.extract_iter(album, ' href="', '"') + ] diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py index af971ab..e71246a 100644 --- a/gallery_dl/extractor/postmill.py +++ b/gallery_dl/extractor/postmill.py @@ -7,7 +7,7 @@ """Extractors for Postmill instances""" from .common import BaseExtractor, Message -from .. import text, util, exception +from .. import text, exception class PostmillExtractor(BaseExtractor): @@ -20,8 +20,8 @@ class PostmillExtractor(BaseExtractor): def _init(self): self.instance = self.root.partition("://")[2] self.save_link_post_body = self.config("save-link-post-body", False) - self._search_canonical_url = util.re(r"/f/([\w\d_]+)/(\d+)/").search - self._search_image_tag = util.re( + self._search_canonical_url = text.re(r"/f/([\w\d_]+)/(\d+)/").search + self._search_image_tag = text.re( r'<a href="[^"]+"\n +class="submission__image-link"').search def items(self): @@ -31,7 +31,7 @@ class PostmillExtractor(BaseExtractor): title = text.unescape(extr( '<meta property="og:title" content="', '">')) - date = text.parse_datetime(extr( + date = self.parse_datetime_iso(extr( '<meta property="og:article:published_time" content="', '">')) username = extr( '<meta property="og:article:author" content="', '">') @@ -72,7 +72,7 @@ class PostmillExtractor(BaseExtractor): urls.append((Message.Queue, url)) data["count"] = len(urls) - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], (msg, url) in enumerate(urls, 1): if url.startswith("text:"): data["filename"], data["extension"] = "", "htm" @@ -130,14 +130,14 @@ BASE_PATTERN = PostmillExtractor.update({ } }) QUERY_RE = r"(?:\?([^#]+))?$" -SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \ - QUERY_RE +SORTING_RE = (rf"(/(?:hot|new|active|top|controversial|most_commented))?" + rf"{QUERY_RE}") class PostmillPostExtractor(PostmillExtractor): """Extractor for a single submission URL""" subcategory = "post" - pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)" + pattern = rf"{BASE_PATTERN}/f/(\w+)/(\d+)" example = "https://raddle.me/f/FORUM/123/TITLE" def __init__(self, match): @@ -152,7 +152,7 @@ class PostmillPostExtractor(PostmillExtractor): class PostmillShortURLExtractor(PostmillExtractor): """Extractor for short submission URLs""" subcategory = "shorturl" - pattern = BASE_PATTERN + r"(/\d+)$" + pattern = rf"{BASE_PATTERN}(/\d+)$" example = "https://raddle.me/123" def items(self): @@ -165,34 +165,34 @@ class PostmillShortURLExtractor(PostmillExtractor): class PostmillHomeExtractor(PostmillSubmissionsExtractor): """Extractor for the home page""" subcategory = "home" - pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE + pattern = rf"{BASE_PATTERN}(/(?:featured|subscribed|all)?){SORTING_RE}" example = "https://raddle.me/" class PostmillForumExtractor(PostmillSubmissionsExtractor): """Extractor for submissions on a forum""" subcategory = "forum" - pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE + pattern = rf"{BASE_PATTERN}(/f/\w+){SORTING_RE}" example = "https://raddle.me/f/FORUM" class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor): """Extractor for submissions made by a user""" subcategory = "usersubmissions" - pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE + pattern = rf"{BASE_PATTERN}(/user/\w+/submissions)(){QUERY_RE}" example = "https://raddle.me/user/USER/submissions" class PostmillTagExtractor(PostmillSubmissionsExtractor): """Extractor for submissions on a forum with a specific tag""" subcategory = "tag" - pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE + pattern = rf"{BASE_PATTERN}(/tag/\w+){SORTING_RE}" example = "https://raddle.me/tag/TAG" class PostmillSearchExtractor(PostmillSubmissionsExtractor): """Extractor for search results""" subcategory = "search" - pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$" + pattern = rf"{BASE_PATTERN}(/search)()\?(q=[^#]+)$" example = "https://raddle.me/search?q=QUERY" whitelisted_parameters = ("q",) diff --git a/gallery_dl/extractor/rawkuma.py b/gallery_dl/extractor/rawkuma.py index 242486d..a4a0c9b 100644 --- a/gallery_dl/extractor/rawkuma.py +++ b/gallery_dl/extractor/rawkuma.py @@ -7,7 +7,7 @@ """Extractors for https://rawkuma.net/""" from .common import MangaExtractor, ChapterExtractor -from .. import text, util +from .. import text BASE_PATTERN = r"(?:https?://)?rawkuma\.(?:net|com)" @@ -21,43 +21,40 @@ class RawkumaBase(): class RawkumaChapterExtractor(RawkumaBase, ChapterExtractor): """Extractor for manga chapters from rawkuma.net""" archive_fmt = "{chapter_id}_{page}" - pattern = BASE_PATTERN + r"/([^/?#]+-chapter-\d+(?:-\d+)?)" - example = "https://rawkuma.net/TITLE-chapter-123/" + pattern = rf"{BASE_PATTERN}(/manga/[^/?#]+/chapter-\d+(?:.\d+)?\.(\d+))" + example = "https://rawkuma.net/manga/7TITLE/chapter-123.321" def __init__(self, match): url = f"{self.root}/{match[1]}/" ChapterExtractor.__init__(self, match, url) def metadata(self, page): - item = util.json_loads(text.extr(page, ',"item":', "}};")) - title = text.rextr( - page, '<h1 class="entry-title', "</h1>").partition(" – ")[2] - date = text.extr(page, 'datetime="', '"') - chapter, sep, minor = item["c"].partition(".") + manga, _, chapter = text.extr( + page, '<title>', "<").rpartition(" Chapter ") + chapter, sep, minor = chapter.partition(" – ")[0].partition(".") return { - "manga" : item["s"], - "manga_id" : text.parse_int(item["mid"]), + "manga" : text.unescape(manga), + "manga_id" : text.parse_int(text.extr(page, "manga_id=", "&")), "chapter" : text.parse_int(chapter), "chapter_minor": sep + minor, - "chapter_id" : text.parse_int(item["cid"]), - "title" : text.unescape(title), - "date" : text.parse_datetime( - date, "%Y-%m-%dWIB%H:%M:%S%z"), - "thumbnail" : item.get("t"), + "chapter_id" : text.parse_int(self.groups[-1]), + # "title" : text.unescape(title), + "date" : self.parse_datetime_iso(text.extr( + page, 'datetime="', '"')), "lang" : "ja", "language" : "Japanese", } def images(self, page): - images = util.json_loads(text.extr(page, '","images":', '}')) - return [(url, None) for url in images] + return [(url, None) for url in text.extract_iter( + page, "<img src='", "'")] class RawkumaMangaExtractor(RawkumaBase, MangaExtractor): """Extractor for manga from rawkuma.net""" chapterclass = RawkumaChapterExtractor - pattern = BASE_PATTERN + r"/manga/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/manga/([^/?#]+)" example = "https://rawkuma.net/manga/TITLE/" def __init__(self, match): @@ -66,18 +63,36 @@ class RawkumaMangaExtractor(RawkumaBase, MangaExtractor): def chapters(self, page): manga = text.unescape(text.extr(page, "<title>", " – ")) + manga_id = text.parse_int(text.extr(page, "manga_id=", "&")) + + url = f"{self.root}/wp-admin/admin-ajax.php" + params = { + "manga_id": manga_id, + "page" : "1", + "action" : "chapter_list", + } + headers = { + "HX-Request" : "true", + "HX-Trigger" : "chapter-list", + "HX-Target" : "chapter-list", + "HX-Current-URL": self.page_url, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + } + html = self.request(url, params=params, headers=headers).text results = [] - for chbox in text.extract_iter( - page, '<li data-num="', "</a>"): - info = text.extr(chbox, '', '"') - chapter, _, title = info.partition(" - ") + for url in text.extract_iter(html, '<a href="', '"'): + info = url[url.rfind("-")+1:-1] + chapter, _, chapter_id = info.rpartition(".") chapter, sep, minor = chapter.partition(".") - results.append((text.extr(chbox, 'href="', '"'), { + results.append((url, { "manga" : manga, + "manga_id" : manga_id, "chapter" : text.parse_int(chapter), "chapter-minor": sep + minor, - "title" : title, + "chapter_id" : text.parse_int(chapter_id), })) return results diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 483a5ba..8e974d2 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -38,7 +38,7 @@ class ReactorExtractor(BaseExtractor): def items(self): data = self.metadata() - yield Message.Directory, data + yield Message.Directory, "", data for post in self.posts(): for image in self._parse_post(post): url = image["url"] @@ -97,7 +97,7 @@ class ReactorExtractor(BaseExtractor): return num = 0 - date = text.parse_datetime(data["datePublished"]) + date = self.parse_datetime_iso(data["datePublished"]) user = data["author"]["name"] description = text.unescape(data["description"]) title, _, tags = text.unescape(data["headline"]).partition(" / ") @@ -171,7 +171,7 @@ class ReactorTagExtractor(ReactorExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "{search_tags}_{post_id}_{num}" - pattern = BASE_PATTERN + r"/tag/([^/?#]+)(?:/[^/?#]+)?" + pattern = rf"{BASE_PATTERN}/tag/([^/?#]+)(?:/[^/?#]+)?" example = "http://reactor.cc/tag/TAG" def __init__(self, match): @@ -187,7 +187,7 @@ class ReactorSearchExtractor(ReactorExtractor): subcategory = "search" directory_fmt = ("{category}", "search", "{search_tags}") archive_fmt = "s_{search_tags}_{post_id}_{num}" - pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" + pattern = rf"{BASE_PATTERN}/search(?:/|\?q=)([^/?#]+)" example = "http://reactor.cc/search?q=QUERY" def __init__(self, match): @@ -202,7 +202,7 @@ class ReactorUserExtractor(ReactorExtractor): """Extractor for all posts of a user on *reactor.cc sites""" subcategory = "user" directory_fmt = ("{category}", "user", "{user}") - pattern = BASE_PATTERN + r"/user/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)" example = "http://reactor.cc/user/USER" def __init__(self, match): @@ -216,7 +216,7 @@ class ReactorUserExtractor(ReactorExtractor): class ReactorPostExtractor(ReactorExtractor): """Extractor for single posts on *reactor.cc sites""" subcategory = "post" - pattern = BASE_PATTERN + r"/post/(\d+)" + pattern = rf"{BASE_PATTERN}/post/(\d+)" example = "http://reactor.cc/post/12345" def __init__(self, match): @@ -228,6 +228,6 @@ class ReactorPostExtractor(ReactorExtractor): pos = post.find('class="uhead">') for image in self._parse_post(post[pos:]): if image["num"] == 1: - yield Message.Directory, image + yield Message.Directory, "", image url = image["url"] yield Message.Url, url, text.nameext_from_url(url, image) diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 24a0171..dccf91d 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -44,7 +44,7 @@ class ReadcomiconlineBase(): class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): """Extractor for comic-issues from readcomiconline.li""" subcategory = "issue" - pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/[^/?#]+\?)([^#]+)" + pattern = rf"{BASE_PATTERN}(/Comic/[^/?#]+/[^/?#]+\?)([^#]+)" example = "https://readcomiconline.li/Comic/TITLE/Issue-123?id=12345" def _init(self): @@ -98,7 +98,7 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): """Extractor for comics from readcomiconline.li""" chapterclass = ReadcomiconlineIssueExtractor subcategory = "comic" - pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/?)$" + pattern = rf"{BASE_PATTERN}(/Comic/[^/?#]+/?)$" example = "https://readcomiconline.li/Comic/TITLE" def chapters(self, page): diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py index cf45578..7f731f8 100644 --- a/gallery_dl/extractor/realbooru.py +++ b/gallery_dl/extractor/realbooru.py @@ -28,18 +28,31 @@ class RealbooruExtractor(booru.BooruExtractor): extr('class="container"', '>') post = { - "_html" : page, "id" : post_id, "rating" : "e" if rating == "adult" else (rating or "?")[0], - "tags" : text.unescape(extr(' alt="', '"')), - "file_url" : extr('src="', '"'), + "file_url" : (s := extr('src="', '"')), + "_fallback" : (extr('src="', '"'),) if s.endswith(".mp4") else (), "created_at": extr(">Posted at ", " by "), "uploader" : extr(">", "<"), "score" : extr('">', "<"), + "tags" : extr('<br />', "</div>"), "title" : extr('id="title" style="width: 100%;" value="', '"'), "source" : extr('d="source" style="width: 100%;" value="', '"'), } + tags_container = post["tags"] + tags = [] + tags_categories = collections.defaultdict(list) + pattern = text.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)') + for tag_type, tag_name in pattern.findall(tags_container): + tag = text.unescape(text.unquote(tag_name)) + tags.append(tag) + tags_categories[tag_type].append(tag) + for key, value in tags_categories.items(): + post[f"tags_{key}"] = ", ".join(value) + tags.sort() + + post["tags"] = ", ".join(tags) post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] return post @@ -48,7 +61,7 @@ class RealbooruExtractor(booru.BooruExtractor): return num def _prepare(self, post): - post["date"] = text.parse_datetime(post["created_at"], "%b, %d %Y") + post["date"] = self.parse_datetime(post["created_at"], "%b, %d %Y") def _pagination(self, params, begin, end): url = self.root + "/index.php" @@ -66,23 +79,13 @@ class RealbooruExtractor(booru.BooruExtractor): return params["pid"] += self.per_page - def _tags(self, post, _): - page = post["_html"] - tag_container = text.extr(page, 'id="tagLink"', '</div>') - tags = collections.defaultdict(list) - pattern = util.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)') - for tag_type, tag_name in pattern.findall(tag_container): - tags[tag_type].append(text.unescape(text.unquote(tag_name))) - for key, value in tags.items(): - post["tags_" + key] = " ".join(value) - class RealbooruTagExtractor(RealbooruExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" per_page = 42 - pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=list&tags=([^&#]*)" example = "https://realbooru.com/index.php?page=post&s=list&tags=TAG" def metadata(self): @@ -102,7 +105,7 @@ class RealbooruFavoriteExtractor(RealbooruExtractor): directory_fmt = ("{category}", "favorites", "{favorite_id}") archive_fmt = "f_{favorite_id}_{id}" per_page = 50 - pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=favorites&s=view&id=(\d+)" example = "https://realbooru.com/index.php?page=favorites&s=view&id=12345" def metadata(self): @@ -120,7 +123,7 @@ class RealbooruPoolExtractor(RealbooruExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool} {pool_name}") archive_fmt = "p_{pool}_{id}" - pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=pool&s=show&id=(\d+)" example = "https://realbooru.com/index.php?page=pool&s=show&id=12345" def metadata(self): @@ -147,7 +150,7 @@ class RealbooruPoolExtractor(RealbooruExtractor): class RealbooruPostExtractor(RealbooruExtractor): subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" + pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=view&id=(\d+)" example = "https://realbooru.com/index.php?page=post&s=view&id=12345" def posts(self): diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py index c553fec..0bf17d3 100644 --- a/gallery_dl/extractor/recursive.py +++ b/gallery_dl/extractor/recursive.py @@ -9,7 +9,7 @@ """Recursive extractor""" from .common import Extractor, Message -from .. import text, util +from .. import text class RecursiveExtractor(Extractor): @@ -27,5 +27,5 @@ class RecursiveExtractor(Extractor): else: page = self.request(text.ensure_http_scheme(url)).text - for match in util.re(r"https?://[^\s\"']+").finditer(page): + for match in text.re(r"https?://[^\s\"']+").finditer(page): yield Message.Queue, match[0], {} diff --git a/gallery_dl/extractor/redbust.py b/gallery_dl/extractor/redbust.py deleted file mode 100644 index d00ed52..0000000 --- a/gallery_dl/extractor/redbust.py +++ /dev/null @@ -1,186 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://redbust.com/""" - -from .common import GalleryExtractor, Extractor, Message -from .. import text - -BASE_PATTERN = r"(?:https?://)?redbust\.com" - - -class RedbustExtractor(Extractor): - """Base class for RedBust extractors""" - category = "redbust" - root = "https://redbust.com" - filename_fmt = "{filename}.{extension}" - - def items(self): - data = {"_extractor": RedbustGalleryExtractor} - for url in self.galleries(): - yield Message.Queue, url, data - - def _pagination(self, path, page=None): - if page is None: - url = f"{self.root}{path}/" - base = url + "page/" - page = self.request(url).text - else: - base = f"{self.root}{path}/page/" - - pnum = 1 - while True: - for post in text.extract_iter( - page, '<h2 class="post-title">', "rel="): - yield text.extr(post, 'href="', '"') - - pnum += 1 - url = f"{base}{pnum}/" - if url not in page: - return - page = self.request(url).text - - -class RedbustGalleryExtractor(GalleryExtractor, RedbustExtractor): - """Extractor for RedBust galleries""" - pattern = BASE_PATTERN + r"/([\w-]+)/?$" - example = "https://redbust.com/TITLE/" - - def items(self): - url = f"{self.root}/{self.groups[0]}/" - self.page = page = self.request(url).text - - self.gallery_id = gid = text.extr( - page, "<link rel='shortlink' href='https://redbust.com/?p=", "'") - - if gid: - self.page_url = False - return GalleryExtractor.items(self) - else: - self.subcategory = "category" - return self._items_category(page) - - def _items_category(self, _): - page = self.page - data = {"_extractor": RedbustGalleryExtractor} - base = f"{self.root}/{self.groups[0]}/page/" - pnum = 1 - - while True: - for post in text.extract_iter( - page, '<h2 class="post-title">', "rel="): - url = text.extr(post, 'href="', '"') - yield Message.Queue, url, data - - pnum += 1 - url = f"{base}{pnum}/" - if url not in page: - return - page = self.request(url).text - - def metadata(self, _): - extr = text.extract_from(self.page) - - return { - "gallery_id" : self.gallery_id, - "gallery_slug": self.groups[0], - "categories" : text.split_html(extr( - '<li class="category">', "</li>"))[::2], - "title" : text.unescape(extr('class="post-title">', "<")), - "date" : text.parse_datetime( - extr('class="post-byline">', "<").strip(), "%B %d, %Y"), - "views" : text.parse_int(extr("</b>", "v").replace(",", "")), - "tags" : text.split_html(extr( - 'class="post-tags">', "</p"))[1:], - } - - def images(self, _): - results = [] - - for img in text.extract_iter(self.page, "'><img ", ">"): - if src := text.extr(img, 'src="', '"'): - path, _, end = src.rpartition("-") - if "x" in end: - url = f"{path}.{end.rpartition('.')[2]}" - data = None if src == url else {"_fallback": (src,)} - else: - url = src - data = None - results.append((url, data)) - - if not results: - # fallback for older galleries - for path in text.extract_iter( - self.page, '<img src="/wp-content/uploads/', '"'): - results.append( - (f"{self.root}/wp-content/uploads/{path}", None)) - - return results - - -class RedbustTagExtractor(RedbustExtractor): - """Extractor for RedBust tag searches""" - subcategory = "tag" - pattern = BASE_PATTERN + r"/tag/([\w-]+)" - example = "https://redbust.com/tag/TAG/" - - def galleries(self): - return self._pagination("/tag/" + self.groups[0]) - - -class RedbustArchiveExtractor(RedbustExtractor): - """Extractor for RedBust monthly archive collections""" - subcategory = "archive" - pattern = BASE_PATTERN + r"(/\d{4}/\d{2})" - example = "https://redbust.com/2010/01/" - - def galleries(self): - return self._pagination(self.groups[0]) - - -class RedbustImageExtractor(RedbustExtractor): - """Extractor for RedBust images""" - subcategory = "image" - directory_fmt = ("{category}", "{title}") - pattern = BASE_PATTERN + r"/(?!tag/|\d{4}/)([\w-]+)/([\w-]+)/?$" - example = "https://redbust.com/TITLE/SLUG/" - - def items(self): - gallery_slug, image_slug = self.groups - url = f"{self.root}/{gallery_slug}/{image_slug}/" - page = self.request(url).text - - img_url = None - - # Look for the largest image in srcset first - if srcset := text.extr(page, 'srcset="', '"'): - # Extract the largest image from srcset (typically last one) - urls = srcset.split(", ") - img_url = urls[-1].partition(" ")[0] if urls else None - - # Fallback to original extraction method - if not img_url: - if entry := text.extr(page, "entry-inner ", "alt="): - img_url = text.extr(entry, "img src=", " ").strip("\"'") - - if not img_url: - return - - end = img_url.rpartition("-")[2] - data = text.nameext_from_url(img_url, { - "title" : text.unescape(text.extr( - page, 'title="Return to ', '"')), - "image_id" : text.extr( - page, "rel='shortlink' href='https://redbust.com/?p=", "'"), - "gallery_slug": gallery_slug, - "image_slug" : image_slug, - "num" : text.parse_int(end.partition(".")[0]), - "count" : 1, - "url" : img_url, - }) - - yield Message.Directory, data - yield Message.Url, img_url, data diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index e20d80e..cc73e47 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -33,11 +33,11 @@ class RedditExtractor(Extractor): previews = self.config("previews", True) embeds = self.config("embeds", True) - if videos := self.config("videos", True): - if videos == "ytdl": - self._extract_video = self._extract_video_ytdl - elif videos == "dash": + if videos := self.config("videos", "dash"): + if videos == "dash": self._extract_video = self._extract_video_dash + elif videos == "ytdl": + self._extract_video = self._extract_video_ytdl videos = True selftext = self.config("selftext") @@ -57,9 +57,9 @@ class RedditExtractor(Extractor): if submission: submission["comment"] = None - submission["date"] = text.parse_timestamp( + submission["date"] = self.parse_timestamp( submission["created_utc"]) - yield Message.Directory, submission + yield Message.Directory, "", submission visited.add(submission["id"]) submission["num"] = 0 @@ -86,7 +86,7 @@ class RedditExtractor(Extractor): yield Message.Url, url, submission elif embeds and "media_metadata" in media: - for embed in self._extract_embed(submission): + for embed in self._extract_embed(submission, media): submission["num"] += 1 text.nameext_from_url(embed, submission) yield Message.Url, embed, submission @@ -94,6 +94,8 @@ class RedditExtractor(Extractor): elif media["is_video"]: if videos: text.nameext_from_url(url, submission) + if not submission["extension"]: + submission["extension"] = "mp4" url = "ytdl:" + self._extract_video(media) yield Message.Url, url, submission @@ -105,14 +107,14 @@ class RedditExtractor(Extractor): urls.append((url, submission)) elif parentdir: - yield Message.Directory, comments[0] + yield Message.Directory, "", comments[0] if self.api.comments: if comments and not submission: submission = comments[0] submission.setdefault("num", 0) if not parentdir: - yield Message.Directory, submission + yield Message.Directory, "", submission for comment in comments: media = (embeds and "media_metadata" in comment) @@ -124,11 +126,11 @@ class RedditExtractor(Extractor): data = submission.copy() data["comment"] = comment - comment["date"] = text.parse_timestamp( + comment["date"] = self.parse_timestamp( comment["created_utc"]) if media: - for url in self._extract_embed(comment): + for url in self._extract_embed(data, comment): data["num"] += 1 text.nameext_from_url(url, data) yield Message.Url, url, data @@ -199,8 +201,8 @@ class RedditExtractor(Extractor): submission["id"], item["media_id"]) self.log.debug(src) - def _extract_embed(self, submission): - meta = submission["media_metadata"] + def _extract_embed(self, submission, media): + meta = media["media_metadata"] if not meta: return @@ -317,8 +319,8 @@ class RedditSubmissionExtractor(RedditExtractor): """Extractor for URLs from a submission on reddit.com""" subcategory = "submission" pattern = (r"(?:https?://)?(?:" - r"(?:\w+\.)?reddit\.com/(?:(?:r|u|user)/[^/?#]+" - r"/comments|gallery)|redd\.it)/([a-z0-9]+)") + r"(?:\w+\.)?reddit\.com/(?:(?:(?:r|u|user)/[^/?#]+/)?" + r"comments|gallery)|redd\.it)/([a-z0-9]+)") example = "https://www.reddit.com/r/SUBREDDIT/comments/id/" def __init__(self, match): @@ -352,7 +354,7 @@ class RedditImageExtractor(Extractor): def items(self): url = f"https://{self.domain}/{self.path}{self.query}" data = text.nameext_from_url(url) - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data @@ -394,7 +396,7 @@ class RedditAPI(): self.morecomments = config("morecomments", False) self._warn_429 = False - if config("api") == "rest": + if config("api") != "oauth": self.root = "https://www.reddit.com" self.headers = None self.authenticate = util.noop diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 4098c54..164fdf4 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -51,8 +51,8 @@ class RedgifsExtractor(Extractor): gif.update(metadata) gif["count"] = cnt - gif["date"] = text.parse_timestamp(gif.get("createDate")) - yield Message.Directory, gif + gif["date"] = self.parse_timestamp(gif.get("createDate")) + yield Message.Directory, "", gif for num, gif in enumerate(gifs, enum): gif["_fallback"] = formats = self._formats(gif) diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py index a43ea4c..d48539e 100644 --- a/gallery_dl/extractor/rule34us.py +++ b/gallery_dl/extractor/rule34us.py @@ -9,7 +9,7 @@ """Extractors for https://rule34.us/""" from .booru import BooruExtractor -from .. import text, util +from .. import text import collections @@ -19,7 +19,7 @@ class Rule34usExtractor(BooruExtractor): per_page = 42 def _init(self): - self._find_tags = util.re( + self._find_tags = text.re( r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall def _parse_post(self, post_id): @@ -57,7 +57,7 @@ class Rule34usTagExtractor(Rule34usExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/index&q=([^&#]+)" + pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/index&q=([^&#]*)" example = "https://rule34.us/index.php?r=posts/index&q=TAG" def __init__(self, match): diff --git a/gallery_dl/extractor/rule34vault.py b/gallery_dl/extractor/rule34vault.py index 14d5aef..9f75f64 100644 --- a/gallery_dl/extractor/rule34vault.py +++ b/gallery_dl/extractor/rule34vault.py @@ -36,8 +36,7 @@ class Rule34vaultExtractor(BooruExtractor): def _prepare(self, post): post.pop("files", None) - post["date"] = text.parse_datetime( - post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["created"]) if "tags" in post: post["tags"] = [t["value"] for t in post["tags"]] @@ -80,7 +79,7 @@ class Rule34vaultExtractor(BooruExtractor): class Rule34vaultPostExtractor(Rule34vaultExtractor): subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/(\d+)" + pattern = rf"{BASE_PATTERN}/post/(\d+)" example = "https://rule34vault.com/post/12345" def posts(self): @@ -91,7 +90,7 @@ class Rule34vaultPlaylistExtractor(Rule34vaultExtractor): subcategory = "playlist" directory_fmt = ("{category}", "{playlist_id}") archive_fmt = "p_{playlist_id}_{id}" - pattern = BASE_PATTERN + r"/playlists/view/(\d+)" + pattern = rf"{BASE_PATTERN}/playlists/view/(\d+)" example = "https://rule34vault.com/playlists/view/12345" def metadata(self): @@ -106,7 +105,7 @@ class Rule34vaultTagExtractor(Rule34vaultExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/(?!p(?:ost|laylists)/)([^/?#]+)" + pattern = rf"{BASE_PATTERN}/(?!p(?:ost|laylists)/)([^/?#]+)" example = "https://rule34vault.com/TAG" def metadata(self): diff --git a/gallery_dl/extractor/rule34xyz.py b/gallery_dl/extractor/rule34xyz.py index 05915ba..ddd656f 100644 --- a/gallery_dl/extractor/rule34xyz.py +++ b/gallery_dl/extractor/rule34xyz.py @@ -68,8 +68,7 @@ class Rule34xyzExtractor(BooruExtractor): def _prepare(self, post): post.pop("files", None) - post["date"] = text.parse_datetime( - post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["created"]) post["filename"], _, post["format"] = post["filename"].rpartition(".") if "tags" in post: post["tags"] = [t["value"] for t in post["tags"]] @@ -135,7 +134,7 @@ class Rule34xyzExtractor(BooruExtractor): class Rule34xyzPostExtractor(Rule34xyzExtractor): subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/(\d+)" + pattern = rf"{BASE_PATTERN}/post/(\d+)" example = "https://rule34.xyz/post/12345" def posts(self): @@ -146,7 +145,7 @@ class Rule34xyzPlaylistExtractor(Rule34xyzExtractor): subcategory = "playlist" directory_fmt = ("{category}", "{playlist_id}") archive_fmt = "p_{playlist_id}_{id}" - pattern = BASE_PATTERN + r"/playlists/view/(\d+)" + pattern = rf"{BASE_PATTERN}/playlists/view/(\d+)" example = "https://rule34.xyz/playlists/view/12345" def metadata(self): @@ -161,7 +160,7 @@ class Rule34xyzTagExtractor(Rule34xyzExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/([^/?#]+)$" + pattern = rf"{BASE_PATTERN}/([^/?#]+)$" example = "https://rule34.xyz/TAG" def metadata(self): diff --git a/gallery_dl/extractor/s3ndpics.py b/gallery_dl/extractor/s3ndpics.py index 215f160..9201a3f 100644 --- a/gallery_dl/extractor/s3ndpics.py +++ b/gallery_dl/extractor/s3ndpics.py @@ -30,15 +30,13 @@ class S3ndpicsExtractor(Extractor): for post in self.posts(): post["id"] = post.pop("_id", None) post["user"] = post.pop("userId", None) - post["date"] = text.parse_datetime( - post["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") - post["date_updated"] = text.parse_datetime( - post["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["createdAt"]) + post["date_updated"] = self.parse_datetime_iso(post["updatedAt"]) files = post.pop("files", ()) post["count"] = len(files) - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], file in enumerate(files, 1): post["type"] = file["type"] path = file["url"] diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py index 07d490a..e15c628 100644 --- a/gallery_dl/extractor/saint.py +++ b/gallery_dl/extractor/saint.py @@ -18,7 +18,7 @@ class SaintAlbumExtractor(LolisafeAlbumExtractor): """Extractor for saint albums""" category = "saint" root = "https://saint2.su" - pattern = BASE_PATTERN + r"/a/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/a/([^/?#]+)" example = "https://saint2.su/a/ID" def fetch_album(self, album_id): @@ -36,7 +36,7 @@ class SaintAlbumExtractor(LolisafeAlbumExtractor): break files.append({ "id2" : id2, - "date" : text.parse_timestamp(extr("", ".")), + "date" : self.parse_timestamp(extr("", ".")), "id" : extr("/embed/", '"'), "size" : text.parse_int(extr('data="', '"')), "file" : text.unescape(extr( @@ -58,7 +58,7 @@ class SaintMediaExtractor(SaintAlbumExtractor): """Extractor for saint media links""" subcategory = "media" directory_fmt = ("{category}",) - pattern = BASE_PATTERN + r"(/(embe)?d/([^/?#]+))" + pattern = rf"{BASE_PATTERN}(/(embe)?d/([^/?#]+))" example = "https://saint2.su/embed/ID" def fetch_album(self, album_id): @@ -73,7 +73,7 @@ class SaintMediaExtractor(SaintAlbumExtractor): file = { "id" : album_id, "id2" : extr("/thumbs/", "-"), - "date" : text.parse_timestamp(extr("", ".")), + "date" : self.parse_timestamp(extr("", ".")), "file" : text.unescape(extr('<source src="', '"')), "id_dl": extr("/d/", "'"), } diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 5caad4b..690b515 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -16,7 +16,7 @@ import collections BASE_PATTERN = r"(?:https?://)?" \ r"(?:(?:chan|www|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ - r"(?:/[a-z]{2})?" + r"(?:/[a-z]{2}(?:[-_][A-Z]{2})?)?" class SankakuExtractor(BooruExtractor): @@ -47,7 +47,7 @@ class SankakuExtractor(BooruExtractor): self.api = SankakuAPI(self) if self.config("tags") == "extended": self._tags = self._tags_extended - self._tags_findall = util.re( + self._tags_findall = text.re( r"tag-type-([^\"' ]+).*?\?tags=([^\"'&]+)").findall def _file_url(self, post): @@ -61,13 +61,13 @@ class SankakuExtractor(BooruExtractor): self.log.warning( "Login required to download 'contentious_content' posts") SankakuExtractor._warning = False - elif url[8] == "v": - url = "https://s.sankakucomplex.com" + url[url.index("/", 8):] + elif url[4] != "s": + url = "https" + url[4:] return url def _prepare(self, post): post["created_at"] = post["created_at"]["s"] - post["date"] = text.parse_timestamp(post["created_at"]) + post["date"] = self.parse_timestamp(post["created_at"]) post["tags"] = post.pop("tag_names", ()) post["tag_string"] = " ".join(post["tags"]) post["_http_validate"] = self._check_expired @@ -119,7 +119,7 @@ class SankakuTagExtractor(SankakuExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"(?:/posts)?/?\?([^#]*)" + pattern = rf"{BASE_PATTERN}(?:/posts)?/?\?([^#]*)" example = "https://sankaku.app/?tags=TAG" def __init__(self, match): @@ -129,10 +129,10 @@ class SankakuTagExtractor(SankakuExtractor): if "date:" in self.tags: # rewrite 'date:' tags (#1790) - self.tags = util.re( + self.tags = text.re( r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)(?!T)").sub( r"date:\3-\2-\1T00:00", self.tags) - self.tags = util.re( + self.tags = text.re( r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)(?!T)").sub( r"date:\1-\2-\3T00:00", self.tags) @@ -149,7 +149,7 @@ class SankakuPoolExtractor(SankakuExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}") archive_fmt = "p_{pool}_{id}" - pattern = BASE_PATTERN + r"/(?:books|pools?/show)/(\w+)" + pattern = rf"{BASE_PATTERN}/(?:books|pools?/show)/(\w+)" example = "https://sankaku.app/books/12345" def metadata(self): @@ -171,7 +171,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/posts?(?:/show)?/(\w+)" + pattern = rf"{BASE_PATTERN}/posts?(?:/show)?/(\w+)" example = "https://sankaku.app/post/show/12345" def posts(self): @@ -181,7 +181,7 @@ class SankakuPostExtractor(SankakuExtractor): class SankakuBooksExtractor(SankakuExtractor): """Extractor for books by tag search on sankaku.app""" subcategory = "books" - pattern = BASE_PATTERN + r"/books/?\?([^#]*)" + pattern = rf"{BASE_PATTERN}/books/?\?([^#]*)" example = "https://sankaku.app/books?tags=TAG" def __init__(self, match): diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index 405e07e..cf5af81 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -40,7 +40,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): extr('property="og:title" content="', '"')), "description": text.unescape( extr('property="og:description" content="', '"')), - "date" : text.parse_datetime( + "date" : self.parse_datetime_iso( extr('property="article:published_time" content="', '"')), } content = extr('<div class="entry-content">', '</article>') @@ -53,7 +53,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): files += self._extract_embeds(content) data["count"] = len(files) - yield Message.Directory, data + yield Message.Directory, "", data for num, url in enumerate(files, 1): file = text.nameext_from_url(url) if url[0] == "/": @@ -64,19 +64,19 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): yield Message.Url, url, file def _extract_images(self, content): - orig_sub = util.re(r"-\d+x\d+\.").sub + orig_sub = text.re(r"-\d+x\d+\.").sub return [ orig_sub(".", url) for url in util.unique(text.extract_iter(content, 'data-lazy-src="', '"')) ] def _extract_videos(self, content): - return util.re(r"<source [^>]*src=[\"']([^\"']+)").findall(content) + return text.re(r"<source [^>]*src=[\"']([^\"']+)").findall(content) def _extract_embeds(self, content): return [ "ytdl:" + url for url in - util.re(r"<iframe [^>]*src=[\"']([^\"']+)").findall(content) + text.re(r"<iframe [^>]*src=[\"']([^\"']+)").findall(content) ] diff --git a/gallery_dl/extractor/schalenetwork.py b/gallery_dl/extractor/schalenetwork.py index a4ef3b0..bbbb9da 100644 --- a/gallery_dl/extractor/schalenetwork.py +++ b/gallery_dl/extractor/schalenetwork.py @@ -126,7 +126,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): data = self.request_json(url, headers=headers) try: - data["date"] = text.parse_timestamp(data["created_at"] // 1000) + data["date"] = self.parse_timestamp(data["created_at"] // 1000) data["count"] = len(data["thumbnails"]["entries"]) del data["thumbnails"] except Exception: @@ -138,14 +138,13 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): name = tag["name"] namespace = tag.get("namespace", 0) tags.append(types[namespace] + ":" + name) - data["tags"] = tags - if self.config("tags", False): - tags = collections.defaultdict(list) + categories = collections.defaultdict(list) for tag in data["tags"]: - tags[tag.get("namespace", 0)].append(tag["name"]) - for type, values in tags.items(): + categories[tag.get("namespace", 0)].append(tag["name"]) + for type, values in categories.items(): data["tags_" + types[type]] = values + data["tags"] = tags url = f"{self.root_api}/books/detail/{gid}/{gkey}?crt={self._crt()}" if token := self._token(False): @@ -169,6 +168,20 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): url = (f"{self.root_api}/books/data/{gid}/{gkey}" f"/{fmt['id']}/{fmt['key']}/{fmt['w']}?crt={self._crt()}") headers = self.headers + + if self.config("cbz", False): + headers["Authorization"] = self._token() + dl = self.request_json( + f"{url}&action=dl", method="POST", headers=headers) + # 'crt' parameter here is necessary for 'hdoujin' downloads + url = f"{dl['base']}?crt={self._crt()}" + info = text.nameext_from_url(url) + if "fallback" in dl: + info["_fallback"] = (dl["fallback"],) + if not info["extension"]: + info["extension"] = "cbz" + return ((url, info),) + data = self.request_json(url, headers=headers) base = data["base"] diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py index ff191db..b853f53 100644 --- a/gallery_dl/extractor/scrolller.py +++ b/gallery_dl/extractor/scrolller.py @@ -34,7 +34,7 @@ class ScrolllerExtractor(Extractor): files = self._extract_files(post) post["count"] = len(files) - yield Message.Directory, post + yield Message.Directory, "", post for file in files: url = file["url"] post.update(file) @@ -136,7 +136,7 @@ class ScrolllerExtractor(Extractor): class ScrolllerSubredditExtractor(ScrolllerExtractor): """Extractor for media from a scrolller subreddit""" subcategory = "subreddit" - pattern = BASE_PATTERN + r"(/r/[^/?#]+)(?:/?\?([^#]+))?" + pattern = rf"{BASE_PATTERN}(/r/[^/?#]+)(?:/?\?([^#]+))?" example = "https://scrolller.com/r/SUBREDDIT" def posts(self): @@ -173,7 +173,7 @@ class ScrolllerSubredditExtractor(ScrolllerExtractor): class ScrolllerFollowingExtractor(ScrolllerExtractor): """Extractor for followed scrolller subreddits""" subcategory = "following" - pattern = BASE_PATTERN + r"/following" + pattern = rf"{BASE_PATTERN}/following" example = "https://scrolller.com/following" def items(self): @@ -199,7 +199,7 @@ class ScrolllerFollowingExtractor(ScrolllerExtractor): class ScrolllerPostExtractor(ScrolllerExtractor): """Extractor for media from a single scrolller post""" subcategory = "post" - pattern = BASE_PATTERN + r"/(?!r/|following$)([^/?#]+)" + pattern = rf"{BASE_PATTERN}/(?!r/|following$)([^/?#]+)" example = "https://scrolller.com/TITLE-SLUG-a1b2c3d4f5" def posts(self): diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index 7319731..705227d 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -31,7 +31,7 @@ class SeigaExtractor(Extractor): images = iter(self.get_images()) data = next(images) - yield Message.Directory, data + yield Message.Directory, "", data for image in util.advance(images, self.start_image): data.update(image) data["extension"] = None @@ -213,7 +213,7 @@ class SeigaImageExtractor(SeigaExtractor): data["description"] = text.remove_html(data["description"]) data["image_id"] = text.parse_int(self.image_id) - data["date"] = text.parse_datetime( + data["date"] = self.parse_datetime( data["date"] + ":00+0900", "%Yå¹´%m月%dæ—¥ %H:%M:%S%z") return (data, data) diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 2feb64e..b599f70 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -9,8 +9,7 @@ """Extractors for https://www.sex.com/""" from .common import Extractor, Message -from .. import text -from datetime import datetime +from .. import text, dt BASE_PATTERN = r"(?:https?://)?(?:www\.)?sex\.com(?:/[a-z]{2})?" @@ -26,7 +25,7 @@ class SexcomExtractor(Extractor): def items(self): self.gifs = self.config("gifs", True) - yield Message.Directory, self.metadata() + yield Message.Directory, "", self.metadata() for pin in map(self._parse_pin, self.pins()): if not pin: continue @@ -34,10 +33,10 @@ class SexcomExtractor(Extractor): url = pin["url"] parts = url.rsplit("/", 4) try: - pin["date_url"] = dt = datetime( + pin["date_url"] = d = dt.datetime( int(parts[1]), int(parts[2]), int(parts[3])) if "date" not in pin: - pin["date"] = dt + pin["date"] = d except Exception: pass pin["tags"] = [t[1:] for t in pin["tags"]] @@ -136,7 +135,7 @@ class SexcomExtractor(Extractor): text.nameext_from_url(data["url"], data) data["uploader"] = extr('itemprop="author">', '<') - data["date"] = text.parse_datetime(extr('datetime="', '"')) + data["date"] = dt.parse_iso(extr('datetime="', '"')) data["tags"] = text.split_html(extr('class="tags"> Tags', '</div>')) data["comments"] = text.parse_int(extr('Comments (', ')')) @@ -195,8 +194,8 @@ class SexcomPinExtractor(SexcomExtractor): """Extractor for a pinned image or video on www.sex.com""" subcategory = "pin" directory_fmt = ("{category}",) - pattern = (BASE_PATTERN + - r"(/(?:\w\w/(?:pic|gif|video)s|pin)/\d+/?)(?!.*#related$)") + pattern = (rf"{BASE_PATTERN}" + rf"(/(?:\w\w/(?:pic|gif|video)s|pin)/\d+/?)(?!.*#related$)") example = "https://www.sex.com/pin/12345-TITLE/" def pins(self): @@ -207,7 +206,7 @@ class SexcomRelatedPinExtractor(SexcomPinExtractor): """Extractor for related pins on www.sex.com""" subcategory = "related-pin" directory_fmt = ("{category}", "related {original_pin[pin_id]}") - pattern = BASE_PATTERN + r"(/pin/(\d+)/?).*#related$" + pattern = rf"{BASE_PATTERN}(/pin/(\d+)/?).*#related$" example = "https://www.sex.com/pin/12345#related" def metadata(self): @@ -224,7 +223,7 @@ class SexcomPinsExtractor(SexcomExtractor): """Extractor for a user's pins on www.sex.com""" subcategory = "pins" directory_fmt = ("{category}", "{user}") - pattern = BASE_PATTERN + r"/user/([^/?#]+)/pins/" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/pins/" example = "https://www.sex.com/user/USER/pins/" def metadata(self): @@ -239,7 +238,7 @@ class SexcomLikesExtractor(SexcomExtractor): """Extractor for a user's liked pins on www.sex.com""" subcategory = "likes" directory_fmt = ("{category}", "{user}", "Likes") - pattern = BASE_PATTERN + r"/user/([^/?#]+)/likes/" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/likes/" example = "https://www.sex.com/user/USER/likes/" def metadata(self): @@ -254,8 +253,8 @@ class SexcomBoardExtractor(SexcomExtractor): """Extractor for pins from a board on www.sex.com""" subcategory = "board" directory_fmt = ("{category}", "{user}", "{board}") - pattern = (BASE_PATTERN + r"/user" - r"/([^/?#]+)/(?!(?:following|pins|repins|likes)/)([^/?#]+)") + pattern = (rf"{BASE_PATTERN}/user" + rf"/([^/?#]+)/(?!(?:following|pins|repins|likes)/)([^/?#]+)") example = "https://www.sex.com/user/USER/BOARD/" def metadata(self): @@ -270,14 +269,31 @@ class SexcomBoardExtractor(SexcomExtractor): return self._pagination(url) +class SexcomFeedExtractor(SexcomExtractor): + """Extractor for pins from your account's main feed on www.sex.com""" + subcategory = "feed" + directory_fmt = ("{category}", "feed") + pattern = rf"{BASE_PATTERN}/feed" + example = "https://www.sex.com/feed/" + + def metadata(self): + return {"feed": True} + + def pins(self): + if not self.cookies_check(("sess_sex",)): + self.log.warning("no 'sess_sex' cookie set") + url = f"{self.root}/feed/" + return self._pagination(url) + + class SexcomSearchExtractor(SexcomExtractor): """Extractor for search results on www.sex.com""" subcategory = "search" directory_fmt = ("{category}", "search", "{search[search]}") - pattern = (BASE_PATTERN + r"/(?:" - r"(pic|gif|video)s(?:\?(search=[^#]+)$|/([^/?#]*))" - r"|search/(pic|gif|video)s" - r")/?(?:\?([^#]+))?") + pattern = (rf"{BASE_PATTERN}/(?:" + rf"(pic|gif|video)s(?:\?(search=[^#]+)$|/([^/?#]*))" + rf"|search/(pic|gif|video)s" + rf")/?(?:\?([^#]+))?") example = "https://www.sex.com/search/pics?query=QUERY" def _init(self): @@ -314,7 +330,7 @@ class SexcomSearchExtractor(SexcomExtractor): parts = path.rsplit("/", 4) try: - pin["date_url"] = pin["date"] = datetime( + pin["date_url"] = pin["date"] = dt.datetime( int(parts[1]), int(parts[2]), int(parts[3])) except Exception: pass @@ -329,7 +345,7 @@ class SexcomSearchExtractor(SexcomExtractor): path = f"{path[:-4]}gif" pin["url"] = f"{root}{path}" - yield Message.Directory, pin + yield Message.Directory, "", pin yield Message.Url, pin["url"], pin if params["page"] >= data["paging"]["numberOfPages"]: diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 36b083b..5572b4d 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -25,6 +25,8 @@ class Shimmie2Extractor(BaseExtractor): if file_url := self.config_instance("file_url"): self.file_url_fmt = file_url + if quote := self.config_instance("quote"): + self._quote_type = lambda _: quote def items(self): data = self.metadata() @@ -44,7 +46,7 @@ class Shimmie2Extractor(BaseExtractor): else: text.nameext_from_url(url, post) - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, url, post def metadata(self): @@ -85,6 +87,11 @@ BASE_PATTERN = Shimmie2Extractor.update({ "root": "https://co.llection.pics", "pattern": r"co\.llection\.pics", }, + "soybooru": { + "root": "https://soybooru.com", + "pattern": r"soybooru\.com", + "quote": "'", + }, }) + r"/(?:index\.php\?q=/?)?" @@ -93,7 +100,7 @@ class Shimmie2TagExtractor(Shimmie2Extractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") file_url_fmt = "{}/_images/{}/{}%20-%20{}.{}" - pattern = BASE_PATTERN + r"post/list/([^/?#]+)(?:/(\d+))?" + pattern = rf"{BASE_PATTERN}post/list/([^/?#]+)(?:/(\d+))?" example = "https://vidya.pics/post/list/TAG/1" def metadata(self): @@ -150,15 +157,14 @@ class Shimmie2TagExtractor(Shimmie2Extractor): } pnum += 1 - if not extr(">Next<", ">"): - if not extr(f"/{pnum}'>{pnum}<", ">"): - return + if not extr(f"/{pnum}{quote}>Next</", ">"): + return class Shimmie2PostExtractor(Shimmie2Extractor): """Extractor for single shimmie2 posts""" subcategory = "post" - pattern = BASE_PATTERN + r"post/view/(\d+)" + pattern = rf"{BASE_PATTERN}post/view/(\d+)" example = "https://vidya.pics/post/view/12345" def posts(self): diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 84c9a84..ad38562 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -20,7 +20,7 @@ class ShopifyExtractor(BaseExtractor): def items(self): data = self.metadata() - yield Message.Directory, data + yield Message.Directory, "", data for product in self.products(): for num, image in enumerate(product.pop("images"), 1): @@ -90,7 +90,7 @@ class ShopifyCollectionExtractor(ShopifyExtractor): """Base class for collection extractors for Shopify based sites""" subcategory = "collection" directory_fmt = ("{category}", "{collection[title]}") - pattern = BASE_PATTERN + r"(/collections/[\w-]+)/?(?:$|[?#])" + pattern = rf"{BASE_PATTERN}(/collections/[\w-]+)/?(?:$|[?#])" example = "https://www.fashionnova.com/collections/TITLE" def metadata(self): @@ -113,7 +113,7 @@ class ShopifyProductExtractor(ShopifyExtractor): """Base class for product extractors for Shopify based sites""" subcategory = "product" directory_fmt = ("{category}", "Products") - pattern = BASE_PATTERN + r"((?:/collections/[\w-]+)?/products/[\w-]+)" + pattern = rf"{BASE_PATTERN}((?:/collections/[\w-]+)?/products/[\w-]+)" example = "https://www.fashionnova.com/collections/TITLE/products/NAME" def products(self): diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py deleted file mode 100644 index d8227fa..0000000 --- a/gallery_dl/extractor/simpcity.py +++ /dev/null @@ -1,186 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2025 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://simpcity.cr/""" - -from .common import Extractor, Message -from .. import text, exception - -BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)" - - -class SimpcityExtractor(Extractor): - """Base class for simpcity extractors""" - category = "simpcity" - root = "https://simpcity.cr" - - def items(self): - extract_urls = text.re( - r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall - - for post in self.posts(): - urls = extract_urls(post["content"]) - data = {"post": post} - post["count"] = data["count"] = len(urls) - yield Message.Directory, data - for data["num"], url in enumerate(urls, 1): - yield Message.Queue, url, data - - def request_page(self, url): - try: - return self.request(url) - except exception.HttpError as exc: - if exc.status == 403 and b">Log in<" in exc.response.content: - msg = text.extr(exc.response.text, "blockMessage--error", "</") - raise exception.AuthRequired( - "'authenticated cookies'", None, - msg.rpartition(">")[2].strip()) - raise - - def _pagination(self, base, pnum=None): - base = f"{self.root}{base}" - - if pnum is None: - url = f"{base}/" - pnum = 1 - else: - url = f"{base}/page-{pnum}" - pnum = None - - while True: - page = self.request_page(url).text - - yield page - - if pnum is None or "pageNav-jump--next" not in page: - return - pnum += 1 - url = f"{base}/page-{pnum}" - - def _pagination_reverse(self, base, pnum=None): - base = f"{self.root}{base}" - - url = f"{base}/page-9999" # force redirect to last page - with self.request_page(url) as response: - url = response.url - if url[-1] == "/": - pnum = 1 - else: - pnum = text.parse_int(url[url.rfind("-")+1:], 1) - page = response.text - - while True: - yield page - - pnum -= 1 - if pnum > 1: - url = f"{base}/page-{pnum}" - elif pnum == 1: - url = f"{base}/" - else: - return - - page = self.request_page(url).text - - def _parse_thread(self, page): - schema = self._extract_jsonld(page)["mainEntity"] - author = schema["author"] - stats = schema["interactionStatistic"] - url_t = schema["url"] - url_a = author.get("url") or "" - - thread = { - "id" : url_t[url_t.rfind(".")+1:-1], - "url" : url_t, - "title": schema["headline"], - "date" : text.parse_datetime(schema["datePublished"]), - "views": stats[0]["userInteractionCount"], - "posts": stats[1]["userInteractionCount"], - "tags" : (schema["keywords"].split(", ") - if "keywords" in schema else ()), - "section" : schema["articleSection"], - "author" : author.get("name") or "", - "author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else - (author.get("name") or "")[15:]), - "author_url": url_a, - } - - return thread - - def _parse_post(self, html): - extr = text.extract_from(html) - - post = { - "author": extr('data-author="', '"'), - "id": extr('data-content="post-', '"'), - "author_url": extr('itemprop="url" content="', '"'), - "date": text.parse_datetime(extr('datetime="', '"')), - "content": extr('<div itemprop="text">', - '<div class="js-selectToQuote').strip(), - } - - url_a = post["author_url"] - post["author_id"] = url_a[url_a.rfind(".")+1:-1] - - return post - - -class SimpcityPostExtractor(SimpcityExtractor): - subcategory = "post" - pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)" - example = "https://simpcity.cr/threads/TITLE.12345/post-54321" - - def posts(self): - post_id = self.groups[0] - url = f"{self.root}/posts/{post_id}/" - page = self.request_page(url).text - - pos = page.find(f'data-content="post-{post_id}"') - if pos < 0: - raise exception.NotFoundError("post") - html = text.extract(page, "<article ", "</article>", pos-200)[0] - - self.kwdict["thread"] = self._parse_thread(page) - return (self._parse_post(html),) - - -class SimpcityThreadExtractor(SimpcityExtractor): - subcategory = "thread" - pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?" - example = "https://simpcity.cr/threads/TITLE.12345/" - - def posts(self): - if (order := self.config("order-posts")) and \ - order[0] not in ("d", "r"): - pages = self._pagination(*self.groups) - reverse = False - else: - pages = self._pagination_reverse(*self.groups) - reverse = True - - for page in pages: - if "thread" not in self.kwdict: - self.kwdict["thread"] = self._parse_thread(page) - posts = text.extract_iter(page, "<article ", "</article>") - if reverse: - posts = list(posts) - posts.reverse() - for html in posts: - yield self._parse_post(html) - - -class SimpcityForumExtractor(SimpcityExtractor): - subcategory = "forum" - pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?" - example = "https://simpcity.cr/forums/TITLE.123/" - - def items(self): - data = {"_extractor": SimpcityThreadExtractor} - for page in self._pagination(*self.groups): - for path in text.extract_iter(page, ' uix-href="', '"'): - yield Message.Queue, f"{self.root}{text.unquote(path)}", data diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index d6541b2..78d3daf 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -48,7 +48,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): "characters": split(extr('box-title">Characters</div>', '</div>')), "tags" : split(extr('box-title">Tags</div>', '</div>')), "artist" : split(extr('box-title">Artists</div>', '</div>')), - "date" : text.parse_datetime(text.remove_html( + "date" : self.parse_datetime(text.remove_html( extr('Uploaded', '</div>')), "%d.%m.%Y"), } data["lang"] = util.language_to_code(data["language"]) @@ -106,7 +106,7 @@ class SimplyhentaiImageExtractor(Extractor): }) data["token"] = data["filename"].rpartition("_")[2] - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data @@ -152,9 +152,9 @@ class SimplyhentaiVideoExtractor(Extractor): "episode": text.parse_int(episode), "tags": text.split_html(tags)[::2], "type": "video", - "date": text.parse_datetime(text.remove_html( + "date": self.parse_datetime(text.remove_html( date), "%B %d, %Y %H:%M"), }) - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, video_url, data diff --git a/gallery_dl/extractor/sizebooru.py b/gallery_dl/extractor/sizebooru.py index cad4b23..00002b8 100644 --- a/gallery_dl/extractor/sizebooru.py +++ b/gallery_dl/extractor/sizebooru.py @@ -45,9 +45,9 @@ class SizebooruExtractor(BooruExtractor): post.update({ "id" : text.parse_int(post_id), - "date" : text.parse_datetime( + "date" : self.parse_datetime( extr("<b>Posted Date:</b> ", "<"), "%m/%d/%Y"), - "date_approved": text.parse_datetime( + "date_approved": self.parse_datetime( extr("<b>Approved Date:</b> ", "<"), "%m/%d/%Y"), "approver" : text.remove_html(extr("<b>Approved By:</b>", "</")), "uploader" : text.remove_html(extr("<b>Posted By:</b>", "</")), diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 3c7205a..43e518e 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -10,7 +10,7 @@ from .common import Extractor, Message, Dispatch from .. import text BASE_PATTERN = r"(?:https?://)?skeb\.jp" -USER_PATTERN = BASE_PATTERN + r"/@([^/?#]+)" +USER_PATTERN = rf"{BASE_PATTERN}/@([^/?#]+)" class SkebExtractor(Extractor): @@ -57,7 +57,7 @@ class SkebExtractor(Extractor): files = self._get_files_from_post(response) post["count"] = len(files) - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], file in enumerate(files, 1): post.update(file) url = file["file_url"] @@ -194,7 +194,7 @@ class SkebExtractor(Extractor): class SkebPostExtractor(SkebExtractor): """Extractor for a single skeb post""" subcategory = "post" - pattern = USER_PATTERN + r"/works/(\d+)" + pattern = rf"{USER_PATTERN}/works/(\d+)" example = "https://skeb.jp/@USER/works/123" def posts(self): @@ -204,7 +204,7 @@ class SkebPostExtractor(SkebExtractor): class SkebWorksExtractor(SkebExtractor): """Extractor for a skeb user's works""" subcategory = "works" - pattern = USER_PATTERN + r"/works" + pattern = rf"{USER_PATTERN}/works" example = "https://skeb.jp/@USER/works" def posts(self): @@ -216,7 +216,7 @@ class SkebWorksExtractor(SkebExtractor): class SkebSentrequestsExtractor(SkebExtractor): """Extractor for a skeb user's sent requests""" subcategory = "sentrequests" - pattern = USER_PATTERN + r"/sent[ _-]?requests" + pattern = rf"{USER_PATTERN}/sent[ _-]?requests" example = "https://skeb.jp/@USER/sentrequests" def posts(self): @@ -227,7 +227,7 @@ class SkebSentrequestsExtractor(SkebExtractor): class SkebUserExtractor(Dispatch, SkebExtractor): """Extractor for a skeb user profile""" - pattern = USER_PATTERN + r"/?$" + pattern = rf"{USER_PATTERN}/?$" example = "https://skeb.jp/@USER" def items(self): @@ -246,7 +246,7 @@ class SkebUserExtractor(Dispatch, SkebExtractor): class SkebSearchExtractor(SkebExtractor): """Extractor for skeb search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search\?q=([^&#]+)" + pattern = rf"{BASE_PATTERN}/search\?q=([^&#]+)" example = "https://skeb.jp/search?q=QUERY" def metadata(self): @@ -298,7 +298,7 @@ class SkebSearchExtractor(SkebExtractor): class SkebFollowingExtractor(SkebExtractor): """Extractor for all creators followed by a skeb user""" subcategory = "following" - pattern = USER_PATTERN + r"/following_creators" + pattern = rf"{USER_PATTERN}/following_creators" example = "https://skeb.jp/@USER/following_creators" items = SkebExtractor.items_users @@ -312,7 +312,7 @@ class SkebFollowingExtractor(SkebExtractor): class SkebFollowingUsersExtractor(SkebExtractor): """Extractor for your followed users""" subcategory = "following-users" - pattern = BASE_PATTERN + r"/following_users" + pattern = rf"{BASE_PATTERN}/following_users" example = "https://skeb.jp/following_users" items = SkebExtractor.items_users diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py index ee877f2..6f723c8 100644 --- a/gallery_dl/extractor/slickpic.py +++ b/gallery_dl/extractor/slickpic.py @@ -32,7 +32,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor): "{album[id]} {album[title]}") filename_fmt = "{num:>03}_{id}{title:?_//}.{extension}" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/albums/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/albums/([^/?#]+)" example = "https://USER.slickpic.com/albums/TITLE/" def __init__(self, match): @@ -56,7 +56,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor): "count": len(imgs), } - yield Message.Directory, data + yield Message.Directory, "", data for num, img in enumerate(imgs, 1): url = img["url_rsz"] + "/o/" + img["fname"] img = text.nameext_from_url(img["fname"], { @@ -110,7 +110,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor): class SlickpicUserExtractor(SlickpicExtractor): subcategory = "user" - pattern = BASE_PATTERN + r"(?:/gallery)?/?(?:$|[?#])" + pattern = rf"{BASE_PATTERN}(?:/gallery)?/?(?:$|[?#])" example = "https://USER.slickpic.com/" def items(self): diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index c0f0e36..1bb70ed 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -10,7 +10,6 @@ """Extractors for https://www.slideshare.net/""" from .common import GalleryExtractor -from .. import text class SlidesharePresentationExtractor(GalleryExtractor): @@ -40,8 +39,8 @@ class SlidesharePresentationExtractor(GalleryExtractor): "description" : slideshow["description"].strip(), "views" : slideshow["views"], "likes" : slideshow["likes"], - "date" : text.parse_datetime( - slideshow["createdAt"], "%Y-%m-%d %H:%M:%S %Z"), + "date" : self.parse_datetime_iso( + slideshow["createdAt"][:19]), } def images(self, page): diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index e9c89a1..902044c 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -81,7 +81,7 @@ class SmugmugAlbumExtractor(SmugmugExtractor): del album["Uris"] data = {"Album": album, "User": user} - yield Message.Directory, data + yield Message.Directory, "", data for image in self.api.album_images(self.album_id, "ImageSizeDetails"): url = self._select_format(image) @@ -93,7 +93,7 @@ class SmugmugImageExtractor(SmugmugExtractor): """Extractor for individual smugmug images""" subcategory = "image" archive_fmt = "{Image[ImageKey]}" - pattern = BASE_PATTERN + r"(?:/[^/?#]+)+/i-([^/?#-]+)" + pattern = rf"{BASE_PATTERN}(?:/[^/?#]+)+/i-([^/?#-]+)" example = "https://USER.smugmug.com/PATH/i-ID" def __init__(self, match): @@ -107,14 +107,14 @@ class SmugmugImageExtractor(SmugmugExtractor): data = {"Image": image} text.nameext_from_url(url, data) - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data class SmugmugPathExtractor(SmugmugExtractor): """Extractor for smugmug albums from URL paths and users""" subcategory = "path" - pattern = BASE_PATTERN + r"((?:/[^/?#a-fh-mo-z][^/?#]*)*)/?$" + pattern = rf"{BASE_PATTERN}((?:/[^/?#a-fh-mo-z][^/?#]*)*)/?$" example = "https://USER.smugmug.com/PATH" def __init__(self, match): diff --git a/gallery_dl/extractor/soundgasm.py b/gallery_dl/extractor/soundgasm.py index 79ab74d..a4617dd 100644 --- a/gallery_dl/extractor/soundgasm.py +++ b/gallery_dl/extractor/soundgasm.py @@ -26,7 +26,7 @@ class SoundgasmExtractor(Extractor): def items(self): for sound in map(self._extract_sound, self.sounds()): url = sound["url"] - yield Message.Directory, sound + yield Message.Directory, "", sound yield Message.Url, url, text.nameext_from_url(url, sound) def _extract_sound(self, url): @@ -50,7 +50,7 @@ class SoundgasmExtractor(Extractor): class SoundgasmAudioExtractor(SoundgasmExtractor): """Extractor for audio clips from soundgasm.net""" subcategory = "audio" - pattern = BASE_PATTERN + r"/([^/?#]+)/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/([^/?#]+)" example = "https://soundgasm.net/u/USER/TITLE" def __init__(self, match): @@ -64,7 +64,7 @@ class SoundgasmAudioExtractor(SoundgasmExtractor): class SoundgasmUserExtractor(SoundgasmExtractor): """Extractor for all sounds from a soundgasm user""" subcategory = "user" - pattern = BASE_PATTERN + r"/([^/?#]+)/?$" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/?$" example = "https://soundgasm.net/u/USER" def __init__(self, match): diff --git a/gallery_dl/extractor/speakerdeck.py b/gallery_dl/extractor/speakerdeck.py index b809b7f..412b3b7 100644 --- a/gallery_dl/extractor/speakerdeck.py +++ b/gallery_dl/extractor/speakerdeck.py @@ -9,7 +9,7 @@ """Extractors for https://speakerdeck.com/""" from .common import GalleryExtractor -from .. import text, util +from .. import text class SpeakerdeckPresentationExtractor(GalleryExtractor): @@ -46,7 +46,7 @@ class SpeakerdeckPresentationExtractor(GalleryExtractor): def images(self, _): url = f"{self.root}/player/{self.presentation_id}" page = self.request(url).text - page = util.re(r"\s+").sub(" ", page) + page = text.re(r"\s+").sub(" ", page) return [ (url, None) for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"') diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index e17b9fd..c3af7fd 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -59,7 +59,7 @@ class SteamgriddbExtractor(Extractor): fake_png = download_fake_png and asset.get("fake_png") asset["count"] = 2 if fake_png else 1 - yield Message.Directory, asset + yield Message.Directory, "", asset asset["num"] = 1 url = asset["url"] @@ -157,7 +157,7 @@ class SteamgriddbAssetsExtractor(SteamgriddbExtractor): class SteamgriddbAssetExtractor(SteamgriddbExtractor): """Extractor for a single asset""" subcategory = "asset" - pattern = BASE_PATTERN + r"/(grid|hero|logo|icon)/(\d+)" + pattern = rf"{BASE_PATTERN}/(grid|hero|logo|icon)/(\d+)" example = "https://www.steamgriddb.com/grid/1234" def __init__(self, match): @@ -177,7 +177,7 @@ class SteamgriddbAssetExtractor(SteamgriddbExtractor): class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor): subcategory = "grids" asset_type = "grid" - pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/grids(?:/(\d+))?" + pattern = rf"{BASE_PATTERN}/(game|collection)/(\d+)/grids(?:/(\d+))?" example = "https://www.steamgriddb.com/game/1234/grids" valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930", "512x512", "1024x1024") @@ -189,7 +189,7 @@ class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor): class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor): subcategory = "heroes" asset_type = "hero" - pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/heroes(?:/(\d+))?" + pattern = rf"{BASE_PATTERN}/(game|collection)/(\d+)/heroes(?:/(\d+))?" example = "https://www.steamgriddb.com/game/1234/heroes" valid_dimensions = ("1920x620", "3840x1240", "1600x650") valid_styles = ("alternate", "blurred", "material") @@ -199,7 +199,7 @@ class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor): class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor): subcategory = "logos" asset_type = "logo" - pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/logos(?:/(\d+))?" + pattern = rf"{BASE_PATTERN}/(game|collection)/(\d+)/logos(?:/(\d+))?" example = "https://www.steamgriddb.com/game/1234/logos" valid_dimensions = None valid_styles = ("official", "white", "black", "custom") @@ -209,7 +209,7 @@ class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor): class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor): subcategory = "icons" asset_type = "icon" - pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?" + pattern = rf"{BASE_PATTERN}/(game|collection)/(\d+)/icons(?:/(\d+))?" example = "https://www.steamgriddb.com/game/1234/icons" valid_dimensions = [f"{i}x{i}" for i in (8, 10, 14, 16, 20, 24, 28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90, diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 989e6cc..280c8d7 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -46,14 +46,20 @@ class SubscribestarExtractor(Extractor): content, "<body>", "</body>") data["title"] = text.unescape(text.rextr(content, "<h1>", "</h1>")) - yield Message.Directory, data + yield Message.Directory, "", data for num, item in enumerate(media, 1): item.update(data) item["num"] = num - text.nameext_from_url(item.get("name") or item["url"], item) - if item["url"][0] == "/": - item["url"] = self.root + item["url"] - yield Message.Url, item["url"], item + + url = item["url"] + if name := (item.get("name") or item.get("original_filename")): + text.nameext_from_name(name, item) + else: + text.nameext_from_url(url, item) + + if url[0] == "/": + url = f"{self.root}{url}" + yield Message.Url, url, item def posts(self): """Yield HTML content of all relevant posts""" @@ -155,7 +161,7 @@ class SubscribestarExtractor(Extractor): attachments = text.extr( html, 'class="uploads-docs"', 'class="post-edit_form"') if attachments: - for att in util.re(r'class="doc_preview[" ]').split( + for att in text.re(r'class="doc_preview[" ]').split( attachments)[1:]: media.append({ "id" : text.parse_int(text.extr( @@ -169,7 +175,7 @@ class SubscribestarExtractor(Extractor): audios = text.extr( html, 'class="uploads-audios"', 'class="post-edit_form"') if audios: - for audio in util.re(r'class="audio_preview-data[" ]').split( + for audio in text.re(r'class="audio_preview-data[" ]').split( audios)[1:]: media.append({ "id" : text.parse_int(text.extr( @@ -202,9 +208,9 @@ class SubscribestarExtractor(Extractor): def _parse_datetime(self, dt): if dt.startswith("Updated on "): dt = dt[11:] - date = text.parse_datetime(dt, "%b %d, %Y %I:%M %p") + date = self.parse_datetime(dt, "%b %d, %Y %I:%M %p") if date is dt: - date = text.parse_datetime(dt, "%B %d, %Y %I:%M %p") + date = self.parse_datetime(dt, "%B %d, %Y %I:%M %p") return date def _warn_preview(self): @@ -215,7 +221,7 @@ class SubscribestarExtractor(Extractor): class SubscribestarUserExtractor(SubscribestarExtractor): """Extractor for media from a subscribestar user""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+)" + pattern = rf"{BASE_PATTERN}/(?!posts/)([^/?#]+)" example = "https://www.subscribestar.com/USER" def posts(self): @@ -237,7 +243,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor): class SubscribestarPostExtractor(SubscribestarExtractor): """Extractor for media from a single subscribestar post""" subcategory = "post" - pattern = BASE_PATTERN + r"/posts/(\d+)" + pattern = rf"{BASE_PATTERN}/posts/(\d+)" example = "https://www.subscribestar.com/posts/12345" def posts(self): diff --git a/gallery_dl/extractor/sxypix.py b/gallery_dl/extractor/sxypix.py new file mode 100644 index 0000000..c9a1701 --- /dev/null +++ b/gallery_dl/extractor/sxypix.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://sxypix.com/""" + +from .common import GalleryExtractor +from .. import text + + +class SxypixGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from sxypix.com""" + category = "sxypix" + root = "https://sxypix.com" + pattern = r"(?:https?://)?(?:www\.)?sxypix\.com(/w/(\w+))" + example = "https://sxypix.com/w/2bbaf1b24a5863d0e73436619bbaa7ee" + + def metadata(self, page): + return { + "gallery_id": self.groups[1], + "title": text.unescape(text.extr( + page, '<meta name="keywords" content="', '"')), + } + + def images(self, page): + data = { + "aid" : text.extr(page, "data-aid='", "'"), + "ghash": text.extr(page, "data-ghash='", "'"), + } + gallery = self.request_json( + "https://sxypix.com/php/gall.php", method="POST", data=data) + + base = "https://x." + return [ + (base + text.extr(entry, "data-src='//.", "'"), None) + for entry in gallery["r"] + ] diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index 190ccbf..59477cc 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -57,8 +57,7 @@ class SzurubooruExtractor(booru.BooruExtractor): return url def _prepare(self, post): - post["date"] = text.parse_datetime( - post["creationTime"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["creationTime"]) tags = [] tags_categories = collections.defaultdict(list) @@ -94,7 +93,7 @@ class SzurubooruTagExtractor(SzurubooruExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}_{version}" - pattern = BASE_PATTERN + r"/posts(?:/query=([^/?#]*))?" + pattern = rf"{BASE_PATTERN}/posts(?:/query=([^/?#]*))?" example = "https://booru.bcbnsfw.space/posts/query=TAG" def __init__(self, match): @@ -117,7 +116,7 @@ class SzurubooruTagExtractor(SzurubooruExtractor): class SzurubooruPostExtractor(SzurubooruExtractor): subcategory = "post" archive_fmt = "{id}_{version}" - pattern = BASE_PATTERN + r"/post/(\d+)" + pattern = rf"{BASE_PATTERN}/post/(\d+)" example = "https://booru.bcbnsfw.space/post/12345" def posts(self): diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index d823f6a..5f8cb67 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -72,7 +72,7 @@ class TapasExtractor(Extractor): class TapasEpisodeExtractor(TapasExtractor): subcategory = "episode" - pattern = BASE_PATTERN + r"/episode/(\d+)" + pattern = rf"{BASE_PATTERN}/episode/(\d+)" example = "https://tapas.io/episode/12345" def items(self): @@ -89,8 +89,8 @@ class TapasEpisodeExtractor(TapasExtractor): html = data["html"] episode["series"] = self._extract_series(html) - episode["date"] = text.parse_datetime(episode["publish_date"]) - yield Message.Directory, episode + episode["date"] = self.parse_datetime_iso(episode["publish_date"]) + yield Message.Directory, "", episode if episode["book"]: content = text.extr( @@ -116,7 +116,7 @@ class TapasEpisodeExtractor(TapasExtractor): class TapasSeriesExtractor(TapasExtractor): subcategory = "series" - pattern = BASE_PATTERN + r"/series/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/series/([^/?#]+)" example = "https://tapas.io/series/TITLE" def items(self): @@ -150,7 +150,7 @@ class TapasSeriesExtractor(TapasExtractor): class TapasCreatorExtractor(TapasExtractor): subcategory = "creator" - pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)" + pattern = rf"{BASE_PATTERN}/(?!series|episode)([^/?#]+)" example = "https://tapas.io/CREATOR" def items(self): diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py index 6dcb153..e27ef0d 100644 --- a/gallery_dl/extractor/tcbscans.py +++ b/gallery_dl/extractor/tcbscans.py @@ -15,7 +15,7 @@ BASE_PATTERN = (r"(?:https?://)?(?:tcb(?:-backup\.bihar-mirchi|scans)" class TcbscansChapterExtractor(ChapterExtractor): category = "tcbscans" - pattern = BASE_PATTERN + r"(/chapters/\d+/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/chapters/\d+/[^/?#]+)" example = "https://tcbscans.me/chapters/12345/MANGA-chapter-123" def __init__(self, match): @@ -44,7 +44,7 @@ class TcbscansChapterExtractor(ChapterExtractor): class TcbscansMangaExtractor(MangaExtractor): category = "tcbscans" chapterclass = TcbscansChapterExtractor - pattern = BASE_PATTERN + r"(/mangas/\d+/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/mangas/\d+/[^/?#]+)" example = "https://tcbscans.me/mangas/123/MANGA" def __init__(self, match): diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py index 2713621..ab77b31 100644 --- a/gallery_dl/extractor/telegraph.py +++ b/gallery_dl/extractor/telegraph.py @@ -27,9 +27,8 @@ class TelegraphGalleryExtractor(GalleryExtractor): 'property="og:title" content="', '"')), "description": text.unescape(extr( 'property="og:description" content="', '"')), - "date": text.parse_datetime(extr( - 'property="article:published_time" content="', '"'), - "%Y-%m-%dT%H:%M:%S%z"), + "date": self.parse_datetime_iso(extr( + 'property="article:published_time" content="', '"')), "author": text.unescape(extr( 'property="article:author" content="', '"')), "post_url": text.unescape(extr( diff --git a/gallery_dl/extractor/tenor.py b/gallery_dl/extractor/tenor.py index 7e1f802..3e4bab0 100644 --- a/gallery_dl/extractor/tenor.py +++ b/gallery_dl/extractor/tenor.py @@ -40,16 +40,17 @@ class TenorExtractor(Extractor): continue url = fmt["url"] + title = gif.pop("h1_title", "") + gif["title"] = title[:-4] if title.endswith(" GIF") else title + gif["width"], gif["height"] = fmt.pop("dims") or (0, 0) + gif["description"] = gif.pop("content_description", "") gif["id_format"] = url.rsplit("/", 2)[1] gif["format"] = fmt["name"] - gif["width"], gif["height"] = fmt["dims"] gif["duration"] = fmt["duration"] gif["size"] = fmt["size"] - gif["title"] = gif["h1_title"][:-4] - gif["description"] = gif.pop("content_description", "") - gif["date"] = text.parse_timestamp(gif["created"]) + gif["date"] = self.parse_timestamp(gif["created"]) - yield Message.Directory, gif + yield Message.Directory, "", gif yield Message.Url, url, text.nameext_from_url(url, gif) def _extract_format(self, gif): @@ -110,7 +111,7 @@ class TenorExtractor(Extractor): class TenorImageExtractor(TenorExtractor): subcategory = "image" - pattern = BASE_PATTERN + r"view/(?:[^/?#]*-)?(\d+)" + pattern = rf"{BASE_PATTERN}view/(?:[^/?#]*-)?(\d+)" example = "https://tenor.com/view/SLUG-1234567890" def gifs(self): @@ -124,7 +125,7 @@ class TenorImageExtractor(TenorExtractor): class TenorSearchExtractor(TenorExtractor): subcategory = "search" directory_fmt = ("{category}", "{search_tags}") - pattern = BASE_PATTERN + r"search/([^/?#]+)" + pattern = rf"{BASE_PATTERN}search/([^/?#]+)" example = "https://tenor.com/search/QUERY" def gifs(self): @@ -140,7 +141,7 @@ class TenorSearchExtractor(TenorExtractor): class TenorUserExtractor(TenorExtractor): subcategory = "user" directory_fmt = ("{category}", "@{user[username]}") - pattern = BASE_PATTERN + r"(?:users|official)/([^/?#]+)" + pattern = rf"{BASE_PATTERN}(?:users|official)/([^/?#]+)" example = "https://tenor.com/users/USER" def gifs(self): diff --git a/gallery_dl/extractor/thehentaiworld.py b/gallery_dl/extractor/thehentaiworld.py index 9a30654..773f300 100644 --- a/gallery_dl/extractor/thehentaiworld.py +++ b/gallery_dl/extractor/thehentaiworld.py @@ -36,12 +36,12 @@ class ThehentaiworldExtractor(Extractor): if "file_urls" in post: urls = post["file_urls"] post["count"] = len(urls) - yield Message.Directory, post + yield Message.Directory, "", post for post["num"], url in enumerate(urls, 1): text.nameext_from_url(url, post) yield Message.Url, url, post else: - yield Message.Directory, post + yield Message.Directory, "", post url = post["file_url"] text.nameext_from_url(url, post) yield Message.Url, url, post @@ -56,8 +56,7 @@ class ThehentaiworldExtractor(Extractor): "id" : text.parse_int(extr(" postid-", " ")), "slug" : extr(" post-", '"'), "tags" : extr('id="tagsHead">', "</ul>"), - "date" : text.parse_datetime(extr( - "<li>Posted: ", "<"), "%Y-%m-%d"), + "date" : self.parse_datetime_iso(extr("<li>Posted: ", "<")), } if (c := url[27]) == "v": diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index f450806..a4c7171 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -25,6 +25,7 @@ class TiktokExtractor(Extractor): def _init(self): self.audio = self.config("audio", True) self.video = self.config("videos", True) + self.cover = self.config("covers", False) def items(self): for tiktok_url in self.urls(): @@ -43,10 +44,10 @@ class TiktokExtractor(Extractor): post = video_detail["itemInfo"]["itemStruct"] post["user"] = (a := post.get("author")) and a["uniqueId"] or "" - post["date"] = text.parse_timestamp(post["createTime"]) + post["date"] = self.parse_timestamp(post["createTime"]) original_title = title = post["desc"] - yield Message.Directory, post + yield Message.Directory, "", post ytdl_media = False if "imagePost" in post: @@ -70,12 +71,14 @@ class TiktokExtractor(Extractor): if self.audio and "music" in post: if self.audio == "ytdl": ytdl_media = "audio" - else: - url = self._extract_audio(post) + elif url := self._extract_audio(post): yield Message.Url, url, post - elif self.video and "video" in post: - ytdl_media = "video" + elif "video" in post: + if self.video: + ytdl_media = "video" + if self.cover and (url := self._extract_cover(post, "video")): + yield Message.Url, url, post else: self.log.info("%s: Skipping post", tiktok_url) @@ -144,6 +147,30 @@ class TiktokExtractor(Extractor): post["extension"] = "mp3" return url + def _extract_cover(self, post, type): + media = post[type] + + for cover_id in ("thumbnail", "cover", "originCover", "dynamicCover"): + if url := media.get(cover_id): + break + else: + return + + text.nameext_from_url(url, post) + post.update({ + "type" : "cover", + "extension": "jpg", + "image" : url, + "title" : post["desc"] or f"TikTok {type} cover #{post['id']}", + "duration" : media.get("duration"), + "num" : 0, + "img_id" : "", + "cover_id" : cover_id, + "width" : 0, + "height" : 0, + }) + return url + def _check_status_code(self, detail, url): status = detail.get("statusCode") if not status: @@ -166,7 +193,7 @@ class TiktokExtractor(Extractor): class TiktokPostExtractor(TiktokExtractor): """Extract a single video or photo TikTok link""" subcategory = "post" - pattern = BASE_PATTERN + r"/(?:@([\w_.-]*)|share)/(?:phot|vide)o/(\d+)" + pattern = rf"{BASE_PATTERN}/(?:@([\w_.-]*)|share)/(?:phot|vide)o/(\d+)" example = "https://www.tiktok.com/@USER/photo/1234567890" def urls(self): @@ -199,7 +226,7 @@ class TiktokVmpostExtractor(TiktokExtractor): class TiktokUserExtractor(TiktokExtractor): """Extract a TikTok user's profile""" subcategory = "user" - pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)" + pattern = rf"{BASE_PATTERN}/@([\w_.-]+)/?(?:$|\?|#)" example = "https://www.tiktok.com/@USER" def _init(self): @@ -214,7 +241,7 @@ class TiktokUserExtractor(TiktokExtractor): except (ImportError, SyntaxError) as exc: self.log.error("Cannot import module '%s'", getattr(exc, "name", "")) - self.log.debug("", exc_info=exc) + self.log.traceback(exc) raise exception.ExtractionError("yt-dlp or youtube-dl is required " "for this feature!") @@ -254,7 +281,7 @@ class TiktokUserExtractor(TiktokExtractor): self.log.warning("Unable to extract 'avatar' URL (%s: %s)", exc.__class__.__name__, exc) else: - yield Message.Directory, avatar + yield Message.Directory, "", avatar yield Message.Url, avatar_url, avatar with ytdl_instance as ydl: diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index ef441d3..873cce8 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -16,7 +16,7 @@ class TmohentaiGalleryExtractor(GalleryExtractor): category = "tmohentai" root = "http://tmohentai.com" directory_fmt = ("{category}", "{title} ({gallery_id})") - pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)" + pattern = rf"{BASE_PATTERN}/(?:contents|reader)/(\w+)" example = "https://tmohentai.com/contents/12345a67b89c0" def __init__(self, match): diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py index 7add79a..cc29b11 100644 --- a/gallery_dl/extractor/toyhouse.py +++ b/gallery_dl/extractor/toyhouse.py @@ -34,7 +34,7 @@ class ToyhouseExtractor(Extractor): post.update(metadata) text.nameext_from_url(post["url"], post) post["id"], _, post["hash"] = post["filename"].partition("_") - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, post["url"], post def posts(self): @@ -51,7 +51,7 @@ class ToyhouseExtractor(Extractor): extr = text.extract_from(post) return { "url": extr(needle, '"'), - "date": text.parse_datetime(extr( + "date": self.parse_datetime(extr( '</h2>\n <div class="mb-1">', '<'), "%d %b %Y, %I:%M:%S %p"), "artists": [ @@ -104,7 +104,7 @@ class ToyhouseExtractor(Extractor): class ToyhouseArtExtractor(ToyhouseExtractor): """Extractor for artworks of a toyhouse user""" subcategory = "art" - pattern = BASE_PATTERN + r"/([^/?#]+)/art" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/art" example = "https://www.toyhou.se/USER/art" def posts(self): diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index 8732c60..1ccdafb 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -65,7 +65,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): "title_jp" : title_jp, "thumbnail" : extr('"og:image" content="', '"'), "uploader" : text.remove_html(extr('id="Uploader">', '</div>')), - "date" : text.parse_datetime( + "date" : self.parse_datetime( extr('id="Uploaded">', '</div>').strip(), "%Y %B %d"), "rating" : text.parse_float(extr( 'id="Rating">', '</div>').partition(" ")[0]), diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 92fc831..5bb5a40 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -9,8 +9,7 @@ """Extractors for https://www.tumblr.com/""" from .common import Extractor, Message -from .. import text, util, oauth, exception -from datetime import datetime, date, timedelta +from .. import text, util, dt, oauth, exception BASE_PATTERN = ( @@ -61,16 +60,16 @@ class TumblrExtractor(Extractor): blog = None # pre-compile regular expressions - self._sub_video = util.re( + self._sub_video = text.re( r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub if self.inline: - self._sub_image = util.re( + self._sub_image = text.re( r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub - self._subn_orig_image = util.re(r"/s\d+x\d+/").subn - _findall_image = util.re('<img src="([^"]+)"').findall - _findall_video = util.re('<source src="([^"]+)"').findall + self._subn_orig_image = text.re(r"/s\d+x\d+/").subn + _findall_image = text.re('<img src="([^"]+)"').findall + _findall_video = text.re('<source src="([^"]+)"').findall for post in self.posts(): if self.date_min > post["timestamp"]: @@ -88,7 +87,7 @@ class TumblrExtractor(Extractor): if self.avatar: url = self.api.avatar(self.blog) - yield Message.Directory, {"blog": blog} + yield Message.Directory, "", {"blog": blog} yield self._prepare_avatar(url, post.copy(), blog) post["blog"] = blog @@ -100,7 +99,7 @@ class TumblrExtractor(Extractor): if "trail" in post: del post["trail"] - post["date"] = text.parse_timestamp(post["timestamp"]) + post["date"] = self.parse_timestamp(post["timestamp"]) posts = [] if "photos" in post: # type "photo" or "link" @@ -161,7 +160,7 @@ class TumblrExtractor(Extractor): del post["extension"] post["count"] = len(posts) - yield Message.Directory, post + yield Message.Directory, "", post for num, (msg, url, post) in enumerate(posts, 1): post["num"] = num @@ -271,7 +270,7 @@ class TumblrExtractor(Extractor): class TumblrUserExtractor(TumblrExtractor): """Extractor for a Tumblr user's posts""" subcategory = "user" - pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$" + pattern = rf"{BASE_PATTERN}(?:/page/\d+|/archive)?/?$" example = "https://www.tumblr.com/BLOG" def posts(self): @@ -281,7 +280,7 @@ class TumblrUserExtractor(TumblrExtractor): class TumblrPostExtractor(TumblrExtractor): """Extractor for a single Tumblr post""" subcategory = "post" - pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)" + pattern = rf"{BASE_PATTERN}/(?:post/|image/)?(\d+)" example = "https://www.tumblr.com/BLOG/12345" def posts(self): @@ -296,7 +295,7 @@ class TumblrPostExtractor(TumblrExtractor): class TumblrTagExtractor(TumblrExtractor): """Extractor for Tumblr user's posts by tag""" subcategory = "tag" - pattern = BASE_PATTERN + r"(?:/archive)?/tagged/([^/?#]+)" + pattern = rf"{BASE_PATTERN}(?:/archive)?/tagged/([^/?#]+)" example = "https://www.tumblr.com/BLOG/tagged/TAG" def posts(self): @@ -308,12 +307,12 @@ class TumblrTagExtractor(TumblrExtractor): class TumblrDayExtractor(TumblrExtractor): """Extractor for Tumblr user's posts by day""" subcategory = "day" - pattern = BASE_PATTERN + r"/day/(\d\d\d\d/\d\d/\d\d)" + pattern = rf"{BASE_PATTERN}/day/(\d\d\d\d/\d\d/\d\d)" example = "https://www.tumblr.com/BLOG/day/1970/01/01" def posts(self): year, month, day = self.groups[3].split("/") - ordinal = date(int(year), int(month), int(day)).toordinal() + ordinal = dt.date(int(year), int(month), int(day)).toordinal() # 719163 == date(1970, 1, 1).toordinal() self.date_min = (ordinal - 719163) * 86400 @@ -326,7 +325,7 @@ class TumblrLikesExtractor(TumblrExtractor): subcategory = "likes" directory_fmt = ("{category}", "{blog_name}", "likes") archive_fmt = "f_{blog[name]}_{id}_{num}" - pattern = BASE_PATTERN + r"/likes" + pattern = rf"{BASE_PATTERN}/likes" example = "https://www.tumblr.com/BLOG/likes" def posts(self): @@ -336,7 +335,7 @@ class TumblrLikesExtractor(TumblrExtractor): class TumblrFollowingExtractor(TumblrExtractor): """Extractor for a Tumblr user's followed blogs""" subcategory = "following" - pattern = BASE_PATTERN + r"/following" + pattern = rf"{BASE_PATTERN}/following" example = "https://www.tumblr.com/BLOG/following" items = TumblrExtractor.items_blogs @@ -348,7 +347,7 @@ class TumblrFollowingExtractor(TumblrExtractor): class TumblrFollowersExtractor(TumblrExtractor): """Extractor for a Tumblr user's followers""" subcategory = "followers" - pattern = BASE_PATTERN + r"/followers" + pattern = rf"{BASE_PATTERN}/followers" example = "https://www.tumblr.com/BLOG/followers" items = TumblrExtractor.items_blogs @@ -514,7 +513,7 @@ class TumblrAPI(oauth.OAuth1API): self.extractor.wait(seconds=reset) continue - t = (datetime.now() + timedelta(0, float(reset))).time() + t = (dt.now() + dt.timedelta(0, float(reset))).time() raise exception.AbortExtraction( f"Aborting - Rate limit will reset at " f"{t.hour:02}:{t.minute:02}:{t.second:02}") diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py index 26868ec..68c9ec7 100644 --- a/gallery_dl/extractor/tumblrgallery.py +++ b/gallery_dl/extractor/tumblrgallery.py @@ -36,7 +36,7 @@ class TumblrgalleryExtractor(GalleryExtractor): class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor): """Extractor for Tumblrblog on tumblrgallery.xyz""" subcategory = "tumblrblog" - pattern = BASE_PATTERN + r"(/tumblrblog/gallery/(\d+)\.html)" + pattern = rf"{BASE_PATTERN}(/tumblrblog/gallery/(\d+)\.html)" example = "https://tumblrgallery.xyz/tumblrblog/gallery/12345.html" def __init__(self, match): @@ -68,7 +68,7 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor): class TumblrgalleryPostExtractor(TumblrgalleryExtractor): """Extractor for Posts on tumblrgallery.xyz""" subcategory = "post" - pattern = BASE_PATTERN + r"(/post/(\d+)\.html)" + pattern = rf"{BASE_PATTERN}(/post/(\d+)\.html)" example = "https://tumblrgallery.xyz/post/12345.html" def __init__(self, match): @@ -93,7 +93,7 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor): subcategory = "search" filename_fmt = "{category}_{num:>03}_{gallery_id}_{id}_{title}.{extension}" directory_fmt = ("{category}", "{search_term}") - pattern = BASE_PATTERN + r"(/s\.php\?q=([^&#]+))" + pattern = rf"{BASE_PATTERN}(/s\.php\?q=([^&#]+))" example = "https://tumblrgallery.xyz/s.php?q=QUERY" def __init__(self, match): diff --git a/gallery_dl/extractor/tungsten.py b/gallery_dl/extractor/tungsten.py index 45836a9..67c0b50 100644 --- a/gallery_dl/extractor/tungsten.py +++ b/gallery_dl/extractor/tungsten.py @@ -23,10 +23,10 @@ class TungstenExtractor(Extractor): def items(self): for post in self.posts(): url = post["original_url"] - post["date"] = text.parse_datetime(post["created_at"]) + post["date"] = self.parse_datetime_iso(post["created_at"]) post["filename"] = url[url.rfind("/")+1:] post["extension"] = "webp" - yield Message.Directory, post + yield Message.Directory, "", post yield Message.Url, url, post def _pagination(self, url, params): diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index 4f9fe84..e21ef2a 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -37,8 +37,7 @@ class TwibooruExtractor(BooruExtractor): return post["view_url"] def _prepare(self, post): - post["date"] = text.parse_datetime( - post["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["created_at"]) if "name" in post: name, sep, rest = post["name"].rpartition(".") @@ -49,7 +48,7 @@ class TwibooruPostExtractor(TwibooruExtractor): """Extractor for single twibooru posts""" subcategory = "post" request_interval = (0.5, 1.5) - pattern = BASE_PATTERN + r"/(\d+)" + pattern = rf"{BASE_PATTERN}/(\d+)" example = "https://twibooru.org/12345" def __init__(self, match): @@ -64,7 +63,7 @@ class TwibooruSearchExtractor(TwibooruExtractor): """Extractor for twibooru search results""" subcategory = "search" directory_fmt = ("{category}", "{search_tags}") - pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))" + pattern = rf"{BASE_PATTERN}/(?:search/?\?([^#]+)|tags/([^/?#]+))" example = "https://twibooru.org/search?q=TAG" def __init__(self, match): @@ -98,7 +97,7 @@ class TwibooruGalleryExtractor(TwibooruExtractor): subcategory = "gallery" directory_fmt = ("{category}", "galleries", "{gallery[id]} {gallery[title]}") - pattern = BASE_PATTERN + r"/galleries/(\d+)" + pattern = rf"{BASE_PATTERN}/galleries/(\d+)" example = "https://twibooru.org/galleries/12345" def __init__(self, match): @@ -146,8 +145,8 @@ class TwibooruAPI(): return response.json() if response.status_code == 429: - until = text.parse_datetime( - response.headers["X-RL-Reset"], "%Y-%m-%d %H:%M:%S %Z") + until = self.parse_datetime_iso( + response.headers["X-RL-Reset"][:19]) # wait an extra minute, just to be safe self.extractor.wait(until=until, adjust=60.0) continue diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index bf125a6..546e8e1 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -76,7 +76,7 @@ class TwitterExtractor(Extractor): seen_tweets = set() if self.config("unique", True) else None if self.twitpic: - self._find_twitpic = util.re( + self._find_twitpic = text.re( r"https?(://twitpic\.com/(?!photos/)\w+)").findall tweets = self.tweets() @@ -124,12 +124,11 @@ class TwitterExtractor(Extractor): tdata = self._transform_tweet(tweet) tdata.update(metadata) tdata["count"] = len(files) - yield Message.Directory, tdata + yield Message.Directory, "", tdata - del tdata["source_id"] - del tdata["sensitive_flags"] - if "source_user" in tdata: - del tdata["source_user"] + tdata.pop("source_id", None) + tdata.pop("source_user", None) + tdata.pop("sensitive_flags", None) for tdata["num"], file in enumerate(files, 1): file.update(tdata) @@ -146,7 +145,7 @@ class TwitterExtractor(Extractor): self._extract_media( data, data["extended_entities"]["media"], files) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.warning( "%s: Error while extracting media files (%s: %s)", data["id_str"], exc.__class__.__name__, exc) @@ -155,7 +154,7 @@ class TwitterExtractor(Extractor): try: self._extract_card(tweet, files) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.warning( "%s: Error while extracting Card files (%s: %s)", data["id_str"], exc.__class__.__name__, exc) @@ -164,7 +163,7 @@ class TwitterExtractor(Extractor): try: self._extract_twitpic(data, files) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.warning( "%s: Error while extracting TwitPic files (%s: %s)", data["id_str"], exc.__class__.__name__, exc) @@ -347,32 +346,36 @@ class TwitterExtractor(Extractor): files.append({"url": url}) def _transform_tweet(self, tweet): + if "legacy" in tweet: + legacy = tweet["legacy"] + else: + legacy = tweet + tweet_id = int(legacy["id_str"]) + if "author" in tweet: author = tweet["author"] elif "core" in tweet: - author = tweet["core"]["user_results"]["result"] + try: + author = tweet["core"]["user_results"]["result"] + except KeyError: + self.log.warning("%s: Missing 'author' data", tweet_id) + author = util.NONE else: author = tweet["user"] author = self._transform_user(author) - if "legacy" in tweet: - legacy = tweet["legacy"] - else: - legacy = tweet - tget = legacy.get - - tweet_id = int(legacy["id_str"]) if tweet_id >= 300000000000000: - date = text.parse_timestamp( + date = self.parse_timestamp( ((tweet_id >> 22) + 1288834974657) // 1000) else: try: - date = text.parse_datetime( + date = self.parse_datetime( legacy["created_at"], "%a %b %d %H:%M:%S %z %Y") except Exception: date = util.NONE source = tweet.get("source") + tget = legacy.get tdata = { "tweet_id" : tweet_id, "retweet_id" : text.parse_int( @@ -439,6 +442,8 @@ class TwitterExtractor(Extractor): txt, _, tco = content.rpartition(" ") tdata["content"] = txt if tco.startswith("https://t.co/") else content + if "pinned" in tweet: + tdata["pinned"] = True if "birdwatch_pivot" in tweet: try: tdata["birdwatch"] = \ @@ -455,7 +460,7 @@ class TwitterExtractor(Extractor): tdata, legacy["extended_entities"]["media"][0]) if tdata["retweet_id"]: tdata["content"] = f"RT @{author['name']}: {tdata['content']}" - tdata["date_original"] = text.parse_timestamp( + tdata["date_original"] = self.parse_timestamp( ((tdata["retweet_id"] >> 22) + 1288834974657) // 1000) return tdata @@ -492,7 +497,7 @@ class TwitterExtractor(Extractor): "id": text.parse_int(cid), "name": com.get("name"), "description": com.get("description"), - "date": text.parse_timestamp(com.get("created_at", 0) // 1000), + "date": self.parse_timestamp(com.get("created_at", 0) // 1000), "nsfw": com.get("is_nsfw"), "role": com.get("role"), "member_count": com.get("member_count"), @@ -528,13 +533,13 @@ class TwitterExtractor(Extractor): "id" : text.parse_int(uid), "name" : core.get("screen_name"), "nick" : core.get("name"), - "location" : user["location"]["location"], - "date" : text.parse_datetime( + "location" : user["location"].get("location"), + "date" : self.parse_datetime( core["created_at"], "%a %b %d %H:%M:%S %z %Y"), "verified" : user["verification"]["verified"], "protected" : user["privacy"]["protected"], "profile_banner" : lget("profile_banner_url", ""), - "profile_image" : user["avatar"]["image_url"].replace( + "profile_image" : user["avatar"].get("image_url", "").replace( "_normal.", "."), "favourites_count": lget("favourites_count"), "followers_count" : lget("followers_count"), @@ -591,9 +596,12 @@ class TwitterExtractor(Extractor): obj = tweet["legacy"] if "legacy" in tweet else tweet cid = obj.get("conversation_id_str") if not cid: - tid = obj["id_str"] - self.log.warning( - "Unable to expand %s (no 'conversation_id')", tid) + if cid is False: + yield tweet + else: + tid = obj["id_str"] + self.log.warning( + "Unable to expand %s (no 'conversation_id')", tid) continue if cid in seen: self.log.debug( @@ -608,6 +616,7 @@ class TwitterExtractor(Extractor): def _make_tweet(self, user, url, id_str): return { "id_str": id_str, + "conversation_id_str": False, "lang": None, "user": user, "source": "><", @@ -658,8 +667,8 @@ class TwitterExtractor(Extractor): class TwitterHomeExtractor(TwitterExtractor): """Extractor for Twitter home timelines""" subcategory = "home" - pattern = (BASE_PATTERN + - r"/(?:home(?:/fo(?:llowing|r[-_ ]?you()))?|i/timeline)/?$") + pattern = (rf"{BASE_PATTERN}/" + rf"(?:home(?:/fo(?:llowing|r[-_ ]?you()))?|i/timeline)/?$") example = "https://x.com/home" def tweets(self): @@ -671,7 +680,7 @@ class TwitterHomeExtractor(TwitterExtractor): class TwitterSearchExtractor(TwitterExtractor): """Extractor for Twitter search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" + pattern = rf"{BASE_PATTERN}/search/?\?(?:[^&#]+&)*q=([^&#]+)" example = "https://x.com/search?q=QUERY" def metadata(self): @@ -702,7 +711,7 @@ class TwitterSearchExtractor(TwitterExtractor): class TwitterHashtagExtractor(TwitterExtractor): """Extractor for Twitter hashtags""" subcategory = "hashtag" - pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/hashtag/([^/?#]+)" example = "https://x.com/hashtag/NAME" def items(self): @@ -713,7 +722,7 @@ class TwitterHashtagExtractor(TwitterExtractor): class TwitterUserExtractor(Dispatch, TwitterExtractor): """Extractor for a Twitter user""" - pattern = (BASE_PATTERN + r"/(?:" + pattern = (rf"{BASE_PATTERN}/(?:" r"([^/?#]+)/?(?:$|\?|#)" r"|i(?:/user/|ntent/user\?user_id=)(\d+))") example = "https://x.com/USER" @@ -890,7 +899,7 @@ class TwitterLikesExtractor(TwitterExtractor): class TwitterBookmarkExtractor(TwitterExtractor): """Extractor for bookmarked tweets""" subcategory = "bookmark" - pattern = BASE_PATTERN + r"/i/bookmarks()" + pattern = rf"{BASE_PATTERN}/i/bookmarks()" example = "https://x.com/i/bookmarks" def tweets(self): @@ -898,7 +907,7 @@ class TwitterBookmarkExtractor(TwitterExtractor): def _transform_tweet(self, tweet): tdata = TwitterExtractor._transform_tweet(self, tweet) - tdata["date_bookmarked"] = text.parse_timestamp( + tdata["date_bookmarked"] = self.parse_timestamp( (int(tweet["sortIndex"] or 0) >> 20) // 1000) return tdata @@ -906,7 +915,7 @@ class TwitterBookmarkExtractor(TwitterExtractor): class TwitterListExtractor(TwitterExtractor): """Extractor for Twitter lists""" subcategory = "list" - pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$" + pattern = rf"{BASE_PATTERN}/i/lists/(\d+)/?$" example = "https://x.com/i/lists/12345" def tweets(self): @@ -916,7 +925,7 @@ class TwitterListExtractor(TwitterExtractor): class TwitterListMembersExtractor(TwitterExtractor): """Extractor for members of a Twitter list""" subcategory = "list-members" - pattern = BASE_PATTERN + r"/i/lists/(\d+)/members" + pattern = rf"{BASE_PATTERN}/i/lists/(\d+)/members" example = "https://x.com/i/lists/12345/members" def items(self): @@ -952,7 +961,7 @@ class TwitterCommunityExtractor(TwitterExtractor): directory_fmt = ("{category}", "Communities", "{community[name]} ({community[id]})") archive_fmt = "C_{community[id]}_{tweet_id}_{num}" - pattern = BASE_PATTERN + r"/i/communities/(\d+)" + pattern = rf"{BASE_PATTERN}/i/communities/(\d+)" example = "https://x.com/i/communities/12345" def tweets(self): @@ -966,7 +975,7 @@ class TwitterCommunitiesExtractor(TwitterExtractor): subcategory = "communities" directory_fmt = TwitterCommunityExtractor.directory_fmt archive_fmt = TwitterCommunityExtractor.archive_fmt - pattern = BASE_PATTERN + r"/([^/?#]+)/communities/?$" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/communities/?$" example = "https://x.com/i/communities" def tweets(self): @@ -978,7 +987,7 @@ class TwitterEventExtractor(TwitterExtractor): subcategory = "event" directory_fmt = ("{category}", "Events", "{event[id]} {event[short_title]}") - pattern = BASE_PATTERN + r"/i/events/(\d+)" + pattern = rf"{BASE_PATTERN}/i/events/(\d+)" example = "https://x.com/i/events/12345" def metadata(self): @@ -991,7 +1000,7 @@ class TwitterEventExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): """Extractor for individual tweets""" subcategory = "tweet" - pattern = (BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" + pattern = (rf"{BASE_PATTERN}/([^/?#]+|i/web)/status/(\d+)" r"/?(?:$|\?|#|photo/|video/)") example = "https://x.com/USER/status/12345" @@ -1072,7 +1081,7 @@ class TwitterTweetExtractor(TwitterExtractor): class TwitterQuotesExtractor(TwitterExtractor): """Extractor for quotes of a Tweet""" subcategory = "quotes" - pattern = BASE_PATTERN + r"/(?:[^/?#]+|i/web)/status/(\d+)/quotes" + pattern = rf"{BASE_PATTERN}/(?:[^/?#]+|i/web)/status/(\d+)/quotes" example = "https://x.com/USER/status/12345/quotes" def items(self): @@ -1096,7 +1105,7 @@ class TwitterInfoExtractor(TwitterExtractor): else: user = api.user_by_screen_name(screen_name) - return iter(((Message.Directory, self._transform_user(user)),)) + return iter(((Message.Directory, "", self._transform_user(user)),)) class TwitterAvatarExtractor(TwitterExtractor): @@ -1162,7 +1171,7 @@ class TwitterImageExtractor(Extractor): "_fallback": TwitterExtractor._image_fallback(self, base), } - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, base + self._size_image, data @@ -1369,7 +1378,7 @@ class TwitterAPI(): endpoint = "/graphql/E8Wq-_jFSaU7hxVcuOPR9g/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withQuickPromoteEligibilityTweetFields": False, "withVoice": True, @@ -1384,7 +1393,7 @@ class TwitterAPI(): endpoint = "/graphql/-O3QOHrVn1aOm_cF5wyTCQ/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withCommunity": True, "withVoice": True, @@ -1399,7 +1408,7 @@ class TwitterAPI(): endpoint = "/graphql/gmHw9geMTncZ7jeLLUUNOw/UserHighlightsTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withVoice": True, } @@ -1413,7 +1422,7 @@ class TwitterAPI(): endpoint = "/graphql/jCRhbOzdgOHp6u9H4g2tEg/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withClientEventToken": False, "withBirdwatchNotes": False, @@ -1429,7 +1438,7 @@ class TwitterAPI(): endpoint = "/graphql/TGEKkJG_meudeaFcqaxM-Q/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withClientEventToken": False, "withBirdwatchNotes": False, @@ -1444,32 +1453,45 @@ class TwitterAPI(): def user_bookmarks(self): endpoint = "/graphql/pLtjrO4ubNh996M_Cubwsg/Bookmarks" variables = { - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, } return self._pagination_tweets( endpoint, variables, ("bookmark_timeline_v2", "timeline"), stop_tweets=128) - def search_timeline(self, query, product="Latest"): + def search_timeline(self, query, product=None): + cfg = self.extractor.config + + if product is None: + if product := cfg("search-results"): + product = { + "top" : "Top", + "live" : "Latest", + "user" : "People", + "media": "Media", + "list" : "Lists", + }.get(product.lower(), product).capitalize() + else: + product = "Latest" + endpoint = "/graphql/4fpceYZ6-YQCx_JSl_Cn_A/SearchTimeline" variables = { "rawQuery": query, - "count": self.extractor.config("search-limit", 20), + "count": cfg("search-limit", 20), "querySource": "typed_query", "product": product, "withGrokTranslatedBio": False, } - if self.extractor.config("search-pagination") in ( - "max_id", "maxid", "id"): + if cfg("search-pagination") in ("max_id", "maxid", "id"): update_variables = self._update_variables_search else: update_variables = None - stop_tweets = self.extractor.config("search-stop") + stop_tweets = cfg("search-stop") if stop_tweets is None or stop_tweets == "auto": - stop_tweets = 3 if update_variables is None else 0 + stop_tweets = 3 return self._pagination_tweets( endpoint, variables, @@ -1494,7 +1516,7 @@ class TwitterAPI(): endpoint = "/graphql/Nyt-88UX4-pPCImZNUl9RQ/CommunityTweetsTimeline" variables = { "communityId": community_id, - "count": 100, + "count": self.extractor.config("limit", 50), "displayLocation": "Community", "rankingMode": "Recency", "withCommunity": True, @@ -1508,7 +1530,7 @@ class TwitterAPI(): endpoint = "/graphql/ZniZ7AAK_VVu1xtSx1V-gQ/CommunityMediaTimeline" variables = { "communityId": community_id, - "count": 100, + "count": self.extractor.config("limit", 50), "withCommunity": True, } return self._pagination_tweets( @@ -1520,7 +1542,7 @@ class TwitterAPI(): endpoint = ("/graphql/p048a9n3hTPppQyK7FQTFw" "/CommunitiesMainPageTimeline") variables = { - "count": 100, + "count": self.extractor.config("limit", 50), "withCommunity": True, } return self._pagination_tweets( @@ -1530,7 +1552,7 @@ class TwitterAPI(): def home_timeline(self): endpoint = "/graphql/DXmgQYmIft1oLP6vMkJixw/HomeTimeline" variables = { - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "latestControlAvailable": True, "withCommunity": True, @@ -1541,7 +1563,7 @@ class TwitterAPI(): def home_latest_timeline(self): endpoint = "/graphql/SFxmNKWfN9ySJcXG_tjX8g/HomeLatestTimeline" variables = { - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "latestControlAvailable": True, } @@ -1568,7 +1590,7 @@ class TwitterAPI(): endpoint = "/graphql/06JtmwM8k_1cthpFZITVVA/ListLatestTweetsTimeline" variables = { "listId": list_id, - "count": 100, + "count": self.extractor.config("limit", 50), } return self._pagination_tweets( endpoint, variables, ("list", "tweets_timeline", "timeline")) @@ -1654,10 +1676,8 @@ class TwitterAPI(): self.extractor._assign_user(user) return user["rest_id"] except KeyError: - if "unavailable_message" in user: - raise exception.NotFoundError( - f"{user['unavailable_message'].get('text')} " - f"({user.get('reason')})", False) + if user and user.get("__typename") == "UserUnavailable": + raise exception.NotFoundError(user["message"], False) else: raise exception.NotFoundError("user") @@ -1700,7 +1720,7 @@ class TwitterAPI(): self.client_transaction.generate_transaction_id(method, path) def _call(self, endpoint, params, method="GET", auth=True, root=None): - url = (root or self.root) + endpoint + url = (self.root if root is None else root) + endpoint while True: if auth: @@ -1877,8 +1897,17 @@ class TwitterAPI(): features=None, field_toggles=None): extr = self.extractor original_retweets = (extr.retweets == "original") - pinned_tweet = extr.pinned + pinned_tweet = True if extr.pinned else None stop_tweets_max = stop_tweets + api_retries = None + + if isinstance(count := variables.get("count"), list): + count = count.copy() + count.reverse() + self.log.debug("Using 'count: %s'", count[-1]) + variables["count"] = count.pop() + else: + count = False params = {"variables": None} if cursor := extr._init_cursor(): @@ -1892,14 +1921,14 @@ class TwitterAPI(): while True: params["variables"] = self._json_dumps(variables) - data = self._call(endpoint, params)["data"] + data = self._call(endpoint, params) try: if path is None: - instructions = (data["user"]["result"]["timeline"] + instructions = (data["data"]["user"]["result"]["timeline"] ["timeline"]["instructions"]) else: - instructions = data + instructions = data["data"] for key in path: instructions = instructions[key] instructions = instructions["instructions"] @@ -1916,7 +1945,7 @@ class TwitterAPI(): elif instr_type == "TimelineAddToModule": entries = instr["moduleItems"] elif instr_type == "TimelinePinEntry": - if pinned_tweet: + if pinned_tweet is not None: pinned_tweet = instr["entry"] elif instr_type == "TimelineReplaceEntry": entry = instr["entry"] @@ -1930,6 +1959,26 @@ class TwitterAPI(): except LookupError: extr.log.debug(data) + if errors := data.get("errors"): + if api_retries is None: + api_tries = 1 + api_retries = extr.config("retries-api", 9) + if api_retries < 0: + api_retries = float("inf") + + err = [] + srv = False + for e in errors: + err.append(f"- '{e.get('message') or e.get('name')}'") + if e.get("source") == "Server": + srv = True + + self.log.warning("API errors (%s/%s):\n%s", + api_tries, api_retries+1, "\n".join(err)) + if srv and api_tries <= api_retries: + api_tries += 1 + continue + if user := extr._user_obj: user = user["legacy"] if user.get("blocked_by"): @@ -1950,14 +1999,13 @@ class TwitterAPI(): "Unable to retrieve Tweets from this timeline") tweets = [] - tweet = None + tweet = last_tweet = retry = None + api_tries = 1 - if pinned_tweet: - if isinstance(pinned_tweet, dict): - tweets.append(pinned_tweet) - elif instructions[-1]["type"] == "TimelinePinEntry": - tweets.append(instructions[-1]["entry"]) - pinned_tweet = False + if pinned_tweet is not None and isinstance(pinned_tweet, dict): + pinned_tweet["pinned"] = True + tweets.append(pinned_tweet) + pinned_tweet = None for entry in entries: esw = entry["entryId"].startswith @@ -1965,6 +2013,7 @@ class TwitterAPI(): if esw("tweet-"): tweets.append(entry) elif esw(("profile-grid-", + "search-grid-", "communities-grid-")): if "content" in entry: tweets.extend(entry["content"]["items"]) @@ -1988,6 +2037,28 @@ class TwitterAPI(): tweet = True cursor = cursor.get("value") + if pinned_tweet is not None: + if extr._user_obj is None: + pinned = None + elif pinned := extr._user_obj["legacy"].get( + "pinned_tweet_ids_str"): + pinned = f"-tweet-{pinned[0]}" + for idx, entry in enumerate(tweets): + if entry["entryId"].endswith(pinned): + # mark as pinned / set 'pinned = True' + pinned_tweet = ( + (entry.get("content") or entry["item"]) + ["itemContent"]["tweet_results"]["result"]) + if "tweet" in pinned_tweet: + pinned_tweet = pinned_tweet["tweet"] + pinned_tweet["pinned"] = True + # move to front of 'tweets' + del tweets[idx] + tweets.insert(0, entry) + break + del pinned + pinned_tweet = None + for entry in tweets: try: item = ((entry.get("content") or entry["item"]) @@ -2015,6 +2086,16 @@ class TwitterAPI(): (entry.get("entryId") or "").rpartition("-")[2]) continue + if retry is None: + try: + tweet["core"]["user_results"]["result"] + retry = False + except KeyError: + self.log.warning("Received Tweet results without " + "'core' data ... Retrying") + retry = True + break + if "retweeted_status_result" in legacy: try: retweet = legacy["retweeted_status_result"]["result"] @@ -2071,18 +2152,25 @@ class TwitterAPI(): tweet.get("rest_id")) continue - if tweet: + if retry: + continue + elif tweet: stop_tweets = stop_tweets_max last_tweet = tweet - else: - if stop_tweets <= 0: + elif stop_tweets <= 0: + if not count: return extr._update_cursor(None) + self.log.debug("Switching to 'count: %s'", count[-1]) + variables["count"] = count.pop() + continue + else: self.log.debug( "No Tweet results (%s/%s)", stop_tweets_max - stop_tweets + 1, stop_tweets_max) stop_tweets -= 1 if not cursor or cursor == variables.get("cursor"): + self.log.debug("No continuation cursor") return extr._update_cursor(None) if update_variables is None: @@ -2169,7 +2257,7 @@ class TwitterAPI(): else: variables["rawQuery"] = f"{query} {max_id}" - if prefix := self.extractor._cursor_prefix: + if prefix := getattr(self.extractor, "_cursor_prefix", None): self.extractor._cursor_prefix = \ f"{prefix.partition('_')[0]}_{tweet_id}/" variables["cursor"] = None diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index cf6631f..b77be95 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -41,11 +41,11 @@ class UnsplashExtractor(Extractor): if metadata: photo.update(metadata) photo["extension"] = "jpg" - photo["date"] = text.parse_datetime(photo["created_at"]) + photo["date"] = self.parse_datetime_iso(photo["created_at"]) if "tags" in photo: photo["tags"] = [t["title"] for t in photo["tags"]] - yield Message.Directory, photo + yield Message.Directory, "", photo yield Message.Url, url, photo def metadata(self): @@ -74,7 +74,7 @@ class UnsplashExtractor(Extractor): class UnsplashImageExtractor(UnsplashExtractor): """Extractor for a single unsplash photo""" subcategory = "image" - pattern = BASE_PATTERN + r"/photos/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)" example = "https://unsplash.com/photos/ID" def photos(self): @@ -85,7 +85,7 @@ class UnsplashImageExtractor(UnsplashExtractor): class UnsplashUserExtractor(UnsplashExtractor): """Extractor for all photos of an unsplash user""" subcategory = "user" - pattern = BASE_PATTERN + r"/@(\w+)/?$" + pattern = rf"{BASE_PATTERN}/@(\w+)/?$" example = "https://unsplash.com/@USER" def photos(self): @@ -97,7 +97,7 @@ class UnsplashUserExtractor(UnsplashExtractor): class UnsplashFavoriteExtractor(UnsplashExtractor): """Extractor for all likes of an unsplash user""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/@(\w+)/likes" + pattern = rf"{BASE_PATTERN}/@(\w+)/likes" example = "https://unsplash.com/@USER/likes" def photos(self): @@ -109,7 +109,7 @@ class UnsplashFavoriteExtractor(UnsplashExtractor): class UnsplashCollectionExtractor(UnsplashExtractor): """Extractor for an unsplash collection""" subcategory = "collection" - pattern = BASE_PATTERN + r"/collections/([^/?#]+)(?:/([^/?#]+))?" + pattern = rf"{BASE_PATTERN}/collections/([^/?#]+)(?:/([^/?#]+))?" example = "https://unsplash.com/collections/12345/TITLE" def __init__(self, match): @@ -128,7 +128,7 @@ class UnsplashCollectionExtractor(UnsplashExtractor): class UnsplashSearchExtractor(UnsplashExtractor): """Extractor for unsplash search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/s/photos/([^/?#]+)(?:\?([^#]+))?" example = "https://unsplash.com/s/photos/QUERY" def __init__(self, match): diff --git a/gallery_dl/extractor/uploadir.py b/gallery_dl/extractor/uploadir.py index d06c2ad..d80abba 100644 --- a/gallery_dl/extractor/uploadir.py +++ b/gallery_dl/extractor/uploadir.py @@ -53,5 +53,5 @@ class UploadirFileExtractor(Extractor): data = text.nameext_from_url(name) data["id"] = self.file_id - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py index 4369ac6..0d8b3d3 100644 --- a/gallery_dl/extractor/urlgalleries.py +++ b/gallery_dl/extractor/urlgalleries.py @@ -38,7 +38,7 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor): data["count"] = len(imgs) root = self.root - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], img in enumerate(imgs, 1): page = self.request(root + img).text url = text.extr(page, "window.location.href = '", "'") @@ -52,7 +52,7 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor): "blog" : text.unescape(extr(' title="', '"')), "_rprt": extr(' title="', '"'), # report button "title": text.unescape(extr(' title="', '"').strip()), - "date" : text.parse_datetime( + "date" : self.parse_datetime( extr(" images in gallery | ", "<"), "%B %d, %Y"), } diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index 7a9269d..0056d1b 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -32,7 +32,7 @@ BASE_PATTERN = UrlshortenerExtractor.update({ class UrlshortenerLinkExtractor(UrlshortenerExtractor): """Extractor for general-purpose URL shorteners""" subcategory = "link" - pattern = BASE_PATTERN + r"(/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/[^/?#]+)" example = "https://bit.ly/abcde" def items(self): diff --git a/gallery_dl/extractor/vanillarock.py b/gallery_dl/extractor/vanillarock.py index e0107f3..63fc7fa 100644 --- a/gallery_dl/extractor/vanillarock.py +++ b/gallery_dl/extractor/vanillarock.py @@ -47,13 +47,13 @@ class VanillarockPostExtractor(VanillarockExtractor): "count": len(imgs), "title": text.unescape(name), "path" : self.path.strip("/"), - "date" : text.parse_datetime(extr( - '<div class="date">', '</div>'), "%Y-%m-%d %H:%M"), + "date" : self.parse_datetime_iso(extr( + '<div class="date">', '</div>')), "tags" : text.split_html(extr( '<div class="cat-tag">', '</div>'))[::2], } - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], url in enumerate(imgs, 1): yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/vichan.py b/gallery_dl/extractor/vichan.py index cbb44ee..86758f3 100644 --- a/gallery_dl/extractor/vichan.py +++ b/gallery_dl/extractor/vichan.py @@ -39,7 +39,7 @@ class VichanThreadExtractor(VichanExtractor): directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{time}{num:?-//} {filename}.{extension}" archive_fmt = "{board}_{thread}_{tim}" - pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/res/(\d+)" example = "https://8kun.top/a/res/12345.html" def items(self): @@ -58,7 +58,7 @@ class VichanThreadExtractor(VichanExtractor): "num" : 0, } - yield Message.Directory, data + yield Message.Directory, "", data for post in posts: if "filename" in post: yield process(post, data) @@ -93,7 +93,7 @@ class VichanThreadExtractor(VichanExtractor): class VichanBoardExtractor(VichanExtractor): """Extractor for vichan boards""" subcategory = "board" - pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:/index|/catalog|/\d+|/?$)" example = "https://8kun.top/a/" def items(self): diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 294fc57..8f6368b 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -75,7 +75,7 @@ class VipergirlsExtractor(Extractor): data["count"] = len(images) del data["imagecount"] - yield Message.Directory, data + yield Message.Directory, "", data if images: for data["num"], image in enumerate(images, 1): yield Message.Queue, image.attrib["main_url"], data @@ -124,8 +124,8 @@ class VipergirlsExtractor(Extractor): class VipergirlsThreadExtractor(VipergirlsExtractor): """Extractor for vipergirls threads""" subcategory = "thread" - pattern = (BASE_PATTERN + - r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?(?:$|#|\?(?!p=))") + pattern = (rf"{BASE_PATTERN}" + rf"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?(?:$|#|\?(?!p=))") example = "https://vipergirls.to/threads/12345-TITLE" def __init__(self, match): @@ -140,8 +140,8 @@ class VipergirlsThreadExtractor(VipergirlsExtractor): class VipergirlsPostExtractor(VipergirlsExtractor): """Extractor for vipergirls posts""" subcategory = "post" - pattern = (BASE_PATTERN + - r"/threads/(\d+)(?:-[^/?#]+)?\?p=\d+[^#]*#post(\d+)") + pattern = (rf"{BASE_PATTERN}" + rf"/threads/(\d+)(?:-[^/?#]+)?\?p=\d+[^#]*#post(\d+)") example = "https://vipergirls.to/threads/12345-TITLE?p=23456#post23456" def __init__(self, match): diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 22d4b9a..e7453fc 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -9,7 +9,7 @@ """Extractors for https://vk.com/""" from .common import Extractor, Message -from .. import text, util, exception +from .. import text, exception BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" @@ -36,11 +36,11 @@ class VkExtractor(Extractor): return num def items(self): - subn = util.re(r"/imp[fg]/").subn + subn = text.re(r"/imp[fg]/").subn sizes = "wzyxrqpo" data = self.metadata() - yield Message.Directory, data + yield Message.Directory, "", data for photo in self.photos(): @@ -72,7 +72,7 @@ class VkExtractor(Extractor): photo["width"] = photo["height"] = 0 photo["id"] = photo["id"].rpartition("_")[2] - photo["date"] = text.parse_timestamp(text.extr( + photo["date"] = self.parse_timestamp(text.extr( photo["date"], 'data-date="', '"')) photo["description"] = text.unescape(text.extr( photo.get("desc", ""), ">", "<")) @@ -134,7 +134,7 @@ class VkExtractor(Extractor): class VkPhotosExtractor(VkExtractor): """Extractor for photos from a vk user""" subcategory = "photos" - pattern = (BASE_PATTERN + r"/(?:" + pattern = (rf"{BASE_PATTERN}/(?:" r"(?:albums|photos|id)(-?\d+)" r"|(?!(?:album|tag|wall)-?\d+_?)([^/?#]+))") example = "https://vk.com/id12345" @@ -184,7 +184,7 @@ class VkAlbumExtractor(VkExtractor): """Extractor for a vk album""" subcategory = "album" directory_fmt = ("{category}", "{user[id]}", "{album[id]}") - pattern = BASE_PATTERN + r"/album(-?\d+)_(\d+)$" + pattern = rf"{BASE_PATTERN}/album(-?\d+)_(\d+)$" example = "https://vk.com/album12345_00" def photos(self): @@ -228,7 +228,7 @@ class VkTaggedExtractor(VkExtractor): """Extractor for a vk tagged photos""" subcategory = "tagged" directory_fmt = ("{category}", "{user[id]}", "tags") - pattern = BASE_PATTERN + r"/tag(-?\d+)$" + pattern = rf"{BASE_PATTERN}/tag(-?\d+)$" example = "https://vk.com/tag12345" def __init__(self, match): @@ -247,7 +247,7 @@ class VkWallPostExtractor(VkExtractor): subcategory = "wall-post" directory_fmt = ("{category}", "{user[id]}", "wall") filename_fmt = "{wall[id]}_{num}.{extension}" - pattern = BASE_PATTERN + r"/wall(-?\d+)_(\d+)" + pattern = rf"{BASE_PATTERN}/wall(-?\d+)_(\d+)" example = "https://vk.com/wall12345_123" def photos(self): diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index df09fce..b8da813 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -12,7 +12,7 @@ from .common import Extractor, Message, Dispatch from .. import text, util BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co" -USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)" +USER_PATTERN = rf"{BASE_PATTERN}/([^/?#]+)" class VscoExtractor(Extractor): @@ -22,7 +22,7 @@ class VscoExtractor(Extractor): directory_fmt = ("{category}", "{user}") filename_fmt = "{id}.{extension}" archive_fmt = "{id}" - tls12 = False + browser = "firefox" def __init__(self, match): Extractor.__init__(self, match) @@ -30,7 +30,7 @@ class VscoExtractor(Extractor): def items(self): videos = self.config("videos", True) - yield Message.Directory, {"user": self.user} + yield Message.Directory, "", {"user": self.user} for img in self.images(): if not img: @@ -62,7 +62,7 @@ class VscoExtractor(Extractor): "grid" : img["grid_name"], "meta" : img.get("image_meta") or {}, "tags" : [tag["text"] for tag in img.get("tags") or ()], - "date" : text.parse_timestamp(img["upload_date"] // 1000), + "date" : self.parse_timestamp(img["upload_date"] // 1000), "video" : img["is_video"], "width" : img["width"], "height": img["height"], @@ -133,7 +133,7 @@ class VscoExtractor(Extractor): class VscoUserExtractor(Dispatch, VscoExtractor): """Extractor for a vsco user profile""" - pattern = USER_PATTERN + r"/?$" + pattern = rf"{USER_PATTERN}/?$" example = "https://vsco.co/USER" def items(self): @@ -149,7 +149,7 @@ class VscoUserExtractor(Dispatch, VscoExtractor): class VscoGalleryExtractor(VscoExtractor): """Extractor for a vsco user's gallery""" subcategory = "gallery" - pattern = USER_PATTERN + r"/(?:gallery|images)" + pattern = rf"{USER_PATTERN}/(?:gallery|images)" example = "https://vsco.co/USER/gallery" def images(self): @@ -173,7 +173,7 @@ class VscoCollectionExtractor(VscoExtractor): subcategory = "collection" directory_fmt = ("{category}", "{user}", "collection") archive_fmt = "c_{user}_{id}" - pattern = USER_PATTERN + r"/collection" + pattern = rf"{USER_PATTERN}/collection" example = "https://vsco.co/USER/collection/1" def images(self): @@ -198,7 +198,7 @@ class VscoSpaceExtractor(VscoExtractor): subcategory = "space" directory_fmt = ("{category}", "space", "{user}") archive_fmt = "s_{user}_{id}" - pattern = BASE_PATTERN + r"/spaces/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/spaces/([^/?#]+)" example = "https://vsco.co/spaces/a1b2c3d4e5f" def images(self): @@ -245,7 +245,7 @@ class VscoSpaceExtractor(VscoExtractor): class VscoSpacesExtractor(VscoExtractor): """Extractor for a vsco.co user's spaces""" subcategory = "spaces" - pattern = USER_PATTERN + r"/spaces" + pattern = rf"{USER_PATTERN}/spaces" example = "https://vsco.co/USER/spaces" def items(self): @@ -275,7 +275,7 @@ class VscoSpacesExtractor(VscoExtractor): class VscoAvatarExtractor(VscoExtractor): """Extractor for vsco.co user avatars""" subcategory = "avatar" - pattern = USER_PATTERN + r"/avatar" + pattern = rf"{USER_PATTERN}/avatar" example = "https://vsco.co/USER/avatar" def images(self): @@ -303,7 +303,7 @@ class VscoAvatarExtractor(VscoExtractor): class VscoImageExtractor(VscoExtractor): """Extractor for individual images on vsco.co""" subcategory = "image" - pattern = USER_PATTERN + r"/media/([0-9a-fA-F]+)" + pattern = rf"{USER_PATTERN}/media/([0-9a-fA-F]+)" example = "https://vsco.co/USER/media/0123456789abcdef" def images(self): @@ -316,7 +316,7 @@ class VscoImageExtractor(VscoExtractor): class VscoVideoExtractor(VscoExtractor): """Extractor for vsco.co videos links""" subcategory = "video" - pattern = USER_PATTERN + r"/video/([^/?#]+)" + pattern = rf"{USER_PATTERN}/video/([^/?#]+)" example = "https://vsco.co/USER/video/012345678-9abc-def0" def images(self): diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index f0f27e0..9ea3c36 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -29,7 +29,7 @@ class WallhavenExtractor(Extractor): self._transform(wp) wp.update(metadata) url = wp["url"] - yield Message.Directory, wp + yield Message.Directory, "", wp yield Message.Url, url, text.nameext_from_url(url, wp) def wallpapers(self): @@ -43,8 +43,7 @@ class WallhavenExtractor(Extractor): wp["url"] = wp.pop("path") if "tags" in wp: wp["tags"] = [t["name"] for t in wp["tags"]] - wp["date"] = text.parse_datetime( - wp.pop("created_at"), "%Y-%m-%d %H:%M:%S") + wp["date"] = self.parse_datetime_iso(wp.pop("created_at")) wp["width"] = wp.pop("dimension_x") wp["height"] = wp.pop("dimension_y") wp["wh_category"] = wp["category"] diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py index 65fca24..1392164 100644 --- a/gallery_dl/extractor/wallpapercave.py +++ b/gallery_dl/extractor/wallpapercave.py @@ -27,7 +27,7 @@ class WallpapercaveImageExtractor(Extractor): path = None for path in text.extract_iter(page, 'class="download" href="', '"'): image = text.nameext_from_url(path) - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, self.root + path, image if path is None: @@ -38,7 +38,7 @@ class WallpapercaveImageExtractor(Extractor): pass else: image = text.nameext_from_url(path) - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, self.root + path, image if path is None: @@ -46,5 +46,5 @@ class WallpapercaveImageExtractor(Extractor): page, 'class="wallpaper" id="wp', '</picture>'): if path := text.rextr(wp, ' src="', '"'): image = text.nameext_from_url(path) - yield Message.Directory, image + yield Message.Directory, "", image yield Message.Url, self.root + path, image diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 8ae2a49..b66ba8d 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -37,12 +37,12 @@ class WarosuThreadExtractor(Extractor): data["title"] = text.unescape(text.remove_html( posts[0]["com"]))[:50] - yield Message.Directory, data + yield Message.Directory, "", data for post in posts: if "image" in post: for key in ("w", "h", "no", "time", "tim"): post[key] = text.parse_int(post[key]) - dt = text.parse_timestamp(post["time"]) + dt = self.parse_timestamp(post["time"]) # avoid zero-padding 'day' with %d post["now"] = dt.strftime(f"%a, %b {dt.day}, %Y %H:%M:%S") post.update(data) diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index a69f3a8..e718e51 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -24,8 +24,7 @@ class WeasylExtractor(Extractor): # Some submissions don't have content and can be skipped if "submission" in data["media"]: data["url"] = data["media"]["submission"][0]["url"] - data["date"] = text.parse_datetime( - data["posted_at"][:19], "%Y-%m-%dT%H:%M:%S") + data["date"] = self.parse_datetime_iso(data["posted_at"][:19]) text.nameext_from_url(data["url"], data) return True return False @@ -42,7 +41,7 @@ class WeasylExtractor(Extractor): f"{self.root}/api/journals/{journalid}/view") data["extension"] = "html" data["html"] = "text:" + data["content"] - data["date"] = text.parse_datetime(data["posted_at"]) + data["date"] = self.parse_datetime_iso(data["posted_at"]) return data def submissions(self, owner_login, folderid=None): @@ -71,7 +70,7 @@ class WeasylExtractor(Extractor): class WeasylSubmissionExtractor(WeasylExtractor): subcategory = "submission" - pattern = BASE_PATTERN + r"(?:~[\w~-]+/submissions|submission|view)/(\d+)" + pattern = rf"{BASE_PATTERN}(?:~[\w~-]+/submissions|submission|view)/(\d+)" example = "https://www.weasyl.com/~USER/submissions/12345/TITLE" def __init__(self, match): @@ -81,13 +80,13 @@ class WeasylSubmissionExtractor(WeasylExtractor): def items(self): data = self.request_submission(self.submitid) if self.populate_submission(data): - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, data["url"], data class WeasylSubmissionsExtractor(WeasylExtractor): subcategory = "submissions" - pattern = BASE_PATTERN + r"(?:~|submissions/)([\w~-]+)/?$" + pattern = rf"{BASE_PATTERN}(?:~|submissions/)([\w~-]+)/?$" example = "https://www.weasyl.com/submissions/USER" def __init__(self, match): @@ -95,14 +94,14 @@ class WeasylSubmissionsExtractor(WeasylExtractor): self.owner_login = match[1] def items(self): - yield Message.Directory, {"owner_login": self.owner_login} + yield Message.Directory, "", {"owner_login": self.owner_login} yield from self.submissions(self.owner_login) class WeasylFolderExtractor(WeasylExtractor): subcategory = "folder" directory_fmt = ("{category}", "{owner_login}", "{folder_name}") - pattern = BASE_PATTERN + r"submissions/([\w~-]+)\?folderid=(\d+)" + pattern = rf"{BASE_PATTERN}submissions/([\w~-]+)\?folderid=(\d+)" example = "https://www.weasyl.com/submissions/USER?folderid=12345" def __init__(self, match): @@ -114,7 +113,7 @@ class WeasylFolderExtractor(WeasylExtractor): # Folder names are only on single submission api calls msg, url, data = next(iter) details = self.request_submission(data["submitid"]) - yield Message.Directory, details + yield Message.Directory, "", details yield msg, url, data yield from iter @@ -123,7 +122,7 @@ class WeasylJournalExtractor(WeasylExtractor): subcategory = "journal" filename_fmt = "{journalid} {title}.{extension}" archive_fmt = "{journalid}" - pattern = BASE_PATTERN + r"journal/(\d+)" + pattern = rf"{BASE_PATTERN}journal/(\d+)" example = "https://www.weasyl.com/journal/12345" def __init__(self, match): @@ -132,7 +131,7 @@ class WeasylJournalExtractor(WeasylExtractor): def items(self): data = self.retrieve_journal(self.journalid) - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, data["html"], data @@ -140,7 +139,7 @@ class WeasylJournalsExtractor(WeasylExtractor): subcategory = "journals" filename_fmt = "{journalid} {title}.{extension}" archive_fmt = "{journalid}" - pattern = BASE_PATTERN + r"journals/([\w~-]+)" + pattern = rf"{BASE_PATTERN}journals/([\w~-]+)" example = "https://www.weasyl.com/journals/USER" def __init__(self, match): @@ -148,7 +147,7 @@ class WeasylJournalsExtractor(WeasylExtractor): self.owner_login = match[1] def items(self): - yield Message.Directory, {"owner_login": self.owner_login} + yield Message.Directory, "", {"owner_login": self.owner_login} url = f"{self.root}/journals/{self.owner_login}" page = self.request(url).text @@ -160,7 +159,7 @@ class WeasylJournalsExtractor(WeasylExtractor): class WeasylFavoriteExtractor(WeasylExtractor): subcategory = "favorite" directory_fmt = ("{category}", "{user}", "Favorites") - pattern = BASE_PATTERN + r"favorites(?:\?userid=(\d+)|/([^/?#]+))" + pattern = rf"{BASE_PATTERN}favorites(?:\?userid=(\d+)|/([^/?#]+))" example = "https://www.weasyl.com/favorites?userid=12345" def items(self): @@ -192,7 +191,7 @@ class WeasylFavoriteExtractor(WeasylExtractor): submission = self.request_submission(submitid) if self.populate_submission(submission): submission["user"] = owner_login - yield Message.Directory, submission + yield Message.Directory, "", submission yield Message.Url, submission["url"], submission try: diff --git a/gallery_dl/extractor/webmshare.py b/gallery_dl/extractor/webmshare.py index cc41b03..2cb41bb 100644 --- a/gallery_dl/extractor/webmshare.py +++ b/gallery_dl/extractor/webmshare.py @@ -40,7 +40,7 @@ class WebmshareVideoExtractor(Extractor): 'property="og:video:width" content="', '"')), "height": text.parse_int(extr( 'property="og:video:height" content="', '"')), - "date" : text.parse_datetime(extr( + "date" : self.parse_datetime(extr( "<small>Added ", "<"), "%B %d, %Y"), "views": text.parse_int(extr('glyphicon-eye-open"></span>', '<')), "id" : self.video_id, @@ -51,5 +51,5 @@ class WebmshareVideoExtractor(Extractor): if data["title"] == "webmshare": data["title"] = "" - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, data["url"], data diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 79120c1..bed251b 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -48,7 +48,7 @@ class WebtoonsBase(): class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): """Extractor for an episode on webtoons.com""" subcategory = "episode" - pattern = (LANG_PATTERN + r"/([^/?#]+)/([^/?#]+)/[^/?#]+)" + pattern = (rf"{LANG_PATTERN}/([^/?#]+)/([^/?#]+)/[^/?#]+)" r"/viewer\?([^#'\"]+)") example = ("https://www.webtoons.com/en/GENRE/TITLE/NAME/viewer" "?title_no=123&episode_no=12345") @@ -131,7 +131,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): def assets(self, page): if self.config("thumbnails", False): - active = text.extr(page, 'class="on ', '</a>') + active = text.extr(page, 'class="on', '</a>') url = _url(text.extr(active, 'data-url="', '"')) return ({"url": url, "type": "thumbnail"},) @@ -142,7 +142,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): categorytransfer = True filename_fmt = "{type}.{extension}" archive_fmt = "{title_no}_{type}" - pattern = LANG_PATTERN + r"/([^/?#]+)/([^/?#]+))/list\?([^#]+)" + pattern = rf"{LANG_PATTERN}/([^/?#]+)/([^/?#]+))/list\?([^#]+)" example = "https://www.webtoons.com/en/GENRE/TITLE/list?title_no=123" def items(self): @@ -160,7 +160,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): page = response.text if self.config("banners") and (asset := self._asset_banner(page)): - yield Message.Directory, asset + yield Message.Directory, "", asset yield Message.Url, asset["url"], asset data = {"_extractor": WebtoonsEpisodeExtractor} @@ -197,7 +197,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): class WebtoonsArtistExtractor(WebtoonsBase, Extractor): """Extractor for webtoons.com artists""" subcategory = "artist" - pattern = BASE_PATTERN + r"/p/community/([^/?#]+)/u/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/p/community/([^/?#]+)/u/([^/?#]+)" example = "https://www.webtoons.com/p/community/LANG/u/ARTIST" def items(self): diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py index 03cbf29..31cdaac 100644 --- a/gallery_dl/extractor/weebcentral.py +++ b/gallery_dl/extractor/weebcentral.py @@ -44,7 +44,7 @@ class WeebcentralBase(): class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor): """Extractor for manga chapters from weebcentral.com""" - pattern = BASE_PATTERN + r"(/chapters/(\w+))" + pattern = rf"{BASE_PATTERN}(/chapters/(\w+))" example = "https://weebcentral.com/chapters/01JHABCDEFGHIJKLMNOPQRSTUV" def metadata(self, page): @@ -95,7 +95,7 @@ class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor): class WeebcentralMangaExtractor(WeebcentralBase, MangaExtractor): """Extractor for manga from weebcentral.com""" chapterclass = WeebcentralChapterExtractor - pattern = BASE_PATTERN + r"/series/(\w+)" + pattern = rf"{BASE_PATTERN}/series/(\w+)" example = "https://weebcentral.com/series/01J7ABCDEFGHIJKLMNOPQRSTUV/TITLE" def chapters(self, _): @@ -127,8 +127,8 @@ class WeebcentralMangaExtractor(WeebcentralBase, MangaExtractor): "chapter" : text.parse_int(chapter), "chapter_minor": sep + minor, "chapter_type" : type, - "date" : text.parse_datetime( - extr(' datetime="', '"')[:-5], "%Y-%m-%dT%H:%M:%S"), + "date" : self.parse_datetime_iso(extr( + ' datetime="', '"')[:-5]), } chapter.update(data) results.append((base + chapter_id, chapter)) diff --git a/gallery_dl/extractor/weebdex.py b/gallery_dl/extractor/weebdex.py new file mode 100644 index 0000000..78fbda1 --- /dev/null +++ b/gallery_dl/extractor/weebdex.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://weebdex.org/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?weebdex\.org" + + +class WeebdexBase(): + """Base class for weebdex extractors""" + category = "weebdex" + root = "https://weebdex.org" + root_api = "https://api.weebdex.org" + request_interval = 0.2 # 5 requests per second + + def _init(self): + self.headers_api = { + "Referer": self.root + "/", + "Origin" : self.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + } + + +class WeebdexChapterExtractor(WeebdexBase, ChapterExtractor): + """Extractor for weebdex manga chapters""" + archive_fmt = "{chapter_id}_{version}_{page}" + pattern = BASE_PATTERN + r"/chapter/(\w+)" + example = "https://weebdex.org/chapter/ID/PAGE" + + def metadata(self, _): + cid = self.groups[0] + url = f"{self.root_api}/chapter/{cid}" + self.data = data = self.request_json(url, headers=self.headers_api) + + rel = data.pop("relationships") + chapter, sep, minor = data["chapter"].partition(".") + + return { + **_manga_info(self, rel["manga"]["id"]), + "title" : data.get("title", ""), + "version" : data["version"], + "volume" : text.parse_int(data["volume"]), + "chapter" : text.parse_int(chapter), + "chapter_minor": sep + minor, + "chapter_id" : cid, + "date" : self.parse_datetime_iso(data["created_at"]), + "date_updated" : self.parse_datetime_iso(data["updated_at"]), + "lang" : data["language"], + "uploader": rel["uploader"]["name"] if "uploader" in rel else "", + "group" : [g["name"] for g in rel.get("groups") or ()], + } + + def images(self, _): + data = self.data + base = f"{data['node']}/data/{data['id']}/" + + return [ + (base + page["name"], { + "width" : page["dimensions"][0], + "height": page["dimensions"][1], + }) + for page in data["data"] + ] + + +class WeebdexMangaExtractor(WeebdexBase, MangaExtractor): + """Extractor for weebdex manga""" + chapterclass = WeebdexChapterExtractor + pattern = BASE_PATTERN + r"/title/(\w+)" + example = "https://weebdex.org/title/ID/SLUG" + + def chapters(self, page): + mid = self.groups[0] + url = f"{self.root_api}/manga/{mid}/chapters" + params = { + "limit": 100, + "order": "asc" if self.config("chapter-reverse") else "desc", + } + + base = self.root + "/chapter/" + manga = _manga_info(self, mid) + results = [] + + while True: + data = self.request_json( + url, params=params, headers=self.headers_api) + + for ch in data["data"]: + chapter, sep, minor = ch["chapter"].partition(".") + ch["volume"] = text.parse_int(ch["volume"]) + ch["chapter"] = text.parse_int(chapter) + ch["chapter_minor"] = sep + minor + ch.update(manga) + results.append((base + ch["id"], ch)) + + if data["total"] <= data["page"] * params["limit"]: + break + params["page"] = data["page"] + 1 + + return results + + +@memcache(keyarg=1) +def _manga_info(self, mid): + url = f"{self.root_api}/manga/{mid}" + manga = self.request_json(url, headers=self.headers_api) + rel = manga["relationships"] + + return { + "manga" : manga["title"], + "manga_id": manga["id"], + "manga_date": self.parse_datetime_iso(manga["created_at"]), + "year" : manga["year"], + "status" : manga["status"], + "origin" : manga["language"], + "description": manga["description"], + "demographic": manga["demographic"], + "tags" : [f"{t['group']}:{t['name']}" for t in rel["tags"]], + "author" : [a["name"] for a in rel["authors"]], + "artist" : [a["name"] for a in rel["artists"]], + } diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 3c0f077..abec0f7 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -34,6 +34,7 @@ class WeiboExtractor(Extractor): def _init(self): self.livephoto = self.config("livephoto", True) self.retweets = self.config("retweets", False) + self.longtext = self.config("text", False) self.videos = self.config("videos", True) self.movies = self.config("movies", False) self.gifs = self.config("gifs", True) @@ -98,10 +99,14 @@ class WeiboExtractor(Extractor): files = [] self._extract_status(status, files) - status["date"] = text.parse_datetime( + if self.longtext and status.get("isLongText") and \ + status["text"].endswith('class="expand">展开</span>'): + status = self._status_by_id(status["id"]) + + status["date"] = self.parse_datetime( status["created_at"], "%a %b %d %H:%M:%S %z %Y") status["count"] = len(files) - yield Message.Directory, status + yield Message.Directory, "", status num = 0 for file in files: @@ -190,7 +195,8 @@ class WeiboExtractor(Extractor): return video def _status_by_id(self, status_id): - url = f"{self.root}/ajax/statuses/show?id={status_id}" + url = (f"{self.root}/ajax/statuses/show" + f"?id={status_id}&isGetLongText=true") return self.request_json(url) def _user_id(self): diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index 830d880..8f3a1c9 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -27,7 +27,7 @@ class WikiartExtractor(Extractor): def items(self): data = self.metadata() - yield Message.Directory, data + yield Message.Directory, "", data for painting in self.paintings(): url = painting["image"] painting.update(data) @@ -68,7 +68,7 @@ class WikiartArtistExtractor(WikiartExtractor): """Extractor for an artist's paintings on wikiart.org""" subcategory = "artist" directory_fmt = ("{category}", "{artist[artistName]}") - pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)/?$" + pattern = rf"{BASE_PATTERN}/(?!\w+-by-)([\w-]+)/?$" example = "https://www.wikiart.org/en/ARTIST" def __init__(self, match): @@ -89,7 +89,7 @@ class WikiartArtistExtractor(WikiartExtractor): class WikiartImageExtractor(WikiartArtistExtractor): """Extractor for individual paintings on wikiart.org""" subcategory = "image" - pattern = BASE_PATTERN + r"/(?!(?:paintings|artists)-by-)([\w-]+)/([\w-]+)" + pattern = rf"{BASE_PATTERN}/(?!(?:paintings|artists)-by-)([\w-]+)/([\w-]+)" example = "https://www.wikiart.org/en/ARTIST/TITLE" def __init__(self, match): @@ -109,7 +109,7 @@ class WikiartArtworksExtractor(WikiartExtractor): """Extractor for artwork collections on wikiart.org""" subcategory = "artworks" directory_fmt = ("{category}", "Artworks by {group!c}", "{type}") - pattern = BASE_PATTERN + r"/paintings-by-([\w-]+)/([\w-]+)" + pattern = rf"{BASE_PATTERN}/paintings-by-([\w-]+)/([\w-]+)" example = "https://www.wikiart.org/en/paintings-by-GROUP/TYPE" def __init__(self, match): @@ -128,7 +128,7 @@ class WikiartArtworksExtractor(WikiartExtractor): class WikiartArtistsExtractor(WikiartExtractor): """Extractor for artist collections on wikiart.org""" subcategory = "artists" - pattern = (BASE_PATTERN + r"/artists-by-([\w-]+)/([\w-]+)") + pattern = (rf"{BASE_PATTERN}/artists-by-([\w-]+)/([\w-]+)") example = "https://www.wikiart.org/en/artists-by-GROUP/TYPE" def __init__(self, match): diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py index 31dc9cd..a07fd84 100644 --- a/gallery_dl/extractor/wikifeet.py +++ b/gallery_dl/extractor/wikifeet.py @@ -34,8 +34,8 @@ class WikifeetGalleryExtractor(GalleryExtractor): "celeb" : self.celeb, "type" : self.type, "birthplace": text.unescape(extr('"bplace":"', '"')), - "birthday" : text.parse_datetime(text.unescape( - extr('"bdate":"', '"'))[:10], "%Y-%m-%d"), + "birthday" : self.parse_datetime_iso(text.unescape(extr( + '"bdate":"', '"'))[:10]), "shoesize" : text.unescape(extr('"ssize":', ',')), "rating" : text.parse_float(extr('"score":', ',')), "celebrity" : text.unescape(extr('"cname":"', '"')), diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index ba020d5..70e42c6 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -22,25 +22,32 @@ class WikimediaExtractor(BaseExtractor): request_interval = (1.0, 2.0) def __init__(self, match): - BaseExtractor.__init__(self, match) + self._init_category(match) + self.format = False if self.category == "wikimedia": - self.category = self.root.split(".")[-2] + labels = self.root.split(".") + self.lang = labels[-3][-2:] + self.category = labels[-2] elif self.category in ("fandom", "wikigg"): + self.lang = "en" + self.format = "original" + self.basesubcategory = self.category self.category = ( f"{self.category}-" f"{self.root.partition('.')[0].rpartition('/')[2]}") - - self.per_page = self.config("limit", 50) - self.subcategories = False + else: + self.lang = "" if useragent := self.config_instance("useragent"): self.useragent = useragent + BaseExtractor.__init__(self, match) + def _init(self): if api_path := self.config_instance("api-path"): if api_path[0] == "/": - self.api_url = self.root + api_path + self.api_url = f"{self.root}{api_path}" else: self.api_url = api_path else: @@ -51,12 +58,15 @@ class WikimediaExtractor(BaseExtractor): # https://www.mediawiki.org/wiki/API:Revisions # https://www.mediawiki.org/wiki/API:Imageinfo self.image_revisions = self.config("image-revisions", 1) + self.format = self.config("format", self.format) + self.per_page = self.config("limit", 50) + self.subcategories = False @cache(maxage=36500*86400, keyarg=1) def _search_api_path(self, root): self.log.debug("Probing possible API endpoints") for path in ("/api.php", "/w/api.php", "/wiki/api.php"): - url = root + path + url = f"{root}{path}" response = self.request(url, method="HEAD", fatal=None) if response.status_code < 400: return url @@ -74,12 +84,19 @@ class WikimediaExtractor(BaseExtractor): m["name"]: m["value"] for m in image["commonmetadata"] or ()} - text.nameext_from_url(image["canonicaltitle"].partition(":")[2], image) - image["date"] = text.parse_datetime( - image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + text.nameext_from_name( + image["canonicaltitle"].partition(":")[2], image) + image["date"] = self.parse_datetime_iso(image["timestamp"]) + + if self.format: + url = image["url"] + image["url"] = (f"{url}{'&' if '?' in url else '?'}" + f"format={self.format}") def items(self): - for info in self._pagination(self.params): + params = self.params() + + for info in self._pagination(params): try: images = info.pop("imageinfo") except KeyError: @@ -88,7 +105,7 @@ class WikimediaExtractor(BaseExtractor): info["count"] = len(images) self.prepare_info(info) - yield Message.Directory, info + yield Message.Directory, "", info num = 0 for image in images: @@ -105,10 +122,10 @@ class WikimediaExtractor(BaseExtractor): yield Message.Url, image["url"], image if self.subcategories: - base = self.root + "/wiki/" - self.params["gcmtype"] = "subcat" - for subcat in self._pagination(self.params): - url = base + subcat["title"].replace(" ", "_") + base = f"{self.root}/wiki/" + params["gcmtype"] = "subcat" + for subcat in self._pagination(params): + url = f"{base}{subcat['title'].replace(' ', '_')}" subcat["_extractor"] = WikimediaArticleExtractor yield Message.Queue, url, subcat @@ -219,7 +236,7 @@ class WikimediaArticleExtractor(WikimediaExtractor): """Extractor for wikimedia articles""" subcategory = "article" directory_fmt = ("{category}", "{page}") - pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)" + pattern = rf"{BASE_PATTERN}/(?!static/)([^?#]+)" example = "https://en.wikipedia.org/wiki/TITLE" def __init__(self, match): @@ -227,53 +244,54 @@ class WikimediaArticleExtractor(WikimediaExtractor): path = self.groups[-1] if path[2] == "/": - self.root = self.root + "/" + path[:2] + self.lang = lang = path[:2] + self.root = f"{self.root}/{lang}" path = path[3:] if path.startswith("wiki/"): path = path[5:] + self.path = text.unquote(path) pre, sep, _ = path.partition(":") - prefix = pre.lower() if sep else None - - self.title = path = text.unquote(path) - if prefix: + self.prefix = prefix = pre.lower() if sep else None + if prefix is not None: self.subcategory = prefix - if prefix == "category": + def params(self): + if self.prefix == "category": if self.config("subcategories", True): self.subcategories = True - self.params = { + return { "generator": "categorymembers", - "gcmtitle" : path, + "gcmtitle" : self.path, "gcmtype" : "file", "gcmlimit" : self.per_page, } - elif prefix == "file": - self.params = { - "titles" : path, - } - else: - self.params = { - "generator": "images", - "gimlimit" : self.per_page, - "titles" : path, + + if self.prefix == "file": + return { + "titles": self.path, } + return { + "generator": "images", + "gimlimit" : self.per_page, + "titles" : self.path, + } + def prepare_info(self, info): - info["page"] = self.title + info["page"] = self.path + info["lang"] = self.lang class WikimediaWikiExtractor(WikimediaExtractor): """Extractor for all files on a MediaWiki instance""" subcategory = "wiki" - pattern = BASE_PATTERN + r"/?$" + pattern = rf"{BASE_PATTERN}/?$" example = "https://en.wikipedia.org/" - def __init__(self, match): - WikimediaExtractor.__init__(self, match) - + def params(self): # ref: https://www.mediawiki.org/wiki/API:Allpages - self.params = { + return { "generator" : "allpages", "gapnamespace": 6, # "File" namespace "gaplimit" : self.per_page, diff --git a/gallery_dl/extractor/xasiat.py b/gallery_dl/extractor/xasiat.py index 6aa3168..d4dbea1 100644 --- a/gallery_dl/extractor/xasiat.py +++ b/gallery_dl/extractor/xasiat.py @@ -7,7 +7,7 @@ """Extractors for https://www.xasiat.com""" from .common import Extractor, Message -from .. import text, util +from .. import text import time BASE_PATTERN = r"(?:https?://)?(?:www\.)?xasiat\.com((?:/fr|/ja)?/albums" @@ -29,7 +29,7 @@ class XasiatExtractor(Extractor): def _pagination(self, path, pnum=1): url = f"{self.root}{path}/" - find_posts = util.re(r'class="item ">\s*<a href="([^"]+)').findall + find_posts = text.re(r'class="item ">\s*<a href="([^"]+)').findall while True: params = { @@ -38,7 +38,7 @@ class XasiatExtractor(Extractor): "block_id": "list_albums_common_albums_list", "sort_by": "post_date", "from": pnum, - "_": int(time.time() * 1000) + "_": int(time.time() * 1000), } page = self.request(url, params=params).text @@ -52,7 +52,7 @@ class XasiatExtractor(Extractor): class XasiatAlbumExtractor(XasiatExtractor): subcategory = "album" - pattern = BASE_PATTERN + r"/(\d+)/[^/?#]+)" + pattern = rf"{BASE_PATTERN}/(\d+)/[^/?#]+)" example = "https://www.xasiat.com/albums/12345/TITLE/" def items(self): @@ -66,38 +66,37 @@ class XasiatAlbumExtractor(XasiatExtractor): images = extr('class="images"', "</div>") urls = list(text.extract_iter(images, 'href="', '"')) - + categories = text.re(r'categories/[^"]+\">\s*(.+)\s*</a').findall(info) data = { "title": text.unescape(title), - "model": util.re( + "model": text.re( r'top_models1"></i>\s*(.+)\s*</span').findall(info), - "tags": util.re( + "tags": text.re( r'tags/[^"]+\">\s*(.+)\s*</a').findall(info), - "album_category": util.re( - r'categories/[^"]+\">\s*(.+)\s*</a').findall(info)[0], + "album_category": categories[0] if categories else "", "album_url": response.url, "album_id": text.parse_int(album_id), "count": len(urls), } - yield Message.Directory, data + yield Message.Directory, "", data for data["num"], url in enumerate(urls, 1): yield Message.Url, url, text.nameext_from_url(url[:-1], data) class XasiatTagExtractor(XasiatExtractor): subcategory = "tag" - pattern = BASE_PATTERN + r"/tags/[^/?#]+)" + pattern = rf"{BASE_PATTERN}/tags/[^/?#]+)" example = "https://www.xasiat.com/albums/tags/TAG/" class XasiatCategoryExtractor(XasiatExtractor): subcategory = "category" - pattern = BASE_PATTERN + r"/categories/[^/?#]+)" + pattern = rf"{BASE_PATTERN}/categories/[^/?#]+)" example = "https://www.xasiat.com/albums/categories/CATEGORY/" class XasiatModelExtractor(XasiatExtractor): subcategory = "model" - pattern = BASE_PATTERN + r"/models/[^/?#]+)" + pattern = rf"{BASE_PATTERN}/models/[^/?#]+)" example = "https://www.xasiat.com/albums/models/MODEL/" diff --git a/gallery_dl/extractor/xenforo.py b/gallery_dl/extractor/xenforo.py new file mode 100644 index 0000000..d8536b0 --- /dev/null +++ b/gallery_dl/extractor/xenforo.py @@ -0,0 +1,348 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for XenForo forums""" + +from .common import BaseExtractor, Message +from .. import text, exception +from ..cache import cache + + +class XenforoExtractor(BaseExtractor): + """Base class for xenforo extractors""" + basecategory = "xenforo" + directory_fmt = ("{category}", "{thread[section]}", + "{thread[title]} ({thread[id]})") + filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}" + archive_fmt = "{post[id]}/{type[0]}{id}_{filename}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.cookies_domain = "." + self.root.split("/")[2] + self.cookies_names = self.config_instance("cookies") + + def items(self): + self.login() + + extract_urls = text.re( + r'(?s)(?:' + r'<video (.*?\ssrc="[^"]+".*?)</video>' + r'|<a [^>]*?href="[^"]*?' + r'(/(?:index\.php\?)?attachments/[^"]+".*?)</a>' + r'|<div [^>]*?data-src="[^"]*?' + r'(/(?:index\.php\?)attachments/[^"]+".*?)/>' + r'|(?:<a [^>]*?href="|<iframe [^>]*?src="|' + r'''onclick="loadMedia\(this, ')([^"']+)''' + r')' + ).findall + + for post in self.posts(): + urls = extract_urls(post["content"]) + if post["attachments"]: + urls.extend(extract_urls(post["attachments"])) + + data = {"post": post} + post["count"] = data["count"] = len(urls) + yield Message.Directory, "", data + + id_last = None + data["_http_expected_status"] = (403,) + data["_http_validate"] = self._validate + data["num"] = data["num_internal"] = data["num_external"] = 0 + for video, inl1, inl2, ext in urls: + if ext: + data["num"] += 1 + data["num_external"] += 1 + data["type"] = "external" + if ext[0] == "/": + if ext[1] == "/": + ext = "https:" + ext + else: + continue + yield Message.Queue, ext, data + + elif video: + data["num"] += 1 + data["num_internal"] += 1 + data["type"] = "video" + url = text.extr(video, 'src="', '"') + text.nameext_from_url(url, data) + data["id"] = text.parse_int( + data["filename"].partition("-")[0]) + yield Message.Url, url, data + + elif (inline := inl1 or inl2): + path = inline[:inline.find('"')] + name, _, id = path[path.rfind("/", 0, -1):].strip( + "/").rpartition(".") + if id == id_last: + id_last = None + continue + else: + id_last = id + data["id"] = text.parse_int(id) + if alt := text.extr(inline, 'alt="', '"'): + text.nameext_from_name(alt, data) + if not data["extension"]: + data["extension"] = name.rpartition("-")[2] + else: + data["filename"], _, data["extension"] = \ + name.rpartition("-") + data["num"] += 1 + data["num_internal"] += 1 + data["type"] = "inline" + yield Message.Url, self.root + path, data + + def request_page(self, url): + try: + return self.request(url) + except exception.HttpError as exc: + if exc.status == 403 and b">Log in<" in exc.response.content: + self._require_auth(exc.response) + raise + + def login(self): + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + self.cookies_update(self._login_impl(username, password)) + + @cache(maxage=365*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = f"{self.root}/login/login" + page = self.request(url).text + data = { + "_xfToken": text.extr(page, 'name="_xfToken" value="', '"'), + "login" : username, + "password": password, + "remember": "1", + "_xfRedirect": "", + } + response = self.request(url, method="POST", data=data) + + if not response.history: + err = self._extract_error(response.text) + raise exception.AuthenticationError(f'"{err}"') + + return { + cookie.name: cookie.value + for cookie in self.cookies + if cookie.domain.endswith(self.cookies_domain) + } + + def _pagination(self, base, pnum=None): + base = f"{self.root}{base}" + + if pnum is None: + url = f"{base}/" + pnum = 1 + else: + url = f"{base}/page-{pnum}" + pnum = None + + while True: + page = self.request_page(url).text + + yield page + + if pnum is None or "pageNav-jump--next" not in page: + return + pnum += 1 + url = f"{base}/page-{pnum}" + + def _pagination_reverse(self, base, pnum=None): + base = f"{self.root}{base}" + + url = f"{base}/page-{'9999' if pnum is None else pnum}" + with self.request_page(url) as response: + if pnum is None and not response.history: + self._require_auth() + url = response.url + if url[-1] == "/": + pnum = 1 + else: + pnum = text.parse_int(url[url.rfind("-")+1:], 1) + page = response.text + + while True: + yield page + + pnum -= 1 + if pnum > 1: + url = f"{base}/page-{pnum}" + elif pnum == 1: + url = f"{base}/" + else: + return + + page = self.request_page(url).text + + def _extract_error(self, html): + return text.unescape(text.extr( + html, "blockMessage--error", "</").rpartition(">")[2].strip()) + + def _parse_thread(self, page): + try: + data = self._extract_jsonld(page) + except ValueError: + return {} + + schema = data.get("mainEntity", data) + author = schema["author"] + stats = schema["interactionStatistic"] + url_t = schema.get("url") or schema.get("@id") or "" + url_a = author.get("url") or "" + + thread = { + "id" : url_t[url_t.rfind(".")+1:-1], + "url" : url_t, + "title": schema["headline"], + "date" : self.parse_datetime_iso(schema["datePublished"]), + "tags" : (schema["keywords"].split(", ") + if "keywords" in schema else ()), + "section" : schema["articleSection"], + "author" : author.get("name") or "", + "author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else + (author.get("name") or "")[15:]), + "author_url": url_a, + } + + if isinstance(stats, list): + thread["views"] = stats[0]["userInteractionCount"] + thread["posts"] = stats[1]["userInteractionCount"] + else: + thread["views"] = -1 + thread["posts"] = stats["userInteractionCount"] + + return thread + + def _parse_post(self, html): + extr = text.extract_from(html) + + post = { + "author": extr('data-author="', '"'), + "id": extr('data-content="post-', '"'), + "author_url": (extr('itemprop="url" content="', '"') or + extr('<a href="', '"')), + "date": self.parse_datetime_iso(extr('datetime="', '"')), + "content": extr('class="message-body', + '<div class="js-selectToQuote'), + "attachments": extr('<section class="message-attachments">', + '</section>'), + } + + url_a = post["author_url"] + post["author_id"] = url_a[url_a.rfind(".")+1:-1] + + con = post["content"] + if (pos := con.find('<div class="bbWrapper')) >= 0: + con = con[pos:] + post["content"] = con.strip() + + return post + + def _require_auth(self, response=None): + raise exception.AuthRequired( + ("username & password", "authenticated cookies"), None, + None if response is None else self._extract_error(response.text)) + + def _validate(self, response): + if response.status_code == 403 and b">Log in<" in response.content: + self._require_auth(response) + return True + + +BASE_PATTERN = XenforoExtractor.update({ + "simpcity": { + "root": "https://simpcity.cr", + "pattern": r"(?:www\.)?simpcity\.(?:cr|su)", + "cookies": ("ogaddgmetaprof_user",), + }, + "nudostarforum": { + "root": "https://nudostar.com/forum", + "pattern": r"(?:www\.)?nudostar\.com/forum", + "cookies": ("xf_user",), + }, + "atfforum": { + "root": "https://allthefallen.moe/forum", + "pattern": r"(?:www\.)?allthefallen\.moe/forum", + "cookies": ("xf_user",), + }, +}) + + +class XenforoPostExtractor(XenforoExtractor): + subcategory = "post" + pattern = (rf"{BASE_PATTERN}(/(?:index\.php\?)?threads" + rf"/[^/?#]+/post-|/posts/)(\d+)") + example = "https://simpcity.cr/threads/TITLE.12345/post-54321" + + def posts(self): + path = self.groups[-2] + post_id = self.groups[-1] + url = f"{self.root}{path}{post_id}/" + page = self.request_page(url).text + + pos = page.find(f'data-content="post-{post_id}"') + if pos < 0: + raise exception.NotFoundError("post") + html = text.extract(page, "<article ", "<footer", pos-200)[0] + + self.kwdict["thread"] = self._parse_thread(page) + return (self._parse_post(html),) + + +class XenforoThreadExtractor(XenforoExtractor): + subcategory = "thread" + pattern = (rf"{BASE_PATTERN}(/(?:index\.php\?)?threads" + rf"/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?") + example = "https://simpcity.cr/threads/TITLE.12345/" + + def posts(self): + path = self.groups[-2] + pnum = self.groups[-1] + + if (order := self.config("order-posts")) and \ + order[0] not in ("d", "r"): + pages = self._pagination(path, pnum) + reverse = False + else: + pages = self._pagination_reverse(path, pnum) + reverse = True + + for page in pages: + if "thread" not in self.kwdict: + self.kwdict["thread"] = self._parse_thread(page) + posts = text.extract_iter(page, "<article ", "<footer") + if reverse: + posts = list(posts) + posts.reverse() + for html in posts: + yield self._parse_post(html) + + +class XenforoForumExtractor(XenforoExtractor): + subcategory = "forum" + pattern = (rf"{BASE_PATTERN}(/(?:index\.php\?)?forums" + rf"/(?:[^/?#]+\.)?[^/?#]+)(?:/page-(\d+))?") + example = "https://simpcity.cr/forums/TITLE.123/" + + def items(self): + extract_threads = text.re( + r'(/(?:index\.php\?)?threads/[^"]+)"[^>]+data-xf-init=').findall + + data = {"_extractor": XenforoThreadExtractor} + path = self.groups[-2] + pnum = self.groups[-1] + for page in self._pagination(path, pnum): + for path in extract_threads(page): + yield Message.Queue, f"{self.root}{text.unquote(path)}", data diff --git a/gallery_dl/extractor/xfolio.py b/gallery_dl/extractor/xfolio.py index 12f437a..8caff85 100644 --- a/gallery_dl/extractor/xfolio.py +++ b/gallery_dl/extractor/xfolio.py @@ -45,7 +45,7 @@ class XfolioExtractor(Extractor): class XfolioWorkExtractor(XfolioExtractor): subcategory = "work" - pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/works/(\d+)" + pattern = rf"{BASE_PATTERN}/portfolio/([^/?#]+)/works/(\d+)" example = "https://xfolio.jp/portfolio/USER/works/12345" def items(self): @@ -57,7 +57,7 @@ class XfolioWorkExtractor(XfolioExtractor): files = self._extract_files(html, work) work["count"] = len(files) - yield Message.Directory, work + yield Message.Directory, "", work for work["num"], file in enumerate(files, 1): file.update(work) yield Message.Url, file["url"], file @@ -107,7 +107,7 @@ class XfolioWorkExtractor(XfolioExtractor): class XfolioUserExtractor(XfolioExtractor): subcategory = "user" - pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)(?:/works)?/?(?:$|\?|#)" + pattern = rf"{BASE_PATTERN}/portfolio/([^/?#]+)(?:/works)?/?(?:$|\?|#)" example = "https://xfolio.jp/portfolio/USER" def works(self): @@ -129,7 +129,7 @@ class XfolioUserExtractor(XfolioExtractor): class XfolioSeriesExtractor(XfolioExtractor): subcategory = "series" - pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/series/(\d+)" + pattern = rf"{BASE_PATTERN}/portfolio/([^/?#]+)/series/(\d+)" example = "https://xfolio.jp/portfolio/USER/series/12345" def works(self): diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index 6c97175..64113d3 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -31,12 +31,12 @@ class XhamsterGalleryExtractor(XhamsterExtractor): "{gallery[id]} {gallery[title]}") filename_fmt = "{num:>03}_{id}.{extension}" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"(/photos/gallery/[^/?#]+)" + pattern = rf"{BASE_PATTERN}(/photos/gallery/[^/?#]+)" example = "https://xhamster.com/photos/gallery/12345" def items(self): data = self.metadata() - yield Message.Directory, data + yield Message.Directory, "", data for num, image in enumerate(self.images(), 1): url = image["imageURL"] image.update(data) @@ -67,7 +67,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor): { "id" : text.parse_int(gallery["id"]), "tags" : [t["label"] for t in info["categoriesTags"]], - "date" : text.parse_timestamp(model["created"]), + "date" : self.parse_timestamp(model["created"]), "views" : text.parse_int(model["views"]), "likes" : text.parse_int(model["rating"]["likes"]), "dislikes" : text.parse_int(model["rating"]["dislikes"]), @@ -102,7 +102,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor): class XhamsterUserExtractor(XhamsterExtractor): """Extractor for all galleries of an xhamster user""" subcategory = "user" - pattern = BASE_PATTERN + r"/users/([^/?#]+)(?:/photos)?/?(?:$|[?#])" + pattern = rf"{BASE_PATTERN}/users/([^/?#]+)(?:/photos)?/?(?:$|[?#])" example = "https://xhamster.com/users/USER/photos" def items(self): diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index 6c016ec..1f33eac 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -28,7 +28,7 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): "{gallery[id]} {gallery[title]}") filename_fmt = "{category}_{gallery[id]}_{num:>03}.{extension}" archive_fmt = "{gallery[id]}_{num}" - pattern = BASE_PATTERN + r"/([^/?#]+)/photos/(\d+)" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/photos/(\d+)" example = "https://www.xvideos.com/profiles/USER/photos/12345" def __init__(self, match): @@ -86,7 +86,7 @@ class XvideosUserExtractor(XvideosBase, Extractor): """Extractor for user profiles on xvideos.com""" subcategory = "user" categorytransfer = True - pattern = BASE_PATTERN + r"/([^/?#]+)/?(?:#.*)?$" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/?(?:#.*)?$" example = "https://www.xvideos.com/profiles/USER" def __init__(self, match): diff --git a/gallery_dl/extractor/yiffverse.py b/gallery_dl/extractor/yiffverse.py index 1595b4d..65289e2 100644 --- a/gallery_dl/extractor/yiffverse.py +++ b/gallery_dl/extractor/yiffverse.py @@ -55,8 +55,7 @@ class YiffverseExtractor(BooruExtractor): def _prepare(self, post): post.pop("files", None) - post["date"] = text.parse_datetime( - post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date"] = self.parse_datetime_iso(post["created"]) post["filename"], _, post["format"] = post["filename"].rpartition(".") if "tags" in post: post["tags"] = [t["value"] for t in post["tags"]] @@ -99,7 +98,7 @@ class YiffverseExtractor(BooruExtractor): class YiffversePostExtractor(YiffverseExtractor): subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/(\d+)" + pattern = rf"{BASE_PATTERN}/post/(\d+)" example = "https://yiffverse.com/post/12345" def posts(self): @@ -110,7 +109,7 @@ class YiffversePlaylistExtractor(YiffverseExtractor): subcategory = "playlist" directory_fmt = ("{category}", "{playlist_id}") archive_fmt = "p_{playlist_id}_{id}" - pattern = BASE_PATTERN + r"/playlist/(\d+)" + pattern = rf"{BASE_PATTERN}/playlist/(\d+)" example = "https://yiffverse.com/playlist/12345" def metadata(self): @@ -125,7 +124,7 @@ class YiffverseTagExtractor(YiffverseExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/(?:tag/([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)" + pattern = rf"{BASE_PATTERN}/(?:tag/([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)" example = "https://yiffverse.com/tag/TAG" def _init(self): diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py index eb33b65..ea3b615 100644 --- a/gallery_dl/extractor/ytdl.py +++ b/gallery_dl/extractor/ytdl.py @@ -114,7 +114,7 @@ class YoutubeDLExtractor(Extractor): info_dict.get("webpage_url") or self.ytdl_url) - yield Message.Directory, info_dict + yield Message.Directory, "", info_dict yield Message.Url, url, info_dict def _process_entries(self, ytdl_module, ytdl_instance, entries): diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 7bff83b..b4bbd5a 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -76,7 +76,7 @@ class ZerochanExtractor(BooruExtractor): data = { "id" : text.parse_int(entry_id), "file_url": jsonld["contentUrl"], - "date" : text.parse_datetime(jsonld["datePublished"]), + "date" : self.parse_datetime_iso(jsonld["datePublished"]), "width" : text.parse_int(jsonld["width"][:-3]), "height" : text.parse_int(jsonld["height"][:-3]), "size" : text.parse_bytes(jsonld["contentSize"][:-1]), @@ -128,7 +128,7 @@ class ZerochanExtractor(BooruExtractor): return data def _parse_json(self, txt): - txt = util.re(r"[\x00-\x1f\x7f]").sub("", txt) + txt = text.re(r"[\x00-\x1f\x7f]").sub("", txt) main, _, tags = txt.partition('tags": [') item = {} @@ -160,7 +160,7 @@ class ZerochanExtractor(BooruExtractor): class ZerochanTagExtractor(ZerochanExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") - pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?" + pattern = rf"{BASE_PATTERN}/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?" example = "https://www.zerochan.net/TAG" def __init__(self, match): @@ -286,7 +286,7 @@ class ZerochanTagExtractor(ZerochanExtractor): class ZerochanImageExtractor(ZerochanExtractor): subcategory = "image" - pattern = BASE_PATTERN + r"/(\d+)" + pattern = rf"{BASE_PATTERN}/(\d+)" example = "https://www.zerochan.net/12345" def posts(self): diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index 5246f66..0787464 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -13,9 +13,8 @@ import sys import time import string import _string -import datetime import operator -from . import text, util +from . import text, util, dt NONE = util.NONE @@ -68,8 +67,8 @@ class StringFormatter(): - "g": calls text.slugify() - "j": calls json.dumps - "t": calls str.strip - - "T": calls util.datetime_to_timestamp_string() - - "d": calls text.parse_timestamp + - "T": calls dt.to_ts_string() + - "d": calls dt.parse_ts() - "s": calls str() - "S": calls util.to_string() - "U": calls urllib.parse.unescape @@ -331,10 +330,10 @@ def _slice(indices): ) -def _bytesgetter(slice, encoding=sys.getfilesystemencoding()): +def _bytesgetter(slice): def apply_slice_bytes(obj): - return obj.encode(encoding)[slice].decode(encoding, "ignore") + return obj.encode(_ENCODING)[slice].decode(_ENCODING, "ignore") return apply_slice_bytes @@ -414,15 +413,27 @@ def _parse_conversion(format_spec, default): def _parse_maxlen(format_spec, default): maxlen, replacement, format_spec = format_spec.split(_SEPARATOR, 2) - maxlen = text.parse_int(maxlen[1:]) fmt = _build_format_func(format_spec, default) - def mlen(obj): - obj = fmt(obj) - return obj if len(obj) <= maxlen else replacement + if maxlen[1] == "b": + maxlen = text.parse_int(maxlen[2:]) + + def mlen(obj): + obj = fmt(obj) + return obj if len(obj.encode(_ENCODING)) <= maxlen else replacement + else: + maxlen = text.parse_int(maxlen[1:]) + + def mlen(obj): + obj = fmt(obj) + return obj if len(obj) <= maxlen else replacement return mlen +def _parse_identity(format_spec, default): + return util.identity + + def _parse_join(format_spec, default): separator, _, format_spec = format_spec.partition(_SEPARATOR) join = separator[1:].join @@ -471,9 +482,9 @@ def _parse_datetime(format_spec, default): dt_format = dt_format[1:] fmt = _build_format_func(format_spec, default) - def dt(obj): - return fmt(text.parse_datetime(obj, dt_format)) - return dt + def dt_parse(obj): + return fmt(dt.parse(obj, dt_format)) + return dt_parse def _parse_offset(format_spec, default): @@ -482,15 +493,15 @@ def _parse_offset(format_spec, default): fmt = _build_format_func(format_spec, default) if not offset or offset == "local": - def off(dt): - local = time.localtime(util.datetime_to_timestamp(dt)) - return fmt(dt + datetime.timedelta(0, local.tm_gmtoff)) + def off(dt_utc): + local = time.localtime(dt.to_ts(dt_utc)) + return fmt(dt_utc + dt.timedelta(0, local.tm_gmtoff)) else: hours, _, minutes = offset.partition(":") offset = 3600 * int(hours) if minutes: offset += 60 * (int(minutes) if offset > 0 else -int(minutes)) - offset = datetime.timedelta(0, offset) + offset = dt.timedelta(0, offset) def off(obj): return fmt(obj + offset) @@ -502,25 +513,36 @@ def _parse_sort(format_spec, default): fmt = _build_format_func(format_spec, default) if "d" in args or "r" in args: - def sort_desc(obj): + def sort(obj): return fmt(sorted(obj, reverse=True)) - return sort_desc else: - def sort_asc(obj): + def sort(obj): return fmt(sorted(obj)) - return sort_asc + return sort def _parse_limit(format_spec, default): limit, hint, format_spec = format_spec.split(_SEPARATOR, 2) - limit = int(limit[1:]) - limit_hint = limit - len(hint) fmt = _build_format_func(format_spec, default) - def apply_limit(obj): - if len(obj) > limit: - obj = obj[:limit_hint] + hint - return fmt(obj) + if limit[1] == "b": + hint = hint.encode(_ENCODING) + limit = int(limit[2:]) + limit_hint = limit - len(hint) + + def apply_limit(obj): + objb = obj.encode(_ENCODING) + if len(objb) > limit: + obj = (objb[:limit_hint] + hint).decode(_ENCODING, "ignore") + return fmt(obj) + else: + limit = int(limit[1:]) + limit_hint = limit - len(hint) + + def apply_limit(obj): + if len(obj) > limit: + obj = obj[:limit_hint] + hint + return fmt(obj) return apply_limit @@ -541,6 +563,7 @@ class Literal(): _literal = Literal() _CACHE = {} +_ENCODING = sys.getfilesystemencoding() _SEPARATOR = "/" _FORMATTERS = { "E" : ExpressionFormatter, @@ -557,7 +580,7 @@ _FORMATTERS = { _GLOBALS = { "_env": lambda: os.environ, "_lit": lambda: _literal, - "_now": datetime.datetime.now, + "_now": dt.datetime.now, "_nul": lambda: util.NONE, } _CONVERSIONS = { @@ -569,9 +592,9 @@ _CONVERSIONS = { "t": str.strip, "n": len, "L": util.code_to_language, - "T": util.datetime_to_timestamp_string, - "d": text.parse_timestamp, - "D": util.to_datetime, + "T": dt.to_ts_string, + "d": dt.parse_ts, + "D": dt.convert, "U": text.unescape, "H": lambda s: text.unescape(text.remove_html(s)), "g": text.slugify, @@ -590,6 +613,7 @@ _FORMAT_SPECIFIERS = { "A": _parse_arithmetic, "C": _parse_conversion, "D": _parse_datetime, + "I": _parse_identity, "J": _parse_join, "L": _parse_maxlen, "M": _parse_map, diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 9369e5d..7a52bd6 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -87,17 +87,22 @@ class Job(): "current_git_head": util.git_head() } # user-supplied metadata - if kwdict := extr.config("keywords"): - if extr.config("keywords-eval"): - self.kwdict_eval = [] - for key, value in kwdict.items(): - if isinstance(value, str): - fmt = formatter.parse(value, None, util.identity) - self.kwdict_eval.append((key, fmt.format_map)) - else: - self.kwdict[key] = value - else: - self.kwdict.update(kwdict) + kwdict = extr.config("keywords") + if kwdict_global := extr.config("keywords-global"): + kwdict = {**kwdict_global, **kwdict} if kwdict else kwdict_global + elif not kwdict: + return + + if extr.config("keywords-eval"): + self.kwdict_eval = [] + for key, value in kwdict.items(): + if isinstance(value, str): + fmt = formatter.parse(value, None, util.identity) + self.kwdict_eval.append((key, fmt.format_map)) + else: + self.kwdict[key] = value + else: + self.kwdict.update(kwdict) def _build_config_path(self, parent): extr = self.extractor @@ -130,6 +135,8 @@ class Job(): if extr.basecategory: if not cfgpath: cfgpath.append((extr.category, extr.subcategory)) + if extr.basesubcategory: + cfgpath.append((extr.basesubcategory, extr.subcategory)) cfgpath.append((extr.basecategory, extr.subcategory)) return cfgpath @@ -138,37 +145,35 @@ class Job(): """Execute or run the job""" extractor = self.extractor log = extractor.log - msg = None self._init() # sleep before extractor start sleep = util.build_duration_func( extractor.config("sleep-extractor")) - if sleep: + if sleep is not None: extractor.sleep(sleep(), "extractor") try: - for msg in extractor: - self.dispatch(msg) + msg = self.dispatch(extractor) except exception.StopExtraction as exc: if exc.depth > 1 and exc.target != extractor.__class__.subcategory: exc.depth -= 1 raise pass except exception.AbortExtraction as exc: + log.traceback(exc) log.error(exc.message) self.status |= exc.code except (exception.TerminateExtraction, exception.RestartExtraction): raise except exception.GalleryDLException as exc: log.error("%s: %s", exc.__class__.__name__, exc) - log.debug("", exc_info=exc) + log.traceback(exc) self.status |= exc.code except OSError as exc: - log.debug("", exc_info=exc) - name = exc.__class__.__name__ - if name == "JSONDecodeError": + log.traceback(exc) + if (name := exc.__class__.__name__) == "JSONDecodeError": log.error("Failed to parse JSON data: %s: %s", name, exc) self.status |= 1 else: # regular OSError @@ -180,7 +185,7 @@ class Job(): "copy its output and report this issue on " "https://github.com/mikf/gallery-dl/issues ."), exc.__class__.__name__, exc) - log.debug("", exc_info=exc) + log.traceback(exc) self.status |= 1 except BaseException: self.status |= 1 @@ -196,31 +201,47 @@ class Job(): self.status |= s return self.status - def dispatch(self, msg): + def dispatch(self, messages): """Call the appropriate message handler""" - if msg[0] == Message.Url: - _, url, kwdict = msg - if self.metadata_url: - kwdict[self.metadata_url] = url - if self.pred_url(url, kwdict): - self.update_kwdict(kwdict) - self.handle_url(url, kwdict) - if FLAGS.FILE is not None: - FLAGS.process("FILE") - - elif msg[0] == Message.Directory: - self.update_kwdict(msg[1]) - self.handle_directory(msg[1]) - - elif msg[0] == Message.Queue: - _, url, kwdict = msg - if self.metadata_url: - kwdict[self.metadata_url] = url - if self.pred_queue(url, kwdict): - self.update_kwdict(kwdict) - self.handle_queue(url, kwdict) - if FLAGS.CHILD is not None: - FLAGS.process("CHILD") + msg = None + process = True + + for msg, url, kwdict in messages: + + if msg == Message.Directory: + if self.pred_post(url, kwdict): + process = True + self.update_kwdict(kwdict) + self.handle_directory(kwdict) + else: + process = None + if FLAGS.POST is not None: + FLAGS.process("POST") + + elif process is None: + continue + + elif msg == Message.Url: + if self.metadata_url: + kwdict[self.metadata_url] = url + if self.pred_url(url, kwdict): + self.update_kwdict(kwdict) + self.handle_url(url, kwdict) + if FLAGS.FILE is not None: + FLAGS.process("FILE") + + elif msg == Message.Queue: + if process is None: + continue + if self.metadata_url: + kwdict[self.metadata_url] = url + if self.pred_queue(url, kwdict): + self.update_kwdict(kwdict) + self.handle_queue(url, kwdict) + if FLAGS.CHILD is not None: + FLAGS.process("CHILD") + + return msg def handle_url(self, url, kwdict): """Handle Message.Url""" @@ -252,15 +273,16 @@ class Job(): def _init(self): self.extractor.initialize() self.pred_url = self._prepare_predicates("image", True) + self.pred_post = self._prepare_predicates("post", False) self.pred_queue = self._prepare_predicates("chapter", False) def _prepare_predicates(self, target, skip=True): predicates = [] - if self.extractor.config(target + "-unique"): + if self.extractor.config(f"{target}-unique"): predicates.append(util.UniquePredicate()) - if pfilter := self.extractor.config(target + "-filter"): + if pfilter := self.extractor.config(f"{target}-filter"): try: pred = util.FilterPredicate(pfilter, target) except (SyntaxError, ValueError, TypeError) as exc: @@ -268,7 +290,7 @@ class Job(): else: predicates.append(pred) - if prange := self.extractor.config(target + "-range"): + if prange := self.extractor.config(f"{target}-range"): try: pred = util.RangePredicate(prange) except ValueError as exc: @@ -288,7 +310,7 @@ class Job(): return self._logger_adapter(logger, self) def _write_unsupported(self, url): - if self.ulog: + if self.ulog is not None: self.ulog.info(url) @@ -321,7 +343,7 @@ class DownloadJob(Job): for callback in hooks["prepare"]: callback(pathfmt) - if archive and archive.check(kwdict): + if archive is not None and archive.check(kwdict): pathfmt.fix_extension() self.handle_skip() return @@ -330,7 +352,7 @@ class DownloadJob(Job): pathfmt.build_path() if pathfmt.exists(): - if archive and self._archive_write_skip: + if archive is not None and self._archive_write_skip: archive.add(kwdict) self.handle_skip() return @@ -340,12 +362,12 @@ class DownloadJob(Job): callback(pathfmt) if kwdict.pop("_file_recheck", False) and pathfmt.exists(): - if archive and self._archive_write_skip: + if archive is not None and self._archive_write_skip: archive.add(kwdict) self.handle_skip() return - if self.sleep: + if self.sleep is not None: self.extractor.sleep(self.sleep(), "download") # download from URL @@ -369,7 +391,7 @@ class DownloadJob(Job): return if not pathfmt.temppath: - if archive and self._archive_write_skip: + if archive is not None and self._archive_write_skip: archive.add(kwdict) self.handle_skip() return @@ -383,15 +405,17 @@ class DownloadJob(Job): pathfmt.finalize() self.out.success(pathfmt.path) self._skipcnt = 0 - if archive and self._archive_write_file: + if archive is not None and self._archive_write_file: archive.add(kwdict) if "after" in hooks: for callback in hooks["after"]: callback(pathfmt) + if archive is not None and self._archive_write_after: + archive.add(kwdict) def handle_directory(self, kwdict): """Set and create the target directory for downloads""" - if not self.pathfmt: + if self.pathfmt is None: self.initialize(kwdict) else: if "post-after" in self.hooks: @@ -428,7 +452,8 @@ class DownloadJob(Job): else: extr._parentdir = pextr._parentdir - if pmeta := pextr.config2("parent-metadata", "metadata-parent"): + if pmeta := pextr.config2( + "parent-metadata", "metadata-parent", pextr.parent): if isinstance(pmeta, str): data = self.kwdict.copy() if kwdict: @@ -509,7 +534,7 @@ class DownloadJob(Job): self.out.skip(pathfmt.path) if self._skipexc: - if not self._skipftr or self._skipftr(pathfmt.kwdict): + if self._skipftr is None or self._skipftr(pathfmt.kwdict): self._skipcnt += 1 if self._skipcnt >= self._skipmax: raise self._skipexc @@ -553,7 +578,7 @@ class DownloadJob(Job): cfg = extr.config pathfmt = self.pathfmt = path.PathFormat(extr) - if kwdict: + if kwdict is not None: pathfmt.set_directory(kwdict) self.sleep = util.build_duration_func(cfg("sleep")) @@ -593,11 +618,13 @@ class DownloadJob(Job): if events is None: self._archive_write_file = True self._archive_write_skip = False + self._archive_write_after = False else: if isinstance(events, str): events = events.split(",") self._archive_write_file = ("file" in events) self._archive_write_skip = ("skip" in events) + self._archive_write_after = ("after" in events) if skip := cfg("skip", True): self._skipexc = None @@ -621,7 +648,7 @@ class DownloadJob(Job): else: # monkey-patch methods to always return False pathfmt.exists = lambda x=None: False - if self.archive: + if self.archive is not None: self.archive.check = pathfmt.exists if not cfg("postprocess", True): @@ -681,15 +708,15 @@ class DownloadJob(Job): pp_dict["__init__"] = None pp_cls = postprocessor.find(name) - if not pp_cls: + if pp_cls is None: pp_log.warning("module '%s' not found", name) continue try: pp_obj = pp_cls(self, pp_dict) except Exception as exc: + pp_log.traceback(exc) pp_log.error("'%s' initialization failed: %s: %s", name, exc.__class__.__name__, exc) - pp_log.debug("", exc_info=exc) else: pp_list.append(pp_obj) @@ -706,15 +733,11 @@ class DownloadJob(Job): condition = util.compile_filter(expr) for hook, callback in hooks.items(): self.hooks[hook].append(functools.partial( - self._call_hook, callback, condition)) + _call_hook_condition, callback, condition)) else: for hook, callback in hooks.items(): self.hooks[hook].append(callback) - def _call_hook(self, callback, condition, pathfmt): - if condition(pathfmt.kwdict): - callback(pathfmt) - def _build_extractor_filter(self): clist = self.extractor.config("whitelist") if clist is not None: @@ -730,20 +753,25 @@ class DownloadJob(Job): return util.build_extractor_filter(clist, negate, special) +def _call_hook_condition(callback, condition, pathfmt): + if condition(pathfmt.kwdict): + callback(pathfmt) + + class SimulationJob(DownloadJob): """Simulate the extraction process without downloading anything""" def handle_url(self, url, kwdict): ext = kwdict["extension"] or "jpg" kwdict["extension"] = self.pathfmt.extension_map(ext, ext) - if self.sleep: + if self.sleep is not None: self.extractor.sleep(self.sleep(), "download") - if self.archive and self._archive_write_skip: + if self.archive is not None and self._archive_write_skip: self.archive.add(kwdict) self.out.skip(self.pathfmt.build_filename(kwdict)) def handle_directory(self, kwdict): - if not self.pathfmt: + if self.pathfmt is None: self.initialize() @@ -931,13 +959,12 @@ class DataJob(Job): extractor = self.extractor sleep = util.build_duration_func( extractor.config("sleep-extractor")) - if sleep: + if sleep is not None: extractor.sleep(sleep(), "extractor") # collect data try: - for msg in extractor: - self.dispatch(msg) + self.dispatch(extractor) except exception.StopExtraction: pass except Exception as exc: diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 05cc9d3..a47d8cd 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -698,10 +698,15 @@ def build_parser(): "(e.g. '5', '8-20', or '1:24:3')"), ) selection.add_argument( + "--post-range", + dest="post-range", metavar="RANGE", action=ConfigAction, + help=("Like '--range', but for posts"), + ) + selection.add_argument( "--chapter-range", dest="chapter-range", metavar="RANGE", action=ConfigAction, - help=("Like '--range', but applies to manga chapters " - "and other delegated URLs"), + help=("Like '--range', but for child extractors handling " + "manga chapters, external URLs, etc."), ) selection.add_argument( "--filter", @@ -713,10 +718,15 @@ def build_parser(): "rating in ('s', 'q')\""), ) selection.add_argument( + "--post-filter", + dest="post-filter", metavar="EXPR", action=ConfigAction, + help=("Like '--filter', but for posts"), + ) + selection.add_argument( "--chapter-filter", dest="chapter-filter", metavar="EXPR", action=ConfigAction, - help=("Like '--filter', but applies to manga chapters " - "and other delegated URLs"), + help=("Like '--filter', but for child extractors handling " + "manga chapters, external URLs, etc."), ) infojson = { diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 9e0888b..fe7235e 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -89,6 +89,11 @@ class LoggerAdapter(): self.logger = logger self.extra = job._logger_extra + def traceback(self, exc): + if self.logger.isEnabledFor(logging.DEBUG): + self.logger._log( + logging.DEBUG, "", None, exc_info=exc, extra=self.extra) + def debug(self, msg, *args, **kwargs): if self.logger.isEnabledFor(logging.DEBUG): kwargs["extra"] = self.extra @@ -171,6 +176,48 @@ class Formatter(logging.Formatter): return msg +class FileHandler(logging.StreamHandler): + def __init__(self, path, mode, encoding, delay=True): + self.path = path + self.mode = mode + self.errors = None + self.encoding = encoding + + if delay: + logging.Handler.__init__(self) + self.stream = None + self.emit = self.emit_delayed + else: + logging.StreamHandler.__init__(self, self._open()) + + def close(self): + with self.lock: + try: + if self.stream: + try: + self.flush() + self.stream.close() + finally: + self.stream = None + finally: + logging.StreamHandler.close(self) + + def _open(self): + try: + return open(self.path, self.mode, + encoding=self.encoding, errors=self.errors) + except FileNotFoundError: + os.makedirs(os.path.dirname(self.path)) + return open(self.path, self.mode, + encoding=self.encoding, errors=self.errors) + + def emit_delayed(self, record): + if self.mode != "w" or not self._closed: + self.stream = self._open() + self.emit = logging.StreamHandler.emit.__get__(self) + self.emit(record) + + def initialize_logging(loglevel): """Setup basic logging functionality before configfiles have been loaded""" # convert levelnames to lowercase @@ -242,7 +289,8 @@ def configure_logging(loglevel): root.setLevel(minlevel) -def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL, mode="w"): +def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL, mode="w", + defer=False): """Setup a new logging handler""" opts = config.interpolate(("output",), key) if not opts: @@ -253,12 +301,10 @@ def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL, mode="w"): path = opts.get("path") mode = opts.get("mode", mode) encoding = opts.get("encoding", "utf-8") + delay = opts.get("defer", defer) try: path = util.expand_path(path) - handler = logging.FileHandler(path, mode, encoding) - except FileNotFoundError: - os.makedirs(os.path.dirname(path)) - handler = logging.FileHandler(path, mode, encoding) + handler = FileHandler(path, mode, encoding, delay) except (OSError, ValueError) as exc: logging.getLogger("gallery-dl").warning( "%s: %s", key, exc) diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 763fb55..be2dcc9 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -31,6 +31,8 @@ class PathFormat(): if kwdefault is None: kwdefault = util.NONE + self.filename_conditions = self.directory_conditions = None + filename_fmt = config("filename") try: if filename_fmt is None: @@ -41,7 +43,6 @@ class PathFormat(): formatter.parse(fmt, kwdefault).format_map) for expr, fmt in filename_fmt.items() if expr ] - self.build_filename = self.build_filename_conditional filename_fmt = filename_fmt.get("", extractor.filename_fmt) self.filename_formatter = formatter.parse( @@ -50,7 +51,6 @@ class PathFormat(): raise exception.FilenameFormatError(exc) directory_fmt = config("directory") - self.directory_conditions = () try: if directory_fmt is None: directory_fmt = extractor.directory_fmt @@ -62,7 +62,6 @@ class PathFormat(): ]) for expr, fmts in directory_fmt.items() if expr ] - self.build_directory = self.build_directory_conditional directory_fmt = directory_fmt.get("", extractor.directory_fmt) self.directory_formatters = [ @@ -160,8 +159,12 @@ class PathFormat(): def exists(self): """Return True if the file exists on disk""" - if self.extension and os.path.exists(self.realpath): - return self.check_file() + if self.extension: + try: + os.lstat(self.realpath) # raises OSError if file doesn't exist + return self.check_file() + except OSError: + pass return False def check_file(self): @@ -174,7 +177,7 @@ class PathFormat(): prefix = format(num) + "." self.kwdict["extension"] = prefix + self.extension self.build_path() - os.stat(self.realpath) # raises OSError if file doesn't exist + os.lstat(self.realpath) # raises OSError if file doesn't exist num += 1 except OSError: pass @@ -252,55 +255,47 @@ class PathFormat(): def build_filename(self, kwdict): """Apply 'kwdict' to filename format string""" try: - return self.clean_path(self.clean_segment( - self.filename_formatter(kwdict))) - except Exception as exc: - raise exception.FilenameFormatError(exc) - - def build_filename_conditional(self, kwdict): - try: - for condition, fmt in self.filename_conditions: - if condition(kwdict): - break - else: + if self.filename_conditions is None: fmt = self.filename_formatter + else: + for condition, fmt in self.filename_conditions: + if condition(kwdict): + break + else: + fmt = self.filename_formatter return self.clean_path(self.clean_segment(fmt(kwdict))) except Exception as exc: raise exception.FilenameFormatError(exc) def build_directory(self, kwdict): """Apply 'kwdict' to directory format strings""" - segments = [] - strip = self.strip - try: - for fmt in self.directory_formatters: - segment = fmt(kwdict).strip() - if strip and segment not in {".", ".."}: - # remove trailing dots and spaces (#647) - segment = segment.rstrip(strip) - if segment: - segments.append(self.clean_segment(segment)) - return segments - except Exception as exc: - raise exception.DirectoryFormatError(exc) - - def build_directory_conditional(self, kwdict): - segments = [] - strip = self.strip - - try: - for condition, formatters in self.directory_conditions: - if condition(kwdict): - break - else: + if self.directory_conditions is None: formatters = self.directory_formatters + else: + for condition, formatters in self.directory_conditions: + if condition(kwdict): + break + else: + formatters = self.directory_formatters + + segments = [] + strip = self.strip for fmt in formatters: - segment = fmt(kwdict).strip() - if strip and segment != "..": - segment = segment.rstrip(strip) - if segment: - segments.append(self.clean_segment(segment)) + segment = fmt(kwdict) + if segment.__class__ is str: + segment = segment.strip() + if strip and segment not in {".", ".."}: + segment = segment.rstrip(strip) + if segment: + segments.append(self.clean_segment(segment)) + else: # assume list + for segment in segment: + segment = segment.strip() + if strip and segment not in {".", ".."}: + segment = segment.rstrip(strip) + if segment: + segments.append(self.clean_segment(segment)) return segments except Exception as exc: raise exception.DirectoryFormatError(exc) @@ -321,7 +316,15 @@ class PathFormat(): self.kwdict["extension"] = self.prefix + self.extension_map( "part", "part") self.build_path() - if part_directory: + + if part_directory is not None: + if isinstance(part_directory, list): + for condition, part_directory in part_directory: + if condition(self.kwdict): + break + else: + return + self.temppath = os.path.join( part_directory, os.path.basename(self.temppath), diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py index 1a4ce56..6da0330 100644 --- a/gallery_dl/postprocessor/__init__.py +++ b/gallery_dl/postprocessor/__init__.py @@ -33,7 +33,7 @@ def find(name): cls = None if name in modules: # prevent unwanted imports try: - module = __import__(name, globals(), None, (), 1) + module = __import__(name, globals(), None, None, 1) except ImportError: pass else: diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index 3b0ab22..9e2e4df 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -22,6 +22,10 @@ else: from shlex import quote +def trim(args): + return (args.partition(" ") if isinstance(args, str) else args)[0] + + class ExecPP(PostProcessor): def __init__(self, job, options): @@ -35,6 +39,7 @@ class ExecPP(PostProcessor): if options.get("async", False): self._exec = self._popen + self.verbose = options.get("verbose", True) self.session = False self.creationflags = 0 if options.get("session"): @@ -115,11 +120,11 @@ class ExecPP(PostProcessor): def _exec(self, args, shell): if retcode := self._popen(args, shell).wait(): self.log.warning("'%s' returned with non-zero exit status (%d)", - args, retcode) + args if self.verbose else trim(args), retcode) return retcode def _popen(self, args, shell): - self.log.debug("Running '%s'", args) + self.log.debug("Running '%s'", args if self.verbose else trim(args)) return util.Popen( args, shell=shell, diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 90e6e3d..0017b5b 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -117,9 +117,15 @@ class MetadataPP(PostProcessor): self.mtime = options.get("mtime") self.omode = options.get("open", omode) self.encoding = options.get("encoding", "utf-8") + self.newline = options.get("newline") self.skip = options.get("skip", False) self.meta_path = options.get("metadata-path") + def open(self, path): + return open(path, self.omode, + encoding=self.encoding, + newline=self.newline) + def run(self, pathfmt): archive = self.archive if archive and archive.check(pathfmt.kwdict): @@ -138,11 +144,11 @@ class MetadataPP(PostProcessor): return try: - with open(path, self.omode, encoding=self.encoding) as fp: + with self.open(path) as fp: self.write(fp, pathfmt.kwdict) except FileNotFoundError: os.makedirs(directory, exist_ok=True) - with open(path, self.omode, encoding=self.encoding) as fp: + with self.open(path) as fp: self.write(fp, pathfmt.kwdict) if archive: diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py index b1269dd..7d4796e 100644 --- a/gallery_dl/postprocessor/mtime.py +++ b/gallery_dl/postprocessor/mtime.py @@ -9,8 +9,7 @@ """Use metadata as file modification time""" from .common import PostProcessor -from .. import text, util, formatter -from datetime import datetime +from .. import text, util, dt, formatter class MtimePP(PostProcessor): @@ -36,8 +35,8 @@ class MtimePP(PostProcessor): return pathfmt.kwdict["_mtime_meta"] = ( - util.datetime_to_timestamp(mtime) - if isinstance(mtime, datetime) else + dt.to_ts(mtime) + if isinstance(mtime, dt.datetime) else text.parse_int(mtime) ) diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 1a55e22..3813fae 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -151,7 +151,7 @@ class UgoiraPP(PostProcessor): "%s: Unable to extract frames from %s (%s: %s)", pathfmt.kwdict.get("id"), pathfmt.filename, exc.__class__.__name__, exc) - return self.log.debug("", exc_info=exc) + return self.log.traceback(exc) if self.convert(pathfmt, tempdir): if self.delete: @@ -227,12 +227,12 @@ class UgoiraPP(PostProcessor): output.stderr_write("\n") self.log.error("Unable to invoke FFmpeg (%s: %s)", exc.__class__.__name__, exc) - self.log.debug("", exc_info=exc) + self.log.traceback(exc) pathfmt.realpath = pathfmt.temppath except Exception as exc: output.stderr_write("\n") self.log.error("%s: %s", exc.__class__.__name__, exc) - self.log.debug("", exc_info=exc) + self.log.traceback(exc) pathfmt.realpath = pathfmt.temppath else: if self.mtime: diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 98bba48..5b074d9 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -8,10 +8,7 @@ """Collection of functions that work on strings/text""" -import sys import html -import time -import datetime import urllib.parse import re as re_module @@ -113,9 +110,27 @@ def nameext_from_url(url, data=None): filename = unquote(filename_from_url(url)) name, _, ext = filename.rpartition(".") if name and len(ext) <= 16: - data["filename"], data["extension"] = name, ext.lower() + data["filename"] = name + data["extension"] = ext.lower() else: - data["filename"], data["extension"] = filename, "" + data["filename"] = filename + data["extension"] = "" + + return data + + +def nameext_from_name(filename, data=None): + """Extract the last part of an URL and fill 'data' accordingly""" + if data is None: + data = {} + + name, _, ext = filename.rpartition(".") + if name and len(ext) <= 16: + data["filename"] = name + data["extension"] = ext.lower() + else: + data["filename"] = filename + data["extension"] = "" return data @@ -322,46 +337,6 @@ def build_query(params): ]) -if sys.hexversion < 0x30c0000: - # Python <= 3.11 - def parse_timestamp(ts, default=None): - """Create a datetime object from a Unix timestamp""" - try: - return datetime.datetime.utcfromtimestamp(int(ts)) - except Exception: - return default -else: - # Python >= 3.12 - def parse_timestamp(ts, default=None): - """Create a datetime object from a Unix timestamp""" - try: - Y, m, d, H, M, S, _, _, _ = time.gmtime(int(ts)) - return datetime.datetime(Y, m, d, H, M, S) - except Exception: - return default - - -def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0): - """Create a datetime object by parsing 'date_string'""" - try: - d = datetime.datetime.strptime(date_string, format) - o = d.utcoffset() - if o is not None: - # convert to naive UTC - d = d.replace(tzinfo=None, microsecond=0) - o - else: - if d.microsecond: - d = d.replace(microsecond=0) - if utcoffset: - # apply manual UTC offset - d += datetime.timedelta(0, utcoffset * -3600) - return d - except (TypeError, IndexError, KeyError): - return None - except (ValueError, OverflowError): - return date_string - - urljoin = urllib.parse.urljoin quote = urllib.parse.quote diff --git a/gallery_dl/update.py b/gallery_dl/update.py index 273ca18..e51a4b3 100644 --- a/gallery_dl/update.py +++ b/gallery_dl/update.py @@ -212,5 +212,5 @@ class UpdateExtractor(Extractor): url = (f"{self.root}/{path_repo}/releases/download" f"/{data['tag_name']}/{binary_name}") - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 49c1ba8..7d54d4c 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -16,7 +16,6 @@ import random import getpass import hashlib import binascii -import datetime import functools import itertools import subprocess @@ -24,7 +23,7 @@ import collections import urllib.parse from http.cookiejar import Cookie from email.utils import mktime_tz, parsedate_tz -from . import text, version, exception +from . import text, dt, version, exception def bencode(num, alphabet="0123456789"): @@ -228,63 +227,6 @@ def to_string(value): return str(value) -def to_datetime(value): - """Convert 'value' to a datetime object""" - if not value: - return EPOCH - - if isinstance(value, datetime.datetime): - return value - - if isinstance(value, str): - try: - if value[-1] == "Z": - # compat for Python < 3.11 - value = value[:-1] - dt = datetime.datetime.fromisoformat(value) - if dt.tzinfo is None: - if dt.microsecond: - dt = dt.replace(microsecond=0) - else: - # convert to naive UTC - dt = dt.astimezone(datetime.timezone.utc).replace( - microsecond=0, tzinfo=None) - return dt - except Exception: - pass - - return text.parse_timestamp(value, EPOCH) - - -def datetime_to_timestamp(dt): - """Convert naive UTC datetime to Unix timestamp""" - return (dt - EPOCH) / SECOND - - -def datetime_to_timestamp_string(dt): - """Convert naive UTC datetime to Unix timestamp string""" - try: - return str((dt - EPOCH) // SECOND) - except Exception: - return "" - - -if sys.hexversion < 0x30c0000: - # Python <= 3.11 - datetime_utcfromtimestamp = datetime.datetime.utcfromtimestamp - datetime_utcnow = datetime.datetime.utcnow - datetime_from_timestamp = datetime_utcfromtimestamp -else: - # Python >= 3.12 - def datetime_from_timestamp(ts=None): - """Convert Unix timestamp to naive UTC datetime""" - Y, m, d, H, M, S, _, _, _ = time.gmtime(ts) - return datetime.datetime(Y, m, d, H, M, S) - - datetime_utcfromtimestamp = datetime_from_timestamp - datetime_utcnow = datetime_from_timestamp - - def json_default(obj): if isinstance(obj, CustomNone): return None @@ -379,7 +321,7 @@ def extract_headers(response): text.nameext_from_url(name, data) if hlm := headers.get("last-modified"): - data["date"] = datetime.datetime(*parsedate_tz(hlm)[:6]) + data["date"] = dt.datetime(*parsedate_tz(hlm)[:6]) return data @@ -751,11 +693,11 @@ class Flags(): # 735506 == 739342 - 137 * 28 # v135.0 release of Chrome on 2025-04-01 has ordinal 739342 # 735562 == 739342 - 135 * 28 -# _ord_today = datetime.date.today().toordinal() +# _ord_today = dt.date.today().toordinal() # _ff_ver = (_ord_today - 735506) // 28 # _ch_ver = (_ord_today - 735562) // 28 -_ff_ver = (datetime.date.today().toordinal() - 735506) // 28 +_ff_ver = (dt.date.today().toordinal() - 735506) // 28 # _ch_ver = _ff_ver - 2 re = text.re @@ -763,8 +705,6 @@ re_compile = text.re_compile NONE = CustomNone() FLAGS = Flags() -EPOCH = datetime.datetime(1970, 1, 1) -SECOND = datetime.timedelta(0, 1) WINDOWS = (os.name == "nt") SENTINEL = object() EXECUTABLE = getattr(sys, "frozen", False) @@ -786,8 +726,8 @@ GLOBALS = { "contains" : contains, "parse_int": text.parse_int, "urlsplit" : urllib.parse.urlsplit, - "datetime" : datetime.datetime, - "timedelta": datetime.timedelta, + "datetime" : dt.datetime, + "timedelta": dt.timedelta, "abort" : raises(exception.StopExtraction), "error" : raises(exception.AbortExtraction), "terminate": raises(exception.TerminateExtraction), @@ -1071,6 +1011,8 @@ class RangePredicate(): if isinstance(rangespec, str): rangespec = rangespec.split(",") + elif isinstance(rangespec, int): + rangespec = (str(rangespec),) for group in rangespec: if not group: diff --git a/gallery_dl/version.py b/gallery_dl/version.py index bc70f74..0dcb01a 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.30.10" +__version__ = "1.31.1" __variant__ = None diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index b7ee1ca..a4d8097 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -55,6 +55,8 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None): opts["min_filesize"] = text.parse_bytes(config("filesize-min"), None) if opts.get("max_filesize") is None: opts["max_filesize"] = text.parse_bytes(config("filesize-max"), None) + if opts.get("overwrites") is None and not config("skip", True): + opts["overwrites"] = True if opts.get("ratelimit") is None: if rate := config("rate"): func = util.build_selection_func(rate, 0, text.parse_bytes) @@ -262,7 +264,7 @@ def parse_command_line(module, argv): else module.match_filter_func(opts.match_filter)) if cookiesfrombrowser := getattr(opts, "cookiesfrombrowser", None): - pattern = util.re(r"""(?x) + pattern = text.re(r"""(?x) (?P<name>[^+:]+) (?:\s*\+\s*(?P<keyring>[^:]+))? (?:\s*:\s*(?!:)(?P<profile>.+?))? @@ -528,7 +530,7 @@ def legacy_postprocessors(opts, module, ytdlp, compat_opts): if len(dur) == 2 and all(t is not None for t in dur): remove_ranges.append(tuple(dur)) continue - remove_chapters_patterns.append(util.re(regex)) + remove_chapters_patterns.append(text.re(regex)) if opts.remove_chapters or sponsorblock_query: postprocessors.append({ "key": "ModifyChapters", |
