diff options
Diffstat (limited to 'gallery_dl/extractor')
35 files changed, 790 insertions, 290 deletions
diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index d198369..948a605 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -64,7 +64,7 @@ class _4archiveThreadExtractor(Extractor): data = { "name": extr('class="name">', "</span>"), "date": text.parse_datetime( - extr('class="dateTime postNum" >', "<").strip(), + extr('class="dateTime postNum">', "<").strip(), "%Y-%m-%d %H:%M:%S"), "no" : text.parse_int(extr('href="#p', '"')), } diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index fc16f43..a4b0997 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -26,6 +26,9 @@ class _8chanExtractor(Extractor): self.root = "https://8chan." + match.group(1) Extractor.__init__(self, match) + def _init(self): + self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2]) + @memcache() def cookies_prepare(self): # fetch captcha cookies diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 49fde7b..ce1a78d 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -22,6 +22,7 @@ class ArtstationExtractor(Extractor): directory_fmt = ("{category}", "{userinfo[username]}") archive_fmt = "{asset[id]}" browser = "firefox" + tls12 = False root = "https://www.artstation.com" def __init__(self, match): diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index 84c3187..c97bf65 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -317,7 +317,7 @@ class BlueskyAPI(): def get_author_feed(self, actor, filter="posts_and_author_threads"): endpoint = "app.bsky.feed.getAuthorFeed" params = { - "actor" : self._did_from_actor(actor), + "actor" : self._did_from_actor(actor, True), "filter": filter, "limit" : "100", } @@ -327,7 +327,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getFeed" params = { "feed" : "at://{}/app.bsky.feed.generator/{}".format( - self._did_from_actor(actor, False), feed), + self._did_from_actor(actor), feed), "limit": "100", } return self._pagination(endpoint, params) @@ -344,7 +344,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getListFeed" params = { "list" : "at://{}/app.bsky.graph.list/{}".format( - self._did_from_actor(actor, False), list), + self._did_from_actor(actor), list), "limit": "100", } return self._pagination(endpoint, params) @@ -391,7 +391,7 @@ class BlueskyAPI(): } return self._pagination(endpoint, params, "posts") - def _did_from_actor(self, actor, user_did=True): + def _did_from_actor(self, actor, user_did=False): if actor.startswith("did:"): did = actor else: diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py new file mode 100644 index 0000000..a9ccab5 --- /dev/null +++ b/gallery_dl/extractor/cien.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://ci-en.net/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" + + +class CienExtractor(Extractor): + category = "cien" + root = "https://ci-en.net" + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + Extractor.__init__(self, match) + + def _pagination_articles(self, url, params): + data = {"extractor": CienArticleExtractor} + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + + for card in text.extract_iter( + page, ' class="c-cardCase-item', '</div>'): + article_url = text.extr(card, ' href="', '"') + yield Message.Queue, article_url, data + + if ' rel="next"' not in page: + return + params["page"] += 1 + + +class CienArticleExtractor(CienExtractor): + subcategory = "article" + pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)" + example = "https://ci-en.net/creator/123/article/12345" + + def items(self): + url = "{}/creator/{}/article/{}".format( + self.root, self.groups[0], self.groups[1]) + page = self.request(url, notfound="article").text + return + yield 1 + + +class CienCreatorExtractor(CienExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" + example = "https://ci-en.net/creator/123" + + def items(self): + url = "{}/creator/{}/article".format(self.root, self.groups[0]) + params = text.parse_query(self.groups[1]) + params["mode"] = "list" + return self._pagination_articles(url, params) + + +class CienRecentExtractor(CienExtractor): + subcategory = "recent" + pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?" + example = "https://ci-en.net/mypage/recent" + + def items(self): + url = self.root + "/mypage/recent" + params = text.parse_query(self.groups[0]) + return self._pagination_articles(url, params) + + +class CienFollowingExtractor(CienExtractor): + subcategory = "following" + pattern = BASE_PATTERN + r"/mypage/subscription(/following)?" + example = "https://ci-en.net/mypage/subscription" + + def items(self): + url = self.root + "/mypage/recent" + params = text.parse_query(self.groups[0]) + return self._pagination_articles(url, params) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index d14e13a..8771261 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -14,6 +14,7 @@ import ssl import time import netrc import queue +import getpass import logging import datetime import requests @@ -21,6 +22,7 @@ import threading from requests.adapters import HTTPAdapter from .message import Message from .. import config, text, util, cache, exception +urllib3 = requests.packages.urllib3 class Extractor(): @@ -45,6 +47,8 @@ class Extractor(): def __init__(self, match): self.log = logging.getLogger(self.category) self.url = match.string + self.match = match + self.groups = match.groups() self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" @@ -168,22 +172,25 @@ class Extractor(): requests.exceptions.ChunkedEncodingError, requests.exceptions.ContentDecodingError) as exc: msg = exc + code = 0 except (requests.exceptions.RequestException) as exc: raise exception.HttpError(exc) else: code = response.status_code if self._write_pages: self._dump_response(response) - if 200 <= code < 400 or fatal is None and \ - (400 <= code < 500) or not fatal and \ - (400 <= code < 429 or 431 <= code < 500): + if ( + code < 400 or + code < 500 and (not fatal and code != 429 or fatal is None) + ): if encoding: response.encoding = encoding return response if notfound and code == 404: raise exception.NotFoundError(notfound) - msg = "'{} {}' for '{}'".format(code, response.reason, url) + msg = "'{} {}' for '{}'".format( + code, response.reason, response.url) server = response.headers.get("Server") if server and server.startswith("cloudflare") and \ code in (403, 503): @@ -194,7 +201,10 @@ class Extractor(): if b'name="captcha-bypass"' in content: self.log.warning("Cloudflare CAPTCHA") break - if code not in retry_codes and code < 500: + + if code == 429 and self._interval_429: + pass + elif code not in retry_codes and code < 500: break finally: @@ -204,20 +214,24 @@ class Extractor(): if tries > retries: break + seconds = tries if self._interval: - seconds = self._interval() - if seconds < tries: - seconds = tries + s = self._interval() + if seconds < s: + seconds = s + if code == 429 and self._interval_429: + s = self._interval_429() + if seconds < s: + seconds = s + self.wait(seconds=seconds, reason="429 Too Many Requests") else: - seconds = tries - - self.sleep(seconds, "retry") + self.sleep(seconds, "retry") tries += 1 raise exception.HttpError(msg, response) def wait(self, seconds=None, until=None, adjust=1.0, - reason="rate limit reset"): + reason="rate limit"): now = time.time() if seconds: @@ -240,7 +254,7 @@ class Extractor(): if reason: t = datetime.datetime.fromtimestamp(until).time() isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second) - self.log.info("Waiting until %s for %s.", isotime, reason) + self.log.info("Waiting until %s (%s)", isotime, reason) time.sleep(seconds) def sleep(self, seconds, reason): @@ -248,6 +262,15 @@ class Extractor(): seconds, reason) time.sleep(seconds) + def input(self, prompt, echo=True): + if echo: + try: + return input(prompt) + except (EOFError, OSError): + return None + else: + return getpass.getpass(prompt) + def _get_auth_info(self): """Return authentication information as (username, password) tuple""" username = self.config("username") @@ -280,6 +303,9 @@ class Extractor(): self.config("sleep-request", self.request_interval), self.request_interval_min, ) + self._interval_429 = util.build_duration_func( + self.config("sleep-429", 60), + ) if self._retries < 0: self._retries = float("inf") @@ -439,9 +465,11 @@ class Extractor(): if not path: return + path_tmp = path + ".tmp" try: - with open(path, "w") as fp: + with open(path_tmp, "w") as fp: util.cookiestxt_store(fp, self.cookies) + os.replace(path_tmp, path) except OSError as exc: self.log.warning("cookies: %s", exc) @@ -599,7 +627,7 @@ class GalleryExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.gallery_url = self.root + match.group(1) if url is None else url + self.gallery_url = self.root + self.groups[0] if url is None else url def items(self): self.login() @@ -674,7 +702,7 @@ class MangaExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.manga_url = url or self.root + match.group(1) + self.manga_url = self.root + self.groups[0] if url is None else url if self.config("chapter-reverse", False): self.reverse = not self.reverse @@ -736,17 +764,18 @@ class BaseExtractor(Extractor): instances = () def __init__(self, match): - if not self.category: - self._init_category(match) Extractor.__init__(self, match) + if not self.category: + self._init_category() + self._cfgpath = ("extractor", self.category, self.subcategory) - def _init_category(self, match): - for index, group in enumerate(match.groups()): + def _init_category(self): + for index, group in enumerate(self.groups): if group is not None: if index: self.category, self.root, info = self.instances[index-1] if not self.root: - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(self.match.group(0)) self.config_instance = info.get else: self.root = group @@ -806,12 +835,12 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): pass if ssl_options or ssl_ciphers: - ssl_context = ssl.create_default_context() - if ssl_options: - ssl_context.options |= ssl_options - if ssl_ciphers: - ssl_context.set_ecdh_curve("prime256v1") - ssl_context.set_ciphers(ssl_ciphers) + ssl_context = urllib3.connection.create_urllib3_context( + options=ssl_options or None, ciphers=ssl_ciphers) + if requests.__version__ > "2.31": + # https://github.com/psf/requests/pull/6731 + ssl_context.load_default_certs() + ssl_context.check_hostname = False else: ssl_context = None @@ -931,8 +960,6 @@ SSL_CIPHERS = { } -urllib3 = requests.packages.urllib3 - # detect brotli support try: BROTLI = urllib3.response.brotli is not None diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index ca8acaa..993885a 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1457,9 +1457,8 @@ class DeviantartOAuthAPI(): self.log.info( "Register your own OAuth application and use its " "credentials to prevent this error: " - "https://github.com/mikf/gallery-dl/blob/master/do" - "cs/configuration.rst#extractordeviantartclient-id" - "--client-secret") + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-deviantart-client-id-client-secret") else: if log: self.log.error(msg) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index acad95c..1805403 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -50,7 +50,7 @@ class ExhentaiExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and response.headers.get("Content-Length") == "0": + if "Cache-Control" not in response.headers and not response.content: self.log.info("blank page") raise exception.AuthorizationError() return response @@ -95,7 +95,11 @@ class ExhentaiExtractor(Extractor): self.cookies.clear() response = self.request(url, method="POST", headers=headers, data=data) - if b"You are now logged in as:" not in response.content: + content = response.content + if b"You are now logged in as:" not in content: + if b"The captcha was not entered correctly" in content: + raise exception.AuthenticationError( + "CAPTCHA required. Use cookies instead.") raise exception.AuthenticationError() # collect more cookies @@ -437,7 +441,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.AuthorizationError() if page.startswith(("Key missing", "Gallery not found")): raise exception.NotFoundError("gallery") - if "hentai.org/mpv/" in page: + if page.count("hentai.org/mpv/") > 1: self.log.warning("Enabled Multi-Page Viewer is not supported") return page diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 715abcb..85dd896 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -117,8 +117,8 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(match.lastindex-1) - self.thread = match.group(match.lastindex) + self.board = self.groups[-2] + self.thread = self.groups[-1] self.data = None def metadata(self): @@ -140,20 +140,22 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): class FoolfuukaBoardExtractor(FoolfuukaExtractor): """Base extractor for FoolFuuka based boards/archives""" subcategory = "board" - pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/(?:page/)?(\d*))?$" example = "https://archived.moe/a/" def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(match.lastindex) + self.board = self.groups[-2] + self.page = self.groups[-1] def items(self): index_base = "{}/_/api/chan/index/?board={}&page=".format( self.root, self.board) thread_base = "{}/{}/thread/".format(self.root, self.board) - for page in itertools.count(1): - with self.request(index_base + format(page)) as response: + page = self.page + for pnum in itertools.count(text.parse_int(page, 1)): + with self.request(index_base + format(pnum)) as response: try: threads = response.json() except ValueError: @@ -167,6 +169,9 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): thread["_extractor"] = FoolfuukaThreadExtractor yield Message.Queue, thread["url"], thread + if page: + return + class FoolfuukaSearchExtractor(FoolfuukaExtractor): """Base extractor for search results on FoolFuuka based boards/archives""" @@ -179,17 +184,16 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): def __init__(self, match): FoolfuukaExtractor.__init__(self, match) self.params = params = {} - args = match.group(match.lastindex).split("/") - key = None - for arg in args: + key = None + for arg in self.groups[-1].split("/"): if key: params[key] = text.unescape(arg) key = None else: key = arg - board = match.group(match.lastindex-1) + board = self.groups[-2] if board != "_": params["boards"] = board diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 56721d0..6040187 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net" +BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net" class FuraffinityExtractor(Extractor): diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 2459a61..37c776e 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -51,19 +51,44 @@ class GelbooruBase(): params["pid"] = self.page_start params["limit"] = self.per_page limit = self.per_page // 2 + pid = False + + if "tags" in params: + tags = params["tags"].split() + op = "<" + id = False + + for tag in tags: + if tag.startswith("sort:"): + if tag == "sort:id:asc": + op = ">" + elif tag == "sort:id" or tag.startswith("sort:id:"): + op = "<" + else: + pid = True + elif tag.startswith("id:"): + id = True + + if not pid: + if id: + tag = "id:" + op + tags = [t for t in tags if not t.startswith(tag)] + tags = "{} id:{}".format(" ".join(tags), op) while True: posts = self._api_request(params) - for post in posts: - yield post + yield from posts if len(posts) < limit: return - if "pid" in params: - del params["pid"] - params["tags"] = "{} id:<{}".format(self.tags, post["id"]) + if pid: + params["pid"] += 1 + else: + if "pid" in params: + del params["pid"] + params["tags"] = tags + str(posts[-1]["id"]) def _pagination_html(self, params): url = self.root + "/index.php" diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 7ab6d02..8d8b8ad 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -25,7 +25,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.api_root = self.config_instance("api_root") or self.root if self.category == "realbooru": - self._file_url = self._file_url_realbooru + self.items = self._items_realbooru self._tags = self._tags_realbooru def _api_request(self, params): @@ -124,6 +124,35 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url + def _items_realbooru(self): + from .common import Message + data = self.metadata() + + for post in self.posts(): + try: + html = self._html(post) + fallback = post["file_url"] + url = post["file_url"] = text.rextract( + html, 'href="', '"', html.index(">Original<"))[0] + except Exception: + self.log.debug("Unable to fetch download URL for post %s " + "(md5: %s)", post.get("id"), post.get("md5")) + continue + + text.nameext_from_url(url, post) + post.update(data) + self._prepare(post) + self._tags(post, html) + + path = url.rpartition("/")[0] + post["_fallback"] = ( + "{}/{}.{}".format(path, post["md5"], post["extension"]), + fallback, + ) + + yield Message.Directory, post + yield Message.Url, url, post + def _tags_realbooru(self, post, page): tag_container = text.extr(page, 'id="tagLink"', '</div>') tags = collections.defaultdict(list) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index aadce6c..4a9759f 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://hiperdex.com/""" +"""Extractors for https://hiperdex.top/""" from .common import ChapterExtractor, MangaExtractor from .. import text @@ -14,18 +14,18 @@ from ..cache import memcache import re BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" - r"(?:1st)?hiperdex\d?\.(?:com|net|info))") + r"(?:1st)?hiperdex\d?\.(?:com|net|info|top))") class HiperdexBase(): """Base class for hiperdex extractors""" category = "hiperdex" - root = "https://hiperdex.com" + root = "https://hiperdex.top" @memcache(keyarg=1) def manga_data(self, manga, page=None): if not page: - url = "{}/mangas/{}/".format(self.root, manga) + url = "{}/manga/{}/".format(self.root, manga) page = self.request(url).text extr = text.extract_from(page) @@ -67,9 +67,9 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): - """Extractor for manga chapters from hiperdex.com""" + """Extractor for hiperdex manga chapters""" pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))" - example = "https://hiperdex.com/mangas/MANGA/CHAPTER/" + example = "https://hiperdex.top/manga/MANGA/CHAPTER/" def __init__(self, match): root, path, self.manga, self.chapter = match.groups() @@ -88,10 +88,10 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): - """Extractor for manga from hiperdex.com""" + """Extractor for hiperdex manga""" chapterclass = HiperdexChapterExtractor pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$" - example = "https://hiperdex.com/mangas/MANGA/" + example = "https://hiperdex.top/manga/MANGA/" def __init__(self, match): root, path, self.manga = match.groups() @@ -121,13 +121,13 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): - """Extractor for an artists's manga on hiperdex.com""" + """Extractor for an artists's manga on hiperdex""" subcategory = "artist" categorytransfer = False chapterclass = HiperdexMangaExtractor reverse = False pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))" - example = "https://hiperdex.com/manga-artist/NAME/" + example = "https://hiperdex.top/manga-artist/NAME/" def __init__(self, match): self.root = text.ensure_http_scheme(match.group(1)) diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 6d3184d..a2b51be 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -23,6 +23,7 @@ class HotleakExtractor(Extractor): def items(self): for post in self.posts(): + post["_http_expected_status"] = (404,) yield Message.Directory, post yield Message.Url, post["url"], post diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 86b1edd..481fb1e 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -68,7 +68,7 @@ class ImgurImageExtractor(ImgurExtractor): filename_fmt = "{category}_{id}{title:?_//}.{extension}" archive_fmt = "{id}" pattern = (BASE_PATTERN + r"/(?!gallery|search)" - r"(?:r/\w+/)?(\w{7}|\w{5})[sbtmlh]?") + r"(?:r/\w+/)?(?:[^/?#]+-)?(\w{7}|\w{5})[sbtmlh]?") example = "https://imgur.com/abcdefg" def items(self): @@ -93,7 +93,7 @@ class ImgurAlbumExtractor(ImgurExtractor): directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}") filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}" archive_fmt = "{album[id]}_{id}" - pattern = BASE_PATTERN + r"/a/(\w{7}|\w{5})" + pattern = BASE_PATTERN + r"/a/(?:[^/?#]+-)?(\w{7}|\w{5})" example = "https://imgur.com/a/abcde" def items(self): @@ -126,7 +126,7 @@ class ImgurAlbumExtractor(ImgurExtractor): class ImgurGalleryExtractor(ImgurExtractor): """Extractor for imgur galleries""" subcategory = "gallery" - pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(\w{7}|\w{5})" + pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(?:[^/?#]+-)?(\w{7}|\w{5})" example = "https://imgur.com/gallery/abcde" def items(self): diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 62586af..2ae8cbe 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -330,15 +330,18 @@ class InkbunnyAPI(): def _call(self, endpoint, params): url = "https://inkbunny.net/api_" + endpoint + ".php" params["sid"] = self.session_id - data = self.extractor.request(url, params=params).json() - if "error_code" in data: + while True: + data = self.extractor.request(url, params=params).json() + + if "error_code" not in data: + return data + if str(data["error_code"]) == "2": self.authenticate(invalidate=True) - return self._call(endpoint, params) - raise exception.StopExtraction(data.get("error_message")) + continue - return data + raise exception.StopExtraction(data.get("error_message")) def _pagination_search(self, params): params["page"] = 1 diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 9c77b7a..b0c24de 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -57,7 +57,7 @@ class KemonopartyExtractor(Extractor): generators = self._build_file_generators(self.config("files")) duplicates = self.config("duplicates") comments = self.config("comments") - username = dms = None + username = dms = announcements = None # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} @@ -68,6 +68,8 @@ class KemonopartyExtractor(Extractor): '<meta name="artist_name" content="', '"')[0]) if self.config("dms"): dms = True + if self.config("announcements"): + announcements = True posts = self.posts() max_posts = self.config("max-posts") @@ -80,7 +82,7 @@ class KemonopartyExtractor(Extractor): self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers post["date"] = self._parse_datetime( - post["published"] or post["added"]) + post.get("published") or post.get("added") or "") if username: post["username"] = username @@ -88,8 +90,12 @@ class KemonopartyExtractor(Extractor): post["comments"] = self._extract_comments(post) if dms is not None: if dms is True: - dms = self._extract_dms(post) + dms = self._extract_cards(post, "dms") post["dms"] = dms + if announcements is not None: + if announcements is True: + announcements = self._extract_cards(post, "announcements") + post["announcements"] = announcements files = [] hashes = set() @@ -156,7 +162,7 @@ class KemonopartyExtractor(Extractor): def _file(self, post): file = post["file"] - if not file: + if not file or "path" not in file: return () file["type"] = "file" return (file,) @@ -200,21 +206,21 @@ class KemonopartyExtractor(Extractor): }) return comments - def _extract_dms(self, post): - url = "{}/{}/user/{}/dms".format( - self.root, post["service"], post["user"]) + def _extract_cards(self, post, type): + url = "{}/{}/user/{}/{}".format( + self.root, post["service"], post["user"], type) page = self.request(url).text - dms = [] - for dm in text.extract_iter(page, "<article", "</article>"): - footer = text.extr(dm, "<footer", "</footer>") - dms.append({ + cards = [] + for card in text.extract_iter(page, "<article", "</article>"): + footer = text.extr(card, "<footer", "</footer>") + cards.append({ "body": text.unescape(text.extr( - dm, "<pre>", "</pre></", + card, "<pre>", "</pre></", ).strip()), - "date": text.extr(footer, 'Published: ', '\n'), + "date": text.extr(footer, ': ', '\n'), }) - return dms + return cards def _parse_datetime(self, date_string): if len(date_string) > 19: @@ -494,7 +500,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.favorites = (text.parse_query(match.group(3)).get("type") or + self.params = text.parse_query(match.group(3)) + self.favorites = (self.params.get("type") or self.config("favorites") or "artist") @@ -502,9 +509,17 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): self._prepare_ddosguard_cookies() self.login() + sort = self.params.get("sort") + order = self.params.get("order") or "desc" + if self.favorites == "artist": users = self.request( self.root + "/api/v1/account/favorites?type=artist").json() + + if not sort: + sort = "updated" + users.sort(key=lambda x: x[sort], reverse=(order == "desc")) + for user in users: user["_extractor"] = KemonopartyUserExtractor url = "{}/{}/user/{}".format( @@ -514,6 +529,11 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): elif self.favorites == "post": posts = self.request( self.root + "/api/v1/account/favorites?type=post").json() + + if not sort: + sort = "faved_seq" + posts.sort(key=lambda x: x[sort], reverse=(order == "desc")) + for post in posts: post["_extractor"] = KemonopartyPostExtractor url = "{}/{}/user/{}/post/{}".format( diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 030d7d1..cb7f701 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -29,6 +29,7 @@ class MastodonExtractor(BaseExtractor): self.instance = self.root.partition("://")[2] self.reblogs = self.config("reblogs", False) self.replies = self.config("replies", True) + self.cards = self.config("cards", False) def items(self): for status in self.statuses(): @@ -48,6 +49,17 @@ class MastodonExtractor(BaseExtractor): if status["reblog"]: attachments.extend(status["reblog"]["media_attachments"]) + if self.cards: + card = status.get("card") + if card: + url = card.get("image") + if url: + card["weburl"] = card.get("url") + card["url"] = url + card["id"] = "card" + "".join( + url.split("/")[6:-2]).lstrip("0") + attachments.append(card) + status["instance"] = self.instance acct = status["account"]["acct"] status["instance_remote"] = \ @@ -120,6 +132,7 @@ class MastodonUserExtractor(MastodonExtractor): api.account_id_by_username(self.item), only_media=( not self.reblogs and + not self.cards and not self.config("text-posts", False) ), exclude_replies=not self.replies, @@ -136,6 +149,36 @@ class MastodonBookmarkExtractor(MastodonExtractor): return MastodonAPI(self).account_bookmarks() +class MastodonFavoriteExtractor(MastodonExtractor): + """Extractor for mastodon favorites""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/favourites" + example = "https://mastodon.social/favourites" + + def statuses(self): + return MastodonAPI(self).account_favorites() + + +class MastodonListExtractor(MastodonExtractor): + """Extractor for mastodon lists""" + subcategory = "list" + pattern = BASE_PATTERN + r"/lists/(\w+)" + example = "https://mastodon.social/lists/12345" + + def statuses(self): + return MastodonAPI(self).timelines_list(self.item) + + +class MastodonHashtagExtractor(MastodonExtractor): + """Extractor for mastodon hashtags""" + subcategory = "hashtag" + pattern = BASE_PATTERN + r"/tags/(\w+)" + example = "https://mastodon.social/tags/NAME" + + def statuses(self): + return MastodonAPI(self).timelines_tag(self.item) + + class MastodonFollowingExtractor(MastodonExtractor): """Extractor for followed mastodon users""" subcategory = "following" @@ -205,37 +248,55 @@ class MastodonAPI(): raise exception.NotFoundError("account") def account_bookmarks(self): + """Statuses the user has bookmarked""" endpoint = "/v1/bookmarks" return self._pagination(endpoint, None) + def account_favorites(self): + """Statuses the user has favourited""" + endpoint = "/v1/favourites" + return self._pagination(endpoint, None) + def account_following(self, account_id): + """Accounts which the given account is following""" endpoint = "/v1/accounts/{}/following".format(account_id) return self._pagination(endpoint, None) def account_lookup(self, username): + """Quickly lookup a username to see if it is available""" endpoint = "/v1/accounts/lookup" params = {"acct": username} return self._call(endpoint, params).json() def account_search(self, query, limit=40): - """Search for accounts""" + """Search for matching accounts by username or display name""" endpoint = "/v1/accounts/search" params = {"q": query, "limit": limit} return self._call(endpoint, params).json() def account_statuses(self, account_id, only_media=True, exclude_replies=False): - """Fetch an account's statuses""" + """Statuses posted to the given account""" endpoint = "/v1/accounts/{}/statuses".format(account_id) - params = {"only_media" : "1" if only_media else "0", - "exclude_replies": "1" if exclude_replies else "0"} + params = {"only_media" : "true" if only_media else "false", + "exclude_replies": "true" if exclude_replies else "false"} return self._pagination(endpoint, params) def status(self, status_id): - """Fetch a status""" + """Obtain information about a status""" endpoint = "/v1/statuses/" + status_id return self._call(endpoint).json() + def timelines_list(self, list_id): + """View statuses in the given list timeline""" + endpoint = "/v1/timelines/list/" + list_id + return self._pagination(endpoint, None) + + def timelines_tag(self, hashtag): + """View public statuses containing the given hashtag""" + endpoint = "/v1/timelines/tag/" + hashtag + return self._pagination(endpoint, None) + def _call(self, endpoint, params=None): if endpoint.startswith("http"): url = endpoint diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 4cdcf87..7ac3a3a 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -102,30 +102,55 @@ class NewgroundsExtractor(Extractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - url = self.root + "/passport/" + url = self.root + "/passport" response = self.request(url) if response.history and response.url.endswith("/social"): return self.cookies page = response.text - headers = {"Origin": self.root, "Referer": url} + headers = { + "Accept": "application/json, text/javascript, */*; q=0.01", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + "X-Requested-With": "XMLHttpRequest", + "Origin": self.root, + "Referer": url, + } url = text.urljoin(self.root, text.extr(page, 'action="', '"')) data = { - "username": username, - "password": password, - "remember": "1", - "login" : "1", "auth" : text.extr(page, 'name="auth" value="', '"'), + "remember": "1", + "username": username, + "password": str(password), + "code" : "", + "codehint": "------", + "mfaCheck": "1", } - response = self.request(url, method="POST", headers=headers, data=data) - if not response.history: - raise exception.AuthenticationError() + while True: + response = self.request( + url, method="POST", headers=headers, data=data) + result = response.json() + + if result.get("success"): + break + if "errors" in result: + raise exception.AuthenticationError( + '"' + '", "'.join(result["errors"]) + '"') + + if result.get("requiresMfa"): + data["code"] = self.input("Verification Code: ") + data["codehint"] = " " + elif result.get("requiresEmailMfa"): + email = result.get("obfuscatedEmail") + prompt = "Email Verification Code ({}): ".format(email) + data["code"] = self.input(prompt) + data["codehint"] = " " + + data.pop("mfaCheck", None) return { cookie.name: cookie.value - for cookie in response.history[0].cookies - if cookie.expires and cookie.domain == self.cookies_domain + for cookie in response.cookies } def extract_post(self, post_url): diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 8c8a5a9..5571575 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -110,7 +110,7 @@ class OAuthBase(Extractor): # get a request token params = {"oauth_callback": self.redirect_uri} - data = self.session.get(request_token_url, params=params).text + data = self.request(request_token_url, params=params).text data = text.parse_query(data) self.session.auth.token_secret = data["oauth_token_secret"] @@ -120,7 +120,7 @@ class OAuthBase(Extractor): data = self.open(authorize_url, params) # exchange the request token for an access token - data = self.session.get(access_token_url, params=data).text + data = self.request(access_token_url, params=data).text data = text.parse_query(data) token = data["oauth_token"] token_secret = data["oauth_token_secret"] @@ -189,7 +189,8 @@ class OAuthBase(Extractor): data["client_id"] = client_id data["client_secret"] = client_secret - data = self.session.post(token_url, data=data, auth=auth).json() + data = self.request( + token_url, method="POST", data=data, auth=auth).json() # check token response if "error" in data: @@ -386,7 +387,7 @@ class OAuthMastodon(OAuthBase): "redirect_uris": self.redirect_uri, "scopes": "read", } - data = self.session.post(url, data=data).json() + data = self.request(url, method="POST", data=data).json() if "client_id" not in data or "client_secret" not in data: raise exception.StopExtraction( @@ -441,7 +442,8 @@ class OAuthPixiv(OAuthBase): "redirect_uri" : "https://app-api.pixiv.net" "/web/v1/users/auth/pixiv/callback", } - data = self.session.post(url, headers=headers, data=data).json() + data = self.request( + url, method="POST", headers=headers, data=data).json() if "error" in data: stdout_write("\n{}\n".format(data)) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 62d11f2..eb6d677 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -263,8 +263,9 @@ class PatreonExtractor(Extractor): page, 'id="__NEXT_DATA__" type="application/json">', '</script') if data: try: - return (util.json_loads(data)["props"]["pageProps"] - ["bootstrapEnvelope"]["bootstrap"]) + data = util.json_loads(data) + env = data["props"]["pageProps"]["bootstrapEnvelope"] + return env.get("pageBootstrap") or env["bootstrap"] except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py index 5cfdc43..83f3577 100644 --- a/gallery_dl/extractor/pixeldrain.py +++ b/gallery_dl/extractor/pixeldrain.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2023 Mike Fährmann +# Copyright 2023-2024 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -59,12 +59,13 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor): directory_fmt = ("{category}", "{album[date]:%Y-%m-%d} {album[title]} ({album[id]})") filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}" - pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)" + pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)(?:#item=(\d+))?" example = "https://pixeldrain.com/l/abcdefgh" def __init__(self, match): Extractor.__init__(self, match) self.album_id = match.group(1) + self.file_index = match.group(2) def items(self): url = "{}/api/list/{}".format(self.root, self.album_id) @@ -74,11 +75,20 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor): album["count"] = album["file_count"] album["date"] = self.parse_datetime(album["date_created"]) + if self.file_index: + idx = text.parse_int(self.file_index) + try: + files = (files[idx],) + except LookupError: + files = () + else: + idx = 0 + del album["files"] del album["file_count"] yield Message.Directory, {"album": album} - for num, file in enumerate(files, 1): + for num, file in enumerate(files, idx+1): file["album"] = album file["num"] = num file["url"] = url = "{}/api/file/{}?download".format( diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 862a7db..d732894 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -104,8 +104,9 @@ class PixivExtractor(Extractor): elif work["page_count"] == 1: url = meta_single_page["original_image_url"] if url == url_sanity: - self.log.debug("Skipping 'sanity_level' warning (%s)", - work["id"]) + self.log.warning( + "Unable to download work %s ('sanity_level' warning)", + work["id"]) continue work["date_url"] = self._date_from_url(url) yield Message.Url, url, text.nameext_from_url(url, work) @@ -619,6 +620,7 @@ class PixivNovelExtractor(PixivExtractor): meta_user = self.config("metadata") meta_bookmark = self.config("metadata-bookmark") embeds = self.config("embeds") + covers = self.config("covers") if embeds: headers = { @@ -658,6 +660,19 @@ class PixivNovelExtractor(PixivExtractor): novel["extension"] = "txt" yield Message.Url, "text:" + content, novel + if covers: + path = novel["image_urls"]["large"].partition("/img/")[2] + url = ("https://i.pximg.net/novel-cover-original/img/" + + path.rpartition(".")[0].replace("_master1200", "")) + novel["date_url"] = self._date_from_url(url) + novel["num"] += 1 + novel["suffix"] = "_p{:02}".format(novel["num"]) + novel["_fallback"] = (url + ".png",) + url_jpg = url + ".jpg" + text.nameext_from_url(url_jpg, novel) + yield Message.Url, url_jpg, novel + del novel["_fallback"] + if embeds: desktop = False illusts = {} diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index f42016f..bd22283 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -23,6 +23,12 @@ class PoipikuExtractor(Extractor): archive_fmt = "{post_id}_{num}" request_interval = (0.5, 1.5) + def _init(self): + self.cookies.set( + "LANG", "en", domain="poipiku.com") + self.cookies.set( + "POIPIKU_CONTENTS_VIEW_MODE", "1", domain="poipiku.com") + def items(self): password = self.config("password", "") @@ -59,7 +65,7 @@ class PoipikuExtractor(Extractor): "//img.", "//img-org.", 1) yield Message.Url, url, text.nameext_from_url(url, post) - if not extr(' show all(+', '<'): + if not extr('ShowAppendFile', '<'): continue url = self.root + "/f/ShowAppendFileF.jsp" diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 3569860..115de9a 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -35,10 +35,7 @@ class ReadcomiconlineBase(): self.log.warning( "Redirect to \n%s\nVisit this URL in your browser, solve " "the CAPTCHA, and press ENTER to continue", response.url) - try: - input() - except (EOFError, OSError): - pass + self.input() else: raise exception.StopExtraction( "Redirect to \n%s\nVisit this URL in your browser and " diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index e099c7e..ce602f6 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -74,8 +74,8 @@ class RedditExtractor(Extractor): yield Message.Url, url, submission elif "gallery_data" in media: - for submission["num"], url in enumerate( - self._extract_gallery(media), 1): + for url in self._extract_gallery(media): + submission["num"] += 1 text.nameext_from_url(url, submission) yield Message.Url, url, submission @@ -99,7 +99,10 @@ class RedditExtractor(Extractor): urls.append((url, submission)) for comment in comments: html = comment["body_html"] or "" - if ' href="' in html: + href = (' href="' in html) + media = ("media_metadata" in comment) + + if media or href: comment["date"] = text.parse_timestamp( comment["created_utc"]) if submission: @@ -107,6 +110,14 @@ class RedditExtractor(Extractor): data["comment"] = comment else: data = comment + + if media: + for embed in self._extract_embed(comment): + submission["num"] += 1 + text.nameext_from_url(embed, submission) + yield Message.Url, embed, submission + + if href: for url in text.extract_iter(html, ' href="', '"'): urls.append((url, data)) @@ -118,6 +129,7 @@ class RedditExtractor(Extractor): if url.startswith(( "https://www.reddit.com/message/compose", "https://reddit.com/message/compose", + "https://preview.redd.it/", )): continue @@ -172,6 +184,27 @@ class RedditExtractor(Extractor): submission["id"], item["media_id"]) self.log.debug(src) + def _extract_embed(self, submission): + meta = submission["media_metadata"] + if not meta: + return + + for mid, data in meta.items(): + if data["status"] != "valid" or "s" not in data: + self.log.warning( + "embed %s: skipping item %s (status: %s)", + submission["id"], mid, data.get("status")) + continue + src = data["s"] + url = src.get("u") or src.get("gif") or src.get("mp4") + if url: + yield url.partition("?")[0].replace("/preview.", "/i.", 1) + else: + self.log.error( + "embed %s: unable to fetch download URL for item %s", + submission["id"], mid) + self.log.debug(src) + def _extract_video_ytdl(self, submission): return "https://www.reddit.com" + submission["permalink"] @@ -454,14 +487,14 @@ class RedditAPI(): remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: - if self._warn_429: - self._warn_429 = False + self.log.warning("API rate limit exceeded") + if self._warn_429 and self.client_id == self.CLIENT_ID: self.log.info( "Register your own OAuth application and use its " "credentials to prevent this error: " - "https://github.com/mikf/gallery-dl/blob/master" - "/docs/configuration.rst" - "#extractorredditclient-id--user-agent") + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-reddit-client-id-user-agent") + self._warn_429 = False self.extractor.wait( seconds=response.headers["x-ratelimit-reset"]) continue diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index edfe1dc..23ba340 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, util, exception +from ..cache import cache class SeigaExtractor(Extractor): @@ -17,6 +18,7 @@ class SeigaExtractor(Extractor): category = "seiga" archive_fmt = "{image_id}" cookies_domain = ".nicovideo.jp" + cookies_names = ("user_session",) root = "https://seiga.nicovideo.jp" def __init__(self, match): @@ -24,8 +26,7 @@ class SeigaExtractor(Extractor): self.start_image = 0 def items(self): - if not self.cookies_check(("user_session",)): - raise exception.StopExtraction("'user_session' cookie required") + self.login() images = iter(self.get_images()) data = next(images) @@ -50,6 +51,59 @@ class SeigaExtractor(Extractor): "HTTP redirect to login page (%s)", location.partition("?")[0]) return location.replace("/o/", "/priv/", 1) + def login(self): + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + return self.cookies_update(self._login_impl(username, password)) + + raise exception.AuthorizationError( + "username & password or 'user_session' cookie required") + + @cache(maxage=365*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + root = "https://account.nicovideo.jp" + response = self.request(root + "/login?site=seiga") + page = response.text + + data = { + "mail_tel": username, + "password": password, + } + url = root + text.unescape(text.extr(page, '<form action="', '"')) + response = self.request(url, method="POST", data=data) + + if "message=cant_login" in response.url: + raise exception.AuthenticationError() + + if "/mfa" in response.url: + page = response.text + email = text.extr(page, 'class="userAccount">', "<") + code = self.input("Email Confirmation Code ({}): ".format(email)) + + data = { + "otp": code, + "loginBtn": "Login", + "device_name": "gdl", + } + url = root + text.unescape(text.extr(page, '<form action="', '"')) + response = self.request(url, method="POST", data=data) + + if not response.history and \ + b"Confirmation code is incorrect" in response.content: + raise exception.AuthenticationError( + "Incorrect Confirmation Code") + + return { + cookie.name: cookie.value + for cookie in self.cookies + if cookie.expires and cookie.domain == self.cookies_domain + } + class SeigaUserExtractor(SeigaExtractor): """Extractor for images of a user from seiga.nicovideo.jp""" diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index b56ed27..e5e7a6b 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -47,13 +47,13 @@ class SlidesharePresentationExtractor(GalleryExtractor): } def images(self, page): - parts = self.slideshow["slideImages"][0]["baseUrl"].split("/") - - begin = "{}/95/{}-".format( - "/".join(parts[:4]), - self.slideshow["strippedTitle"], + slides = self.slideshow["slides"] + begin = "{}/{}/95/{}-".format( + slides["host"], + slides["imageLocation"], + slides["title"], ) - end = "-1024.jpg?" + parts[-1].rpartition("?")[2] + end = "-1024.jpg" return [ (begin + str(n) + end, None) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index d4adfed..0abb3ab 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -43,6 +43,8 @@ class SubscribestarExtractor(Extractor): item.update(data) item["num"] = num text.nameext_from_url(item.get("name") or item["url"], item) + if item["url"][0] == "/": + item["url"] = self.root + item["url"] yield Message.Url, item["url"], item def posts(self): diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index 0a9df20..167953d 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -151,3 +151,18 @@ class TapasEpisodeExtractor(TapasExtractor): def episode_ids(self): return (self.episode_id,) + + +class TapasCreatorExtractor(TapasExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)" + example = "https://tapas.io/CREATOR" + + def items(self): + url = "{}/{}/series".format(self.root, self.groups[0]) + page = self.request(url).text + page = text.extr(page, '<ul class="content-list-wrap', "</ul>") + + data = {"_extractor": TapasSeriesExtractor} + for path in text.extract_iter(page, ' href="', '"'): + yield Message.Queue, self.root + path, data diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py index a3ef26c..de6f3ee 100644 --- a/gallery_dl/extractor/tcbscans.py +++ b/gallery_dl/extractor/tcbscans.py @@ -30,7 +30,7 @@ class TcbscansChapterExtractor(ChapterExtractor): page, 'font-bold mt-8">', "</h1>").rpartition(" - Chapter ") chapter, sep, minor = chapter.partition(".") return { - "manga": text.unescape(manga), + "manga": text.unescape(manga).strip(), "chapter": text.parse_int(chapter), "chapter_minor": sep + minor, "lang": "en", "language": "English", diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index fee0145..c34910f 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -447,9 +447,9 @@ class TumblrAPI(oauth.OAuth1API): if api_key == self.API_KEY: self.log.info( "Register your own OAuth application and use its " - "credentials to prevent this error: https://githu" - "b.com/mikf/gallery-dl/blob/master/docs/configurat" - "ion.rst#extractortumblrapi-key--api-secret") + "credentials to prevent this error: " + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-tumblr-api-key-api-secret") if self.extractor.config("ratelimit") == "wait": self.extractor.wait(seconds=reset) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index a5bd984..ff77828 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -6,17 +6,18 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://twitter.com/""" +"""Extractors for https://x.com/""" from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache import itertools +import random import json import re BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?" - r"(?:(?:[fv]x)?twitter|(?:fixup)?x)\.com") + r"(?:(?:[fv]x)?twitter|(?:fix(?:up|v))?x)\.com") class TwitterExtractor(Extractor): @@ -25,9 +26,9 @@ class TwitterExtractor(Extractor): directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{retweet_id}_{num}" - cookies_domain = ".twitter.com" + cookies_domain = ".x.com" cookies_names = ("auth_token",) - root = "https://twitter.com" + root = "https://x.com" browser = "firefox" def __init__(self, match): @@ -243,8 +244,8 @@ class TwitterExtractor(Extractor): # collect URLs from entities for url in tweet["entities"].get("urls") or (): - url = url["expanded_url"] - if "//twitpic.com/" not in url or "/photos/" in url: + url = url.get("expanded_url") or url.get("url") or "" + if not url or "//twitpic.com/" not in url or "/photos/" in url: continue if url.startswith("http:"): url = "https" + url[4:] @@ -336,12 +337,20 @@ class TwitterExtractor(Extractor): urls = entities.get("urls") if urls: for url in urls: - content = content.replace(url["url"], url["expanded_url"]) + try: + content = content.replace(url["url"], url["expanded_url"]) + except KeyError: + pass txt, _, tco = content.rpartition(" ") tdata["content"] = txt if tco.startswith("https://t.co/") else content if "birdwatch_pivot" in tweet: - tdata["birdwatch"] = tweet["birdwatch_pivot"]["subtitle"]["text"] + try: + tdata["birdwatch"] = \ + tweet["birdwatch_pivot"]["subtitle"]["text"] + except KeyError: + self.log.debug("Unable to extract 'birdwatch' note from %s", + tweet["birdwatch_pivot"]) if "in_reply_to_screen_name" in legacy: tdata["reply_to"] = legacy["in_reply_to_screen_name"] if "quoted_by" in legacy: @@ -398,7 +407,10 @@ class TwitterExtractor(Extractor): urls = entities["description"].get("urls") if urls: for url in urls: - descr = descr.replace(url["url"], url["expanded_url"]) + try: + descr = descr.replace(url["url"], url["expanded_url"]) + except KeyError: + pass udata["description"] = descr if "url" in entities: @@ -483,7 +495,13 @@ class TwitterExtractor(Extractor): username, password = self._get_auth_info() if username: - self.cookies_update(_login_impl(self, username, password)) + return self.cookies_update(_login_impl(self, username, password)) + + for cookie in self.cookies: + if cookie.domain == ".twitter.com": + self.cookies.set( + cookie.name, cookie.value, domain=self.cookies_domain, + expires=cookie.expires, secure=cookie.secure) class TwitterUserExtractor(TwitterExtractor): @@ -491,7 +509,7 @@ class TwitterUserExtractor(TwitterExtractor): subcategory = "user" pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])" r"|i(?:/user/|ntent/user\?user_id=)(\d+))") - example = "https://twitter.com/USER" + example = "https://x.com/USER" def __init__(self, match): TwitterExtractor.__init__(self, match) @@ -519,7 +537,7 @@ class TwitterTimelineExtractor(TwitterExtractor): """Extractor for a Twitter user timeline""" subcategory = "timeline" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)" - example = "https://twitter.com/USER/timeline" + example = "https://x.com/USER/timeline" def tweets(self): # yield initial batch of (media) tweets @@ -566,7 +584,7 @@ class TwitterTweetsExtractor(TwitterExtractor): """Extractor for Tweets from a user's Tweets timeline""" subcategory = "tweets" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)" - example = "https://twitter.com/USER/tweets" + example = "https://x.com/USER/tweets" def tweets(self): return self.api.user_tweets(self.user) @@ -576,7 +594,7 @@ class TwitterRepliesExtractor(TwitterExtractor): """Extractor for Tweets from a user's timeline including replies""" subcategory = "replies" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/with_replies(?!\w)" - example = "https://twitter.com/USER/with_replies" + example = "https://x.com/USER/with_replies" def tweets(self): return self.api.user_tweets_and_replies(self.user) @@ -586,7 +604,7 @@ class TwitterMediaExtractor(TwitterExtractor): """Extractor for Tweets from a user's Media timeline""" subcategory = "media" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)" - example = "https://twitter.com/USER/media" + example = "https://x.com/USER/media" def tweets(self): return self.api.user_media(self.user) @@ -596,7 +614,7 @@ class TwitterLikesExtractor(TwitterExtractor): """Extractor for liked tweets""" subcategory = "likes" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)" - example = "https://twitter.com/USER/likes" + example = "https://x.com/USER/likes" def metadata(self): return {"user_likes": self.user} @@ -609,7 +627,7 @@ class TwitterBookmarkExtractor(TwitterExtractor): """Extractor for bookmarked tweets""" subcategory = "bookmark" pattern = BASE_PATTERN + r"/i/bookmarks()" - example = "https://twitter.com/i/bookmarks" + example = "https://x.com/i/bookmarks" def tweets(self): return self.api.user_bookmarks() @@ -625,7 +643,7 @@ class TwitterListExtractor(TwitterExtractor): """Extractor for Twitter lists""" subcategory = "list" pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$" - example = "https://twitter.com/i/lists/12345" + example = "https://x.com/i/lists/12345" def tweets(self): return self.api.list_latest_tweets_timeline(self.user) @@ -635,7 +653,7 @@ class TwitterListMembersExtractor(TwitterExtractor): """Extractor for members of a Twitter list""" subcategory = "list-members" pattern = BASE_PATTERN + r"/i/lists/(\d+)/members" - example = "https://twitter.com/i/lists/12345/members" + example = "https://x.com/i/lists/12345/members" def items(self): self.login() @@ -646,7 +664,7 @@ class TwitterFollowingExtractor(TwitterExtractor): """Extractor for followed users""" subcategory = "following" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)" - example = "https://twitter.com/USER/following" + example = "https://x.com/USER/following" def items(self): self.login() @@ -657,7 +675,7 @@ class TwitterSearchExtractor(TwitterExtractor): """Extractor for Twitter search results""" subcategory = "search" pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" - example = "https://twitter.com/search?q=QUERY" + example = "https://x.com/search?q=QUERY" def metadata(self): return {"search": text.unquote(self.user)} @@ -688,7 +706,7 @@ class TwitterHashtagExtractor(TwitterExtractor): """Extractor for Twitter hashtags""" subcategory = "hashtag" pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)" - example = "https://twitter.com/hashtag/NAME" + example = "https://x.com/hashtag/NAME" def items(self): url = "{}/search?q=%23{}".format(self.root, self.user) @@ -700,7 +718,7 @@ class TwitterCommunityExtractor(TwitterExtractor): """Extractor for a Twitter community""" subcategory = "community" pattern = BASE_PATTERN + r"/i/communities/(\d+)" - example = "https://twitter.com/i/communities/12345" + example = "https://x.com/i/communities/12345" def tweets(self): if self.textonly: @@ -712,7 +730,7 @@ class TwitterCommunitiesExtractor(TwitterExtractor): """Extractor for followed Twitter communities""" subcategory = "communities" pattern = BASE_PATTERN + r"/([^/?#]+)/communities/?$" - example = "https://twitter.com/i/communities" + example = "https://x.com/i/communities" def tweets(self): return self.api.communities_main_page_timeline(self.user) @@ -724,7 +742,7 @@ class TwitterEventExtractor(TwitterExtractor): directory_fmt = ("{category}", "Events", "{event[id]} {event[short_title]}") pattern = BASE_PATTERN + r"/i/events/(\d+)" - example = "https://twitter.com/i/events/12345" + example = "https://x.com/i/events/12345" def metadata(self): return {"event": self.api.live_event(self.user)} @@ -736,8 +754,9 @@ class TwitterEventExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): """Extractor for individual tweets""" subcategory = "tweet" - pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)/?$" - example = "https://twitter.com/USER/status/12345" + pattern = (BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" + r"/?(?:$|\?|#|photo/|video/)") + example = "https://x.com/USER/status/12345" def __init__(self, match): TwitterExtractor.__init__(self, match) @@ -817,7 +836,7 @@ class TwitterQuotesExtractor(TwitterExtractor): """Extractor for quotes of a Tweet""" subcategory = "quotes" pattern = BASE_PATTERN + r"/(?:[^/?#]+|i/web)/status/(\d+)/quotes" - example = "https://twitter.com/USER/status/12345/quotes" + example = "https://x.com/USER/status/12345/quotes" def items(self): url = "{}/search?q=quoted_tweet_id:{}".format(self.root, self.user) @@ -830,7 +849,7 @@ class TwitterAvatarExtractor(TwitterExtractor): filename_fmt = "avatar {date}.{extension}" archive_fmt = "AV_{user[id]}_{date}" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/photo" - example = "https://twitter.com/USER/photo" + example = "https://x.com/USER/photo" def tweets(self): self.api._user_id_by_screen_name(self.user) @@ -852,7 +871,7 @@ class TwitterBackgroundExtractor(TwitterExtractor): filename_fmt = "background {date}.{extension}" archive_fmt = "BG_{user[id]}_{date}" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/header_photo" - example = "https://twitter.com/USER/header_photo" + example = "https://x.com/USER/header_photo" def tweets(self): self.api._user_id_by_screen_name(self.user) @@ -899,7 +918,7 @@ class TwitterAPI(): self.extractor = extractor self.log = extractor.log - self.root = "https://twitter.com/i/api" + self.root = "https://x.com/i/api" self._nsfw_warning = True self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode @@ -919,7 +938,7 @@ class TwitterAPI(): self.headers = { "Accept": "*/*", - "Referer": "https://twitter.com/", + "Referer": extractor.root + "/", "content-type": "application/json", "x-guest-token": None, "x-twitter-auth-type": "OAuth2Session" if auth_token else None, @@ -1262,7 +1281,7 @@ class TwitterAPI(): endpoint = "/1.1/guest/activate.json" self.log.info("Requesting guest token") return str(self._call( - endpoint, None, "POST", False, "https://api.twitter.com", + endpoint, None, "POST", False, "https://api.x.com", )["guest_token"]) def _authenticate_guest(self): @@ -1288,63 +1307,72 @@ class TwitterAPI(): if csrf_token: self.headers["x-csrf-token"] = csrf_token - if response.status_code < 400: + remaining = int(response.headers.get("x-rate-limit-remaining", 6)) + if remaining < 6 and remaining <= random.randrange(1, 6): + self._handle_ratelimit(response) + continue + + try: data = response.json() + except ValueError: + data = {"errors": ({"message": response.text},)} + + errors = data.get("errors") + if not errors: + return data + + retry = False + for error in errors: + msg = error.get("message") or "Unspecified" + self.log.debug("API error: '%s'", msg) + + if "this account is temporarily locked" in msg: + msg = "Account temporarily locked" + if self.extractor.config("locked") != "wait": + raise exception.AuthorizationError(msg) + self.log.warning(msg) + self.extractor.input("Press ENTER to retry.") + retry = True + + elif "Could not authenticate you" in msg: + if not self.extractor.config("relogin", True): + continue - errors = data.get("errors") - if not errors: - return data + username, password = self.extractor._get_auth_info() + if not username: + continue - retry = False - for error in errors: - msg = error.get("message") or "Unspecified" - self.log.debug("API error: '%s'", msg) + _login_impl.invalidate(username) + self.extractor.cookies_update( + _login_impl(self.extractor, username, password)) + self.__init__(self.extractor) + retry = True - if "this account is temporarily locked" in msg: - msg = "Account temporarily locked" - if self.extractor.config("locked") != "wait": - raise exception.AuthorizationError(msg) - self.log.warning("%s. Press ENTER to retry.", msg) - try: - input() - except (EOFError, OSError): - pass - retry = True - - elif msg.lower().startswith("timeout"): - retry = True + elif msg.lower().startswith("timeout"): + retry = True - if not retry: - return data - elif self.headers["x-twitter-auth-type"]: + if retry: + if self.headers["x-twitter-auth-type"]: self.log.debug("Retrying API request") continue + else: + # fall through to "Login Required" + response.status_code = 404 - # fall through to "Login Required" - response.status_code = 404 - - if response.status_code == 429: - # rate limit exceeded - if self.extractor.config("ratelimit") == "abort": - raise exception.StopExtraction("Rate limit exceeded") - - until = response.headers.get("x-rate-limit-reset") - seconds = None if until else 60 - self.extractor.wait(until=until, seconds=seconds) - continue - - if response.status_code in (403, 404) and \ + if response.status_code < 400: + return data + elif response.status_code in (403, 404) and \ not self.headers["x-twitter-auth-type"]: raise exception.AuthorizationError("Login required") + elif response.status_code == 429: + self._handle_ratelimit(response) + continue # error try: - data = response.json() - errors = ", ".join(e["message"] for e in data["errors"]) - except ValueError: - errors = response.text + errors = ", ".join(e["message"] for e in errors) except Exception: - errors = data.get("errors", "") + pass raise exception.StopExtraction( "%s %s (%s)", response.status_code, response.reason, errors) @@ -1680,6 +1708,13 @@ class TwitterAPI(): return variables["cursor"] = cursor + def _handle_ratelimit(self, response): + if self.extractor.config("ratelimit") == "abort": + raise exception.StopExtraction("Rate limit exceeded") + + until = response.headers.get("x-rate-limit-reset") + self.extractor.wait(until=until, seconds=None if until else 60) + def _process_tombstone(self, entry, tombstone): text = (tombstone.get("richText") or tombstone["text"])["text"] tweet_id = entry["entryId"].rpartition("-")[2] @@ -1695,22 +1730,22 @@ class TwitterAPI(): @cache(maxage=365*86400, keyarg=1) def _login_impl(extr, username, password): - import re - import random + def process(data, params=None): + response = extr.request( + url, params=params, headers=headers, json=data, + method="POST", fatal=None) - if re.fullmatch(r"[\w.%+-]+@[\w.-]+\.\w{2,}", username): - extr.log.warning( - "Login with email is no longer possible. " - "You need to provide your username or phone number instead.") - - def process(response): try: data = response.json() except ValueError: data = {"errors": ({"message": "Invalid response"},)} else: if response.status_code < 400: - return data["flow_token"] + try: + return (data["flow_token"], + data["subtasks"][0]["subtask_id"]) + except LookupError: + pass errors = [] for error in data.get("errors") or (): @@ -1719,9 +1754,13 @@ def _login_impl(extr, username, password): extr.log.debug(response.text) raise exception.AuthenticationError(", ".join(errors)) - extr.cookies.clear() + cookies = extr.cookies + cookies.clear() api = TwitterAPI(extr) api._authenticate_guest() + + url = "https://api.x.com/1.1/onboarding/task.json" + params = {"flow_name": "login"} headers = api.headers extr.log.info("Logging in as %s", username) @@ -1778,31 +1817,18 @@ def _login_impl(extr, username, password): "web_modal": 1, }, } - url = "https://api.twitter.com/1.1/onboarding/task.json?flow_name=login" - response = extr.request(url, method="POST", headers=headers, json=data) - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "LoginJsInstrumentationSubtask", + flow_token, subtask = process(data, params) + while not cookies.get("auth_token"): + if subtask == "LoginJsInstrumentationSubtask": + data = { "js_instrumentation": { "response": "{}", "link": "next_link", }, - }, - ], - } - url = "https://api.twitter.com/1.1/onboarding/task.json" - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - - # username - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "LoginEnterUserIdentifierSSO", + } + elif subtask == "LoginEnterUserIdentifierSSO": + data = { "settings_list": { "setting_responses": [ { @@ -1814,48 +1840,61 @@ def _login_impl(extr, username, password): ], "link": "next_link", }, - }, - ], - } - # url = "https://api.twitter.com/1.1/onboarding/task.json" - extr.sleep(random.uniform(2.0, 4.0), "login (username)") - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - - # password - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "LoginEnterPassword", + } + elif subtask == "LoginEnterPassword": + data = { "enter_password": { "password": password, "link": "next_link", }, - }, - ], - } - # url = "https://api.twitter.com/1.1/onboarding/task.json" - extr.sleep(random.uniform(2.0, 4.0), "login (password)") - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - - # account duplication check ? - data = { - "flow_token": process(response), - "subtask_inputs": [ - { - "subtask_id": "AccountDuplicationCheck", + } + elif subtask == "LoginEnterAlternateIdentifierSubtask": + alt = extr.input( + "Alternate Identifier (username, email, phone number): ") + data = { + "enter_text": { + "text": alt, + "link": "next_link", + }, + } + elif subtask == "LoginTwoFactorAuthChallenge": + data = { + "enter_text": { + "text": extr.input("2FA Token: "), + "link": "next_link", + }, + } + elif subtask == "LoginAcid": + data = { + "enter_text": { + "text": extr.input("Email Verification Code: "), + "link": "next_link", + }, + } + elif subtask == "AccountDuplicationCheck": + data = { "check_logged_in_account": { "link": "AccountDuplicationCheck_false", }, - }, - ], - } - # url = "https://api.twitter.com/1.1/onboarding/task.json" - response = extr.request( - url, method="POST", headers=headers, json=data, fatal=None) - process(response) + } + elif subtask == "ArkoseLogin": + raise exception.AuthenticationError("Login requires CAPTCHA") + elif subtask == "DenyLoginSubtask": + raise exception.AuthenticationError("Login rejected as suspicious") + elif subtask == "ArkoseLogin": + raise exception.AuthenticationError("No auth token cookie") + else: + raise exception.StopExtraction("Unrecognized subtask %s", subtask) + + inputs = {"subtask_id": subtask} + inputs.update(data) + data = { + "flow_token": flow_token, + "subtask_inputs": [inputs], + } + + extr.sleep(random.uniform(1.0, 3.0), "login ({})".format(subtask)) + flow_token, subtask = process(data) return { cookie.name: cookie.value diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 41141c6..c112f4a 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -46,6 +46,8 @@ class VscoExtractor(Extractor): url = "https://image-{}.vsco.co/{}".format(cdn, path) elif cdn.isdecimal(): url = "https://image.vsco.co/" + base + elif img["responsive_url"].startswith("http"): + url = img["responsive_url"] else: url = "https://" + img["responsive_url"] @@ -238,6 +240,34 @@ class VscoSpacesExtractor(VscoExtractor): yield Message.Queue, url, space +class VscoAvatarExtractor(VscoExtractor): + """Extractor for vsco.co user avatars""" + subcategory = "avatar" + pattern = USER_PATTERN + r"/avatar" + example = "https://vsco.co/USER/avatar" + + def images(self): + url = "{}/{}/gallery".format(self.root, self.user) + page = self.request(url).text + piid = text.extr(page, '"profileImageId":"', '"') + + url = "https://im.vsco.co/" + piid + # needs GET request, since HEAD does not redirect to full URL + response = self.request(url, allow_redirects=False) + + return ({ + "_id" : piid, + "is_video" : False, + "grid_name" : "", + "upload_date" : 0, + "responsive_url": response.headers["Location"], + "video_url" : "", + "image_meta" : None, + "width" : 0, + "height" : 0, + },) + + class VscoImageExtractor(VscoExtractor): """Extractor for individual images on vsco.co""" subcategory = "image" diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index ac00682..9370cfb 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -27,9 +27,9 @@ class WikimediaExtractor(BaseExtractor): if self.category == "wikimedia": self.category = self.root.split(".")[-2] - elif self.category == "fandom": - self.category = \ - "fandom-" + self.root.partition(".")[0].rpartition("/")[2] + elif self.category in ("fandom", "wikigg"): + self.category = "{}-{}".format( + self.category, self.root.partition(".")[0].rpartition("/")[2]) if path.startswith("wiki/"): path = path[5:] @@ -69,14 +69,18 @@ class WikimediaExtractor(BaseExtractor): def items(self): for info in self._pagination(self.params): - image = info["imageinfo"][0] + try: + image = info["imageinfo"][0] + except LookupError: + self.log.debug("Missing 'imageinfo' for %s", info) + continue image["metadata"] = { m["name"]: m["value"] - for m in image["metadata"]} + for m in image["metadata"] or ()} image["commonmetadata"] = { m["name"]: m["value"] - for m in image["commonmetadata"]} + for m in image["commonmetadata"] or ()} filename = image["canonicaltitle"] image["filename"], _, image["extension"] = \ @@ -148,6 +152,10 @@ BASE_PATTERN = WikimediaExtractor.update({ "root": None, "pattern": r"[\w-]+\.fandom\.com", }, + "wikigg": { + "root": None, + "pattern": r"\w+\.wiki\.gg", + }, "mariowiki": { "root": "https://www.mariowiki.com", "pattern": r"(?:www\.)?mariowiki\.com", |
