diff options
| author | 2024-01-23 23:35:00 -0500 | |
|---|---|---|
| committer | 2024-01-23 23:35:00 -0500 | |
| commit | 12e23f1195164dcb740d6d4a4287e762c9e5e534 (patch) | |
| tree | e6b13483475c510ea2f685c21363271f23745c56 /gallery_dl/extractor | |
| parent | e949aaf6f6ac93896947d5b736e48e7911926efb (diff) | |
New upstream version 1.26.7.upstream/1.26.7
Diffstat (limited to 'gallery_dl/extractor')
33 files changed, 903 insertions, 273 deletions
diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py new file mode 100644 index 0000000..dbbf21b --- /dev/null +++ b/gallery_dl/extractor/2ch.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://2ch.hk/""" + +from .common import Extractor, Message +from .. import text, util + + +class _2chThreadExtractor(Extractor): + """Extractor for 2ch threads""" + category = "2ch" + subcategory = "thread" + root = "https://2ch.hk" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{tim}{filename:? //}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)" + example = "https://2ch.hk/a/res/12345.html" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread) + posts = self.request(url).json()["threads"][0]["posts"] + + op = posts[0] + title = op.get("subject") or text.remove_html(op["comment"]) + + thread = { + "board" : self.board, + "thread": self.thread, + "title" : text.unescape(title)[:50], + } + + yield Message.Directory, thread + for post in posts: + files = post.get("files") + if files: + post["post_name"] = post["name"] + post["date"] = text.parse_timestamp(post["timestamp"]) + del post["files"] + del post["name"] + + for file in files: + file.update(thread) + file.update(post) + + file["filename"] = file["fullname"].rpartition(".")[0] + file["tim"], _, file["extension"] = \ + file["name"].rpartition(".") + + yield Message.Url, self.root + file["path"], file + + +class _2chBoardExtractor(Extractor): + """Extractor for 2ch boards""" + category = "2ch" + subcategory = "board" + root = "https://2ch.hk" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$" + example = "https://2ch.hk/a/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def items(self): + # index page + url = "{}/{}/index.json".format(self.root, self.board) + index = self.request(url).json() + index["_extractor"] = _2chThreadExtractor + for thread in index["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, index + + # pages 1..n + for n in util.advance(index["pages"], 1): + url = "{}/{}/{}.json".format(self.root, self.board, n) + page = self.request(url).json() + page["_extractor"] = _2chThreadExtractor + for thread in page["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, page diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e33f2c..d624736 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,6 +10,7 @@ import sys import re modules = [ + "2ch", "2chan", "2chen", "35photo", @@ -53,7 +54,7 @@ modules = [ "gelbooru_v01", "gelbooru_v02", "gofile", - "hbrowse", + "hatenablog", "hentai2read", "hentaicosplays", "hentaifoundry", @@ -145,6 +146,7 @@ modules = [ "smugmug", "soundgasm", "speakerdeck", + "steamgriddb", "subscribestar", "szurubooru", "tapas", @@ -175,6 +177,7 @@ modules = [ "weibo", "wikiart", "wikifeet", + "wikimedia", "xhamster", "xvideos", "zerochan", diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index cd6302e..e82cd09 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -10,8 +10,11 @@ from .common import Extractor, ChapterExtractor, MangaExtractor from .. import text, exception import re -BASE_PATTERN = (r"(?:https?://)?" - r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)") +BASE_PATTERN = (r"(?:https?://)?(?:" + r"(?:ba|d|h|m|w)to\.to|" + r"(?:(?:manga|read)toto|batocomic|[xz]bato)\.(?:com|net|org)|" + r"comiko\.(?:net|org)|" + r"bat(?:otoo|o?two)\.com)") class BatotoBase(): @@ -38,7 +41,8 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) manga, info, _ = extr("<title>", "<").rsplit(" - ", 3) - manga_id = extr("/title/", "/") + manga_id = text.extr( + extr('rel="canonical" href="', '"'), "/title/", "/") match = re.match( r"(?:Volume\s+(\d+) )?" @@ -76,12 +80,13 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor): """Extractor for bato.to manga""" reverse = False chapterclass = BatotoChapterExtractor - pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$" + pattern = (BASE_PATTERN + + r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$") example = "https://bato.to/title/12345-MANGA/" def __init__(self, match): self.root = text.root_from_url(match.group(0)) - self.manga_id = match.group(1) + self.manga_id = match.group(1) or match.group(2) url = "{}/title/{}".format(self.root, self.manga_id) MangaExtractor.__init__(self, match, url) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 58ae59d..402408e 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -37,7 +37,7 @@ class BloggerExtractor(BaseExtractor): findall_image = re.compile( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' - r'lh\d+\.googleusercontent\.com/|' + r'lh\d+(?:-\w+)?\.googleusercontent\.com|' r'\d+\.bp\.blogspot\.com)/[^"]+)').findall findall_video = re.compile( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 26123b8..e7fc14b 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,13 +6,13 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkrr.su/""" +"""Extractors for https://bunkrr.ru/""" from .lolisafe import LolisafeAlbumExtractor from .. import text from urllib.parse import urlsplit, urlunsplit -BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)" +BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:[rs]u|la|is|to)" MEDIA_DOMAIN_OVERRIDES = { "cdn9.bunkr.ru" : "c9.bunkr.ru", @@ -27,11 +27,11 @@ CDN_HOSTED_EXTENSIONS = ( class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkrr.su albums""" + """Extractor for bunkrr.ru albums""" category = "bunkr" - root = "https://bunkrr.su" + root = "https://bunkrr.ru" pattern = BASE_PATTERN + r"/a/([^/?#]+)" - example = "https://bunkrr.su/a/ID" + example = "https://bunkrr.ru/a/ID" def fetch_album(self, album_id): # album metadata @@ -84,11 +84,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): class BunkrMediaExtractor(BunkrAlbumExtractor): - """Extractor for bunkrr.su media links""" + """Extractor for bunkrr.ru media links""" subcategory = "media" directory_fmt = ("{category}",) pattern = BASE_PATTERN + r"/[vid]/([^/?#]+)" - example = "https://bunkrr.su/v/FILENAME" + example = "https://bunkrr.ru/v/FILENAME" def fetch_album(self, album_id): try: diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 2bf200b..ef5a44c 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -38,10 +38,6 @@ BASE_PATTERN = CheveretoExtractor.update({ "root": "https://jpg4.su", "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", }, - "pixl": { - "root": "https://pixl.li", - "pattern": r"pixl\.(?:li|is)", - }, "imgkiwi": { "root": "https://img.kiwi", "pattern": r"img\.kiwi", diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 0dd05ef..cf0f8c9 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -102,6 +102,9 @@ class Extractor(): def config_accumulate(self, key): return config.accumulate(self._cfgpath, key) + def config_instance(self, key, default=None): + return default + def _config_shared(self, key, default=None): return config.interpolate_common( ("extractor",), self._cfgpath, key, default) @@ -735,9 +738,10 @@ class BaseExtractor(Extractor): for index, group in enumerate(match.groups()): if group is not None: if index: - self.category, self.root = self.instances[index-1] + self.category, self.root, info = self.instances[index-1] if not self.root: self.root = text.root_from_url(match.group(0)) + self.config_instance = info.get else: self.root = group self.category = group.partition("://")[2] @@ -757,7 +761,7 @@ class BaseExtractor(Extractor): root = info["root"] if root: root = root.rstrip("/") - instance_list.append((category, root)) + instance_list.append((category, root, info)) pattern = info.get("pattern") if not pattern: diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 4b5f1d7..bcfbe73 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -38,7 +38,7 @@ class DeviantartExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = (match.group(1) or match.group(2)).lower() + self.user = (match.group(1) or match.group(2) or "").lower() self.offset = 0 def _init(self): @@ -452,9 +452,11 @@ class DeviantartExtractor(Extractor): return None dev = self.api.deviation(deviation["deviationid"], False) - folder = dev["premium_folder_data"] + folder = deviation["premium_folder_data"] username = dev["author"]["username"] - has_access = folder["has_access"] + + # premium_folder_data is no longer present when user has access (#5063) + has_access = ("premium_folder_data" not in dev) or folder["has_access"] if not has_access and folder["type"] == "watchers" and \ self.config("auto-watch"): @@ -547,22 +549,45 @@ class DeviantartAvatarExtractor(DeviantartExtractor): example = "https://www.deviantart.com/USER/avatar/" def deviations(self): - profile = self.api.user_profile(self.user.lower()) - if profile: - url = profile["user"]["usericon"] - return ({ - "author" : profile["user"], - "category" : "avatar", - "index" : text.parse_int(url.rpartition("?")[2]), - "is_deleted" : False, - "is_downloadable": False, - "published_time" : 0, - "title" : "avatar", - "content" : { - "src": url.replace("/avatars/", "/avatars-big/", 1), - }, - },) - return () + name = self.user.lower() + profile = self.api.user_profile(name) + if not profile: + return () + + user = profile["user"] + icon = user["usericon"] + index = icon.rpartition("?")[2] + + formats = self.config("formats") + if not formats: + url = icon.replace("/avatars/", "/avatars-big/", 1) + return (self._make_deviation(url, user, index, ""),) + + if isinstance(formats, str): + formats = formats.replace(" ", "").split(",") + + results = [] + for fmt in formats: + fmt, _, ext = fmt.rpartition(".") + if fmt: + fmt = "-" + fmt + url = "https://a.deviantart.net/avatars{}/{}/{}/{}.{}?{}".format( + fmt, name[0], name[1], name, ext, index) + results.append(self._make_deviation(url, user, index, fmt)) + return results + + def _make_deviation(self, url, user, index, fmt): + return { + "author" : user, + "category" : "avatar", + "index" : text.parse_int(index), + "is_deleted" : False, + "is_downloadable": False, + "published_time" : 0, + "title" : "avatar" + fmt, + "stats" : {"comments": 0}, + "content" : {"src": url}, + } class DeviantartBackgroundExtractor(DeviantartExtractor): diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 6a0e069..8c9da2f 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -44,24 +44,26 @@ class EromeExtractor(Extractor): pos = page.index('<div class="user-profile', pos) user, pos = text.extract( page, 'href="https://www.erome.com/', '"', pos) - count, pos = text.extract( - page, 'fa-camera"></i>', '</span>', pos) + + urls = [] + groups = page.split('<div class="media-group"') + for group in util.advance(groups, 1): + url = (text.extr(group, '<source src="', '"') or + text.extr(group, 'data-src="', '"')) + if url: + urls.append(url) data = { "album_id" : album_id, "title" : text.unescape(title), "user" : text.unquote(user), + "count" : len(urls), "_http_headers": {"Referer": url}, - "count" : text.parse_int(count), } yield Message.Directory, data - groups = page.split('<div class="media-group"') - for data["num"], group in enumerate(util.advance(groups, 1), 1): - url = (text.extr(group, '<source src="', '"') or - text.extr(group, 'data-src="', '"')) - if url: - yield Message.Url, url, text.nameext_from_url(url, data) + for data["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, data) def albums(self): return () diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py index 20afb5a..beecbff 100644 --- a/gallery_dl/extractor/fuskator.py +++ b/gallery_dl/extractor/fuskator.py @@ -22,7 +22,7 @@ class FuskatorGalleryExtractor(GalleryExtractor): def __init__(self, match): self.gallery_hash = match.group(1) - url = "{}/thumbs/{}/".format(self.root, self.gallery_hash) + url = "{}/thumbs/{}/index.html".format(self.root, self.gallery_hash) GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -50,15 +50,16 @@ class FuskatorGalleryExtractor(GalleryExtractor): "gallery_id" : text.parse_int(gallery_id), "gallery_hash": self.gallery_hash, "title" : text.unescape(title[:-15]), - "views" : data["hits"], - "score" : data["rating"], - "tags" : data["tags"].split(","), - "count" : len(data["images"]), + "views" : data.get("hits"), + "score" : data.get("rating"), + "tags" : (data.get("tags") or "").split(","), } def images(self, page): - for image in self.data["images"]: - yield "https:" + image["imageUrl"], image + return [ + ("https:" + image["imageUrl"], image) + for image in self.data["images"] + ] class FuskatorSearchExtractor(Extractor): diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index eba1539..83f1392 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -23,7 +23,7 @@ class GelbooruBase(): root = "https://gelbooru.com" offset = 0 - def _api_request(self, params, key="post"): + def _api_request(self, params, key="post", log=False): if "s" not in params: params["s"] = "post" params["api_key"] = self.api_key @@ -35,8 +35,9 @@ class GelbooruBase(): try: posts = data[key] except KeyError: - self.log.error("Incomplete API response (missing '%s')", key) - self.log.debug("%s", data) + if log: + self.log.error("Incomplete API response (missing '%s')", key) + self.log.debug("%s", data) return [] if not isinstance(posts, list): @@ -117,7 +118,7 @@ class GelbooruBase(): class GelbooruTagExtractor(GelbooruBase, gelbooru_v02.GelbooruV02TagExtractor): """Extractor for images from gelbooru.com based on search-tags""" - pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]+)" + pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]*)" example = "https://gelbooru.com/index.php?page=post&s=list&tags=TAG" @@ -169,7 +170,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, "limit": "1", } - count = self._api_request(params, "@attributes")[0]["count"] + count = self._api_request(params, "@attributes", True)[0]["count"] if count <= self.offset: return @@ -186,7 +187,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, params["limit"] = self.per_page while True: - favs = self._api_request(params, "favorite") + favs = self._api_request(params, "favorite", True) favs.reverse() if skip: diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 0c8af3d..7ab6d02 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -22,14 +22,10 @@ class GelbooruV02Extractor(booru.BooruExtractor): def _init(self): self.api_key = self.config("api-key") self.user_id = self.config("user-id") - - try: - self.api_root = INSTANCES[self.category]["api_root"] - except KeyError: - self.api_root = self.root + self.api_root = self.config_instance("api_root") or self.root if self.category == "realbooru": - self.items = self._items_realbooru + self._file_url = self._file_url_realbooru self._tags = self._tags_realbooru def _api_request(self, params): @@ -128,28 +124,6 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url - def _items_realbooru(self): - from .common import Message - data = self.metadata() - - for post in self.posts(): - try: - html = self._html(post) - url = post["file_url"] = text.rextract( - html, 'href="', '"', html.index(">Original<"))[0] - except Exception: - self.log.debug("Unable to fetch download URL for post %s " - "(md5: %s)", post.get("id"), post.get("md5")) - continue - - text.nameext_from_url(url, post) - post.update(data) - self._prepare(post) - self._tags(post, html) - - yield Message.Directory, post - yield Message.Url, url, post - def _tags_realbooru(self, post, page): tag_container = text.extr(page, 'id="tagLink"', '</div>') tags = collections.defaultdict(list) @@ -161,7 +135,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): post["tags_" + key] = " ".join(value) -INSTANCES = { +BASE_PATTERN = GelbooruV02Extractor.update({ "realbooru": { "root": "https://realbooru.com", "pattern": r"realbooru\.com", @@ -187,16 +161,14 @@ INSTANCES = { "root": "https://xbooru.com", "pattern": r"xbooru\.com", }, -} - -BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES) +}) class GelbooruV02TagExtractor(GelbooruV02Extractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)" example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG" def __init__(self, match): @@ -208,6 +180,8 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor): return {"search_tags": self.tags} def posts(self): + if self.tags == "all": + self.tags = "" return self._pagination({"tags": self.tags}) diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py new file mode 100644 index 0000000..792f666 --- /dev/null +++ b/gallery_dl/extractor/hatenablog.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hatenablog.com""" + +import re +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = ( + r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?" + r"([\w-]+\.(?:hatenablog\.(?:com|jp)" + r"|hatenadiary\.com|hateblo\.jp)))" +) +QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" + + +class HatenablogExtractor(Extractor): + """Base class for HatenaBlog extractors""" + category = "hatenablog" + directory_fmt = ("{category}", "{domain}") + filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}" + archive_fmt = "{filename}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.domain = match.group(1) or match.group(2) + + def _init(self): + self._find_img = re.compile(r'<img +([^>]+)').finditer + + def _handle_article(self, article: str): + extr = text.extract_from(article) + date = text.parse_datetime(extr('<time datetime="', '"')) + entry_link = text.unescape(extr('<a href="', '"')) + entry = entry_link.partition("/entry/")[2] + title = text.unescape(extr('>', '<')) + content = extr( + '<div class="entry-content hatenablog-entry">', '</div>') + + images = [] + for i in self._find_img(content): + attributes = i.group(1) + if 'class="hatena-fotolife"' not in attributes: + continue + image = text.unescape(text.extr(attributes, 'src="', '"')) + images.append(image) + + data = { + "domain": self.domain, + "date": date, + "entry": entry, + "title": title, + "count": len(images), + } + yield Message.Directory, data + for data["num"], url in enumerate(images, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + +class HatenablogEntriesExtractor(HatenablogExtractor): + """Base class for a list of entries""" + allowed_parameters = () + + def __init__(self, match): + HatenablogExtractor.__init__(self, match) + self.path = match.group(3) + self.query = {key: value for key, value in text.parse_query( + match.group(4)).items() if self._acceptable_query(key)} + + def _init(self): + HatenablogExtractor._init(self) + self._find_pager_url = re.compile( + r' class="pager-next">\s*<a href="([^"]+)').search + + def items(self): + url = "https://" + self.domain + self.path + query = self.query + + while url: + page = self.request(url, params=query).text + + extr = text.extract_from(page) + attributes = extr('<body ', '>') + if "page-archive" in attributes: + yield from self._handle_partial_articles(extr) + else: + yield from self._handle_full_articles(extr) + + match = self._find_pager_url(page) + url = text.unescape(match.group(1)) if match else None + query = None + + def _handle_partial_articles(self, extr): + while True: + section = extr('<section class="archive-entry', '</section>') + if not section: + break + + url = "hatenablog:" + text.unescape(text.extr( + section, '<a class="entry-title-link" href="', '"')) + data = {"_extractor": HatenablogEntryExtractor} + yield Message.Queue, url, data + + def _handle_full_articles(self, extr): + while True: + attributes = extr('<article ', '>') + if not attributes: + break + if "no-entry" in attributes: + continue + + article = extr('', '</article>') + yield from self._handle_article(article) + + def _acceptable_query(self, key): + return key == "page" or key in self.allowed_parameters + + +class HatenablogEntryExtractor(HatenablogExtractor): + """Extractor for a single entry URL""" + subcategory = "entry" + pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE + example = "https://BLOG.hatenablog.com/entry/PATH" + + def __init__(self, match): + HatenablogExtractor.__init__(self, match) + self.path = match.group(3) + + def items(self): + url = "https://" + self.domain + "/entry/" + self.path + page = self.request(url).text + + extr = text.extract_from(page) + while True: + attributes = extr('<article ', '>') + if "no-entry" in attributes: + continue + article = extr('', '</article>') + return self._handle_article(article) + + +class HatenablogHomeExtractor(HatenablogEntriesExtractor): + """Extractor for a blog's home page""" + subcategory = "home" + pattern = BASE_PATTERN + r"(/?)" + QUERY_RE + example = "https://BLOG.hatenablog.com" + + +class HatenablogArchiveExtractor(HatenablogEntriesExtractor): + """Extractor for a blog's archive page""" + subcategory = "archive" + pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + r"|/category/[^?#]+)?)" + QUERY_RE) + example = "https://BLOG.hatenablog.com/archive/2024" + + +class HatenablogSearchExtractor(HatenablogEntriesExtractor): + """Extractor for a blog's search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"(/search)" + QUERY_RE + example = "https://BLOG.hatenablog.com/search?q=QUERY" + allowed_parameters = ("q",) diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py deleted file mode 100644 index a522140..0000000 --- a/gallery_dl/extractor/hbrowse.py +++ /dev/null @@ -1,92 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://www.hbrowse.com/""" - -from .common import ChapterExtractor, MangaExtractor -from .. import text, util, exception - - -class HbrowseBase(): - """Base class for hbrowse extractors""" - category = "hbrowse" - root = "https://www.hbrowse.com" - - def parse_page(self, page, data): - """Parse metadata on 'page' and add it to 'data'""" - data, pos = text.extract_all(page, ( - ('manga' , '<td class="listLong">', '</td>'), - ('artist', '<td class="listLong">', '</td>'), - ('total' , '<td class="listLong">', ' '), - ('origin', '<td class="listLong">', '</td>'), - ), values=data) - - if not data["manga"] and "<b>Warning</b>" in page: - msg = page.rpartition(">")[2].strip() - raise exception.StopExtraction("Site is not accessible: '%s'", msg) - - tags = text.extract(page, 'class="listTable"', '</table>', pos)[0] - - data["manga"] = text.unescape(data["manga"]) - data["total"] = text.parse_int(data["total"]) - data["artist"] = text.remove_html(data["artist"]) - data["origin"] = text.remove_html(data["origin"]) - data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"')) - return data - - -class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor): - """Extractor for manga-chapters from hbrowse.com""" - directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}") - filename_fmt = ("{category}_{manga_id}_{chapter:>05}_" - "{page:>03}.{extension}") - archive_fmt = "{manga_id}_{chapter}_{page}" - pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))" - example = "https://www.hbrowse.com/12345/c00000" - - def __init__(self, match): - self.path, self.gid, self.chapter = match.groups() - self.path += "/" - ChapterExtractor.__init__(self, match) - - def metadata(self, page): - return self.parse_page(page, { - "manga_id": text.parse_int(self.gid), - "chapter": text.parse_int(self.chapter) - }) - - def images(self, page): - base = self.root + "/data" + self.path - json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]" - return [(base + name, None) for name in util.json_loads(json_data)] - - -class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): - """Extractor for manga from hbrowse.com""" - chapterclass = HbrowseChapterExtractor - reverse = False - pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$" - example = "https://www.hbrowse.com/12345" - - def chapters(self, page): - results = [] - data = self.parse_page(page, { - "manga_id": text.parse_int( - self.manga_url.rstrip("/").rpartition("/")[2]) - }) - - pos = 0 - needle = '<td class="listMiddle">\n<a class="listLink" href="' - while True: - url, pos = text.extract(page, needle, '"', pos) - if not url: - return results - title, pos = text.extract(page, '>View ', '<', pos) - data["chapter"] = text.parse_int(url.rpartition("/")[2][1:]) - data["title"] = title - results.append((text.urljoin(self.root, url), data.copy())) diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index f6170c2..54c6539 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -29,8 +29,9 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): example = "https://issuu.com/issuu/docs/TITLE/" def metadata(self, page): + pos = page.rindex('id="initial-data"') data = util.json_loads(text.rextract( - page, '<script data-json="', '"')[0].replace(""", '"')) + page, '<script data-json="', '"', pos)[0].replace(""", '"')) doc = data["initialDocumentData"]["document"] doc["date"] = text.parse_datetime( diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index c24e57d..10228b5 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -9,9 +9,10 @@ """Extractors for https://kemono.party/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache, memcache import itertools +import json import re BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" @@ -37,10 +38,14 @@ class KemonopartyExtractor(Extractor): Extractor.__init__(self, match) def _init(self): + self.revisions = self.config("revisions") self._prepare_ddosguard_cookies() self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall + self._json_dumps = json.JSONEncoder( + ensure_ascii=False, check_circular=False, + sort_keys=True, separators=(",", ":")).encode def items(self): find_hash = re.compile(HASH_PATTERN).match @@ -223,11 +228,23 @@ class KemonopartyExtractor(Extractor): idx = len(revs) for rev in revs: + rev["revision_hash"] = self._revision_hash(rev) rev["revision_index"] = idx idx -= 1 return revs + def _revision_hash(self, revision): + rev = revision.copy() + rev.pop("revision_id", None) + rev.pop("added", None) + rev.pop("next", None) + rev.pop("prev", None) + rev["file"].pop("name", None) + for a in rev["attachments"]: + a.pop("name", None) + return util.sha1(self._json_dumps(rev)) + def _validate(response): return (response.headers["content-length"] != "9" or @@ -252,13 +269,13 @@ class KemonopartyUserExtractor(KemonopartyExtractor): url = self.api_url params = text.parse_query(self.query) params["o"] = text.parse_int(params.get("o")) - revisions = self.config("revisions") while True: posts = self.request(url, params=params).json() - if revisions: + if self.revisions: for post in posts: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 post_url = "{}/post/{}".format(self.api_url, post["id"]) try: @@ -296,7 +313,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor): def posts(self): if not self.revision: post = self.request(self.api_url).json() - if self.config("revisions"): + if self.revisions: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 try: revs = self._post_revisions(self.api_url) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 94bea57..bca7e4d 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -148,6 +148,32 @@ class MangadexFeedExtractor(MangadexExtractor): return self.api.user_follows_manga_feed() +class MangadexListExtractor(MangadexExtractor): + """Extractor for mangadex lists""" + subcategory = "list" + pattern = (BASE_PATTERN + + r"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?") + example = ("https://mangadex.org/list" + "/01234567-89ab-cdef-0123-456789abcdef/NAME") + + def __init__(self, match): + MangadexExtractor.__init__(self, match) + if match.group(2) == "feed": + self.subcategory = "list-feed" + else: + self.items = self._items_titles + + def chapters(self): + return self.api.list_feed(self.uuid) + + def _items_titles(self): + data = {"_extractor": MangadexMangaExtractor} + for item in self.api.list(self.uuid)["relationships"]: + if item["type"] == "manga": + url = "{}/title/{}".format(self.root, item["id"]) + yield Message.Queue, url, data + + class MangadexAPI(): """Interface for the MangaDex API v5 @@ -173,6 +199,12 @@ class MangadexAPI(): params = {"includes[]": ("scanlation_group",)} return self._call("/chapter/" + uuid, params)["data"] + def list(self, uuid): + return self._call("/list/" + uuid)["data"] + + def list_feed(self, uuid): + return self._pagination("/list/" + uuid + "/feed") + @memcache(keyarg=1) def manga(self, uuid): params = {"includes[]": ("artist", "author")} diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 0b63d6c..68b4196 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -75,7 +75,7 @@ class MastodonExtractor(BaseExtractor): account["acct"], account["moved"]["acct"]) -INSTANCES = { +BASE_PATTERN = MastodonExtractor.update({ "mastodon.social": { "root" : "https://mastodon.social", "pattern" : r"mastodon\.social", @@ -100,9 +100,7 @@ INSTANCES = { "client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o", "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY", } -} - -BASE_PATTERN = MastodonExtractor.update(INSTANCES) + "(?:/web)?" +}) + "(?:/web)?" class MastodonUserExtractor(MastodonExtractor): @@ -174,10 +172,8 @@ class MastodonAPI(): if access_token is None or access_token == "cache": access_token = _access_token_cache(extractor.instance) if not access_token: - try: - access_token = INSTANCES[extractor.category]["access-token"] - except (KeyError, TypeError): - pass + access_token = extractor.config_instance("access-token") + if access_token: self.headers = {"Authorization": "Bearer " + access_token} else: diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index b991705..9614513 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -116,7 +116,8 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): yield from text.extract_iter( page, 'href="javascript:void(0);"><img src="', '"') else: - yield text.extr(page, 'itemprop="image" src="', '"') + pos = page.find('id="view-center"') + 1 + yield text.extract(page, 'itemprop="image" src="', '"', pos)[0] @staticmethod def _extract_user_name(page): diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index bc7b308..d36f509 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -235,10 +235,6 @@ BASE_PATTERN = NitterExtractor.update({ "root": "https://nitter.net", "pattern": r"nitter\.net", }, - "nitter.lacontrevoie.fr": { - "root": "https://nitter.lacontrevoie.fr", - "pattern": r"nitter\.lacontrevoie\.fr", - }, "nitter.1d4.us": { "root": "https://nitter.1d4.us", "pattern": r"nitter\.1d4\.us", diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 1690160..8c8a5a9 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -358,8 +358,8 @@ class OAuthMastodon(OAuthBase): yield Message.Version, 1 from . import mastodon - for application in mastodon.INSTANCES.values(): - if self.instance == application["root"].partition("://")[2]: + for _, root, application in mastodon.MastodonExtractor.instances: + if self.instance == root.partition("://")[2]: break else: application = self._register(self.instance) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 89c0d2f..5226724 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -56,7 +56,7 @@ class PahealExtractor(Extractor): "date" : text.parse_datetime( extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"), "source" : text.unescape(text.extr( - extr(">Source Link<", "</td>"), "href='", "'")), + extr(">Source Link<", "</td>"), "href='", "'")), } dimensions, size, ext = extr("Info</th><td>", "<").split(" // ") diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 6c2f39d..62d11f2 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -52,19 +52,29 @@ class PatreonExtractor(Extractor): post["hash"] = fhash post["type"] = kind post["num"] += 1 - yield Message.Url, url, text.nameext_from_url(name, post) + text.nameext_from_url(name, post) + if text.ext_from_url(url) == "m3u8": + url = "ytdl:" + url + post["extension"] = "mp4" + yield Message.Url, url, post else: self.log.debug("skipping %s (%s %s)", url, fhash, kind) - @staticmethod - def _postfile(post): + def _postfile(self, post): postfile = post.get("post_file") if postfile: - return (("postfile", postfile["url"], postfile["name"]),) + url = postfile["url"] + name = postfile.get("name") + if not name: + if url.startswith("https://stream.mux.com/"): + name = url + else: + name = self._filename(url) or url + return (("postfile", url, name),) return () def _images(self, post): - for image in post["images"]: + for image in post.get("images") or (): url = image.get("download_url") if url: name = image.get("file_name") or self._filename(url) or url @@ -80,7 +90,7 @@ class PatreonExtractor(Extractor): return () def _attachments(self, post): - for attachment in post["attachments"]: + for attachment in post.get("attachments") or (): url = self.request( attachment["url"], method="HEAD", allow_redirects=False, fatal=False, diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index ac6a391..339646f 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -32,7 +32,7 @@ class PhilomenaExtractor(BooruExtractor): post["date"] = text.parse_datetime(post["created_at"]) -INSTANCES = { +BASE_PATTERN = PhilomenaExtractor.update({ "derpibooru": { "root": "https://derpibooru.org", "pattern": r"(?:www\.)?derpibooru\.org", @@ -48,9 +48,7 @@ INSTANCES = { "pattern": r"furbooru\.org", "filter_id": "2", }, -} - -BASE_PATTERN = PhilomenaExtractor.update(INSTANCES) +}) class PhilomenaPostExtractor(PhilomenaExtractor): @@ -176,10 +174,7 @@ class PhilomenaAPI(): if filter_id: params["filter_id"] = filter_id elif not api_key: - try: - params["filter_id"] = INSTANCES[extr.category]["filter_id"] - except (KeyError, TypeError): - params["filter_id"] = "2" + params["filter_id"] = extr.config_instance("filter_id") or "2" params["page"] = extr.page_start params["per_page"] = extr.per_page diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 4414c71..b9821f2 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -826,9 +826,9 @@ class PixivAppAPI(): extractor.session.headers.update({ "App-OS" : "ios", - "App-OS-Version": "13.1.2", - "App-Version" : "7.7.6", - "User-Agent" : "PixivIOSApp/7.7.6 (iOS 13.1.2; iPhone11,8)", + "App-OS-Version": "16.7.2", + "App-Version" : "7.19.1", + "User-Agent" : "PixivIOSApp/7.19.1 (iOS 16.7.2; iPhone12,8)", "Referer" : "https://app-api.pixiv.net/", }) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 602895c..b3b7a9c 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/posts?(?:/show)?/([0-9a-f]+)" + pattern = BASE_PATTERN + r"/posts?(?:/show)?/(\w+)" example = "https://sankaku.app/post/show/12345" def __init__(self, match): diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 8a08fab..67f38c4 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -19,17 +19,12 @@ class Shimmie2Extractor(BaseExtractor): archive_fmt = "{id}" def _init(self): - try: - instance = INSTANCES[self.category] - except KeyError: - return - - cookies = instance.get("cookies") + cookies = self.config_instance("cookies") if cookies: domain = self.root.rpartition("/")[2] self.cookies_update_dict(cookies, domain=domain) - file_url = instance.get("file_url") + file_url = self.config_instance("file_url") if file_url: self.file_url_fmt = file_url @@ -73,15 +68,15 @@ class Shimmie2Extractor(BaseExtractor): return "'" -INSTANCES = { +BASE_PATTERN = Shimmie2Extractor.update({ "loudbooru": { "root": "https://loudbooru.com", "pattern": r"loudbooru\.com", "cookies": {"ui-tnc-agreed": "true"}, }, "giantessbooru": { - "root": "https://giantessbooru.com", - "pattern": r"giantessbooru\.com", + "root": "https://sizechangebooru.com", + "pattern": r"(?:sizechange|giantess)booru\.com", "cookies": {"agreed": "true"}, }, "tentaclerape": { @@ -97,9 +92,7 @@ INSTANCES = { "root": "https://rule34hentai.net", "pattern": r"rule34hentai\.net", }, -} - -BASE_PATTERN = Shimmie2Extractor.update(INSTANCES) + r"/(?:index\.php\?q=/?)?" +}) + r"/(?:index\.php\?q=/?)?" class Shimmie2TagExtractor(Shimmie2Extractor): @@ -183,25 +176,25 @@ class Shimmie2TagExtractor(Shimmie2Extractor): extr = text.extract_from(self.request(url).text) while True: - pid = extr('href="./index.php?q=/post/view/', '&') + pid = extr("href='./index.php?q=/post/view/", "&") if not pid: break - tags, dimensions, size = extr('title="', '"').split(" // ") + tags, dimensions, size = extr("title='", "'").split(" // ") width, _, height = dimensions.partition("x") yield { "file_url": file_url_fmt(pid), - "id": pid, - "md5": "", - "tags": tags, - "width": width, - "height": height, - "size": text.parse_bytes(size[:-1]), + "id" : pid, + "md5" : "", + "tags" : tags, + "width" : width, + "height" : height, + "size" : text.parse_bytes(size[:-1]), } pnum += 1 - if not extr('/{}">{}<'.format(pnum, pnum), ">"): + if not extr("/{0}'>{0}<".format(pnum), ">"): return @@ -248,7 +241,7 @@ class Shimmie2PostExtractor(Shimmie2Extractor): "id" : self.post_id, "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), "md5" : "", - "file_url": self.root + extr('id="main_image" src=".', '"'), + "file_url": self.root + extr("id='main_image' src='.", "'"), "width" : extr("orig_width =", ";"), "height" : 0, "size" : 0, diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py new file mode 100644 index 0000000..9d46fd6 --- /dev/null +++ b/gallery_dl/extractor/steamgriddb.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.steamgriddb.com""" + +from .common import Extractor, Message +from .. import text, exception + + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?steamgriddb\.com" +LANGUAGE_CODES = ( + "aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av", "ay", "az", + "ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", "br", "bs", "ca", "ce", + "ch", "co", "cr", "cs", "cu", "cv", "cy", "da", "de", "dv", "dz", "ee", + "el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fj", "fo", "fr", + "fy", "ga", "gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr", + "ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik", "io", "is", + "it", "iu", "ja", "jv", "ka", "kg", "ki", "kj", "kk", "kl", "km", "kn", + "ko", "kr", "ks", "ku", "kv", "kw", "ky", "la", "lb", "lg", "li", "ln", + "lo", "lt", "lu", "lv", "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms", + "mt", "my", "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv", + "ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps", "pt", "qu", + "rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd", "se", "sg", "si", "sk", + "sl", "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw", "ta", + "te", "tg", "th", "ti", "tk", "tl", "tn", "to", "tr", "ts", "tt", "tw", + "ty", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", + "yo", "za", "zh", "zu", +) +FILE_EXT_TO_MIME = { + "png": "image/png", + "jpeg": "image/jpeg", + "jpg": "image/jpeg", + "webp": "image/webp", + "ico": "image/vnd.microsoft.icon", + "all": "all", +} + + +class SteamgriddbExtractor(Extractor): + """Base class for SteamGridDB""" + category = "steamgriddb" + directory_fmt = ("{category}", "{subcategory}", "{game[id]}") + filename_fmt = "{game[id]}_{id}_{num:>02}.{extension}" + archive_fmt = "{filename}" + root = "https://www.steamgriddb.com" + + def _init(self): + self.cookies_update({ + "userprefs": "%7B%22adult%22%3Afalse%7D", + }) + + def items(self): + download_fake_png = self.config("download-fake-png", True) + + for asset in self.assets(): + if download_fake_png and asset.get("fake_png"): + urls = (asset["url"], asset["fake_png"]) + else: + urls = (asset["url"],) + + asset["count"] = len(urls) + yield Message.Directory, asset + for asset["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, asset) + + def _call(self, endpoint, **kwargs): + data = self.request(self.root + endpoint, **kwargs).json() + if not data["success"]: + raise exception.StopExtraction(data["error"]) + return data["data"] + + +class SteamgriddbAssetsExtractor(SteamgriddbExtractor): + """Base class for extracting a list of assets""" + + def __init__(self, match): + SteamgriddbExtractor.__init__(self, match) + list_type = match.group(1) + id = int(match.group(2)) + self.game_id = id if list_type == "game" else None + self.collection_id = id if list_type == "collection" else None + self.page = int(match.group(3) or 1) + + def assets(self): + limit = 48 + page = min(self.page - 1, 0) + + sort = self.config("sort", "score_desc") + if sort not in ("score_desc", "score_asc", "score_old_desc", + "score_old_asc", "age_desc", "age_asc"): + raise exception.StopExtractor("Invalid sort '%s'", sort) + + json = { + "static" : self.config("static", True), + "animated": self.config("animated", True), + "humor" : self.config("humor", True), + "nsfw" : self.config("nsfw", True), + "epilepsy": self.config("epilepsy", True), + "untagged": self.config("untagged", True), + + "asset_type": self.asset_type, + "limit": limit, + "order": sort, + } + if self.valid_dimensions: + json["dimensions"] = self.config_list( + "dimensions", "dimension", self.valid_dimensions) + json["styles"] = self.config_list("styles", "style", self.valid_styles) + json["languages"] = self.config_list( + "languages", "language", LANGUAGE_CODES) + file_types = self.config_list( + "file-types", "file type", self.valid_file_types) + json["mime"] = [FILE_EXT_TO_MIME[i] for i in file_types] + + if self.game_id: + json["game_id"] = [self.game_id] + else: + json["collection_id"] = self.collection_id + + while True: + json["page"] = page + + data = self._call( + "/api/public/search/assets", method="POST", json=json) + for asset in data["assets"]: + if not asset.get("game"): + asset["game"] = data["game"] + yield asset + + if data["total"] <= limit * page: + break + page += 1 + + def config_list(self, key, type_name, valid_values): + value = self.config(key) + if isinstance(value, str): + value = value.split(",") + + if value is None or "all" in value: + return ["all"] + + for i in value: + if i not in valid_values: + raise exception.StopExtraction("Invalid %s '%s'", type_name, i) + + return value + + +class SteamgriddbAssetExtractor(SteamgriddbExtractor): + """Extractor for a single asset""" + subcategory = "asset" + pattern = BASE_PATTERN + r"/(grid|hero|logo|icon)/(\d+)" + example = "https://www.steamgriddb.com/grid/1234" + + def __init__(self, match): + SteamgriddbExtractor.__init__(self, match) + self.asset_type = match.group(1) + self.asset_id = match.group(2) + + def assets(self): + endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id + asset = self._call(endpoint)["asset"] + return (asset,) + + +class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor): + subcategory = "grids" + asset_type = "grid" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/grids(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/grids" + valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930", + "512x512", "1024x1024") + valid_styles = ("alternate", "blurred", "no_logo", "material", + "white_logo") + valid_file_types = ("png", "jpeg", "jpg", "webp") + + +class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor): + subcategory = "heroes" + asset_type = "hero" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/heroes(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/heroes" + valid_dimensions = ("1920x620", "3840x1240", "1600x650") + valid_styles = ("alternate", "blurred", "material") + valid_file_types = ("png", "jpeg", "jpg", "webp") + + +class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor): + subcategory = "logos" + asset_type = "logo" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/logos(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/logos" + valid_dimensions = None + valid_styles = ("official", "white", "black", "custom") + valid_file_types = ("png", "webp") + + +class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor): + subcategory = "icons" + asset_type = "icon" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/icons" + valid_dimensions = ["{0}x{0}".format(i) for i in (8, 10, 14, 16, 20, 24, + 28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90, + 96, 100, 114, 120, 128, 144, 150, 152, 160, 180, 192, + 194, 256, 310, 512, 768, 1024)] + valid_styles = ("official", "custom") + valid_file_types = ("png", "ico") diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index aa9ab9f..cf759e0 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -546,7 +546,7 @@ class TwitterTimelineExtractor(TwitterExtractor): def _select_tweet_source(self): strategy = self.config("strategy") if strategy is None or strategy == "auto": - if self.retweets or self.replies or self.textonly: + if self.retweets or self.textonly: return self.api.user_tweets else: return self.api.user_media diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index f2e6521..49a3deb 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -15,7 +15,7 @@ class UrlshortenerExtractor(BaseExtractor): basecategory = "urlshortener" -INSTANCES = { +BASE_PATTERN = UrlshortenerExtractor.update({ "bitly": { "root": "https://bit.ly", "pattern": r"bit\.ly", @@ -26,9 +26,7 @@ INSTANCES = { "root": "https://t.co", "pattern": r"t\.co", }, -} - -BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES) +}) class UrlshortenerLinkExtractor(UrlshortenerExtractor): @@ -42,10 +40,7 @@ class UrlshortenerLinkExtractor(UrlshortenerExtractor): self.id = match.group(match.lastindex) def _init(self): - try: - self.headers = INSTANCES[self.category]["headers"] - except Exception: - self.headers = None + self.headers = self.config_instance("headers") def items(self): response = self.request( diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index c22e67e..95eeafe 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, exception +import re BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" @@ -24,6 +25,7 @@ class VkExtractor(Extractor): request_interval = (0.5, 1.5) def items(self): + sub = re.compile(r"/imp[fg]/").sub sizes = "wzyxrqpo" data = self.metadata() @@ -40,11 +42,15 @@ class VkExtractor(Extractor): continue try: - photo["url"] = photo[size + "src"] + url = photo[size + "src"] except KeyError: self.log.warning("no photo URL found (%s)", photo.get("id")) continue + photo["url"] = sub("/", url.partition("?")[0]) + # photo["url"] = url + photo["_fallback"] = (url,) + try: _, photo["width"], photo["height"] = photo[size] except ValueError: diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 3f2f410..949c7cb 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -87,23 +87,41 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): self.episode_no = params.get("episode_no") def metadata(self, page): - keywords, pos = text.extract( - page, '<meta name="keywords" content="', '"') - title, pos = text.extract( - page, '<meta property="og:title" content="', '"', pos) - descr, pos = text.extract( - page, '<meta property="og:description" content="', '"', pos) + extr = text.extract_from(page) + title = extr('<meta property="og:title" content="', '"') + descr = extr('<meta property="og:description" content="', '"') + + if extr('<div class="subj_info"', '\n'): + comic_name = extr('>', '<') + episode_name = extr('<h1 class="subj_episode" title="', '"') + else: + comic_name = episode_name = "" + + if extr('<span class="tx _btnOpenEpisodeList ', '"'): + episode = extr('>#', '<') + else: + episode = "" + + if extr('<div class="author_area"', '\n'): + username = extr('/creator/', '"') + author_name = extr('<span>', '</span>') + else: + username = author_name = "" return { - "genre" : self.genre, - "comic" : self.comic, - "title_no" : self.title_no, - "episode_no" : self.episode_no, - "title" : text.unescape(title), - "episode" : keywords.split(", ")[1], - "description": text.unescape(descr), - "lang" : self.lang, - "language" : util.code_to_language(self.lang), + "genre" : self.genre, + "comic" : self.comic, + "title_no" : self.title_no, + "episode_no" : self.episode_no, + "title" : text.unescape(title), + "episode" : episode, + "comic_name" : text.unescape(comic_name), + "episode_name": text.unescape(episode_name), + "username" : username, + "author_name" : text.unescape(author_name), + "description" : text.unescape(descr), + "lang" : self.lang, + "language" : util.code_to_language(self.lang), } @staticmethod diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py new file mode 100644 index 0000000..1eafc29 --- /dev/null +++ b/gallery_dl/extractor/wikimedia.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Ailothaen +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Wikimedia sites""" + +from .common import BaseExtractor, Message +from .. import text + + +class WikimediaExtractor(BaseExtractor): + """Base class for wikimedia extractors""" + basecategory = "wikimedia" + filename_fmt = "{filename} ({sha1[:8]}).{extension}" + directory_fmt = ("{category}", "{page}") + archive_fmt = "{sha1}" + request_interval = (1.0, 2.0) + + def __init__(self, match): + BaseExtractor.__init__(self, match) + path = match.group(match.lastindex) + + if self.category == "fandom": + self.category = \ + "fandom-" + self.root.partition(".")[0].rpartition("/")[2] + + if path.startswith("wiki/"): + path = path[5:] + self.api_path = "/w/api.php" + else: + self.api_path = "/api.php" + + pre, sep, _ = path.partition(":") + prefix = pre.lower() if sep else None + + self.title = path = text.unquote(path) + if prefix: + self.subcategory = prefix + + if prefix == "category": + self.params = { + "generator": "categorymembers", + "gcmtitle" : path, + "gcmtype" : "file", + } + elif prefix == "file": + self.params = { + "titles" : path, + } + else: + self.params = { + "generator": "images", + "titles" : path, + } + + def _init(self): + api_path = self.config_instance("api-path") + if api_path: + if api_path[0] == "/": + self.api_url = self.root + api_path + else: + self.api_url = api_path + else: + self.api_url = self.root + self.api_path + + def items(self): + for info in self._pagination(self.params): + image = info["imageinfo"][0] + + image["metadata"] = { + m["name"]: m["value"] + for m in image["metadata"]} + image["commonmetadata"] = { + m["name"]: m["value"] + for m in image["commonmetadata"]} + + filename = image["canonicaltitle"] + image["filename"], _, image["extension"] = \ + filename.partition(":")[2].rpartition(".") + image["date"] = text.parse_datetime( + image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + image["page"] = self.title + + yield Message.Directory, image + yield Message.Url, image["url"], image + + def _pagination(self, params): + """ + https://www.mediawiki.org/wiki/API:Query + https://opendata.stackexchange.com/questions/13381 + """ + + url = self.api_url + params["action"] = "query" + params["format"] = "json" + params["prop"] = "imageinfo" + params["iiprop"] = ( + "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth" + ) + + while True: + data = self.request(url, params=params).json() + + try: + pages = data["query"]["pages"] + except KeyError: + pass + else: + yield from pages.values() + + try: + continuation = data["continue"] + except KeyError: + break + params.update(continuation) + + +BASE_PATTERN = WikimediaExtractor.update({ + "wikipedia": { + "root": None, + "pattern": r"[a-z]{2,}\.wikipedia\.org", + }, + "wiktionary": { + "root": None, + "pattern": r"[a-z]{2,}\.wiktionary\.org", + }, + "wikiquote": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiquote\.org", + }, + "wikibooks": { + "root": None, + "pattern": r"[a-z]{2,}\.wikibooks\.org", + }, + "wikisource": { + "root": None, + "pattern": r"[a-z]{2,}\.wikisource\.org", + }, + "wikinews": { + "root": None, + "pattern": r"[a-z]{2,}\.wikinews\.org", + }, + "wikiversity": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiversity\.org", + }, + "wikispecies": { + "root": "https://species.wikimedia.org", + "pattern": r"species\.wikimedia\.org", + }, + "wikimediacommons": { + "root": "https://commons.wikimedia.org", + "pattern": r"commons\.wikimedia\.org", + }, + "mediawiki": { + "root": "https://www.mediawiki.org", + "pattern": r"(?:www\.)?mediawiki\.org", + }, + "fandom": { + "root": None, + "pattern": r"[\w-]+\.fandom\.com", + "api-path": "/api.php", + }, + "mariowiki": { + "root": "https://www.mariowiki.com", + "pattern": r"(?:www\.)?mariowiki\.com", + }, +}) + + +class WikimediaArticleExtractor(WikimediaExtractor): + """Extractor for wikimedia articles""" + subcategory = "article" + pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)" + example = "https://en.wikipedia.org/wiki/TITLE" |
