diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/common.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/gelbooru.py | 43 | ||||
| -rw-r--r-- | gallery_dl/extractor/gelbooru_v02.py | 18 | ||||
| -rw-r--r-- | gallery_dl/extractor/hitomi.py | 35 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangadex.py | 17 | ||||
| -rw-r--r-- | gallery_dl/extractor/newgrounds.py | 64 | ||||
| -rw-r--r-- | gallery_dl/extractor/patreon.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/wordpress.py | 41 |
8 files changed, 156 insertions, 66 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index c440aee..afe4a16 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -336,7 +336,8 @@ class Extractor(): now = time.time() for cookie in self._cookiejar: - if cookie.name in names and cookie.domain == domain: + if cookie.name in names and ( + not domain or cookie.domain == domain): if cookie.expires and cookie.expires < now: self.log.warning("Cookie '%s' has expired", cookie.name) else: diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index df45d0d..a6bda52 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from . import gelbooru_v02 -from .. import text, exception +from .. import text, util, exception import binascii @@ -20,6 +20,42 @@ class GelbooruBase(): basecategory = "booru" root = "https://gelbooru.com" + def _api_request(self, params): + url = self.root + "/index.php?page=dapi&s=post&q=index&json=1" + data = self.request(url, params=params).json() + if "post" not in data: + return () + posts = data["post"] + if not isinstance(posts, list): + return (posts,) + return posts + + def _pagination(self, params): + params["pid"] = self.page_start + params["limit"] = self.per_page + + post = None + while True: + try: + posts = self._api_request(params) + except ValueError: + if "tags" not in params or post is None: + raise + taglist = [tag for tag in params["tags"].split() + if not tag.startswith("id:<")] + taglist.append("id:<" + str(post.attrib["id"])) + params["tags"] = " ".join(taglist) + params["pid"] = 0 + continue + + post = None + for post in posts: + yield post + + if len(posts) < self.per_page: + return + params["pid"] += 1 + @staticmethod def _file_url(post): url = post["file_url"] @@ -82,6 +118,11 @@ class GelbooruPoolExtractor(GelbooruBase, "pool_name": text.unescape(name), } + def posts(self): + params = {} + for params["id"] in util.advance(self.post_ids, self.page_start): + yield from self._api_request(params) + class GelbooruPostExtractor(GelbooruBase, gelbooru_v02.GelbooruV02PostExtractor): diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index a42a202..8da0bde 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -19,8 +19,15 @@ import re class GelbooruV02Extractor(booru.BooruExtractor): basecategory = "gelbooru_v02" + def __init__(self, match): + booru.BooruExtractor.__init__(self, match) + try: + self.api_root = INSTANCES[self.category]["api_root"] + except KeyError: + self.api_root = self.root + def _api_request(self, params): - url = self.root + "/index.php?page=dapi&s=post&q=index" + url = self.api_root + "/index.php?page=dapi&s=post&q=index" return ElementTree.fromstring(self.request(url, params=params).text) def _pagination(self, params): @@ -97,12 +104,15 @@ class GelbooruV02Extractor(booru.BooruExtractor): post["notes"] = notes -BASE_PATTERN = GelbooruV02Extractor.update({ +INSTANCES = { "realbooru": {"root": "https://realbooru.com"}, - "rule34" : {"root": "https://rule34.xxx"}, + "rule34" : {"root": "https://rule34.xxx", + "api_root": " https://api.rule34.xxx"}, "safebooru": {"root": "https://safebooru.org"}, "tbib" : {"root": "https://tbib.org"}, -}) +} + +BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES) class GelbooruV02TagExtractor(GelbooruV02Extractor): diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 88cf98c..ce6c7ce 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -26,7 +26,7 @@ class HitomiGalleryExtractor(GalleryExtractor): r"/(?:[^/?#]+-)?(\d+)") test = ( ("https://hitomi.la/galleries/867789.html", { - "pattern": r"https://[a-c]b.hitomi.la/images/1639745412/\d+" + "pattern": r"https://[a-c]b.hitomi.la/images/1641140516/\d+" r"/[0-9a-f]{64}\.jpg", "keyword": "4873ef9a523621fc857b114e0b2820ba4066e9ae", "options": (("metadata", True),), @@ -39,12 +39,12 @@ class HitomiGalleryExtractor(GalleryExtractor): }), # Game CG with scenes (#321) ("https://hitomi.la/galleries/733697.html", { - "url": "479d16fe92117a6a2ce81b4e702e6347922c81e3", + "url": "d4854175da2b5fa4ae62749266c7be0bf237dc99", "count": 210, }), # fallback for galleries only available through /reader/ URLs ("https://hitomi.la/galleries/1045954.html", { - "url": "ebc1415c5d7f634166ef7e2635b77735de1ea7a2", + "url": "eea99c3745719a7a392150335e6ae3f73faa0b85", "count": 1413, }), # gallery with "broken" redirect @@ -138,7 +138,7 @@ class HitomiGalleryExtractor(GalleryExtractor): def images(self, _): # see https://ltn.hitomi.la/gg.js - gg_m, gg_b = _parse_gg(self) + gg_m, gg_b, gg_default = _parse_gg(self) result = [] for image in self.info["files"]: @@ -148,7 +148,7 @@ class HitomiGalleryExtractor(GalleryExtractor): # see https://ltn.hitomi.la/common.js inum = int(ihash[-1] + ihash[-3:-1], 16) url = "https://{}b.hitomi.la/images/{}/{}/{}.{}".format( - chr(97 + gg_m.get(inum, 0)), + chr(97 + gg_m.get(inum, gg_default)), gg_b, inum, ihash, idata["extension"], ) result.append((url, idata)) @@ -195,10 +195,25 @@ class HitomiTagExtractor(Extractor): def _parse_gg(extr): page = extr.request("https://ltn.hitomi.la/gg.js").text - m = { - int(match.group(1)): int(match.group(2)) - for match in re.finditer(r"case (\d+): o = (\d+); break;", page) - } + m = {} + + keys = [] + for match in re.finditer( + r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?", page): + key, value = match.groups() + keys.append(int(key)) + + if value: + value = int(value) + for key in keys: + m[key] = value + keys.clear() + + for match in re.finditer( + r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)", page): + m[int(match.group(1))] = int(match.group(2)) + + d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page) b = re.search(r"b:\s*[\"'](.+)[\"']", page) - return m, b.group(1).strip("/") + return m, b.group(1).strip("/"), int(d.group(1)) if d else 1 diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 393f4e2..ea5d4a8 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -40,7 +40,7 @@ class MangadexExtractor(Extractor): uuid = chapter["id"] data = self._transform(chapter) data["_extractor"] = MangadexChapterExtractor - self._cache[uuid] = (chapter, data) + self._cache[uuid] = data yield Message.Queue, self.root + "/chapter/" + uuid, data def _transform(self, chapter): @@ -72,7 +72,7 @@ class MangadexExtractor(Extractor): "date" : text.parse_datetime(cattributes["publishAt"]), "lang" : lang, "language": util.code_to_language(lang), - "count" : len(cattributes["data"]), + "count" : cattributes["pages"], } data["artist"] = [artist["attributes"]["name"] @@ -107,20 +107,21 @@ class MangadexChapterExtractor(MangadexExtractor): def items(self): try: - chapter, data = self._cache.pop(self.uuid) + data = self._cache.pop(self.uuid) except KeyError: chapter = self.api.chapter(self.uuid) data = self._transform(chapter) - yield Message.Directory, data - cattributes = chapter["attributes"] + yield Message.Directory, data data["_http_headers"] = self._headers - base = "{}/data/{}/".format( - self.api.athome_server(self.uuid)["baseUrl"], cattributes["hash"]) + + server = self.api.athome_server(self.uuid) + chapter = server["chapter"] + base = "{}/data/{}/".format(server["baseUrl"], chapter["hash"]) enum = util.enumerate_reversed if self.config( "page-reverse") else enumerate - for data["page"], page in enum(cattributes["data"], 1): + for data["page"], page in enum(chapter["data"], 1): text.nameext_from_url(page, data) yield Message.Url, base + page, data diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 4351b3e..8bcbc20 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -38,6 +38,7 @@ class NewgroundsExtractor(Extractor): def items(self): self.login() + metadata = self.metadata() for post_url in self.posts(): try: @@ -48,6 +49,8 @@ class NewgroundsExtractor(Extractor): url = None if url: + if metadata: + post.update(metadata) yield Message.Directory, post yield Message.Url, url, text.nameext_from_url(url, post) @@ -62,9 +65,12 @@ class NewgroundsExtractor(Extractor): "Unable to get download URL for '%s'", post_url) def posts(self): - """Return urls of all relevant image pages""" + """Return URLs of all relevant post pages""" return self._pagination(self._path) + def metadata(self): + """Return general metadata""" + def login(self): username, password = self._get_auth_info() if username: @@ -493,3 +499,59 @@ class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor): text.ensure_http_scheme(user.rpartition('"')[2]) for user in text.extract_iter(page, 'class="item-user', '"><img') ] + + +class NewgroundsSearchExtractor(NewgroundsExtractor): + """Extractor for newgrounds.com search reesults""" + subcategory = "search" + directory_fmt = ("{category}", "search", "{search_tags}") + pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" + r"/search/conduct/([^/?#]+)/?\?([^#]+)") + test = ( + ("https://www.newgrounds.com/search/conduct/art?terms=tree", { + "pattern": NewgroundsImageExtractor.pattern, + "keyword": {"search_tags": "tree"}, + "range": "1-10", + "count": 10, + }), + ("https://www.newgrounds.com/search/conduct/movies?terms=tree", { + "pattern": r"https://uploads.ungrounded.net(/alternate)?/\d+/\d+", + "range": "1-10", + "count": 10, + }), + ("https://www.newgrounds.com/search/conduct/audio?advanced=1" + "&terms=tree+green+nature&match=tdtu&genre=5&suitabilities=e%2Cm"), + ) + + def __init__(self, match): + NewgroundsExtractor.__init__(self, match) + self._path, query = match.groups() + self.query = text.parse_query(query) + + def posts(self): + return self._pagination("/search/conduct/" + self._path, self.query) + + def metadata(self): + return {"search_tags": self.query.get("terms", "")} + + def _pagination(self, path, params): + url = self.root + path + headers = { + "Accept": "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Referer": self.root, + } + params["inner"] = "1" + params["page"] = 1 + + while True: + data = self.request(url, params=params, headers=headers).json() + + post_url = None + for post_url in text.extract_iter(data["content"], 'href="', '"'): + if not post_url.startswith("/search/"): + yield post_url + + if post_url is None: + return + params["page"] += 1 diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index f8c80ef..a7e0ff1 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -20,6 +20,7 @@ class PatreonExtractor(Extractor): """Base class for patreon extractors""" category = "patreon" root = "https://www.patreon.com" + cookiedomain = ".patreon.com" directory_fmt = ("{category}", "{creator[full_name]}") filename_fmt = "{id}_{title}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" diff --git a/gallery_dl/extractor/wordpress.py b/gallery_dl/extractor/wordpress.py deleted file mode 100644 index dd7d28a..0000000 --- a/gallery_dl/extractor/wordpress.py +++ /dev/null @@ -1,41 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2021 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for WordPress blogs""" - -from .common import BaseExtractor, Message -from .. import text - - -class WordpressExtractor(BaseExtractor): - """Base class for wordpress extractors""" - basecategory = "wordpress" - - def items(self): - for post in self.posts(): - yield Message.Difrectory, post - - - -BASE_PATTERN = WordpressExtractor.update({}) - - -class WordpressBlogExtractor(WordpressExtractor): - """Extractor for WordPress blogs""" - subcategory = "blog" - directory_fmt = ("{category}", "{blog}") - pattern = BASE_PATTERN + r"/?$" - - def posts(self): - url = self.root + "/wp-json/wp/v2/posts" - params = {"page": 1, "per_page": "100"} - - while True: - data = self.request(url, params=params).json() - exit() - yield 1 |
