diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/bilibili.py | 10 | ||||
| -rw-r--r-- | gallery_dl/extractor/bluesky.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/cohost.py | 30 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 25 | ||||
| -rw-r--r-- | gallery_dl/extractor/cyberdrop.py | 24 | ||||
| -rw-r--r-- | gallery_dl/extractor/deviantart.py | 26 | ||||
| -rw-r--r-- | gallery_dl/extractor/facebook.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 10 | ||||
| -rw-r--r-- | gallery_dl/extractor/itaku.py | 46 | ||||
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/lofter.py | 147 | ||||
| -rw-r--r-- | gallery_dl/extractor/recursive.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/saint.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/tapas.py | 124 | ||||
| -rw-r--r-- | gallery_dl/extractor/yiffverse.py | 157 | ||||
| -rw-r--r-- | gallery_dl/extractor/zerochan.py | 37 |
17 files changed, 557 insertions, 97 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8d5f3d0..d003a61 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -98,6 +98,7 @@ modules = [ "lexica", "lightroom", "livedoor", + "lofter", "luscious", "lynxchan", "mangadex", @@ -195,6 +196,7 @@ modules = [ "wikimedia", "xhamster", "xvideos", + "yiffverse", "zerochan", "zzup", "booru", diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py index d5c419e..b9de165 100644 --- a/gallery_dl/extractor/bilibili.py +++ b/gallery_dl/extractor/bilibili.py @@ -23,7 +23,8 @@ class BilibiliExtractor(Extractor): class BilibiliUserArticlesExtractor(BilibiliExtractor): """Extractor for a bilibili user's articles""" subcategory = "user-articles" - pattern = r"(?:https?://)?space\.bilibili\.com/(\d+)/article" + pattern = (r"(?:https?://)?space\.bilibili\.com/(\d+)" + r"/(?:article|upload/opus)") example = "https://space.bilibili.com/12345/article" def items(self): @@ -56,6 +57,13 @@ class BilibiliArticleExtractor(BilibiliExtractor): article["username"] = modules["module_author"]["name"] pics = [] + + if "module_top" in modules: + try: + pics.extend(modules["module_top"]["display"]["album"]["pics"]) + except Exception: + pass + for paragraph in modules['module_content']['paragraphs']: if "pic" not in paragraph: continue diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index f60ea15..f8fef93 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -204,6 +204,8 @@ class BlueskyUserExtractor(BlueskyExtractor): def items(self): base = "{}/profile/{}/".format(self.root, self.user) + default = ("posts" if self.config("quoted", False) or + self.config("reposts", False) else "media") return self._dispatch_extractors(( (BlueskyInfoExtractor , base + "info"), (BlueskyAvatarExtractor , base + "avatar"), @@ -212,7 +214,7 @@ class BlueskyUserExtractor(BlueskyExtractor): (BlueskyRepliesExtractor , base + "replies"), (BlueskyMediaExtractor , base + "media"), (BlueskyLikesExtractor , base + "likes"), - ), ("media",)) + ), (default,)) class BlueskyPostsExtractor(BlueskyExtractor): diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py index 0524239..6a43224 100644 --- a/gallery_dl/extractor/cohost.py +++ b/gallery_dl/extractor/cohost.py @@ -19,7 +19,7 @@ class CohostExtractor(Extractor): category = "cohost" root = "https://cohost.org" directory_fmt = ("{category}", "{postingProject[handle]}") - filename_fmt = ("{postId}_{headline:?/_/[b:200]}{num}.{extension}") + filename_fmt = ("{postId}{headline:?_//[b:200]}{num:?_//}.{extension}") archive_fmt = "{postId}_{num}" def _init(self): @@ -28,6 +28,14 @@ class CohostExtractor(Extractor): self.shares = self.config("shares", False) self.asks = self.config("asks", True) + self.avatar = self.config("avatar", False) + if self.avatar: + self._urls_avatar = {None, ""} + + self.background = self.config("background", False) + if self.background: + self._urls_background = {None, ""} + def items(self): for post in self.posts(): reason = post.get("limitedVisibilityReason") @@ -43,6 +51,26 @@ class CohostExtractor(Extractor): post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") yield Message.Directory, post + + project = post["postingProject"] + if self.avatar: + url = project.get("avatarURL") + if url not in self._urls_avatar: + self._urls_avatar.add(url) + p = post.copy() + p["postId"] = p["kind"] = "avatar" + p["headline"] = p["num"] = "" + yield Message.Url, url, text.nameext_from_url(url, p) + + if self.background: + url = project.get("headerURL") + if url not in self._urls_background: + self._urls_background.add(url) + p = post.copy() + p["postId"] = p["kind"] = "background" + p["headline"] = p["num"] = "" + yield Message.Url, url, text.nameext_from_url(url, p) + for post["num"], file in enumerate(files, 1): url = file["fileURL"] post.update(file) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5f9d355..5ada030 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -42,8 +42,7 @@ class Extractor(): ciphers = None tls12 = True browser = None - useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:128.0) Gecko/20100101 Firefox/128.0") + useragent = util.USERAGENT_FIREFOX request_interval = 0.0 request_interval_min = 0.0 request_interval_429 = 60.0 @@ -172,8 +171,16 @@ class Extractor(): while True: try: response = session.request(method, url, **kwargs) - except (requests.exceptions.ConnectionError, - requests.exceptions.Timeout, + except requests.exceptions.ConnectionError as exc: + code = 0 + try: + reason = exc.args[0].reason + cls = reason.__class__.__name__ + pre, _, err = str(reason.args[-1]).partition(":") + msg = " {}: {}".format(cls, (err or pre).lstrip()) + except Exception: + msg = exc + except (requests.exceptions.Timeout, requests.exceptions.ChunkedEncodingError, requests.exceptions.ContentDecodingError) as exc: msg = exc @@ -212,6 +219,11 @@ class Extractor(): if b'name="captcha-bypass"' in content: self.log.warning("Cloudflare CAPTCHA") break + elif server and server.startswith("ddos-guard") and \ + code == 403: + if b"/ddos-guard/js-challenge/" in response.content: + self.log.warning("DDoS-Guard challenge") + break if code == 429 and self._handle_429(response): continue @@ -909,10 +921,11 @@ def _browser_useragent(): server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(("127.0.0.1", 6414)) + server.bind(("127.0.0.1", 0)) server.listen(1) - webbrowser.open("http://127.0.0.1:6414/user-agent") + host, port = server.getsockname() + webbrowser.open("http://{}:{}/user-agent".format(host, port)) client = server.accept()[0] server.close() diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index a514696..e150829 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -10,12 +10,15 @@ from . import lolisafe from .common import Message from .. import text +BASE_PATTERN = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)" + class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): + """Extractor for cyberdrop albums""" category = "cyberdrop" root = "https://cyberdrop.me" root_api = "https://api.cyberdrop.me" - pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)" + pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://cyberdrop.me/a/ID" def items(self): @@ -40,7 +43,7 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): extr('id="title"', "") album = { - "album_id" : self.album_id, + "album_id" : album_id, "album_name" : text.unescape(extr('title="', '"')), "album_size" : text.parse_bytes(extr( '<p class="title">', "B")), @@ -67,3 +70,20 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): continue yield file + + +class CyberdropMediaExtractor(CyberdropAlbumExtractor): + """Extractor for cyberdrop media links""" + subcategory = "media" + directory_fmt = ("{category}",) + pattern = BASE_PATTERN + r"/f/([^/?#]+)" + example = "https://cyberdrop.me/f/ID" + + def fetch_album(self, album_id): + return self._extract_files((album_id,)), { + "album_id" : "", + "album_name" : "", + "album_size" : -1, + "description": "", + "count" : 1, + } diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index ea3f13d..69934b4 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -451,6 +451,26 @@ class DeviantartExtractor(Extractor): elif type == "text": self._tiptap_process_text(html, content) + elif type == "heading": + attrs = content["attrs"] + level = str(attrs.get("level") or "3") + + html.append("<h") + html.append(level) + html.append(' style="text-align:') + html.append(attrs.get("textAlign") or "left") + html.append('">') + html.append('<span style="margin-inline-start:0px">') + + children = content.get("content") + if children: + for block in children: + self._tiptap_process_content(html, block) + + html.append("</span></h") + html.append(level) + html.append(">") + elif type == "hardBreak": html.append("<br/><br/>") @@ -478,8 +498,9 @@ class DeviantartExtractor(Extractor): for mark in marks: type = mark["type"] if type == "link": + attrs = mark.get("attrs") or {} html.append('<a href="') - html.append(text.escape(mark["attrs"]["href"])) + html.append(text.escape(attrs.get("href") or "")) html.append('" rel="noopener noreferrer nofollow ugc">') close.append("</a>") elif type == "bold": @@ -491,6 +512,9 @@ class DeviantartExtractor(Extractor): elif type == "underline": html.append("<u>") close.append("</u>") + elif type == "strike": + html.append("<s>") + close.append("</s>") elif type == "textStyle" and len(mark) <= 1: pass else: diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index 04acfc5..2f3fdbf 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -40,7 +40,8 @@ class FacebookExtractor(Extractor): @staticmethod def decode_all(txt): return text.unescape( - txt.encode("utf-8").decode("unicode_escape") + txt.encode().decode("unicode_escape") + .encode("utf_16", "surrogatepass").decode("utf_16") ).replace("\\/", "/") @staticmethod diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index e6b6b14..8c5b180 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -163,21 +163,14 @@ class InstagramExtractor(Extractor): "post_id": reel_id, "post_shortcode": shortcode_from_id(reel_id), } - if "title" in post: data["highlight_title"] = post["title"] - if "created_at" in post: - data["post_date"] = data["date"] = text.parse_timestamp( - post.get("created_at")) else: # regular image/video post - date = text.parse_timestamp(post.get("taken_at")) data = { "post_id" : post["pk"], "post_shortcode": post["code"], "post_url": "{}/p/{}/".format(self.root, post["code"]), - "post_date": date, - "date": date, "likes": post.get("like_count", 0), "pinned": post.get("timeline_pinned_user_ids", ()), "liked": post.get("has_liked", False), @@ -218,7 +211,8 @@ class InstagramExtractor(Extractor): data["owner_id"] = owner["pk"] data["username"] = owner.get("username") data["fullname"] = owner.get("full_name") - + data["post_date"] = data["date"] = text.parse_timestamp( + post.get("taken_at") or post.get("created_at") or post.get("seen")) data["_files"] = files = [] for num, item in enumerate(items, 1): diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 1aef66e..7f941bb 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -78,6 +78,16 @@ class ItakuImageExtractor(ItakuExtractor): return (self.api.image(self.item),) +class ItakuSearchExtractor(ItakuExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/home/images/?\?([^#]+)" + example = "https://itaku.ee/home/images?tags=SEARCH" + + def posts(self): + params = text.parse_query_list(self.item) + return self.api.search_images(params) + + class ItakuAPI(): def __init__(self, extractor): @@ -87,6 +97,42 @@ class ItakuAPI(): "Accept": "application/json, text/plain, */*", } + def search_images(self, params): + endpoint = "/galleries/images/" + required_tags = [] + negative_tags = [] + optional_tags = [] + + tags = params.pop("tags", None) + if not tags: + tags = () + elif isinstance(tags, str): + tags = (tags,) + + for tag in tags: + if not tag: + pass + elif tag[0] == "-": + negative_tags.append(tag[1:]) + elif tag[0] == "~": + optional_tags.append(tag[1:]) + else: + required_tags.append(tag) + + api_params = { + "required_tags": required_tags, + "negative_tags": negative_tags, + "optional_tags": optional_tags, + "date_range": "", + "maturity_rating": ("SFW", "Questionable", "NSFW"), + "ordering" : "-date_added", + "page" : "1", + "page_size" : "30", + "visibility": ("PUBLIC", "PROFILE_ONLY"), + } + api_params.update(params) + return self._pagination(endpoint, api_params, self.image) + def galleries_images(self, username, section=None): endpoint = "/galleries/images/" params = { diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 16c5b99..a7caca9 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -433,8 +433,8 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): class KemonopartyFavoriteExtractor(KemonopartyExtractor): """Extractor for kemono.su favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites()()(?:/?\?([^#]+))?" - example = "https://kemono.su/favorites" + pattern = BASE_PATTERN + r"/(?:account/)?favorites()()(?:/?\?([^#]+))?" + example = "https://kemono.su/account/favorites/artists" def items(self): self._prepare_ddosguard_cookies() diff --git a/gallery_dl/extractor/lofter.py b/gallery_dl/extractor/lofter.py new file mode 100644 index 0000000..412b6b9 --- /dev/null +++ b/gallery_dl/extractor/lofter.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.lofter.com/""" + +from .common import Extractor, Message +from .. import text, util, exception + + +class LofterExtractor(Extractor): + """Base class for lofter extractors""" + category = "lofter" + root = "https://www.lofter.com" + directory_fmt = ("{category}", "{blog_name}") + filename_fmt = "{id}_{num}.{extension}" + archive_fmt = "{id}_{num}" + + def _init(self): + self.api = LofterAPI(self) + + def items(self): + for post in self.posts(): + if "post" in post: + post = post["post"] + + post["blog_name"] = post["blogInfo"]["blogName"] + post["date"] = text.parse_timestamp(post["publishTime"] // 1000) + post_type = post["type"] + + # Article + if post_type == 1: + content = post["content"] + image_urls = text.extract_iter(content, '<img src="', '"') + image_urls = [text.unescape(x) for x in image_urls] + image_urls = [x.partition("?")[0] for x in image_urls] + + # Photo + elif post_type == 2: + photo_links = util.json_loads(post["photoLinks"]) + image_urls = [x["orign"] for x in photo_links] + image_urls = [x.partition("?")[0] for x in image_urls] + + # Video + elif post_type == 4: + embed = util.json_loads(post["embed"]) + image_urls = [embed["originUrl"]] + + # Answer + elif post_type == 5: + images = util.json_loads(post["images"]) + image_urls = [x["orign"] for x in images] + image_urls = [x.partition("?")[0] for x in image_urls] + + else: + image_urls = () + self.log.warning( + "%s: Unsupported post type '%s'.", + post["id"], post_type) + + post["count"] = len(image_urls) + yield Message.Directory, post + for post["num"], url in enumerate(image_urls, 1): + yield Message.Url, url, text.nameext_from_url(url, post) + + def posts(self): + return () + + +class LofterPostExtractor(LofterExtractor): + """Extractor for a lofter post""" + subcategory = "post" + pattern = r"(?:https?://)?[\w-]+\.lofter\.com/post/([0-9a-f]+)_([0-9a-f]+)" + example = "https://BLOG.lofter.com/post/12345678_90abcdef" + + def posts(self): + blog_id, post_id = self.groups + post = self.api.post(int(blog_id, 16), int(post_id, 16)) + return (post,) + + +class LofterBlogPostsExtractor(LofterExtractor): + """Extractor for a lofter blog's posts""" + subcategory = "blog-posts" + pattern = (r"(?:https?://)?(?:" + # https://www.lofter.com/front/blog/home-page/<blog_name> + r"www\.lofter\.com/front/blog/home-page/([\w-]+)|" + # https://<blog_name>.lofter.com/ + r"([\w-]+)\.lofter\.com" + r")/?(?:$|\?|#)") + example = "https://BLOG.lofter.com/" + + def posts(self): + blog_name = self.groups[0] or self.groups[1] + return self.api.blog_posts(blog_name) + + +class LofterAPI(): + + def __init__(self, extractor): + self.extractor = extractor + + def blog_posts(self, blog_name): + endpoint = "/v2.0/blogHomePage.api" + params = { + "method": "getPostLists", + "offset": 0, + "limit": 200, + "blogdomain": blog_name + ".lofter.com", + } + return self._pagination(endpoint, params) + + def post(self, blog_id, post_id): + endpoint = "/oldapi/post/detail.api" + params = { + "targetblogid": blog_id, + "postid": post_id, + } + return self._call(endpoint, params)["posts"][0] + + def _call(self, endpoint, data): + url = "https://api.lofter.com" + endpoint + params = { + 'product': 'lofter-android-7.9.10' + } + response = self.extractor.request( + url, method="POST", params=params, data=data) + info = response.json() + + if info["meta"]["status"] != 200: + self.extractor.log.debug("Server response: %s", info) + raise exception.StopExtraction("API request failed") + + return info["response"] + + def _pagination(self, endpoint, params): + while True: + data = self._call(endpoint, params) + posts = data["posts"] + + yield from posts + + if params["offset"] + len(posts) < data["offset"]: + break + params["offset"] = data["offset"] diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py index 4156484..1883bbc 100644 --- a/gallery_dl/extractor/recursive.py +++ b/gallery_dl/extractor/recursive.py @@ -9,6 +9,7 @@ """Recursive extractor""" from .common import Extractor, Message +from .. import text import re @@ -25,7 +26,7 @@ class RecursiveExtractor(Extractor): with open(url[7:]) as fp: page = fp.read() else: - page = self.request(url).text + page = self.request(text.ensure_http_scheme(url)).text for match in re.finditer(r"https?://[^\s\"']+", page): yield Message.Queue, match.group(0), {} diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py index 784cdc0..1c62d75 100644 --- a/gallery_dl/extractor/saint.py +++ b/gallery_dl/extractor/saint.py @@ -11,7 +11,7 @@ from .lolisafe import LolisafeAlbumExtractor from .. import text -BASE_PATTERN = r"(?:https?://)?saint\d*\.(?:su|pk|to)" +BASE_PATTERN = r"(?:https?://)?saint\d*\.(?:su|pk|cr|to)" class SaintAlbumExtractor(LolisafeAlbumExtractor): diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index 167953d..e756385 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -30,44 +30,6 @@ class TapasExtractor(Extractor): if self._cache is None: TapasExtractor._cache = {} - def items(self): - self.login() - headers = {"Accept": "application/json, text/javascript, */*;"} - - for episode_id in self.episode_ids(): - url = "{}/episode/{}".format(self.root, episode_id) - data = self.request(url, headers=headers).json()["data"] - - episode = data["episode"] - if not episode.get("free") and not episode.get("unlocked"): - raise exception.StopExtraction( - "Episode '%s' not unlocked (ID %s) ", - episode["title"], episode_id) - - html = data["html"] - series_id = text.rextract(html, 'data-series-id="', '"')[0] - try: - episode["series"] = self._cache[series_id] - except KeyError: - url = "{}/series/{}".format(self.root, series_id) - episode["series"] = self._cache[series_id] = self.request( - url, headers=headers).json()["data"] - - episode["date"] = text.parse_datetime(episode["publish_date"]) - yield Message.Directory, episode - - if episode["book"]: - content, _ = text.extract( - html, '<div class="viewer">', '<div class="viewer-bottom') - episode["num"] = 1 - episode["extension"] = "html" - yield Message.Url, "text:" + content, episode - - else: # comic - for episode["num"], url in enumerate(text.extract_iter( - html, 'data-src="', '"'), 1): - yield Message.Url, url, text.nameext_from_url(url, episode) - def login(self): if self.cookies_check(self.cookies_names): return @@ -103,24 +65,70 @@ class TapasExtractor(Extractor): return {"_cpc_": response.history[0].cookies.get("_cpc_")} + def request_api(self, url, params=None): + headers = {"Accept": "application/json, text/javascript, */*;"} + return self.request(url, params=params, headers=headers).json()["data"] + + +class TapasEpisodeExtractor(TapasExtractor): + subcategory = "episode" + pattern = BASE_PATTERN + r"/episode/(\d+)" + example = "https://tapas.io/episode/12345" + + def items(self): + self.login() + + episode_id = self.groups[0] + url = "{}/episode/{}".format(self.root, episode_id) + data = self.request_api(url) + + episode = data["episode"] + if not episode.get("free") and not episode.get("unlocked"): + raise exception.AuthorizationError( + "%s: Episode '%s' not unlocked", + episode_id, episode["title"]) + + html = data["html"] + episode["series"] = self._extract_series(html) + episode["date"] = text.parse_datetime(episode["publish_date"]) + yield Message.Directory, episode + + if episode["book"]: + content = text.extr( + html, '<div class="viewer">', '<div class="viewer-bottom') + episode["num"] = 1 + episode["extension"] = "html" + yield Message.Url, "text:" + content, episode + + else: # comic + for episode["num"], url in enumerate(text.extract_iter( + html, 'data-src="', '"'), 1): + yield Message.Url, url, text.nameext_from_url(url, episode) + + def _extract_series(self, html): + series_id = text.rextract(html, 'data-series-id="', '"')[0] + try: + return self._cache[series_id] + except KeyError: + url = "{}/series/{}".format(self.root, series_id) + series = self._cache[series_id] = self.request_api(url) + return series + class TapasSeriesExtractor(TapasExtractor): subcategory = "series" pattern = BASE_PATTERN + r"/series/([^/?#]+)" example = "https://tapas.io/series/TITLE" - def __init__(self, match): - TapasExtractor.__init__(self, match) - self.series_name = match.group(1) + def items(self): + self.login() - def episode_ids(self): - url = "{}/series/{}".format(self.root, self.series_name) - series_id, _, episode_id = text.extract( + url = "{}/series/{}".format(self.root, self.groups[0]) + series_id, _, episode_id = text.extr( self.request(url).text, 'content="tapastic://series/', '"', - )[0].partition("/episodes/") + ).partition("/episodes/") url = "{}/series/{}/episodes".format(self.root, series_id) - headers = {"Accept": "application/json, text/javascript, */*;"} params = { "eid" : episode_id, "page" : 1, @@ -129,36 +137,26 @@ class TapasSeriesExtractor(TapasExtractor): "max_limit" : "20", } + base = self.root + "/episode/" while True: - data = self.request( - url, params=params, headers=headers).json()["data"] - yield from text.extract_iter( - data["body"], 'data-href="/episode/', '"') + data = self.request_api(url, params) + for episode in data["episodes"]: + episode["_extractor"] = TapasEpisodeExtractor + yield Message.Queue, base + str(episode["id"]), episode if not data["pagination"]["has_next"]: return params["page"] += 1 -class TapasEpisodeExtractor(TapasExtractor): - subcategory = "episode" - pattern = BASE_PATTERN + r"/episode/(\d+)" - example = "https://tapas.io/episode/12345" - - def __init__(self, match): - TapasExtractor.__init__(self, match) - self.episode_id = match.group(1) - - def episode_ids(self): - return (self.episode_id,) - - class TapasCreatorExtractor(TapasExtractor): subcategory = "creator" pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)" example = "https://tapas.io/CREATOR" def items(self): + self.login() + url = "{}/{}/series".format(self.root, self.groups[0]) page = self.request(url).text page = text.extr(page, '<ul class="content-list-wrap', "</ul>") diff --git a/gallery_dl/extractor/yiffverse.py b/gallery_dl/extractor/yiffverse.py new file mode 100644 index 0000000..2b14341 --- /dev/null +++ b/gallery_dl/extractor/yiffverse.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://yiffverse.com/""" + +from .booru import BooruExtractor +from .. import text +import collections + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?yiffverse\.com" + + +class YiffverseExtractor(BooruExtractor): + category = "yiffverse" + root = "https://yiffverse.com" + root_cdn = "https://furry34com.b-cdn.net" + filename_fmt = "{category}_{id}.{extension}" + per_page = 30 + + TAG_TYPES = { + None: "general", + 1 : "general", + 2 : "copyright", + 4 : "character", + 8 : "artist", + } + FORMATS = ( + ("100", "mov.mp4"), + ("101", "mov720.mp4"), + ("102", "mov480.mp4"), + ("10" , "pic.jpg"), + ) + + def _file_url(self, post): + files = post["files"] + for fmt, extension in self.FORMATS: + if fmt in files: + break + else: + fmt = next(iter(files)) + + post_id = post["id"] + root = self.root_cdn if files[fmt][0] else self.root + post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format( + root, post_id // 1000, post_id, post_id, extension) + post["format_id"] = fmt + post["format"] = extension.partition(".")[0] + + return url + + def _prepare(self, post): + post.pop("files", None) + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["filename"], _, post["format"] = post["filename"].rpartition(".") + if "tags" in post: + post["tags"] = [t["value"] for t in post["tags"]] + + def _tags(self, post, _): + if "tags" not in post: + post.update(self._fetch_post(post["id"])) + + tags = collections.defaultdict(list) + for tag in post["tags"]: + tags[tag["type"]].append(tag["value"]) + types = self.TAG_TYPES + for type, values in tags.items(): + post["tags_" + types[type]] = values + + def _fetch_post(self, post_id): + url = "{}/api/v2/post/{}".format(self.root, post_id) + return self.request(url).json() + + def _pagination(self, endpoint, params=None): + url = "{}/api{}".format(self.root, endpoint) + + if params is None: + params = {} + params["sortOrder"] = 1 + params["status"] = 2 + params["take"] = self.per_page + threshold = self.per_page + + while True: + data = self.request(url, method="POST", json=params).json() + + yield from data["items"] + + if len(data["items"]) < threshold: + return + params["cursor"] = data.get("cursor") + + +class YiffversePostExtractor(YiffverseExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/post/(\d+)" + example = "https://yiffverse.com/post/12345" + + def posts(self): + return (self._fetch_post(self.groups[0]),) + + +class YiffversePlaylistExtractor(YiffverseExtractor): + subcategory = "playlist" + directory_fmt = ("{category}", "{playlist_id}") + archive_fmt = "p_{playlist_id}_{id}" + pattern = BASE_PATTERN + r"/playlist/(\d+)" + example = "https://yiffverse.com/playlist/12345" + + def metadata(self): + return {"playlist_id": self.groups[0]} + + def posts(self): + endpoint = "/v2/post/search/playlist/" + self.groups[0] + return self._pagination(endpoint) + + +class YiffverseTagExtractor(YiffverseExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/(?:tag/([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)" + example = "https://yiffverse.com/tag/TAG" + + def _init(self): + tag, query = self.groups + params = text.parse_query(query) + + self.tags = tags = [] + if tag: + tags.append(text.unquote(tag)) + if "tags" in params: + tags.extend(params["tags"].split("|")) + + type = params.get("type") + if type == "video": + self.type = 1 + elif type == "image": + self.type = 0 + else: + self.type = None + + def metadata(self): + return {"search_tags": " ".join(self.tags)} + + def posts(self): + endpoint = "/v2/post/search/root" + params = {"includeTags": [t.replace("_", " ") for t in self.tags]} + if self.type is not None: + params["type"] = self.type + return self._pagination(endpoint, params) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 4c4fb3a..bc135ad 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -78,8 +78,8 @@ class ZerochanExtractor(BooruExtractor): 'class="breadcrumbs', '</nav>'))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('<ul id="tags"', '</ul>'), - "source" : text.unescape(text.extr( - extr('id="source-url"', '</a>'), 'href="', '"')), + "source" : text.unescape(text.remove_html(extr( + 'id="source-url"', '</p>').rpartition("</s>")[2])), } html = data["tags"] @@ -93,14 +93,12 @@ class ZerochanExtractor(BooruExtractor): def _parse_entry_api(self, entry_id): url = "{}/{}?json".format(self.root, entry_id) - text = self.request(url).text + txt = self.request(url).text try: - item = util.json_loads(text) - except ValueError as exc: - if " control character " not in str(exc): - raise - text = re.sub(r"[\x00-\x1f\x7f]", "", text) - item = util.json_loads(text) + item = util.json_loads(txt) + except ValueError: + item = self._parse_json(txt) + item["id"] = text.parse_int(entry_id) data = { "id" : item["id"], @@ -118,6 +116,27 @@ class ZerochanExtractor(BooruExtractor): return data + def _parse_json(self, txt): + txt = re.sub(r"[\x00-\x1f\x7f]", "", txt) + main, _, tags = txt.partition('tags": [') + + item = {} + for line in main.split(', "')[1:]: + key, _, value = line.partition('": ') + if value: + if value[0] == '"': + value = value[1:-1] + else: + value = text.parse_int(value) + if key: + item[key] = value + + item["tags"] = tags = tags[5:].split('", "') + if tags: + tags[-1] = tags[-1][:-5] + + return item + def _tags(self, post, page): tags = collections.defaultdict(list) for tag in post["tags"]: |
