From c2e774d3f5a4499b8beb5a12ab46a0099b16b1e7 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Tue, 15 Mar 2022 00:19:57 -0400 Subject: New upstream version 1.21.0. --- gallery_dl/downloader/common.py | 8 +- gallery_dl/downloader/http.py | 3 +- gallery_dl/downloader/ytdl.py | 1 + gallery_dl/extractor/__init__.py | 4 + gallery_dl/extractor/booru.py | 6 +- gallery_dl/extractor/common.py | 38 ++---- gallery_dl/extractor/deviantart.py | 13 +- gallery_dl/extractor/fanbox.py | 9 +- gallery_dl/extractor/fantia.py | 28 ++++ gallery_dl/extractor/hentaicosplays.py | 1 + gallery_dl/extractor/imagebam.py | 106 ++++++++------- gallery_dl/extractor/kemonoparty.py | 7 +- gallery_dl/extractor/kissgoddess.py | 80 +++++++++++ gallery_dl/extractor/lolisafe.py | 17 +-- gallery_dl/extractor/mangadex.py | 5 +- gallery_dl/extractor/mememuseum.py | 120 ++++++++++++++++ gallery_dl/extractor/newgrounds.py | 16 ++- gallery_dl/extractor/oauth.py | 8 +- gallery_dl/extractor/patreon.py | 16 ++- gallery_dl/extractor/seiga.py | 31 +---- gallery_dl/extractor/skeb.py | 26 ++-- gallery_dl/extractor/slideshare.py | 104 ++++++++------ gallery_dl/extractor/subscribestar.py | 4 +- gallery_dl/extractor/toyhouse.py | 173 +++++++++++++++++++++++ gallery_dl/extractor/tumblr.py | 11 +- gallery_dl/extractor/twibooru.py | 241 +++++++++++++++++++++++++++++++++ gallery_dl/extractor/twitter.py | 62 ++++++--- gallery_dl/extractor/ytdl.py | 5 +- gallery_dl/path.py | 12 +- gallery_dl/postprocessor/metadata.py | 9 +- gallery_dl/postprocessor/mtime.py | 10 +- gallery_dl/text.py | 7 + gallery_dl/util.py | 34 ++++- gallery_dl/version.py | 2 +- gallery_dl/ytdl.py | 4 +- 35 files changed, 992 insertions(+), 229 deletions(-) create mode 100644 gallery_dl/extractor/kissgoddess.py create mode 100644 gallery_dl/extractor/mememuseum.py create mode 100644 gallery_dl/extractor/toyhouse.py create mode 100644 gallery_dl/extractor/twibooru.py (limited to 'gallery_dl') diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index d858075..1168d83 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -27,6 +27,12 @@ class DownloaderBase(): self.partdir = util.expand_path(self.partdir) os.makedirs(self.partdir, exist_ok=True) + proxies = self.config("proxy", util.SENTINEL) + if proxies is util.SENTINEL: + self.proxies = job.extractor._proxies + else: + self.proxies = util.build_proxy_map(proxies, self.log) + def config(self, key, default=None): """Interpolate downloader config value for 'key'""" return config.interpolate(("downloader", self.scheme), key, default) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 91ce731..b878f5f 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -121,7 +121,8 @@ class HttpDownloader(DownloaderBase): try: response = self.session.request( "GET", url, stream=True, headers=headers, - timeout=self.timeout, verify=self.verify) + timeout=self.timeout, verify=self.verify, + proxies=self.proxies) except (ConnectionError, Timeout) as exc: msg = str(exc) continue diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 462bbf8..2badccf 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -25,6 +25,7 @@ class YoutubeDLDownloader(DownloaderBase): "retries": retries+1 if retries >= 0 else float("inf"), "socket_timeout": self.config("timeout", extractor._timeout), "nocheckcertificate": not self.config("verify", extractor._verify), + "proxy": self.proxies.get("http") if self.proxies else None, } self.ytdl_instance = None diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b52561e..1bec48e 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -67,6 +67,7 @@ modules = [ "keenspot", "kemonoparty", "khinsider", + "kissgoddess", "kohlchan", "komikcast", "lightroom", @@ -81,6 +82,7 @@ modules = [ "mangapark", "mangasee", "mangoxo", + "mememuseum", "myhentaigallery", "myportfolio", "naver", @@ -123,9 +125,11 @@ modules = [ "speakerdeck", "subscribestar", "tapas", + "toyhouse", "tsumino", "tumblr", "tumblrgallery", + "twibooru", "twitter", "unsplash", "vanillarock", diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index a42ec53..12d98b1 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -41,9 +41,9 @@ class BooruExtractor(BaseExtractor): page_html = self._extended_tags(post) if notes: self._notes(post, page_html) - self._prepare(post) - post.update(data) text.nameext_from_url(url, post) + post.update(data) + self._prepare(post) yield Message.Directory, post yield Message.Url, url, post diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5a2d3a3..e3559f9 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -55,6 +55,7 @@ class Extractor(): self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) + self._proxies = util.build_proxy_map(self.config("proxy"), self.log) self._interval = util.build_duration_func( self.config("sleep-request", self.request_interval), self.request_interval_min, @@ -65,7 +66,6 @@ class Extractor(): self._init_session() self._init_cookies() - self._init_proxies() @classmethod def from_url(cls, url): @@ -104,10 +104,12 @@ class Extractor(): def request(self, url, *, method="GET", session=None, retries=None, encoding=None, fatal=True, notfound=None, **kwargs): - if retries is None: - retries = self._retries if session is None: session = self.session + if retries is None: + retries = self._retries + if "proxies" not in kwargs: + kwargs["proxies"] = self._proxies if "timeout" not in kwargs: kwargs["timeout"] = self._timeout if "verify" not in kwargs: @@ -289,20 +291,6 @@ class Extractor(): session.mount("https://", adapter) session.mount("http://", adapter) - def _init_proxies(self): - """Update the session's proxy map""" - proxies = self.config("proxy") - if proxies: - if isinstance(proxies, str): - proxies = {"http": proxies, "https": proxies} - if isinstance(proxies, dict): - for scheme, proxy in proxies.items(): - if "://" not in proxy: - proxies[scheme] = "http://" + proxy.lstrip("/") - self.session.proxies = proxies - else: - self.log.warning("invalid proxy specifier: %s", proxies) - def _init_cookies(self): """Populate the session's cookiejar""" self._cookiefile = None @@ -371,20 +359,25 @@ class Extractor(): for cookie in self._cookiejar: if cookie.name in names and ( not domain or cookie.domain == domain): + if cookie.expires: diff = int(cookie.expires - now) + if diff <= 0: self.log.warning( "Cookie '%s' has expired", cookie.name) + continue + elif diff <= 86400: hours = diff // 3600 self.log.warning( "Cookie '%s' will expire in less than %s hour%s", cookie.name, hours + 1, "s" if hours else "") - else: - names.discard(cookie.name) - if not names: - return True + continue + + names.discard(cookie.name) + if not names: + return True return False def _prepare_ddosguard_cookies(self): @@ -616,8 +609,7 @@ class BaseExtractor(Extractor): if index: self.category, self.root = self.instances[index-1] if not self.root: - url = text.ensure_http_scheme(match.group(0)) - self.root = url[:url.index("/", 8)] + self.root = text.root_from_url(match.group(0)) else: self.root = group self.category = group.partition("://")[2] diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 94fec16..fda7220 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -1004,6 +1004,7 @@ class DeviantartOAuthAPI(): self.extractor = extractor self.log = extractor.log self.headers = {"dA-minor-version": "20200519"} + self._warn_429 = True self.delay = extractor.config("wait-min", 0) self.delay_min = max(2, self.delay) @@ -1260,6 +1261,16 @@ class DeviantartOAuthAPI(): if self.delay < 30: self.delay += 1 self.log.warning("%s. Using %ds delay.", msg, self.delay) + + if self._warn_429 and self.delay >= 3: + self._warn_429 = False + if self.client_id == self.CLIENT_ID: + self.log.info( + "Register your own OAuth application and use its " + "credentials to prevent this error: " + "https://github.com/mikf/gallery-dl/blob/master/do" + "cs/configuration.rst#extractordeviantartclient-id" + "--client-secret") else: self.log.error(msg) return data diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index ef79808..11436cb 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -51,19 +51,16 @@ class FanboxExtractor(Extractor): url = text.ensure_http_scheme(url) body = self.request(url, headers=headers).json()["body"] for item in body["items"]: - yield self._process_post(item) + yield self._get_post_data(item["id"]) url = body["nextUrl"] - def _get_post_data_from_id(self, post_id): + def _get_post_data(self, post_id): """Fetch and process post data""" headers = {"Origin": self.root} url = "https://api.fanbox.cc/post.info?postId="+post_id post = self.request(url, headers=headers).json()["body"] - return self._process_post(post) - - def _process_post(self, post): content_body = post.pop("body", None) if content_body: if "html" in content_body: @@ -279,7 +276,7 @@ class FanboxPostExtractor(FanboxExtractor): self.post_id = match.group(3) def posts(self): - return (self._get_post_data_from_id(self.post_id),) + return (self._get_post_data(self.post_id),) class FanboxRedirectExtractor(Extractor): diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 89a965f..c05ec39 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -8,6 +8,7 @@ from .common import Extractor, Message from .. import text +import json class FantiaExtractor(Extractor): @@ -29,7 +30,9 @@ class FantiaExtractor(Extractor): for post_id in self.posts(): full_response, post = self._get_post_data(post_id) yield Message.Directory, post + post["num"] = 0 for url, url_data in self._get_urls_from_post(full_response, post): + post["num"] += 1 fname = url_data["content_filename"] or url text.nameext_from_url(fname, url_data) url_data["file_url"] = url @@ -90,14 +93,39 @@ class FantiaExtractor(Extractor): post["content_title"] = content["title"] post["content_filename"] = content.get("filename", "") post["content_id"] = content["id"] + + if "comment" in content: + post["content_comment"] = content["comment"] + if "post_content_photos" in content: for photo in content["post_content_photos"]: post["file_id"] = photo["id"] yield photo["url"]["original"], post + if "download_uri" in content: post["file_id"] = content["id"] yield self.root+"/"+content["download_uri"], post + if content["category"] == "blog" and "comment" in content: + comment_json = json.loads(content["comment"]) + ops = comment_json.get("ops", ()) + + # collect blogpost text first + blog_text = "" + for op in ops: + insert = op.get("insert") + if isinstance(insert, str): + blog_text += insert + post["blogpost_text"] = blog_text + + # collect images + for op in ops: + insert = op.get("insert") + if isinstance(insert, dict) and "fantiaImage" in insert: + img = insert["fantiaImage"] + post["file_id"] = img["id"] + yield "https://fantia.jp" + img["original_url"], post + class FantiaCreatorExtractor(FantiaExtractor): """Extractor for a Fantia creator's works""" diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 7dd047c..b4f433b 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -57,6 +57,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor): self.root = text.ensure_http_scheme(root) url = "{}/story/{}/".format(self.root, self.slug) GalleryExtractor.__init__(self, match, url) + self.session.headers["Referer"] = url def metadata(self, page): title = text.extract(page, "", "")[0] diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 9370840..7cd67d6 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2021 Mike Fährmann +# Copyright 2014-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,45 +10,40 @@ from .common import Extractor, Message from .. import text, exception +import re class ImagebamExtractor(Extractor): """Base class for imagebam extractors""" category = "imagebam" root = "https://www.imagebam.com" - cookies = None def __init__(self, match): Extractor.__init__(self, match) - self.key = match.group(1) - if self.cookies: - self.session.cookies = self.cookies - - def get_image_data(self, data): - page_url = "{}/image/{}".format(self.root, data["image_key"]) - page = self.request(page_url).text - image_url, pos = text.extract(page, '', '<')[0] - return {"title": text.unescape(title.strip())} - - def get_image_keys(self, page): - """Return a list of all image keys""" - keys = [] + def metadata(page): + return {"title": text.unescape(text.extract( + page, 'id="gallery-name">', '<')[0].strip())} + + def images(self, page): + findall = re.compile(r'', "<")[0].rpartition(" | ")[0], + } + + def images(self, page): + pnum = 1 + + while page: + for url in text.extract_iter(page, "= 7", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.model = match.group(1) + + def items(self): + url = "{}/people/{}.html".format(self.root, self.model) + page = self.request(url).text + + data = {"_extractor": KissgoddessGalleryExtractor} + for path in text.extract_iter(page, 'thumb">= 30" + }) + per_page = 25 + + def __init__(self, match): + MememuseumExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1)) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + pnum = 1 + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + extr = text.extract_from(self.request(url).text) + + while True: + mime = extr("data-mime='", "'") + if not mime: + break + + pid = extr("data-post-id='", "'") + tags, dimensions, size = extr("title='", "'").split(" // ") + md5 = extr("/_thumbs/", "/") + width, _, height = dimensions.partition("x") + + yield { + "file_url": "{}/_images/{}/{}%20-%20{}.{}".format( + self.root, md5, pid, text.quote(tags), + mime.rpartition("/")[2]), + "id": pid, "md5": md5, "tags": tags, + "width": width, "height": height, + "size": text.parse_bytes(size[:-1]), + } + + if not extr(">Next<", ">"): + return + pnum += 1 + + +class MememuseumPostExtractor(MememuseumExtractor): + """Extractor for single images from meme.museum""" + subcategory = "post" + pattern = r"(?:https?://)?meme\.museum/post/view/(\d+)" + test = ("https://meme.museum/post/view/10243", { + "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc4997" + r"1f78/10243%20-%20g%20beard%20open_source%20richard_stallm" + r"an%20stallman%20tagme%20text\.jpg", + "keyword": "3c8009251480cf17248c08b2b194dc0c4d59580e", + "content": "45565f3f141fc960a8ae1168b80e718a494c52d2", + }) + + def __init__(self, match): + MememuseumExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + url = "{}/post/view/{}".format(self.root, self.post_id) + extr = text.extract_from(self.request(url).text) + + return ({ + "id" : self.post_id, + "tags" : extr(": ", "<"), + "md5" : extr("/_thumbs/", "/"), + "file_url": self.root + extr("id='main_image' src='", "'"), + "width" : extr("data-width=", " ").strip("'\""), + "height" : extr("data-height=", " ").strip("'\""), + "size" : 0, + },) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 54e2040..6d0e94b 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -103,7 +103,7 @@ class NewgroundsExtractor(Extractor): } def extract_post(self, post_url): - + url = post_url if "/art/view/" in post_url: extract_data = self._extract_image_data elif "/audio/listen/" in post_url: @@ -111,18 +111,19 @@ class NewgroundsExtractor(Extractor): else: extract_data = self._extract_media_data if self.flash: - post_url += "/format/flash" + url += "/format/flash" - response = self.request(post_url, fatal=False) + response = self.request(url, fatal=False) if response.status_code >= 400: return {} page = response.text extr = text.extract_from(page) data = extract_data(extr, post_url) - data["_comment"] = extr('id="author_comments"', '') + data["_comment"] = extr( + 'id="author_comments"', '').partition(">")[2] data["comment"] = text.unescape(text.remove_html( - data["_comment"].partition(">")[2], "", "")) + data["_comment"], "", "")) data["favorites"] = text.parse_int(extr( 'id="faves_load">', '<').replace(",", "")) data["score"] = text.parse_float(extr('id="score_number">', '<')) @@ -134,6 +135,7 @@ class NewgroundsExtractor(Extractor): data["tags"].sort() data["user"] = self.user or data["artist"][0] + data["post_url"] = post_url return data @staticmethod @@ -171,6 +173,7 @@ class NewgroundsExtractor(Extractor): def _extract_media_data(self, extr, url): index = url.split("/")[5] title = extr('"og:title" content="', '"') + descr = extr('"og:description" content="', '"') src = extr('{"url":"', '"') if src: @@ -209,7 +212,7 @@ class NewgroundsExtractor(Extractor): "title" : text.unescape(title), "url" : src, "date" : date, - "description": text.unescape(extr( + "description": text.unescape(descr or extr( 'itemprop="description" content="', '"')), "rating" : extr('class="rated-', '"'), "index" : text.parse_int(index), @@ -319,6 +322,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "artist" : ["kickinthehead", "danpaladin", "tomfulp"], "comment" : "re:My fan trailer for Alien Hominid HD!", "date" : "dt:2013-02-01 09:50:49", + "description": "Fan trailer for Alien Hominid HD!", "favorites" : int, "filename" : "564957_alternate_31", "index" : 595355, diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 6812f35..428f772 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2021 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -399,7 +399,7 @@ class OAuthPixiv(OAuthBase): if "error" in data: print(data) - if data["error"] == "invalid_request": + if data["error"] in ("invalid_request", "invalid_grant"): print("'code' expired, try again") return @@ -417,6 +417,10 @@ class OAuthPixiv(OAuthBase): 2) Login 3) Select the last network monitor entry ('callback?state=...') 4) Copy its 'code' query parameter, paste it below, and press Enter + +- This 'code' will expire 30 seconds after logging in. +- Copy-pasting more than just the 'code' value will work as well, + like the entire URL or several query parameters. """) code = input("code: ") return code.rpartition("=")[2].strip() diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 051f1ef..35a015f 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -71,6 +71,15 @@ class PatreonExtractor(Extractor): name = image.get("file_name") or self._filename(url) or url yield "image", url, name + def _image_large(self, post): + image = post.get("image") + if image: + url = image.get("large_url") + if url: + name = image.get("file_name") or self._filename(url) or url + return (("image_large", url, name),) + return () + def _attachments(self, post): for attachment in post["attachments"]: url = self.request( @@ -212,10 +221,11 @@ class PatreonExtractor(Extractor): def _build_file_generators(self, filetypes): if filetypes is None: - return (self._images, self._attachments, - self._postfile, self._content) + return (self._images, self._image_large, + self._attachments, self._postfile, self._content) genmap = { "images" : self._images, + "image_large": self._image_large, "attachments": self._attachments, "postfile" : self._postfile, "content" : self._content, diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index bf38a77..22c9487 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -1,16 +1,15 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://seiga.nicovideo.jp/""" +"""Extractors for https://seiga.nicovideo.jp/""" from .common import Extractor, Message from .. import text, util, exception -from ..cache import cache class SeigaExtractor(Extractor): @@ -25,7 +24,9 @@ class SeigaExtractor(Extractor): self.start_image = 0 def items(self): - self.login() + if not self._check_cookies(("user_session",)): + raise exception.StopExtraction("'user_session' cookie required") + images = iter(self.get_images()) data = next(images) @@ -45,28 +46,6 @@ class SeigaExtractor(Extractor): url, method="HEAD", allow_redirects=False, notfound="image") return response.headers["Location"].replace("/o/", "/priv/", 1) - def login(self): - """Login and set necessary cookies""" - if not self._check_cookies(("user_session",)): - username, password = self._get_auth_info() - self._update_cookies(self._login_impl(username, password)) - - @cache(maxage=7*24*3600, keyarg=1) - def _login_impl(self, username, password): - if not username or not password: - raise exception.AuthenticationError( - "Username and password required") - - self.log.info("Logging in as %s", username) - url = "https://account.nicovideo.jp/api/v1/login" - data = {"mail_tel": username, "password": password} - - self.request(url, method="POST", data=data) - if "user_session" not in self.session.cookies: - raise exception.AuthenticationError() - del self.session.cookies["nicosid"] - return self.session.cookies - class SeigaUserExtractor(SeigaExtractor): """Extractor for images of a user from seiga.nicovideo.jp""" diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 2c806ad..965391c 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -22,10 +22,11 @@ class SkebExtractor(Extractor): Extractor.__init__(self, match) self.user_name = match.group(1) self.thumbnails = self.config("thumbnails", False) + self.sent_requests = self.config("sent-requests", False) def items(self): - for post_num in self.posts(): - response, post = self._get_post_data(post_num) + for user_name, post_num in self.posts(): + response, post = self._get_post_data(user_name, post_num) yield Message.Directory, post for data in self._get_urls_from_post(response, post): url = data["file_url"] @@ -38,24 +39,33 @@ class SkebExtractor(Extractor): url = "{}/api/users/{}/works".format(self.root, self.user_name) params = {"role": "creator", "sort": "date", "offset": 0} headers = {"Referer": self.root, "Authorization": "Bearer null"} + do_requests = self.sent_requests while True: posts = self.request(url, params=params, headers=headers).json() for post in posts: post_num = post["path"].rpartition("/")[2] + user_name = post["path"].split("/")[1][1:] if post["private"]: - self.log.debug("Skipping %s (private)", post_num) + self.log.debug("Skipping @%s/%s (private)", + user_name, post_num) continue - yield post_num + yield user_name, post_num if len(posts) < 30: - return + if do_requests: + params["offset"] = 0 + params['role'] = "client" + do_requests = False + continue + else: + return params["offset"] += 30 - def _get_post_data(self, post_num): + def _get_post_data(self, user_name, post_num): url = "{}/api/users/{}/works/{}".format( - self.root, self.user_name, post_num) + self.root, user_name, post_num) headers = {"Referer": self.root, "Authorization": "Bearer null"} resp = self.request(url, headers=headers).json() creator = resp["creator"] @@ -130,7 +140,7 @@ class SkebPostExtractor(SkebExtractor): self.post_num = match.group(2) def posts(self): - return (self.post_num,) + return ((self.user_name, self.post_num),) class SkebUserExtractor(SkebExtractor): diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index 91386e8..557c9fb 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2021 Mike Fährmann, Leonardo Taccari +# Copyright 2016-2022 Mike Fährmann, Leonardo Taccari # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,11 +8,12 @@ """Extractors for https://www.slideshare.net/""" -from .common import Extractor, Message +from .common import GalleryExtractor from .. import text +import json -class SlidesharePresentationExtractor(Extractor): +class SlidesharePresentationExtractor(GalleryExtractor): """Extractor for images from a presentation on slideshare.net""" category = "slideshare" subcategory = "presentation" @@ -24,13 +25,36 @@ class SlidesharePresentationExtractor(Extractor): test = ( (("https://www.slideshare.net" "/Slideshare/get-started-with-slide-share"), { - "url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18", - "content": "2e90a01c6ca225579ebf8f98ab46f97a28a5e45c", + "pattern": r"https://image\.slidesharecdn\.com/getstartedwithslide" + r"share-150520173821-lva1-app6892/95/get-started-with-s" + r"lide-share-\d+-1024\.jpg\?cb=\d+", + "count": 19, + "content": "2b6a191eab60b3978fdacfecf2da302dd45bc108", + "keyword": { + "comments": "0", + "description": "Get Started with SlideShare - " + "A Beginngers Guide for Creators", + "likes": r"re:\d{3,}", + "presentation": "get-started-with-slide-share", + "published": "dt:2015-05-20 00:00:00", + "title": "Getting Started With SlideShare", + "user": "Slideshare", + "views": r"re:\d{7,}", + }, }), - # long title + # long title and description (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren" "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), { "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7", + "keyword": { + "title": "Warum Sie nicht Ihren Mitarbeitenden ändern " + "sollten, sondern Ihr Managementsystem", + "description": "Mitarbeitende verhalten sich mehrheitlich so, " + "wie das System es ihnen vorgibt. Welche Voraus" + "setzungen es braucht, damit Ihre Mitarbeitende" + "n ihr ganzes Herzblut einsetzen, bespricht Fre" + "di Schmidli in diesem Referat.", + }, }), # mobile URL (("https://www.slideshare.net" @@ -40,48 +64,50 @@ class SlidesharePresentationExtractor(Extractor): ) def __init__(self, match): - Extractor.__init__(self, match) self.user, self.presentation = match.groups() + url = "https://www.slideshare.net/{}/{}".format( + self.user, self.presentation) + GalleryExtractor.__init__(self, match, url) - def items(self): - page = self.request("https://www.slideshare.net/" + self.user + - "/" + self.presentation).text - data = self.get_job_metadata(page) - imgs = self.get_image_urls(page) - data["count"] = len(imgs) - yield Message.Directory, data - for data["num"], url in enumerate(imgs, 1): - yield Message.Url, url, text.nameext_from_url(url, data) + def metadata(self, page): + extr = text.extract_from(page) + descr = extr('', '') + published = extr('') + comments = extr('content="UserComments:', '"') + likes = extr('content="UserLikes:', '"') + views = extr('content="UserPageVisits:', '"') - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" - descr, pos = text.extract( - page, '', '', pos) - views, pos = text.extract( - page, '', pos) - published, pos = text.extract( - page, '', pos) - title, pos = text.extract( - page, '', '', pos) - alt_descr, pos = text.extract( - page, '

', '

', pos) - - if descr.endswith("…") and alt_descr: - descr = text.remove_html(alt_descr).strip() + if descr.endswith("…"): + alt_descr = extr( + 'id="slideshow-description-text" class="notranslate">', '

') + if alt_descr: + descr = text.remove_html(alt_descr).strip() return { "user": self.user, "presentation": self.presentation, "title": text.unescape(title.strip()), "description": text.unescape(descr), - "views": text.parse_int(views.rpartition( - " views")[0].replace(",", "")), - "published": published.strip(), + "views": views, + "likes": likes, + "comments": comments, + "published": text.parse_datetime( + published.strip(), "%b. %d, %Y"), } @staticmethod - def get_image_urls(page): - """Extract and return a list of all image-urls""" - return list(text.extract_iter(page, 'data-full="', '"')) + def images(page): + data = json.loads(text.extract( + page, "xtend(true, slideshare_object.slideshow_config, ", ");")[0]) + + # useing 'stripped_title' here is technically wrong, but it works all + # the same, slideshare doesn't seem to care what characters go there + begin = "https://image.slidesharecdn.com/{}/95/{}-".format( + data["ppt_location"], data["stripped_title"]) + end = "-1024.jpg?cb=" + str(data["timestamp"]) + + return [ + (begin + str(n) + end, None) + for n in range(1, data["slide_count"]+1) + ] diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 69e3854..b57013a 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2021 Mike Fährmann +# Copyright 2020-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -105,7 +105,7 @@ class SubscribestarExtractor(Extractor): att, 'data-upload-id="', '"')[0]), "name": text.unescape(text.extract( att, 'doc_preview-title">', '<')[0] or ""), - "url" : text.extract(att, 'href="', '"')[0], + "url" : text.unescape(text.extract(att, 'href="', '"')[0]), "type": "attachment", }) diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py new file mode 100644 index 0000000..c6be38d --- /dev/null +++ b/gallery_dl/extractor/toyhouse.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://toyhou.se/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?toyhou\.se" + + +class ToyhouseExtractor(Extractor): + """Base class for toyhouse extractors""" + category = "toyhouse" + root = "https://toyhou.se" + directory_fmt = ("{category}", "{user|artists!S}") + archive_fmt = "{id}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + self.offset = 0 + + def items(self): + metadata = self.metadata() + + for post in util.advance(self.posts(), self.offset): + if metadata: + post.update(metadata) + text.nameext_from_url(post["url"], post) + post["id"], _, post["hash"] = post["filename"].partition("_") + yield Message.Directory, post + yield Message.Url, post["url"], post + + def posts(self): + return () + + def metadata(self): + return None + + def skip(self, num): + self.offset += num + return num + + def _parse_post(self, post, needle='
\n
', '<'), + "%d %b %Y, %I:%M:%S %p"), + "artists": [ + text.remove_html(artist) + for artist in extr( + '
', '
\n
').split( + '
') + ], + "characters": text.split_html(extr( + '
', ''): + cnt += 1 + yield self._parse_post(post) + + if cnt == 0 and params["page"] == 1: + token, pos = text.extract( + page, '= 19", + }), + ) + + def posts(self): + return self._pagination("/{}/art".format(self.user)) + + def metadata(self): + return {"user": self.user} + + +class ToyhouseImageExtractor(ToyhouseExtractor): + """Extractor for individual toyhouse images""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:" + r"(?:www\.)?toyhou\.se/~images|" + r"f\d+\.toyhou\.se/file/[^/?#]+/(?:image|watermark)s" + r")/(\d+)") + test = ( + ("https://toyhou.se/~images/40587320", { + "content": "058ec8427977ab432c4cc5be5a6dd39ce18713ef", + "keyword": { + "artists": ["d-floe"], + "characters": ["Sumi"], + "date": "dt:2021-10-08 01:32:47", + "extension": "png", + "filename": "40587320_TT1NaBUr3FLkS1p", + "hash": "TT1NaBUr3FLkS1p", + "id": "40587320", + "url": "https://f2.toyhou.se/file/f2-toyhou-se/images" + "/40587320_TT1NaBUr3FLkS1p.png", + }, + }), + # direct link, multiple artists + (("https://f2.toyhou.se/file/f2-toyhou-se" + "/watermarks/36817425_bqhGcwcnU.png?1625561467"), { + "keyword": { + "artists": [ + "http://aminoapps.com/p/92sf3z", + "kroksoc (Color)"], + "characters": ["❀Reiichi❀"], + "date": "dt:2021-07-03 20:02:02", + "hash": "bqhGcwcnU", + "id": "36817425", + }, + }), + ("https://f2.toyhou.se/file/f2-toyhou-se" + "/images/40587320_TT1NaBUr3FLkS1p.png"), + ) + + def posts(self): + url = "{}/~images/{}".format(self.root, self.user) + return (self._parse_post(self.request(url).text, '