diff options
| author | 2024-09-07 18:33:19 -0400 | |
|---|---|---|
| committer | 2024-09-07 18:33:19 -0400 | |
| commit | 1f3ffe32342852fd9ea9e7704022488f3a1222bd (patch) | |
| tree | cb255a091b73e96840de0f6f44b36dff1acab4b9 /gallery_dl/extractor | |
| parent | b5e56c51e491b41f9eb6a895459c185788a377e5 (diff) | |
New upstream version 1.27.4.upstream/1.27.4
Diffstat (limited to 'gallery_dl/extractor')
24 files changed, 376 insertions, 171 deletions
diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index 2adb142..786acd9 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -51,28 +51,29 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): if not manga: manga = extr('link-hover">', "<") info = text.remove_html(extr('link-hover">', "</")) + info = text.unescape(info) match = re.match( - r"(?:Volume\s+(\d+) )?" - r"\w+\s+(\d+)(.*)", info) + r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?" + r"(?:Chapter|Episode)\s*(\d+)([\w.]*)", info) if match: volume, chapter, minor = match.groups() - title = text.remove_html(extr( - "selected>", "</option")).partition(" : ")[2] else: volume = chapter = 0 minor = "" - title = info return { - "manga" : text.unescape(manga), - "manga_id" : text.parse_int(manga_id), - "title" : text.unescape(title), - "volume" : text.parse_int(volume), - "chapter" : text.parse_int(chapter), - "chapter_minor": minor, - "chapter_id" : text.parse_int(self.chapter_id), - "date" : text.parse_timestamp(extr(' time="', '"')[:-3]), + "manga" : text.unescape(manga), + "manga_id" : text.parse_int(manga_id), + "chapter_url" : extr(self.chapter_id + "-ch_", '"'), + "title" : text.unescape(text.remove_html(extr( + "selected>", "</option")).partition(" : ")[2]), + "volume" : text.parse_int(volume), + "chapter" : text.parse_int(chapter), + "chapter_minor" : minor, + "chapter_string": info, + "chapter_id" : text.parse_int(self.chapter_id), + "date" : text.parse_timestamp(extr(' time="', '"')[:-3]), } def images(self, page): diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 240bbd3..780bdf1 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,15 +6,24 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkr.sk/""" +"""Extractors for https://bunkr.si/""" from .lolisafe import LolisafeAlbumExtractor -from .. import text - -BASE_PATTERN = ( - r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|[cf]i|ru|la|is|to|ac|black|cat|media|red|site|ws|org))" -) +from .. import text, config + + +if config.get(("extractor", "bunkr"), "tlds"): + BASE_PATTERN = ( + r"(?:bunkr:(?:https?://)?([^/?#]+)|" + r"(?:https?://)?(?:app\.)?(bunkr+\.\w+))" + ) +else: + BASE_PATTERN = ( + r"(?:bunkr:(?:https?://)?([^/?#]+)|" + r"(?:https?://)?(?:app\.)?(bunkr+" + r"\.(?:s[kiu]|[cf]i|ru|la|is|to|a[cx]" + r"|black|cat|media|red|site|ws|org)))" + ) LEGACY_DOMAINS = { "bunkr.ru", @@ -28,15 +37,15 @@ LEGACY_DOMAINS = { class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkr.sk albums""" + """Extractor for bunkr.si albums""" category = "bunkr" - root = "https://bunkr.sk" + root = "https://bunkr.si" pattern = BASE_PATTERN + r"/a/([^/?#]+)" - example = "https://bunkr.sk/a/ID" + example = "https://bunkr.si/a/ID" def __init__(self, match): LolisafeAlbumExtractor.__init__(self, match) - domain = match.group(match.lastindex-1) + domain = self.groups[0] or self.groups[1] if domain not in LEGACY_DOMAINS: self.root = "https://" + domain @@ -69,11 +78,16 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): def _extract_file(self, url): page = self.request(url).text - return ( - text.extr(page, '<source src="', '"') or - text.extr(page, '<img src="', '"') or - text.rextract(page, ' href="', '"', page.rindex("Download"))[0] - ) + url = (text.extr(page, '<source src="', '"') or + text.extr(page, '<img src="', '"')) + + if not url: + url_download = text.rextract( + page, ' href="', '"', page.rindex("Download"))[0] + page = self.request(text.unescape(url_download)).text + url = text.unescape(text.rextract(page, ' href="', '"')[0]) + + return url def _validate(self, response): if response.history and response.url.endswith("/maintenance-vid.mp4"): @@ -83,11 +97,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): class BunkrMediaExtractor(BunkrAlbumExtractor): - """Extractor for bunkr.sk media links""" + """Extractor for bunkr.si media links""" subcategory = "media" directory_fmt = ("{category}",) pattern = BASE_PATTERN + r"(/[vid]/[^/?#]+)" - example = "https://bunkr.sk/v/FILENAME" + example = "https://bunkr.si/v/FILENAME" def fetch_album(self, album_id): try: diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index d864960..a514696 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -14,6 +14,7 @@ from .. import text class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): category = "cyberdrop" root = "https://cyberdrop.me" + root_api = "https://api.cyberdrop.me" pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)" example = "https://cyberdrop.me/a/ID" @@ -55,5 +56,14 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): def _extract_files(self, file_ids): for file_id in file_ids: - url = "{}/api/f/{}".format(self.root, file_id) - yield self.request(url).json() + try: + url = "{}/api/file/info/{}".format(self.root_api, file_id) + file = self.request(url).json() + auth = self.request(file["auth_url"]).json() + file["url"] = auth["url"] + except Exception as exc: + self.log.warning("%s (%s: %s)", + file_id, exc.__class__.__name__, exc) + continue + + yield file diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index f3ea4e7..ea70b58 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -69,11 +69,12 @@ class DeviantartExtractor(Extractor): self.quality = ",q_{}".format(self.quality) self.quality_sub = re.compile(r",q_\d+").sub - if self.original != "image": - self._update_content = self._update_content_default - else: - self._update_content = self._update_content_image + if isinstance(self.original, str) and \ + self.original.lower().startswith("image"): self.original = True + self._update_content = self._update_content_image + else: + self._update_content = self._update_content_default journals = self.config("journals", "html") if journals == "html": @@ -1462,6 +1463,8 @@ class DeviantartOAuthAPI(): return if "next_cursor" in data: + if not data["next_cursor"]: + return params["offset"] = None params["cursor"] = data["next_cursor"] elif data["next_offset"] is not None: diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index af963bc..553ec22 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -10,6 +10,7 @@ from .common import Message from . import danbooru +from ..cache import memcache from .. import text, util @@ -44,16 +45,11 @@ class E621Extractor(danbooru.DanbooruExtractor): self.root[8:], md5[0:2], md5[2:4], md5, file["ext"]) if notes and post.get("has_notes"): - url = "{}/notes.json?search[post_id]={}".format( - self.root, post["id"]) - post["notes"] = self.request(url).json() + post["notes"] = self._get_notes(post["id"]) if pools and post["pools"]: - url = "{}/pools.json?search[id]={}".format( - self.root, ",".join(map(str, post["pools"]))) - post["pools"] = _pools = self.request(url).json() - for pool in _pools: - pool["name"] = pool["name"].replace("_", " ") + post["pools"] = self._get_pools( + ",".join(map(str, post["pools"]))) post["filename"] = file["md5"] post["extension"] = file["ext"] @@ -64,6 +60,18 @@ class E621Extractor(danbooru.DanbooruExtractor): yield Message.Directory, post yield Message.Url, file["url"], post + def _get_notes(self, id): + return self.request( + "{}/notes.json?search[post_id]={}".format(self.root, id)).json() + + @memcache(keyarg=1) + def _get_pools(self, ids): + pools = self.request( + "{}/pools.json?search[id]={}".format(self.root, ids)).json() + for pool in pools: + pool["name"] = pool["name"].replace("_", " ") + return pools + BASE_PATTERN = E621Extractor.update({ "e621": { diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 1b4f995..01af7a4 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -430,7 +430,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): } page = self.request(url, cookies=cookies).text - current = text.extr(page, "<strong>", "</strong>") + current = text.extr(page, "<strong>", "</strong>").replace(",", "") self.log.debug("Image Limits: %s/%s", current, self.limits) self._remaining = self.limits - text.parse_int(current) diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index c94a110..1b4971c 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -75,11 +75,8 @@ class FlickrImageExtractor(FlickrExtractor): def items(self): photo = self.api.photos_getInfo(self.item_id) - if self.api.exif: - photo.update(self.api.photos_getExif(self.item_id)) - if self.api.contexts: - photo.update(self.api.photos_getAllContexts(self.item_id)) + self.api._extract_metadata(photo) if photo["media"] == "video" and self.api.videos: self.api._extract_video(photo) else: @@ -135,8 +132,13 @@ class FlickrAlbumExtractor(FlickrExtractor): def metadata(self): data = FlickrExtractor.metadata(self) - data["album"] = self.api.photosets_getInfo( - self.album_id, self.user["nsid"]) + try: + data["album"] = self.api.photosets_getInfo( + self.album_id, self.user["nsid"]) + except Exception: + data["album"] = {} + self.log.warning("%s: Unable to retrieve album metadata", + self.album_id) return data def photos(self): @@ -407,6 +409,8 @@ class FlickrAPI(oauth.OAuth1API): self.log.debug("Server response: %s", data) if data["code"] == 1: raise exception.NotFoundError(self.extractor.subcategory) + elif data["code"] == 2: + raise exception.AuthorizationError(msg) elif data["code"] == 98: raise exception.AuthenticationError(msg) elif data["code"] == 99: @@ -453,10 +457,7 @@ class FlickrAPI(oauth.OAuth1API): photo["date"] = text.parse_timestamp(photo["dateupload"]) photo["tags"] = photo["tags"].split() - if self.exif: - photo.update(self.photos_getExif(photo["id"])) - if self.contexts: - photo.update(self.photos_getAllContexts(photo["id"])) + self._extract_metadata(photo) photo["id"] = text.parse_int(photo["id"]) if "owner" in photo: @@ -512,6 +513,23 @@ class FlickrAPI(oauth.OAuth1API): photo["width"] = photo["height"] = 0 return photo + def _extract_metadata(self, photo): + if self.exif: + try: + photo.update(self.photos_getExif(photo["id"])) + except Exception as exc: + self.log.warning( + "Unable to retrieve 'exif' data for %s (%s: %s)", + photo["id"], exc.__class__.__name__, exc) + + if self.contexts: + try: + photo.update(self.photos_getAllContexts(photo["id"])) + except Exception as exc: + self.log.warning( + "Unable to retrieve 'contexts' data for %s (%s: %s)", + photo["id"], exc.__class__.__name__, exc) + @staticmethod def _clean_info(info): info["title"] = info["title"]["_content"] diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 3055426..d253582 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -179,6 +179,11 @@ class FuraffinityExtractor(Extractor): break self._favorite_id = text.parse_int(extr('data-fav-id="', '"')) yield post_id + + pos = page.find('type="submit">Next</button>') + if pos >= 0: + path = text.rextract(page, '<form action="', '"', pos)[0] + continue path = text.extr(page, 'right" href="', '"') def _pagination_search(self, query): diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 16d4340..a6c1d5a 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -15,7 +15,7 @@ import re class GenericExtractor(Extractor): """Extractor for images in a generic web page.""" category = "generic" - directory_fmt = ("{category}", "{pageurl}") + directory_fmt = ("{category}", "{subcategory}", "{path}") archive_fmt = "{imageurl}" # By default, the generic extractor is disabled @@ -52,7 +52,10 @@ class GenericExtractor(Extractor): self.scheme = match.group('scheme') else: self.scheme = 'https://' - self.url = self.scheme + self.url + self.url = text.ensure_http_scheme(self.url, self.scheme) + + self.subcategory = match.group('domain') + self.path = match.group('path') # Used to resolve relative image urls self.root = self.scheme + match.group('domain') @@ -87,6 +90,7 @@ class GenericExtractor(Extractor): def metadata(self, page): """Extract generic webpage metadata, return them in a dict.""" data = {} + data['path'] = self.path.replace("/", "") data['pageurl'] = self.url data['title'] = text.extr(page, '<title>', "</title>") data['description'] = text.extr( diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index f0eb4e9..52b4ae6 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -47,8 +47,7 @@ class GofileFolderExtractor(Extractor): raise exception.AuthorizationError("Password required") num = 0 - for content_id in folder["childrenIds"]: - content = contents[content_id] + for content in contents.values(): content["folder"] = folder if content["type"] == "file": diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 9b74700..18df9df 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -89,6 +89,7 @@ class HitomiGalleryExtractor(GalleryExtractor): path = ext = "webp" ihash = image["hash"] idata = text.nameext_from_url(image["name"]) + idata["extension_original"] = idata["extension"] if ext: idata["extension"] = ext diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index c05fe72..422c865 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -12,6 +12,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache +import itertools import binascii import json import re @@ -57,12 +58,17 @@ class InstagramExtractor(Extractor): data = self.metadata() videos = self.config("videos", True) previews = self.config("previews", False) + max_posts = self.config("max-posts") video_headers = {"User-Agent": "Mozilla/5.0"} order = self.config("order-files") reverse = order[0] in ("r", "d") if order else False - for post in self.posts(): + posts = self.posts() + if max_posts: + posts = itertools.islice(posts, max_posts) + + for post in posts: if "__typename" in post: post = self._parse_post_graphql(post) @@ -159,15 +165,19 @@ class InstagramExtractor(Extractor): if "title" in post: data["highlight_title"] = post["title"] if "created_at" in post: - data["date"] = text.parse_timestamp(post.get("created_at")) + data["post_date"] = data["date"] = text.parse_timestamp( + post.get("created_at")) else: # regular image/video post + date = text.parse_timestamp(post.get("taken_at")) data = { "post_id" : post["pk"], "post_shortcode": post["code"], + "post_url": "{}/p/{}/".format(self.root, post["code"]), + "post_date": date, + "date": date, "likes": post.get("like_count", 0), "pinned": post.get("timeline_pinned_user_ids", ()), - "date": text.parse_timestamp(post.get("taken_at")), "liked": post.get("has_liked", False), } @@ -206,7 +216,6 @@ class InstagramExtractor(Extractor): data["owner_id"] = owner["pk"] data["username"] = owner.get("username") data["fullname"] = owner.get("full_name") - data["post_url"] = "{}/p/{}/".format(self.root, data["post_shortcode"]) data["_files"] = files = [] for num, item in enumerate(items, 1): @@ -269,7 +278,6 @@ class InstagramExtractor(Extractor): owner = post["owner"] data = { "typename" : typename, - "date" : text.parse_timestamp(post["taken_at_timestamp"]), "likes" : post["edge_media_preview_like"]["count"], "liked" : post.get("viewer_has_liked", False), "pinned" : pinned, @@ -279,11 +287,13 @@ class InstagramExtractor(Extractor): "post_id" : post["id"], "post_shortcode": post["shortcode"], "post_url" : "{}/p/{}/".format(self.root, post["shortcode"]), + "post_date" : text.parse_timestamp(post["taken_at_timestamp"]), "description": text.parse_unicode_escapes("\n".join( edge["node"]["text"] for edge in post["edge_media_to_caption"]["edges"] )), } + data["date"] = data["post_date"] tags = self._find_tags(data["description"]) if tags: @@ -313,6 +323,7 @@ class InstagramExtractor(Extractor): media = { "num": num, "media_id" : node["id"], + "date" : data["date"], "shortcode" : (node.get("shortcode") or shortcode_from_id(node["id"])), "display_url": node["display_url"], @@ -328,6 +339,7 @@ class InstagramExtractor(Extractor): dimensions = post["dimensions"] media = { "media_id" : post["id"], + "date" : data["date"], "shortcode" : post["shortcode"], "display_url": post["display_url"], "video_url" : post.get("video_url"), @@ -378,7 +390,11 @@ class InstagramExtractor(Extractor): "full_name": user["full_name"]}) def _init_cursor(self): - return self.config("cursor") or None + cursor = self.config("cursor", True) + if not cursor: + self._update_cursor = util.identity + elif isinstance(cursor, str): + return cursor def _update_cursor(self, cursor): self.log.debug("Cursor: %s", cursor) @@ -418,6 +434,7 @@ class InstagramUserExtractor(InstagramExtractor): base = "{}/{}/".format(self.root, self.item) stories = "{}/stories/{}/".format(self.root, self.item) return self._dispatch_extractors(( + (InstagramInfoExtractor , base + "info/"), (InstagramAvatarExtractor , base + "avatar/"), (InstagramStoriesExtractor , stories), (InstagramHighlightsExtractor, base + "highlights/"), diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py index 979b1a2..cacf504 100644 --- a/gallery_dl/extractor/koharu.py +++ b/gallery_dl/extractor/koharu.py @@ -161,16 +161,29 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor): return results def _select_format(self, formats): - if not self.fmt or self.fmt == "original": - fmtid = "0" + fmt = self.fmt + + if not fmt or fmt == "best": + fmtids = ("0", "1600", "1280", "980", "780") + elif isinstance(fmt, str): + fmtids = fmt.split(",") + elif isinstance(fmt, list): + fmtids = fmt else: - fmtid = str(self.fmt) + fmtids = (str(self.fmt),) - try: - fmt = formats[fmtid] - except KeyError: + for fmtid in fmtids: + try: + fmt = formats[fmtid] + if fmt["id"]: + break + except KeyError: + self.log.debug("%s: Format %s is not available", + self.groups[0], fmtid) + else: raise exception.NotFoundError("format") + self.log.debug("%s: Selected format %s", self.groups[0], fmtid) fmt["w"] = fmtid return fmt diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 3d7d685..117b88b 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -34,7 +34,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor): def __init__(self, match): LolisafeExtractor.__init__(self, match) - self.album_id = match.group(match.lastindex) + self.album_id = self.groups[-1] def _init(self): domain = self.config("domain") diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index ecd6619..5fc0ce5 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -171,15 +171,17 @@ class NewgroundsExtractor(Extractor): if self.flash: url += "/format/flash" - with self.request(url, fatal=False) as response: - if response.status_code >= 400: - return {} - page = response.text + response = self.request(url, fatal=False) + page = response.text pos = page.find('id="adults_only"') if pos >= 0: msg = text.extract(page, 'class="highlight">', '<', pos)[0] self.log.warning('"%s"', msg) + return {} + + if response.status_code >= 400: + return {} extr = text.extract_from(page) data = extract_data(extr, post_url) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index d732894..3479b88 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -94,12 +94,39 @@ class PixivExtractor(Extractor): work.get("id"), exc.message) continue - url = ugoira["zip_urls"]["medium"].replace( - "_ugoira600x600", "_ugoira1920x1080") - work["frames"] = ugoira["frames"] + url = ugoira["zip_urls"]["medium"] + work["frames"] = frames = ugoira["frames"] work["date_url"] = self._date_from_url(url) work["_http_adjust_extension"] = False - yield Message.Url, url, text.nameext_from_url(url, work) + + if self.load_ugoira == "original": + base, sep, _ = url.rpartition("_ugoira") + base = base.replace( + "/img-zip-ugoira/", "/img-original/", 1) + sep + + for ext in ("jpg", "png", "gif"): + try: + url = ("{}0.{}".format(base, ext)) + self.request(url, method="HEAD") + break + except exception.HttpError: + pass + else: + self.log.warning( + "Unable to find Ugoira frame URLs (%s)", + work.get("id")) + continue + + for num, frame in enumerate(frames): + url = ("{}{}.{}".format(base, num, ext)) + work["num"] = work["_ugoira_frame_index"] = num + work["suffix"] = "_p{:02}".format(num) + text.nameext_from_url(url, work) + yield Message.Url, url, work + + else: + url = url.replace("_ugoira600x600", "_ugoira1920x1080") + yield Message.Url, url, text.nameext_from_url(url, work) elif work["page_count"] == 1: url = meta_single_page["original_image_url"] @@ -551,9 +578,6 @@ class PixivSeriesExtractor(PixivExtractor): directory_fmt = ("{category}", "{user[id]} {user[account]}", "{series[id]} {series[title]}") filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" - cookies_domain = ".pixiv.net" - browser = "firefox" - tls12 = False pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)" example = "https://www.pixiv.net/user/12345/series/12345" @@ -562,34 +586,18 @@ class PixivSeriesExtractor(PixivExtractor): self.user_id, self.series_id = match.groups() def works(self): - url = self.root + "/ajax/series/" + self.series_id - params = {"p": 1} - headers = { - "Accept": "application/json", - "Referer": "{}/user/{}/series/{}".format( - self.root, self.user_id, self.series_id), - "Alt-Used": "www.pixiv.net", - } + series = None - while True: - data = self.request(url, params=params, headers=headers).json() - body = data["body"] - page = body["page"] - - series = body["extraData"]["meta"] - series["id"] = self.series_id - series["total"] = page["total"] - series["title"] = text.extr(series["title"], '"', '"') - - for info in page["series"]: - work = self.api.illust_detail(info["workId"]) - work["num_series"] = info["order"] - work["series"] = series - yield work - - if len(page["series"]) < 10: - return - params["p"] += 1 + for work in self.api.illust_series(self.series_id): + if series is None: + series = self.api.data + series["total"] = num_series = series.pop("series_work_count") + else: + num_series -= 1 + + work["num_series"] = num_series + work["series"] = series + yield work class PixivNovelExtractor(PixivExtractor): @@ -916,6 +924,11 @@ class PixivAppAPI(): params = {"illust_id": illust_id} return self._pagination("/v2/illust/related", params) + def illust_series(self, series_id, offset=0): + params = {"illust_series_id": series_id, "offset": offset} + return self._pagination("/v1/illust/series", params, + key_data="illust_series_detail") + def novel_bookmark_detail(self, novel_id): params = {"novel_id": novel_id} return self._call( @@ -1013,10 +1026,15 @@ class PixivAppAPI(): raise exception.StopExtraction("API request failed: %s", error) - def _pagination(self, endpoint, params, key="illusts"): + def _pagination(self, endpoint, params, + key_items="illusts", key_data=None): while True: data = self._call(endpoint, params) - yield from data[key] + + if key_data: + self.data = data.get(key_data) + key_data = None + yield from data[key_items] if not data["next_url"]: return diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index ad3efa7..7db8172 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -66,7 +66,8 @@ class SankakuExtractor(BooruExtractor): def _prepare(self, post): post["created_at"] = post["created_at"]["s"] post["date"] = text.parse_timestamp(post["created_at"]) - post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]] + post["tags"] = [tag["name"].lower().replace(" ", "_") + for tag in post["tags"] if tag["name"]] post["tag_string"] = " ".join(post["tags"]) post["_http_validate"] = self._check_expired @@ -79,7 +80,7 @@ class SankakuExtractor(BooruExtractor): for tag in post["tags"]: name = tag["name"] if name: - tags[types[tag["type"]]].append(name) + tags[types[tag["type"]]].append(name.lower().replace(" ", "_")) for key, value in tags.items(): post["tags_" + key] = value post["tag_string_" + key] = " ".join(value) diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 80f2aea..7708b5c 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -152,6 +152,25 @@ class SexcomPinsExtractor(SexcomExtractor): return self._pagination(url) +class SexcomLikesExtractor(SexcomExtractor): + """Extractor for a user's liked pins on www.sex.com""" + subcategory = "likes" + directory_fmt = ("{category}", "{user}", "Likes") + pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/likes/" + example = "https://www.sex.com/user/USER/likes/" + + def __init__(self, match): + SexcomExtractor.__init__(self, match) + self.user = match.group(1) + + def metadata(self): + return {"user": text.unquote(self.user)} + + def pins(self): + url = "{}/user/{}/likes/".format(self.root, self.user) + return self._pagination(url) + + class SexcomBoardExtractor(SexcomExtractor): """Extractor for pins from a board on www.sex.com""" subcategory = "board" diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index bba1ece..b6917cc 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -86,6 +86,7 @@ BASE_PATTERN = SzurubooruExtractor.update({ "bcbnsfw": { "root": "https://booru.bcbnsfw.space", "pattern": r"booru\.bcbnsfw\.space", + "query-all": "*", }, "snootbooru": { "root": "https://snootbooru.com", @@ -110,7 +111,12 @@ class SzurubooruTagExtractor(SzurubooruExtractor): return {"search_tags": self.query} def posts(self): - return self._pagination("/posts/", {"query": self.query}) + if self.query.strip(): + query = self.query + else: + query = self.config_instance("query-all") + + return self._pagination("/posts/", {"query": query}) class SzurubooruPostExtractor(SzurubooruExtractor): diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py index 64fa951..44d87ee 100644 --- a/gallery_dl/extractor/toyhouse.py +++ b/gallery_dl/extractor/toyhouse.py @@ -123,4 +123,5 @@ class ToyhouseImageExtractor(ToyhouseExtractor): def posts(self): url = "{}/~images/{}".format(self.root, self.user) - return (self._parse_post(self.request(url).text, '<img src="'),) + return (self._parse_post( + self.request(url).text, '<img class="mw-100" src="'),) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index ff29c04..73455d2 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -400,6 +400,9 @@ class TumblrAPI(oauth.OAuth1API): """Retrieve liked posts""" endpoint = "/v2/blog/{}/likes".format(blog) params = {"limit": "50", "before": self.before} + if self.api_key: + params["api_key"] = self.api_key + while True: posts = self._call(endpoint, params)["liked_posts"] if not posts: diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ea57d76..d4ec343 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -155,6 +155,7 @@ class TwitterExtractor(Extractor): if not self.unavailable: continue + mtype = media.get("type") descr = media.get("ext_alt_text") width = media["original_info"].get("width", 0) height = media["original_info"].get("height", 0) @@ -164,6 +165,7 @@ class TwitterExtractor(Extractor): files.append({ "url": "ytdl:{}/i/web/status/{}".format( self.root, tweet["id_str"]), + "type" : mtype, "width" : width, "height" : height, "extension" : None, @@ -177,6 +179,7 @@ class TwitterExtractor(Extractor): ) files.append({ "url" : variant["url"], + "type" : mtype, "width" : width, "height" : height, "bitrate" : variant.get("bitrate", 0), @@ -193,6 +196,7 @@ class TwitterExtractor(Extractor): base = url.rpartition("=")[0] + "=" files.append(text.nameext_from_url(url, { "url" : base + self._size_image, + "type" : mtype, "width" : width, "height" : height, "_fallback" : self._image_fallback(base), @@ -504,7 +508,11 @@ class TwitterExtractor(Extractor): } def _init_cursor(self): - return self.config("cursor") or None + cursor = self.config("cursor", True) + if not cursor: + self._update_cursor = util.identity + elif isinstance(cursor, str): + return cursor def _update_cursor(self, cursor): self.log.debug("Cursor: %s", cursor) @@ -560,6 +568,7 @@ class TwitterUserExtractor(TwitterExtractor): def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( + (TwitterInfoExtractor , base + "info"), (TwitterAvatarExtractor , base + "photo"), (TwitterBackgroundExtractor, base + "header_photo"), (TwitterTimelineExtractor , base + "timeline"), @@ -590,9 +599,16 @@ class TwitterTimelineExtractor(TwitterExtractor): return cursor def tweets(self): - self._cursor = cursor = self.config("cursor") or None reset = False + cursor = self.config("cursor", True) + if not cursor: + self._update_cursor = util.identity + elif isinstance(cursor, str): + self._cursor = cursor + else: + cursor = None + if cursor: state = cursor.partition("/")[0] state, _, tweet_id = state.partition("_") @@ -1612,6 +1628,9 @@ class TwitterAPI(): entries = instr["entries"] elif instr_type == "TimelineAddToModule": entries = instr["moduleItems"] + elif instr_type == "TimelinePinEntry": + if pinned_tweet: + pinned_tweet = instr["entry"] elif instr_type == "TimelineReplaceEntry": entry = instr["entry"] if entry["entryId"].startswith("cursor-bottom-"): @@ -1650,9 +1669,11 @@ class TwitterAPI(): tweet = None if pinned_tweet: - pinned_tweet = False - if instructions[-1]["type"] == "TimelinePinEntry": + if isinstance(pinned_tweet, dict): + tweets.append(pinned_tweet) + elif instructions[-1]["type"] == "TimelinePinEntry": tweets.append(instructions[-1]["entry"]) + pinned_tweet = False for entry in entries: esw = entry["entryId"].startswith diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 9370cfb..7a62e01 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -17,13 +17,11 @@ class WikimediaExtractor(BaseExtractor): """Base class for wikimedia extractors""" basecategory = "wikimedia" filename_fmt = "{filename} ({sha1[:8]}).{extension}" - directory_fmt = ("{category}", "{page}") archive_fmt = "{sha1}" request_interval = (1.0, 2.0) def __init__(self, match): BaseExtractor.__init__(self, match) - path = match.group(match.lastindex) if self.category == "wikimedia": self.category = self.root.split(".")[-2] @@ -31,31 +29,7 @@ class WikimediaExtractor(BaseExtractor): self.category = "{}-{}".format( self.category, self.root.partition(".")[0].rpartition("/")[2]) - if path.startswith("wiki/"): - path = path[5:] - - pre, sep, _ = path.partition(":") - prefix = pre.lower() if sep else None - - self.title = path = text.unquote(path) - if prefix: - self.subcategory = prefix - - if prefix == "category": - self.params = { - "generator": "categorymembers", - "gcmtitle" : path, - "gcmtype" : "file", - } - elif prefix == "file": - self.params = { - "titles" : path, - } - else: - self.params = { - "generator": "images", - "titles" : path, - } + self.per_page = self.config("limit", 50) def _init(self): api_path = self.config_instance("api-path") @@ -67,6 +41,22 @@ class WikimediaExtractor(BaseExtractor): else: self.api_url = self.root + "/api.php" + @staticmethod + def prepare(image): + """Adjust the content of a image object""" + image["metadata"] = { + m["name"]: m["value"] + for m in image["metadata"] or ()} + image["commonmetadata"] = { + m["name"]: m["value"] + for m in image["commonmetadata"] or ()} + + filename = image["canonicaltitle"] + image["filename"], _, image["extension"] = \ + filename.partition(":")[2].rpartition(".") + image["date"] = text.parse_datetime( + image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + def items(self): for info in self._pagination(self.params): try: @@ -75,20 +65,7 @@ class WikimediaExtractor(BaseExtractor): self.log.debug("Missing 'imageinfo' for %s", info) continue - image["metadata"] = { - m["name"]: m["value"] - for m in image["metadata"] or ()} - image["commonmetadata"] = { - m["name"]: m["value"] - for m in image["commonmetadata"] or ()} - - filename = image["canonicaltitle"] - image["filename"], _, image["extension"] = \ - filename.partition(":")[2].rpartition(".") - image["date"] = text.parse_datetime( - image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") - image["page"] = self.title - + self.prepare(image) yield Message.Directory, image yield Message.Url, image["url"], image @@ -110,6 +87,17 @@ class WikimediaExtractor(BaseExtractor): while True: data = self.request(url, params=params).json() + # ref: https://www.mediawiki.org/wiki/API:Errors_and_warnings + error = data.get("error") + if error: + self.log.error("%s: %s", error["code"], error["info"]) + return + # MediaWiki will emit warnings for non-fatal mistakes such as + # invalid parameter instead of raising an error + warnings = data.get("warnings") + if warnings: + self.log.debug("MediaWiki returned warnings: %s", warnings) + try: pages = data["query"]["pages"] except KeyError: @@ -181,5 +169,59 @@ BASE_PATTERN = WikimediaExtractor.update({ class WikimediaArticleExtractor(WikimediaExtractor): """Extractor for wikimedia articles""" subcategory = "article" + directory_fmt = ("{category}", "{page}") pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)" example = "https://en.wikipedia.org/wiki/TITLE" + + def __init__(self, match): + WikimediaExtractor.__init__(self, match) + + path = match.group(match.lastindex) + if path.startswith("wiki/"): + path = path[5:] + + pre, sep, _ = path.partition(":") + prefix = pre.lower() if sep else None + + self.title = path = text.unquote(path) + if prefix: + self.subcategory = prefix + + if prefix == "category": + self.params = { + "generator": "categorymembers", + "gcmtitle" : path, + "gcmtype" : "file", + "gcmlimit" : self.per_page, + } + elif prefix == "file": + self.params = { + "titles" : path, + } + else: + self.params = { + "generator": "images", + "gimlimit" : self.per_page, + "titles" : path, + } + + def prepare(self, image): + WikimediaExtractor.prepare(image) + image["page"] = self.title + + +class WikimediaWikiExtractor(WikimediaExtractor): + """Extractor for all files on a MediaWiki instance""" + subcategory = "wiki" + pattern = BASE_PATTERN + r"/?$" + example = "https://en.wikipedia.org/" + + def __init__(self, match): + WikimediaExtractor.__init__(self, match) + + # ref: https://www.mediawiki.org/wiki/API:Allpages + self.params = { + "generator" : "allpages", + "gapnamespace": 6, # "File" namespace + "gaplimit" : self.per_page, + } diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py index cb3c74c..168845e 100644 --- a/gallery_dl/extractor/ytdl.py +++ b/gallery_dl/extractor/ytdl.py @@ -116,21 +116,20 @@ class YoutubeDLExtractor(Extractor): for entry in entries: if not entry: continue - elif entry.get("_type") in ("url", "url_transparent"): + + if entry.get("_type") in ("url", "url_transparent"): try: - info_dict = ytdl_instance.extract_info( + entry = ytdl_instance.extract_info( entry["url"], False, ie_key=entry.get("ie_key")) except ytdl_module.utils.YoutubeDLError: continue - - if not info_dict: + if not entry: continue - elif "entries" in info_dict: - yield from self._process_entries( - ytdl_module, ytdl_instance, info_dict["entries"]) - else: - yield info_dict + + if "entries" in entry: + yield from self._process_entries( + ytdl_module, ytdl_instance, entry["entries"]) else: yield entry |
