diff options
Diffstat (limited to 'gallery_dl')
24 files changed, 376 insertions, 460 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index a344fe4..fa56bfb 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,7 +24,6 @@ modules = [ "artstation", "aryion", "bbc", - "bcy", "behance", "blogger", "bunkr", @@ -85,7 +84,6 @@ modules = [ "lensdump", "lexica", "lightroom", - "lineblog", "livedoor", "luscious", "lynxchan", diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py deleted file mode 100644 index d6adb4e..0000000 --- a/gallery_dl/extractor/bcy.py +++ /dev/null @@ -1,206 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://bcy.net/""" - -from .common import Extractor, Message -from .. import text, util, exception -import re - - -class BcyExtractor(Extractor): - """Base class for bcy extractors""" - category = "bcy" - directory_fmt = ("{category}", "{user[id]} {user[name]}") - filename_fmt = "{post[id]} {id}.{extension}" - archive_fmt = "{post[id]}_{id}" - root = "https://bcy.net" - - def __init__(self, match): - Extractor.__init__(self, match) - self.item_id = match.group(1) - self.session.headers["Referer"] = self.root + "/" - - def items(self): - sub = re.compile(r"^https?://p\d+-bcy" - r"(?:-sign\.bcyimg\.com|\.byteimg\.com/img)" - r"/banciyuan").sub - iroot = "https://img-bcy-qn.pstatp.com" - noop = self.config("noop") - - for post in self.posts(): - if not post["image_list"]: - continue - - multi = None - tags = post.get("post_tags") or () - data = { - "user": { - "id" : post["uid"], - "name" : post["uname"], - "avatar" : sub(iroot, post["avatar"].partition("~")[0]), - }, - "post": { - "id" : text.parse_int(post["item_id"]), - "tags" : [t["tag_name"] for t in tags], - "date" : text.parse_timestamp(post["ctime"]), - "parody" : post["work"], - "content": post["plain"], - "likes" : post["like_count"], - "shares" : post["share_count"], - "replies": post["reply_count"], - }, - } - - yield Message.Directory, data - for data["num"], image in enumerate(post["image_list"], 1): - data["id"] = image["mid"] - data["width"] = image["w"] - data["height"] = image["h"] - - url = image["path"].partition("~")[0] - text.nameext_from_url(url, data) - - # full-resolution image without watermark - if data["extension"]: - if not url.startswith(iroot): - url = sub(iroot, url) - data["filter"] = "" - yield Message.Url, url, data - - # watermarked image & low quality noop filter - else: - if multi is None: - multi = self._data_from_post( - post["item_id"])["post_data"]["multi"] - image = multi[data["num"] - 1] - - if image["origin"]: - data["filter"] = "watermark" - yield Message.Url, image["origin"], data - - if noop: - data["extension"] = "" - data["filter"] = "noop" - yield Message.Url, image["original_path"], data - - def posts(self): - """Returns an iterable with all relevant 'post' objects""" - - def _data_from_post(self, post_id): - url = "{}/item/detail/{}".format(self.root, post_id) - page = self.request(url, notfound="post").text - data = (text.extr(page, 'JSON.parse("', '");') - .replace('\\\\u002F', '/') - .replace('\\"', '"')) - try: - return util.json_loads(data)["detail"] - except ValueError: - return util.json_loads(data.replace('\\"', '"'))["detail"] - - -class BcyUserExtractor(BcyExtractor): - """Extractor for user timelines""" - subcategory = "user" - pattern = r"(?:https?://)?bcy\.net/u/(\d+)" - test = ( - ("https://bcy.net/u/1933712", { - "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg", - "count": ">= 20", - }), - ("https://bcy.net/u/109282764041", { - "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" - r"~tplv-bcyx-yuan-logo-v1:.+\.image", - "range": "1-25", - "count": 25, - }), - ) - - def posts(self): - url = self.root + "/apiv3/user/selfPosts" - params = {"uid": self.item_id, "since": None} - - while True: - data = self.request(url, params=params).json() - - try: - items = data["data"]["items"] - except KeyError: - return - if not items: - return - - for item in items: - yield item["item_detail"] - params["since"] = item["since"] - - -class BcyPostExtractor(BcyExtractor): - """Extractor for individual posts""" - subcategory = "post" - pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)" - test = ( - ("https://bcy.net/item/detail/6355835481002893070", { - "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3", - "count": 1, - "keyword": { - "user": { - "id" : 1933712, - "name" : "wukloo", - "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/", - }, - "post": { - "id" : 6355835481002893070, - "tags" : list, - "date" : "dt:2016-11-22 08:47:46", - "parody" : "东方PROJECT", - "content": "re:根据微博的建议稍微做了点修改", - "likes" : int, - "shares" : int, - "replies": int, - }, - "id": 8330182, - "num": 1, - "width" : 3000, - "height": 1687, - "filename": "712e0780b09011e696f973c3d1568337", - "extension": "jpg", - }, - }), - # only watermarked images available - ("https://bcy.net/item/detail/6950136331708144648", { - "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" - r"~tplv-bcyx-yuan-logo-v1:.+\.image", - "count": 10, - "keyword": {"filter": "watermark"}, - }), - # deleted - ("https://bcy.net/item/detail/6780546160802143237", { - "exception": exception.NotFoundError, - "count": 0, - }), - # only visible to logged in users - ("https://bcy.net/item/detail/6747523535150783495", { - "count": 0, - }), - # JSON decode error (#3321) - ("https://bcy.net/item/detail/7166939271872388110", { - "count": 0, - }), - ) - - def posts(self): - try: - data = self._data_from_post(self.item_id) - except KeyError: - return () - post = data["post_data"] - post["image_list"] = post["multi"] - post["plain"] = text.parse_unicode_escapes(post["plain"]) - post.update(data["detail_user"]) - return (post,) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 5c8c530..35b2752 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -52,6 +52,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "num": int, }, }), + # cdn12 .ru TLD (#4147) + ("https://bunkrr.su/a/j1G29CnD", { + "pattern": r"https://(cdn12.bunkr.ru|media-files12.bunkr.la)/\w+", + "count": 8, + }), ("https://bunkrr.su/a/Lktg9Keq"), ("https://bunkr.la/a/Lktg9Keq"), ("https://bunkr.su/a/Lktg9Keq"), @@ -87,10 +92,12 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): url = text.unescape(url) if url.endswith((".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts", ".zip", ".rar", ".7z")): - append({"file": url.replace("://cdn", "://media-files", 1), - "_http_headers": headers}) - else: - append({"file": url}) + if url.startswith("https://cdn12."): + url = ("https://media-files12.bunkr.la" + + url[url.find("/", 14):]) + else: + url = url.replace("://cdn", "://media-files", 1) + append({"file": url, "_http_headers": headers}) return files, { "album_id" : self.album_id, diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 50d1026..5c9b157 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -286,7 +286,7 @@ class Extractor(): useragent = self.config("user-agent") if useragent is None: useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:102.0) Gecko/20100101 Firefox/102.0") + "rv:115.0) Gecko/20100101 Firefox/115.0") elif useragent == "browser": useragent = _browser_useragent() headers["User-Agent"] = useragent @@ -805,8 +805,8 @@ _browser_cookies = {} HTTP_HEADERS = { "firefox": ( - ("User-Agent", "Mozilla/5.0 ({}; rv:102.0) " - "Gecko/20100101 Firefox/102.0"), + ("User-Agent", "Mozilla/5.0 ({}; rv:115.0) " + "Gecko/20100101 Firefox/115.0"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," "image/avif,image/webp,*/*;q=0.8"), ("Accept-Language", "en-US,en;q=0.5"), diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 03307f8..709bc57 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -80,7 +80,7 @@ class EromeExtractor(Extractor): for params["page"] in itertools.count(1): page = self.request(url, params=params).text - album_ids = EromeAlbumExtractor.pattern.findall(page) + album_ids = EromeAlbumExtractor.pattern.findall(page)[::2] yield from album_ids if len(album_ids) < 36: diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 35c4cc4..f92b904 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -23,6 +23,7 @@ class FantiaExtractor(Extractor): self.headers = { "Accept" : "application/json, text/plain, */*", "Referer": self.root, + "X-Requested-With": "XMLHttpRequest", } _empty_plan = { "id" : 0, @@ -68,7 +69,8 @@ class FantiaExtractor(Extractor): def _pagination(self, url): params = {"page": 1} - headers = self.headers + headers = self.headers.copy() + del headers["X-Requested-With"] while True: page = self.request(url, params=params, headers=headers).text diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index c4f32a4..b6fbcb6 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -27,7 +27,7 @@ class GelbooruV01Extractor(booru.BooruExtractor): "uploader" : extr('By: ', ' <'), "width" : extr('Size: ', 'x'), "height" : extr('', ' <'), - "source" : extr('Source: <a href="', '"'), + "source" : extr('Source: ', ' <'), "rating" : (extr('Rating: ', '<') or "?")[0].lower(), "score" : extr('Score: ', ' <'), "file_url" : extr('<img alt="img" src="', '"'), @@ -78,9 +78,9 @@ BASE_PATTERN = GelbooruV01Extractor.update({ "root": "https://drawfriends.booru.org", "pattern": r"drawfriends\.booru\.org", }, - "vidyart": { - "root": "https://vidyart.booru.org", - "pattern": r"vidyart\.booru\.org", + "vidyart2": { + "root": "https://vidyart2.booru.org", + "pattern": r"vidyart2\.booru\.org", }, }) @@ -106,7 +106,7 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): "count": 25, }), ("https://drawfriends.booru.org/index.php?page=post&s=list&tags=all"), - ("https://vidyart.booru.org/index.php?page=post&s=list&tags=all"), + ("https://vidyart2.booru.org/index.php?page=post&s=list&tags=all"), ) def __init__(self, match): @@ -141,7 +141,7 @@ class GelbooruV01FavoriteExtractor(GelbooruV01Extractor): "count": 4, }), ("https://drawfriends.booru.org/index.php?page=favorites&s=view&id=1"), - ("https://vidyart.booru.org/index.php?page=favorites&s=view&id=1"), + ("https://vidyart2.booru.org/index.php?page=favorites&s=view&id=1"), ) def __init__(self, match): @@ -193,7 +193,7 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor): }, }), ("https://drawfriends.booru.org/index.php?page=post&s=view&id=107474"), - ("https://vidyart.booru.org/index.php?page=post&s=view&id=383111"), + ("https://vidyart2.booru.org/index.php?page=post&s=view&id=39168"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index 0ccd7fa..ccebdf9 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2022 Mike Fährmann +# Copyright 2017-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, exception +from ..cache import cache class GfycatExtractor(Extractor): @@ -80,6 +81,8 @@ class GfycatUserExtractor(GfycatExtractor): }) def gfycats(self): + if self.key == "me": + return GfycatAPI(self).me() return GfycatAPI(self).user(self.key) @@ -219,15 +222,8 @@ class GfycatAPI(): def __init__(self, extractor): self.extractor = extractor - - def gfycat(self, gfycat_id): - endpoint = "/v1/gfycats/" + gfycat_id - return self._call(endpoint)["gfyItem"] - - def user(self, user): - endpoint = "/v1/users/{}/gfycats".format(user.lower()) - params = {"count": 100} - return self._pagination(endpoint, params) + self.headers = {} + self.username, self.password = extractor._get_auth_info() def collection(self, user, collection): endpoint = "/v1/users/{}/collections/{}/gfycats".format( @@ -240,14 +236,64 @@ class GfycatAPI(): params = {"count": 100} return self._pagination(endpoint, params, "gfyCollections") + def gfycat(self, gfycat_id): + endpoint = "/v1/gfycats/" + gfycat_id + return self._call(endpoint)["gfyItem"] + + def me(self): + endpoint = "/v1/me/gfycats" + params = {"count": 100} + return self._pagination(endpoint, params) + def search(self, query): endpoint = "/v1/gfycats/search" params = {"search_text": query, "count": 150} return self._pagination(endpoint, params) + def user(self, user): + endpoint = "/v1/users/{}/gfycats".format(user.lower()) + params = {"count": 100} + return self._pagination(endpoint, params) + + def authenticate(self): + self.headers["Authorization"] = \ + self._authenticate_impl(self.username, self.password) + + @cache(maxage=3600, keyarg=1) + def _authenticate_impl(self, username, password): + self.extractor.log.info("Logging in as %s", username) + + url = "https://weblogin.gfycat.com/oauth/webtoken" + headers = {"Origin": "https://gfycat.com"} + data = { + "access_key": "Anr96uuqt9EdamSCwK4txKPjMsf2" + "M95Rfa5FLLhPFucu8H5HTzeutyAa", + } + response = self.extractor.request( + url, method="POST", headers=headers, json=data).json() + + url = "https://weblogin.gfycat.com/oauth/weblogin" + headers["authorization"] = "Bearer " + response["access_token"] + data = { + "grant_type": "password", + "username" : username, + "password" : password, + } + response = self.extractor.request( + url, method="POST", headers=headers, json=data, fatal=None).json() + + if "errorMessage" in response: + raise exception.AuthenticationError( + response["errorMessage"]["description"]) + return "Bearer " + response["access_token"] + def _call(self, endpoint, params=None): + if self.username: + self.authenticate() + url = self.API_ROOT + endpoint - return self.extractor.request(url, params=params).json() + return self.extractor.request( + url, params=params, headers=self.headers).json() def _pagination(self, endpoint, params, key="gfycats"): while True: diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py index b8d425a..39208e5 100644 --- a/gallery_dl/extractor/jpgfish.py +++ b/gallery_dl/extractor/jpgfish.py @@ -4,18 +4,18 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://jpg.pet/""" +"""Extractors for https://jpeg.pet/""" from .common import Extractor, Message from .. import text -BASE_PATTERN = r"(?:https?://)?jpg\.(?:pet|fish(?:ing)?|church)" +BASE_PATTERN = r"(?:https?://)?jpe?g\.(?:pet|fish(?:ing)?|church)" class JpgfishExtractor(Extractor): """Base class for jpgfish extractors""" category = "jpgfish" - root = "https://jpg.pet" + root = "https://jpeg.pet" directory_fmt = ("{category}", "{user}", "{album}",) archive_fmt = "{id}" @@ -36,7 +36,7 @@ class JpgfishImageExtractor(JpgfishExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))" test = ( - ("https://jpg.pet/img/funnymeme.LecXGS", { + ("https://jpeg.pet/img/funnymeme.LecXGS", { "pattern": r"https://simp3\.jpg\.church/images/funnymeme\.jpg", "content": "098e5e9b17ad634358426e0ffd1c93871474d13c", "keyword": { @@ -52,6 +52,7 @@ class JpgfishImageExtractor(JpgfishExtractor): "pattern": r"https://simp2\.jpg\.church/hannahowo_00457\.jpg", "keyword": {"album": "401-500"}, }), + ("https://jpg.pet/img/funnymeme.LecXGS"), ("https://jpg.fishing/img/funnymeme.LecXGS"), ("https://jpg.fish/img/funnymeme.LecXGS"), ("https://jpg.church/img/funnymeme.LecXGS"), @@ -83,7 +84,7 @@ class JpgfishAlbumExtractor(JpgfishExtractor): subcategory = "album" pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?" test = ( - ("https://jpg.pet/album/CDilP/?sort=date_desc&page=1", { + ("https://jpeg.pet/album/CDilP/?sort=date_desc&page=1", { "count": 2, }), ("https://jpg.fishing/a/gunggingnsk.N9OOI", { @@ -95,6 +96,7 @@ class JpgfishAlbumExtractor(JpgfishExtractor): ("https://jpg.church/a/hannahowo.aNTdH/sub", { "count": 606, }), + ("https://jpg.pet/album/CDilP/?sort=date_desc&page=1"), ) def __init__(self, match): @@ -120,12 +122,13 @@ class JpgfishUserExtractor(JpgfishExtractor): subcategory = "user" pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?" test = ( - ("https://jpg.pet/exearco", { + ("https://jpeg.pet/exearco", { "count": 3, }), ("https://jpg.church/exearco/albums", { "count": 1, }), + ("https://jpg.pet/exearco"), ("https://jpg.fishing/exearco"), ("https://jpg.fish/exearco"), ("https://jpg.church/exearco"), diff --git a/gallery_dl/extractor/lineblog.py b/gallery_dl/extractor/lineblog.py deleted file mode 100644 index adb27a8..0000000 --- a/gallery_dl/extractor/lineblog.py +++ /dev/null @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2019-2020 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://www.lineblog.me/""" - -from .livedoor import LivedoorBlogExtractor, LivedoorPostExtractor -from .. import text - - -class LineblogBase(): - """Base class for lineblog extractors""" - category = "lineblog" - root = "https://lineblog.me" - - def _images(self, post): - imgs = [] - body = post.pop("body") - - for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1): - src = text.extr(img, 'src="', '"') - alt = text.extr(img, 'alt="', '"') - - if not src: - continue - if src.startswith("https://obs.line-scdn.") and src.count("/") > 3: - src = src.rpartition("/")[0] - - imgs.append(text.nameext_from_url(alt or src, { - "url" : src, - "num" : num, - "hash": src.rpartition("/")[2], - "post": post, - })) - - return imgs - - -class LineblogBlogExtractor(LineblogBase, LivedoorBlogExtractor): - """Extractor for a user's blog on lineblog.me""" - pattern = r"(?:https?://)?lineblog\.me/(\w+)/?(?:$|[?#])" - test = ("https://lineblog.me/mamoru_miyano/", { - "range": "1-20", - "count": 20, - "pattern": r"https://obs.line-scdn.net/[\w-]+$", - "keyword": { - "post": { - "categories" : tuple, - "date" : "type:datetime", - "description": str, - "id" : int, - "tags" : list, - "title" : str, - "user" : "mamoru_miyano" - }, - "filename": str, - "hash" : r"re:\w{32,}", - "num" : int, - }, - }) - - -class LineblogPostExtractor(LineblogBase, LivedoorPostExtractor): - """Extractor for blog posts on lineblog.me""" - pattern = r"(?:https?://)?lineblog\.me/(\w+)/archives/(\d+)" - test = ("https://lineblog.me/mamoru_miyano/archives/1919150.html", { - "url": "24afeb4044c554f80c374b52bf8109c6f1c0c757", - "keyword": "76a38e2c0074926bd3362f66f9fc0e6c41591dcb", - }) diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index 49d4d7d..74c239e 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -87,7 +87,8 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor): ) def metadata(self, page): - data = {"tags": list(text.extract_iter(page, "class>", "<"))} + tags = text.extr(page, 'class="wp-manga-tags-list">', '</div>') + data = {"tags": list(text.split_html(tags)[::2])} info = text.extr(page, '<h1 id="chapter-heading">', "</h1>") if not info: raise exception.NotFoundError("chapter") @@ -148,7 +149,7 @@ class MangareadMangaExtractor(MangareadBase, MangaExtractor): } }), ("https://www.mangaread.org/manga/doesnotexist", { - "exception": exception.NotFoundError, + "exception": exception.HttpError, }), ) diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index d6292af..cafe4f7 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -91,7 +91,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): return { "title_id": self.title_id, "episode" : self.episode, - "comic" : extr("titleName: '", "'"), + "comic" : extr('titleName: "', '"'), "tags" : [t.strip() for t in text.extract_iter( extr("tagList: [", "}],"), '"tagName":"', '"')], "title" : extr('"subtitle":"', '"'), diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 5d100a4..e047f3d 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -87,14 +87,15 @@ class NewgroundsExtractor(Extractor): if response.history and response.url.endswith("/social"): return self.session.cookies + page = response.text headers = {"Origin": self.root, "Referer": url} - url = text.urljoin(self.root, text.extr( - response.text, 'action="', '"')) + url = text.urljoin(self.root, text.extr(page, 'action="', '"')) data = { "username": username, "password": password, "remember": "1", "login" : "1", + "auth" : text.extr(page, 'name="auth" value="', '"'), } response = self.request(url, method="POST", headers=headers, data=data) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index f0a50c8..1fa571c 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -55,8 +55,8 @@ class PahealExtractor(Extractor): "class='username' href='/user/", "'")), "date" : text.parse_datetime( extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"), - "source" : text.extract( - extr(">Source Link<", "</td>"), "href='", "'")[0], + "source" : text.unescape(text.extr( + extr(">Source Link<", "</td>"), "href='", "'")), } dimensions, size, ext = extr("Info</th><td>", ">").split(" // ") @@ -74,10 +74,34 @@ class PahealTagExtractor(PahealExtractor): directory_fmt = ("{category}", "{search_tags}") pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/list/([^/?#]+)") - test = ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { - "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", - "count": ">= 15" - }) + test = ( + ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { + "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", + "count": ">= 15" + }), + ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { + "range": "1", + "options": (("metadata", True),), + "keyword": { + "date": "dt:2018-01-07 07:04:05", + "duration": 0.0, + "extension": "jpg", + "filename": "2446128 - Ayane_Suzuki Idolmaster " + "idolmaster_dearly_stars Zanzi", + "height": 768, + "id": 2446128, + "md5": "b0ceda9d860df1d15b60293a7eb465c1", + "search_tags": "Ayane_Suzuki", + "size": 205312, + "source": "https://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=19957280", + "tags": "Ayane_Suzuki Idolmaster " + "idolmaster_dearly_stars Zanzi", + "uploader": "XXXname", + "width": 1024, + }, + }), + ) per_page = 70 def __init__(self, match): @@ -96,8 +120,9 @@ class PahealTagExtractor(PahealExtractor): url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) page = self.request(url).text + pos = page.find("id='image-list'") for post in text.extract_iter( - page, '<img id="thumb_', 'Only</a>'): + page, "<img id='thumb_", "Only</a>", pos): yield self._extract_data(post) if ">Next<" not in page: @@ -106,10 +131,10 @@ class PahealTagExtractor(PahealExtractor): @staticmethod def _extract_data(post): - pid , pos = text.extract(post, '', '"') - data, pos = text.extract(post, 'title="', '"', pos) - md5 , pos = text.extract(post, '/_thumbs/', '/', pos) - url , pos = text.extract(post, '<a href="', '"', pos) + pid , pos = text.extract(post, "", "'") + data, pos = text.extract(post, "title='", "'", pos) + md5 , pos = text.extract(post, "/_thumbs/", "/", pos) + url , pos = text.extract(post, "<a href='", "'", pos) tags, data, date = data.split("\n") dimensions, size, ext = data.split(" // ") @@ -126,7 +151,7 @@ class PahealTagExtractor(PahealExtractor): } def _extract_data_ex(self, post): - pid = post[:post.index('"')] + pid = post[:post.index("'")] return self._extract_post(pid) @@ -139,19 +164,19 @@ class PahealPostExtractor(PahealExtractor): ("https://rule34.paheal.net/post/view/481609", { "pattern": r"https://tulip\.paheal\.net/_images" r"/bbdc1c33410c2cdce7556c7990be26b7/481609%20-%20" - r"Azumanga_Daioh%20Osaka%20Vuvuzela%20inanimate\.jpg", + r"Azumanga_Daioh%20inanimate%20Osaka%20Vuvuzela\.jpg", "content": "7b924bcf150b352ac75c9d281d061e174c851a11", "keyword": { "date": "dt:2010-06-17 15:40:23", "extension": "jpg", "file_url": "re:https://tulip.paheal.net/_images/bbdc1c33410c", - "filename": "481609 - Azumanga_Daioh Osaka Vuvuzela inanimate", + "filename": "481609 - Azumanga_Daioh inanimate Osaka Vuvuzela", "height": 660, "id": 481609, "md5": "bbdc1c33410c2cdce7556c7990be26b7", "size": 157389, - "source": None, - "tags": "Azumanga_Daioh Osaka Vuvuzela inanimate", + "source": "", + "tags": "Azumanga_Daioh inanimate Osaka Vuvuzela", "uploader": "CaptainButtface", "width": 614, }, @@ -163,7 +188,7 @@ class PahealPostExtractor(PahealExtractor): "md5": "b39edfe455a0381110c710d6ed2ef57d", "size": 758989, "source": "http://www.furaffinity.net/view/4057821/", - "tags": "Vuvuzela inanimate thelost-dragon", + "tags": "inanimate thelost-dragon Vuvuzela", "uploader": "leacheate_soup", "width": 1200, }, @@ -171,8 +196,8 @@ class PahealPostExtractor(PahealExtractor): # video ("https://rule34.paheal.net/post/view/3864982", { "pattern": r"https://[\w]+\.paheal\.net/_images/7629fc0ff77e32637d" - r"de5bf4f992b2cb/3864982%20-%20Metal_Gear%20Metal_Gear_" - r"Solid_V%20Quiet%20Vg_erotica%20animated%20webm\.webm", + r"de5bf4f992b2cb/3864982%20-%20animated%20Metal_Gear%20" + r"Metal_Gear_Solid_V%20Quiet%20Vg_erotica%20webm\.webm", "keyword": { "date": "dt:2020-09-06 01:59:03", "duration": 30.0, @@ -183,8 +208,8 @@ class PahealPostExtractor(PahealExtractor): "size": 18454938, "source": "https://twitter.com/VG_Worklog" "/status/1302407696294055936", - "tags": "Metal_Gear Metal_Gear_Solid_V Quiet " - "Vg_erotica animated webm", + "tags": "animated Metal_Gear Metal_Gear_Solid_V " + "Quiet Vg_erotica webm", "uploader": "justausername", "width": 1768, }, diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index df85b96..e718828 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -19,39 +19,19 @@ class PhilomenaExtractor(BooruExtractor): filename_fmt = "{filename}.{extension}" archive_fmt = "{id}" request_interval = 1.0 + page_start = 1 per_page = 50 + def __init__(self, match): + BooruExtractor.__init__(self, match) + self.api = PhilomenaAPI(self) + _file_url = operator.itemgetter("view_url") @staticmethod def _prepare(post): post["date"] = text.parse_datetime(post["created_at"]) - def _pagination(self, url, params): - params["page"] = 1 - params["per_page"] = self.per_page - - api_key = self.config("api-key") - if api_key: - params["key"] = api_key - - filter_id = self.config("filter") - if filter_id: - params["filter_id"] = filter_id - elif not api_key: - try: - params["filter_id"] = INSTANCES[self.category]["filter_id"] - except (KeyError, TypeError): - params["filter_id"] = "2" - - while True: - data = self.request(url, params=params).json() - yield from data["images"] - - if len(data["images"]) < self.per_page: - return - params["page"] += 1 - INSTANCES = { "derpibooru": { @@ -146,8 +126,7 @@ class PhilomenaPostExtractor(PhilomenaExtractor): self.image_id = match.group(match.lastindex) def posts(self): - url = self.root + "/api/v1/json/images/" + self.image_id - return (self.request(url).json()["image"],) + return (self.api.image(self.image_id),) class PhilomenaSearchExtractor(PhilomenaExtractor): @@ -201,8 +180,7 @@ class PhilomenaSearchExtractor(PhilomenaExtractor): return {"search_tags": self.params.get("q", "")} def posts(self): - url = self.root + "/api/v1/json/search/images" - return self._pagination(url, self.params) + return self.api.search(self.params) class PhilomenaGalleryExtractor(PhilomenaExtractor): @@ -239,15 +217,81 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor): self.gallery_id = match.group(match.lastindex) def metadata(self): - url = self.root + "/api/v1/json/search/galleries" - params = {"q": "id:" + self.gallery_id} - galleries = self.request(url, params=params).json()["galleries"] - if not galleries: + try: + return {"gallery": self.api.gallery(self.gallery_id)} + except IndexError: raise exception.NotFoundError("gallery") - return {"gallery": galleries[0]} def posts(self): gallery_id = "gallery_id:" + self.gallery_id - url = self.root + "/api/v1/json/search/images" params = {"sd": "desc", "sf": gallery_id, "q": gallery_id} - return self._pagination(url, params) + return self.api.search(params) + + +class PhilomenaAPI(): + """Interface for the Philomena API + + https://www.derpibooru.org/pages/api + """ + + def __init__(self, extractor): + self.extractor = extractor + self.root = extractor.root + "/api" + + def gallery(self, gallery_id): + endpoint = "/v1/json/search/galleries" + params = {"q": "id:" + gallery_id} + return self._call(endpoint, params)["galleries"][0] + + def image(self, image_id): + endpoint = "/v1/json/images/" + image_id + return self._call(endpoint)["image"] + + def search(self, params): + endpoint = "/v1/json/search/images" + return self._pagination(endpoint, params) + + def _call(self, endpoint, params=None): + url = self.root + endpoint + + while True: + response = self.extractor.request(url, params=params, fatal=None) + + if response.status_code < 400: + return response.json() + + if response.status_code == 429: + self.extractor.wait(seconds=600) + continue + + # error + self.extractor.log.debug(response.content) + raise exception.StopExtraction( + "%s %s", response.status_code, response.reason) + + def _pagination(self, endpoint, params): + extr = self.extractor + + api_key = extr.config("api-key") + if api_key: + params["key"] = api_key + + filter_id = extr.config("filter") + if filter_id: + params["filter_id"] = filter_id + elif not api_key: + try: + params["filter_id"] = INSTANCES[extr.category]["filter_id"] + except (KeyError, TypeError): + params["filter_id"] = "2" + + params["page"] = extr.page_start + params["per_page"] = extr.per_page + + while True: + data = self._call(endpoint, params) + yield from data["images"] + + if len(data["images"]) < extr.per_page: + return + params["page"] += 1 diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index f19e33c..fa4efa0 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -58,6 +58,9 @@ class PornhubGalleryExtractor(PornhubExtractor): self._first = None def items(self): + self.session.cookies.set( + "accessAgeDisclaimerPH", "1", domain=".pornhub.com") + data = self.metadata() yield Message.Directory, data for num, image in enumerate(self.images(), 1): diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 9a57dcf..54b162b 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -20,6 +20,7 @@ class RedditExtractor(Extractor): filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}" archive_fmt = "{filename}" cookiedomain = ".reddit.com" + request_interval = 0.6 def items(self): self.api = RedditAPI(self) @@ -377,6 +378,18 @@ class RedditAPI(): self.client_id = client_id self.headers = {"User-Agent": config("user-agent")} + if self.client_id == self.CLIENT_ID: + client_id = self.client_id + self._warn_429 = True + kind = "default" + else: + client_id = client_id[:5] + "*" * (len(client_id)-5) + self._warn_429 = False + kind = "custom" + + self.log.debug( + "Using %s API credentials (client-id %s)", kind, client_id) + token = config("refresh-token") if token is None or token == "cache": key = "#" + self.client_id @@ -463,28 +476,39 @@ class RedditAPI(): def _call(self, endpoint, params): url = "https://oauth.reddit.com" + endpoint params["raw_json"] = "1" - self.authenticate() - response = self.extractor.request( - url, params=params, headers=self.headers, fatal=None) - remaining = response.headers.get("x-ratelimit-remaining") - if remaining and float(remaining) < 2: - self.extractor.wait(seconds=response.headers["x-ratelimit-reset"]) - return self._call(endpoint, params) + while True: + self.authenticate() + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=None) + + remaining = response.headers.get("x-ratelimit-remaining") + if remaining and float(remaining) < 2: + if self._warn_429: + self._warn_429 = False + self.log.info( + "Register your own OAuth application and use its " + "credentials to prevent this error: " + "https://github.com/mikf/gallery-dl/blob/master" + "/docs/configuration.rst" + "#extractorredditclient-id--user-agent") + self.extractor.wait( + seconds=response.headers["x-ratelimit-reset"]) + continue - try: - data = response.json() - except ValueError: - raise exception.StopExtraction(text.remove_html(response.text)) - - if "error" in data: - if data["error"] == 403: - raise exception.AuthorizationError() - if data["error"] == 404: - raise exception.NotFoundError() - self.log.debug(data) - raise exception.StopExtraction(data.get("message")) - return data + try: + data = response.json() + except ValueError: + raise exception.StopExtraction(text.remove_html(response.text)) + + if "error" in data: + if data["error"] == 403: + raise exception.AuthorizationError() + if data["error"] == 404: + raise exception.NotFoundError() + self.log.debug(data) + raise exception.StopExtraction(data.get("message")) + return data def _pagination(self, endpoint, params): id_min = self._parse_id("id-min", 0) diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index 7b8d2a3..711435e 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -164,6 +164,10 @@ class SeigaImageExtractor(SeigaExtractor): ("https://seiga.nicovideo.jp/seiga/im123", { "exception": exception.NotFoundError, }), + ("https://seiga.nicovideo.jp/seiga/im10877923", { + "pattern": r"https://lohas\.nicoseiga\.jp/priv/5936a2a6c860a600e46" + r"5e0411c0822e0b510e286/1688757110/10877923", + }), ("https://seiga.nicovideo.jp/image/source/5977527"), ("https://sp.seiga.nicovideo.jp/seiga/#!/im5977527"), ("https://lohas.nicoseiga.jp/thumb/5977527i"), @@ -182,6 +186,9 @@ class SeigaImageExtractor(SeigaExtractor): return num def get_images(self): + self.session.cookies.set( + "skip_fetish_warning", "1", domain="seiga.nicovideo.jp") + url = "{}/seiga/im{}".format(self.root, self.image_id) page = self.request(url, notfound="image").text diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index bea457f..3521298 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -30,21 +30,20 @@ class SlidesharePresentationExtractor(GalleryExtractor): "count": 19, "content": "2b6a191eab60b3978fdacfecf2da302dd45bc108", "keyword": { - "comments": "0", "description": "Get Started with SlideShare - " "A Beginngers Guide for Creators", - "likes": r"re:\d{3,}", + "likes": int, "presentation": "get-started-with-slide-share", - "published": "dt:2015-05-20 00:00:00", + "date": "dt:2015-05-20 17:38:21", "title": "Getting Started With SlideShare", "user": "Slideshare", - "views": r"re:\d{7,}", + "views": int, }, }), # long title and description (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren" "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), { - "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7", + "url": "d8952260f8bec337dd809a958ec8091350393f6b", "keyword": { "title": "Warum Sie nicht Ihren Mitarbeitenden ändern " "sollten, sondern Ihr Managementsystem", @@ -58,7 +57,7 @@ class SlidesharePresentationExtractor(GalleryExtractor): # mobile URL (("https://www.slideshare.net" "/mobile/uqudent/introduction-to-fixed-prosthodontics"), { - "url": "43eda2adf4dd221a251c8df794dfb82649e94647", + "url": "72c431cb1eccbb6794f608ecbbc01d52e8768159", }), ) @@ -69,43 +68,31 @@ class SlidesharePresentationExtractor(GalleryExtractor): GalleryExtractor.__init__(self, match, url) def metadata(self, page): - extr = text.extract_from(page) - descr = extr('<meta name="description" content="', '"') - comments = extr('content="UserComments:', '"') - likes = extr('content="UserLikes:', '"') - views = extr('content="UserPageVisits:', '"') - title = extr('<span class="j-title-breadcrumb">', '</span>') - published = extr('<div class="metadata-item">', '</div>') - - if descr.endswith("…"): - alt_descr = extr('slideshow-description-text"', '</p>') - if alt_descr: - descr = text.remove_html(alt_descr.partition(">")[2]).strip() + data = util.json_loads(text.extr( + page, 'id="__NEXT_DATA__" type="application/json">', '</script>')) + self.slideshow = slideshow = data["props"]["pageProps"]["slideshow"] return { - "user": self.user, + "user" : slideshow["username"], "presentation": self.presentation, - "title": text.unescape(title.strip()), - "description": text.unescape(descr), - "views": views, - "likes": likes, - "comments": comments, - "published": text.parse_datetime( - published.strip(), "%b. %d, %Y"), + "title" : slideshow["title"].strip(), + "description" : slideshow["description"].strip(), + "views" : slideshow["views"], + "likes" : slideshow["likes"], + "date" : text.parse_datetime( + slideshow["createdAt"], "%Y-%m-%d %H:%M:%S %Z"), } - @staticmethod - def images(page): - data = util.json_loads(text.extract( - page, "xtend(true, slideshare_object.slideshow_config, ", ");")[0]) + def images(self, page): + parts = self.slideshow["slideImages"][0]["baseUrl"].split("/") - # useing 'stripped_title' here is technically wrong, but it works all - # the same, slideshare doesn't seem to care what characters go there - begin = "https://image.slidesharecdn.com/{}/95/{}-".format( - data["ppt_location"], data["stripped_title"]) - end = "-1024.jpg?cb=" + str(data["timestamp"]) + begin = "{}/95/{}-".format( + "/".join(parts[:4]), + self.slideshow["strippedTitle"], + ) + end = "-1024.jpg?" + parts[-1].rpartition("?")[2] return [ (begin + str(n) + end, None) - for n in range(1, data["slide_count"]+1) + for n in range(1, self.slideshow["totalSlides"]+1) ] diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index 30bf2f1..a8acd31 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,6 +22,7 @@ class TwibooruExtractor(BooruExtractor): filename_fmt = "{id}_{filename}.{extension}" archive_fmt = "{id}" request_interval = 6.05 + page_start = 1 per_page = 50 root = "https://twibooru.org" @@ -230,7 +231,7 @@ class TwibooruAPI(): elif not api_key: params["filter_id"] = "2" - params["page"] = 1 + params["page"] = extr.page_start params["per_page"] = per_page = extr.per_page while True: diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 10db974..7b9a2e4 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -510,13 +510,13 @@ class TwitterTimelineExtractor(TwitterExtractor): if not self.textonly: # try to search for media-only tweets tweet = None - for tweet in self.api.search_adaptive(query + " filter:links"): + for tweet in self.api.search_timeline(query + " filter:links"): yield tweet if tweet is not None: return # yield unfiltered search results - yield from self.api.search_adaptive(query) + yield from self.api.search_timeline(query) def _select_tweet_source(self): strategy = self.config("strategy") @@ -693,7 +693,7 @@ class TwitterSearchExtractor(TwitterExtractor): except KeyError: pass - return self.api.search_adaptive(query) + return self.api.search_timeline(query) class TwitterHashtagExtractor(TwitterExtractor): @@ -929,16 +929,15 @@ Your reaction.""", def _tweets_single(self, tweet_id): tweets = [] - for tweet in self.api.tweet_detail(tweet_id): - if tweet["rest_id"] == tweet_id or \ - tweet.get("_retweet_id_str") == tweet_id: - if self._user_obj is None: - self._assign_user(tweet["core"]["user_results"]["result"]) - tweets.append(tweet) + tweet = self.api.tweet_result_by_rest_id(tweet_id) + self._assign_user(tweet["core"]["user_results"]["result"]) - tweet_id = tweet["legacy"].get("quoted_status_id_str") - if not tweet_id: - break + while True: + tweets.append(tweet) + tweet_id = tweet["legacy"].get("quoted_status_id_str") + if not tweet_id: + break + tweet = self.api.tweet_result_by_rest_id(tweet_id) return tweets @@ -1087,8 +1086,8 @@ class TwitterAPI(): auth_token = cookies.get("auth_token", domain=cookiedomain) search = extractor.config("search-endpoint") - if search == "graphql" or not auth_token and search in ("auto", None): - self.search_adaptive = self.search_timeline + if search == "rest": + self.search_timeline = self.search_adaptive self.headers = { "Accept": "*/*", @@ -1179,6 +1178,46 @@ class TwitterAPI(): "responsive_web_enhance_cards_enabled": False, } + def tweet_result_by_rest_id(self, tweet_id): + endpoint = "/graphql/2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId" + params = { + "variables": self._json_dumps({ + "tweetId": tweet_id, + "withCommunity": False, + "includePromotedContent": False, + "withVoice": False, + }), + "features": self._json_dumps({ + "creator_subscriptions_tweet_preview_api_enabled": True, + "tweetypie_unmention_optimization_enabled": True, + "responsive_web_edit_tweet_api_enabled": True, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": + True, + "view_counts_everywhere_api_enabled": True, + "longform_notetweets_consumption_enabled": True, + "responsive_web_twitter_article_tweet_consumption_enabled": + False, + "tweet_awards_web_tipping_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, + "standardized_nudges_misinfo": True, + "tweet_with_visibility_results_prefer_gql_" + "limited_actions_policy_enabled": True, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "responsive_web_media_download_video_enabled": False, + "responsive_web_graphql_skip_user_profile_" + "image_extensions_enabled": False, + "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_enhance_cards_enabled": False, + }), + "fieldToggles": self._json_dumps({ + "withArticleRichContentState": False, + }), + } + return self._call(endpoint, params)["data"]["tweetResult"]["result"] + def tweet_detail(self, tweet_id): endpoint = "/graphql/JlLZj42Ltr2qwjasw-l5lQ/TweetDetail" variables = { @@ -1439,6 +1478,9 @@ class TwitterAPI(): if response.status_code == 429: # rate limit exceeded + if self.extractor.config("ratelimit") == "abort": + raise exception.StopExtraction("Rate limit exceeded") + until = response.headers.get("x-rate-limit-reset") seconds = None if until else 60 self.extractor.wait(until=until, seconds=seconds) @@ -1592,7 +1634,9 @@ class TwitterAPI(): if entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] if entries is None: - raise KeyError() + if not cursor: + return + entries = () except LookupError: extr.log.debug(data) @@ -1730,7 +1774,7 @@ class TwitterAPI(): "features" : self._json_dumps(self.features_pagination)} while True: - cursor = entry = stop = None + cursor = entry = None params["variables"] = self._json_dumps(variables) data = self._call(endpoint, params)["data"] @@ -1759,11 +1803,8 @@ class TwitterAPI(): yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] - elif instr["type"] == "TimelineTerminateTimeline": - if instr["direction"] == "Bottom": - stop = True - if stop or not cursor or not entry: + if not cursor or cursor.startswith(("-1|", "0|")) or not entry: return variables["cursor"] = cursor diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 805aa53..5a3adc8 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -170,6 +170,8 @@ class WeiboExtractor(Extractor): yield from statuses if "next_cursor" in data: # videos, newvideo + if data["next_cursor"] == -1: + return params["cursor"] = data["next_cursor"] elif "page" in params: # home, article params["page"] += 1 diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py index 662e08b..5f02e94 100644 --- a/gallery_dl/extractor/wikifeet.py +++ b/gallery_dl/extractor/wikifeet.py @@ -32,7 +32,7 @@ class WikifeetGalleryExtractor(GalleryExtractor): "pid" : int, "width" : int, "height" : int, - "shoesize" : "7.5 US", + "shoesize" : "9 US", "type" : "women", "tags" : list, }, @@ -50,7 +50,7 @@ class WikifeetGalleryExtractor(GalleryExtractor): "pid" : int, "width" : int, "height" : int, - "shoesize" : "[NOT SET]", + "shoesize" : "4 US", "type" : "women", "tags" : list, }, @@ -111,7 +111,10 @@ class WikifeetGalleryExtractor(GalleryExtractor): "pid" : data["pid"], "width" : data["pw"], "height": data["ph"], - "tags" : [tagmap[tag] for tag in data["tags"]], + "tags" : [ + tagmap[tag] + for tag in data["tags"] if tag in tagmap + ], }) for data in util.json_loads(text.extr(page, "['gdata'] = ", ";")) ] diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 9438d73..f2a3111 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.7" +__version__ = "1.25.8" |
