From e2f67519f8c1750a71aab3dc56b8345fff21bac5 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sat, 15 Jul 2023 17:08:47 -0400 Subject: New upstream version 1.25.8. --- CHANGELOG.md | 36 ++++++ PKG-INFO | 6 +- README.rst | 4 +- data/man/gallery-dl.1 | 2 +- data/man/gallery-dl.conf.5 | 36 ++++-- docs/gallery-dl.conf | 5 +- gallery_dl.egg-info/PKG-INFO | 6 +- gallery_dl.egg-info/SOURCES.txt | 2 - gallery_dl/extractor/__init__.py | 2 - gallery_dl/extractor/bcy.py | 206 ----------------------------------- gallery_dl/extractor/bunkr.py | 15 ++- gallery_dl/extractor/common.py | 6 +- gallery_dl/extractor/erome.py | 4 +- gallery_dl/extractor/fantia.py | 4 +- gallery_dl/extractor/gelbooru_v01.py | 14 +-- gallery_dl/extractor/gfycat.py | 68 ++++++++++-- gallery_dl/extractor/jpgfish.py | 15 ++- gallery_dl/extractor/lineblog.py | 73 ------------- gallery_dl/extractor/mangaread.py | 5 +- gallery_dl/extractor/naverwebtoon.py | 2 +- gallery_dl/extractor/newgrounds.py | 5 +- gallery_dl/extractor/paheal.py | 67 ++++++++---- gallery_dl/extractor/philomena.py | 116 ++++++++++++++------ gallery_dl/extractor/pornhub.py | 3 + gallery_dl/extractor/reddit.py | 64 +++++++---- gallery_dl/extractor/seiga.py | 9 +- gallery_dl/extractor/slideshare.py | 59 ++++------ gallery_dl/extractor/twibooru.py | 5 +- gallery_dl/extractor/twitter.py | 81 ++++++++++---- gallery_dl/extractor/weibo.py | 2 + gallery_dl/extractor/wikifeet.py | 9 +- gallery_dl/version.py | 2 +- test/test_results.py | 2 +- 33 files changed, 455 insertions(+), 480 deletions(-) delete mode 100644 gallery_dl/extractor/bcy.py delete mode 100644 gallery_dl/extractor/lineblog.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b71b404..53034fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,41 @@ # Changelog +## 1.25.8 - 2023-07-15 +### Changes +- update default User-Agent header to Firefox 115 ESR +### Additions +- [gfycat] support `@me` user ([#3770](https://github.com/mikf/gallery-dl/issues/3770), [#4271](https://github.com/mikf/gallery-dl/issues/4271)) +- [gfycat] implement login support ([#3770](https://github.com/mikf/gallery-dl/issues/3770), [#4271](https://github.com/mikf/gallery-dl/issues/4271)) +- [reddit] notify users about registering an OAuth application ([#4292](https://github.com/mikf/gallery-dl/issues/4292)) +- [twitter] add `ratelimit` option ([#4251](https://github.com/mikf/gallery-dl/issues/4251)) +- [twitter] use `TweetResultByRestId` endpoint that allows accessing single Tweets without login ([#4250](https://github.com/mikf/gallery-dl/issues/4250)) +### Fixes +- [bunkr] use `.la` TLD for `media-files12` servers ([#4147](https://github.com/mikf/gallery-dl/issues/4147), [#4276](https://github.com/mikf/gallery-dl/issues/4276)) +- [erome] ignore duplicate album IDs +- [fantia] send `X-Requested-With` header ([#4273](https://github.com/mikf/gallery-dl/issues/4273)) +- [gelbooru_v01] fix `source` metadata ([#4302](https://github.com/mikf/gallery-dl/issues/4302), [#4303](https://github.com/mikf/gallery-dl/issues/4303)) +- [gelbooru_v01] update `vidyart` domain +- [jpgfish] update domain to `jpeg.pet` +- [mangaread] fix `tags` metadata extraction +- [naverwebtoon] fix `comic` metadata extraction +- [newgrounds] extract & pass auth token during login ([#4268](https://github.com/mikf/gallery-dl/issues/4268)) +- [paheal] fix extraction ([#4262](https://github.com/mikf/gallery-dl/issues/4262), [#4293](https://github.com/mikf/gallery-dl/issues/4293)) +- [paheal] unescape `source` +- [philomena] fix `--range` ([#4288](https://github.com/mikf/gallery-dl/issues/4288)) +- [philomena] handle `429 Too Many Requests` errors ([#4288](https://github.com/mikf/gallery-dl/issues/4288)) +- [pornhub] set `accessAgeDisclaimerPH` cookie ([#4301](https://github.com/mikf/gallery-dl/issues/4301)) +- [reddit] use 0.6s delay between API requests ([#4292](https://github.com/mikf/gallery-dl/issues/4292)) +- [seiga] set `skip_fetish_warning` cookie ([#4242](https://github.com/mikf/gallery-dl/issues/4242)) +- [slideshare] fix extraction +- [twitter] fix `following` extractor not getting all users ([#4287](https://github.com/mikf/gallery-dl/issues/4287)) +- [twitter] use GraphQL search endpoint by default ([#4264](https://github.com/mikf/gallery-dl/issues/4264)) +- [twitter] do not treat missing `TimelineAddEntries` instruction as fatal ([#4278](https://github.com/mikf/gallery-dl/issues/4278)) +- [weibo] fix cursor based pagination +- [wikifeet] fix `tag` extraction ([#4289](https://github.com/mikf/gallery-dl/issues/4289), [#4291](https://github.com/mikf/gallery-dl/issues/4291)) +### Removals +- [bcy] remove module +- [lineblog] remove module + ## 1.25.7 - 2023-07-02 ### Additions - [flickr] add 'exif' option diff --git a/PKG-INFO b/PKG-INFO index ff9ab3f..953bc56 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.25.7 +Version: 1.25.8 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -109,9 +109,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/README.rst b/README.rst index 86dd58d..51e239c 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 16a4bba..84fd161 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2023-07-02" "1.25.7" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2023-07-15" "1.25.8" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 2cba623..5fa271b 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2023-07-02" "1.25.7" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2023-07-15" "1.25.8" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -460,6 +460,8 @@ and optional for .br * \f[I]exhentai\f[] .br +* \f[I]gfycat\f[] +.br * \f[I]idolcomplex\f[] .br * \f[I]imgbb\f[] @@ -646,7 +648,7 @@ or a \f[I]list\f[] with IP and explicit port number as elements. \f[I]string\f[] .IP "Default:" 9 -\f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0"\f[] +\f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0"\f[] .IP "Description:" 4 User-Agent header value to be used for HTTP requests. @@ -3687,6 +3689,22 @@ If this option is enabled, gallery-dl will try to fetch a quoted (original) Tweet when it sees the Tweet which quotes it. +.SS extractor.twitter.ratelimit +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"wait"\f[] + +.IP "Description:" 4 +Selects how to handle exceeding the API rate limit. + +.br +* \f[I]"abort"\f[]: Raise an error and stop extraction +.br +* \f[I]"wait"\f[]: Wait until rate limit reset + + .SS extractor.twitter.replies .IP "Type:" 6 \f[I]bool\f[] @@ -3727,17 +3745,15 @@ will be taken from the original Tweets, not the Retweets. \f[I]string\f[] .IP "Default:" 9 -\f[I]"auto"\f[] +\f[I]"graphql"\f[] .IP "Description:" 4 Selects the API endpoint used to retrieve search results. .br -* \f[I]"rest"\f[]: Legacy REST endpoint - returns a \f[I]403 Forbidden\f[] error when not logged in +* \f[I]"graphql"\f[]: GraphQL endpoint .br -* \f[I]"graphql"\f[]: New GraphQL endpoint -.br -* \f[I]"auto"\f[]: \f[I]"rest"\f[] when logged in, \f[I]"graphql"\f[] otherwise +* \f[I]"rest"\f[]: Legacy REST endpoint .SS extractor.twitter.timeline.strategy @@ -5822,6 +5838,12 @@ as \f[I]"client-id"\f[] \f[I]user-agent\f[] and replace \f[I]\f[] and \f[I]\f[] accordingly (see Reddit's \f[I]API access rules\f[]) +.br +* clear your \f[I]cache\f[] to delete any remaining +\f[I]access-token\f[] entries. (\f[I]gallery-dl --clear-cache reddit\f[]) +.br +* get a \f[I]refresh-token\f[] for the +new \f[I]client-id\f[] (\f[I]gallery-dl oauth:reddit\f[]) .SS extractor.smugmug.api-key & .api-secret diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 902d0a2..b5efc73 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -10,7 +10,7 @@ "proxy": null, "skip": true, - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0", "retries": 4, "timeout": 30.0, "verify": true, @@ -261,6 +261,9 @@ }, "reddit": { + "client-id": null, + "user-agent": null, + "refresh-token": null, "comments": 0, "morecomments": false, "date-min": 0, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index d008254..00db3b4 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.25.7 +Version: 1.25.8 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -109,9 +109,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 44fbd22..355a3f0 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -55,7 +55,6 @@ gallery_dl/extractor/architizer.py gallery_dl/extractor/artstation.py gallery_dl/extractor/aryion.py gallery_dl/extractor/bbc.py -gallery_dl/extractor/bcy.py gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py gallery_dl/extractor/booru.py @@ -123,7 +122,6 @@ gallery_dl/extractor/komikcast.py gallery_dl/extractor/lensdump.py gallery_dl/extractor/lexica.py gallery_dl/extractor/lightroom.py -gallery_dl/extractor/lineblog.py gallery_dl/extractor/livedoor.py gallery_dl/extractor/lolisafe.py gallery_dl/extractor/luscious.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index a344fe4..fa56bfb 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,7 +24,6 @@ modules = [ "artstation", "aryion", "bbc", - "bcy", "behance", "blogger", "bunkr", @@ -85,7 +84,6 @@ modules = [ "lensdump", "lexica", "lightroom", - "lineblog", "livedoor", "luscious", "lynxchan", diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py deleted file mode 100644 index d6adb4e..0000000 --- a/gallery_dl/extractor/bcy.py +++ /dev/null @@ -1,206 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://bcy.net/""" - -from .common import Extractor, Message -from .. import text, util, exception -import re - - -class BcyExtractor(Extractor): - """Base class for bcy extractors""" - category = "bcy" - directory_fmt = ("{category}", "{user[id]} {user[name]}") - filename_fmt = "{post[id]} {id}.{extension}" - archive_fmt = "{post[id]}_{id}" - root = "https://bcy.net" - - def __init__(self, match): - Extractor.__init__(self, match) - self.item_id = match.group(1) - self.session.headers["Referer"] = self.root + "/" - - def items(self): - sub = re.compile(r"^https?://p\d+-bcy" - r"(?:-sign\.bcyimg\.com|\.byteimg\.com/img)" - r"/banciyuan").sub - iroot = "https://img-bcy-qn.pstatp.com" - noop = self.config("noop") - - for post in self.posts(): - if not post["image_list"]: - continue - - multi = None - tags = post.get("post_tags") or () - data = { - "user": { - "id" : post["uid"], - "name" : post["uname"], - "avatar" : sub(iroot, post["avatar"].partition("~")[0]), - }, - "post": { - "id" : text.parse_int(post["item_id"]), - "tags" : [t["tag_name"] for t in tags], - "date" : text.parse_timestamp(post["ctime"]), - "parody" : post["work"], - "content": post["plain"], - "likes" : post["like_count"], - "shares" : post["share_count"], - "replies": post["reply_count"], - }, - } - - yield Message.Directory, data - for data["num"], image in enumerate(post["image_list"], 1): - data["id"] = image["mid"] - data["width"] = image["w"] - data["height"] = image["h"] - - url = image["path"].partition("~")[0] - text.nameext_from_url(url, data) - - # full-resolution image without watermark - if data["extension"]: - if not url.startswith(iroot): - url = sub(iroot, url) - data["filter"] = "" - yield Message.Url, url, data - - # watermarked image & low quality noop filter - else: - if multi is None: - multi = self._data_from_post( - post["item_id"])["post_data"]["multi"] - image = multi[data["num"] - 1] - - if image["origin"]: - data["filter"] = "watermark" - yield Message.Url, image["origin"], data - - if noop: - data["extension"] = "" - data["filter"] = "noop" - yield Message.Url, image["original_path"], data - - def posts(self): - """Returns an iterable with all relevant 'post' objects""" - - def _data_from_post(self, post_id): - url = "{}/item/detail/{}".format(self.root, post_id) - page = self.request(url, notfound="post").text - data = (text.extr(page, 'JSON.parse("', '");') - .replace('\\\\u002F', '/') - .replace('\\"', '"')) - try: - return util.json_loads(data)["detail"] - except ValueError: - return util.json_loads(data.replace('\\"', '"'))["detail"] - - -class BcyUserExtractor(BcyExtractor): - """Extractor for user timelines""" - subcategory = "user" - pattern = r"(?:https?://)?bcy\.net/u/(\d+)" - test = ( - ("https://bcy.net/u/1933712", { - "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg", - "count": ">= 20", - }), - ("https://bcy.net/u/109282764041", { - "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" - r"~tplv-bcyx-yuan-logo-v1:.+\.image", - "range": "1-25", - "count": 25, - }), - ) - - def posts(self): - url = self.root + "/apiv3/user/selfPosts" - params = {"uid": self.item_id, "since": None} - - while True: - data = self.request(url, params=params).json() - - try: - items = data["data"]["items"] - except KeyError: - return - if not items: - return - - for item in items: - yield item["item_detail"] - params["since"] = item["since"] - - -class BcyPostExtractor(BcyExtractor): - """Extractor for individual posts""" - subcategory = "post" - pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)" - test = ( - ("https://bcy.net/item/detail/6355835481002893070", { - "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3", - "count": 1, - "keyword": { - "user": { - "id" : 1933712, - "name" : "wukloo", - "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/", - }, - "post": { - "id" : 6355835481002893070, - "tags" : list, - "date" : "dt:2016-11-22 08:47:46", - "parody" : "东方PROJECT", - "content": "re:根据微博的建议稍微做了点修改", - "likes" : int, - "shares" : int, - "replies": int, - }, - "id": 8330182, - "num": 1, - "width" : 3000, - "height": 1687, - "filename": "712e0780b09011e696f973c3d1568337", - "extension": "jpg", - }, - }), - # only watermarked images available - ("https://bcy.net/item/detail/6950136331708144648", { - "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" - r"~tplv-bcyx-yuan-logo-v1:.+\.image", - "count": 10, - "keyword": {"filter": "watermark"}, - }), - # deleted - ("https://bcy.net/item/detail/6780546160802143237", { - "exception": exception.NotFoundError, - "count": 0, - }), - # only visible to logged in users - ("https://bcy.net/item/detail/6747523535150783495", { - "count": 0, - }), - # JSON decode error (#3321) - ("https://bcy.net/item/detail/7166939271872388110", { - "count": 0, - }), - ) - - def posts(self): - try: - data = self._data_from_post(self.item_id) - except KeyError: - return () - post = data["post_data"] - post["image_list"] = post["multi"] - post["plain"] = text.parse_unicode_escapes(post["plain"]) - post.update(data["detail_user"]) - return (post,) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 5c8c530..35b2752 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -52,6 +52,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "num": int, }, }), + # cdn12 .ru TLD (#4147) + ("https://bunkrr.su/a/j1G29CnD", { + "pattern": r"https://(cdn12.bunkr.ru|media-files12.bunkr.la)/\w+", + "count": 8, + }), ("https://bunkrr.su/a/Lktg9Keq"), ("https://bunkr.la/a/Lktg9Keq"), ("https://bunkr.su/a/Lktg9Keq"), @@ -87,10 +92,12 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): url = text.unescape(url) if url.endswith((".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts", ".zip", ".rar", ".7z")): - append({"file": url.replace("://cdn", "://media-files", 1), - "_http_headers": headers}) - else: - append({"file": url}) + if url.startswith("https://cdn12."): + url = ("https://media-files12.bunkr.la" + + url[url.find("/", 14):]) + else: + url = url.replace("://cdn", "://media-files", 1) + append({"file": url, "_http_headers": headers}) return files, { "album_id" : self.album_id, diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 50d1026..5c9b157 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -286,7 +286,7 @@ class Extractor(): useragent = self.config("user-agent") if useragent is None: useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:102.0) Gecko/20100101 Firefox/102.0") + "rv:115.0) Gecko/20100101 Firefox/115.0") elif useragent == "browser": useragent = _browser_useragent() headers["User-Agent"] = useragent @@ -805,8 +805,8 @@ _browser_cookies = {} HTTP_HEADERS = { "firefox": ( - ("User-Agent", "Mozilla/5.0 ({}; rv:102.0) " - "Gecko/20100101 Firefox/102.0"), + ("User-Agent", "Mozilla/5.0 ({}; rv:115.0) " + "Gecko/20100101 Firefox/115.0"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," "image/avif,image/webp,*/*;q=0.8"), ("Accept-Language", "en-US,en;q=0.5"), diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 03307f8..709bc57 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -80,7 +80,7 @@ class EromeExtractor(Extractor): for params["page"] in itertools.count(1): page = self.request(url, params=params).text - album_ids = EromeAlbumExtractor.pattern.findall(page) + album_ids = EromeAlbumExtractor.pattern.findall(page)[::2] yield from album_ids if len(album_ids) < 36: diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 35c4cc4..f92b904 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -23,6 +23,7 @@ class FantiaExtractor(Extractor): self.headers = { "Accept" : "application/json, text/plain, */*", "Referer": self.root, + "X-Requested-With": "XMLHttpRequest", } _empty_plan = { "id" : 0, @@ -68,7 +69,8 @@ class FantiaExtractor(Extractor): def _pagination(self, url): params = {"page": 1} - headers = self.headers + headers = self.headers.copy() + del headers["X-Requested-With"] while True: page = self.request(url, params=params, headers=headers).text diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index c4f32a4..b6fbcb6 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -27,7 +27,7 @@ class GelbooruV01Extractor(booru.BooruExtractor): "uploader" : extr('By: ', ' <'), "width" : extr('Size: ', 'x'), "height" : extr('', ' <'), - "source" : extr('Source: "), 1): - src = text.extr(img, 'src="', '"') - alt = text.extr(img, 'alt="', '"') - - if not src: - continue - if src.startswith("https://obs.line-scdn.") and src.count("/") > 3: - src = src.rpartition("/")[0] - - imgs.append(text.nameext_from_url(alt or src, { - "url" : src, - "num" : num, - "hash": src.rpartition("/")[2], - "post": post, - })) - - return imgs - - -class LineblogBlogExtractor(LineblogBase, LivedoorBlogExtractor): - """Extractor for a user's blog on lineblog.me""" - pattern = r"(?:https?://)?lineblog\.me/(\w+)/?(?:$|[?#])" - test = ("https://lineblog.me/mamoru_miyano/", { - "range": "1-20", - "count": 20, - "pattern": r"https://obs.line-scdn.net/[\w-]+$", - "keyword": { - "post": { - "categories" : tuple, - "date" : "type:datetime", - "description": str, - "id" : int, - "tags" : list, - "title" : str, - "user" : "mamoru_miyano" - }, - "filename": str, - "hash" : r"re:\w{32,}", - "num" : int, - }, - }) - - -class LineblogPostExtractor(LineblogBase, LivedoorPostExtractor): - """Extractor for blog posts on lineblog.me""" - pattern = r"(?:https?://)?lineblog\.me/(\w+)/archives/(\d+)" - test = ("https://lineblog.me/mamoru_miyano/archives/1919150.html", { - "url": "24afeb4044c554f80c374b52bf8109c6f1c0c757", - "keyword": "76a38e2c0074926bd3362f66f9fc0e6c41591dcb", - }) diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index 49d4d7d..74c239e 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -87,7 +87,8 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor): ) def metadata(self, page): - data = {"tags": list(text.extract_iter(page, "class>", "<"))} + tags = text.extr(page, 'class="wp-manga-tags-list">', '') + data = {"tags": list(text.split_html(tags)[::2])} info = text.extr(page, '

', "

") if not info: raise exception.NotFoundError("chapter") @@ -148,7 +149,7 @@ class MangareadMangaExtractor(MangareadBase, MangaExtractor): } }), ("https://www.mangaread.org/manga/doesnotexist", { - "exception": exception.NotFoundError, + "exception": exception.HttpError, }), ) diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index d6292af..cafe4f7 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -91,7 +91,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): return { "title_id": self.title_id, "episode" : self.episode, - "comic" : extr("titleName: '", "'"), + "comic" : extr('titleName: "', '"'), "tags" : [t.strip() for t in text.extract_iter( extr("tagList: [", "}],"), '"tagName":"', '"')], "title" : extr('"subtitle":"', '"'), diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 5d100a4..e047f3d 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -87,14 +87,15 @@ class NewgroundsExtractor(Extractor): if response.history and response.url.endswith("/social"): return self.session.cookies + page = response.text headers = {"Origin": self.root, "Referer": url} - url = text.urljoin(self.root, text.extr( - response.text, 'action="', '"')) + url = text.urljoin(self.root, text.extr(page, 'action="', '"')) data = { "username": username, "password": password, "remember": "1", "login" : "1", + "auth" : text.extr(page, 'name="auth" value="', '"'), } response = self.request(url, method="POST", headers=headers, data=data) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index f0a50c8..1fa571c 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -55,8 +55,8 @@ class PahealExtractor(Extractor): "class='username' href='/user/", "'")), "date" : text.parse_datetime( extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"), - "source" : text.extract( - extr(">Source Link<", ""), "href='", "'")[0], + "source" : text.unescape(text.extr( + extr(">Source Link<", ""), "href='", "'")), } dimensions, size, ext = extr("Info", ">").split(" // ") @@ -74,10 +74,34 @@ class PahealTagExtractor(PahealExtractor): directory_fmt = ("{category}", "{search_tags}") pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/list/([^/?#]+)") - test = ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { - "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", - "count": ">= 15" - }) + test = ( + ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { + "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", + "count": ">= 15" + }), + ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { + "range": "1", + "options": (("metadata", True),), + "keyword": { + "date": "dt:2018-01-07 07:04:05", + "duration": 0.0, + "extension": "jpg", + "filename": "2446128 - Ayane_Suzuki Idolmaster " + "idolmaster_dearly_stars Zanzi", + "height": 768, + "id": 2446128, + "md5": "b0ceda9d860df1d15b60293a7eb465c1", + "search_tags": "Ayane_Suzuki", + "size": 205312, + "source": "https://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=19957280", + "tags": "Ayane_Suzuki Idolmaster " + "idolmaster_dearly_stars Zanzi", + "uploader": "XXXname", + "width": 1024, + }, + }), + ) per_page = 70 def __init__(self, match): @@ -96,8 +120,9 @@ class PahealTagExtractor(PahealExtractor): url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) page = self.request(url).text + pos = page.find("id='image-list'") for post in text.extract_iter( - page, '', '') - published = extr('') - - if descr.endswith("…"): - alt_descr = extr('slideshow-description-text"', '

') - if alt_descr: - descr = text.remove_html(alt_descr.partition(">")[2]).strip() + data = util.json_loads(text.extr( + page, 'id="__NEXT_DATA__" type="application/json">', '')) + self.slideshow = slideshow = data["props"]["pageProps"]["slideshow"] return { - "user": self.user, + "user" : slideshow["username"], "presentation": self.presentation, - "title": text.unescape(title.strip()), - "description": text.unescape(descr), - "views": views, - "likes": likes, - "comments": comments, - "published": text.parse_datetime( - published.strip(), "%b. %d, %Y"), + "title" : slideshow["title"].strip(), + "description" : slideshow["description"].strip(), + "views" : slideshow["views"], + "likes" : slideshow["likes"], + "date" : text.parse_datetime( + slideshow["createdAt"], "%Y-%m-%d %H:%M:%S %Z"), } - @staticmethod - def images(page): - data = util.json_loads(text.extract( - page, "xtend(true, slideshare_object.slideshow_config, ", ");")[0]) + def images(self, page): + parts = self.slideshow["slideImages"][0]["baseUrl"].split("/") - # useing 'stripped_title' here is technically wrong, but it works all - # the same, slideshare doesn't seem to care what characters go there - begin = "https://image.slidesharecdn.com/{}/95/{}-".format( - data["ppt_location"], data["stripped_title"]) - end = "-1024.jpg?cb=" + str(data["timestamp"]) + begin = "{}/95/{}-".format( + "/".join(parts[:4]), + self.slideshow["strippedTitle"], + ) + end = "-1024.jpg?" + parts[-1].rpartition("?")[2] return [ (begin + str(n) + end, None) - for n in range(1, data["slide_count"]+1) + for n in range(1, self.slideshow["totalSlides"]+1) ] diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index 30bf2f1..a8acd31 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,6 +22,7 @@ class TwibooruExtractor(BooruExtractor): filename_fmt = "{id}_{filename}.{extension}" archive_fmt = "{id}" request_interval = 6.05 + page_start = 1 per_page = 50 root = "https://twibooru.org" @@ -230,7 +231,7 @@ class TwibooruAPI(): elif not api_key: params["filter_id"] = "2" - params["page"] = 1 + params["page"] = extr.page_start params["per_page"] = per_page = extr.per_page while True: diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 10db974..7b9a2e4 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -510,13 +510,13 @@ class TwitterTimelineExtractor(TwitterExtractor): if not self.textonly: # try to search for media-only tweets tweet = None - for tweet in self.api.search_adaptive(query + " filter:links"): + for tweet in self.api.search_timeline(query + " filter:links"): yield tweet if tweet is not None: return # yield unfiltered search results - yield from self.api.search_adaptive(query) + yield from self.api.search_timeline(query) def _select_tweet_source(self): strategy = self.config("strategy") @@ -693,7 +693,7 @@ class TwitterSearchExtractor(TwitterExtractor): except KeyError: pass - return self.api.search_adaptive(query) + return self.api.search_timeline(query) class TwitterHashtagExtractor(TwitterExtractor): @@ -929,16 +929,15 @@ Your reaction.""", def _tweets_single(self, tweet_id): tweets = [] - for tweet in self.api.tweet_detail(tweet_id): - if tweet["rest_id"] == tweet_id or \ - tweet.get("_retweet_id_str") == tweet_id: - if self._user_obj is None: - self._assign_user(tweet["core"]["user_results"]["result"]) - tweets.append(tweet) + tweet = self.api.tweet_result_by_rest_id(tweet_id) + self._assign_user(tweet["core"]["user_results"]["result"]) - tweet_id = tweet["legacy"].get("quoted_status_id_str") - if not tweet_id: - break + while True: + tweets.append(tweet) + tweet_id = tweet["legacy"].get("quoted_status_id_str") + if not tweet_id: + break + tweet = self.api.tweet_result_by_rest_id(tweet_id) return tweets @@ -1087,8 +1086,8 @@ class TwitterAPI(): auth_token = cookies.get("auth_token", domain=cookiedomain) search = extractor.config("search-endpoint") - if search == "graphql" or not auth_token and search in ("auto", None): - self.search_adaptive = self.search_timeline + if search == "rest": + self.search_timeline = self.search_adaptive self.headers = { "Accept": "*/*", @@ -1179,6 +1178,46 @@ class TwitterAPI(): "responsive_web_enhance_cards_enabled": False, } + def tweet_result_by_rest_id(self, tweet_id): + endpoint = "/graphql/2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId" + params = { + "variables": self._json_dumps({ + "tweetId": tweet_id, + "withCommunity": False, + "includePromotedContent": False, + "withVoice": False, + }), + "features": self._json_dumps({ + "creator_subscriptions_tweet_preview_api_enabled": True, + "tweetypie_unmention_optimization_enabled": True, + "responsive_web_edit_tweet_api_enabled": True, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": + True, + "view_counts_everywhere_api_enabled": True, + "longform_notetweets_consumption_enabled": True, + "responsive_web_twitter_article_tweet_consumption_enabled": + False, + "tweet_awards_web_tipping_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, + "standardized_nudges_misinfo": True, + "tweet_with_visibility_results_prefer_gql_" + "limited_actions_policy_enabled": True, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "responsive_web_media_download_video_enabled": False, + "responsive_web_graphql_skip_user_profile_" + "image_extensions_enabled": False, + "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_enhance_cards_enabled": False, + }), + "fieldToggles": self._json_dumps({ + "withArticleRichContentState": False, + }), + } + return self._call(endpoint, params)["data"]["tweetResult"]["result"] + def tweet_detail(self, tweet_id): endpoint = "/graphql/JlLZj42Ltr2qwjasw-l5lQ/TweetDetail" variables = { @@ -1439,6 +1478,9 @@ class TwitterAPI(): if response.status_code == 429: # rate limit exceeded + if self.extractor.config("ratelimit") == "abort": + raise exception.StopExtraction("Rate limit exceeded") + until = response.headers.get("x-rate-limit-reset") seconds = None if until else 60 self.extractor.wait(until=until, seconds=seconds) @@ -1592,7 +1634,9 @@ class TwitterAPI(): if entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] if entries is None: - raise KeyError() + if not cursor: + return + entries = () except LookupError: extr.log.debug(data) @@ -1730,7 +1774,7 @@ class TwitterAPI(): "features" : self._json_dumps(self.features_pagination)} while True: - cursor = entry = stop = None + cursor = entry = None params["variables"] = self._json_dumps(variables) data = self._call(endpoint, params)["data"] @@ -1759,11 +1803,8 @@ class TwitterAPI(): yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] - elif instr["type"] == "TimelineTerminateTimeline": - if instr["direction"] == "Bottom": - stop = True - if stop or not cursor or not entry: + if not cursor or cursor.startswith(("-1|", "0|")) or not entry: return variables["cursor"] = cursor diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 805aa53..5a3adc8 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -170,6 +170,8 @@ class WeiboExtractor(Extractor): yield from statuses if "next_cursor" in data: # videos, newvideo + if data["next_cursor"] == -1: + return params["cursor"] = data["next_cursor"] elif "page" in params: # home, article params["page"] += 1 diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py index 662e08b..5f02e94 100644 --- a/gallery_dl/extractor/wikifeet.py +++ b/gallery_dl/extractor/wikifeet.py @@ -32,7 +32,7 @@ class WikifeetGalleryExtractor(GalleryExtractor): "pid" : int, "width" : int, "height" : int, - "shoesize" : "7.5 US", + "shoesize" : "9 US", "type" : "women", "tags" : list, }, @@ -50,7 +50,7 @@ class WikifeetGalleryExtractor(GalleryExtractor): "pid" : int, "width" : int, "height" : int, - "shoesize" : "[NOT SET]", + "shoesize" : "4 US", "type" : "women", "tags" : list, }, @@ -111,7 +111,10 @@ class WikifeetGalleryExtractor(GalleryExtractor): "pid" : data["pid"], "width" : data["pw"], "height": data["ph"], - "tags" : [tagmap[tag] for tag in data["tags"]], + "tags" : [ + tagmap[tag] + for tag in data["tags"] if tag in tagmap + ], }) for data in util.json_loads(text.extr(page, "['gdata'] = ", ";")) ] diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 9438d73..f2a3111 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.7" +__version__ = "1.25.8" diff --git a/test/test_results.py b/test/test_results.py index 03a17c4..3c7d284 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -325,7 +325,7 @@ def setup_test_config(): for category in ("danbooru", "atfbooru", "aibooru", "e621", "e926", "instagram", "twitter", "subscribestar", "deviantart", "inkbunny", "tapas", "pillowfort", "mangadex", - "vipergirls"): + "vipergirls", "gfycat"): config.set(("extractor", category), "username", None) config.set(("extractor", "mastodon.social"), "access-token", -- cgit v1.2.3