diff options
| author | 2022-11-01 21:32:59 -0400 | |
|---|---|---|
| committer | 2022-11-01 21:32:59 -0400 | |
| commit | 66c7c6d45cd567748517037f9e80579af97f7d4e (patch) | |
| tree | 8a0d42e12696620e068791e0198e90df7db9795f | |
| parent | cd4945d8efa6a948d65bfcb134871157e93188cb (diff) | |
| parent | e59d46ecda74190381b1d2725b0bd9df5c0be8d8 (diff) | |
Update upstream source from tag 'upstream/1.23.5'
Update to upstream version '1.23.5'
with Debian dir 5d9c17920ff526fe04990c66a8256250c5686e11
31 files changed, 694 insertions, 353 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 5901e37..21341ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,37 @@ # Changelog +## 1.23.5 - 2022-10-30 +### Fixes +- [instagram] fix AttributeError on user stories extraction ([#3123](https://github.com/mikf/gallery-dl/issues/3123)) + +## 1.23.4 - 2022-10-29 +### Additions +- [aibooru] add support for aibooru.online ([#3075](https://github.com/mikf/gallery-dl/issues/3075)) +- [instagram] add 'avatar' extractor ([#929](https://github.com/mikf/gallery-dl/issues/929), [#1097](https://github.com/mikf/gallery-dl/issues/1097), [#2992](https://github.com/mikf/gallery-dl/issues/2992)) +- [instagram] support 'instagram.com/s/' highlight URLs ([#3076](https://github.com/mikf/gallery-dl/issues/3076)) +- [instagram] extract 'coauthors' metadata ([#3107](https://github.com/mikf/gallery-dl/issues/3107)) +- [mangasee] add support for 'mangalife' ([#3086](https://github.com/mikf/gallery-dl/issues/3086)) +- [mastodon] add 'bookmark' extractor ([#3109](https://github.com/mikf/gallery-dl/issues/3109)) +- [mastodon] support cross-instance user references and '/web/' URLs ([#3109](https://github.com/mikf/gallery-dl/issues/3109)) +- [moebooru] implement 'notes' extraction ([#3094](https://github.com/mikf/gallery-dl/issues/3094)) +- [pixiv] extend 'metadata' option ([#3057](https://github.com/mikf/gallery-dl/issues/3057)) +- [reactor] match 'best', 'new', 'all' URLs ([#3073](https://github.com/mikf/gallery-dl/issues/3073)) +- [smugloli] add 'smugloli' extractors ([#3060](https://github.com/mikf/gallery-dl/issues/3060)) +- [tumblr] add 'fallback-delay' and 'fallback-retries' options ([#2957](https://github.com/mikf/gallery-dl/issues/2957)) +- [vichan] add generic extractors for vichan imageboards +### Fixes +- [bcy] fix extraction ([#3103](https://github.com/mikf/gallery-dl/issues/3103)) +- [gelbooru] support alternate parameter order in post URLs ([#2821](https://github.com/mikf/gallery-dl/issues/2821)) +- [hentai2read] support minor versions in chapter URLs ([#3089](https://github.com/mikf/gallery-dl/issues/3089)) +- [hentaihere] support minor versions in chapter URLs +- [kemonoparty] fix 'dms' extraction ([#3106](https://github.com/mikf/gallery-dl/issues/3106)) +- [kemonoparty] update pagination offset +- [manganelo] update domain to 'chapmanganato.com' ([#3097](https://github.com/mikf/gallery-dl/issues/3097)) +- [pixiv] use 'exact_match_for_tags' as default search mode ([#3092](https://github.com/mikf/gallery-dl/issues/3092)) +- [redgifs] fix 'token' extraction ([#3080](https://github.com/mikf/gallery-dl/issues/3080), [#3081](https://github.com/mikf/gallery-dl/issues/3081)) +- [skeb] fix extraction ([#3112](https://github.com/mikf/gallery-dl/issues/3112)) +- improve compatibility of DownloadArchive ([#3078](https://github.com/mikf/gallery-dl/issues/3078)) + ## 1.23.3 - 2022-10-15 ### Additions - [2chen] Add `2chen.moe` extractor ([#2707](https://github.com/mikf/gallery-dl/issues/2707)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.23.3 +Version: 1.23.5 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -66,8 +66,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index cca3dee..0b27854 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-10-15" "1.23.3" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-10-30" "1.23.5" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 1c484b6..8944195 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-10-15" "1.23.3" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-10-30" "1.23.5" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -1746,8 +1746,13 @@ A (comma-separated) list of subcategories to include when processing a user profile. Possible values are -\f[I]"posts"\f[], \f[I]"reels"\f[], \f[I]"channel"\f[], \f[I]"tagged"\f[], -\f[I]"stories"\f[], \f[I]"highlights"\f[]. +\f[I]"posts"\f[], +\f[I]"reels"\f[], +\f[I]"channel"\f[] +\f[I]"tagged"\f[], +\f[I]"stories"\f[], +\f[I]"highlights"\f[], +\f[I]"avatar"\f[]. You can use \f[I]"all"\f[] instead of listing all values separately. @@ -2250,7 +2255,7 @@ Possible values are It is possible to use \f[I]"all"\f[] instead of listing all values separately. -.SS extractor.pixiv.artworks.metadata +.SS extractor.pixiv.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -2727,6 +2732,29 @@ Possible types are \f[I]text\f[], \f[I]quote\f[], \f[I]link\f[], \f[I]answer\f[] You can use \f[I]"all"\f[] instead of listing all types separately. +.SS extractor.tumblr.fallback-delay +.IP "Type:" 6 +\f[I]float\f[] + +.IP "Default:" 9 +\f[I]120.0\f[] + +.IP "Description:" 4 +Number of seconds to wait between retries +for fetching full-resolution images. + + +.SS extractor.tumblr.fallback-retries +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]2\f[] + +.IP "Description:" 4 +Number of retries for fetching full-resolution images. + + .SS extractor.twibooru.api-key .IP "Type:" 6 \f[I]string\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index e507eb0..1fcbb3b 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -230,6 +230,7 @@ { "refresh-token": null, "include": "artworks", + "metadata": false, "tags": "japanese", "ugoira": true }, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index c1bfabf..d00e803 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.23.3 +Version: 1.23.5 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index b768d5b..3fa2176 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -47,7 +47,6 @@ gallery_dl/extractor/420chan.py gallery_dl/extractor/4chan.py gallery_dl/extractor/500px.py gallery_dl/extractor/8chan.py -gallery_dl/extractor/8kun.py gallery_dl/extractor/8muses.py gallery_dl/extractor/__init__.py gallery_dl/extractor/adultempire.py @@ -189,6 +188,7 @@ gallery_dl/extractor/twibooru.py gallery_dl/extractor/twitter.py gallery_dl/extractor/unsplash.py gallery_dl/extractor/vanillarock.py +gallery_dl/extractor/vichan.py gallery_dl/extractor/vk.py gallery_dl/extractor/vsco.py gallery_dl/extractor/wallhaven.py @@ -198,7 +198,6 @@ gallery_dl/extractor/weasyl.py gallery_dl/extractor/webtoons.py gallery_dl/extractor/weibo.py gallery_dl/extractor/wikiart.py -gallery_dl/extractor/wikieat.py gallery_dl/extractor/xhamster.py gallery_dl/extractor/xvideos.py gallery_dl/extractor/ytdl.py diff --git a/gallery_dl/extractor/8kun.py b/gallery_dl/extractor/8kun.py deleted file mode 100644 index 5d260b9..0000000 --- a/gallery_dl/extractor/8kun.py +++ /dev/null @@ -1,100 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020-2022 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://8kun.top/""" - -from .common import Extractor, Message -from .. import text - - -class _8kunThreadExtractor(Extractor): - """Extractor for 8kun threads""" - category = "8kun" - subcategory = "thread" - directory_fmt = ("{category}", "{board}", "{thread} {title}") - filename_fmt = "{time}{num:?-//} {filename}.{extension}" - archive_fmt = "{board}_{thread}_{tim}" - pattern = r"(?:https?://)?8kun\.top/([^/]+)/res/(\d+)" - test = ( - ("https://8kun.top/test/res/65248.html", { - "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+", - "count": ">= 8", - }), - # old-style file URLs (#1101) - # ("https://8kun.top/d/res/13258.html", { - # "pattern": r"https://media\.8kun\.top/d/src/\d+(-\d)?\.\w+", - # "range": "1-20", - # }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board, self.thread = match.groups() - - def items(self): - url = "https://8kun.top/{}/res/{}.json".format(self.board, self.thread) - posts = self.request(url).json()["posts"] - title = posts[0].get("sub") or text.remove_html(posts[0]["com"]) - process = self._process - - data = { - "board" : self.board, - "thread": self.thread, - "title" : text.unescape(title)[:50], - "num" : 0, - } - - yield Message.Directory, data - for post in posts: - if "filename" in post: - yield process(post, data) - if "extra_files" in post: - for post["num"], filedata in enumerate( - post["extra_files"], 1): - yield process(post, filedata) - - @staticmethod - def _process(post, data): - post.update(data) - post["extension"] = post["ext"][1:] - tim = post["tim"] - url = ("https://media.8kun.top/" + - ("file_store/" if len(tim) > 16 else post["board"] + "/src/") + - tim + post["ext"]) - return Message.Url, url, post - - -class _8kunBoardExtractor(Extractor): - """Extractor for 8kun boards""" - category = "8kun" - subcategory = "board" - pattern = r"(?:https?://)?8kun\.top/([^/?#]+)/(?:index|\d+)\.html" - test = ( - ("https://8kun.top/v/index.html", { - "pattern": _8kunThreadExtractor.pattern, - "count": ">= 100", - }), - ("https://8kun.top/v/2.html"), - ("https://8kun.top/v/index.html?PageSpeed=noscript"), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board = match.group(1) - - def items(self): - url = "https://8kun.top/{}/threads.json".format(self.board) - threads = self.request(url).json() - - for page in threads: - for thread in page["threads"]: - url = "https://8kun.top/{}/res/{}.html".format( - self.board, thread["no"]) - thread["page"] = page["page"] - thread["_extractor"] = _8kunThreadExtractor - yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 851f660..9e0340a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -17,7 +17,6 @@ modules = [ "4chan", "500px", "8chan", - "8kun", "8muses", "adultempire", "architizer", @@ -143,6 +142,7 @@ modules = [ "twitter", "unsplash", "vanillarock", + "vichan", "vk", "vsco", "wallhaven", @@ -152,7 +152,6 @@ modules = [ "webtoons", "weibo", "wikiart", - "wikieat", "xhamster", "xvideos", "zerochan", diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py index 47e51b3..7982881 100644 --- a/gallery_dl/extractor/bcy.py +++ b/gallery_dl/extractor/bcy.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2021 Mike Fährmann +# Copyright 2020-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -25,9 +25,12 @@ class BcyExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.item_id = match.group(1) + self.session.headers["Referer"] = self.root + "/" def items(self): - sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub + sub = re.compile(r"^https?://p\d+-bcy" + r"(?:-sign\.bcyimg\.com|\.byteimg\.com/img)" + r"/banciyuan").sub iroot = "https://img-bcy-qn.pstatp.com" noop = self.config("noop") @@ -64,19 +67,18 @@ class BcyExtractor(Extractor): url = image["path"].partition("~")[0] text.nameext_from_url(url, data) + # full-resolution image without watermark if data["extension"]: if not url.startswith(iroot): url = sub(iroot, url) data["filter"] = "" yield Message.Url, url, data + # watermarked image & low quality noop filter else: - if not multi: - if len(post["multi"]) < len(post["image_list"]): - multi = self._data_from_post(post["item_id"]) - multi = multi["post_data"]["multi"] - else: - multi = post["multi"] + if multi is None: + multi = self._data_from_post( + post["item_id"])["post_data"]["multi"] image = multi[data["num"] - 1] if image["origin"]: @@ -111,8 +113,8 @@ class BcyUserExtractor(BcyExtractor): "count": ">= 20", }), ("https://bcy.net/u/109282764041", { - "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" - r"~tplv-banciyuan-logo-v3:.+\.image", + "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" + r"~tplv-bcyx-yuan-logo-v1:.+\.image", "range": "1-25", "count": 25, }), @@ -171,13 +173,13 @@ class BcyPostExtractor(BcyExtractor): }), # only watermarked images available ("https://bcy.net/item/detail/6950136331708144648", { - "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" - r"~tplv-banciyuan-logo-v3:.+\.image", - "count": 8, + "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" + r"~tplv-bcyx-yuan-logo-v1:.+\.image", + "count": 10, "keyword": {"filter": "watermark"}, }), # deleted - ("https://bcy.net/item/detail/6780546160802143236", { + ("https://bcy.net/item/detail/6780546160802143237", { "exception": exception.NotFoundError, "count": 0, }), diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index c455ce1..906afda 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -102,6 +102,9 @@ class DanbooruExtractor(BaseExtractor): resp = self.request(template.format(self.root, post["id"])) post.update(resp.json()) + if url[0] == "/": + url = self.root + url + post.update(data) yield Message.Directory, post yield Message.Url, url, post @@ -170,6 +173,10 @@ INSTANCES = { "pattern": r"booru\.allthefallen\.moe", "page-limit": 5000, }, + "aibooru": { + "root": None, + "pattern": r"(?:safe.)?aibooru\.online", + } } BASE_PATTERN = DanbooruExtractor.update(INSTANCES) @@ -202,10 +209,16 @@ class DanbooruTagExtractor(DanbooruExtractor): ("https://booru.allthefallen.moe/posts?tags=yume_shokunin", { "count": 12, }), + ("https://aibooru.online/posts?tags=center_frills&z=1", { + "pattern": r"https://aibooru\.online/data/original" + r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w+", + "count": ">= 3", + }), ("https://hijiribe.donmai.us/posts?tags=bonocho"), ("https://sonohara.donmai.us/posts?tags=bonocho"), ("https://safebooru.donmai.us/posts?tags=bonocho"), ("https://e926.net/posts?tags=anry"), + ("https://safe.aibooru.online/posts?tags=center_frills"), ) def __init__(self, match): @@ -238,6 +251,7 @@ class DanbooruPoolExtractor(DanbooruExtractor): "url": "902549ffcdb00fe033c3f63e12bc3cb95c5fd8d5", "count": 6, }), + ("https://aibooru.online/pools/1"), ("https://danbooru.donmai.us/pool/show/7659"), ("https://e621.net/pool/show/73"), ) @@ -300,6 +314,9 @@ class DanbooruPostExtractor(DanbooruExtractor): ("https://booru.allthefallen.moe/posts/22", { "content": "21dda68e1d7e0a554078e62923f537d8e895cac8", }), + ("https://aibooru.online/posts/1", { + "content": "54d548743cd67799a62c77cbae97cfa0fec1b7e9", + }), ("https://danbooru.donmai.us/post/show/294929"), ("https://e621.net/post/show/535"), ) @@ -334,6 +351,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): "count": ">= 70", }), ("https://booru.allthefallen.moe/explore/posts/popular"), + ("https://aibooru.online/explore/posts/popular"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 92f7ac2..a2cf0c0 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -142,13 +142,23 @@ class GelbooruPoolExtractor(GelbooruBase, class GelbooruPostExtractor(GelbooruBase, gelbooru_v02.GelbooruV02PostExtractor): """Extractor for single images from gelbooru.com""" - pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" - r"\?page=post&s=view&id=(?P<post>\d+)") + pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?" + r"(?=(?:[^#]+&)?page=post(?:&|#|$))" + r"(?=(?:[^#]+&)?s=view(?:&|#|$))" + r"(?:[^#]+&)?id=(\d+)") test = ( ("https://gelbooru.com/index.php?page=post&s=view&id=313638", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", "count": 1, }), + + ("https://gelbooru.com/index.php?page=post&s=view&id=313638"), + ("https://gelbooru.com/index.php?s=view&page=post&id=313638"), + ("https://gelbooru.com/index.php?page=post&id=313638&s=view"), + ("https://gelbooru.com/index.php?s=view&id=313638&page=post"), + ("https://gelbooru.com/index.php?id=313638&page=post&s=view"), + ("https://gelbooru.com/index.php?id=313638&s=view&page=post"), + ("https://gelbooru.com/index.php?page=post&s=view&id=6018318", { "options": (("tags", True),), "content": "977caf22f27c72a5d07ea4d4d9719acdab810991", diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index 53be67b..dc4e31d 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract hentai-manga from https://hentai2read.com/""" +"""Extractors for https://hentai2read.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text @@ -23,11 +23,32 @@ class Hentai2readBase(): class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): """Extractor for a single manga chapter from hentai2read.com""" archive_fmt = "{chapter_id}_{page}" - pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?#]+/(\d+))" - test = ("https://hentai2read.com/amazon_elixir/1/", { - "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", - "keyword": "ff84b8f751f0e4ee37717efc4332ff1db71951d9", - }) + pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?#]+/([^/?#]+))" + test = ( + ("https://hentai2read.com/amazon_elixir/1/", { + "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", + "keyword": "85645b02d34aa11b3deb6dadd7536863476e1bad", + }), + ("https://hentai2read.com/popuni_kei_joshi_panic/2.5/", { + "pattern": r"https://hentaicdn\.com/hentai" + r"/13088/2\.5y/ccdn00\d+\.jpg", + "count": 36, + "keyword": { + "author": "Kurisu", + "chapter": 2, + "chapter_id": 75152, + "chapter_minor": ".5", + "count": 36, + "lang": "en", + "language": "English", + "manga": "Popuni Kei Joshi Panic!", + "manga_id": 13088, + "page": int, + "title": "Popuni Kei Joshi Panic! 2.5", + "type": "Original", + }, + }), + ) def __init__(self, match): self.chapter = match.group(2) @@ -37,12 +58,14 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): title, pos = text.extract(page, "<title>", "</title>") manga_id, pos = text.extract(page, 'data-mid="', '"', pos) chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) + chapter, sep, minor = self.chapter.partition(".") match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - " - r"(\d+): (.+) . Page 1 ", title) + r"([^:]+): (.+) . Page 1 ", title) return { "manga": match.group(1), "manga_id": text.parse_int(manga_id), - "chapter": text.parse_int(self.chapter), + "chapter": text.parse_int(chapter), + "chapter_minor": sep + minor, "chapter_id": text.parse_int(chapter_id), "type": match.group(2), "author": match.group(3), @@ -51,8 +74,7 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): "language": "English", } - @staticmethod - def images(page): + def images(self, page): images = text.extract(page, "'images' : ", ",\n")[0] return [ ("https://hentaicdn.com/hentai" + part, None) @@ -67,18 +89,35 @@ class Hentai2readMangaExtractor(Hentai2readBase, MangaExtractor): test = ( ("https://hentai2read.com/amazon_elixir/", { "url": "273073752d418ec887d7f7211e42b832e8c403ba", - "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac", + "keyword": "5c1b712258e78e120907121d3987c71f834d13e1", }), ("https://hentai2read.com/oshikage_riot/", { "url": "6595f920a3088a15c2819c502862d45f8eb6bea6", - "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36", + "keyword": "a2e9724acb221040d4b29bf9aa8cb75b2240d8af", + }), + ("https://hentai2read.com/popuni_kei_joshi_panic/", { + "pattern": Hentai2readChapterExtractor.pattern, + "range": "2-3", + "keyword": { + "chapter": int, + "chapter_id": int, + "chapter_minor": ".5", + "lang": "en", + "language": "English", + "manga": "Popuni Kei Joshi Panic!", + "manga_id": 13088, + "title": str, + "type": "Original", + }, }), ) def chapters(self, page): results = [] + + pos = page.find('itemscope itemtype="http://schema.org/Book') + 1 manga, pos = text.extract( - page, '<span itemprop="name">', '</span>') + page, '<span itemprop="name">', '</span>', pos) mtype, pos = text.extract( page, '<small class="text-danger">[', ']</small>', pos) manga_id = text.parse_int(text.extract( @@ -90,12 +129,19 @@ class Hentai2readMangaExtractor(Hentai2readBase, MangaExtractor): return results _ , pos = text.extract(page, ' href="', '"', pos) url, pos = text.extract(page, ' href="', '"', pos) - chapter, pos = text.extract(page, '>', '<', pos) + chapter, pos = text.extract(page, '>', '<', pos) chapter, _, title = text.unescape(chapter).strip().partition(" - ") + chapter, sep, minor = chapter.partition(".") + results.append((url, { - "manga_id": manga_id, "manga": manga, "type": mtype, - "chapter_id": text.parse_int(chapter_id), + "manga": manga, + "manga_id": manga_id, "chapter": text.parse_int(chapter), - "title": title, "lang": "en", "language": "English", + "chapter_minor": sep + minor, + "chapter_id": text.parse_int(chapter_id), + "type": mtype, + "title": title, + "lang": "en", + "language": "English", })) diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py index 8083a9b..c3e6d76 100644 --- a/gallery_dl/extractor/hentaihere.py +++ b/gallery_dl/extractor/hentaihere.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract hentai-manga from https://hentaihere.com/""" +"""Extractors for https://hentaihere.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text @@ -23,11 +23,28 @@ class HentaihereBase(): class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): """Extractor for a single manga chapter from hentaihere.com""" archive_fmt = "{chapter_id}_{page}" - pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)" - test = ("https://hentaihere.com/m/S13812/1/1/", { - "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", - "keyword": "cbcee0c0eb178c4b87f06a834085784f8dddad24", - }) + pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/([^/?#]+)" + test = ( + ("https://hentaihere.com/m/S13812/1/1/", { + "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", + "keyword": "0207d20eea3a15d2a8d1496755bdfa49de7cfa9d", + }), + ("https://hentaihere.com/m/S23048/1.5/1/", { + "author": "Shinozuka Yuuji", + "chapter": 1, + "chapter_id": 80186, + "chapter_minor": ".5", + "count": 32, + "lang": "en", + "language": "English", + "manga": "High School Slut's Love Consultation", + "manga_id": 23048, + "page": int, + "title": "High School Slut's Love Consultation + " + "Girlfriend [Full Color]", + "type": "Original", + }), + ) def __init__(self, match): self.manga_id, self.chapter = match.groups() @@ -37,12 +54,14 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): def metadata(self, page): title = text.extract(page, "<title>", "</title>")[0] chapter_id = text.extract(page, 'report/C', '"')[0] + chapter, sep, minor = self.chapter.partition(".") pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " match = re.match(pattern, title) return { "manga": match.group(1), "manga_id": text.parse_int(self.manga_id), - "chapter": text.parse_int(self.chapter), + "chapter": text.parse_int(chapter), + "chapter_minor": sep + minor, "chapter_id": text.parse_int(chapter_id), "type": match.group(2), "title": match.group(3), @@ -67,22 +86,34 @@ class HentaihereMangaExtractor(HentaihereBase, MangaExtractor): test = ( ("https://hentaihere.com/m/S13812", { "url": "d1ba6e28bb2162e844f8559c2b2725ba0a093559", - "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac", + "keyword": "5c1b712258e78e120907121d3987c71f834d13e1", }), ("https://hentaihere.com/m/S7608", { "url": "6c5239758dc93f6b1b4175922836c10391b174f7", - "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36", + "keyword": { + "chapter": int, + "chapter_id": int, + "chapter_minor": "", + "lang": "en", + "language": "English", + "manga": "Oshikake Riot", + "manga_id": 7608, + "title": r"re:Oshikake Riot( \d+)?", + "type": "Original", + }, }), ) def chapters(self, page): results = [] - manga_id = text.parse_int( - self.manga_url.rstrip("/").rpartition("/")[2][1:]) + + pos = page.find('itemscope itemtype="http://schema.org/Book') + 1 manga, pos = text.extract( - page, '<span itemprop="name">', '</span>') + page, '<span itemprop="name">', '</span>', pos) mtype, pos = text.extract( page, '<span class="mngType text-danger">[', ']</span>', pos) + manga_id = text.parse_int( + self.manga_url.rstrip("/").rpartition("/")[2][1:]) while True: marker, pos = text.extract( @@ -90,12 +121,20 @@ class HentaihereMangaExtractor(HentaihereBase, MangaExtractor): if marker is None: return results url, pos = text.extract(page, '<a href="', '"', pos) + chapter, pos = text.extract(page, 'title="Tagged: -">\n', '<', pos) chapter_id, pos = text.extract(page, '/C', '"', pos) chapter, _, title = text.unescape(chapter).strip().partition(" - ") + chapter, sep, minor = chapter.partition(".") + results.append((url, { - "manga_id": manga_id, "manga": manga, "type": mtype, - "chapter_id": text.parse_int(chapter_id), + "manga_id": manga_id, + "manga": manga, "chapter": text.parse_int(chapter), - "title": title, "lang": "en", "language": "English", + "chapter_minor": sep + minor, + "chapter_id": text.parse_int(chapter_id), + "type": mtype, + "title": title, + "lang": "en", + "language": "English", })) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 4775613..a4ea71a 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -12,6 +12,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache +import binascii import json import time import re @@ -171,6 +172,15 @@ class InstagramExtractor(Extractor): data["location_url"] = "{}/explore/locations/{}/{}/".format( self.root, location["pk"], slug) + coauthors = post.get("coauthor_producers") + if coauthors: + data["coauthors"] = [ + {"id" : user["pk"], + "username" : user["username"], + "full_name": user["full_name"]} + for user in coauthors + ] + if "carousel_media" in post: items = post["carousel_media"] data["sidecar_media_id"] = data["post_id"] @@ -265,6 +275,14 @@ class InstagramExtractor(Extractor): data["location_url"] = "{}/explore/locations/{}/{}/".format( self.root, location["id"], location["slug"]) + coauthors = post.get("coauthor_producers") + if coauthors: + data["coauthors"] = [ + {"id" : user["id"], + "username": user["username"]} + for user in coauthors + ] + data["_files"] = files = [] if "edge_sidecar_to_children" in post: for num, edge in enumerate( @@ -361,6 +379,7 @@ class InstagramUserExtractor(InstagramExtractor): base = "{}/{}/".format(self.root, self.item) stories = "{}/stories/{}/".format(self.root, self.item) return self._dispatch_extractors(( + (InstagramAvatarExtractor , base + "avatar/"), (InstagramStoriesExtractor , stories), (InstagramHighlightsExtractor, base + "highlights/"), (InstagramPostsExtractor , base + "posts/"), @@ -418,7 +437,7 @@ class InstagramTaggedExtractor(InstagramExtractor): return {"tagged_owner_id": self.user_id} self.user_id = self.api.user_id(self.item) - user = self.api.user(self.item) + user = self.api.user_by_name(self.item) return { "tagged_owner_id" : user["id"], @@ -483,25 +502,32 @@ class InstagramStoriesExtractor(InstagramExtractor): """Extractor for Instagram stories""" subcategory = "stories" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/stories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)") + r"/s(?:tories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)" + r"|/(aGlnaGxpZ2h0[^?#]+)(?:\?story_media_id=(\d+))?)") test = ( ("https://www.instagram.com/stories/instagram/"), ("https://www.instagram.com/stories/highlights/18042509488170095/"), ("https://instagram.com/stories/geekmig/2724343156064789461"), + ("https://www.instagram.com/s/aGlnaGxpZ2h0OjE4MDQyNTA5NDg4MTcwMDk1"), + ("https://www.instagram.com/s/aGlnaGxpZ2h0OjE4MDQyNTA5NDg4MTcwMDk1" + "?story_media_id=2724343156064789461"), ) def __init__(self, match): - self.highlight_id, self.user, self.media_id = match.groups() - if self.highlight_id: + h1, self.user, m1, h2, m2 = match.groups() + + if self.user: + self.highlight_id = None + else: self.subcategory = InstagramHighlightsExtractor.subcategory + self.highlight_id = ("highlight:" + h1 if h1 else + binascii.a2b_base64(h2).decode()) + + self.media_id = m1 or m2 InstagramExtractor.__init__(self, match) def posts(self): - if self.highlight_id: - reel_id = "highlight:" + self.highlight_id - else: - reel_id = self.api.user_id(self.user) - + reel_id = self.highlight_id or self.api.user_id(self.user) reels = self.api.reels_media(reel_id) if self.media_id and reels: @@ -544,6 +570,48 @@ class InstagramTagExtractor(InstagramExtractor): return self.api.tags_media(self.item) +class InstagramAvatarExtractor(InstagramExtractor): + """Extractor for an Instagram user's avatar""" + subcategory = "avatar" + pattern = USER_PATTERN + r"/avatar" + test = ("https://www.instagram.com/instagram/avatar", { + "pattern": r"https://instagram\.[\w.-]+\.fbcdn\.net/v/t51\.2885-19" + r"/281440578_1088265838702675_6233856337905829714_n\.jpg", + }) + + def posts(self): + if self._logged_in: + user_id = self.api.user_id(self.item) + user = self.api.user_by_id(user_id) + avatar = (user.get("hd_profile_pic_url_info") or + user["hd_profile_pic_versions"][-1]) + else: + user = self.item + if user.startswith("id:"): + user = self.api.user_by_id(user[3:]) + else: + user = self.api.user_by_name(user) + user["pk"] = user["id"] + url = user.get("profile_pic_url_hd") or user["profile_pic_url"] + avatar = {"url": url, "width": 0, "height": 0} + + pk = user.get("profile_pic_id") + if pk: + pk = pk.partition("_")[0] + code = shortcode_from_id(pk) + else: + pk = code = "avatar:" + str(user["pk"]) + + return ({ + "pk" : pk, + "code" : code, + "user" : user, + "caption" : None, + "like_count": 0, + "image_versions2": {"candidates": (avatar,)}, + },) + + class InstagramPostExtractor(InstagramExtractor): """Extractor for an Instagram post""" subcategory = "post" @@ -693,15 +761,19 @@ class InstagramRestAPI(): return self._pagination_sections(endpoint, data) @memcache(keyarg=1) - def user(self, screen_name): + def user_by_name(self, screen_name): endpoint = "/v1/users/web_profile_info/" params = {"username": screen_name} return self._call(endpoint, params=params)["data"]["user"] + def user_by_id(self, user_id): + endpoint = "/v1/users/{}/info/".format(user_id) + return self._call(endpoint)["user"] + def user_id(self, screen_name): if screen_name.startswith("id:"): return screen_name[3:] - user = self.user(screen_name) + user = self.user_by_name(screen_name) if user is None: raise exception.AuthorizationError( "Login required to access this profile") @@ -812,7 +884,8 @@ class InstagramGraphqlAPI(): self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode api = InstagramRestAPI(extractor) - self.user = api.user + self.user_by_name = api.user_by_name + self.user_by_id = api.user_by_id self.user_id = api.user_id @staticmethod diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 750b741..21ff114 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -190,7 +190,7 @@ class KemonopartyExtractor(Extractor): for dm in text.extract_iter(page, "<article", "</article>"): dms.append({ "body": text.unescape(text.extract( - dm, '<pre>', '</pre></section>', + dm, "<pre>", "</pre></", )[0].strip()), "date": text.extract(dm, 'datetime="', '"')[0], }) @@ -230,9 +230,10 @@ class KemonopartyUserExtractor(KemonopartyExtractor): posts = self.request(url, params=params).json() yield from posts - if len(posts) < 25: + cnt = len(posts) + if cnt < 25: return - params["o"] += 25 + params["o"] += cnt class KemonopartyPostExtractor(KemonopartyExtractor): @@ -420,9 +421,10 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): posts = self.request(url, params=params).json() yield from posts - if len(posts) < 25: + cnt = len(posts) + if cnt < 25: break - params["skip"] += 25 + params["skip"] += cnt class KemonopartyDiscordServerExtractor(KemonopartyExtractor): diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 3444a7a..a12a801 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -11,21 +11,22 @@ from .. import text import re BASE_PATTERN = \ - r"(?:https?://)?((?:(?:read)?manganato|(?:www\.)?manganelo)\.com)" + r"(?:https?://)?((?:(?:chap|read)?manganato|(?:www\.)?manganelo)\.com)" class ManganeloChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from manganelo.com""" category = "manganelo" - root = "https://readmanganato.com" + root = "https://chapmanganato.com" pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)" test = ( - ("https://readmanganato.com/manga-gn983696/chapter-23", { + ("https://chapmanganato.com/manga-gn983696/chapter-23", { "pattern": r"https://v\d+\.mkklcdnv6tempv5\.com/img/tab_17/03/23" r"/39/gn983696/vol_3_chapter_23_24_yen/\d+-[no]\.jpg", "keyword": "2c5cd59342f149375df9bcb50aa416b4d04a43cf", "count": 25, }), + ("https://readmanganato.com/manga-gn983696/chapter-23"), ("https://manganelo.com/chapter/gamers/chapter_15"), ("https://manganelo.com/chapter/gq921227/chapter_23"), ) @@ -73,14 +74,15 @@ class ManganeloChapterExtractor(ChapterExtractor): class ManganeloMangaExtractor(MangaExtractor): """Extractor for manga from manganelo.com""" category = "manganelo" - root = "https://readmanganato.com" + root = "https://chapmanganato.com" chapterclass = ManganeloChapterExtractor pattern = BASE_PATTERN + r"(/(?:manga[-/]|read_)\w+)/?$" test = ( - ("https://readmanganato.com/manga-gn983696", { + ("https://chapmanganato.com/manga-gn983696", { "pattern": ManganeloChapterExtractor.pattern, "count": ">= 25", }), + ("https://readmanganato.com/manga-gn983696"), ("https://manganelo.com/manga/read_otome_no_teikoku"), ("https://manganelo.com/manga/ol921234/"), ) diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py index 2bd11ef..5fa5631 100644 --- a/gallery_dl/extractor/mangasee.py +++ b/gallery_dl/extractor/mangasee.py @@ -35,33 +35,59 @@ class MangaseeBase(): class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): - pattern = r"(?:https?://)?mangasee123\.com(/read-online/[^/?#]+\.html)" - test = (("https://mangasee123.com/read-online" - "/Tokyo-Innocent-chapter-4.5-page-1.html"), { - "pattern": r"https://[^/]+/manga/Tokyo-Innocent/0004\.5-00\d\.png", - "count": 8, - "keyword": { - "chapter": 4, - "chapter_minor": ".5", - "chapter_string": "100045", + pattern = (r"(?:https?://)?(mangasee123|manga4life)\.com" + r"(/read-online/[^/?#]+\.html)") + test = ( + (("https://mangasee123.com/read-online" + "/Tokyo-Innocent-chapter-4.5-page-1.html"), { + "pattern": r"https://[^/]+/manga/Tokyo-Innocent/0004\.5-00\d\.png", "count": 8, - "date": "dt:2020-01-20 21:52:53", - "extension": "png", - "filename": r"re:0004\.5-00\d", - "index": "1", - "lang": "en", - "language": "English", - "manga": "Tokyo Innocent", - "page": int, - "title": "", - }, - }) + "keyword": { + "chapter": 4, + "chapter_minor": ".5", + "chapter_string": "100045", + "count": 8, + "date": "dt:2020-01-20 21:52:53", + "extension": "png", + "filename": r"re:0004\.5-00\d", + "index": "1", + "lang": "en", + "language": "English", + "manga": "Tokyo Innocent", + "page": int, + "title": "", + }, + }), + (("https://manga4life.com/read-online" + "/One-Piece-chapter-1063-page-1.html"), { + "pattern": r"https://[^/]+/manga/One-Piece/1063-0\d\d\.png", + "count": 13, + "keyword": { + "chapter": 1063, + "chapter_minor": "", + "chapter_string": "110630", + "count": 13, + "date": "dt:2022-10-16 17:32:54", + "extension": "png", + "filename": r"re:1063-0\d\d", + "index": "1", + "lang": "en", + "language": "English", + "manga": "One Piece", + "page": int, + "title": "", + }, + }), + ) def __init__(self, match): - ChapterExtractor.__init__(self, match) + if match.group(1) == "manga4life": + self.category = "mangalife" + self.root = "https://manga4life.com" + ChapterExtractor.__init__(self, match, self.root + match.group(2)) self.session.headers["Referer"] = self.gallery_url - domain = "mangasee123.com" + domain = self.root.rpartition("/")[2] cookies = self.session.cookies if not cookies.get("PHPSESSID", domain=domain): cookies.set("PHPSESSID", util.generate_token(13), domain=domain) @@ -96,12 +122,24 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): class MangaseeMangaExtractor(MangaseeBase, MangaExtractor): chapterclass = MangaseeChapterExtractor - pattern = r"(?:https?://)?mangasee123\.com(/manga/[^/?#]+)" - test = (("https://mangasee123.com/manga" - "/Nakamura-Koedo-To-Daizu-Keisuke-Wa-Umaku-Ikanai"), { - "pattern": MangaseeChapterExtractor.pattern, - "count": ">= 17", - }) + pattern = r"(?:https?://)?(mangasee123|manga4life)\.com(/manga/[^/?#]+)" + test = ( + (("https://mangasee123.com/manga" + "/Nakamura-Koedo-To-Daizu-Keisuke-Wa-Umaku-Ikanai"), { + "pattern": MangaseeChapterExtractor.pattern, + "count": ">= 17", + }), + ("https://manga4life.com/manga/Ano-Musume-Ni-Kiss-To-Shirayuri-O", { + "pattern": MangaseeChapterExtractor.pattern, + "count": ">= 50", + }), + ) + + def __init__(self, match): + if match.group(1) == "manga4life": + self.category = "mangalife" + self.root = "https://manga4life.com" + MangaExtractor.__init__(self, match, self.root + match.group(2)) def chapters(self, page): slug, pos = text.extract(page, 'vm.IndexName = "', '"') diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 9ce5772..0d2cded 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -92,7 +92,7 @@ INSTANCES = { } } -BASE_PATTERN = MastodonExtractor.update(INSTANCES) +BASE_PATTERN = MastodonExtractor.update(INSTANCES) + "(?:/web)?" class MastodonUserExtractor(MastodonExtractor): @@ -111,9 +111,16 @@ class MastodonUserExtractor(MastodonExtractor): "count": 60, }), ("https://baraag.net/@pumpkinnsfw"), + ("https://mastodon.social/@yoru_nine@pawoo.net", { + "pattern": r"https://mastodon\.social/media_proxy/\d+/original", + "range": "1-10", + "count": 10, + }), ("https://mastodon.social/@id:10843"), ("https://mastodon.social/users/id:10843"), ("https://mastodon.social/users/jk"), + ("https://mastodon.social/users/yoru_nine@pawoo.net"), + ("https://mastodon.social/web/@jk"), ) def statuses(self): @@ -126,6 +133,20 @@ class MastodonUserExtractor(MastodonExtractor): ) +class MastodonBookmarkExtractor(MastodonExtractor): + """Extractor for mastodon bookmarks""" + subcategory = "bookmark" + pattern = BASE_PATTERN + r"/bookmarks" + test = ( + ("https://mastodon.social/bookmarks"), + ("https://pawoo.net/bookmarks"), + ("https://baraag.net/bookmarks"), + ) + + def statuses(self): + return MastodonAPI(self).account_bookmarks() + + class MastodonFollowingExtractor(MastodonExtractor): """Extractor for followed mastodon users""" subcategory = "following" @@ -197,13 +218,21 @@ class MastodonAPI(): if username.startswith("id:"): return username[3:] - handle = "@{}@{}".format(username, self.extractor.instance) + if "@" in username: + handle = "@" + username + else: + handle = "@{}@{}".format(username, self.extractor.instance) + for account in self.account_search(handle, 1): - if account["username"] == username: + if account["acct"] == username: self.extractor._check_move(account) return account["id"] raise exception.NotFoundError("account") + def account_bookmarks(self): + endpoint = "/v1/bookmarks" + return self._pagination(endpoint, None) + def account_following(self, account_id): endpoint = "/v1/accounts/{}/following".format(account_id) return self._pagination(endpoint, None) diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 27ec929..4d63c3e 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -26,9 +26,10 @@ class MoebooruExtractor(BooruExtractor): def _prepare(post): post["date"] = text.parse_timestamp(post["created_at"]) - def _extended_tags(self, post): - url = "{}/post/show/{}".format(self.root, post["id"]) - page = self.request(url).text + def _extended_tags(self, post, page=None): + if not page: + url = "{}/post/show/{}".format(self.root, post["id"]) + page = self.request(url).text html = text.extract(page, '<ul id="tag-', '</ul>')[0] if html: tags = collections.defaultdict(list) @@ -37,6 +38,29 @@ class MoebooruExtractor(BooruExtractor): tags[tag_type].append(text.unquote(tag_name)) for key, value in tags.items(): post["tags_" + key] = " ".join(value) + return page + + def _notes(self, post, page=None): + if not page: + url = "{}/post/show/{}".format(self.root, post["id"]) + page = self.request(url).text + notes = [] + notes_container = text.extract(page, 'id="note-container"', "<img ")[0] + if not notes_container: + return + + for note in notes_container.split('class="note-box"')[1:]: + extr = text.extract_from(note) + notes.append({ + "width" : int(extr("width: ", "p")), + "height": int(extr("height: ", "p")), + "y" : int(extr("top: ", "p")), + "x" : int(extr("left: ", "p")), + "id" : int(extr('id="note-body-', '"')), + "body" : text.remove_html(extr('>', "</div>")), + }) + + post["notes"] = notes def _pagination(self, url, params): params["page"] = self.page_start @@ -96,6 +120,37 @@ class MoebooruPostExtractor(MoebooruExtractor): "tags_general": str, }, }), + ("https://yande.re/post/show/993156", { + "content": "fed722bd90f48de41ec163692befc701056e2b1e", + "options": (("notes", True),), + "keyword": { + "notes": [ + { + "id": 7096, + "x" : 90, + "y" : 626, + "width" : 283, + "height": 529, + "body" : "Please keep this as a secret for me!!", + }, + { + "id": 7095, + "x" : 900, + "y" : 438, + "width" : 314, + "height": 588, + "body" : "The facts that I love playing games", + }, + ], + }, + }), + ("https://lolibooru.moe/post/show/281305/", { + "content": "a331430223ffc5b23c31649102e7d49f52489b57", + "options": (("notes", True),), + "keyword": { + "notes": list, + }, + }), ("https://konachan.net/post/show/205189"), ("https://www.sakugabooru.com/post/show/125570"), ("https://lolibooru.moe/post/show/287835"), diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 6b2e1c3..e3a96bd 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -45,6 +45,7 @@ class PixivExtractor(Extractor): work["tags"] = [tag["name"] for tag in work["tags"]] ratings = {0: "General", 1: "R-18", 2: "R-18G"} + userdata = self.config("metadata") metadata = self.metadata() works = self.works() @@ -60,6 +61,8 @@ class PixivExtractor(Extractor): del work["image_urls"] del work["meta_pages"] + if userdata: + work.update(self.api.user_detail(work["user"]["id"])) if transform_tags: transform_tags(work) work["num"] = 0 @@ -198,7 +201,7 @@ class PixivArtworksExtractor(PixivExtractor): def metadata(self): if self.config("metadata"): - return self.api.user_detail(self.user_id) + self.api.user_detail(self.user_id) return {} def works(self): @@ -557,7 +560,7 @@ class PixivSearchExtractor(PixivExtractor): sort = "date_d" self.sort = sort_map[sort] - target = query.get("s_mode", "s_tag") + target = query.get("s_mode", "s_tag_full") target_map = { "s_tag": "partial_match_for_tags", "s_tag_full": "exact_match_for_tags", @@ -565,7 +568,7 @@ class PixivSearchExtractor(PixivExtractor): } if target not in target_map: self.log.warning("invalid search target '%s'", target) - target = "s_tag" + target = "s_tag_full" self.target = target_map[target] self.date_start = query.get("scd") diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index db8d700..448dc1b 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -170,7 +170,7 @@ class ReactorTagExtractor(ReactorExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "{search_tags}_{post_id}_{num}" - pattern = BASE_PATTERN + r"/tag/([^/?#]+)" + pattern = BASE_PATTERN + r"/tag/([^/?#]+)(?:/[^/?#]+)?" test = ( ("http://reactor.cc/tag/gif"), ("http://anime.reactor.cc/tag/Anime+Art"), @@ -180,6 +180,10 @@ class ReactorTagExtractor(ReactorExtractor): ("http://joyreactor.com/tag/Cirno", { "url": "aa59090590b26f4654881301fe8fe748a51625a8", }), + # 'best' rating (#3073) + ("http://joyreactor.com/tag/Dark+Souls+2/best", { + "count": 4, + }), ("http://pornreactor.cc/tag/RiceGnat", { "range": "1-25", "count": ">= 25", diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 1111c3a..53e5e79 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text -from ..cache import cache +from ..cache import memcache class RedgifsExtractor(Extractor): @@ -133,10 +133,11 @@ class RedgifsAPI(): def __init__(self, extractor): self.extractor = extractor self.headers = { - "Referer" : extractor.root + "/", - "authorization": "Bearer " + self._fetch_bearer_token(extractor), - "content-type" : "application/json", - "Origin" : extractor.root, + "Referer" : extractor.root + "/", + "authorization" : None, + "content-type" : "application/json", + "x-customheader": extractor.root + "/", + "Origin" : extractor.root, } def gif(self, gif_id): @@ -156,6 +157,7 @@ class RedgifsAPI(): def _call(self, endpoint, params=None): url = self.API_ROOT + endpoint + self.headers["authorization"] = self._auth() return self.extractor.request( url, params=params, headers=self.headers).json() @@ -170,16 +172,10 @@ class RedgifsAPI(): return params["page"] += 1 - @cache(maxage=3600) - def _fetch_bearer_token(self, extr): - extr.log.debug("Retrieving Bearer token") - - page = extr.request(extr.root + "/").text - index = text.extract(page, "/assets/js/index", ".js")[0] - - url = extr.root + "/assets/js/index" + index + ".js" - page = extr.request(url, encoding="utf-8").text - token = "ey" + text.extract(page, '="ey', '"')[0] - - extr.log.debug("Token: '%s'", token) - return token + @memcache(maxage=600) + def _auth(self): + # https://github.com/Redgifs/api/wiki/Temporary-tokens + url = self.API_ROOT + "/v2/auth/temporary" + self.headers["authorization"] = None + return "Bearer " + self.extractor.request( + url, headers=self.headers).json()["token"] diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 822b1f2..3724c85 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -77,9 +77,6 @@ class SkebExtractor(Extractor): "body" : resp["body"], "source_body" : resp["source_body"], "translated_body" : resp["translated"], - "completed_at" : resp["completed_at"], - "date" : text.parse_datetime( - resp["completed_at"], "%Y-%m-%dT%H:%M:%S.%fZ"), "nsfw" : resp["nsfw"], "anonymous" : resp["anonymous"], "tags" : resp["tag_list"], @@ -160,7 +157,6 @@ class SkebPostExtractor(SkebExtractor): "name": str, "screen_name": "minato_ragi", }, - "completed_at": "2022-02-27T14:03:45.442Z", "content_category": "preview", "creator": { "avatar_url": "https://pbs.twimg.com/profile_images" @@ -171,7 +167,6 @@ class SkebPostExtractor(SkebExtractor): "name": "イチノセ奏", "screen_name": "kanade_cocotte", }, - "date": "dt:2022-02-27 14:03:45", "file_id": int, "file_url": str, "genre": "art", @@ -212,7 +207,7 @@ class SkebUserExtractor(SkebExtractor): "pattern": r"https://skeb\.imgix\.net/uploads/origins/[\w-]+" r"\?bg=%23fff&auto=format&txtfont=bold&txtshad=70" r"&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150" - r"&txt=SAMPLE&w=800&s=\w+", + r"&txt=SAMPLE&fm=webp&w=800&s=\w+", "range": "1-5", }) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 324a3c6..5451f6e 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -49,6 +49,8 @@ class TumblrExtractor(Extractor): self.reblogs = self.config("reblogs", True) self.external = self.config("external", False) self.original = self.config("original", True) + self.fallback_delay = self.config("fallback-delay", 120.0) + self.fallback_retries = self.config("fallback-retries", 2) if len(self.types) == 1: self.api.posts_type = next(iter(self.types)) @@ -250,8 +252,8 @@ class TumblrExtractor(Extractor): return updated, (resized == updated) def _original_image_fallback(self, url, post_id): - for _ in range(3): - self.sleep(120, "image token") + for _ in range(self.fallback_retries): + self.sleep(self.fallback_delay, "image token") yield self._update_image_token(url)[0] self.log.warning("Unable to fetch higher-resolution " "version of %s (%s)", url, post_id) diff --git a/gallery_dl/extractor/vichan.py b/gallery_dl/extractor/vichan.py new file mode 100644 index 0000000..2fafb56 --- /dev/null +++ b/gallery_dl/extractor/vichan.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for vichan imageboards""" + +from .common import BaseExtractor, Message +from .. import text + + +class VichanExtractor(BaseExtractor): + """Base class for vichan extractors""" + basecategory = "vichan" + + +BASE_PATTERN = VichanExtractor.update({ + "8kun": { + "root": "https://8kun.top", + "pattern": r"8kun\.top", + }, + "wikieat": { + "root": "https://wikieat.club", + "pattern": r"wikieat\.club", + }, + "smugloli": { + "root": None, + "pattern": r"smuglo(?:\.li|li\.net)", + }, +}) + + +class VichanThreadExtractor(VichanExtractor): + """Extractor for vichan threads""" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{time}{num:?-//} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" + test = ( + ("https://8kun.top/test/res/65248.html", { + "pattern": r"https://media\.128ducks\.com/file_store/\w{64}\.\w+", + "count": ">= 8", + }), + # old-style file URLs (#1101) + # ("https://8kun.top/d/res/13258.html", { + # "pattern": r"https://media\.128ducks\.com/d/src/\d+(-\d)?\.\w+", + # "range": "1-20", + # }), + + ("https://wikieat.club/cel/res/25321.html", { + "pattern": r"https://wikieat\.club/cel/src/\d+(-\d)?\.\w+", + "count": ">= 200", + }), + + ("https://smuglo.li/a/res/1154380.html", { + "pattern": r"https://smug.+/a/src/\d+(-\d)?\.\w+", + "count": ">= 18", + "keyword": { + "board": "a", + "thread": "1154380", + "title": "Mob Psycho 100 Season 3", + }, + }), + ("https://smugloli.net/a/res/1145409.html"), + ) + + def __init__(self, match): + VichanExtractor.__init__(self, match) + index = match.lastindex + self.board = match.group(index-1) + self.thread = match.group(index) + + def items(self): + url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread) + posts = self.request(url).json()["posts"] + title = posts[0].get("sub") or text.remove_html(posts[0]["com"]) + process = (self._process_8kun if self.category == "8kun" else + self._process) + data = { + "board" : self.board, + "thread": self.thread, + "title" : text.unescape(title)[:50], + "num" : 0, + } + + yield Message.Directory, data + for post in posts: + if "filename" in post: + yield process(post, data) + if "extra_files" in post: + for post["num"], filedata in enumerate( + post["extra_files"], 1): + yield process(post, filedata) + + def _process(self, post, data): + post.update(data) + post["extension"] = post["ext"][1:] + post["url"] = "{}/{}/src/{}{}".format( + self.root, post["board"], post["tim"], post["ext"]) + return Message.Url, post["url"], post + + @staticmethod + def _process_8kun(post, data): + post.update(data) + post["extension"] = post["ext"][1:] + + tim = post["tim"] + if len(tim) > 16: + post["url"] = "https://media.128ducks.com/file_store/{}{}".format( + tim, post["ext"]) + else: + post["url"] = "https://media.128ducks.com/{}/src/{}{}".format( + post["board"], tim, post["ext"]) + + return Message.Url, post["url"], post + + +class VichanBoardExtractor(VichanExtractor): + """Extractor for vichan boards""" + subcategory = "board" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)" + test = ( + ("https://8kun.top/v/index.html", { + "pattern": VichanThreadExtractor.pattern, + "count": ">= 100", + }), + ("https://8kun.top/v/2.html"), + ("https://8kun.top/v/index.html?PageSpeed=noscript"), + + ("https://wikieat.club/cel/index.html", { + "pattern": VichanThreadExtractor.pattern, + "count": ">= 100", + }), + ("https://wikieat.club/cel/catalog.html"), + ("https://wikieat.club/cel/2.html"), + + ("https://smuglo.li/a", { + "pattern": VichanThreadExtractor.pattern, + "count": ">= 100", + }), + ("https://smuglo.li/a/1.html"), + ("https://smugloli.net/cute/catalog.html"), + ) + + def __init__(self, match): + VichanExtractor.__init__(self, match) + self.board = match.group(match.lastindex) + + def items(self): + url = "{}/{}/threads.json".format(self.root, self.board) + threads = self.request(url).json() + + for page in threads: + for thread in page["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["no"]) + thread["page"] = page["page"] + thread["_extractor"] = VichanThreadExtractor + yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/wikieat.py b/gallery_dl/extractor/wikieat.py deleted file mode 100644 index c7b1958..0000000 --- a/gallery_dl/extractor/wikieat.py +++ /dev/null @@ -1,95 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2021 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://wikieat.club/""" - -from .common import Extractor, Message -from .. import text - - -class WikieatThreadExtractor(Extractor): - """Extractor for Wikieat threads""" - category = "wikieat" - subcategory = "thread" - directory_fmt = ("{category}", "{board}", "{thread} {title}") - filename_fmt = "{time}{num:?-//} {filename}.{extension}" - archive_fmt = "{board}_{thread}_{tim}" - pattern = r"(?:https?://)?wikieat\.club/([^/]+)/res/(\d+)" - test = ("https://wikieat.club/cel/res/25321.html", { - "pattern": r"https://wikieat\.club/cel/src/\d+(-\d)?\.\w+", - "count": ">= 200", - }) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board, self.thread = match.groups() - - def items(self): - url = "https://wikieat.club/{}/res/{}.json".format( - self.board, self.thread) - posts = self.request(url).json()["posts"] - title = posts[0].get("sub") or text.remove_html(posts[0]["com"]) - process = self._process - - data = { - "board" : self.board, - "thread": self.thread, - "title" : text.unescape(title)[:50], - "num" : 0, - } - - yield Message.Directory, data - for post in posts: - if "filename" in post: - yield process(post, data) - if "extra_files" in post: - for post["num"], filedata in enumerate( - post["extra_files"], 1): - yield process(post, filedata) - - @staticmethod - def _process(post, data): - post.update(data) - post["extension"] = post["ext"][1:] - tim = post["tim"] - url = ("https://wikieat.club/" + - post["board"] + "/src/" + - tim + post["ext"]) - return Message.Url, url, post - - -class WikieatBoardExtractor(Extractor): - """Extractor for Wikieat boards""" - category = "wikieat" - subcategory = "board" - pattern = (r"(?:https?://)?wikieat\.club" - r"/([^/?#]+)/(?:index|catalog|\d+)\.html") - test = ( - ("https://wikieat.club/cel/index.html", { - "pattern": WikieatThreadExtractor.pattern, - "count": ">= 100", - }), - ("https://wikieat.club/cel/catalog.html"), - ("https://wikieat.club/cel/2.html") - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board = match.group(1) - - def items(self): - url = "https://wikieat.club/{}/threads.json".format(self.board) - threads = self.request(url).json() - - for page in threads: - for thread in page["threads"]: - url = "https://wikieat.club/{}/res/{}.html".format( - self.board, thread["no"]) - thread["page"] = page["page"] - thread["_extractor"] = WikieatThreadExtractor - yield Message.Queue, url, thread diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 1650b0a..98b6d59 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -790,11 +790,11 @@ class DownloadArchive(): try: self.cursor.execute("CREATE TABLE IF NOT EXISTS archive " - "(entry PRIMARY KEY) WITHOUT ROWID") + "(entry TEXT PRIMARY KEY) WITHOUT ROWID") except sqlite3.OperationalError: # fallback for missing WITHOUT ROWID support (#553) self.cursor.execute("CREATE TABLE IF NOT EXISTS archive " - "(entry PRIMARY KEY)") + "(entry TEXT PRIMARY KEY)") def check(self, kwdict): """Return True if the item described by 'kwdict' exists in archive""" @@ -807,4 +807,4 @@ class DownloadArchive(): """Add item described by 'kwdict' to archive""" key = kwdict.get(self._cache_key) or self.keygen(kwdict) self.cursor.execute( - "INSERT OR IGNORE INTO archive VALUES (?)", (key,)) + "INSERT OR IGNORE INTO archive (entry) VALUES (?)", (key,)) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index f758857..85a03de 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.23.3" +__version__ = "1.23.5" diff --git a/test/test_results.py b/test/test_results.py index e594933..a42de09 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -324,7 +324,7 @@ def setup_test_config(): for category in ("danbooru", "instagram", "twitter", "subscribestar", "e621", "atfbooru", "inkbunny", "tapas", "pillowfort", - "mangadex"): + "mangadex", "aibooru"): config.set(("extractor", category), "username", None) config.set(("extractor", "mastodon.social"), "access-token", |
