diff options
| author | 2024-02-20 02:31:10 -0500 | |
|---|---|---|
| committer | 2024-02-20 02:31:10 -0500 | |
| commit | 01166fa52707cc282467427cf0e65c1b8983c4be (patch) | |
| tree | 7f61e0de7e76a7a226bb6e05e4e3d181e11f673a | |
| parent | 12e23f1195164dcb740d6d4a4287e762c9e5e534 (diff) | |
New upstream version 1.26.8.upstream/1.26.8
36 files changed, 1207 insertions, 342 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 277250d..f938ab9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,49 @@ # Changelog +## 1.26.8 - 2024-02-17 +### Extractors +#### Additions +- [bluesky] add support ([#4438](https://github.com/mikf/gallery-dl/issues/4438), [#4708](https://github.com/mikf/gallery-dl/issues/4708), [#4722](https://github.com/mikf/gallery-dl/issues/4722), [#5047](https://github.com/mikf/gallery-dl/issues/5047)) +- [bunkr] support new domains ([#5114](https://github.com/mikf/gallery-dl/issues/5114), [#5130](https://github.com/mikf/gallery-dl/issues/5130), [#5134](https://github.com/mikf/gallery-dl/issues/5134)) +- [fanbox] add `home` and `supporting` extractors ([#5138](https://github.com/mikf/gallery-dl/issues/5138)) +- [imagechest] add `user` extractor ([#5143](https://github.com/mikf/gallery-dl/issues/5143)) +- [imagetwist] add `gallery` extractor ([#5190](https://github.com/mikf/gallery-dl/issues/5190)) +- [kemonoparty] add `posts` extractor ([#5194](https://github.com/mikf/gallery-dl/issues/5194), [#5198](https://github.com/mikf/gallery-dl/issues/5198)) +- [twitter] support communities ([#4913](https://github.com/mikf/gallery-dl/issues/4913)) +- [vsco] support spaces ([#5202](https://github.com/mikf/gallery-dl/issues/5202)) +- [weibo] add `gifs` option ([#5183](https://github.com/mikf/gallery-dl/issues/5183)) +- [wikimedia] support `www.pidgi.net` ([#5205](https://github.com/mikf/gallery-dl/issues/5205)) +- [wikimedia] support `bulbapedia.bulbagarden.net` ([#5206](https://github.com/mikf/gallery-dl/issues/5206)) +#### Fixes +- [archivedmoe] fix `thebarchive` WebM URLs ([#5116](https://github.com/mikf/gallery-dl/issues/5116)) +- [batoto] fix crash when manga name or chapter contains a `-` ([#5200](https://github.com/mikf/gallery-dl/issues/5200)) +- [bunkr] fix extraction ([#5088](https://github.com/mikf/gallery-dl/issues/5088), [#5151](https://github.com/mikf/gallery-dl/issues/5151), [#5153](https://github.com/mikf/gallery-dl/issues/5153)) +- [gofile] update `website_token` extraction +- [idolcomplex] fix pagination for tags containing `:` ([#5184](https://github.com/mikf/gallery-dl/issues/5184)) +- [kemonoparty] fix deleting file names when computing `revision_hash` ([#5103](https://github.com/mikf/gallery-dl/issues/5103)) +- [luscious] fix IndexError for files without thumbnail ([#5122](https://github.com/mikf/gallery-dl/issues/5122), [#5124](https://github.com/mikf/gallery-dl/issues/5124), [#5182](https://github.com/mikf/gallery-dl/issues/5182)) +- [naverwebtoon] fix `title` for comics with empty tags ([#5120](https://github.com/mikf/gallery-dl/issues/5120)) +- [pinterest] fix section URLs for boards with `/`, `?`, or `#` in their name ([#5104](https://github.com/mikf/gallery-dl/issues/5104)) +- [twitter] update query hashes +- [zerochan] fix skipping every other post +#### Improvements +- [deviantart] skip locked/blurred posts ([#4567](https://github.com/mikf/gallery-dl/issues/4567), [#5193](https://github.com/mikf/gallery-dl/issues/5193)) +- [deviantart] implement downloading PNG versions of non-original images with `"quality": "png"` ([#4846](https://github.com/mikf/gallery-dl/issues/4846)) +- [flickr] handle non-JSON errors ([#5131](https://github.com/mikf/gallery-dl/issues/5131)) +- [idolcomplex] support alphanumeric post IDs ([#5171](https://github.com/mikf/gallery-dl/issues/5171)) +- [kemonoparty] implement filtering duplicate revisions with `"revisions": "unique"`([#5013](https://github.com/mikf/gallery-dl/issues/5013)) +- [naverwebtoon] support `/webtoon/` paths for all comics ([#5123](https://github.com/mikf/gallery-dl/issues/5123)) +#### Metadata +- [idolcomplex] extract `id_alnum` metadata ([#5171](https://github.com/mikf/gallery-dl/issues/5171)) +- [pornpics] support multiple values for `channel` ([#5195](https://github.com/mikf/gallery-dl/issues/5195)) +- [sankaku] add `id-format` option ([#5073](https://github.com/mikf/gallery-dl/issues/5073)) +- [skeb] add `num` and `count` metadata fields ([#5187](https://github.com/mikf/gallery-dl/issues/5187)) +### Downloaders +#### Fixes +- [http] remove `pyopenssl` import ([#5156](https://github.com/mikf/gallery-dl/issues/5156)) +### Miscellaneous +- fix filename formatting silently failing under certain circumstances ([#5185](https://github.com/mikf/gallery-dl/issues/5185), [#5186](https://github.com/mikf/gallery-dl/issues/5186)) + ## 1.26.7 - 2024-01-21 ### Extractors #### Additions @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.26.7 +Version: 1.26.8 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -112,9 +112,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.8/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.8/gallery-dl.bin>`__ Nightly Builds @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.8/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.8/gallery-dl.bin>`__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index b779e1e..078ff4f 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2024-01-21" "1.26.7" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2024-02-17" "1.26.8" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 7fec8ae..4b349dd 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2024-01-21" "1.26.7" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2024-02-17" "1.26.8" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -464,6 +464,8 @@ and optional for .br * \f[I]atfbooru\f[] (*) .br +* \f[I]bluesky\f[] +.br * \f[I]danbooru\f[] (*) .br * \f[I]e621\f[] (*) @@ -1374,6 +1376,78 @@ Supported module types are Download embedded videos hosted on https://www.blogger.com/ +.SS extractor.bluesky.include +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]"media"\f[] + +.IP "Example:" 4 +.br +* "avatar,background,posts" +.br +* ["avatar", "background", "posts"] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Possible values are +\f[I]"avatar"\f[], +\f[I]"background"\f[], +\f[I]"posts"\f[], +\f[I]"replies"\f[], +\f[I]"media"\f[], +\f[I]"likes"\f[], + +It is possible to use \f[I]"all"\f[] instead of listing all values separately. + + +.SS extractor.bluesky.metadata +.IP "Type:" 6 +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Example:" 4 +.br +* "facets,user" +.br +* ["facets", "user"] + +.IP "Description:" 4 +Extract additional metadata. + +.br +* \f[I]facets\f[]: \f[I]hashtags\f[], \f[I]mentions\f[], and \f[I]uris\f[] +.br +* \f[I]user\f[]: detailed \f[I]user\f[] metadata for the user referenced in the input URL +(See \f[I]app.bsky.actor.getProfile\f[]). + + +.SS extractor.bluesky.post.depth +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]0\f[] + +.IP "Description:" 4 +Sets the maximum depth of returned reply posts. + +(See depth parameter of \f[I]app.bsky.feed.getPostThread\f[]) + + .SS extractor.cyberdrop.domain .IP "Type:" 6 \f[I]string\f[] @@ -1761,16 +1835,19 @@ when a \f[I]refresh token\f[] is provided. .SS extractor.deviantart.quality .IP "Type:" 6 -\f[I]integer\f[] +.br +* \f[I]integer\f[] +.br +* \f[I]string\f[] .IP "Default:" 9 \f[I]100\f[] .IP "Description:" 4 -JPEG quality level of newer images for which +JPEG quality level of images for which an original file download is not available. -Note: Only has an effect when \f[I]deviantart.jwt\f[] is disabled. +Set this to \f[I]"png"\f[] to download a PNG version of these images instead. .SS extractor.deviantart.refresh-token @@ -1833,7 +1910,7 @@ Leave \f[I]SIZE\f[] empty to download the regular, small avatar format. .IP "Example:" 4 .br -* notes,pools +* "notes,pools" .br * ["notes", "pools"] @@ -2575,7 +2652,10 @@ Extract \f[I]username\f[] metadata. .SS extractor.kemonoparty.revisions .IP "Type:" 6 -\f[I]bool\f[] +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] .IP "Default:" 9 \f[I]false\f[] @@ -2583,6 +2663,8 @@ Extract \f[I]username\f[] metadata. .IP "Description:" 4 Extract post revisions. +Set this to \f[I]"unique"\f[] to filter out duplicate revisions. + Note: This requires 1 additional HTTP request per post. @@ -3473,6 +3555,22 @@ If the format is given as \f[I]string\f[], it will be extended with restrict it to only one possible format. +.SS extractor.sankaku.id-format +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"numeric"\f[] + +.IP "Description:" 4 +Format of \f[I]id\f[] metadata fields. + +.br +* \f[I]"alphanumeric"\f[] or \f[I]"alnum"\f[]: 11-character alphanumeric IDs (\f[I]y0abGlDOr2o\f[]) +.br +* \f[I]"numeric"\f[] or \f[I]"legacy"\f[]: numeric IDs (\f[I]360451\f[]) + + .SS extractor.sankaku.refresh .IP "Type:" 6 \f[I]bool\f[] @@ -4476,6 +4574,22 @@ Fetch extra submission metadata during gallery downloads. Note: This requires 1 additional HTTP request per submission. +.SS extractor.weibo.gifs +.IP "Type:" 6 +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download \f[I]gif\f[] files. + +Set this to \f[I]"video"\f[] to download GIFs as video files. + + .SS extractor.weibo.include .IP "Type:" 6 .br diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index e9a8b02..b4f974c 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.26.7 +Version: 1.26.8 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -112,9 +112,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.8/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.7/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.8/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index ff16efd..ba1f7d8 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -60,6 +60,7 @@ gallery_dl/extractor/batoto.py gallery_dl/extractor/bbc.py gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py +gallery_dl/extractor/bluesky.py gallery_dl/extractor/booru.py gallery_dl/extractor/bunkr.py gallery_dl/extractor/catbox.py diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index f493947..f1d2c4a 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -13,12 +13,7 @@ import mimetypes from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase from .. import text, util - from ssl import SSLError -try: - from OpenSSL.SSL import Error as OpenSSLError -except ImportError: - OpenSSLError = SSLError class HttpDownloader(DownloaderBase): @@ -249,7 +244,7 @@ class HttpDownloader(DownloaderBase): file_header = next( content if response.raw.chunked else response.iter_content(16), b"") - except (RequestException, SSLError, OpenSSLError) as exc: + except (RequestException, SSLError) as exc: msg = str(exc) print() continue @@ -283,7 +278,7 @@ class HttpDownloader(DownloaderBase): self.out.start(pathfmt.path) try: self.receive(fp, content, size, offset) - except (RequestException, SSLError, OpenSSLError) as exc: + except (RequestException, SSLError) as exc: msg = str(exc) print() continue @@ -310,7 +305,7 @@ class HttpDownloader(DownloaderBase): try: for _ in response.iter_content(self.chunk_size): pass - except (RequestException, SSLError, OpenSSLError) as exc: + except (RequestException, SSLError) as exc: print() self.log.debug( "Unable to consume response body (%s: %s); " diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d624736..a665249 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -29,6 +29,7 @@ modules = [ "bbc", "behance", "blogger", + "bluesky", "bunkr", "catbox", "chevereto", diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index e82cd09..2adb142 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -40,10 +40,18 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) - manga, info, _ = extr("<title>", "<").rsplit(" - ", 3) + try: + manga, info, _ = extr("<title>", "<").rsplit(" - ", 3) + except ValueError: + manga = info = None + manga_id = text.extr( extr('rel="canonical" href="', '"'), "/title/", "/") + if not manga: + manga = extr('link-hover">', "<") + info = text.remove_html(extr('link-hover">', "</")) + match = re.match( r"(?:Volume\s+(\d+) )?" r"\w+\s+(\d+)(.*)", info) diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py new file mode 100644 index 0000000..8de0d7b --- /dev/null +++ b/gallery_dl/extractor/bluesky.py @@ -0,0 +1,458 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://bsky.app/""" + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache, memcache + +BASE_PATTERN = r"(?:https?://)?bsky\.app" +USER_PATTERN = BASE_PATTERN + r"/profile/([^/?#]+)" + + +class BlueskyExtractor(Extractor): + """Base class for bluesky extractors""" + category = "bluesky" + directory_fmt = ("{category}", "{author[handle]}") + filename_fmt = "{createdAt[:19]}_{post_id}_{num}.{extension}" + archive_fmt = "{filename}" + root = "https://bsky.app" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + + def _init(self): + meta = self.config("metadata") or () + if meta: + if isinstance(meta, str): + meta = meta.replace(" ", "").split(",") + elif not isinstance(meta, (list, tuple)): + meta = ("user", "facets") + self._metadata_user = ("user" in meta) + self._metadata_facets = ("facets" in meta) + + self.api = BlueskyAPI(self) + self._user = None + + def items(self): + for post in self.posts(): + if "post" in post: + post = post["post"] + post.update(post["record"]) + del post["record"] + + images = () + if "embed" in post: + media = post["embed"] + if "media" in media: + media = media["media"] + if "images" in media: + images = media["images"] + + if self._metadata_facets: + if "facets" in post: + post["hashtags"] = tags = [] + post["mentions"] = dids = [] + post["uris"] = uris = [] + for facet in post["facets"]: + features = facet["features"][0] + if "tag" in features: + tags.append(features["tag"]) + elif "did" in features: + dids.append(features["did"]) + elif "uri" in features: + uris.append(features["uri"]) + else: + post["hashtags"] = post["mentions"] = post["uris"] = () + + if self._metadata_user: + post["user"] = self._user or post["author"] + + post["post_id"] = post["uri"].rpartition("/")[2] + post["count"] = len(images) + post["date"] = text.parse_datetime( + post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S") + + yield Message.Directory, post + + if not images: + continue + + base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob" + "?did={}&cid=".format(post["author"]["did"])) + post["num"] = 0 + + for file in images: + post["num"] += 1 + post["description"] = file["alt"] + + try: + aspect = file["aspectRatio"] + post["width"] = aspect["width"] + post["height"] = aspect["height"] + except KeyError: + post["width"] = post["height"] = 0 + + image = file["image"] + post["filename"] = link = image["ref"]["$link"] + post["extension"] = image["mimeType"].rpartition("/")[2] + + yield Message.Url, base + link, post + + def posts(self): + return () + + def _make_post(self, actor, kind): + did = self.api._did_from_actor(actor) + profile = self.api.get_profile(did) + + if kind not in profile: + return () + cid = profile[kind].rpartition("/")[2].partition("@")[0] + + return ({ + "post": { + "embed": {"images": [{ + "alt": kind, + "image": { + "$type" : "blob", + "ref" : {"$link": cid}, + "mimeType": "image/jpeg", + "size" : 0, + }, + "aspectRatio": { + "width" : 1000, + "height": 1000, + }, + }]}, + "author" : profile, + "record" : (), + "createdAt": "", + "uri" : cid, + }, + },) + + +class BlueskyUserExtractor(BlueskyExtractor): + subcategory = "user" + pattern = USER_PATTERN + r"$" + example = "https://bsky.app/profile/HANDLE" + + def initialize(self): + pass + + def items(self): + base = "{}/profile/{}/".format(self.root, self.user) + return self._dispatch_extractors(( + (BlueskyAvatarExtractor , base + "avatar"), + (BlueskyBackgroundExtractor, base + "banner"), + (BlueskyPostsExtractor , base + "posts"), + (BlueskyRepliesExtractor , base + "replies"), + (BlueskyMediaExtractor , base + "media"), + (BlueskyLikesExtractor , base + "likes"), + ), ("media",)) + + +class BlueskyPostsExtractor(BlueskyExtractor): + subcategory = "posts" + pattern = USER_PATTERN + r"/posts" + example = "https://bsky.app/profile/HANDLE/posts" + + def posts(self): + return self.api.get_author_feed(self.user, "posts_and_author_threads") + + +class BlueskyRepliesExtractor(BlueskyExtractor): + subcategory = "replies" + pattern = USER_PATTERN + r"/replies" + example = "https://bsky.app/profile/HANDLE/replies" + + def posts(self): + return self.api.get_author_feed(self.user, "posts_with_replies") + + +class BlueskyMediaExtractor(BlueskyExtractor): + subcategory = "media" + pattern = USER_PATTERN + r"/media" + example = "https://bsky.app/profile/HANDLE/media" + + def posts(self): + return self.api.get_author_feed(self.user, "posts_with_media") + + +class BlueskyLikesExtractor(BlueskyExtractor): + subcategory = "likes" + pattern = USER_PATTERN + r"/likes" + example = "https://bsky.app/profile/HANDLE/likes" + + def posts(self): + return self.api.get_actor_likes(self.user) + + +class BlueskyFeedExtractor(BlueskyExtractor): + subcategory = "feed" + pattern = USER_PATTERN + r"/feed/([^/?#]+)" + example = "https://bsky.app/profile/HANDLE/feed/NAME" + + def __init__(self, match): + BlueskyExtractor.__init__(self, match) + self.feed = match.group(2) + + def posts(self): + return self.api.get_feed(self.user, self.feed) + + +class BlueskyListExtractor(BlueskyExtractor): + subcategory = "list" + pattern = USER_PATTERN + r"/lists/([^/?#]+)" + example = "https://bsky.app/profile/HANDLE/lists/ID" + + def __init__(self, match): + BlueskyExtractor.__init__(self, match) + self.list = match.group(2) + + def posts(self): + return self.api.get_list_feed(self.user, self.list) + + +class BlueskyFollowingExtractor(BlueskyExtractor): + subcategory = "following" + pattern = USER_PATTERN + r"/follows" + example = "https://bsky.app/profile/HANDLE/follows" + + def items(self): + for user in self.api.get_follows(self.user): + url = "https://bsky.app/profile/" + user["did"] + yield Message.Queue, url, user + + +class BlueskyPostExtractor(BlueskyExtractor): + subcategory = "post" + pattern = USER_PATTERN + r"/post/([^/?#]+)" + example = "https://bsky.app/profile/HANDLE/post/ID" + + def __init__(self, match): + BlueskyExtractor.__init__(self, match) + self.post_id = match.group(2) + + def posts(self): + return self.api.get_post_thread(self.user, self.post_id) + + +class BlueskyAvatarExtractor(BlueskyExtractor): + subcategory = "avatar" + filename_fmt = "avatar_{post_id}.{extension}" + pattern = USER_PATTERN + r"/avatar" + example = "https://bsky.app/profile/HANDLE/avatar" + + def posts(self): + return self._make_post(self.user, "avatar") + + +class BlueskyBackgroundExtractor(BlueskyExtractor): + subcategory = "background" + filename_fmt = "background_{post_id}.{extension}" + pattern = USER_PATTERN + r"/ba(?:nner|ckground)" + example = "https://bsky.app/profile/HANDLE/banner" + + def posts(self): + return self._make_post(self.user, "banner") + + +class BlueskySearchExtractor(BlueskyExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/search(?:/|\?q=)(.+)" + example = "https://bsky.app/search?q=QUERY" + + def posts(self): + return self.api.search_posts(self.user) + + +class BlueskyAPI(): + """Interface for the Bluesky API + + https://www.docs.bsky.app/docs/category/http-reference + """ + + def __init__(self, extractor): + self.extractor = extractor + self.log = extractor.log + self.headers = {"Accept": "application/json"} + + self.username, self.password = extractor._get_auth_info() + if self.username: + self.root = "https://bsky.social" + else: + self.root = "https://api.bsky.app" + self.authenticate = util.noop + + def get_actor_likes(self, actor): + endpoint = "app.bsky.feed.getActorLikes" + params = { + "actor": self._did_from_actor(actor), + "limit": "100", + } + return self._pagination(endpoint, params) + + def get_author_feed(self, actor, filter="posts_and_author_threads"): + endpoint = "app.bsky.feed.getAuthorFeed" + params = { + "actor" : self._did_from_actor(actor), + "filter": filter, + "limit" : "100", + } + return self._pagination(endpoint, params) + + def get_feed(self, actor, feed): + endpoint = "app.bsky.feed.getFeed" + params = { + "feed" : "at://{}/app.bsky.feed.generator/{}".format( + self._did_from_actor(actor), feed), + "limit": "100", + } + return self._pagination(endpoint, params) + + def get_follows(self, actor): + endpoint = "app.bsky.graph.getFollows" + params = { + "actor": self._did_from_actor(actor), + "limit": "100", + } + return self._pagination(endpoint, params, "follows") + + def get_list_feed(self, actor, list): + endpoint = "app.bsky.feed.getListFeed" + params = { + "list" : "at://{}/app.bsky.graph.list/{}".format( + self._did_from_actor(actor), list), + "limit": "100", + } + return self._pagination(endpoint, params) + + def get_post_thread(self, actor, post_id): + endpoint = "app.bsky.feed.getPostThread" + params = { + "uri": "at://{}/app.bsky.feed.post/{}".format( + self._did_from_actor(actor), post_id), + "depth" : self.extractor.config("depth", "0"), + "parentHeight": "0", + } + + thread = self._call(endpoint, params)["thread"] + if "replies" not in thread: + return (thread,) + + index = 0 + posts = [thread] + while index < len(posts): + post = posts[index] + if "replies" in post: + posts.extend(post["replies"]) + index += 1 + return posts + + @memcache(keyarg=1) + def get_profile(self, did): + endpoint = "app.bsky.actor.getProfile" + params = {"actor": did} + return self._call(endpoint, params) + + @memcache(keyarg=1) + def resolve_handle(self, handle): + endpoint = "com.atproto.identity.resolveHandle" + params = {"handle": handle} + return self._call(endpoint, params)["did"] + + def search_posts(self, query): + endpoint = "app.bsky.feed.searchPosts" + params = { + "q" : query, + "limit": "100", + } + return self._pagination(endpoint, params, "posts") + + def _did_from_actor(self, actor): + if actor.startswith("did:"): + did = actor + else: + did = self.resolve_handle(actor) + + if self.extractor._metadata_user: + self.extractor._user = self.get_profile(did) + + return did + + def authenticate(self): + self.headers["Authorization"] = self._authenticate_impl(self.username) + + @cache(maxage=3600, keyarg=1) + def _authenticate_impl(self, username): + refresh_token = _refresh_token_cache(username) + + if refresh_token: + self.log.info("Refreshing access token for %s", username) + endpoint = "com.atproto.server.refreshSession" + headers = {"Authorization": "Bearer " + refresh_token} + data = None + else: + self.log.info("Logging in as %s", username) + endpoint = "com.atproto.server.createSession" + headers = None + data = { + "identifier": username, + "password" : self.password, + } + + url = "{}/xrpc/{}".format(self.root, endpoint) + response = self.extractor.request( + url, method="POST", headers=headers, json=data, fatal=None) + data = response.json() + + if response.status_code != 200: + self.log.debug("Server response: %s", data) + raise exception.AuthenticationError('"{}: {}"'.format( + data.get("error"), data.get("message"))) + + _refresh_token_cache.update(self.username, data["refreshJwt"]) + return "Bearer " + data["accessJwt"] + + def _call(self, endpoint, params): + url = "{}/xrpc/{}".format(self.root, endpoint) + + while True: + self.authenticate() + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=None) + + if response.status_code < 400: + return response.json() + if response.status_code == 429: + self.extractor.wait(seconds=60) + continue + + self.extractor.log.debug("Server response: %s", response.text) + raise exception.StopExtraction( + "API request failed (%s %s)", + response.status_code, response.reason) + + def _pagination(self, endpoint, params, key="feed"): + while True: + data = self._call(endpoint, params) + yield from data[key] + + cursor = data.get("cursor") + if not cursor: + return + params["cursor"] = cursor + + +@cache(maxage=84*86400, keyarg=0) +def _refresh_token_cache(username): + return None diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index e7fc14b..1a0e47d 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,32 +6,39 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkrr.ru/""" +"""Extractors for https://bunkr.sk/""" from .lolisafe import LolisafeAlbumExtractor from .. import text -from urllib.parse import urlsplit, urlunsplit -BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:[rs]u|la|is|to)" +BASE_PATTERN = ( + r"(?:https?://)?(?:app\.)?(bunkr+" + r"\.(?:s[kiu]|ru|la|is|to|ac|black|cat|media|red|site|ws))" +) -MEDIA_DOMAIN_OVERRIDES = { - "cdn9.bunkr.ru" : "c9.bunkr.ru", - "cdn12.bunkr.ru": "media-files12.bunkr.la", - "cdn-pizza.bunkr.ru": "pizza.bunkr.ru", +LEGACY_DOMAINS = { + "bunkr.ru", + "bunkrr.ru", + "bunkr.su", + "bunkrr.su", + "bunkr.la", + "bunkr.is", + "bunkr.to", } -CDN_HOSTED_EXTENSIONS = ( - ".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts", ".wmv", - ".zip", ".rar", ".7z", -) - class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkrr.ru albums""" + """Extractor for bunkr.sk albums""" category = "bunkr" - root = "https://bunkrr.ru" + root = "https://bunkr.sk" pattern = BASE_PATTERN + r"/a/([^/?#]+)" - example = "https://bunkrr.ru/a/ID" + example = "https://bunkr.sk/a/ID" + + def __init__(self, match): + LolisafeAlbumExtractor.__init__(self, match) + domain = match.group(match.lastindex-1) + if domain not in LEGACY_DOMAINS: + self.root = "https://" + domain def fetch_album(self, album_id): # album metadata @@ -53,46 +60,32 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): def _extract_files(self, urls): for url in urls: - if url.startswith("/"): - try: - url = self._extract_file(text.unescape(url)) - except Exception as exc: - self.log.error("%s: %s", exc.__class__.__name__, exc) - continue - - else: - if url.lower().endswith(CDN_HOSTED_EXTENSIONS): - scheme, domain, path, query, fragment = urlsplit(url) - if domain in MEDIA_DOMAIN_OVERRIDES: - domain = MEDIA_DOMAIN_OVERRIDES[domain] - else: - domain = domain.replace("cdn", "media-files", 1) - url = urlunsplit((scheme, domain, path, query, fragment)) - + try: + url = self._extract_file(text.unescape(url)) + except Exception as exc: + self.log.error("%s: %s", exc.__class__.__name__, exc) + continue yield {"file": text.unescape(url)} - def _extract_file(self, path): - page = self.request(self.root + path).text - if path[1] == "v": - url = text.extr(page, '<source src="', '"') - else: - url = text.extr(page, '<img src="', '"') - if not url: - url = text.rextract( - page, ' href="', '"', page.rindex("Download"))[0] - return url + def _extract_file(self, url): + page = self.request(url).text + return ( + text.extr(page, '<source src="', '"') or + text.extr(page, '<img src="', '"') or + text.rextract(page, ' href="', '"', page.rindex("Download"))[0] + ) class BunkrMediaExtractor(BunkrAlbumExtractor): - """Extractor for bunkrr.ru media links""" + """Extractor for bunkr.sk media links""" subcategory = "media" directory_fmt = ("{category}",) - pattern = BASE_PATTERN + r"/[vid]/([^/?#]+)" - example = "https://bunkrr.ru/v/FILENAME" + pattern = BASE_PATTERN + r"(/[vid]/[^/?#]+)" + example = "https://bunkr.sk/v/FILENAME" def fetch_album(self, album_id): try: - url = self._extract_file(urlsplit(self.url).path) + url = self._extract_file(self.root + self.album_id) except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) return (), {} diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index bcfbe73..0cf4f88 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -62,7 +62,12 @@ class DeviantartExtractor(Extractor): self.unwatch = None if self.quality: - self.quality = ",q_{}".format(self.quality) + if self.quality == "png": + self.quality = "-fullview.png?" + self.quality_sub = re.compile(r"-fullview\.[a-z0-9]+\?").sub + else: + self.quality = ",q_{}".format(self.quality) + self.quality_sub = re.compile(r",q_\d+").sub if self.original != "image": self._update_content = self._update_content_default @@ -119,6 +124,12 @@ class DeviantartExtractor(Extractor): "Skipping %s (deleted)", deviation["deviationid"]) continue + tier_access = deviation.get("tier_access") + if tier_access == "locked": + self.log.debug( + "Skipping %s (access locked)", deviation["deviationid"]) + continue + if "premium_folder_data" in deviation: data = self._fetch_premium(deviation) if not data: @@ -129,26 +140,7 @@ class DeviantartExtractor(Extractor): yield Message.Directory, deviation if "content" in deviation: - content = deviation["content"] - - if self.original and deviation["is_downloadable"]: - self._update_content(deviation, content) - elif self.jwt: - self._update_token(deviation, content) - elif content["src"].startswith("https://images-wixmp-"): - if self.intermediary and deviation["index"] <= 790677560: - # https://github.com/r888888888/danbooru/issues/4069 - intermediary, count = re.subn( - r"(/f/[^/]+/[^/]+)/v\d+/.*", - r"/intermediary\1", content["src"], 1) - if count: - deviation["is_original"] = False - deviation["_fallback"] = (content["src"],) - content["src"] = intermediary - if self.quality: - content["src"] = re.sub( - r",q_\d+", self.quality, content["src"], 1) - + content = self._extract_content(deviation) yield self.commit(deviation, content) elif deviation["is_downloadable"]: @@ -333,6 +325,33 @@ class DeviantartExtractor(Extractor): deviation["extension"] = "txt" return Message.Url, txt, deviation + def _extract_content(self, deviation): + content = deviation["content"] + + if self.original and deviation["is_downloadable"]: + self._update_content(deviation, content) + return content + + if self.jwt: + self._update_token(deviation, content) + return content + + if content["src"].startswith("https://images-wixmp-"): + if self.intermediary and deviation["index"] <= 790677560: + # https://github.com/r888888888/danbooru/issues/4069 + intermediary, count = re.subn( + r"(/f/[^/]+/[^/]+)/v\d+/.*", + r"/intermediary\1", content["src"], 1) + if count: + deviation["is_original"] = False + deviation["_fallback"] = (content["src"],) + content["src"] = intermediary + if self.quality: + content["src"] = self.quality_sub( + self.quality, content["src"], 1) + + return content + @staticmethod def _find_folder(folders, name, uuid): if uuid.isdecimal(): diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 61a3928..2223403 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -11,7 +11,8 @@ from .. import text from ..cache import memcache import re -BASE_PATTERN = ( +BASE_PATTERN = r"(?:https?://)?(?:www\.)?fanbox\.cc" +USER_PATTERN = ( r"(?:https?://)?(?:" r"(?!www\.)([\w-]+)\.fanbox\.cc|" r"(?:www\.)?fanbox\.cc/@([\w-]+))" @@ -290,7 +291,7 @@ class FanboxExtractor(Extractor): class FanboxCreatorExtractor(FanboxExtractor): """Extractor for a Fanbox creator's works""" subcategory = "creator" - pattern = BASE_PATTERN + r"(?:/posts)?/?$" + pattern = USER_PATTERN + r"(?:/posts)?/?$" example = "https://USER.fanbox.cc/" def __init__(self, match): @@ -305,7 +306,7 @@ class FanboxCreatorExtractor(FanboxExtractor): class FanboxPostExtractor(FanboxExtractor): """Extractor for media from a single Fanbox post""" subcategory = "post" - pattern = BASE_PATTERN + r"/posts/(\d+)" + pattern = USER_PATTERN + r"/posts/(\d+)" example = "https://USER.fanbox.cc/posts/12345" def __init__(self, match): @@ -316,6 +317,28 @@ class FanboxPostExtractor(FanboxExtractor): return (self._get_post_data(self.post_id),) +class FanboxHomeExtractor(FanboxExtractor): + """Extractor for your Fanbox home feed""" + subcategory = "home" + pattern = BASE_PATTERN + r"/?$" + example = "https://fanbox.cc/" + + def posts(self): + url = "https://api.fanbox.cc/post.listHome?limit=10" + return self._pagination(url) + + +class FanboxSupportingExtractor(FanboxExtractor): + """Extractor for your supported Fanbox users feed""" + subcategory = "supporting" + pattern = BASE_PATTERN + r"/home/supporting" + example = "https://fanbox.cc/home/supporting" + + def posts(self): + url = "https://api.fanbox.cc/post.listSupporting?limit=10" + return self._pagination(url) + + class FanboxRedirectExtractor(Extractor): """Extractor for pixiv redirects to fanbox.cc""" category = "fanbox" diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py index 6e81519..80478ca 100644 --- a/gallery_dl/extractor/fapachi.py +++ b/gallery_dl/extractor/fapachi.py @@ -58,8 +58,9 @@ class FapachiUserExtractor(Extractor): page = self.request("{}/{}/page/{}".format( self.root, self.user, self.num)).text for post in text.extract_iter(page, 'model-media-prew">', ">"): - url = self.root + text.extr(post, '<a href="', '"') - yield Message.Queue, url, data + path = text.extr(post, '<a href="', '"') + if path: + yield Message.Queue, self.root + path, data if '">Next page</a>' not in page: return diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index ea32765..f7dc3cc 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -386,7 +386,11 @@ class FlickrAPI(oauth.OAuth1API): params["nojsoncallback"] = "1" if self.api_key: params["api_key"] = self.api_key - data = self.request(self.API_URL, params=params).json() + response = self.request(self.API_URL, params=params) + try: + data = response.json() + except ValueError: + data = {"code": -1, "message": response.content} if "code" in data: msg = data.get("message") self.log.debug("Server response: %s", data) diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index cedac0c..715abcb 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -24,6 +24,8 @@ class FoolfuukaExtractor(BaseExtractor): BaseExtractor.__init__(self, match) if self.category == "b4k": self.remote = self._remote_direct + elif self.category == "archivedmoe": + self.referer = False def items(self): yield Message.Directory, self.metadata() @@ -53,9 +55,12 @@ class FoolfuukaExtractor(BaseExtractor): def remote(self, media): """Resolve a remote media link""" - needle = '<meta http-equiv="Refresh" content="0; url=' page = self.request(media["remote_media_link"]).text - return text.extr(page, needle, '"') + url = text.extr(page, 'http-equiv="Refresh" content="0; url=', '"') + if url.endswith(".webm") and \ + url.startswith("https://thebarchive.com/"): + return url[:-1] + return url @staticmethod def _remote_direct(media): diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 3928792..289f91c 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -73,7 +73,7 @@ class GofileFolderExtractor(Extractor): def _get_website_token(self): self.log.debug("Fetching website token") page = self.request(self.root + "/dist/js/alljs.js").text - return text.extr(page, 'fetchData.websiteToken = "', '"') + return text.extr(page, 'fetchData.wt = "', '"') def _get_content(self, content_id, password=None): if password is not None: @@ -81,7 +81,7 @@ class GofileFolderExtractor(Extractor): return self._api_request("getContent", { "contentId" : content_id, "token" : self.api_token, - "websiteToken": self.website_token, + "wt" : self.website_token, "password" : password, }) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index f70a948..c249a3e 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -35,7 +35,7 @@ class IdolcomplexExtractor(SankakuExtractor): def _init(self): self.find_pids = re.compile( - r" href=[\"#]/\w\w/posts/([0-9a-f]+)" + r" href=[\"#]/\w\w/posts/(\w+)" ).findall self.find_tags = re.compile( r'tag-type-([^"]+)">\s*<a [^>]*?href="/[^?]*\?tags=([^"]+)' @@ -101,7 +101,7 @@ class IdolcomplexExtractor(SankakuExtractor): page = self.request(url, retries=10).text extr = text.extract_from(page) - tags = extr("<title>", " | ") + pid_alnum = extr('/posts/', '"') vavg = extr('itemprop="ratingValue">', "<") vcnt = extr('itemprop="reviewCount">', "<") pid = extr(">Post ID:", "<") @@ -121,8 +121,8 @@ class IdolcomplexExtractor(SankakuExtractor): data = { "id" : text.parse_int(pid), + "id_alnum" : pid_alnum, "md5" : file_url.rpartition("/")[2].partition(".")[0], - "tags" : text.unescape(tags), "vote_average": text.parse_float(vavg), "vote_count" : text.parse_int(vcnt), "created_at" : created, @@ -206,8 +206,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): if not next_url: return - next_params = text.parse_query(text.unescape(text.unescape( - next_url).lstrip("?/"))) + next_params = text.parse_query(text.unquote(text.unescape( + text.unescape(next_url).lstrip("?/")))) if "next" in next_params: # stop if the same "next" value occurs twice in a row (#265) @@ -258,7 +258,7 @@ class IdolcomplexPostExtractor(IdolcomplexExtractor): """Extractor for single images from idol.sankakucomplex.com""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/posts?/(?:show/)?([0-9a-f]+)" + pattern = BASE_PATTERN + r"/posts?/(?:show/)?(\w+)" example = "https://idol.sankakucomplex.com/posts/0123456789abcdef" def __init__(self, match): diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 9199d12..115fff3 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -9,15 +9,17 @@ """Extractors for https://imgchest.com/""" -from .common import GalleryExtractor +from .common import GalleryExtractor, Extractor, Message from .. import text, exception +BASE_PATTERN = r"(?:https?://)?(?:www\.)?imgchest\.com" + class ImagechestGalleryExtractor(GalleryExtractor): """Extractor for image galleries from imgchest.com""" category = "imagechest" root = "https://imgchest.com" - pattern = r"(?:https?://)?(?:www\.)?imgchest\.com/p/([A-Za-z0-9]{11})" + pattern = BASE_PATTERN + r"/p/([A-Za-z0-9]{11})" example = "https://imgchest.com/p/abcdefghijk" def __init__(self, match): @@ -83,6 +85,42 @@ class ImagechestGalleryExtractor(GalleryExtractor): ] +class ImagechestUserExtractor(Extractor): + """Extractor for imgchest.com user profiles""" + category = "imagechest" + subcategory = "user" + root = "https://imgchest.com" + pattern = BASE_PATTERN + r"/u/([^/?#]+)" + example = "https://imgchest.com/u/USER" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + url = self.root + "/api/posts" + params = { + "page" : 1, + "sort" : "new", + "tag" : "", + "q" : "", + "username": text.unquote(self.user), + "nsfw" : "true", + } + + while True: + try: + data = self.request(url, params=params).json()["data"] + except (TypeError, KeyError): + return + + for gallery in data: + gallery["_extractor"] = ImagechestGalleryExtractor + yield Message.Queue, gallery["link"], gallery + + params["page"] += 1 + + class ImagechestAPI(): """Interface for the Image Chest API diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 94019bd..5f1e0f4 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -183,6 +183,23 @@ class ImagetwistImageExtractor(ImagehostImageExtractor): return url, filename +class ImagetwistGalleryExtractor(ImagehostImageExtractor): + """Extractor for galleries from imagetwist.com""" + category = "imagetwist" + subcategory = "gallery" + pattern = (r"(?:https?://)?((?:www\.|phun\.)?" + r"image(?:twist|haha)\.com/(p/[^/?#]+/\d+))") + example = "https://imagetwist.com/p/USER/12345/NAME" + + def items(self): + data = {"_extractor": ImagetwistImageExtractor} + root = self.page_url[:self.page_url.find("/", 8)] + page = self.request(self.page_url).text + gallery = text.extr(page, 'class="gallerys', "</div") + for path in text.extract_iter(gallery, ' href="', '"'): + yield Message.Queue, root + path, data + + class ImgspiceImageExtractor(ImagehostImageExtractor): """Extractor for single images from imgspice.com""" category = "imgspice" diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 10228b5..fd5a73a 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://kemono.party/""" +"""Extractors for https://kemono.su/""" from .common import Extractor, Message from .. import text, util, exception @@ -23,11 +23,11 @@ HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})" class KemonopartyExtractor(Extractor): """Base class for kemonoparty extractors""" category = "kemonoparty" - root = "https://kemono.party" + root = "https://kemono.su" directory_fmt = ("{category}", "{service}", "{user}") filename_fmt = "{id}_{title[:180]}_{num:>02}_{filename[:180]}.{extension}" archive_fmt = "{service}_{user}_{id}_{num}" - cookies_domain = ".kemono.party" + cookies_domain = ".kemono.su" def __init__(self, match): domain = match.group(1) @@ -39,6 +39,8 @@ class KemonopartyExtractor(Extractor): def _init(self): self.revisions = self.config("revisions") + if self.revisions: + self.revisions_unique = (self.revisions == "unique") self._prepare_ddosguard_cookies() self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' @@ -162,7 +164,7 @@ class KemonopartyExtractor(Extractor): return post["attachments"] def _inline(self, post): - for path in self._find_inline(post["content"] or ""): + for path in self._find_inline(post.get("content") or ""): yield {"path": path, "name": path, "type": "inline"} def _build_file_generators(self, filetypes): @@ -222,8 +224,37 @@ class KemonopartyExtractor(Extractor): self.root, server) return self.request(url).json() - @memcache(keyarg=1) - def _post_revisions(self, url): + def _revisions_post(self, post, url): + post["revision_id"] = 0 + + try: + revs = self.request(url + "/revisions").json() + except exception.HttpError: + post["revision_hash"] = self._revision_hash(post) + post["revision_index"] = 1 + return (post,) + revs.insert(0, post) + + for rev in revs: + rev["revision_hash"] = self._revision_hash(rev) + + if self.revisions_unique: + uniq = [] + last = None + for rev in revs: + if last != rev["revision_hash"]: + last = rev["revision_hash"] + uniq.append(rev) + revs = uniq + + idx = len(revs) + for rev in revs: + rev["revision_index"] = idx + idx -= 1 + + return revs + + def _revisions_all(self, url): revs = self.request(url + "/revisions").json() idx = len(revs) @@ -240,7 +271,9 @@ class KemonopartyExtractor(Extractor): rev.pop("added", None) rev.pop("next", None) rev.pop("prev", None) + rev["file"] = rev["file"].copy() rev["file"].pop("name", None) + rev["attachments"] = [a.copy() for a in rev["attachments"]] for a in rev["attachments"]: a.pop("name", None) return util.sha1(self._json_dumps(rev)) @@ -252,10 +285,10 @@ def _validate(response): class KemonopartyUserExtractor(KemonopartyExtractor): - """Extractor for all posts from a kemono.party user listing""" + """Extractor for all posts from a kemono.su user listing""" subcategory = "user" pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|[?#])" - example = "https://kemono.party/SERVICE/user/12345" + example = "https://kemono.su/SERVICE/user/12345" def __init__(self, match): _, _, service, user_id, self.query = match.groups() @@ -275,18 +308,9 @@ class KemonopartyUserExtractor(KemonopartyExtractor): if self.revisions: for post in posts: - post["revision_hash"] = self._revision_hash(post) - post["revision_id"] = 0 - post_url = "{}/post/{}".format(self.api_url, post["id"]) - try: - revs = self._post_revisions(post_url) - except exception.HttpError: - post["revision_index"] = 1 - yield post - else: - post["revision_index"] = len(revs) + 1 - yield post - yield from revs + post_url = "{}/api/v1/{}/user/{}/post/{}".format( + self.root, post["service"], post["user"], post["id"]) + yield from self._revisions_post(post, post_url) else: yield from posts @@ -295,11 +319,25 @@ class KemonopartyUserExtractor(KemonopartyExtractor): params["o"] += 50 +class KemonopartyPostsExtractor(KemonopartyExtractor): + """Extractor for kemono.su post listings""" + subcategory = "posts" + pattern = BASE_PATTERN + r"/posts(?:/?\?([^#]+))?" + example = "https://kemono.su/posts" + + def __init__(self, match): + KemonopartyExtractor.__init__(self, match) + self.query = match.group(3) + self.api_url = self.root + "/api/v1/posts" + + posts = KemonopartyUserExtractor.posts + + class KemonopartyPostExtractor(KemonopartyExtractor): - """Extractor for a single kemono.party post""" + """Extractor for a single kemono.su post""" subcategory = "post" pattern = USER_PATTERN + r"/post/([^/?#]+)(/revisions?(?:/(\d*))?)?" - example = "https://kemono.party/SERVICE/user/12345/post/12345" + example = "https://kemono.su/SERVICE/user/12345/post/12345" def __init__(self, match): _, _, service, user_id, post_id, self.revision, self.revision_id = \ @@ -314,18 +352,10 @@ class KemonopartyPostExtractor(KemonopartyExtractor): if not self.revision: post = self.request(self.api_url).json() if self.revisions: - post["revision_hash"] = self._revision_hash(post) - post["revision_id"] = 0 - try: - revs = self._post_revisions(self.api_url) - except exception.HttpError: - post["revision_index"] = 1 - else: - post["revision_index"] = len(revs) + 1 - return itertools.chain((post,), revs) + return self._revisions_post(post, self.api_url) return (post,) - revs = self._post_revisions(self.api_url) + revs = self._revisions_all(self.api_url) if not self.revision_id: return revs @@ -337,14 +367,14 @@ class KemonopartyPostExtractor(KemonopartyExtractor): class KemonopartyDiscordExtractor(KemonopartyExtractor): - """Extractor for kemono.party discord servers""" + """Extractor for kemono.su discord servers""" subcategory = "discord" directory_fmt = ("{category}", "discord", "{server}", "{channel_name|channel}") filename_fmt = "{id}_{num:>02}_{filename}.{extension}" archive_fmt = "discord_{server}_{id}_{num}" pattern = BASE_PATTERN + r"/discord/server/(\d+)(?:/channel/(\d+))?#(.*)" - example = "https://kemono.party/discord/server/12345#CHANNEL" + example = "https://kemono.su/discord/server/12345#CHANNEL" def __init__(self, match): KemonopartyExtractor.__init__(self, match) @@ -430,7 +460,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): class KemonopartyDiscordServerExtractor(KemonopartyExtractor): subcategory = "discord-server" pattern = BASE_PATTERN + r"/discord/server/(\d+)$" - example = "https://kemono.party/discord/server/12345" + example = "https://kemono.su/discord/server/12345" def __init__(self, match): KemonopartyExtractor.__init__(self, match) @@ -445,10 +475,10 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): class KemonopartyFavoriteExtractor(KemonopartyExtractor): - """Extractor for kemono.party favorites""" + """Extractor for kemono.su favorites""" subcategory = "favorite" pattern = BASE_PATTERN + r"/favorites(?:/?\?([^#]+))?" - example = "https://kemono.party/favorites" + example = "https://kemono.su/favorites" def __init__(self, match): KemonopartyExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index c3c44d2..8e73964 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -63,7 +63,11 @@ class LusciousAlbumExtractor(LusciousExtractor): image["num"] = num image["album"] = album - image["thumbnail"] = image.pop("thumbnails")[0]["url"] + try: + image["thumbnail"] = image.pop("thumbnails")[0]["url"] + except LookupError: + image["thumbnail"] = "" + image["tags"] = [item["text"] for item in image["tags"]] image["date"] = text.parse_timestamp(image["created"]) image["id"] = text.parse_int(image["id"]) diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index 72ee5b0..4137f5d 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -46,7 +46,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): "episode" : self.episode, "comic" : extr('titleName: "', '"'), "tags" : [t.strip() for t in text.extract_iter( - extr("tagList: [", "}],"), '"tagName":"', '"')], + extr("tagList: [", "],"), '"tagName":"', '"')], "title" : extr('"subtitle":"', '"'), "author" : [a.strip() for a in text.extract_iter( extr('"writers":[', ']'), '"name":"', '"')], @@ -79,9 +79,6 @@ class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor): self.sort = query.get("sort", "ASC") def items(self): - base = "{}/{}/detail?titleId={}&no=".format( - self.root, self.path, self.title_id) - url = self.root + "/api/article/list" headers = { "Accept": "application/json, text/plain, */*", @@ -95,6 +92,10 @@ class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor): while True: data = self.request(url, headers=headers, params=params).json() + path = data["webtoonLevelCode"].lower().replace("_c", "C", 1) + base = "{}/{}/detail?titleId={}&no=".format( + self.root, path, data["titleId"]) + for article in data["articleList"]: article["_extractor"] = NaverwebtoonEpisodeExtractor yield Message.Queue, base + str(article["no"]), article diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index c46a587..8c04ed5 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -149,8 +149,7 @@ class PinterestBoardExtractor(PinterestExtractor): pins = self.api.board_pins(board["id"]) if board["section_count"] and self.config("sections", True): - base = "{}/{}/{}/id:".format( - self.root, board["owner"]["username"], board["name"]) + base = "{}{}id:".format(self.root, board["url"]) data = {"_extractor": PinterestSectionExtractor} sections = [(base + section["id"], data) for section in self.api.board_sections(board["id"])] @@ -220,7 +219,7 @@ class PinterestSectionExtractor(PinterestExtractor): "{board[name]}", "{section[title]}") archive_fmt = "{board[id]}_{id}" pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/([^/?#]+)/([^/?#]+)" - example = "https://www.pinterest.com/USER/BOARD/SEcTION" + example = "https://www.pinterest.com/USER/BOARD/SECTION" def __init__(self, match): PinterestExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py index 4a6f031..83f3064 100644 --- a/gallery_dl/extractor/pornpics.py +++ b/gallery_dl/extractor/pornpics.py @@ -76,7 +76,7 @@ class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor): "gallery_id": text.parse_int(self.gallery_id), "slug" : extr("/galleries/", "/").rpartition("-")[0], "title" : text.unescape(extr("<h1>", "<")), - "channel" : extr('>Channel:', '</a>').rpartition(">")[2], + "channel" : text.split_html(extr(">Channel: ", '</div>')), "models" : text.split_html(extr( ">Models:", '<span class="suggest')), "categories": text.split_html(extr( diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index b3b7a9c..caf3e16 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -179,12 +179,16 @@ class SankakuAPI(): def __init__(self, extractor): self.extractor = extractor self.headers = { - "Accept" : "application/vnd.sankaku.api+json;v=2", - "Platform": "web-app", - "Origin" : extractor.root, + "Accept" : "application/vnd.sankaku.api+json;v=2", + "Platform" : "web-app", + "Api-Version": None, + "Origin" : extractor.root, } - self.username, self.password = self.extractor._get_auth_info() + if extractor.config("id-format") in ("alnum", "alphanumeric"): + self.headers["Api-Version"] = "2" + + self.username, self.password = extractor._get_auth_info() if not self.username: self.authenticate = util.noop diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 55a0db0..0b29ed0 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -33,10 +33,14 @@ class SkebExtractor(Extractor): response, post = self._get_post_data(user_name, post_num) if metadata: post.update(metadata) + + files = self._get_files_from_post(response) + post["count"] = len(files) yield Message.Directory, post - for data in self._get_urls_from_post(response, post): - url = data["file_url"] - yield Message.Url, url, text.nameext_from_url(url, data) + for post["num"], file in enumerate(files, 1): + post.update(file) + url = file["file_url"] + yield Message.Url, url, text.nameext_from_url(url, post) def posts(self): """Return post number""" @@ -105,40 +109,48 @@ class SkebExtractor(Extractor): } return resp, post - def _get_urls_from_post(self, resp, post): + def _get_files_from_post(self, resp): + files = [] + if self.thumbnails and "og_image_url" in resp: - post["content_category"] = "thumb" - post["file_id"] = "thumb" - post["_file_id"] = str(resp["id"]) + "t" - post["file_url"] = resp["og_image_url"] - yield post + files.append({ + "content_category": "thumb", + "file_id" : "thumb", + "_file_id": str(resp["id"]) + "t", + "file_url": resp["og_image_url"], + }) if self.article and "article_image_url" in resp: url = resp["article_image_url"] if url: - post["content_category"] = "article" - post["file_id"] = "article" - post["_file_id"] = str(resp["id"]) + "a" - post["file_url"] = url - yield post + files.append({ + "content_category": "article", + "file_id" : "article", + "_file_id": str(resp["id"]) + "a", + "file_url": url, + }) for preview in resp["previews"]: - post["content_category"] = "preview" - post["file_id"] = post["_file_id"] = preview["id"] - post["file_url"] = preview["url"] info = preview["information"] - post["original"] = { - "width" : info["width"], - "height" : info["height"], - "byte_size" : info["byte_size"], - "duration" : info["duration"], - "frame_rate": info["frame_rate"], - "software" : info["software"], - "extension" : info["extension"], - "is_movie" : info["is_movie"], - "transcoder": info["transcoder"], - } - yield post + files.append({ + "content_category": "preview", + "file_id" : preview["id"], + "_file_id": preview["id"], + "file_url": preview["url"], + "original": { + "width" : info["width"], + "height" : info["height"], + "byte_size" : info["byte_size"], + "duration" : info["duration"], + "frame_rate": info["frame_rate"], + "software" : info["software"], + "extension" : info["extension"], + "is_movie" : info["is_movie"], + "transcoder": info["transcoder"], + }, + }) + + return files class SkebPostExtractor(SkebExtractor): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index cf759e0..ad5bfc6 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -693,6 +693,28 @@ class TwitterHashtagExtractor(TwitterExtractor): yield Message.Queue, url, data +class TwitterCommunityExtractor(TwitterExtractor): + """Extractor for a Twitter community""" + subcategory = "community" + pattern = BASE_PATTERN + r"/i/communities/(\d+)" + example = "https://twitter.com/i/communities/12345" + + def tweets(self): + if self.textonly: + return self.api.community_tweets_timeline(self.user) + return self.api.community_media_timeline(self.user) + + +class TwitterCommunitiesExtractor(TwitterExtractor): + """Extractor for followed Twitter communities""" + subcategory = "communities" + pattern = BASE_PATTERN + r"/([^/?#]+)/communities/?$" + example = "https://twitter.com/i/communities" + + def tweets(self): + return self.api.communities_main_page_timeline(self.user) + + class TwitterEventExtractor(TwitterExtractor): """Extractor for Tweets from a Twitter Event""" subcategory = "event" @@ -881,15 +903,19 @@ class TwitterAPI(): self.headers = { "Accept": "*/*", - "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" - "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" - "4FA33AGWWjCpTnA", + "Referer": "https://twitter.com/", + "content-type": "application/json", "x-guest-token": None, "x-twitter-auth-type": "OAuth2Session" if auth_token else None, + "x-csrf-token": csrf_token, "x-twitter-client-language": "en", "x-twitter-active-user": "yes", - "x-csrf-token": csrf_token, - "Referer": "https://twitter.com/", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" + "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" + "4FA33AGWWjCpTnA", } self.params = { "include_profile_interstitial_type": "1", @@ -933,78 +959,54 @@ class TwitterAPI(): "collab_control,vibe", } self.features = { - "hidden_profile_likes_enabled": False, + "hidden_profile_likes_enabled": True, + "hidden_profile_subscriptions_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, - "subscriptions_verification_info_verified_since_enabled": True, "highlights_tweets_tab_ui_enabled": True, + "responsive_web_twitter_article_notes_tab_enabled": True, "creator_subscriptions_tweet_preview_api_enabled": True, "responsive_web_graphql_" "skip_user_profile_image_extensions_enabled": False, "responsive_web_graphql_timeline_navigation_enabled": True, } self.features_pagination = { - "rweb_lists_timeline_redesign_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, "creator_subscriptions_tweet_preview_api_enabled": True, "responsive_web_graphql_timeline_navigation_enabled": True, "responsive_web_graphql_skip_user_profile_" "image_extensions_enabled": False, + "c9s_tweet_anatomy_moderator_badge_enabled": True, "tweetypie_unmention_optimization_enabled": True, "responsive_web_edit_tweet_api_enabled": True, "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, "view_counts_everywhere_api_enabled": True, "longform_notetweets_consumption_enabled": True, + "responsive_web_twitter_article_tweet_consumption_enabled": True, "tweet_awards_web_tipping_enabled": False, "freedom_of_speech_not_reach_fetch_enabled": True, "standardized_nudges_misinfo": True, "tweet_with_visibility_results_prefer_gql_" - "limited_actions_policy_enabled": False, - "interactive_text_enabled": True, - "responsive_web_text_conversations_enabled": False, + "limited_actions_policy_enabled": True, + "rweb_video_timestamps_enabled": True, "longform_notetweets_rich_text_read_enabled": True, - "longform_notetweets_inline_media_enabled": False, + "longform_notetweets_inline_media_enabled": True, + "responsive_web_media_download_video_enabled": True, "responsive_web_enhance_cards_enabled": False, } def tweet_result_by_rest_id(self, tweet_id): - endpoint = "/graphql/2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId" + endpoint = "/graphql/MWY3AO9_I3rcP_L2A4FR4A/TweetResultByRestId" + variables = { + "tweetId": tweet_id, + "withCommunity": False, + "includePromotedContent": False, + "withVoice": False, + } params = { - "variables": self._json_dumps({ - "tweetId": tweet_id, - "withCommunity": False, - "includePromotedContent": False, - "withVoice": False, - }), - "features": self._json_dumps({ - "creator_subscriptions_tweet_preview_api_enabled": True, - "tweetypie_unmention_optimization_enabled": True, - "responsive_web_edit_tweet_api_enabled": True, - "graphql_is_translatable_rweb_tweet_is_translatable_enabled": - True, - "view_counts_everywhere_api_enabled": True, - "longform_notetweets_consumption_enabled": True, - "responsive_web_twitter_article_tweet_consumption_enabled": - False, - "tweet_awards_web_tipping_enabled": False, - "freedom_of_speech_not_reach_fetch_enabled": True, - "standardized_nudges_misinfo": True, - "tweet_with_visibility_results_prefer_gql_" - "limited_actions_policy_enabled": True, - "longform_notetweets_rich_text_read_enabled": True, - "longform_notetweets_inline_media_enabled": True, - "responsive_web_graphql_exclude_directive_enabled": True, - "verified_phone_label_enabled": False, - "responsive_web_media_download_video_enabled": False, - "responsive_web_graphql_skip_user_profile_" - "image_extensions_enabled": False, - "responsive_web_graphql_timeline_navigation_enabled": True, - "responsive_web_enhance_cards_enabled": False, - }), - "fieldToggles": self._json_dumps({ - "withArticleRichContentState": False, - }), + "variables": self._json_dumps(variables), + "features" : self._json_dumps(self.features_pagination), } tweet = self._call(endpoint, params)["data"]["tweetResult"]["result"] if "tweet" in tweet: @@ -1021,7 +1023,7 @@ class TwitterAPI(): return tweet def tweet_detail(self, tweet_id): - endpoint = "/graphql/JlLZj42Ltr2qwjasw-l5lQ/TweetDetail" + endpoint = "/graphql/B9_KmbkLhXt6jRwGjJrweg/TweetDetail" variables = { "focalTweetId": tweet_id, "referrer": "profile", @@ -1037,7 +1039,7 @@ class TwitterAPI(): endpoint, variables, ("threaded_conversation_with_injections_v2",)) def user_tweets(self, screen_name): - endpoint = "/graphql/-AY51QoFpVf-w7TxjQ6lpw/UserTweets" + endpoint = "/graphql/5ICa5d9-AitXZrIA3H-4MQ/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1049,7 +1051,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_tweets_and_replies(self, screen_name): - endpoint = "/graphql/urrCZMyyIh1FkSFi2cdPUA/UserTweetsAndReplies" + endpoint = "/graphql/UtLStR_BnYUGD7Q453UXQg/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1061,7 +1063,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_media(self, screen_name): - endpoint = "/graphql/lo965xQZdN2-eSM1Jc-W_A/UserMedia" + endpoint = "/graphql/tO4LMUYAZbR4T0SqQ85aAw/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1073,28 +1075,8 @@ class TwitterAPI(): } return self._pagination_tweets(endpoint, variables) - def user_media_legacy(self, screen_name): - endpoint = "/graphql/nRybED9kRbN-TOWioHq1ng/UserMedia" - variables = { - "userId": self._user_id_by_screen_name(screen_name), - "count": 100, - "includePromotedContent": False, - "withSuperFollowsUserFields": True, - "withBirdwatchPivots": False, - "withSuperFollowsTweetFields": True, - "withClientEventToken": False, - "withBirdwatchNotes": False, - "withVoice": True, - "withV2Timeline": False, - "__fs_interactive_text": False, - "__fs_dont_mention_me_view_api_enabled": False, - } - return self._pagination_tweets( - endpoint, variables, ("user", "result", "timeline", "timeline"), - features=False) - def user_likes(self, screen_name): - endpoint = "/graphql/6JET1d0iHsIzW0Zjs3OOwQ/Likes" + endpoint = "/graphql/9s8V6sUI8fZLDiN-REkAxA/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1107,9 +1089,10 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_bookmarks(self): - endpoint = "/graphql/YNtYqNuki6_oiVwx0uP8mQ/Bookmarks" + endpoint = "/graphql/cQxQgX8MJYjWwC0dxpyfYg/Bookmarks" variables = { "count": 100, + "includePromotedContent": False, } features = self.features_pagination.copy() features["graphql_timeline_v2_bookmark_timeline"] = True @@ -1118,7 +1101,7 @@ class TwitterAPI(): features=features) def list_latest_tweets_timeline(self, list_id): - endpoint = "/graphql/ZBbXrl37E6za5ml-DIpmgg/ListLatestTweetsTimeline" + endpoint = "/graphql/HjsWc-nwwHKYwHenbHm-tw/ListLatestTweetsTimeline" variables = { "listId": list_id, "count": 100, @@ -1127,22 +1110,54 @@ class TwitterAPI(): endpoint, variables, ("list", "tweets_timeline", "timeline")) def search_timeline(self, query): - endpoint = "/graphql/7jT5GT59P8IFjgxwqnEdQw/SearchTimeline" + endpoint = "/graphql/fZK7JipRHWtiZsTodhsTfQ/SearchTimeline" variables = { "rawQuery": query, - "count": 20, + "count": 100, + "querySource": "", "product": "Latest", - "withDownvotePerspective": False, - "withReactionsMetadata": False, - "withReactionsPerspective": False, } - features = self.features_pagination.copy() - features["blue_business_profile_image_shape_enabled"] = False - features["vibe_api_enabled"] = True + return self._pagination_tweets( endpoint, variables, - ("search_by_raw_query", "search_timeline", "timeline"), - features=features) + ("search_by_raw_query", "search_timeline", "timeline")) + + def community_tweets_timeline(self, community_id): + endpoint = "/graphql/7B2AdxSuC-Er8qUr3Plm_w/CommunityTweetsTimeline" + variables = { + "communityId": community_id, + "count": 100, + "displayLocation": "Community", + "rankingMode": "Recency", + "withCommunity": True, + } + return self._pagination_tweets( + endpoint, variables, + ("communityResults", "result", "ranked_community_timeline", + "timeline")) + + def community_media_timeline(self, community_id): + endpoint = "/graphql/qAGUldfcIoMv5KyAyVLYog/CommunityMediaTimeline" + variables = { + "communityId": community_id, + "count": 100, + "withCommunity": True, + } + return self._pagination_tweets( + endpoint, variables, + ("communityResults", "result", "community_media_timeline", + "timeline")) + + def communities_main_page_timeline(self, screen_name): + endpoint = ("/graphql/GtOhw2mstITBepTRppL6Uw" + "/CommunitiesMainPageTimeline") + variables = { + "count": 100, + "withCommunity": True, + } + return self._pagination_tweets( + endpoint, variables, + ("viewer", "communities_timeline", "timeline")) def live_event_timeline(self, event_id): endpoint = "/2/live_event/timeline/{}.json".format(event_id) @@ -1160,21 +1175,8 @@ class TwitterAPI(): return (self._call(endpoint, params) ["twitter_objects"]["live_events"][event_id]) - def list_by_rest_id(self, list_id): - endpoint = "/graphql/AmCdeFUvlrKAO96yHr-GCg/ListByRestId" - params = { - "variables": self._json_dumps({ - "listId": list_id, - }), - "features": self._json_dumps(self.features), - } - try: - return self._call(endpoint, params)["data"]["list"] - except KeyError: - raise exception.NotFoundError("list") - def list_members(self, list_id): - endpoint = "/graphql/a_ZQomd3MMk1crWkeiQBPg/ListMembers" + endpoint = "/graphql/BQp2IEYkgxuSxqbTAr1e1g/ListMembers" variables = { "listId": list_id, "count": 100, @@ -1184,7 +1186,7 @@ class TwitterAPI(): endpoint, variables, ("list", "members_timeline", "timeline")) def user_following(self, screen_name): - endpoint = "/graphql/JPZiqKjET7_M1r5Tlr8pyA/Following" + endpoint = "/graphql/PAnE9toEjRfE-4tozRcsfw/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1194,9 +1196,8 @@ class TwitterAPI(): @memcache(keyarg=1) def user_by_rest_id(self, rest_id): - endpoint = "/graphql/1YAM811Q8Ry4XyPpJclURQ/UserByRestId" - features = self.features.copy() - features["blue_business_profile_image_shape_enabled"] = True + endpoint = "/graphql/tD8zKvQzwY3kdx5yz6YmOw/UserByRestId" + features = self.features params = { "variables": self._json_dumps({ "userId": rest_id, @@ -1208,13 +1209,18 @@ class TwitterAPI(): @memcache(keyarg=1) def user_by_screen_name(self, screen_name): - endpoint = "/graphql/XA6F1nJELYg65hxOC2Ekmg/UserByScreenName" + endpoint = "/graphql/k5XapwcSikNsEsILW5FvgA/UserByScreenName" + features = self.features.copy() + features["subscriptions_verification_info_" + "is_identity_verified_enabled"] = True + features["subscriptions_verification_info_" + "verified_since_enabled"] = True params = { "variables": self._json_dumps({ "screen_name": screen_name, "withSafetyModeUserFields": True, }), - "features": self._json_dumps(self.features), + "features": self._json_dumps(features), } return self._call(endpoint, params)["data"]["user"]["result"] @@ -1486,7 +1492,8 @@ class TwitterAPI(): if esw("tweet-"): tweets.append(entry) - elif esw("profile-grid-"): + elif esw(("profile-grid-", + "communities-grid-")): if "content" in entry: tweets.extend(entry["content"]["items"]) else: diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 14e3c7b..41141c6 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -11,8 +11,8 @@ from .common import Extractor, Message from .. import text, util - -BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co/([^/]+)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co" +USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)" class VscoExtractor(Extractor): @@ -115,7 +115,7 @@ class VscoExtractor(Extractor): class VscoUserExtractor(VscoExtractor): """Extractor for images from a user on vsco.co""" subcategory = "user" - pattern = BASE_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])" + pattern = USER_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])" example = "https://vsco.co/USER/gallery" def images(self): @@ -139,8 +139,8 @@ class VscoCollectionExtractor(VscoExtractor): subcategory = "collection" directory_fmt = ("{category}", "{user}", "collection") archive_fmt = "c_{user}_{id}" - pattern = BASE_PATTERN + r"/collection/" - example = "https://vsco.co/USER/collection/12345" + pattern = USER_PATTERN + r"/collection" + example = "https://vsco.co/USER/collection/1" def images(self): url = "{}/{}/collection/1".format(self.root, self.user) @@ -159,10 +159,89 @@ class VscoCollectionExtractor(VscoExtractor): )) +class VscoSpaceExtractor(VscoExtractor): + """Extractor for a vsco.co space""" + subcategory = "space" + directory_fmt = ("{category}", "space", "{user}") + archive_fmt = "s_{user}_{id}" + pattern = BASE_PATTERN + r"/spaces/([^/?#]+)" + example = "https://vsco.co/spaces/a1b2c3d4e5f" + + def images(self): + url = "{}/spaces/{}".format(self.root, self.user) + data = self._extract_preload_state(url) + + tkn = data["users"]["currentUser"]["tkn"] + sid = self.user + + posts = data["entities"]["posts"] + images = data["entities"]["postImages"] + for post in posts.values(): + post["image"] = images[post["image"]] + + space = data["spaces"]["byId"][sid] + space["postsList"] = [posts[pid] for pid in space["postsList"]] + + url = "{}/grpc/spaces/{}/posts".format(self.root, sid) + params = {} + return self._pagination(url, params, tkn, space) + + def _pagination(self, url, params, token, data): + headers = { + "Accept" : "application/json", + "Referer" : "{}/spaces/{}".format(self.root, self.user), + "Content-Type" : "application/json", + "Authorization": "Bearer " + token, + } + + while True: + for post in data["postsList"]: + post = self._transform_media(post["image"]) + post["upload_date"] = post["upload_date"]["sec"] * 1000 + yield post + + cursor = data["cursor"] + if cursor.get("atEnd"): + return + params["cursor"] = cursor["postcursorcontext"]["postId"] + + data = self.request(url, params=params, headers=headers).json() + + +class VscoSpacesExtractor(VscoExtractor): + """Extractor for a vsco.co user's spaces""" + subcategory = "spaces" + pattern = USER_PATTERN + r"/spaces" + example = "https://vsco.co/USER/spaces" + + def items(self): + url = "{}/{}/spaces".format(self.root, self.user) + data = self._extract_preload_state(url) + + tkn = data["users"]["currentUser"]["tkn"] + uid = data["sites"]["siteByUsername"][self.user]["site"]["userId"] + + headers = { + "Accept" : "application/json", + "Referer" : url, + "Content-Type" : "application/json", + "Authorization": "Bearer " + tkn, + } + # this would theoretically need to be paginated + url = "{}/grpc/spaces/user/{}".format(self.root, uid) + data = self.request(url, headers=headers).json() + + for space in data["spacesWithRoleList"]: + space = space["space"] + url = "{}/spaces/{}".format(self.root, space["id"]) + space["_extractor"] = VscoSpaceExtractor + yield Message.Queue, url, space + + class VscoImageExtractor(VscoExtractor): """Extractor for individual images on vsco.co""" subcategory = "image" - pattern = BASE_PATTERN + r"/media/([0-9a-fA-F]+)" + pattern = USER_PATTERN + r"/media/([0-9a-fA-F]+)" example = "https://vsco.co/USER/media/0123456789abcdef" def __init__(self, match): diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 3bd0648..5b45148 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -33,6 +33,8 @@ class WeiboExtractor(Extractor): self.retweets = self.config("retweets", True) self.videos = self.config("videos", True) self.livephoto = self.config("livephoto", True) + self.gifs = self.config("gifs", True) + self.gifs_video = (self.gifs == "video") cookies = _cookie_cache() if cookies is not None: @@ -106,8 +108,11 @@ class WeiboExtractor(Extractor): pic = pics[pic_id] pic_type = pic.get("type") - if pic_type == "gif" and self.videos: - append({"url": pic["video"]}) + if pic_type == "gif" and self.gifs: + if self.gifs_video: + append({"url": pic["video"]}) + else: + append(pic["largest"].copy()) elif pic_type == "livephoto" and self.livephoto: append(pic["largest"].copy()) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 1eafc29..c93f33f 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -25,15 +25,14 @@ class WikimediaExtractor(BaseExtractor): BaseExtractor.__init__(self, match) path = match.group(match.lastindex) - if self.category == "fandom": + if self.category == "wikimedia": + self.category = self.root.split(".")[-2] + elif self.category == "fandom": self.category = \ "fandom-" + self.root.partition(".")[0].rpartition("/")[2] if path.startswith("wiki/"): path = path[5:] - self.api_path = "/w/api.php" - else: - self.api_path = "/api.php" pre, sep, _ = path.partition(":") prefix = pre.lower() if sep else None @@ -66,7 +65,7 @@ class WikimediaExtractor(BaseExtractor): else: self.api_url = api_path else: - self.api_url = self.root + self.api_path + self.api_url = self.root + "/api.php" def items(self): for info in self._pagination(self.params): @@ -122,55 +121,47 @@ class WikimediaExtractor(BaseExtractor): BASE_PATTERN = WikimediaExtractor.update({ - "wikipedia": { - "root": None, - "pattern": r"[a-z]{2,}\.wikipedia\.org", - }, - "wiktionary": { - "root": None, - "pattern": r"[a-z]{2,}\.wiktionary\.org", - }, - "wikiquote": { - "root": None, - "pattern": r"[a-z]{2,}\.wikiquote\.org", - }, - "wikibooks": { + "wikimedia": { "root": None, - "pattern": r"[a-z]{2,}\.wikibooks\.org", - }, - "wikisource": { - "root": None, - "pattern": r"[a-z]{2,}\.wikisource\.org", - }, - "wikinews": { - "root": None, - "pattern": r"[a-z]{2,}\.wikinews\.org", - }, - "wikiversity": { - "root": None, - "pattern": r"[a-z]{2,}\.wikiversity\.org", + "pattern": r"[a-z]{2,}\." + r"wik(?:i(?:pedia|quote|books|source|news|versity|data" + r"|voyage)|tionary)" + r"\.org", + "api-path": "/w/api.php", }, "wikispecies": { "root": "https://species.wikimedia.org", "pattern": r"species\.wikimedia\.org", + "api-path": "/w/api.php", }, "wikimediacommons": { "root": "https://commons.wikimedia.org", "pattern": r"commons\.wikimedia\.org", + "api-path": "/w/api.php", }, "mediawiki": { "root": "https://www.mediawiki.org", "pattern": r"(?:www\.)?mediawiki\.org", + "api-path": "/w/api.php", }, "fandom": { "root": None, "pattern": r"[\w-]+\.fandom\.com", - "api-path": "/api.php", }, "mariowiki": { "root": "https://www.mariowiki.com", "pattern": r"(?:www\.)?mariowiki\.com", }, + "bulbapedia": { + "root": "https://bulbapedia.bulbagarden.net", + "pattern": r"(?:bulbapedia|archives)\.bulbagarden\.net", + "api-path": "/w/api.php", + }, + "pidgiwiki": { + "root": "https://www.pidgi.net", + "pattern": r"(?:www\.)?pidgi\.net", + "api-path": "/wiki/api.php", + }, }) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 1307399..6ee96e6 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -146,9 +146,9 @@ class ZerochanTagExtractor(ZerochanExtractor): yield { "id" : extr('href="/', '"'), "name" : extr('alt="', '"'), - "width" : extr('title="', 'x'), + "width" : extr('title="', '✕'), "height": extr('', ' '), - "size" : extr('', 'B'), + "size" : extr('', 'b'), "file_url": "https://static." + extr( '<a href="https://static.', '"'), } diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 71927a5..1616bbd 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -231,6 +231,8 @@ class PathFormat(): self.temppath = self.realpath = self.realpath[:-1] elif not self.temppath: self.build_path() + except exception.GalleryDLException: + raise except Exception: self.path = self.directory + "?" self.realpath = self.temppath = self.realdirectory + "?" diff --git a/gallery_dl/version.py b/gallery_dl/version.py index f99beaa..e89ab9c 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.7" +__version__ = "1.26.8" diff --git a/test/test_results.py b/test/test_results.py index 680b0f9..bceb271 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -49,6 +49,14 @@ AUTH = { "twitter", } +AUTH_CONFIG = ( + "username", + "cookies", + "api-key", + "client-id", + "refresh-token", +) + class TestExtractorResults(unittest.TestCase): @@ -88,14 +96,16 @@ class TestExtractorResults(unittest.TestCase): key = key.split(".") config.set(key[:-1], key[-1], value) - requires_auth = result.get("#auth") - if requires_auth is None: - requires_auth = (result["#category"][1] in AUTH) - if requires_auth: + auth = result.get("#auth") + if auth is None: + auth = (result["#category"][1] in AUTH) + elif not auth: + for key in AUTH_CONFIG: + config.set((), key, None) + + if auth: extr = result["#class"].from_url(result["#url"]) - if not any(extr.config(key) for key in ( - "username", "cookies", "api-key", "client-id", - "refresh-token")): + if not any(extr.config(key) for key in AUTH_CONFIG): msg = "no auth" self._skipped.append((result["#url"], msg)) self.skipTest(msg) |
