diff options
39 files changed, 867 insertions, 362 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index d779ffa..182d685 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,32 +1,41 @@ -## 1.29.4 - 2025-04-13 +## 1.29.5 - 2025-04-26 ### Extractors #### Additions -- [chevereto] support `imagepond.net` ([#7278](https://github.com/mikf/gallery-dl/issues/7278)) -- [webtoons] add `artist` extractor ([#7274](https://github.com/mikf/gallery-dl/issues/7274)) +- [bluesky] add `video` extractor ([#4438](https://github.com/mikf/gallery-dl/issues/4438)) +- [instagram] add `followers` extractor ([#7374](https://github.com/mikf/gallery-dl/issues/7374)) +- [itaku] add `stars` extractor ([#7411](https://github.com/mikf/gallery-dl/issues/7411)) +- [pictoa] add support ([#6683](https://github.com/mikf/gallery-dl/issues/6683) [#7409](https://github.com/mikf/gallery-dl/issues/7409)) +- [twitter] add `followers` extractor ([#6331](https://github.com/mikf/gallery-dl/issues/6331)) #### Fixes -- [deviantart] fix `KeyError: 'has_subfolders'` ([#7272](https://github.com/mikf/gallery-dl/issues/7272) [#7337](https://github.com/mikf/gallery-dl/issues/7337)) -- [discord] fix `parent` keyword inconsistency ([#7341](https://github.com/mikf/gallery-dl/issues/7341) [#7353](https://github.com/mikf/gallery-dl/issues/7353)) -- [E621:pool] fix `AttributeError` ([#7265](https://github.com/mikf/gallery-dl/issues/7265) [#7344](https://github.com/mikf/gallery-dl/issues/7344)) -- [everia] fix/improve image extraction ([#7270](https://github.com/mikf/gallery-dl/issues/7270)) -- [gelbooru] fix video URLs ([#7345](https://github.com/mikf/gallery-dl/issues/7345)) -- [hentai2read] fix `AttributeError` exception for chapters without artist ([#7355](https://github.com/mikf/gallery-dl/issues/7355)) -- [issuu] fix extractors ([#7317](https://github.com/mikf/gallery-dl/issues/7317)) -- [kemonoparty] fix file paths with backslashes ([#7321](https://github.com/mikf/gallery-dl/issues/7321)) -- [readcomiconline] fix `issue` extractor ([#7269](https://github.com/mikf/gallery-dl/issues/7269) [#7330](https://github.com/mikf/gallery-dl/issues/7330)) -- [rule34xyz] update to API v2 ([#7289](https://github.com/mikf/gallery-dl/issues/7289)) -- [zerochan] fix `KeyError: 'author'` ([#7282](https://github.com/mikf/gallery-dl/issues/7282)) +- [architizer] fix `project` extractor ([#7421](https://github.com/mikf/gallery-dl/issues/7421)) +- [bluesky:likes] fix infinite loop ([#7194](https://github.com/mikf/gallery-dl/issues/7194) [#7287](https://github.com/mikf/gallery-dl/issues/7287)) +- [deviantart] fix `401 Unauthorized` errors for for multi-image posts ([#6653](https://github.com/mikf/gallery-dl/issues/6653)) +- [everia] fix `title` extraction ([#7379](https://github.com/mikf/gallery-dl/issues/7379)) +- [fanbox] fix `comments` extraction +- [fapello] stop pagination on empty results ([#7385](https://github.com/mikf/gallery-dl/issues/7385)) +- [kemonoparty] fix `archives` option ([#7416](https://github.com/mikf/gallery-dl/issues/7416) [#7419](https://github.com/mikf/gallery-dl/issues/7419)) +- [pixiv] fix `user_details` requests not being cached ([#7414](https://github.com/mikf/gallery-dl/issues/7414)) +- [pixiv:novel] handle exceptions during `embeds` extraction ([#7422](https://github.com/mikf/gallery-dl/issues/7422)) +- [subscribestar] fix username & password login +- [wikifeet] support site redesign ([#7286](https://github.com/mikf/gallery-dl/issues/7286) [#7396](https://github.com/mikf/gallery-dl/issues/7396)) #### Improvements -- [instagram] use Chrome `User-Agent` by default ([#6379](https://github.com/mikf/gallery-dl/issues/6379)) -- [pixiv] support `phixiv.net` URLs ([#7352](https://github.com/mikf/gallery-dl/issues/7352)) -- [tumblr] support URLs without subdomain ([#7358](https://github.com/mikf/gallery-dl/issues/7358)) -- [webtoons] download JPEG files in higher quality -- [webtoons] use a default 0.5-1.5s delay between requests ([#7329](https://github.com/mikf/gallery-dl/issues/7329)) -- [zzup] support `w.zzup.com` URLs ([#7327](https://github.com/mikf/gallery-dl/issues/7327)) -### Downloaders -- [ytdl] fix `KeyError: 'extractor'` exception when `ytdl` reports an error ([#7301](https://github.com/mikf/gallery-dl/issues/7301)) +- [bluesky:likes] use `repo.listRecords` endpoint ([#7194](https://github.com/mikf/gallery-dl/issues/7194) [#7287](https://github.com/mikf/gallery-dl/issues/7287)) +- [gelbooru] don't hardcode image server domains ([#7392](https://github.com/mikf/gallery-dl/issues/7392)) +- [instagram] support `/share/` URLs ([#7241](https://github.com/mikf/gallery-dl/issues/7241)) +- [kemonoparty] use `/posts-legacy` endpoint ([#6780](https://github.com/mikf/gallery-dl/issues/6780) [#6931](https://github.com/mikf/gallery-dl/issues/6931) [#7404](https://github.com/mikf/gallery-dl/issues/7404)) +- [naver] support videos ([#4682](https://github.com/mikf/gallery-dl/issues/4682) [#7395](https://github.com/mikf/gallery-dl/issues/7395)) +- [scrolller] support album posts ([#7339](https://github.com/mikf/gallery-dl/issues/7339)) +- [subscribestar] add warning for missing login cookie +- [twitter] update API endpoint query hashes ([#7382](https://github.com/mikf/gallery-dl/issues/7382) [#7386](https://github.com/mikf/gallery-dl/issues/7386)) +- [weasyl] use `gallery-dl` User-Agent header ([#7412](https://github.com/mikf/gallery-dl/issues/7412)) +#### Metadata +- [deviantart:stash] extract more metadata ([#7397](https://github.com/mikf/gallery-dl/issues/7397)) +- [moebooru:pool] replace underscores in pool names ([#4646](https://github.com/mikf/gallery-dl/issues/4646)) +- [naver] fix recent `date` bug ([#4682](https://github.com/mikf/gallery-dl/issues/4682)) ### Post Processors -- [metadata] add `metadata-path` option ([#6582](https://github.com/mikf/gallery-dl/issues/6582)) -- [metadata] fix handling of empty directory paths ([#7296](https://github.com/mikf/gallery-dl/issues/7296)) -- [ugoira] preserve `extension` when using `"mode": "archive"` ([#7304](https://github.com/mikf/gallery-dl/issues/7304)) +- [ugoira] restore `keep-files` functionality ([#7304](https://github.com/mikf/gallery-dl/issues/7304)) +- [ugoira] support `"keep-files": true` + custom extension ([#7304](https://github.com/mikf/gallery-dl/issues/7304)) +- [ugoira] use `_ugoira_frame_index` to detect `.zip` files ### Miscellaneous -- [formatter] add `i` and `f` conversions ([#6582](https://github.com/mikf/gallery-dl/issues/6582)) +- [util] auto-update Chrome version +- use internal version of `re.compile()` for extractor patterns @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: gallery_dl -Version: 1.29.4 +Version: 1.29.5 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -133,9 +133,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.4/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.5/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.4/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.5/gallery-dl.bin>`__ Nightly Builds @@ -77,9 +77,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.4/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.5/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.4/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.5/gallery-dl.bin>`__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 7eb34af..7a6c97d 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2025-04-13" "1.29.4" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2025-04-26" "1.29.5" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index dc11605..d329d9c 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2025-04-13" "1.29.4" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2025-04-26" "1.29.5" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -810,13 +810,13 @@ or a \f[I]list\f[] with IP and explicit port number as elements. .IP "Default:" 9 .br -* \f[I]"gallery-dl/VERSION"\f[]: \f[I][Danbooru]\f[], \f[I]mangadex\f[] +* \f[I]"gallery-dl/VERSION"\f[]: \f[I][Danbooru]\f[], \f[I]mangadex\f[], \f[I]weasyl\f[] .br * \f[I]"gallery-dl/VERSION (by mikf)"\f[]: \f[I][E621]\f[] .br * \f[I]"Patreon/72.2.28 (Android; Android 14; Scale/2.10)"\f[]: \f[I]patreon\f[] .br -* \f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"\f[]: \f[I]instagram\f[] +* \f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/LATEST.0.0.0 Safari/537.36"\f[]: \f[I]instagram\f[] .br * \f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:LATEST) Gecko/20100101 Firefox/LATEST"\f[]: otherwise @@ -1857,11 +1857,39 @@ Possible values are \f[I]"posts"\f[], \f[I]"replies"\f[], \f[I]"media"\f[], +\f[I]"video"\f[], \f[I]"likes"\f[], It is possible to use \f[I]"all"\f[] instead of listing all values separately. +.SS extractor.bluesky.likes.endpoint +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"listRecords"\f[] + +.IP "Description:" 4 +API endpoint to use for retrieving liked posts. + +\f[I]"listRecords"\f[] +Use the results from +.br +\f[I]com.atproto.repo.listRecords\f[] +Requires no login and alows accessing likes of all users, +.br +but uses one request to +\f[I]getPostThread\f[] +per post, +\f[I]"getActorLikes"\f[] +Use the results from +.br +\f[I]app.bsky.feed.getActorLikes\f[] +Requires login and only allows accessing your own likes. +.br + + .SS extractor.bluesky.metadata .IP "Type:" 6 .br @@ -1890,7 +1918,7 @@ Extract additional metadata. (See \f[I]app.bsky.actor.getProfile\f[]). -.SS extractor.bluesky.post.depth +.SS extractor.bluesky.likes.depth .IP "Type:" 6 \f[I]integer\f[] @@ -4007,6 +4035,17 @@ Extract extended \f[I]pool\f[] metadata. Note: Not supported by all \f[I]moebooru\f[] instances. +.SS extractor.naver.videos +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download videos. + + .SS extractor.newgrounds.flash .IP "Type:" 6 \f[I]bool\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index b85a3e7..b8b46d6 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -153,6 +153,10 @@ "reposts" : false, "videos" : true, + "likes": { + "depth" : 0, + "endpoint": "listRecords" + }, "post": { "depth": 0 } @@ -416,6 +420,10 @@ "username": "", "password": "" }, + "naver": + { + "videos": true + }, "newgrounds": { "username": "", diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 3d113ec..7d3e9ca 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: gallery_dl -Version: 1.29.4 +Version: 1.29.5 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -133,9 +133,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.4/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.5/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.4/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.5/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 2f4a87c..a6afdd7 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -177,6 +177,7 @@ gallery_dl/extractor/pexels.py gallery_dl/extractor/philomena.py gallery_dl/extractor/photovogue.py gallery_dl/extractor/picarto.py +gallery_dl/extractor/pictoa.py gallery_dl/extractor/piczel.py gallery_dl/extractor/pillowfort.py gallery_dl/extractor/pinterest.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 87c3798..9a7ca53 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -7,7 +7,7 @@ # published by the Free Software Foundation. import sys -import re +from ..util import re_compile modules = [ "2ch", @@ -130,6 +130,7 @@ modules = [ "philomena", "photovogue", "picarto", + "pictoa", "piczel", "pillowfort", "pinterest", @@ -234,7 +235,8 @@ def find(url): def add(cls): """Add 'cls' to the list of available extractors""" - cls.pattern = re.compile(cls.pattern) + if isinstance(cls.pattern, str): + cls.pattern = re_compile(cls.pattern) _cache.append(cls) return cls @@ -242,9 +244,11 @@ def add(cls): def add_module(module): """Add all extractors in 'module' to the list of available extractors""" classes = _get_classes(module) - for cls in classes: - cls.pattern = re.compile(cls.pattern) - _cache.extend(classes) + if classes: + if isinstance(classes[0].pattern, str): + for cls in classes: + cls.pattern = re_compile(cls.pattern) + _cache.extend(classes) return classes diff --git a/gallery_dl/extractor/architizer.py b/gallery_dl/extractor/architizer.py index 0268224..911753b 100644 --- a/gallery_dl/extractor/architizer.py +++ b/gallery_dl/extractor/architizer.py @@ -54,7 +54,7 @@ class ArchitizerProjectExtractor(GalleryExtractor): return [ (url, None) for url in text.extract_iter( - page, "property='og:image:secure_url' content='", "?") + page, 'property="og:image:secure_url" content="', "?") ] diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index f8fef93..ec274b8 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -25,10 +25,6 @@ class BlueskyExtractor(Extractor): archive_fmt = "{filename}" root = "https://bsky.app" - def __init__(self, match): - Extractor.__init__(self, match) - self.user = match.group(1) - def _init(self): meta = self.config("metadata") or () if meta: @@ -87,6 +83,22 @@ class BlueskyExtractor(Extractor): def posts(self): return () + def _posts_records(self, actor, collection): + depth = self.config("depth", "0") + + for record in self.api.list_records(actor, collection): + uri = None + try: + uri = record["value"]["subject"]["uri"] + if "/app.bsky.feed.post/" in uri: + yield from self.api.get_post_thread_uri(uri, depth) + except exception.StopExtraction: + pass # deleted post + except Exception as exc: + self.log.debug(record, exc_info=exc) + self.log.warning("Failed to extract %s (%s: %s)", + uri or "record", exc.__class__.__name__, exc) + def _pid(self, post): return post["uri"].rpartition("/")[2] @@ -203,7 +215,7 @@ class BlueskyUserExtractor(BlueskyExtractor): pass def items(self): - base = "{}/profile/{}/".format(self.root, self.user) + base = "{}/profile/{}/".format(self.root, self.groups[0]) default = ("posts" if self.config("quoted", False) or self.config("reposts", False) else "media") return self._dispatch_extractors(( @@ -213,6 +225,7 @@ class BlueskyUserExtractor(BlueskyExtractor): (BlueskyPostsExtractor , base + "posts"), (BlueskyRepliesExtractor , base + "replies"), (BlueskyMediaExtractor , base + "media"), + (BlueskyVideoExtractor , base + "video"), (BlueskyLikesExtractor , base + "likes"), ), (default,)) @@ -223,7 +236,8 @@ class BlueskyPostsExtractor(BlueskyExtractor): example = "https://bsky.app/profile/HANDLE/posts" def posts(self): - return self.api.get_author_feed(self.user, "posts_and_author_threads") + return self.api.get_author_feed( + self.groups[0], "posts_and_author_threads") class BlueskyRepliesExtractor(BlueskyExtractor): @@ -232,7 +246,8 @@ class BlueskyRepliesExtractor(BlueskyExtractor): example = "https://bsky.app/profile/HANDLE/replies" def posts(self): - return self.api.get_author_feed(self.user, "posts_with_replies") + return self.api.get_author_feed( + self.groups[0], "posts_with_replies") class BlueskyMediaExtractor(BlueskyExtractor): @@ -241,7 +256,18 @@ class BlueskyMediaExtractor(BlueskyExtractor): example = "https://bsky.app/profile/HANDLE/media" def posts(self): - return self.api.get_author_feed(self.user, "posts_with_media") + return self.api.get_author_feed( + self.groups[0], "posts_with_media") + + +class BlueskyVideoExtractor(BlueskyExtractor): + subcategory = "video" + pattern = USER_PATTERN + r"/video" + example = "https://bsky.app/profile/HANDLE/video" + + def posts(self): + return self.api.get_author_feed( + self.groups[0], "posts_with_video") class BlueskyLikesExtractor(BlueskyExtractor): @@ -250,7 +276,9 @@ class BlueskyLikesExtractor(BlueskyExtractor): example = "https://bsky.app/profile/HANDLE/likes" def posts(self): - return self.api.get_actor_likes(self.user) + if self.config("endpoint") == "getActorLikes": + return self.api.get_actor_likes(self.groups[0]) + return self._posts_records(self.groups[0], "app.bsky.feed.like") class BlueskyFeedExtractor(BlueskyExtractor): @@ -258,12 +286,9 @@ class BlueskyFeedExtractor(BlueskyExtractor): pattern = USER_PATTERN + r"/feed/([^/?#]+)" example = "https://bsky.app/profile/HANDLE/feed/NAME" - def __init__(self, match): - BlueskyExtractor.__init__(self, match) - self.feed = match.group(2) - def posts(self): - return self.api.get_feed(self.user, self.feed) + actor, feed = self.groups + return self.api.get_feed(actor, feed) class BlueskyListExtractor(BlueskyExtractor): @@ -271,12 +296,9 @@ class BlueskyListExtractor(BlueskyExtractor): pattern = USER_PATTERN + r"/lists/([^/?#]+)" example = "https://bsky.app/profile/HANDLE/lists/ID" - def __init__(self, match): - BlueskyExtractor.__init__(self, match) - self.list = match.group(2) - def posts(self): - return self.api.get_list_feed(self.user, self.list) + actor, list_id = self.groups + return self.api.get_list_feed(actor, list_id) class BlueskyFollowingExtractor(BlueskyExtractor): @@ -285,7 +307,7 @@ class BlueskyFollowingExtractor(BlueskyExtractor): example = "https://bsky.app/profile/HANDLE/follows" def items(self): - for user in self.api.get_follows(self.user): + for user in self.api.get_follows(self.groups[0]): url = "https://bsky.app/profile/" + user["did"] user["_extractor"] = BlueskyUserExtractor yield Message.Queue, url, user @@ -296,12 +318,9 @@ class BlueskyPostExtractor(BlueskyExtractor): pattern = USER_PATTERN + r"/post/([^/?#]+)" example = "https://bsky.app/profile/HANDLE/post/ID" - def __init__(self, match): - BlueskyExtractor.__init__(self, match) - self.post_id = match.group(2) - def posts(self): - return self.api.get_post_thread(self.user, self.post_id) + actor, post_id = self.groups + return self.api.get_post_thread(actor, post_id) class BlueskyInfoExtractor(BlueskyExtractor): @@ -311,7 +330,7 @@ class BlueskyInfoExtractor(BlueskyExtractor): def items(self): self._metadata_user = True - self.api._did_from_actor(self.user) + self.api._did_from_actor(self.groups[0]) return iter(((Message.Directory, self._user),)) @@ -322,7 +341,7 @@ class BlueskyAvatarExtractor(BlueskyExtractor): example = "https://bsky.app/profile/HANDLE/avatar" def posts(self): - return self._make_post(self.user, "avatar") + return self._make_post(self.groups[0], "avatar") class BlueskyBackgroundExtractor(BlueskyExtractor): @@ -332,7 +351,7 @@ class BlueskyBackgroundExtractor(BlueskyExtractor): example = "https://bsky.app/profile/HANDLE/banner" def posts(self): - return self._make_post(self.user, "banner") + return self._make_post(self.groups[0], "banner") class BlueskySearchExtractor(BlueskyExtractor): @@ -341,7 +360,7 @@ class BlueskySearchExtractor(BlueskyExtractor): example = "https://bsky.app/search?q=QUERY" def posts(self): - query = text.unquote(self.user.replace("+", " ")) + query = text.unquote(self.groups[0].replace("+", " ")) return self.api.search_posts(query) @@ -351,13 +370,14 @@ class BlueskyHashtagExtractor(BlueskyExtractor): example = "https://bsky.app/hashtag/NAME" def posts(self): - return self.api.search_posts("#"+self.user, self.groups[1]) + hashtag, order = self.groups + return self.api.search_posts("#"+hashtag, order) class BlueskyAPI(): """Interface for the Bluesky API - https://www.docs.bsky.app/docs/category/http-reference + https://docs.bsky.app/docs/category/http-reference """ def __init__(self, extractor): @@ -378,7 +398,7 @@ class BlueskyAPI(): "actor": self._did_from_actor(actor), "limit": "100", } - return self._pagination(endpoint, params) + return self._pagination(endpoint, params, check_empty=True) def get_author_feed(self, actor, filter="posts_and_author_threads"): endpoint = "app.bsky.feed.getAuthorFeed" @@ -416,11 +436,16 @@ class BlueskyAPI(): return self._pagination(endpoint, params) def get_post_thread(self, actor, post_id): + uri = "at://{}/app.bsky.feed.post/{}".format( + self._did_from_actor(actor), post_id) + depth = self.extractor.config("depth", "0") + return self.get_post_thread_uri(uri, depth) + + def get_post_thread_uri(self, uri, depth="0"): endpoint = "app.bsky.feed.getPostThread" params = { - "uri": "at://{}/app.bsky.feed.post/{}".format( - self._did_from_actor(actor), post_id), - "depth" : self.extractor.config("depth", "0"), + "uri" : uri, + "depth" : depth, "parentHeight": "0", } @@ -443,6 +468,18 @@ class BlueskyAPI(): params = {"actor": did} return self._call(endpoint, params) + def list_records(self, actor, collection): + endpoint = "com.atproto.repo.listRecords" + actor_did = self._did_from_actor(actor) + params = { + "repo" : actor_did, + "collection": collection, + "limit" : "100", + # "reverse" : "false", + } + return self._pagination(endpoint, params, "records", + self.service_endpoint(actor_did)) + @memcache(keyarg=1) def resolve_handle(self, handle): endpoint = "com.atproto.identity.resolveHandle" @@ -523,8 +560,10 @@ class BlueskyAPI(): _refresh_token_cache.update(self.username, data["refreshJwt"]) return "Bearer " + data["accessJwt"] - def _call(self, endpoint, params): - url = "{}/xrpc/{}".format(self.root, endpoint) + def _call(self, endpoint, params, root=None): + if root is None: + root = self.root + url = "{}/xrpc/{}".format(root, endpoint) while True: self.authenticate() @@ -549,9 +588,13 @@ class BlueskyAPI(): self.extractor.log.debug("Server response: %s", response.text) raise exception.StopExtraction(msg) - def _pagination(self, endpoint, params, key="feed"): + def _pagination(self, endpoint, params, + key="feed", root=None, check_empty=False): while True: - data = self._call(endpoint, params) + data = self._call(endpoint, params, root) + + if check_empty and not data[key]: + return yield from data[key] cursor = data.get("cursor") diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 995505f..c430ec1 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -59,7 +59,7 @@ class Extractor(): @classmethod def from_url(cls, url): if isinstance(cls.pattern, str): - cls.pattern = re.compile(cls.pattern) + cls.pattern = util.re_compile(cls.pattern) match = cls.pattern.match(url) return cls(match) if match else None @@ -240,6 +240,11 @@ class Extractor(): raise exception.HttpError(msg, response) + def request_location(self, url, **kwargs): + kwargs.setdefault("method", "HEAD") + kwargs.setdefault("allow_redirects", False) + return self.request(url, **kwargs).headers.get("location", "") + _handle_429 = util.false def wait(self, seconds=None, until=None, adjust=1.0, diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 378c7ec..ae475e2 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -867,6 +867,9 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ )["deviation"]["extended"]["deviationUuid"] yield self.api.deviation(deviation_uuid) + def _unescape_json(self, json): + return json.replace('\\"', '"').replace("\\\\", "\\") + class DeviantartUserExtractor(DeviantartExtractor): """Extractor for an artist's user profile""" @@ -1046,7 +1049,7 @@ class DeviantartStashExtractor(DeviantartExtractor): DeviantartExtractor.__init__(self, match) self.user = None - def deviations(self, stash_id=None): + def deviations(self, stash_id=None, stash_data=None): if stash_id is None: legacy_url, stash_id = self.groups else: @@ -1068,14 +1071,33 @@ class DeviantartStashExtractor(DeviantartExtractor): deviation["_page"] = page deviation["index"] = text.parse_int(text.extr( page, '\\"deviationId\\":', ',')) + + deviation["stash_id"] = stash_id + if stash_data: + folder = stash_data["folder"] + deviation["stash_name"] = folder["name"] + deviation["stash_folder"] = folder["folderId"] + deviation["stash_parent"] = folder["parentId"] or 0 + deviation["stash_description"] = \ + folder["richDescription"]["excerpt"] + else: + deviation["stash_name"] = "" + deviation["stash_description"] = "" + deviation["stash_folder"] = 0 + deviation["stash_parent"] = 0 + yield deviation return + stash_data = text.extr(page, ',\\"stash\\":', ',\\"@@') + if stash_data: + stash_data = util.json_loads(self._unescape_json(stash_data)) + for sid in text.extract_iter( page, 'href="https://www.deviantart.com/stash/', '"'): if sid == stash_id or sid.endswith("#comments"): continue - yield from self.deviations(sid) + yield from self.deviations(sid, stash_data) class DeviantartFavoriteExtractor(DeviantartExtractor): @@ -1276,28 +1298,26 @@ class DeviantartDeviationExtractor(DeviantartExtractor): deviation = self.api.deviation(uuid) deviation["_page"] = page + deviation["index_file"] = 0 + deviation["num"] = deviation["count"] = 1 - _dev_info = text.extr( - page, '\\"deviationExtended\\":', ',\\"deviation\\":', None) - # Clean up escaped quotes - _json_str = re.sub( - r'(?<!\\)\\{1}"', '"', _dev_info).replace("\\'", "'") - _extended_info = util.json_loads(_json_str)[self.deviation_id] - additional_media = _extended_info.get("additionalMedia") or () + additional_media = text.extr(page, ',\\"additionalMedia\\":', '}],\\"') + if not additional_media: + yield deviation + return - if additional_media: - self.filename_fmt = ("{category}_{index}_{index_file}_{title}_" - "{num:>02}.{extension}") - self.archive_fmt = ("g_{_username}_{index}{index_file:?_//}." - "{extension}") + self.filename_fmt = ("{category}_{index}_{index_file}_{title}_" + "{num:>02}.{extension}") + self.archive_fmt = ("g_{_username}_{index}{index_file:?_//}." + "{extension}") - deviation["index_file"] = 0 + additional_media = util.json_loads(self._unescape_json( + additional_media) + "}]") deviation["count"] = 1 + len(additional_media) - deviation["num"] = 1 yield deviation for index, post in enumerate(additional_media): - uri = post["media"]["baseUri"].encode().decode("unicode-escape") + uri = self._eclipse_media(post["media"], "fullview")[0] deviation["content"]["src"] = uri deviation["num"] += 1 deviation["index_file"] = post["fileId"] diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py index e41f6f6..3bf0a74 100644 --- a/gallery_dl/extractor/everia.py +++ b/gallery_dl/extractor/everia.py @@ -57,7 +57,7 @@ class EveriaPostExtractor(EveriaExtractor): data = { "title": text.unescape( - text.extr(page, 'itemprop="headline">', "</h1>")), + text.extr(page, 'itemprop="headline">', "</h")), "tags": list(text.extract_iter(page, 'rel="tag">', "</a>")), "post_url": url, "post_category": text.extr( diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 9bbfb43..3b43134 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -173,15 +173,16 @@ class FanboxExtractor(Extractor): return plans def _get_comment_data(self, post_id): - url = ("https://api.fanbox.cc/post.listComments" + url = ("https://api.fanbox.cc/post.getComments" "?limit=10&postId=" + post_id) comments = [] while url: url = text.ensure_http_scheme(url) body = self.request(url, headers=self.headers).json()["body"] - comments.extend(body["items"]) - url = body["nextUrl"] + data = body["commentList"] + comments.extend(data["items"]) + url = data["nextUrl"] return comments def _get_urls_from_post(self, content_body, post): @@ -296,8 +297,7 @@ class FanboxExtractor(Extractor): url = "https://www.pixiv.net/fanbox/"+content_id # resolve redirect try: - url = self.request(url, method="HEAD", - allow_redirects=False).headers["location"] + url = self.request_location(url) except Exception as exc: url = None self.log.warning("Unable to extract fanbox embed %s (%s: %s)", @@ -392,13 +392,7 @@ class FanboxRedirectExtractor(Extractor): pattern = r"(?:https?://)?(?:www\.)?pixiv\.net/fanbox/creator/(\d+)" example = "https://www.pixiv.net/fanbox/creator/12345" - def __init__(self, match): - Extractor.__init__(self, match) - self.user_id = match.group(1) - def items(self): - url = "https://www.pixiv.net/fanbox/creator/" + self.user_id - data = {"_extractor": FanboxCreatorExtractor} - response = self.request( - url, method="HEAD", allow_redirects=False, notfound="user") - yield Message.Queue, response.headers["Location"], data + url = "https://www.pixiv.net/fanbox/creator/" + self.groups[0] + location = self.request_location(url, notfound="user") + yield Message.Queue, location, {"_extractor": FanboxCreatorExtractor} diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index 838ae7b..cf18edc 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -72,10 +72,13 @@ class FapelloModelExtractor(Extractor): if not page: return + url = None for url in text.extract_iter(page, '<a href="', '"'): if url == "javascript:void(0);": continue yield Message.Queue, url, data + if url is None: + return num += 1 diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index eb07739..f24b696 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -107,22 +107,16 @@ class GelbooruBase(): return params["pid"] += self.per_page - @staticmethod - def _file_url(post): + def _file_url(self, post): url = post["file_url"] if url.endswith((".webm", ".mp4")): + post["_fallback"] = (url,) md5 = post["md5"] + root = text.root_from_url(post["preview_url"]) path = "/images/{}/{}/{}.webm".format(md5[0:2], md5[2:4], md5) - post["_fallback"] = GelbooruBase._video_fallback(path) - url = "https://img4.gelbooru.com" + path + url = root + path return url - @staticmethod - def _video_fallback(path): - yield "https://img3.gelbooru.com" + path - yield "https://img2.gelbooru.com" + path - yield "https://img1.gelbooru.com" + path - def _notes(self, post, page): notes_data = text.extr(page, '<section id="notes"', '</section>') if not notes_data: diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 432a7ad..0f88cac 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -606,6 +606,20 @@ class InstagramHighlightsExtractor(InstagramExtractor): return self.api.highlights_media(uid) +class InstagramFollowersExtractor(InstagramExtractor): + """Extractor for an Instagram user's followers""" + subcategory = "followers" + pattern = USER_PATTERN + r"/followers" + example = "https://www.instagram.com/USER/followers/" + + def items(self): + uid = self.api.user_id(self.item) + for user in self.api.user_followers(uid): + user["_extractor"] = InstagramUserExtractor + url = "{}/{}".format(self.root, user["username"]) + yield Message.Queue, url, user + + class InstagramFollowingExtractor(InstagramExtractor): """Extractor for an Instagram user's followed users""" subcategory = "following" @@ -693,11 +707,21 @@ class InstagramPostExtractor(InstagramExtractor): """Extractor for an Instagram post""" subcategory = "post" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?:[^/?#]+/)?(?:p|tv|reel)/([^/?#]+)") + r"/(?:share/()|[^/?#]+/)?(?:p|tv|reel)/([^/?#]+)") example = "https://www.instagram.com/p/abcdefg/" def posts(self): - return self.api.media(self.item) + share, shortcode = self.groups + if share is not None: + url = text.ensure_http_scheme(self.url) + headers = { + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "same-origin", + } + location = self.request_location(url, headers=headers) + shortcode = location.split("/")[-2] + return self.api.media(shortcode) class InstagramRestAPI(): @@ -816,6 +840,11 @@ class InstagramRestAPI(): params = {"count": 30} return self._pagination(endpoint, params) + def user_followers(self, user_id): + endpoint = "/v1/friendships/{}/followers/".format(user_id) + params = {"count": 12} + return self._pagination_following(endpoint, params) + def user_following(self, user_id): endpoint = "/v1/friendships/{}/following/".format(user_id) params = {"count": 12} @@ -908,9 +937,10 @@ class InstagramRestAPI(): for item in data["items"]: yield from item["media_items"] - if "next_max_id" not in data: + next_max_id = data.get("next_max_id") + if not next_max_id: return extr._update_cursor(None) - params["max_id"] = extr._update_cursor(data["next_max_id"]) + params["max_id"] = extr._update_cursor(next_max_id) def _pagination_following(self, endpoint, params): extr = self.extractor @@ -921,10 +951,10 @@ class InstagramRestAPI(): yield from data["users"] - if len(data["users"]) < params["count"]: + next_max_id = data.get("next_max_id") + if not next_max_id: return extr._update_cursor(None) - params["max_id"] = extr._update_cursor( - params["max_id"] + params["count"]) + params["max_id"] = extr._update_cursor(next_max_id) class InstagramGraphqlAPI(): diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 2974b59..e602665 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -65,6 +65,15 @@ class ItakuGalleryExtractor(ItakuExtractor): return self.api.galleries_images(*self.groups) +class ItakuStarsExtractor(ItakuExtractor): + subcategory = "stars" + pattern = BASE_PATTERN + r"/profile/([^/?#]+)/stars(?:/(\d+))?" + example = "https://itaku.ee/profile/USER/stars" + + def posts(self): + return self.api.galleries_images_starred(*self.groups) + + class ItakuImageExtractor(ItakuExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/images/(\d+)" @@ -139,6 +148,21 @@ class ItakuAPI(): } return self._pagination(endpoint, params, self.image) + def galleries_images_starred(self, username, section=None): + endpoint = "/galleries/images/user_starred_imgs/" + params = { + "cursor" : None, + "stars_of" : self.user(username)["owner"], + "sections" : section, + "date_range": "", + "ordering" : "-date_added", + "maturity_rating": ("SFW", "Questionable", "NSFW"), + "page" : "1", + "page_size" : "30", + "visibility": ("PUBLIC", "PROFILE_ONLY"), + } + return self._pagination(endpoint, params, self.image) + def image(self, image_id): endpoint = "/galleries/images/{}/".format(image_id) return self._call(endpoint) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index de7d040..79070ee 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -153,7 +153,7 @@ class KemonopartyExtractor(Extractor): file["type"] = "archive" if archives: try: - data = self.api.posts_archives(file["hash"]) + data = self.api.file(file["hash"]) data.update(file) post_archives.append(data) except Exception as exc: @@ -319,12 +319,9 @@ class KemonopartyUserExtractor(KemonopartyExtractor): def posts(self): _, _, service, creator_id, query = self.groups params = text.parse_query(query) - if params.get("tag"): - return self.api.creator_tagged_posts( - service, creator_id, params.get("tag"), params.get("o")) - else: - return self.api.creator_posts( - service, creator_id, params.get("o"), params.get("q")) + return self.api.creator_posts_legacy( + service, creator_id, + params.get("o"), params.get("q"), params.get("tag")) class KemonopartyPostsExtractor(KemonopartyExtractor): @@ -524,18 +521,19 @@ class KemonoAPI(): params = {"q": query, "o": offset, "tag": tags} return self._pagination(endpoint, params, 50, "posts") - def posts_archives(self, file_hash): - endpoint = "/posts/archives/" + file_hash - return self._call(endpoint)["archive"] + def file(self, file_hash): + endpoint = "/file/" + file_hash + return self._call(endpoint) def creator_posts(self, service, creator_id, offset=0, query=None): endpoint = "/{}/user/{}".format(service, creator_id) params = {"q": query, "o": offset} return self._pagination(endpoint, params, 50) - def creator_tagged_posts(self, service, creator_id, tags, offset=0): + def creator_posts_legacy(self, service, creator_id, + offset=0, query=None, tags=None): endpoint = "/{}/user/{}/posts-legacy".format(service, creator_id) - params = {"o": offset, "tag": tags} + params = {"o": offset, "tag": tags, "q": query} return self._pagination(endpoint, params, 50, "results") def creator_announcements(self, service, creator_id): diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index e97d273..9fd66e2 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -127,6 +127,7 @@ class MoebooruPoolExtractor(MoebooruExtractor): if self.config("metadata"): url = "{}/pool/show/{}.json".format(self.root, self.pool_id) pool = self.request(url).json() + pool["name"] = pool["name"].replace("_", " ") pool.pop("posts", None) return {"pool": pool} return {"pool": text.parse_int(self.pool_id)} diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index d3150e6..2287325 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -9,7 +9,9 @@ """Extractors for https://blog.naver.com/""" from .common import GalleryExtractor, Extractor, Message -from .. import text +from .. import text, util +import datetime +import time class NaverBase(): @@ -59,19 +61,66 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): "user" : extr("var nickName = '", "'"), }, } - data["post"]["date"] = text.parse_datetime( + + data["post"]["date"] = self._parse_datetime( extr('se_publishDate pcol2">', '<') or - extr('_postAddDate">', '<'), "%Y. %m. %d. %H:%M") + extr('_postAddDate">', '<')) + return data + def _parse_datetime(self, date_string): + if "전" in date_string: + ts = time.gmtime() + return datetime.datetime(ts.tm_year, ts.tm_mon, ts.tm_mday) + return text.parse_datetime(date_string, "%Y. %m. %d. %H:%M") + def images(self, page): - results = [] + files = [] + self._extract_images(files, page) + if self.config("videos", True): + self._extract_videos(files, page) + return files + + def _extract_images(self, files, page): for url in text.extract_iter(page, 'data-lazy-src="', '"'): url = url.replace("://post", "://blog", 1).partition("?")[0] if "\ufffd" in text.unquote(url): url = text.unquote(url, encoding="EUC-KR") - results.append((url, None)) - return results + files.append((url, None)) + + def _extract_videos(self, files, page): + for module in text.extract_iter(page, " data-module='", "'></"): + if '"v2_video"' not in module: + continue + media = util.json_loads(module)["data"] + try: + self._extract_media(files, media) + except Exception as exc: + self.log.warning("%s: Failed to extract video '%s' (%s: %s)", + self.post_id, media.get("vid"), + exc.__class__.__name__, exc) + + def _extract_media(self, files, media): + url = ("https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/" + + media["vid"]) + params = { + "key" : media["inkey"], + "sid" : "2", + # "pid": "00000000-0000-0000-0000-000000000000", + "nonce": int(time.time()), + "devt" : "html5_pc", + "prv" : "N", + "aup" : "N", + "stpb" : "N", + "cpl" : "ko_KR", + "providerEnv": "real", + "adt" : "glad", + "lc" : "ko_KR", + } + data = self.request(url, params=params).json() + video = max(data["videos"]["list"], + key=lambda v: v.get("size") or 0) + files.append((video["source"], video)) class NaverBlogExtractor(NaverBase, Extractor): diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index b8c6acb..2b6742e 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -109,11 +109,7 @@ class PatreonExtractor(Extractor): def _attachments(self, post): for attachment in post.get("attachments") or (): - url = self.request( - attachment["url"], method="HEAD", - allow_redirects=False, fatal=False, - ).headers.get("Location") - + url = self.request_location(attachment["url"], fatal=False) if url: yield "attachment", url, attachment["name"] diff --git a/gallery_dl/extractor/pictoa.py b/gallery_dl/extractor/pictoa.py new file mode 100644 index 0000000..a8008cf --- /dev/null +++ b/gallery_dl/extractor/pictoa.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://pictoa.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:[\w]+\.)?pictoa\.com(?:\.de)?" + + +class PictoaExtractor(Extractor): + """Base class for pictoa extractors""" + category = "pictoa" + root = "https://pictoa.com" + directory_fmt = ("{category}", "{album_id} {album_title}") + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}" + + +class PictoaImageExtractor(PictoaExtractor): + """Extractor for single images from pictoa.com""" + subcategory = "image" + pattern = BASE_PATTERN + r"/albums/(?:[\w-]+-)?(\d+)/(\d+)" + example = "https://www.pictoa.com/albums/NAME-12345/12345.html" + + def items(self): + album_id, image_id = self.groups + + url = "{}/albums/{}/{}.html".format(self.root, album_id, image_id) + page = self.request(url).text + album_title = text.extr(page, 'property="og:title" content="', '"') + image_url = text.extr(page, 'property="og:image" content="', '"') + + data = { + "album_id" : album_id, + "album_title": album_title.rpartition(" #")[0], + "id" : image_id, + "url" : image_url, + } + + text.nameext_from_url(image_url, data) + yield Message.Directory, data + yield Message.Url, image_url, data + + +class PictoaAlbumExtractor(PictoaExtractor): + """Extractor for image albums from pictoa.com""" + subcategory = "album" + pattern = BASE_PATTERN + r"/albums/(?:[\w-]+-)?(\d+).html" + example = "https://www.pictoa.com/albums/NAME-12345.html" + + def items(self): + album_id = self.groups[0] + url = "{}/albums/{}.html".format(self.root, album_id) + page = self.request(url).text + + album_data = { + "album_id" : album_id, + "album_title": text.extr(page, "<h1>", "<"), + "tags" : text.split_html(text.extr( + page, '<ol class="related-categories', '</ol>'))[1:], + "_extractor" : PictoaImageExtractor, + } + + while True: + container = text.extr(page, '<main>', '<span id="flag" >') + for url in text.extract_iter( + container, '<a rel="nofollow" href="', '"'): + yield Message.Queue, url, album_data + + url = text.extr(page, '<link rel="next" href="', '"') + if not url: + break + page = self.request(url).text diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 1a299c1..ad8c681 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -380,15 +380,10 @@ class PinterestPinitExtractor(PinterestExtractor): pattern = r"(?:https?://)?pin\.it/([^/?#]+)" example = "https://pin.it/abcde" - def __init__(self, match): - PinterestExtractor.__init__(self, match) - self.shortened_id = match.group(1) - def items(self): url = "https://api.pinterest.com/url_shortener/{}/redirect/".format( - self.shortened_id) - response = self.request(url, method="HEAD", allow_redirects=False) - location = response.headers.get("Location") + self.groups[0]) + location = self.request_location(url) if not location or not PinterestPinExtractor.pattern.match(location): raise exception.NotFoundError("pin") yield Message.Queue, location, {"_extractor": PinterestPinExtractor} diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index e8050b3..dfed1aa 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -69,7 +69,7 @@ class PixivExtractor(Extractor): files = self._extract_files(work) if self.meta_user: - work.update(self.api.user_detail(work["user"]["id"])) + work.update(self.api.user_detail(str(work["user"]["id"]))) if self.meta_comments: if work["total_comments"] and not work.get("_ajax"): try: @@ -516,16 +516,10 @@ class PixivMeExtractor(PixivExtractor): pattern = r"(?:https?://)?pixiv\.me/([^/?#]+)" example = "https://pixiv.me/USER" - def __init__(self, match): - PixivExtractor.__init__(self, match) - self.account = match.group(1) - def items(self): - url = "https://pixiv.me/" + self.account - data = {"_extractor": PixivUserExtractor} - response = self.request( - url, method="HEAD", allow_redirects=False, notfound="user") - yield Message.Queue, response.headers["Location"], data + url = "https://pixiv.me/" + self.groups[0] + location = self.request_location(url, notfound="user") + yield Message.Queue, location, {"_extractor": PixivUserExtractor} class PixivWorkExtractor(PixivExtractor): @@ -887,7 +881,7 @@ class PixivNovelExtractor(PixivExtractor): novels = itertools.islice(novels, self.max_posts) for novel in novels: if self.meta_user: - novel.update(self.api.user_detail(novel["user"]["id"])) + novel.update(self.api.user_detail(str(novel["user"]["id"]))) if self.meta_comments: if novel["total_comments"]: novel["comments"] = list( @@ -940,15 +934,19 @@ class PixivNovelExtractor(PixivExtractor): illusts[marker[11:].partition("-")[0]] = None if desktop: - novel_id = str(novel["id"]) - url = "{}/novel/show.php?id={}".format( - self.root, novel_id) - data = util.json_loads(text.extr( - self.request(url, headers=headers).text, - "id=\"meta-preload-data\" content='", "'")) - - for image in (data["novel"][novel_id] - ["textEmbeddedImages"]).values(): + try: + novel_id = str(novel["id"]) + url = "{}/novel/show.php?id={}".format( + self.root, novel_id) + data = util.json_loads(text.extr( + self.request(url, headers=headers).text, + "id=\"meta-preload-data\" content='", "'")) + images = (data["novel"][novel_id] + ["textEmbeddedImages"]).values() + except Exception: + images = () + + for image in images: url = image.pop("urls")["original"] novel.update(image) novel["date_url"] = self._date_from_url(url) diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py index 8877175..6ea18e6 100644 --- a/gallery_dl/extractor/postmill.py +++ b/gallery_dl/extractor/postmill.py @@ -153,17 +153,13 @@ class PostmillPostExtractor(PostmillExtractor): class PostmillShortURLExtractor(PostmillExtractor): """Extractor for short submission URLs""" subcategory = "shorturl" - pattern = BASE_PATTERN + r"/(\d+)$" + pattern = BASE_PATTERN + r"(/\d+)$" example = "https://raddle.me/123" - def __init__(self, match): - PostmillExtractor.__init__(self, match) - self.post_id = match.group(3) - def items(self): - url = self.root + "/" + self.post_id - response = self.request(url, method="HEAD", allow_redirects=False) - full_url = text.urljoin(url, response.headers["Location"]) + url = self.root + self.groups[2] + location = self.request_location(url) + full_url = text.urljoin(url, location) yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor} diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 7a9e3c5..76eadc4 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -357,10 +357,9 @@ class RedditRedirectExtractor(Extractor): sub_type = "user" url = "https://www.reddit.com/{}/{}/s/{}".format( sub_type, subreddit, share_url) + location = self.request_location(url, notfound="submission") data = {"_extractor": RedditSubmissionExtractor} - response = self.request(url, method="HEAD", allow_redirects=False, - notfound="submission") - yield Message.Queue, response.headers["Location"], data + yield Message.Queue, location, data class RedditAPI(): diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py index c818c98..f97fa14 100644 --- a/gallery_dl/extractor/scrolller.py +++ b/gallery_dl/extractor/scrolller.py @@ -20,8 +20,8 @@ class ScrolllerExtractor(Extractor): category = "scrolller" root = "https://scrolller.com" directory_fmt = ("{category}", "{subredditTitle}") - filename_fmt = "{id}{title:? //}.{extension}" - archive_fmt = "{id}" + filename_fmt = "{id}{num:?_//>03}{title:? //}.{extension}" + archive_fmt = "{id}_{num}" request_interval = (0.5, 1.5) def _init(self): @@ -31,23 +31,36 @@ class ScrolllerExtractor(Extractor): self.login() for post in self.posts(): - - media_sources = post.get("mediaSources") - if not media_sources: - self.log.warning("%s: No media files", post.get("id")) - continue - - src = max(media_sources, key=self._sort_key) - post.update(src) - url = src["url"] - text.nameext_from_url(url, post) + files = self._extract_files(post) + post["count"] = len(files) yield Message.Directory, post - yield Message.Url, url, post + for file in files: + url = file["url"] + post.update(file) + yield Message.Url, url, text.nameext_from_url(url, post) def posts(self): return () + def _extract_files(self, post): + album = post.pop("albumContent", None) + if not album: + sources = post.get("mediaSources") + if not sources: + self.log.warning("%s: No media files", post.get("id")) + return () + src = max(sources, key=self._sort_key) + src["num"] = 0 + return (src,) + + files = [] + for num, media in enumerate(album, 1): + src = max(media["mediaSources"], key=self._sort_key) + src["num"] = num + files.append(src) + return files + def login(self): username, password = self._get_auth_info() if username: @@ -63,7 +76,7 @@ class ScrolllerExtractor(Extractor): } try: - data = self._request_graphql("LoginQuery", variables) + data = self._request_graphql("LoginQuery", variables, False) except exception.HttpError as exc: if exc.status == 403: raise exception.AuthenticationError() @@ -71,10 +84,9 @@ class ScrolllerExtractor(Extractor): return data["login"]["token"] - def _request_graphql(self, opname, variables): - url = "https://api.scrolller.com/api/v2/graphql" + def _request_graphql(self, opname, variables, admin=True): headers = { - "Content-Type" : "text/plain;charset=UTF-8", + "Content-Type" : None, "Origin" : self.root, "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", @@ -85,14 +97,23 @@ class ScrolllerExtractor(Extractor): "variables" : variables, "authorization": self.auth_token, } + + if admin: + url = "https://api.scrolller.com/admin" + headers["Content-Type"] = "application/json" + else: + url = "https://api.scrolller.com/api/v2/graphql" + headers["Content-Type"] = "text/plain;charset=UTF-8" + return self.request( url, method="POST", headers=headers, data=util.json_dumps(data), ).json()["data"] - def _pagination(self, opname, variables): - while True: + def _pagination(self, opname, variables, data=None): + if data is None: data = self._request_graphql(opname, variables) + while True: while "items" not in data: data = data.popitem()[1] yield from data["items"] @@ -101,6 +122,8 @@ class ScrolllerExtractor(Extractor): return variables["iterator"] = data["iterator"] + data = self._request_graphql(opname, variables) + def _sort_key(self, src): return src["width"], not src["isOptimized"] @@ -114,6 +137,7 @@ class ScrolllerSubredditExtractor(ScrolllerExtractor): def posts(self): url, query = self.groups filter = None + sort = "RANDOM" if query: params = text.parse_query(query) @@ -121,12 +145,24 @@ class ScrolllerSubredditExtractor(ScrolllerExtractor): filter = params["filter"].upper().rstrip("S") variables = { - "url" : url, - "iterator" : None, - "filter" : filter, - "hostsDown": None, + "url" : url, + "filter": filter, + "sortBy": sort, + "limit" : 50, } - return self._pagination("SubredditQuery", variables) + subreddit = self._request_graphql( + "SubredditQuery", variables)["getSubreddit"] + + variables = { + "subredditId": subreddit["id"], + "iterator": None, + "filter" : filter, + "sortBy" : sort, + "limit" : 50, + "isNsfw" : subreddit["isNsfw"], + } + return self._pagination( + "SubredditChildrenQuery", variables, subreddit["children"]) class ScrolllerFollowingExtractor(ScrolllerExtractor): @@ -142,11 +178,14 @@ class ScrolllerFollowingExtractor(ScrolllerExtractor): raise exception.AuthorizationError("Login required") variables = { - "iterator" : None, - "hostsDown": None, + "iterator": None, + "filter" : None, + "limit" : 10, + "isNsfw" : False, + "sortBy" : "RANDOM", } - for subreddit in self._pagination("FollowingQuery", variables): + for subreddit in self._pagination("GetFollowingSubreddits", variables): url = self.root + subreddit["url"] subreddit["_extractor"] = ScrolllerSubredditExtractor yield Message.Queue, url, subreddit @@ -156,39 +195,62 @@ class ScrolllerPostExtractor(ScrolllerExtractor): """Extractor for media from a single scrolller post""" subcategory = "post" pattern = BASE_PATTERN + r"/(?!r/|following$)([^/?#]+)" - example = "https://scrolller.com/title-slug-a1b2c3d4f5" + example = "https://scrolller.com/TITLE-SLUG-a1b2c3d4f5" def posts(self): - url = "{}/{}".format(self.root, self.groups[0]) - page = self.request(url).text - data = util.json_loads(text.extr( - page, '<script>window.scrolllerConfig="', '"</script>') - .replace('\\"', '"')) - return (data["item"],) + variables = {"url": "/" + self.groups[0]} + data = self._request_graphql("SubredditPostQuery", variables) + return (data["getPost"],) QUERIES = { + "SubredditPostQuery": """\ +query SubredditPostQuery( + $url: String! +) { + getPost( + data: { url: $url } + ) { + __typename id url title subredditId subredditTitle subredditUrl + redditPath isNsfw hasAudio fullLengthSource gfycatSource redgifsSource + ownerAvatar username displayName favoriteCount isPaid tags + commentsCount commentsRepliesCount isFavorite + albumContent { mediaSources { url width height isOptimized } } + mediaSources { url width height isOptimized } + blurredMediaSources { url width height isOptimized } + } +} +""", + "SubredditQuery": """\ query SubredditQuery( $url: String! - $filter: SubredditPostFilter $iterator: String + $sortBy: GallerySortBy + $filter: GalleryFilter + $limit: Int! ) { getSubreddit( - url: $url + data: { + url: $url, + iterator: $iterator, + filter: $filter, + limit: $limit, + sortBy: $sortBy + } ) { - children( - limit: 50 - iterator: $iterator - filter: $filter - disabledHosts: null - ) { + __typename id url title secondaryTitle description createdAt isNsfw + subscribers isComplete itemCount videoCount pictureCount albumCount + isPaid username tags isFollowing + banner { url width height isOptimized } + children { iterator items { - __typename id url title subredditId subredditTitle - subredditUrl redditPath isNsfw albumUrl hasAudio - fullLengthSource gfycatSource redgifsSource ownerAvatar - username displayName isPaid tags isFavorite + __typename id url title subredditId subredditTitle subredditUrl + redditPath isNsfw hasAudio fullLengthSource gfycatSource + redgifsSource ownerAvatar username displayName favoriteCount + isPaid tags commentsCount commentsRepliesCount isFavorite + albumContent { mediaSources { url width height isOptimized } } mediaSources { url width height isOptimized } blurredMediaSources { url width height isOptimized } } @@ -197,19 +259,59 @@ query SubredditQuery( } """, - "FollowingQuery": """\ -query FollowingQuery( + "SubredditChildrenQuery": """\ +query SubredditChildrenQuery( + $subredditId: Int! $iterator: String + $filter: GalleryFilter + $sortBy: GallerySortBy + $limit: Int! + $isNsfw: Boolean ) { - getFollowing( - limit: 10 - iterator: $iterator + getSubredditChildren( + data: { + subredditId: $subredditId, + iterator: $iterator, + filter: $filter, + sortBy: $sortBy, + limit: $limit, + isNsfw: $isNsfw + }, + ) { + iterator items { + __typename id url title subredditId subredditTitle subredditUrl + redditPath isNsfw hasAudio fullLengthSource gfycatSource + redgifsSource ownerAvatar username displayName favoriteCount isPaid + tags commentsCount commentsRepliesCount isFavorite + albumContent { mediaSources { url width height isOptimized } } + mediaSources { url width height isOptimized } + blurredMediaSources { url width height isOptimized } + } + } +} +""", + + "GetFollowingSubreddits": """\ +query GetFollowingSubreddits( + $iterator: String, + $limit: Int!, + $filter: GalleryFilter, + $isNsfw: Boolean, + $sortBy: GallerySortBy +) { + getFollowingSubreddits( + data: { + isNsfw: $isNsfw + limit: $limit + filter: $filter + iterator: $iterator + sortBy: $sortBy + } ) { iterator items { __typename id url title secondaryTitle description createdAt isNsfw subscribers isComplete itemCount videoCount pictureCount albumCount - isPaid username tags isFollowing - banner { url width height isOptimized } + isFollowing } } } @@ -229,4 +331,14 @@ query LoginQuery( } """, + "ItemTypeQuery": """\ +query ItemTypeQuery( + $url: String! +) { + getItemType( + url: $url + ) +} +""", + } diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index 23ba340..ff8c505 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -43,9 +43,7 @@ class SeigaExtractor(Extractor): def get_image_url(self, image_id): """Get url for an image with id 'image_id'""" url = "{}/image/source/{}".format(self.root, image_id) - response = self.request( - url, method="HEAD", allow_redirects=False, notfound="image") - location = response.headers["location"] + location = self.request_location(url, notfound="image") if "nicovideo.jp/login" in location: raise exception.StopExtraction( "HTTP redirect to login page (%s)", location.partition("?")[0]) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 5d0ec46..1054a63 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -23,14 +23,15 @@ class SubscribestarExtractor(Extractor): directory_fmt = ("{category}", "{author_name}") filename_fmt = "{post_id}_{id}.{extension}" archive_fmt = "{id}" - cookies_domain = "www.subscribestar.com" - cookies_names = ("auth_token",) + cookies_domain = ".subscribestar.com" + cookies_names = ("_personalization_id",) + _warning = True def __init__(self, match): tld, self.item = match.groups() if tld == "adult": self.root = "https://subscribestar.adult" - self.cookies_domain = "subscribestar.adult" + self.cookies_domain = ".subscribestar.adult" self.subcategory += "-adult" Extractor.__init__(self, match) @@ -78,34 +79,64 @@ class SubscribestarExtractor(Extractor): username, password = self._get_auth_info() if username: - self.cookies_update(self._login_impl(username, password)) + self.cookies_update(self._login_impl( + (username, self.cookies_domain), password)) + + if self._warning: + if not username or not self.cookies_check(self.cookies_names): + self.log.warning("no '_personalization_id' cookie set") + SubscribestarExtractor._warning = False @cache(maxage=28*86400, keyarg=1) def _login_impl(self, username, password): + username = username[0] self.log.info("Logging in as %s", username) - url = "https://www.subscribestar.com/session.json" + if self.root.endswith(".adult"): + self.cookies.set("18_plus_agreement_generic", "true", + domain=self.cookies_domain) + + # load login page + url = self.root + "/login" + page = self.request(url).text + headers = { - "Origin" : "https://www.subscribestar.com", - "Referer" : "https://www.subscribestar.com/login", + "Accept": "*/*;q=0.5, text/javascript, application/javascript, " + "application/ecmascript, application/x-ecmascript", + "Referer": self.root + "/login", + "X-CSRF-Token": text.unescape(text.extr( + page, '<meta name="csrf-token" content="', '"')), + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", } - data = { - "utf8" : "✓", - "email" : username, - "password": password, - } - response = self.request( - url, method="POST", headers=headers, data=data, fatal=False) - if response.json().get("errors"): - self.log.debug(response.json()["errors"]) - raise exception.AuthenticationError() + def check_errors(response): + errors = response.json().get("errors") + if errors: + self.log.debug(errors) + try: + msg = '"{}"'.format(errors.popitem()[1]) + except Exception: + msg = None + raise exception.AuthenticationError(msg) + return response + + # submit username / email + url = self.root + "/session.json" + data = {"email": username} + response = check_errors(self.request( + url, method="POST", headers=headers, data=data, fatal=False)) + + # submit password + url = self.root + "/session/password.json" + data = {"password": password} + response = check_errors(self.request( + url, method="POST", headers=headers, data=data, fatal=False)) + # return cookies return { cookie.name: cookie.value for cookie in response.cookies - if cookie.name.startswith("auth") } def _media_from_post(self, html): diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index 4c1da7a..b9783c4 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -183,10 +183,7 @@ class TiktokVmpostExtractor(TiktokExtractor): url = text.ensure_http_scheme(self.url) headers = {"User-Agent": "facebookexternalhit/1.1"} - response = self.request(url, headers=headers, method="HEAD", - allow_redirects=False, notfound="post") - - url = response.headers.get("Location") + url = self.request_location(url, headers=headers, notfound="post") if not url or len(url) <= 28: # https://www.tiktok.com/?_r=1 raise exception.NotFoundError("post") diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 8d90bc5..e2fe000 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -798,6 +798,17 @@ class TwitterFollowingExtractor(TwitterExtractor): return self._users_result(TwitterAPI(self).user_following(self.user)) +class TwitterFollowersExtractor(TwitterExtractor): + """Extractor for a user's followers""" + subcategory = "followers" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/followers(?!\w)" + example = "https://x.com/USER/followers" + + def items(self): + self.login() + return self._users_result(TwitterAPI(self).user_followers(self.user)) + + class TwitterSearchExtractor(TwitterExtractor): """Extractor for Twitter search results""" subcategory = "search" @@ -1139,54 +1150,76 @@ class TwitterAPI(): "collab_control,vibe", } self.features = { - "hidden_profile_likes_enabled": True, "hidden_profile_subscriptions_enabled": True, + "profile_label_improvements_pcf_label_in_post_enabled": True, + "rweb_tipjar_consumption_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, "highlights_tweets_tab_ui_enabled": True, "responsive_web_twitter_article_notes_tab_enabled": True, + "subscriptions_feature_can_gift_premium": True, "creator_subscriptions_tweet_preview_api_enabled": True, "responsive_web_graphql_" "skip_user_profile_image_extensions_enabled": False, - "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_graphql_" + "timeline_navigation_enabled": True, } self.features_pagination = { + "rweb_video_screen_enabled": False, + "profile_label_improvements_pcf_label_in_post_enabled": True, + "rweb_tipjar_consumption_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, "creator_subscriptions_tweet_preview_api_enabled": True, - "responsive_web_graphql_timeline_navigation_enabled": True, - "responsive_web_graphql_skip_user_profile_" - "image_extensions_enabled": False, + "responsive_web_graphql_" + "timeline_navigation_enabled": True, + "responsive_web_graphql_" + "skip_user_profile_image_extensions_enabled": False, + "premium_content_api_read_enabled": False, + "communities_web_enable_tweet_community_results_fetch": True, "c9s_tweet_anatomy_moderator_badge_enabled": True, - "tweetypie_unmention_optimization_enabled": True, + "responsive_web_grok_analyze_button_fetch_trends_enabled": False, + "responsive_web_grok_analyze_post_followups_enabled": True, + "responsive_web_jetfuel_frame": False, + "responsive_web_grok_share_attachment_enabled": True, + "articles_preview_enabled": True, "responsive_web_edit_tweet_api_enabled": True, "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, "view_counts_everywhere_api_enabled": True, "longform_notetweets_consumption_enabled": True, "responsive_web_twitter_article_tweet_consumption_enabled": True, "tweet_awards_web_tipping_enabled": False, + "responsive_web_grok_show_grok_translated_post": False, + "responsive_web_grok_analysis_button_from_backend": True, + "creator_subscriptions_quote_tweet_preview_enabled": False, "freedom_of_speech_not_reach_fetch_enabled": True, "standardized_nudges_misinfo": True, - "tweet_with_visibility_results_prefer_gql_" - "limited_actions_policy_enabled": True, - "rweb_video_timestamps_enabled": True, + "tweet_with_visibility_results_" + "prefer_gql_limited_actions_policy_enabled": True, "longform_notetweets_rich_text_read_enabled": True, "longform_notetweets_inline_media_enabled": True, - "responsive_web_media_download_video_enabled": True, + "responsive_web_grok_image_annotation_enabled": True, "responsive_web_enhance_cards_enabled": False, } def tweet_result_by_rest_id(self, tweet_id): - endpoint = "/graphql/MWY3AO9_I3rcP_L2A4FR4A/TweetResultByRestId" + endpoint = "/graphql/Vg2Akr5FzUmF0sTplA5k6g/TweetResultByRestId" variables = { "tweetId": tweet_id, "withCommunity": False, "includePromotedContent": False, "withVoice": False, } + field_toggles = { + "withArticleRichContentState": True, + "withArticlePlainText": False, + "withGrokAnalyze": False, + "withDisallowedReplyControls": False, + } params = { - "variables": self._json_dumps(variables), - "features" : self._json_dumps(self.features_pagination), + "variables" : self._json_dumps(variables), + "features" : self._json_dumps(self.features_pagination), + "fieldToggles": self._json_dumps(field_toggles), } tweet = self._call(endpoint, params)["data"]["tweetResult"]["result"] if "tweet" in tweet: @@ -1203,47 +1236,61 @@ class TwitterAPI(): return tweet def tweet_detail(self, tweet_id): - endpoint = "/graphql/B9_KmbkLhXt6jRwGjJrweg/TweetDetail" + endpoint = "/graphql/b9Yw90FMr_zUb8DvA8r2ug/TweetDetail" variables = { "focalTweetId": tweet_id, "referrer": "profile", "with_rux_injections": False, + # "rankingMode": "Relevance", "includePromotedContent": False, "withCommunity": True, - "withQuickPromoteEligibilityTweetFields": True, + "withQuickPromoteEligibilityTweetFields": False, "withBirdwatchNotes": True, "withVoice": True, - "withV2Timeline": True, + } + field_toggles = { + "withArticleRichContentState": True, + "withArticlePlainText": False, + "withGrokAnalyze": False, + "withDisallowedReplyControls": False, } return self._pagination_tweets( - endpoint, variables, ("threaded_conversation_with_injections_v2",)) + endpoint, variables, + ("threaded_conversation_with_injections_v2",), + field_toggles=field_toggles) def user_tweets(self, screen_name): - endpoint = "/graphql/5ICa5d9-AitXZrIA3H-4MQ/UserTweets" + endpoint = "/graphql/M3Hpkrb8pjWkEuGdLeXMOA/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, "includePromotedContent": False, - "withQuickPromoteEligibilityTweetFields": True, + "withQuickPromoteEligibilityTweetFields": False, "withVoice": True, - "withV2Timeline": True, } - return self._pagination_tweets(endpoint, variables) + field_toggles = { + "withArticlePlainText": False, + } + return self._pagination_tweets( + endpoint, variables, field_toggles=field_toggles) def user_tweets_and_replies(self, screen_name): - endpoint = "/graphql/UtLStR_BnYUGD7Q453UXQg/UserTweetsAndReplies" + endpoint = "/graphql/pz0IHaV_t7T4HJavqqqcIA/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, "includePromotedContent": False, "withCommunity": True, "withVoice": True, - "withV2Timeline": True, } - return self._pagination_tweets(endpoint, variables) + field_toggles = { + "withArticlePlainText": False, + } + return self._pagination_tweets( + endpoint, variables, field_toggles=field_toggles) def user_media(self, screen_name): - endpoint = "/graphql/tO4LMUYAZbR4T0SqQ85aAw/UserMedia" + endpoint = "/graphql/8B9DqlaGvYyOvTCzzZWtNA/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1251,12 +1298,15 @@ class TwitterAPI(): "withClientEventToken": False, "withBirdwatchNotes": False, "withVoice": True, - "withV2Timeline": True, } - return self._pagination_tweets(endpoint, variables) + field_toggles = { + "withArticlePlainText": False, + } + return self._pagination_tweets( + endpoint, variables, field_toggles=field_toggles) def user_likes(self, screen_name): - endpoint = "/graphql/9s8V6sUI8fZLDiN-REkAxA/Likes" + endpoint = "/graphql/uxjTlmrTI61zreSIV1urbw/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1264,24 +1314,24 @@ class TwitterAPI(): "withClientEventToken": False, "withBirdwatchNotes": False, "withVoice": True, - "withV2Timeline": True, } - return self._pagination_tweets(endpoint, variables) + field_toggles = { + "withArticlePlainText": False, + } + return self._pagination_tweets( + endpoint, variables, field_toggles=field_toggles) def user_bookmarks(self): - endpoint = "/graphql/cQxQgX8MJYjWwC0dxpyfYg/Bookmarks" + endpoint = "/graphql/ztCdjqsvvdL0dE8R5ME0hQ/Bookmarks" variables = { "count": 100, "includePromotedContent": False, } - features = self.features_pagination.copy() - features["graphql_timeline_v2_bookmark_timeline"] = True return self._pagination_tweets( - endpoint, variables, ("bookmark_timeline_v2", "timeline"), False, - features=features) + endpoint, variables, ("bookmark_timeline_v2", "timeline"), False) def list_latest_tweets_timeline(self, list_id): - endpoint = "/graphql/HjsWc-nwwHKYwHenbHm-tw/ListLatestTweetsTimeline" + endpoint = "/graphql/LSefrrxhpeX8HITbKfWz9g/ListLatestTweetsTimeline" variables = { "listId": list_id, "count": 100, @@ -1289,21 +1339,20 @@ class TwitterAPI(): return self._pagination_tweets( endpoint, variables, ("list", "tweets_timeline", "timeline")) - def search_timeline(self, query): - endpoint = "/graphql/fZK7JipRHWtiZsTodhsTfQ/SearchTimeline" + def search_timeline(self, query, product="Latest"): + endpoint = "/graphql/fL2MBiqXPk5pSrOS5ACLdA/SearchTimeline" variables = { "rawQuery": query, "count": 100, - "querySource": "", - "product": "Latest", + "querySource": "typed_query", + "product": product, } - return self._pagination_tweets( endpoint, variables, ("search_by_raw_query", "search_timeline", "timeline")) def community_tweets_timeline(self, community_id): - endpoint = "/graphql/7B2AdxSuC-Er8qUr3Plm_w/CommunityTweetsTimeline" + endpoint = "/graphql/awszcpgwaIeqqNfmzjxUow/CommunityTweetsTimeline" variables = { "communityId": community_id, "count": 100, @@ -1317,7 +1366,7 @@ class TwitterAPI(): "timeline")) def community_media_timeline(self, community_id): - endpoint = "/graphql/qAGUldfcIoMv5KyAyVLYog/CommunityMediaTimeline" + endpoint = "/graphql/HfMuDHto2j3NKUeiLjKWHA/CommunityMediaTimeline" variables = { "communityId": community_id, "count": 100, @@ -1329,7 +1378,7 @@ class TwitterAPI(): "timeline")) def communities_main_page_timeline(self, screen_name): - endpoint = ("/graphql/GtOhw2mstITBepTRppL6Uw" + endpoint = ("/graphql/NbdrKPY_h_nlvZUg7oqH5Q" "/CommunitiesMainPageTimeline") variables = { "count": 100, @@ -1356,17 +1405,34 @@ class TwitterAPI(): ["twitter_objects"]["live_events"][event_id]) def list_members(self, list_id): - endpoint = "/graphql/BQp2IEYkgxuSxqbTAr1e1g/ListMembers" + endpoint = "/graphql/v97svwb-qcBmzv6QruDuNg/ListMembers" variables = { "listId": list_id, "count": 100, - "withSafetyModeUserFields": True, } return self._pagination_users( endpoint, variables, ("list", "members_timeline", "timeline")) + def user_followers(self, screen_name): + endpoint = "/graphql/jqZ0_HJBA6mnu18iTZYm9w/Followers" + variables = { + "userId": self._user_id_by_screen_name(screen_name), + "count": 100, + "includePromotedContent": False, + } + return self._pagination_users(endpoint, variables) + + def user_followers_verified(self, screen_name): + endpoint = "/graphql/GHg0X_FjrJoISwwLPWi1LQ/BlueVerifiedFollowers" + variables = { + "userId": self._user_id_by_screen_name(screen_name), + "count": 100, + "includePromotedContent": False, + } + return self._pagination_users(endpoint, variables) + def user_following(self, screen_name): - endpoint = "/graphql/PAnE9toEjRfE-4tozRcsfw/Following" + endpoint = "/graphql/4QHbs4wmzgtU91f-t96_Eg/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1376,12 +1442,11 @@ class TwitterAPI(): @memcache(keyarg=1) def user_by_rest_id(self, rest_id): - endpoint = "/graphql/tD8zKvQzwY3kdx5yz6YmOw/UserByRestId" + endpoint = "/graphql/5vdJ5sWkbSRDiiNZvwc2Yg/UserByRestId" features = self.features params = { "variables": self._json_dumps({ "userId": rest_id, - "withSafetyModeUserFields": True, }), "features": self._json_dumps(features), } @@ -1389,7 +1454,7 @@ class TwitterAPI(): @memcache(keyarg=1) def user_by_screen_name(self, screen_name): - endpoint = "/graphql/k5XapwcSikNsEsILW5FvgA/UserByScreenName" + endpoint = "/graphql/32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName" features = self.features.copy() features["subscriptions_verification_info_" "is_identity_verified_enabled"] = True @@ -1398,9 +1463,11 @@ class TwitterAPI(): params = { "variables": self._json_dumps({ "screen_name": screen_name, - "withSafetyModeUserFields": True, }), "features": self._json_dumps(features), + "fieldToggles": self._json_dumps({ + "withAuxiliaryUserLabels": True, + }), } return self._call(endpoint, params)["data"]["user"]["result"] @@ -1620,7 +1687,8 @@ class TwitterAPI(): params["cursor"] = extr._update_cursor(cursor) def _pagination_tweets(self, endpoint, variables, - path=None, stop_tweets=True, features=None): + path=None, stop_tweets=True, + features=None, field_toggles=None): extr = self.extractor original_retweets = (extr.retweets == "original") pinned_tweet = extr.pinned @@ -1633,6 +1701,8 @@ class TwitterAPI(): features = self.features_pagination if features: params["features"] = self._json_dumps(features) + if field_toggles: + params["fieldToggles"] = self._json_dumps(field_toggles) while True: params["variables"] = self._json_dumps(variables) @@ -1640,7 +1710,7 @@ class TwitterAPI(): try: if path is None: - instructions = (data["user"]["result"]["timeline_v2"] + instructions = (data["user"]["result"]["timeline"] ["timeline"]["instructions"]) else: instructions = data diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index 49a3deb..0478ef0 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -32,21 +32,13 @@ BASE_PATTERN = UrlshortenerExtractor.update({ class UrlshortenerLinkExtractor(UrlshortenerExtractor): """Extractor for general-purpose URL shorteners""" subcategory = "link" - pattern = BASE_PATTERN + r"/([^/?#]+)" + pattern = BASE_PATTERN + r"(/[^/?#]+)" example = "https://bit.ly/abcde" - def __init__(self, match): - UrlshortenerExtractor.__init__(self, match) - self.id = match.group(match.lastindex) - - def _init(self): - self.headers = self.config_instance("headers") - def items(self): - response = self.request( - "{}/{}".format(self.root, self.id), headers=self.headers, - method="HEAD", allow_redirects=False, notfound="URL") - try: - yield Message.Queue, response.headers["location"], {} - except KeyError: + url = self.root + self.groups[-1] + location = self.request_location( + url, headers=self.config_instance("headers"), notfound="URL") + if not location: raise exception.StopExtraction("Unable to resolve short URL") + yield Message.Queue, location, {} diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index 13b0520..ed2a395 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -7,7 +7,7 @@ """Extractors for https://www.weasyl.com/""" from .common import Extractor, Message -from .. import text +from .. import text, util BASE_PATTERN = r"(?:https://)?(?:www\.)?weasyl.com/" @@ -18,6 +18,7 @@ class WeasylExtractor(Extractor): filename_fmt = "{submitid} {title}.{extension}" archive_fmt = "{submitid}" root = "https://www.weasyl.com" + useragent = util.USERAGENT @staticmethod def populate_submission(data): diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py index d3586c0..f7bfeb2 100644 --- a/gallery_dl/extractor/wikifeet.py +++ b/gallery_dl/extractor/wikifeet.py @@ -33,12 +33,12 @@ class WikifeetGalleryExtractor(GalleryExtractor): return { "celeb" : self.celeb, "type" : self.type, - "rating" : text.parse_float(extr('"ratingValue": "', '"')), - "celebrity" : text.unescape(extr("times'>", "</h1>")), - "shoesize" : text.remove_html(extr("Shoe Size:", "edit")), - "birthplace": text.remove_html(extr("Birthplace:", "edit")), - "birthday" : text.parse_datetime(text.remove_html( - extr("Birth Date:", "edit")), "%Y-%m-%d"), + "birthplace": text.unescape(extr('"bplace":"', '"')), + "birthday" : text.parse_datetime(text.unescape( + extr('"bdate":"', '"'))[:10], "%Y-%m-%d"), + "shoesize" : text.unescape(extr('"ssize":', ',')), + "rating" : text.parse_float(extr('"score":', ',')), + "celebrity" : text.unescape(extr('"cname":"', '"')), } def images(self, page): @@ -61,5 +61,6 @@ class WikifeetGalleryExtractor(GalleryExtractor): for tag in data["tags"] if tag in tagmap ], }) - for data in util.json_loads(text.extr(page, "['gdata'] = ", ";")) + for data in + util.json_loads("[" + text.extr(page, '"gallery":[', '],') + "]") ] diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index c1bfc20..5340335 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -111,16 +111,13 @@ class UgoiraPP(PostProcessor): return self._frames = pathfmt.kwdict["_ugoira_frame_data"] - if pathfmt.extension == "zip": + index = pathfmt.kwdict.get("_ugoira_frame_index") + if index is None: self._convert_zip = True if self.delete: pathfmt.set_extension(self.extension) pathfmt.build_path() else: - index = pathfmt.kwdict.get("_ugoira_frame_index") - if index is None: - return - pathfmt.build_path() frame = self._frames[index].copy() frame["index"] = index @@ -138,6 +135,7 @@ class UgoiraPP(PostProcessor): if not self._convert_zip: return self._zip_source = True + self._zip_ext = ext = pathfmt.extension with self._tempdir() as tempdir: if tempdir: @@ -156,7 +154,12 @@ class UgoiraPP(PostProcessor): return self.log.debug("", exc_info=exc) if self.convert(pathfmt, tempdir): - pathfmt.delete = self.delete + if self.delete: + pathfmt.delete = True + elif pathfmt.extension != ext: + self.log.info(pathfmt.filename) + pathfmt.set_extension(ext) + pathfmt.build_path() def convert_from_files(self, pathfmt): if not self._convert_files: @@ -252,9 +255,15 @@ class UgoiraPP(PostProcessor): ]).encode() if self._zip_source: - self.delete = False + zpath = pathfmt.temppath + if self.delete: + self.delete = False + elif self._zip_ext != self.extension: + self._copy_file(zpath, pathfmt.realpath) + zpath = pathfmt.realpath + if self.metadata: - with zipfile.ZipFile(pathfmt.temppath, "a") as zfile: + with zipfile.ZipFile(zpath, "a") as zfile: zinfo = zipfile.ZipInfo(metaname) if self.mtime: zinfo.date_time = zfile.infolist()[0].date_time diff --git a/gallery_dl/util.py b/gallery_dl/util.py index eabd4ab..ba31ea7 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -27,6 +27,11 @@ from http.cookiejar import Cookie from email.utils import mktime_tz, parsedate_tz from . import text, version, exception +try: + re_compile = re._compiler.compile +except AttributeError: + re_compile = re.sre_compile.compile + def bencode(num, alphabet="0123456789"): """Encode an integer into a base-N encoded string""" @@ -685,11 +690,16 @@ class CustomNone(): __repr__ = __str__ -# v128.0 release on 2024-07-09 has ordinal 739076 -# v137.0 release on 2025-04-01 has ordinal 739342 -# 735492 == 739076 - 128 * 28 +# v137.0 release of Firefox on 2025-04-01 has ordinal 739342 # 735506 == 739342 - 137 * 28 +# v135.0 release of Chrome on 2025-04-01 has ordinal 739342 +# 735562 == 739342 - 135 * 28 +# _ord_today = datetime.date.today().toordinal() +# _ff_ver = (_ord_today - 735506) // 28 +# _ch_ver = (_ord_today - 735562) // 28 + _ff_ver = (datetime.date.today().toordinal() - 735506) // 28 +# _ch_ver = _ff_ver - 2 NONE = CustomNone() EPOCH = datetime.datetime(1970, 1, 1) @@ -701,8 +711,8 @@ USERAGENT = "gallery-dl/" + version.__version__ USERAGENT_FIREFOX = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{}.0) " "Gecko/20100101 Firefox/{}.0").format(_ff_ver, _ff_ver) USERAGENT_CHROME = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 " - "Safari/537.36") + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{}.0.0.0 " + "Safari/537.36").format(_ff_ver - 2) SPECIAL_EXTRACTORS = {"oauth", "recursive", "generic"} GLOBALS = { "contains" : contains, diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 87169e2..af4acf5 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.29.4" +__version__ = "1.29.5" __variant__ = None |
