summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/bluesky.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/bluesky.py')
-rw-r--r--gallery_dl/extractor/bluesky.py458
1 files changed, 458 insertions, 0 deletions
diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py
new file mode 100644
index 0000000..8de0d7b
--- /dev/null
+++ b/gallery_dl/extractor/bluesky.py
@@ -0,0 +1,458 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://bsky.app/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+from ..cache import cache, memcache
+
+BASE_PATTERN = r"(?:https?://)?bsky\.app"
+USER_PATTERN = BASE_PATTERN + r"/profile/([^/?#]+)"
+
+
+class BlueskyExtractor(Extractor):
+ """Base class for bluesky extractors"""
+ category = "bluesky"
+ directory_fmt = ("{category}", "{author[handle]}")
+ filename_fmt = "{createdAt[:19]}_{post_id}_{num}.{extension}"
+ archive_fmt = "{filename}"
+ root = "https://bsky.app"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def _init(self):
+ meta = self.config("metadata") or ()
+ if meta:
+ if isinstance(meta, str):
+ meta = meta.replace(" ", "").split(",")
+ elif not isinstance(meta, (list, tuple)):
+ meta = ("user", "facets")
+ self._metadata_user = ("user" in meta)
+ self._metadata_facets = ("facets" in meta)
+
+ self.api = BlueskyAPI(self)
+ self._user = None
+
+ def items(self):
+ for post in self.posts():
+ if "post" in post:
+ post = post["post"]
+ post.update(post["record"])
+ del post["record"]
+
+ images = ()
+ if "embed" in post:
+ media = post["embed"]
+ if "media" in media:
+ media = media["media"]
+ if "images" in media:
+ images = media["images"]
+
+ if self._metadata_facets:
+ if "facets" in post:
+ post["hashtags"] = tags = []
+ post["mentions"] = dids = []
+ post["uris"] = uris = []
+ for facet in post["facets"]:
+ features = facet["features"][0]
+ if "tag" in features:
+ tags.append(features["tag"])
+ elif "did" in features:
+ dids.append(features["did"])
+ elif "uri" in features:
+ uris.append(features["uri"])
+ else:
+ post["hashtags"] = post["mentions"] = post["uris"] = ()
+
+ if self._metadata_user:
+ post["user"] = self._user or post["author"]
+
+ post["post_id"] = post["uri"].rpartition("/")[2]
+ post["count"] = len(images)
+ post["date"] = text.parse_datetime(
+ post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
+
+ yield Message.Directory, post
+
+ if not images:
+ continue
+
+ base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob"
+ "?did={}&cid=".format(post["author"]["did"]))
+ post["num"] = 0
+
+ for file in images:
+ post["num"] += 1
+ post["description"] = file["alt"]
+
+ try:
+ aspect = file["aspectRatio"]
+ post["width"] = aspect["width"]
+ post["height"] = aspect["height"]
+ except KeyError:
+ post["width"] = post["height"] = 0
+
+ image = file["image"]
+ post["filename"] = link = image["ref"]["$link"]
+ post["extension"] = image["mimeType"].rpartition("/")[2]
+
+ yield Message.Url, base + link, post
+
+ def posts(self):
+ return ()
+
+ def _make_post(self, actor, kind):
+ did = self.api._did_from_actor(actor)
+ profile = self.api.get_profile(did)
+
+ if kind not in profile:
+ return ()
+ cid = profile[kind].rpartition("/")[2].partition("@")[0]
+
+ return ({
+ "post": {
+ "embed": {"images": [{
+ "alt": kind,
+ "image": {
+ "$type" : "blob",
+ "ref" : {"$link": cid},
+ "mimeType": "image/jpeg",
+ "size" : 0,
+ },
+ "aspectRatio": {
+ "width" : 1000,
+ "height": 1000,
+ },
+ }]},
+ "author" : profile,
+ "record" : (),
+ "createdAt": "",
+ "uri" : cid,
+ },
+ },)
+
+
+class BlueskyUserExtractor(BlueskyExtractor):
+ subcategory = "user"
+ pattern = USER_PATTERN + r"$"
+ example = "https://bsky.app/profile/HANDLE"
+
+ def initialize(self):
+ pass
+
+ def items(self):
+ base = "{}/profile/{}/".format(self.root, self.user)
+ return self._dispatch_extractors((
+ (BlueskyAvatarExtractor , base + "avatar"),
+ (BlueskyBackgroundExtractor, base + "banner"),
+ (BlueskyPostsExtractor , base + "posts"),
+ (BlueskyRepliesExtractor , base + "replies"),
+ (BlueskyMediaExtractor , base + "media"),
+ (BlueskyLikesExtractor , base + "likes"),
+ ), ("media",))
+
+
+class BlueskyPostsExtractor(BlueskyExtractor):
+ subcategory = "posts"
+ pattern = USER_PATTERN + r"/posts"
+ example = "https://bsky.app/profile/HANDLE/posts"
+
+ def posts(self):
+ return self.api.get_author_feed(self.user, "posts_and_author_threads")
+
+
+class BlueskyRepliesExtractor(BlueskyExtractor):
+ subcategory = "replies"
+ pattern = USER_PATTERN + r"/replies"
+ example = "https://bsky.app/profile/HANDLE/replies"
+
+ def posts(self):
+ return self.api.get_author_feed(self.user, "posts_with_replies")
+
+
+class BlueskyMediaExtractor(BlueskyExtractor):
+ subcategory = "media"
+ pattern = USER_PATTERN + r"/media"
+ example = "https://bsky.app/profile/HANDLE/media"
+
+ def posts(self):
+ return self.api.get_author_feed(self.user, "posts_with_media")
+
+
+class BlueskyLikesExtractor(BlueskyExtractor):
+ subcategory = "likes"
+ pattern = USER_PATTERN + r"/likes"
+ example = "https://bsky.app/profile/HANDLE/likes"
+
+ def posts(self):
+ return self.api.get_actor_likes(self.user)
+
+
+class BlueskyFeedExtractor(BlueskyExtractor):
+ subcategory = "feed"
+ pattern = USER_PATTERN + r"/feed/([^/?#]+)"
+ example = "https://bsky.app/profile/HANDLE/feed/NAME"
+
+ def __init__(self, match):
+ BlueskyExtractor.__init__(self, match)
+ self.feed = match.group(2)
+
+ def posts(self):
+ return self.api.get_feed(self.user, self.feed)
+
+
+class BlueskyListExtractor(BlueskyExtractor):
+ subcategory = "list"
+ pattern = USER_PATTERN + r"/lists/([^/?#]+)"
+ example = "https://bsky.app/profile/HANDLE/lists/ID"
+
+ def __init__(self, match):
+ BlueskyExtractor.__init__(self, match)
+ self.list = match.group(2)
+
+ def posts(self):
+ return self.api.get_list_feed(self.user, self.list)
+
+
+class BlueskyFollowingExtractor(BlueskyExtractor):
+ subcategory = "following"
+ pattern = USER_PATTERN + r"/follows"
+ example = "https://bsky.app/profile/HANDLE/follows"
+
+ def items(self):
+ for user in self.api.get_follows(self.user):
+ url = "https://bsky.app/profile/" + user["did"]
+ yield Message.Queue, url, user
+
+
+class BlueskyPostExtractor(BlueskyExtractor):
+ subcategory = "post"
+ pattern = USER_PATTERN + r"/post/([^/?#]+)"
+ example = "https://bsky.app/profile/HANDLE/post/ID"
+
+ def __init__(self, match):
+ BlueskyExtractor.__init__(self, match)
+ self.post_id = match.group(2)
+
+ def posts(self):
+ return self.api.get_post_thread(self.user, self.post_id)
+
+
+class BlueskyAvatarExtractor(BlueskyExtractor):
+ subcategory = "avatar"
+ filename_fmt = "avatar_{post_id}.{extension}"
+ pattern = USER_PATTERN + r"/avatar"
+ example = "https://bsky.app/profile/HANDLE/avatar"
+
+ def posts(self):
+ return self._make_post(self.user, "avatar")
+
+
+class BlueskyBackgroundExtractor(BlueskyExtractor):
+ subcategory = "background"
+ filename_fmt = "background_{post_id}.{extension}"
+ pattern = USER_PATTERN + r"/ba(?:nner|ckground)"
+ example = "https://bsky.app/profile/HANDLE/banner"
+
+ def posts(self):
+ return self._make_post(self.user, "banner")
+
+
+class BlueskySearchExtractor(BlueskyExtractor):
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/search(?:/|\?q=)(.+)"
+ example = "https://bsky.app/search?q=QUERY"
+
+ def posts(self):
+ return self.api.search_posts(self.user)
+
+
+class BlueskyAPI():
+ """Interface for the Bluesky API
+
+ https://www.docs.bsky.app/docs/category/http-reference
+ """
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.log = extractor.log
+ self.headers = {"Accept": "application/json"}
+
+ self.username, self.password = extractor._get_auth_info()
+ if self.username:
+ self.root = "https://bsky.social"
+ else:
+ self.root = "https://api.bsky.app"
+ self.authenticate = util.noop
+
+ def get_actor_likes(self, actor):
+ endpoint = "app.bsky.feed.getActorLikes"
+ params = {
+ "actor": self._did_from_actor(actor),
+ "limit": "100",
+ }
+ return self._pagination(endpoint, params)
+
+ def get_author_feed(self, actor, filter="posts_and_author_threads"):
+ endpoint = "app.bsky.feed.getAuthorFeed"
+ params = {
+ "actor" : self._did_from_actor(actor),
+ "filter": filter,
+ "limit" : "100",
+ }
+ return self._pagination(endpoint, params)
+
+ def get_feed(self, actor, feed):
+ endpoint = "app.bsky.feed.getFeed"
+ params = {
+ "feed" : "at://{}/app.bsky.feed.generator/{}".format(
+ self._did_from_actor(actor), feed),
+ "limit": "100",
+ }
+ return self._pagination(endpoint, params)
+
+ def get_follows(self, actor):
+ endpoint = "app.bsky.graph.getFollows"
+ params = {
+ "actor": self._did_from_actor(actor),
+ "limit": "100",
+ }
+ return self._pagination(endpoint, params, "follows")
+
+ def get_list_feed(self, actor, list):
+ endpoint = "app.bsky.feed.getListFeed"
+ params = {
+ "list" : "at://{}/app.bsky.graph.list/{}".format(
+ self._did_from_actor(actor), list),
+ "limit": "100",
+ }
+ return self._pagination(endpoint, params)
+
+ def get_post_thread(self, actor, post_id):
+ endpoint = "app.bsky.feed.getPostThread"
+ params = {
+ "uri": "at://{}/app.bsky.feed.post/{}".format(
+ self._did_from_actor(actor), post_id),
+ "depth" : self.extractor.config("depth", "0"),
+ "parentHeight": "0",
+ }
+
+ thread = self._call(endpoint, params)["thread"]
+ if "replies" not in thread:
+ return (thread,)
+
+ index = 0
+ posts = [thread]
+ while index < len(posts):
+ post = posts[index]
+ if "replies" in post:
+ posts.extend(post["replies"])
+ index += 1
+ return posts
+
+ @memcache(keyarg=1)
+ def get_profile(self, did):
+ endpoint = "app.bsky.actor.getProfile"
+ params = {"actor": did}
+ return self._call(endpoint, params)
+
+ @memcache(keyarg=1)
+ def resolve_handle(self, handle):
+ endpoint = "com.atproto.identity.resolveHandle"
+ params = {"handle": handle}
+ return self._call(endpoint, params)["did"]
+
+ def search_posts(self, query):
+ endpoint = "app.bsky.feed.searchPosts"
+ params = {
+ "q" : query,
+ "limit": "100",
+ }
+ return self._pagination(endpoint, params, "posts")
+
+ def _did_from_actor(self, actor):
+ if actor.startswith("did:"):
+ did = actor
+ else:
+ did = self.resolve_handle(actor)
+
+ if self.extractor._metadata_user:
+ self.extractor._user = self.get_profile(did)
+
+ return did
+
+ def authenticate(self):
+ self.headers["Authorization"] = self._authenticate_impl(self.username)
+
+ @cache(maxage=3600, keyarg=1)
+ def _authenticate_impl(self, username):
+ refresh_token = _refresh_token_cache(username)
+
+ if refresh_token:
+ self.log.info("Refreshing access token for %s", username)
+ endpoint = "com.atproto.server.refreshSession"
+ headers = {"Authorization": "Bearer " + refresh_token}
+ data = None
+ else:
+ self.log.info("Logging in as %s", username)
+ endpoint = "com.atproto.server.createSession"
+ headers = None
+ data = {
+ "identifier": username,
+ "password" : self.password,
+ }
+
+ url = "{}/xrpc/{}".format(self.root, endpoint)
+ response = self.extractor.request(
+ url, method="POST", headers=headers, json=data, fatal=None)
+ data = response.json()
+
+ if response.status_code != 200:
+ self.log.debug("Server response: %s", data)
+ raise exception.AuthenticationError('"{}: {}"'.format(
+ data.get("error"), data.get("message")))
+
+ _refresh_token_cache.update(self.username, data["refreshJwt"])
+ return "Bearer " + data["accessJwt"]
+
+ def _call(self, endpoint, params):
+ url = "{}/xrpc/{}".format(self.root, endpoint)
+
+ while True:
+ self.authenticate()
+ response = self.extractor.request(
+ url, params=params, headers=self.headers, fatal=None)
+
+ if response.status_code < 400:
+ return response.json()
+ if response.status_code == 429:
+ self.extractor.wait(seconds=60)
+ continue
+
+ self.extractor.log.debug("Server response: %s", response.text)
+ raise exception.StopExtraction(
+ "API request failed (%s %s)",
+ response.status_code, response.reason)
+
+ def _pagination(self, endpoint, params, key="feed"):
+ while True:
+ data = self._call(endpoint, params)
+ yield from data[key]
+
+ cursor = data.get("cursor")
+ if not cursor:
+ return
+ params["cursor"] = cursor
+
+
+@cache(maxage=84*86400, keyarg=0)
+def _refresh_token_cache(username):
+ return None