diff options
| author | 2025-08-16 07:00:40 -0400 | |
|---|---|---|
| committer | 2025-08-16 07:00:40 -0400 | |
| commit | 22e8d9823eb9fb802c926fb03a5fdccbea26f878 (patch) | |
| tree | d399937a3bf139d386b8f5df2fc646b751c14719 /gallery_dl/extractor/facebook.py | |
| parent | 0839cde5064bd6000162ee23b8445b99afe10068 (diff) | |
| parent | 3d18761f620a294ea6c5bff13c5994b93b29f3ed (diff) | |
Update upstream source from tag 'upstream/1.30.3'
Update to upstream version '1.30.3'
with Debian dir cbd3490f51b0ee3f2e172965318cd079b856367d
Diffstat (limited to 'gallery_dl/extractor/facebook.py')
| -rw-r--r-- | gallery_dl/extractor/facebook.py | 129 |
1 files changed, 115 insertions, 14 deletions
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index 069ed99..f9ed1ab 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -7,7 +7,7 @@ """Extractors for https://www.facebook.com/""" from .common import Extractor, Message, Dispatch -from .. import text, exception +from .. import text, util, exception from ..cache import memcache BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com" @@ -61,6 +61,7 @@ class FacebookExtractor(Extractor): "user_id": text.extr( set_page, '"owner":{"__typename":"User","id":"', '"' ), + "user_pfbid": "", "title": self.decode_all(text.extr( set_page, '"title":{"text":"', '"' )), @@ -74,6 +75,15 @@ class FacebookExtractor(Extractor): ) } + if directory["user_id"].startswith("pfbid"): + directory["user_pfbid"] = directory["user_id"] + directory["user_id"] = ( + text.extr( + set_page, '"actors":[{"__typename":"User","id":"', '"') or + text.extr( + set_page, '"userID":"', '"') or + directory["set_id"].split(".")[1]) + return directory def parse_photo_page(self, photo_page): @@ -92,6 +102,7 @@ class FacebookExtractor(Extractor): "user_id": text.extr( photo_page, '"owner":{"__typename":"User","id":"', '"' ), + "user_pfbid": "", "caption": self.decode_all(text.extr( photo_page, '"message":{"delight_ranges"', @@ -115,6 +126,11 @@ class FacebookExtractor(Extractor): ) } + if photo["user_id"].startswith("pfbid"): + photo["user_pfbid"] = photo["user_id"] + photo["user_id"] = text.extr( + photo_page, r'\"content_owner_id_new\":\"', r'\"') + text.nameext_from_url(photo["url"], photo) photo["followups_ids"] = [] @@ -296,21 +312,33 @@ class FacebookExtractor(Extractor): i += 1 @memcache(keyarg=1) - def _extract_profile_photos_page(self, profile): - profile_photos_url = f"{self.root}/{profile}/photos_by" + def _extract_profile(self, profile, set_id=False): + if set_id: + url = f"{self.root}/{profile}/photos_by" + else: + url = f"{self.root}/{profile}" + return self._extract_profile_page(url) + def _extract_profile_page(self, url): for _ in range(self.fallback_retries + 1): - profile_photos_page = self.request(profile_photos_url).text - if set_id := self._extract_profile_set_id(profile_photos_page): - break - self.log.debug("Got empty profile photos page, retrying...") - else: - raise exception.AbortExtraction("Failed to extract profile data") + page = self.request(url).text - avatar_page_url = text.extr( - profile_photos_page, ',"profilePhoto":{"url":"', '"') + if page.find('>Page Not Found</title>', 0, 3000) > 0: + break + if ('"props":{"title":"This content isn\'t available right now"' in + page): + raise exception.AuthRequired( + "authenticated cookies", "profile", + "This content isn't available right now") + + set_id = self._extract_profile_set_id(page) + user = self._extract_profile_user(page) + if set_id or user: + user["set_id"] = set_id + return user - return set_id, avatar_page_url.replace("\\/", "/") + self.log.debug("Got empty profile photos page, retrying...") + return {} def _extract_profile_set_id(self, profile_photos_page): set_ids_raw = text.extr( @@ -325,6 +353,28 @@ class FacebookExtractor(Extractor): return set_id + def _extract_profile_user(self, page): + data = text.extr(page, '","user":{"', '},"viewer":{') + + user = None + try: + user = util.json_loads(f'{{"{data}}}') + if user["id"].startswith("pfbid"): + user["user_pfbid"] = user["id"] + user["id"] = text.extr(page, '"userID":"', '"') + user["username"] = (text.extr(page, '"userVanity":"', '"') or + text.extr(page, '"vanity":"', '"')) + user["profile_tabs"] = [ + edge["node"] + for edge in (user["profile_tabs"]["profile_user"] + ["timeline_nav_app_sections"]["edges"]) + ] + except Exception: + if user is None: + self.log.debug("Failed to extract user data: %s", data) + user = {} + return user + class FacebookSetExtractor(FacebookExtractor): """Base class for Facebook Set extractors""" @@ -418,6 +468,51 @@ class FacebookVideoExtractor(FacebookExtractor): yield Message.Url, audio["url"], audio +class FacebookInfoExtractor(FacebookExtractor): + """Extractor for Facebook Profile data""" + subcategory = "info" + directory_fmt = ("{category}", "{username}") + pattern = USER_PATTERN + r"/info" + example = "https://www.facebook.com/USERNAME/info" + + def items(self): + user = self._extract_profile(self.groups[0]) + return iter(((Message.Directory, user),)) + + +class FacebookAlbumsExtractor(FacebookExtractor): + """Extractor for Facebook Profile albums""" + subcategory = "albums" + pattern = USER_PATTERN + r"/photos_albums(?:/([^/?#]+))?" + example = "https://www.facebook.com/USERNAME/photos_albums" + + def items(self): + profile, name = self.groups + url = f"{self.root}/{profile}/photos_albums" + page = self.request(url).text + + pos = page.find( + '"TimelineAppCollectionAlbumsRenderer","collection":{"id":"') + if pos < 0: + return + if name is not None: + name = name.lower() + + items = text.extract(page, '},"pageItems":', '}}},', pos)[0] + edges = util.json_loads(items + "}}")["edges"] + + # TODO: use /graphql API endpoint + for edge in edges: + node = edge["node"] + album = node["node"] + album["title"] = title = node["title"]["text"] + if name is not None and name != title.lower(): + continue + album["_extractor"] = FacebookSetExtractor + album["thumbnail"] = (img := node["image"]) and img["uri"] + yield Message.Queue, album["url"], album + + class FacebookPhotosExtractor(FacebookExtractor): """Extractor for Facebook Profile Photos""" subcategory = "photos" @@ -425,7 +520,10 @@ class FacebookPhotosExtractor(FacebookExtractor): example = "https://www.facebook.com/USERNAME/photos" def items(self): - set_id = self._extract_profile_photos_page(self.groups[0])[0] + set_id = self._extract_profile(self.groups[0], True)["set_id"] + if not set_id: + return iter(()) + set_url = f"{self.root}/media/set/?set={set_id}" set_page = self.request(set_url).text set_data = self.parse_set_page(set_page) @@ -439,7 +537,8 @@ class FacebookAvatarExtractor(FacebookExtractor): example = "https://www.facebook.com/USERNAME/avatar" def items(self): - avatar_page_url = self._extract_profile_photos_page(self.groups[0])[1] + user = self._extract_profile(self.groups[0]) + avatar_page_url = user["profilePhoto"]["url"] avatar_page = self.photo_page_request_wrapper(avatar_page_url).text avatar = self.parse_photo_page(avatar_page) @@ -462,6 +561,8 @@ class FacebookUserExtractor(Dispatch, FacebookExtractor): def items(self): base = f"{self.root}/{self.groups[0]}/" return self._dispatch_extractors(( + (FacebookInfoExtractor , base + "info"), (FacebookAvatarExtractor, base + "avatar"), (FacebookPhotosExtractor, base + "photos"), + (FacebookAlbumsExtractor, base + "photos_albums"), ), ("photos",)) |
