diff options
Diffstat (limited to 'gallery_dl/extractor/facebook.py')
| -rw-r--r-- | gallery_dl/extractor/facebook.py | 179 |
1 files changed, 102 insertions, 77 deletions
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index b284ee8..069ed99 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -6,10 +6,14 @@ """Extractors for https://www.facebook.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, exception +from ..cache import memcache BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com" +USER_PATTERN = (BASE_PATTERN + + r"/(?!media/|photo/|photo.php|watch/)" + r"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)") class FacebookExtractor(Extractor): @@ -20,9 +24,6 @@ class FacebookExtractor(Extractor): filename_fmt = "{id}.{extension}" archive_fmt = "{id}.{extension}" - set_url_fmt = root + "/media/set/?set={set_id}" - photo_url_fmt = root + "/photo/?fbid={photo_id}&set={set_id}" - def _init(self): headers = self.session.headers headers["Accept"] = ( @@ -37,22 +38,20 @@ class FacebookExtractor(Extractor): self.videos = self.config("videos", True) self.author_followups = self.config("author-followups", False) - @staticmethod - def decode_all(txt): + def decode_all(self, txt): return text.unescape( txt.encode().decode("unicode_escape") .encode("utf_16", "surrogatepass").decode("utf_16") ).replace("\\/", "/") - @staticmethod - def parse_set_page(set_page): + def parse_set_page(self, set_page): directory = { "set_id": text.extr( set_page, '"mediaSetToken":"', '"' ) or text.extr( set_page, '"mediasetToken":"', '"' ), - "username": FacebookExtractor.decode_all( + "username": self.decode_all( text.extr( set_page, '"user":{"__isProfile":"User","name":"', '","' ) or text.extr( @@ -62,7 +61,7 @@ class FacebookExtractor(Extractor): "user_id": text.extr( set_page, '"owner":{"__typename":"User","id":"', '"' ), - "title": FacebookExtractor.decode_all(text.extr( + "title": self.decode_all(text.extr( set_page, '"title":{"text":"', '"' )), "first_photo_id": text.extr( @@ -77,8 +76,7 @@ class FacebookExtractor(Extractor): return directory - @staticmethod - def parse_photo_page(photo_page): + def parse_photo_page(self, photo_page): photo = { "id": text.extr( photo_page, '"__isNode":"Photo","id":"', '"' @@ -88,13 +86,13 @@ class FacebookExtractor(Extractor): '"url":"https:\\/\\/www.facebook.com\\/photo\\/?fbid=', '"' ).rsplit("&set=", 1)[-1], - "username": FacebookExtractor.decode_all(text.extr( + "username": self.decode_all(text.extr( photo_page, '"owner":{"__typename":"User","name":"', '"' )), "user_id": text.extr( photo_page, '"owner":{"__typename":"User","id":"', '"' ), - "caption": FacebookExtractor.decode_all(text.extr( + "caption": self.decode_all(text.extr( photo_page, '"message":{"delight_ranges"', '"},"message_preferred_body"' @@ -103,7 +101,7 @@ class FacebookExtractor(Extractor): text.extr(photo_page, '\\"publish_time\\":', ',') or text.extr(photo_page, '"created_time":', ',') ), - "url": FacebookExtractor.decode_all(text.extr( + "url": self.decode_all(text.extr( photo_page, ',"image":{"uri":"', '","' )), "next_photo_id": text.extr( @@ -133,8 +131,7 @@ class FacebookExtractor(Extractor): return photo - @staticmethod - def parse_post_page(post_page): + def parse_post_page(self, post_page): first_photo_url = text.extr( text.extr( post_page, '"__isMedia":"Photo"', '"target_group"' @@ -148,13 +145,12 @@ class FacebookExtractor(Extractor): return post - @staticmethod - def parse_video_page(video_page): + def parse_video_page(self, video_page): video = { "id": text.extr( video_page, '\\"video_id\\":\\"', '\\"' ), - "username": FacebookExtractor.decode_all(text.extr( + "username": self.decode_all(text.extr( video_page, '"actors":[{"__typename":"User","name":"', '","' )), "user_id": text.extr( @@ -167,7 +163,7 @@ class FacebookExtractor(Extractor): } if not video["username"]: - video["username"] = FacebookExtractor.decode_all(text.extr( + video["username"] = self.decode_all(text.extr( video_page, '"__typename":"User","id":"' + video["user_id"] + '","name":"', '","' @@ -179,7 +175,7 @@ class FacebookExtractor(Extractor): audio = { **video, - "url": FacebookExtractor.decode_all(text.extr( + "url": self.decode_all(text.extr( text.extr( first_video_raw, "AudioChannelConfiguration", @@ -196,7 +192,7 @@ class FacebookExtractor(Extractor): first_video_raw, 'FBQualityLabel=\\"', '\\u003C\\/BaseURL>' ): resolution = raw_url.split('\\"', 1)[0] - video["urls"][resolution] = FacebookExtractor.decode_all( + video["urls"][resolution] = self.decode_all( raw_url.split('BaseURL>', 1)[1] ) @@ -224,17 +220,16 @@ class FacebookExtractor(Extractor): res = self.request(url, **kwargs) if res.url.startswith(self.root + "/login"): - raise exception.AuthenticationError( - "You must be logged in to continue viewing images." + - LEFT_OFF_TXT + raise exception.AuthRequired( + message=(f"You must be logged in to continue viewing images." + f"{LEFT_OFF_TXT}") ) if b'{"__dr":"CometErrorRoot.react"}' in res.content: - raise exception.StopExtraction( - "You've been temporarily blocked from viewing images. " - "\nPlease try using a different account, " - "using a VPN or waiting before you retry." + - LEFT_OFF_TXT + raise exception.AbortExtraction( + f"You've been temporarily blocked from viewing images.\n" + f"Please try using a different account, " + f"using a VPN or waiting before you retry.{LEFT_OFF_TXT}" ) return res @@ -248,9 +243,7 @@ class FacebookExtractor(Extractor): while i < len(all_photo_ids): photo_id = all_photo_ids[i] - photo_url = self.photo_url_fmt.format( - photo_id=photo_id, set_id=set_id - ) + photo_url = f"{self.root}/photo/?fbid={photo_id}&set={set_id}" photo_page = self.photo_page_request_wrapper(photo_url).text photo = self.parse_photo_page(photo_page) @@ -302,6 +295,36 @@ class FacebookExtractor(Extractor): i += 1 + @memcache(keyarg=1) + def _extract_profile_photos_page(self, profile): + profile_photos_url = f"{self.root}/{profile}/photos_by" + + for _ in range(self.fallback_retries + 1): + profile_photos_page = self.request(profile_photos_url).text + if set_id := self._extract_profile_set_id(profile_photos_page): + break + self.log.debug("Got empty profile photos page, retrying...") + else: + raise exception.AbortExtraction("Failed to extract profile data") + + avatar_page_url = text.extr( + profile_photos_page, ',"profilePhoto":{"url":"', '"') + + return set_id, avatar_page_url.replace("\\/", "/") + + def _extract_profile_set_id(self, profile_photos_page): + set_ids_raw = text.extr( + profile_photos_page, '"pageItems"', '"page_info"' + ) + + set_id = text.extr( + set_ids_raw, 'set=', '"' + ).rsplit("&", 1)[0] or text.extr( + set_ids_raw, '\\/photos\\/', '\\/' + ) + + return set_id + class FacebookSetExtractor(FacebookExtractor): """Base class for Facebook Set extractors""" @@ -317,13 +340,12 @@ class FacebookSetExtractor(FacebookExtractor): def items(self): set_id = self.groups[0] or self.groups[3] - path = self.groups[1] - if path: + if path := self.groups[1]: post_url = self.root + "/" + path post_page = self.request(post_url).text set_id = self.parse_post_page(post_page)["set_id"] - set_url = self.set_url_fmt.format(set_id=set_id) + set_url = f"{self.root}/media/set/?set={set_id}" set_page = self.request(set_url).text set_data = self.parse_set_page(set_page) if self.groups[2]: @@ -342,16 +364,15 @@ class FacebookPhotoExtractor(FacebookExtractor): def items(self): photo_id = self.groups[0] - photo_url = self.photo_url_fmt.format(photo_id=photo_id, set_id="") + photo_url = f"{self.root}/photo/?fbid={photo_id}&set=" photo_page = self.photo_page_request_wrapper(photo_url).text i = 1 photo = self.parse_photo_page(photo_page) photo["num"] = i - set_page = self.request( - self.set_url_fmt.format(set_id=photo["set_id"]) - ).text + set_url = f"{self.root}/media/set/?set={photo['set_id']}" + set_page = self.request(set_url).text directory = self.parse_set_page(set_page) @@ -362,9 +383,7 @@ class FacebookPhotoExtractor(FacebookExtractor): for comment_photo_id in photo["followups_ids"]: comment_photo = self.parse_photo_page( self.photo_page_request_wrapper( - self.photo_url_fmt.format( - photo_id=comment_photo_id, set_id="" - ) + f"{self.root}/photo/?fbid={comment_photo_id}&set=" ).text ) i += 1 @@ -399,44 +418,50 @@ class FacebookVideoExtractor(FacebookExtractor): yield Message.Url, audio["url"], audio -class FacebookProfileExtractor(FacebookExtractor): - """Base class for Facebook Profile Photos Set extractors""" - subcategory = "profile" - pattern = ( - BASE_PATTERN + - r"/(?!media/|photo/|photo.php|watch/)" - r"(?:profile\.php\?id=|people/[^/?#]+/)?" - r"([^/?&#]+)(?:/photos(?:_by)?|/videos|/posts)?/?(?:$|\?|#)" - ) - example = "https://www.facebook.com/USERNAME" +class FacebookPhotosExtractor(FacebookExtractor): + """Extractor for Facebook Profile Photos""" + subcategory = "photos" + pattern = USER_PATTERN + r"/photos(?:_by)?" + example = "https://www.facebook.com/USERNAME/photos" - @staticmethod - def get_profile_photos_set_id(profile_photos_page): - set_ids_raw = text.extr( - profile_photos_page, '"pageItems"', '"page_info"' - ) + def items(self): + set_id = self._extract_profile_photos_page(self.groups[0])[0] + set_url = f"{self.root}/media/set/?set={set_id}" + set_page = self.request(set_url).text + set_data = self.parse_set_page(set_page) + return self.extract_set(set_data) - set_id = text.extr( - set_ids_raw, 'set=', '"' - ).rsplit("&", 1)[0] or text.extr( - set_ids_raw, '\\/photos\\/', '\\/' - ) - return set_id +class FacebookAvatarExtractor(FacebookExtractor): + """Extractor for Facebook Profile Avatars""" + subcategory = "avatar" + pattern = USER_PATTERN + r"/avatar" + example = "https://www.facebook.com/USERNAME/avatar" def items(self): - profile_photos_url = ( - self.root + "/" + self.groups[0] + "/photos_by" - ) - profile_photos_page = self.request(profile_photos_url).text + avatar_page_url = self._extract_profile_photos_page(self.groups[0])[1] + avatar_page = self.photo_page_request_wrapper(avatar_page_url).text - set_id = self.get_profile_photos_set_id(profile_photos_page) + avatar = self.parse_photo_page(avatar_page) + avatar["count"] = avatar["num"] = 1 + avatar["type"] = "avatar" - if set_id: - set_url = self.set_url_fmt.format(set_id=set_id) - set_page = self.request(set_url).text - set_data = self.parse_set_page(set_page) - return self.extract_set(set_data) + set_url = f"{self.root}/media/set/?set={avatar['set_id']}" + set_page = self.request(set_url).text + directory = self.parse_set_page(set_page) - self.log.debug("Profile photos set ID not found.") - return iter(()) + yield Message.Directory, directory + yield Message.Url, avatar["url"], avatar + + +class FacebookUserExtractor(Dispatch, FacebookExtractor): + """Extractor for Facebook Profiles""" + pattern = USER_PATTERN + r"/?(?:$|\?|#)" + example = "https://www.facebook.com/USERNAME" + + def items(self): + base = f"{self.root}/{self.groups[0]}/" + return self._dispatch_extractors(( + (FacebookAvatarExtractor, base + "avatar"), + (FacebookPhotosExtractor, base + "photos"), + ), ("photos",)) |
