aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/facebook.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/facebook.py')
-rw-r--r--gallery_dl/extractor/facebook.py447
1 files changed, 447 insertions, 0 deletions
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py
new file mode 100644
index 0000000..04acfc5
--- /dev/null
+++ b/gallery_dl/extractor/facebook.py
@@ -0,0 +1,447 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.facebook.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com"
+
+
+class FacebookExtractor(Extractor):
+ """Base class for Facebook extractors"""
+ category = "facebook"
+ root = "https://www.facebook.com"
+ directory_fmt = ("{category}", "{username}", "{title} ({set_id})")
+ filename_fmt = "{id}.{extension}"
+ archive_fmt = "{id}.{extension}"
+
+ set_url_fmt = root + "/media/set/?set={set_id}"
+ photo_url_fmt = root + "/photo/?fbid={photo_id}&set={set_id}"
+
+ def _init(self):
+ headers = self.session.headers
+ headers["Accept"] = (
+ "text/html,application/xhtml+xml,application/xml;q=0.9,"
+ "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"
+ )
+ headers["Sec-Fetch-Dest"] = "empty"
+ headers["Sec-Fetch-Mode"] = "navigate"
+ headers["Sec-Fetch-Site"] = "same-origin"
+
+ self.fallback_retries = self.config("fallback-retries", 2)
+ self.videos = self.config("videos", True)
+ self.author_followups = self.config("author-followups", False)
+
+ @staticmethod
+ def decode_all(txt):
+ return text.unescape(
+ txt.encode("utf-8").decode("unicode_escape")
+ ).replace("\\/", "/")
+
+ @staticmethod
+ def parse_set_page(set_page):
+ directory = {
+ "set_id": text.extr(
+ set_page, '"mediaSetToken":"', '"'
+ ) or text.extr(
+ set_page, '"mediasetToken":"', '"'
+ ),
+ "username": FacebookExtractor.decode_all(
+ text.extr(
+ set_page, '"user":{"__isProfile":"User","name":"', '","'
+ ) or text.extr(
+ set_page, '"actors":[{"__typename":"User","name":"', '","'
+ )
+ ),
+ "user_id": text.extr(
+ set_page, '"owner":{"__typename":"User","id":"', '"'
+ ),
+ "title": FacebookExtractor.decode_all(text.extr(
+ set_page, '"title":{"text":"', '"'
+ )),
+ "first_photo_id": text.extr(
+ set_page,
+ '{"__typename":"Photo","__isMedia":"Photo","',
+ '","creation_story"'
+ ).rsplit('"id":"', 1)[-1] or
+ text.extr(
+ set_page, '{"__typename":"Photo","id":"', '"'
+ )
+ }
+
+ return directory
+
+ @staticmethod
+ def parse_photo_page(photo_page):
+ photo = {
+ "id": text.extr(
+ photo_page, '"__isNode":"Photo","id":"', '"'
+ ),
+ "set_id": text.extr(
+ photo_page,
+ '"url":"https:\\/\\/www.facebook.com\\/photo\\/?fbid=',
+ '"'
+ ).rsplit("&set=", 1)[-1],
+ "username": FacebookExtractor.decode_all(text.extr(
+ photo_page, '"owner":{"__typename":"User","name":"', '"'
+ )),
+ "user_id": text.extr(
+ photo_page, '"owner":{"__typename":"User","id":"', '"'
+ ),
+ "caption": FacebookExtractor.decode_all(text.extr(
+ photo_page,
+ '"message":{"delight_ranges"',
+ '"},"message_preferred_body"'
+ ).rsplit('],"text":"', 1)[-1]),
+ "date": text.parse_timestamp(text.extr(
+ photo_page, '\\"publish_time\\":', ','
+ )),
+ "url": FacebookExtractor.decode_all(text.extr(
+ photo_page, ',"image":{"uri":"', '","'
+ )),
+ "next_photo_id": text.extr(
+ photo_page,
+ '"nextMediaAfterNodeId":{"__typename":"Photo","id":"',
+ '"'
+ ) or text.extr(
+ photo_page,
+ '"nextMedia":{"edges":[{"node":{"__typename":"Photo","id":"',
+ '"'
+ )
+ }
+
+ text.nameext_from_url(photo["url"], photo)
+
+ photo["followups_ids"] = []
+ for comment_raw in text.extract_iter(
+ photo_page, '{"node":{"id"', '"cursor":null}'
+ ):
+ if ('"is_author_original_poster":true' in comment_raw and
+ '{"__typename":"Photo","id":"' in comment_raw):
+ photo["followups_ids"].append(text.extr(
+ comment_raw,
+ '{"__typename":"Photo","id":"',
+ '"'
+ ))
+
+ return photo
+
+ @staticmethod
+ def parse_post_page(post_page):
+ first_photo_url = text.extr(
+ text.extr(
+ post_page, '"__isMedia":"Photo"', '"target_group"'
+ ), '"url":"', ','
+ )
+
+ post = {
+ "set_id": text.extr(post_page, '{"mediaset_token":"', '"') or
+ text.extr(first_photo_url, 'set=', '"').rsplit("&", 1)[0]
+ }
+
+ return post
+
+ @staticmethod
+ def parse_video_page(video_page):
+ video = {
+ "id": text.extr(
+ video_page, '\\"video_id\\":\\"', '\\"'
+ ),
+ "username": FacebookExtractor.decode_all(text.extr(
+ video_page, '"actors":[{"__typename":"User","name":"', '","'
+ )),
+ "user_id": text.extr(
+ video_page, '"owner":{"__typename":"User","id":"', '"'
+ ),
+ "date": text.parse_timestamp(text.extr(
+ video_page, '\\"publish_time\\":', ','
+ )),
+ "type": "video"
+ }
+
+ if not video["username"]:
+ video["username"] = FacebookExtractor.decode_all(text.extr(
+ video_page,
+ '"__typename":"User","id":"' + video["user_id"] + '","name":"',
+ '","'
+ ))
+
+ first_video_raw = text.extr(
+ video_page, '"permalink_url"', '\\/Period>\\u003C\\/MPD>'
+ )
+
+ audio = {
+ **video,
+ "url": FacebookExtractor.decode_all(text.extr(
+ text.extr(
+ first_video_raw,
+ "AudioChannelConfiguration",
+ "BaseURL>\\u003C"
+ ),
+ "BaseURL>", "\\u003C\\/"
+ )),
+ "type": "audio"
+ }
+
+ video["urls"] = {}
+
+ for raw_url in text.extract_iter(
+ first_video_raw, 'FBQualityLabel=\\"', '\\u003C\\/BaseURL>'
+ ):
+ resolution = raw_url.split('\\"', 1)[0]
+ video["urls"][resolution] = FacebookExtractor.decode_all(
+ raw_url.split('BaseURL>', 1)[1]
+ )
+
+ if not video["urls"]:
+ return video, audio
+
+ video["url"] = max(
+ video["urls"].items(),
+ key=lambda x: text.parse_int(x[0][:-1])
+ )[1]
+
+ text.nameext_from_url(video["url"], video)
+ audio["filename"] = video["filename"]
+ audio["extension"] = "m4a"
+
+ return video, audio
+
+ def photo_page_request_wrapper(self, url, **kwargs):
+ LEFT_OFF_TXT = "" if url.endswith("&set=") else (
+ "\nYou can use this URL to continue from "
+ "where you left off (added \"&setextract\"): "
+ "\n" + url + "&setextract"
+ )
+
+ res = self.request(url, **kwargs)
+
+ if res.url.startswith(self.root + "/login"):
+ raise exception.AuthenticationError(
+ "You must be logged in to continue viewing images." +
+ LEFT_OFF_TXT
+ )
+
+ if b'{"__dr":"CometErrorRoot.react"}' in res.content:
+ raise exception.StopExtraction(
+ "You've been temporarily blocked from viewing images. "
+ "\nPlease try using a different account, "
+ "using a VPN or waiting before you retry." +
+ LEFT_OFF_TXT
+ )
+
+ return res
+
+ def extract_set(self, first_photo_id, set_id):
+ all_photo_ids = [first_photo_id]
+
+ retries = 0
+ i = 0
+
+ while i < len(all_photo_ids):
+ photo_id = all_photo_ids[i]
+ photo_url = self.photo_url_fmt.format(
+ photo_id=photo_id, set_id=set_id
+ )
+ photo_page = self.photo_page_request_wrapper(photo_url).text
+
+ photo = self.parse_photo_page(photo_page)
+ photo["set_id"] = set_id
+ photo["num"] = i + 1
+
+ if self.author_followups:
+ for followup_id in photo["followups_ids"]:
+ if followup_id not in all_photo_ids:
+ self.log.debug(
+ "Found a followup in comments: %s", followup_id
+ )
+ all_photo_ids.append(followup_id)
+
+ if not photo["url"]:
+ if retries < self.fallback_retries and self._interval_429:
+ seconds = self._interval_429()
+ self.log.warning(
+ "Failed to find photo download URL for %s. "
+ "Retrying in %s seconds.", photo_url, seconds,
+ )
+ self.wait(seconds=seconds, reason="429 Too Many Requests")
+ retries += 1
+ continue
+ else:
+ self.log.error(
+ "Failed to find photo download URL for " + photo_url +
+ ". Skipping."
+ )
+ retries = 0
+ else:
+ retries = 0
+ yield Message.Url, photo["url"], photo
+
+ if photo["next_photo_id"] == "":
+ self.log.debug(
+ "Can't find next image in the set. "
+ "Extraction is over."
+ )
+ elif photo["next_photo_id"] in all_photo_ids:
+ if photo["next_photo_id"] != photo["id"]:
+ self.log.debug(
+ "Detected a loop in the set, it's likely finished. "
+ "Extraction is over."
+ )
+ else:
+ all_photo_ids.append(photo["next_photo_id"])
+
+ i += 1
+
+
+class FacebookSetExtractor(FacebookExtractor):
+ """Base class for Facebook Set extractors"""
+ subcategory = "set"
+ pattern = (
+ BASE_PATTERN +
+ r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)"
+ r"[^/?#]*(?<!&setextract)$"
+ r"|([^/?#]+/posts/[^/?#]+)"
+ r"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)"
+ )
+ example = "https://www.facebook.com/media/set/?set=SET_ID"
+
+ def items(self):
+ set_id = self.groups[0] or self.groups[3]
+ path = self.groups[1]
+ if path:
+ post_url = self.root + "/" + path
+ post_page = self.request(post_url).text
+ set_id = self.parse_post_page(post_page)["set_id"]
+
+ set_url = self.set_url_fmt.format(set_id=set_id)
+ set_page = self.request(set_url).text
+
+ directory = self.parse_set_page(set_page)
+
+ yield Message.Directory, directory
+
+ yield from self.extract_set(
+ self.groups[2] or directory["first_photo_id"],
+ directory["set_id"]
+ )
+
+
+class FacebookPhotoExtractor(FacebookExtractor):
+ """Base class for Facebook Photo extractors"""
+ subcategory = "photo"
+ pattern = (BASE_PATTERN +
+ r"/(?:[^/?#]+/photos/[^/?#]+/|photo(?:.php)?/?\?"
+ r"(?:[^&#]+&)*fbid=)([^/?&#]+)[^/?#]*(?<!&setextract)$")
+ example = "https://www.facebook.com/photo/?fbid=PHOTO_ID"
+
+ def items(self):
+ photo_id = self.groups[0]
+ photo_url = self.photo_url_fmt.format(photo_id=photo_id, set_id="")
+ photo_page = self.photo_page_request_wrapper(photo_url).text
+
+ i = 1
+ photo = self.parse_photo_page(photo_page)
+ photo["num"] = i
+
+ set_page = self.request(
+ self.set_url_fmt.format(set_id=photo["set_id"])
+ ).text
+
+ directory = self.parse_set_page(set_page)
+
+ yield Message.Directory, directory
+ yield Message.Url, photo["url"], photo
+
+ if self.author_followups:
+ for comment_photo_id in photo["followups_ids"]:
+ comment_photo = self.parse_photo_page(
+ self.photo_page_request_wrapper(
+ self.photo_url_fmt.format(
+ photo_id=comment_photo_id, set_id=""
+ )
+ ).text
+ )
+ i += 1
+ comment_photo["num"] = i
+ yield Message.Url, comment_photo["url"], comment_photo
+
+
+class FacebookVideoExtractor(FacebookExtractor):
+ """Base class for Facebook Video extractors"""
+ subcategory = "video"
+ directory_fmt = ("{category}", "{username}", "{subcategory}")
+ pattern = BASE_PATTERN + r"/(?:[^/?#]+/videos/|watch/?\?v=)([^/?&#]+)"
+ example = "https://www.facebook.com/watch/?v=VIDEO_ID"
+
+ def items(self):
+ video_id = self.groups[0]
+ video_url = self.root + "/watch/?v=" + video_id
+ video_page = self.request(video_url).text
+
+ video, audio = self.parse_video_page(video_page)
+
+ if "url" not in video:
+ return
+
+ yield Message.Directory, video
+
+ if self.videos == "ytdl":
+ yield Message.Url, "ytdl:" + video_url, video
+ elif self.videos:
+ yield Message.Url, video["url"], video
+ if audio["url"]:
+ yield Message.Url, audio["url"], audio
+
+
+class FacebookProfileExtractor(FacebookExtractor):
+ """Base class for Facebook Profile Photos Set extractors"""
+ subcategory = "profile"
+ pattern = (
+ BASE_PATTERN +
+ r"/(?!media/|photo/|photo.php|watch/)"
+ r"(?:profile\.php\?id=|people/[^/?#]+/)?"
+ r"([^/?&#]+)(?:/photos(?:_by)?|/videos|/posts)?/?(?:$|\?|#)"
+ )
+ example = "https://www.facebook.com/USERNAME"
+
+ @staticmethod
+ def get_profile_photos_set_id(profile_photos_page):
+ set_ids_raw = text.extr(
+ profile_photos_page, '"pageItems"', '"page_info"'
+ )
+
+ set_id = text.extr(
+ set_ids_raw, 'set=', '"'
+ ).rsplit("&", 1)[0] or text.extr(
+ set_ids_raw, '\\/photos\\/', '\\/'
+ )
+
+ return set_id
+
+ def items(self):
+ profile_photos_url = (
+ self.root + "/" + self.groups[0] + "/photos_by"
+ )
+ profile_photos_page = self.request(profile_photos_url).text
+
+ set_id = self.get_profile_photos_set_id(profile_photos_page)
+
+ if set_id:
+ set_url = self.set_url_fmt.format(set_id=set_id)
+ set_page = self.request(set_url).text
+
+ directory = self.parse_set_page(set_page)
+
+ yield Message.Directory, directory
+
+ yield from self.extract_set(
+ directory["first_photo_id"], directory["set_id"]
+ )
+ else:
+ self.log.debug("Profile photos set ID not found.")