diff options
Diffstat (limited to 'gallery_dl/extractor/flickr.py')
| -rw-r--r-- | gallery_dl/extractor/flickr.py | 503 |
1 files changed, 503 insertions, 0 deletions
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py new file mode 100644 index 0000000..d941d76 --- /dev/null +++ b/gallery_dl/extractor/flickr.py @@ -0,0 +1,503 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.flickr.com/""" + +from .common import Extractor, Message +from .. import text, oauth, util, exception + + +class FlickrExtractor(Extractor): + """Base class for flickr extractors""" + category = "flickr" + filename_fmt = "{category}_{id}.{extension}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = FlickrAPI(self) + self.item_id = match.group(1) + self.user = None + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for photo in self.photos(): + photo.update(data) + url = photo["url"] + yield Message.Url, url, text.nameext_from_url(url, photo) + + def metadata(self): + """Return general metadata""" + self.user = self.api.urls_lookupUser(self.item_id) + return {"user": self.user} + + def photos(self): + """Return an iterable with all relevant photo objects""" + + +class FlickrImageExtractor(FlickrExtractor): + """Extractor for individual images from flickr.com""" + subcategory = "image" + archive_fmt = "{id}" + pattern = (r"(?:https?://)?(?:" + r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/" + r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" + r"|flic\.kr/p/([A-Za-z1-9]+))") + test = ( + ("https://www.flickr.com/photos/departingyyz/16089302239", { + "pattern": pattern, + "content": "0821a28ee46386e85b02b67cf2720063440a228c", + "keyword": { + "comments": int, + "description": str, + "extension": "jpg", + "filename": "16089302239_de18cd8017_b", + "id": 16089302239, + "height": 683, + "label": "Large", + "media": "photo", + "url": str, + "views": int, + "width": 1024, + }, + }), + ("https://www.flickr.com/photos/145617051@N08/46733161535", { + "count": 1, + "keyword": {"media": "video"}, + }), + ("http://c2.staticflickr.com/2/1475/24531000464_9a7503ae68_b.jpg", { + "pattern": pattern}), + ("https://farm2.static.flickr.com/1035/1188352415_cb139831d0.jpg", { + "pattern": pattern}), + ("https://flic.kr/p/FPVo9U", { + "pattern": pattern}), + ("https://www.flickr.com/photos/zzz/16089302238", { + "exception": exception.NotFoundError}), + ) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + if not self.item_id: + alphabet = ("123456789abcdefghijkmnopqrstu" + "vwxyzABCDEFGHJKLMNPQRSTUVWXYZ") + self.item_id = util.bdecode(match.group(2), alphabet) + + def items(self): + photo = self.api.photos_getInfo(self.item_id) + + if photo["media"] == "video" and self.api.videos: + self.api._extract_video(photo) + else: + self.api._extract_photo(photo) + + photo["title"] = photo["title"]["_content"] + photo["comments"] = text.parse_int(photo["comments"]["_content"]) + photo["description"] = photo["description"]["_content"] + photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]] + photo["date"] = text.parse_timestamp(photo["dateuploaded"]) + photo["views"] = text.parse_int(photo["views"]) + photo["id"] = text.parse_int(photo["id"]) + + if "location" in photo: + location = photo["location"] + for key, value in location.items(): + if isinstance(value, dict): + location[key] = value["_content"] + + url = photo["url"] + yield Message.Version, 1 + yield Message.Directory, photo + yield Message.Url, url, text.nameext_from_url(url, photo) + + +class FlickrAlbumExtractor(FlickrExtractor): + """Extractor for photo albums from flickr.com""" + subcategory = "album" + directory_fmt = ("{category}", "{subcategory}s", + "{album[id]} - {album[title]}") + archive_fmt = "a_{album[id]}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" + r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?") + test = ( + (("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), { + "pattern": FlickrImageExtractor.pattern, + "count": 6, + }), + ("https://www.flickr.com/photos/shona_s/albums", { + "pattern": pattern, + "count": 2, + }), + ) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + self.album_id = match.group(2) + + def items(self): + if self.album_id: + return FlickrExtractor.items(self) + return self._album_items() + + def _album_items(self): + yield Message.Version, 1 + data = FlickrExtractor.metadata(self) + data["_extractor"] = FlickrAlbumExtractor + + for album in self.api.photosets_getList(self.user["nsid"]): + self.api._clean_info(album).update(data) + url = "https://www.flickr.com/photos/{}/albums/{}".format( + self.user["path_alias"], album["id"]) + yield Message.Queue, url, album + + def metadata(self): + data = FlickrExtractor.metadata(self) + data["album"] = self.api.photosets_getInfo( + self.album_id, self.user["nsid"]) + return data + + def photos(self): + return self.api.photosets_getPhotos(self.album_id) + + +class FlickrGalleryExtractor(FlickrExtractor): + """Extractor for photo galleries from flickr.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "galleries", + "{user[username]} {gallery[id]}") + archive_fmt = "g_{gallery[id]}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" + r"photos/([^/]+)/galleries/(\d+)") + test = (("https://www.flickr.com/photos/flickr/" + "galleries/72157681572514792/"), { + "pattern": FlickrImageExtractor.pattern, + "count": ">= 10", + }) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + self.gallery_id = match.group(2) + + def metadata(self): + data = FlickrExtractor.metadata(self) + data["gallery"] = self.api.galleries_getInfo(self.gallery_id) + return data + + def photos(self): + return self.api.galleries_getPhotos(self.gallery_id) + + +class FlickrGroupExtractor(FlickrExtractor): + """Extractor for group pools from flickr.com""" + subcategory = "group" + directory_fmt = ("{category}", "{subcategory}s", "{group[groupname]}") + archive_fmt = "G_{group[nsid]}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)" + test = ("https://www.flickr.com/groups/bird_headshots/", { + "pattern": FlickrImageExtractor.pattern, + "count": "> 150", + }) + + def metadata(self): + self.group = self.api.urls_lookupGroup(self.item_id) + return {"group": self.group} + + def photos(self): + return self.api.groups_pools_getPhotos(self.group["nsid"]) + + +class FlickrUserExtractor(FlickrExtractor): + """Extractor for the photostream of a flickr user""" + subcategory = "user" + directory_fmt = ("{category}", "{user[username]}") + archive_fmt = "u_{user[nsid]}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$" + test = ("https://www.flickr.com/photos/shona_s/", { + "pattern": FlickrImageExtractor.pattern, + "count": 28, + }) + + def photos(self): + return self.api.people_getPhotos(self.user["nsid"]) + + +class FlickrFavoriteExtractor(FlickrExtractor): + """Extractor for favorite photos of a flickr user""" + subcategory = "favorite" + directory_fmt = ("{category}", "{subcategory}s", "{user[username]}") + archive_fmt = "f_{user[nsid]}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites" + test = ("https://www.flickr.com/photos/shona_s/favorites", { + "pattern": FlickrImageExtractor.pattern, + "count": 4, + }) + + def photos(self): + return self.api.favorites_getList(self.user["nsid"]) + + +class FlickrSearchExtractor(FlickrExtractor): + """Extractor for flickr photos based on search results""" + subcategory = "search" + directory_fmt = ("{category}", "{subcategory}", "{search[text]}") + archive_fmt = "s_{search}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)" + test = ( + ("https://flickr.com/search/?text=mountain"), + ("https://flickr.com/search/?text=tree%20cloud%20house" + "&color_codes=4&styles=minimalism"), + ) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + self.search = text.parse_query(match.group(1)) + if "text" not in self.search: + self.search["text"] = "" + + def metadata(self): + return {"search": self.search} + + def photos(self): + return self.api.photos_search(self.search) + + +class FlickrAPI(oauth.OAuth1API): + """Minimal interface for the flickr API""" + API_URL = "https://api.flickr.com/services/rest/" + API_KEY = "ac4fd7aa98585b9eee1ba761c209de68" + API_SECRET = "3adb0f568dc68393" + FORMATS = [ + ("o", "Original" , None), + ("k", "Large 2048" , 2048), + ("h", "Large 1600" , 1600), + ("l", "Large" , 1024), + ("c", "Medium 800" , 800), + ("z", "Medium 640" , 640), + ("m", "Medium" , 500), + ("n", "Small 320" , 320), + ("s", "Small" , 240), + ("q", "Large Square", 150), + ("t", "Thumbnail" , 100), + ("s", "Square" , 75), + ] + VIDEO_FORMATS = { + "orig" : 9, + "1080p" : 8, + "720p" : 7, + "360p" : 6, + "288p" : 5, + "700" : 4, + "300" : 3, + "100" : 2, + "appletv" : 1, + "iphone_wifi": 0, + } + + def __init__(self, extractor): + oauth.OAuth1API.__init__(self, extractor) + + self.videos = extractor.config("videos", True) + self.maxsize = extractor.config("size-max") + if isinstance(self.maxsize, str): + for fmt, fmtname, fmtwidth in self.FORMATS: + if self.maxsize == fmt or self.maxsize == fmtname: + self.maxsize = fmtwidth + break + else: + self.maxsize = None + extractor.log.warning( + "Could not match '%s' to any format", self.maxsize) + if self.maxsize: + self.formats = [fmt for fmt in self.FORMATS + if not fmt[2] or fmt[2] <= self.maxsize] + else: + self.formats = self.FORMATS + self.formats = self.formats[:4] + + def favorites_getList(self, user_id): + """Returns a list of the user's favorite photos.""" + params = {"user_id": user_id} + return self._pagination("favorites.getList", params) + + def galleries_getInfo(self, gallery_id): + """Gets information about a gallery.""" + params = {"gallery_id": gallery_id} + gallery = self._call("galleries.getInfo", params)["gallery"] + return self._clean_info(gallery) + + def galleries_getPhotos(self, gallery_id): + """Return the list of photos for a gallery.""" + params = {"gallery_id": gallery_id} + return self._pagination("galleries.getPhotos", params) + + def groups_pools_getPhotos(self, group_id): + """Returns a list of pool photos for a given group.""" + params = {"group_id": group_id} + return self._pagination("groups.pools.getPhotos", params) + + def people_getPhotos(self, user_id): + """Return photos from the given user's photostream.""" + params = {"user_id": user_id} + return self._pagination("people.getPhotos", params) + + def photos_getInfo(self, photo_id): + """Get information about a photo.""" + params = {"photo_id": photo_id} + return self._call("photos.getInfo", params)["photo"] + + def photos_getSizes(self, photo_id): + """Returns the available sizes for a photo.""" + params = {"photo_id": photo_id} + sizes = self._call("photos.getSizes", params)["sizes"]["size"] + if self.maxsize: + for index, size in enumerate(sizes): + if index > 0 and (int(size["width"]) > self.maxsize or + int(size["height"]) > self.maxsize): + del sizes[index:] + break + return sizes + + def photos_search(self, params): + """Return a list of photos matching some criteria.""" + return self._pagination("photos.search", params.copy()) + + def photosets_getInfo(self, photoset_id, user_id): + """Gets information about a photoset.""" + params = {"photoset_id": photoset_id, "user_id": user_id} + photoset = self._call("photosets.getInfo", params)["photoset"] + return self._clean_info(photoset) + + def photosets_getList(self, user_id): + """Returns the photosets belonging to the specified user.""" + params = {"user_id": user_id} + return self._pagination_sets("photosets.getList", params) + + def photosets_getPhotos(self, photoset_id): + """Get the list of photos in a set.""" + params = {"photoset_id": photoset_id} + return self._pagination("photosets.getPhotos", params, "photoset") + + def urls_lookupGroup(self, groupname): + """Returns a group NSID, given the url to a group's page.""" + params = {"url": "https://www.flickr.com/groups/" + groupname} + group = self._call("urls.lookupGroup", params)["group"] + return {"nsid": group["id"], + "path_alias": groupname, + "groupname": group["groupname"]["_content"]} + + def urls_lookupUser(self, username): + """Returns a user NSID, given the url to a user's photos or profile.""" + params = {"url": "https://www.flickr.com/photos/" + username} + user = self._call("urls.lookupUser", params)["user"] + return {"nsid": user["id"], + "path_alias": username, + "username": user["username"]["_content"]} + + def video_getStreamInfo(self, video_id, secret=None): + """Returns all available video streams""" + params = {"photo_id": video_id} + if not secret: + secret = self._call("photos.getInfo", params)["photo"]["secret"] + params["secret"] = secret + stream = self._call("video.getStreamInfo", params)["streams"]["stream"] + return max(stream, key=lambda s: self.VIDEO_FORMATS.get(s["type"], 0)) + + def _call(self, method, params): + params["method"] = "flickr." + method + params["format"] = "json" + params["nojsoncallback"] = "1" + if self.api_key: + params["api_key"] = self.api_key + data = self.request(self.API_URL, params=params).json() + if "code" in data: + if data["code"] == 1: + raise exception.NotFoundError(self.extractor.subcategory) + elif data["code"] == 98: + raise exception.AuthenticationError(data.get("message")) + elif data["code"] == 99: + raise exception.AuthorizationError() + self.log.error("API call failed: %s", data.get("message")) + raise exception.StopExtraction() + return data + + def _pagination(self, method, params, key="photos"): + params["extras"] = "description,date_upload,tags,views,media," + params["extras"] += ",".join("url_" + fmt[0] for fmt in self.formats) + params["page"] = 1 + + while True: + data = self._call(method, params)[key] + yield from map(self._extract_format, data["photo"]) + if params["page"] >= data["pages"]: + return + params["page"] += 1 + + def _pagination_sets(self, method, params): + params["page"] = 1 + + while True: + data = self._call(method, params)["photosets"] + yield from data["photoset"] + if params["page"] >= data["pages"]: + return + params["page"] += 1 + + def _extract_format(self, photo): + photo["description"] = photo["description"]["_content"].strip() + photo["views"] = text.parse_int(photo["views"]) + photo["date"] = text.parse_timestamp(photo["dateupload"]) + photo["tags"] = photo["tags"].split() + photo["id"] = text.parse_int(photo["id"]) + + if photo["media"] == "video" and self.videos: + return self._extract_video(photo) + + for fmt, fmtname, fmtwidth in self.formats: + key = "url_" + fmt + if key in photo: + photo["width"] = text.parse_int(photo["width_" + fmt]) + photo["height"] = text.parse_int(photo["height_" + fmt]) + if self.maxsize and (photo["width"] > self.maxsize or + photo["height"] > self.maxsize): + continue + photo["url"] = photo[key] + photo["label"] = fmtname + + # remove excess data + keys = [ + key for key in photo + if key.startswith(("url_", "width_", "height_")) + ] + for key in keys: + del photo[key] + break + else: + self._extract_photo(photo) + + return photo + + def _extract_photo(self, photo): + size = self.photos_getSizes(photo["id"])[-1] + photo["url"] = size["source"] + photo["label"] = size["label"] + photo["width"] = text.parse_int(size["width"]) + photo["height"] = text.parse_int(size["height"]) + return photo + + def _extract_video(self, photo): + stream = self.video_getStreamInfo(photo["id"], photo.get("secret")) + photo["url"] = stream["_content"] + photo["label"] = stream["type"] + photo["width"] = photo["height"] = 0 + return photo + + @staticmethod + def _clean_info(info): + info["title"] = info["title"]["_content"] + info["description"] = info["description"]["_content"] + return info |
