diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/artstation.py | 58 | ||||
| -rw-r--r-- | gallery_dl/extractor/blogger.py | 59 | ||||
| -rw-r--r-- | gallery_dl/extractor/deviantart.py | 28 | ||||
| -rw-r--r-- | gallery_dl/extractor/exhentai.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/imagehosts.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 720 | ||||
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 30 | ||||
| -rw-r--r-- | gallery_dl/extractor/mastodon.py | 9 | ||||
| -rw-r--r-- | gallery_dl/extractor/myportfolio.py | 7 | ||||
| -rw-r--r-- | gallery_dl/extractor/newgrounds.py | 63 | ||||
| -rw-r--r-- | gallery_dl/extractor/pixiv.py | 60 | ||||
| -rw-r--r-- | gallery_dl/extractor/plurk.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/sankaku.py | 56 | ||||
| -rw-r--r-- | gallery_dl/extractor/skeb.py | 64 | ||||
| -rw-r--r-- | gallery_dl/extractor/tumblr.py | 40 |
15 files changed, 824 insertions, 380 deletions
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index c0e8e67..62626a1 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -103,16 +103,23 @@ class ArtstationExtractor(Extractor): return response.json() def _pagination(self, url, params=None, json=None): + headers = { + "Accept" : "application/json, text/plain, */*", + "Origin" : self.root, + "Referer": self.root + "/", + } + if json: params = json - kwargs = {"json": json} + headers["PUBLIC-CSRF-TOKEN"] = self._init_csrf_token() + kwargs = {"method": "POST", "headers": headers, "json": json} else: if not params: params = {} - kwargs = {"params": params} + kwargs = {"params": params, "headers": headers} - params["page"] = 1 total = 0 + params["page"] = 1 while True: data = self.request(url, **kwargs).json() @@ -124,6 +131,17 @@ class ArtstationExtractor(Extractor): params["page"] += 1 + def _init_csrf_token(self): + url = self.root + "/api/v2/csrf_protection/token.json" + headers = { + "Accept" : "*/*", + "Origin" : self.root, + "Referer": self.root + "/", + } + return self.request( + url, method="POST", headers=headers, json={}, + ).json()["public_csrf_token"] + @staticmethod def _no_cache(url, alphabet=(string.digits + string.ascii_letters)): """Cause a cache miss to prevent Cloudflare 'optimizations' @@ -298,34 +316,46 @@ class ArtstationSearchExtractor(ArtstationExtractor): archive_fmt = "s_{search[query]}_{asset[id]}" pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com" r"/search/?\?([^#]+)") - test = ("https://www.artstation.com/search?q=ancient&sort_by=rank", { + test = ("https://www.artstation.com/search?query=ancient&sort_by=rank", { "range": "1-20", "count": 20, }) def __init__(self, match): ArtstationExtractor.__init__(self, match) - query = text.parse_query(match.group(1)) - self.query = query.get("q", "") - self.sorting = query.get("sort_by", "rank").lower() + self.params = query = text.parse_query(match.group(1)) + self.query = text.unquote(query.get("query") or query.get("q", "")) + self.sorting = query.get("sort_by", "relevance").lower() + self.tags = query.get("tags", "").split(",") def metadata(self): return {"search": { "query" : self.query, "sorting": self.sorting, + "tags" : self.tags, }} def projects(self): + filters = [] + for key, value in self.params.items(): + if key.endswith("_ids") or key == "tags": + filters.append({ + "field" : key, + "method": "include", + "value" : value.split(","), + }) + url = "{}/api/v2/search/projects.json".format(self.root) - return self._pagination(url, json={ - "additional_fields": "[]", - "filters" : "[]", - "page" : None, - "per_page" : "50", - "pro_first" : "1", + data = { "query" : self.query, + "page" : None, + "per_page" : 50, "sorting" : self.sorting, - }) + "pro_first" : "1", + "filters" : filters, + "additional_fields": (), + } + return self._pagination(url, json=data) class ArtstationArtworkExtractor(ArtstationExtractor): diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index e0885d2..232f3ea 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -48,6 +48,7 @@ class BloggerExtractor(Extractor): r'\d+\.bp\.blogspot\.com)/[^"]+)').findall findall_video = re.compile( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall + metadata = self.metadata() for post in self.posts(blog): content = post["content"] @@ -74,18 +75,21 @@ class BloggerExtractor(Extractor): del post["selfLink"] del post["blog"] - yield Message.Directory, {"blog": blog, "post": post} - for num, url in enumerate(files, 1): - yield Message.Url, url, text.nameext_from_url(url, { - "blog": blog, - "post": post, - "url" : url, - "num" : num, - }) + data = {"blog": blog, "post": post} + if metadata: + data.update(metadata) + yield Message.Directory, data + + for data["num"], url in enumerate(files, 1): + data["url"] = url + yield Message.Url, url, text.nameext_from_url(url, data) def posts(self, blog): """Return an iterable with all relevant post objects""" + def metadata(self): + """Return additional metadata""" + class BloggerPostExtractor(BloggerExtractor): """Extractor for a single blog post""" @@ -173,31 +177,48 @@ class BloggerBlogExtractor(BloggerExtractor): class BloggerSearchExtractor(BloggerExtractor): - """Extractor for search resuls and labels""" + """Extractor for Blogger search resuls""" subcategory = "search" - pattern = BASE_PATTERN + r"/search(?:/?\?q=([^/?#]+)|/label/([^/?#]+))" + pattern = BASE_PATTERN + r"/search/?\?q=([^&#]+)" test = ( ("https://julianbphotography.blogspot.com/search?q=400mm", { - "count": "< 10" + "count": "< 10", + "keyword": {"query": "400mm"}, }), + ) + + def __init__(self, match): + BloggerExtractor.__init__(self, match) + self.query = text.unquote(match.group(3)) + + def posts(self, blog): + return self.api.blog_search(blog["id"], self.query) + + def metadata(self): + return {"query": self.query} + + +class BloggerLabelExtractor(BloggerExtractor): + """Extractor for Blogger posts by label""" + subcategory = "label" + pattern = BASE_PATTERN + r"/search/label/([^/?#]+)" + test = ( ("https://dmmagazine.blogspot.com/search/label/D%26D", { "range": "1-25", "count": 25, + "keyword": {"label": "D&D"}, }), ) def __init__(self, match): BloggerExtractor.__init__(self, match) - query = match.group(3) - if query: - self.query, self.label = query, None - else: - self.query, self.label = None, match.group(4) + self.label = text.unquote(match.group(3)) def posts(self, blog): - if self.query: - return self.api.blog_search(blog["id"], text.unquote(self.query)) - return self.api.blog_posts(blog["id"], text.unquote(self.label)) + return self.api.blog_posts(blog["id"], self.label) + + def metadata(self): + return {"label": self.label} class BloggerAPI(): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 60f644d..6897476 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -936,12 +936,13 @@ class DeviantartDeviationExtractor(DeviantartExtractor): self.deviation_id = match.group(4) def deviations(self): - deviation = DeviantartEclipseAPI(self).deviation_extended_fetch( - self.deviation_id, self.user, self.type) - if "error" in deviation: + url = "{}/{}/{}/{}".format( + self.root, self.user, self.type, self.deviation_id) + appurl = text.extract(self._limited_request(url).text, + 'property="da:appurl" content="', '"')[0] + if not appurl: raise exception.NotFoundError("deviation") - return (self.api.deviation( - deviation["deviation"]["extended"]["deviationUuid"]),) + return (self.api.deviation(appurl.rpartition("/")[2]),) class DeviantartScrapsExtractor(DeviantartExtractor): @@ -1398,6 +1399,8 @@ class DeviantartEclipseAPI(): def __init__(self, extractor): self.extractor = extractor self.log = extractor.log + self.request = self.extractor._limited_request + self.csrf_token = None def deviation_extended_fetch(self, deviation_id, user=None, kind=None): endpoint = "/da-browse/shared_api/deviation/extended_fetch" @@ -1429,11 +1432,12 @@ class DeviantartEclipseAPI(): } return self._pagination(endpoint, params) - def _call(self, endpoint, params=None): + def _call(self, endpoint, params): url = "https://www.deviantart.com/_napi" + endpoint headers = {"Referer": "https://www.deviantart.com/"} + params["csrf_token"] = self.csrf_token or self._fetch_csrf_token() - response = self.extractor._limited_request( + response = self.request( url, params=params, headers=headers, fatal=None) if response.status_code == 404: @@ -1464,12 +1468,20 @@ class DeviantartEclipseAPI(): def _module_id_watching(self, user): url = "{}/{}/about".format(self.extractor.root, user) - page = self.extractor._limited_request(url).text + page = self.request(url).text pos = page.find('\\"type\\":\\"watching\\"') if pos < 0: raise exception.NotFoundError("module") + self._fetch_csrf_token(page) return text.rextract(page, '\\"id\\":', ',', pos)[0].strip('" ') + def _fetch_csrf_token(self, page=None): + if page is None: + page = self.request(self.extractor.root + "/").text + self.csrf_token = token = text.extract( + page, "window.__CSRF_TOKEN__ = '", "'")[0] + return token + @cache(maxage=100*365*24*3600, keyarg=0) def _refresh_token_cache(token): diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 01ba03a..e37e81b 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -505,7 +505,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): if url == last: continue last = url - yield Message.Queue, url, data + yield Message.Queue, url + "/", data if 'class="ptdd">><' in page or ">No hits found</p>" in page: return diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index d699f07..69455a8 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2021 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -292,7 +292,7 @@ class FappicImageExtractor(ImagehostImageExtractor): }) def get_info(self, page): - url , pos = text.extract(page, '<a href="/?click"><img src="', '"') + url , pos = text.extract(page, '<a href="#"><img src="', '"') filename, pos = text.extract(page, 'alt="', '"', pos) if filename.startswith("Porn-Picture-"): diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 8c98d2e..425d541 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -34,13 +34,25 @@ class InstagramExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.item = match.group(1) + self.api = None self.www_claim = "0" self.csrf_token = util.generate_token() + self._logged_in = True self._find_tags = re.compile(r"#\w+").findall self._cursor = None def items(self): self.login() + + api = self.config("api") + if api is None or api == "auto": + api = InstagramRestAPI if self._logged_in else InstagramGraphqlAPI + elif api == "graphql": + api = InstagramGraphqlAPI + else: + api = InstagramRestAPI + self.api = api(self) + data = self.metadata() videos = self.config("videos", True) previews = self.config("previews", False) @@ -51,10 +63,11 @@ class InstagramExtractor(Extractor): if "__typename" in post: post = self._parse_post_graphql(post) else: - post = self._parse_post_api(post) + post = self._parse_post_rest(post) post.update(data) files = post.pop("_files") + post["count"] = len(files) yield Message.Directory, post for file in files: file.update(post) @@ -107,63 +120,6 @@ class InstagramExtractor(Extractor): return response - def _request_api(self, endpoint, **kwargs): - url = "https://i.instagram.com/api" + endpoint - kwargs["headers"] = { - "X-CSRFToken" : self.csrf_token, - "X-IG-App-ID" : "936619743392459", - "X-IG-WWW-Claim": self.www_claim, - } - kwargs["cookies"] = { - "csrftoken": self.csrf_token, - } - return self.request(url, **kwargs).json() - - def _request_graphql(self, query_hash, variables): - url = self.root + "/graphql/query/" - params = { - "query_hash": query_hash, - "variables" : json.dumps(variables), - } - headers = { - "X-CSRFToken" : self.csrf_token, - "X-IG-App-ID" : "936619743392459", - "X-IG-WWW-Claim" : self.www_claim, - "X-Requested-With": "XMLHttpRequest", - } - cookies = { - "csrftoken": self.csrf_token, - } - return self.request( - url, params=params, headers=headers, cookies=cookies, - ).json()["data"] - - @memcache(keyarg=1) - def _user_by_screen_name(self, screen_name): - url = "https://www.instagram.com/{}/?__a=1&__d=dis".format( - screen_name) - headers = { - "Referer": "https://www.instagram.com/{}/".format(screen_name), - "X-CSRFToken" : self.csrf_token, - "X-IG-App-ID" : "936619743392459", - "X-IG-WWW-Claim" : self.www_claim, - "X-Requested-With": "XMLHttpRequest", - } - cookies = { - "csrftoken": self.csrf_token, - } - return self.request( - url, headers=headers, cookies=cookies).json()["graphql"]["user"] - - def _uid_by_screen_name(self, screen_name): - if screen_name.startswith("id:"): - return screen_name[3:] - return self._user_by_screen_name(screen_name)["id"] - - def _media_by_id(self, post_id): - endpoint = "/v1/media/{}/info/".format(post_id) - return self._pagination_api(endpoint) - def login(self): self._username = None if not self._check_cookies(self.cookienames): @@ -171,92 +127,13 @@ class InstagramExtractor(Extractor): if username: self._username = username self._update_cookies(_login_impl(self, username, password)) + else: + self._logged_in = False self.session.cookies.set( "csrftoken", self.csrf_token, domain=self.cookiedomain) - def _parse_post_graphql(self, post): - typename = post["__typename"] - - if post.get("is_video") and "video_url" not in post: - media = next(self._media_by_id(post["id"])) - return self._parse_post_api(media) - - if typename == "GraphSidecar" and \ - "edge_sidecar_to_children" not in post: - media = next(self._media_by_id(post["id"])) - return self._parse_post_api(media) - - pinned = post.get("pinned_for_users", ()) - if pinned: - for index, user in enumerate(pinned): - pinned[index] = int(user["id"]) - - owner = post["owner"] - data = { - "typename" : typename, - "date" : text.parse_timestamp(post["taken_at_timestamp"]), - "likes" : post["edge_media_preview_like"]["count"], - "pinned" : pinned, - "owner_id" : owner["id"], - "username" : owner.get("username"), - "fullname" : owner.get("full_name"), - "post_id" : post["id"], - "post_shortcode": post["shortcode"], - "post_url" : "{}/p/{}/".format(self.root, post["shortcode"]), - "description": text.parse_unicode_escapes("\n".join( - edge["node"]["text"] - for edge in post["edge_media_to_caption"]["edges"] - )), - } - - tags = self._find_tags(data["description"]) - if tags: - data["tags"] = sorted(set(tags)) - - location = post.get("location") - if location: - data["location_id"] = location["id"] - data["location_slug"] = location["slug"] - data["location_url"] = "{}/explore/locations/{}/{}/".format( - self.root, location["id"], location["slug"]) - - data["_files"] = files = [] - if "edge_sidecar_to_children" in post: - for num, edge in enumerate( - post["edge_sidecar_to_children"]["edges"], 1): - node = edge["node"] - dimensions = node["dimensions"] - media = { - "num": num, - "media_id" : node["id"], - "shortcode" : (node.get("shortcode") or - shortcode_from_id(node["id"])), - "display_url": node["display_url"], - "video_url" : node.get("video_url"), - "width" : dimensions["width"], - "height" : dimensions["height"], - "sidecar_media_id" : post["id"], - "sidecar_shortcode": post["shortcode"], - } - self._extract_tagged_users(node, media) - files.append(media) - else: - dimensions = post["dimensions"] - media = { - "media_id" : post["id"], - "shortcode" : post["shortcode"], - "display_url": post["display_url"], - "video_url" : post.get("video_url"), - "width" : dimensions["width"], - "height" : dimensions["height"], - } - self._extract_tagged_users(post, media) - files.append(media) - - return data - - def _parse_post_api(self, post): - if "items" in post: + def _parse_post_rest(self, post): + if "items" in post: # story or highlight items = post["items"] reel_id = str(post["id"]).rpartition(":")[2] data = { @@ -270,7 +147,7 @@ class InstagramExtractor(Extractor): if "created_at" in post: data["date"] = text.parse_timestamp(post.get("created_at")) - else: + else: # regular image/video post data = { "post_id" : post["pk"], "post_shortcode": post["code"], @@ -344,6 +221,85 @@ class InstagramExtractor(Extractor): return data + def _parse_post_graphql(self, post): + typename = post["__typename"] + + if self._logged_in: + if post.get("is_video") and "video_url" not in post: + post = self.api.media(post["id"])[0] + elif typename == "GraphSidecar" and \ + "edge_sidecar_to_children" not in post: + post = self.api.media(post["id"])[0] + + pinned = post.get("pinned_for_users", ()) + if pinned: + for index, user in enumerate(pinned): + pinned[index] = int(user["id"]) + + owner = post["owner"] + data = { + "typename" : typename, + "date" : text.parse_timestamp(post["taken_at_timestamp"]), + "likes" : post["edge_media_preview_like"]["count"], + "pinned" : pinned, + "owner_id" : owner["id"], + "username" : owner.get("username"), + "fullname" : owner.get("full_name"), + "post_id" : post["id"], + "post_shortcode": post["shortcode"], + "post_url" : "{}/p/{}/".format(self.root, post["shortcode"]), + "description": text.parse_unicode_escapes("\n".join( + edge["node"]["text"] + for edge in post["edge_media_to_caption"]["edges"] + )), + } + + tags = self._find_tags(data["description"]) + if tags: + data["tags"] = sorted(set(tags)) + + location = post.get("location") + if location: + data["location_id"] = location["id"] + data["location_slug"] = location["slug"] + data["location_url"] = "{}/explore/locations/{}/{}/".format( + self.root, location["id"], location["slug"]) + + data["_files"] = files = [] + if "edge_sidecar_to_children" in post: + for num, edge in enumerate( + post["edge_sidecar_to_children"]["edges"], 1): + node = edge["node"] + dimensions = node["dimensions"] + media = { + "num": num, + "media_id" : node["id"], + "shortcode" : (node.get("shortcode") or + shortcode_from_id(node["id"])), + "display_url": node["display_url"], + "video_url" : node.get("video_url"), + "width" : dimensions["width"], + "height" : dimensions["height"], + "sidecar_media_id" : post["id"], + "sidecar_shortcode": post["shortcode"], + } + self._extract_tagged_users(node, media) + files.append(media) + else: + dimensions = post["dimensions"] + media = { + "media_id" : post["id"], + "shortcode" : post["shortcode"], + "display_url": post["display_url"], + "video_url" : post.get("video_url"), + "width" : dimensions["width"], + "height" : dimensions["height"], + } + self._extract_tagged_users(post, media) + files.append(media) + + return data + @staticmethod def _extract_tagged_users(src, dest): dest["tagged_users"] = tagged_users = [] @@ -382,51 +338,6 @@ class InstagramExtractor(Extractor): "username" : user["username"], "full_name": user["full_name"]}) - def _pagination_graphql(self, query_hash, variables): - cursor = self.config("cursor") - if cursor: - variables["after"] = cursor - - while True: - data = next(iter(self._request_graphql( - query_hash, variables)["user"].values())) - - for edge in data["edges"]: - yield edge["node"] - - info = data["page_info"] - if not info["has_next_page"]: - return - elif not data["edges"]: - s = "" if self.item.endswith("s") else "s" - raise exception.StopExtraction( - "%s'%s posts are private", self.item, s) - - variables["after"] = self._cursor = info["end_cursor"] - self.log.debug("Cursor: %s", self._cursor) - - def _pagination_api(self, endpoint, params=None): - if params is None: - params = {} - while True: - data = self._request_api(endpoint, params=params) - yield from data["items"] - - if not data["more_available"]: - return - params["max_id"] = data["next_max_id"] - - def _pagination_api_post(self, endpoint, params, post=False): - while True: - data = self._request_api(endpoint, method="POST", data=params) - for item in data["items"]: - yield item["media"] - - info = data["paging_info"] - if not info["more_available"]: - return - params["max_id"] = info["max_id"] - class InstagramUserExtractor(InstagramExtractor): """Extractor for an Instagram user profile""" @@ -446,13 +357,13 @@ class InstagramUserExtractor(InstagramExtractor): (InstagramHighlightsExtractor, base + "highlights/"), (InstagramPostsExtractor , base + "posts/"), (InstagramReelsExtractor , base + "reels/"), - (InstagramChannelExtractor , base + "channel/"), (InstagramTaggedExtractor , base + "tagged/"), + (InstagramChannelExtractor , base + "channel/"), ), ("posts",)) class InstagramPostsExtractor(InstagramExtractor): - """Extractor for ProfilePage posts""" + """Extractor for an Instagram user's posts""" subcategory = "posts" pattern = USER_PATTERN + r"/posts" test = ("https://www.instagram.com/instagram/posts/", { @@ -461,13 +372,26 @@ class InstagramPostsExtractor(InstagramExtractor): }) def posts(self): - query_hash = "69cba40317214236af40e7efa697781d" - variables = {"id": self._uid_by_screen_name(self.item), "first": 50} - return self._pagination_graphql(query_hash, variables) + uid = self.api.user_id(self.item) + return self.api.user_feed(uid) + + +class InstagramReelsExtractor(InstagramExtractor): + """Extractor for an Instagram user's reels""" + subcategory = "reels" + pattern = USER_PATTERN + r"/reels" + test = ("https://www.instagram.com/instagram/reels/", { + "range": "40-60", + "count": ">= 20", + }) + + def posts(self): + uid = self.api.user_id(self.item) + return self.api.user_clips(uid) class InstagramTaggedExtractor(InstagramExtractor): - """Extractor for ProfilePage tagged posts""" + """Extractor for an Instagram user's tagged posts""" subcategory = "tagged" pattern = USER_PATTERN + r"/tagged" test = ("https://www.instagram.com/instagram/tagged/", { @@ -485,7 +409,7 @@ class InstagramTaggedExtractor(InstagramExtractor): self.user_id = self.item[3:] return {"tagged_owner_id": self.user_id} - user = self._user_by_screen_name(self.item) + user = self.api.user(self.item) self.user_id = user["id"] return { @@ -495,13 +419,11 @@ class InstagramTaggedExtractor(InstagramExtractor): } def posts(self): - endpoint = "/v1/usertags/{}/feed/".format(self.user_id) - params = {"count": 50} - return self._pagination_api(endpoint, params) + return self.api.user_tagged(self.user_id) class InstagramChannelExtractor(InstagramExtractor): - """Extractor for ProfilePage channel""" + """Extractor for an Instagram user's channel posts""" subcategory = "channel" pattern = USER_PATTERN + r"/channel" test = ("https://www.instagram.com/instagram/channel/", { @@ -510,25 +432,25 @@ class InstagramChannelExtractor(InstagramExtractor): }) def posts(self): - query_hash = "bc78b344a68ed16dd5d7f264681c4c76" - variables = {"id": self._uid_by_screen_name(self.item), "first": 50} - return self._pagination_graphql(query_hash, variables) + uid = self.api.user_id(self.item) + return self.api.user_clips(uid) class InstagramSavedExtractor(InstagramExtractor): - """Extractor for ProfilePage saved media""" + """Extractor for an Instagram user's saved media""" subcategory = "saved" - pattern = USER_PATTERN + r"/saved/?$" - test = ("https://www.instagram.com/instagram/saved/",) + pattern = USER_PATTERN + r"/saved(?:/all-posts)?/?$" + test = ( + ("https://www.instagram.com/instagram/saved/"), + ("https://www.instagram.com/instagram/saved/all-posts/"), + ) def posts(self): - query_hash = "2ce1d673055b99250e93b6f88f878fde" - variables = {"id": self._uid_by_screen_name(self.item), "first": 50} - return self._pagination_graphql(query_hash, variables) + return self.api.user_saved() class InstagramCollectionExtractor(InstagramExtractor): - """Extractor for ProfilePage saved collection media""" + """Extractor for Instagram collection""" subcategory = "collection" pattern = USER_PATTERN + r"/saved/([^/?#]+)/([^/?#]+)" test = ( @@ -546,13 +468,59 @@ class InstagramCollectionExtractor(InstagramExtractor): } def posts(self): - endpoint = "/v1/feed/collection/{}/posts/".format(self.collection_id) - for item in self._pagination_api(endpoint): - yield item["media"] + return self.api.user_collection(self.collection_id) + + +class InstagramStoriesExtractor(InstagramExtractor): + """Extractor for Instagram stories""" + subcategory = "stories" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/stories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)") + test = ( + ("https://www.instagram.com/stories/instagram/"), + ("https://www.instagram.com/stories/highlights/18042509488170095/"), + ("https://instagram.com/stories/geekmig/2724343156064789461"), + ) + + def __init__(self, match): + self.highlight_id, self.user, self.media_id = match.groups() + if self.highlight_id: + self.subcategory = InstagramHighlightsExtractor.subcategory + InstagramExtractor.__init__(self, match) + + def posts(self): + if self.highlight_id: + reel_id = "highlight:" + self.highlight_id + else: + reel_id = self.api.user_id(self.user) + + reels = self.api.reels_media(reel_id) + + if self.media_id and reels: + reel = reels[0] + for item in reel["items"]: + if item["pk"] == self.media_id: + reel["items"] = (item,) + break + else: + raise exception.NotFoundError("story") + + return reels + + +class InstagramHighlightsExtractor(InstagramExtractor): + """Extractor for an Instagram user's story highlights""" + subcategory = "highlights" + pattern = USER_PATTERN + r"/highlights" + test = ("https://www.instagram.com/instagram/highlights",) + + def posts(self): + uid = self.api.user_id(self.item) + return self.api.highlights_media(uid) class InstagramTagExtractor(InstagramExtractor): - """Extractor for TagPage""" + """Extractor for Instagram tags""" subcategory = "tag" directory_fmt = ("{category}", "{subcategory}", "{tag}") pattern = BASE_PATTERN + r"/explore/tags/([^/?#]+)" @@ -565,27 +533,7 @@ class InstagramTagExtractor(InstagramExtractor): return {"tag": text.unquote(self.item)} def posts(self): - endpoint = "/v1/tags/{}/sections/".format(self.item) - data = { - "include_persistent": "0", - "max_id" : None, - "page" : None, - "surface": "grid", - "tab" : "recent", - } - - while True: - info = self._request_api(endpoint, method="POST", data=data) - - for section in info["sections"]: - for media in section["layout_content"]["medias"]: - yield media["media"] - - if not info.get("more_available"): - return - - data["max_id"] = info["next_max_id"] - data["page"] = info["next_page"] + return self.api.tags_media(self.item) class InstagramPostExtractor(InstagramExtractor): @@ -618,7 +566,6 @@ class InstagramPostExtractor(InstagramExtractor): "width": int, } }), - # GraphSidecar ("https://www.instagram.com/p/BoHk1haB5tM/", { "count": 5, @@ -633,7 +580,6 @@ class InstagramPostExtractor(InstagramExtractor): "username": "instagram", } }), - # GraphVideo ("https://www.instagram.com/p/Bqxp0VSBgJg/", { "pattern": r"/46840863_726311431074534_7805566102611403091_n\.mp4", @@ -651,7 +597,6 @@ class InstagramPostExtractor(InstagramExtractor): "width": int, } }), - # GraphVideo (IGTV) ("https://www.instagram.com/tv/BkQjCfsBIzi/", { "pattern": r"/10000000_597132547321814_702169244961988209_n\.mp4", @@ -668,7 +613,6 @@ class InstagramPostExtractor(InstagramExtractor): "width": int, } }), - # GraphSidecar with 2 embedded GraphVideo objects ("https://www.instagram.com/p/BtOvDOfhvRr/", { "count": 2, @@ -679,7 +623,6 @@ class InstagramPostExtractor(InstagramExtractor): "video_url": str, } }), - # GraphImage with tagged user ("https://www.instagram.com/p/B_2lf3qAd3y/", { "keyword": { @@ -690,98 +633,265 @@ class InstagramPostExtractor(InstagramExtractor): }] } }), - # URL with username (#2085) ("https://www.instagram.com/dm/p/CW042g7B9CY/"), - ("https://www.instagram.com/reel/CDg_6Y1pxWu/"), ) def posts(self): - return self._media_by_id(id_from_shortcode(self.item)) + return self.api.media(id_from_shortcode(self.item)) -class InstagramStoriesExtractor(InstagramExtractor): - """Extractor for Instagram stories""" - subcategory = "stories" - pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/stories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)") - test = ( - ("https://www.instagram.com/stories/instagram/"), - ("https://www.instagram.com/stories/highlights/18042509488170095/"), - ("https://instagram.com/stories/geekmig/2724343156064789461"), - ) +class InstagramRestAPI(): - def __init__(self, match): - self.highlight_id, self.user, self.media_id = match.groups() - if self.highlight_id: - self.subcategory = InstagramHighlightsExtractor.subcategory - InstagramExtractor.__init__(self, match) + def __init__(self, extractor): + self.extractor = extractor - def posts(self): - if self.highlight_id: - reel_id = "highlight:" + self.highlight_id - else: - reel_id = self._uid_by_screen_name(self.user) + def highlights_media(self, user_id): + chunk_size = 5 + reel_ids = [hl["id"] for hl in self.highlights_tray(user_id)] + + for offset in range(0, len(reel_ids), chunk_size): + yield from self.reels_media( + reel_ids[offset : offset+chunk_size]) + + def highlights_tray(self, user_id): + endpoint = "/v1/highlights/{}/highlights_tray/".format(user_id) + return self._call(endpoint)["tray"] + + def media(self, post_id): + endpoint = "/v1/media/{}/info/".format(post_id) + return self._pagination(endpoint) + def reels_media(self, reel_ids): endpoint = "/v1/feed/reels_media/" - params = {"reel_ids": reel_id} - reels = self._request_api(endpoint, params=params)["reels"] + params = {"reel_ids": reel_ids} + return self._call(endpoint, params=params)["reels_media"] - if self.media_id: - reel = reels[reel_id] - for item in reel["items"]: - if item["pk"] == self.media_id: - reel["items"] = (item,) - break + def tags_media(self, tag): + for section in self.tags_sections(tag): + for media in section["layout_content"]["medias"]: + yield media["media"] + + def tags_sections(self, tag): + endpoint = "/v1/tags/{}/sections/".format(tag) + data = { + "include_persistent": "0", + "max_id" : None, + "page" : None, + "surface": "grid", + "tab" : "recent", + } + return self._pagination_sections(endpoint, data) + + @memcache(keyarg=1) + def user(self, screen_name): + endpoint = "/v1/users/web_profile_info/" + params = {"username": screen_name} + return self._call(endpoint, params=params)["data"]["user"] + + def user_id(self, screen_name): + if screen_name.startswith("id:"): + return screen_name[3:] + return self.user(screen_name)["id"] + + def user_clips(self, user_id): + endpoint = "/v1/clips/user/" + data = {"target_user_id": user_id, "page_size": "50"} + return self._pagination_post(endpoint, data) + + def user_collection(self, collection_id): + endpoint = "/v1/feed/collection/{}/posts/".format(collection_id) + params = {"count": 50} + return self._pagination(endpoint, params, media=True) + + def user_feed(self, user_id): + endpoint = "/v1/feed/user/{}/".format(user_id) + params = {"count": 30} + return self._pagination(endpoint, params) + + def user_saved(self): + endpoint = "/v1/feed/saved/posts/" + params = {"count": 50} + return self._pagination(endpoint, params, media=True) + + def user_tagged(self, user_id): + endpoint = "/v1/usertags/{}/feed/".format(user_id) + params = {"count": 50} + return self._pagination(endpoint, params) + + def _call(self, endpoint, **kwargs): + extr = self.extractor + + url = "https://i.instagram.com/api" + endpoint + kwargs["headers"] = { + "X-CSRFToken" : extr.csrf_token, + "X-Instagram-AJAX": "1006242110", + "X-IG-App-ID" : "936619743392459", + "X-ASBD-ID" : "198387", + "X-IG-WWW-Claim" : extr.www_claim, + "Origin" : extr.root, + "Referer" : extr.root + "/", + } + kwargs["cookies"] = { + "csrftoken": extr.csrf_token, + } + return extr.request(url, **kwargs).json() + + def _pagination(self, endpoint, params=None, media=False): + if params is None: + params = {} + while True: + data = self._call(endpoint, params=params) + + if media: + for item in data["items"]: + yield item["media"] else: - raise exception.NotFoundError("story") + yield from data["items"] - return reels.values() + if not data.get("more_available"): + return + params["max_id"] = data["next_max_id"] + def _pagination_post(self, endpoint, params): + while True: + data = self._call(endpoint, method="POST", data=params) -class InstagramHighlightsExtractor(InstagramExtractor): - """Extractor for all Instagram story highlights of a user""" - subcategory = "highlights" - pattern = USER_PATTERN + r"/highlights" - test = ("https://www.instagram.com/instagram/highlights",) + for item in data["items"]: + yield item["media"] - def posts(self): - endpoint = "/v1/highlights/{}/highlights_tray/".format( - self._uid_by_screen_name(self.item)) - tray = self._request_api(endpoint)["tray"] - reel_ids = [highlight["id"] for highlight in tray] + info = data["paging_info"] + if not info.get("more_available"): + return + params["max_id"] = info["max_id"] - # Anything above 30 responds with statuscode 400. - # 30 can work, however, sometimes the API will respond with 560 or 500. - chunk_size = 5 - endpoint = "/v1/feed/reels_media/" + def _pagination_sections(self, endpoint, params): + while True: + info = self._call(endpoint, method="POST", data=params) - for offset in range(0, len(reel_ids), chunk_size): - chunk_ids = reel_ids[offset : offset+chunk_size] - params = {"reel_ids": chunk_ids} - reels = self._request_api(endpoint, params=params)["reels"] - for reel_id in chunk_ids: - yield reels[reel_id] + yield from info["sections"] + + if not info.get("more_available"): + return + params["max_id"] = info["next_max_id"] + params["page"] = info["next_page"] -class InstagramReelsExtractor(InstagramExtractor): - """Extractor for an Instagram user's reels""" - subcategory = "reels" - pattern = USER_PATTERN + r"/reels" - test = ("https://www.instagram.com/instagram/reels/", { - "range": "40-60", - "count": ">= 20", - }) +class InstagramGraphqlAPI(): - def posts(self): - endpoint = "/v1/clips/user/" - data = { - "target_user_id": self._uid_by_screen_name(self.item), - "page_size" : "50", + def __init__(self, extractor): + self.extractor = extractor + self.user = InstagramRestAPI(extractor).user + self.user_collection = self.user_saved = self.reels_media = \ + self.highlights_media = self._login_required + self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode + + @staticmethod + def _login_required(_=None): + raise exception.AuthorizationError("Login required") + + def highlights_tray(self, user_id): + query_hash = "d4d88dc1500312af6f937f7b804c68c3" + variables = { + "user_id": user_id, + "include_chaining": False, + "include_reel": False, + "include_suggested_users": False, + "include_logged_out_extras": True, + "include_highlight_reels": True, + "include_live_status": False, + } + edges = (self._call(query_hash, variables)["user"] + ["edge_highlight_reels"]["edges"]) + return [edge["node"] for edge in edges] + + def media(self, post_id): + query_hash = "9f8827793ef34641b2fb195d4d41151c" + variables = { + "shortcode": shortcode_from_id(post_id), + "child_comment_count": 3, + "fetch_comment_count": 40, + "parent_comment_count": 24, + "has_threaded_comments": True, + } + media = self._call(query_hash, variables).get("shortcode_media") + return (media,) if media else () + + def tags_media(self, tag): + query_hash = "9b498c08113f1e09617a1703c22b2f32" + variables = {"tag_name": text.unescape(tag), "first": 50} + return self._pagination(query_hash, variables, + "hashtag", "edge_hashtag_to_media") + + def user_id(self, screen_name): + if screen_name.startswith("id:"): + return screen_name[3:] + return self.user(screen_name)["id"] + + def user_clips(self, user_id): + query_hash = "bc78b344a68ed16dd5d7f264681c4c76" + variables = {"id": user_id, "first": 50} + return self._pagination(query_hash, variables) + + def user_feed(self, user_id): + query_hash = "69cba40317214236af40e7efa697781d" + variables = {"id": user_id, "first": 50} + return self._pagination(query_hash, variables) + + def user_tagged(self, user_id): + query_hash = "be13233562af2d229b008d2976b998b5" + variables = {"id": user_id, "first": 50} + return self._pagination(query_hash, variables) + + def _call(self, query_hash, variables): + extr = self.extractor + + url = "https://www.instagram.com/graphql/query/" + params = { + "query_hash": query_hash, + "variables" : self._json_dumps(variables), + } + headers = { + "Accept" : "*/*", + "X-CSRFToken" : extr.csrf_token, + "X-Instagram-AJAX": "1006267176", + "X-IG-App-ID" : "936619743392459", + "X-ASBD-ID" : "198387", + "X-IG-WWW-Claim" : extr.www_claim, + "X-Requested-With": "XMLHttpRequest", + "Referer" : extr.root + "/", + } + cookies = { + "csrftoken": extr.csrf_token, } + return extr.request( + url, params=params, headers=headers, cookies=cookies, + ).json()["data"] - return self._pagination_api_post(endpoint, data) + def _pagination(self, query_hash, variables, + key_data="user", key_edge=None): + cursor = self.extractor.config("cursor") + if cursor: + variables["after"] = cursor + + while True: + data = self._call(query_hash, variables)[key_data] + data = data[key_edge] if key_edge else next(iter(data.values())) + + for edge in data["edges"]: + yield edge["node"] + + info = data["page_info"] + if not info["has_next_page"]: + return + elif not data["edges"]: + s = "" if self.item.endswith("s") else "s" + raise exception.StopExtraction( + "%s'%s posts are private", self.item, s) + + variables["after"] = self._cursor = info["end_cursor"] + self.extractor.log.debug("Cursor: %s", self._cursor) @cache(maxage=360*24*3600, keyarg=1) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 816b561..750b741 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -33,6 +33,7 @@ class KemonopartyExtractor(Extractor): self.cookiedomain = ".coomer.party" self.root = text.root_from_url(match.group(0)) Extractor.__init__(self, match) + self.session.headers["Referer"] = self.root + "/" def items(self): self._prepare_ddosguard_cookies() @@ -46,7 +47,7 @@ class KemonopartyExtractor(Extractor): comments = self.config("comments") username = dms = None - # prevent files to be sent with gzip compression + # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} if self.config("metadata"): @@ -63,6 +64,9 @@ class KemonopartyExtractor(Extractor): for post in posts: + headers["Referer"] = "{}/{}/user/{}/post/{}".format( + self.root, post["service"], post["user"], post["id"]) + post["_http_headers"] = headers post["date"] = text.parse_datetime( post["published"] or post["added"], "%a, %d %b %Y %H:%M:%S %Z") @@ -74,27 +78,33 @@ class KemonopartyExtractor(Extractor): if dms is True: dms = self._extract_dms(post) post["dms"] = dms - yield Message.Directory, post + files = [] hashes = set() - post["num"] = 0 + for file in itertools.chain.from_iterable( g(post) for g in generators): url = file["path"] match = find_hash(url) if match: - post["hash"] = hash = match.group(1) + file["hash"] = hash = match.group(1) if hash in hashes and not duplicates: self.log.debug("Skipping %s (duplicate)", url) continue hashes.add(hash) else: - post["hash"] = "" + file["hash"] = "" + + files.append(file) + post["count"] = len(files) + yield Message.Directory, post + + for post["num"], file in enumerate(files, 1): + post["hash"] = file["hash"] post["type"] = file["type"] - post["num"] += 1 - post["_http_headers"] = headers + url = file["path"] text.nameext_from_url(file.get("name", url), post) if not post["extension"]: @@ -236,6 +246,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "keyword": { "added": "Wed, 06 May 2020 20:28:02 GMT", "content": str, + "count": 1, "date": "dt:2019-08-11 02:09:04", "edited": None, "embed": dict, @@ -374,6 +385,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): post["channel_name"] = self.channel_name post["date"] = text.parse_datetime( post["published"], "%a, %d %b %Y %H:%M:%S %Z") + post["count"] = len(files) yield Message.Directory, post for post["num"], file in enumerate(files, 1): @@ -466,7 +478,7 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): if self.favorites == "artist": users = self.request( - self.root + "/api/v1/account/favorites?type=artist").json() + self.root + "/api/favorites?type=artist").json() for user in users: user["_extractor"] = KemonopartyUserExtractor url = "{}/{}/user/{}".format( @@ -475,7 +487,7 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): elif self.favorites == "post": posts = self.request( - self.root + "/api/v1/account/favorites?type=post").json() + self.root + "/api/favorites?type=post").json() for post in posts: post["_extractor"] = KemonopartyPostExtractor url = "{}/{}/user/{}/post/{}".format( diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 493a8ef..9ce5772 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -31,6 +31,8 @@ class MastodonExtractor(BaseExtractor): def items(self): for status in self.statuses(): + if self._check_move: + self._check_move(status["account"]) if not self.reblogs and status["reblog"]: self.log.debug("Skipping %s (reblog)", status["id"]) continue @@ -56,6 +58,12 @@ class MastodonExtractor(BaseExtractor): """Return an iterable containing all relevant Status objects""" return () + def _check_move(self, account): + self._check_move = None + if "moved" in account: + self.log.warning("Account '%s' moved to '%s'", + account["acct"], account["moved"]["acct"]) + INSTANCES = { "mastodon.social": { @@ -192,6 +200,7 @@ class MastodonAPI(): handle = "@{}@{}".format(username, self.extractor.instance) for account in self.account_search(handle, 1): if account["username"] == username: + self.extractor._check_move(account) return account["id"] raise exception.NotFoundError("account") diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index f06ab70..8254118 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -104,4 +104,7 @@ class MyportfolioGalleryExtractor(Extractor): @staticmethod def images(page): """Extract and return a list of all image-urls""" - return list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) + return ( + list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) or + list(text.extract_iter(page, 'data-src="', '"')) + ) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index d9ab336..2c2dcb9 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -155,6 +155,7 @@ class NewgroundsExtractor(Extractor): data = { "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), + "type" : extr('og:type" content="', '"'), "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), "rating" : extr('class="rated-', '"'), @@ -173,6 +174,7 @@ class NewgroundsExtractor(Extractor): return { "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), + "type" : extr('og:type" content="', '"'), "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), "url" : extr('{"url":"', '"').replace("\\/", "/"), @@ -184,6 +186,7 @@ class NewgroundsExtractor(Extractor): def _extract_media_data(self, extr, url): index = url.split("/")[5] title = extr('"og:title" content="', '"') + type = extr('og:type" content="', '"') descr = extr('"og:description" content="', '"') src = extr('{"url":"', '"') @@ -223,6 +226,7 @@ class NewgroundsExtractor(Extractor): "title" : text.unescape(title), "url" : src, "date" : date, + "type" : type, "description": text.unescape(descr or extr( 'itemprop="description" content="', '"')), "rating" : extr('class="rated-', '"'), @@ -305,6 +309,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor): "score" : float, "tags" : ["ryu", "streetfighter"], "title" : "Ryu is Hawt", + "type" : "article", "user" : "tomfulp", "width" : 447, }, @@ -357,6 +362,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "score" : float, "tags" : ["alienhominid", "trailer"], "title" : "Alien Hominid Fan Trailer", + "type" : "movie", "user" : "kickinthehead", }, }), @@ -373,6 +379,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "score" : float, "tags" : ["fulp", "interview", "tom", "zj"], "title" : "ZJ Interviews Tom Fulp!", + "type" : "music.song", "user" : "zj", }, }), @@ -380,6 +387,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): ("https://www.newgrounds.com/portal/view/161181/format/flash", { "pattern": r"https://uploads\.ungrounded\.net/161000" r"/161181_ddautta_mask__550x281_\.swf\?f1081628129", + "keyword": {"type": "movie"}, }), # format selection (#1729) ("https://www.newgrounds.com/portal/view/758545", { @@ -392,6 +400,49 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "options": (("username", None),), "count": 1, }), + # flash game + ("https://www.newgrounds.com/portal/view/829032", { + "pattern": r"https://uploads\.ungrounded\.net/829000" + r"/829032_picovsbeardx\.swf\?f1641968445", + "range": "1", + "keyword": { + "artist" : [ + "dungeonation", + "carpetbakery", + "animalspeakandrews", + "bill", + "chipollo", + "dylz49", + "gappyshamp", + "pinktophat", + "rad", + "shapeshiftingblob", + "tomfulp", + "voicesbycorey", + "psychogoldfish", + ], + "comment" : "re:The children are expendable. Take out the ", + "date" : "dt:2022-01-10 23:00:57", + "description": "Bloodshed in The Big House that Blew...again!", + "favorites" : int, + "index" : 829032, + "post_url" : "https://www.newgrounds.com/portal/view/829032", + "rating" : "m", + "score" : float, + "tags" : [ + "assassin", + "boyfriend", + "darnell", + "nene", + "pico", + "picos-school", + ], + "title" : "PICO VS BEAR DX", + "type" : "game", + "url" : "https://uploads.ungrounded.net/829000" + "/829032_picovsbeardx.swf?f1641968445", + }, + }), ) def __init__(self, match): @@ -434,6 +485,17 @@ class NewgroundsMoviesExtractor(NewgroundsExtractor): }) +class NewgroundsGamesExtractor(NewgroundsExtractor): + """Extractor for a newgrounds user's games""" + subcategory = _path = "games" + pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/games/?$" + test = ("https://tomfulp.newgrounds.com/games", { + "pattern": r"https://uploads.ungrounded.net(/alternate)?/\d+/\d+_.+", + "range": "1-10", + "count": 10, + }) + + class NewgroundsUserExtractor(NewgroundsExtractor): """Extractor for a newgrounds user profile""" subcategory = "user" @@ -454,6 +516,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor): return self._dispatch_extractors(( (NewgroundsArtExtractor , base + "art"), (NewgroundsAudioExtractor , base + "audio"), + (NewgroundsGamesExtractor , base + "games"), (NewgroundsMoviesExtractor, base + "movies"), ), ("art",)) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a589760..6b2e1c3 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -642,6 +642,66 @@ class PixivPixivisionExtractor(PixivExtractor): } +class PixivSeriesExtractor(PixivExtractor): + """Extractor for illustrations from a Pixiv series""" + subcategory = "series" + directory_fmt = ("{category}", "{user[id]} {user[account]}", + "{series[id]} {series[title]}") + filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" + pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" + r"/user/(\d+)/series/(\d+)") + test = ("https://www.pixiv.net/user/10509347/series/21859", { + "range": "1-10", + "count": 10, + "keyword": { + "num_series": int, + "series": { + "canonical": "https://www.pixiv.net/user/10509347" + "/series/21859", + "description": str, + "ogp": dict, + "title": "先輩がうざい後輩の話", + "total": int, + "twitter": dict, + }, + }, + }) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id, self.series_id = match.groups() + + def works(self): + url = self.root + "/ajax/series/" + self.series_id + params = {"p": 1} + headers = { + "Accept": "application/json", + "Referer": "{}/user/{}/series/{}".format( + self.root, self.user_id, self.series_id), + "Alt-Used": "www.pixiv.net", + } + + while True: + data = self.request(url, params=params, headers=headers).json() + body = data["body"] + page = body["page"] + + series = body["extraData"]["meta"] + series["id"] = self.series_id + series["total"] = page["total"] + series["title"] = text.extract(series["title"], '"', '"')[0] + + for info in page["series"]: + work = self.api.illust_detail(info["workId"]) + work["num_series"] = info["order"] + work["series"] = series + yield work + + if len(page["series"]) < 10: + return + params["p"] += 1 + + class PixivSketchExtractor(Extractor): """Extractor for user pages on sketch.pixiv.net""" category = "pixiv" diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index f2e964d..535fae9 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -85,7 +85,7 @@ class PlurkTimelineExtractor(PlurkExtractor): def plurks(self): url = "{}/{}".format(self.root, self.user) page = self.request(url).text - user_id, pos = text.extract(page, '"user_id":', ',') + user_id, pos = text.extract(page, '"page_user": {"id":', ',') plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0]) headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"} diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 2ce7f6c..3396e3a 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -53,12 +53,15 @@ class SankakuExtractor(BooruExtractor): url = "https://s.sankakucomplex.com" + url[url.index("/", 8):] return url - @staticmethod - def _prepare(post): + def _prepare(self, post): post["created_at"] = post["created_at"]["s"] post["date"] = text.parse_timestamp(post["created_at"]) post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]] post["tag_string"] = " ".join(post["tags"]) + post["_http_validate"] = self._check_expired + + def _check_expired(self, response): + return not response.history or '.com/expired.png' not in response.url def _extended_tags(self, post): tags = collections.defaultdict(list) @@ -219,7 +222,11 @@ class SankakuAPI(): def __init__(self, extractor): self.extractor = extractor - self.headers = {"Accept": "application/vnd.sankaku.api+json;v=2"} + self.headers = { + "Accept" : "application/vnd.sankaku.api+json;v=2", + "Origin" : extractor.root, + "Referer": extractor.root + "/", + } self.username, self.password = self.extractor._get_auth_info() if not self.username: @@ -253,11 +260,14 @@ class SankakuAPI(): for _ in range(5): self.authenticate() response = self.extractor.request( - url, params=params, headers=self.headers, fatal=False) + url, params=params, headers=self.headers, fatal=None) if response.status_code == 429: - self.extractor.wait( - until=response.headers.get("X-RateLimit-Reset")) + until = response.headers.get("X-RateLimit-Reset") + if not until and b"tags-limit" in response.content: + raise exception.StopExtraction("Search tag limit exceeded") + seconds = None if until else 60 + self.extractor.wait(until=until, seconds=seconds) continue data = response.json() @@ -278,9 +288,41 @@ class SankakuAPI(): params["lang"] = "en" params["limit"] = str(self.extractor.per_page) + refresh = self.extractor.config("refresh", False) + if refresh: + offset = expires = 0 + from time import time + while True: data = self._call(endpoint, params) - yield from data["data"] + + if refresh: + posts = data["data"] + if offset: + posts = util.advance(posts, offset) + + for post in posts: + if not expires: + url = post["file_url"] + if url: + expires = text.parse_int( + text.extract(url, "e=", "&")[0]) - 60 + + if 0 < expires <= time(): + self.extractor.log.debug("Refreshing download URLs") + expires = None + break + + offset += 1 + yield post + + if expires is None: + expires = 0 + continue + offset = expires = 0 + + else: + yield from data["data"] params["next"] = data["meta"]["next"] if not params["next"]: diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index cd8c238..822b1f2 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -26,8 +26,11 @@ class SkebExtractor(Extractor): self.article = self.config("article", False) def items(self): + metadata = self.metadata() for user_name, post_num in self.posts(): response, post = self._get_post_data(user_name, post_num) + if metadata: + post.update(metadata) yield Message.Directory, post for data in self._get_urls_from_post(response, post): url = data["file_url"] @@ -36,6 +39,9 @@ class SkebExtractor(Extractor): def posts(self): """Return post number""" + def metadata(self): + """Return additional metadata""" + def _pagination(self, url, params): headers = {"Referer": self.root, "Authorization": "Bearer null"} params["offset"] = 0 @@ -223,6 +229,62 @@ class SkebUserExtractor(SkebExtractor): return posts +class SkebSearchExtractor(SkebExtractor): + """Extractor for skeb search results""" + subcategory = "search" + pattern = r"(?:https?://)?skeb\.jp/search\?q=([^&#]+)" + test = ("https://skeb.jp/search?q=bunny%20tree&t=works", { + "count": ">= 18", + "keyword": {"search_tags": "bunny tree"}, + }) + + def metadata(self): + return {"search_tags": text.unquote(self.user_name)} + + def posts(self): + url = "https://hb1jt3kre9-2.algolianet.com/1/indexes/*/queries" + params = { + "x-algolia-agent": "Algolia for JavaScript (4.13.1); Browser", + } + headers = { + "Origin": self.root, + "Referer": self.root + "/", + "x-algolia-api-key": "9a4ce7d609e71bf29e977925e4c6740c", + "x-algolia-application-id": "HB1JT3KRE9", + } + + filters = self.config("filters") + if filters is None: + filters = ("genre:art OR genre:voice OR genre:novel OR " + "genre:video OR genre:music OR genre:correction") + elif not isinstance(filters, str): + filters = " OR ".join(filters) + + page = 0 + pams = "hitsPerPage=40&filters=" + text.quote(filters) + "&page=" + + request = { + "indexName": "Request", + "query": text.unquote(self.user_name), + "params": pams + str(page), + } + data = {"requests": (request,)} + + while True: + result = self.request( + url, method="POST", params=params, headers=headers, json=data, + ).json()["results"][0] + + for post in result["hits"]: + parts = post["path"].split("/") + yield parts[1][1:], parts[3] + + if page >= result["nbPages"]: + return + page += 1 + request["params"] = pams + str(page) + + class SkebFollowingExtractor(SkebExtractor): """Extractor for all creators followed by a skeb user""" subcategory = "following" @@ -238,8 +300,8 @@ class SkebFollowingExtractor(SkebExtractor): def users(self): url = "{}/api/users/{}/following_creators".format( self.root, self.user_name) - headers = {"Referer": self.root, "Authorization": "Bearer null"} params = {"sort": "date", "offset": 0, "limit": 90} + headers = {"Referer": self.root, "Authorization": "Bearer null"} while True: data = self.request(url, params=params, headers=headers).json() diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 6f53881..447ce00 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -116,13 +116,17 @@ class TumblrExtractor(Extractor): if self.original and "/s2048x3072/" in photo["url"] and ( photo["width"] == 2048 or photo["height"] == 3072): - photo["url"] = self._original_photo(photo["url"]) + photo["url"], fb = self._original_photo(photo["url"]) + if fb: + post["_fallback"] = self._original_image_fallback( + photo["url"], post["id"]) del photo["original_size"] del photo["alt_sizes"] posts.append( self._prepare_image(photo["url"], post.copy())) del post["photo"] + post.pop("_fallback", None) url = post.get("audio_url") # type "audio" if url and url.startswith("https://a.tumblr.com/"): @@ -138,8 +142,12 @@ class TumblrExtractor(Extractor): # API response, but they can't contain images/videos anyway body = post["reblog"]["comment"] + post["reblog"]["tree_html"] for url in _findall_image(body): - url = self._original_inline_image(url) + url, fb = self._original_inline_image(url) + if fb: + post["_fallback"] = self._original_image_fallback( + url, post["id"]) posts.append(self._prepare_image(url, post.copy())) + post.pop("_fallback", None) for url in _findall_video(body): url = self._original_video(url) posts.append(self._prepare(url, post.copy())) @@ -218,23 +226,35 @@ class TumblrExtractor(Extractor): return self.blog != post.get("reblogged_root_uuid") def _original_photo(self, url): - return self._update_image_token( - url.replace("/s2048x3072/", "/s99999x99999/", 1)) + resized = url.replace("/s2048x3072/", "/s99999x99999/", 1) + return self._update_image_token(resized) def _original_inline_image(self, url): if self.original: - url, n = self._subn_orig_image("/s99999x99999/", url, 1) + resized, n = self._subn_orig_image("/s99999x99999/", url, 1) if n: - return self._update_image_token(url) - return self._sub_image(r"https://\1_1280.\2", url) + return self._update_image_token(resized) + return self._sub_image(r"https://\1_1280.\2", url), False def _original_video(self, url): return self._sub_video(r"https://\1.\2", url) - def _update_image_token(self, url): + def _update_image_token(self, resized): headers = {"Accept": "text/html,*/*;q=0.8"} - response = self.request(url, headers=headers) - return text.extract(response.text, '" src="', '"')[0] + try: + response = self.request(resized, headers=headers) + except Exception: + return resized, True + else: + updated = text.extract(response.text, '" src="', '"')[0] + return updated, (resized == updated) + + def _original_image_fallback(self, url, post_id): + yield self._update_image_token(url)[0] + yield self._update_image_token(url)[0] + yield self._update_image_token(url)[0] + self.log.warning("Unable to fetch higher-resolution " + "version of %s (%s)", url, post_id) class TumblrUserExtractor(TumblrExtractor): |
