summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/artstation.py58
-rw-r--r--gallery_dl/extractor/blogger.py59
-rw-r--r--gallery_dl/extractor/deviantart.py28
-rw-r--r--gallery_dl/extractor/exhentai.py2
-rw-r--r--gallery_dl/extractor/imagehosts.py4
-rw-r--r--gallery_dl/extractor/instagram.py720
-rw-r--r--gallery_dl/extractor/kemonoparty.py30
-rw-r--r--gallery_dl/extractor/mastodon.py9
-rw-r--r--gallery_dl/extractor/myportfolio.py7
-rw-r--r--gallery_dl/extractor/newgrounds.py63
-rw-r--r--gallery_dl/extractor/pixiv.py60
-rw-r--r--gallery_dl/extractor/plurk.py4
-rw-r--r--gallery_dl/extractor/sankaku.py56
-rw-r--r--gallery_dl/extractor/skeb.py64
-rw-r--r--gallery_dl/extractor/tumblr.py40
15 files changed, 824 insertions, 380 deletions
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index c0e8e67..62626a1 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -103,16 +103,23 @@ class ArtstationExtractor(Extractor):
return response.json()
def _pagination(self, url, params=None, json=None):
+ headers = {
+ "Accept" : "application/json, text/plain, */*",
+ "Origin" : self.root,
+ "Referer": self.root + "/",
+ }
+
if json:
params = json
- kwargs = {"json": json}
+ headers["PUBLIC-CSRF-TOKEN"] = self._init_csrf_token()
+ kwargs = {"method": "POST", "headers": headers, "json": json}
else:
if not params:
params = {}
- kwargs = {"params": params}
+ kwargs = {"params": params, "headers": headers}
- params["page"] = 1
total = 0
+ params["page"] = 1
while True:
data = self.request(url, **kwargs).json()
@@ -124,6 +131,17 @@ class ArtstationExtractor(Extractor):
params["page"] += 1
+ def _init_csrf_token(self):
+ url = self.root + "/api/v2/csrf_protection/token.json"
+ headers = {
+ "Accept" : "*/*",
+ "Origin" : self.root,
+ "Referer": self.root + "/",
+ }
+ return self.request(
+ url, method="POST", headers=headers, json={},
+ ).json()["public_csrf_token"]
+
@staticmethod
def _no_cache(url, alphabet=(string.digits + string.ascii_letters)):
"""Cause a cache miss to prevent Cloudflare 'optimizations'
@@ -298,34 +316,46 @@ class ArtstationSearchExtractor(ArtstationExtractor):
archive_fmt = "s_{search[query]}_{asset[id]}"
pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com"
r"/search/?\?([^#]+)")
- test = ("https://www.artstation.com/search?q=ancient&sort_by=rank", {
+ test = ("https://www.artstation.com/search?query=ancient&sort_by=rank", {
"range": "1-20",
"count": 20,
})
def __init__(self, match):
ArtstationExtractor.__init__(self, match)
- query = text.parse_query(match.group(1))
- self.query = query.get("q", "")
- self.sorting = query.get("sort_by", "rank").lower()
+ self.params = query = text.parse_query(match.group(1))
+ self.query = text.unquote(query.get("query") or query.get("q", ""))
+ self.sorting = query.get("sort_by", "relevance").lower()
+ self.tags = query.get("tags", "").split(",")
def metadata(self):
return {"search": {
"query" : self.query,
"sorting": self.sorting,
+ "tags" : self.tags,
}}
def projects(self):
+ filters = []
+ for key, value in self.params.items():
+ if key.endswith("_ids") or key == "tags":
+ filters.append({
+ "field" : key,
+ "method": "include",
+ "value" : value.split(","),
+ })
+
url = "{}/api/v2/search/projects.json".format(self.root)
- return self._pagination(url, json={
- "additional_fields": "[]",
- "filters" : "[]",
- "page" : None,
- "per_page" : "50",
- "pro_first" : "1",
+ data = {
"query" : self.query,
+ "page" : None,
+ "per_page" : 50,
"sorting" : self.sorting,
- })
+ "pro_first" : "1",
+ "filters" : filters,
+ "additional_fields": (),
+ }
+ return self._pagination(url, json=data)
class ArtstationArtworkExtractor(ArtstationExtractor):
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index e0885d2..232f3ea 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -48,6 +48,7 @@ class BloggerExtractor(Extractor):
r'\d+\.bp\.blogspot\.com)/[^"]+)').findall
findall_video = re.compile(
r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
+ metadata = self.metadata()
for post in self.posts(blog):
content = post["content"]
@@ -74,18 +75,21 @@ class BloggerExtractor(Extractor):
del post["selfLink"]
del post["blog"]
- yield Message.Directory, {"blog": blog, "post": post}
- for num, url in enumerate(files, 1):
- yield Message.Url, url, text.nameext_from_url(url, {
- "blog": blog,
- "post": post,
- "url" : url,
- "num" : num,
- })
+ data = {"blog": blog, "post": post}
+ if metadata:
+ data.update(metadata)
+ yield Message.Directory, data
+
+ for data["num"], url in enumerate(files, 1):
+ data["url"] = url
+ yield Message.Url, url, text.nameext_from_url(url, data)
def posts(self, blog):
"""Return an iterable with all relevant post objects"""
+ def metadata(self):
+ """Return additional metadata"""
+
class BloggerPostExtractor(BloggerExtractor):
"""Extractor for a single blog post"""
@@ -173,31 +177,48 @@ class BloggerBlogExtractor(BloggerExtractor):
class BloggerSearchExtractor(BloggerExtractor):
- """Extractor for search resuls and labels"""
+ """Extractor for Blogger search resuls"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/search(?:/?\?q=([^/?#]+)|/label/([^/?#]+))"
+ pattern = BASE_PATTERN + r"/search/?\?q=([^&#]+)"
test = (
("https://julianbphotography.blogspot.com/search?q=400mm", {
- "count": "< 10"
+ "count": "< 10",
+ "keyword": {"query": "400mm"},
}),
+ )
+
+ def __init__(self, match):
+ BloggerExtractor.__init__(self, match)
+ self.query = text.unquote(match.group(3))
+
+ def posts(self, blog):
+ return self.api.blog_search(blog["id"], self.query)
+
+ def metadata(self):
+ return {"query": self.query}
+
+
+class BloggerLabelExtractor(BloggerExtractor):
+ """Extractor for Blogger posts by label"""
+ subcategory = "label"
+ pattern = BASE_PATTERN + r"/search/label/([^/?#]+)"
+ test = (
("https://dmmagazine.blogspot.com/search/label/D%26D", {
"range": "1-25",
"count": 25,
+ "keyword": {"label": "D&D"},
}),
)
def __init__(self, match):
BloggerExtractor.__init__(self, match)
- query = match.group(3)
- if query:
- self.query, self.label = query, None
- else:
- self.query, self.label = None, match.group(4)
+ self.label = text.unquote(match.group(3))
def posts(self, blog):
- if self.query:
- return self.api.blog_search(blog["id"], text.unquote(self.query))
- return self.api.blog_posts(blog["id"], text.unquote(self.label))
+ return self.api.blog_posts(blog["id"], self.label)
+
+ def metadata(self):
+ return {"label": self.label}
class BloggerAPI():
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 60f644d..6897476 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -936,12 +936,13 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
self.deviation_id = match.group(4)
def deviations(self):
- deviation = DeviantartEclipseAPI(self).deviation_extended_fetch(
- self.deviation_id, self.user, self.type)
- if "error" in deviation:
+ url = "{}/{}/{}/{}".format(
+ self.root, self.user, self.type, self.deviation_id)
+ appurl = text.extract(self._limited_request(url).text,
+ 'property="da:appurl" content="', '"')[0]
+ if not appurl:
raise exception.NotFoundError("deviation")
- return (self.api.deviation(
- deviation["deviation"]["extended"]["deviationUuid"]),)
+ return (self.api.deviation(appurl.rpartition("/")[2]),)
class DeviantartScrapsExtractor(DeviantartExtractor):
@@ -1398,6 +1399,8 @@ class DeviantartEclipseAPI():
def __init__(self, extractor):
self.extractor = extractor
self.log = extractor.log
+ self.request = self.extractor._limited_request
+ self.csrf_token = None
def deviation_extended_fetch(self, deviation_id, user=None, kind=None):
endpoint = "/da-browse/shared_api/deviation/extended_fetch"
@@ -1429,11 +1432,12 @@ class DeviantartEclipseAPI():
}
return self._pagination(endpoint, params)
- def _call(self, endpoint, params=None):
+ def _call(self, endpoint, params):
url = "https://www.deviantart.com/_napi" + endpoint
headers = {"Referer": "https://www.deviantart.com/"}
+ params["csrf_token"] = self.csrf_token or self._fetch_csrf_token()
- response = self.extractor._limited_request(
+ response = self.request(
url, params=params, headers=headers, fatal=None)
if response.status_code == 404:
@@ -1464,12 +1468,20 @@ class DeviantartEclipseAPI():
def _module_id_watching(self, user):
url = "{}/{}/about".format(self.extractor.root, user)
- page = self.extractor._limited_request(url).text
+ page = self.request(url).text
pos = page.find('\\"type\\":\\"watching\\"')
if pos < 0:
raise exception.NotFoundError("module")
+ self._fetch_csrf_token(page)
return text.rextract(page, '\\"id\\":', ',', pos)[0].strip('" ')
+ def _fetch_csrf_token(self, page=None):
+ if page is None:
+ page = self.request(self.extractor.root + "/").text
+ self.csrf_token = token = text.extract(
+ page, "window.__CSRF_TOKEN__ = '", "'")[0]
+ return token
+
@cache(maxage=100*365*24*3600, keyarg=0)
def _refresh_token_cache(token):
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 01ba03a..e37e81b 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -505,7 +505,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
if url == last:
continue
last = url
- yield Message.Queue, url, data
+ yield Message.Queue, url + "/", data
if 'class="ptdd">&gt;<' in page or ">No hits found</p>" in page:
return
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index d699f07..69455a8 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2021 Mike Fährmann
+# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -292,7 +292,7 @@ class FappicImageExtractor(ImagehostImageExtractor):
})
def get_info(self, page):
- url , pos = text.extract(page, '<a href="/?click"><img src="', '"')
+ url , pos = text.extract(page, '<a href="#"><img src="', '"')
filename, pos = text.extract(page, 'alt="', '"', pos)
if filename.startswith("Porn-Picture-"):
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 8c98d2e..425d541 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -34,13 +34,25 @@ class InstagramExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.item = match.group(1)
+ self.api = None
self.www_claim = "0"
self.csrf_token = util.generate_token()
+ self._logged_in = True
self._find_tags = re.compile(r"#\w+").findall
self._cursor = None
def items(self):
self.login()
+
+ api = self.config("api")
+ if api is None or api == "auto":
+ api = InstagramRestAPI if self._logged_in else InstagramGraphqlAPI
+ elif api == "graphql":
+ api = InstagramGraphqlAPI
+ else:
+ api = InstagramRestAPI
+ self.api = api(self)
+
data = self.metadata()
videos = self.config("videos", True)
previews = self.config("previews", False)
@@ -51,10 +63,11 @@ class InstagramExtractor(Extractor):
if "__typename" in post:
post = self._parse_post_graphql(post)
else:
- post = self._parse_post_api(post)
+ post = self._parse_post_rest(post)
post.update(data)
files = post.pop("_files")
+ post["count"] = len(files)
yield Message.Directory, post
for file in files:
file.update(post)
@@ -107,63 +120,6 @@ class InstagramExtractor(Extractor):
return response
- def _request_api(self, endpoint, **kwargs):
- url = "https://i.instagram.com/api" + endpoint
- kwargs["headers"] = {
- "X-CSRFToken" : self.csrf_token,
- "X-IG-App-ID" : "936619743392459",
- "X-IG-WWW-Claim": self.www_claim,
- }
- kwargs["cookies"] = {
- "csrftoken": self.csrf_token,
- }
- return self.request(url, **kwargs).json()
-
- def _request_graphql(self, query_hash, variables):
- url = self.root + "/graphql/query/"
- params = {
- "query_hash": query_hash,
- "variables" : json.dumps(variables),
- }
- headers = {
- "X-CSRFToken" : self.csrf_token,
- "X-IG-App-ID" : "936619743392459",
- "X-IG-WWW-Claim" : self.www_claim,
- "X-Requested-With": "XMLHttpRequest",
- }
- cookies = {
- "csrftoken": self.csrf_token,
- }
- return self.request(
- url, params=params, headers=headers, cookies=cookies,
- ).json()["data"]
-
- @memcache(keyarg=1)
- def _user_by_screen_name(self, screen_name):
- url = "https://www.instagram.com/{}/?__a=1&__d=dis".format(
- screen_name)
- headers = {
- "Referer": "https://www.instagram.com/{}/".format(screen_name),
- "X-CSRFToken" : self.csrf_token,
- "X-IG-App-ID" : "936619743392459",
- "X-IG-WWW-Claim" : self.www_claim,
- "X-Requested-With": "XMLHttpRequest",
- }
- cookies = {
- "csrftoken": self.csrf_token,
- }
- return self.request(
- url, headers=headers, cookies=cookies).json()["graphql"]["user"]
-
- def _uid_by_screen_name(self, screen_name):
- if screen_name.startswith("id:"):
- return screen_name[3:]
- return self._user_by_screen_name(screen_name)["id"]
-
- def _media_by_id(self, post_id):
- endpoint = "/v1/media/{}/info/".format(post_id)
- return self._pagination_api(endpoint)
-
def login(self):
self._username = None
if not self._check_cookies(self.cookienames):
@@ -171,92 +127,13 @@ class InstagramExtractor(Extractor):
if username:
self._username = username
self._update_cookies(_login_impl(self, username, password))
+ else:
+ self._logged_in = False
self.session.cookies.set(
"csrftoken", self.csrf_token, domain=self.cookiedomain)
- def _parse_post_graphql(self, post):
- typename = post["__typename"]
-
- if post.get("is_video") and "video_url" not in post:
- media = next(self._media_by_id(post["id"]))
- return self._parse_post_api(media)
-
- if typename == "GraphSidecar" and \
- "edge_sidecar_to_children" not in post:
- media = next(self._media_by_id(post["id"]))
- return self._parse_post_api(media)
-
- pinned = post.get("pinned_for_users", ())
- if pinned:
- for index, user in enumerate(pinned):
- pinned[index] = int(user["id"])
-
- owner = post["owner"]
- data = {
- "typename" : typename,
- "date" : text.parse_timestamp(post["taken_at_timestamp"]),
- "likes" : post["edge_media_preview_like"]["count"],
- "pinned" : pinned,
- "owner_id" : owner["id"],
- "username" : owner.get("username"),
- "fullname" : owner.get("full_name"),
- "post_id" : post["id"],
- "post_shortcode": post["shortcode"],
- "post_url" : "{}/p/{}/".format(self.root, post["shortcode"]),
- "description": text.parse_unicode_escapes("\n".join(
- edge["node"]["text"]
- for edge in post["edge_media_to_caption"]["edges"]
- )),
- }
-
- tags = self._find_tags(data["description"])
- if tags:
- data["tags"] = sorted(set(tags))
-
- location = post.get("location")
- if location:
- data["location_id"] = location["id"]
- data["location_slug"] = location["slug"]
- data["location_url"] = "{}/explore/locations/{}/{}/".format(
- self.root, location["id"], location["slug"])
-
- data["_files"] = files = []
- if "edge_sidecar_to_children" in post:
- for num, edge in enumerate(
- post["edge_sidecar_to_children"]["edges"], 1):
- node = edge["node"]
- dimensions = node["dimensions"]
- media = {
- "num": num,
- "media_id" : node["id"],
- "shortcode" : (node.get("shortcode") or
- shortcode_from_id(node["id"])),
- "display_url": node["display_url"],
- "video_url" : node.get("video_url"),
- "width" : dimensions["width"],
- "height" : dimensions["height"],
- "sidecar_media_id" : post["id"],
- "sidecar_shortcode": post["shortcode"],
- }
- self._extract_tagged_users(node, media)
- files.append(media)
- else:
- dimensions = post["dimensions"]
- media = {
- "media_id" : post["id"],
- "shortcode" : post["shortcode"],
- "display_url": post["display_url"],
- "video_url" : post.get("video_url"),
- "width" : dimensions["width"],
- "height" : dimensions["height"],
- }
- self._extract_tagged_users(post, media)
- files.append(media)
-
- return data
-
- def _parse_post_api(self, post):
- if "items" in post:
+ def _parse_post_rest(self, post):
+ if "items" in post: # story or highlight
items = post["items"]
reel_id = str(post["id"]).rpartition(":")[2]
data = {
@@ -270,7 +147,7 @@ class InstagramExtractor(Extractor):
if "created_at" in post:
data["date"] = text.parse_timestamp(post.get("created_at"))
- else:
+ else: # regular image/video post
data = {
"post_id" : post["pk"],
"post_shortcode": post["code"],
@@ -344,6 +221,85 @@ class InstagramExtractor(Extractor):
return data
+ def _parse_post_graphql(self, post):
+ typename = post["__typename"]
+
+ if self._logged_in:
+ if post.get("is_video") and "video_url" not in post:
+ post = self.api.media(post["id"])[0]
+ elif typename == "GraphSidecar" and \
+ "edge_sidecar_to_children" not in post:
+ post = self.api.media(post["id"])[0]
+
+ pinned = post.get("pinned_for_users", ())
+ if pinned:
+ for index, user in enumerate(pinned):
+ pinned[index] = int(user["id"])
+
+ owner = post["owner"]
+ data = {
+ "typename" : typename,
+ "date" : text.parse_timestamp(post["taken_at_timestamp"]),
+ "likes" : post["edge_media_preview_like"]["count"],
+ "pinned" : pinned,
+ "owner_id" : owner["id"],
+ "username" : owner.get("username"),
+ "fullname" : owner.get("full_name"),
+ "post_id" : post["id"],
+ "post_shortcode": post["shortcode"],
+ "post_url" : "{}/p/{}/".format(self.root, post["shortcode"]),
+ "description": text.parse_unicode_escapes("\n".join(
+ edge["node"]["text"]
+ for edge in post["edge_media_to_caption"]["edges"]
+ )),
+ }
+
+ tags = self._find_tags(data["description"])
+ if tags:
+ data["tags"] = sorted(set(tags))
+
+ location = post.get("location")
+ if location:
+ data["location_id"] = location["id"]
+ data["location_slug"] = location["slug"]
+ data["location_url"] = "{}/explore/locations/{}/{}/".format(
+ self.root, location["id"], location["slug"])
+
+ data["_files"] = files = []
+ if "edge_sidecar_to_children" in post:
+ for num, edge in enumerate(
+ post["edge_sidecar_to_children"]["edges"], 1):
+ node = edge["node"]
+ dimensions = node["dimensions"]
+ media = {
+ "num": num,
+ "media_id" : node["id"],
+ "shortcode" : (node.get("shortcode") or
+ shortcode_from_id(node["id"])),
+ "display_url": node["display_url"],
+ "video_url" : node.get("video_url"),
+ "width" : dimensions["width"],
+ "height" : dimensions["height"],
+ "sidecar_media_id" : post["id"],
+ "sidecar_shortcode": post["shortcode"],
+ }
+ self._extract_tagged_users(node, media)
+ files.append(media)
+ else:
+ dimensions = post["dimensions"]
+ media = {
+ "media_id" : post["id"],
+ "shortcode" : post["shortcode"],
+ "display_url": post["display_url"],
+ "video_url" : post.get("video_url"),
+ "width" : dimensions["width"],
+ "height" : dimensions["height"],
+ }
+ self._extract_tagged_users(post, media)
+ files.append(media)
+
+ return data
+
@staticmethod
def _extract_tagged_users(src, dest):
dest["tagged_users"] = tagged_users = []
@@ -382,51 +338,6 @@ class InstagramExtractor(Extractor):
"username" : user["username"],
"full_name": user["full_name"]})
- def _pagination_graphql(self, query_hash, variables):
- cursor = self.config("cursor")
- if cursor:
- variables["after"] = cursor
-
- while True:
- data = next(iter(self._request_graphql(
- query_hash, variables)["user"].values()))
-
- for edge in data["edges"]:
- yield edge["node"]
-
- info = data["page_info"]
- if not info["has_next_page"]:
- return
- elif not data["edges"]:
- s = "" if self.item.endswith("s") else "s"
- raise exception.StopExtraction(
- "%s'%s posts are private", self.item, s)
-
- variables["after"] = self._cursor = info["end_cursor"]
- self.log.debug("Cursor: %s", self._cursor)
-
- def _pagination_api(self, endpoint, params=None):
- if params is None:
- params = {}
- while True:
- data = self._request_api(endpoint, params=params)
- yield from data["items"]
-
- if not data["more_available"]:
- return
- params["max_id"] = data["next_max_id"]
-
- def _pagination_api_post(self, endpoint, params, post=False):
- while True:
- data = self._request_api(endpoint, method="POST", data=params)
- for item in data["items"]:
- yield item["media"]
-
- info = data["paging_info"]
- if not info["more_available"]:
- return
- params["max_id"] = info["max_id"]
-
class InstagramUserExtractor(InstagramExtractor):
"""Extractor for an Instagram user profile"""
@@ -446,13 +357,13 @@ class InstagramUserExtractor(InstagramExtractor):
(InstagramHighlightsExtractor, base + "highlights/"),
(InstagramPostsExtractor , base + "posts/"),
(InstagramReelsExtractor , base + "reels/"),
- (InstagramChannelExtractor , base + "channel/"),
(InstagramTaggedExtractor , base + "tagged/"),
+ (InstagramChannelExtractor , base + "channel/"),
), ("posts",))
class InstagramPostsExtractor(InstagramExtractor):
- """Extractor for ProfilePage posts"""
+ """Extractor for an Instagram user's posts"""
subcategory = "posts"
pattern = USER_PATTERN + r"/posts"
test = ("https://www.instagram.com/instagram/posts/", {
@@ -461,13 +372,26 @@ class InstagramPostsExtractor(InstagramExtractor):
})
def posts(self):
- query_hash = "69cba40317214236af40e7efa697781d"
- variables = {"id": self._uid_by_screen_name(self.item), "first": 50}
- return self._pagination_graphql(query_hash, variables)
+ uid = self.api.user_id(self.item)
+ return self.api.user_feed(uid)
+
+
+class InstagramReelsExtractor(InstagramExtractor):
+ """Extractor for an Instagram user's reels"""
+ subcategory = "reels"
+ pattern = USER_PATTERN + r"/reels"
+ test = ("https://www.instagram.com/instagram/reels/", {
+ "range": "40-60",
+ "count": ">= 20",
+ })
+
+ def posts(self):
+ uid = self.api.user_id(self.item)
+ return self.api.user_clips(uid)
class InstagramTaggedExtractor(InstagramExtractor):
- """Extractor for ProfilePage tagged posts"""
+ """Extractor for an Instagram user's tagged posts"""
subcategory = "tagged"
pattern = USER_PATTERN + r"/tagged"
test = ("https://www.instagram.com/instagram/tagged/", {
@@ -485,7 +409,7 @@ class InstagramTaggedExtractor(InstagramExtractor):
self.user_id = self.item[3:]
return {"tagged_owner_id": self.user_id}
- user = self._user_by_screen_name(self.item)
+ user = self.api.user(self.item)
self.user_id = user["id"]
return {
@@ -495,13 +419,11 @@ class InstagramTaggedExtractor(InstagramExtractor):
}
def posts(self):
- endpoint = "/v1/usertags/{}/feed/".format(self.user_id)
- params = {"count": 50}
- return self._pagination_api(endpoint, params)
+ return self.api.user_tagged(self.user_id)
class InstagramChannelExtractor(InstagramExtractor):
- """Extractor for ProfilePage channel"""
+ """Extractor for an Instagram user's channel posts"""
subcategory = "channel"
pattern = USER_PATTERN + r"/channel"
test = ("https://www.instagram.com/instagram/channel/", {
@@ -510,25 +432,25 @@ class InstagramChannelExtractor(InstagramExtractor):
})
def posts(self):
- query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
- variables = {"id": self._uid_by_screen_name(self.item), "first": 50}
- return self._pagination_graphql(query_hash, variables)
+ uid = self.api.user_id(self.item)
+ return self.api.user_clips(uid)
class InstagramSavedExtractor(InstagramExtractor):
- """Extractor for ProfilePage saved media"""
+ """Extractor for an Instagram user's saved media"""
subcategory = "saved"
- pattern = USER_PATTERN + r"/saved/?$"
- test = ("https://www.instagram.com/instagram/saved/",)
+ pattern = USER_PATTERN + r"/saved(?:/all-posts)?/?$"
+ test = (
+ ("https://www.instagram.com/instagram/saved/"),
+ ("https://www.instagram.com/instagram/saved/all-posts/"),
+ )
def posts(self):
- query_hash = "2ce1d673055b99250e93b6f88f878fde"
- variables = {"id": self._uid_by_screen_name(self.item), "first": 50}
- return self._pagination_graphql(query_hash, variables)
+ return self.api.user_saved()
class InstagramCollectionExtractor(InstagramExtractor):
- """Extractor for ProfilePage saved collection media"""
+ """Extractor for Instagram collection"""
subcategory = "collection"
pattern = USER_PATTERN + r"/saved/([^/?#]+)/([^/?#]+)"
test = (
@@ -546,13 +468,59 @@ class InstagramCollectionExtractor(InstagramExtractor):
}
def posts(self):
- endpoint = "/v1/feed/collection/{}/posts/".format(self.collection_id)
- for item in self._pagination_api(endpoint):
- yield item["media"]
+ return self.api.user_collection(self.collection_id)
+
+
+class InstagramStoriesExtractor(InstagramExtractor):
+ """Extractor for Instagram stories"""
+ subcategory = "stories"
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/stories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)")
+ test = (
+ ("https://www.instagram.com/stories/instagram/"),
+ ("https://www.instagram.com/stories/highlights/18042509488170095/"),
+ ("https://instagram.com/stories/geekmig/2724343156064789461"),
+ )
+
+ def __init__(self, match):
+ self.highlight_id, self.user, self.media_id = match.groups()
+ if self.highlight_id:
+ self.subcategory = InstagramHighlightsExtractor.subcategory
+ InstagramExtractor.__init__(self, match)
+
+ def posts(self):
+ if self.highlight_id:
+ reel_id = "highlight:" + self.highlight_id
+ else:
+ reel_id = self.api.user_id(self.user)
+
+ reels = self.api.reels_media(reel_id)
+
+ if self.media_id and reels:
+ reel = reels[0]
+ for item in reel["items"]:
+ if item["pk"] == self.media_id:
+ reel["items"] = (item,)
+ break
+ else:
+ raise exception.NotFoundError("story")
+
+ return reels
+
+
+class InstagramHighlightsExtractor(InstagramExtractor):
+ """Extractor for an Instagram user's story highlights"""
+ subcategory = "highlights"
+ pattern = USER_PATTERN + r"/highlights"
+ test = ("https://www.instagram.com/instagram/highlights",)
+
+ def posts(self):
+ uid = self.api.user_id(self.item)
+ return self.api.highlights_media(uid)
class InstagramTagExtractor(InstagramExtractor):
- """Extractor for TagPage"""
+ """Extractor for Instagram tags"""
subcategory = "tag"
directory_fmt = ("{category}", "{subcategory}", "{tag}")
pattern = BASE_PATTERN + r"/explore/tags/([^/?#]+)"
@@ -565,27 +533,7 @@ class InstagramTagExtractor(InstagramExtractor):
return {"tag": text.unquote(self.item)}
def posts(self):
- endpoint = "/v1/tags/{}/sections/".format(self.item)
- data = {
- "include_persistent": "0",
- "max_id" : None,
- "page" : None,
- "surface": "grid",
- "tab" : "recent",
- }
-
- while True:
- info = self._request_api(endpoint, method="POST", data=data)
-
- for section in info["sections"]:
- for media in section["layout_content"]["medias"]:
- yield media["media"]
-
- if not info.get("more_available"):
- return
-
- data["max_id"] = info["next_max_id"]
- data["page"] = info["next_page"]
+ return self.api.tags_media(self.item)
class InstagramPostExtractor(InstagramExtractor):
@@ -618,7 +566,6 @@ class InstagramPostExtractor(InstagramExtractor):
"width": int,
}
}),
-
# GraphSidecar
("https://www.instagram.com/p/BoHk1haB5tM/", {
"count": 5,
@@ -633,7 +580,6 @@ class InstagramPostExtractor(InstagramExtractor):
"username": "instagram",
}
}),
-
# GraphVideo
("https://www.instagram.com/p/Bqxp0VSBgJg/", {
"pattern": r"/46840863_726311431074534_7805566102611403091_n\.mp4",
@@ -651,7 +597,6 @@ class InstagramPostExtractor(InstagramExtractor):
"width": int,
}
}),
-
# GraphVideo (IGTV)
("https://www.instagram.com/tv/BkQjCfsBIzi/", {
"pattern": r"/10000000_597132547321814_702169244961988209_n\.mp4",
@@ -668,7 +613,6 @@ class InstagramPostExtractor(InstagramExtractor):
"width": int,
}
}),
-
# GraphSidecar with 2 embedded GraphVideo objects
("https://www.instagram.com/p/BtOvDOfhvRr/", {
"count": 2,
@@ -679,7 +623,6 @@ class InstagramPostExtractor(InstagramExtractor):
"video_url": str,
}
}),
-
# GraphImage with tagged user
("https://www.instagram.com/p/B_2lf3qAd3y/", {
"keyword": {
@@ -690,98 +633,265 @@ class InstagramPostExtractor(InstagramExtractor):
}]
}
}),
-
# URL with username (#2085)
("https://www.instagram.com/dm/p/CW042g7B9CY/"),
-
("https://www.instagram.com/reel/CDg_6Y1pxWu/"),
)
def posts(self):
- return self._media_by_id(id_from_shortcode(self.item))
+ return self.api.media(id_from_shortcode(self.item))
-class InstagramStoriesExtractor(InstagramExtractor):
- """Extractor for Instagram stories"""
- subcategory = "stories"
- pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/stories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)")
- test = (
- ("https://www.instagram.com/stories/instagram/"),
- ("https://www.instagram.com/stories/highlights/18042509488170095/"),
- ("https://instagram.com/stories/geekmig/2724343156064789461"),
- )
+class InstagramRestAPI():
- def __init__(self, match):
- self.highlight_id, self.user, self.media_id = match.groups()
- if self.highlight_id:
- self.subcategory = InstagramHighlightsExtractor.subcategory
- InstagramExtractor.__init__(self, match)
+ def __init__(self, extractor):
+ self.extractor = extractor
- def posts(self):
- if self.highlight_id:
- reel_id = "highlight:" + self.highlight_id
- else:
- reel_id = self._uid_by_screen_name(self.user)
+ def highlights_media(self, user_id):
+ chunk_size = 5
+ reel_ids = [hl["id"] for hl in self.highlights_tray(user_id)]
+
+ for offset in range(0, len(reel_ids), chunk_size):
+ yield from self.reels_media(
+ reel_ids[offset : offset+chunk_size])
+
+ def highlights_tray(self, user_id):
+ endpoint = "/v1/highlights/{}/highlights_tray/".format(user_id)
+ return self._call(endpoint)["tray"]
+
+ def media(self, post_id):
+ endpoint = "/v1/media/{}/info/".format(post_id)
+ return self._pagination(endpoint)
+ def reels_media(self, reel_ids):
endpoint = "/v1/feed/reels_media/"
- params = {"reel_ids": reel_id}
- reels = self._request_api(endpoint, params=params)["reels"]
+ params = {"reel_ids": reel_ids}
+ return self._call(endpoint, params=params)["reels_media"]
- if self.media_id:
- reel = reels[reel_id]
- for item in reel["items"]:
- if item["pk"] == self.media_id:
- reel["items"] = (item,)
- break
+ def tags_media(self, tag):
+ for section in self.tags_sections(tag):
+ for media in section["layout_content"]["medias"]:
+ yield media["media"]
+
+ def tags_sections(self, tag):
+ endpoint = "/v1/tags/{}/sections/".format(tag)
+ data = {
+ "include_persistent": "0",
+ "max_id" : None,
+ "page" : None,
+ "surface": "grid",
+ "tab" : "recent",
+ }
+ return self._pagination_sections(endpoint, data)
+
+ @memcache(keyarg=1)
+ def user(self, screen_name):
+ endpoint = "/v1/users/web_profile_info/"
+ params = {"username": screen_name}
+ return self._call(endpoint, params=params)["data"]["user"]
+
+ def user_id(self, screen_name):
+ if screen_name.startswith("id:"):
+ return screen_name[3:]
+ return self.user(screen_name)["id"]
+
+ def user_clips(self, user_id):
+ endpoint = "/v1/clips/user/"
+ data = {"target_user_id": user_id, "page_size": "50"}
+ return self._pagination_post(endpoint, data)
+
+ def user_collection(self, collection_id):
+ endpoint = "/v1/feed/collection/{}/posts/".format(collection_id)
+ params = {"count": 50}
+ return self._pagination(endpoint, params, media=True)
+
+ def user_feed(self, user_id):
+ endpoint = "/v1/feed/user/{}/".format(user_id)
+ params = {"count": 30}
+ return self._pagination(endpoint, params)
+
+ def user_saved(self):
+ endpoint = "/v1/feed/saved/posts/"
+ params = {"count": 50}
+ return self._pagination(endpoint, params, media=True)
+
+ def user_tagged(self, user_id):
+ endpoint = "/v1/usertags/{}/feed/".format(user_id)
+ params = {"count": 50}
+ return self._pagination(endpoint, params)
+
+ def _call(self, endpoint, **kwargs):
+ extr = self.extractor
+
+ url = "https://i.instagram.com/api" + endpoint
+ kwargs["headers"] = {
+ "X-CSRFToken" : extr.csrf_token,
+ "X-Instagram-AJAX": "1006242110",
+ "X-IG-App-ID" : "936619743392459",
+ "X-ASBD-ID" : "198387",
+ "X-IG-WWW-Claim" : extr.www_claim,
+ "Origin" : extr.root,
+ "Referer" : extr.root + "/",
+ }
+ kwargs["cookies"] = {
+ "csrftoken": extr.csrf_token,
+ }
+ return extr.request(url, **kwargs).json()
+
+ def _pagination(self, endpoint, params=None, media=False):
+ if params is None:
+ params = {}
+ while True:
+ data = self._call(endpoint, params=params)
+
+ if media:
+ for item in data["items"]:
+ yield item["media"]
else:
- raise exception.NotFoundError("story")
+ yield from data["items"]
- return reels.values()
+ if not data.get("more_available"):
+ return
+ params["max_id"] = data["next_max_id"]
+ def _pagination_post(self, endpoint, params):
+ while True:
+ data = self._call(endpoint, method="POST", data=params)
-class InstagramHighlightsExtractor(InstagramExtractor):
- """Extractor for all Instagram story highlights of a user"""
- subcategory = "highlights"
- pattern = USER_PATTERN + r"/highlights"
- test = ("https://www.instagram.com/instagram/highlights",)
+ for item in data["items"]:
+ yield item["media"]
- def posts(self):
- endpoint = "/v1/highlights/{}/highlights_tray/".format(
- self._uid_by_screen_name(self.item))
- tray = self._request_api(endpoint)["tray"]
- reel_ids = [highlight["id"] for highlight in tray]
+ info = data["paging_info"]
+ if not info.get("more_available"):
+ return
+ params["max_id"] = info["max_id"]
- # Anything above 30 responds with statuscode 400.
- # 30 can work, however, sometimes the API will respond with 560 or 500.
- chunk_size = 5
- endpoint = "/v1/feed/reels_media/"
+ def _pagination_sections(self, endpoint, params):
+ while True:
+ info = self._call(endpoint, method="POST", data=params)
- for offset in range(0, len(reel_ids), chunk_size):
- chunk_ids = reel_ids[offset : offset+chunk_size]
- params = {"reel_ids": chunk_ids}
- reels = self._request_api(endpoint, params=params)["reels"]
- for reel_id in chunk_ids:
- yield reels[reel_id]
+ yield from info["sections"]
+
+ if not info.get("more_available"):
+ return
+ params["max_id"] = info["next_max_id"]
+ params["page"] = info["next_page"]
-class InstagramReelsExtractor(InstagramExtractor):
- """Extractor for an Instagram user's reels"""
- subcategory = "reels"
- pattern = USER_PATTERN + r"/reels"
- test = ("https://www.instagram.com/instagram/reels/", {
- "range": "40-60",
- "count": ">= 20",
- })
+class InstagramGraphqlAPI():
- def posts(self):
- endpoint = "/v1/clips/user/"
- data = {
- "target_user_id": self._uid_by_screen_name(self.item),
- "page_size" : "50",
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.user = InstagramRestAPI(extractor).user
+ self.user_collection = self.user_saved = self.reels_media = \
+ self.highlights_media = self._login_required
+ self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
+
+ @staticmethod
+ def _login_required(_=None):
+ raise exception.AuthorizationError("Login required")
+
+ def highlights_tray(self, user_id):
+ query_hash = "d4d88dc1500312af6f937f7b804c68c3"
+ variables = {
+ "user_id": user_id,
+ "include_chaining": False,
+ "include_reel": False,
+ "include_suggested_users": False,
+ "include_logged_out_extras": True,
+ "include_highlight_reels": True,
+ "include_live_status": False,
+ }
+ edges = (self._call(query_hash, variables)["user"]
+ ["edge_highlight_reels"]["edges"])
+ return [edge["node"] for edge in edges]
+
+ def media(self, post_id):
+ query_hash = "9f8827793ef34641b2fb195d4d41151c"
+ variables = {
+ "shortcode": shortcode_from_id(post_id),
+ "child_comment_count": 3,
+ "fetch_comment_count": 40,
+ "parent_comment_count": 24,
+ "has_threaded_comments": True,
+ }
+ media = self._call(query_hash, variables).get("shortcode_media")
+ return (media,) if media else ()
+
+ def tags_media(self, tag):
+ query_hash = "9b498c08113f1e09617a1703c22b2f32"
+ variables = {"tag_name": text.unescape(tag), "first": 50}
+ return self._pagination(query_hash, variables,
+ "hashtag", "edge_hashtag_to_media")
+
+ def user_id(self, screen_name):
+ if screen_name.startswith("id:"):
+ return screen_name[3:]
+ return self.user(screen_name)["id"]
+
+ def user_clips(self, user_id):
+ query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
+ variables = {"id": user_id, "first": 50}
+ return self._pagination(query_hash, variables)
+
+ def user_feed(self, user_id):
+ query_hash = "69cba40317214236af40e7efa697781d"
+ variables = {"id": user_id, "first": 50}
+ return self._pagination(query_hash, variables)
+
+ def user_tagged(self, user_id):
+ query_hash = "be13233562af2d229b008d2976b998b5"
+ variables = {"id": user_id, "first": 50}
+ return self._pagination(query_hash, variables)
+
+ def _call(self, query_hash, variables):
+ extr = self.extractor
+
+ url = "https://www.instagram.com/graphql/query/"
+ params = {
+ "query_hash": query_hash,
+ "variables" : self._json_dumps(variables),
+ }
+ headers = {
+ "Accept" : "*/*",
+ "X-CSRFToken" : extr.csrf_token,
+ "X-Instagram-AJAX": "1006267176",
+ "X-IG-App-ID" : "936619743392459",
+ "X-ASBD-ID" : "198387",
+ "X-IG-WWW-Claim" : extr.www_claim,
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer" : extr.root + "/",
+ }
+ cookies = {
+ "csrftoken": extr.csrf_token,
}
+ return extr.request(
+ url, params=params, headers=headers, cookies=cookies,
+ ).json()["data"]
- return self._pagination_api_post(endpoint, data)
+ def _pagination(self, query_hash, variables,
+ key_data="user", key_edge=None):
+ cursor = self.extractor.config("cursor")
+ if cursor:
+ variables["after"] = cursor
+
+ while True:
+ data = self._call(query_hash, variables)[key_data]
+ data = data[key_edge] if key_edge else next(iter(data.values()))
+
+ for edge in data["edges"]:
+ yield edge["node"]
+
+ info = data["page_info"]
+ if not info["has_next_page"]:
+ return
+ elif not data["edges"]:
+ s = "" if self.item.endswith("s") else "s"
+ raise exception.StopExtraction(
+ "%s'%s posts are private", self.item, s)
+
+ variables["after"] = self._cursor = info["end_cursor"]
+ self.extractor.log.debug("Cursor: %s", self._cursor)
@cache(maxage=360*24*3600, keyarg=1)
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 816b561..750b741 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -33,6 +33,7 @@ class KemonopartyExtractor(Extractor):
self.cookiedomain = ".coomer.party"
self.root = text.root_from_url(match.group(0))
Extractor.__init__(self, match)
+ self.session.headers["Referer"] = self.root + "/"
def items(self):
self._prepare_ddosguard_cookies()
@@ -46,7 +47,7 @@ class KemonopartyExtractor(Extractor):
comments = self.config("comments")
username = dms = None
- # prevent files to be sent with gzip compression
+ # prevent files from being sent with gzip compression
headers = {"Accept-Encoding": "identity"}
if self.config("metadata"):
@@ -63,6 +64,9 @@ class KemonopartyExtractor(Extractor):
for post in posts:
+ headers["Referer"] = "{}/{}/user/{}/post/{}".format(
+ self.root, post["service"], post["user"], post["id"])
+ post["_http_headers"] = headers
post["date"] = text.parse_datetime(
post["published"] or post["added"],
"%a, %d %b %Y %H:%M:%S %Z")
@@ -74,27 +78,33 @@ class KemonopartyExtractor(Extractor):
if dms is True:
dms = self._extract_dms(post)
post["dms"] = dms
- yield Message.Directory, post
+ files = []
hashes = set()
- post["num"] = 0
+
for file in itertools.chain.from_iterable(
g(post) for g in generators):
url = file["path"]
match = find_hash(url)
if match:
- post["hash"] = hash = match.group(1)
+ file["hash"] = hash = match.group(1)
if hash in hashes and not duplicates:
self.log.debug("Skipping %s (duplicate)", url)
continue
hashes.add(hash)
else:
- post["hash"] = ""
+ file["hash"] = ""
+
+ files.append(file)
+ post["count"] = len(files)
+ yield Message.Directory, post
+
+ for post["num"], file in enumerate(files, 1):
+ post["hash"] = file["hash"]
post["type"] = file["type"]
- post["num"] += 1
- post["_http_headers"] = headers
+ url = file["path"]
text.nameext_from_url(file.get("name", url), post)
if not post["extension"]:
@@ -236,6 +246,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
"keyword": {
"added": "Wed, 06 May 2020 20:28:02 GMT",
"content": str,
+ "count": 1,
"date": "dt:2019-08-11 02:09:04",
"edited": None,
"embed": dict,
@@ -374,6 +385,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
post["channel_name"] = self.channel_name
post["date"] = text.parse_datetime(
post["published"], "%a, %d %b %Y %H:%M:%S %Z")
+ post["count"] = len(files)
yield Message.Directory, post
for post["num"], file in enumerate(files, 1):
@@ -466,7 +478,7 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
if self.favorites == "artist":
users = self.request(
- self.root + "/api/v1/account/favorites?type=artist").json()
+ self.root + "/api/favorites?type=artist").json()
for user in users:
user["_extractor"] = KemonopartyUserExtractor
url = "{}/{}/user/{}".format(
@@ -475,7 +487,7 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
elif self.favorites == "post":
posts = self.request(
- self.root + "/api/v1/account/favorites?type=post").json()
+ self.root + "/api/favorites?type=post").json()
for post in posts:
post["_extractor"] = KemonopartyPostExtractor
url = "{}/{}/user/{}/post/{}".format(
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 493a8ef..9ce5772 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -31,6 +31,8 @@ class MastodonExtractor(BaseExtractor):
def items(self):
for status in self.statuses():
+ if self._check_move:
+ self._check_move(status["account"])
if not self.reblogs and status["reblog"]:
self.log.debug("Skipping %s (reblog)", status["id"])
continue
@@ -56,6 +58,12 @@ class MastodonExtractor(BaseExtractor):
"""Return an iterable containing all relevant Status objects"""
return ()
+ def _check_move(self, account):
+ self._check_move = None
+ if "moved" in account:
+ self.log.warning("Account '%s' moved to '%s'",
+ account["acct"], account["moved"]["acct"])
+
INSTANCES = {
"mastodon.social": {
@@ -192,6 +200,7 @@ class MastodonAPI():
handle = "@{}@{}".format(username, self.extractor.instance)
for account in self.account_search(handle, 1):
if account["username"] == username:
+ self.extractor._check_move(account)
return account["id"]
raise exception.NotFoundError("account")
diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py
index f06ab70..8254118 100644
--- a/gallery_dl/extractor/myportfolio.py
+++ b/gallery_dl/extractor/myportfolio.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2021 Mike Fährmann
+# Copyright 2018-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -104,4 +104,7 @@ class MyportfolioGalleryExtractor(Extractor):
@staticmethod
def images(page):
"""Extract and return a list of all image-urls"""
- return list(text.extract_iter(page, 'js-lightbox" data-src="', '"'))
+ return (
+ list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) or
+ list(text.extract_iter(page, 'data-src="', '"'))
+ )
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index d9ab336..2c2dcb9 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -155,6 +155,7 @@ class NewgroundsExtractor(Extractor):
data = {
"title" : text.unescape(extr('"og:title" content="', '"')),
"description": text.unescape(extr(':description" content="', '"')),
+ "type" : extr('og:type" content="', '"'),
"date" : text.parse_datetime(extr(
'itemprop="datePublished" content="', '"')),
"rating" : extr('class="rated-', '"'),
@@ -173,6 +174,7 @@ class NewgroundsExtractor(Extractor):
return {
"title" : text.unescape(extr('"og:title" content="', '"')),
"description": text.unescape(extr(':description" content="', '"')),
+ "type" : extr('og:type" content="', '"'),
"date" : text.parse_datetime(extr(
'itemprop="datePublished" content="', '"')),
"url" : extr('{"url":"', '"').replace("\\/", "/"),
@@ -184,6 +186,7 @@ class NewgroundsExtractor(Extractor):
def _extract_media_data(self, extr, url):
index = url.split("/")[5]
title = extr('"og:title" content="', '"')
+ type = extr('og:type" content="', '"')
descr = extr('"og:description" content="', '"')
src = extr('{"url":"', '"')
@@ -223,6 +226,7 @@ class NewgroundsExtractor(Extractor):
"title" : text.unescape(title),
"url" : src,
"date" : date,
+ "type" : type,
"description": text.unescape(descr or extr(
'itemprop="description" content="', '"')),
"rating" : extr('class="rated-', '"'),
@@ -305,6 +309,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
"score" : float,
"tags" : ["ryu", "streetfighter"],
"title" : "Ryu is Hawt",
+ "type" : "article",
"user" : "tomfulp",
"width" : 447,
},
@@ -357,6 +362,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
"score" : float,
"tags" : ["alienhominid", "trailer"],
"title" : "Alien Hominid Fan Trailer",
+ "type" : "movie",
"user" : "kickinthehead",
},
}),
@@ -373,6 +379,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
"score" : float,
"tags" : ["fulp", "interview", "tom", "zj"],
"title" : "ZJ Interviews Tom Fulp!",
+ "type" : "music.song",
"user" : "zj",
},
}),
@@ -380,6 +387,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
("https://www.newgrounds.com/portal/view/161181/format/flash", {
"pattern": r"https://uploads\.ungrounded\.net/161000"
r"/161181_ddautta_mask__550x281_\.swf\?f1081628129",
+ "keyword": {"type": "movie"},
}),
# format selection (#1729)
("https://www.newgrounds.com/portal/view/758545", {
@@ -392,6 +400,49 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
"options": (("username", None),),
"count": 1,
}),
+ # flash game
+ ("https://www.newgrounds.com/portal/view/829032", {
+ "pattern": r"https://uploads\.ungrounded\.net/829000"
+ r"/829032_picovsbeardx\.swf\?f1641968445",
+ "range": "1",
+ "keyword": {
+ "artist" : [
+ "dungeonation",
+ "carpetbakery",
+ "animalspeakandrews",
+ "bill",
+ "chipollo",
+ "dylz49",
+ "gappyshamp",
+ "pinktophat",
+ "rad",
+ "shapeshiftingblob",
+ "tomfulp",
+ "voicesbycorey",
+ "psychogoldfish",
+ ],
+ "comment" : "re:The children are expendable. Take out the ",
+ "date" : "dt:2022-01-10 23:00:57",
+ "description": "Bloodshed in The Big House that Blew...again!",
+ "favorites" : int,
+ "index" : 829032,
+ "post_url" : "https://www.newgrounds.com/portal/view/829032",
+ "rating" : "m",
+ "score" : float,
+ "tags" : [
+ "assassin",
+ "boyfriend",
+ "darnell",
+ "nene",
+ "pico",
+ "picos-school",
+ ],
+ "title" : "PICO VS BEAR DX",
+ "type" : "game",
+ "url" : "https://uploads.ungrounded.net/829000"
+ "/829032_picovsbeardx.swf?f1641968445",
+ },
+ }),
)
def __init__(self, match):
@@ -434,6 +485,17 @@ class NewgroundsMoviesExtractor(NewgroundsExtractor):
})
+class NewgroundsGamesExtractor(NewgroundsExtractor):
+ """Extractor for a newgrounds user's games"""
+ subcategory = _path = "games"
+ pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/games/?$"
+ test = ("https://tomfulp.newgrounds.com/games", {
+ "pattern": r"https://uploads.ungrounded.net(/alternate)?/\d+/\d+_.+",
+ "range": "1-10",
+ "count": 10,
+ })
+
+
class NewgroundsUserExtractor(NewgroundsExtractor):
"""Extractor for a newgrounds user profile"""
subcategory = "user"
@@ -454,6 +516,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor):
return self._dispatch_extractors((
(NewgroundsArtExtractor , base + "art"),
(NewgroundsAudioExtractor , base + "audio"),
+ (NewgroundsGamesExtractor , base + "games"),
(NewgroundsMoviesExtractor, base + "movies"),
), ("art",))
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index a589760..6b2e1c3 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -642,6 +642,66 @@ class PixivPixivisionExtractor(PixivExtractor):
}
+class PixivSeriesExtractor(PixivExtractor):
+ """Extractor for illustrations from a Pixiv series"""
+ subcategory = "series"
+ directory_fmt = ("{category}", "{user[id]} {user[account]}",
+ "{series[id]} {series[title]}")
+ filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}"
+ pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net"
+ r"/user/(\d+)/series/(\d+)")
+ test = ("https://www.pixiv.net/user/10509347/series/21859", {
+ "range": "1-10",
+ "count": 10,
+ "keyword": {
+ "num_series": int,
+ "series": {
+ "canonical": "https://www.pixiv.net/user/10509347"
+ "/series/21859",
+ "description": str,
+ "ogp": dict,
+ "title": "先輩がうざい後輩の話",
+ "total": int,
+ "twitter": dict,
+ },
+ },
+ })
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.user_id, self.series_id = match.groups()
+
+ def works(self):
+ url = self.root + "/ajax/series/" + self.series_id
+ params = {"p": 1}
+ headers = {
+ "Accept": "application/json",
+ "Referer": "{}/user/{}/series/{}".format(
+ self.root, self.user_id, self.series_id),
+ "Alt-Used": "www.pixiv.net",
+ }
+
+ while True:
+ data = self.request(url, params=params, headers=headers).json()
+ body = data["body"]
+ page = body["page"]
+
+ series = body["extraData"]["meta"]
+ series["id"] = self.series_id
+ series["total"] = page["total"]
+ series["title"] = text.extract(series["title"], '"', '"')[0]
+
+ for info in page["series"]:
+ work = self.api.illust_detail(info["workId"])
+ work["num_series"] = info["order"]
+ work["series"] = series
+ yield work
+
+ if len(page["series"]) < 10:
+ return
+ params["p"] += 1
+
+
class PixivSketchExtractor(Extractor):
"""Extractor for user pages on sketch.pixiv.net"""
category = "pixiv"
diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py
index f2e964d..535fae9 100644
--- a/gallery_dl/extractor/plurk.py
+++ b/gallery_dl/extractor/plurk.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -85,7 +85,7 @@ class PlurkTimelineExtractor(PlurkExtractor):
def plurks(self):
url = "{}/{}".format(self.root, self.user)
page = self.request(url).text
- user_id, pos = text.extract(page, '"user_id":', ',')
+ user_id, pos = text.extract(page, '"page_user": {"id":', ',')
plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0])
headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"}
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 2ce7f6c..3396e3a 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -53,12 +53,15 @@ class SankakuExtractor(BooruExtractor):
url = "https://s.sankakucomplex.com" + url[url.index("/", 8):]
return url
- @staticmethod
- def _prepare(post):
+ def _prepare(self, post):
post["created_at"] = post["created_at"]["s"]
post["date"] = text.parse_timestamp(post["created_at"])
post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]]
post["tag_string"] = " ".join(post["tags"])
+ post["_http_validate"] = self._check_expired
+
+ def _check_expired(self, response):
+ return not response.history or '.com/expired.png' not in response.url
def _extended_tags(self, post):
tags = collections.defaultdict(list)
@@ -219,7 +222,11 @@ class SankakuAPI():
def __init__(self, extractor):
self.extractor = extractor
- self.headers = {"Accept": "application/vnd.sankaku.api+json;v=2"}
+ self.headers = {
+ "Accept" : "application/vnd.sankaku.api+json;v=2",
+ "Origin" : extractor.root,
+ "Referer": extractor.root + "/",
+ }
self.username, self.password = self.extractor._get_auth_info()
if not self.username:
@@ -253,11 +260,14 @@ class SankakuAPI():
for _ in range(5):
self.authenticate()
response = self.extractor.request(
- url, params=params, headers=self.headers, fatal=False)
+ url, params=params, headers=self.headers, fatal=None)
if response.status_code == 429:
- self.extractor.wait(
- until=response.headers.get("X-RateLimit-Reset"))
+ until = response.headers.get("X-RateLimit-Reset")
+ if not until and b"tags-limit" in response.content:
+ raise exception.StopExtraction("Search tag limit exceeded")
+ seconds = None if until else 60
+ self.extractor.wait(until=until, seconds=seconds)
continue
data = response.json()
@@ -278,9 +288,41 @@ class SankakuAPI():
params["lang"] = "en"
params["limit"] = str(self.extractor.per_page)
+ refresh = self.extractor.config("refresh", False)
+ if refresh:
+ offset = expires = 0
+ from time import time
+
while True:
data = self._call(endpoint, params)
- yield from data["data"]
+
+ if refresh:
+ posts = data["data"]
+ if offset:
+ posts = util.advance(posts, offset)
+
+ for post in posts:
+ if not expires:
+ url = post["file_url"]
+ if url:
+ expires = text.parse_int(
+ text.extract(url, "e=", "&")[0]) - 60
+
+ if 0 < expires <= time():
+ self.extractor.log.debug("Refreshing download URLs")
+ expires = None
+ break
+
+ offset += 1
+ yield post
+
+ if expires is None:
+ expires = 0
+ continue
+ offset = expires = 0
+
+ else:
+ yield from data["data"]
params["next"] = data["meta"]["next"]
if not params["next"]:
diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py
index cd8c238..822b1f2 100644
--- a/gallery_dl/extractor/skeb.py
+++ b/gallery_dl/extractor/skeb.py
@@ -26,8 +26,11 @@ class SkebExtractor(Extractor):
self.article = self.config("article", False)
def items(self):
+ metadata = self.metadata()
for user_name, post_num in self.posts():
response, post = self._get_post_data(user_name, post_num)
+ if metadata:
+ post.update(metadata)
yield Message.Directory, post
for data in self._get_urls_from_post(response, post):
url = data["file_url"]
@@ -36,6 +39,9 @@ class SkebExtractor(Extractor):
def posts(self):
"""Return post number"""
+ def metadata(self):
+ """Return additional metadata"""
+
def _pagination(self, url, params):
headers = {"Referer": self.root, "Authorization": "Bearer null"}
params["offset"] = 0
@@ -223,6 +229,62 @@ class SkebUserExtractor(SkebExtractor):
return posts
+class SkebSearchExtractor(SkebExtractor):
+ """Extractor for skeb search results"""
+ subcategory = "search"
+ pattern = r"(?:https?://)?skeb\.jp/search\?q=([^&#]+)"
+ test = ("https://skeb.jp/search?q=bunny%20tree&t=works", {
+ "count": ">= 18",
+ "keyword": {"search_tags": "bunny tree"},
+ })
+
+ def metadata(self):
+ return {"search_tags": text.unquote(self.user_name)}
+
+ def posts(self):
+ url = "https://hb1jt3kre9-2.algolianet.com/1/indexes/*/queries"
+ params = {
+ "x-algolia-agent": "Algolia for JavaScript (4.13.1); Browser",
+ }
+ headers = {
+ "Origin": self.root,
+ "Referer": self.root + "/",
+ "x-algolia-api-key": "9a4ce7d609e71bf29e977925e4c6740c",
+ "x-algolia-application-id": "HB1JT3KRE9",
+ }
+
+ filters = self.config("filters")
+ if filters is None:
+ filters = ("genre:art OR genre:voice OR genre:novel OR "
+ "genre:video OR genre:music OR genre:correction")
+ elif not isinstance(filters, str):
+ filters = " OR ".join(filters)
+
+ page = 0
+ pams = "hitsPerPage=40&filters=" + text.quote(filters) + "&page="
+
+ request = {
+ "indexName": "Request",
+ "query": text.unquote(self.user_name),
+ "params": pams + str(page),
+ }
+ data = {"requests": (request,)}
+
+ while True:
+ result = self.request(
+ url, method="POST", params=params, headers=headers, json=data,
+ ).json()["results"][0]
+
+ for post in result["hits"]:
+ parts = post["path"].split("/")
+ yield parts[1][1:], parts[3]
+
+ if page >= result["nbPages"]:
+ return
+ page += 1
+ request["params"] = pams + str(page)
+
+
class SkebFollowingExtractor(SkebExtractor):
"""Extractor for all creators followed by a skeb user"""
subcategory = "following"
@@ -238,8 +300,8 @@ class SkebFollowingExtractor(SkebExtractor):
def users(self):
url = "{}/api/users/{}/following_creators".format(
self.root, self.user_name)
- headers = {"Referer": self.root, "Authorization": "Bearer null"}
params = {"sort": "date", "offset": 0, "limit": 90}
+ headers = {"Referer": self.root, "Authorization": "Bearer null"}
while True:
data = self.request(url, params=params, headers=headers).json()
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 6f53881..447ce00 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -116,13 +116,17 @@ class TumblrExtractor(Extractor):
if self.original and "/s2048x3072/" in photo["url"] and (
photo["width"] == 2048 or photo["height"] == 3072):
- photo["url"] = self._original_photo(photo["url"])
+ photo["url"], fb = self._original_photo(photo["url"])
+ if fb:
+ post["_fallback"] = self._original_image_fallback(
+ photo["url"], post["id"])
del photo["original_size"]
del photo["alt_sizes"]
posts.append(
self._prepare_image(photo["url"], post.copy()))
del post["photo"]
+ post.pop("_fallback", None)
url = post.get("audio_url") # type "audio"
if url and url.startswith("https://a.tumblr.com/"):
@@ -138,8 +142,12 @@ class TumblrExtractor(Extractor):
# API response, but they can't contain images/videos anyway
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
for url in _findall_image(body):
- url = self._original_inline_image(url)
+ url, fb = self._original_inline_image(url)
+ if fb:
+ post["_fallback"] = self._original_image_fallback(
+ url, post["id"])
posts.append(self._prepare_image(url, post.copy()))
+ post.pop("_fallback", None)
for url in _findall_video(body):
url = self._original_video(url)
posts.append(self._prepare(url, post.copy()))
@@ -218,23 +226,35 @@ class TumblrExtractor(Extractor):
return self.blog != post.get("reblogged_root_uuid")
def _original_photo(self, url):
- return self._update_image_token(
- url.replace("/s2048x3072/", "/s99999x99999/", 1))
+ resized = url.replace("/s2048x3072/", "/s99999x99999/", 1)
+ return self._update_image_token(resized)
def _original_inline_image(self, url):
if self.original:
- url, n = self._subn_orig_image("/s99999x99999/", url, 1)
+ resized, n = self._subn_orig_image("/s99999x99999/", url, 1)
if n:
- return self._update_image_token(url)
- return self._sub_image(r"https://\1_1280.\2", url)
+ return self._update_image_token(resized)
+ return self._sub_image(r"https://\1_1280.\2", url), False
def _original_video(self, url):
return self._sub_video(r"https://\1.\2", url)
- def _update_image_token(self, url):
+ def _update_image_token(self, resized):
headers = {"Accept": "text/html,*/*;q=0.8"}
- response = self.request(url, headers=headers)
- return text.extract(response.text, '" src="', '"')[0]
+ try:
+ response = self.request(resized, headers=headers)
+ except Exception:
+ return resized, True
+ else:
+ updated = text.extract(response.text, '" src="', '"')[0]
+ return updated, (resized == updated)
+
+ def _original_image_fallback(self, url, post_id):
+ yield self._update_image_token(url)[0]
+ yield self._update_image_token(url)[0]
+ yield self._update_image_token(url)[0]
+ self.log.warning("Unable to fetch higher-resolution "
+ "version of %s (%s)", url, post_id)
class TumblrUserExtractor(TumblrExtractor):