From b2d8a54ecf4157570d00a8b974a779766822bf4b Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Mon, 3 Oct 2022 04:08:41 -0400 Subject: New upstream version 1.23.2 --- CHANGELOG.md | 31 +- PKG-INFO | 6 +- README.rst | 4 +- data/man/gallery-dl.1 | 2 +- data/man/gallery-dl.conf.5 | 47 ++- docs/gallery-dl-example.conf | 17 + docs/gallery-dl.conf | 15 +- gallery_dl.egg-info/PKG-INFO | 6 +- gallery_dl/extractor/artstation.py | 58 ++- gallery_dl/extractor/blogger.py | 59 ++- gallery_dl/extractor/deviantart.py | 28 +- gallery_dl/extractor/exhentai.py | 2 +- gallery_dl/extractor/imagehosts.py | 4 +- gallery_dl/extractor/instagram.py | 720 +++++++++++++++++++++--------------- gallery_dl/extractor/kemonoparty.py | 30 +- gallery_dl/extractor/mastodon.py | 9 + gallery_dl/extractor/myportfolio.py | 7 +- gallery_dl/extractor/newgrounds.py | 63 ++++ gallery_dl/extractor/pixiv.py | 60 +++ gallery_dl/extractor/plurk.py | 4 +- gallery_dl/extractor/sankaku.py | 56 ++- gallery_dl/extractor/skeb.py | 64 +++- gallery_dl/extractor/tumblr.py | 40 +- gallery_dl/version.py | 2 +- 24 files changed, 934 insertions(+), 400 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f4fdf9..c83ab91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,42 @@ # Changelog +## 1.23.2 - 2022-10-01 +### Additions +- [artstation] support search filters ([#2970](https://github.com/mikf/gallery-dl/issues/2970)) +- [blogger] add `label` and `query` metadata fields ([#2930](https://github.com/mikf/gallery-dl/issues/2930)) +- [exhentai] add a slash to the end of gallery URLs ([#2947](https://github.com/mikf/gallery-dl/issues/2947)) +- [instagram] add `count` metadata field ([#2979](https://github.com/mikf/gallery-dl/issues/2979)) +- [instagram] add `api` option +- [kemonoparty] add `count` metadata field ([#2952](https://github.com/mikf/gallery-dl/issues/2952)) +- [mastodon] warn about moved accounts ([#2939](https://github.com/mikf/gallery-dl/issues/2939)) +- [newgrounds] add `games` extractor ([#2955](https://github.com/mikf/gallery-dl/issues/2955)) +- [newgrounds] extract `type` metadata +- [pixiv] add `series` extractor ([#2964](https://github.com/mikf/gallery-dl/issues/2964)) +- [sankaku] implement `refresh` option ([#2958](https://github.com/mikf/gallery-dl/issues/2958)) +- [skeb] add `search` extractor and `filters` option ([#2945](https://github.com/mikf/gallery-dl/issues/2945)) +### Fixes +- [deviantart] fix extraction ([#2981](https://github.com/mikf/gallery-dl/issues/2981), [#2983](https://github.com/mikf/gallery-dl/issues/2983)) +- [fappic] fix extraction +- [instagram] extract higher-resolution photos ([#2666](https://github.com/mikf/gallery-dl/issues/2666)) +- [instagram] fix `username` and `fullname` metadata for saved posts ([#2911](https://github.com/mikf/gallery-dl/issues/2911)) +- [instagram] update API headers +- [kemonoparty] send `Referer` headers ([#2989](https://github.com/mikf/gallery-dl/issues/2989), [#2990](https://github.com/mikf/gallery-dl/issues/2990)) +- [kemonoparty] restore `favorites` API endpoints ([#2994](https://github.com/mikf/gallery-dl/issues/2994)) +- [myportfolio] use fallback when no images are found ([#2959](https://github.com/mikf/gallery-dl/issues/2959)) +- [plurk] fix extraction ([#2977](https://github.com/mikf/gallery-dl/issues/2977)) +- [sankaku] detect expired links ([#2958](https://github.com/mikf/gallery-dl/issues/2958)) +- [tumblr] retry extraction of failed higher-resolution images ([#2957](https://github.com/mikf/gallery-dl/issues/2957)) + ## 1.23.1 - 2022-09-18 ### Additions - [flickr] add support for `secure.flickr.com` URLs ([#2910](https://github.com/mikf/gallery-dl/issues/2910)) - [hotleak] add hotleak extractors ([#2890](https://github.com/mikf/gallery-dl/issues/2890), [#2909](https://github.com/mikf/gallery-dl/issues/2909)) - [instagram] add `highlight_title` and `date` metadata for highlight downloads ([#2879](https://github.com/mikf/gallery-dl/issues/2879)) - [paheal] add support for videos ([#2892](https://github.com/mikf/gallery-dl/issues/2892)) -- [twitter] add general support for unified cards ([#2875](https://github.com/mikf/gallery-dl/issues/2875)) -- [twitter] implement `cards-blacklist` option ([#2875](https://github.com/mikf/gallery-dl/issues/2875)) - [tumblr] fetch high-quality inline images ([#2877](https://github.com/mikf/gallery-dl/issues/2877)) - [tumblr] implement `ratelimit` option ([#2919](https://github.com/mikf/gallery-dl/issues/2919)) +- [twitter] add general support for unified cards ([#2875](https://github.com/mikf/gallery-dl/issues/2875)) +- [twitter] implement `cards-blacklist` option ([#2875](https://github.com/mikf/gallery-dl/issues/2875)) - [zerochan] add `metadata` option ([#2861](https://github.com/mikf/gallery-dl/issues/2861)) - [postprocessor:zip] implement `files` option ([#2872](https://github.com/mikf/gallery-dl/issues/2872)) ### Fixes diff --git a/PKG-INFO b/PKG-INFO index b15426c..aea8d49 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.23.1 +Version: 1.23.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/README.rst b/README.rst index 813d6d8..5676a0e 100644 --- a/README.rst +++ b/README.rst @@ -66,8 +66,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index e76a380..c7051c2 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-09-18" "1.23.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-10-01" "1.23.2" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index f465d84..14db723 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-09-18" "1.23.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-10-01" "1.23.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -1689,6 +1689,24 @@ Value of the \f[I]orderby\f[] parameter for submission searches. for details) +.SS extractor.instagram.api +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"auto"\f[] + +.IP "Description:" 4 +Selects which API endpoints to use. + +.br +* \f[I]"rest"\f[]: REST API - higher-resolution media, only usable when logged in +.br +* \f[I]"graphql"\f[]: GraphQL API - lower-resolution media, partially accessible when not logged in +.br +* \f[I]"auto"\f[]: Use REST API when logged in, GraphQL API otherwise + + .SS extractor.instagram.include .IP "Type:" 6 \f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] @@ -1999,7 +2017,7 @@ A (comma-separated) list of subcategories to include when processing a user profile. Possible values are -\f[I]"art"\f[], \f[I]"audio"\f[], \f[I]"movies"\f[]. +\f[I]"art"\f[], \f[I]"audio"\f[], \f[I]"games"\f[], \f[I]"movies"\f[]. You can use \f[I]"all"\f[] instead of listing all values separately. @@ -2456,6 +2474,17 @@ If the format is given as \f[]string\f[I], it will be extended with restrict it to only one possible format. +.SS extractor.sankaku.refresh +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Refresh download URLs before they expire. + + .SS extractor.sankakucomplex.embeds .IP "Type:" 6 \f[I]bool\f[] @@ -2511,6 +2540,20 @@ Download sent requests. Download thumbnails. +.SS extractor.skeb.search.filters +.IP "Type:" 6 +\f[I]list\f[] or \f[I]string\f[] + +.IP "Default:" 9 +\f[I]["genre:art", "genre:voice", "genre:novel", "genre:video", "genre:music", "genre:correction"]\f[] + +.IP "Example:" 4 +"genre:music OR genre:voice" + +.IP "Description:" 4 +Filters used during searches. + + .SS extractor.smugmug.videos .IP "Type:" 6 \f[I]bool\f[] diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index 89bfd0c..279aeef 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -127,6 +127,23 @@ ] }, + "kemonoparty": { + "postprocessors": [ + { + "name": "metadata", + "event": "post", + "filename": "{id} {title}.txt", + + "#": "write text content and external URLs", + "mode": "custom", + "format": "{content}\n{embed[url]:?/\n/}", + + "#": "onlx write file if there is an external link present", + "filter": "embed.get('url') or re.search(r'(?i)(gigafile|xgf|1drv|mediafire|mega|google|drive)', content)" + } + ] + }, + "flickr": { "access-token": "1234567890-abcdef", diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 6ba50f2..1c565ec 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -156,8 +156,9 @@ { "username": null, "password": null, + "api": "auto", "include": "posts", - "sleep-request": 8.0, + "sleep-request": [6.0, 12.0], "videos": true }, "khinsider": @@ -244,19 +245,21 @@ { "format": ["hd", "sd", "gif"] }, + "sankaku": + { + "username": null, + "password": null, + "refresh": false + }, "sankakucomplex": { "embeds": false, "videos": true }, - "sankaku": - { - "username": null, - "password": null - }, "skeb": { "article": false, + "filters": null, "sent-requests": false, "thumbnails": false }, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index ea2164a..016840e 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.23.1 +Version: 1.23.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index c0e8e67..62626a1 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -103,16 +103,23 @@ class ArtstationExtractor(Extractor): return response.json() def _pagination(self, url, params=None, json=None): + headers = { + "Accept" : "application/json, text/plain, */*", + "Origin" : self.root, + "Referer": self.root + "/", + } + if json: params = json - kwargs = {"json": json} + headers["PUBLIC-CSRF-TOKEN"] = self._init_csrf_token() + kwargs = {"method": "POST", "headers": headers, "json": json} else: if not params: params = {} - kwargs = {"params": params} + kwargs = {"params": params, "headers": headers} - params["page"] = 1 total = 0 + params["page"] = 1 while True: data = self.request(url, **kwargs).json() @@ -124,6 +131,17 @@ class ArtstationExtractor(Extractor): params["page"] += 1 + def _init_csrf_token(self): + url = self.root + "/api/v2/csrf_protection/token.json" + headers = { + "Accept" : "*/*", + "Origin" : self.root, + "Referer": self.root + "/", + } + return self.request( + url, method="POST", headers=headers, json={}, + ).json()["public_csrf_token"] + @staticmethod def _no_cache(url, alphabet=(string.digits + string.ascii_letters)): """Cause a cache miss to prevent Cloudflare 'optimizations' @@ -298,34 +316,46 @@ class ArtstationSearchExtractor(ArtstationExtractor): archive_fmt = "s_{search[query]}_{asset[id]}" pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com" r"/search/?\?([^#]+)") - test = ("https://www.artstation.com/search?q=ancient&sort_by=rank", { + test = ("https://www.artstation.com/search?query=ancient&sort_by=rank", { "range": "1-20", "count": 20, }) def __init__(self, match): ArtstationExtractor.__init__(self, match) - query = text.parse_query(match.group(1)) - self.query = query.get("q", "") - self.sorting = query.get("sort_by", "rank").lower() + self.params = query = text.parse_query(match.group(1)) + self.query = text.unquote(query.get("query") or query.get("q", "")) + self.sorting = query.get("sort_by", "relevance").lower() + self.tags = query.get("tags", "").split(",") def metadata(self): return {"search": { "query" : self.query, "sorting": self.sorting, + "tags" : self.tags, }} def projects(self): + filters = [] + for key, value in self.params.items(): + if key.endswith("_ids") or key == "tags": + filters.append({ + "field" : key, + "method": "include", + "value" : value.split(","), + }) + url = "{}/api/v2/search/projects.json".format(self.root) - return self._pagination(url, json={ - "additional_fields": "[]", - "filters" : "[]", - "page" : None, - "per_page" : "50", - "pro_first" : "1", + data = { "query" : self.query, + "page" : None, + "per_page" : 50, "sorting" : self.sorting, - }) + "pro_first" : "1", + "filters" : filters, + "additional_fields": (), + } + return self._pagination(url, json=data) class ArtstationArtworkExtractor(ArtstationExtractor): diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index e0885d2..232f3ea 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -48,6 +48,7 @@ class BloggerExtractor(Extractor): r'\d+\.bp\.blogspot\.com)/[^"]+)').findall findall_video = re.compile( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall + metadata = self.metadata() for post in self.posts(blog): content = post["content"] @@ -74,18 +75,21 @@ class BloggerExtractor(Extractor): del post["selfLink"] del post["blog"] - yield Message.Directory, {"blog": blog, "post": post} - for num, url in enumerate(files, 1): - yield Message.Url, url, text.nameext_from_url(url, { - "blog": blog, - "post": post, - "url" : url, - "num" : num, - }) + data = {"blog": blog, "post": post} + if metadata: + data.update(metadata) + yield Message.Directory, data + + for data["num"], url in enumerate(files, 1): + data["url"] = url + yield Message.Url, url, text.nameext_from_url(url, data) def posts(self, blog): """Return an iterable with all relevant post objects""" + def metadata(self): + """Return additional metadata""" + class BloggerPostExtractor(BloggerExtractor): """Extractor for a single blog post""" @@ -173,31 +177,48 @@ class BloggerBlogExtractor(BloggerExtractor): class BloggerSearchExtractor(BloggerExtractor): - """Extractor for search resuls and labels""" + """Extractor for Blogger search resuls""" subcategory = "search" - pattern = BASE_PATTERN + r"/search(?:/?\?q=([^/?#]+)|/label/([^/?#]+))" + pattern = BASE_PATTERN + r"/search/?\?q=([^&#]+)" test = ( ("https://julianbphotography.blogspot.com/search?q=400mm", { - "count": "< 10" + "count": "< 10", + "keyword": {"query": "400mm"}, }), + ) + + def __init__(self, match): + BloggerExtractor.__init__(self, match) + self.query = text.unquote(match.group(3)) + + def posts(self, blog): + return self.api.blog_search(blog["id"], self.query) + + def metadata(self): + return {"query": self.query} + + +class BloggerLabelExtractor(BloggerExtractor): + """Extractor for Blogger posts by label""" + subcategory = "label" + pattern = BASE_PATTERN + r"/search/label/([^/?#]+)" + test = ( ("https://dmmagazine.blogspot.com/search/label/D%26D", { "range": "1-25", "count": 25, + "keyword": {"label": "D&D"}, }), ) def __init__(self, match): BloggerExtractor.__init__(self, match) - query = match.group(3) - if query: - self.query, self.label = query, None - else: - self.query, self.label = None, match.group(4) + self.label = text.unquote(match.group(3)) def posts(self, blog): - if self.query: - return self.api.blog_search(blog["id"], text.unquote(self.query)) - return self.api.blog_posts(blog["id"], text.unquote(self.label)) + return self.api.blog_posts(blog["id"], self.label) + + def metadata(self): + return {"label": self.label} class BloggerAPI(): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 60f644d..6897476 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -936,12 +936,13 @@ class DeviantartDeviationExtractor(DeviantartExtractor): self.deviation_id = match.group(4) def deviations(self): - deviation = DeviantartEclipseAPI(self).deviation_extended_fetch( - self.deviation_id, self.user, self.type) - if "error" in deviation: + url = "{}/{}/{}/{}".format( + self.root, self.user, self.type, self.deviation_id) + appurl = text.extract(self._limited_request(url).text, + 'property="da:appurl" content="', '"')[0] + if not appurl: raise exception.NotFoundError("deviation") - return (self.api.deviation( - deviation["deviation"]["extended"]["deviationUuid"]),) + return (self.api.deviation(appurl.rpartition("/")[2]),) class DeviantartScrapsExtractor(DeviantartExtractor): @@ -1398,6 +1399,8 @@ class DeviantartEclipseAPI(): def __init__(self, extractor): self.extractor = extractor self.log = extractor.log + self.request = self.extractor._limited_request + self.csrf_token = None def deviation_extended_fetch(self, deviation_id, user=None, kind=None): endpoint = "/da-browse/shared_api/deviation/extended_fetch" @@ -1429,11 +1432,12 @@ class DeviantartEclipseAPI(): } return self._pagination(endpoint, params) - def _call(self, endpoint, params=None): + def _call(self, endpoint, params): url = "https://www.deviantart.com/_napi" + endpoint headers = {"Referer": "https://www.deviantart.com/"} + params["csrf_token"] = self.csrf_token or self._fetch_csrf_token() - response = self.extractor._limited_request( + response = self.request( url, params=params, headers=headers, fatal=None) if response.status_code == 404: @@ -1464,12 +1468,20 @@ class DeviantartEclipseAPI(): def _module_id_watching(self, user): url = "{}/{}/about".format(self.extractor.root, user) - page = self.extractor._limited_request(url).text + page = self.request(url).text pos = page.find('\\"type\\":\\"watching\\"') if pos < 0: raise exception.NotFoundError("module") + self._fetch_csrf_token(page) return text.rextract(page, '\\"id\\":', ',', pos)[0].strip('" ') + def _fetch_csrf_token(self, page=None): + if page is None: + page = self.request(self.extractor.root + "/").text + self.csrf_token = token = text.extract( + page, "window.__CSRF_TOKEN__ = '", "'")[0] + return token + @cache(maxage=100*365*24*3600, keyarg=0) def _refresh_token_cache(token): diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 01ba03a..e37e81b 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -505,7 +505,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): if url == last: continue last = url - yield Message.Queue, url, data + yield Message.Queue, url + "/", data if 'class="ptdd">><' in page or ">No hits found

" in page: return diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index d699f07..69455a8 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2021 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -292,7 +292,7 @@ class FappicImageExtractor(ImagehostImageExtractor): }) def get_info(self, page): - url , pos = text.extract(page, '= 20", + }) + + def posts(self): + uid = self.api.user_id(self.item) + return self.api.user_clips(uid) class InstagramTaggedExtractor(InstagramExtractor): - """Extractor for ProfilePage tagged posts""" + """Extractor for an Instagram user's tagged posts""" subcategory = "tagged" pattern = USER_PATTERN + r"/tagged" test = ("https://www.instagram.com/instagram/tagged/", { @@ -485,7 +409,7 @@ class InstagramTaggedExtractor(InstagramExtractor): self.user_id = self.item[3:] return {"tagged_owner_id": self.user_id} - user = self._user_by_screen_name(self.item) + user = self.api.user(self.item) self.user_id = user["id"] return { @@ -495,13 +419,11 @@ class InstagramTaggedExtractor(InstagramExtractor): } def posts(self): - endpoint = "/v1/usertags/{}/feed/".format(self.user_id) - params = {"count": 50} - return self._pagination_api(endpoint, params) + return self.api.user_tagged(self.user_id) class InstagramChannelExtractor(InstagramExtractor): - """Extractor for ProfilePage channel""" + """Extractor for an Instagram user's channel posts""" subcategory = "channel" pattern = USER_PATTERN + r"/channel" test = ("https://www.instagram.com/instagram/channel/", { @@ -510,25 +432,25 @@ class InstagramChannelExtractor(InstagramExtractor): }) def posts(self): - query_hash = "bc78b344a68ed16dd5d7f264681c4c76" - variables = {"id": self._uid_by_screen_name(self.item), "first": 50} - return self._pagination_graphql(query_hash, variables) + uid = self.api.user_id(self.item) + return self.api.user_clips(uid) class InstagramSavedExtractor(InstagramExtractor): - """Extractor for ProfilePage saved media""" + """Extractor for an Instagram user's saved media""" subcategory = "saved" - pattern = USER_PATTERN + r"/saved/?$" - test = ("https://www.instagram.com/instagram/saved/",) + pattern = USER_PATTERN + r"/saved(?:/all-posts)?/?$" + test = ( + ("https://www.instagram.com/instagram/saved/"), + ("https://www.instagram.com/instagram/saved/all-posts/"), + ) def posts(self): - query_hash = "2ce1d673055b99250e93b6f88f878fde" - variables = {"id": self._uid_by_screen_name(self.item), "first": 50} - return self._pagination_graphql(query_hash, variables) + return self.api.user_saved() class InstagramCollectionExtractor(InstagramExtractor): - """Extractor for ProfilePage saved collection media""" + """Extractor for Instagram collection""" subcategory = "collection" pattern = USER_PATTERN + r"/saved/([^/?#]+)/([^/?#]+)" test = ( @@ -546,13 +468,59 @@ class InstagramCollectionExtractor(InstagramExtractor): } def posts(self): - endpoint = "/v1/feed/collection/{}/posts/".format(self.collection_id) - for item in self._pagination_api(endpoint): - yield item["media"] + return self.api.user_collection(self.collection_id) + + +class InstagramStoriesExtractor(InstagramExtractor): + """Extractor for Instagram stories""" + subcategory = "stories" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/stories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)") + test = ( + ("https://www.instagram.com/stories/instagram/"), + ("https://www.instagram.com/stories/highlights/18042509488170095/"), + ("https://instagram.com/stories/geekmig/2724343156064789461"), + ) + + def __init__(self, match): + self.highlight_id, self.user, self.media_id = match.groups() + if self.highlight_id: + self.subcategory = InstagramHighlightsExtractor.subcategory + InstagramExtractor.__init__(self, match) + + def posts(self): + if self.highlight_id: + reel_id = "highlight:" + self.highlight_id + else: + reel_id = self.api.user_id(self.user) + + reels = self.api.reels_media(reel_id) + + if self.media_id and reels: + reel = reels[0] + for item in reel["items"]: + if item["pk"] == self.media_id: + reel["items"] = (item,) + break + else: + raise exception.NotFoundError("story") + + return reels + + +class InstagramHighlightsExtractor(InstagramExtractor): + """Extractor for an Instagram user's story highlights""" + subcategory = "highlights" + pattern = USER_PATTERN + r"/highlights" + test = ("https://www.instagram.com/instagram/highlights",) + + def posts(self): + uid = self.api.user_id(self.item) + return self.api.highlights_media(uid) class InstagramTagExtractor(InstagramExtractor): - """Extractor for TagPage""" + """Extractor for Instagram tags""" subcategory = "tag" directory_fmt = ("{category}", "{subcategory}", "{tag}") pattern = BASE_PATTERN + r"/explore/tags/([^/?#]+)" @@ -565,27 +533,7 @@ class InstagramTagExtractor(InstagramExtractor): return {"tag": text.unquote(self.item)} def posts(self): - endpoint = "/v1/tags/{}/sections/".format(self.item) - data = { - "include_persistent": "0", - "max_id" : None, - "page" : None, - "surface": "grid", - "tab" : "recent", - } - - while True: - info = self._request_api(endpoint, method="POST", data=data) - - for section in info["sections"]: - for media in section["layout_content"]["medias"]: - yield media["media"] - - if not info.get("more_available"): - return - - data["max_id"] = info["next_max_id"] - data["page"] = info["next_page"] + return self.api.tags_media(self.item) class InstagramPostExtractor(InstagramExtractor): @@ -618,7 +566,6 @@ class InstagramPostExtractor(InstagramExtractor): "width": int, } }), - # GraphSidecar ("https://www.instagram.com/p/BoHk1haB5tM/", { "count": 5, @@ -633,7 +580,6 @@ class InstagramPostExtractor(InstagramExtractor): "username": "instagram", } }), - # GraphVideo ("https://www.instagram.com/p/Bqxp0VSBgJg/", { "pattern": r"/46840863_726311431074534_7805566102611403091_n\.mp4", @@ -651,7 +597,6 @@ class InstagramPostExtractor(InstagramExtractor): "width": int, } }), - # GraphVideo (IGTV) ("https://www.instagram.com/tv/BkQjCfsBIzi/", { "pattern": r"/10000000_597132547321814_702169244961988209_n\.mp4", @@ -668,7 +613,6 @@ class InstagramPostExtractor(InstagramExtractor): "width": int, } }), - # GraphSidecar with 2 embedded GraphVideo objects ("https://www.instagram.com/p/BtOvDOfhvRr/", { "count": 2, @@ -679,7 +623,6 @@ class InstagramPostExtractor(InstagramExtractor): "video_url": str, } }), - # GraphImage with tagged user ("https://www.instagram.com/p/B_2lf3qAd3y/", { "keyword": { @@ -690,98 +633,265 @@ class InstagramPostExtractor(InstagramExtractor): }] } }), - # URL with username (#2085) ("https://www.instagram.com/dm/p/CW042g7B9CY/"), - ("https://www.instagram.com/reel/CDg_6Y1pxWu/"), ) def posts(self): - return self._media_by_id(id_from_shortcode(self.item)) + return self.api.media(id_from_shortcode(self.item)) -class InstagramStoriesExtractor(InstagramExtractor): - """Extractor for Instagram stories""" - subcategory = "stories" - pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/stories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)") - test = ( - ("https://www.instagram.com/stories/instagram/"), - ("https://www.instagram.com/stories/highlights/18042509488170095/"), - ("https://instagram.com/stories/geekmig/2724343156064789461"), - ) +class InstagramRestAPI(): - def __init__(self, match): - self.highlight_id, self.user, self.media_id = match.groups() - if self.highlight_id: - self.subcategory = InstagramHighlightsExtractor.subcategory - InstagramExtractor.__init__(self, match) + def __init__(self, extractor): + self.extractor = extractor - def posts(self): - if self.highlight_id: - reel_id = "highlight:" + self.highlight_id - else: - reel_id = self._uid_by_screen_name(self.user) + def highlights_media(self, user_id): + chunk_size = 5 + reel_ids = [hl["id"] for hl in self.highlights_tray(user_id)] + + for offset in range(0, len(reel_ids), chunk_size): + yield from self.reels_media( + reel_ids[offset : offset+chunk_size]) + + def highlights_tray(self, user_id): + endpoint = "/v1/highlights/{}/highlights_tray/".format(user_id) + return self._call(endpoint)["tray"] + + def media(self, post_id): + endpoint = "/v1/media/{}/info/".format(post_id) + return self._pagination(endpoint) + def reels_media(self, reel_ids): endpoint = "/v1/feed/reels_media/" - params = {"reel_ids": reel_id} - reels = self._request_api(endpoint, params=params)["reels"] + params = {"reel_ids": reel_ids} + return self._call(endpoint, params=params)["reels_media"] - if self.media_id: - reel = reels[reel_id] - for item in reel["items"]: - if item["pk"] == self.media_id: - reel["items"] = (item,) - break + def tags_media(self, tag): + for section in self.tags_sections(tag): + for media in section["layout_content"]["medias"]: + yield media["media"] + + def tags_sections(self, tag): + endpoint = "/v1/tags/{}/sections/".format(tag) + data = { + "include_persistent": "0", + "max_id" : None, + "page" : None, + "surface": "grid", + "tab" : "recent", + } + return self._pagination_sections(endpoint, data) + + @memcache(keyarg=1) + def user(self, screen_name): + endpoint = "/v1/users/web_profile_info/" + params = {"username": screen_name} + return self._call(endpoint, params=params)["data"]["user"] + + def user_id(self, screen_name): + if screen_name.startswith("id:"): + return screen_name[3:] + return self.user(screen_name)["id"] + + def user_clips(self, user_id): + endpoint = "/v1/clips/user/" + data = {"target_user_id": user_id, "page_size": "50"} + return self._pagination_post(endpoint, data) + + def user_collection(self, collection_id): + endpoint = "/v1/feed/collection/{}/posts/".format(collection_id) + params = {"count": 50} + return self._pagination(endpoint, params, media=True) + + def user_feed(self, user_id): + endpoint = "/v1/feed/user/{}/".format(user_id) + params = {"count": 30} + return self._pagination(endpoint, params) + + def user_saved(self): + endpoint = "/v1/feed/saved/posts/" + params = {"count": 50} + return self._pagination(endpoint, params, media=True) + + def user_tagged(self, user_id): + endpoint = "/v1/usertags/{}/feed/".format(user_id) + params = {"count": 50} + return self._pagination(endpoint, params) + + def _call(self, endpoint, **kwargs): + extr = self.extractor + + url = "https://i.instagram.com/api" + endpoint + kwargs["headers"] = { + "X-CSRFToken" : extr.csrf_token, + "X-Instagram-AJAX": "1006242110", + "X-IG-App-ID" : "936619743392459", + "X-ASBD-ID" : "198387", + "X-IG-WWW-Claim" : extr.www_claim, + "Origin" : extr.root, + "Referer" : extr.root + "/", + } + kwargs["cookies"] = { + "csrftoken": extr.csrf_token, + } + return extr.request(url, **kwargs).json() + + def _pagination(self, endpoint, params=None, media=False): + if params is None: + params = {} + while True: + data = self._call(endpoint, params=params) + + if media: + for item in data["items"]: + yield item["media"] else: - raise exception.NotFoundError("story") + yield from data["items"] - return reels.values() + if not data.get("more_available"): + return + params["max_id"] = data["next_max_id"] + def _pagination_post(self, endpoint, params): + while True: + data = self._call(endpoint, method="POST", data=params) -class InstagramHighlightsExtractor(InstagramExtractor): - """Extractor for all Instagram story highlights of a user""" - subcategory = "highlights" - pattern = USER_PATTERN + r"/highlights" - test = ("https://www.instagram.com/instagram/highlights",) + for item in data["items"]: + yield item["media"] - def posts(self): - endpoint = "/v1/highlights/{}/highlights_tray/".format( - self._uid_by_screen_name(self.item)) - tray = self._request_api(endpoint)["tray"] - reel_ids = [highlight["id"] for highlight in tray] + info = data["paging_info"] + if not info.get("more_available"): + return + params["max_id"] = info["max_id"] - # Anything above 30 responds with statuscode 400. - # 30 can work, however, sometimes the API will respond with 560 or 500. - chunk_size = 5 - endpoint = "/v1/feed/reels_media/" + def _pagination_sections(self, endpoint, params): + while True: + info = self._call(endpoint, method="POST", data=params) - for offset in range(0, len(reel_ids), chunk_size): - chunk_ids = reel_ids[offset : offset+chunk_size] - params = {"reel_ids": chunk_ids} - reels = self._request_api(endpoint, params=params)["reels"] - for reel_id in chunk_ids: - yield reels[reel_id] + yield from info["sections"] + + if not info.get("more_available"): + return + params["max_id"] = info["next_max_id"] + params["page"] = info["next_page"] -class InstagramReelsExtractor(InstagramExtractor): - """Extractor for an Instagram user's reels""" - subcategory = "reels" - pattern = USER_PATTERN + r"/reels" - test = ("https://www.instagram.com/instagram/reels/", { - "range": "40-60", - "count": ">= 20", - }) +class InstagramGraphqlAPI(): - def posts(self): - endpoint = "/v1/clips/user/" - data = { - "target_user_id": self._uid_by_screen_name(self.item), - "page_size" : "50", + def __init__(self, extractor): + self.extractor = extractor + self.user = InstagramRestAPI(extractor).user + self.user_collection = self.user_saved = self.reels_media = \ + self.highlights_media = self._login_required + self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode + + @staticmethod + def _login_required(_=None): + raise exception.AuthorizationError("Login required") + + def highlights_tray(self, user_id): + query_hash = "d4d88dc1500312af6f937f7b804c68c3" + variables = { + "user_id": user_id, + "include_chaining": False, + "include_reel": False, + "include_suggested_users": False, + "include_logged_out_extras": True, + "include_highlight_reels": True, + "include_live_status": False, + } + edges = (self._call(query_hash, variables)["user"] + ["edge_highlight_reels"]["edges"]) + return [edge["node"] for edge in edges] + + def media(self, post_id): + query_hash = "9f8827793ef34641b2fb195d4d41151c" + variables = { + "shortcode": shortcode_from_id(post_id), + "child_comment_count": 3, + "fetch_comment_count": 40, + "parent_comment_count": 24, + "has_threaded_comments": True, + } + media = self._call(query_hash, variables).get("shortcode_media") + return (media,) if media else () + + def tags_media(self, tag): + query_hash = "9b498c08113f1e09617a1703c22b2f32" + variables = {"tag_name": text.unescape(tag), "first": 50} + return self._pagination(query_hash, variables, + "hashtag", "edge_hashtag_to_media") + + def user_id(self, screen_name): + if screen_name.startswith("id:"): + return screen_name[3:] + return self.user(screen_name)["id"] + + def user_clips(self, user_id): + query_hash = "bc78b344a68ed16dd5d7f264681c4c76" + variables = {"id": user_id, "first": 50} + return self._pagination(query_hash, variables) + + def user_feed(self, user_id): + query_hash = "69cba40317214236af40e7efa697781d" + variables = {"id": user_id, "first": 50} + return self._pagination(query_hash, variables) + + def user_tagged(self, user_id): + query_hash = "be13233562af2d229b008d2976b998b5" + variables = {"id": user_id, "first": 50} + return self._pagination(query_hash, variables) + + def _call(self, query_hash, variables): + extr = self.extractor + + url = "https://www.instagram.com/graphql/query/" + params = { + "query_hash": query_hash, + "variables" : self._json_dumps(variables), + } + headers = { + "Accept" : "*/*", + "X-CSRFToken" : extr.csrf_token, + "X-Instagram-AJAX": "1006267176", + "X-IG-App-ID" : "936619743392459", + "X-ASBD-ID" : "198387", + "X-IG-WWW-Claim" : extr.www_claim, + "X-Requested-With": "XMLHttpRequest", + "Referer" : extr.root + "/", + } + cookies = { + "csrftoken": extr.csrf_token, } + return extr.request( + url, params=params, headers=headers, cookies=cookies, + ).json()["data"] - return self._pagination_api_post(endpoint, data) + def _pagination(self, query_hash, variables, + key_data="user", key_edge=None): + cursor = self.extractor.config("cursor") + if cursor: + variables["after"] = cursor + + while True: + data = self._call(query_hash, variables)[key_data] + data = data[key_edge] if key_edge else next(iter(data.values())) + + for edge in data["edges"]: + yield edge["node"] + + info = data["page_info"] + if not info["has_next_page"]: + return + elif not data["edges"]: + s = "" if self.item.endswith("s") else "s" + raise exception.StopExtraction( + "%s'%s posts are private", self.item, s) + + variables["after"] = self._cursor = info["end_cursor"] + self.extractor.log.debug("Cursor: %s", self._cursor) @cache(maxage=360*24*3600, keyarg=1) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 816b561..750b741 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -33,6 +33,7 @@ class KemonopartyExtractor(Extractor): self.cookiedomain = ".coomer.party" self.root = text.root_from_url(match.group(0)) Extractor.__init__(self, match) + self.session.headers["Referer"] = self.root + "/" def items(self): self._prepare_ddosguard_cookies() @@ -46,7 +47,7 @@ class KemonopartyExtractor(Extractor): comments = self.config("comments") username = dms = None - # prevent files to be sent with gzip compression + # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} if self.config("metadata"): @@ -63,6 +64,9 @@ class KemonopartyExtractor(Extractor): for post in posts: + headers["Referer"] = "{}/{}/user/{}/post/{}".format( + self.root, post["service"], post["user"], post["id"]) + post["_http_headers"] = headers post["date"] = text.parse_datetime( post["published"] or post["added"], "%a, %d %b %Y %H:%M:%S %Z") @@ -74,27 +78,33 @@ class KemonopartyExtractor(Extractor): if dms is True: dms = self._extract_dms(post) post["dms"] = dms - yield Message.Directory, post + files = [] hashes = set() - post["num"] = 0 + for file in itertools.chain.from_iterable( g(post) for g in generators): url = file["path"] match = find_hash(url) if match: - post["hash"] = hash = match.group(1) + file["hash"] = hash = match.group(1) if hash in hashes and not duplicates: self.log.debug("Skipping %s (duplicate)", url) continue hashes.add(hash) else: - post["hash"] = "" + file["hash"] = "" + + files.append(file) + post["count"] = len(files) + yield Message.Directory, post + + for post["num"], file in enumerate(files, 1): + post["hash"] = file["hash"] post["type"] = file["type"] - post["num"] += 1 - post["_http_headers"] = headers + url = file["path"] text.nameext_from_url(file.get("name", url), post) if not post["extension"]: @@ -236,6 +246,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "keyword": { "added": "Wed, 06 May 2020 20:28:02 GMT", "content": str, + "count": 1, "date": "dt:2019-08-11 02:09:04", "edited": None, "embed": dict, @@ -374,6 +385,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): post["channel_name"] = self.channel_name post["date"] = text.parse_datetime( post["published"], "%a, %d %b %Y %H:%M:%S %Z") + post["count"] = len(files) yield Message.Directory, post for post["num"], file in enumerate(files, 1): @@ -466,7 +478,7 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): if self.favorites == "artist": users = self.request( - self.root + "/api/v1/account/favorites?type=artist").json() + self.root + "/api/favorites?type=artist").json() for user in users: user["_extractor"] = KemonopartyUserExtractor url = "{}/{}/user/{}".format( @@ -475,7 +487,7 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): elif self.favorites == "post": posts = self.request( - self.root + "/api/v1/account/favorites?type=post").json() + self.root + "/api/favorites?type=post").json() for post in posts: post["_extractor"] = KemonopartyPostExtractor url = "{}/{}/user/{}/post/{}".format( diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 493a8ef..9ce5772 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -31,6 +31,8 @@ class MastodonExtractor(BaseExtractor): def items(self): for status in self.statuses(): + if self._check_move: + self._check_move(status["account"]) if not self.reblogs and status["reblog"]: self.log.debug("Skipping %s (reblog)", status["id"]) continue @@ -56,6 +58,12 @@ class MastodonExtractor(BaseExtractor): """Return an iterable containing all relevant Status objects""" return () + def _check_move(self, account): + self._check_move = None + if "moved" in account: + self.log.warning("Account '%s' moved to '%s'", + account["acct"], account["moved"]["acct"]) + INSTANCES = { "mastodon.social": { @@ -192,6 +200,7 @@ class MastodonAPI(): handle = "@{}@{}".format(username, self.extractor.instance) for account in self.account_search(handle, 1): if account["username"] == username: + self.extractor._check_move(account) return account["id"] raise exception.NotFoundError("account") diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index f06ab70..8254118 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -104,4 +104,7 @@ class MyportfolioGalleryExtractor(Extractor): @staticmethod def images(page): """Extract and return a list of all image-urls""" - return list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) + return ( + list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) or + list(text.extract_iter(page, 'data-src="', '"')) + ) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index d9ab336..2c2dcb9 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -155,6 +155,7 @@ class NewgroundsExtractor(Extractor): data = { "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), + "type" : extr('og:type" content="', '"'), "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), "rating" : extr('class="rated-', '"'), @@ -173,6 +174,7 @@ class NewgroundsExtractor(Extractor): return { "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), + "type" : extr('og:type" content="', '"'), "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), "url" : extr('{"url":"', '"').replace("\\/", "/"), @@ -184,6 +186,7 @@ class NewgroundsExtractor(Extractor): def _extract_media_data(self, extr, url): index = url.split("/")[5] title = extr('"og:title" content="', '"') + type = extr('og:type" content="', '"') descr = extr('"og:description" content="', '"') src = extr('{"url":"', '"') @@ -223,6 +226,7 @@ class NewgroundsExtractor(Extractor): "title" : text.unescape(title), "url" : src, "date" : date, + "type" : type, "description": text.unescape(descr or extr( 'itemprop="description" content="', '"')), "rating" : extr('class="rated-', '"'), @@ -305,6 +309,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor): "score" : float, "tags" : ["ryu", "streetfighter"], "title" : "Ryu is Hawt", + "type" : "article", "user" : "tomfulp", "width" : 447, }, @@ -357,6 +362,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "score" : float, "tags" : ["alienhominid", "trailer"], "title" : "Alien Hominid Fan Trailer", + "type" : "movie", "user" : "kickinthehead", }, }), @@ -373,6 +379,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "score" : float, "tags" : ["fulp", "interview", "tom", "zj"], "title" : "ZJ Interviews Tom Fulp!", + "type" : "music.song", "user" : "zj", }, }), @@ -380,6 +387,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): ("https://www.newgrounds.com/portal/view/161181/format/flash", { "pattern": r"https://uploads\.ungrounded\.net/161000" r"/161181_ddautta_mask__550x281_\.swf\?f1081628129", + "keyword": {"type": "movie"}, }), # format selection (#1729) ("https://www.newgrounds.com/portal/view/758545", { @@ -392,6 +400,49 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "options": (("username", None),), "count": 1, }), + # flash game + ("https://www.newgrounds.com/portal/view/829032", { + "pattern": r"https://uploads\.ungrounded\.net/829000" + r"/829032_picovsbeardx\.swf\?f1641968445", + "range": "1", + "keyword": { + "artist" : [ + "dungeonation", + "carpetbakery", + "animalspeakandrews", + "bill", + "chipollo", + "dylz49", + "gappyshamp", + "pinktophat", + "rad", + "shapeshiftingblob", + "tomfulp", + "voicesbycorey", + "psychogoldfish", + ], + "comment" : "re:The children are expendable. Take out the ", + "date" : "dt:2022-01-10 23:00:57", + "description": "Bloodshed in The Big House that Blew...again!", + "favorites" : int, + "index" : 829032, + "post_url" : "https://www.newgrounds.com/portal/view/829032", + "rating" : "m", + "score" : float, + "tags" : [ + "assassin", + "boyfriend", + "darnell", + "nene", + "pico", + "picos-school", + ], + "title" : "PICO VS BEAR DX", + "type" : "game", + "url" : "https://uploads.ungrounded.net/829000" + "/829032_picovsbeardx.swf?f1641968445", + }, + }), ) def __init__(self, match): @@ -434,6 +485,17 @@ class NewgroundsMoviesExtractor(NewgroundsExtractor): }) +class NewgroundsGamesExtractor(NewgroundsExtractor): + """Extractor for a newgrounds user's games""" + subcategory = _path = "games" + pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/games/?$" + test = ("https://tomfulp.newgrounds.com/games", { + "pattern": r"https://uploads.ungrounded.net(/alternate)?/\d+/\d+_.+", + "range": "1-10", + "count": 10, + }) + + class NewgroundsUserExtractor(NewgroundsExtractor): """Extractor for a newgrounds user profile""" subcategory = "user" @@ -454,6 +516,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor): return self._dispatch_extractors(( (NewgroundsArtExtractor , base + "art"), (NewgroundsAudioExtractor , base + "audio"), + (NewgroundsGamesExtractor , base + "games"), (NewgroundsMoviesExtractor, base + "movies"), ), ("art",)) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a589760..6b2e1c3 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -642,6 +642,66 @@ class PixivPixivisionExtractor(PixivExtractor): } +class PixivSeriesExtractor(PixivExtractor): + """Extractor for illustrations from a Pixiv series""" + subcategory = "series" + directory_fmt = ("{category}", "{user[id]} {user[account]}", + "{series[id]} {series[title]}") + filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" + pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" + r"/user/(\d+)/series/(\d+)") + test = ("https://www.pixiv.net/user/10509347/series/21859", { + "range": "1-10", + "count": 10, + "keyword": { + "num_series": int, + "series": { + "canonical": "https://www.pixiv.net/user/10509347" + "/series/21859", + "description": str, + "ogp": dict, + "title": "先輩がうざい後輩の話", + "total": int, + "twitter": dict, + }, + }, + }) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id, self.series_id = match.groups() + + def works(self): + url = self.root + "/ajax/series/" + self.series_id + params = {"p": 1} + headers = { + "Accept": "application/json", + "Referer": "{}/user/{}/series/{}".format( + self.root, self.user_id, self.series_id), + "Alt-Used": "www.pixiv.net", + } + + while True: + data = self.request(url, params=params, headers=headers).json() + body = data["body"] + page = body["page"] + + series = body["extraData"]["meta"] + series["id"] = self.series_id + series["total"] = page["total"] + series["title"] = text.extract(series["title"], '"', '"')[0] + + for info in page["series"]: + work = self.api.illust_detail(info["workId"]) + work["num_series"] = info["order"] + work["series"] = series + yield work + + if len(page["series"]) < 10: + return + params["p"] += 1 + + class PixivSketchExtractor(Extractor): """Extractor for user pages on sketch.pixiv.net""" category = "pixiv" diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index f2e964d..535fae9 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -85,7 +85,7 @@ class PlurkTimelineExtractor(PlurkExtractor): def plurks(self): url = "{}/{}".format(self.root, self.user) page = self.request(url).text - user_id, pos = text.extract(page, '"user_id":', ',') + user_id, pos = text.extract(page, '"page_user": {"id":', ',') plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0]) headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"} diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 2ce7f6c..3396e3a 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -53,12 +53,15 @@ class SankakuExtractor(BooruExtractor): url = "https://s.sankakucomplex.com" + url[url.index("/", 8):] return url - @staticmethod - def _prepare(post): + def _prepare(self, post): post["created_at"] = post["created_at"]["s"] post["date"] = text.parse_timestamp(post["created_at"]) post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]] post["tag_string"] = " ".join(post["tags"]) + post["_http_validate"] = self._check_expired + + def _check_expired(self, response): + return not response.history or '.com/expired.png' not in response.url def _extended_tags(self, post): tags = collections.defaultdict(list) @@ -219,7 +222,11 @@ class SankakuAPI(): def __init__(self, extractor): self.extractor = extractor - self.headers = {"Accept": "application/vnd.sankaku.api+json;v=2"} + self.headers = { + "Accept" : "application/vnd.sankaku.api+json;v=2", + "Origin" : extractor.root, + "Referer": extractor.root + "/", + } self.username, self.password = self.extractor._get_auth_info() if not self.username: @@ -253,11 +260,14 @@ class SankakuAPI(): for _ in range(5): self.authenticate() response = self.extractor.request( - url, params=params, headers=self.headers, fatal=False) + url, params=params, headers=self.headers, fatal=None) if response.status_code == 429: - self.extractor.wait( - until=response.headers.get("X-RateLimit-Reset")) + until = response.headers.get("X-RateLimit-Reset") + if not until and b"tags-limit" in response.content: + raise exception.StopExtraction("Search tag limit exceeded") + seconds = None if until else 60 + self.extractor.wait(until=until, seconds=seconds) continue data = response.json() @@ -278,9 +288,41 @@ class SankakuAPI(): params["lang"] = "en" params["limit"] = str(self.extractor.per_page) + refresh = self.extractor.config("refresh", False) + if refresh: + offset = expires = 0 + from time import time + while True: data = self._call(endpoint, params) - yield from data["data"] + + if refresh: + posts = data["data"] + if offset: + posts = util.advance(posts, offset) + + for post in posts: + if not expires: + url = post["file_url"] + if url: + expires = text.parse_int( + text.extract(url, "e=", "&")[0]) - 60 + + if 0 < expires <= time(): + self.extractor.log.debug("Refreshing download URLs") + expires = None + break + + offset += 1 + yield post + + if expires is None: + expires = 0 + continue + offset = expires = 0 + + else: + yield from data["data"] params["next"] = data["meta"]["next"] if not params["next"]: diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index cd8c238..822b1f2 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -26,8 +26,11 @@ class SkebExtractor(Extractor): self.article = self.config("article", False) def items(self): + metadata = self.metadata() for user_name, post_num in self.posts(): response, post = self._get_post_data(user_name, post_num) + if metadata: + post.update(metadata) yield Message.Directory, post for data in self._get_urls_from_post(response, post): url = data["file_url"] @@ -36,6 +39,9 @@ class SkebExtractor(Extractor): def posts(self): """Return post number""" + def metadata(self): + """Return additional metadata""" + def _pagination(self, url, params): headers = {"Referer": self.root, "Authorization": "Bearer null"} params["offset"] = 0 @@ -223,6 +229,62 @@ class SkebUserExtractor(SkebExtractor): return posts +class SkebSearchExtractor(SkebExtractor): + """Extractor for skeb search results""" + subcategory = "search" + pattern = r"(?:https?://)?skeb\.jp/search\?q=([^&#]+)" + test = ("https://skeb.jp/search?q=bunny%20tree&t=works", { + "count": ">= 18", + "keyword": {"search_tags": "bunny tree"}, + }) + + def metadata(self): + return {"search_tags": text.unquote(self.user_name)} + + def posts(self): + url = "https://hb1jt3kre9-2.algolianet.com/1/indexes/*/queries" + params = { + "x-algolia-agent": "Algolia for JavaScript (4.13.1); Browser", + } + headers = { + "Origin": self.root, + "Referer": self.root + "/", + "x-algolia-api-key": "9a4ce7d609e71bf29e977925e4c6740c", + "x-algolia-application-id": "HB1JT3KRE9", + } + + filters = self.config("filters") + if filters is None: + filters = ("genre:art OR genre:voice OR genre:novel OR " + "genre:video OR genre:music OR genre:correction") + elif not isinstance(filters, str): + filters = " OR ".join(filters) + + page = 0 + pams = "hitsPerPage=40&filters=" + text.quote(filters) + "&page=" + + request = { + "indexName": "Request", + "query": text.unquote(self.user_name), + "params": pams + str(page), + } + data = {"requests": (request,)} + + while True: + result = self.request( + url, method="POST", params=params, headers=headers, json=data, + ).json()["results"][0] + + for post in result["hits"]: + parts = post["path"].split("/") + yield parts[1][1:], parts[3] + + if page >= result["nbPages"]: + return + page += 1 + request["params"] = pams + str(page) + + class SkebFollowingExtractor(SkebExtractor): """Extractor for all creators followed by a skeb user""" subcategory = "following" @@ -238,8 +300,8 @@ class SkebFollowingExtractor(SkebExtractor): def users(self): url = "{}/api/users/{}/following_creators".format( self.root, self.user_name) - headers = {"Referer": self.root, "Authorization": "Bearer null"} params = {"sort": "date", "offset": 0, "limit": 90} + headers = {"Referer": self.root, "Authorization": "Bearer null"} while True: data = self.request(url, params=params, headers=headers).json() diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 6f53881..447ce00 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -116,13 +116,17 @@ class TumblrExtractor(Extractor): if self.original and "/s2048x3072/" in photo["url"] and ( photo["width"] == 2048 or photo["height"] == 3072): - photo["url"] = self._original_photo(photo["url"]) + photo["url"], fb = self._original_photo(photo["url"]) + if fb: + post["_fallback"] = self._original_image_fallback( + photo["url"], post["id"]) del photo["original_size"] del photo["alt_sizes"] posts.append( self._prepare_image(photo["url"], post.copy())) del post["photo"] + post.pop("_fallback", None) url = post.get("audio_url") # type "audio" if url and url.startswith("https://a.tumblr.com/"): @@ -138,8 +142,12 @@ class TumblrExtractor(Extractor): # API response, but they can't contain images/videos anyway body = post["reblog"]["comment"] + post["reblog"]["tree_html"] for url in _findall_image(body): - url = self._original_inline_image(url) + url, fb = self._original_inline_image(url) + if fb: + post["_fallback"] = self._original_image_fallback( + url, post["id"]) posts.append(self._prepare_image(url, post.copy())) + post.pop("_fallback", None) for url in _findall_video(body): url = self._original_video(url) posts.append(self._prepare(url, post.copy())) @@ -218,23 +226,35 @@ class TumblrExtractor(Extractor): return self.blog != post.get("reblogged_root_uuid") def _original_photo(self, url): - return self._update_image_token( - url.replace("/s2048x3072/", "/s99999x99999/", 1)) + resized = url.replace("/s2048x3072/", "/s99999x99999/", 1) + return self._update_image_token(resized) def _original_inline_image(self, url): if self.original: - url, n = self._subn_orig_image("/s99999x99999/", url, 1) + resized, n = self._subn_orig_image("/s99999x99999/", url, 1) if n: - return self._update_image_token(url) - return self._sub_image(r"https://\1_1280.\2", url) + return self._update_image_token(resized) + return self._sub_image(r"https://\1_1280.\2", url), False def _original_video(self, url): return self._sub_video(r"https://\1.\2", url) - def _update_image_token(self, url): + def _update_image_token(self, resized): headers = {"Accept": "text/html,*/*;q=0.8"} - response = self.request(url, headers=headers) - return text.extract(response.text, '" src="', '"')[0] + try: + response = self.request(resized, headers=headers) + except Exception: + return resized, True + else: + updated = text.extract(response.text, '" src="', '"')[0] + return updated, (resized == updated) + + def _original_image_fallback(self, url, post_id): + yield self._update_image_token(url)[0] + yield self._update_image_token(url)[0] + yield self._update_image_token(url)[0] + self.log.warning("Unable to fetch higher-resolution " + "version of %s (%s)", url, post_id) class TumblrUserExtractor(TumblrExtractor): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index ce018fe..13cb9a0 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.23.1" +__version__ = "1.23.2" -- cgit v1.2.3