diff options
| author | 2022-08-29 02:17:16 -0400 | |
|---|---|---|
| committer | 2022-08-29 02:17:16 -0400 | |
| commit | a768930761f7f20587ae40a8cacca0e55c85290a (patch) | |
| tree | 5a4163db912b93fc45f717e5e43fd5be3e66f16c /gallery_dl/extractor | |
| parent | ae2a0f5622beaa6f402526f8a7b939419283a090 (diff) | |
New upstream version 1.23.0.upstream/1.23.0
Diffstat (limited to 'gallery_dl/extractor')
34 files changed, 638 insertions, 202 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 70cebb3..9e4507a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -26,6 +26,7 @@ modules = [ "behance", "blogger", "bunkr", + "catbox", "comicvine", "cyberdrop", "danbooru", @@ -150,6 +151,7 @@ modules = [ "wikieat", "xhamster", "xvideos", + "zerochan", "booru", "moebooru", "foolfuuka", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 19b9d97..c0e8e67 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -32,9 +32,11 @@ class ArtstationExtractor(Extractor): data = self.metadata() for project in self.projects(): - for asset in self.get_project_assets(project["hash_id"]): + for num, asset in enumerate( + self.get_project_assets(project["hash_id"]), 1): asset.update(data) adict = asset["asset"] + asset["num"] = num yield Message.Directory, asset if adict["has_embedded_player"] and self.external: @@ -85,6 +87,7 @@ class ArtstationExtractor(Extractor): assets = data["assets"] del data["assets"] + data["count"] = len(assets) if len(assets) == 1: data["asset"] = assets[0] yield data diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 21ca991..e0885d2 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -67,9 +67,6 @@ class BloggerExtractor(Extractor): key=lambda x: x["format_id"], )["play_url"]) - if not files: - continue - post["author"] = post["author"]["displayName"] post["replies"] = post["replies"]["totalItems"] post["content"] = text.remove_html(content) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 9904d0a..3091f57 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -16,10 +16,10 @@ import json class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkr.is albums""" category = "bunkr" - root = "https://app.bunkr.is" + root = "https://bunkr.is" pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:is|to)/a/([^/?#]+)" test = ( - ("https://app.bunkr.is/a/Lktg9Keq", { + ("https://bunkr.is/a/Lktg9Keq", { "pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { @@ -33,7 +33,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): }, }), # mp4 (#2239) - ("https://bunkr.is/a/ptRHaCn2", { + ("https://app.bunkr.is/a/ptRHaCn2", { "pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4", "content": "80e61d1dbc5896ae7ef9a28734c747b28b320471", }), @@ -70,16 +70,16 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): album = props["album"] files = props["files"] except Exception as exc: - self.log.debug(exc) + self.log.debug(exc.__class__.__name__, exc) self.root = self.root.replace("bunkr", "app.bunkr", 1) return self._fetch_album_api(album_id) for file in files: name = file["name"] + cdn = file["cdn"] if name.endswith(".mp4"): - file["file"] = "https://media-files.bunkr.is/" + name - else: - file["file"] = file["cdn"] + "/" + name + cdn = cdn.replace("//cdn", "//media-files") + file["file"] = cdn + "/" + name return files, { "album_id" : self.album_id, diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py new file mode 100644 index 0000000..509108f --- /dev/null +++ b/gallery_dl/extractor/catbox.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://catbox.moe/""" + +from .common import GalleryExtractor +from .. import text + + +class CatboxAlbumExtractor(GalleryExtractor): + """Extractor for catbox albums""" + category = "catbox" + subcategory = "album" + root = "https://catbox.moe" + filename_fmt = "{filename}.{extension}" + directory_fmt = ("{category}", "{album_name} ({album_id})") + archive_fmt = "{album_id}_{filename}" + pattern = r"(?:https?://)?(?:www\.)?catbox\.moe(/c/[^/?#]+)" + test = ( + ("https://catbox.moe/c/1igcbe", { + "url": "35866a88c29462814f103bc22ec031eaeb380f8a", + "content": "70ddb9de3872e2d17cc27e48e6bf395e5c8c0b32", + "pattern": r"https://files\.catbox\.moe/\w+\.\w{3}$", + "count": 3, + "keyword": { + "album_id": "1igcbe", + "album_name": "test", + "date": "dt:2022-08-18 00:00:00", + "description": "album test &>", + }, + }), + ("https://www.catbox.moe/c/cd90s1"), + ("https://catbox.moe/c/w7tm47#"), + ) + + def metadata(self, page): + extr = text.extract_from(page) + return { + "album_id" : self.gallery_url.rpartition("/")[2], + "album_name" : text.unescape(extr("<h1>", "<")), + "date" : text.parse_datetime(extr( + "<p>Created ", "<"), "%B %d %Y"), + "description": text.unescape(extr("<p>", "<")), + } + + def images(self, page): + return [ + ("https://files.catbox.moe/" + path, None) + for path in text.extract_iter( + page, ">https://files.catbox.moe/", "<") + ] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 6ccae7f..1b41101 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -795,12 +795,23 @@ SSL_CIPHERS = { } +urllib3 = requests.packages.urllib3 + # detect brotli support try: - BROTLI = requests.packages.urllib3.response.brotli is not None + BROTLI = urllib3.response.brotli is not None except AttributeError: BROTLI = False +# set (urllib3) warnings filter +action = config.get((), "warnings", "default") +if action: + try: + import warnings + warnings.simplefilter(action, urllib3.exceptions.HTTPWarning) + except Exception: + pass +del action # Undo automatic pyOpenSSL injection by requests pyopenssl = config.get((), "pyopenssl", False) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index ec0db68..8c2ed53 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -34,6 +34,7 @@ class DanbooruExtractor(BaseExtractor): self.per_page = iget("per-page", 200) self.request_interval_min = iget("request-interval-min", 0.0) self._pools = iget("pools") + self._popular_endpoint = iget("popular", "/explore/posts/popular.json") BaseExtractor.__init__(self, match) @@ -150,6 +151,7 @@ INSTANCES = { "headers": {"User-Agent": "gallery-dl/{} (by mikf)".format( __version__)}, "pools": "sort", + "popular": "/popular.json", "page-limit": 750, "per-page": 320, "request-interval-min": 1.0, @@ -308,7 +310,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): subcategory = "popular" directory_fmt = ("{category}", "popular", "{scale}", "{date}") archive_fmt = "P_{scale[0]}_{date}_{id}" - pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?" + pattern = BASE_PATTERN + r"/(?:explore/posts/)?popular(?:\?([^#]*))?" test = ( ("https://danbooru.donmai.us/explore/posts/popular"), (("https://danbooru.donmai.us/explore/posts/popular" @@ -316,7 +318,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): "range": "1-120", "count": 120, }), - ("https://e621.net/explore/posts/popular"), + ("https://e621.net/popular"), (("https://e621.net/explore/posts/popular" "?date=2019-06-01&scale=month"), { "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+", @@ -345,8 +347,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): def posts(self): if self.page_start is None: self.page_start = 1 - return self._pagination( - "/explore/posts/popular.json", self.params, True) + return self._pagination(self._popular_endpoint, self.params, True) class DanbooruFavoriteExtractor(DanbooruExtractor): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 39ae484..60f644d 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1128,11 +1128,18 @@ class DeviantartOAuthAPI(): self._folders((deviation,)) return deviation - def deviation_content(self, deviation_id, public=False): + def deviation_content(self, deviation_id, public=True): """Get extended content of a single Deviation""" endpoint = "/deviation/content" params = {"deviationid": deviation_id} - return self._call(endpoint, params=params, public=public) + content = self._call(endpoint, params=params, public=public) + if public and content["html"].startswith( + ' <span class=\"username-with-symbol'): + if self.refresh_token_key: + content = self._call(endpoint, params=params, public=False) + else: + self.log.warning("Private Journal") + return content def deviation_download(self, deviation_id, public=True): """Get the original file download (if allowed)""" diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 11436cb..8481248 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -6,6 +6,7 @@ """Extractors for https://www.fanbox.cc/""" +import re from .common import Extractor, Message from .. import text @@ -78,6 +79,7 @@ class FanboxExtractor(Extractor): num = 0 cover_image = post.get("coverImageUrl") if cover_image: + cover_image = re.sub("/c/[0-9a-z_]+", "", cover_image) final_post = post.copy() final_post["isCoverImage"] = True final_post["fileUrl"] = cover_image diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 34b52ef..5e6da5b 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -88,9 +88,13 @@ BASE_PATTERN = FoolfuukaExtractor.update({ "root": "https://boards.fireden.net", "pattern": r"boards\.fireden\.net", }, - "nyafuu": { - "root": "https://archive.nyafuu.org", - "pattern": r"(?:archive\.)?nyafuu\.org", + "rozenarcana": { + "root": "https://archive.alice.al", + "pattern": r"(?:archive\.)?alice\.al", + }, + "tokyochronos": { + "root": "https://www.tokyochronos.net", + "pattern": r"(?:www\.)?tokyochronos\.net", }, "rbt": { "root": "https://rbt.asia", @@ -111,7 +115,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)" test = ( ("https://archive.4plebs.org/tg/thread/54059290", { - "url": "07452944164b602502b02b24521f8cee5c484d2a", + "url": "fd823f17b5001442b941fddcd9ec91bafedfbc79", }), ("https://archived.moe/gd/thread/309639/", { "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8", @@ -133,8 +137,11 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): ("https://boards.fireden.net/sci/thread/11264294/", { "url": "61cab625c95584a12a30049d054931d64f8d20aa", }), - ("https://archive.nyafuu.org/c/thread/2849220/", { - "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f", + ("https://archive.alice.al/c/thread/2849220/", { + "url": "632e2c8de05de6b3847685f4bf1b4e5c6c9e0ed5", + }), + ("https://www.tokyochronos.net/a/thread/241664141/", { + "url": "ae03852cf44e3dcfce5be70274cb1828e1dbb7d6", }), ("https://rbt.asia/g/thread/61487650/", { "url": "fadd274b25150a1bdf03a40c58db320fa3b617c4", @@ -180,7 +187,8 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/"), ("https://desuarchive.org/a/"), ("https://boards.fireden.net/sci/"), - ("https://archive.nyafuu.org/c/"), + ("https://archive.alice.al/c/"), + ("https://www.tokyochronos.net/a/"), ("https://rbt.asia/g/"), ("https://thebarchive.com/b/"), ) @@ -223,7 +231,8 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): ("https://archiveofsins.com/_/search/text/test/"), ("https://desuarchive.org/_/search/text/test/"), ("https://boards.fireden.net/_/search/text/test/"), - ("https://archive.nyafuu.org/_/search/text/test/"), + ("https://archive.alice.al/_/search/text/test/"), + ("https://www.tokyochronos.net/_/search/text/test/"), ("https://rbt.asia/_/search/text/test/"), ("https://thebarchive.com/_/search/text/test/"), ) @@ -288,7 +297,8 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/gallery/"), ("https://desuarchive.org/a/gallery/5"), ("https://boards.fireden.net/sci/gallery/6"), - ("https://archive.nyafuu.org/c/gallery/7"), + ("https://archive.alice.al/c/gallery/7"), + ("https://www.tokyochronos.net/a/gallery/7"), ("https://rbt.asia/g/gallery/8"), ("https://thebarchive.com/b/gallery/9"), ) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index e8bee37..92f7ac2 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from . import gelbooru_v02 -from .. import text, util, exception +from .. import text, exception import binascii @@ -21,10 +21,15 @@ class GelbooruBase(): root = "https://gelbooru.com" def _api_request(self, params): + params["api_key"] = self.api_key + params["user_id"] = self.user_id + url = self.root + "/index.php?page=dapi&s=post&q=index&json=1" data = self.request(url, params=params).json() + if "post" not in data: return () + posts = data["post"] if not isinstance(posts, list): return (posts,) @@ -85,28 +90,29 @@ class GelbooruTagExtractor(GelbooruBase, class GelbooruPoolExtractor(GelbooruBase, gelbooru_v02.GelbooruV02PoolExtractor): - """Extractor for image-pools from gelbooru.com""" + """Extractor for gelbooru pools""" + per_page = 45 pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=pool&s=show&id=(?P<pool>\d+)") test = ( ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { "count": 6, }), - ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { - "options": (("api", False),), - "count": 6, - }), ) def metadata(self): - url = "{}/index.php?page=pool&s=show&id={}".format( - self.root, self.pool_id) - page = self.request(url).text + url = self.root + "/index.php" + self._params = { + "page": "pool", + "s" : "show", + "id" : self.pool_id, + "pid" : self.page_start, + } + self._page = self.request(url, params=self._params).text - name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>") + name, pos = text.extract(self._page, "<h3>Now Viewing: ", "</h3>") if not name: raise exception.NotFoundError("pool") - self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos) return { "pool": text.parse_int(self.pool_id), @@ -114,9 +120,23 @@ class GelbooruPoolExtractor(GelbooruBase, } def posts(self): - params = {} - for params["id"] in util.advance(self.post_ids, self.page_start): - yield from self._api_request(params) + url = self.root + "/index.php" + params = self._params + + page = self._page + del self._page + data = {} + + while True: + num_ids = 0 + for data["id"] in text.extract_iter(page, '" id="p', '"'): + num_ids += 1 + yield from self._api_request(data) + + if num_ids < self.per_page: + return + params["pid"] += self.per_page + page = self.request(url, params=params).text class GelbooruPostExtractor(GelbooruBase, diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 35a3448..8214614 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -21,6 +21,9 @@ class GelbooruV02Extractor(booru.BooruExtractor): def __init__(self, match): booru.BooruExtractor.__init__(self, match) + self.api_key = self.config("api-key") + self.user_id = self.config("user-id") + try: self.api_root = INSTANCES[self.category]["api_root"] except KeyError: @@ -59,6 +62,24 @@ class GelbooruV02Extractor(booru.BooruExtractor): return params["pid"] += 1 + def _pagination_html(self, params): + url = self.root + "/index.php" + params["pid"] = self.page_start * self.per_page + + data = {} + while True: + num_ids = 0 + page = self.request(url, params=params).text + + for data["id"] in text.extract_iter(page, '" id="p', '"'): + num_ids += 1 + for post in self._api_request(data): + yield post.attrib + + if num_ids < self.per_page: + return + params["pid"] += self.per_page + @staticmethod def _prepare(post): post["date"] = text.parse_datetime( @@ -204,7 +225,12 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor): def __init__(self, match): GelbooruV02Extractor.__init__(self, match) self.pool_id = match.group(match.lastindex) - self.post_ids = () + + if self.category == "rule34": + self.posts = self._posts_pages + self.per_page = 45 + else: + self.post_ids = () def skip(self, num): self.page_start += num @@ -232,6 +258,13 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor): for post in self._api_request(params): yield post.attrib + def _posts_pages(self): + return self._pagination_html({ + "page": "pool", + "s" : "show", + "id" : self.pool_id, + }) + class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): subcategory = "favorite" @@ -265,27 +298,11 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): return {"favorite_id": text.parse_int(self.favorite_id)} def posts(self): - url = self.root + "/index.php" - params = { + return self._pagination_html({ "page": "favorites", "s" : "view", "id" : self.favorite_id, - "pid" : self.page_start * self.per_page, - } - - data = {} - while True: - num_ids = 0 - page = self.request(url, params=params).text - - for data["id"] in text.extract_iter(page, '" id="p', '"'): - num_ids += 1 - for post in self._api_request(data): - yield post.attrib - - if num_ids < self.per_page: - return - params["pid"] += self.per_page + }) class GelbooruV02PostExtractor(GelbooruV02Extractor): diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index ca7e692..f8b0c3b 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -174,23 +174,27 @@ class HitomiTagExtractor(Extractor): } offset = 0 + total = None while True: headers["Referer"] = "{}/{}/{}.html?page={}".format( self.root, self.type, self.tag, offset // 100 + 1) headers["Range"] = "bytes={}-{}".format(offset, offset+99) - nozomi = self.request(nozomi_url, headers=headers).content + response = self.request(nozomi_url, headers=headers) - for gallery_id in decode_nozomi(nozomi): + for gallery_id in decode_nozomi(response.content): gallery_url = "{}/galleries/{}.html".format( self.root, gallery_id) yield Message.Queue, gallery_url, data - if len(nozomi) < 100: - return offset += 100 + if total is None: + total = text.parse_int( + response.headers["content-range"].rpartition("/")[2]) + if offset >= total: + return -@memcache() +@memcache(maxage=1800) def _parse_gg(extr): page = extr.request("https://ltn.hitomi.la/gg.js").text diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 4a2c3bb..d56af8b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -270,6 +270,7 @@ class InstagramExtractor(Extractor): "post_shortcode": post["code"], "likes": post["like_count"], "pinned": post.get("timeline_pinned_user_ids", ()), + "date": text.parse_timestamp(post.get("taken_at")), } caption = post["caption"] @@ -399,6 +400,8 @@ class InstagramExtractor(Extractor): self.log.debug("Cursor: %s", self._cursor) def _pagination_api(self, endpoint, params=None): + if params is None: + params = {} while True: data = self._request_api(endpoint, params=params) yield from data["items"] @@ -509,7 +512,7 @@ class InstagramChannelExtractor(InstagramExtractor): class InstagramSavedExtractor(InstagramExtractor): """Extractor for ProfilePage saved media""" subcategory = "saved" - pattern = USER_PATTERN + r"/saved" + pattern = USER_PATTERN + r"/saved/?$" test = ("https://www.instagram.com/instagram/saved/",) def posts(self): @@ -518,6 +521,30 @@ class InstagramSavedExtractor(InstagramExtractor): return self._pagination_graphql(query_hash, variables) +class InstagramCollectionExtractor(InstagramExtractor): + """Extractor for ProfilePage saved collection media""" + subcategory = "collection" + pattern = USER_PATTERN + r"/saved/([^/?#]+)/([^/?#]+)" + test = ( + "https://www.instagram.com/instagram/saved/collection_name/123456789/", + ) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.user, self.collection_name, self.collection_id = match.groups() + + def metadata(self): + return { + "collection_id" : self.collection_id, + "collection_name": text.unescape(self.collection_name), + } + + def posts(self): + endpoint = "/v1/feed/collection/{}/posts/".format(self.collection_id) + for item in self._pagination_api(endpoint): + yield item["media"] + + class InstagramTagExtractor(InstagramExtractor): """Extractor for TagPage""" subcategory = "tag" diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 6b2cf4c..00a32cd 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -101,9 +101,9 @@ class ItakuImageExtractor(ItakuExtractor): "/gallery_imgs/220504_oUNIAFT/xl.jpg", "liked_by_you": False, "maturity_rating": "SFW", - "num_comments": 2, - "num_likes": 80, - "num_reshares": 2, + "num_comments": int, + "num_likes": int, + "num_reshares": int, "obj_tags": 136446, "owner": 16775, "owner_avatar": "https://d1wmr8tlk3viaj.cloudfront.net" @@ -115,8 +115,9 @@ class ItakuImageExtractor(ItakuExtractor): "tags": list, "tags_character": ["hatsune_miku"], "tags_copyright": ["vocaloid"], - "tags_general" : ["twintails", "green_hair", "flag", "gloves", - "green_eyes", "female", "racing_miku"], + "tags_general" : ["female", "green_eyes", "twintails", + "green_hair", "gloves", "flag", + "racing_miku"], "title": "Racing Miku 2022 Ver.", "too_mature": False, "uncompressed_filesize": "0.62", diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index f1eb79f..816b561 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -440,20 +440,44 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): class KemonopartyFavoriteExtractor(KemonopartyExtractor): """Extractor for kemono.party favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites" - test = ("https://kemono.party/favorites", { - "pattern": KemonopartyUserExtractor.pattern, - "url": "f4b5b796979bcba824af84206578c79101c7f0e1", - "count": 3, - }) + pattern = BASE_PATTERN + r"/favorites(?:/?\?([^#]+))?" + test = ( + ("https://kemono.party/favorites", { + "pattern": KemonopartyUserExtractor.pattern, + "url": "f4b5b796979bcba824af84206578c79101c7f0e1", + "count": 3, + }), + ("https://kemono.party/favorites?type=post", { + "pattern": KemonopartyPostExtractor.pattern, + "url": "ecfccf5f0d50b8d14caa7bbdcf071de5c1e5b90f", + "count": 3, + }), + ) + + def __init__(self, match): + KemonopartyExtractor.__init__(self, match) + self.favorites = (text.parse_query(match.group(2)).get("type") or + self.config("favorites") or + "artist") def items(self): self._prepare_ddosguard_cookies() self.login() - users = self.request(self.root + "/api/favorites").json() - for user in users: - user["_extractor"] = KemonopartyUserExtractor - url = "{}/{}/user/{}".format( - self.root, user["service"], user["id"]) - yield Message.Queue, url, user + if self.favorites == "artist": + users = self.request( + self.root + "/api/v1/account/favorites?type=artist").json() + for user in users: + user["_extractor"] = KemonopartyUserExtractor + url = "{}/{}/user/{}".format( + self.root, user["service"], user["id"]) + yield Message.Queue, url, user + + elif self.favorites == "post": + posts = self.request( + self.root + "/api/v1/account/favorites?type=post").json() + for post in posts: + post["_extractor"] = KemonopartyPostExtractor + url = "{}/{}/user/{}/post/{}".format( + self.root, post["service"], post["user"], post["id"]) + yield Message.Queue, url, post diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index b5db3dd..57db0c9 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -49,7 +49,9 @@ class LusciousAlbumExtractor(LusciousExtractor): r"/(?:albums|pictures/c/[^/?#]+/album)/[^/?#]+_(\d+)") test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { - "url": "7e4984a271a1072ac6483e4228a045895aff86f3", + "pattern": r"https://storage\.bhs\.cloud\.ovh\.net/v1/AUTH_\w+" + r"/images/NTRshouldbeillegal/277031" + r"/luscious_net_\d+_\d+\.jpg$", # "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", "keyword": { "album": { diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 6e780e8..493a8ef 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -179,12 +179,11 @@ class MastodonAPI(): try: access_token = INSTANCES[extractor.category]["access-token"] except (KeyError, TypeError): - raise exception.StopExtraction( - "Missing access token.\n" - "Run 'gallery-dl oauth:mastodon:%s' to obtain one.", - extractor.instance) - - self.headers = {"Authorization": "Bearer " + access_token} + pass + if access_token: + self.headers = {"Authorization": "Bearer " + access_token} + else: + self.headers = None def account_id_by_username(self, username): if username.startswith("id:"): @@ -232,6 +231,11 @@ class MastodonAPI(): if code < 400: return response + if code == 401: + raise exception.StopExtraction( + "Invalid or missing access token.\n" + "Run 'gallery-dl oauth:mastodon:%s' to obtain one.", + self.extractor.instance) if code == 404: raise exception.NotFoundError() if code == 429: diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 122ea46..2c8e72c 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -126,7 +126,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): username, password = self._get_auth_info() self._update_cookies(self._login_impl(username, password)) - @cache(maxage=150*24*3600, keyarg=1) + @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): if not username or not password: raise exception.AuthenticationError( diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 653822f..d6628c4 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -41,7 +41,8 @@ class OAuthBase(Extractor): stdout_write("Waiting for response. (Cancel with Ctrl+c)\n") server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(("localhost", self.config("port", 6414))) + server.bind((self.config("host", "localhost"), + self.config("port", 6414))) server.listen(1) # workaround for ctrl+c not working during server.accept on Windows diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index fba1312..225f0ff 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -122,7 +122,7 @@ class PhilomenaPostExtractor(PhilomenaExtractor): "tag_ids": list, "tags": list, "thumbnails_generated": True, - "updated_at": "2022-04-25T09:30:57Z", + "updated_at": r"re:\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ", "uploader": "Clover the Clever", "uploader_id": 211188, "upvotes": int, diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index e1846cc..8203885 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -51,13 +51,13 @@ class PoipikuExtractor(Extractor): thumb = extr('class="IllustItemThumbImg" src="', '"') if not thumb: break - elif thumb.startswith("/img/"): + elif thumb.startswith(("//img.poipiku.com/img/", "/img/")): continue post["num"] += 1 url = text.ensure_http_scheme(thumb[:-8]) yield Message.Url, url, text.nameext_from_url(url, post) - if not extr('</i> show all', '<'): + if not extr('> show all', '<'): continue url = self.root + "/f/ShowAppendFileF.jsp" @@ -131,7 +131,7 @@ class PoipikuPostExtractor(PoipikuExtractor): pattern = BASE_PATTERN + r"/(\d+)/(\d+)" test = ( ("https://poipiku.com/25049/5864576.html", { - "pattern": r"https://img\.poipiku\.com/user_img03/000025049" + "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049" r"/005864576_EWN1Y65gQ\.png$", "keyword": { "count": "1", @@ -146,7 +146,7 @@ class PoipikuPostExtractor(PoipikuExtractor): }, }), ("https://poipiku.com/2166245/6411749.html", { - "pattern": r"https://img\.poipiku\.com/user_img01/002166245" + "pattern": r"https://img\.poipiku\.com/user_img\d+/002166245" r"/006411749_\w+\.jpeg$", "count": 4, "keyword": { diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 6dfc907..cd8c238 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -16,13 +16,14 @@ class SkebExtractor(Extractor): category = "skeb" directory_fmt = ("{category}", "{creator[screen_name]}") filename_fmt = "{post_num}_{file_id}.{extension}" - archive_fmt = "{post_num}_{file_id}_{content_category}" + archive_fmt = "{post_num}_{_file_id}_{content_category}" root = "https://skeb.jp" def __init__(self, match): Extractor.__init__(self, match) self.user_name = match.group(1) self.thumbnails = self.config("thumbnails", False) + self.article = self.config("article", False) def items(self): for user_name, post_num in self.posts(): @@ -64,6 +65,7 @@ class SkebExtractor(Extractor): resp = self.request(url, headers=headers).json() creator = resp["creator"] post = { + "post_id" : resp["id"], "post_num" : post_num, "post_url" : self.root + resp["path"], "body" : resp["body"], @@ -102,12 +104,22 @@ class SkebExtractor(Extractor): if self.thumbnails and "og_image_url" in resp: post["content_category"] = "thumb" post["file_id"] = "thumb" + post["_file_id"] = str(resp["id"]) + "t" post["file_url"] = resp["og_image_url"] yield post + if self.article and "article_image_url" in resp: + url = resp["article_image_url"] + if url: + post["content_category"] = "article" + post["file_id"] = "article" + post["_file_id"] = str(resp["id"]) + "a" + post["file_url"] = url + yield post + for preview in resp["previews"]: post["content_category"] = "preview" - post["file_id"] = preview["id"] + post["file_id"] = post["_file_id"] = preview["id"] post["file_url"] = preview["url"] info = preview["information"] post["original"] = { diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index b0b8f3b..506db26 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -59,7 +59,7 @@ class SlidesharePresentationExtractor(GalleryExtractor): # mobile URL (("https://www.slideshare.net" "/mobile/uqudent/introduction-to-fixed-prosthodontics"), { - "url": "59993ad7b0cb93c73011547eedcd02c622649e9d", + "url": "43eda2adf4dd221a251c8df794dfb82649e94647", }), ) @@ -72,14 +72,14 @@ class SlidesharePresentationExtractor(GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) descr = extr('<meta name="description" content="', '"') - title = extr('<span class="j-title-breadcrumb">', '</span>') - published = extr('<div class="metadata-item">', '</div>') comments = extr('content="UserComments:', '"') likes = extr('content="UserLikes:', '"') views = extr('content="UserPageVisits:', '"') + title = extr('<span class="j-title-breadcrumb">', '</span>') + published = extr('<div class="metadata-item">', '</div>') if descr.endswith("…"): - alt_descr = extr('id="slideshow-description-text"', '</p>') + alt_descr = extr('slideshow-description-text"', '</p>') if alt_descr: descr = text.remove_html(alt_descr.partition(">")[2]).strip() diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 98e914e..4010da3 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -111,13 +111,13 @@ class SmugmugImageExtractor(SmugmugExtractor): test = ( ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { "url": "e6408fd2c64e721fd146130dceb56a971ceb4259", - "keyword": "460a773f5addadd3e216bda346fc524fe4eedc52", + "keyword": "b31a63d07c9c26eb0f79f52d60d171a98938f99b", "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0", }), # video ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee", - "keyword": "eb74e5cf6780d5152ab8f11b431ec1b17fa8f69b", + "keyword": "4cef98133ace511adc874c9d9abac5817ba0d856", }), ) diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index fcdf18f..545a95b 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -108,7 +108,7 @@ class TapasSeriesExtractor(TapasExtractor): test = ( ("https://tapas.io/series/just-leave-me-be", { "pattern": r"https://\w+\.cloudfront\.net/pc/\w\w/[0-9a-f-]+\.jpg", - "count": 127, + "count": 132, }), ("https://tapas.io/series/yona", { # mature "count": 26, diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index ded7fd1..b694fa0 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -64,6 +64,7 @@ class TumblrExtractor(Extractor): self.inline = self.config("inline", True) self.reblogs = self.config("reblogs", True) self.external = self.config("external", False) + self.original = self.config("original", True) if len(self.types) == 1: self.api.posts_type = next(iter(self.types)) @@ -101,8 +102,7 @@ class TumblrExtractor(Extractor): del post["trail"] post["blog"] = blog post["date"] = text.parse_timestamp(post["timestamp"]) - yield Message.Directory, post - post["num"] = 0 + posts = [] if "photos" in post: # type "photo" or "link" photos = post["photos"] @@ -110,18 +110,31 @@ class TumblrExtractor(Extractor): for photo in photos: post["photo"] = photo - photo.update(photo["original_size"]) + + best_photo = photo["original_size"] + for alt_photo in photo["alt_sizes"]: + if (alt_photo["height"] > best_photo["height"] or + alt_photo["width"] > best_photo["width"]): + best_photo = alt_photo + photo.update(best_photo) + + if self.original and "/s2048x3072/" in photo["url"] and ( + photo["width"] == 2048 or photo["height"] == 3072): + photo["url"] = self._original_image(photo["url"]) + del photo["original_size"] del photo["alt_sizes"] - yield self._prepare_image(photo["url"], post) + posts.append( + self._prepare_image(photo["url"], post.copy())) + del post["photo"] url = post.get("audio_url") # type "audio" if url and url.startswith("https://a.tumblr.com/"): - yield self._prepare(url, post) + posts.append(self._prepare(url, post.copy())) url = post.get("video_url") # type "video" if url: - yield self._prepare(_original_video(url), post) + posts.append(self._prepare(_original_video(url), post.copy())) if self.inline and "reblog" in post: # inline media # only "chat" posts are missing a "reblog" key in their @@ -129,16 +142,25 @@ class TumblrExtractor(Extractor): body = post["reblog"]["comment"] + post["reblog"]["tree_html"] for url in re.findall('<img src="([^"]+)"', body): url = _original_inline_image(url) - yield self._prepare_image(url, post) + posts.append(self._prepare_image(url, post.copy())) for url in re.findall('<source src="([^"]+)"', body): url = _original_video(url) - yield self._prepare(url, post) + posts.append(self._prepare(url, post.copy())) if self.external: # external links - post["extension"] = None url = post.get("permalink_url") or post.get("url") if url: - yield Message.Queue, url, post + post["extension"] = None + posts.append((Message.Queue, url, post.copy())) + del post["extension"] + + post["count"] = len(posts) + yield Message.Directory, post + + for num, (msg, url, post) in enumerate(posts, 1): + post["num"] = num + post["count"] = len(posts) + yield msg, url, post def posts(self): """Return an iterable containing all relevant posts""" @@ -167,14 +189,12 @@ class TumblrExtractor(Extractor): @staticmethod def _prepare(url, post): text.nameext_from_url(url, post) - post["num"] += 1 post["hash"] = post["filename"].partition("_")[2] return Message.Url, url, post @staticmethod def _prepare_image(url, post): text.nameext_from_url(url, post) - post["num"] += 1 parts = post["filename"].split("_") try: @@ -188,7 +208,7 @@ class TumblrExtractor(Extractor): @staticmethod def _prepare_avatar(url, post, blog): text.nameext_from_url(url, post) - post["num"] = 1 + post["num"] = post["count"] = 1 post["blog"] = blog post["reblogged"] = False post["type"] = post["id"] = post["hash"] = "avatar" @@ -200,6 +220,12 @@ class TumblrExtractor(Extractor): def _skip_reblog_same_blog(self, post): return self.blog != post.get("reblogged_root_uuid") + def _original_image(self, url): + url = url.replace("/s2048x3072/", "/s99999x99999/", 1) + headers = {"Accept": "text/html,*/*;q=0.8"} + response = self.request(url, headers=headers) + return text.extract(response.text, '" src="', '"')[0] + class TumblrUserExtractor(TumblrExtractor): """Extractor for all images from a tumblr-user""" @@ -279,6 +305,12 @@ class TumblrPostExtractor(TumblrExtractor): ("https://mikf123.tumblr.com/post/181022380064/chat-post", { "count": 0, }), + ("https://mikf123.tumblr.com/image/689860196535762944", { + "pattern": r"^https://\d+\.media\.tumblr\.com" + r"/134791621559a79793563b636b5fe2c6" + r"/8f1131551cef6e74-bc/s99999x99999" + r"/188cf9b8915b0d0911c6c743d152fc62e8f38491\.png$", + }), ("http://ziemniax.tumblr.com/post/109697912859/", { "exception": exception.NotFoundError, # HTML response (#297) }), diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 36b4806..0df4ea2 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache +import itertools import json BASE_PATTERN = ( @@ -40,7 +41,7 @@ class TwitterExtractor(Extractor): self.quoted = self.config("quoted", False) self.videos = self.config("videos", True) self.cards = self.config("cards", False) - self._user_id = None + self._user = self._user_obj = None self._user_cache = {} self._init_sizes() @@ -90,8 +91,9 @@ class TwitterExtractor(Extractor): if "in_reply_to_user_id_str" in data and ( not self.replies or ( self.replies == "self" and - (self._user_id or data["in_reply_to_user_id_str"]) != - data["user_id_str"] + data["user_id_str"] != + (self._user_obj["rest_id"] if self._user else + data["in_reply_to_user_id_str"]) ) ): self.log.debug("Skipping %s (reply)", data["id_str"]) @@ -229,11 +231,13 @@ class TwitterExtractor(Extractor): files.append({"url": url}) def _transform_tweet(self, tweet): - if "core" in tweet: - user = self._transform_user( - tweet["core"]["user_results"]["result"]) + if "author" in tweet: + author = tweet["author"] + elif "core" in tweet: + author = tweet["core"]["user_results"]["result"] else: - user = self._transform_user(tweet["user"]) + author = tweet["user"] + author = self._transform_user(author) if "legacy" in tweet: tweet = tweet["legacy"] @@ -245,12 +249,13 @@ class TwitterExtractor(Extractor): "retweet_id" : text.parse_int( tget("retweeted_status_id_str")), "quote_id" : text.parse_int( - tget("quoted_status_id_str")), + tget("quoted_by_id_str")), "reply_id" : text.parse_int( tget("in_reply_to_status_id_str")), "date" : text.parse_datetime( tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), - "user" : user, + "user" : self._user or author, + "author" : author, "lang" : tweet["lang"], "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), @@ -280,13 +285,8 @@ class TwitterExtractor(Extractor): if "in_reply_to_screen_name" in tweet: tdata["reply_to"] = tweet["in_reply_to_screen_name"] - if "quoted_by_id_str" in tweet: - tdata["quote_by"] = text.parse_int(tweet["quoted_by_id_str"]) - - if "author" in tweet: - tdata["author"] = self._transform_user(tweet["author"]) - else: - tdata["author"] = tdata["user"] + if "quoted_by" in tweet: + tdata["quote_by"] = tweet["quoted_by"] return tdata @@ -336,6 +336,10 @@ class TwitterExtractor(Extractor): return udata + def _assign_user(self, user): + self._user_obj = user + self._user = self._transform_user(user) + def _users_result(self, users): userfmt = self.config("users") if not userfmt or userfmt == "timeline": @@ -455,33 +459,24 @@ class TwitterTimelineExtractor(TwitterExtractor): tweet = None for tweet in self._select_tweet_source()(self.user): yield tweet - if tweet is None: return - # get username - if not self.user.startswith("id:"): - username = self.user - elif "core" in tweet: - username = (tweet["core"]["user_results"]["result"] - ["legacy"]["screen_name"]) - else: - username = tweet["user"]["screen_name"] - - # get tweet data - if "legacy" in tweet: - tweet = tweet["legacy"] - # build search query - query = "from:{} max_id:{}".format(username, tweet["id_str"]) + query = "from:{} max_id:{}".format( + self._user["name"], tweet["rest_id"]) if self.retweets: query += " include:retweets include:nativeretweets" + if not self.textonly: - query += (" (filter:images OR" - " filter:native_video OR" - " card_name:animated_gif)") + # try to search for media-only tweets + tweet = None + for tweet in self.api.search_adaptive(query + " filter:links"): + yield tweet + if tweet is not None: + return - # yield search results starting from last tweet id + # yield unfiltered search results yield from self.api.search_adaptive(query) def _select_tweet_source(self): @@ -625,7 +620,25 @@ class TwitterSearchExtractor(TwitterExtractor): return {"search": text.unquote(self.user)} def tweets(self): - return self.api.search_adaptive(text.unquote(self.user)) + query = text.unquote(self.user.replace("+", " ")) + + user = None + for item in query.split(): + item = item.strip("()") + if item.startswith("from:"): + if user: + user = None + break + else: + user = item[5:] + + if user is not None: + try: + self._assign_user(self.api.user_by_screen_name(user)) + except KeyError: + pass + + return self.api.search_adaptive(query) class TwitterEventExtractor(TwitterExtractor): @@ -693,7 +706,7 @@ class TwitterTweetExtractor(TwitterExtractor): }), ("https://twitter.com/i/web/status/1424898916156284928", { "options": (("replies", "self"),), - "count": 0, + "count": 1, }), # "quoted" option (#854) ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", { @@ -777,20 +790,38 @@ class TwitterTweetExtractor(TwitterExtractor): def tweets(self): if self.config("conversations", False): - return self.api.tweet_detail(self.tweet_id) + return self._tweets_conversation(self.tweet_id) + else: + return self._tweets_single(self.tweet_id) + def _tweets_single(self, tweet_id): tweets = [] - tweet_id = self.tweet_id + for tweet in self.api.tweet_detail(tweet_id): if tweet["rest_id"] == tweet_id or \ tweet.get("_retweet_id_str") == tweet_id: + self._assign_user(tweet["core"]["user_results"]["result"]) tweets.append(tweet) tweet_id = tweet["legacy"].get("quoted_status_id_str") if not tweet_id: break + return tweets + def _tweets_conversation(self, tweet_id): + tweets = self.api.tweet_detail(tweet_id) + buffer = [] + + for tweet in tweets: + buffer.append(tweet) + if tweet["rest_id"] == tweet_id or \ + tweet.get("_retweet_id_str") == tweet_id: + self._assign_user(tweet["core"]["user_results"]["result"]) + break + + return itertools.chain(buffer, tweets) + class TwitterImageExtractor(Extractor): category = "twitter" @@ -888,7 +919,6 @@ class TwitterAPI(): self._nsfw_warning = True self._syndication = extractor.config("syndication") self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode - self._user = None cookies = extractor.session.cookies cookiedomain = extractor.cookiedomain @@ -1050,13 +1080,13 @@ class TwitterAPI(): def _user_id_by_screen_name(self, screen_name): if screen_name.startswith("id:"): - self._user = util.SENTINEL user_id = screen_name[3:] + user = self.user_by_rest_id(user_id) else: user = () try: - user = self._user = self.user_by_screen_name(screen_name) + user = self.user_by_screen_name(screen_name) user_id = user["rest_id"] except KeyError: if "unavailable_message" in user: @@ -1066,7 +1096,7 @@ class TwitterAPI(): else: raise exception.NotFoundError("user") - self.extractor._user_id = user_id + self.extractor._assign_user(user) return user_id @cache(maxage=3600) @@ -1183,7 +1213,7 @@ class TwitterAPI(): if quoted: quoted = quoted.copy() quoted["author"] = users[quoted["user_id_str"]] - quoted["user"] = tweet["user"] + quoted["quoted_by"] = tweet["user"]["screen_name"] quoted["quoted_by_id_str"] = tweet["id_str"] yield quoted @@ -1226,17 +1256,10 @@ class TwitterAPI(): except LookupError: extr.log.debug(data) - if self._user: - user = self._user - if user is util.SENTINEL: - try: - user = self.user_by_rest_id(variables["userId"]) - except KeyError: - raise exception.NotFoundError("user") - user = user.get("legacy") - if not user: - pass - elif user.get("blocked_by"): + user = extr._user_obj + if user: + user = user["legacy"] + if user.get("blocked_by"): if self.headers["x-twitter-auth-type"] and \ extr.config("logout"): guest_token = self._guest_token() @@ -1322,7 +1345,7 @@ class TwitterAPI(): try: legacy["retweeted_status_id_str"] = \ retweet["rest_id"] - legacy["author"] = \ + tweet["author"] = \ retweet["core"]["user_results"]["result"] if "extended_entities" in retweet["legacy"] and \ "extended_entities" not in legacy: @@ -1336,9 +1359,9 @@ class TwitterAPI(): if "quoted_status_result" in tweet: try: quoted = tweet["quoted_status_result"]["result"] - quoted["legacy"]["author"] = \ - quoted["core"]["user_results"]["result"] - quoted["core"] = tweet["core"] + quoted["legacy"]["quoted_by"] = ( + tweet["core"]["user_results"]["result"] + ["legacy"]["screen_name"]) quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"] yield quoted except KeyError: @@ -1374,10 +1397,14 @@ class TwitterAPI(): if instr["type"] == "TimelineAddEntries": for entry in instr["entries"]: if entry["entryId"].startswith("user-"): - user = (entry["content"]["itemContent"] - ["user_results"]["result"]) - if "rest_id" in user: - yield user + try: + user = (entry["content"]["itemContent"] + ["user_results"]["result"]) + except KeyError: + pass + else: + if "rest_id" in user: + yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] elif instr["type"] == "TimelineTerminateTimeline": @@ -1439,6 +1466,6 @@ class TwitterAPI(): return { "rest_id": tweet["id_str"], "legacy" : tweet, - "user" : tweet["user"], + "core" : {"user_results": {"result": tweet["user"]}}, "_retweet_id_str": retweet_id, } diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index c29d730..623ed94 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -84,7 +84,7 @@ class UnsplashImageExtractor(UnsplashExtractor): "blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz", "categories": list, "color": "#f3c08c", - "created_at": "2020-04-08T08:29:42-04:00", + "created_at": "2020-04-08T12:29:42Z", "date": "dt:2020-04-08 12:29:42", "description": "The Island", "downloads": int, @@ -112,7 +112,7 @@ class UnsplashImageExtractor(UnsplashExtractor): }, "title": "Beaver Dam, WI 53916, USA" }, - "promoted_at": "2020-04-08T11:12:03-04:00", + "promoted_at": "2020-04-08T15:12:03Z", "sponsorship": None, "tags": list, "updated_at": str, diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index ab2153f..25b00fe 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -40,12 +40,17 @@ class VkExtractor(Extractor): continue try: + photo["url"] = photo[size + "src"] + except KeyError: + self.log.warning("no photo URL found (%s)", photo.get("id")) + continue + + try: _, photo["width"], photo["height"] = photo[size] except ValueError: # photo without width/height entries (#2535) photo["width"] = photo["height"] = 0 - photo["url"] = photo[size + "src"] photo["id"] = photo["id"].rpartition("_")[2] photo.update(data) diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 756384b..668be0f 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -169,7 +169,7 @@ class VscoCollectionExtractor(VscoExtractor): return self._pagination(url, params, tkn, "medias", ( data["medias"]["byId"][mid["id"]]["media"] for mid in data - ["collections"]["byCollectionId"][cid]["byPage"]["1"]["collection"] + ["collections"]["byId"][cid]["1"]["collection"] )) diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 37eab24..0ad8523 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -189,7 +189,7 @@ class WallhavenAPI(): def collections(self, username): endpoint = "/v1/collections/" + username - return self._pagination(endpoint) + return self._pagination(endpoint, metadata=False) def search(self, params): endpoint = "/v1/search" @@ -200,13 +200,20 @@ class WallhavenAPI(): return self.extractor.request( url, headers=self.headers, params=params).json() - def _pagination(self, endpoint, params=None): + def _pagination(self, endpoint, params=None, metadata=None): if params is None: params = {} + if metadata is None: + metadata = self.extractor.config("metadata") while True: data = self._call(endpoint, params) - yield from data["data"] + + if metadata: + for wp in data["data"]: + yield self.info(str(wp["id"])) + else: + yield from data["data"] meta = data.get("meta") if not meta or meta["current_page"] >= meta["last_page"]: diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index bdbdc8c..189c0c5 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -99,13 +99,14 @@ class WeiboExtractor(Extractor): else: yield pic["largest"].copy() - if "page_info" in status: - page_info = status["page_info"] - if "media_info" not in page_info or not self.videos: - return - media = max(page_info["media_info"]["playback_list"], - key=lambda m: m["meta"]["quality_index"]) - yield media["play_info"].copy() + if "page_info" in status and self.videos: + try: + media = max(status["page_info"]["media_info"]["playback_list"], + key=lambda m: m["meta"]["quality_index"]) + except KeyError: + pass + else: + yield media["play_info"].copy() def _status_by_id(self, status_id): url = "{}/ajax/statuses/show?id={}".format(self.root, status_id) @@ -147,14 +148,17 @@ class WeiboExtractor(Extractor): return yield from statuses - if "next_cursor" in data: + if "next_cursor" in data: # videos, newvideo params["cursor"] = data["next_cursor"] - elif "page" in params: + elif "page" in params: # home, article params["page"] += 1 - elif data["since_id"]: + elif data["since_id"]: # album params["sinceid"] = data["since_id"] - else: - params["since_id"] = statuses[-1]["id"] - 1 + else: # feed, last album page + try: + params["since_id"] = statuses[-1]["id"] - 1 + except KeyError: + return def _sina_visitor_system(self, response): self.log.info("Sina Visitor System") @@ -366,6 +370,10 @@ class WeiboStatusExtractor(WeiboExtractor): "pattern": r"https://g\.us\.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM0104" r"120005tc0E010\.mp4\?label=gif_mp4", }), + # missing 'playback_list' (#2792) + ("https://weibo.com/2909128931/4409545658754086", { + "count": 9, + }), ("https://m.weibo.cn/status/4339748116375525"), ("https://m.weibo.cn/5746766133/4339748116375525"), ) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py new file mode 100644 index 0000000..2b5acd8 --- /dev/null +++ b/gallery_dl/extractor/zerochan.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.zerochan.net/""" + +from .booru import BooruExtractor +from ..cache import cache +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" + + +class ZerochanExtractor(BooruExtractor): + """Base class for zerochan extractors""" + category = "zerochan" + root = "https://www.zerochan.net" + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}" + cookiedomain = ".zerochan.net" + cookienames = ("z_id", "z_hash") + + def login(self): + if not self._check_cookies(self.cookienames): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + # force legacy layout + self.session.cookies.set("v3", "0", domain=self.cookiedomain) + + @cache(maxage=90*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/login" + headers = { + "Origin" : self.root, + "Referer" : url, + } + data = { + "ref" : "/", + "name" : username, + "password": password, + "login" : "Login", + } + + response = self.request(url, method="POST", headers=headers, data=data) + if not response.history: + raise exception.AuthenticationError() + + return response.cookies + + def _parse_entry_page(self, entry_id): + url = "{}/{}".format(self.root, entry_id) + extr = text.extract_from(self.request(url).text) + + return { + "id" : entry_id, + "author": extr('"author": "', '"'), + "file_url": extr('"contentUrl": "', '"'), + "date" : text.parse_datetime(extr( + '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"), + "width" : extr('"width": "', ' '), + "height": extr('"height": "', ' '), + "size" : extr('"contentSize": "', 'B'), + "path" : text.split_html(extr( + 'class="breadcrumbs', '</p>'))[3::2], + "tags" : extr('alt="Tags: ', '"').split(", ") + } + + +class ZerochanTagExtractor(ZerochanExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?" + test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", { + "pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)", + "count": "> 24", + "keywords": { + "extension": r"re:jpg|png", + "file_url": "", + "filename": r"re:Perth.\(Kantai.Collection\).full.\d+", + "height": r"re:^\d+$", + "id": r"re:^\d+$", + "name": "Perth (Kantai Collection)", + "search_tags": "Perth (Kantai Collection)", + "size": r"re:^\d+k$", + "width": r"re:^\d+$", + }, + }) + + def __init__(self, match): + ZerochanExtractor.__init__(self, match) + self.search_tag, self.query = match.groups() + + def metadata(self): + return {"search_tags": text.unquote( + self.search_tag.replace("+", " "))} + + def posts(self): + url = self.root + "/" + self.search_tag + params = text.parse_query(self.query) + params["p"] = text.parse_int(params.get("p"), 1) + + while True: + page = self.request(url, params=params).text + thumbs = text.extract(page, '<ul id="thumbs', '</ul>')[0] + extr = text.extract_from(thumbs) + + while True: + post = extr('<li class="', '>') + if not post: + break + yield { + "id" : extr('href="/', '"'), + "name" : extr('alt="', '"'), + "width" : extr('title="', 'x'), + "height": extr('', ' '), + "size" : extr('', 'B'), + "file_url": "https://static." + extr( + '<a href="https://static.', '"'), + } + + if 'rel="next"' not in page: + break + params["p"] += 1 + + +class ZerochanImageExtractor(ZerochanExtractor): + subcategory = "image" + pattern = BASE_PATTERN + r"/(\d+)" + test = ("https://www.zerochan.net/2920445", { + "pattern": r"https://static\.zerochan\.net/" + r"Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg", + "keyword": { + "author": "YukinoTokisaki", + "date": "dt:2020-04-24 21:33:44", + "file_url": str, + "filename": "Perth.(Kantai.Collection).full.2920445", + "height": "1366", + "id": "2920445", + "size": "1929k", + "width": "1920", + }, + }) + + def __init__(self, match): + ZerochanExtractor.__init__(self, match) + self.image_id = match.group(1) + + def posts(self): + return (self._parse_entry_page(self.image_id),) |
