diff options
Diffstat (limited to 'gallery_dl/extractor/sankaku.py')
| -rw-r--r-- | gallery_dl/extractor/sankaku.py | 210 |
1 files changed, 164 insertions, 46 deletions
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 438dd9f..9e64eac 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -6,13 +6,15 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://chan.sankakucomplex.com/""" +"""Extractors for https://sankaku.app/""" from .booru import BooruExtractor from .. import text, exception +from ..cache import cache import collections -BASE_PATTERN = r"(?:https?://)?(?:beta|chan)\.sankakucomplex\.com" +BASE_PATTERN = r"(?:https?://)?" \ + r"(?:sankaku\.app|(?:beta|chan)\.sankakucomplex\.com)" class SankakuExtractor(BooruExtractor): @@ -20,8 +22,8 @@ class SankakuExtractor(BooruExtractor): basecategory = "booru" category = "sankaku" filename_fmt = "{category}_{id}_{md5}.{extension}" - request_interval_min = 1.0 - per_page = 100 + cookiedomain = None + _warning = True TAG_TYPES = { 0: "general", @@ -36,17 +38,24 @@ class SankakuExtractor(BooruExtractor): 9: "meta", } - def _prepare_post(self, post, extended_tags=False): + def skip(self, num): + return 0 + + def _file_url(self, post): url = post["file_url"] - if url[0] == "/": - url = self.root + url - if extended_tags: - self._fetch_extended_tags(post) - post["date"] = text.parse_timestamp(post["created_at"]["s"]) - post["tags"] = [tag["name"] for tag in post["tags"]] + if not url and self._warning: + self.log.warning( + "Login required to download 'contentious_content' posts") + SankakuExtractor._warning = False return url - def _fetch_extended_tags(self, post): + @staticmethod + def _prepare(post): + post["created_at"] = post["created_at"]["s"] + post["date"] = text.parse_timestamp(post["created_at"]) + post["tags"] = [tag["name"] for tag in post["tags"]] + + def _extended_tags(self, post): tags = collections.defaultdict(list) types = self.TAG_TYPES for tag in post["tags"]: @@ -54,44 +63,21 @@ class SankakuExtractor(BooruExtractor): for key, value in tags.items(): post["tags_" + key] = value - def _api_request(self, endpoint, params=None): - url = "https://capi-v2.sankakucomplex.com" + endpoint - while True: - response = self.request(url, params=params, fatal=False) - if response.status_code == 429: - self.wait(until=response.headers.get("X-RateLimit-Reset")) - continue - return response.json() - - def _pagination(self, params): - params["lang"] = "en" - params["limit"] = str(self.per_page) - - while True: - data = self._api_request("/posts/keyset", params) - if not data.get("success", True): - raise exception.StopExtraction(data.get("code")) - yield from data["data"] - - params["next"] = data["meta"]["next"] - if not params["next"]: - return - if "page" in params: - del params["page"] - class SankakuTagExtractor(SankakuExtractor): - """Extractor for images from chan.sankakucomplex.com by search-tags""" + """Extractor for images from sankaku.app by search-tags""" subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" pattern = BASE_PATTERN + r"/\?([^#]*)" test = ( - ("https://beta.sankakucomplex.com/?tags=bonocho", { + ("https://sankaku.app/?tags=bonocho", { "count": 5, "pattern": r"https://c?s\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", }), + ("https://beta.sankakucomplex.com/?tags=bonocho"), + ("https://chan.sankakucomplex.com/?tags=bonocho"), # error on five or more tags ("https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", { "options": (("username", None),), @@ -111,19 +97,21 @@ class SankakuTagExtractor(SankakuExtractor): return {"search_tags": self.tags} def posts(self): - return self._pagination({"tags": self.tags}) + params = {"tags": self.tags} + return SankakuAPI(self).posts_keyset(params) class SankakuPoolExtractor(SankakuExtractor): - """Extractor for image pools or books from chan.sankakucomplex.com""" + """Extractor for image pools or books from sankaku.app""" subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}") archive_fmt = "p_{pool}_{id}" pattern = BASE_PATTERN + r"/(?:books|pool/show)/(\d+)" test = ( - ("https://beta.sankakucomplex.com/books/90", { + ("https://sankaku.app/books/90", { "count": 5, }), + ("https://beta.sankakucomplex.com/books/90"), ("https://chan.sankakucomplex.com/pool/show/90"), ) @@ -132,7 +120,7 @@ class SankakuPoolExtractor(SankakuExtractor): self.pool_id = match.group(1) def metadata(self): - pool = self._api_request("/pools/" + self.pool_id) + pool = SankakuAPI(self).pools(self.pool_id) self._posts = pool.pop("posts") return {"pool": pool} @@ -141,12 +129,12 @@ class SankakuPoolExtractor(SankakuExtractor): class SankakuPostExtractor(SankakuExtractor): - """Extractor for single images from chan.sankakucomplex.com""" + """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" pattern = BASE_PATTERN + r"/post/show/(\d+)" test = ( - ("https://beta.sankakucomplex.com/post/show/360451", { + ("https://sankaku.app/post/show/360451", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", "options": (("tags", True),), "keyword": { @@ -158,6 +146,12 @@ class SankakuPostExtractor(SankakuExtractor): "tags_general" : list, }, }), + # 'contentious_content' + ("https://sankaku.app/post/show/21418978", { + "pattern": r"https://s\.sankakucomplex\.com" + r"/data/13/3c/133cda3bfde249c504284493903fb985\.jpg", + }), + ("https://beta.sankakucomplex.com/post/show/360451"), ("https://chan.sankakucomplex.com/post/show/360451"), ) @@ -166,4 +160,128 @@ class SankakuPostExtractor(SankakuExtractor): self.post_id = match.group(1) def posts(self): - return self._pagination({"tags": "id:" + self.post_id}) + return SankakuAPI(self).posts(self.post_id) + + +class SankakuAPI(): + """Interface for the sankaku.app API""" + + def __init__(self, extractor): + self.extractor = extractor + self.headers = {"Accept": "application/vnd.sankaku.api+json;v=2"} + + self.username, self.password = self.extractor._get_auth_info() + if not self.username: + self.authenticate = lambda: None + + def pools(self, pool_id): + params = {"lang": "en"} + return self._call("/pools/" + pool_id, params) + + def posts(self, post_id): + params = { + "lang" : "en", + "page" : "1", + "limit": "1", + "tags" : "id_range:" + post_id, + } + return self._call("/posts", params) + + def posts_keyset(self, params): + return self._pagination("/posts/keyset", params) + + def authenticate(self): + self.headers["Authorization"] = \ + _authenticate_impl(self.extractor, self.username, self.password) + + def _call(self, endpoint, params=None): + url = "https://capi-v2.sankakucomplex.com" + endpoint + for _ in range(5): + self.authenticate() + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=False) + + if response.status_code == 429: + self.extractor.wait( + until=response.headers.get("X-RateLimit-Reset")) + continue + + data = response.json() + try: + success = data.get("success", True) + except AttributeError: + success = True + if not success: + code = data.get("code") + if code == "invalid_token": + _authenticate_impl.invalidate(self.username) + continue + raise exception.StopExtraction(code) + return data + + def _pagination(self, endpoint, params): + params["lang"] = "en" + params["limit"] = str(self.extractor.per_page) + + while True: + data = self._call(endpoint, params) + yield from data["data"] + + params["next"] = data["meta"]["next"] + if not params["next"]: + return + + +@cache(maxage=365*24*3600, keyarg=1) +def _authenticate_impl(extr, username, password): + extr.log.info("Logging in as %s", username) + headers = {"Accept": "application/vnd.sankaku.api+json;v=2"} + + # get initial access_token + url = "https://login.sankakucomplex.com/auth/token" + data = {"login": username, "password": password} + response = extr.request( + url, method="POST", headers=headers, json=data, fatal=False) + data = response.json() + + if response.status_code >= 400 or not data.get("success"): + raise exception.AuthenticationError(data.get("error")) + access_token = data["access_token"] + + # start openid auth + url = "https://login.sankakucomplex.com/oidc/auth" + params = { + "response_type": "code", + "scope" : "openid", + "client_id" : "sankaku-web-app", + "redirect_uri" : "https://sankaku.app/sso/callback", + "state" : "return_uri=https://sankaku.app/", + "theme" : "black", + "lang" : "undefined", + } + page = extr.request(url, params=params).text + submit_url = text.extract(page, 'submitUrl = "', '"')[0] + + # get code from initial access_token + url = "https://login.sankakucomplex.com" + submit_url + data = { + "accessToken": access_token, + "nonce" : "undefined", + } + response = extr.request(url, method="POST", data=data) + query = text.parse_query(response.request.url.partition("?")[2]) + + # get final access_token from code + url = "https://capi-v2.sankakucomplex.com/sso/finalize?lang=en" + data = { + "code" : query["code"], + "client_id" : "sankaku-web-app", + "redirect_uri": "https://sankaku.app/sso/callback", + } + response = extr.request( + url, method="POST", headers=headers, json=data, fatal=False) + data = response.json() + + if response.status_code >= 400 or not data.get("success"): + raise exception.AuthenticationError(data.get("error")) + return "Bearer " + data["access_token"] |
