diff options
Diffstat (limited to 'gallery_dl/extractor')
46 files changed, 1560 insertions, 556 deletions
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index 8df8645..33e7929 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -6,13 +6,13 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.2chan.net/""" +"""Extractors for https://www.2chan.net/""" from .common import Extractor, Message from .. import text -class FutabaThreadExtractor(Extractor): +class _2chanThreadExtractor(Extractor): """Extractor for images from threads on www.2chan.net""" category = "2chan" subcategory = "thread" diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py index 15f4207..febbb51 100644 --- a/gallery_dl/extractor/3dbooru.py +++ b/gallery_dl/extractor/3dbooru.py @@ -11,7 +11,7 @@ from . import booru -class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): +class _3dbooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): """Base class for 3dbooru extractors""" category = "3dbooru" api_url = "http://behoimi.org/post/index.json" @@ -26,8 +26,7 @@ class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): }) -class ThreedeebooruTagExtractor(booru.TagMixin, - ThreedeebooruExtractor): +class _3dbooruTagExtractor(booru.TagMixin, _3dbooruExtractor): """Extractor for images from behoimi.org based on search-tags""" pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org/post" r"(?:/(?:index)?)?\?tags=(?P<tags>[^&#]+)") @@ -37,8 +36,7 @@ class ThreedeebooruTagExtractor(booru.TagMixin, }) -class ThreedeebooruPoolExtractor(booru.PoolMixin, - ThreedeebooruExtractor): +class _3dbooruPoolExtractor(booru.PoolMixin, _3dbooruExtractor): """Extractor for image-pools from behoimi.org""" pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/pool/show/(?P<pool>\d+)" test = ("http://behoimi.org/pool/show/27", { @@ -47,8 +45,7 @@ class ThreedeebooruPoolExtractor(booru.PoolMixin, }) -class ThreedeebooruPostExtractor(booru.PostMixin, - ThreedeebooruExtractor): +class _3dbooruPostExtractor(booru.PostMixin, _3dbooruExtractor): """Extractor for single images from behoimi.org""" pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/post/show/(?P<post>\d+)" test = ("http://behoimi.org/post/show/140852", { @@ -64,8 +61,7 @@ class ThreedeebooruPostExtractor(booru.PostMixin, }) -class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin, - ThreedeebooruExtractor): +class _3dbooruPopularExtractor(booru.MoebooruPopularMixin, _3dbooruExtractor): """Extractor for popular images from behoimi.org""" pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org" r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)" diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py index e387b33..36a0573 100644 --- a/gallery_dl/extractor/4chan.py +++ b/gallery_dl/extractor/4chan.py @@ -6,15 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images and videos from https://www.4chan.org/""" +"""Extractors for https://www.4chan.org/""" -from . import chan +from .common import Extractor, Message from .. import text -class FourchanThreadExtractor(chan.ChanThreadExtractor): - """Extractor for images from threads from 4chan.org""" +class _4chanThreadExtractor(Extractor): + """Extractor for 4chan threads""" category = "4chan" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{tim} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" pattern = (r"(?:https?://)?boards\.4chan(?:nel)?\.org" r"/([^/]+)/thread/(\d+)") test = ( @@ -28,9 +32,30 @@ class FourchanThreadExtractor(chan.ChanThreadExtractor): "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a", }), ) - api_url = "https://a.4cdn.org/{board}/thread/{thread}.json" - file_url = "https://i.4cdn.org/{board}/{tim}{ext}" - def update(self, post, data=None): - chan.ChanThreadExtractor.update(self, post, data) - post["filename"] = text.unescape(post["filename"]) + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "https://a.4cdn.org/{}/thread/{}.json".format( + self.board, self.thread) + posts = self.request(url).json()["posts"] + title = posts[0].get("sub") or text.remove_html(posts[0]["com"]) + + data = { + "board" : self.board, + "thread": self.thread, + "title" : text.unescape(title)[:50], + } + + yield Message.Version, 1 + yield Message.Directory, data + for post in posts: + if "filename" in post: + post.update(data) + post["extension"] = post["ext"][1:] + post["filename"] = text.unescape(post["filename"]) + url = "https://i.4cdn.org/{}/{}{}".format( + post["board"], post["tim"], post["ext"]) + yield Message.Url, url, post diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py deleted file mode 100644 index e526da3..0000000 --- a/gallery_dl/extractor/8chan.py +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2014-2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extract images and videos from https://8ch.net/""" - -from . import chan - - -class InfinitychanThreadExtractor(chan.ChanThreadExtractor): - """Extractor for images from threads from 8ch.net""" - category = "8chan" - filename_fmt = "{time}-{filename}{ext}" - pattern = r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+)" - test = ("https://8ch.net/builders/res/3.html", { - "url": "5d85c0509f907f217aea379f862b41bf3d01f645", - "keyword": "0c497190c0c0f826925fde09815351d01869c783", - }) - api_url = "https://8ch.net/{board}/res/{thread}.json" - file_url = "https://media.8ch.net/{board}/src/{tim}{ext}" - file_url_v2 = "https://media.8ch.net/file_store/{tim}{ext}" - - def build_url(self, post): - fmt = self.file_url if len(post["tim"]) < 64 else self.file_url_v2 - return fmt.format_map(post) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 351c5df..b8f74d1 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -15,11 +15,11 @@ modules = [ "3dbooru", "4chan", "500px", - "8chan", "8muses", "adultempire", "artstation", "behance", + "blogger", "bobx", "danbooru", "deviantart", @@ -49,6 +49,7 @@ modules = [ "imgth", "imgur", "instagram", + "issuu", "keenspot", "khinsider", "kissmanga", @@ -66,10 +67,12 @@ modules = [ "mangastream", "mangoxo", "myportfolio", + "naver", "newgrounds", "ngomik", "nhentai", "nijie", + "nozomi", "nsfwalbum", "paheal", "patreon", diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py index 85d8266..8160e48 100644 --- a/gallery_dl/extractor/adultempire.py +++ b/gallery_dl/extractor/adultempire.py @@ -21,12 +21,12 @@ class AdultempireGalleryExtractor(GalleryExtractor): test = ( ("https://www.adultempire.com/5998/gallery.html", { "range": "1", - "keyword": "25c8171f5623678491a0d7bdf38a7a6ebfa4a361", + "keyword": "5b3266e69801db0d78c22181da23bc102886e027", "content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e", }), ("https://www.adultdvdempire.com/5683/gallery.html", { "url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d", - "keyword": "9634eb16cc6dbf347eb9dcdd9b2a499dfd04d167", + "keyword": "8d448d79c4ac5f5b10a3019d5b5129ddb43655e5", }), ) @@ -55,4 +55,4 @@ class AdultempireGalleryExtractor(GalleryExtractor): if len(urls) < 24: return params["page"] += 1 - page = self.request(self.chapter_url, params=params).text + page = self.request(self.gallery_url, params=params).text diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py new file mode 100644 index 0000000..31bbaf8 --- /dev/null +++ b/gallery_dl/extractor/blogger.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Blogger blogs""" + +from .common import Extractor, Message +from .. import text +import re + +BASE_PATTERN = ( + r"(?:blogger:(?:https?://)?([^/]+)|" + r"(?:https?://)?([^.]+\.blogspot\.com))") + + +class BloggerExtractor(Extractor): + """Base class for blogger extractors""" + category = "blogger" + directory_fmt = ("{category}", "{blog[name]}", + "{post[date]:%Y-%m-%d} {post[title]}") + filename_fmt = "{num:>03}.{extension}" + archive_fmt = "{post[id]}_{num}" + root = "https://www.blogger.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.blog = match.group(1) or match.group(2) + self.api = BloggerAPI(self) + + def items(self): + yield Message.Version, 1 + + blog = self.api.blog_by_url("http://" + self.blog) + blog["pages"] = blog["pages"]["totalItems"] + blog["posts"] = blog["posts"]["totalItems"] + blog["date"] = text.parse_datetime(blog["published"]) + del blog["selfLink"] + + sub = re.compile(r"/s\d+/").sub + findall = re.compile( + r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall + + for post in self.posts(blog): + images = findall(post["content"]) + if not images: + continue + + post["author"] = post["author"]["displayName"] + post["replies"] = post["replies"]["totalItems"] + post["content"] = text.remove_html(post["content"]) + post["date"] = text.parse_datetime(post["published"]) + del post["selfLink"] + del post["blog"] + + yield Message.Directory, {"blog": blog, "post": post} + for num, url in enumerate(images, 1): + url = sub("/s0/", url).replace("http:", "https:", 1) + yield Message.Url, url, text.nameext_from_url(url, { + "blog": blog, + "post": post, + "url" : url, + "num" : num, + }) + + def posts(self, blog): + """Return an iterable with all relevant post objects""" + + +class BloggerPostExtractor(BloggerExtractor): + """Extractor for a single blog post""" + subcategory = "post" + pattern = BASE_PATTERN + r"(/\d{4}/\d\d/[^/?&#]+\.html)" + test = ( + ("https://julianbphotography.blogspot.com/2010/12/moon-rise.html", { + "url": "9928429fb62f712eb4de80f53625eccecc614aae", + "pattern": r"https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg", + "keyword": { + "blog": { + "date" : "type:datetime", + "description": "", + "id" : "5623928067739466034", + "kind" : "blogger#blog", + "locale" : dict, + "name" : "Julian Bunker Photography", + "pages" : int, + "posts" : int, + "published" : "2010-11-21T10:19:42-08:00", + "updated" : str, + "url" : "http://www.julianbunker.com/", + }, + "post": { + "author" : "Julian Bunker", + "content" : str, + "date" : "type:datetime", + "etag" : str, + "id" : "6955139236418998998", + "kind" : "blogger#post", + "published" : "2010-12-25T17:08:00-08:00", + "replies" : "0", + "title" : "Moon Rise", + "updated" : "2011-12-06T05:21:24-08:00", + "url" : "re:.+/2010/12/moon-rise.html$", + }, + "num": int, + "url": str, + }, + }), + ("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", { + "url": "9928429fb62f712eb4de80f53625eccecc614aae", + }), + ) + + def __init__(self, match): + BloggerExtractor.__init__(self, match) + self.path = match.group(3) + + def posts(self, blog): + return (self.api.post_by_path(blog["id"], self.path),) + + +class BloggerBlogExtractor(BloggerExtractor): + """Extractor for an entire Blogger blog""" + subcategory = "blog" + pattern = BASE_PATTERN + "/?$" + test = ( + ("https://julianbphotography.blogspot.com/", { + "range": "1-25", + "count": 25, + "pattern": r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg", + }), + ("blogger:http://www.julianbunker.com/", { + "range": "1-25", + "count": 25, + }), + ) + + def posts(self, blog): + return self.api.blog_posts(blog["id"]) + + +class BloggerAPI(): + """Minimal interface for the Blogger v3 API + + Ref: https://developers.google.com/blogger + """ + API_KEY = "AIzaSyCN9ax34oMMyM07g_M-5pjeDp_312eITK8" + + def __init__(self, extractor): + self.extractor = extractor + self.api_key = extractor.config("api-key", self.API_KEY) + + def blog_by_url(self, url): + return self._call("blogs/byurl", {"url": url}) + + def blog_posts(self, blog_id): + return self._pagination("blogs/{}/posts".format(blog_id), {}) + + def post_by_path(self, blog_id, path): + endpoint = "blogs/{}/posts/bypath".format(blog_id) + return self._call(endpoint, {"path": path}) + + def _call(self, endpoint, params): + url = "https://www.googleapis.com/blogger/v3/" + endpoint + params["key"] = self.api_key + return self.extractor.request(url, params=params).json() + + def _pagination(self, endpoint, params): + while True: + data = self._call(endpoint, params) + yield from data["items"] + + if "nextPageToken" not in data: + return + params["pageToken"] = data["nextPageToken"] diff --git a/gallery_dl/extractor/bobx.py b/gallery_dl/extractor/bobx.py index 67427a7..dba5fe7 100644 --- a/gallery_dl/extractor/bobx.py +++ b/gallery_dl/extractor/bobx.py @@ -94,7 +94,8 @@ class BobxIdolExtractor(BobxExtractor): subcategory = "idol" pattern = r"(?:https?://)?(?:www\.)?bobx\.com/([^/]+/[^/?&#]+)/?$" test = ("http://www.bobx.com/idol/rin-okabe/", { - "url": "74d80bfcd53b738b31909bb42e5cc97c41b475b8", + "pattern": BobxGalleryExtractor.pattern, + "count": ">= 6", }) def items(self): @@ -107,6 +108,5 @@ class BobxIdolExtractor(BobxExtractor): for part in text.extract_iter(page, '="photoset/', '"'): # skip every other entry skip = not skip - if skip: - continue - yield Message.Queue, "{}photoset/{}".format(url, part), data + if not skip: + yield Message.Queue, "{}photoset/{}".format(url, part), data diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py deleted file mode 100644 index 5e44fd9..0000000 --- a/gallery_dl/extractor/chan.py +++ /dev/null @@ -1,61 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Base classes for extractors for different Futaba Channel-like boards""" - -from .common import Extractor, Message -from .. import text - - -class ChanThreadExtractor(Extractor): - """Base class for extractors for Futaba Channel-like boards""" - category = "chan" - subcategory = "thread" - directory_fmt = ("{category}", "{board}", "{thread} - {title}") - filename_fmt = "{tim}-{filename}.{extension}" - archive_fmt = "{board}_{thread}_{tim}" - api_url = "" - file_url = "" - - def __init__(self, match): - Extractor.__init__(self, match) - self.metadata = { - "board": match.group(1), - "thread": match.group(2), - } - - def items(self): - yield Message.Version, 1 - url = self.api_url.format_map(self.metadata) - posts = self.request(url).json()["posts"] - self.metadata["title"] = self.get_thread_title(posts[0]) - yield Message.Directory, self.metadata - for post in posts: - if "filename" not in post: - continue - self.update(post) - yield Message.Url, self.build_url(post), post - if "extra_files" in post: - for file in post["extra_files"]: - self.update(post, file) - yield Message.Url, self.build_url(post), post - - def update(self, post, data=None): - """Update keyword dictionary""" - post.update(data or self.metadata) - post["extension"] = post["ext"][1:] - - def build_url(self, post): - """Construct an image url out of a post object""" - return self.file_url.format_map(post) - - @staticmethod - def get_thread_title(post): - """Return thread title from first post""" - title = post["sub"] if "sub" in post else text.remove_html(post["com"]) - return text.unescape(title)[:50] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index a90af1c..0d258eb 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -18,7 +18,7 @@ import requests import threading import http.cookiejar from .message import Message -from .. import config, text, exception, cloudflare +from .. import config, text, util, exception, cloudflare class Extractor(): @@ -37,9 +37,9 @@ class Extractor(): self.session = requests.Session() self.log = logging.getLogger(self.category) self.url = match.string - self._init_headers() - self._init_cookies() - self._init_proxies() + + self._cookiefile = None + self._cookiejar = self.session.cookies self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) @@ -47,6 +47,10 @@ class Extractor(): if self._retries < 0: self._retries = float("inf") + self._init_headers() + self._init_cookies() + self._init_proxies() + @classmethod def from_url(cls, url): if isinstance(cls.pattern, str): @@ -67,7 +71,7 @@ class Extractor(): return config.interpolate( ("extractor", self.category, self.subcategory, key), default) - def request(self, url, method="GET", *, session=None, retries=None, + def request(self, url, *, method="GET", session=None, retries=None, encoding=None, fatal=True, notfound=None, **kwargs): tries = 1 retries = self._retries if retries is None else retries @@ -110,7 +114,7 @@ class Extractor(): msg = "" self.log.warning("Cloudflare CAPTCHA" + msg) - msg = "{}: {} for url: {}".format(code, response.reason, url) + msg = "'{} {}' for '{}'".format(code, response.reason, url) if code < 500 and code != 429 and code != 430: break @@ -141,7 +145,7 @@ class Extractor(): return username, password def _init_headers(self): - """Set additional headers for the 'session' object""" + """Initialize HTTP headers for the 'session' object""" headers = self.session.headers headers.clear() @@ -174,26 +178,43 @@ class Extractor(): if cookies: if isinstance(cookies, dict): self._update_cookies_dict(cookies, self.cookiedomain) - else: + elif isinstance(cookies, str): + cookiefile = util.expand_path(cookies) cookiejar = http.cookiejar.MozillaCookieJar() try: - cookiejar.load(cookies) + cookiejar.load(cookiefile) except OSError as exc: self.log.warning("cookies: %s", exc) else: - self.session.cookies.update(cookiejar) + self._cookiejar.update(cookiejar) + self._cookiefile = cookiefile + else: + self.log.warning( + "expected 'dict' or 'str' value for 'cookies' option, " + "got '%s' (%s)", cookies.__class__.__name__, cookies) cookies = cloudflare.cookies(self.category) if cookies: domain, cookies = cookies self._update_cookies_dict(cookies, domain) + def _store_cookies(self): + """Store the session's cookiejar in a cookies.txt file""" + if self._cookiefile and self.config("cookies-update", False): + cookiejar = http.cookiejar.MozillaCookieJar() + for cookie in self._cookiejar: + cookiejar.set_cookie(cookie) + try: + cookiejar.save(self._cookiefile) + except OSError as exc: + self.log.warning("cookies: %s", exc) + def _update_cookies(self, cookies, *, domain=""): """Update the session's cookiejar with 'cookies'""" if isinstance(cookies, dict): self._update_cookies_dict(cookies, domain or self.cookiedomain) else: - setcookie = self.session.cookies.set_cookie + setcookie = self._cookiejar.set_cookie try: cookies = iter(cookies) except TypeError: @@ -204,17 +225,17 @@ class Extractor(): def _update_cookies_dict(self, cookiedict, domain): """Update cookiejar with name-value pairs from a dict""" - setcookie = self.session.cookies.set + setcookie = self._cookiejar.set for name, value in cookiedict.items(): setcookie(name, value, domain=domain) - def _check_cookies(self, cookienames, *, domain=""): + def _check_cookies(self, cookienames, *, domain=None): """Check if all 'cookienames' are in the session's cookiejar""" - if not domain: + if domain is None: domain = self.cookiedomain try: for name in cookienames: - self.session.cookies._find(name, domain) + self._cookiejar._find(name, domain) except KeyError: return False return True @@ -249,24 +270,21 @@ class Extractor(): yield test -class ChapterExtractor(Extractor): +class GalleryExtractor(Extractor): - subcategory = "chapter" - directory_fmt = ( - "{category}", "{manga}", - "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}") - filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}") - archive_fmt = ( - "{manga}_{chapter}{chapter_minor}_{page}") + subcategory = "gallery" + filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + directory_fmt = ("{category}", "{gallery_id} {title}") + archive_fmt = "{gallery_id}_{num}" + enum = "num" def __init__(self, match, url=None): Extractor.__init__(self, match) - self.chapter_url = url or self.root + match.group(1) + self.gallery_url = self.root + match.group(1) if url is None else url def items(self): self.login() - page = self.request(self.chapter_url).text + page = self.request(self.gallery_url).text data = self.metadata(page) imgs = self.images(page) @@ -284,7 +302,7 @@ class ChapterExtractor(Extractor): yield Message.Version, 1 yield Message.Directory, data - for data["page"], (url, imgdata) in images: + for data[self.enum], (url, imgdata) in images: if imgdata: data.update(imgdata) yield Message.Url, url, text.nameext_from_url(url, data) @@ -299,6 +317,19 @@ class ChapterExtractor(Extractor): """Return a list of all (image-url, metadata)-tuples""" +class ChapterExtractor(GalleryExtractor): + + subcategory = "chapter" + directory_fmt = ( + "{category}", "{manga}", + "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}") + filename_fmt = ( + "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}") + archive_fmt = ( + "{manga}_{chapter}{chapter_minor}_{page}") + enum = "page" + + class MangaExtractor(Extractor): subcategory = "manga" @@ -333,14 +364,6 @@ class MangaExtractor(Extractor): """Return a list of all (chapter-url, metadata)-tuples""" -class GalleryExtractor(ChapterExtractor): - - subcategory = "gallery" - filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}" - directory_fmt = ("{category}", "{gallery_id} {title}") - archive_fmt = "{gallery_id}_{page}" - - class AsynchronousMixin(): """Run info extraction in a separate thread""" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index ab32a00..eeee74a 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -58,9 +58,12 @@ class DeviantartExtractor(Extractor): def items(self): if self.user: - self.group = not self.api.user_profile(self.user) + profile = self.api.user_profile(self.user) + self.group = not profile if self.group: self.subcategory = "group-" + self.subcategory + else: + self.user = profile["user"]["username"] yield Message.Version, 1 for deviation in self.deviations(): @@ -260,11 +263,53 @@ class DeviantartExtractor(Extractor): content.update(download) +class DeviantartUserExtractor(Extractor): + """Extractor for an artist's user profile""" + category = "deviantart" + subcategory = "user" + pattern = BASE_PATTERN + r"/?$" + test = ( + ("https://www.deviantart.com/shimoda7", { + "options": (("include", "gsjf"),), + "pattern": r"/shimoda7/(gallery(/scraps)?|posts|favourites)", + "count": 4, + }), + ("https://shimoda7.deviantart.com/"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) or match.group(2) + + incl = self.config("include") or "g" + if isinstance(incl, list): + incl = "".join(item[0] for item in incl if item) + self.include = incl.lower() + + def items(self): + base = "https://www.deviantart.com/{}/".format(self.user) + incl = self.include + data = {} + + if "g" in incl: + data["_extractor"] = DeviantartGalleryExtractor + yield Message.Queue, base + "gallery", data + if "s" in incl: + data["_extractor"] = DeviantartScrapsExtractor + yield Message.Queue, base + "gallery/scraps", data + if "j" in incl: + data["_extractor"] = DeviantartJournalExtractor + yield Message.Queue, base + "posts", data + if "f" in incl: + data["_extractor"] = DeviantartFavoriteExtractor + yield Message.Queue, base + "favourites", data + + class DeviantartGalleryExtractor(DeviantartExtractor): """Extractor for all deviations from an artist's gallery""" subcategory = "gallery" archive_fmt = "g_{username}_{index}.{extension}" - pattern = BASE_PATTERN + r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$" + pattern = BASE_PATTERN + r"/gallery(?:/all|/?\?catpath=)?/?$" test = ( ("https://www.deviantart.com/shimoda7/gallery/", { "pattern": r"https://(www.deviantart.com/download/\d+/" @@ -315,12 +360,12 @@ class DeviantartGalleryExtractor(DeviantartExtractor): }, }), # group - ("https://www.deviantart.com/yakuzafc", { + ("https://www.deviantart.com/yakuzafc/gallery", { "pattern": r"https://www.deviantart.com/yakuzafc/gallery/0/", "count": ">= 15", }), # 'folders' option (#276) - ("https://www.deviantart.com/justatest235723", { + ("https://www.deviantart.com/justatest235723/gallery", { "count": 3, "options": (("metadata", 1), ("folders", 1), ("original", 0)), "keyword": { @@ -334,10 +379,12 @@ class DeviantartGalleryExtractor(DeviantartExtractor): ("https://www.deviantart.com/shimoda8/gallery/", { "exception": exception.NotFoundError, }), - # old-style URLs + + ("https://www.deviantart.com/shimoda7/gallery"), + ("https://www.deviantart.com/shimoda7/gallery/all"), ("https://www.deviantart.com/shimoda7/gallery/?catpath=/"), ("https://shimoda7.deviantart.com/gallery/"), - ("https://yakuzafc.deviantart.com/"), + ("https://shimoda7.deviantart.com/gallery/all/"), ("https://shimoda7.deviantart.com/gallery/?catpath=/"), ) @@ -794,6 +841,14 @@ class DeviantartScrapsExtractor(DeviantartExtractorV2): ) def deviations(self): + # copy self.session + session = self.session.__class__() + for attr in session.__attrs__: + setattr(session, attr, getattr(self.session, attr, None)) + + # reset cookies in the original session object + self.session.cookies = session.cookies.__class__() + url = self.root + "/_napi/da-user-profile/api/gallery/contents" params = { "username" : self.user, @@ -806,7 +861,8 @@ class DeviantartScrapsExtractor(DeviantartExtractorV2): } while True: - data = self.request(url, params=params, headers=headers).json() + data = self.request( + url, session=session, params=params, headers=headers).json() for obj in data["results"]: yield obj["deviation"] @@ -974,11 +1030,12 @@ class DeviantartAPI(): auth = (self.client_id, self.client_secret) response = self.extractor.request( - url, method="POST", data=data, auth=auth) + url, method="POST", data=data, auth=auth, fatal=False) data = response.json() if response.status_code != 200: - raise exception.AuthenticationError('"{} ({})"'.format( + self.log.debug("Server response: %s", data) + raise exception.AuthenticationError('"{}" ({})'.format( data.get("error_description"), data.get("error"))) if refresh_token: _refresh_token_cache.update(refresh_token, data["refresh_token"]) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 75e19d6..cba9627 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -69,8 +69,7 @@ class ExhentaiExtractor(Extractor): def login(self): """Login and set necessary cookies""" if self.LIMIT: - self.log.error("Image limit reached!") - raise exception.StopExtraction() + raise exception.StopExtraction("Image limit reached!") if self._check_cookies(self.cookienames): return username, password = self._get_auth_info() @@ -235,9 +234,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): url = iurl data = self._parse_image_info(url) except IndexError: - self.log.error("Unable to parse image info for '%s'", url) self.log.debug("Page content:\n%s", page) - raise exception.StopExtraction() + raise exception.StopExtraction( + "Unable to parse image info for '%s'", url) data["num"] = self.image_num data["image_token"] = self.key["start"] = extr('var startkey="', '";') @@ -272,9 +271,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): url = imgurl data = self._parse_image_info(url) except IndexError: - self.log.error("Unable to parse image info for '%s'", url) self.log.debug("Page content:\n%s", page) - raise exception.StopExtraction() + raise exception.StopExtraction( + "Unable to parse image info for '%s'", url) data["num"] = request["page"] data["image_token"] = imgkey @@ -311,12 +310,12 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self._remaining -= data["cost"] if self._remaining <= 0: + ExhentaiExtractor.LIMIT = True url = "{}/s/{}/{}-{}".format( self.root, data["image_token"], self.gallery_id, data["num"]) - self.log.error("Image limit reached! Continue with " - "'%s' as URL after resetting it.", url) - ExhentaiExtractor.LIMIT = True - raise exception.StopExtraction() + raise exception.StopExtraction( + "Image limit reached! Continue with '%s' " + "as URL after resetting it.", url) def _update_limits(self): url = "https://e-hentai.org/home.php" diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 73b8ec4..b71fc4d 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -423,14 +423,15 @@ class FlickrAPI(oauth.OAuth1API): params["api_key"] = self.api_key data = self.request(self.API_URL, params=params).json() if "code" in data: + msg = data.get("message") + self.log.debug("Server response: %s", data) if data["code"] == 1: raise exception.NotFoundError(self.extractor.subcategory) elif data["code"] == 98: - raise exception.AuthenticationError(data.get("message")) + raise exception.AuthenticationError(msg) elif data["code"] == 99: - raise exception.AuthorizationError() - self.log.error("API call failed: %s", data.get("message")) - raise exception.StopExtraction() + raise exception.AuthorizationError(msg) + raise exception.StopExtraction("API request failed: %s", msg) return data def _pagination(self, method, params, key="photos"): diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 5f4c5b8..645b53a 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -78,7 +78,7 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): EXTRACTORS = { "4plebs": { - "name": "fourplebs", + "name": "_4plebs", "root": "https://archive.4plebs.org", "pattern": r"(?:archive\.)?4plebs\.org", "test-thread": ("https://archive.4plebs.org/tg/thread/54059290", { diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 14baa36..fc7dbf9 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -44,14 +44,13 @@ class FoolslideBase(SharedConfigMixin): class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): """Base class for chapter extractors for FoOlSlide based sites""" - directory_fmt = ( - "{category}", "{manga}", "{chapter_string}") + directory_fmt = ("{category}", "{manga}", "{chapter_string}") archive_fmt = "{id}" pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" decode = "default" def items(self): - page = self.request(self.chapter_url).text + page = self.request(self.gallery_url).text data = self.metadata(page) imgs = self.images(page) @@ -77,7 +76,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) extr('<h1 class="tbtitle dnone">', '') - return self.parse_chapter_url(self.chapter_url, { + return self.parse_chapter_url(self.gallery_url, { "manga" : text.unescape(extr('title="', '"')).strip(), "chapter_string": text.unescape(extr('title="', '"')), }) diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py index dbcf2f2..eba1c39 100644 --- a/gallery_dl/extractor/fuskator.py +++ b/gallery_dl/extractor/fuskator.py @@ -42,7 +42,7 @@ class FuskatorGalleryExtractor(GalleryExtractor): def metadata(self, page): headers = { - "Referer" : self.chapter_url, + "Referer" : self.gallery_url, "X-Requested-With": "XMLHttpRequest", } auth = self.request( diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py index 01793dc..43479c6 100644 --- a/gallery_dl/extractor/hbrowse.py +++ b/gallery_dl/extractor/hbrowse.py @@ -29,8 +29,7 @@ class HbrowseBase(): if not data["manga"] and "<b>Warning</b>" in page: msg = page.rpartition(">")[2].strip() - self.log.error("Site is not accessible: '%s'", msg) - raise exception.StopExtraction() + raise exception.StopExtraction("Site is not accessible: '%s'", msg) tags = text.extract(page, 'class="listTable"', '</table>', pos)[0] diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py index 161073b..1ab71d6 100644 --- a/gallery_dl/extractor/hentaicafe.py +++ b/gallery_dl/extractor/hentaicafe.py @@ -31,10 +31,10 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor): info = text.unescape(text.extract(page, '<title>', '</title>')[0]) manga, _, chapter_string = info.partition(" :: ") - data = self._data(self.chapter_url.split("/")[5]) + data = self._data(self.gallery_url.split("/")[5]) data["manga"] = manga data["chapter_string"] = chapter_string.rstrip(" :") - return self.parse_chapter_url(self.chapter_url, data) + return self.parse_chapter_url(self.gallery_url, data) @memcache(keyarg=1) def _data(self, manga): diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py index cf4871f..7e0b63c 100644 --- a/gallery_dl/extractor/hentaifox.py +++ b/gallery_dl/extractor/hentaifox.py @@ -24,7 +24,7 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): test = ("https://hentaifox.com/gallery/56622/", { "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", "count": 24, - "keyword": "38f8517605feb6854d48833297da6b05c6541b69", + "keyword": "903ebe227d85e484460382fc6cbab42be7a244d5", }) def __init__(self, match): diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index d875817..9e2ee9f 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -22,7 +22,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): test = ( ("https://hentainexus.com/view/5688", { "url": "746d0043e20030f1171aae5ea113176607302517", - "keyword": "b05986369fbaf29cfa08b118960d92c49e59524b", + "keyword": "9512cf5f258130e5f75de9954d7a13217c2405e7", }), ("https://hentainexus.com/read/5688"), ) diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index ef08d69..e53b051 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -11,17 +11,20 @@ from .common import GalleryExtractor from .. import text, util import string +import json class HitomiGalleryExtractor(GalleryExtractor): """Extractor for image galleries from hitomi.la""" category = "hitomi" root = "https://hitomi.la" - pattern = r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)" + pattern = (r"(?:https?://)?hitomi\.la" + r"/(?:manga|doujinshi|cg|gamecg|galleries|reader)" + r"/(?:[^/?&#]+-)?(\d+)") test = ( ("https://hitomi.la/galleries/867789.html", { "pattern": r"https://aa.hitomi.la/galleries/867789/\d+.jpg", - "keyword": "d097a8db8e810045131b4510c41714004f9eff3a", + "keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2", "count": 16, }), ("https://hitomi.la/galleries/1401410.html", { @@ -39,6 +42,11 @@ class HitomiGalleryExtractor(GalleryExtractor): "url": "055c898a36389719799d6bce76889cc4ea4421fc", "count": 1413, }), + ("https://hitomi.la/manga/amazon-no-hiyaku-867789.html"), + ("https://hitomi.la/manga/867789.html"), + ("https://hitomi.la/doujinshi/867789.html"), + ("https://hitomi.la/cg/867789.html"), + ("https://hitomi.la/gamecg/867789.html"), ("https://hitomi.la/reader/867789.html"), ) @@ -54,6 +62,11 @@ class HitomiGalleryExtractor(GalleryExtractor): self.fallback = True url = url.replace("/galleries/", "/reader/") response = GalleryExtractor.request(self, url, **kwargs) + elif b"<title>Redirect</title>" in response.content: + url = text.extract(response.text, "href='", "'")[0] + if not url.startswith("http"): + url = text.urljoin(self.root, url) + response = self.request(url, **kwargs) return response def metadata(self, page): @@ -86,25 +99,19 @@ class HitomiGalleryExtractor(GalleryExtractor): # see https://ltn.hitomi.la/common.js offset = text.parse_int(self.gallery_id[-1]) % 3 subdomain = chr(97 + offset) + "a" - base = "https://" + subdomain + ".hitomi.la/galleries/" + base = "https://{}.hitomi.la/galleries/{}/".format( + subdomain, self.gallery_id) # set Referer header before image downloads (#239) - self.session.headers["Referer"] = self.chapter_url - - # handle Game CG galleries with scenes (#321) - scenes = text.extract(page, "var scene_indexes = [", "]")[0] - if scenes and scenes.strip(): - url = "{}/reader/{}.html".format(self.root, self.gallery_id) - page = self.request(url).text - begin, end = ">//g.hitomi.la/galleries/", "</div>" - elif self.fallback: - begin, end = ">//g.hitomi.la/galleries/", "</div>" - else: - begin, end = "'//tn.hitomi.la/smalltn/", ".jpg'," + self.session.headers["Referer"] = self.gallery_url + + # get 'galleryinfo' + url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id) + page = self.request(url).text return [ - (base + urlpart, None) - for urlpart in text.extract_iter(page, begin, end) + (base + image["name"], None) + for image in json.loads(page.partition("=")[2]) ] @staticmethod diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 2a8dcad..fb321d0 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -90,7 +90,7 @@ class ImgbbExtractor(Extractor): return params["seek"] = data["seekEnd"] params["page"] += 1 - data = self.request(endpoint, "POST", data=params).json() + data = self.request(endpoint, method="POST", data=params).json() page = data["html"] diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index cb36c30..b1be995 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -10,8 +10,6 @@ from .common import Extractor, Message from .. import text, exception -import itertools -import json BASE_PATTERN = r"(?:https?://)?(?:www\.|[im]\.)?imgur\.com" @@ -21,103 +19,89 @@ class ImgurExtractor(Extractor): """Base class for imgur extractors""" category = "imgur" root = "https://imgur.com" - api_root = "https://api.imgur.com" def __init__(self, match): Extractor.__init__(self, match) + self.api = ImgurAPI(self) self.key = match.group(1) self.mp4 = self.config("mp4", True) - def _extract_data(self, path): - response = self.request(self.root + path, notfound=self.subcategory) - data = json.loads(text.extract( - response.text, "image : ", ",\n")[0]) + def _prepare(self, image): try: - del data["adConfig"] - del data["isAd"] + del image["ad_url"] + del image["ad_type"] + del image["ad_config"] except KeyError: pass - return data - def _prepare(self, image): - image["ext"] = image["ext"].partition("?")[0] - if image["ext"] == ".gif" and ( - (self.mp4 and image["prefer_video"]) or self.mp4 == "always"): - image["ext"] = ".mp4" - url = "https://i.imgur.com/" + image["hash"] + image["ext"] - image["extension"] = image["ext"][1:] + url = image["mp4"] if image["animated"] and self.mp4 else image["link"] + image["date"] = text.parse_timestamp(image["datetime"]) + text.nameext_from_url(url, image) + return url - def _items_apiv3(self, urlfmt): + def _items_queue(self, items): album_ex = ImgurAlbumExtractor image_ex = ImgurImageExtractor - params = { - "IMGURPLATFORM" : "web", - "album_previews": "0", - "client_id" : "546c25a59c58ad7", - } - headers = { - "Origin" : self.root, - "Referer": self.root + "/", - } - yield Message.Version, 1 - - for num in itertools.count(0): - url = urlfmt.format(num) - data = self.request(url, params=params, headers=headers).json() - - for item in data["data"]: - item["_extractor"] = album_ex if item["is_album"] else image_ex - yield Message.Queue, item["link"], item - - if len(data["data"]) < 60: - return + for item in items: + item["_extractor"] = album_ex if item["is_album"] else image_ex + yield Message.Queue, item["link"], item class ImgurImageExtractor(ImgurExtractor): """Extractor for individual images on imgur.com""" subcategory = "image" - filename_fmt = "{category}_{hash}{title:?_//}.{extension}" - archive_fmt = "{hash}" + filename_fmt = "{category}_{id}{title:?_//}.{extension}" + archive_fmt = "{id}" pattern = BASE_PATTERN + r"/(?!gallery)(\w{7}|\w{5})[sbtmlh]?\.?" test = ( ("https://imgur.com/21yMxCS", { "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { - "animated": False, - "datetime": "2016-11-10 14:24:35", - "description": str, - "ext": ".png", - "extension": "png", - "hash": "21yMxCS", - "height": "32", - "is_moderated": False, - "is_safe": False, - "is_viral": 0, - "looping": False, - "mimetype": "image/png", - "name": None, - "prefer_video": False, - "size": 182, - "source": "", - "title": "Test", - "video_host": None, - "video_source": None, - "width": "64", + "account_id" : None, + "account_url" : None, + "animated" : False, + "bandwidth" : int, + "date" : "type:datetime", + "datetime" : 1478787875, + "description" : None, + "edited" : "0", + "extension" : "png", + "favorite" : False, + "filename" : "21yMxCS", + "has_sound" : False, + "height" : 32, + "id" : "21yMxCS", + "in_gallery" : False, + "in_most_viral": False, + "is_ad" : False, + "link" : "https://i.imgur.com/21yMxCS.png", + "nsfw" : False, + "section" : None, + "size" : 182, + "tags" : [], + "title" : "Test", + "type" : "image/png", + "views" : int, + "vote" : None, + "width" : 64, }, }), ("http://imgur.com/0gybAXR", { # gifv/mp4 video "url": "a2220eb265a55b0c95e0d3d721ec7665460e3fd7", "content": "a3c080e43f58f55243ab830569ba02309d59abfc", }), + ("https://imgur.com/XFfsmuC", { # missing title in API response (#467) + "keyword": {"title": "Tears are a natural response to irritants"}, + }), ("https://imgur.com/HjoXJAd", { # url ends with '.jpg?1' - "url": "73f361b50753ab25da64160aa50bc5d139480d45", + "url": "ec2cf11a2bfb4939feff374781a6e6f3e9af8e8e", }), ("https://imgur.com/zzzzzzz", { # not found - "exception": exception.NotFoundError, + "exception": exception.HttpError, }), ("https://www.imgur.com/21yMxCS"), # www ("https://m.imgur.com/21yMxCS"), # mobile @@ -129,7 +113,11 @@ class ImgurImageExtractor(ImgurExtractor): ) def items(self): - image = self._extract_data("/" + self.key) + image = self.api.image(self.key) + if not image["title"]: + page = self.request(self.root + "/" + self.key, fatal=False).text + title = text.extract(page, "<title>", "<")[0] + image["title"] = (title or "").rpartition(" - ")[0].strip() url = self._prepare(image) yield Message.Version, 1 yield Message.Directory, image @@ -139,42 +127,67 @@ class ImgurImageExtractor(ImgurExtractor): class ImgurAlbumExtractor(ImgurExtractor): """Extractor for imgur albums""" subcategory = "album" - directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}") - filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}" - archive_fmt = "{album[hash]}_{hash}" + directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}") + filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}" + archive_fmt = "{album[id]}_{id}" pattern = BASE_PATTERN + r"/(?:a|t/unmuted)/(\w{7}|\w{5})" test = ( ("https://imgur.com/a/TcBmP", { "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", "keyword": { "album": { - "album_cover": "693j2Kr", - "album_description": None, - "cover": "693j2Kr", - "datetime": "2015-10-09 10:37:50", - "description": None, - "hash": "TcBmP", - "id": "TcBmP", - "is_album": True, - "num_images": "19", - "title": "138", - "title_clean": "TcBmP", - "views": str, + "account_id" : None, + "account_url" : None, + "cover" : "693j2Kr", + "cover_edited": None, + "cover_height": 1400, + "cover_width" : 951, + "date" : "type:datetime", + "datetime" : 1444387070, + "description" : None, + "favorite" : False, + "id" : "TcBmP", + "images_count": 19, + "in_gallery" : False, + "is_ad" : False, + "is_album" : True, + "layout" : "blog", + "link" : "https://imgur.com/a/TcBmP", + "nsfw" : False, + "privacy" : "hidden", + "section" : None, + "title" : "138", + "views" : int, }, - "animated": bool, - "datetime": str, - "extension": str, - "hash": str, - "height": int, - "num": int, - "prefer_video": bool, - "size": int, - "title": str, - "width": int, + "account_id" : None, + "account_url": None, + "animated" : bool, + "bandwidth" : int, + "date" : "type:datetime", + "datetime" : int, + "description": None, + "edited" : "0", + "favorite" : False, + "has_sound" : False, + "height" : int, + "id" : str, + "in_gallery" : False, + "is_ad" : False, + "link" : r"re:https://i\.imgur\.com/\w+\.jpg", + "nsfw" : None, + "num" : int, + "section" : None, + "size" : int, + "tags" : list, + "title" : None, + "type" : "image/jpeg", + "views" : int, + "vote" : None, + "width" : int, }, }), ("https://imgur.com/a/eD9CT", { # large album - "url": "4ee94de31ff26be416271bc0b1ea27b9349c9937", + "url": "de748c181a04d18bef1de9d4f4866ef0a06d632b", }), ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash "url": "695ef0c950023362a0163ee5041796300db76674", @@ -183,21 +196,22 @@ class ImgurAlbumExtractor(ImgurExtractor): "url": "86b4747f8147cec7602f0214e267309af73a8655", }), ("https://imgur.com/a/TcBmQ", { - "exception": exception.NotFoundError, + "exception": exception.HttpError, }), ("https://www.imgur.com/a/TcBmP"), # www ("https://m.imgur.com/a/TcBmP"), # mobile ) def items(self): - album = self._extract_data("/a/" + self.key + "/all") - images = album["album_images"]["images"] - del album["album_images"] + album = self.api.album(self.key) + album["date"] = text.parse_timestamp(album["datetime"]) + images = album["images"] - if int(album["num_images"]) > len(images): - url = "{}/ajaxalbums/getimages/{}/hit.json".format( - self.root, self.key) - images = self.request(url).json()["data"]["images"] + try: + del album["images"] + del album["ad_config"] + except KeyError: + pass yield Message.Version, 1 yield Message.Directory, {"album": album, "count": len(images)} @@ -224,13 +238,11 @@ class ImgurGalleryExtractor(ImgurExtractor): def items(self): url = self.root + "/a/" + self.key with self.request(url, method="HEAD", fatal=False) as response: - code = response.status_code - - if code < 400: - extr = ImgurAlbumExtractor - else: - extr = ImgurImageExtractor - url = self.root + "/" + self.key + if response.status_code < 400: + extr = ImgurAlbumExtractor + else: + extr = ImgurImageExtractor + url = self.root + "/" + self.key yield Message.Version, 1 yield Message.Queue, url, {"_extractor": extr} @@ -251,9 +263,7 @@ class ImgurUserExtractor(ImgurExtractor): ) def items(self): - urlfmt = "{}/3/account/{}/submissions/{{}}/newest".format( - self.api_root, self.key) - return self._items_apiv3(urlfmt) + return self._items_queue(self.api.account_submissions(self.key)) class ImgurFavoriteExtractor(ImgurExtractor): @@ -267,6 +277,43 @@ class ImgurFavoriteExtractor(ImgurExtractor): }) def items(self): - urlfmt = "{}/3/account/{}/gallery_favorites/{{}}/newest".format( - self.api_root, self.key) - return self._items_apiv3(urlfmt) + return self._items_queue(self.api.account_favorites(self.key)) + + +class ImgurAPI(): + + def __init__(self, extractor): + self.extractor = extractor + self.headers = { + "Authorization": "Client-ID " + extractor.config( + "client-id", "546c25a59c58ad7"), + } + + def account_favorites(self, account): + endpoint = "account/{}/gallery_favorites".format(account) + return self._pagination(endpoint) + + def account_submissions(self, account): + endpoint = "account/{}/submissions".format(account) + return self._pagination(endpoint) + + def album(self, album_hash): + return self._call("album/" + album_hash) + + def image(self, image_hash): + return self._call("image/" + image_hash) + + def _call(self, endpoint): + return self.extractor.request( + "https://api.imgur.com/3/" + endpoint, headers=self.headers, + ).json()["data"] + + def _pagination(self, endpoint): + num = 0 + + while True: + data = self._call("{}/{}".format(endpoint, num)) + if not data: + return + yield from data + num += 1 diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 8eee390..a14225f 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -36,17 +36,13 @@ class InstagramExtractor(Extractor): data.update(metadata) yield Message.Directory, data - if data['typename'] in ('GraphImage', 'GraphStoryImage', 'GraphStoryVideo'): - yield Message.Url, data['display_url'], \ - text.nameext_from_url(data['display_url'], data) - elif data['typename'] == 'GraphVideo': - data["extension"] = None - yield Message.Url, \ - 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data - elif data['typename'] == 'GraphHighlightReel': + if data['typename'] == 'GraphHighlightReel': url = '{}/stories/highlights/{}/'.format(self.root, data['id']) data['_extractor'] = InstagramStoriesExtractor yield Message.Queue, url, data + else: + url = data['video_url'] or data['display_url'] + yield Message.Url, url, text.nameext_from_url(url, data) def login(self): if self._check_cookies(self.cookienames): @@ -101,12 +97,20 @@ class InstagramExtractor(Extractor): def _extract_shared_data(self, url): page = self.request(url).text - data = text.extract(page, 'window._sharedData = ', ';</script>')[0] - return json.loads(data) + shared_data, pos = text.extract( + page, 'window._sharedData =', ';</script>') + additional_data, pos = text.extract( + page, 'window.__additionalDataLoaded(', ');</script>', pos) + + data = json.loads(shared_data) + if additional_data: + next(iter(data['entry_data'].values()))[0] = \ + json.loads(additional_data.partition(',')[2]) + return data def _extract_postpage(self, url): - shared_data = self._extract_shared_data(url) - media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media'] + data = self.request(url + "?__a=1").json() + media = data['graphql']['shortcode_media'] common = { 'date': text.parse_timestamp(media['taken_at_timestamp']), @@ -122,7 +126,6 @@ class InstagramExtractor(Extractor): medias = [] if media['__typename'] == 'GraphSidecar': - yi = 0 for n in media['edge_sidecar_to_children']['edges']: children = n['node'] media_data = { @@ -130,14 +133,12 @@ class InstagramExtractor(Extractor): 'shortcode': children['shortcode'], 'typename': children['__typename'], 'display_url': children['display_url'], + 'video_url': children.get('video_url'), 'height': text.parse_int(children['dimensions']['height']), 'width': text.parse_int(children['dimensions']['width']), 'sidecar_media_id': media['id'], 'sidecar_shortcode': media['shortcode'], } - if children['__typename'] == 'GraphVideo': - media_data['_ytdl_index'] = yi - yi += 1 media_data.update(common) medias.append(media_data) @@ -147,6 +148,7 @@ class InstagramExtractor(Extractor): 'shortcode': media['shortcode'], 'typename': media['__typename'], 'display_url': media['display_url'], + 'video_url': media.get('video_url'), 'height': text.parse_int(media['dimensions']['height']), 'width': text.parse_int(media['dimensions']['width']), } @@ -318,7 +320,7 @@ class InstagramImageExtractor(InstagramExtractor): # GraphVideo ("https://www.instagram.com/p/Bqxp0VSBgJg/", { - "url": "8f38c1cf460c9804842f7306c487410f33f82e7e", + "pattern": r"/47129943_191645575115739_8539303288426725376_n\.mp4", "keyword": { "date": "type:datetime", "description": str, @@ -334,7 +336,7 @@ class InstagramImageExtractor(InstagramExtractor): # GraphVideo (IGTV) ("https://www.instagram.com/tv/BkQjCfsBIzi/", { - "url": "64208f408e11cbbca86c2df4488e90262ae9d9ec", + "pattern": r"/10000000_1760663964018792_716207142595461120_n\.mp4", "keyword": { "date": "type:datetime", "description": str, @@ -351,11 +353,10 @@ class InstagramImageExtractor(InstagramExtractor): # GraphSidecar with 2 embedded GraphVideo objects ("https://www.instagram.com/p/BtOvDOfhvRr/", { "count": 2, - "url": "e290d4180a58ae50c910d51d3b04d5f5c4622cd7", "keyword": { "sidecar_media_id": "1967717017113261163", "sidecar_shortcode": "BtOvDOfhvRr", - "_ytdl_index": int, + "video_url": str, } }) ) diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py new file mode 100644 index 0000000..12d7487 --- /dev/null +++ b/gallery_dl/extractor/issuu.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://issuu.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import json + + +class IssuuBase(): + """Base class for issuu extractors""" + category = "issuu" + root = "https://issuu.com" + + +class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): + """Extractor for a single publication""" + subcategory = "publication" + directory_fmt = ("{category}", "{document[userName]}", + "{document[originalPublishDate]} {document[title]}") + filename_fmt = "{num:>03}.{extension}" + archive_fmt = "{document[id]}_{num}" + pattern = r"(?:https?://)?issuu\.com(/[^/?&#]+/docs/[^/?&#]+)" + test = ("https://issuu.com/issuu/docs/motions-1-2019/", { + "pattern": r"https://image.isu.pub/190916155301-\w+/jpg/page_\d+.jpg", + "count" : 36, + "keyword": { + "document": { + "access" : "public", + "contentRating": dict, + "date" : "type:datetime", + "description" : "re:Motions, the brand new publication by Is", + "documentId" : r"re:\d+-d99ec95935f15091b040cb8060f05510", + "documentName" : "motions-1-2019", + "downloadState": "NOT_AVAILABLE", + "id" : r"re:\d+-d99ec95935f15091b040cb8060f05510", + "isConverting" : False, + "isQuarantined": False, + "lang" : "en", + "language" : "English", + "pageCount" : 36, + "publicationId": "d99ec95935f15091b040cb8060f05510", + "sections" : list, + "title" : "Motions by Issuu - Issue 1", + "userName" : "issuu", + }, + "extension": "jpg", + "filename" : r"re:page_\d+", + "num" : int, + }, + }) + + def metadata(self, page): + data = json.loads(text.extract( + page, 'window.__INITIAL_STATE__ =', ';\n')[0]) + + doc = data["document"] + doc["lang"] = doc["language"] + doc["language"] = util.code_to_language(doc["language"]) + doc["date"] = text.parse_datetime( + doc["originalPublishDate"], "%Y-%m-%d") + + self._cnt = text.parse_int(doc["pageCount"]) + self._tpl = "https://{}/{}/jpg/page_{{}}.jpg".format( + data["config"]["hosts"]["image"], doc["id"]) + + return {"document": doc} + + def images(self, page): + fmt = self._tpl.format + return [(fmt(i), None) for i in range(1, self._cnt + 1)] + + +class IssuuUserExtractor(IssuuBase, Extractor): + """Extractor for all publications of a user/publisher""" + subcategory = "user" + pattern = r"(?:https?://)?issuu\.com/([^/?&#]+)/?$" + test = ("https://issuu.com/issuu", { + "pattern": IssuuPublicationExtractor.pattern, + "count" : "> 25", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + url = "{}/call/profile/v1/documents/{}".format(self.root, self.user) + params = {"offset": 0, "limit": "25"} + + yield Message.Version, 1 + while True: + data = self.request(url, params=params).json() + + for publication in data["items"]: + publication["url"] = "{}/{}/docs/{}".format( + self.root, self.user, publication["uri"]) + publication["_extractor"] = IssuuPublicationExtractor + yield Message.Queue, publication["url"], publication + + if not data["hasMore"]: + return + params["offset"] += data["limit"] diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index 6314a94..bb89f93 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -33,10 +33,9 @@ class RedirectMixin(): except (EOFError, OSError): pass else: - self.log.error( + raise exception.StopExtraction( "Redirect to \n%s\nVisit this URL in your browser and " "solve the CAPTCHA to continue", response.url) - raise exception.StopExtraction() class KissmangaBase(RedirectMixin): diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 965daa0..0aeeb4a 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -6,75 +6,109 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://luscious.net/""" +"""Extractors for https://members.luscious.net/""" -from .common import GalleryExtractor, Extractor, Message +from .common import Extractor, Message from .. import text, exception -from ..cache import cache -class LusciousBase(Extractor): +class LusciousExtractor(Extractor): """Base class for luscious extractors""" category = "luscious" cookiedomain = ".luscious.net" root = "https://members.luscious.net" - def login(self): - """Login and set necessary cookies""" - username, password = self._get_auth_info() - if username: - self._update_cookies(self._login_impl(username, password)) - - @cache(maxage=14*24*3600, keyarg=1) - def _login_impl(self, username, password): - self.log.info("Logging in as %s", username) - url = "https://members.luscious.net/accounts/login/" - headers = {"Referer": "https://members.luscious.net/login/"} + def _graphql(self, op, variables, query): data = { - "login": username, - "password": password, - "remember": "on", - "next": "/", + "id" : 1, + "operationName": op, + "query" : query, + "variables" : variables, } + response = self.request( + "{}/graphql/nobatch/?operationName={}".format(self.root, op), + method="POST", json=data, fatal=False, + ) - response = self.request(url, method="POST", headers=headers, data=data) - if "/accounts/login/" in response.url or not response.history: - raise exception.AuthenticationError() - for cookie in response.history[0].cookies: - if cookie.name.startswith("sessionid_"): - return {cookie.name: cookie.value} - raise exception.AuthenticationError() + if response.status_code >= 400: + self.log.debug("Server response: %s", response.text) + raise exception.StopExtraction( + "GraphQL query failed ('%s %s')", + response.status_code, response.reason) - @staticmethod - def _parse_tags(tags): - return [ - text.unescape(tag.replace(":_", ":")) - for tag in text.extract_iter(tags or "", "/tags/", "/") - ] + return response.json()["data"] -class LusciousAlbumExtractor(LusciousBase, GalleryExtractor): +class LusciousAlbumExtractor(LusciousExtractor): """Extractor for image albums from luscious.net""" subcategory = "album" - archive_fmt = "{gallery_id}_{image_id}" + filename_fmt = "{category}_{album[id]}_{num:>03}.{extension}" + directory_fmt = ("{category}", "{album[id]} {album[title]}") + archive_fmt = "{album[id]}_{id}" pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net" - r"/(?:albums|pictures/c/[^/?&#]+/album)/([^/?&#]+_(\d+))") + r"/(?:albums|pictures/c/[^/?&#]+/album)/[^/?&#]+_(\d+)") test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { "url": "7e4984a271a1072ac6483e4228a045895aff86f3", - "keyword": "07c0b915f2ab1cc3bbf28b76e7950fccee1213f3", - "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", + # "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", + "keyword": { + "album": { + "__typename" : "Album", + "audiences" : list, + "content" : "Hentai", + "cover" : "re:https://cdnio.luscious.net/.+/277031/", + "created" : 1479625853, + "created_by" : "NTRshouldbeillegal", + "date" : "type:datetime", + "description" : "Enjoy.", + "download_url": "/download/824778/277031/", + "genres" : list, + "id" : 277031, + "is_manga" : True, + "labels" : list, + "language" : "English", + "like_status" : "none", + "modified" : int, + "permissions" : list, + "rating" : float, + "slug" : "okinami-no-koigokoro", + "status" : "not_moderated", + "tags" : list, + "title" : "Okinami no Koigokoro", + "url" : "/albums/okinami-no-koigokoro_277031/", + "marked_for_deletion": False, + "marked_for_processing": False, + "number_of_animated_pictures": 0, + "number_of_favorites": int, + "number_of_pictures": 18, + }, + "aspect_ratio": r"re:\d+:\d+", + "category" : "luscious", + "created" : int, + "date" : "type:datetime", + "height" : int, + "id" : int, + "is_animated" : False, + "like_status" : "none", + "position" : int, + "resolution" : r"re:\d+x\d+", + "status" : "not_moderated", + "tags" : list, + "thumbnail" : str, + "title" : str, + "width" : int, + "number_of_comments": int, + "number_of_favorites": int, + }, }), ("https://luscious.net/albums/virgin-killer-sweater_282582/", { "url": "21cc68a7548f4d71dfd67d8caf96349dde7e791c", - "keyword": "e1202078b504adeccd521aa932f456a5a85479a0", }), ("https://luscious.net/albums/not-found_277035/", { "exception": exception.NotFoundError, }), ("https://members.luscious.net/albums/login-required_323871/", { - "options": (("username", None),), - "exception": exception.HttpError, + "count": 78, }), ("https://www.luscious.net/albums/okinami_277031/"), ("https://members.luscious.net/albums/okinami_277031/"), @@ -83,126 +117,340 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor): ) def __init__(self, match): - path, self.gallery_id = match.groups() - url = "{}/albums/{}/".format(self.root, path) - GalleryExtractor.__init__(self, match, url) + LusciousExtractor.__init__(self, match) + self.album_id = match.group(1) - def metadata(self, page): - title, pos = text.extract(page, '"og:title" content="', '"') + def items(self): + album = self.metadata() + yield Message.Version, 1 + yield Message.Directory, {"album": album} + for num, image in enumerate(self.images(), 1): + image["num"] = num + image["album"] = album + + image["thumbnail"] = image.pop("thumbnails")[0]["url"] + image["tags"] = [item["text"] for item in image["tags"]] + image["date"] = text.parse_timestamp(image["created"]) + image["id"] = text.parse_int(image["id"]) + + url = image["url_to_video"] or image["url_to_original"] + yield Message.Url, url, text.nameext_from_url(url, image) + + def metadata(self): + variables = { + "id": self.album_id, + } - if title is None: - msg = text.extract(page, '<div class="content">', '</div>', pos)[0] - if msg: - raise exception.AuthorizationError(msg) + query = """ +query AlbumGet($id: ID!) { + album { + get(id: $id) { + ... on Album { + ...AlbumStandard + } + ... on MutationError { + errors { + code + message + } + } + } + } +} + +fragment AlbumStandard on Album { + __typename + id + title + labels + description + created + modified + like_status + number_of_favorites + rating + status + marked_for_deletion + marked_for_processing + number_of_pictures + number_of_animated_pictures + slug + is_manga + url + download_url + permissions + cover { + width + height + size + url + } + created_by { + id + name + display_name + user_title + avatar { + url + size + } + url + } + content { + id + title + url + } + language { + id + title + url + } + tags { + id + category + text + url + count + } + genres { + id + title + slug + url + } + audiences { + id + title + url + url + } + last_viewed_picture { + id + position + url + } +} +""" + album = self._graphql("AlbumGet", variables, query)["album"]["get"] + if "errors" in album: raise exception.NotFoundError("album") - info , pos = text.extract(page, '<li class="user_info">', "", pos) - if info is None: - count, pos = text.extract(page, '>Pages:', '<', pos) - else: - count, pos = text.extract(page, '<p>', ' ', pos) - genre, pos = text.extract(page, '<p>Genre:', '</p>', pos) - adnce, pos = text.extract(page, '<p>Audience:', '</p>', pos) - tags , pos = text.extract(page, '"tag_list static">', '</ol>', pos) - - return { - "gallery_id": text.parse_int(self.gallery_id), - "title" : text.unescape(title or ""), - "count" : text.parse_int(count), - "genre" : text.remove_html(genre), - "audience" : text.remove_html(adnce), - "tags" : self._parse_tags(tags), + album["audiences"] = [item["title"] for item in album["audiences"]] + album["genres"] = [item["title"] for item in album["genres"]] + album["tags"] = [item["text"] for item in album["tags"]] + + album["cover"] = album["cover"]["url"] + album["content"] = album["content"]["title"] + album["language"] = album["language"]["title"].partition(" ")[0] + album["created_by"] = album["created_by"]["display_name"] + + album["id"] = text.parse_int(album["id"]) + album["date"] = text.parse_timestamp(album["created"]) + + return album + + def images(self): + variables = { + "input": { + "filters": [{ + "name" : "album_id", + "value": self.album_id, + }], + "display": "position", + "page" : 1, + }, } - def images(self, page): - extr = text.extract - - url = "{}/pictures/album/x_{}/sorted/old/page/1/".format( - self.root, self.gallery_id) - page = self.request(url).text - pos = page.find('<div id="picture_page_') - url = extr(page, '<a href="', '"', pos)[0] - iurl = None - - while url and not url.endswith("/more_like_this/"): - page = self.request(self.root + url).text - - if not iurl: # first loop iteraton - current = extr(page, '"pj_current_page" value="', '"')[0] - if current and current != "1": - url = "{}/albums/{}/jump_to_page/1/".format( - self.root, self.gallery_id) - page = self.request(url, method="POST").text - - iid , pos = extr(url , '/id/', '/') - url , pos = extr(page, '<link rel="next" href="', '"') - name, pos = extr(page, '<h1 id="picture_title">', '</h1>', pos) - _ , pos = extr(page, '<ul class="image_option_icons">', '', pos) - iurl, pos = extr(page, '<li><a href="', '"', pos+100) - - if iurl[0] == "/": - iurl = text.urljoin(self.root, iurl) - - yield iurl, { - "name": name, - "image_id": text.parse_int(iid), + query = """ +query AlbumListOwnPictures($input: PictureListInput!) { + picture { + list(input: $input) { + info { + ...FacetCollectionInfo + } + items { + ...PictureStandardWithoutAlbum } + } + } +} + +fragment FacetCollectionInfo on FacetCollectionInfo { + page + has_next_page + has_previous_page + total_items + total_pages + items_per_page + url_complete + url_filters_only +} + +fragment PictureStandardWithoutAlbum on Picture { + __typename + id + title + created + like_status + number_of_comments + number_of_favorites + status + width + height + resolution + aspect_ratio + url_to_original + url_to_video + is_animated + position + tags { + id + category + text + url + } + permissions + url + thumbnails { + width + height + size + url + } +} +""" + while True: + data = self._graphql("AlbumListOwnPictures", variables, query) + yield from data["picture"]["list"]["items"] + + if not data["picture"]["list"]["info"]["has_next_page"]: + return + variables["input"]["page"] += 1 -class LusciousSearchExtractor(LusciousBase, Extractor): +class LusciousSearchExtractor(LusciousExtractor): """Extractor for album searches on luscious.net""" subcategory = "search" pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net" - r"/(albums(?:/(?![^/?&#]+_\d+)[^/?&#]+)+|manga|pictures)/?$") + r"/albums/list/?(?:\?([^#]+))?") test = ( - ("https://luscious.net/manga/"), - ("https://members.luscious.net/albums/sorted/updated/album_type/manga" - "/content_id/2/tagged/+full_color/page/1/", { + ("https://members.luscious.net/albums/list/"), + ("https://members.luscious.net/albums/list/" + "?display=date_newest&language_ids=%2B1&tagged=+full_color&page=1", { "pattern": LusciousAlbumExtractor.pattern, - "range": "20-40", - "count": 21, + "range": "41-60", + "count": 20, }), ) def __init__(self, match): - Extractor.__init__(self, match) - self.path = match.group(1).partition("/page/")[0] - if not self.path.startswith("albums/"): - self.path = "albums/sorted/updated/album_type/" + self.path + LusciousExtractor.__init__(self, match) + self.query = match.group(1) def items(self): - self.login() - yield Message.Version, 1 - for album in self.albums(): - url, data = self.parse_album(album) - yield Message.Queue, url, data + query = text.parse_query(self.query) + display = query.pop("display", "date_newest") + page = query.pop("page", None) + + variables = { + "input": { + "display": display, + "filters": [{"name": n, "value": v} for n, v in query.items()], + "page": text.parse_int(page, 1), + }, + } - def albums(self, pnum=1): + query = """ +query AlbumListWithPeek($input: AlbumListInput!) { + album { + list(input: $input) { + info { + ...FacetCollectionInfo + } + items { + ...AlbumMinimal + peek_thumbnails { + width + height + size + url + } + } + } + } +} + +fragment FacetCollectionInfo on FacetCollectionInfo { + page + has_next_page + has_previous_page + total_items + total_pages + items_per_page + url_complete + url_filters_only +} + +fragment AlbumMinimal on Album { + __typename + id + title + labels + description + created + modified + number_of_favorites + number_of_pictures + slug + is_manga + url + download_url + cover { + width + height + size + url + } + content { + id + title + url + } + language { + id + title + url + } + tags { + id + category + text + url + count + } + genres { + id + title + slug + url + } + audiences { + id + title + url + } +} +""" + yield Message.Version, 1 while True: - url = "{}/{}/page/{}/.json/".format(self.root, self.path, pnum) - data = self.request(url).json() + data = self._graphql("AlbumListWithPeek", variables, query) - yield from text.extract_iter( - data["html"], "<figcaption>", "</figcaption>") + for album in data["album"]["list"]["items"]: + album["url"] = self.root + album["url"] + album["_extractor"] = LusciousAlbumExtractor + yield Message.Queue, album["url"], album - if data["paginator_complete"]: + if not data["album"]["list"]["info"]["has_next_page"]: return - pnum += 1 - - def parse_album(self, album): - url , pos = text.extract(album, 'href="', '"') - title, pos = text.extract(album, ">", "<", pos) - count, pos = text.extract(album, "# of pictures:", "<", pos) - date , pos = text.extract(album, "Updated: ", "<", pos) - desc , pos = text.extract(album, "class='desc'>", "<", pos) - tags , pos = text.extract(album, "<ol ", "</ol>", pos) - - return text.urljoin(self.root, url), { - "title": text.unescape(title or ""), - "description": text.unescape(desc or ""), - "gallery_id": text.parse_int(url.rpartition("_")[2].rstrip("/")), - "count": text.parse_int(count), - "date": date, - "tags": self._parse_tags(tags), - "_extractor": LusciousAlbumExtractor, - } + variables["input"]["page"] += 1 diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py new file mode 100644 index 0000000..c980a38 --- /dev/null +++ b/gallery_dl/extractor/naver.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://blog.naver.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text + + +class NaverBase(): + """Base class for naver extractors""" + category = "naver" + root = "https://blog.naver.com" + + +class NaverPostExtractor(NaverBase, GalleryExtractor): + """Extractor for blog posts on blog.naver.com""" + subcategory = "post" + filename_fmt = "{num:>03}.{extension}" + directory_fmt = ("{category}", "{blog[user]} {blog[id]}", + "{post[date]:%Y-%m-%d} {post[title]}") + archive_fmt = "{blog[id]}_{post[num]}_{num}" + pattern = (r"(?:https?://)?blog\.naver\.com/" + r"(?:PostView\.nhn\?blogId=(\w+)&logNo=(\d+)|(\w+)/(\d+)/?$)") + test = ( + ("https://blog.naver.com/rlfqjxm0/221430673006", { + "url": "6c694f3aced075ed5e9511f1e796d14cb26619cc", + "keyword": "a6e23d19afbee86b37d6e7ad934650c379d2cb1e", + }), + (("https://blog.naver.com/PostView.nhn" + "?blogId=rlfqjxm0&logNo=221430673006"), { + "url": "6c694f3aced075ed5e9511f1e796d14cb26619cc", + "keyword": "a6e23d19afbee86b37d6e7ad934650c379d2cb1e", + }), + ) + + def __init__(self, match): + blog_id = match.group(1) + if blog_id: + self.blog_id = blog_id + self.post_id = match.group(2) + else: + self.blog_id = match.group(3) + self.post_id = match.group(4) + + url = "{}/PostView.nhn?blogId={}&logNo={}".format( + self.root, self.blog_id, self.post_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page) + data = { + "post": { + "title" : extr('"og:title" content="', '"'), + "description": extr('"og:description" content="', '"'), + "num" : text.parse_int(self.post_id), + }, + "blog": { + "id" : self.blog_id, + "num" : text.parse_int(extr("var blogNo = '", "'")), + "user" : extr("var nickName = '", "'"), + }, + } + data["post"]["date"] = text.parse_datetime( + extr('se_publishDate pcol2">', '<') or + extr('_postAddDate">', '<'), "%Y. %m. %d. %H:%M") + return data + + def images(self, page): + return [ + (url.replace("://post", "://blog", 1).partition("?")[0], None) + for url in text.extract_iter(page, 'data-lazy-src="', '"') + ] + + +class NaverBlogExtractor(NaverBase, Extractor): + """Extractor for a user's blog on blog.naver.com""" + subcategory = "blog" + pattern = (r"(?:https?://)?blog\.naver\.com/" + r"(?:PostList.nhn\?(?:[^&#]+&)*blogId=([^&#]+)|(\w+)/?$)") + test = ( + ("https://blog.naver.com/gukjung", { + "pattern": NaverPostExtractor.pattern, + "count": 12, + "range": "1-12", + }), + ("https://blog.naver.com/PostList.nhn?blogId=gukjung", { + "pattern": NaverPostExtractor.pattern, + "count": 12, + "range": "1-12", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.blog_id = match.group(1) or match.group(2) + + def items(self): + yield Message.Version, 1 + + # fetch first post number + url = "{}/PostList.nhn?blogId={}".format(self.root, self.blog_id) + post_num = text.extract( + self.request(url).text, 'gnFirstLogNo = "', '"', + )[0] + + # setup params for API calls + url = "{}/PostViewBottomTitleListAsync.nhn".format(self.root) + params = { + "blogId" : self.blog_id, + "logNo" : post_num or "0", + "viewDate" : "", + "categoryNo" : "", + "parentCategoryNo" : "", + "showNextPage" : "true", + "showPreviousPage" : "false", + "sortDateInMilli" : "", + "isThumbnailViewType": "false", + "countPerPage" : "", + } + + # loop over all posts + while True: + data = self.request(url, params=params).json() + + for post in data["postList"]: + post["url"] = "{}/PostView.nhn?blogId={}&logNo={}".format( + self.root, self.blog_id, post["logNo"]) + post["_extractor"] = NaverPostExtractor + yield Message.Queue, post["url"], post + + if not data["hasNextPage"]: + return + params["logNo"] = data["nextIndexLogNo"] + params["sortDateInMilli"] = data["nextIndexSortDate"] diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index fdfad87..0bd858f 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -20,7 +20,7 @@ class NijieExtractor(AsynchronousMixin, Extractor): """Base class for nijie extractors""" category = "nijie" directory_fmt = ("{category}", "{user_id}") - filename_fmt = "{category}_{artist_id}_{image_id}_p{num:>02}.{extension}" + filename_fmt = "{image_id}_p{num}.{extension}" archive_fmt = "{image_id}_{num}" cookiedomain = "nijie.info" cookienames = ("nemail", "nlogin") diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py new file mode 100644 index 0000000..97be789 --- /dev/null +++ b/gallery_dl/extractor/nozomi.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://nozomi.la/""" + +from .common import Extractor, Message +from .. import text + + +class NozomiExtractor(Extractor): + """Base class for nozomi extractors""" + category = "nozomi" + root = "https://nozomi.la" + filename_fmt = "{postid}.{extension}" + archive_fmt = "{postid}" + + def items(self): + yield Message.Version, 1 + + data = self.metadata() + self.session.headers["Origin"] = self.root + self.session.headers["Referer"] = self.root + "/" + + for post_id in map(str, self.posts()): + url = "https://j.nozomi.la/post/{}/{}/{}.json".format( + post_id[-1], post_id[-3:-1], post_id) + response = self.request(url, fatal=False) + + if response.status_code >= 400: + self.log.warning( + "Skipping post %s ('%s %s')", + post_id, response.status_code, response.reason) + continue + + image = response.json() + image["tags"] = self._list(image.get("general")) + image["artist"] = self._list(image.get("artist")) + image["copyright"] = self._list(image.get("copyright")) + image["character"] = self._list(image.get("character")) + image["is_video"] = bool(image.get("is_video")) + image["date"] = text.parse_datetime( + image["date"] + ":00", "%Y-%m-%d %H:%M:%S%z") + image["url"] = text.urljoin(self.root, image["imageurl"]) + text.nameext_from_url(image["url"], image) + image.update(data) + + for key in ("general", "imageurl", "imageurls"): + if key in image: + del image[key] + + yield Message.Directory, image + yield Message.Url, image["url"], image + + def metadata(self): + return {} + + def posts(self): + return () + + @staticmethod + def _list(src): + if not src: + return [] + return [x["tagname_display"] for x in src] + + @staticmethod + def _unpack(b): + for i in range(0, len(b), 4): + yield (b[i] << 24) + (b[i+1] << 16) + (b[i+2] << 8) + b[i+3] + + +class NozomiPostExtractor(NozomiExtractor): + """Extractor for individual posts on nozomi.la""" + subcategory = "post" + pattern = r"(?:https?://)?nozomi\.la/post/(\d+)" + test = ("https://nozomi.la/post/3649262.html", { + "url": "f4522adfc8159355fd0476de28761b5be0f02068", + "content": "cd20d2c5149871a0b80a1b0ce356526278964999", + "keyword": { + "artist" : ["hammer (sunset beach)"], + "character": ["patchouli knowledge"], + "copyright": ["touhou"], + "dataid" : "re:aaa9f7c632cde1e1a5baaff3fb6a6d857ec73df7fdc5cf5a", + "date" : "type:datetime", + "extension": "jpg", + "favorites": int, + "filename" : str, + "height" : 768, + "is_video" : False, + "postid" : 3649262, + "source" : "danbooru", + "sourceid" : 2434215, + "tags" : list, + "type" : "jpg", + "url" : str, + "width" : 1024, + }, + }) + + def __init__(self, match): + NozomiExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + return (self.post_id,) + + +class NozomiTagExtractor(NozomiExtractor): + """Extractor for posts from tag searches on nozomi.la""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{postid}" + pattern = r"(?:https?://)?nozomi\.la/tag/([^/?&#]+)-\d+\." + test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", { + "pattern": r"^https://i.nozomi.la/\w/\w\w/\w+\.\w+$", + "count": ">= 75", + "range": "1-75", + }) + + def __init__(self, match): + NozomiExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1)).lower() + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + url = "https://n.nozomi.la/nozomi/{}.nozomi".format(self.tags) + i = 0 + + while True: + headers = {"Range": "bytes={}-{}".format(i, i+255)} + response = self.request(url, headers=headers) + yield from self._unpack(response.content) + + i += 256 + cr = response.headers.get("Content-Range", "").rpartition("/")[2] + if text.parse_int(cr, i) <= i: + return + + +class NozomiSearchExtractor(NozomiExtractor): + """Extractor for search results on nozomi.la""" + subcategory = "search" + directory_fmt = ("{category}", "{search_tags:J }") + archive_fmt = "t_{search_tags}_{postid}" + pattern = r"(?:https?://)?nozomi\.la/search\.html\?q=([^&#]+)" + test = ("https://nozomi.la/search.html?q=hibiscus%203:4_ratio#1", { + "count": ">= 5", + }) + + def __init__(self, match): + NozomiExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1)).lower().split() + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + index = None + result = set() + + def nozomi(path): + url = "https://j.nozomi.la/" + path + ".nozomi" + return self._unpack(self.request(url).content) + + for tag in self.tags: + if tag[0] == "-": + if not index: + index = set(nozomi("index")) + items = index.difference(nozomi("nozomi/" + tag[1:])) + else: + items = nozomi("nozomi/" + tag) + + if result: + result.intersection_update(items) + else: + result.update(items) + + return result diff --git a/gallery_dl/extractor/nsfwalbum.py b/gallery_dl/extractor/nsfwalbum.py index 5005fb4..8f1f3f2 100644 --- a/gallery_dl/extractor/nsfwalbum.py +++ b/gallery_dl/extractor/nsfwalbum.py @@ -17,14 +17,14 @@ class NsfwalbumAlbumExtractor(GalleryExtractor): category = "nsfwalbum" subcategory = "album" root = "https://nsfwalbum.com" - filename_fmt = "{album_id}_{page:>03}_{id}.{extension}" + filename_fmt = "{album_id}_{num:>03}_{id}.{extension}" directory_fmt = ("{category}", "{album_id} {title}") archive_fmt = "{id}" pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))" test = ("https://nsfwalbum.com/album/401611", { "range": "1-5", "url": "b0481fc7fad5982da397b6359fbed8421b8ba284", - "keyword": "fc1ad4ebcd6d4cf32da15203120112b8bcf12eec", + "keyword": "e98f9b0d473c00000831618d0235863b1dd78294", }) def __init__(self, match): diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 6c6dd0a..912447b 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -296,8 +296,8 @@ class OAuthMastodon(OAuthBase): data = self.session.post(url, data=data).json() if "client_id" not in data or "client_secret" not in data: - self.log.error("Failed to register new application: '%s'", data) - raise exception.StopExtraction() + raise exception.StopExtraction( + "Failed to register new application: '%s'", data) data["client-id"] = data.pop("client_id") data["client-secret"] = data.pop("client_secret") diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index ab5932d..9b13391 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -76,6 +76,8 @@ class PatreonExtractor(Extractor): headers = {"Referer": self.root} while url: + if not url.startswith("http"): + url = "https://" + url.lstrip("/:") posts = self.request(url, headers=headers).json() if "included" in posts: diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py index 83f75a3..8456f97 100644 --- a/gallery_dl/extractor/photobucket.py +++ b/gallery_dl/extractor/photobucket.py @@ -22,11 +22,11 @@ class PhotobucketAlbumExtractor(Extractor): filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}" archive_fmt = "{id}" pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)" - r"/user/[^/?&#]+/library/[^?&#]*") + r"/user/[^/?&#]+/library(?:/[^?&#]*)?") test = ( - ("https://s258.photobucket.com/user/focolandia/library/", { - "pattern": r"https?://[oi]+\d+.photobucket.com/albums/hh280/", - "count": ">= 39" + ("https://s369.photobucket.com/user/CrpyLrkr/library", { + "pattern": r"https?://[oi]+\d+.photobucket.com/albums/oo139/", + "count": ">= 50" }), # subalbums of main "directory" ("https://s271.photobucket.com/user/lakerfanryan/library/", { @@ -149,10 +149,9 @@ class PhotobucketImageExtractor(Extractor): if "message" not in image: break # success tries += 1 - self.log.debug("'%s'", image["message"]) + self.log.debug(image["message"]) else: - self.log.error("%s", image["message"]) - raise exception.StopExtraction() + raise exception.StopExtraction(image["message"]) # adjust metadata entries to be at least somewhat similar # to what the 'album' extractor provides diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index f5b8869..e36a82b 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -241,9 +241,8 @@ class PinterestAPI(): if response.status_code == 404 or response.history: resource = self.extractor.subcategory.rpartition("-")[2] raise exception.NotFoundError(resource) - self.extractor.log.error("API request failed") self.extractor.log.debug("%s", response.text) - raise exception.StopExtraction() + raise exception.StopExtraction("API request failed") def _pagination(self, resource, options): while True: diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index d313daa..d32f245 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -335,11 +335,9 @@ class PixivSearchExtractor(PixivExtractor): def get_metadata(self, user=None): query = text.parse_query(self.query) - if "word" in query: - self.word = text.unescape(query["word"]) - else: - self.log.error("missing search term") - raise exception.StopExtraction() + if "word" not in query: + raise exception.StopExtraction("Missing search term") + self.word = query["word"] sort = query.get("order", "date_d") sort_map = { @@ -504,8 +502,7 @@ class PixivAppAPI(): return response.json() if response.status_code == 404: raise exception.NotFoundError() - self.log.error("API request failed: %s", response.text) - raise exception.StopExtraction() + raise exception.StopExtraction("API request failed: %s", response.text) def _pagination(self, endpoint, params): while True: diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index 325c6a0..2bb66ac 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -49,7 +49,7 @@ class PlurkExtractor(Extractor): data = {"plurk_id": plurk["id"], "count": "200"} while True: - info = self.request(url, "POST", data=data).json() + info = self.request(url, method="POST", data=data).json() yield from info["responses"] if not info["has_newer"]: return @@ -91,7 +91,8 @@ class PlurkTimelineExtractor(PlurkExtractor): offset = datetime.datetime.strptime( plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z") data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z") - response = self.request(url, "POST", headers=headers, data=data) + response = self.request( + url, method="POST", headers=headers, data=data) plurks = response.json()["plurks"] diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 9c283de..ecce003 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -259,12 +259,17 @@ class RedditAPI(): data = {"grant_type": ("https://oauth.reddit.com/" "grants/installed_client"), "device_id": "DO_NOT_TRACK_THIS_DEVICE"} + + auth = (self.client_id, "") response = self.extractor.request( - url, method="POST", data=data, auth=(self.client_id, "")) + url, method="POST", data=data, auth=auth, fatal=False) + data = response.json() + if response.status_code != 200: - raise exception.AuthenticationError('"{} ({})"'.format( - response.json().get("message"), response.status_code)) - return "Bearer " + response.json()["access_token"] + self.log.debug("Server response: %s", data) + raise exception.AuthenticationError('"{}: {}"'.format( + data.get("error"), data.get("message"))) + return "Bearer " + data["access_token"] def _call(self, endpoint, params): url = "https://oauth.reddit.com" + endpoint diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index bb8a2ae..b07d024 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -201,9 +201,8 @@ class SankakuTagExtractor(SankakuExtractor): tags = self.tags.split() if not self.logged_in and len(tags) > 4: - self.log.error("Unauthenticated users cannot use " - "more than 4 tags at once.") - raise exception.StopExtraction() + raise exception.StopExtraction( + "Unauthenticated users cannot use more than 4 tags at once.") return {"search_tags": " ".join(tags)} def get_posts(self): diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 38b7813..c4597af 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -21,6 +21,7 @@ class SexcomExtractor(Extractor): root = "https://www.sex.com" def items(self): + self.session.headers["Referer"] = self.root yield Message.Version, 1 yield Message.Directory, self.metadata() for pin in map(self._parse_pin, self.pins()): @@ -52,7 +53,7 @@ class SexcomExtractor(Extractor): def _parse_pin(self, url): response = self.request(url, fatal=False) if response.status_code >= 400: - self.log.warning('Unable to fetch %s ("%s: %s")', + self.log.warning('Unable to fetch %s ("%s %s")', url, response.status_code, response.reason) return None extr = text.extract_from(response.text) @@ -102,6 +103,7 @@ class SexcomPinExtractor(SexcomExtractor): # picture ("https://www.sex.com/pin/56714360/", { "url": "599190d6e3d79f9f49dda194a0a58cb0ffa3ab86", + "content": "963ed681cf53904173c7581b713c7f9471f04db0", "keyword": { "comments": int, "date": "2018-10-02T21:18:17-04:00", @@ -150,7 +152,7 @@ class SexcomRelatedPinExtractor(SexcomPinExtractor): directory_fmt = ("{category}", "related {original_pin[pin_id]}") pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+).*#related$" test = ("https://www.sex.com/pin/56714360/#related", { - "count": 24, + "count": ">= 22", }) def metadata(self): diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index ba0fcf4..82a61da 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): (("https://original-work.simply-hentai.com" "/amazon-no-hiyaku-amazon-elixir"), { "url": "21613585ae5ec2f69ea579e9713f536fceab5bd5", - "keyword": "bf75f9ff0fb60756b1b9b92403526a72d9178d23", + "keyword": "9e87a0973553b2922ddee37958b8f5d87910af72", }), ("https://www.simply-hentai.com/notfound", { "exception": exception.GalleryDLException, @@ -43,7 +43,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): extr = text.extract_from(page) split = text.split_html - self.chapter_url = extr('<link rel="canonical" href="', '"') + self.gallery_url = extr('<link rel="canonical" href="', '"') title = extr('<meta property="og:title" content="', '"') if not title: raise exception.NotFoundError("gallery") @@ -63,11 +63,14 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): return data def images(self, _): - url = self.chapter_url + "/all-pages" + url = self.gallery_url + "/all-pages" headers = {"Accept": "application/json"} images = self.request(url, headers=headers).json() return [ - (urls["full"], {"image_id": text.parse_int(image_id)}) + ( + urls["full"].replace("/giant_thumb_", "/"), + {"image_id": text.parse_int(image_id)}, + ) for image_id, urls in sorted(images.items()) ] @@ -84,12 +87,12 @@ class SimplyhentaiImageExtractor(Extractor): test = ( (("https://www.simply-hentai.com/image" "/pheromomania-vol-1-kanzenban-isao-3949d8b3-400c-4b6"), { - "url": "0338eb137830ab6f81e5f410d3936ef785d063d9", + "url": "3d8eb55240a960134891bd77fe1df7988fcdc455", "keyword": "e10e5588481cab68329ef6ec1e5325206b2079a2", }), ("https://www.simply-hentai.com/gif/8915dfcf-0b6a-47c", { - "url": "11c060d7ec4dfd0bd105300b6e1fd454674a5af1", - "keyword": "dd97a4bb449c397d6fec9f43a1303c0fb168ae65", + "url": "f73916527211b4a40f26568ee26cd8999f5f4f30", + "keyword": "f94d775177fed918759c8a78a50976f867425b48", }), ) diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 2e6508c..be29dcf 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -259,11 +259,9 @@ class SmugmugAPI(oauth.OAuth1API): if data["Code"] == 404: raise exception.NotFoundError() if data["Code"] == 429: - self.log.error("Rate limit reached") - else: - self.log.error("API request failed") - self.log.debug(data) - raise exception.StopExtraction() + raise exception.StopExtraction("Rate limit reached") + self.log.debug(data) + raise exception.StopExtraction("API request failed") def _expansion(self, endpoint, expands, params=None): endpoint = self._extend(endpoint, expands) diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index cc0dc90..298b7e0 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -109,14 +109,13 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): def images(self, page): url = "{}/Read/Index/{}?page=1".format(self.root, self.gallery_id) - headers = {"Referer": self.chapter_url} + headers = {"Referer": self.gallery_url} response = self.request(url, headers=headers, fatal=False) if "/Auth/" in response.url: - self.log.error( + raise exception.StopExtraction( "Failed to get gallery JSON data. Visit '%s' in a browser " "and solve the CAPTCHA to continue.", response.url) - raise exception.StopExtraction() page = response.text tpl, pos = text.extract(page, 'data-cdn="', '"') @@ -195,8 +194,8 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor): return self._parse_simple(query) return self._parse_jsurl(query) except Exception as exc: - self.log.error("Invalid search query: '%s' (%s)", query, exc) - raise exception.StopExtraction() + raise exception.StopExtraction( + "Invalid search query '%s' (%s)", query, exc) @staticmethod def _parse_simple(query): diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 8abbaf7..998eed4 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -407,26 +407,22 @@ class TumblrAPI(oauth.OAuth1API): # daily rate limit if response.headers.get("x-ratelimit-perday-remaining") == "0": reset = response.headers.get("x-ratelimit-perday-reset") - self.log.error( + raise exception.StopExtraction( "Daily API rate limit exceeded: aborting; " - "rate limit will reset at %s", - self._to_time(reset), + "rate limit will reset at %s", self._to_time(reset), ) - raise exception.StopExtraction() # hourly rate limit reset = response.headers.get("x-ratelimit-perhour-reset") if reset: self.log.info( - "Hourly API rate limit exceeded; " - "waiting until %s for rate limit reset", - self._to_time(reset), + "Hourly API rate limit exceeded; waiting until " + "%s for rate limit reset", self._to_time(reset), ) time.sleep(int(reset) + 1) return self._call(blog, endpoint, params) - self.log.error(data) - raise exception.StopExtraction() + raise exception.StopExtraction(data) @staticmethod def _to_time(reset): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 8105ede..dfafc1f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, exception -from ..cache import cache +from ..cache import cache, memcache import re @@ -26,6 +26,7 @@ class TwitterExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + self.logged_in = False self.retweets = self.config("retweets", True) self.content = self.config("content", False) self.videos = self.config("videos", False) @@ -53,10 +54,20 @@ class TwitterExtractor(Extractor): yield Message.Urllist, urls, data if self.videos and "-videoContainer" in tweet: + if self.videos == "ytdl": + data["extension"] = None + url = "ytdl:{}/{}/status/{}".format( + self.root, data["user"], data["tweet_id"]) + else: + url = self._video_from_tweet(data["tweet_id"]) + ext = text.ext_from_url(url) + if ext == "m3u8": + url = "ytdl:" + url + data["extension"] = "mp4" + data["_ytdl_extra"] = {"protocol": "m3u8_native"} + else: + data["extension"] = ext data["num"] = 1 - data["extension"] = None - url = "ytdl:{}/{}/status/{}".format( - self.root, data["user"], data["tweet_id"]) yield Message.Url, url, data def metadata(self): @@ -70,6 +81,7 @@ class TwitterExtractor(Extractor): username, password = self._get_auth_info() if username: self._update_cookies(self._login_impl(username, password)) + self.logged_in = True @cache(maxage=360*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -115,17 +127,48 @@ class TwitterExtractor(Extractor): data["content"] = cl if cl and len(cr) < 16 else content return data - def _tweets_from_api(self, url): + def _video_from_tweet(self, tweet_id): + url = "https://api.twitter.com/1.1/videos/tweet/config/{}.json".format( + tweet_id) + cookies = None + headers = { + "Origin" : self.root, + "Referer" : "{}/i/web/status/{}".format(self.root, tweet_id), + "x-csrf-token" : self.session.cookies.get("ct0"), + "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekM" + "xqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28N" + "HfOPqkca3qaAxGfsyKCs0wRbw", + } + + if self.logged_in: + headers["x-twitter-auth-type"] = "OAuth2Session" + else: + token = self._guest_token(headers) + cookies = {"gt": token} + headers["x-guest-token"] = token + + data = self.request(url, cookies=cookies, headers=headers).json() + return data["track"]["playbackUrl"] + + @memcache() + def _guest_token(self, headers): + return self.request( + "https://api.twitter.com/1.1/guest/activate.json", + method="POST", headers=headers, + ).json().get("guest_token") + + def _tweets_from_api(self, url, max_position=None): params = { "include_available_features": "1", "include_entities": "1", + "max_position": max_position, "reset_error_state": "false", "lang": "en", } headers = { "X-Requested-With": "XMLHttpRequest", "X-Twitter-Active-User": "yes", - "Referer": "{}/{}".format(self.root, self.user) + "Referer": self.root + "/", } while True: @@ -140,18 +183,23 @@ class TwitterExtractor(Extractor): if not data["has_more_items"]: return - position = text.parse_int(text.extract( - tweet, 'data-tweet-id="', '"')[0]) - if "max_position" in params and position >= params["max_position"]: - return - params["max_position"] = position + if "min_position" in data: + position = data["min_position"] + if position == max_position: + return + else: + position = text.parse_int(text.extract( + tweet, 'data-tweet-id="', '"')[0]) + if max_position and position >= max_position: + return + params["max_position"] = max_position = position class TwitterTimelineExtractor(TwitterExtractor): """Extractor for all images from a user's timeline""" subcategory = "timeline" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" - r"/([^/?&#]+)/?(?:$|[?#])") + r"/(?!search)([^/?&#]+)/?(?:$|[?#])") test = ( ("https://twitter.com/supernaturepics", { "range": "1-40", @@ -171,7 +219,7 @@ class TwitterMediaExtractor(TwitterExtractor): """Extractor for all images from a user's Media Tweets""" subcategory = "media" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" - r"/([^/?&#]+)/media(?!\w)") + r"/(?!search)([^/?&#]+)/media(?!\w)") test = ( ("https://twitter.com/supernaturepics/media", { "range": "1-40", @@ -186,6 +234,26 @@ class TwitterMediaExtractor(TwitterExtractor): return self._tweets_from_api(url) +class TwitterSearchExtractor(TwitterExtractor): + """Extractor for all images from a search timeline""" + subcategory = "search" + directory_fmt = ("{category}", "Search", "{search}") + pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)") + test = ("https://twitter.com/search?q=nature", { + "range": "1-40", + "count": 40, + }) + + def metadata(self): + return {"search": self.user} + + def tweets(self): + url = "{}/i/search/timeline?f=tweets&q={}".format( + self.root, self.user) + return self._tweets_from_api(url, "-1") + + class TwitterTweetExtractor(TwitterExtractor): """Extractor for images from individual tweets""" subcategory = "tweet" @@ -205,17 +273,17 @@ class TwitterTweetExtractor(TwitterExtractor): # video ("https://twitter.com/perrypumas/status/1065692031626829824", { "options": (("videos", True),), - "pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+", + "pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8", }), # content with emoji, newlines, hashtags (#338) ("https://twitter.com/yumi_san0112/status/1151144618936823808", { "options": (("content", True),), - "keyword": "b13b6c4cd0b0c15b2ea7685479e7fedde3c47b9e", + "keyword": "b133464b73aec33871521ab021a3166204194285", }), # Reply to another tweet (#403) ("https://twitter.com/tyson_hesse/status/1103767554424598528", { - "options": (("videos", True),), - "pattern": r"ytdl:https://twitter.com/.*/1103767554424598528$", + "options": (("videos", "ytdl"),), + "pattern": r"ytdl:https://twitter.com/.+/1103767554424598528", }), # /i/web/ URL ("https://twitter.com/i/web/status/1155074198240292865", { @@ -231,9 +299,19 @@ class TwitterTweetExtractor(TwitterExtractor): return {"user": self.user, "tweet_id": self.tweet_id} def tweets(self): - self.session.cookies.clear() url = "{}/i/web/status/{}".format(self.root, self.tweet_id) - page = self.request(url).text + cookies = {"app_shell_visited": "1"} + headers = { + "Referer" : url, + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; " + "Trident/7.0; rv:11.0) like Gecko", + } + + response = self.request(url, cookies=cookies, headers=headers) + if response.history and response.url == self.root + "/": + raise exception.AuthorizationError() + page = response.text + end = page.index('class="js-tweet-stats-container') beg = page.rindex('<div class="tweet ', 0, end) return (page[beg:end],) diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 4326582..09a166c 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -33,8 +33,8 @@ class WallhavenSearchExtractor(WallhavenExtractor): ("https://wallhaven.cc/search?q=touhou"), (("https://wallhaven.cc/search?q=id%3A87" "&categories=111&purity=100&sorting=date_added&order=asc&page=3"), { - "count": 4, - "url": "d024bc11895d758b76ffdb0fa85a627e53f072cf", + "count": 5, + "url": "d477b68a534c3416d506ae1f159b25debab64678", }), ) |
