diff options
author | Unit 193 <unit193@unit193.net> | 2020-12-30 18:41:48 -0500 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2020-12-30 18:41:48 -0500 |
commit | 87a5aa088ce33a1196ff409b76a9ea8233bdc634 (patch) | |
tree | 7e6155edcc5dd12e40b47ad814b3bc69e65c52fc | |
parent | 8f7c87a2697113134c311aaeafd9c919555a2741 (diff) | |
download | gallery-dl-87a5aa088ce33a1196ff409b76a9ea8233bdc634.tar.bz2 gallery-dl-87a5aa088ce33a1196ff409b76a9ea8233bdc634.tar.xz gallery-dl-87a5aa088ce33a1196ff409b76a9ea8233bdc634.tar.zst |
New upstream version 1.16.1.upstream/1.16.1
32 files changed, 516 insertions, 374 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index c536269..3531352 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,27 @@ # Changelog +## 1.16.1 - 2020-12-27 +### Additions +- [instagram] add `include` option ([#1180](https://github.com/mikf/gallery-dl/issues/1180)) +- [pinterest] implement video support ([#1189](https://github.com/mikf/gallery-dl/issues/1189)) +- [sankaku] reimplement login support ([#1176](https://github.com/mikf/gallery-dl/issues/1176), [#1182](https://github.com/mikf/gallery-dl/issues/1182)) +- [sankaku] add support for sankaku.app URLs ([#1193](https://github.com/mikf/gallery-dl/issues/1193)) +### Changes +- [e621] return pool posts in order ([#1195](https://github.com/mikf/gallery-dl/issues/1195)) +- [hentaicafe] prefer title of `/hc.fyi/` pages ([#1106](https://github.com/mikf/gallery-dl/issues/1106)) +- [hentaicafe] simplify default filenames +- [sankaku] normalize `created_at` metadata ([#1190](https://github.com/mikf/gallery-dl/issues/1190)) +- [postprocessor:exec] do not add missing `{}` to command ([#1185](https://github.com/mikf/gallery-dl/issues/1185)) +### Fixes +- [booru] improve error handling +- [instagram] warn about private profiles ([#1187](https://github.com/mikf/gallery-dl/issues/1187)) +- [keenspot] improve redirect handling +- [mangadex] respect `chapter-reverse` settings ([#1194](https://github.com/mikf/gallery-dl/issues/1194)) +- [pixiv] output debug message on failed login attempts ([#1192](https://github.com/mikf/gallery-dl/issues/1192)) +- increase SQLite connection timeouts ([#1173](https://github.com/mikf/gallery-dl/issues/1173)) +### Removals +- [mangapanda] remove module + ## 1.16.0 - 2020-12-12 ### Additions - [booru] implement generalized extractors for `*booru` and `moebooru` sites @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.16.0 +Version: 1.16.1 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.bin>`__ These executables include a Python interpreter and all required Python packages. @@ -236,6 +236,7 @@ Description: ========== ``instagram``, ``luscious``, ``pinterest``, + ``sankaku``, ``subscribestar``, ``tsumino``, and ``twitter``. @@ -275,8 +276,8 @@ Description: ========== option in your configuration file by specifying - | the path to a Mozilla/Netscape format cookies.txt file exported by a browser addon - | (e.g. `cookies.txt <https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg>`__ for Chrome, - `Export Cookies <https://addons.mozilla.org/en-US/firefox/addon/export-cookies-txt/?src=search>`__ for Firefox) + | (e.g. `Get cookies.txt <https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid/>`__ for Chrome, + `Export Cookies <https://addons.mozilla.org/en-US/firefox/addon/export-cookies-txt/>`__ for Firefox) - | a list of name-value pairs gathered from your browser's web developer tools | (in `Chrome <https://developers.google.com/web/tools/chrome-devtools/storage/cookies>`__, @@ -331,7 +332,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.0.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.1.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ @@ -83,8 +83,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.bin>`__ These executables include a Python interpreter and all required Python packages. @@ -225,6 +225,7 @@ and optional for ``instagram``, ``luscious``, ``pinterest``, +``sankaku``, ``subscribestar``, ``tsumino``, and ``twitter``. @@ -264,8 +265,8 @@ This can be done via the option in your configuration file by specifying - | the path to a Mozilla/Netscape format cookies.txt file exported by a browser addon - | (e.g. `cookies.txt <https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg>`__ for Chrome, - `Export Cookies <https://addons.mozilla.org/en-US/firefox/addon/export-cookies-txt/?src=search>`__ for Firefox) + | (e.g. `Get cookies.txt <https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid/>`__ for Chrome, + `Export Cookies <https://addons.mozilla.org/en-US/firefox/addon/export-cookies-txt/>`__ for Firefox) - | a list of name-value pairs gathered from your browser's web developer tools | (in `Chrome <https://developers.google.com/web/tools/chrome-devtools/storage/cookies>`__, @@ -320,7 +321,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.0.tar.gz +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.1.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index af6eaf3..c3df997 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2020-12-12" "1.16.0" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2020-12-27" "1.16.1" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 8c291fb..40efa15 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2020-12-12" "1.16.0" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2020-12-27" "1.16.1" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -346,6 +346,8 @@ and optional for .br * \f[I]pinterest\f[] .br +* \f[I]sankaku\f[] +.br * \f[I]subscribestar\f[] .br * \f[I]tsumino\f[] @@ -1191,16 +1193,24 @@ Value of the \f[I]orderby\f[] parameter for submission searches. for details) -.SS extractor.instagram.highlights +.SS extractor.instagram.include .IP "Type:" 6 -\f[I]bool\f[] +\f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 -\f[I]false\f[] +\f[I]"posts"\f[] + +.IP "Example:" 4 +"stories,highlights,posts" or ["stories", "highlights", "posts"] .IP "Description:" 4 -Include *Story Highlights* when downloading a user profile. -(requires authentication) +A (comma-separated) list of subcategories to include +when processing a user profile. + +Possible values are +\f[I]"posts"\f[], \f[I]"stories"\f[], \f[I]"highlights"\f[], \f[I]"channel"\f[]. + +You can use \f[I]"all"\f[] instead of listing all values separately. .SS extractor.instagram.videos @@ -1335,6 +1345,17 @@ Download subalbums. Include pins from board sections. +.SS extractor.pinterest.videos +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download from video pins. + + .SS extractor.pixiv.user.avatar .IP "Type:" 6 \f[I]bool\f[] diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index e0eda0d..7b2006e 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.16.0 +Version: 1.16.1 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.bin>`__ These executables include a Python interpreter and all required Python packages. @@ -236,6 +236,7 @@ Description: ========== ``instagram``, ``luscious``, ``pinterest``, + ``sankaku``, ``subscribestar``, ``tsumino``, and ``twitter``. @@ -275,8 +276,8 @@ Description: ========== option in your configuration file by specifying - | the path to a Mozilla/Netscape format cookies.txt file exported by a browser addon - | (e.g. `cookies.txt <https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg>`__ for Chrome, - `Export Cookies <https://addons.mozilla.org/en-US/firefox/addon/export-cookies-txt/?src=search>`__ for Firefox) + | (e.g. `Get cookies.txt <https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid/>`__ for Chrome, + `Export Cookies <https://addons.mozilla.org/en-US/firefox/addon/export-cookies-txt/>`__ for Firefox) - | a list of name-value pairs gathered from your browser's web developer tools | (in `Chrome <https://developers.google.com/web/tools/chrome-devtools/storage/cookies>`__, @@ -331,7 +332,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.0.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.1.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index fd1b4a1..3b28345 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -97,7 +97,6 @@ gallery_dl/extractor/mangadex.py gallery_dl/extractor/mangafox.py gallery_dl/extractor/mangahere.py gallery_dl/extractor/mangakakalot.py -gallery_dl/extractor/mangapanda.py gallery_dl/extractor/mangapark.py gallery_dl/extractor/mangareader.py gallery_dl/extractor/mangastream.py diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py index 3886091..a874f63 100644 --- a/gallery_dl/cache.py +++ b/gallery_dl/cache.py @@ -210,6 +210,6 @@ try: os.close(os.open(dbfile, os.O_CREAT | os.O_RDONLY, 0o600)) DatabaseCacheDecorator.db = sqlite3.connect( - dbfile, timeout=30, check_same_thread=False) + dbfile, timeout=60, check_same_thread=False) except (OSError, TypeError, sqlite3.OperationalError): cache = memcache # noqa: F811 diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 611603e..b38cddc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -66,7 +66,6 @@ modules = [ "mangafox", "mangahere", "mangakakalot", - "mangapanda", "mangapark", "mangareader", "mangastream", diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 517df93..64cde80 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -13,6 +13,7 @@ from .. import text, util, exception from xml.etree import ElementTree import collections +import operator import re @@ -25,15 +26,25 @@ class BooruExtractor(Extractor): def items(self): self.login() - extended_tags = self.config("tags", False) data = self.metadata() + tags = self.config("tags", False) + for post in self.posts(): try: - url = self._prepare_post(post, extended_tags) - except KeyError: + url = self._file_url(post) + if url[0] == "/": + url = self.root + url + except (KeyError, TypeError): + self.log.debug("Unable to fetch download URL for post %s " + "(md5: %s)", post.get("id"), post.get("md5")) continue + + if tags: + self._extended_tags(post) + self._prepare(post) post.update(data) text.nameext_from_url(url, post) + yield Message.Directory, post yield Message.Url, url, post @@ -53,17 +64,14 @@ class BooruExtractor(Extractor): """Return an iterable with post objects""" return () - def _prepare_post(self, post, extended_tags=False): - url = post["file_url"] - if url[0] == "/": - url = self.root + url - if extended_tags: - self._fetch_extended_tags(post) + _file_url = operator.itemgetter("file_url") + + @staticmethod + def _prepare(post): post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") - return url - def _fetch_extended_tags(self, post, page=None): + def _extended_tags(self, post, page=None): if not page: url = "{}/index.php?page=post&s=view&id={}".format( self.root, post["id"]) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index ca37cb4..33797f9 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -12,7 +12,6 @@ from .common import Extractor, Message from .. import text import datetime - BASE_PATTERN = ( r"(?:https?://)?" r"(danbooru|hijiribe|sonohara|safebooru)" @@ -33,7 +32,6 @@ class DanbooruExtractor(Extractor): super().__init__(match) self.root = "https://{}.donmai.us".format(match.group(1)) self.ugoira = self.config("ugoira", False) - self.params = {} username, api_key = self._get_auth_info() if username: @@ -71,13 +69,16 @@ class DanbooruExtractor(Extractor): yield Message.Url, url, post def metadata(self): - return {} + return () def posts(self): - return self._pagination(self.root + "/posts.json") + return () + + def _pagination(self, endpoint, params=None, pagenum=False): + url = self.root + endpoint - def _pagination(self, url, pagenum=False): - params = self.params.copy() + if params is None: + params = {} params["limit"] = self.per_page params["page"] = self.page_start @@ -122,10 +123,14 @@ class DanbooruTagExtractor(DanbooruExtractor): def __init__(self, match): super().__init__(match) - self.params["tags"] = text.unquote(match.group(2).replace("+", " ")) + self.tags = text.unquote(match.group(2).replace("+", " ")) def metadata(self): - return {"search_tags": self.params["tags"]} + return {"search_tags": self.tags} + + def posts(self): + params = {"tags": self.tags} + return self._pagination("/posts.json", params) class DanbooruPoolExtractor(DanbooruExtractor): @@ -141,15 +146,19 @@ class DanbooruPoolExtractor(DanbooruExtractor): def __init__(self, match): super().__init__(match) self.pool_id = match.group(2) - self.params["tags"] = "pool:" + self.pool_id + self.post_ids = () def metadata(self): url = "{}/pools/{}.json".format(self.root, self.pool_id) pool = self.request(url).json() pool["name"] = pool["name"].replace("_", " ") - del pool["post_ids"] + self.post_ids = pool.pop("post_ids") return {"pool": pool} + def posts(self): + params = {"tags": "pool:" + self.pool_id} + return self._pagination("/posts.json", params) + class DanbooruPostExtractor(DanbooruExtractor): """Extractor for single danbooru posts""" @@ -193,10 +202,9 @@ class DanbooruPopularExtractor(DanbooruExtractor): def __init__(self, match): super().__init__(match) - self.params.update(text.parse_query(match.group(2))) + self.params = text.parse_query(match.group(2)) def metadata(self): - self.page_start = self.page_start or 1 scale = self.params.get("scale", "day") date = self.params.get("date") or datetime.date.today().isoformat() @@ -209,5 +217,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): return {"date": date, "scale": scale} def posts(self): - url = self.root + "/explore/posts/popular.json" - return self._pagination(url, True) + if self.page_start is None: + self.page_start = 1 + return self._pagination( + "/explore/posts/popular.json", self.params, True) diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 591fe33..4ad19cd 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -11,7 +11,6 @@ from .common import Extractor, Message from . import danbooru - BASE_PATTERN = r"(?:https?://)?e(621|926)\.net" @@ -39,9 +38,9 @@ class E621Extractor(danbooru.DanbooruExtractor): file = post["file"] if not file["url"]: - ihash = file["md5"] + md5 = file["md5"] file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format( - self.root[8:], ihash[0:2], ihash[2:4], ihash, file["ext"]) + self.root[8:], md5[0:2], md5[2:4], md5, file["ext"]) post["filename"] = file["md5"] post["extension"] = file["ext"] @@ -69,12 +68,33 @@ class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor): pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)" test = ( ("https://e621.net/pools/73", { - "url": "842f2fb065c7c339486a9b1d689020b8569888ed", - "content": "c2c87b7a9150509496cddc75ccab08109922876a", + "url": "1bd09a72715286a79eea3b7f09f51b3493eb579a", + "content": "91abe5d5334425d9787811d7f06d34c77974cd22", }), ("https://e621.net/pool/show/73"), ) + def posts(self): + self.log.info("Fetching posts of pool %s", self.pool_id) + + id_to_post = { + post["id"]: post + for post in self._pagination( + "/posts.json", {"tags": "pool:" + self.pool_id}) + } + + posts = [] + append = posts.append + for num, pid in enumerate(self.post_ids, 1): + if pid in id_to_post: + post = id_to_post[pid] + post["num"] = num + append(post) + else: + self.log.warning("Post %s is unavailable", pid) + + return posts + class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): """Extractor for single e621 posts""" diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index b0614e2..7a28e9c 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -17,11 +17,12 @@ class GelbooruBase(): category = "gelbooru" root = "https://gelbooru.com" - def _prepare_post(self, post, extended_tags=False): - url = booru.BooruExtractor._prepare_post(self, post, extended_tags) - if url.startswith("https://mp4.gelbooru.com/"): + @staticmethod + def _file_url(post): + url = post["file_url"] + if url.startswith(("https://mp4.gelbooru.com/", "https://video-cdn")): md5 = post["md5"] - return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format( + url = "https://img2.gelbooru.com/images/{}/{}/{}.webm".format( md5[0:2], md5[2:4], md5) return url diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py index e12670a..462d3e9 100644 --- a/gallery_dl/extractor/hentaicafe.py +++ b/gallery_dl/extractor/hentaicafe.py @@ -19,6 +19,7 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor): """Extractor for manga-chapters from hentai.cafe""" category = "hentaicafe" directory_fmt = ("{category}", "{manga}") + filename_fmt = "c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}" pattern = (r"(?:https?://)?(?:www\.)?hentai\.cafe" r"(/manga/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)") test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", { @@ -32,13 +33,14 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor): manga, _, chapter_string = info.partition(" :: ") data = self._data(self.gallery_url.split("/")[5]) - data["manga"] = manga + if "manga" not in data: + data["manga"] = manga data["chapter_string"] = chapter_string.rstrip(" :") return self.parse_chapter_url(self.gallery_url, data) @memcache(keyarg=1) def _data(self, manga): - return {"artist": [], "tags": []} + return {"artist": (), "tags": ()} class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): @@ -50,17 +52,17 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): # single chapter ("https://hentai.cafe/hazuki-yuuto-summer-blues/", { "url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b", - "keyword": "5af1c570bb5f533a32b3375f9cdaa17a0152ba67", + "keyword": "ced644ff94ea22e1991a5e44bf37c38a7e2ac2b3", }), # multi-chapter ("https://hentai.cafe/saitom-saitom-box/", { "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", - "keyword": "3c28517d356cac6acbd9895c9eeefae505304078", + "keyword": "4c2262d680286a54357c334c1faca8f1b0e692e9", }), # new-style URL ("https://hentai.cafe/hc.fyi/2782", { "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", - "keyword": "3c28517d356cac6acbd9895c9eeefae505304078", + "keyword": "4c2262d680286a54357c334c1faca8f1b0e692e9", }), # foolslide URL ("https://hentai.cafe/manga/series/saitom-box/", { @@ -80,16 +82,18 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): chapters.reverse() return chapters - url , pos = text.extract(page, '<link rel="canonical" href="', '"') + manga , pos = text.extract(page, '<title>', '<') + url , pos = text.extract(page, 'rel="canonical" href="', '"', pos) tags , pos = text.extract(page, "<p>Tags: ", "</br>", pos) artist, pos = text.extract(page, "\nArtists: ", "</br>", pos) - manga , pos = text.extract(page, "/manga/read/", "/", pos) + key , pos = text.extract(page, "/manga/read/", "/", pos) data = { + "manga" : text.unescape(manga.rpartition(" | ")[0]), "manga_id": text.parse_int(url.rpartition("/")[2]), "tags" : text.split_html(tags)[::2], "artist" : text.split_html(artist), } - HentaicafeChapterExtractor._data(manga).update(data) + HentaicafeChapterExtractor._data(key).update(data) return [ (url, data) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 9870824..930c8b4 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -12,11 +12,13 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache -import itertools import json import time import re +BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com" +USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)" + class InstagramExtractor(Extractor): """Base class for instagram extractors""" @@ -31,6 +33,7 @@ class InstagramExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) + self.item = match.group(1) self.www_claim = "0" self.csrf_token = util.generate_csrf_token() self._find_tags = re.compile(r"#\w+").findall @@ -68,15 +71,18 @@ class InstagramExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) + if response.history and "/accounts/login/" in response.request.url: if self._cursor: self.log.info("Use '-o cursor=%s' to continue downloading " "from the current position", self._cursor) raise exception.StopExtraction( - "Redirected to login page (%s)", response.request.url) + "HTTP redirect to login page (%s)", response.request.url) + www_claim = response.headers.get("x-ig-set-www-claim") if www_claim is not None: self.www_claim = www_claim + return response def _api_request(self, endpoint, params): @@ -322,10 +328,11 @@ class InstagramExtractor(Extractor): cursor = self.config("cursor") if cursor: return { - "edges": (), + "edges" : (), "page_info": { - "end_cursor": cursor, + "end_cursor" : cursor, "has_next_page": True, + "_virtual" : True, }, } return user[key] @@ -338,6 +345,10 @@ class InstagramExtractor(Extractor): info = data["page_info"] if not info["has_next_page"]: return + elif not data["edges"] and "_virtual" not in info: + s = "" if self.item.endswith("s") else "s" + raise exception.StopExtraction( + "%s'%s posts are private", self.item, s) variables["after"] = self._cursor = info["end_cursor"] self.log.debug("Cursor: %s", self._cursor) @@ -346,80 +357,62 @@ class InstagramExtractor(Extractor): class InstagramUserExtractor(InstagramExtractor): - """Extractor for ProfilePage""" + """Extractor for an Instagram user profile""" subcategory = "user" - pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)" - r"([^/?#]+)/?(?:$|[?#])") + pattern = USER_PATTERN + r"/?(?:$|[?#])" test = ( - ("https://www.instagram.com/instagram/", { - "range": "1-16", - "count": ">= 16", - }), - # ("https://www.instagram.com/instagram/", { - # "options": (("highlights", True),), - # "pattern": InstagramStoriesExtractor.pattern, - # "range": "1-2", - # "count": 2, - # }), + ("https://www.instagram.com/instagram/"), ("https://www.instagram.com/instagram/?hl=en"), ) - def __init__(self, match): - InstagramExtractor.__init__(self, match) - self.user = match.group(1) + def items(self): + if self.config("highlights"): + self.log.warning("'highlights' is deprecated, " + "use '\"include\": \"…,highlights\"' instead") + default = ("highlights", "posts") + else: + default = ("posts",) + + base = "{}/{}/".format(self.root, self.item) + stories = "{}/stories/{}/".format(self.root, self.item) + return self._dispatch_extractors(( + (InstagramStoriesExtractor , stories), + (InstagramHighlightsExtractor, base + "highlights/"), + (InstagramPostsExtractor , base + "posts/"), + (InstagramChannelExtractor , base + "channel/"), + ), default) + + +class InstagramPostsExtractor(InstagramExtractor): + """Extractor for ProfilePage posts""" + subcategory = "posts" + pattern = USER_PATTERN + r"/posts" + test = ("https://www.instagram.com/instagram/posts/", { + "range": "1-16", + "count": ">= 16", + }) def posts(self): - url = "{}/{}/".format(self.root, self.user) + url = "{}/{}/".format(self.root, self.item) user = self._extract_profile_page(url) - if user.get("highlight_reel_count") and self.config("highlights"): - query_hash = "d4d88dc1500312af6f937f7b804c68c3" - variables = { - "user_id": user["id"], - "include_chaining": False, - "include_reel": True, - "include_suggested_users": False, - "include_logged_out_extras": False, - "include_highlight_reels": True, - "include_live_status": True, - } - data = self._graphql_request(query_hash, variables) - highlights = [ - { - "__typename": "GraphReel", - "id" : "highlight:" + edge["node"]["id"], - } - for edge in data["user"]["edge_highlight_reels"]["edges"] - ] - else: - highlights = None - query_hash = "003056d32c2554def87228bc3fd9668a" variables = {"id": user["id"], "first": 50} edge = self._get_edge_data(user, "edge_owner_to_timeline_media") - posts = self._pagination(query_hash, variables, edge) - - return itertools.chain(highlights, posts) if highlights else posts + return self._pagination(query_hash, variables, edge) class InstagramChannelExtractor(InstagramExtractor): """Extractor for ProfilePage channel""" subcategory = "channel" - pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" - r"([^/?#]+)/channel") + pattern = USER_PATTERN + r"/channel" test = ("https://www.instagram.com/instagram/channel/", { "range": "1-16", "count": ">= 16", }) - def __init__(self, match): - InstagramExtractor.__init__(self, match) - self.user = match.group(1) - def posts(self): - url = "{}/{}/channel/".format(self.root, self.user) + url = "{}/{}/channel/".format(self.root, self.item) user = self._extract_profile_page(url) query_hash = "bc78b344a68ed16dd5d7f264681c4c76" @@ -431,17 +424,11 @@ class InstagramChannelExtractor(InstagramExtractor): class InstagramSavedExtractor(InstagramExtractor): """Extractor for ProfilePage saved media""" subcategory = "saved" - pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" - r"([^/?#]+)/saved") + pattern = USER_PATTERN + r"([^/?#]+)/saved" test = ("https://www.instagram.com/instagram/saved/",) - def __init__(self, match): - InstagramExtractor.__init__(self, match) - self.user = match.group(1) - def posts(self): - url = "{}/{}/saved/".format(self.root, self.user) + url = "{}/{}/saved/".format(self.root, self.item) user = self._extract_profile_page(url) query_hash = "2ce1d673055b99250e93b6f88f878fde" @@ -454,22 +441,17 @@ class InstagramTagExtractor(InstagramExtractor): """Extractor for TagPage""" subcategory = "tag" directory_fmt = ("{category}", "{subcategory}", "{tag}") - pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/explore/tags/([^/?#]+)") + pattern = BASE_PATTERN + r"/explore/tags/([^/?#]+)" test = ("https://www.instagram.com/explore/tags/instagram/", { "range": "1-16", "count": ">= 16", }) - def __init__(self, match): - InstagramExtractor.__init__(self, match) - self.tag = match.group(1) - def metadata(self): - return {"tag": self.tag} + return {"tag": self.item} def posts(self): - url = "{}/explore/tags/{}/".format(self.root, self.tag) + url = "{}/explore/tags/{}/".format(self.root, self.item) data = self._extract_shared_data(url) hashtag = data["entry_data"]["TagPage"][0]["graphql"]["hashtag"] @@ -599,21 +581,20 @@ class InstagramPostExtractor(InstagramExtractor): ("https://www.instagram.com/reel/CDg_6Y1pxWu/"), ) - def __init__(self, match): - InstagramExtractor.__init__(self, match) - self.shortcode = match.group(1) - def posts(self): query_hash = "a9441f24ac73000fa17fe6e6da11d59d" variables = { - "shortcode" : self.shortcode, + "shortcode" : self.item, "child_comment_count" : 3, "fetch_comment_count" : 40, "parent_comment_count" : 24, "has_threaded_comments": True } data = self._graphql_request(query_hash, variables) - return (data["shortcode_media"],) + media = data.get("shortcode_media") + if not media: + raise exception.NotFoundError("post") + return (media,) class InstagramStoriesExtractor(InstagramExtractor): @@ -644,3 +625,34 @@ class InstagramStoriesExtractor(InstagramExtractor): reel_id = user["id"] return ({"__typename": "GraphReel", "id": reel_id},) + + +class InstagramHighlightsExtractor(InstagramExtractor): + """Extractor for all Instagram story highlights of a user""" + subcategory = "highlights" + pattern = USER_PATTERN + r"/highlights" + test = ("https://www.instagram.com/instagram/highlights",) + + def posts(self): + url = "{}/{}/".format(self.root, self.item) + user = self._extract_profile_page(url) + + query_hash = "d4d88dc1500312af6f937f7b804c68c3" + variables = { + "user_id": user["id"], + "include_chaining": False, + "include_reel": True, + "include_suggested_users": False, + "include_logged_out_extras": False, + "include_highlight_reels": True, + "include_live_status": True, + } + data = self._graphql_request(query_hash, variables) + + return [ + { + "__typename": "GraphReel", + "id" : "highlight:" + edge["node"]["id"], + } + for edge in data["user"]["edge_highlight_reels"]["edges"] + ] diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py index 5902333..0cbea67 100644 --- a/gallery_dl/extractor/keenspot.py +++ b/gallery_dl/extractor/keenspot.py @@ -39,7 +39,7 @@ class KeenspotComicExtractor(Extractor): }), ("http://twokinds.keenspot.com/comic/1066/", { # "random" access "range": "1-3", - "url": "97e2a6ed8ba1709314f2449f84b6b1ce5db21c04", + "url": "6a784e11370abfb343dcad9adbb7718f9b7be350", }) ) @@ -58,7 +58,14 @@ class KeenspotComicExtractor(Extractor): yield Message.Version, 1 yield Message.Directory, data - url = self._first(self.request(self.root + "/").text) + with self.request(self.root + "/") as response: + if response.history: + url = response.request.url + self.root = url[:url.index("/", 8)] + page = response.text + del response + + url = self._first(page) if self.path: url = self.root + self.path diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 96c81c7..dca8995 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -196,5 +196,8 @@ class MangadexMangaExtractor(MangadexExtractor): "_extractor": MangadexChapterExtractor, }) - results.sort(key=lambda x: (x["chapter"], x["chapter_minor"])) + results.sort( + key=lambda x: (x["chapter"], x["chapter_minor"]), + reverse=self.config("chapter-reverse", False), + ) return results diff --git a/gallery_dl/extractor/mangapanda.py b/gallery_dl/extractor/mangapanda.py deleted file mode 100644 index 155a9b6..0000000 --- a/gallery_dl/extractor/mangapanda.py +++ /dev/null @@ -1,118 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2020 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for http://www.mangapanda.com/""" - -from .common import ChapterExtractor, MangaExtractor -from .. import text - - -class MangapandaBase(): - """Base class for mangapanda extractors""" - category = "mangapanda" - root = "http://www.mangapanda.com" - - @staticmethod - def parse_page(page, data): - """Parse metadata on 'page' and add it to 'data'""" - text.extract_all(page, ( - ("manga" , '<h2 class="aname">', '</h2>'), - ("release", '>Year of Release:</td>\n<td>', '</td>'), - ('author' , '>Author:</td>\n<td>', '</td>'), - ('artist' , '>Artist:</td>\n<td>', '</td>'), - ), values=data) - data["manga"] = data["manga"].strip() - data["author"] = text.unescape(data["author"]) - data["artist"] = text.unescape(data["artist"]) - return data - - -class MangapandaChapterExtractor(MangapandaBase, ChapterExtractor): - """Extractor for manga-chapters from mangapanda.com""" - archive_fmt = "{manga}_{chapter}_{page}" - pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?#]+)/(\d+))" - test = ("http://www.mangapanda.com/red-storm/2", { - "url": "1f633f776e950531ba9b1e81965316458e785261", - "keyword": "b24df4b9cc36383fb6a44e06d32a3884a4dcb5fb", - }) - - def __init__(self, match): - path, self.url_title, self.chapter = match.groups() - ChapterExtractor.__init__(self, match, self.root + path) - - def metadata(self, chapter_page): - page = self.request(self.root + self.url_title).text - data = self.parse_page(page, { - "chapter": text.parse_int(self.chapter), - "lang": "en", - "language": "English", - }) - text.extract_all(page, ( - ('title', ' ' + self.chapter + '</a> : ', '</td>'), - ('date', '<td>', '</td>'), - ), page.index('<div id="chapterlist">'), data) - data["count"] = text.parse_int(text.extract( - chapter_page, '</select> of ', '<')[0] - ) - return data - - def images(self, page): - while True: - next_url, image_url, image_data = self.get_image_metadata(page) - yield image_url, image_data - - if not next_url: - return - page = self.request(next_url).text - - def get_image_metadata(self, page): - """Collect next url, image-url and metadata for one manga-page""" - extr = text.extract - width = None - test , pos = extr(page, "document['pu']", '') - if test is None: - return None, None, None - if page.find("document['imgwidth']", pos, pos+200) != -1: - width , pos = extr(page, "document['imgwidth'] = ", ";", pos) - height, pos = extr(page, "document['imgheight'] = ", ";", pos) - _ , pos = extr(page, '<div id="imgholder">', '') - url, pos = extr(page, ' href="', '"', pos) - if width is None: - width , pos = extr(page, '<img id="img" width="', '"', pos) - height, pos = extr(page, ' height="', '"', pos) - image, pos = extr(page, ' src="', '"', pos) - return self.root + url, image, { - "width": text.parse_int(width), - "height": text.parse_int(height), - } - - -class MangapandaMangaExtractor(MangapandaBase, MangaExtractor): - """Extractor for manga from mangapanda.com""" - chapterclass = MangapandaChapterExtractor - reverse = False - pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com(/[^/?#]+)/?$" - test = ("http://www.mangapanda.com/mushishi", { - "url": "50a1ba730b85426b904da256c80f68ba6a8a2566", - "keyword": "031b3ea085921c552de017ecbb9b906e462229c9", - }) - - def chapters(self, page): - results = [] - data = self.parse_page(page, {"lang": "en", "language": "English"}) - - needle = '<div class="chico_manga"></div>\n<a href="' - pos = page.index('<div id="chapterlist">') - while True: - url, pos = text.extract(page, needle, '"', pos) - if not url: - return results - data["title"], pos = text.extract(page, '</a> : ', '</td>', pos) - data["date"] , pos = text.extract(page, '<td>', '</td>', pos) - data["chapter"] = text.parse_int(url.rpartition("/")[2]) - results.append((self.root + url, data.copy())) diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index cbc8680..0ac55cd 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -23,16 +23,11 @@ class MoebooruExtractor(BooruExtractor): filename_fmt = "{category}_{id}_{md5}.{extension}" page_start = 1 - def _prepare_post(self, post, extended_tags=False): - url = post["file_url"] - if url[0] == "/": - url = self.root + url - if extended_tags: - self._fetch_extended_tags(post) + @staticmethod + def _prepare(post): post["date"] = text.parse_timestamp(post["created_at"]) - return url - def _fetch_extended_tags(self, post): + def _extended_tags(self, post): url = "{}/post/show/{}".format(self.root, post["id"]) page = self.request(url).text html = text.extract(page, '<ul id="tag-', '</ul>')[0] @@ -217,13 +212,6 @@ EXTRACTORS = { }), "test-post": ("https://hypnohub.net/post/show/73964", { "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee", - "options": (("tags", True),), - "keyword": { - "tags_artist": "gonoike_biwa icontrol_(manipper)", - "tags_character": "komaru_naegi", - "tags_copyright": "dangan_ronpa dangan_ronpa_another_episode", - "tags_general": str, - }, }), "test-popular": ( ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", { diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 2394acf..e558513 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -100,6 +100,10 @@ class NijieExtractor(AsynchronousMixin, Extractor): @cache(maxage=150*24*3600, keyarg=1) def _login_impl(self, username, password): + if not username or not password: + raise exception.AuthenticationError( + "Username and password required") + self.log.info("Logging in as %s", username) url = "{}/login_int.php".format(self.root) data = {"email": username, "password": password, "save": "on"} diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index aa11289..739e67e 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -14,7 +14,6 @@ from ..cache import cache import itertools import json - BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" @@ -31,29 +30,59 @@ class PinterestExtractor(Extractor): def items(self): self.api.login() data = self.metadata() - yield Message.Version, 1 - yield Message.Directory, data + videos = self.config("videos", True) + yield Message.Directory, data for pin in self.pins(): - if "images" in pin: - url, pin_data = self.data_from_pin(pin) - pin_data.update(data) - yield Message.Url, url, pin_data + + try: + media = self._media_from_pin(pin) + except Exception: + self.log.debug("Unable to fetch download URL for pin %s", + pin.get("id")) + continue + + if not videos and media.get("duration") is not None: + continue + + pin.update(data) + pin.update(media) + url = media["url"] + text.nameext_from_url(url, pin) + + if pin["extension"] == "m3u8": + url = "ytdl:" + url + pin["extension"] = "mp4" + pin["_ytdl_extra"] = {"protocol": "m3u8_native"} + + yield Message.Url, url, pin def metadata(self): """Return general metadata""" def pins(self): - """Return all relevant pin-objects""" + """Return all relevant pin objects""" @staticmethod - def data_from_pin(pin): - """Get image url and metadata from a pin-object""" - img = pin["images"]["orig"] - url = img["url"] - pin["width"] = img["width"] - pin["height"] = img["height"] - return url, text.nameext_from_url(url, pin) + def _media_from_pin(pin): + videos = pin.get("videos") + if videos: + video_formats = videos["video_list"] + + for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"): + if fmt in video_formats: + media = video_formats[fmt] + break + else: + media = max(video_formats.values(), + key=lambda x: x.get("width", 0)) + + if "V_720P" in video_formats: + media["_fallback"] = (video_formats["V_720P"]["url"],) + + return media + + return pin["images"]["orig"] class PinterestPinExtractor(PinterestExtractor): @@ -66,6 +95,11 @@ class PinterestPinExtractor(PinterestExtractor): "content": ("4c435a66f6bb82bb681db2ecc888f76cf6c5f9ca", "d3e24bc9f7af585e8c23b9136956bd45a4d9b947"), }), + # video pin (#1189) + ("https://www.pinterest.com/pin/422564377542934214/", { + "pattern": r"https://v\.pinimg\.com/videos/mc/hls/d7/22/ff" + r"/d722ff00ab2352981b89974b37909de8.m3u8", + }), ("https://www.pinterest.com/pin/858146903966145188/", { "exception": exception.NotFoundError, }), @@ -78,7 +112,7 @@ class PinterestPinExtractor(PinterestExtractor): def metadata(self): self.pin = self.api.pin(self.pin_id) - return self.data_from_pin(self.pin)[1] + return self.pin def pins(self): return (self.pin,) @@ -173,8 +207,7 @@ class PinterestRelatedPinExtractor(PinterestPinExtractor): }) def metadata(self): - pin = self.api.pin(self.pin_id) - return {"original_pin": self.data_from_pin(pin)[1]} + return {"original_pin": self.api.pin(self.pin_id)} def pins(self): return self.api.pin_related(self.pin_id) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a813d0e..8aee058 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -522,6 +522,10 @@ class PixivAppAPI(): @cache(maxage=3600, keyarg=1) def _login_impl(self, username, password): + if not username or not password: + raise exception.AuthenticationError( + "Username and password required") + url = "https://oauth.secure.pixiv.net/auth/token" data = { "client_id": self.client_id, @@ -550,6 +554,7 @@ class PixivAppAPI(): response = self.extractor.request( url, method="POST", headers=headers, data=data, fatal=False) if response.status_code >= 400: + self.log.debug(response.text) raise exception.AuthenticationError() data = response.json()["response"] diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index 1856c82..61e3d41 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -29,26 +29,26 @@ class PornhubGalleryExtractor(PornhubExtractor): archive_fmt = "{id}" pattern = BASE_PATTERN + r"/album/(\d+)" test = ( - ("https://www.pornhub.com/album/17218841", { + ("https://www.pornhub.com/album/19289801", { "pattern": r"https://\w+.phncdn.com/pics/albums/\d+/\d+/\d+/\d+/", - "count": 81, + "count": 308, "keyword": { - "id": int, - "num": int, - "score": int, - "views": int, + "id" : int, + "num" : int, + "score" : int, + "views" : int, "caption": str, - "user": "Unknown", + "user" : "Danika Mori", "gallery": { - "id" : 17218841, + "id" : 19289801, "score": int, "views": int, "tags" : list, - "title": "Hentai/Ecchi 41", + "title": "Danika Mori Best Moments", }, }, }), - ("https://www.pornhub.com/album/37180171", { + ("https://www.pornhub.com/album/69040172", { "exception": exception.AuthorizationError, }), ) @@ -118,10 +118,10 @@ class PornhubGalleryExtractor(PornhubExtractor): class PornhubUserExtractor(PornhubExtractor): """Extractor for all galleries of a pornhub user""" subcategory = "user" - pattern = (BASE_PATTERN + r"/(users|model)/([^/?#]+)" + pattern = (BASE_PATTERN + r"/(users|model|pornstar)/([^/?#]+)" "(?:/photos(?:/(public|private|favorites))?)?/?$") test = ( - ("https://www.pornhub.com/users/flyings0l0/photos/public", { + ("https://www.pornhub.com/pornstar/danika-mori/photos", { "pattern": PornhubGalleryExtractor.pattern, "count": ">= 6", }), diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index cfbab1d..aa0ba6d 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -231,12 +231,13 @@ class JoyreactorSearchExtractor(ReactorSearchExtractor): category = "joyreactor" pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" test = ( - ("http://joyreactor.cc/search/Cirno+Gifs", { + ("http://joyreactor.cc/search/Cirno", { "range": "1-25", "count": ">= 20", }), - ("http://joyreactor.com/search?q=Cirno+Gifs", { - "count": 0, # no search results on joyreactor.com + ("http://joyreactor.com/search?q=Cirno", { + "range": "1-25", + "count": ">= 20", }), ) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 438dd9f..9e64eac 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -6,13 +6,15 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://chan.sankakucomplex.com/""" +"""Extractors for https://sankaku.app/""" from .booru import BooruExtractor from .. import text, exception +from ..cache import cache import collections -BASE_PATTERN = r"(?:https?://)?(?:beta|chan)\.sankakucomplex\.com" +BASE_PATTERN = r"(?:https?://)?" \ + r"(?:sankaku\.app|(?:beta|chan)\.sankakucomplex\.com)" class SankakuExtractor(BooruExtractor): @@ -20,8 +22,8 @@ class SankakuExtractor(BooruExtractor): basecategory = "booru" category = "sankaku" filename_fmt = "{category}_{id}_{md5}.{extension}" - request_interval_min = 1.0 - per_page = 100 + cookiedomain = None + _warning = True TAG_TYPES = { 0: "general", @@ -36,17 +38,24 @@ class SankakuExtractor(BooruExtractor): 9: "meta", } - def _prepare_post(self, post, extended_tags=False): + def skip(self, num): + return 0 + + def _file_url(self, post): url = post["file_url"] - if url[0] == "/": - url = self.root + url - if extended_tags: - self._fetch_extended_tags(post) - post["date"] = text.parse_timestamp(post["created_at"]["s"]) - post["tags"] = [tag["name"] for tag in post["tags"]] + if not url and self._warning: + self.log.warning( + "Login required to download 'contentious_content' posts") + SankakuExtractor._warning = False return url - def _fetch_extended_tags(self, post): + @staticmethod + def _prepare(post): + post["created_at"] = post["created_at"]["s"] + post["date"] = text.parse_timestamp(post["created_at"]) + post["tags"] = [tag["name"] for tag in post["tags"]] + + def _extended_tags(self, post): tags = collections.defaultdict(list) types = self.TAG_TYPES for tag in post["tags"]: @@ -54,44 +63,21 @@ class SankakuExtractor(BooruExtractor): for key, value in tags.items(): post["tags_" + key] = value - def _api_request(self, endpoint, params=None): - url = "https://capi-v2.sankakucomplex.com" + endpoint - while True: - response = self.request(url, params=params, fatal=False) - if response.status_code == 429: - self.wait(until=response.headers.get("X-RateLimit-Reset")) - continue - return response.json() - - def _pagination(self, params): - params["lang"] = "en" - params["limit"] = str(self.per_page) - - while True: - data = self._api_request("/posts/keyset", params) - if not data.get("success", True): - raise exception.StopExtraction(data.get("code")) - yield from data["data"] - - params["next"] = data["meta"]["next"] - if not params["next"]: - return - if "page" in params: - del params["page"] - class SankakuTagExtractor(SankakuExtractor): - """Extractor for images from chan.sankakucomplex.com by search-tags""" + """Extractor for images from sankaku.app by search-tags""" subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" pattern = BASE_PATTERN + r"/\?([^#]*)" test = ( - ("https://beta.sankakucomplex.com/?tags=bonocho", { + ("https://sankaku.app/?tags=bonocho", { "count": 5, "pattern": r"https://c?s\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", }), + ("https://beta.sankakucomplex.com/?tags=bonocho"), + ("https://chan.sankakucomplex.com/?tags=bonocho"), # error on five or more tags ("https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", { "options": (("username", None),), @@ -111,19 +97,21 @@ class SankakuTagExtractor(SankakuExtractor): return {"search_tags": self.tags} def posts(self): - return self._pagination({"tags": self.tags}) + params = {"tags": self.tags} + return SankakuAPI(self).posts_keyset(params) class SankakuPoolExtractor(SankakuExtractor): - """Extractor for image pools or books from chan.sankakucomplex.com""" + """Extractor for image pools or books from sankaku.app""" subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}") archive_fmt = "p_{pool}_{id}" pattern = BASE_PATTERN + r"/(?:books|pool/show)/(\d+)" test = ( - ("https://beta.sankakucomplex.com/books/90", { + ("https://sankaku.app/books/90", { "count": 5, }), + ("https://beta.sankakucomplex.com/books/90"), ("https://chan.sankakucomplex.com/pool/show/90"), ) @@ -132,7 +120,7 @@ class SankakuPoolExtractor(SankakuExtractor): self.pool_id = match.group(1) def metadata(self): - pool = self._api_request("/pools/" + self.pool_id) + pool = SankakuAPI(self).pools(self.pool_id) self._posts = pool.pop("posts") return {"pool": pool} @@ -141,12 +129,12 @@ class SankakuPoolExtractor(SankakuExtractor): class SankakuPostExtractor(SankakuExtractor): - """Extractor for single images from chan.sankakucomplex.com""" + """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" pattern = BASE_PATTERN + r"/post/show/(\d+)" test = ( - ("https://beta.sankakucomplex.com/post/show/360451", { + ("https://sankaku.app/post/show/360451", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", "options": (("tags", True),), "keyword": { @@ -158,6 +146,12 @@ class SankakuPostExtractor(SankakuExtractor): "tags_general" : list, }, }), + # 'contentious_content' + ("https://sankaku.app/post/show/21418978", { + "pattern": r"https://s\.sankakucomplex\.com" + r"/data/13/3c/133cda3bfde249c504284493903fb985\.jpg", + }), + ("https://beta.sankakucomplex.com/post/show/360451"), ("https://chan.sankakucomplex.com/post/show/360451"), ) @@ -166,4 +160,128 @@ class SankakuPostExtractor(SankakuExtractor): self.post_id = match.group(1) def posts(self): - return self._pagination({"tags": "id:" + self.post_id}) + return SankakuAPI(self).posts(self.post_id) + + +class SankakuAPI(): + """Interface for the sankaku.app API""" + + def __init__(self, extractor): + self.extractor = extractor + self.headers = {"Accept": "application/vnd.sankaku.api+json;v=2"} + + self.username, self.password = self.extractor._get_auth_info() + if not self.username: + self.authenticate = lambda: None + + def pools(self, pool_id): + params = {"lang": "en"} + return self._call("/pools/" + pool_id, params) + + def posts(self, post_id): + params = { + "lang" : "en", + "page" : "1", + "limit": "1", + "tags" : "id_range:" + post_id, + } + return self._call("/posts", params) + + def posts_keyset(self, params): + return self._pagination("/posts/keyset", params) + + def authenticate(self): + self.headers["Authorization"] = \ + _authenticate_impl(self.extractor, self.username, self.password) + + def _call(self, endpoint, params=None): + url = "https://capi-v2.sankakucomplex.com" + endpoint + for _ in range(5): + self.authenticate() + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=False) + + if response.status_code == 429: + self.extractor.wait( + until=response.headers.get("X-RateLimit-Reset")) + continue + + data = response.json() + try: + success = data.get("success", True) + except AttributeError: + success = True + if not success: + code = data.get("code") + if code == "invalid_token": + _authenticate_impl.invalidate(self.username) + continue + raise exception.StopExtraction(code) + return data + + def _pagination(self, endpoint, params): + params["lang"] = "en" + params["limit"] = str(self.extractor.per_page) + + while True: + data = self._call(endpoint, params) + yield from data["data"] + + params["next"] = data["meta"]["next"] + if not params["next"]: + return + + +@cache(maxage=365*24*3600, keyarg=1) +def _authenticate_impl(extr, username, password): + extr.log.info("Logging in as %s", username) + headers = {"Accept": "application/vnd.sankaku.api+json;v=2"} + + # get initial access_token + url = "https://login.sankakucomplex.com/auth/token" + data = {"login": username, "password": password} + response = extr.request( + url, method="POST", headers=headers, json=data, fatal=False) + data = response.json() + + if response.status_code >= 400 or not data.get("success"): + raise exception.AuthenticationError(data.get("error")) + access_token = data["access_token"] + + # start openid auth + url = "https://login.sankakucomplex.com/oidc/auth" + params = { + "response_type": "code", + "scope" : "openid", + "client_id" : "sankaku-web-app", + "redirect_uri" : "https://sankaku.app/sso/callback", + "state" : "return_uri=https://sankaku.app/", + "theme" : "black", + "lang" : "undefined", + } + page = extr.request(url, params=params).text + submit_url = text.extract(page, 'submitUrl = "', '"')[0] + + # get code from initial access_token + url = "https://login.sankakucomplex.com" + submit_url + data = { + "accessToken": access_token, + "nonce" : "undefined", + } + response = extr.request(url, method="POST", data=data) + query = text.parse_query(response.request.url.partition("?")[2]) + + # get final access_token from code + url = "https://capi-v2.sankakucomplex.com/sso/finalize?lang=en" + data = { + "code" : query["code"], + "client_id" : "sankaku-web-app", + "redirect_uri": "https://sankaku.app/sso/callback", + } + response = extr.request( + url, method="POST", headers=headers, json=data, fatal=False) + data = response.json() + + if response.status_code >= 400 or not data.get("success"): + raise exception.AuthenticationError(data.get("error")) + return "Bearer " + data["access_token"] diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index b32a170..7f9130d 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -54,6 +54,10 @@ class SeigaExtractor(Extractor): @cache(maxage=7*24*3600, keyarg=1) def _login_impl(self, username, password): + if not username or not password: + raise exception.AuthenticationError( + "Username and password required") + self.log.info("Logging in as %s", username) url = "https://account.nicovideo.jp/api/v1/login" data = {"mail_tel": username, "password": password} diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index a3dc6a0..5d3ca89 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -47,7 +47,7 @@ class WebtoonsEpisodeExtractor(WebtoonsExtractor): (("https://www.webtoons.com/en/comedy/safely-endangered" "/ep-572-earth/viewer?title_no=352&episode_no=572"), { "url": "11041d71a3f92728305c11a228e77cf0f7aa02ef", - "content": "4f7701a750368e377d65900e6e8f64a5f9cb9c86", + "content": "1ce950324f14018b691c42b0ede57fa25618abeb", "count": 5, }), ) diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index 4efc92c..9238590 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -116,7 +116,7 @@ class WikiartArtistsExtractor(WikiartExtractor): pattern = (BASE_PATTERN + r"/artists-by-([\w-]+)/([\w-]+)") test = ("https://www.wikiart.org/en/artists-by-century/12", { "pattern": WikiartArtistExtractor.pattern, - "count": 7, + "count": ">= 8", }) def __init__(self, match): diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index 205f42e..5a54a77 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -30,8 +30,6 @@ class ExecPP(PostProcessor): args = options["command"] if isinstance(args, str): - if "{}" not in args: - args += " {}" self.args = args execute = self.exec_string else: @@ -77,12 +75,12 @@ class ExecPP(PostProcessor): self.log.debug("Running '%s'", args) retcode = subprocess.Popen(args, shell=shell).wait() if retcode: - self.log.warning( - "Executing '%s' returned with non-zero exit status (%d)", - " ".join(args) if isinstance(args, list) else args, retcode) + self.log.warning("'%s' returned with non-zero exit status (%d)", + args, retcode) - def _exec_async(self, args): - subprocess.Popen(args, shell=self.shell) + def _exec_async(self, args, shell): + self.log.debug("Running '%s'", args) + subprocess.Popen(args, shell=shell) __postprocessor__ = ExecPP diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 4c0d17b..d91d29a 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -956,7 +956,7 @@ class PathFormat(): class DownloadArchive(): def __init__(self, path, extractor): - con = sqlite3.connect(path) + con = sqlite3.connect(path, timeout=60, check_same_thread=False) con.isolation_level = None self.close = con.close self.cursor = con.cursor() diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 0b01ad2..21541be 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.16.0" +__version__ = "1.16.1" diff --git a/test/test_results.py b/test/test_results.py index 4e9f4b2..f7356d5 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -30,7 +30,6 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { - "dokireader", "imagevenue", "photobucket", } @@ -312,6 +311,7 @@ def setup_test_config(): config.set(("extractor", "nijie") , "username", email) config.set(("extractor", "seiga") , "username", email) config.set(("extractor", "pinterest") , "username", email2) + config.set(("extractor", "pinterest") , "username", None) # login broken config.set(("extractor", "newgrounds"), "username", "d1618111") config.set(("extractor", "newgrounds"), "password", "d1618111") |