diff options
author | Unit 193 <unit193@unit193.net> | 2020-08-15 17:48:17 -0400 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2020-08-15 17:48:17 -0400 |
commit | a9dc717bd3bed88f1c9a9845f20c7d0c6df3b091 (patch) | |
tree | 9ba1bc366235a2fda8352a87b4a9ec3c3d5c8044 | |
parent | 366cfd0b079036401bfba8ad21fd217310a1230f (diff) | |
parent | 7cf59dc17c3607e096292462ed15d391be4e3dfd (diff) | |
download | gallery-dl-a9dc717bd3bed88f1c9a9845f20c7d0c6df3b091.tar.bz2 gallery-dl-a9dc717bd3bed88f1c9a9845f20c7d0c6df3b091.tar.xz gallery-dl-a9dc717bd3bed88f1c9a9845f20c7d0c6df3b091.tar.zst |
Update upstream source from tag 'upstream/1.14.4'
Update to upstream version '1.14.4'
with Debian dir 6efd1a975e5a197bdad2999f9c9ecbfdd15ab9d1
39 files changed, 668 insertions, 345 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index ac09ee7..fa9f17c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ # Changelog +## 1.14.4 - 2020-08-15 +### Additions +- [blogger] add `search` extractor ([#925](https://github.com/mikf/gallery-dl/issues/925)) +- [blogger] support searching posts by labels ([#925](https://github.com/mikf/gallery-dl/issues/925)) +- [inkbunny] add `user` and `post` extractors ([#283](https://github.com/mikf/gallery-dl/issues/283)) +- [instagram] support `/reel/` URLs +- [pinterest] support `pinterest.co.uk` URLs ([#914](https://github.com/mikf/gallery-dl/issues/914)) +- [reddit] support gallery posts ([#920](https://github.com/mikf/gallery-dl/issues/920)) +- [subscribestar] extract attached media files ([#852](https://github.com/mikf/gallery-dl/issues/852)) +### Fixes +- [blogger] improve error messages for missing posts/blogs ([#903](https://github.com/mikf/gallery-dl/issues/903)) +- [exhentai] adjust image limit costs ([#940](https://github.com/mikf/gallery-dl/issues/940)) +- [gfycat] skip malformed gfycat responses ([#902](https://github.com/mikf/gallery-dl/issues/902)) +- [imgur] handle 403 overcapacity responses ([#910](https://github.com/mikf/gallery-dl/issues/910)) +- [instagram] wait before GraphQL requests ([#901](https://github.com/mikf/gallery-dl/issues/901)) +- [mangareader] fix extraction +- [mangoxo] fix login +- [pixnet] detect password-protected albums ([#177](https://github.com/mikf/gallery-dl/issues/177)) +- [simplyhentai] fix `gallery_id` extraction +- [subscribestar] update `date` parsing +- [vsco] handle missing `description` fields +- [xhamster] fix extraction ([#917](https://github.com/mikf/gallery-dl/issues/917)) +- allow `parent-directory` to work recursively ([#905](https://github.com/mikf/gallery-dl/issues/905)) +- skip external OAuth tests ([#908](https://github.com/mikf/gallery-dl/issues/908)) +### Removals +- [bobx] remove module + ## 1.14.3 - 2020-07-18 ### Additions - [8muses] support `comics.8muses.com` URLs @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.14.3 +Version: 1.14.4 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.3/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.3/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.4/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.4/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -165,7 +165,7 @@ Description: ========== $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" - If a site's address is nonstandard for its extractor, you can prefix the URL with the + If a site's address is nonstandard for its extractor, you can prefix the URL with the extractor's name to force the use of a specific extractor: .. code:: bash @@ -216,8 +216,9 @@ Description: ========== a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` and optional for - ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``, - ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, and ``twitter``. + ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``, + ``instagram``, ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, + and ``twitter``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -310,7 +311,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.3.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.4.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ @@ -83,8 +83,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.3/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.3/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.4/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.4/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -154,7 +154,7 @@ Filter manga chapters by language and chapter number: $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" -If a site's address is nonstandard for its extractor, you can prefix the URL with the +If a site's address is nonstandard for its extractor, you can prefix the URL with the extractor's name to force the use of a specific extractor: .. code:: bash @@ -205,8 +205,9 @@ Some extractors require you to provide valid login credentials in the form of a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` and optional for -``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``, -``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, and ``twitter``. +``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``, +``instagram``, ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, +and ``twitter``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -299,7 +300,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.3.tar.gz +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.4.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index f05f2e8..e554159 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2020-07-18" "1.14.3" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2020-08-15" "1.14.4" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 039e750..67e51d4 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2020-07-18" "1.14.3" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2020-08-15" "1.14.4" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -111,7 +111,7 @@ subcategory image Note: Even if the value of the \f[I]extension\f[] key is missing or -\f[I]None\f[], it will filled in later when the file download is +\f[I]None\f[], it will be filled in later when the file download is starting. This key is therefore always available to provide a valid filename extension. @@ -284,6 +284,8 @@ and optional for .br * \f[I]idolcomplex\f[] .br +* \f[I]inkbunny\f[] +.br * \f[I]instagram\f[] .br * \f[I]luscious\f[] @@ -1006,6 +1008,19 @@ Controls whether to choose the GIF or MP4 version of an animation. .br * \f[I]"always"\f[]: Always choose MP4. +.SS extractor.inkbunny.orderby +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"create_datetime"\f[] + +.IP "Description:" 4 +Value of the \f[I]orderby\f[] parameter for submission searches. + +(See \f[I]API#Search <https://wiki.inkbunny.net/wiki/API#Search>\f[] +for details) + .SS extractor.instagram.highlights .IP "Type:" 6 \f[I]bool\f[] diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 0007699..8f6f112 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.14.3 +Version: 1.14.4 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.3/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.3/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.4/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.4/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -165,7 +165,7 @@ Description: ========== $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" - If a site's address is nonstandard for its extractor, you can prefix the URL with the + If a site's address is nonstandard for its extractor, you can prefix the URL with the extractor's name to force the use of a specific extractor: .. code:: bash @@ -216,8 +216,9 @@ Description: ========== a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` and optional for - ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``, - ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, and ``twitter``. + ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``, + ``instagram``, ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, + and ``twitter``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -310,7 +311,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.3.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.4.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 80c9f4f..56c9245 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -49,7 +49,6 @@ gallery_dl/extractor/aryion.py gallery_dl/extractor/bcy.py gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py -gallery_dl/extractor/bobx.py gallery_dl/extractor/booru.py gallery_dl/extractor/common.py gallery_dl/extractor/danbooru.py @@ -86,6 +85,7 @@ gallery_dl/extractor/imgbb.py gallery_dl/extractor/imgbox.py gallery_dl/extractor/imgth.py gallery_dl/extractor/imgur.py +gallery_dl/extractor/inkbunny.py gallery_dl/extractor/instagram.py gallery_dl/extractor/issuu.py gallery_dl/extractor/kabeuchi.py diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py index 783e2b2..f553d41 100644 --- a/gallery_dl/exception.py +++ b/gallery_dl/exception.py @@ -51,6 +51,11 @@ class HttpError(ExtractionError): default = "HTTP request failed" code = 4 + def __init__(self, message, response=None): + ExtractionError.__init__(self, message) + self.response = response + self.status = response.status_code if response else 0 + class NotFoundError(ExtractionError): """Requested resource (gallery/image) could not be found""" diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3184663..6f8867c 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -23,7 +23,6 @@ modules = [ "bcy", "behance", "blogger", - "bobx", "danbooru", "deviantart", "dynastyscans", @@ -54,6 +53,7 @@ modules = [ "imgbox", "imgth", "imgur", + "inkbunny", "instagram", "issuu", "kabeuchi", diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 331cfc2..9c18e0e 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -147,7 +147,7 @@ class BloggerPostExtractor(BloggerExtractor): class BloggerBlogExtractor(BloggerExtractor): """Extractor for an entire Blogger blog""" subcategory = "blog" - pattern = BASE_PATTERN + "/?$" + pattern = BASE_PATTERN + r"/?$" test = ( ("https://julianbphotography.blogspot.com/", { "range": "1-25", @@ -164,6 +164,34 @@ class BloggerBlogExtractor(BloggerExtractor): return self.api.blog_posts(blog["id"]) +class BloggerSearchExtractor(BloggerExtractor): + """Extractor for search resuls and labels""" + subcategory = "search" + pattern = BASE_PATTERN + r"/search(?:/?\?q=([^/?&#]+)|/label/([^/?&#]+))" + test = ( + ("https://julianbphotography.blogspot.com/search?q=400mm", { + "count": "< 10" + }), + ("https://dmmagazine.blogspot.com/search/label/D%26D", { + "range": "1-25", + "count": 25, + }), + ) + + def __init__(self, match): + BloggerExtractor.__init__(self, match) + query = match.group(3) + if query: + self.query, self.label = query, None + else: + self.query, self.label = None, match.group(4) + + def posts(self, blog): + if self.query: + return self.api.blog_search(blog["id"], text.unquote(self.query)) + return self.api.blog_posts(blog["id"], text.unquote(self.label)) + + class BloggerAPI(): """Minimal interface for the Blogger v3 API @@ -176,19 +204,27 @@ class BloggerAPI(): self.api_key = extractor.config("api-key", self.API_KEY) def blog_by_url(self, url): - return self._call("blogs/byurl", {"url": url}) + return self._call("blogs/byurl", {"url": url}, "blog") + + def blog_posts(self, blog_id, label=None): + endpoint = "blogs/{}/posts".format(blog_id) + params = {"labels": label} + return self._pagination(endpoint, params) - def blog_posts(self, blog_id): - return self._pagination("blogs/{}/posts".format(blog_id), {}) + def blog_search(self, blog_id, query): + endpoint = "blogs/{}/posts/search".format(blog_id) + params = {"q": query} + return self._pagination(endpoint, params) def post_by_path(self, blog_id, path): endpoint = "blogs/{}/posts/bypath".format(blog_id) - return self._call(endpoint, {"path": path}) + return self._call(endpoint, {"path": path}, "post") - def _call(self, endpoint, params): + def _call(self, endpoint, params, notfound=None): url = "https://www.googleapis.com/blogger/v3/" + endpoint params["key"] = self.api_key - return self.extractor.request(url, params=params).json() + return self.extractor.request( + url, params=params, notfound=notfound).json() def _pagination(self, endpoint, params): while True: diff --git a/gallery_dl/extractor/bobx.py b/gallery_dl/extractor/bobx.py deleted file mode 100644 index 94a2840..0000000 --- a/gallery_dl/extractor/bobx.py +++ /dev/null @@ -1,135 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2018-2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extract images from http://www.bobx.com/dark/""" - -from .common import Extractor, Message -from .. import text -from ..cache import memcache -import random -import time - - -class BobxExtractor(Extractor): - """Base class for bobx extractors""" - category = "bobx" - root = "http://www.bobx.com" - cookiedomain = ".bobx.com" - per_page = 80 - - def __init__(self, match): - Extractor.__init__(self, match) - self.path = match.group(1) - - def login(self): - if not self._check_cookies(("BobXUser",)): - self._update_cookies(self._login_impl()) - - @memcache() - def _login_impl(self): - """Generate a randomized 'BobXUser' cookie""" - rand = random.randrange - tnow = time.time() - rand(60, 3600) - - return {"BobXUser": "{}.{}.{}.{}.{}.{}".format( - int(tnow), - rand(128, 192), rand(0, 256), rand(0, 256), rand(0, 256), - tnow + 622080000, # timestamp in 7200 days - )} - - -class BobxGalleryExtractor(BobxExtractor): - """Extractor for individual image galleries on bobx.com""" - subcategory = "gallery" - directory_fmt = ("{category}", "{model}", "{title}") - filename_fmt = "{model}_{image_id}_{num:>03}.{extension}" - archive_fmt = "{image_id}" - pattern = (r"(?:https?://)?(?:www\.)?bobx\.com" - r"/([^/]+/[^/]+/photoset/[\w-]+)-\d+-\d+-\d+\.html") - test = ( - (("http://www.bobx.com/idol/mikoto-hibi" - "/photoset/wpb-2018-_11-0-2-8.html"), { - "url": "93972d6a661f6627e963d62c9d15531e6b36a389", - "keyword": "6c620862db494ed05e69356ba30e604b167b0670", - "content": "3f176b7fe752524cec21a763aa55567e41181e07", - }), - (("http://www.bobx.com/idol/nashiko-momotsuki" - "/photoset/wpb-net-_221---2018-08---magic-of-summer-0-10-10.html"), { - "url": "f5d6c0cd0881ae6f504c21a90d86e3464dc54e8e", - "keyword": "f4819c75f494044348889ecd27771508464c0f5f", - }), - ) - - def items(self): - self.login() - - num = 0 - while True: - url = "{}/{}-{}-10-8.html".format(self.root, self.path, num) - page = self.request(url, encoding="utf-8").text - - if num == 0: - data = self.metadata(page) - yield Message.Version, 1 - yield Message.Directory, data - data["num"] = 0 - - for url in self.images(page): - url = text.urljoin(self.root, url.replace("-preview-", "-")) - data = text.nameext_from_url(url, data) - data["image_id"] = text.parse_int( - data["filename"].rpartition("-")[2]) - data["num"] += 1 - yield Message.Url, url, data - - num += self.per_page - if num >= data["count"]: - return - - @staticmethod - def metadata(page): - """Collect metadata for extractor-job""" - info = text.extract(page, "<title>", "</title>")[0] - model, _, info = info.partition(" in ") - info, _, count = info.rpartition(" of ") - title = info.rpartition(" - @")[0] - return { - "title": text.unquote(title), - "model": text.unquote(model), - "count": text.parse_int(count), - } - - @staticmethod - def images(page): - """Extract all image-urls""" - page = text.extract(page, "<table CELLPADDING=", "<script ")[0] - return text.extract_iter(page, '<img src="/thumbnail', '"') - - -class BobxIdolExtractor(BobxExtractor): - """Extractor for an idol's image galleries on bobx.com""" - subcategory = "idol" - pattern = r"(?:https?://)?(?:www\.)?bobx\.com/([^/]+/[^/?&#]+)/?$" - test = ("http://www.bobx.com/idol/rin-okabe/", { - "pattern": BobxGalleryExtractor.pattern, - "count": ">= 6", - }) - - def items(self): - self.login() - url = "{}/{}/".format(self.root, self.path) - data = {"_extractor": BobxGalleryExtractor} - page = self.request(url).text - skip = True - - yield Message.Version, 1 - for part in text.extract_iter(page, '="photoset/', '"'): - # skip every other entry - skip = not skip - if not skip: - yield Message.Queue, "{}photoset/{}".format(url, part), data diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index bbbd8a6..e6c0968 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -79,6 +79,7 @@ class Extractor(): session = self.session if session is None else session kwargs.setdefault("timeout", self._timeout) kwargs.setdefault("verify", self._verify) + response = None while True: try: @@ -125,7 +126,7 @@ class Extractor(): time.sleep(min(2 ** (tries-1), 1800)) tries += 1 - raise exception.HttpError(msg) + raise exception.HttpError(msg, response) def wait(self, *, seconds=None, until=None, adjust=1.0, reason="rate limit reset"): diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index bf310ec..4cb10b4 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2019 Mike Fährmann +# Copyright 2014-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -123,7 +123,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): r"|/s/([\da-f]{10})/(\d+)-(\d+))") test = ( ("https://exhentai.org/g/1200119/d55c44d3d0/", { - "keyword": "3eeae7bde70dd992402d4cc0230ea0f2c4af46c5", + "keyword": "199db053b4ccab94463b459e1cfe079df8cdcdd1", "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff", }), ("https://exhentai.org/g/960461/4f0e369d82/", { @@ -353,7 +353,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "height": text.parse_int(parts[2]), "size": size, # 1 initial point + 1 per 0.1 MB - "cost": 1 + math.ceil(size / 104857.6) + "cost": 1 + math.ceil(size / 100000) } diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index b4b0e49..ac1bca3 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -28,6 +28,9 @@ class GfycatExtractor(Extractor): def items(self): metadata = self.metadata() for gfycat in self.gfycats(): + if "gfyName" not in gfycat: + self.log.warning("Skipping '%s' (malformed)", gfycat["gfyId"]) + continue url = self._select_format(gfycat) gfycat.update(metadata) yield Message.Directory, gfycat @@ -118,6 +121,10 @@ class GfycatImageExtractor(GfycatExtractor): ("https://www.gfycat.com/foolishforkedabyssiniancat", { "pattern": "https://redgifs.com/watch/foolishforkedabyssiniancat", }), + # malformed API response (#902) + ("https://gfycat.com/illexcitablehairstreak", { + "count": 0, + }), ("https://gfycat.com/gifs/detail/UnequaledHastyAnkole"), ("https://gfycat.com/ifr/UnequaledHastyAnkole"), ("https://gfycat.com/ru/UnequaledHastyAnkole"), @@ -132,6 +139,9 @@ class GfycatImageExtractor(GfycatExtractor): data = {"_extractor": RedgifsImageExtractor} yield Message.Queue, url, data else: + if "gfyName" not in gfycat: + self.log.warning("Skipping '%s' (malformed)", gfycat["gfyId"]) + return url = self._select_format(gfycat) yield Message.Directory, gfycat yield Message.Url, url, gfycat diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index aa41836..49c1a98 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -36,21 +36,17 @@ class HentainexusGalleryExtractor(GalleryExtractor): rmve = text.remove_html extr = text.extract_from(page) data = { - "gallery_id" : text.parse_int(self.gallery_id), - "tags" : extr('"og:description" content="', '"').split(", "), - "thumbnail" : extr('"og:image" content="', '"'), - "title" : extr('<h1 class="title">', '</h1>'), - "artist" : rmve(extr('viewcolumn">Artist</td>' , '</td>')), - "book" : rmve(extr('viewcolumn">Book</td>' , '</td>')), - "circle" : rmve(extr('viewcolumn">Circle</td>' , '</td>')), - "event" : rmve(extr('viewcolumn">Event</td>' , '</td>')), - "language" : rmve(extr('viewcolumn">Language</td>' , '</td>')), - "magazine" : rmve(extr('viewcolumn">Magazine</td>' , '</td>')), - "parody" : rmve(extr('viewcolumn">Parody</td>' , '</td>')), - "publisher" : rmve(extr('viewcolumn">Publisher</td>' , '</td>')), - "description": rmve(extr('viewcolumn">Description</td>', '</td>')), + "gallery_id": text.parse_int(self.gallery_id), + "tags" : extr('"og:description" content="', '"').split(", "), + "thumbnail" : extr('"og:image" content="', '"'), + "title" : extr('<h1 class="title">', '</h1>'), } + for key in ("Artist", "Book", "Circle", "Event", "Language", + "Magazine", "Parody", "Publisher", "Description"): + data[key.lower()] = rmve(extr( + 'viewcolumn">' + key + '</td>', '</td>')) data["lang"] = util.language_to_code(data["language"]) + if 'doujin' in data['tags']: data['type'] = 'Doujinshi' elif 'illustration' in data['tags']: @@ -60,10 +56,10 @@ class HentainexusGalleryExtractor(GalleryExtractor): data["title_conventional"] = self._join_title(data) return data - def images(self, page): + def images(self, _): url = "{}/read/{}".format(self.root, self.gallery_id) - extr = text.extract_from(self.request(url).text) - urls = extr("initReader(", "]") + "]" + page = self.request(url).text + urls = text.extract(page, "initReader(", "]")[0] + "]" return [(url, None) for url in json.loads(urls)] @staticmethod @@ -120,14 +116,13 @@ class HentainexusSearchExtractor(Extractor): self.params = text.parse_query(match.group(1)) def items(self): - yield Message.Version, 1 params = self.params path = "/" + data = {"_extractor": HentainexusGalleryExtractor} while path: page = self.request(self.root + path, params=params).text extr = text.extract_from(page) - data = {"_extractor": HentainexusGalleryExtractor} while True: gallery_id = extr('<a href="/view/', '"') diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 3882a92..2a69fb1 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -132,14 +132,10 @@ class ImgbbAlbumExtractor(ImgbbExtractor): "url": "ac0abcfcb89f4df6adc2f7e4ff872f3b03ef1bc7", "keyword": {"user": ""}, }), - # deleted - ("https://ibb.co/album/fDArrF", { - "exception": exception.NotFoundError, - }), # private ("https://ibb.co/album/hqgWrF", { "exception": exception.HttpError, - }) + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index a617975..25328ab 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -332,9 +332,15 @@ class ImgurAPI(): return self._call("image/" + image_hash) def _call(self, endpoint): - return self.extractor.request( - "https://api.imgur.com/3/" + endpoint, headers=self.headers, - ).json()["data"] + try: + return self.extractor.request( + "https://api.imgur.com/3/" + endpoint, headers=self.headers, + ).json()["data"] + except exception.HttpError as exc: + if exc.status != 403 or b"capacity" not in exc.response.content: + raise + self.extractor.sleep(seconds=600) + return self._call(endpoint) def _pagination(self, endpoint): num = 0 diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py new file mode 100644 index 0000000..ff8318c --- /dev/null +++ b/gallery_dl/extractor/inkbunny.py @@ -0,0 +1,251 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://inkbunny.net/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache + + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?inkbunny\.net" + + +class InkbunnyExtractor(Extractor): + """Base class for inkbunny extractors""" + category = "inkbunny" + directory_fmt = ("{category}", "{username!l}") + filename_fmt = "{submission_id} {file_id} {title}.{extension}" + archive_fmt = "{file_id}" + root = "https://inkbunny.net" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = InkbunnyAPI(self) + + def items(self): + self.api.authenticate() + to_bool = ("deleted", "digitalsales", "favorite", "forsale", + "friends_only", "guest_block", "hidden", "printsales", + "public", "scraps") + + for post in self.posts(): + post["date"] = text.parse_datetime( + post["create_datetime"] + "00", "%Y-%m-%d %H:%M:%S.%f%z") + post["tags"] = [kw["keyword_name"] for kw in post["keywords"]] + post["ratings"] = [r["name"] for r in post["ratings"]] + files = post["files"] + + for key in to_bool: + post[key] = (post[key] == "t") + + del post["keywords"] + del post["files"] + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + post.update(file) + post["deleted"] = (file["deleted"] == "t") + post["date"] = text.parse_datetime( + file["create_datetime"] + "00", "%Y-%m-%d %H:%M:%S.%f%z") + text.nameext_from_url(file["file_name"], post) + yield Message.Url, file["file_url_full"], post + + +class InkbunnyUserExtractor(InkbunnyExtractor): + """Extractor for inkbunny user profiles""" + subcategory = "user" + pattern = BASE_PATTERN + r"/(?!s/)(gallery/|scraps/)?([^/?&#]+)" + test = ( + ("https://inkbunny.net/soina", { + "pattern": r"https://[\w.]+\.metapix\.net/files/full" + r"/\d+/\d+_soina_.+", + "range": "20-50", + "keyword": { + "date" : "type:datetime", + "deleted" : bool, + "file_id" : "re:[0-9]+", + "filename" : r"re:[0-9]+_soina_\w+", + "full_file_md5": "re:[0-9a-f]{32}", + "mimetype" : str, + "submission_id": "re:[0-9]+", + "user_id" : "20969", + "comments_count" : "re:[0-9]+", + "deleted" : bool, + "digitalsales" : bool, + "favorite" : bool, + "favorites_count": "re:[0-9]+", + "forsale" : bool, + "friends_only" : bool, + "guest_block" : bool, + "hidden" : bool, + "pagecount" : "re:[0-9]+", + "pools" : list, + "pools_count" : int, + "printsales" : bool, + "public" : bool, + "rating_id" : "re:[0-9]+", + "rating_name" : str, + "ratings" : list, + "scraps" : bool, + "tags" : list, + "title" : str, + "type_name" : str, + "username" : "soina", + "views" : str, + }, + }), + ("https://inkbunny.net/gallery/soina", { + "range": "1-25", + "keyword": {"scraps": False}, + }), + ("https://inkbunny.net/scraps/soina", { + "range": "1-25", + "keyword": {"scraps": True}, + }), + ) + + def __init__(self, match): + kind, self.user = match.groups() + if not kind: + self.scraps = None + elif kind[0] == "g": + self.subcategory = "gallery" + self.scraps = "no" + else: + self.subcategory = "scraps" + self.scraps = "only" + InkbunnyExtractor.__init__(self, match) + + def posts(self): + orderby = self.config("orderby") + params = { + "username": self.user, + "scraps" : self.scraps, + "orderby" : orderby, + } + if orderby and orderby.startswith("unread_"): + params["unread_submissions"] = "yes" + return self.api.search(params) + + +class InkbunnyPostExtractor(InkbunnyExtractor): + """Extractor for individual Inkbunny posts""" + subcategory = "post" + pattern = BASE_PATTERN + r"/s/(\d+)" + test = ( + ("https://inkbunny.net/s/1829715", { + "pattern": r"https://[\w.]+\.metapix\.net/files/full" + r"/2626/2626843_soina_dscn2296\.jpg", + "content": "cf69d8dddf0822a12b4eef1f4b2258bd600b36c8", + }), + ("https://inkbunny.net/s/2044094", { + "count": 4, + }), + ) + + def __init__(self, match): + InkbunnyExtractor.__init__(self, match) + self.submission_id = match.group(1) + + def posts(self): + return self.api.detail(({"submission_id": self.submission_id},)) + + +class InkbunnyAPI(): + """Interface for the Inkunny API + + Ref: https://wiki.inkbunny.net/wiki/API + """ + + def __init__(self, extractor): + self.extractor = extractor + self.session_id = None + + def detail(self, submissions): + """Get full details about submissions with the given IDs""" + ids = { + sub["submission_id"]: idx + for idx, sub in enumerate(submissions) + } + params = { + "submission_ids": ",".join(ids), + "show_description": "yes", + } + + submissions = [None] * len(ids) + for sub in self._call("submissions", params)["submissions"]: + submissions[ids[sub["submission_id"]]] = sub + return submissions + + def search(self, params): + """Perform a search""" + return self._pagination_search(params) + + def set_allowed_ratings(self, nudity=True, sexual=True, + violence=True, strong_violence=True): + """Change allowed submission ratings""" + params = { + "tag[2]": "yes" if nudity else "no", + "tag[3]": "yes" if violence else "no", + "tag[4]": "yes" if sexual else "no", + "tag[5]": "yes" if strong_violence else "no", + } + self._call("userrating", params) + + def authenticate(self, invalidate=False): + username, password = self.extractor._get_auth_info() + if invalidate: + _authenticate_impl.invalidate(username or "guest") + if username: + self.session_id = _authenticate_impl(self, username, password) + else: + self.session_id = _authenticate_impl(self, "guest", "") + self.set_allowed_ratings() + + def _call(self, endpoint, params): + url = "https://inkbunny.net/api_" + endpoint + ".php" + params["sid"] = self.session_id + data = self.extractor.request(url, params=params).json() + + if "error_code" in data: + if str(data["error_code"]) == "2": + self.authenticate(invalidate=True) + return self._call(endpoint, params) + raise exception.StopExtraction(data.get("error_message")) + + return data + + def _pagination_search(self, params): + params["page"] = 1 + params["get_rid"] = "yes" + params["submission_ids_only"] = "yes" + + while True: + data = self._call("search", params) + yield from self.detail(data["submissions"]) + + if data["page"] >= data["pages_count"]: + return + if "get_rid" in params: + del params["get_rid"] + params["rid"] = data["rid"] + params["page"] += 1 + + +@cache(maxage=360*24*3600, keyarg=1) +def _authenticate_impl(api, username, password): + api.extractor.log.info("Logging in as %s", username) + + url = "https://inkbunny.net/api_login.php" + data = {"username": username, "password": password} + data = api.extractor.request(url, method="POST", data=data).json() + + if "sid" not in data: + raise exception.AuthenticationError(data.get("error_message")) + return data["sid"] diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index bf6b10f..639f272 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -315,7 +315,7 @@ class InstagramExtractor(Extractor): if not has_next_page: break - + time.sleep(3) end_cursor = medias['page_info']['end_cursor'] variables = '{{"{}":"{}","first":12,"after":"{}"}}'.format( psdf['variables_id'], @@ -342,7 +342,8 @@ class InstagramExtractor(Extractor): class InstagramImageExtractor(InstagramExtractor): """Extractor for PostPage""" subcategory = "image" - pattern = r"(?:https?://)?(?:www\.)?instagram\.com/(?:p|tv)/([^/?&#]+)" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/(?:p|tv|reel)/([^/?&#]+)") test = ( # GraphImage ("https://www.instagram.com/p/BqvsDleB3lV/", { @@ -440,6 +441,8 @@ class InstagramImageExtractor(InstagramExtractor): }] } }), + + ("https://www.instagram.com/reel/CDg_6Y1pxWu/"), ) def __init__(self, match): @@ -500,7 +503,7 @@ class InstagramUserExtractor(InstagramExtractor): """Extractor for ProfilePage""" subcategory = "user" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" + r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)" r"([^/?&#]+)/?(?:$|[?#])") test = ( ("https://www.instagram.com/instagram/", { @@ -530,7 +533,7 @@ class InstagramUserExtractor(InstagramExtractor): 'node_id': 'id', 'variables_id': 'id', 'edge_to_medias': 'edge_owner_to_timeline_media', - 'query_hash': '44efc15d3c13342d02df0b5a9fa3d33f', + 'query_hash': '15bf78a4ad24e33cbd838fdb31353ac1', }) if self.config('highlights'): @@ -599,5 +602,5 @@ class InstagramTagExtractor(InstagramExtractor): 'node_id': 'name', 'variables_id': 'tag_name', 'edge_to_medias': 'edge_hashtag_to_media', - 'query_hash': '7dabc71d3e758b1ec19ffb85639e427b', + 'query_hash': 'c769cb6c71b24c8a86590b22402fda50', }) diff --git a/gallery_dl/extractor/mangapanda.py b/gallery_dl/extractor/mangapanda.py index 18ef005..a4b8340 100644 --- a/gallery_dl/extractor/mangapanda.py +++ b/gallery_dl/extractor/mangapanda.py @@ -1,14 +1,15 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from https://www.mangapanda.com/""" +"""Extractors for https://www.mangapanda.com/""" -from .mangareader import MangareaderMangaExtractor, MangareaderChapterExtractor +from .common import ChapterExtractor, MangaExtractor +from .. import text class MangapandaBase(): @@ -16,21 +17,102 @@ class MangapandaBase(): category = "mangapanda" root = "https://www.mangapanda.com" + @staticmethod + def parse_page(page, data): + """Parse metadata on 'page' and add it to 'data'""" + text.extract_all(page, ( + ("manga" , '<h2 class="aname">', '</h2>'), + ("release", '>Year of Release:</td>\n<td>', '</td>'), + ('author' , '>Author:</td>\n<td>', '</td>'), + ('artist' , '>Artist:</td>\n<td>', '</td>'), + ), values=data) + data["manga"] = data["manga"].strip() + data["author"] = text.unescape(data["author"]) + data["artist"] = text.unescape(data["artist"]) + return data -class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor): + +class MangapandaChapterExtractor(MangapandaBase, ChapterExtractor): """Extractor for manga-chapters from mangapanda.com""" + archive_fmt = "{manga}_{chapter}_{page}" pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))" test = ("https://www.mangapanda.com/red-storm/2", { "url": "1f633f776e950531ba9b1e81965316458e785261", "keyword": "b24df4b9cc36383fb6a44e06d32a3884a4dcb5fb", }) + def __init__(self, match): + path, self.url_title, self.chapter = match.groups() + ChapterExtractor.__init__(self, match, self.root + path) + + def metadata(self, chapter_page): + page = self.request(self.root + self.url_title).text + data = self.parse_page(page, { + "chapter": text.parse_int(self.chapter), + "lang": "en", + "language": "English", + }) + text.extract_all(page, ( + ('title', ' ' + self.chapter + '</a> : ', '</td>'), + ('date', '<td>', '</td>'), + ), page.index('<div id="chapterlist">'), data) + data["count"] = text.parse_int(text.extract( + chapter_page, '</select> of ', '<')[0] + ) + return data + + def images(self, page): + while True: + next_url, image_url, image_data = self.get_image_metadata(page) + yield image_url, image_data + + if not next_url: + return + page = self.request(next_url).text -class MangapandaMangaExtractor(MangapandaBase, MangareaderMangaExtractor): + def get_image_metadata(self, page): + """Collect next url, image-url and metadata for one manga-page""" + extr = text.extract + width = None + test , pos = extr(page, "document['pu']", '') + if test is None: + return None, None, None + if page.find("document['imgwidth']", pos, pos+200) != -1: + width , pos = extr(page, "document['imgwidth'] = ", ";", pos) + height, pos = extr(page, "document['imgheight'] = ", ";", pos) + _ , pos = extr(page, '<div id="imgholder">', '') + url, pos = extr(page, ' href="', '"', pos) + if width is None: + width , pos = extr(page, '<img id="img" width="', '"', pos) + height, pos = extr(page, ' height="', '"', pos) + image, pos = extr(page, ' src="', '"', pos) + return self.root + url, image, { + "width": text.parse_int(width), + "height": text.parse_int(height), + } + + +class MangapandaMangaExtractor(MangapandaBase, MangaExtractor): """Extractor for manga from mangapanda.com""" chapterclass = MangapandaChapterExtractor + reverse = False pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com(/[^/?&#]+)/?$" test = ("https://www.mangapanda.com/mushishi", { "url": "357f965732371cac1990fee8b480f62e29141a42", "keyword": "031b3ea085921c552de017ecbb9b906e462229c9", }) + + def chapters(self, page): + results = [] + data = self.parse_page(page, {"lang": "en", "language": "English"}) + + needle = '<div class="chico_manga"></div>\n<a href="' + pos = page.index('<div id="chapterlist">') + while True: + url, pos = text.extract(page, needle, '"', pos) + if not url: + return results + data["title"], pos = text.extract(page, '</a> : ', '</td>', pos) + data["date"] , pos = text.extract(page, '<td>', '</td>', pos) + data["chapter"] = text.parse_int(url.rpartition("/")[2]) + results.append((self.root + url, data.copy())) diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index 31083dc..fd9c7ac 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -6,10 +6,12 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from https://www.mangareader.net/""" +"""Extractors for https://www.mangareader.net/""" from .common import ChapterExtractor, MangaExtractor from .. import text +from ..cache import memcache +import json class MangareaderBase(): @@ -17,19 +19,35 @@ class MangareaderBase(): category = "mangareader" root = "https://www.mangareader.net" - @staticmethod - def parse_page(page, data): - """Parse metadata on 'page' and add it to 'data'""" - text.extract_all(page, ( - ("manga" , '<h2 class="aname">', '</h2>'), - ("release", '>Year of Release:</td>\n<td>', '</td>'), - ('author' , '>Author:</td>\n<td>', '</td>'), - ('artist' , '>Artist:</td>\n<td>', '</td>'), - ), values=data) - data["manga"] = data["manga"].strip() - data["author"] = text.unescape(data["author"]) - data["artist"] = text.unescape(data["artist"]) - return data + @memcache(keyarg=1) + def _manga_info(self, path, page=None): + if not page: + page = self.request(self.root + path).text + extr = text.extract_from(page) + data = { + "manga" : text.unescape(extr('class="name">', '<')), + "release" : text.unescape(extr('Year of Release :</td><td>', '<')), + "author" : text.unescape(text.unescape(extr( + 'Author :</td><td>', '<'))), + "artist" : text.unescape(text.unescape(extr( + 'Artist :</td><td>', '<'))), + "lang" : "en", + "language": "English", + } + + extr('<table', '>') + chapters = [] + while True: + url = extr('</i> <a href="', '"') + if not url: + return chapters + chapter = { + "chapter": text.parse_int(url.rpartition("/")[2]), + "title" : text.unescape(extr("</a> : ", "<")), + "date" : extr("<td>", "<"), + } + chapter.update(data) + chapters.append((self.root + url, chapter)) class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): @@ -38,59 +56,28 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))" test = (("https://www.mangareader.net" "/karate-shoukoushi-kohinata-minoru/11"), { - "url": "3d8a5b900856d59b8d8e83908d0df392be92c0f4", + "url": "45ece5668d1e9f65cf2225237d78de58660b54e4", "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6", }) def __init__(self, match): - path, self.url_title, self.chapter = match.groups() - ChapterExtractor.__init__(self, match, self.root + path) + ChapterExtractor.__init__(self, match) + _, self.path, self.chapter = match.groups() - def metadata(self, chapter_page): - page = self.request(self.root + self.url_title).text - data = self.parse_page(page, { - "chapter": text.parse_int(self.chapter), - "lang": "en", - "language": "English", - }) - text.extract_all(page, ( - ('title', ' ' + self.chapter + '</a> : ', '</td>'), - ('date', '<td>', '</td>'), - ), page.index('<div id="chapterlist">'), data) - data["count"] = text.parse_int(text.extract( - chapter_page, '</select> of ', '<')[0] - ) - return data + def metadata(self, page): + chapter = text.parse_int(self.chapter) + return self._manga_info(self.path)[chapter-1][1] def images(self, page): - while True: - next_url, image_url, image_data = self.get_image_metadata(page) - yield image_url, image_data - - if not next_url: - return - page = self.request(next_url).text - - def get_image_metadata(self, page): - """Collect next url, image-url and metadata for one manga-page""" - extr = text.extract - width = None - test , pos = extr(page, "document['pu']", '') - if test is None: - return None, None, None - if page.find("document['imgwidth']", pos, pos+200) != -1: - width , pos = extr(page, "document['imgwidth'] = ", ";", pos) - height, pos = extr(page, "document['imgheight'] = ", ";", pos) - _ , pos = extr(page, '<div id="imgholder">', '') - url, pos = extr(page, ' href="', '"', pos) - if width is None: - width , pos = extr(page, '<img id="img" width="', '"', pos) - height, pos = extr(page, ' height="', '"', pos) - image, pos = extr(page, ' src="', '"', pos) - return self.root + url, image, { - "width": text.parse_int(width), - "height": text.parse_int(height), - } + data = json.loads(text.extract( + page, 'document["mj"]=', '</script>')[0]) + return [ + (text.ensure_http_scheme(img["u"]), { + "width" : text.parse_int(img["w"]), + "height": text.parse_int(img["h"]), + }) + for img in data["im"] + ] class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): @@ -104,16 +91,5 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): }) def chapters(self, page): - results = [] - data = self.parse_page(page, {"lang": "en", "language": "English"}) - - needle = '<div class="chico_manga"></div>\n<a href="' - pos = page.index('<div id="chapterlist">') - while True: - url, pos = text.extract(page, needle, '"', pos) - if not url: - return results - data["title"], pos = text.extract(page, '</a> : ', '</td>', pos) - data["date"] , pos = text.extract(page, '<td>', '</td>', pos) - data["chapter"] = text.parse_int(url.rpartition("/")[2]) - results.append((self.root + url, data.copy())) + path = self.manga_url[len(self.root):] + return self._manga_info(path, page) diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index 25fba70..0e04f97 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -58,7 +58,7 @@ class MangoxoExtractor(Extractor): ("timestamp", str(int(time.time()))), ] query = "&".join("=".join(item) for item in sorted(params)) - query += "&secretKey=996293536" + query += "&secretKey=340836904" sign = hashlib.md5(query.encode()).hexdigest() params.append(("sign", sign.upper())) return params diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index 51b314a..e2e163a 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -9,7 +9,7 @@ """Extract images from https://www.myportfolio.com/""" from .common import Extractor, Message -from .. import text +from .. import text, exception class MyportfolioGalleryExtractor(Extractor): @@ -31,9 +31,8 @@ class MyportfolioGalleryExtractor(Extractor): "pattern": r"https://andrewling\.myportfolio\.com/[^/?&#+]+$", "count": ">= 6", }), - # no explicit title ("https://stevenilousphotography.myportfolio.com/society", { - "keyword": "49e7ff6322645c22b409280656202c2736a380c9", + "exception": exception.NotFoundError, }), # custom domain ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", { @@ -89,8 +88,10 @@ class MyportfolioGalleryExtractor(Extractor): if title: title = title.partition(">")[2] user = user[:-len(title)-3] - else: + elif user: user, _, title = user.partition(" - ") + else: + raise exception.NotFoundError() return { "user": text.unescape(user), diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 931fb13..8f2d633 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -95,8 +95,8 @@ class PahealPostExtractor(PahealExtractor): pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/view/(\d+)") test = ("https://rule34.paheal.net/post/view/481609", { - "url": "1142779378f655ec0497d4c301836aa667f788b1", - "keyword": "34e9e93d4fa6fa06fac1a56e78c9a52e8cd7b271", + "url": "d3fd0f82762716fe3fb03c9c923e61c13ce22204", + "keyword": "35748081bfeaab48f909f4b097a4d79b2be12538", "content": "7b924bcf150b352ac75c9d281d061e174c851a11", }) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 3bbe06a..cc89ac5 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -14,7 +14,7 @@ import itertools import json -BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.\w+" +BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" class PinterestExtractor(Extractor): @@ -101,6 +101,8 @@ class PinterestBoardExtractor(PinterestExtractor): ("https://www.pinterest.com/g1952848/test/", { "exception": exception.GalleryDLException, }), + # .co.uk TLD (#914) + ("https://www.pinterest.co.uk/hextra7519/based-animals/"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py index 9cada6b..d8ac9f6 100644 --- a/gallery_dl/extractor/pixnet.py +++ b/gallery_dl/extractor/pixnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,7 +9,7 @@ """Extractors for https://www.pixnet.net/""" from .common import Extractor, Message -from .. import text +from .. import text, exception BASE_PATTERN = r"(?:https?://)?(?!www\.)([^.]+)\.pixnet.net" @@ -53,6 +53,9 @@ class PixnetExtractor(Extractor): yield from text.extract_iter(page, '<li id="', '</li>') pnext = text.extract(page, 'class="nextBtn"', '>')[0] + if pnext is None and 'name="albumpass">' in page: + raise exception.StopExtraction( + "Album %s is password-protected.", self.item_id) if "href" not in pnext: return url = self.root + text.extract(pnext, 'href="', '"')[0] @@ -107,6 +110,9 @@ class PixnetSetExtractor(PixnetExtractor): "url": "b3eb6431aea0bcf5003432a4a0f3a3232084fc13", "keyword": "bf7004faa1cea18cf9bd856f0955a69be51b1ec6", }), + ("https://sky92100.pixnet.net/album/set/17492544", { + "count": 0, # password-protected + }), ) def items(self): diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index f97454b..8290d2d 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -112,6 +112,7 @@ class ReactorExtractor(SharedConfigMixin, Extractor): if not tags: title, tags = tags, title tags = tags.split(" :: ") + tags.sort() for image in images: url = text.extract(image, ' src="', '"')[0] @@ -259,19 +260,19 @@ class JoyreactorPostExtractor(ReactorPostExtractor): test = ( ("http://joyreactor.com/post/3721876", { # single image "url": "6ce09f239d8b7fdf6dd1664c2afc39618cc87663", - "keyword": "966d2acd462732a9ed823a9db5ed19f95734fd10", + "keyword": "147ed5b9799ba43cbd16168450afcfae5ddedbf3", }), ("http://joyreactor.com/post/3713804", { # 4 images "url": "f08ac8493ca0619a3e3c6bedb8d8374af3eec304", - "keyword": "84e34d402342607045a65fab6d4d593d146c238a", + "keyword": "f12c6f3c2f298fed9b12bd3e70fb823870aa9b93", }), ("http://joyreactor.com/post/3726210", { # gif / video "url": "33a48e1eca6cb2d298fbbb6536b3283799d6515b", - "keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47", + "keyword": "d173cc6e88f02a63904e475eacd7050304eb1967", }), ("http://joyreactor.com/post/3668724", { # youtube embed "url": "bf1666eddcff10c9b58f6be63fa94e4e13074214", - "keyword": "989112c7888e9cc80fd35870180c6c98165d953b", + "keyword": "e18b1ffbd79d76f9a0e90b6d474cc2499e343f0b", }), ("http://joyreactor.cc/post/1299", { # "malformed" JSON "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde", diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 679059c..cb70fe5 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -17,7 +17,7 @@ class RedditExtractor(Extractor): """Base class for reddit extractors""" category = "reddit" directory_fmt = ("{category}", "{subreddit}") - filename_fmt = "{id} {title[:220]}.{extension}" + filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}" archive_fmt = "{filename}" cookiedomain = None @@ -50,11 +50,22 @@ class RedditExtractor(Extractor): yield Message.Directory, submission visited.add(submission["id"]) url = submission["url"] + submission["num"] = 0 if url.startswith("https://i.redd.it/"): text.nameext_from_url(url, submission) yield Message.Url, url, submission + elif "gallery_data" in submission: + meta = submission["media_metadata"] + items = submission["gallery_data"]["items"] + for submission["num"], item in enumerate(items, 1): + url = meta[item["media_id"]]["s"]["u"] + url = url.partition("?")[0] + url = url.replace("/preview.", "/i.", 1) + text.nameext_from_url(url, submission) + yield Message.Url, url, submission + elif submission["is_video"]: if videos: text.nameext_from_url(url, submission) @@ -160,9 +171,8 @@ class RedditSubmissionExtractor(RedditExtractor): """Extractor for URLs from a submission on reddit.com""" subcategory = "submission" pattern = (r"(?:https?://)?(?:" - r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|" - r"redd\.it" - r")/([a-z0-9]+)") + r"(?:\w+\.)?reddit\.com/(?:r/[^/?&#]+/comments|gallery)" + r"|redd\.it)/([a-z0-9]+)") test = ( ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", { "pattern": r"https://c2.staticflickr.com/8/7272/\w+_k.jpg", @@ -173,6 +183,11 @@ class RedditSubmissionExtractor(RedditExtractor): "pattern": r"https://", "count": 3, }), + ("https://www.reddit.com/gallery/hrrh23", { + "url": "25b91ede15459470274dd17291424b037ed8b0ae", + "content": "1e7dde4ee7d5f4c4b45749abfd15b2dbfa27df3f", + "count": 3, + }), ("https://old.reddit.com/r/lavaporn/comments/2a00np/"), ("https://np.reddit.com/r/lavaporn/comments/2a00np/"), ("https://m.reddit.com/r/lavaporn/comments/2a00np/"), diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 28ee46c..9d1df18 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -74,21 +74,33 @@ class ShopifyCollectionExtractor(ShopifyExtractor): def products(self): params = text.parse_query(self.params) params["page"] = text.parse_int(params.get("page"), 1) - search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+") - - while True: - page = self.request(self.item_url, params=params).text - urls = search_re.findall(page) - last = None - - if not urls: - return - for path in urls: - if last == path: - continue - last = path - yield self.root + path - params["page"] += 1 + fetch = True + last = None + + for pattern in ( + r"/collections/[\w-]+/products/[\w-]+", + r"href=[\"'](/products/[\w-]+)", + ): + search_re = re.compile(pattern) + + while True: + if fetch: + page = self.request(self.item_url, params=params).text + urls = search_re.findall(page) + + if len(urls) < 3: + if last: + return + fetch = False + break + fetch = True + + for path in urls: + if last == path: + continue + last = path + yield self.root + path + params["page"] += 1 class ShopifyProductExtractor(ShopifyExtractor): @@ -121,7 +133,6 @@ EXTRACTORS = { ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), ("https://www.fashionnova.com/collections/mini-dresses#1"), ), - }, } diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index abf9995..a0d34d1 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -45,11 +45,12 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): self.gallery_url = extr('<link rel="canonical" href="', '"') title = extr('<meta property="og:title" content="', '"') + image = extr('<meta property="og:image" content="', '"') if not title: raise exception.NotFoundError("gallery") data = { "title" : text.unescape(title), - "gallery_id": text.parse_int(extr('/Album/', '/')), + "gallery_id": text.parse_int(image.split("/")[-2]), "parody" : split(extr('box-title">Series</div>', '</div>')), "language" : text.remove_html(extr( 'box-title">Language</div>', '</div>')) or None, diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 44a0a84..163102d 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -112,13 +112,13 @@ class SmugmugImageExtractor(SmugmugExtractor): test = ( ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { "url": "f624ad7293afd6412a7d34e3950a118596c36c85", - "keyword": "085861b5935e3cd96ad15954039bc2419cdf1c27", + "keyword": "d69c69c1517b8ea77bc763cffc4d0a4002dfee3f", "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0", }), # video ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee", - "keyword": "e0927fda7b1c39c19974625270102ad7e72b9d6f", + "keyword": "720da317232504f05099da37802ed3c3ce3cd310", }), ) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 08d8850..076d0c0 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache -import datetime import json @@ -35,8 +34,6 @@ class SubscribestarExtractor(Extractor): self.cookiedomain = "subscribestar.adult" self.subcategory += "-adult" Extractor.__init__(self, match) - self.metadata = self.config("metadata", False) - self._year = " " + str(datetime.date.today().year) def items(self): self.login() @@ -92,38 +89,46 @@ class SubscribestarExtractor(Extractor): @staticmethod def _media_from_post(html): + media = [] + gallery = text.extract(html, 'data-gallery="', '"')[0] if gallery: - return [ + media.extend( item for item in json.loads(text.unescape(gallery)) if "/previews/" not in item["url"] - ] - return () + ) + + attachments = text.extract( + html, 'class="uploads-docs"', 'data-role="post-edit_form"')[0] + if attachments: + for att in attachments.split('class="doc_preview"')[1:]: + media.append({ + "id" : text.parse_int(text.extract( + att, 'data-upload-id="', '"')[0]), + "url" : text.extract(att, 'href="', '"')[0], + "type": "attachment", + }) + + return media def _data_from_post(self, html): extr = text.extract_from(html) - data = { + return { "post_id" : text.parse_int(extr('data-id="', '"')), "author_id" : text.parse_int(extr('data-user-id="', '"')), "author_name": text.unescape(extr('href="/', '"')), "author_nick": text.unescape(extr('>', '<')), + "date" : self._parse_datetime(text.remove_html(extr( + 'class="post-date">', '</'))), "content" : (extr( '<div class="post-content', '<div class="post-uploads') .partition(">")[2]), } - if self.metadata: - url = "{}/posts/{}".format(self.root, data["post_id"]) - page = self.request(url).text - data["date"] = self._parse_datetime(text.extract( - page, 'class="section-subtitle">', '<')[0]) - - return data - def _parse_datetime(self, dt): - date = text.parse_datetime(dt, "%B %d, %Y %H:%M") + date = text.parse_datetime(dt, "%b %d, %Y %I:%M %p") if date is dt: - date = text.parse_datetime(dt + self._year, "%d %b %H:%M %Y") + date = text.parse_datetime(dt, "%B %d, %Y %I:%M %p") return date @@ -141,6 +146,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor): "author_name": "subscribestar", "author_nick": "SubscribeStar", "content": str, + "date" : "type:datetime", "height" : int, "id" : int, "pinned" : bool, @@ -209,8 +215,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor): def posts(self): url = "{}/posts/{}".format(self.root, self.item) - self._page = self.request(url).text - return (self._page,) + return (self.request(url).text,) def _data_from_post(self, html): extr = text.extract_from(html) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 2530040..71f14dc 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -503,8 +503,9 @@ class TwitterAPI(): if response.status_code < 400: return response.json() if response.status_code == 429: - self.extractor.wait(until=response.headers["x-rate-limit-reset"]) - return self._call(endpoint, params) + until = response.headers.get("x-rate-limit-reset") + self.extractor.wait(until=until, seconds=(None if until else 60)) + return self._call(endpoint, params, method) try: msg = ", ".join( diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index c9f0ec3..76e4e3d 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -61,7 +61,7 @@ class VscoExtractor(Extractor): "video" : img["is_video"], "width" : img["width"], "height": img["height"], - "description": img["description"], + "description": img.get("description") or "", }) yield Message.Url, url, data diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index a338216..5f11df3 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -146,7 +146,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor): def _data(self, url): page = self.request(url).text return json.loads(text.extract( - page, "window.initials =", "</script>")[0].rstrip("\n\r;")) + page, "window.initials=", "</script>")[0].rstrip("\n\r;")) class XhamsterUserExtractor(XhamsterExtractor): diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 4c18e4d..163c3c6 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -48,6 +48,9 @@ class Job(): extr.category = pextr.category extr.subcategory = pextr.subcategory + # transfer parent directory + extr._parentdir = pextr._parentdir + # reuse connection adapters extr.session.adapters = pextr.session.adapters diff --git a/gallery_dl/version.py b/gallery_dl/version.py index fd52077..b2b59e0 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.14.3" +__version__ = "1.14.4" diff --git a/test/test_oauth.py b/test/test_oauth.py index e4664e4..7455928 100644 --- a/test/test_oauth.py +++ b/test/test_oauth.py @@ -15,7 +15,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from gallery_dl import oauth, text # noqa E402 TESTSERVER = "http://term.ie/oauth/example" -TESTSERVER = "http://term.ie/oauth/example" CONSUMER_KEY = "key" CONSUMER_SECRET = "secret" REQUEST_TOKEN = "requestkey" @@ -96,12 +95,17 @@ class TestOAuthSession(unittest.TestCase): def _oauth_request(self, endpoint, params=None, oauth_token=None, oauth_token_secret=None): + # the test server at 'term.ie' is unreachable + raise unittest.SkipTest() + session = oauth.OAuth1Session( CONSUMER_KEY, CONSUMER_SECRET, oauth_token, oauth_token_secret, ) try: - return session.get(TESTSERVER + endpoint, params=params).text + response = session.get(TESTSERVER + endpoint, params=params) + response.raise_for_status() + return response.text except OSError: raise unittest.SkipTest() diff --git a/test/test_results.py b/test/test_results.py index dd1ed1d..1f2f699 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -31,8 +31,10 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { - "bobx", + "hentaihand", "imagevenue", + "mangapark", + "ngomik", "photobucket", "worldthree", } @@ -317,7 +319,7 @@ def setup_test_config(): config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") for category in ("danbooru", "instagram", "twitter", "subscribestar", - "e621"): + "e621", "inkbunny"): config.set(("extractor", category), "username", None) config.set(("extractor", "mastodon.social"), "access-token", |