From 7cf59dc17c3607e096292462ed15d391be4e3dfd Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sat, 15 Aug 2020 17:48:11 -0400 Subject: New upstream version 1.14.4. --- CHANGELOG.md | 27 ++++ PKG-INFO | 15 +- README.rst | 13 +- data/man/gallery-dl.1 | 2 +- data/man/gallery-dl.conf.5 | 19 ++- gallery_dl.egg-info/PKG-INFO | 15 +- gallery_dl.egg-info/SOURCES.txt | 2 +- gallery_dl/exception.py | 5 + gallery_dl/extractor/__init__.py | 2 +- gallery_dl/extractor/blogger.py | 50 ++++++- gallery_dl/extractor/bobx.py | 135 ------------------ gallery_dl/extractor/common.py | 3 +- gallery_dl/extractor/exhentai.py | 6 +- gallery_dl/extractor/gfycat.py | 10 ++ gallery_dl/extractor/hentainexus.py | 31 ++--- gallery_dl/extractor/imgbb.py | 6 +- gallery_dl/extractor/imgur.py | 12 +- gallery_dl/extractor/inkbunny.py | 251 ++++++++++++++++++++++++++++++++++ gallery_dl/extractor/instagram.py | 13 +- gallery_dl/extractor/mangapanda.py | 92 ++++++++++++- gallery_dl/extractor/mangareader.py | 122 +++++++---------- gallery_dl/extractor/mangoxo.py | 2 +- gallery_dl/extractor/myportfolio.py | 9 +- gallery_dl/extractor/paheal.py | 4 +- gallery_dl/extractor/pinterest.py | 4 +- gallery_dl/extractor/pixnet.py | 10 +- gallery_dl/extractor/reactor.py | 9 +- gallery_dl/extractor/reddit.py | 23 +++- gallery_dl/extractor/shopify.py | 43 +++--- gallery_dl/extractor/simplyhentai.py | 3 +- gallery_dl/extractor/smugmug.py | 4 +- gallery_dl/extractor/subscribestar.py | 43 +++--- gallery_dl/extractor/twitter.py | 5 +- gallery_dl/extractor/vsco.py | 2 +- gallery_dl/extractor/xhamster.py | 2 +- gallery_dl/job.py | 3 + gallery_dl/version.py | 2 +- test/test_oauth.py | 8 +- test/test_results.py | 6 +- 39 files changed, 668 insertions(+), 345 deletions(-) delete mode 100644 gallery_dl/extractor/bobx.py create mode 100644 gallery_dl/extractor/inkbunny.py diff --git a/CHANGELOG.md b/CHANGELOG.md index ac09ee7..fa9f17c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ # Changelog +## 1.14.4 - 2020-08-15 +### Additions +- [blogger] add `search` extractor ([#925](https://github.com/mikf/gallery-dl/issues/925)) +- [blogger] support searching posts by labels ([#925](https://github.com/mikf/gallery-dl/issues/925)) +- [inkbunny] add `user` and `post` extractors ([#283](https://github.com/mikf/gallery-dl/issues/283)) +- [instagram] support `/reel/` URLs +- [pinterest] support `pinterest.co.uk` URLs ([#914](https://github.com/mikf/gallery-dl/issues/914)) +- [reddit] support gallery posts ([#920](https://github.com/mikf/gallery-dl/issues/920)) +- [subscribestar] extract attached media files ([#852](https://github.com/mikf/gallery-dl/issues/852)) +### Fixes +- [blogger] improve error messages for missing posts/blogs ([#903](https://github.com/mikf/gallery-dl/issues/903)) +- [exhentai] adjust image limit costs ([#940](https://github.com/mikf/gallery-dl/issues/940)) +- [gfycat] skip malformed gfycat responses ([#902](https://github.com/mikf/gallery-dl/issues/902)) +- [imgur] handle 403 overcapacity responses ([#910](https://github.com/mikf/gallery-dl/issues/910)) +- [instagram] wait before GraphQL requests ([#901](https://github.com/mikf/gallery-dl/issues/901)) +- [mangareader] fix extraction +- [mangoxo] fix login +- [pixnet] detect password-protected albums ([#177](https://github.com/mikf/gallery-dl/issues/177)) +- [simplyhentai] fix `gallery_id` extraction +- [subscribestar] update `date` parsing +- [vsco] handle missing `description` fields +- [xhamster] fix extraction ([#917](https://github.com/mikf/gallery-dl/issues/917)) +- allow `parent-directory` to work recursively ([#905](https://github.com/mikf/gallery-dl/issues/905)) +- skip external OAuth tests ([#908](https://github.com/mikf/gallery-dl/issues/908)) +### Removals +- [bobx] remove module + ## 1.14.3 - 2020-07-18 ### Additions - [8muses] support `comics.8muses.com` URLs diff --git a/PKG-INFO b/PKG-INFO index ab22502..afc4636 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.14.3 +Version: 1.14.4 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH `__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows `__ - - `Linux `__ + - `Windows `__ + - `Linux `__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -165,7 +165,7 @@ Description: ========== $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" - If a site's address is nonstandard for its extractor, you can prefix the URL with the + If a site's address is nonstandard for its extractor, you can prefix the URL with the extractor's name to force the use of a specific extractor: .. code:: bash @@ -216,8 +216,9 @@ Description: ========== a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` and optional for - ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``, - ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, and ``twitter``. + ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``, + ``instagram``, ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, + and ``twitter``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -310,7 +311,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.3.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.4.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/README.rst b/README.rst index 360c02b..2148c42 100644 --- a/README.rst +++ b/README.rst @@ -83,8 +83,8 @@ Download a standalone executable file, put it into your `PATH `__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -154,7 +154,7 @@ Filter manga chapters by language and chapter number: $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" -If a site's address is nonstandard for its extractor, you can prefix the URL with the +If a site's address is nonstandard for its extractor, you can prefix the URL with the extractor's name to force the use of a specific extractor: .. code:: bash @@ -205,8 +205,9 @@ Some extractors require you to provide valid login credentials in the form of a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` and optional for -``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``, -``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, and ``twitter``. +``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``, +``instagram``, ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, +and ``twitter``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -299,7 +300,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.3.tar.gz +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.4.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index f05f2e8..e554159 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2020-07-18" "1.14.3" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2020-08-15" "1.14.4" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 039e750..67e51d4 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2020-07-18" "1.14.3" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2020-08-15" "1.14.4" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -111,7 +111,7 @@ subcategory image Note: Even if the value of the \f[I]extension\f[] key is missing or -\f[I]None\f[], it will filled in later when the file download is +\f[I]None\f[], it will be filled in later when the file download is starting. This key is therefore always available to provide a valid filename extension. @@ -284,6 +284,8 @@ and optional for .br * \f[I]idolcomplex\f[] .br +* \f[I]inkbunny\f[] +.br * \f[I]instagram\f[] .br * \f[I]luscious\f[] @@ -1006,6 +1008,19 @@ Controls whether to choose the GIF or MP4 version of an animation. .br * \f[I]"always"\f[]: Always choose MP4. +.SS extractor.inkbunny.orderby +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"create_datetime"\f[] + +.IP "Description:" 4 +Value of the \f[I]orderby\f[] parameter for submission searches. + +(See \f[I]API#Search \f[] +for details) + .SS extractor.instagram.highlights .IP "Type:" 6 \f[I]bool\f[] diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 0007699..8f6f112 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.14.3 +Version: 1.14.4 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH `__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows `__ - - `Linux `__ + - `Windows `__ + - `Linux `__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -165,7 +165,7 @@ Description: ========== $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" - If a site's address is nonstandard for its extractor, you can prefix the URL with the + If a site's address is nonstandard for its extractor, you can prefix the URL with the extractor's name to force the use of a specific extractor: .. code:: bash @@ -216,8 +216,9 @@ Description: ========== a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` and optional for - ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``, - ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, and ``twitter``. + ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``, + ``instagram``, ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, + and ``twitter``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -310,7 +311,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.3.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.4.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 80c9f4f..56c9245 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -49,7 +49,6 @@ gallery_dl/extractor/aryion.py gallery_dl/extractor/bcy.py gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py -gallery_dl/extractor/bobx.py gallery_dl/extractor/booru.py gallery_dl/extractor/common.py gallery_dl/extractor/danbooru.py @@ -86,6 +85,7 @@ gallery_dl/extractor/imgbb.py gallery_dl/extractor/imgbox.py gallery_dl/extractor/imgth.py gallery_dl/extractor/imgur.py +gallery_dl/extractor/inkbunny.py gallery_dl/extractor/instagram.py gallery_dl/extractor/issuu.py gallery_dl/extractor/kabeuchi.py diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py index 783e2b2..f553d41 100644 --- a/gallery_dl/exception.py +++ b/gallery_dl/exception.py @@ -51,6 +51,11 @@ class HttpError(ExtractionError): default = "HTTP request failed" code = 4 + def __init__(self, message, response=None): + ExtractionError.__init__(self, message) + self.response = response + self.status = response.status_code if response else 0 + class NotFoundError(ExtractionError): """Requested resource (gallery/image) could not be found""" diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3184663..6f8867c 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -23,7 +23,6 @@ modules = [ "bcy", "behance", "blogger", - "bobx", "danbooru", "deviantart", "dynastyscans", @@ -54,6 +53,7 @@ modules = [ "imgbox", "imgth", "imgur", + "inkbunny", "instagram", "issuu", "kabeuchi", diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 331cfc2..9c18e0e 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -147,7 +147,7 @@ class BloggerPostExtractor(BloggerExtractor): class BloggerBlogExtractor(BloggerExtractor): """Extractor for an entire Blogger blog""" subcategory = "blog" - pattern = BASE_PATTERN + "/?$" + pattern = BASE_PATTERN + r"/?$" test = ( ("https://julianbphotography.blogspot.com/", { "range": "1-25", @@ -164,6 +164,34 @@ class BloggerBlogExtractor(BloggerExtractor): return self.api.blog_posts(blog["id"]) +class BloggerSearchExtractor(BloggerExtractor): + """Extractor for search resuls and labels""" + subcategory = "search" + pattern = BASE_PATTERN + r"/search(?:/?\?q=([^/?&#]+)|/label/([^/?&#]+))" + test = ( + ("https://julianbphotography.blogspot.com/search?q=400mm", { + "count": "< 10" + }), + ("https://dmmagazine.blogspot.com/search/label/D%26D", { + "range": "1-25", + "count": 25, + }), + ) + + def __init__(self, match): + BloggerExtractor.__init__(self, match) + query = match.group(3) + if query: + self.query, self.label = query, None + else: + self.query, self.label = None, match.group(4) + + def posts(self, blog): + if self.query: + return self.api.blog_search(blog["id"], text.unquote(self.query)) + return self.api.blog_posts(blog["id"], text.unquote(self.label)) + + class BloggerAPI(): """Minimal interface for the Blogger v3 API @@ -176,19 +204,27 @@ class BloggerAPI(): self.api_key = extractor.config("api-key", self.API_KEY) def blog_by_url(self, url): - return self._call("blogs/byurl", {"url": url}) + return self._call("blogs/byurl", {"url": url}, "blog") + + def blog_posts(self, blog_id, label=None): + endpoint = "blogs/{}/posts".format(blog_id) + params = {"labels": label} + return self._pagination(endpoint, params) - def blog_posts(self, blog_id): - return self._pagination("blogs/{}/posts".format(blog_id), {}) + def blog_search(self, blog_id, query): + endpoint = "blogs/{}/posts/search".format(blog_id) + params = {"q": query} + return self._pagination(endpoint, params) def post_by_path(self, blog_id, path): endpoint = "blogs/{}/posts/bypath".format(blog_id) - return self._call(endpoint, {"path": path}) + return self._call(endpoint, {"path": path}, "post") - def _call(self, endpoint, params): + def _call(self, endpoint, params, notfound=None): url = "https://www.googleapis.com/blogger/v3/" + endpoint params["key"] = self.api_key - return self.extractor.request(url, params=params).json() + return self.extractor.request( + url, params=params, notfound=notfound).json() def _pagination(self, endpoint, params): while True: diff --git a/gallery_dl/extractor/bobx.py b/gallery_dl/extractor/bobx.py deleted file mode 100644 index 94a2840..0000000 --- a/gallery_dl/extractor/bobx.py +++ /dev/null @@ -1,135 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2018-2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extract images from http://www.bobx.com/dark/""" - -from .common import Extractor, Message -from .. import text -from ..cache import memcache -import random -import time - - -class BobxExtractor(Extractor): - """Base class for bobx extractors""" - category = "bobx" - root = "http://www.bobx.com" - cookiedomain = ".bobx.com" - per_page = 80 - - def __init__(self, match): - Extractor.__init__(self, match) - self.path = match.group(1) - - def login(self): - if not self._check_cookies(("BobXUser",)): - self._update_cookies(self._login_impl()) - - @memcache() - def _login_impl(self): - """Generate a randomized 'BobXUser' cookie""" - rand = random.randrange - tnow = time.time() - rand(60, 3600) - - return {"BobXUser": "{}.{}.{}.{}.{}.{}".format( - int(tnow), - rand(128, 192), rand(0, 256), rand(0, 256), rand(0, 256), - tnow + 622080000, # timestamp in 7200 days - )} - - -class BobxGalleryExtractor(BobxExtractor): - """Extractor for individual image galleries on bobx.com""" - subcategory = "gallery" - directory_fmt = ("{category}", "{model}", "{title}") - filename_fmt = "{model}_{image_id}_{num:>03}.{extension}" - archive_fmt = "{image_id}" - pattern = (r"(?:https?://)?(?:www\.)?bobx\.com" - r"/([^/]+/[^/]+/photoset/[\w-]+)-\d+-\d+-\d+\.html") - test = ( - (("http://www.bobx.com/idol/mikoto-hibi" - "/photoset/wpb-2018-_11-0-2-8.html"), { - "url": "93972d6a661f6627e963d62c9d15531e6b36a389", - "keyword": "6c620862db494ed05e69356ba30e604b167b0670", - "content": "3f176b7fe752524cec21a763aa55567e41181e07", - }), - (("http://www.bobx.com/idol/nashiko-momotsuki" - "/photoset/wpb-net-_221---2018-08---magic-of-summer-0-10-10.html"), { - "url": "f5d6c0cd0881ae6f504c21a90d86e3464dc54e8e", - "keyword": "f4819c75f494044348889ecd27771508464c0f5f", - }), - ) - - def items(self): - self.login() - - num = 0 - while True: - url = "{}/{}-{}-10-8.html".format(self.root, self.path, num) - page = self.request(url, encoding="utf-8").text - - if num == 0: - data = self.metadata(page) - yield Message.Version, 1 - yield Message.Directory, data - data["num"] = 0 - - for url in self.images(page): - url = text.urljoin(self.root, url.replace("-preview-", "-")) - data = text.nameext_from_url(url, data) - data["image_id"] = text.parse_int( - data["filename"].rpartition("-")[2]) - data["num"] += 1 - yield Message.Url, url, data - - num += self.per_page - if num >= data["count"]: - return - - @staticmethod - def metadata(page): - """Collect metadata for extractor-job""" - info = text.extract(page, "", "")[0] - model, _, info = info.partition(" in ") - info, _, count = info.rpartition(" of ") - title = info.rpartition(" - @")[0] - return { - "title": text.unquote(title), - "model": text.unquote(model), - "count": text.parse_int(count), - } - - @staticmethod - def images(page): - """Extract all image-urls""" - page = text.extract(page, "= 6", - }) - - def items(self): - self.login() - url = "{}/{}/".format(self.root, self.path) - data = {"_extractor": BobxGalleryExtractor} - page = self.request(url).text - skip = True - - yield Message.Version, 1 - for part in text.extract_iter(page, '="photoset/', '"'): - # skip every other entry - skip = not skip - if not skip: - yield Message.Queue, "{}photoset/{}".format(url, part), data diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index bbbd8a6..e6c0968 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -79,6 +79,7 @@ class Extractor(): session = self.session if session is None else session kwargs.setdefault("timeout", self._timeout) kwargs.setdefault("verify", self._verify) + response = None while True: try: @@ -125,7 +126,7 @@ class Extractor(): time.sleep(min(2 ** (tries-1), 1800)) tries += 1 - raise exception.HttpError(msg) + raise exception.HttpError(msg, response) def wait(self, *, seconds=None, until=None, adjust=1.0, reason="rate limit reset"): diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index bf310ec..4cb10b4 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2019 Mike Fährmann +# Copyright 2014-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -123,7 +123,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): r"|/s/([\da-f]{10})/(\d+)-(\d+))") test = ( ("https://exhentai.org/g/1200119/d55c44d3d0/", { - "keyword": "3eeae7bde70dd992402d4cc0230ea0f2c4af46c5", + "keyword": "199db053b4ccab94463b459e1cfe079df8cdcdd1", "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff", }), ("https://exhentai.org/g/960461/4f0e369d82/", { @@ -353,7 +353,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "height": text.parse_int(parts[2]), "size": size, # 1 initial point + 1 per 0.1 MB - "cost": 1 + math.ceil(size / 104857.6) + "cost": 1 + math.ceil(size / 100000) } diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index b4b0e49..ac1bca3 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -28,6 +28,9 @@ class GfycatExtractor(Extractor): def items(self): metadata = self.metadata() for gfycat in self.gfycats(): + if "gfyName" not in gfycat: + self.log.warning("Skipping '%s' (malformed)", gfycat["gfyId"]) + continue url = self._select_format(gfycat) gfycat.update(metadata) yield Message.Directory, gfycat @@ -118,6 +121,10 @@ class GfycatImageExtractor(GfycatExtractor): ("https://www.gfycat.com/foolishforkedabyssiniancat", { "pattern": "https://redgifs.com/watch/foolishforkedabyssiniancat", }), + # malformed API response (#902) + ("https://gfycat.com/illexcitablehairstreak", { + "count": 0, + }), ("https://gfycat.com/gifs/detail/UnequaledHastyAnkole"), ("https://gfycat.com/ifr/UnequaledHastyAnkole"), ("https://gfycat.com/ru/UnequaledHastyAnkole"), @@ -132,6 +139,9 @@ class GfycatImageExtractor(GfycatExtractor): data = {"_extractor": RedgifsImageExtractor} yield Message.Queue, url, data else: + if "gfyName" not in gfycat: + self.log.warning("Skipping '%s' (malformed)", gfycat["gfyId"]) + return url = self._select_format(gfycat) yield Message.Directory, gfycat yield Message.Url, url, gfycat diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index aa41836..49c1a98 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -36,21 +36,17 @@ class HentainexusGalleryExtractor(GalleryExtractor): rmve = text.remove_html extr = text.extract_from(page) data = { - "gallery_id" : text.parse_int(self.gallery_id), - "tags" : extr('"og:description" content="', '"').split(", "), - "thumbnail" : extr('"og:image" content="', '"'), - "title" : extr('

', '

'), - "artist" : rmve(extr('viewcolumn">Artist' , '')), - "book" : rmve(extr('viewcolumn">Book' , '')), - "circle" : rmve(extr('viewcolumn">Circle' , '')), - "event" : rmve(extr('viewcolumn">Event' , '')), - "language" : rmve(extr('viewcolumn">Language' , '')), - "magazine" : rmve(extr('viewcolumn">Magazine' , '')), - "parody" : rmve(extr('viewcolumn">Parody' , '')), - "publisher" : rmve(extr('viewcolumn">Publisher' , '')), - "description": rmve(extr('viewcolumn">Description', '')), + "gallery_id": text.parse_int(self.gallery_id), + "tags" : extr('"og:description" content="', '"').split(", "), + "thumbnail" : extr('"og:image" content="', '"'), + "title" : extr('

', '

'), } + for key in ("Artist", "Book", "Circle", "Event", "Language", + "Magazine", "Parody", "Publisher", "Description"): + data[key.lower()] = rmve(extr( + 'viewcolumn">' + key + '', '')) data["lang"] = util.language_to_code(data["language"]) + if 'doujin' in data['tags']: data['type'] = 'Doujinshi' elif 'illustration' in data['tags']: @@ -60,10 +56,10 @@ class HentainexusGalleryExtractor(GalleryExtractor): data["title_conventional"] = self._join_title(data) return data - def images(self, page): + def images(self, _): url = "{}/read/{}".format(self.root, self.gallery_id) - extr = text.extract_from(self.request(url).text) - urls = extr("initReader(", "]") + "]" + page = self.request(url).text + urls = text.extract(page, "initReader(", "]")[0] + "]" return [(url, None) for url in json.loads(urls)] @staticmethod @@ -120,14 +116,13 @@ class HentainexusSearchExtractor(Extractor): self.params = text.parse_query(match.group(1)) def items(self): - yield Message.Version, 1 params = self.params path = "/" + data = {"_extractor": HentainexusGalleryExtractor} while path: page = self.request(self.root + path, params=params).text extr = text.extract_from(page) - data = {"_extractor": HentainexusGalleryExtractor} while True: gallery_id = extr('= data["pages_count"]: + return + if "get_rid" in params: + del params["get_rid"] + params["rid"] = data["rid"] + params["page"] += 1 + + +@cache(maxage=360*24*3600, keyarg=1) +def _authenticate_impl(api, username, password): + api.extractor.log.info("Logging in as %s", username) + + url = "https://inkbunny.net/api_login.php" + data = {"username": username, "password": password} + data = api.extractor.request(url, method="POST", data=data).json() + + if "sid" not in data: + raise exception.AuthenticationError(data.get("error_message")) + return data["sid"] diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index bf6b10f..639f272 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -315,7 +315,7 @@ class InstagramExtractor(Extractor): if not has_next_page: break - + time.sleep(3) end_cursor = medias['page_info']['end_cursor'] variables = '{{"{}":"{}","first":12,"after":"{}"}}'.format( psdf['variables_id'], @@ -342,7 +342,8 @@ class InstagramExtractor(Extractor): class InstagramImageExtractor(InstagramExtractor): """Extractor for PostPage""" subcategory = "image" - pattern = r"(?:https?://)?(?:www\.)?instagram\.com/(?:p|tv)/([^/?&#]+)" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/(?:p|tv|reel)/([^/?&#]+)") test = ( # GraphImage ("https://www.instagram.com/p/BqvsDleB3lV/", { @@ -440,6 +441,8 @@ class InstagramImageExtractor(InstagramExtractor): }] } }), + + ("https://www.instagram.com/reel/CDg_6Y1pxWu/"), ) def __init__(self, match): @@ -500,7 +503,7 @@ class InstagramUserExtractor(InstagramExtractor): """Extractor for ProfilePage""" subcategory = "user" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" + r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)" r"([^/?&#]+)/?(?:$|[?#])") test = ( ("https://www.instagram.com/instagram/", { @@ -530,7 +533,7 @@ class InstagramUserExtractor(InstagramExtractor): 'node_id': 'id', 'variables_id': 'id', 'edge_to_medias': 'edge_owner_to_timeline_media', - 'query_hash': '44efc15d3c13342d02df0b5a9fa3d33f', + 'query_hash': '15bf78a4ad24e33cbd838fdb31353ac1', }) if self.config('highlights'): @@ -599,5 +602,5 @@ class InstagramTagExtractor(InstagramExtractor): 'node_id': 'name', 'variables_id': 'tag_name', 'edge_to_medias': 'edge_hashtag_to_media', - 'query_hash': '7dabc71d3e758b1ec19ffb85639e427b', + 'query_hash': 'c769cb6c71b24c8a86590b22402fda50', }) diff --git a/gallery_dl/extractor/mangapanda.py b/gallery_dl/extractor/mangapanda.py index 18ef005..a4b8340 100644 --- a/gallery_dl/extractor/mangapanda.py +++ b/gallery_dl/extractor/mangapanda.py @@ -1,14 +1,15 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from https://www.mangapanda.com/""" +"""Extractors for https://www.mangapanda.com/""" -from .mangareader import MangareaderMangaExtractor, MangareaderChapterExtractor +from .common import ChapterExtractor, MangaExtractor +from .. import text class MangapandaBase(): @@ -16,21 +17,102 @@ class MangapandaBase(): category = "mangapanda" root = "https://www.mangapanda.com" + @staticmethod + def parse_page(page, data): + """Parse metadata on 'page' and add it to 'data'""" + text.extract_all(page, ( + ("manga" , '

', '

'), + ("release", '>Year of Release:\n
'), + ('author' , '>Author:\n'), + ('artist' , '>Artist:\n'), + ), values=data) + data["manga"] = data["manga"].strip() + data["author"] = text.unescape(data["author"]) + data["artist"] = text.unescape(data["artist"]) + return data -class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor): + +class MangapandaChapterExtractor(MangapandaBase, ChapterExtractor): """Extractor for manga-chapters from mangapanda.com""" + archive_fmt = "{manga}_{chapter}_{page}" pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))" test = ("https://www.mangapanda.com/red-storm/2", { "url": "1f633f776e950531ba9b1e81965316458e785261", "keyword": "b24df4b9cc36383fb6a44e06d32a3884a4dcb5fb", }) + def __init__(self, match): + path, self.url_title, self.chapter = match.groups() + ChapterExtractor.__init__(self, match, self.root + path) + + def metadata(self, chapter_page): + page = self.request(self.root + self.url_title).text + data = self.parse_page(page, { + "chapter": text.parse_int(self.chapter), + "lang": "en", + "language": "English", + }) + text.extract_all(page, ( + ('title', ' ' + self.chapter + ' : ', ''), + ('date', ''), + ), page.index('
'), data) + data["count"] = text.parse_int(text.extract( + chapter_page, ' of ', '<')[0] + ) + return data + + def images(self, page): + while True: + next_url, image_url, image_data = self.get_image_metadata(page) + yield image_url, image_data + + if not next_url: + return + page = self.request(next_url).text -class MangapandaMangaExtractor(MangapandaBase, MangareaderMangaExtractor): + def get_image_metadata(self, page): + """Collect next url, image-url and metadata for one manga-page""" + extr = text.extract + width = None + test , pos = extr(page, "document['pu']", '') + if test is None: + return None, None, None + if page.find("document['imgwidth']", pos, pos+200) != -1: + width , pos = extr(page, "document['imgwidth'] = ", ";", pos) + height, pos = extr(page, "document['imgheight'] = ", ";", pos) + _ , pos = extr(page, '
', '') + url, pos = extr(page, ' href="', '"', pos) + if width is None: + width , pos = extr(page, '') + while True: + url, pos = text.extract(page, needle, '"', pos) + if not url: + return results + data["title"], pos = text.extract(page, ' : ', '', pos) + data["date"] , pos = text.extract(page, '
', pos) + data["chapter"] = text.parse_int(url.rpartition("/")[2]) + results.append((self.root + url, data.copy())) diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index 31083dc..fd9c7ac 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -6,10 +6,12 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from https://www.mangareader.net/""" +"""Extractors for https://www.mangareader.net/""" from .common import ChapterExtractor, MangaExtractor from .. import text +from ..cache import memcache +import json class MangareaderBase(): @@ -17,19 +19,35 @@ class MangareaderBase(): category = "mangareader" root = "https://www.mangareader.net" - @staticmethod - def parse_page(page, data): - """Parse metadata on 'page' and add it to 'data'""" - text.extract_all(page, ( - ("manga" , '

', '

'), - ("release", '>Year of Release:\n'), - ('author' , '>Author:\n'), - ('artist' , '>Artist:\n'), - ), values=data) - data["manga"] = data["manga"].strip() - data["author"] = text.unescape(data["author"]) - data["artist"] = text.unescape(data["artist"]) - return data + @memcache(keyarg=1) + def _manga_info(self, path, page=None): + if not page: + page = self.request(self.root + path).text + extr = text.extract_from(page) + data = { + "manga" : text.unescape(extr('class="name">', '<')), + "release" : text.unescape(extr('Year of Release :'), - ('date', ''), - ), page.index('
'), data) - data["count"] = text.parse_int(text.extract( - chapter_page, ' of ', '<')[0] - ) - return data + def metadata(self, page): + chapter = text.parse_int(self.chapter) + return self._manga_info(self.path)[chapter-1][1] def images(self, page): - while True: - next_url, image_url, image_data = self.get_image_metadata(page) - yield image_url, image_data - - if not next_url: - return - page = self.request(next_url).text - - def get_image_metadata(self, page): - """Collect next url, image-url and metadata for one manga-page""" - extr = text.extract - width = None - test , pos = extr(page, "document['pu']", '') - if test is None: - return None, None, None - if page.find("document['imgwidth']", pos, pos+200) != -1: - width , pos = extr(page, "document['imgwidth'] = ", ";", pos) - height, pos = extr(page, "document['imgheight'] = ", ";", pos) - _ , pos = extr(page, '
', '') - url, pos = extr(page, ' href="', '"', pos) - if width is None: - width , pos = extr(page, '')[0]) + return [ + (text.ensure_http_scheme(img["u"]), { + "width" : text.parse_int(img["w"]), + "height": text.parse_int(img["h"]), + }) + for img in data["im"] + ] class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): @@ -104,16 +91,5 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): }) def chapters(self, page): - results = [] - data = self.parse_page(page, {"lang": "en", "language": "English"}) - - needle = '
\n') - while True: - url, pos = text.extract(page, needle, '"', pos) - if not url: - return results - data["title"], pos = text.extract(page, ' : ', '', pos) - data["date"] , pos = text.extract(page, '
', pos) - data["chapter"] = text.parse_int(url.rpartition("/")[2]) - results.append((self.root + url, data.copy())) + path = self.manga_url[len(self.root):] + return self._manga_info(path, page) diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index 25fba70..0e04f97 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -58,7 +58,7 @@ class MangoxoExtractor(Extractor): ("timestamp", str(int(time.time()))), ] query = "&".join("=".join(item) for item in sorted(params)) - query += "&secretKey=996293536" + query += "&secretKey=340836904" sign = hashlib.md5(query.encode()).hexdigest() params.append(("sign", sign.upper())) return params diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index 51b314a..e2e163a 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -9,7 +9,7 @@ """Extract images from https://www.myportfolio.com/""" from .common import Extractor, Message -from .. import text +from .. import text, exception class MyportfolioGalleryExtractor(Extractor): @@ -31,9 +31,8 @@ class MyportfolioGalleryExtractor(Extractor): "pattern": r"https://andrewling\.myportfolio\.com/[^/?&#+]+$", "count": ">= 6", }), - # no explicit title ("https://stevenilousphotography.myportfolio.com/society", { - "keyword": "49e7ff6322645c22b409280656202c2736a380c9", + "exception": exception.NotFoundError, }), # custom domain ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", { @@ -89,8 +88,10 @@ class MyportfolioGalleryExtractor(Extractor): if title: title = title.partition(">")[2] user = user[:-len(title)-3] - else: + elif user: user, _, title = user.partition(" - ") + else: + raise exception.NotFoundError() return { "user": text.unescape(user), diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 931fb13..8f2d633 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -95,8 +95,8 @@ class PahealPostExtractor(PahealExtractor): pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/view/(\d+)") test = ("https://rule34.paheal.net/post/view/481609", { - "url": "1142779378f655ec0497d4c301836aa667f788b1", - "keyword": "34e9e93d4fa6fa06fac1a56e78c9a52e8cd7b271", + "url": "d3fd0f82762716fe3fb03c9c923e61c13ce22204", + "keyword": "35748081bfeaab48f909f4b097a4d79b2be12538", "content": "7b924bcf150b352ac75c9d281d061e174c851a11", }) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 3bbe06a..cc89ac5 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -14,7 +14,7 @@ import itertools import json -BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.\w+" +BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" class PinterestExtractor(Extractor): @@ -101,6 +101,8 @@ class PinterestBoardExtractor(PinterestExtractor): ("https://www.pinterest.com/g1952848/test/", { "exception": exception.GalleryDLException, }), + # .co.uk TLD (#914) + ("https://www.pinterest.co.uk/hextra7519/based-animals/"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py index 9cada6b..d8ac9f6 100644 --- a/gallery_dl/extractor/pixnet.py +++ b/gallery_dl/extractor/pixnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,7 +9,7 @@ """Extractors for https://www.pixnet.net/""" from .common import Extractor, Message -from .. import text +from .. import text, exception BASE_PATTERN = r"(?:https?://)?(?!www\.)([^.]+)\.pixnet.net" @@ -53,6 +53,9 @@ class PixnetExtractor(Extractor): yield from text.extract_iter(page, '
  • ')[0] + if pnext is None and 'name="albumpass">' in page: + raise exception.StopExtraction( + "Album %s is password-protected.", self.item_id) if "href" not in pnext: return url = self.root + text.extract(pnext, 'href="', '"')[0] @@ -107,6 +110,9 @@ class PixnetSetExtractor(PixnetExtractor): "url": "b3eb6431aea0bcf5003432a4a0f3a3232084fc13", "keyword": "bf7004faa1cea18cf9bd856f0955a69be51b1ec6", }), + ("https://sky92100.pixnet.net/album/set/17492544", { + "count": 0, # password-protected + }), ) def items(self): diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index f97454b..8290d2d 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -112,6 +112,7 @@ class ReactorExtractor(SharedConfigMixin, Extractor): if not tags: title, tags = tags, title tags = tags.split(" :: ") + tags.sort() for image in images: url = text.extract(image, ' src="', '"')[0] @@ -259,19 +260,19 @@ class JoyreactorPostExtractor(ReactorPostExtractor): test = ( ("http://joyreactor.com/post/3721876", { # single image "url": "6ce09f239d8b7fdf6dd1664c2afc39618cc87663", - "keyword": "966d2acd462732a9ed823a9db5ed19f95734fd10", + "keyword": "147ed5b9799ba43cbd16168450afcfae5ddedbf3", }), ("http://joyreactor.com/post/3713804", { # 4 images "url": "f08ac8493ca0619a3e3c6bedb8d8374af3eec304", - "keyword": "84e34d402342607045a65fab6d4d593d146c238a", + "keyword": "f12c6f3c2f298fed9b12bd3e70fb823870aa9b93", }), ("http://joyreactor.com/post/3726210", { # gif / video "url": "33a48e1eca6cb2d298fbbb6536b3283799d6515b", - "keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47", + "keyword": "d173cc6e88f02a63904e475eacd7050304eb1967", }), ("http://joyreactor.com/post/3668724", { # youtube embed "url": "bf1666eddcff10c9b58f6be63fa94e4e13074214", - "keyword": "989112c7888e9cc80fd35870180c6c98165d953b", + "keyword": "e18b1ffbd79d76f9a0e90b6d474cc2499e343f0b", }), ("http://joyreactor.cc/post/1299", { # "malformed" JSON "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde", diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 679059c..cb70fe5 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -17,7 +17,7 @@ class RedditExtractor(Extractor): """Base class for reddit extractors""" category = "reddit" directory_fmt = ("{category}", "{subreddit}") - filename_fmt = "{id} {title[:220]}.{extension}" + filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}" archive_fmt = "{filename}" cookiedomain = None @@ -50,11 +50,22 @@ class RedditExtractor(Extractor): yield Message.Directory, submission visited.add(submission["id"]) url = submission["url"] + submission["num"] = 0 if url.startswith("https://i.redd.it/"): text.nameext_from_url(url, submission) yield Message.Url, url, submission + elif "gallery_data" in submission: + meta = submission["media_metadata"] + items = submission["gallery_data"]["items"] + for submission["num"], item in enumerate(items, 1): + url = meta[item["media_id"]]["s"]["u"] + url = url.partition("?")[0] + url = url.replace("/preview.", "/i.", 1) + text.nameext_from_url(url, submission) + yield Message.Url, url, submission + elif submission["is_video"]: if videos: text.nameext_from_url(url, submission) @@ -160,9 +171,8 @@ class RedditSubmissionExtractor(RedditExtractor): """Extractor for URLs from a submission on reddit.com""" subcategory = "submission" pattern = (r"(?:https?://)?(?:" - r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|" - r"redd\.it" - r")/([a-z0-9]+)") + r"(?:\w+\.)?reddit\.com/(?:r/[^/?&#]+/comments|gallery)" + r"|redd\.it)/([a-z0-9]+)") test = ( ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", { "pattern": r"https://c2.staticflickr.com/8/7272/\w+_k.jpg", @@ -173,6 +183,11 @@ class RedditSubmissionExtractor(RedditExtractor): "pattern": r"https://", "count": 3, }), + ("https://www.reddit.com/gallery/hrrh23", { + "url": "25b91ede15459470274dd17291424b037ed8b0ae", + "content": "1e7dde4ee7d5f4c4b45749abfd15b2dbfa27df3f", + "count": 3, + }), ("https://old.reddit.com/r/lavaporn/comments/2a00np/"), ("https://np.reddit.com/r/lavaporn/comments/2a00np/"), ("https://m.reddit.com/r/lavaporn/comments/2a00np/"), diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 28ee46c..9d1df18 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -74,21 +74,33 @@ class ShopifyCollectionExtractor(ShopifyExtractor): def products(self): params = text.parse_query(self.params) params["page"] = text.parse_int(params.get("page"), 1) - search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+") - - while True: - page = self.request(self.item_url, params=params).text - urls = search_re.findall(page) - last = None - - if not urls: - return - for path in urls: - if last == path: - continue - last = path - yield self.root + path - params["page"] += 1 + fetch = True + last = None + + for pattern in ( + r"/collections/[\w-]+/products/[\w-]+", + r"href=[\"'](/products/[\w-]+)", + ): + search_re = re.compile(pattern) + + while True: + if fetch: + page = self.request(self.item_url, params=params).text + urls = search_re.findall(page) + + if len(urls) < 3: + if last: + return + fetch = False + break + fetch = True + + for path in urls: + if last == path: + continue + last = path + yield self.root + path + params["page"] += 1 class ShopifyProductExtractor(ShopifyExtractor): @@ -121,7 +133,6 @@ EXTRACTORS = { ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), ("https://www.fashionnova.com/collections/mini-dresses#1"), ), - }, } diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index abf9995..a0d34d1 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -45,11 +45,12 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): self.gallery_url = extr('Series', '')), "language" : text.remove_html(extr( 'box-title">Language', '')) or None, diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 44a0a84..163102d 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -112,13 +112,13 @@ class SmugmugImageExtractor(SmugmugExtractor): test = ( ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { "url": "f624ad7293afd6412a7d34e3950a118596c36c85", - "keyword": "085861b5935e3cd96ad15954039bc2419cdf1c27", + "keyword": "d69c69c1517b8ea77bc763cffc4d0a4002dfee3f", "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0", }), # video ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee", - "keyword": "e0927fda7b1c39c19974625270102ad7e72b9d6f", + "keyword": "720da317232504f05099da37802ed3c3ce3cd310", }), ) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 08d8850..076d0c0 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache -import datetime import json @@ -35,8 +34,6 @@ class SubscribestarExtractor(Extractor): self.cookiedomain = "subscribestar.adult" self.subcategory += "-adult" Extractor.__init__(self, match) - self.metadata = self.config("metadata", False) - self._year = " " + str(datetime.date.today().year) def items(self): self.login() @@ -92,38 +89,46 @@ class SubscribestarExtractor(Extractor): @staticmethod def _media_from_post(html): + media = [] + gallery = text.extract(html, 'data-gallery="', '"')[0] if gallery: - return [ + media.extend( item for item in json.loads(text.unescape(gallery)) if "/previews/" not in item["url"] - ] - return () + ) + + attachments = text.extract( + html, 'class="uploads-docs"', 'data-role="post-edit_form"')[0] + if attachments: + for att in attachments.split('class="doc_preview"')[1:]: + media.append({ + "id" : text.parse_int(text.extract( + att, 'data-upload-id="', '"')[0]), + "url" : text.extract(att, 'href="', '"')[0], + "type": "attachment", + }) + + return media def _data_from_post(self, html): extr = text.extract_from(html) - data = { + return { "post_id" : text.parse_int(extr('data-id="', '"')), "author_id" : text.parse_int(extr('data-user-id="', '"')), "author_name": text.unescape(extr('href="/', '"')), "author_nick": text.unescape(extr('>', '<')), + "date" : self._parse_datetime(text.remove_html(extr( + 'class="post-date">', '")[2]), } - if self.metadata: - url = "{}/posts/{}".format(self.root, data["post_id"]) - page = self.request(url).text - data["date"] = self._parse_datetime(text.extract( - page, 'class="section-subtitle">', '<')[0]) - - return data - def _parse_datetime(self, dt): - date = text.parse_datetime(dt, "%B %d, %Y %H:%M") + date = text.parse_datetime(dt, "%b %d, %Y %I:%M %p") if date is dt: - date = text.parse_datetime(dt + self._year, "%d %b %H:%M %Y") + date = text.parse_datetime(dt, "%B %d, %Y %I:%M %p") return date @@ -141,6 +146,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor): "author_name": "subscribestar", "author_nick": "SubscribeStar", "content": str, + "date" : "type:datetime", "height" : int, "id" : int, "pinned" : bool, @@ -209,8 +215,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor): def posts(self): url = "{}/posts/{}".format(self.root, self.item) - self._page = self.request(url).text - return (self._page,) + return (self.request(url).text,) def _data_from_post(self, html): extr = text.extract_from(html) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 2530040..71f14dc 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -503,8 +503,9 @@ class TwitterAPI(): if response.status_code < 400: return response.json() if response.status_code == 429: - self.extractor.wait(until=response.headers["x-rate-limit-reset"]) - return self._call(endpoint, params) + until = response.headers.get("x-rate-limit-reset") + self.extractor.wait(until=until, seconds=(None if until else 60)) + return self._call(endpoint, params, method) try: msg = ", ".join( diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index c9f0ec3..76e4e3d 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -61,7 +61,7 @@ class VscoExtractor(Extractor): "video" : img["is_video"], "width" : img["width"], "height": img["height"], - "description": img["description"], + "description": img.get("description") or "", }) yield Message.Url, url, data diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index a338216..5f11df3 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -146,7 +146,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor): def _data(self, url): page = self.request(url).text return json.loads(text.extract( - page, "window.initials =", "")[0].rstrip("\n\r;")) + page, "window.initials=", "")[0].rstrip("\n\r;")) class XhamsterUserExtractor(XhamsterExtractor): diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 4c18e4d..163c3c6 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -48,6 +48,9 @@ class Job(): extr.category = pextr.category extr.subcategory = pextr.subcategory + # transfer parent directory + extr._parentdir = pextr._parentdir + # reuse connection adapters extr.session.adapters = pextr.session.adapters diff --git a/gallery_dl/version.py b/gallery_dl/version.py index fd52077..b2b59e0 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.14.3" +__version__ = "1.14.4" diff --git a/test/test_oauth.py b/test/test_oauth.py index e4664e4..7455928 100644 --- a/test/test_oauth.py +++ b/test/test_oauth.py @@ -14,7 +14,6 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from gallery_dl import oauth, text # noqa E402 -TESTSERVER = "http://term.ie/oauth/example" TESTSERVER = "http://term.ie/oauth/example" CONSUMER_KEY = "key" CONSUMER_SECRET = "secret" @@ -96,12 +95,17 @@ class TestOAuthSession(unittest.TestCase): def _oauth_request(self, endpoint, params=None, oauth_token=None, oauth_token_secret=None): + # the test server at 'term.ie' is unreachable + raise unittest.SkipTest() + session = oauth.OAuth1Session( CONSUMER_KEY, CONSUMER_SECRET, oauth_token, oauth_token_secret, ) try: - return session.get(TESTSERVER + endpoint, params=params).text + response = session.get(TESTSERVER + endpoint, params=params) + response.raise_for_status() + return response.text except OSError: raise unittest.SkipTest() diff --git a/test/test_results.py b/test/test_results.py index dd1ed1d..1f2f699 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -31,8 +31,10 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { - "bobx", + "hentaihand", "imagevenue", + "mangapark", + "ngomik", "photobucket", "worldthree", } @@ -317,7 +319,7 @@ def setup_test_config(): config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") for category in ("danbooru", "instagram", "twitter", "subscribestar", - "e621"): + "e621", "inkbunny"): config.set(("extractor", category), "username", None) config.set(("extractor", "mastodon.social"), "access-token", -- cgit v1.2.3
  • ', '', '', '', '', '', '', '', '', '<')), + "author" : text.unescape(text.unescape(extr( + 'Author :', '<'))), + "artist" : text.unescape(text.unescape(extr( + 'Artist :', '<'))), + "lang" : "en", + "language": "English", + } + + extr('') + chapters = [] + while True: + url = extr(' : ", "<")), + "date" : extr("", "<"), + } + chapter.update(data) + chapters.append((self.root + url, chapter)) class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): @@ -38,59 +56,28 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))" test = (("https://www.mangareader.net" "/karate-shoukoushi-kohinata-minoru/11"), { - "url": "3d8a5b900856d59b8d8e83908d0df392be92c0f4", + "url": "45ece5668d1e9f65cf2225237d78de58660b54e4", "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6", }) def __init__(self, match): - path, self.url_title, self.chapter = match.groups() - ChapterExtractor.__init__(self, match, self.root + path) + ChapterExtractor.__init__(self, match) + _, self.path, self.chapter = match.groups() - def metadata(self, chapter_page): - page = self.request(self.root + self.url_title).text - data = self.parse_page(page, { - "chapter": text.parse_int(self.chapter), - "lang": "en", - "language": "English", - }) - text.extract_all(page, ( - ('title', ' ' + self.chapter + ' : ', '', '', '