From 209a3c800871cd68edd2bc7ae661a24ecd496d2d Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Fri, 13 Nov 2020 19:17:03 -0500 Subject: New upstream version 1.15.3. --- CHANGELOG.md | 16 ++++++ PKG-INFO | 8 +-- README.rst | 6 +-- data/man/gallery-dl.1 | 2 +- data/man/gallery-dl.conf.5 | 69 ++++++++++++++++++++++-- gallery_dl.egg-info/PKG-INFO | 8 +-- gallery_dl/__init__.py | 10 +++- gallery_dl/extractor/2chan.py | 8 +-- gallery_dl/extractor/500px.py | 12 ++--- gallery_dl/extractor/8kun.py | 20 +++++-- gallery_dl/extractor/__init__.py | 2 + gallery_dl/extractor/common.py | 9 ++++ gallery_dl/extractor/deviantart.py | 3 +- gallery_dl/extractor/exhentai.py | 27 ++++++---- gallery_dl/extractor/gfycat.py | 2 +- gallery_dl/extractor/hentaifoundry.py | 34 ++++++++---- gallery_dl/extractor/khinsider.py | 7 +-- gallery_dl/extractor/mangoxo.py | 11 ++-- gallery_dl/extractor/paheal.py | 8 ++- gallery_dl/extractor/sankakucomplex.py | 79 +++++++++++++++++---------- gallery_dl/extractor/twitter.py | 98 ++++++++++++++++++++++++++++++---- gallery_dl/extractor/weasyl.py | 3 +- gallery_dl/postprocessor/metadata.py | 2 + gallery_dl/util.py | 23 +++++++- gallery_dl/version.py | 2 +- test/test_extractor.py | 3 ++ test/test_postprocessor.py | 24 +++++---- test/test_results.py | 2 +- test/test_util.py | 10 ++++ 29 files changed, 390 insertions(+), 118 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f382013..59ee36a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Changelog +## 1.15.3 - 2020-11-13 +### Additions +- [sankakucomplex] extract videos and embeds ([#308](https://github.com/mikf/gallery-dl/issues/308)) +- [twitter] add support for lists ([#1096](https://github.com/mikf/gallery-dl/issues/1096)) +- [postprocessor:metadata] accept string-lists for `content-format` ([#1080](https://github.com/mikf/gallery-dl/issues/1080)) +- implement `modules` and `extension-map` options +### Fixes +- [500px] update query hashes +- [8kun] fix file URLs of older posts ([#1101](https://github.com/mikf/gallery-dl/issues/1101)) +- [exhentai] update image URL parsing ([#1094](https://github.com/mikf/gallery-dl/issues/1094)) +- [hentaifoundry] update `YII_CSRF_TOKEN` cookie handling ([#1083](https://github.com/mikf/gallery-dl/issues/1083)) +- [hentaifoundry] use scheme from input URLs ([#1095](https://github.com/mikf/gallery-dl/issues/1095)) +- [mangoxo] fix metadata extraction +- [paheal] fix extraction ([#1088](https://github.com/mikf/gallery-dl/issues/1088)) +- collect post processors from `basecategory` entries ([#1084](https://github.com/mikf/gallery-dl/issues/1084)) + ## 1.15.2 - 2020-10-24 ### Additions - [pinterest] implement login support ([#1055](https://github.com/mikf/gallery-dl/issues/1055)) diff --git a/PKG-INFO b/PKG-INFO index d488155..da06a1d 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.15.2 +Version: 1.15.3 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH `__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows `__ - - `Linux `__ + - `Windows `__ + - `Linux `__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -319,7 +319,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.2.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.3.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/README.rst b/README.rst index 6715d1e..ccc745a 100644 --- a/README.rst +++ b/README.rst @@ -83,8 +83,8 @@ Download a standalone executable file, put it into your `PATH `__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -308,7 +308,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.2.tar.gz +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.3.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 9df67f4..a913c47 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2020-10-24" "1.15.2" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2020-11-13" "1.15.3" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 8dd3187..05bd92e 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2020-10-24" "1.15.2" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2020-11-13" "1.15.3" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -217,6 +217,28 @@ Note: In a string with 2 or more characters, \f[I][]^-\\\f[] need to be escaped with backslashes, e.g. \f[I]"\\\\[\\\\]"\f[] +.SS extractor.*.extension-map +.IP "Type:" 6 +\f[I]object\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Example:" 4 +.. code:: json + +{ +"jpeg": "jpg", +"jpe" : "jpg", +"jfif": "jpg", +"jif" : "jpg", +"jfi" : "jpg" +} + +.IP "Description:" 4 +A JSON \f[I]object\f[] mapping filename extensions to alternatives. + + .SS extractor.*.skip .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] @@ -1530,6 +1552,28 @@ receives too many HTTP requests in a certain amount of time. Waiting a few seconds between each request tries to prevent that. +.SS extractor.sankakucomplex.embeds +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download video embeds from external sites. + + +.SS extractor.sankakucomplex.videos +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download videos. + + .SS extractor.smugmug.videos .IP "Type:" 6 \f[I]bool\f[] @@ -2288,10 +2332,13 @@ Note: \f[I]metadata.extension\f[] is ignored if this option is set. .SS metadata.content-format .IP "Type:" 6 -\f[I]string\f[] +\f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] .IP "Example:" 4 -"tags:\\n\\n{tags:J\\n}\\n" +.br +* "tags:\\n\\n{tags:J\\n}\\n" +.br +* ["tags:", "", "{tags:J\\n}"] .IP "Description:" 4 Custom format string to build the content of metadata files with. @@ -2465,6 +2512,22 @@ case the Python interpreter gets shut down unexpectedly .SH MISCELLANEOUS OPTIONS +.SS extractor.modules +.IP "Type:" 6 +\f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +The \f[I]modules\f[] list in +\f[I]extractor/__init__.py\f[] + +.IP "Example:" 4 +["reddit", "danbooru", "mangadex"] + +.IP "Description:" 4 +The list of modules to load when searching for a suitable +extractor class. Useful to reduce startup time and memory usage. + + .SS cache.file .IP "Type:" 6 \f[I]Path\f[] diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 18f8d82..70c079d 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.15.2 +Version: 1.15.3 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH `__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows `__ - - `Linux `__ + - `Windows `__ + - `Linux `__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -319,7 +319,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.2.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.3.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index e71a5b0..6c2c713 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -9,7 +9,7 @@ from __future__ import unicode_literals, print_function __author__ = "Mike Fährmann" -__copyright__ = "Copyright 2014-2018 Mike Fährmann" +__copyright__ = "Copyright 2014-2020 Mike Fährmann" __license__ = "GPLv2" __maintainer__ = "Mike Fährmann" __email__ = "mike_faehrmann@web.de" @@ -129,6 +129,12 @@ def main(): for opts in args.options: config.set(*opts) + # extractor modules + modules = config.get(("extractor",), "modules") + if modules is not None: + extractor.modules = modules + extractor._module_iter = iter(modules) + # loglevels output.configure_logging(args.loglevel) if args.loglevel >= logging.ERROR: @@ -142,7 +148,7 @@ def main(): head = "" try: out, err = subprocess.Popen( - ("git", "rev-parse", "--short", "HEAD"), + ("git", "rev-parse", "--short", "HEAD"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(__file__)), diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index c34cfec..51e461e 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,9 +21,9 @@ class _2chanThreadExtractor(Extractor): archive_fmt = "{board}_{thread}_{tim}" url_fmt = "https://{server}.2chan.net/{board}/src/{filename}" pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)" - test = ("http://dec.2chan.net/70/res/947.htm", { - "url": "c5c12b80b290e224b6758507b3bb952044f4595b", - "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0", + test = ("http://dec.2chan.net/70/res/11048.htm", { + "url": "2ecf919139bd5d915930530b3576d67c388a2a49", + "keyword": "8def4ec98a89fd4fff8bbcbae603604dcb4a3bb9", }) def __init__(self, match): diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 624b14d..df9941a 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -109,8 +109,8 @@ class _500pxUserExtractor(_500pxExtractor): variables = {"username": self.user, "pageSize": 20} photos = self._request_graphql( "OtherPhotosQuery", variables, - "54524abbdc809f8d4e10d37839e8ab2d" - "3035413688cad9c7fbece13b66637e9d", + "018a5e5117bd72bdf28066aad02c4f2d" + "8acdf7f6127215d231da60e24080eb1b", )["user"]["photos"] while True: @@ -122,8 +122,8 @@ class _500pxUserExtractor(_500pxExtractor): variables["cursor"] = photos["pageInfo"]["endCursor"] photos = self._request_graphql( "OtherPhotosPaginationContainerQuery", variables, - "6d31e01104456ce642a2c6fc2f936812" - "b0f2a65c442d03e1521d769c20efe507", + "b4af70d42c71a5e43f0be36ce60dc81e" + "9742ebc117cde197350f2b86b5977d98", )["userByUsername"]["photos"] @@ -153,7 +153,7 @@ class _500pxGalleryExtractor(_500pxExtractor): def metadata(self): user = self._request_graphql( "ProfileRendererQuery", {"username": self.user_name}, - "4d02ff5c13927a3ac73b3eef306490508bc765956940c31051468cf30402a503", + "5a17a9af1830b58b94a912995b7947b24f27f1301c6ea8ab71a9eb1a6a86585b", )["profile"] self.user_id = str(user["legacyId"]) @@ -166,7 +166,7 @@ class _500pxGalleryExtractor(_500pxExtractor): } gallery = self._request_graphql( "GalleriesDetailQueryRendererQuery", variables, - "fd367cacf9bebcdc0620bd749dbd8fc9b0ccbeb54fc76b8b4b95e66a8c0cba49", + "fb8bb66d31b58903e2f01ebe66bbe7937b982753be3211855b7bce4e286c1a49", )["gallery"] self._photos = gallery["photos"] diff --git a/gallery_dl/extractor/8kun.py b/gallery_dl/extractor/8kun.py index 47fe672..e55bb08 100644 --- a/gallery_dl/extractor/8kun.py +++ b/gallery_dl/extractor/8kun.py @@ -20,10 +20,17 @@ class _8kunThreadExtractor(Extractor): filename_fmt = "{time}{num:?-//} {filename}.{extension}" archive_fmt = "{board}_{thread}_{tim}" pattern = r"(?:https?://)?8kun\.top/([^/]+)/res/(\d+)" - test = ("https://8kun.top/test/res/65248.html", { - "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+", - "count": ">= 8", - }) + test = ( + ("https://8kun.top/test/res/65248.html", { + "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+", + "count": ">= 8", + }), + # old-style file URLs (#1101) + ("https://8kun.top/d/res/13258.html", { + "pattern": r"https://media\.8kun\.top/d/src/\d+(-\d)?\.\w+", + "range": "1-20", + }), + ) def __init__(self, match): Extractor.__init__(self, match) @@ -56,7 +63,10 @@ class _8kunThreadExtractor(Extractor): def _process(post, data): post.update(data) post["extension"] = post["ext"][1:] - url = "https://media.8kun.top/file_store/" + post["tim"] + post["ext"] + tim = post["tim"] + url = ("https://media.8kun.top/" + + ("file_store/" if len(tim) > 16 else post["board"] + "/src/") + + tim + post["ext"]) return Message.Url, url, post diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b8e39bc..d0c327a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -185,6 +185,8 @@ def _list_classes(): module = importlib.import_module("."+module_name, __package__) yield from add_module(module) + globals()["_list_classes"] = lambda : _cache + def _get_classes(module): """Return a list of all extractor classes in a module""" diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 357deac..5efea4a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -518,6 +518,15 @@ class SharedConfigMixin(): ), key, default, ) + def config_accumulate(self, key): + values = config.accumulate(self._cfgpath, key) + + conf = config.get(("extractor",), self.basecategory) + if conf: + values[:0] = config.accumulate((self.subcategory,), key, conf=conf) + + return values + def generate_extractors(extractor_data, symtable, classes): """Dynamically generate Extractor classes""" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index e40ec51..456a173 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -165,11 +165,12 @@ class DeviantartExtractor(Extractor): # filename metadata alphabet = "0123456789abcdefghijklmnopqrstuvwxyz" + deviation["index_base36"] = util.bencode(deviation["index"], alphabet) sub = re.compile(r"\W").sub deviation["filename"] = "".join(( sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["author"]["username"].lower()), "-d", - util.bencode(deviation["index"], alphabet), + deviation["index_base36"], )) @staticmethod diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 06b5ba2..4ead3fb 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -347,24 +347,33 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): @staticmethod def _parse_image_info(url): - parts = url.split("/")[4].split("-") + for part in url.split("/")[4:]: + try: + _, size, width, height, _ = part.split("-") + break + except ValueError: + pass + else: + size = width = height = 0 + return { - "width": text.parse_int(parts[2]), - "height": text.parse_int(parts[3]), - "size": text.parse_int(parts[1]), - "cost": 1, + "cost" : 1, + "size" : text.parse_int(size), + "width" : text.parse_int(width), + "height": text.parse_int(height), } @staticmethod def _parse_original_info(info): parts = info.lstrip().split(" ") size = text.parse_bytes(parts[3] + parts[4][0]) + return { - "width": text.parse_int(parts[0]), - "height": text.parse_int(parts[2]), - "size": size, # 1 initial point + 1 per 0.1 MB - "cost": 1 + math.ceil(size / 100000) + "cost" : 1 + math.ceil(size / 100000), + "size" : size, + "width" : text.parse_int(parts[0]), + "height": text.parse_int(parts[2]), } diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index 493c1d2..f878dbd 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -55,7 +55,7 @@ class GfycatExtractor(Extractor): class GfycatUserExtractor(GfycatExtractor): """Extractor for gfycat user profiles""" subcategory = "user" - directory_fmt = ("{category}", "{userName}") + directory_fmt = ("{category}", "{username|userName}") pattern = r"(?:https?://)?gfycat\.com/@([^/?#]+)" test = ("https://gfycat.com/@gretta", { "pattern": r"https://giant\.gfycat\.com/[A-Za-z]+\.mp4", diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 0be528d..691cefb 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.)?hentai-foundry\.com" +BASE_PATTERN = r"(https?://)?(?:www\.)?hentai-foundry\.com" class HentaifoundryExtractor(Extractor): @@ -20,12 +20,14 @@ class HentaifoundryExtractor(Extractor): directory_fmt = ("{category}", "{user}") filename_fmt = "{category}_{index}_{title}.{extension}" archive_fmt = "{index}" + cookiedomain = "www.hentai-foundry.com" root = "https://www.hentai-foundry.com" per_page = 25 def __init__(self, match): + self.root = (match.group(1) or "https://") + "www.hentai-foundry.com" + self.user = match.group(2) Extractor.__init__(self, match) - self.user = match.group(1) self.page_url = "" self.start_post = 0 self.start_page = 1 @@ -75,7 +77,8 @@ class HentaifoundryExtractor(Extractor): "width" : text.parse_int(extr('width="', '"')), "height" : text.parse_int(extr('height="', '"')), "index" : text.parse_int(path.rsplit("/", 2)[1]), - "src" : "https:" + text.unescape(extr('src="', '"')), + "src" : text.urljoin(self.root, text.unescape(extr( + 'src="', '"'))), "description": text.unescape(text.remove_html(extr( '>Description', '') .replace("\r\n", "\n"), "", "")), @@ -121,7 +124,13 @@ class HentaifoundryExtractor(Extractor): def _init_site_filters(self): """Set site-internal filters to show all images""" url = self.root + "/?enterAgree=1" - response = self.request(url, method="HEAD") + self.request(url, method="HEAD") + + csrf_token = self.session.cookies.get( + "YII_CSRF_TOKEN", domain=self.cookiedomain) + if not csrf_token: + self.log.warning("Unable to update site content filters") + return url = self.root + "/site/filters" data = { @@ -148,7 +157,7 @@ class HentaifoundryExtractor(Extractor): "filter_order" : "date_new", "filter_type" : "0", "YII_CSRF_TOKEN" : text.unquote(text.extract( - response.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0]), + csrf_token, "%22", "%22")[0]), } self.request(url, method="POST", data=data) @@ -235,7 +244,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor): directory_fmt = ("{category}", "Recent Pictures", "{date}") archive_fmt = "r_{index}" pattern = BASE_PATTERN + r"/pictures/recent/(\d\d\d\d-\d\d-\d\d)" - test = ("http://www.hentai-foundry.com/pictures/recent/2018-09-20", { + test = ("https://www.hentai-foundry.com/pictures/recent/2018-09-20", { "pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/?#]+/\d+/", "range": "20-30", }) @@ -254,7 +263,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor): directory_fmt = ("{category}", "Popular Pictures") archive_fmt = "p_{index}" pattern = BASE_PATTERN + r"/pictures/popular()" - test = ("http://www.hentai-foundry.com/pictures/popular", { + test = ("https://www.hentai-foundry.com/pictures/popular", { "pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/?#]+/\d+/", "range": "20-30", }) @@ -267,7 +276,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor): class HentaifoundryImageExtractor(HentaifoundryExtractor): """Extractor for a single image from hentaifoundry.com""" subcategory = "image" - pattern = (r"(?:https?://)?(?:www\.|pictures\.)?hentai-foundry\.com" + pattern = (r"(https?://)?(?:www\.|pictures\.)?hentai-foundry\.com" r"/(?:pictures/user|[^/?#])/([^/?#]+)/(\d+)") test = ( (("https://www.hentai-foundry.com" @@ -290,7 +299,10 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): "width" : 495, }, }), - ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/"), + ("http://www.hentai-foundry.com/pictures/user/Tenpura/407501/", { + "pattern": "http://pictures.hentai-foundry.com/t/Tenpura/407501/", + }), + ("https://www.hentai-foundry.com/pictures/user/Tenpura/407501/"), ("https://pictures.hentai-foundry.com" "/t/Tenpura/407501/Tenpura-407501-shimakaze.png"), ) @@ -298,7 +310,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.index = match.group(2) + self.index = match.group(3) def items(self): post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format( @@ -359,7 +371,7 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.index = match.group(2) + self.index = match.group(3) def items(self): story_url = "{}/stories/user/{}/{}/x?enterAgree=1".format( diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index 6ddf0e8..679b5a0 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -23,9 +23,10 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): root = "https://downloads.khinsider.com" test = (("https://downloads.khinsider.com" "/game-soundtracks/album/horizon-riders-wii"), { - "pattern": r"https?://vgmdownloads.com/soundtracks/horizon-riders-wii/" - r"[^/]+/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3", - "keyword": "5b2c35cce638c326cab2a4f7a79f245d008d62ff", + "pattern": r"https?://vgm(site|downloads).com" + r"/soundtracks/horizon-riders-wii/[^/]+" + r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3", + "keyword": "12ca70e0709ea15250e577ea388cf2b5b0c65630", }) def __init__(self, match): diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index 5743498..344dd56 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -86,7 +86,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor): "album": { "id": "lzVOv1Q9", "name": "re:池永康晟 Ikenaga Yasunari 透出古朴", - "date": "2019.3.22 14:42", + "date": "dt:2019-03-22 14:42:00", "description": str, }, "num": int, @@ -113,23 +113,24 @@ class MangoxoAlbumExtractor(MangoxoExtractor): def metadata(self, page): """Return general metadata""" title, pos = text.extract(page, '', '') - count, pos = text.extract(page, 'id="pic-count">', '<', pos) - cover, pos = text.extract(page, ' src="', '"', pos) + _ , pos = text.extract(page, 'class="desc"', '', pos) cid , pos = text.extract(page, '//www.mangoxo.com/channel/', '"', pos) cname, pos = text.extract(page, '>', '<', pos) + count, pos = text.extract(page, 'id="pic-count">', '<', pos) + cover, pos = text.extract(page, ' src="', '"', pos) date , pos = text.extract(page, '', '<', pos) descr, pos = text.extract(page, '
', '
', pos) return { "channel": { "id": cid, - "name": text.unescape(cname), + "name": text.unescape(cname.strip()), "cover": cover, }, "album": { "id": self.album_id, "name": text.unescape(title), - "date": date.strip(), + "date": text.parse_datetime(date.strip(), "%Y.%m.%d %H:%M"), "description": text.unescape(descr), }, "count": text.parse_int(count), diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 57521d6..e0b0496 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -21,6 +21,9 @@ class PahealExtractor(SharedConfigMixin, Extractor): root = "https://rule34.paheal.net" def items(self): + self.session.cookies.set( + "ui-tnc-agreed", "true", domain="rule34.paheal.net") + yield Message.Version, 1 yield Message.Directory, self.get_metadata() @@ -65,7 +68,7 @@ class PahealTagExtractor(PahealExtractor): page = self.request(url).text for post in text.extract_iter( - page, ''): yield self._extract_data(post) if ">Next<" not in page: @@ -79,7 +82,8 @@ class PahealTagExtractor(PahealExtractor): md5 , pos = text.extract(post, '/_thumbs/', '/', pos) url , pos = text.extract(post, '', '') data["tags"] = text.split_html(extr('="meta-tags">', ''))[::2] - yield Message.Version, 1 - yield Message.Directory, data - for img in imgs: - img.update(data) - yield Message.Url, img["url"], img + files = self._extract_images(content) + if self.config("videos", True): + files += self._extract_videos(content) + if self.config("embeds", False): + files += self._extract_embeds(content) + data["count"] = len(files) - def images(self, extr): - num = 0 - imgs = [] - urls = set() - orig = re.compile(r"-\d+x\d+\.") - - extr('
', '') - while True: - url = extr('data-lazy-src="', '"') - if not url: - return imgs - if url in urls: - continue + yield Message.Directory, data + for num, url in enumerate(files, 1): + file = text.nameext_from_url(url) if url[0] == "/": url = text.urljoin(self.root, url) - url = orig.sub(".", url) - num += 1 - imgs.append(text.nameext_from_url(url, { - "url" : url, - "num" : num, - })) - urls.add(url) + file["url"] = url + file["num"] = num + file.update(data) + yield Message.Url, url, file + + @staticmethod + def _extract_images(content): + orig_sub = re.compile(r"-\d+x\d+\.").sub + return [ + orig_sub(".", url) for url in + util.unique(text.extract_iter(content, 'data-lazy-src="', '"')) + ] + + @staticmethod + def _extract_videos(content): + return re.findall(r"]*src=[\"']([^\"']+)", content) + + @staticmethod + def _extract_embeds(content): + return [ + "ytdl:" + url for url in + re.findall(r"