diff options
author | Unit 193 <unit193@unit193.net> | 2020-11-13 19:17:11 -0500 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2020-11-13 19:17:11 -0500 |
commit | 29228b3731dde3a707e4b507eedd54a634e3725a (patch) | |
tree | 632edf08783da3f40d0077a73c402d11d6af75cc | |
parent | b35d81189c65d5834430cd24ce50d3f5f6392868 (diff) | |
parent | 209a3c800871cd68edd2bc7ae661a24ecd496d2d (diff) | |
download | gallery-dl-29228b3731dde3a707e4b507eedd54a634e3725a.tar.bz2 gallery-dl-29228b3731dde3a707e4b507eedd54a634e3725a.tar.xz gallery-dl-29228b3731dde3a707e4b507eedd54a634e3725a.tar.zst |
Update upstream source from tag 'upstream/1.15.3'
Update to upstream version '1.15.3'
with Debian dir e4e814274dce1884e0f01e4c1dcfab7a594ef987
29 files changed, 390 insertions, 118 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index f382013..59ee36a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Changelog +## 1.15.3 - 2020-11-13 +### Additions +- [sankakucomplex] extract videos and embeds ([#308](https://github.com/mikf/gallery-dl/issues/308)) +- [twitter] add support for lists ([#1096](https://github.com/mikf/gallery-dl/issues/1096)) +- [postprocessor:metadata] accept string-lists for `content-format` ([#1080](https://github.com/mikf/gallery-dl/issues/1080)) +- implement `modules` and `extension-map` options +### Fixes +- [500px] update query hashes +- [8kun] fix file URLs of older posts ([#1101](https://github.com/mikf/gallery-dl/issues/1101)) +- [exhentai] update image URL parsing ([#1094](https://github.com/mikf/gallery-dl/issues/1094)) +- [hentaifoundry] update `YII_CSRF_TOKEN` cookie handling ([#1083](https://github.com/mikf/gallery-dl/issues/1083)) +- [hentaifoundry] use scheme from input URLs ([#1095](https://github.com/mikf/gallery-dl/issues/1095)) +- [mangoxo] fix metadata extraction +- [paheal] fix extraction ([#1088](https://github.com/mikf/gallery-dl/issues/1088)) +- collect post processors from `basecategory` entries ([#1084](https://github.com/mikf/gallery-dl/issues/1084)) + ## 1.15.2 - 2020-10-24 ### Additions - [pinterest] implement login support ([#1055](https://github.com/mikf/gallery-dl/issues/1055)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.15.2 +Version: 1.15.3 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.3/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.3/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -319,7 +319,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.2.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.3.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ @@ -83,8 +83,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.3/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.3/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -308,7 +308,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.2.tar.gz +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.3.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 9df67f4..a913c47 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2020-10-24" "1.15.2" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2020-11-13" "1.15.3" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 8dd3187..05bd92e 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2020-10-24" "1.15.2" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2020-11-13" "1.15.3" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -217,6 +217,28 @@ Note: In a string with 2 or more characters, \f[I][]^-\\\f[] need to be escaped with backslashes, e.g. \f[I]"\\\\[\\\\]"\f[] +.SS extractor.*.extension-map +.IP "Type:" 6 +\f[I]object\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Example:" 4 +.. code:: json + +{ +"jpeg": "jpg", +"jpe" : "jpg", +"jfif": "jpg", +"jif" : "jpg", +"jfi" : "jpg" +} + +.IP "Description:" 4 +A JSON \f[I]object\f[] mapping filename extensions to alternatives. + + .SS extractor.*.skip .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] @@ -1530,6 +1552,28 @@ receives too many HTTP requests in a certain amount of time. Waiting a few seconds between each request tries to prevent that. +.SS extractor.sankakucomplex.embeds +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download video embeds from external sites. + + +.SS extractor.sankakucomplex.videos +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download videos. + + .SS extractor.smugmug.videos .IP "Type:" 6 \f[I]bool\f[] @@ -2288,10 +2332,13 @@ Note: \f[I]metadata.extension\f[] is ignored if this option is set. .SS metadata.content-format .IP "Type:" 6 -\f[I]string\f[] +\f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] .IP "Example:" 4 -"tags:\\n\\n{tags:J\\n}\\n" +.br +* "tags:\\n\\n{tags:J\\n}\\n" +.br +* ["tags:", "", "{tags:J\\n}"] .IP "Description:" 4 Custom format string to build the content of metadata files with. @@ -2465,6 +2512,22 @@ case the Python interpreter gets shut down unexpectedly .SH MISCELLANEOUS OPTIONS +.SS extractor.modules +.IP "Type:" 6 +\f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +The \f[I]modules\f[] list in +\f[I]extractor/__init__.py\f[] + +.IP "Example:" 4 +["reddit", "danbooru", "mangadex"] + +.IP "Description:" 4 +The list of modules to load when searching for a suitable +extractor class. Useful to reduce startup time and memory usage. + + .SS cache.file .IP "Type:" 6 \f[I]Path\f[] diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 18f8d82..70c079d 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.15.2 +Version: 1.15.3 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.3/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.3/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -319,7 +319,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.2.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.3.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index e71a5b0..6c2c713 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -9,7 +9,7 @@ from __future__ import unicode_literals, print_function __author__ = "Mike Fährmann" -__copyright__ = "Copyright 2014-2018 Mike Fährmann" +__copyright__ = "Copyright 2014-2020 Mike Fährmann" __license__ = "GPLv2" __maintainer__ = "Mike Fährmann" __email__ = "mike_faehrmann@web.de" @@ -129,6 +129,12 @@ def main(): for opts in args.options: config.set(*opts) + # extractor modules + modules = config.get(("extractor",), "modules") + if modules is not None: + extractor.modules = modules + extractor._module_iter = iter(modules) + # loglevels output.configure_logging(args.loglevel) if args.loglevel >= logging.ERROR: @@ -142,7 +148,7 @@ def main(): head = "" try: out, err = subprocess.Popen( - ("git", "rev-parse", "--short", "HEAD"), + ("git", "rev-parse", "--short", "HEAD"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(__file__)), diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index c34cfec..51e461e 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,9 +21,9 @@ class _2chanThreadExtractor(Extractor): archive_fmt = "{board}_{thread}_{tim}" url_fmt = "https://{server}.2chan.net/{board}/src/{filename}" pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)" - test = ("http://dec.2chan.net/70/res/947.htm", { - "url": "c5c12b80b290e224b6758507b3bb952044f4595b", - "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0", + test = ("http://dec.2chan.net/70/res/11048.htm", { + "url": "2ecf919139bd5d915930530b3576d67c388a2a49", + "keyword": "8def4ec98a89fd4fff8bbcbae603604dcb4a3bb9", }) def __init__(self, match): diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 624b14d..df9941a 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -109,8 +109,8 @@ class _500pxUserExtractor(_500pxExtractor): variables = {"username": self.user, "pageSize": 20} photos = self._request_graphql( "OtherPhotosQuery", variables, - "54524abbdc809f8d4e10d37839e8ab2d" - "3035413688cad9c7fbece13b66637e9d", + "018a5e5117bd72bdf28066aad02c4f2d" + "8acdf7f6127215d231da60e24080eb1b", )["user"]["photos"] while True: @@ -122,8 +122,8 @@ class _500pxUserExtractor(_500pxExtractor): variables["cursor"] = photos["pageInfo"]["endCursor"] photos = self._request_graphql( "OtherPhotosPaginationContainerQuery", variables, - "6d31e01104456ce642a2c6fc2f936812" - "b0f2a65c442d03e1521d769c20efe507", + "b4af70d42c71a5e43f0be36ce60dc81e" + "9742ebc117cde197350f2b86b5977d98", )["userByUsername"]["photos"] @@ -153,7 +153,7 @@ class _500pxGalleryExtractor(_500pxExtractor): def metadata(self): user = self._request_graphql( "ProfileRendererQuery", {"username": self.user_name}, - "4d02ff5c13927a3ac73b3eef306490508bc765956940c31051468cf30402a503", + "5a17a9af1830b58b94a912995b7947b24f27f1301c6ea8ab71a9eb1a6a86585b", )["profile"] self.user_id = str(user["legacyId"]) @@ -166,7 +166,7 @@ class _500pxGalleryExtractor(_500pxExtractor): } gallery = self._request_graphql( "GalleriesDetailQueryRendererQuery", variables, - "fd367cacf9bebcdc0620bd749dbd8fc9b0ccbeb54fc76b8b4b95e66a8c0cba49", + "fb8bb66d31b58903e2f01ebe66bbe7937b982753be3211855b7bce4e286c1a49", )["gallery"] self._photos = gallery["photos"] diff --git a/gallery_dl/extractor/8kun.py b/gallery_dl/extractor/8kun.py index 47fe672..e55bb08 100644 --- a/gallery_dl/extractor/8kun.py +++ b/gallery_dl/extractor/8kun.py @@ -20,10 +20,17 @@ class _8kunThreadExtractor(Extractor): filename_fmt = "{time}{num:?-//} {filename}.{extension}" archive_fmt = "{board}_{thread}_{tim}" pattern = r"(?:https?://)?8kun\.top/([^/]+)/res/(\d+)" - test = ("https://8kun.top/test/res/65248.html", { - "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+", - "count": ">= 8", - }) + test = ( + ("https://8kun.top/test/res/65248.html", { + "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+", + "count": ">= 8", + }), + # old-style file URLs (#1101) + ("https://8kun.top/d/res/13258.html", { + "pattern": r"https://media\.8kun\.top/d/src/\d+(-\d)?\.\w+", + "range": "1-20", + }), + ) def __init__(self, match): Extractor.__init__(self, match) @@ -56,7 +63,10 @@ class _8kunThreadExtractor(Extractor): def _process(post, data): post.update(data) post["extension"] = post["ext"][1:] - url = "https://media.8kun.top/file_store/" + post["tim"] + post["ext"] + tim = post["tim"] + url = ("https://media.8kun.top/" + + ("file_store/" if len(tim) > 16 else post["board"] + "/src/") + + tim + post["ext"]) return Message.Url, url, post diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b8e39bc..d0c327a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -185,6 +185,8 @@ def _list_classes(): module = importlib.import_module("."+module_name, __package__) yield from add_module(module) + globals()["_list_classes"] = lambda : _cache + def _get_classes(module): """Return a list of all extractor classes in a module""" diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 357deac..5efea4a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -518,6 +518,15 @@ class SharedConfigMixin(): ), key, default, ) + def config_accumulate(self, key): + values = config.accumulate(self._cfgpath, key) + + conf = config.get(("extractor",), self.basecategory) + if conf: + values[:0] = config.accumulate((self.subcategory,), key, conf=conf) + + return values + def generate_extractors(extractor_data, symtable, classes): """Dynamically generate Extractor classes""" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index e40ec51..456a173 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -165,11 +165,12 @@ class DeviantartExtractor(Extractor): # filename metadata alphabet = "0123456789abcdefghijklmnopqrstuvwxyz" + deviation["index_base36"] = util.bencode(deviation["index"], alphabet) sub = re.compile(r"\W").sub deviation["filename"] = "".join(( sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["author"]["username"].lower()), "-d", - util.bencode(deviation["index"], alphabet), + deviation["index_base36"], )) @staticmethod diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 06b5ba2..4ead3fb 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -347,24 +347,33 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): @staticmethod def _parse_image_info(url): - parts = url.split("/")[4].split("-") + for part in url.split("/")[4:]: + try: + _, size, width, height, _ = part.split("-") + break + except ValueError: + pass + else: + size = width = height = 0 + return { - "width": text.parse_int(parts[2]), - "height": text.parse_int(parts[3]), - "size": text.parse_int(parts[1]), - "cost": 1, + "cost" : 1, + "size" : text.parse_int(size), + "width" : text.parse_int(width), + "height": text.parse_int(height), } @staticmethod def _parse_original_info(info): parts = info.lstrip().split(" ") size = text.parse_bytes(parts[3] + parts[4][0]) + return { - "width": text.parse_int(parts[0]), - "height": text.parse_int(parts[2]), - "size": size, # 1 initial point + 1 per 0.1 MB - "cost": 1 + math.ceil(size / 100000) + "cost" : 1 + math.ceil(size / 100000), + "size" : size, + "width" : text.parse_int(parts[0]), + "height": text.parse_int(parts[2]), } diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index 493c1d2..f878dbd 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -55,7 +55,7 @@ class GfycatExtractor(Extractor): class GfycatUserExtractor(GfycatExtractor): """Extractor for gfycat user profiles""" subcategory = "user" - directory_fmt = ("{category}", "{userName}") + directory_fmt = ("{category}", "{username|userName}") pattern = r"(?:https?://)?gfycat\.com/@([^/?#]+)" test = ("https://gfycat.com/@gretta", { "pattern": r"https://giant\.gfycat\.com/[A-Za-z]+\.mp4", diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 0be528d..691cefb 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.)?hentai-foundry\.com" +BASE_PATTERN = r"(https?://)?(?:www\.)?hentai-foundry\.com" class HentaifoundryExtractor(Extractor): @@ -20,12 +20,14 @@ class HentaifoundryExtractor(Extractor): directory_fmt = ("{category}", "{user}") filename_fmt = "{category}_{index}_{title}.{extension}" archive_fmt = "{index}" + cookiedomain = "www.hentai-foundry.com" root = "https://www.hentai-foundry.com" per_page = 25 def __init__(self, match): + self.root = (match.group(1) or "https://") + "www.hentai-foundry.com" + self.user = match.group(2) Extractor.__init__(self, match) - self.user = match.group(1) self.page_url = "" self.start_post = 0 self.start_page = 1 @@ -75,7 +77,8 @@ class HentaifoundryExtractor(Extractor): "width" : text.parse_int(extr('width="', '"')), "height" : text.parse_int(extr('height="', '"')), "index" : text.parse_int(path.rsplit("/", 2)[1]), - "src" : "https:" + text.unescape(extr('src="', '"')), + "src" : text.urljoin(self.root, text.unescape(extr( + 'src="', '"'))), "description": text.unescape(text.remove_html(extr( '>Description</div>', '</section>') .replace("\r\n", "\n"), "", "")), @@ -121,7 +124,13 @@ class HentaifoundryExtractor(Extractor): def _init_site_filters(self): """Set site-internal filters to show all images""" url = self.root + "/?enterAgree=1" - response = self.request(url, method="HEAD") + self.request(url, method="HEAD") + + csrf_token = self.session.cookies.get( + "YII_CSRF_TOKEN", domain=self.cookiedomain) + if not csrf_token: + self.log.warning("Unable to update site content filters") + return url = self.root + "/site/filters" data = { @@ -148,7 +157,7 @@ class HentaifoundryExtractor(Extractor): "filter_order" : "date_new", "filter_type" : "0", "YII_CSRF_TOKEN" : text.unquote(text.extract( - response.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0]), + csrf_token, "%22", "%22")[0]), } self.request(url, method="POST", data=data) @@ -235,7 +244,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor): directory_fmt = ("{category}", "Recent Pictures", "{date}") archive_fmt = "r_{index}" pattern = BASE_PATTERN + r"/pictures/recent/(\d\d\d\d-\d\d-\d\d)" - test = ("http://www.hentai-foundry.com/pictures/recent/2018-09-20", { + test = ("https://www.hentai-foundry.com/pictures/recent/2018-09-20", { "pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/?#]+/\d+/", "range": "20-30", }) @@ -254,7 +263,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor): directory_fmt = ("{category}", "Popular Pictures") archive_fmt = "p_{index}" pattern = BASE_PATTERN + r"/pictures/popular()" - test = ("http://www.hentai-foundry.com/pictures/popular", { + test = ("https://www.hentai-foundry.com/pictures/popular", { "pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/?#]+/\d+/", "range": "20-30", }) @@ -267,7 +276,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor): class HentaifoundryImageExtractor(HentaifoundryExtractor): """Extractor for a single image from hentaifoundry.com""" subcategory = "image" - pattern = (r"(?:https?://)?(?:www\.|pictures\.)?hentai-foundry\.com" + pattern = (r"(https?://)?(?:www\.|pictures\.)?hentai-foundry\.com" r"/(?:pictures/user|[^/?#])/([^/?#]+)/(\d+)") test = ( (("https://www.hentai-foundry.com" @@ -290,7 +299,10 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): "width" : 495, }, }), - ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/"), + ("http://www.hentai-foundry.com/pictures/user/Tenpura/407501/", { + "pattern": "http://pictures.hentai-foundry.com/t/Tenpura/407501/", + }), + ("https://www.hentai-foundry.com/pictures/user/Tenpura/407501/"), ("https://pictures.hentai-foundry.com" "/t/Tenpura/407501/Tenpura-407501-shimakaze.png"), ) @@ -298,7 +310,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.index = match.group(2) + self.index = match.group(3) def items(self): post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format( @@ -359,7 +371,7 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.index = match.group(2) + self.index = match.group(3) def items(self): story_url = "{}/stories/user/{}/{}/x?enterAgree=1".format( diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index 6ddf0e8..679b5a0 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -23,9 +23,10 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): root = "https://downloads.khinsider.com" test = (("https://downloads.khinsider.com" "/game-soundtracks/album/horizon-riders-wii"), { - "pattern": r"https?://vgmdownloads.com/soundtracks/horizon-riders-wii/" - r"[^/]+/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3", - "keyword": "5b2c35cce638c326cab2a4f7a79f245d008d62ff", + "pattern": r"https?://vgm(site|downloads).com" + r"/soundtracks/horizon-riders-wii/[^/]+" + r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3", + "keyword": "12ca70e0709ea15250e577ea388cf2b5b0c65630", }) def __init__(self, match): diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index 5743498..344dd56 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -86,7 +86,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor): "album": { "id": "lzVOv1Q9", "name": "re:池永康晟 Ikenaga Yasunari 透出古朴", - "date": "2019.3.22 14:42", + "date": "dt:2019-03-22 14:42:00", "description": str, }, "num": int, @@ -113,23 +113,24 @@ class MangoxoAlbumExtractor(MangoxoExtractor): def metadata(self, page): """Return general metadata""" title, pos = text.extract(page, '<title>', '</title>') - count, pos = text.extract(page, 'id="pic-count">', '<', pos) - cover, pos = text.extract(page, ' src="', '"', pos) + _ , pos = text.extract(page, 'class="desc"', '', pos) cid , pos = text.extract(page, '//www.mangoxo.com/channel/', '"', pos) cname, pos = text.extract(page, '>', '<', pos) + count, pos = text.extract(page, 'id="pic-count">', '<', pos) + cover, pos = text.extract(page, ' src="', '"', pos) date , pos = text.extract(page, '</i>', '<', pos) descr, pos = text.extract(page, '<pre>', '</pre>', pos) return { "channel": { "id": cid, - "name": text.unescape(cname), + "name": text.unescape(cname.strip()), "cover": cover, }, "album": { "id": self.album_id, "name": text.unescape(title), - "date": date.strip(), + "date": text.parse_datetime(date.strip(), "%Y.%m.%d %H:%M"), "description": text.unescape(descr), }, "count": text.parse_int(count), diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 57521d6..e0b0496 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -21,6 +21,9 @@ class PahealExtractor(SharedConfigMixin, Extractor): root = "https://rule34.paheal.net" def items(self): + self.session.cookies.set( + "ui-tnc-agreed", "true", domain="rule34.paheal.net") + yield Message.Version, 1 yield Message.Directory, self.get_metadata() @@ -65,7 +68,7 @@ class PahealTagExtractor(PahealExtractor): page = self.request(url).text for post in text.extract_iter( - page, '<img id="thumb_', '>Image Only<'): + page, '<img id="thumb_', 'Only</a>'): yield self._extract_data(post) if ">Next<" not in page: @@ -79,7 +82,8 @@ class PahealTagExtractor(PahealExtractor): md5 , pos = text.extract(post, '/_thumbs/', '/', pos) url , pos = text.extract(post, '<a href="', '"', pos) - tags, dimensions, size, _ = data.split(" // ") + tags, data, date = data.split("\n") + dimensions, size, ext = data.split(" // ") width, _, height = dimensions.partition("x") return { diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index f6ad327..972750c 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,7 +9,7 @@ """Extractors for https://www.sankakucomplex.com/""" from .common import Extractor, Message -from .. import text +from .. import text, util import re @@ -40,6 +40,21 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c", "keyword": "e78fcc23c2711befc0969a45ea5082a29efccf68", }), + # videos (#308) + (("https://www.sankakucomplex.com/2019/06/11" + "/darling-ol-goddess-shows-off-her-plump-lower-area/"), { + "pattern": r"/wp-content/uploads/2019/06/[^/]+\d\.mp4", + "range": "26-", + "count": 5, + }), + # youtube embeds (#308) + (("https://www.sankakucomplex.com/2015/02/12" + "/snow-miku-2015-live-magical-indeed/"), { + "options": (("embeds", True),), + "pattern": r"https://www.youtube.com/embed/", + "range": "2-", + "count": 2, + }), ) def items(self): @@ -53,38 +68,44 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): "date" : text.parse_datetime( extr('property="article:published_time" content="', '"')), } - imgs = self.images(extr) - data["count"] = len(imgs) + content = extr('<div class="entry-content">', '</article>') data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2] - yield Message.Version, 1 - yield Message.Directory, data - for img in imgs: - img.update(data) - yield Message.Url, img["url"], img + files = self._extract_images(content) + if self.config("videos", True): + files += self._extract_videos(content) + if self.config("embeds", False): + files += self._extract_embeds(content) + data["count"] = len(files) - def images(self, extr): - num = 0 - imgs = [] - urls = set() - orig = re.compile(r"-\d+x\d+\.") - - extr('<div class="entry-content">', '') - while True: - url = extr('data-lazy-src="', '"') - if not url: - return imgs - if url in urls: - continue + yield Message.Directory, data + for num, url in enumerate(files, 1): + file = text.nameext_from_url(url) if url[0] == "/": url = text.urljoin(self.root, url) - url = orig.sub(".", url) - num += 1 - imgs.append(text.nameext_from_url(url, { - "url" : url, - "num" : num, - })) - urls.add(url) + file["url"] = url + file["num"] = num + file.update(data) + yield Message.Url, url, file + + @staticmethod + def _extract_images(content): + orig_sub = re.compile(r"-\d+x\d+\.").sub + return [ + orig_sub(".", url) for url in + util.unique(text.extract_iter(content, 'data-lazy-src="', '"')) + ] + + @staticmethod + def _extract_videos(content): + return re.findall(r"<source [^>]*src=[\"']([^\"']+)", content) + + @staticmethod + def _extract_embeds(content): + return [ + "ytdl:" + url for url in + re.findall(r"<iframe [^>]*src=[\"']([^\"']+)", content) + ] class SankakucomplexTagExtractor(SankakucomplexExtractor): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 06973b2..fe0b3c5 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache - +import json BASE_PATTERN = ( r"(?:https?://)?(?:www\.|mobile\.)?" @@ -78,8 +78,8 @@ class TwitterExtractor(Extractor): def _extract_media(self, tweet, files): for media in tweet["extended_entities"]["media"]: - width = media["original_info"].get("width", 0), - height = media["original_info"].get("height", 0), + width = media["original_info"].get("width", 0) + height = media["original_info"].get("height", 0) if "video_info" in media: if self.videos == "ytdl": @@ -321,6 +321,35 @@ class TwitterBookmarkExtractor(TwitterExtractor): return TwitterAPI(self).timeline_bookmark() +class TwitterListExtractor(TwitterExtractor): + """Extractor for Twitter lists""" + subcategory = "list" + pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$" + test = ("https://twitter.com/i/lists/784214683683127296", { + "range": "1-40", + "count": 40, + "archive": False, + }) + + def tweets(self): + return TwitterAPI(self).timeline_list(self.user) + + +class TwitterListMembersExtractor(TwitterExtractor): + """Extractor for members of a Twitter list""" + subcategory = "list-members" + pattern = BASE_PATTERN + r"/i/lists/(\d+)/members" + test = ("https://twitter.com/i/lists/784214683683127296/members",) + + def items(self): + self.login() + for user in TwitterAPI(self).list_members(self.user): + user["_extractor"] = TwitterTimelineExtractor + url = "{}/intent/user?user_id={}".format( + self.root, user["rest_id"]) + yield Message.Queue, url, user + + class TwitterSearchExtractor(TwitterExtractor): """Extractor for all images from a search timeline""" subcategory = "search" @@ -399,7 +428,7 @@ class TwitterTweetExtractor(TwitterExtractor): # Twitter card (#1005) ("https://twitter.com/billboard/status/1306599586602135555", { "options": (("cards", True),), - "pattern": r"https://pbs.twimg.com/card_img/1317274761030856707/", + "pattern": r"https://pbs.twimg.com/card_img/\d+/", }), # original retweets (#1026) ("https://twitter.com/jessica_3978/status/1296304589591810048", { @@ -511,6 +540,13 @@ class TwitterAPI(): endpoint = "2/timeline/bookmark.json" return self._pagination(endpoint) + def timeline_list(self, list_id): + endpoint = "2/timeline/list.json" + params = self.params.copy() + params["list_id"] = list_id + params["ranking_mode"] = "reverse_chronological" + return self._pagination(endpoint, params) + def search(self, query): endpoint = "2/search/adaptive.json" params = self.params.copy() @@ -522,12 +558,29 @@ class TwitterAPI(): return self._pagination( endpoint, params, "sq-I-t-", "sq-cursor-bottom") - def user_by_screen_name(self, screen_name): - endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName" - params = { - "variables": '{"screen_name":"' + screen_name + '"' - ',"withHighlightedLabel":true}' + def list_members(self, list_id): + endpoint = "graphql/M74V2EwlxxVYGB4DbyAphQ/ListMembers" + variables = { + "listId": list_id, + "count" : 20, + "withTweetResult": False, + "withUserResult" : False, } + return self._pagination_members(endpoint, variables) + + def list_by_rest_id(self, list_id): + endpoint = "graphql/LXXTUytSX1QY-2p8Xp9BFA/ListByRestId" + params = {"variables": '{"listId":"' + list_id + '"' + ',"withUserResult":false}'} + try: + return self._call(endpoint, params)["data"]["list"] + except KeyError: + raise exception.NotFoundError("list") + + def user_by_screen_name(self, screen_name): + endpoint = "graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName" + params = {"variables": '{"screen_name":"' + screen_name + '"' + ',"withHighlightedLabel":true}'} try: return self._call(endpoint, params)["data"]["user"] except KeyError: @@ -627,3 +680,30 @@ class TwitterAPI(): if not cursor or not tweet: return params["cursor"] = cursor + + def _pagination_members(self, endpoint, variables): + while True: + cursor = entry = stop = None + params = {"variables": json.dumps(variables)} + data = self._call(endpoint, params) + + try: + instructions = (data["data"]["list"]["members_timeline"] + ["timeline"]["instructions"]) + except KeyError: + raise exception.AuthorizationError() + + for instr in instructions: + if instr["type"] == "TimelineAddEntries": + for entry in instr["entries"]: + if entry["entryId"].startswith("user-"): + yield entry["content"]["itemContent"]["user"] + elif entry["entryId"].startswith("cursor-bottom-"): + cursor = entry["content"]["value"] + elif instr["type"] == "TimelineTerminateTimeline": + if instr["direction"] == "Bottom": + stop = True + + if stop or not cursor or not entry: + return + variables["cursor"] = cursor diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index 6799784..1dd5b09 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -226,7 +226,6 @@ class WeasylFavoriteExtractor(WeasylExtractor): if not owner_login: owner_login = text.extract(page, '<a href="/~', '"')[0] - yield Message.Directory, {"owner_login": owner_login} for submitid in text.extract_iter(page, "/submissions/", "/", pos): if submitid == lastid: @@ -234,6 +233,8 @@ class WeasylFavoriteExtractor(WeasylExtractor): lastid = submitid submission = self.request_submission(submitid) if self.populate_submission(submission): + submission["user"] = owner_login + yield Message.Directory, submission yield Message.Url, submission["url"], submission if "&nextid=" not in page: diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index a955ba3..f88dde7 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -22,6 +22,8 @@ class MetadataPP(PostProcessor): if mode == "custom": self.write = self._write_custom cfmt = options.get("content-format") or options.get("format") + if isinstance(cfmt, list): + cfmt = "\n".join(cfmt) + "\n" self.contentfmt = util.Formatter(cfmt).format_map ext = "txt" elif mode == "tags": diff --git a/gallery_dl/util.py b/gallery_dl/util.py index d85d2b3..a334b6e 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -48,12 +48,22 @@ def bdecode(data, alphabet="0123456789"): def advance(iterable, num): - """"Advance the iterable by 'num' steps""" + """"Advance 'iterable' by 'num' steps""" iterator = iter(iterable) next(itertools.islice(iterator, num, num), None) return iterator +def unique(iterable): + """Yield unique elements from 'iterable' while preserving order""" + seen = set() + add = seen.add + for element in iterable: + if element not in seen: + add(element) + yield element + + def raises(cls): """Returns a function that raises 'cls' as exception""" def wrap(*args): @@ -713,6 +723,12 @@ class PathFormat(): directory_fmt = extractor.config("directory", extractor.directory_fmt) kwdefault = extractor.config("keywords-default") + extension_map = extractor.config("extension-map") + if extension_map is None: + # TODO: better default value in 1.16.0 + extension_map = {} + self.extension_map = extension_map.get + try: self.filename_formatter = Formatter( filename_fmt, kwdefault).format_map @@ -840,7 +856,9 @@ class PathFormat(): """Set general filename data""" self.kwdict = kwdict self.temppath = self.prefix = "" - self.extension = kwdict["extension"] + + ext = kwdict["extension"] + kwdict["extension"] = self.extension = self.extension_map(ext, ext) if self.extension: self.build_path() @@ -849,6 +867,7 @@ class PathFormat(): def set_extension(self, extension, real=True): """Set filename extension""" + extension = self.extension_map(extension, extension) if real: self.extension = extension self.kwdict["extension"] = self.prefix + extension diff --git a/gallery_dl/version.py b/gallery_dl/version.py index b2e5a58..605865f 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.15.2" +__version__ = "1.15.3" diff --git a/test/test_extractor.py b/test/test_extractor.py index 162edc0..8bc3a27 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -22,6 +22,8 @@ from gallery_dl.extractor import mastodon # noqa E402 from gallery_dl.extractor.common import Extractor, Message # noqa E402 from gallery_dl.extractor.directlink import DirectlinkExtractor # noqa E402 +_list_classes = extractor._list_classes + class FakeExtractor(Extractor): category = "fake" @@ -45,6 +47,7 @@ class TestExtractorModule(unittest.TestCase): def setUp(self): extractor._cache.clear() extractor._module_iter = iter(extractor.modules) + extractor._list_classes = _list_classes def test_find(self): for uri in self.VALID_URIS: diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index ff98477..524e501 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -235,18 +235,20 @@ class MetadataTest(BasePostprocessorTest): self.assertEqual(self._output(m), "foo\nbar\nbaz\n") def test_metadata_custom(self): - pp = self._create( - {"mode": "custom", "format": "{foo}\n{missing}\n"}, - {"foo": "bar"}, - ) - self.assertEqual(pp.write, pp._write_custom) - self.assertEqual(pp.extension, "txt") - self.assertTrue(pp.contentfmt) + def test(pp_info): + pp = self._create(pp_info, {"foo": "bar"}) + self.assertEqual(pp.write, pp._write_custom) + self.assertEqual(pp.extension, "txt") + self.assertTrue(pp.contentfmt) - with patch("builtins.open", mock_open()) as m: - pp.prepare(self.pathfmt) - pp.run(self.pathfmt) - self.assertEqual(self._output(m), "bar\nNone\n") + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self._output(m), "bar\nNone\n") + + test({"mode": "custom", "content-format": "{foo}\n{missing}\n"}) + test({"mode": "custom", "content-format": ["{foo}", "{missing}"]}) + test({"mode": "custom", "format": "{foo}\n{missing}\n"}) def test_metadata_extfmt(self): pp = self._create({ diff --git a/test/test_results.py b/test/test_results.py index d54017e..239b5ad 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -30,7 +30,7 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { - "imgbox", + "4plebs", "imagevenue", "mangapanda", "photobucket", diff --git a/test/test_util.py b/test/test_util.py index 08ecd64..fd659a0 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -446,6 +446,16 @@ class TestOther(unittest.TestCase): self.assertCountEqual( util.advance(util.advance(items, 1), 2), range(3, 5)) + def test_unique(self): + self.assertSequenceEqual( + list(util.unique("")), "") + self.assertSequenceEqual( + list(util.unique("AABBCC")), "ABC") + self.assertSequenceEqual( + list(util.unique("ABABABCAABBCC")), "ABC") + self.assertSequenceEqual( + list(util.unique([1, 2, 1, 3, 2, 1])), [1, 2, 3]) + def test_raises(self): func = util.raises(Exception) with self.assertRaises(Exception): |