diff options
author | Unit 193 <unit193@unit193.net> | 2021-08-04 02:14:44 -0400 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2021-08-04 02:14:44 -0400 |
commit | 873d9a628e9412a79bdc64cd962470749de3425b (patch) | |
tree | 8cd421ef79a9fa784147fa888543216f0872357b | |
parent | 32de2b06db501c7de81678bce8e3e0c3e63d340c (diff) | |
download | gallery-dl-873d9a628e9412a79bdc64cd962470749de3425b.tar.bz2 gallery-dl-873d9a628e9412a79bdc64cd962470749de3425b.tar.xz gallery-dl-873d9a628e9412a79bdc64cd962470749de3425b.tar.zst |
New upstream version 1.18.2.upstream/1.18.2
52 files changed, 1551 insertions, 323 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a4c90c..72f9c42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,50 @@ # Changelog +## 1.18.2 - 2021-07-23 +### Additions +- [bbc] add `gallery` and `programme` extractors ([#1706](https://github.com/mikf/gallery-dl/issues/1706)) +- [comicvine] add extractor ([#1712](https://github.com/mikf/gallery-dl/issues/1712)) +- [kemonoparty] add `max-posts` option ([#1674](https://github.com/mikf/gallery-dl/issues/1674)) +- [kemonoparty] parse `o` query parameters ([#1674](https://github.com/mikf/gallery-dl/issues/1674)) +- [mastodon] add `reblogs` and `replies` options ([#1669](https://github.com/mikf/gallery-dl/issues/1669)) +- [pixiv] add extractor for `pixivision` articles ([#1672](https://github.com/mikf/gallery-dl/issues/1672)) +- [ytdl] add experimental extractor for sites supported by youtube-dl ([#1680](https://github.com/mikf/gallery-dl/issues/1680), [#878](https://github.com/mikf/gallery-dl/issues/878)) +- extend `parent-metadata` functionality ([#1687](https://github.com/mikf/gallery-dl/issues/1687), [#1651](https://github.com/mikf/gallery-dl/issues/1651), [#1364](https://github.com/mikf/gallery-dl/issues/1364)) +- add `archive-prefix` option ([#1711](https://github.com/mikf/gallery-dl/issues/1711)) +- add `url-metadata` option ([#1659](https://github.com/mikf/gallery-dl/issues/1659), [#1073](https://github.com/mikf/gallery-dl/issues/1073)) +### Changes +- [kemonoparty] skip duplicated patreon files ([#1689](https://github.com/mikf/gallery-dl/issues/1689), [#1667](https://github.com/mikf/gallery-dl/issues/1667)) +- [mangadex] use custom User-Agent header ([#1535](https://github.com/mikf/gallery-dl/issues/1535)) +### Fixes +- [hitomi] fix image URLs ([#1679](https://github.com/mikf/gallery-dl/issues/1679)) +- [imagevenue] fix extraction ([#1677](https://github.com/mikf/gallery-dl/issues/1677)) +- [instagram] fix extraction of `/explore/tags/` posts ([#1666](https://github.com/mikf/gallery-dl/issues/1666)) +- [moebooru] fix `tags` ending with a `+` when logged in ([#1702](https://github.com/mikf/gallery-dl/issues/1702)) +- [naverwebtoon] fix comic extraction +- [pururin] update domain and fix extraction +- [vk] improve metadata extraction and URL pattern ([#1691](https://github.com/mikf/gallery-dl/issues/1691)) +- [downloader:ytdl] fix `outtmpl` setting for yt-dlp ([#1680](https://github.com/mikf/gallery-dl/issues/1680)) + +## 1.18.1 - 2021-07-04 +### Additions +- [mangafox] add `manga` extractor ([#1633](https://github.com/mikf/gallery-dl/issues/1633)) +- [mangasee] add `chapter` and `manga` extractors +- [mastodon] implement `text-posts` option ([#1569](https://github.com/mikf/gallery-dl/issues/1569), [#1669](https://github.com/mikf/gallery-dl/issues/1669)) +- [seisoparty] add `user` and `post` extractors ([#1635](https://github.com/mikf/gallery-dl/issues/1635)) +- implement conditional directories ([#1394](https://github.com/mikf/gallery-dl/issues/1394)) +- add `T` format string conversion ([#1646](https://github.com/mikf/gallery-dl/issues/1646)) +- document format string syntax +### Changes +- [twitter] set `retweet_id` for original retweets ([#1481](https://github.com/mikf/gallery-dl/issues/1481)) +### Fixes +- [directlink] manually encode Referer URLs ([#1647](https://github.com/mikf/gallery-dl/issues/1647)) +- [hiperdex] use domain from input URL +- [kemonoparty] fix `username` extraction ([#1652](https://github.com/mikf/gallery-dl/issues/1652)) +- [kemonoparty] warn about missing DDoS-GUARD cookies +- [twitter] ensure guest tokens are returned as string ([#1665](https://github.com/mikf/gallery-dl/issues/1665)) +- [webtoons] match arbitrary language codes ([#1643](https://github.com/mikf/gallery-dl/issues/1643)) +- fix depth counter in UrlJob when specifying `-g` multiple times + ## 1.18.0 - 2021-06-19 ### Additions - [foolfuuka] support `archive.wakarimasen.moe` ([#1595](https://github.com/mikf/gallery-dl/issues/1595)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.18.0 +Version: 1.18.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -16,7 +16,7 @@ Description: ========== *gallery-dl* is a command-line program to download image galleries and collections from several image hosting sites (see `Supported Sites`_). It is a cross-platform tool with many configuration options - and powerful filenaming capabilities. + and powerful `filenaming capabilities <Formatting_>`_. |pypi| |build| |gitter| @@ -75,8 +75,8 @@ Description: ========== Prebuilt executable files with a Python interpreter and required Python packages included are available for - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.18.0/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.18.0/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.18.2/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.18.2/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -333,6 +333,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.md + .. _Formatting: https://github.com/mikf/gallery-dl/blob/master/docs/formatting.md .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ @@ -5,7 +5,7 @@ gallery-dl *gallery-dl* is a command-line program to download image galleries and collections from several image hosting sites (see `Supported Sites`_). It is a cross-platform tool with many configuration options -and powerful filenaming capabilities. +and powerful `filenaming capabilities <Formatting_>`_. |pypi| |build| |gitter| @@ -64,8 +64,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.18.0/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.18.0/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.18.2/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.18.2/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -322,6 +322,7 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.md +.. _Formatting: https://github.com/mikf/gallery-dl/blob/master/docs/formatting.md .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 25da021..ee57b4b 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2021-06-19" "1.18.0" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2021-07-23" "1.18.2" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 84e8e0e..91101d1 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2021-06-19" "1.18.0" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2021-07-23" "1.18.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -79,7 +79,7 @@ those as makeshift comments by settings their values to arbitrary strings. .IP "Example:" 4 .br -* .. code:: +* .. code:: json "{manga}_c{chapter}_{page:>03}.{extension}" @@ -135,13 +135,29 @@ a valid filename extension. .SS extractor.*.directory .IP "Type:" 6 -\f[I]list\f[] of \f[I]strings\f[] +\f[I]list\f[] of \f[I]strings\f[] or \f[I]object\f[] .IP "Example:" 4 +.br +* .. code:: json + ["{category}", "{manga}", "c{chapter} - {title}"] +.br +* .. code:: json + +{ +"'nature' in content": ["Nature Pictures"], +"retweet_id != 0" : ["{category}", "{user[name]}", "Retweets"], +"" : ["{category}", "{user[name]}"] +} + + .IP "Description:" 4 -A list of \f[I]format strings\f[] for the resulting target directory. +A list of \f[I]format strings\f[] to build target directory paths with. + +If this is an \f[I]object\f[], it must contain Python expressions mapping to the +list of format strings to use. Each individual string in such a list represents a single path segment, which will be joined together and appended to the @@ -174,13 +190,27 @@ for any spawned child extractors. .SS extractor.*.parent-metadata .IP "Type:" 6 -\f[I]bool\f[] +\f[I]bool\f[] or \f[I]string\f[] .IP "Default:" 9 \f[I]false\f[] .IP "Description:" 4 -Overwrite any metadata provided by a child extractor with its parent's. +If \f[I]true\f[], overwrite any metadata provided by a child extractor +with its parent's. + +If this is a \f[I]string\f[], add a parent's metadata to its children's +.br +to a field named after said string. +For example with \f[I]"parent-metadata": "_p_"\f[]: +.br + +.. code:: json + +{ +"id": "child-id", +"_p_": {"id": "parent-id"} +} .SS extractor.*.parent-skip @@ -194,6 +224,17 @@ Overwrite any metadata provided by a child extractor with its parent's. Share number of skipped downloads between parent and child extractors. +.SS extractor.*.url-metadata +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Insert a file's download URL into its metadata dictionary as the given name. + + .SS extractor.*.path-restrict .IP "Type:" 6 \f[I]string\f[] or \f[I]object\f[] @@ -555,7 +596,7 @@ any .IP "Description:" 4 Default value used for missing or undefined keyword names in -format strings. +\f[I]format strings\f[]. .SS extractor.*.category-transfer @@ -623,6 +664,17 @@ may pose a security risk. An alternative \f[I]format string\f[] to build archive IDs with. +.SS extractor.*.archive-prefix +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"{category}"\f[] + +.IP "Description:" 4 +Prefix for archive IDs. + + .SS extractor.*.postprocessors .IP "Type:" 6 \f[I]list\f[] of \f[I]Postprocessor Configuration\f[] objects @@ -862,6 +914,35 @@ descend into subfolders Download embedded videos hosted on https://www.blogger.com/ +.SS extractor.danbooru.ugoira +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Controls the download target for Ugoira posts. + +.br +* \f[I]true\f[]: Original ZIP archives +.br +* \f[I]false\f[]: Converted video files + + +.SS extractor.danbooru.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract additional metadata (notes, artist commentary, parent, children) + +Note: This requires 1 additional HTTP request for each post. + + .SS extractor.derpibooru.api-key .IP "Type:" 6 \f[I]string\f[] @@ -1341,6 +1422,17 @@ You can use \f[I]"all"\f[] instead of listing all values separately. Download video files. +.SS extractor.kemonoparty.max-posts +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Limit the number of posts to download. + + .SS extractor.kemonoparty.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -1352,6 +1444,17 @@ Download video files. Extract \f[I]username\f[] metadata +.SS extractor.kemonoparty.patreon-skip-file +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Skip main files in Patreon posts to avoid duplicates. + + .SS extractor.khinsider.format .IP "Type:" 6 \f[I]string\f[] @@ -1380,6 +1483,62 @@ the first in the list gets chosen (usually mp3). The server to use for API requests. +.SS extractor.mangadex.lang +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Example:" 4 +"en" + +.IP "Description:" 4 +\f[I]ISO 639-1\f[] language code +to filter chapters by. + + +.SS extractor.mangadex.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Provide \f[I]artist\f[], \f[I]author\f[], and \f[I]group\f[] metadata fields. + + +.SS extractor.mastodon.reblogs +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Fetch media from reblogged posts. + + +.SS extractor.mastodon.replies +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Fetch media from replies to other posts. + + +.SS extractor.mastodon.text-posts +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Also emit metadata for text-only posts without media content. + + .SS extractor.newgrounds.flash .IP "Type:" 6 \f[I]bool\f[] @@ -2099,6 +2258,118 @@ will be taken from the original posts, not the retweeted posts. Download video files. +.SS extractor.ytdl.enabled +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Match **all** URLs, even ones without a \f[I]ytdl:\f[] prefix. + + +.SS extractor.ytdl.format +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +youtube-dl's default, currently \f[I]"bestvideo+bestaudio/best"\f[] + +.IP "Description:" 4 +Video \f[I]format selection +<https://github.com/ytdl-org/youtube-dl#format-selection>\f[] +directly passed to youtube-dl. + + +.SS extractor.ytdl.generic +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Controls the use of youtube-dl's generic extractor. + +Set this option to \f[I]"force"\f[] for the same effect as youtube-dl's +\f[I]--force-generic-extractor\f[]. + + +.SS extractor.ytdl.logging +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Route youtube-dl's output through gallery-dl's logging system. +Otherwise youtube-dl will write its output directly to stdout/stderr. + +Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in +\f[I]extractor.ytdl.raw-options\f[] to \f[I]true\f[] to suppress all output. + + +.SS extractor.ytdl.module +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"youtube_dl"\f[] + +.IP "Description:" 4 +Name of the youtube-dl Python module to import. + + +.SS extractor.ytdl.raw-options +.IP "Type:" 6 +\f[I]object\f[] + +.IP "Example:" 4 +.. code:: json + +{ +"quiet": true, +"writesubtitles": true, +"merge_output_format": "mkv" +} + + +.IP "Description:" 4 +Additional options passed directly to the \f[I]YoutubeDL\f[] constructor. + +All available options can be found in \f[I]youtube-dl's docstrings +<https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L138-L318>\f[]. + + +.SS extractor.[booru].tags +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Categorize tags by their respective types +and provide them as \f[I]tags_<type>\f[] metadata fields. + +Note: This requires 1 additional HTTP request for each post. + + +.SS extractor.[booru].notes +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract overlay notes (position and text). + +Note: This requires 1 additional HTTP request for each post. + + .SS extractor.[manga-extractor].chapter-reverse .IP "Type:" 6 \f[I]bool\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 9514c7a..ffbed52 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -105,10 +105,6 @@ { "include": "pictures" }, - "hentainexus": - { - "original": true - }, "hitomi": { "metadata": true @@ -148,7 +144,9 @@ }, "mangadex": { - "api-server": "https://api.mangadex.org" + "api-server": "https://api.mangadex.org", + "metadata": false, + "lang": null }, "mangoxo": { @@ -285,6 +283,15 @@ "retweets": true, "videos": true }, + "ytdl": + { + "enabled": false, + "format": null, + "generic": true, + "logging": true, + "module": "youtube_dl", + "raw-options": null + }, "booru": { "tags": false, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index b53c326..c8f8dec 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.18.0 +Version: 1.18.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -16,7 +16,7 @@ Description: ========== *gallery-dl* is a command-line program to download image galleries and collections from several image hosting sites (see `Supported Sites`_). It is a cross-platform tool with many configuration options - and powerful filenaming capabilities. + and powerful `filenaming capabilities <Formatting_>`_. |pypi| |build| |gitter| @@ -75,8 +75,8 @@ Description: ========== Prebuilt executable files with a Python interpreter and required Python packages included are available for - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.18.0/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.18.0/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.18.2/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.18.2/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -333,6 +333,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.md + .. _Formatting: https://github.com/mikf/gallery-dl/blob/master/docs/formatting.md .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 9655896..f8a3c2c 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -45,10 +45,12 @@ gallery_dl/extractor/adultempire.py gallery_dl/extractor/architizer.py gallery_dl/extractor/artstation.py gallery_dl/extractor/aryion.py +gallery_dl/extractor/bbc.py gallery_dl/extractor/bcy.py gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py gallery_dl/extractor/booru.py +gallery_dl/extractor/comicvine.py gallery_dl/extractor/common.py gallery_dl/extractor/cyberdrop.py gallery_dl/extractor/danbooru.py @@ -105,6 +107,7 @@ gallery_dl/extractor/mangahere.py gallery_dl/extractor/mangakakalot.py gallery_dl/extractor/manganelo.py gallery_dl/extractor/mangapark.py +gallery_dl/extractor/mangasee.py gallery_dl/extractor/mangoxo.py gallery_dl/extractor/mastodon.py gallery_dl/extractor/message.py @@ -141,6 +144,7 @@ gallery_dl/extractor/redgifs.py gallery_dl/extractor/sankaku.py gallery_dl/extractor/sankakucomplex.py gallery_dl/extractor/seiga.py +gallery_dl/extractor/seisoparty.py gallery_dl/extractor/senmanga.py gallery_dl/extractor/sexcom.py gallery_dl/extractor/shopify.py @@ -168,6 +172,7 @@ gallery_dl/extractor/weibo.py gallery_dl/extractor/wikiart.py gallery_dl/extractor/xhamster.py gallery_dl/extractor/xvideos.py +gallery_dl/extractor/ytdl.py gallery_dl/postprocessor/__init__.py gallery_dl/postprocessor/classify.py gallery_dl/postprocessor/common.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index d5893b7..2cad029 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -127,6 +127,8 @@ def main(): # extractor modules modules = config.get(("extractor",), "modules") if modules is not None: + if isinstance(modules, str): + modules = modules.split(",") extractor.modules = modules extractor._module_iter = iter(modules) diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index e116188..b1e1d58 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -15,13 +15,9 @@ import os class YoutubeDLDownloader(DownloaderBase): scheme = "ytdl" - module = None def __init__(self, job): - module = self.module - if not module: - module_name = self.config("module") or "youtube_dl" - module = YoutubeDLDownloader.module = __import__(module_name) + module = __import__(self.config("module") or "youtube_dl") DownloaderBase.__init__(self, job) extractor = job.extractor @@ -41,7 +37,10 @@ class YoutubeDLDownloader(DownloaderBase): "max_filesize": text.parse_bytes( self.config("filesize-max"), None), } - options.update(self.config("raw-options") or {}) + + raw_options = self.config("raw-options") + if raw_options: + options.update(raw_options) if self.config("logging", True): options["logger"] = self.log @@ -54,30 +53,37 @@ class YoutubeDLDownloader(DownloaderBase): self.ytdl = module.YoutubeDL(options) def download(self, url, pathfmt): - if self.forward_cookies: - set_cookie = self.ytdl.cookiejar.set_cookie - for cookie in self.session.cookies: - set_cookie(cookie) - - try: - info_dict = self.ytdl.extract_info(url[5:], download=False) - except Exception: - return False + kwdict = pathfmt.kwdict + + ytdl = kwdict.pop("_ytdl_instance", None) + if not ytdl: + ytdl = self.ytdl + if self.forward_cookies: + set_cookie = ytdl.cookiejar.set_cookie + for cookie in self.session.cookies: + set_cookie(cookie) + + info_dict = kwdict.pop("_ytdl_info_dict", None) + if not info_dict: + try: + info_dict = ytdl.extract_info(url[5:], download=False) + except Exception: + return False if "entries" in info_dict: - index = pathfmt.kwdict.get("_ytdl_index") + index = kwdict.get("_ytdl_index") if index is None: - return self._download_playlist(pathfmt, info_dict) + return self._download_playlist(ytdl, pathfmt, info_dict) else: info_dict = info_dict["entries"][index] - extra = pathfmt.kwdict.get("_ytdl_extra") + extra = kwdict.get("_ytdl_extra") if extra: info_dict.update(extra) - return self._download_video(pathfmt, info_dict) + return self._download_video(ytdl, pathfmt, info_dict) - def _download_video(self, pathfmt, info_dict): + def _download_video(self, ytdl, pathfmt, info_dict): if "url" in info_dict: text.nameext_from_url(info_dict["url"], pathfmt.kwdict) @@ -86,8 +92,8 @@ class YoutubeDLDownloader(DownloaderBase): info_dict["ext"] = "mkv" if self.outtmpl: - self.ytdl.params["outtmpl"] = self.outtmpl - pathfmt.filename = filename = self.ytdl.prepare_filename(info_dict) + self._set_outtmpl(ytdl, self.outtmpl) + pathfmt.filename = filename = ytdl.prepare_filename(info_dict) pathfmt.extension = info_dict["ext"] pathfmt.path = pathfmt.directory + filename pathfmt.realpath = pathfmt.temppath = ( @@ -101,26 +107,35 @@ class YoutubeDLDownloader(DownloaderBase): if self.part and self.partdir: pathfmt.temppath = os.path.join( self.partdir, pathfmt.filename) - self.ytdl.params["outtmpl"] = pathfmt.temppath.replace("%", "%%") + + self._set_outtmpl(ytdl, pathfmt.temppath.replace("%", "%%")) self.out.start(pathfmt.path) try: - self.ytdl.process_info(info_dict) + ytdl.process_info(info_dict) except Exception: self.log.debug("Traceback", exc_info=True) return False return True - def _download_playlist(self, pathfmt, info_dict): + def _download_playlist(self, ytdl, pathfmt, info_dict): pathfmt.set_extension("%(playlist_index)s.%(ext)s") - self.ytdl.params["outtmpl"] = pathfmt.realpath + self._set_outtmpl(ytdl, pathfmt.realpath) for entry in info_dict["entries"]: - self.ytdl.process_info(entry) + ytdl.process_info(entry) return True + @staticmethod + def _set_outtmpl(ytdl, outtmpl): + try: + ytdl.outtmpl_dict["default"] = outtmpl + except AttributeError: + ytdl.params["outtmpl"] = outtmpl + def compatible_formats(formats): + """Returns True if 'formats' are compatible for merge""" video_ext = formats[0].get("ext") audio_ext = formats[1].get("ext") diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 4cf5e48..696b370 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -140,7 +140,7 @@ class _500pxGalleryExtractor(_500pxExtractor): }), # unavailable photos (#1335) ("https://500px.com/p/Light_Expression_Photography/galleries/street", { - "count": 0, + "count": 4, }), ("https://500px.com/fashvamp/galleries/lera"), ) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d927d70..1a6a899 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -20,9 +20,11 @@ modules = [ "architizer", "artstation", "aryion", + "bbc", "bcy", "behance", "blogger", + "comicvine", "cyberdrop", "danbooru", "deviantart", @@ -74,6 +76,7 @@ modules = [ "mangakakalot", "manganelo", "mangapark", + "mangasee", "mangoxo", "myhentaigallery", "myportfolio", @@ -105,6 +108,7 @@ modules = [ "sankaku", "sankakucomplex", "seiga", + "seisoparty", "senmanga", "sexcom", "simplyhentai", @@ -141,6 +145,7 @@ modules = [ "recursive", "oauth", "test", + "ytdl", ] diff --git a/gallery_dl/extractor/architizer.py b/gallery_dl/extractor/architizer.py index 9629e25..dbc197e 100644 --- a/gallery_dl/extractor/architizer.py +++ b/gallery_dl/extractor/architizer.py @@ -37,7 +37,7 @@ class ArchitizerProjectExtractor(GalleryExtractor): "subcategory": "project", "title": "House LO", "type": "Residential › Private House", - "year": "2018", + "year": "2020", }, }) diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py new file mode 100644 index 0000000..ace8a28 --- /dev/null +++ b/gallery_dl/extractor/bbc.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://bbc.co.uk/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import json + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?bbc\.co\.uk(/programmes/" + + +class BbcGalleryExtractor(GalleryExtractor): + """Extractor for a programme gallery on bbc.co.uk""" + category = "bbc" + root = "https://www.bbc.co.uk" + directory_fmt = ("{category}", "{path[0]}", "{path[1]}", "{path[2]}", + "{path[3:]:J - /}") + filename_fmt = "{num:>02}.{extension}" + archive_fmt = "{programme}_{num}" + pattern = BASE_PATTERN + r"[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$" + test = ( + ("https://www.bbc.co.uk/programmes/p084qtzs/p085g9kg", { + "pattern": r"https://ichef\.bbci\.co\.uk" + r"/images/ic/976x549_b/\w+\.jpg", + "count": 37, + "keyword": { + "programme": "p084qtzs", + "path": ["BBC One", "Doctor Who", "The Timeless Children"], + }, + }), + ("https://www.bbc.co.uk/programmes/p084qtzs"), + ) + + def metadata(self, page): + data = json.loads(text.extract( + page, '<script type="application/ld+json">', '</script>')[0]) + return { + "programme": self.gallery_url.split("/")[4], + "path": list(util.unique_sequence( + element["name"] + for element in data["itemListElement"] + )), + } + + def images(self, page): + return [ + (imgset.rpartition(", ")[2].partition(" ")[0], None) + for imgset in text.extract_iter(page, 'data-image-src-sets="', '"') + ] + + +class BbcProgrammeExtractor(Extractor): + """Extractor for all galleries of a bbc programme""" + category = "bbc" + subcategory = "programme" + root = "https://www.bbc.co.uk" + pattern = BASE_PATTERN + r"[^/?#]+/galleries)" + test = ("https://www.bbc.co.uk/programmes/b006q2x0/galleries", { + "pattern": BbcGalleryExtractor.pattern, + "count": ">= 24", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.galleries_url = self.root + match.group(1) + + def items(self): + page = self.request(self.galleries_url).text + data = {"_extractor": BbcGalleryExtractor} + + for programme_id in text.extract_iter( + page, '<a href="https://www.bbc.co.uk/programmes/', '"'): + url = "https://www.bbc.co.uk/programmes/" + programme_id + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py index d6e3683..f867bd9 100644 --- a/gallery_dl/extractor/bcy.py +++ b/gallery_dl/extractor/bcy.py @@ -173,9 +173,8 @@ class BcyPostExtractor(BcyExtractor): ("https://bcy.net/item/detail/6950136331708144648", { "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" r"~tplv-banciyuan-logo-v3:.+\.image", - "count": 10, - "keyword": {"filter": "watermark"} - + "count": 8, + "keyword": {"filter": "watermark"}, }), # deleted ("https://bcy.net/item/detail/6780546160802143236", { diff --git a/gallery_dl/extractor/comicvine.py b/gallery_dl/extractor/comicvine.py new file mode 100644 index 0000000..3a57886 --- /dev/null +++ b/gallery_dl/extractor/comicvine.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://comicvine.gamespot.com/""" + +from .booru import BooruExtractor +from .. import text +import operator + + +class ComicvineTagExtractor(BooruExtractor): + """Extractor for a gallery on comicvine.gamespot.com""" + category = "comicvine" + subcategory = "tag" + basecategory = "" + root = "https://comicvine.gamespot.com" + per_page = 1000 + directory_fmt = ("{category}", "{tag}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{id}" + pattern = (r"(?:https?://)?comicvine\.gamespot\.com" + r"(/([^/?#]+)/(\d+-\d+)/images/.*)") + test = ( + ("https://comicvine.gamespot.com/jock/4040-5653/images/", { + "pattern": r"https://comicvine\.gamespot\.com/a/uploads" + r"/original/\d+/\d+/\d+-.+\.(jpe?g|png)", + "count": ">= 140", + }), + (("https://comicvine.gamespot.com/batman/4005-1699" + "/images/?tag=Fan%20Art%20%26%20Cosplay"), { + "pattern": r"https://comicvine\.gamespot\.com/a/uploads" + r"/original/\d+/\d+/\d+-.+", + "count": ">= 450", + }), + ) + + def __init__(self, match): + BooruExtractor.__init__(self, match) + self.path, self.object_name, self.object_id = match.groups() + + def metadata(self): + return {"tag": text.unquote(self.object_name)} + + def posts(self): + url = self.root + "/js/image-data.json" + params = { + "images": text.extract( + self.request(self.root + self.path).text, + 'data-gallery-id="', '"')[0], + "start" : self.page_start, + "count" : self.per_page, + "object": self.object_id, + } + + while True: + images = self.request(url, params=params).json()["images"] + yield from images + + if len(images) < self.per_page: + return + params["start"] += self.per_page + + def skip(self, num): + self.page_start = num + return num + + _file_url = operator.itemgetter("original") + + @staticmethod + def _prepare(post): + post["date"] = text.parse_datetime( + post["dateCreated"], "%a, %b %d %Y") + post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 048e0a3..2533ae5 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -174,8 +174,7 @@ class Extractor(): elif until: if isinstance(until, datetime.datetime): # convert to UTC timestamp - epoch = datetime.datetime(1970, 1, 1) - until = (until - epoch) / datetime.timedelta(0, 1) + until = (until - util.EPOCH) / util.SECOND else: until = float(until) seconds = until - now diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 70e268d..163d7ba 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -258,19 +258,25 @@ class DeviantartExtractor(Extractor): return Message.Url, txt, deviation @staticmethod - def _find_folder(folders, name): - match = re.compile(name.replace( - "-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match - for folder in folders: - if match(folder["name"]): - return folder + def _find_folder(folders, name, uuid): + if uuid.isdecimal(): + match = re.compile(name.replace( + "-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match + for folder in folders: + if match(folder["name"]): + return folder + else: + for folder in folders: + if folder["folderid"] == uuid: + return folder raise exception.NotFoundError("folder") def _folder_urls(self, folders, category, extractor): - base = "{}/{}/{}/0/".format(self.root, self.user, category) + base = "{}/{}/{}/".format(self.root, self.user, category) for folder in folders: folder["_extractor"] = extractor - yield base + folder["name"], folder + url = "{}{}/{}".format(base, folder["folderid"], folder["name"]) + yield url, folder def _update_content_default(self, deviation, content): public = "premium_folder_data" not in deviation @@ -422,7 +428,8 @@ class DeviantartGalleryExtractor(DeviantartExtractor): }), # group ("https://www.deviantart.com/yakuzafc/gallery", { - "pattern": r"https://www.deviantart.com/yakuzafc/gallery/0/", + "pattern": r"https://www.deviantart.com/yakuzafc/gallery" + r"/\w{8}-\w{4}-\w{4}-\w{4}-\w{12}/", "count": ">= 15", }), # 'folders' option (#276) @@ -461,7 +468,7 @@ class DeviantartFolderExtractor(DeviantartExtractor): subcategory = "folder" directory_fmt = ("{category}", "{username}", "{folder[title]}") archive_fmt = "F_{folder[uuid]}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery/(\d+)/([^/?#]+)" + pattern = BASE_PATTERN + r"/gallery/([^/?#]+)/([^/?#]+)" test = ( # user ("https://www.deviantart.com/shimoda7/gallery/722019/Miscellaneous", { @@ -473,6 +480,12 @@ class DeviantartFolderExtractor(DeviantartExtractor): "count": ">= 4", "options": (("original", False),), }), + # uuid + (("https://www.deviantart.com/shimoda7/gallery" + "/B38E3C6A-2029-6B45-757B-3C8D3422AD1A/misc"), { + "count": 5, + "options": (("original", False),), + }), # name starts with '_', special characters (#1451) (("https://www.deviantart.com/justatest235723" "/gallery/69302698/-test-b-c-d-e-f-"), { @@ -491,7 +504,7 @@ class DeviantartFolderExtractor(DeviantartExtractor): def deviations(self): folders = self.api.gallery_folders(self.user) - folder = self._find_folder(folders, self.folder_name) + folder = self._find_folder(folders, self.folder_name, self.folder_id) self.folder = { "title": folder["name"], "uuid" : folder["folderid"], @@ -611,10 +624,15 @@ class DeviantartCollectionExtractor(DeviantartExtractor): directory_fmt = ("{category}", "{username}", "Favourites", "{collection[title]}") archive_fmt = "C_{collection[uuid]}_{index}.{extension}" - pattern = BASE_PATTERN + r"/favourites/(\d+)/([^/?#]+)" + pattern = BASE_PATTERN + r"/favourites/([^/?#]+)/([^/?#]+)" test = ( - (("https://www.deviantart.com/pencilshadings" - "/favourites/70595441/3D-Favorites"), { + (("https://www.deviantart.com/pencilshadings/favourites" + "/70595441/3D-Favorites"), { + "count": ">= 20", + "options": (("original", False),), + }), + (("https://www.deviantart.com/pencilshadings/favourites" + "/F050486B-CB62-3C66-87FB-1105A7F6379F/3D Favorites"), { "count": ">= 20", "options": (("original", False),), }), @@ -630,7 +648,8 @@ class DeviantartCollectionExtractor(DeviantartExtractor): def deviations(self): folders = self.api.collections_folders(self.user) - folder = self._find_folder(folders, self.collection_name) + folder = self._find_folder( + folders, self.collection_name, self.collection_id) self.collection = { "title": folder["name"], "uuid" : folder["folderid"], diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index a6346bf..8505b0b 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2020 Mike Fährmann +# Copyright 2017-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -59,7 +59,8 @@ class DirectlinkExtractor(Extractor): data["path"], _, name = data["path"].rpartition("/") data["filename"], _, ext = name.rpartition(".") data["extension"] = ext.lower() - data["_http_headers"] = {"Referer": self.url} + data["_http_headers"] = { + "Referer": self.url.encode("latin-1", "ignore")} yield Message.Version, 1 yield Message.Directory, data diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 64a6cb7..bccd6c8 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -122,7 +122,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "date": "dt:2018-03-18 20:15:00", "eh_category": "Non-H", "expunged": False, - "favorites": "17", + "favorites": "18", "filecount": "4", "filesize": 1488978, "gid": 1200119, diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 5ea3adb..b82160f 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -137,7 +137,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): "url": "61896d9d9a2edb556b619000a308a984307b6d30", }), ("https://thebarchive.com/b/thread/739772332/", { - "url": "07d39d2cb48f40fb337dc992993d965b0cd5f7cd", + "url": "e8b18001307d130d67db31740ce57c8561b5d80c", }), ("https://archive.wakarimasen.moe/a/thread/223157648/", { "url": "fef0758d2eb81b1ba783051fd5ec491d70107a78", diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py index 4485925..fa8e98b 100644 --- a/gallery_dl/extractor/hentaihand.py +++ b/gallery_dl/extractor/hentaihand.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Mike Fährmann +# Copyright 2020-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,21 +19,23 @@ class HentaihandGalleryExtractor(GalleryExtractor): root = "https://hentaihand.com" pattern = r"(?:https?://)?(?:www\.)?hentaihand\.com/\w+/comic/([\w-]+)" test = ( - (("https://hentaihand.com/en/comic/kouda-tomohiro-chiyomi-" - "blizzard-comic-aun-2016-12-english-nanda-sore-scans"), { - "pattern": r"https://cdn.hentaihand.com/.*/images/304546/\d+.jpg$", - "count": 19, + (("https://hentaihand.com/en/comic/c75-takumi-na-muchi-choudenji-hou-" + "no-aishi-kata-how-to-love-a-super-electromagnetic-gun-toaru-kagaku-" + "no-railgun-english"), { + "pattern": r"https://cdn.hentaihand.com/.*/images/5669/\d+.jpg$", + "count": 50, "keyword": { - "artists" : ["Kouda Tomohiro"], - "date" : "dt:2020-02-06 00:00:00", - "gallery_id": 304546, + "artists" : ["Takumi Na Muchi"], + "date" : "dt:2014-06-28 00:00:00", + "gallery_id": 5669, "lang" : "en", "language" : "English", - "relationships": ["Family", "Step family"], + "parodies" : ["Toaru Kagaku No Railgun"], + "relationships": list, "tags" : list, - "title" : r"re:\[Kouda Tomohiro\] Chiyomi Blizzard", - "title_alt" : r"re:\[幸田朋弘\] ちよみブリザード", - "type" : "Manga", + "title" : r"re:\(C75\) \[Takumi na Muchi\] Choudenji Hou ", + "title_alt" : r"re:\(C75\) \[たくみなむち\] 超電磁砲のあいしかた", + "type" : "Doujinshi", }, }), ) @@ -76,9 +78,9 @@ class HentaihandTagExtractor(Extractor): r"/\w+/(parody|character|tag|artist|group|language" r"|category|relationship)/([^/?#]+)") test = ( - ("https://hentaihand.com/en/artist/himuro", { + ("https://hentaihand.com/en/artist/takumi-na-muchi", { "pattern": HentaihandGalleryExtractor.pattern, - "count": ">= 18", + "count": ">= 6", }), ("https://hentaihand.com/en/tag/full-color"), ("https://hentaihand.com/fr/language/japanese"), diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 7ad06c9..a40d631 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -13,13 +13,13 @@ from .. import text from ..cache import memcache import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?hiperdex\d?\.(?:com|net|info)" +BASE_PATTERN = r"((?:https?://)?(?:www\.)?hiperdex\d?\.(?:com|net|info))" class HiperdexBase(): """Base class for hiperdex extractors""" category = "hiperdex" - root = "https://hiperdex2.com" + root = "https://hiperdex.com" @memcache(keyarg=1) def manga_data(self, manga, page=None): @@ -65,7 +65,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): """Extractor for manga chapters from hiperdex.com""" pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))" test = ( - ("https://hiperdex2.com/manga/domestic-na-kanojo/154-5/", { + ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", { "pattern": r"https://hiperdex\d?.(com|net|info)/wp-content/uploads" r"/WP-manga/data/manga_\w+/[0-9a-f]{32}/\d+\.webp", "count": 9, @@ -82,12 +82,14 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): "type" : "Manga", }, }), + ("https://hiperdex2.com/manga/domestic-na-kanojo/154-5/"), ("https://hiperdex.net/manga/domestic-na-kanojo/154-5/"), ("https://hiperdex.info/manga/domestic-na-kanojo/154-5/"), ) def __init__(self, match): - path, self.manga, self.chapter = match.groups() + root, path, self.manga, self.chapter = match.groups() + self.root = text.ensure_http_scheme(root) ChapterExtractor.__init__(self, match, self.root + path + "/") def metadata(self, _): @@ -106,7 +108,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): chapterclass = HiperdexChapterExtractor pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$" test = ( - ("https://hiperdex2.com/manga/youre-not-that-special/", { + ("https://hiperdex.com/manga/youre-not-that-special/", { "count": 51, "pattern": HiperdexChapterExtractor.pattern, "keyword": { @@ -123,12 +125,14 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): "type" : "Manhwa", }, }), + ("https://hiperdex2.com/manga/youre-not-that-special/"), ("https://hiperdex.net/manga/youre-not-that-special/"), ("https://hiperdex.info/manga/youre-not-that-special/"), ) def __init__(self, match): - path, self.manga = match.groups() + root, path, self.manga = match.groups() + self.root = text.ensure_http_scheme(root) MangaExtractor.__init__(self, match, self.root + path + "/") def chapters(self, page): @@ -156,10 +160,10 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): categorytransfer = False chapterclass = HiperdexMangaExtractor reverse = False - pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/([^/?#]+))" + pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))" test = ( - ("https://hiperdex2.com/manga-artist/beck-ho-an/"), ("https://hiperdex.net/manga-artist/beck-ho-an/"), + ("https://hiperdex2.com/manga-artist/beck-ho-an/"), ("https://hiperdex.info/manga-artist/beck-ho-an/"), ("https://hiperdex.com/manga-author/viagra/", { "pattern": HiperdexMangaExtractor.pattern, @@ -168,7 +172,8 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): ) def __init__(self, match): - MangaExtractor.__init__(self, match, self.root + match.group(1) + "/") + self.root = text.ensure_http_scheme(match.group(1)) + MangaExtractor.__init__(self, match, self.root + match.group(2) + "/") def chapters(self, page): results = [] diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 497509d..2ea5dfa 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -35,12 +35,12 @@ class HitomiGalleryExtractor(GalleryExtractor): }), # Game CG with scenes (#321) ("https://hitomi.la/galleries/733697.html", { - "url": "8dfbcb1e51cec43a7112d58b7e92153155ada3b9", + "url": "1de8510bd4c3048a1cbbf242505d8449e93ba5a4", "count": 210, }), # fallback for galleries only available through /reader/ URLs ("https://hitomi.la/galleries/1045954.html", { - "url": "a5af7fdca1f5c93c289af128914a8488ea345036", + "url": "681bb07d8ce4d0c4d0592e47b239b6e42d566386", "count": 1413, }), # gallery with "broken" redirect @@ -140,11 +140,10 @@ class HitomiGalleryExtractor(GalleryExtractor): # see https://ltn.hitomi.la/common.js inum = int(ihash[-3:-1], 16) - frontends = 2 if inum < 0x70 else 3 - inum = 1 if inum < 0x49 else inum + offset = 2 if inum < 0x40 else 1 if inum < 0x80 else 0 url = "https://{}b.hitomi.la/images/{}/{}/{}.{}".format( - chr(97 + (inum % frontends)), + chr(97 + offset), ihash[-1], ihash[-3:-1], ihash, idata["extension"], ) diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index d757e17..9328437 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -133,18 +133,30 @@ class AcidimgImageExtractor(ImagehostImageExtractor): class ImagevenueImageExtractor(ImagehostImageExtractor): """Extractor for single images from imagevenue.com""" category = "imagevenue" - pattern = (r"(?:https?://)?(img\d+\.imagevenue\.com" - r"/img\.php\?image=(?:[a-z]+_)?(\d+)_[^&#]+)") - test = (("http://img28116.imagevenue.com/img.php" - "?image=th_52709_test_122_64lo.jpg"), { - "url": "46812995d557f2c6adf0ebd0e631e6e4e45facde", - "content": "59ec819cbd972dd9a71f25866fbfc416f2f215b3", - }) - https = False + pattern = (r"(?:https?://)?((?:www|img\d+)\.imagevenue\.com" + r"/([A-Z0-9]{8,10}|view/.*|img\.php\?.*))") + test = ( + ("https://www.imagevenue.com/ME13LS07", { + "pattern": r"https://cdn-images\.imagevenue\.com" + r"/10/ac/05/ME13LS07_o\.png", + "keyword": "ae15d6e3b2095f019eee84cd896700cd34b09c36", + "content": "cfaa8def53ed1a575e0c665c9d6d8cf2aac7a0ee", + }), + (("https://www.imagevenue.com/view/o?i=92518_13732377" + "annakarina424200712535AM_122_486lo.jpg&h=img150&l=loc486"), { + "url": "8bf0254e29250d8f5026c0105bbdda3ee3d84980", + }), + (("http://img28116.imagevenue.com/img.php" + "?image=th_52709_test_122_64lo.jpg"), { + "url": "f98e3091df7f48a05fb60fbd86f789fc5ec56331", + }), + ) def get_info(self, page): - url = text.extract(page, "SRC='", "'")[0] - return text.urljoin(self.page_url, url), url + pos = page.index('class="card-body') + url, pos = text.extract(page, '<img src="', '"', pos) + filename, pos = text.extract(page, 'alt="', '"', pos) + return url, text.unescape(filename) class ImagetwistImageExtractor(ImagehostImageExtractor): diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index b015556..28b5506 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -49,7 +49,7 @@ class InstagramExtractor(Extractor): if "__typename" in post: post = self._parse_post_graphql(post) else: - post = self._parse_post_reel(post) + post = self._parse_post_api(post) post.update(data) files = post.pop("_files") @@ -239,16 +239,23 @@ class InstagramExtractor(Extractor): return data - def _parse_post_reel(self, post): + def _parse_post_api(self, post): if "media" in post: media = post["media"] owner = media["user"] - post["items"] = (media,) data = { "post_id" : media["pk"], "post_shortcode": self._shortcode_from_id(media["pk"]), } + + if "carousel_media" in media: + post["items"] = media["carousel_media"] + data["sidecar_media_id"] = data["post_id"] + data["sidecar_shortcode"] = data["post_shortcode"] + else: + post["items"] = (media,) + else: reel_id = str(post["id"]).rpartition(":")[2] owner = post["user"] @@ -279,9 +286,11 @@ class InstagramExtractor(Extractor): files.append({ "num" : num, - "date" : text.parse_timestamp(item["taken_at"]), + "date" : text.parse_timestamp(item.get("taken_at") or + media.get("taken_at")), "media_id" : item["pk"], - "shortcode" : item["code"], + "shortcode" : (item.get("code") or + self._shortcode_from_id(item["pk"])), "display_url": image["url"], "video_url" : video["url"] if video else None, "width" : media["width"], @@ -485,18 +494,42 @@ class InstagramTagExtractor(InstagramExtractor): }) def metadata(self): - return {"tag": self.item} + return {"tag": text.unquote(self.item)} def posts(self): url = "{}/explore/tags/{}/".format(self.root, self.item) - data = self._extract_shared_data(url) - hashtag = data["entry_data"]["TagPage"][0]["graphql"]["hashtag"] + page = self._extract_shared_data(url)["entry_data"]["TagPage"][0] + if "data" in page: + return self._pagination_sections(page["data"]["recent"]) + + hashtag = page["graphql"]["hashtag"] query_hash = "9b498c08113f1e09617a1703c22b2f32" variables = {"tag_name": hashtag["name"], "first": 50} edge = self._get_edge_data(hashtag, "edge_hashtag_to_media") return self._pagination_graphql(query_hash, variables, edge) + def _pagination_sections(self, info): + endpoint = "/v1/tags/instagram/sections/" + data = { + "include_persistent": "0", + "max_id" : None, + "page" : None, + "surface": "grid", + "tab" : "recent", + } + + while True: + for section in info["sections"]: + yield from section["layout_content"]["medias"] + + if not info.get("more_available"): + return + + data["max_id"] = info["next_max_id"] + data["page"] = info["next_page"] + info = self._request_api(endpoint, method="POST", data=data) + def _pagination_graphql(self, query_hash, variables, data): while True: for edge in data["edges"]: @@ -619,7 +652,7 @@ class InstagramPostExtractor(InstagramExtractor): ) def posts(self): - query_hash = "971f52b26328008c768b7d8e4ac9ce3c" + query_hash = "1f950d414a6e11c98c556aa007b3157d" variables = { "shortcode" : self.item, "child_comment_count" : 3, diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 8c51d5d..7218488 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text +import itertools import re BASE_PATTERN = r"(?:https?://)?kemono\.party/([^/?#]+)/user/([^/?#]+)" @@ -22,17 +23,32 @@ class KemonopartyExtractor(Extractor): directory_fmt = ("{category}", "{service}", "{user}") filename_fmt = "{id}_{title}_{num:>02}_{filename}.{extension}" archive_fmt = "{service}_{user}_{id}_{num}" + cookiedomain = ".kemono.party" + _warning = True def items(self): + if self._warning: + if not self._check_cookies(("__ddg1", "__ddg2")): + self.log.warning("no DDoS-GUARD cookies set (__ddg1, __ddg2)") + KemonopartyExtractor._warning = False + find_inline = re.compile(r'src="(/inline/[^"]+)').findall + skip_service = \ + "patreon" if self.config("patreon-skip-file", True) else None if self.config("metadata"): username = text.unescape(text.extract( - self.request(self.user_url).text, "<title>", " | Kemono<")[0]) + self.request(self.user_url).text, "<title>", " | Kemono" + )[0]).lstrip() else: username = None - for post in self.posts(): + posts = self.posts() + max_posts = self.config("max-posts") + if max_posts: + posts = itertools.islice(posts, max_posts) + + for post in posts: files = [] append = files.append @@ -40,7 +56,8 @@ class KemonopartyExtractor(Extractor): if file: file["type"] = "file" - append(file) + if post["service"] != skip_service or not post["attachments"]: + append(file) for attachment in post["attachments"]: attachment["type"] = "attachment" append(attachment) @@ -68,24 +85,30 @@ class KemonopartyExtractor(Extractor): class KemonopartyUserExtractor(KemonopartyExtractor): """Extractor for all posts from a kemono.party user listing""" subcategory = "user" - pattern = BASE_PATTERN + r"/?(?:$|[?#])" + pattern = BASE_PATTERN + r"/?(?:\?o=(\d+))?(?:$|[?#])" test = ( ("https://kemono.party/fanbox/user/6993449", { "range": "1-25", "count": 25, }), + # 'max-posts' option, 'o' query parameter (#1674) + ("https://kemono.party/patreon/user/881792?o=150", { + "options": (("max-posts", 25),), + "count": "< 100", + }), ("https://kemono.party/subscribestar/user/alcorart"), ) def __init__(self, match): KemonopartyExtractor.__init__(self, match) - service, user_id = match.groups() + service, user_id, offset = match.groups() self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) + self.offset = text.parse_int(offset) def posts(self): url = self.api_url - params = {"o": 0} + params = {"o": self.offset} while True: posts = self.request(url, params=params).json() @@ -133,6 +156,16 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "pattern": r"https://data\.kemono\.party/(file|attachment)s" r"/gumroad/trylsc/IURjT/", }), + # username (#1548, #1652) + ("https://kemono.party/gumroad/user/3252870377455/post/aJnAH", { + "options": (("metadata", True),), + "keyword": {"username": "Kudalyn's Creations"}, + }), + # skip patreon main file (#1667, #1689) + ("https://kemono.party/patreon/user/4158582/post/32099982", { + "count": 2, + "keyword": {"type": "attachment"}, + }), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), ) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 0fe46b1..a8241dc 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache +from ..version import __version__ from collections import defaultdict BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangadex\.(?:org|cc)" @@ -27,6 +28,7 @@ class MangadexExtractor(Extractor): archive_fmt = "{chapter_id}_{page}" root = "https://mangadex.org" _cache = {} + _headers = {"User-Agent": "gallery-dl/" + __version__} def __init__(self, match): Extractor.__init__(self, match) @@ -116,6 +118,7 @@ class MangadexChapterExtractor(MangadexExtractor): yield Message.Directory, data cattributes = chapter["data"]["attributes"] + data["_http_headers"] = self._headers base = "{}/data/{}/".format( self.api.athome_server(self.uuid)["baseUrl"], cattributes["hash"]) for data["page"], page in enumerate(cattributes["data"], 1): @@ -170,7 +173,7 @@ class MangadexAPI(): def __init__(self, extr): self.extractor = extr - self.headers = {} + self.headers = extr._headers.copy() self.username, self.password = self.extractor._get_auth_info() if not self.username: diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index a9d504e..f6514ca 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -6,17 +6,21 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for from https://fanfox.net/""" +"""Extractors for https://fanfox.net/""" -from .common import ChapterExtractor +from .common import ChapterExtractor, MangaExtractor from .. import text +import re + +BASE_PATTERN = r"(?:https?://)?(?:www\.|m\.)?(?:fanfox\.net|mangafox\.me)" class MangafoxChapterExtractor(ChapterExtractor): - """Extractor for manga-chapters from fanfox.net""" + """Extractor for manga chapters from fanfox.net""" category = "mangafox" - pattern = (r"(?:https?://)?(?:www\.|m\.)?(?:fanfox\.net|mangafox\.me)" - r"(/manga/[^/]+/((?:v([^/?#]+)/)?c(\d+)([^/?#]*)))") + root = "https://m.fanfox.net" + pattern = BASE_PATTERN + \ + r"(/manga/[^/?#]+/((?:v([^/?#]+)/)?c(\d+)([^/?#]*)))" test = ( ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", { "keyword": "5661dab258d42d09d98f194f7172fb9851a49766", @@ -25,7 +29,6 @@ class MangafoxChapterExtractor(ChapterExtractor): ("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/"), ("http://fanfox.net/manga/black_clover/vTBD/c295/1.html"), ) - root = "https://m.fanfox.net" def __init__(self, match): base, self.cstr, self.volume, self.chapter, self.minor = match.groups() @@ -60,3 +63,55 @@ class MangafoxChapterExtractor(ChapterExtractor): pnum += 2 page = self.request("{}/{}.html".format(self.urlbase, pnum)).text + + +class MangafoxMangaExtractor(MangaExtractor): + """Extractor for manga from fanfox.net""" + category = "mangafox" + root = "https://m.fanfox.net" + chapterclass = MangafoxChapterExtractor + pattern = BASE_PATTERN + r"(/manga/[^/?#]+)/?$" + test = ( + ("https://fanfox.net/manga/kanojo_mo_kanojo", { + "pattern": MangafoxChapterExtractor.pattern, + "count": ">=60", + }), + ("https://mangafox.me/manga/shangri_la_frontier", { + "pattern": MangafoxChapterExtractor.pattern, + "count": ">=45", + }), + ("https://m.fanfox.net/manga/sentai_daishikkaku"), + ) + + def chapters(self, page): + match_info = re.compile(r"Ch (\d+)(\S*)(?: (.*))?").match + manga, pos = text.extract(page, '<p class="title">', '</p>') + author, pos = text.extract(page, '<p>Author(s):', '</p>', pos) + data = { + "manga" : text.unescape(manga), + "author" : text.remove_html(author), + "lang" : "en", + "language": "English", + } + + results = [] + pos = page.index('<dd class="chlist">') + while True: + url, pos = text.extract(page, '<a href="//', '"', pos) + if url == 'mangafox.la?f=mobile': + return results + info, pos = text.extract(page, '>', '<span', pos) + date, pos = text.extract(page, 'right">', '</span>', pos) + + match = match_info(text.unescape(info)) + if match: + chapter, minor, title = match.groups() + chapter_minor = minor + else: + chapter, _, minor = url[:-7].rpartition("/c")[2].partition(".") + chapter_minor = "." + minor + + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = chapter_minor if minor else "" + data["date"] = date + results.append(("https://" + url, data.copy())) diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 9b6d4ba..4bd5572 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -56,7 +56,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): test = ( ("https://mangapark.net/manga/gosu/i811653/c055/1", { "count": 50, - "keyword": "8344bdda8cd8414e7729a4e148379f147e3437da", + "keyword": "db1ed9af4f972756a25dbfa5af69a8f155b043ff", }), (("https://mangapark.net/manga" "/ad-astra-per-aspera-hata-kenjirou/i662051/c001.2/1"), { @@ -121,7 +121,7 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): r"(/manga/[^/?#]+)/?$") test = ( ("https://mangapark.net/manga/aria", { - "url": "f07caf0bc5097c9b32c8c0d6f446bce1bf4bd329", + "url": "b8f7db2f581404753c4af37af66c049a41273b94", "keyword": "2c0d28efaf84fcfe62932b6931ef3c3987cd48c0", }), ("https://mangapark.me/manga/aria"), diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py new file mode 100644 index 0000000..1b3dd18 --- /dev/null +++ b/gallery_dl/extractor/mangasee.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://mangasee123.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +import json + + +class MangaseeBase(): + category = "mangasee" + browser = "firefox" + root = "https://mangasee123.com" + + @staticmethod + def _transform_chapter(data): + chapter = data["Chapter"] + return { + "title" : data["ChapterName"] or "", + "index" : chapter[0], + "chapter" : int(chapter[1:-1]), + "chapter_minor": "" if chapter[-1] == "0" else "." + chapter[-1], + "chapter_string": chapter, + "lang" : "en", + "language": "English", + "date" : text.parse_datetime( + data["Date"], "%Y-%m-%d %H:%M:%S"), + } + + +class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): + pattern = r"(?:https?://)?mangasee123\.com(/read-online/[^/?#]+\.html)" + test = (("https://mangasee123.com/read-online" + "/Tokyo-Innocent-chapter-4.5-page-1.html"), { + "pattern": r"https://[^/]+/manga/Tokyo-Innocent/0004\.5-00\d\.png", + "count": 8, + "keyword": { + "chapter": 4, + "chapter_minor": ".5", + "chapter_string": "100045", + "count": 8, + "date": "dt:2020-01-20 21:52:53", + "extension": "png", + "filename": r"re:0004\.5-00\d", + "index": "1", + "lang": "en", + "language": "English", + "manga": "Tokyo Innocent", + "page": int, + "title": "", + }, + }) + + def metadata(self, page): + extr = text.extract_from(page) + self.chapter = data = json.loads(extr("vm.CurChapter =", ";\r\n")) + self.domain = extr('vm.CurPathName = "', '"') + self.slug = extr('vm.IndexName = "', '"') + + data = self._transform_chapter(data) + data["manga"] = extr('vm.SeriesName = "', '"') + return data + + def images(self, page): + chapter = self.chapter["Chapter"][1:] + if chapter[-1] == "0": + chapter = chapter[:-1] + else: + chapter = chapter[:-1] + "." + chapter[-1] + + base = "https://{}/manga/{}/".format(self.domain, self.slug) + if self.chapter["Directory"]: + base += self.chapter["Directory"] + "/" + base += chapter + "-" + + return [ + ("{}{:>03}.png".format(base, i), None) + for i in range(1, int(self.chapter["Page"]) + 1) + ] + + +class MangaseeMangaExtractor(MangaseeBase, MangaExtractor): + chapterclass = MangaseeChapterExtractor + pattern = r"(?:https?://)?mangasee123\.com(/manga/[^/?#]+)" + test = (("https://mangasee123.com/manga" + "/Nakamura-Koedo-To-Daizu-Keisuke-Wa-Umaku-Ikanai"), { + "pattern": MangaseeChapterExtractor.pattern, + "count": ">= 17", + }) + + def chapters(self, page): + slug, pos = text.extract(page, 'vm.IndexName = "', '"') + chapters = json.loads(text.extract( + page, "vm.Chapters = ", ";\r\n", pos)[0]) + + result = [] + for data in map(self._transform_chapter, chapters): + url = "{}/read-online/{}-chapter-{}{}".format( + self.root, slug, data["chapter"], data["chapter_minor"]) + if data["index"] != "1": + url += "-index-" + data["index"] + url += "-page-1.html" + + data["manga"] = slug + result.append((url, data)) + return result diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index daa3d65..ff0bfc3 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for mastodon instances""" +"""Extractors for Mastodon instances""" from .common import BaseExtractor, Message from .. import text, exception @@ -25,30 +25,37 @@ class MastodonExtractor(BaseExtractor): BaseExtractor.__init__(self, match) self.instance = self.root.partition("://")[2] self.item = match.group(match.lastindex) + self.reblogs = self.config("reblogs", False) + self.replies = self.config("replies", True) def items(self): for status in self.statuses(): + + if not self.reblogs and status["reblog"]: + self.log.debug("Skipping %s (reblog)", status["id"]) + continue + if not self.replies and status["in_reply_to_id"]: + self.log.debug("Skipping %s (reply)", status["id"]) + continue + attachments = status["media_attachments"] - if attachments: - self.prepare(status) - yield Message.Directory, status - for media in attachments: - status["media"] = media - url = media["url"] - yield Message.Url, url, text.nameext_from_url(url, status) + del status["media_attachments"] + + status["instance"] = self.instance + status["tags"] = [tag["name"] for tag in status["tags"]] + status["date"] = text.parse_datetime( + status["created_at"][:19], "%Y-%m-%dT%H:%M:%S") + + yield Message.Directory, status + for media in attachments: + status["media"] = media + url = media["url"] + yield Message.Url, url, text.nameext_from_url(url, status) def statuses(self): - """Return an iterable containing all relevant Status-objects""" + """Return an iterable containing all relevant Status objects""" return () - def prepare(self, status): - """Prepare a status object""" - del status["media_attachments"] - status["instance"] = self.instance - status["tags"] = [tag["name"] for tag in status["tags"]] - status["date"] = text.parse_datetime( - status["created_at"][:19], "%Y-%m-%dT%H:%M:%S") - INSTANCES = { "mastodon.social": { @@ -97,6 +104,7 @@ class MastodonUserExtractor(MastodonExtractor): def statuses(self): api = MastodonAPI(self) + username = self.item handle = "@{}@{}".format(username, self.instance) for account in api.account_search(handle, 1): @@ -104,7 +112,12 @@ class MastodonUserExtractor(MastodonExtractor): break else: raise exception.NotFoundError("account") - return api.account_statuses(account["id"]) + + return api.account_statuses( + account["id"], + only_media=not self.config("text-posts", False), + exclude_replies=not self.replies, + ) class MastodonStatusExtractor(MastodonExtractor): @@ -130,8 +143,8 @@ class MastodonStatusExtractor(MastodonExtractor): class MastodonAPI(): """Minimal interface for the Mastodon API + https://docs.joinmastodon.org/ https://github.com/tootsuite/mastodon - https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md """ def __init__(self, extractor): @@ -153,15 +166,17 @@ class MastodonAPI(): self.headers = {"Authorization": "Bearer " + access_token} def account_search(self, query, limit=40): - """Search for content""" + """Search for accounts""" endpoint = "/v1/accounts/search" params = {"q": query, "limit": limit} return self._call(endpoint, params).json() - def account_statuses(self, account_id): - """Get an account's statuses""" + def account_statuses(self, account_id, only_media=True, + exclude_replies=False): + """Fetch an account's statuses""" endpoint = "/v1/accounts/{}/statuses".format(account_id) - params = {"only_media": "1"} + params = {"only_media" : "1" if only_media else "0", + "exclude_replies": "1" if exclude_replies else "0"} return self._pagination(endpoint, params) def status(self, status_id): @@ -202,6 +217,7 @@ class MastodonAPI(): if not url: return url = url["url"] + params = None @cache(maxage=100*365*24*3600, keyarg=0) diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index df77110..604d65c 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -32,7 +32,7 @@ class MoebooruExtractor(BooruExtractor): html = text.extract(page, '<ul id="tag-', '</ul>')[0] if html: tags = collections.defaultdict(list) - pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)") + pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)") for tag_type, tag_name in pattern.findall(html): tags[tag_type].append(text.unquote(tag_name)) for key, value in tags.items(): diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index 1da3e49..348f6a1 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -105,5 +105,5 @@ class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor): return [ self.root + "/webtoon/detail.nhn?" + query for query in text.extract_iter( - page, '<a href="/webtoon/detail.nhn?', '"') + page, '<a href="/webtoon/detail?', '"') ][::2] diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 64fc938..c6c885c 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -107,11 +107,11 @@ class PhilomenaPostExtractor(PhilomenaExtractor): "source_url": "https://www.deviantart.com/speccysy/art" "/Afternoon-Flight-215193985", "spoilered": False, - "tag_count": 38, + "tag_count": 39, "tag_ids": list, "tags": list, "thumbnails_generated": True, - "updated_at": "2021-05-28T17:39:38Z", + "updated_at": "2021-07-13T14:22:40Z", "uploader": "Clover the Clever", "uploader_id": 211188, "upvotes": int, diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index ff07a57..aefe644 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -6,10 +6,10 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images and ugoira from https://www.pixiv.net/""" +"""Extractors for https://www.pixiv.net/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache from datetime import datetime, timedelta import itertools @@ -517,6 +517,48 @@ class PixivFollowExtractor(PixivExtractor): return {"user_follow": self.api.user} +class PixivPixivisionExtractor(PixivExtractor): + """Extractor for illustrations from a pixivision article""" + subcategory = "pixivision" + directory_fmt = ("{category}", "pixivision", + "{pixivision_id} {pixivision_title}") + archive_fmt = "V{pixivision_id}_{id}{suffix}.{extension}" + pattern = r"(?:https?://)?(?:www\.)?pixivision\.net/(?:en/)?a/(\d+)" + test = ( + ("https://www.pixivision.net/en/a/2791"), + ("https://pixivision.net/a/2791", { + "count": 7, + "keyword": { + "pixivision_id": "2791", + "pixivision_title": "What's your favorite music? Editor’s " + "picks featuring: “CD Covers”!", + }, + }), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.pixivision_id = match.group(1) + + def works(self): + return ( + self.api.illust_detail(illust_id) + for illust_id in util.unique_sequence(text.extract_iter( + self.page, '<a href="https://www.pixiv.net/en/artworks/', '"')) + ) + + def metadata(self): + url = "https://www.pixivision.net/en/a/" + self.pixivision_id + headers = {"User-Agent": "Mozilla/5.0"} + self.page = self.request(url, headers=headers).text + + title = text.extract(self.page, '<title>', ' - pixivision<')[0] + return { + "pixivision_id" : self.pixivision_id, + "pixivision_title": text.unescape(title), + } + + class PixivAppAPI(): """Minimal interface for the Pixiv App API for mobile devices diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py index 49c24bc..dee7bd4 100644 --- a/gallery_dl/extractor/pururin.py +++ b/gallery_dl/extractor/pururin.py @@ -6,20 +6,22 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://pururin.io/""" +"""Extractors for https://pururin.to/""" from .common import GalleryExtractor from .. import text, util +import binascii import json class PururinGalleryExtractor(GalleryExtractor): """Extractor for image galleries on pururin.io""" category = "pururin" - pattern = r"(?:https?://)?(?:www\.)?pururin\.io/(?:gallery|read)/(\d+)" + pattern = r"(?:https?://)?(?:www\.)?pururin\.[ti]o/(?:gallery|read)/(\d+)" test = ( - ("https://pururin.io/gallery/38661/iowant-2", { - "pattern": r"https://cdn.pururin.io/\w+/images/data/\d+/\d+\.jpg", + ("https://pururin.to/gallery/38661/iowant-2", { + "pattern": r"https://cdn.pururin.[ti]o/\w+" + r"/images/data/\d+/\d+\.jpg", "keyword": { "title" : "re:I ?owant 2!!", "title_en" : "re:I ?owant 2!!", @@ -41,11 +43,12 @@ class PururinGalleryExtractor(GalleryExtractor): "language" : "English", } }), - ("https://pururin.io/gallery/7661/unisis-team-vanilla", { + ("https://pururin.to/gallery/7661/unisis-team-vanilla", { "count": 17, }), + ("https://pururin.io/gallery/38661/iowant-2"), ) - root = "https://pururin.io" + root = "https://pururin.to" def __init__(self, match): self.gallery_id = match.group(1) @@ -70,8 +73,8 @@ class PururinGalleryExtractor(GalleryExtractor): url = "{}/read/{}/01/x".format(self.root, self.gallery_id) page = self.request(url).text - info = json.loads(text.unescape(text.extract( - page, ':gallery="', '"')[0])) + info = json.loads(binascii.a2b_base64(text.extract( + page, '<gallery-read encoded="', '"')[0]).decode()) self._ext = info["image_extension"] self._cnt = info["total_pages"] @@ -97,6 +100,6 @@ class PururinGalleryExtractor(GalleryExtractor): return data def images(self, _): - ufmt = "https://cdn.pururin.io/assets/images/data/{}/{{}}.{}".format( + ufmt = "https://cdn.pururin.to/assets/images/data/{}/{{}}.{}".format( self.gallery_id, self._ext) return [(ufmt.format(num), None) for num in range(1, self._cnt + 1)] diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index c62a942..104dc23 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -215,7 +215,7 @@ class JoyreactorTagExtractor(ReactorTagExtractor): "count": ">= 15", }), ("http://joyreactor.com/tag/Cirno", { - "url": "de1e60c15bfb07a0e9603b00dc3d05f60edc7914", + "url": "aa59090590b26f4654881301fe8fe748a51625a8", }), ) @@ -243,7 +243,7 @@ class JoyreactorUserExtractor(ReactorUserExtractor): test = ( ("http://joyreactor.cc/user/hemantic"), ("http://joyreactor.com/user/Tacoman123", { - "url": "452cd0fa23e2ad0e122c296ba75aa7f0b29329f6", + "url": "60ce9a3e3db791a0899f7fb7643b5b87d09ae3b5", }), ) @@ -254,23 +254,27 @@ class JoyreactorPostExtractor(ReactorPostExtractor): pattern = JR_BASE_PATTERN + r"/post/(\d+)" test = ( ("http://joyreactor.com/post/3721876", { # single image - "url": "6ce09f239d8b7fdf6dd1664c2afc39618cc87663", - "keyword": "147ed5b9799ba43cbd16168450afcfae5ddedbf3", + "pattern": r"http://img\d\.joyreactor\.com/pics/post/full" + r"/cartoon-painting-monster-lake-4841316.jpeg", + "count": 1, + "keyword": "2207a7dfed55def2042b6c2554894c8d7fda386e", }), ("http://joyreactor.com/post/3713804", { # 4 images - "url": "f08ac8493ca0619a3e3c6bedb8d8374af3eec304", - "keyword": "f12c6f3c2f298fed9b12bd3e70fb823870aa9b93", + "pattern": r"http://img\d\.joyreactor\.com/pics/post/full" + r"/movie-tv-godzilla-monsters-\d+\.jpeg", + "count": 4, + "keyword": "d7da9ba7809004c809eedcf6f1c06ad0fbb3df21", }), ("http://joyreactor.com/post/3726210", { # gif / video - "url": "33a48e1eca6cb2d298fbbb6536b3283799d6515b", - "keyword": "d173cc6e88f02a63904e475eacd7050304eb1967", + "url": "60f3b9a0a3918b269bea9b4f8f1a5ab3c2c550f8", + "keyword": "8949d9d5fc469dab264752432efbaa499561664a", }), ("http://joyreactor.com/post/3668724", { # youtube embed "url": "bf1666eddcff10c9b58f6be63fa94e4e13074214", "keyword": "e18b1ffbd79d76f9a0e90b6d474cc2499e343f0b", }), ("http://joyreactor.cc/post/1299", { # "malformed" JSON - "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde", + "url": "ab02c6eb7b4035ad961b29ee0770ee41be2fcc39", }), ) @@ -311,7 +315,7 @@ class PornreactorUserExtractor(ReactorUserExtractor): test = ( ("http://pornreactor.cc/user/Disillusion", { "range": "1-25", - "count": ">= 25", + "count": ">= 20", }), ("http://fapreactor.com/user/Disillusion"), ) @@ -324,10 +328,10 @@ class PornreactorPostExtractor(ReactorPostExtractor): pattern = PR_BASE_PATTERN + r"/post/(\d+)" test = ( ("http://pornreactor.cc/post/863166", { - "url": "680db1e33ca92ff70b2c0e1708c471cbe2201324", + "url": "a09fb0577489e1f9564c25d0ad576f81b19c2ef3", "content": "ec6b0568bfb1803648744077da082d14de844340", }), ("http://fapreactor.com/post/863166", { - "url": "864ecd5785e4898301aa8d054dd653b1165be158", + "url": "2a956ce0c90e8bc47b4392db4fa25ad1342f3e54", }), ) diff --git a/gallery_dl/extractor/seisoparty.py b/gallery_dl/extractor/seisoparty.py new file mode 100644 index 0000000..b736b4b --- /dev/null +++ b/gallery_dl/extractor/seisoparty.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://seiso.party/""" + +from .common import Extractor, Message +from .. import text +import re + + +class SeisopartyExtractor(Extractor): + """Base class for seisoparty extractors""" + category = "seisoparty" + root = "https://seiso.party" + directory_fmt = ("{category}", "{service}", "{username}") + filename_fmt = "{id}_{title}_{num:>02}_{filename}.{extension}" + archive_fmt = "{service}_{user}_{id}_{num}" + cookiedomain = ".seiso.party" + _warning = True + + def __init__(self, match): + Extractor.__init__(self, match) + self.user_name = None + self._find_files = re.compile( + r'href="(https://cdn(?:-\d)?\.seiso\.party/files/[^"]+)').findall + + def items(self): + if self._warning: + if not self._check_cookies(("__ddg1", "__ddg2")): + self.log.warning("no DDoS-GUARD cookies set (__ddg1, __ddg2)") + SeisopartyExtractor._warning = False + + for post in self.posts(): + files = post.pop("files") + yield Message.Directory, post + for post["num"], url in enumerate(files, 1): + yield Message.Url, url, text.nameext_from_url(url, post) + + def _parse_post(self, page, post_id): + extr = text.extract_from(page) + return { + "service" : self.service, + "user" : self.user_id, + "username": self.user_name, + "id" : post_id, + "date" : text.parse_datetime(extr( + '<div class="margin-bottom-15 minor-text">', '<'), + "%Y-%m-%d %H:%M:%S %Z"), + "title" : text.unescape(extr('class="post-title">', '<')), + "content" : text.unescape(extr("\n<p>\n", "\n</p>\n").strip()), + "files" : self._find_files(page), + } + + +class SeisopartyUserExtractor(SeisopartyExtractor): + """Extractor for all posts from a seiso.party user listing""" + subcategory = "user" + pattern = r"(?:https?://)?seiso\.party/artists/([^/?#]+)/([^/?#]+)" + test = ( + ("https://seiso.party/artists/fanbox/21", { + "pattern": r"https://cdn\.seiso\.party/files/fanbox/\d+/", + "count": ">=15", + "keyword": { + "content": str, + "date": "type:datetime", + "id": r"re:\d+", + "num": int, + "service": "fanbox", + "title": str, + "user": "21", + "username": "雨", + }, + }), + ) + + def __init__(self, match): + SeisopartyExtractor.__init__(self, match) + self.service, self.user_id = match.groups() + + def posts(self): + url = "{}/artists/{}/{}".format(self.root, self.service, self.user_id) + page = self.request(url).text + self.user_name, pos = text.extract(page, '<span class="title">', '<') + + url = self.root + text.extract( + page, 'href="', '"', page.index('id="content"', pos))[0] + response = self.request(url) + headers = {"Referer": url} + + while True: + yield self._parse_post(response.text, url.rpartition("/")[2]) + response = self.request(url + "/next", headers=headers) + if url == response.url: + return + url = headers["Referer"] = response.url + + +class SeisopartyPostExtractor(SeisopartyExtractor): + """Extractor for a single seiso.party post""" + subcategory = "post" + pattern = r"(?:https?://)?seiso\.party/post/([^/?#]+)/([^/?#]+)/([^/?#]+)" + test = ( + ("https://seiso.party/post/fanbox/21/371", { + "url": "75f13b92de0ce399b6163c3de18f1f36011c2366", + "count": 2, + "keyword": { + "content": "この前描いためぐるちゃんのPSDファイルです。\n" + "どうぞよろしくお願いします。", + "date": "dt:2021-05-06 12:38:31", + "extension": "re:psd|jpg", + "filename": "re:backcourt|ffb2ccb7a3586d05f9a4620329dd131e", + "id": "371", + "num": int, + "service": "fanbox", + "title": "MEGURU.PSD", + "user": "21", + "username": "雨", + }, + }), + ("https://seiso.party/post/patreon/429/95949", { + "pattern": r"https://cdn-2\.seiso\.party/files/patreon/95949/", + "count": 2, + }), + ) + + def __init__(self, match): + SeisopartyExtractor.__init__(self, match) + self.service, self.user_id, self.post_id = match.groups() + + def posts(self): + url = "{}/artists/{}/{}".format(self.root, self.service, self.user_id) + page = self.request(url).text + self.user_name, pos = text.extract(page, '<span class="title">', '<') + + url = "{}/post/{}/{}/{}".format( + self.root, self.service, self.user_id, self.post_id) + return (self._parse_post(self.request(url).text, self.post_id),) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 83836e5..ae8b58d 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -140,8 +140,8 @@ class SubscribestarUserExtractor(SubscribestarExtractor): test = ( ("https://www.subscribestar.com/subscribestar", { "count": ">= 20", - "pattern": r"https://star-uploads.s\d+-us-west-\d+.amazonaws.com" - r"/uploads/users/11/", + "pattern": r"https://(star-uploads|ss-uploads-prod)\.s\d+-us-west-" + r"\d+\.amazonaws\.com/uploads(_v2)?/users/11/", "keyword": { "author_id": 11, "author_name": "subscribestar", diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 5550f96..fd0140d 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -113,18 +113,16 @@ class TwitterExtractor(Extractor): "url" : base + "orig", "width" : width, "height" : height, - "_fallback": self._image_fallback(base, url + ":"), + "_fallback": self._image_fallback(base), })) else: files.append({"url": media["media_url"]}) @staticmethod - def _image_fallback(new, old): - yield old + "orig" - - for size in ("large", "medium", "small"): - yield new + size - yield old + size + def _image_fallback(base): + yield base + "large" + yield base + "medium" + yield base + "small" def _extract_card(self, tweet, files): card = tweet["card"] @@ -486,8 +484,9 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("retweets", "original"),), "count": 2, "keyword": { - "tweet_id": 1296296016002547713, - "date" : "dt:2020-08-20 04:00:28", + "tweet_id" : 1296296016002547713, + "retweet_id": 1296296016002547713, + "date" : "dt:2020-08-20 04:00:28", }, }), # all Tweets from a conversation (#1319) @@ -526,18 +525,17 @@ class TwitterImageExtractor(Extractor): self.id, self.fmt = match.groups() def items(self): - base = "https://pbs.twimg.com/media/" + self.id - new = base + "?format=" + self.fmt + "&name=" - old = base + "." + self.fmt + ":" + base = "https://pbs.twimg.com/media/{}?format={}&name=".format( + self.id, self.fmt) data = { "filename": self.id, "extension": self.fmt, - "_fallback": TwitterExtractor._image_fallback(new, old), + "_fallback": TwitterExtractor._image_fallback(base), } yield Message.Directory, data - yield Message.Url, new + "orig", data + yield Message.Url, base + "orig", data class TwitterAPI(): @@ -712,7 +710,7 @@ class TwitterAPI(): def _guest_token(self): root = "https://api.twitter.com" endpoint = "/1.1/guest/activate.json" - return self._call(endpoint, None, root, "POST")["guest_token"] + return str(self._call(endpoint, None, root, "POST")["guest_token"]) def _call(self, endpoint, params, root=None, method="GET"): if root is None: @@ -809,6 +807,7 @@ class TwitterAPI(): if original_retweets: if not retweet: continue + retweet["retweeted_status_id_str"] = retweet["id_str"] retweet["_retweet_id_str"] = tweet["id_str"] tweet = retweet elif retweet: diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 1ce1140..2178641 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -17,43 +17,60 @@ class VkPhotosExtractor(Extractor): """Extractor for photos from a vk user""" category = "vk" subcategory = "photos" - directory_fmt = ("{category}", "{user[id]}") + directory_fmt = ("{category}", "{user[name]|user[id]}") filename_fmt = "{id}.{extension}" archive_fmt = "{id}" root = "https://vk.com" request_interval = 1.0 - pattern = r"(?:https://)?(?:www\.|m\.)?vk\.com/(?:albums|photos|id)(\d+)" + pattern = (r"(?:https://)?(?:www\.|m\.)?vk\.com/(?:" + r"(?:albums|photos|id)(-?\d+)|([^/?#]+))") test = ( ("https://vk.com/id398982326", { "pattern": r"https://sun\d+-\d+\.userapi\.com/c\d+/v\d+" r"/[0-9a-f]+/[\w-]+\.jpg", "count": ">= 35", + "keywords": { + "id": r"re:\d+", + "user": { + "id": "398982326", + "info": "Мы за Движуху! – m1ni SounD #4 [EROmusic]", + "name": "", + "nick": "Dobrov Kurva", + }, + }, + }), + ("https://vk.com/cosplayinrussia", { + "range": "75-100", + "keywords": { + "id": r"re:\d+", + "user": { + "id" : "-165740836", + "info": "Предложка открыта, кидайте ваши косплейчики. При " + "правильном оформлении они будут опубликованы", + "name": "cosplayinrussia", + "nick": "Косплей | Cosplay 18+", + }, + }, }), ("https://m.vk.com/albums398982326"), ("https://www.vk.com/id398982326?profile=1"), + ("https://vk.com/albums-165740836"), ) def __init__(self, match): Extractor.__init__(self, match) - self.user_id = match.group(1) + self.user_id, self.user_name = match.groups() def items(self): - user_id = self.user_id - - if self.config("metadata"): - url = "{}/id{}".format(self.root, user_id) - extr = text.extract_from(self.request(url).text) - data = {"user": { - "id" : user_id, - "nick": text.unescape(extr( - "<title>", " | VK<")), - "name": text.unescape(extr( - '<h1 class="page_name">', "<")).replace(" ", " "), - "info": text.unescape(text.remove_html(extr( - '<span class="current_text">', '</span'))) - }} + if self.user_id: + user_id = self.user_id + prefix = "public" if user_id[0] == "-" else "id" + url = "{}/{}{}".format(self.root, prefix, user_id.lstrip("-")) + data = self._extract_profile(url) else: - data = {"user": {"id": user_id}} + url = "{}/{}".format(self.root, self.user_name) + data = self._extract_profile(url) + user_id = data["user"]["id"] photos_url = "{}/photos{}".format(self.root, user_id) headers = { @@ -86,3 +103,15 @@ class VkPhotosExtractor(Extractor): if cnt <= 40 or offset == params["offset"]: return params["offset"] = offset + + def _extract_profile(self, url): + extr = text.extract_from(self.request(url).text) + return {"user": { + "name": text.unescape(extr( + 'rel="canonical" href="https://vk.com/', '"')), + "nick": text.unescape(extr( + '<h1 class="page_name">', "<")).replace(" ", " "), + "info": text.unescape(text.remove_html(extr( + '<span class="current_text">', '</span'))), + "id" : extr('<a href="/albums', '"'), + }} diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index cebb421..e2474c9 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -11,7 +11,7 @@ from .common import GalleryExtractor, Extractor, Message from .. import exception, text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com/((en|fr)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com/(([^/?#]+)" class WebtoonsBase(): @@ -118,6 +118,9 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): "list?title_no=210827&page=9"), { "count": ">= 18", }), + # (#1643) + ("https://www.webtoons.com/es/romance/lore-olympus/" + "list?title_no=1725"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index 511a609..9f95e14 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -71,8 +71,8 @@ class WikiartArtistExtractor(WikiartExtractor): directory_fmt = ("{category}", "{artist[artistName]}") pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)/?$" test = ("https://www.wikiart.org/en/thomas-cole", { - "url": "5140343730331786117fa5f4c013a6153393e28e", - "keyword": "4d9cbc50ebddfcb186f31ff70b08833578dd0070", + "url": "deabec0ed7efa97e2a729ff9d08b539143106bac", + "keyword": "751a5457b71c8704982d3bb6485a214cd3d07bf9", }) def __init__(self, match): diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py new file mode 100644 index 0000000..d380dab --- /dev/null +++ b/gallery_dl/extractor/ytdl.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for sites supported by youtube-dl""" + +from .common import Extractor, Message +from .. import text, config, exception + + +class YoutubeDLExtractor(Extractor): + """Generic extractor for youtube-dl supported URLs""" + category = "ytdl" + directory_fmt = ("{category}", "{subcategory}") + filename_fmt = "{title}-{id}.{extension}" + archive_fmt = "{extractor_key} {id}" + pattern = r"ytdl:(.*)" + test = ("ytdl:https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9",) + + def __init__(self, match): + # import main youtube_dl module + module_name = self.ytdl_module_name = config.get( + ("extractor", "ytdl"), "module") or "youtube_dl" + module = __import__(module_name) + + # find suitable youtube_dl extractor + self.ytdl_url = url = match.group(1) + generic = config.interpolate(("extractor", "ytdl"), "generic", True) + if generic == "force": + self.ytdl_ie_key = "Generic" + self.force_generic_extractor = True + else: + for ie in module.extractor.gen_extractor_classes(): + if ie.suitable(url): + self.ytdl_ie_key = ie.ie_key() + break + if not generic and self.ytdl_ie_key == "Generic": + raise exception.NoExtractorError() + self.force_generic_extractor = False + + # set subcategory to youtube_dl extractor's key + self.subcategory = self.ytdl_ie_key + Extractor.__init__(self, match) + + def items(self): + # import subcategory module + ytdl_module = __import__( + config.get(("extractor", "ytdl", self.subcategory), "module") or + self.ytdl_module_name) + self.log.debug("Using %s", ytdl_module) + + # construct YoutubeDL object + options = { + "format" : self.config("format"), + "retries" : self._retries, + "socket_timeout" : self._timeout, + "nocheckcertificate" : not self._verify, + "proxy" : self.session.proxies.get("http"), + "force_generic_extractor": self.force_generic_extractor, + "nopart" : not self.config("part", True), + "updatetime" : self.config("mtime", True), + "ratelimit" : text.parse_bytes( + self.config("rate"), None), + "min_filesize" : text.parse_bytes( + self.config("filesize-min"), None), + "max_filesize" : text.parse_bytes( + self.config("filesize-max"), None), + } + + raw_options = self.config("raw-options") + if raw_options: + options.update(raw_options) + if self.config("logging", True): + options["logger"] = self.log + options["extract_flat"] = "in_playlist" + + username, password = self._get_auth_info() + if username: + options["username"], options["password"] = username, password + del username, password + + ytdl = ytdl_module.YoutubeDL(options) + + # transfer cookies to ytdl + cookies = self.session.cookies + if cookies: + set_cookie = self.ytdl.cookiejar.set_cookie + for cookie in self.session.cookies: + set_cookie(cookie) + + # extract youtube_dl info_dict + info_dict = ytdl._YoutubeDL__extract_info( + self.ytdl_url, + ytdl.get_info_extractor(self.ytdl_ie_key), + False, {}, True) + + if "entries" in info_dict: + results = self._process_entries(ytdl, info_dict["entries"]) + else: + results = (info_dict,) + + # yield results + for info_dict in results: + info_dict["extension"] = None + info_dict["_ytdl_info_dict"] = info_dict + info_dict["_ytdl_instance"] = ytdl + + url = "ytdl:" + (info_dict.get("url") or + info_dict.get("webpage_url") or + self.ytdl_url) + + yield Message.Directory, info_dict + yield Message.Url, url, info_dict + + def _process_entries(self, ytdl, entries): + for entry in entries: + if entry.get("_type") in ("url", "url_transparent"): + info_dict = ytdl.extract_info( + entry["url"], False, + ie_key=entry.get("ie_key")) + if "entries" in info_dict: + yield from self._process_entries( + ytdl, info_dict["entries"]) + else: + yield info_dict + else: + yield entry + + +if config.get(("extractor", "ytdl"), "enabled"): + # make 'ytdl:' prefix optional + YoutubeDLExtractor.pattern = r"(?:ytdl:)?(.*)" diff --git a/gallery_dl/job.py b/gallery_dl/job.py index dddc03a..953d9c3 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -33,6 +33,7 @@ class Job(): self.pathfmt = None self.kwdict = {} self.status = 0 + self.url_key = extr.config("url-metadata") self._logger_extra = { "job" : self, @@ -57,7 +58,7 @@ class Job(): extr.session.adapters = pextr.session.adapters # user-supplied metadata - kwdict = self.extractor.config("keywords") + kwdict = extr.config("keywords") if kwdict: self.kwdict.update(kwdict) @@ -106,19 +107,23 @@ class Job(): def dispatch(self, msg): """Call the appropriate message handler""" if msg[0] == Message.Url: - _, url, kwds = msg - if self.pred_url(url, kwds): - self.update_kwdict(kwds) - self.handle_url(url, kwds) + _, url, kwdict = msg + if self.url_key: + kwdict[self.url_key] = url + if self.pred_url(url, kwdict): + self.update_kwdict(kwdict) + self.handle_url(url, kwdict) elif msg[0] == Message.Directory: self.update_kwdict(msg[1]) self.handle_directory(msg[1]) elif msg[0] == Message.Queue: - _, url, kwds = msg - if self.pred_queue(url, kwds): - self.handle_queue(url, kwds) + _, url, kwdict = msg + if self.url_key: + kwdict[self.url_key] = url + if self.pred_queue(url, kwdict): + self.handle_queue(url, kwdict) elif msg[0] == Message.Version: if msg[1] != 1: @@ -302,11 +307,18 @@ class DownloadJob(Job): else: extr._parentdir = pextr._parentdir - if pextr.config("parent-metadata"): - if self.kwdict: - job.kwdict.update(self.kwdict) - if kwdict: - job.kwdict.update(kwdict) + pmeta = pextr.config("parent-metadata") + if pmeta: + if isinstance(pmeta, str): + data = self.kwdict.copy() + if kwdict: + data.update(kwdict) + job.kwdict[pmeta] = data + else: + if self.kwdict: + job.kwdict.update(self.kwdict) + if kwdict: + job.kwdict.update(kwdict) if pextr.config("parent-skip"): job._skipcnt = self._skipcnt @@ -626,7 +638,7 @@ class UrlJob(Job): extr = extractor.find(url) if extr: - self.status |= self.__class__(extr, self).run() + self.status |= self.__class__(extr, self, self.depth + 1).run() else: self._write_unsupported(url) diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index ef1d304..c721612 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -89,7 +89,7 @@ class MetadataPP(PostProcessor): ext = kwdict.get("extension") kwdict["extension"] = pathfmt.extension kwdict["extension"] = pathfmt.prefix + self._extension_fmt(kwdict) - filename = pathfmt.build_filename() + filename = pathfmt.build_filename(kwdict) kwdict["extension"] = ext return filename diff --git a/gallery_dl/util.py b/gallery_dl/util.py index fbede3e..3462138 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -145,6 +145,14 @@ def to_string(value): return str(value) +def to_timestamp(dt): + """Convert naive datetime to UTC timestamp string""" + try: + return str((dt - EPOCH) // SECOND) + except Exception: + return "" + + def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4): """Serialize 'obj' as JSON and write it to 'fp'""" json.dump( @@ -370,6 +378,8 @@ class UniversalNone(): NONE = UniversalNone() +EPOCH = datetime.datetime(1970, 1, 1) +SECOND = datetime.timedelta(0, 1) WINDOWS = (os.name == "nt") SENTINEL = object() SPECIAL_EXTRACTORS = {"oauth", "recursive", "test"} @@ -390,11 +400,17 @@ def compile_expression(expr, name="<expr>", globals=GLOBALS): def build_predicate(predicates): if not predicates: - return lambda url, kwds: True + return lambda url, kwdict: True elif len(predicates) == 1: return predicates[0] - else: - return ChainPredicate(predicates) + return functools.partial(chain_predicates, predicates) + + +def chain_predicates(predicates, url, kwdict): + for pred in predicates: + if not pred(url, kwdict): + return False + return True class RangePredicate(): @@ -408,7 +424,7 @@ class RangePredicate(): else: self.lower, self.upper = 0, 0 - def __call__(self, url, kwds): + def __call__(self, url, _): self.index += 1 if self.index > self.upper: @@ -473,7 +489,7 @@ class UniquePredicate(): def __init__(self): self.urls = set() - def __call__(self, url, kwds): + def __call__(self, url, _): if url.startswith("text:"): return True if url not in self.urls: @@ -498,18 +514,6 @@ class FilterPredicate(): raise exception.FilterError(exc) -class ChainPredicate(): - """Predicate; True if all of its predicates return True""" - def __init__(self, predicates): - self.predicates = predicates - - def __call__(self, url, kwds): - for pred in self.predicates: - if not pred(url, kwds): - return False - return True - - class ExtendedUrl(): """URL with attached config key-value pairs""" def __init__(self, url, gconf, lconf): @@ -536,6 +540,7 @@ class Formatter(): - "d": calls text.parse_timestamp - "U": calls urllib.parse.unquote - "S": calls util.to_string() + - "T": calls util.to_timestamü() - Example: {f!l} -> "example"; {f!u} -> "EXAMPLE" Extra Format Specifiers: @@ -559,12 +564,14 @@ class Formatter(): Replaces all occurrences of <old> with <new> Example: {f:R /_/} -> "f_o_o_b_a_r" (if "f" is "f o o b a r") """ + CACHE = {} CONVERSIONS = { "l": str.lower, "u": str.upper, "c": str.capitalize, "C": string.capwords, "t": str.strip, + "T": to_timestamp, "d": text.parse_timestamp, "U": urllib.parse.unquote, "S": to_string, @@ -575,19 +582,26 @@ class Formatter(): def __init__(self, format_string, default=None): self.default = default - self.result = [] - self.fields = [] - - for literal_text, field_name, format_spec, conversion in \ - _string.formatter_parser(format_string): - if literal_text: - self.result.append(literal_text) - if field_name: - self.fields.append(( - len(self.result), - self._field_access(field_name, format_spec, conversion), - )) - self.result.append("") + key = (format_string, default) + + try: + self.result, self.fields = self.CACHE[key] + except KeyError: + self.result = [] + self.fields = [] + + for literal_text, field_name, format_spec, conv in \ + _string.formatter_parser(format_string): + if literal_text: + self.result.append(literal_text) + if field_name: + self.fields.append(( + len(self.result), + self._field_access(field_name, format_spec, conv), + )) + self.result.append("") + + self.CACHE[key] = (self.result, self.fields) if len(self.result) == 1: if self.fields: @@ -777,9 +791,20 @@ class PathFormat(): raise exception.FilenameFormatError(exc) directory_fmt = config("directory") - if directory_fmt is None: - directory_fmt = extractor.directory_fmt try: + if directory_fmt is None: + directory_fmt = extractor.directory_fmt + elif isinstance(directory_fmt, dict): + self.directory_conditions = [ + (compile_expression(expr), [ + Formatter(fmt, kwdefault).format_map + for fmt in fmts + ]) + for expr, fmts in directory_fmt.items() if expr + ] + self.build_directory = self.build_directory_conditional + directory_fmt = directory_fmt.get("", extractor.directory_fmt) + self.directory_formatters = [ Formatter(dirfmt, kwdefault).format_map for dirfmt in directory_fmt @@ -793,19 +818,6 @@ class PathFormat(): self.path = self.realpath = self.temppath = "" self.delete = self._create_directory = False - basedir = extractor._parentdir - if not basedir: - basedir = config("base-directory") - if basedir is None: - basedir = "." + os.sep + "gallery-dl" + os.sep - elif basedir: - basedir = expand_path(basedir) - if os.altsep and os.altsep in basedir: - basedir = basedir.replace(os.altsep, os.sep) - if basedir[-1] != os.sep: - basedir += os.sep - self.basedirectory = basedir - extension_map = config("extension-map") if extension_map is None: extension_map = self.EXTENSION_MAP @@ -826,6 +838,22 @@ class PathFormat(): remove = config("path-remove", "\x00-\x1f\x7f") self.clean_path = self._build_cleanfunc(remove, "") + basedir = extractor._parentdir + if not basedir: + basedir = config("base-directory") + sep = os.sep + if basedir is None: + basedir = "." + sep + "gallery-dl" + sep + elif basedir: + basedir = expand_path(basedir) + altsep = os.altsep + if altsep and altsep in basedir: + basedir = basedir.replace(altsep, sep) + if basedir[-1] != sep: + basedir += sep + basedir = self.clean_path(basedir) + self.basedirectory = basedir + @staticmethod def _build_cleanfunc(chars, repl): if not chars: @@ -837,8 +865,8 @@ class PathFormat(): def func(x, c=chars, r=repl): return x.replace(c, r) else: - def func(x, sub=re.compile("[" + chars + "]").sub, r=repl): - return sub(r, x) + return functools.partial( + re.compile("[" + chars + "]").sub, repl) return func def open(self, mode="wb"): @@ -870,29 +898,14 @@ class PathFormat(): def set_directory(self, kwdict): """Build directory path and create it if necessary""" self.kwdict = kwdict - - # Build path segments by applying 'kwdict' to directory format strings - segments = [] - append = segments.append - try: - for formatter in self.directory_formatters: - segment = formatter(kwdict).strip() - if WINDOWS: - # remove trailing dots and spaces (#647) - segment = segment.rstrip(". ") - if segment: - append(self.clean_segment(segment)) - except Exception as exc: - raise exception.DirectoryFormatError(exc) - - # Join path segments sep = os.sep - directory = self.clean_path(self.basedirectory + sep.join(segments)) - # Ensure 'directory' ends with a path separator + segments = self.build_directory(kwdict) if segments: - directory += sep - self.directory = directory + self.directory = directory = self.basedirectory + self.clean_path( + sep.join(segments) + sep) + else: + self.directory = directory = self.basedirectory if WINDOWS: # Enable longer-than-260-character paths on Windows @@ -935,17 +948,15 @@ class PathFormat(): self.temppath = self.realpath = self.realpath[:-1] return True - def build_filename(self): + def build_filename(self, kwdict): """Apply 'kwdict' to filename format string""" try: return self.clean_path(self.clean_segment( - self.filename_formatter(self.kwdict))) + self.filename_formatter(kwdict))) except Exception as exc: raise exception.FilenameFormatError(exc) - def build_filename_conditional(self): - kwdict = self.kwdict - + def build_filename_conditional(self, kwdict): try: for condition, formatter in self.filename_conditions: if condition(kwdict): @@ -956,12 +967,49 @@ class PathFormat(): except Exception as exc: raise exception.FilenameFormatError(exc) + def build_directory(self, kwdict): + """Apply 'kwdict' to directory format strings""" + segments = [] + append = segments.append + + try: + for formatter in self.directory_formatters: + segment = formatter(kwdict).strip() + if WINDOWS: + # remove trailing dots and spaces (#647) + segment = segment.rstrip(". ") + if segment: + append(self.clean_segment(segment)) + return segments + except Exception as exc: + raise exception.DirectoryFormatError(exc) + + def build_directory_conditional(self, kwdict): + segments = [] + append = segments.append + + try: + for condition, formatters in self.directory_conditions: + if condition(kwdict): + break + else: + formatters = self.directory_formatters + for formatter in formatters: + segment = formatter(kwdict).strip() + if WINDOWS: + segment = segment.rstrip(". ") + if segment: + append(self.clean_segment(segment)) + return segments + except Exception as exc: + raise exception.DirectoryFormatError(exc) + def build_path(self): """Combine directory and filename to full paths""" if self._create_directory: os.makedirs(self.realdirectory, exist_ok=True) self._create_directory = False - self.filename = filename = self.build_filename() + self.filename = filename = self.build_filename(self.kwdict) self.path = self.directory + filename self.realpath = self.realdirectory + filename if not self.temppath: @@ -1028,9 +1076,9 @@ class DownloadArchive(): # fallback for missing WITHOUT ROWID support (#553) self.cursor.execute("CREATE TABLE IF NOT EXISTS archive " "(entry PRIMARY KEY)") - - self.keygen = (extractor.category + extractor.config( - "archive-format", extractor.archive_fmt) + self.keygen = ( + extractor.config("archive-prefix", extractor.category) + + extractor.config("archive-format", extractor.archive_fmt) ).format_map def check(self, kwdict): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 1a3e0e4..fbb4e5b 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.18.0" +__version__ = "1.18.2" diff --git a/test/test_extractor.py b/test/test_extractor.py index f04e1c7..de43ff7 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -147,7 +147,7 @@ class TestExtractorModule(unittest.TestCase): return c.capitalize() for extr in extractor.extractors(): - if extr.category not in ("", "oauth"): + if extr.category not in ("", "oauth", "ytdl"): expected = "{}{}Extractor".format( capitalize(extr.category), capitalize(extr.subcategory), diff --git a/test/test_results.py b/test/test_results.py index 5b22ecd..c36b6dd 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -22,7 +22,6 @@ from gallery_dl import extractor, util, job, config, exception # noqa E402 # temporary issues, etc. BROKEN = { - "imagevenue", "photobucket", } @@ -315,6 +314,11 @@ def setup_test_config(): "e621", "inkbunny", "tapas", "pillowfort", "mangadex"): config.set(("extractor", category), "username", None) + config.set(("extractor", "kemonoparty"), "cookies", { + "__ddg1": "0gBDGpJ3KZQmA4B9QH25", "__ddg2": "lmj5s1jnJOvhPXCX"}) + config.set(("extractor", "seisoparty"), "cookies", { + "__ddg1": "Y8rBxSDHO5UCEtQvzyI9", "__ddg2": "lmj5s1jnJOvhPXCX"}) + config.set(("extractor", "mastodon.social"), "access-token", "Blf9gVqG7GytDTfVMiyYQjwVMQaNACgf3Ds3IxxVDUQ") diff --git a/test/test_util.py b/test/test_util.py index d90d5ad..2d574da 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -124,7 +124,7 @@ class TestPredicate(unittest.TestCase): pred = util.build_predicate([util.UniquePredicate(), util.UniquePredicate()]) - self.assertIsInstance(pred, util.ChainPredicate) + self.assertIs(pred.func, util.chain_predicates) class TestISO639_1(unittest.TestCase): @@ -271,6 +271,7 @@ class TestFormatter(unittest.TestCase): "s": " \n\r\tSPACE ", "u": "%27%3C%20/%20%3E%27", "t": 1262304000, + "dt": datetime.datetime(2010, 1, 1), "name": "Name", "title1": "Title", "title2": "", @@ -295,6 +296,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{n!S}", "") self._run_test("{t!d}", datetime.datetime(2010, 1, 1)) self._run_test("{t!d:%Y-%m-%d}", "2010-01-01") + self._run_test("{dt!T}", "1262304000") with self.assertRaises(KeyError): self._run_test("{a!q}", "hello world") @@ -601,6 +603,11 @@ class TestOther(unittest.TestCase): self.assertEqual(f(["a", "b", "c"]), "a, b, c") self.assertEqual(f([1, 2, 3]), "1, 2, 3") + def test_to_timestamp(self, f=util.to_timestamp): + self.assertEqual(f(util.EPOCH), "0") + self.assertEqual(f(datetime.datetime(2010, 1, 1)), "1262304000") + self.assertEqual(f(None), "") + def test_universal_none(self): obj = util.NONE |