diff options
author | Unit 193 <unit193@unit193.net> | 2021-06-05 20:55:43 -0400 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2021-06-05 20:55:43 -0400 |
commit | f26e7753b7a30fbe6a44cf5c72957a9096316923 (patch) | |
tree | 5906c00be309b8a429df75f1183c02188d90973a | |
parent | 3a066ea27a496139eaad532d7e53e0649ee1d848 (diff) | |
parent | 8a644b7a06c504263a478d3681eed10b4161b5be (diff) | |
download | gallery-dl-f26e7753b7a30fbe6a44cf5c72957a9096316923.tar.bz2 gallery-dl-f26e7753b7a30fbe6a44cf5c72957a9096316923.tar.xz gallery-dl-f26e7753b7a30fbe6a44cf5c72957a9096316923.tar.zst |
Update upstream source from tag 'upstream/1.17.5'
Update to upstream version '1.17.5'
with Debian dir b048d0b0c877962aaf3bbb88b1a2b8e2c1255371
52 files changed, 1213 insertions, 352 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 59691b7..dcc1299 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,54 @@ # Changelog +## 1.17.5 - 2021-05-30 +### Additions +- [kemonoparty] add `metadata` option ([#1548](https://github.com/mikf/gallery-dl/issues/1548)) +- [kemonoparty] add `type` metadata field ([#1556](https://github.com/mikf/gallery-dl/issues/1556)) +- [mangapark] recognize v2.mangapark URLs ([#1578](https://github.com/mikf/gallery-dl/issues/1578)) +- [patreon] extract user-defined `tags` ([#1539](https://github.com/mikf/gallery-dl/issues/1539), [#1540](https://github.com/mikf/gallery-dl/issues/1540)) +- [pillowfort] implement login with username & password ([#846](https://github.com/mikf/gallery-dl/issues/846)) +- [pillowfort] add `inline` and `external` options ([#846](https://github.com/mikf/gallery-dl/issues/846)) +- [pixiv] implement `max-posts` option ([#1558](https://github.com/mikf/gallery-dl/issues/1558)) +- [pixiv] add `metadata` option ([#1551](https://github.com/mikf/gallery-dl/issues/1551)) +- [twitter] add `text-tweets` option ([#570](https://github.com/mikf/gallery-dl/issues/570)) +- [weibo] extend `retweets` option ([#1542](https://github.com/mikf/gallery-dl/issues/1542)) +- [postprocessor:ugoira] support using the `image2` demuxer ([#1550](https://github.com/mikf/gallery-dl/issues/1550)) +- [postprocessor:ugoira] add `repeat-last-frame` option ([#1550](https://github.com/mikf/gallery-dl/issues/1550)) +- support `XDG_CONFIG_HOME` ([#1545](https://github.com/mikf/gallery-dl/issues/1545)) +- implement `parent-skip` and `"skip": "terminate"` options ([#1399](https://github.com/mikf/gallery-dl/issues/1399)) +### Changes +- [twitter] resolve `t.co` URLs in `content` ([#1532](https://github.com/mikf/gallery-dl/issues/1532)) +### Fixes +- [500px] update query hashes ([#1573](https://github.com/mikf/gallery-dl/issues/1573)) +- [aryion] find text posts in `recursive=false` mode ([#1568](https://github.com/mikf/gallery-dl/issues/1568)) +- [imagebam] fix extraction of NSFW images ([#1534](https://github.com/mikf/gallery-dl/issues/1534)) +- [imgur] update URL patterns ([#1561](https://github.com/mikf/gallery-dl/issues/1561)) +- [manganelo] update domain to `manganato.com` +- [reactor] skip deleted/empty posts +- [twitter] add missing retweet media entities ([#1555](https://github.com/mikf/gallery-dl/issues/1555)) +- fix ISO 639-1 code for Japanese (`jp` -> `ja`) + +## 1.17.4 - 2021-05-07 +### Additions +- [gelbooru] add extractor for `/redirect.php` URLs ([#1530](https://github.com/mikf/gallery-dl/issues/1530)) +- [inkbunny] add `favorite` extractor ([#1521](https://github.com/mikf/gallery-dl/issues/1521)) +- add `output.skip` option +- add an optional argument to `--clear-cache` to select which cache entries to remove ([#1230](https://github.com/mikf/gallery-dl/issues/1230)) +### Changes +- [pixiv] update `translated-tags` option ([#1507](https://github.com/mikf/gallery-dl/issues/1507)) + - rename to `tags` + - accept `"japanese"`, `"translated"`, and `"original"` as values +### Fixes +- [500px] update query hashes +- [kemonoparty] fix download URLs ([#1514](https://github.com/mikf/gallery-dl/issues/1514)) +- [imagebam] fix extraction +- [instagram] update query hashes +- [nozomi] update default archive-fmt for `tag` and `search` extractors ([#1529](https://github.com/mikf/gallery-dl/issues/1529)) +- [pixiv] remove duplicate translated tags ([#1507](https://github.com/mikf/gallery-dl/issues/1507)) +- [readcomiconline] change domain to `readcomiconline.li` ([#1517](https://github.com/mikf/gallery-dl/issues/1517)) +- [sankaku] update invalid-token detection ([#1515](https://github.com/mikf/gallery-dl/issues/1515)) +- fix crash when using `--no-download` with `--ugoira-conv` ([#1507](https://github.com/mikf/gallery-dl/issues/1507)) + ## 1.17.3 - 2021-04-25 ### Additions - [danbooru] add option for extended metadata extraction ([#1458](https://github.com/mikf/gallery-dl/issues/1458)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.17.3 +Version: 1.17.5 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -75,8 +75,8 @@ Description: ========== Prebuilt executable files with a Python interpreter and required Python packages included are available for - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.5/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.5/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -190,6 +190,7 @@ Description: ========== Linux, macOS, etc.: * ``/etc/gallery-dl.conf`` + * ``${XDG_CONFIG_HOME}/gallery-dl/config.json`` * ``${HOME}/.config/gallery-dl/config.json`` * ``${HOME}/.gallery-dl.conf`` @@ -220,6 +221,7 @@ Description: ========== ``inkbunny``, ``instagram``, ``mangoxo``, + ``pillowfort``, ``pinterest``, ``sankaku``, ``subscribestar``, @@ -64,8 +64,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.5/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.5/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -179,6 +179,7 @@ Windows: Linux, macOS, etc.: * ``/etc/gallery-dl.conf`` + * ``${XDG_CONFIG_HOME}/gallery-dl/config.json`` * ``${HOME}/.config/gallery-dl/config.json`` * ``${HOME}/.gallery-dl.conf`` @@ -209,6 +210,7 @@ and optional for ``inkbunny``, ``instagram``, ``mangoxo``, +``pillowfort``, ``pinterest``, ``sankaku``, ``subscribestar``, diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 76afd8a..436260b 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -11,7 +11,7 @@ _arguments -C -S \ {-i,--input-file}'[Download URLs found in FILE ("-" for stdin). More than one --input-file can be specified]':'<file>':_files \ --cookies'[File to load additional cookies from]':'<file>':_files \ --proxy'[Use the specified proxy]':'<url>' \ ---clear-cache'[Delete all cached login sessions, cookies, etc.]' \ +--clear-cache'[Delete all cached login sessions, cookies, etc.]':'<module>' \ {-q,--quiet}'[Activate quiet mode]' \ {-v,--verbose}'[Print various debugging information]' \ {-g,--get-urls}'[Print URLs instead of downloading]' \ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 6a22a07..719b8b4 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2021-04-25" "1.17.3" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2021-05-30" "1.17.5" "gallery-dl Manual" .\" disable hyphenation .nh @@ -35,7 +35,7 @@ File to load additional cookies from .B "\-\-proxy" \f[I]URL\f[] Use the specified proxy .TP -.B "\-\-clear\-cache" +.B "\-\-clear\-cache" \f[I]MODULE\f[] Delete all cached login sessions, cookies, etc. .TP .B "\-q, \-\-quiet" diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 0190b7f..f35f218 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2021-04-25" "1.17.3" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2021-05-30" "1.17.5" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -166,6 +166,17 @@ for any spawned child extractors. Overwrite any metadata provided by a child extractor with its parent's. +.SS extractor.*.parent-skip +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Share number of skipped downloads between parent and child extractors. + + .SS extractor.*.path-restrict .IP "Type:" 6 \f[I]string\f[] or \f[I]object\f[] @@ -267,12 +278,18 @@ exists or its ID is in a \f[I]download archive\f[]. * \f[I]false\f[]: Overwrite already existing files .br -* \f[I]"abort"\f[]: Abort the current extractor run +* \f[I]"abort"\f[]: Stop the current extractor run .br -* \f[I]"abort:N"\f[]: Skip downloads and abort extractor run +* \f[I]"abort:N"\f[]: Skip downloads and stop the current extractor run after \f[I]N\f[] consecutive skips .br +* \f[I]"terminate"\f[]: Stop the current extractor run, including parent extractors +.br +* \f[I]"terminate:N"\f[]: Skip downloads and stop the current extractor run, +including parent extractors, after \f[I]N\f[] consecutive skips + +.br * \f[I]"exit"\f[]: Exit the program altogether .br * \f[I]"exit:N"\f[]: Skip downloads and exit the program @@ -357,6 +374,8 @@ and optional for .br * \f[I]mangoxo\f[] .br +* \f[I]pillowfort\f[] +.br * \f[I]pinterest\f[] .br * \f[I]sankaku\f[] @@ -1286,7 +1305,8 @@ A (comma-separated) list of subcategories to include when processing a user profile. Possible values are -\f[I]"posts"\f[], \f[I]reels\f[], \f[I]"stories"\f[], \f[I]"highlights"\f[], \f[I]"channel"\f[]. +\f[I]"posts"\f[], \f[I]"reels"\f[], \f[I]"channel"\f[], \f[I]"tagged"\f[], +\f[I]"stories"\f[], \f[I]"highlights"\f[]. You can use \f[I]"all"\f[] instead of listing all values separately. @@ -1302,6 +1322,17 @@ You can use \f[I]"all"\f[] instead of listing all values separately. Download video files. +.SS extractor.kemonoparty.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract \f[I]username\f[] metadata + + .SS extractor.khinsider.format .IP "Type:" 6 \f[I]string\f[] @@ -1434,6 +1465,28 @@ port than the default. Download subalbums. +.SS extractor.pillowfort.external +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Follow links to external sites, e.g. Twitter, + + +.SS extractor.pillowfort.inline +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Extract inline images. + + .SS extractor.pillowfort.reblogs .IP "Type:" 6 \f[I]bool\f[] @@ -1478,7 +1531,7 @@ Download from video pins. Download user avatars. -.SS extractor.pixiv.work.related +.SS extractor.pixiv.user.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -1486,10 +1539,10 @@ Download user avatars. \f[I]false\f[] .IP "Description:" 4 -Also download related artworks. +Fetch extended \f[I]user\f[] metadata. -.SS extractor.pixiv.translated-tags +.SS extractor.pixiv.work.related .IP "Type:" 6 \f[I]bool\f[] @@ -1497,7 +1550,25 @@ Also download related artworks. \f[I]false\f[] .IP "Description:" 4 -Provide translated ´tags`. +Also download related artworks. + + +.SS extractor.pixiv.tags +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"japanese"\f[] + +.IP "Description:" 4 +Controls the \f[I]tags\f[] metadata field. + +.br +* "japanese": List of Japanese tags +.br +* "translated": List of translated tags +.br +* "original": Unmodified list with both Japanese and translated tags .SS extractor.pixiv.ugoira @@ -1517,6 +1588,18 @@ Use an ugoira post processor to convert them to watchable videos. (Example__) +.SS extractor.pixiv.max-posts +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]0\f[] + +.IP "Description:" 4 +When downloading galleries, this sets the maximum number of posts to get. +A value of \f[I]0\f[] means no limit. + + .SS extractor.plurk.comments .IP "Type:" 6 \f[I]bool\f[] @@ -1848,6 +1931,21 @@ If this value is \f[I]"original"\f[], metadata for these files will be taken from the original Tweets, not the Retweets. +.SS extractor.twitter.text-tweets +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Also emit metadata for text-only Tweets without media content. + +This only has an effect with a \f[I]metadata\f[] (or \f[I]exec\f[]) post processor +with \f[I]"event": "post"\f[] +and appropriate \f[I]filename\f[]. + + .SS extractor.twitter.twitpic .IP "Type:" 6 \f[I]bool\f[] @@ -1965,7 +2063,10 @@ to use your account's browsing settings and filters. \f[I]true\f[] .IP "Description:" 4 -Extract media from retweeted posts. +Fetch media from retweeted posts. + +If this value is \f[I]"original"\f[], metadata for these files +will be taken from the original posts, not the retweeted posts. .SS extractor.weibo.videos @@ -2287,6 +2388,17 @@ Controls whether the output strings should be shortened to fit on one console line. +.SS output.skip +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Show skipped file downloads. + + .SS output.progress .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] @@ -2558,7 +2670,7 @@ The event for which metadata gets written to a file. The available events are: \f[I]init\f[] -After post procesor initialization +After post processor initialization and before the first file download \f[I]finalize\f[] On extractor shutdown, e.g. after all files were downloaded @@ -2631,6 +2743,19 @@ Filename extension for the resulting video files. Additional FFmpeg command-line arguments. +.SS ugoira.ffmpeg-demuxer +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]image2\f[] + +.IP "Description:" 4 +FFmpeg demuxer to read input files with. Possible values are +"\f[I]image2\f[]" and +"\f[I]concat\f[]". + + .SS ugoira.ffmpeg-location .IP "Type:" 6 \f[I]Path\f[] @@ -2714,6 +2839,18 @@ to the list of FFmpeg command-line arguments to reduce an odd width/height by 1 pixel and make them even. +.SS ugoira.repeat-last-frame +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Allow repeating the last frame when necessary +to prevent it from only being displayed for a very short amount of time. + + .SS zip.extension .IP "Type:" 6 \f[I]string\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 4eaf1b8..7497cd6 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -59,6 +59,8 @@ }, "deviantart": { + "client-id": null, + "client-secret": null, "extra": false, "flat": true, "folders": false, @@ -174,6 +176,8 @@ }, "pillowfort": { + "external": false, + "inline": true, "reblogs": false }, "pinterest": @@ -183,8 +187,9 @@ }, "pixiv": { + "refresh-token": null, "avatar": false, - "translated-tags": false, + "tags": "japanese", "ugoira": true }, "reactor": @@ -254,6 +259,7 @@ "quoted": true, "replies": true, "retweets": true, + "text-tweets": false, "twitpic": false, "users": "timeline", "videos": true @@ -320,6 +326,7 @@ "mode": "auto", "progress": true, "shorten": true, + "skip": true, "log": "[{name}][{levelname}] {message}", "logfile": null, "unsupportedfile": null diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index e192d75..7fe851f 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.17.3 +Version: 1.17.5 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -75,8 +75,8 @@ Description: ========== Prebuilt executable files with a Python interpreter and required Python packages included are available for - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.5/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.5/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -190,6 +190,7 @@ Description: ========== Linux, macOS, etc.: * ``/etc/gallery-dl.conf`` + * ``${XDG_CONFIG_HOME}/gallery-dl/config.json`` * ``${HOME}/.config/gallery-dl/config.json`` * ``${HOME}/.gallery-dl.conf`` @@ -220,6 +221,7 @@ Description: ========== ``inkbunny``, ``instagram``, ``mangoxo``, + ``pillowfort``, ``pinterest``, ``sankaku``, ``subscribestar``, diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 3cc2071..9655896 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -182,6 +182,7 @@ test/test_config.py test/test_cookies.py test/test_downloader.py test/test_extractor.py +test/test_job.py test/test_oauth.py test/test_postprocessor.py test/test_results.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 5bf229a..8154afc 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -186,7 +186,7 @@ def main(): elif args.clear_cache: from . import cache log = logging.getLogger("cache") - cnt = cache.clear() + cnt = cache.clear(args.clear_cache) if cnt is None: log.error("Database file not available") @@ -249,6 +249,8 @@ def main(): retval |= jobtype(url.value).run() else: retval |= jobtype(url).run() + except exception.TerminateExtraction: + pass except exception.NoExtractorError: log.error("No suitable extractor found for '%s'", url) retval |= 64 diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py index a874f63..5ab68bf 100644 --- a/gallery_dl/cache.py +++ b/gallery_dl/cache.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -168,24 +168,33 @@ def cache(maxage=3600, keyarg=None): return wrap -def clear(): - """Delete all database entries""" +def clear(module="all"): + """Delete database entries for 'module'""" db = DatabaseCacheDecorator.db + if not db: + return None - if db: - rowcount = 0 - cursor = db.cursor() - try: + rowcount = 0 + cursor = db.cursor() + module = module.lower() + + try: + if module == "all": cursor.execute("DELETE FROM data") - except sqlite3.OperationalError: - pass # database is not initialized, can't be modified, etc. else: - rowcount = cursor.rowcount - db.commit() + cursor.execute( + "DELETE FROM data " + "WHERE key LIKE 'gallery_dl.extractor.' || ? || '.%'", + (module,) + ) + except sqlite3.OperationalError: + pass # database is not initialized, can't be modified, etc. + else: + rowcount = cursor.rowcount + db.commit() + if rowcount: cursor.execute("VACUUM") - return rowcount - - return None + return rowcount def _path(): diff --git a/gallery_dl/config.py b/gallery_dl/config.py index e0a5459..953b1b1 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -31,6 +31,8 @@ if util.WINDOWS: else: _default_configs = [ "/etc/gallery-dl.conf", + "${XDG_CONFIG_HOME}/gallery-dl/config.json" + if os.environ.get("XDG_CONFIG_HOME") else "${HOME}/.config/gallery-dl/config.json", "${HOME}/.gallery-dl.conf", ] diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index bc42d7c..76ec46f 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -80,6 +80,10 @@ class HttpDownloader(DownloaderBase): tries = 0 msg = "" + kwdict = pathfmt.kwdict + adjust_extension = kwdict.get( + "_http_adjust_extension", self.adjust_extension) + if self.part: pathfmt.part_enable(self.partdir) @@ -105,7 +109,7 @@ class HttpDownloader(DownloaderBase): if self.headers: headers.update(self.headers) # file-specific headers - extra = pathfmt.kwdict.get("_http_headers") + extra = kwdict.get("_http_headers") if extra: headers.update(extra) @@ -139,7 +143,7 @@ class HttpDownloader(DownloaderBase): return False # check for invalid responses - validate = pathfmt.kwdict.get("_http_validate") + validate = kwdict.get("_http_validate") if validate and not validate(response): self.log.warning("Invalid response") return False @@ -168,7 +172,7 @@ class HttpDownloader(DownloaderBase): content = response.iter_content(self.chunk_size) # check filename extension against file header - if self.adjust_extension and not offset and \ + if adjust_extension and not offset and \ pathfmt.extension in FILE_SIGNATURES: try: file_header = next( @@ -198,7 +202,7 @@ class HttpDownloader(DownloaderBase): if file_header: fp.write(file_header) elif offset: - if self.adjust_extension and \ + if adjust_extension and \ pathfmt.extension in FILE_SIGNATURES: self._adjust_extension(pathfmt, fp.read(16)) fp.seek(offset) @@ -222,10 +226,9 @@ class HttpDownloader(DownloaderBase): self.downloading = False if self.mtime: - pathfmt.kwdict.setdefault( - "_mtime", response.headers.get("Last-Modified")) + kwdict.setdefault("_mtime", response.headers.get("Last-Modified")) else: - pathfmt.kwdict["_mtime"] = None + kwdict["_mtime"] = None return True diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py index f553d41..0433dc9 100644 --- a/gallery_dl/exception.py +++ b/gallery_dl/exception.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,6 +23,7 @@ Exception +-- FilterError +-- NoExtractorError +-- StopExtraction + +-- TerminateExtraction """ @@ -109,3 +110,8 @@ class StopExtraction(GalleryDLException): GalleryDLException.__init__(self) self.message = message % args if args else message self.code = 1 if message else 0 + + +class TerminateExtraction(GalleryDLException): + """Terminate data extraction""" + code = 0 diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index edb9d46..27634de 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -104,7 +104,8 @@ class _35photoUserExtractor(_35photoExtractor): r"/(?!photo_|genre_|tags/|rating/)([^/?#]+)") test = ( ("https://35photo.pro/liya", { - "pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg", + "pattern": r"https://([a-z][0-9]\.)?35photo\.pro" + r"/photos_(main|series)/.*\.jpg", "count": 9, }), ("https://35photo.pro/suhoveev", { @@ -214,7 +215,7 @@ class _35photoImageExtractor(_35photoExtractor): test = ("https://35photo.pro/photo_753340/", { "count": 1, "keyword": { - "url" : r"re:https://m\d+.35photo.pro/photos_main/.*.jpg", + "url" : r"re:https://35photo\.pro/photos_main/.*\.jpg", "id" : 753340, "title" : "Winter walk", "description": str, diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 0583eb9..c2c5a66 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -146,7 +146,7 @@ class _500pxGalleryExtractor(_500pxExtractor): }), # unavailable photos (#1335) ("https://500px.com/p/Light_Expression_Photography/galleries/street", { - "count": ">= 7", + "count": 0, }), ("https://500px.com/fashvamp/galleries/lera"), ) @@ -172,7 +172,7 @@ class _500pxGalleryExtractor(_500pxExtractor): } gallery = self._request_graphql( "GalleriesDetailQueryRendererQuery", variables, - "fb8bb66d31b58903e2f01ebe66bbe7937b982753be3211855b7bce4e286c1a49", + "eda3c77ca4efe4b3347ec9c08befe3bd2c58099ebfb1f680d829fcd26d34f12d", )["gallery"] self._photos = gallery["photos"] @@ -200,8 +200,8 @@ class _500pxGalleryExtractor(_500pxExtractor): variables["cursor"] = photos["pageInfo"]["endCursor"] photos = self._request_graphql( "GalleriesDetailPaginationContainerQuery", variables, - "457c66d976f56863c81795f03e98cb54" - "3c7c6cdae7abeab8fe9e8e8a67479fa9", + "466cf6661a07e7fdca465edb39118efb" + "80fb157c6d3f620c7f518cdae0832c78", )["galleryByOwnerIdAndSlugOrToken"]["photos"] diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index ded2ae3..0d0ad70 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Mike Fährmann +# Copyright 2020-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,7 +12,6 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache - BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4" @@ -33,6 +32,8 @@ class AryionExtractor(Extractor): self._needle = "class='gallery-item' id='" def login(self): + if self._check_cookies(self.cookienames): + return username, password = self._get_auth_info() if username: self._update_cookies(self._login_impl(username, password)) @@ -73,8 +74,7 @@ class AryionExtractor(Extractor): def _pagination(self, url): while True: page = self.request(url).text - yield from text.extract_iter( - page, self._needle, "'") + yield from text.extract_iter(page, self._needle, "'") pos = page.find("Next >>") if pos < 0: @@ -173,7 +173,7 @@ class AryionGalleryExtractor(AryionExtractor): def skip(self, num): if self.recursive: - num = 0 + return 0 self.offset += num return num @@ -182,7 +182,7 @@ class AryionGalleryExtractor(AryionExtractor): url = "{}/g4/gallery/{}".format(self.root, self.user) return self._pagination(url) else: - self._needle = "class='thumb' href='/g4/view/" + self._needle = "thumb' href='/g4/view/" url = "{}/g4/latest.php?name={}".format(self.root, self.user) return util.advance(self._pagination(url), self.offset) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 1f86ea5..3b96a4e 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -61,6 +61,7 @@ class DanbooruExtractor(Extractor): "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format( self.root, post["id"]) ).json()["pixiv_ugoira_frame_data"]["data"] + post["_http_adjust_extension"] = False else: url = post["large_file_url"] post["extension"] = "webm" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 47f589a..9a461a4 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -930,10 +930,12 @@ class DeviantartOAuthAPI(): self.folders = extractor.config("folders", False) self.metadata = extractor.extra or extractor.config("metadata", False) - self.client_id = extractor.config( - "client-id", self.CLIENT_ID) - self.client_secret = extractor.config( - "client-secret", self.CLIENT_SECRET) + self.client_id = extractor.config("client-id") + if self.client_id: + self.client_secret = extractor.config("client-secret") + else: + self.client_id = self.CLIENT_ID + self.client_secret = self.CLIENT_SECRET token = extractor.config("refresh-token") if token is None or token == "cache": diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 910da7d..64a6cb7 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -128,7 +128,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "gid": 1200119, "height": int, "image_token": "re:[0-9a-f]{10}", - "lang": "jp", + "lang": "ja", "language": "Japanese", "parent": "", "rating": r"re:\d\.\d+", diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 0bcec2b..5962b9e 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -135,7 +135,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): "url": "61896d9d9a2edb556b619000a308a984307b6d30", }), ("https://thebarchive.com/b/thread/739772332/", { - "url": "e8b18001307d130d67db31740ce57c8561b5d80c", + "url": "07d39d2cb48f40fb337dc992993d965b0cd5f7cd", }), ) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 863cead..df45d0d 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -8,8 +8,10 @@ """Extractors for https://gelbooru.com/""" +from .common import Extractor, Message from . import gelbooru_v02 from .. import text, exception +import binascii class GelbooruBase(): @@ -131,3 +133,23 @@ class GelbooruPostExtractor(GelbooruBase, } }), ) + + +class GelbooruRedirectExtractor(GelbooruBase, Extractor): + subcategory = "redirect" + pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com" + r"/redirect\.php\?s=([^&#]+)") + test = (("https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgu" + "cGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MTgzMDA0Ng=="), { + "pattern": r"https://gelbooru.com/index.php" + r"\?page=post&s=view&id=1830046" + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.redirect_url = text.ensure_http_scheme( + binascii.a2b_base64(match.group(1)).decode()) + + def items(self): + data = {"_extractor": GelbooruPostExtractor} + yield Message.Queue, self.redirect_url, data diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 76b2c38..9370840 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2019 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from http://www.imagebam.com/""" +"""Extractors for https://www.imagebam.com/""" from .common import Extractor, Message from .. import text, exception @@ -15,34 +15,44 @@ from .. import text, exception class ImagebamExtractor(Extractor): """Base class for imagebam extractors""" category = "imagebam" - root = "http://www.imagebam.com" + root = "https://www.imagebam.com" + cookies = None - def get_image_data(self, page_url, data): - """Fill 'data' and return image URL""" + def __init__(self, match): + Extractor.__init__(self, match) + self.key = match.group(1) + if self.cookies: + self.session.cookies = self.cookies + + def get_image_data(self, data): + page_url = "{}/image/{}".format(self.root, data["image_key"]) page = self.request(page_url).text - image_url = text.extract(page, 'property="og:image" content="', '"')[0] - data["extension"] = image_url.rpartition(".")[2] - data["image_key"] = page_url.rpartition("/")[2] - data["image_id"] = data["image_key"][6:] - return image_url + image_url, pos = text.extract(page, '<img src="https://images', '"') + + if not image_url: + # cache cookies + ImagebamExtractor.cookies = self.session.cookies + # repeat request to get past "Continue to your image" pages + page = self.request(page_url).text + image_url, pos = text.extract( + page, '<img src="https://images', '"') - def request_page(self, url): - """Retrive the main part of a gallery page""" - page = self.request(text.urljoin(self.root, url)).text - return text.extract(page, "<fieldset>", "</fieldset>")[0] + filename = text.unescape(text.extract(page, 'alt="', '"', pos)[0]) + data["url"] = "https://images" + image_url + data["filename"], _, data["extension"] = filename.rpartition(".") class ImagebamGalleryExtractor(ImagebamExtractor): """Extractor for image galleries from imagebam.com""" subcategory = "gallery" - directory_fmt = ("{category}", "{title} - {gallery_key}") - filename_fmt = "{num:>03}-{image_key}.{extension}" + directory_fmt = ("{category}", "{title} {gallery_key}") + filename_fmt = "{num:>03} {filename}.{extension}" archive_fmt = "{gallery_key}_{image_key}" pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)" test = ( - ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { + ("https://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { "url": "76d976788ae2757ac81694736b07b72356f5c4c8", - "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a", + "keyword": "b048478b1bbba3072a7fa9fcc40630b3efad1f6c", "content": "596e6bfa157f2c7169805d50075c2986549973a8", }), ("http://www.imagebam.com/gallery/op9dwcklwdrrguibnkoe7jxgvig30o5p", { @@ -51,78 +61,67 @@ class ImagebamGalleryExtractor(ImagebamExtractor): "url": "32ae6fe5dc3e4ca73ff6252e522d16473595d1d1", }), ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { - "exception": exception.NotFoundError, + "exception": exception.HttpError, }), ) - def __init__(self, match): - ImagebamExtractor.__init__(self, match) - self.gallery_key = match.group(1) - def items(self): - url = "{}/gallery/{}".format(self.root, self.gallery_key) - page = self.request_page(url) - if not page or ">Error<" in page: - raise exception.NotFoundError("gallery") + url = "{}/gallery/{}".format(self.root, self.key) + page = self.request(url).text data = self.get_metadata(page) - imgs = self.get_image_pages(page) - data["count"] = len(imgs) - data["gallery_key"] = self.gallery_key + keys = self.get_image_keys(page) + keys.reverse() + data["count"] = len(keys) + data["gallery_key"] = self.key - yield Message.Version, 1 yield Message.Directory, data - for data["num"], page_url in enumerate(imgs, 1): - image_url = self.get_image_data(page_url, data) - yield Message.Url, image_url, data + for data["num"], data["image_key"] in enumerate(keys, 1): + self.get_image_data(data) + yield Message.Url, data["url"], data @staticmethod def get_metadata(page): """Return gallery metadata""" - return text.extract_all(page, ( - ("title" , "'> ", " <span "), - (None , "'>", "</span>"), - ("description", ":#FCFCFC;'>", "</div>"), - ))[0] - - def get_image_pages(self, page): - """Return a list of all image pages""" - pages = [] + title = text.extract(page, 'id="gallery-name">', '<')[0] + return {"title": text.unescape(title.strip())} + + def get_image_keys(self, page): + """Return a list of all image keys""" + keys = [] while True: - pages.extend(text.extract_iter(page, "\n<a href='", "'")) - pos = page.find('"pagination_current"') + keys.extend(text.extract_iter( + page, '<a href="https://www.imagebam.com/image/', '"')) + pos = page.find('rel="next" aria-label="Next') if pos > 0: - url = text.extract(page, "<a href='", "'", pos)[0] + url = text.rextract(page, 'href="', '"', pos)[0] if url: - page = self.request_page(url) + page = self.request(url).text continue - return pages + return keys class ImagebamImageExtractor(ImagebamExtractor): """Extractor for single images from imagebam.com""" subcategory = "image" - filename_fmt = "{image_key}.{extension}" archive_fmt = "{image_key}" pattern = (r"(?:https?://)?(?:\w+\.)?imagebam\.com" r"/(?:image/|(?:[0-9a-f]{2}/){3})([0-9a-f]+)") test = ( - ("http://www.imagebam.com/image/94d56c502511890", { + ("https://www.imagebam.com/image/94d56c502511890", { "url": "5e9ba3b1451f8ded0ae3a1b84402888893915d4a", - "keyword": "4263d4840007524129792b8587a562b5d20c2687", + "keyword": "2a4380d4b57554ff793898c2d6ec60987c86d1a1", "content": "0c8768055e4e20e7c7259608b67799171b691140", }), ("http://images3.imagebam.com/1d/8c/44/94d56c502511890.png"), + # NSFW (#1534) + ("https://www.imagebam.com/image/0850951366904951", { + "url": "d37297b17ed1615b4311c8ed511e50ce46e4c748", + }), ) - def __init__(self, match): - ImagebamExtractor.__init__(self, match) - self.image_key = match.group(1) - def items(self): - page_url = "{}/image/{}".format(self.root, self.image_key) - data = {} - image_url = self.get_image_data(page_url, data) - yield Message.Version, 1 + data = {"image_key": self.key} + self.get_image_data(data) yield Message.Directory, data - yield Message.Url, image_url, data + yield Message.Url, data["url"], data diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 7009c7a..f925c9e 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -57,7 +57,8 @@ class ImgurImageExtractor(ImgurExtractor): subcategory = "image" filename_fmt = "{category}_{id}{title:?_//}.{extension}" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/(?!gallery|search)(\w{7}|\w{5})[sbtmlh]?\.?" + pattern = (BASE_PATTERN + r"/(?!gallery|search)" + r"(?:r/\w+/)?(\w{7}|\w{5})[sbtmlh]?") test = ( ("https://imgur.com/21yMxCS", { "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", @@ -110,6 +111,7 @@ class ImgurImageExtractor(ImgurExtractor): ("https://imgur.com/zzzzzzz", { # not found "exception": exception.HttpError, }), + ("https://m.imgur.com/r/Celebs/iHJ7tsM"), ("https://www.imgur.com/21yMxCS"), # www ("https://m.imgur.com/21yMxCS"), # mobile ("https://imgur.com/zxaY6"), # 5 character key @@ -289,7 +291,7 @@ class ImgurFavoriteExtractor(ImgurExtractor): class ImgurSubredditExtractor(ImgurExtractor): """Extractor for a subreddits's imgur links""" subcategory = "subreddit" - pattern = BASE_PATTERN + r"/r/([^/?#]+)" + pattern = BASE_PATTERN + r"/r/([^/?#]+)/?$" test = ("https://imgur.com/r/pics", { "range": "1-100", "count": 100, diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 9b5331a..2f7935b 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -64,7 +64,7 @@ class InkbunnyExtractor(Extractor): class InkbunnyUserExtractor(InkbunnyExtractor): """Extractor for inkbunny user profiles""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!s/)(gallery/|scraps/)?([^/?#]+)" + pattern = BASE_PATTERN + r"/(?!s/)(gallery/|scraps/)?(\w+)(?:$|[/?#])" test = ( ("https://inkbunny.net/soina", { "pattern": r"https://[\w.]+\.metapix\.net/files/full" @@ -138,6 +138,33 @@ class InkbunnyUserExtractor(InkbunnyExtractor): return self.api.search(params) +class InkbunnyFavoriteExtractor(InkbunnyExtractor): + """Extractor for inkbunny user favorites""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/userfavorites_process\.php\?favs_user_id=(\d+)" + test = ( + ("https://inkbunny.net/userfavorites_process.php?favs_user_id=20969", { + "pattern": r"https://[\w.]+\.metapix\.net/files/full" + r"/\d+/\d+_\w+_.+", + "range": "20-50", + }), + ) + + def __init__(self, match): + InkbunnyExtractor.__init__(self, match) + self.user_id = match.group(1) + + def posts(self): + orderby = self.config("orderby", "fav_datetime") + params = { + "favs_user_id": self.user_id, + "orderby" : orderby, + } + if orderby and orderby.startswith("unread_"): + params["unread_submissions"] = "yes" + return self.api.search(params) + + class InkbunnyPostExtractor(InkbunnyExtractor): """Extractor for individual Inkbunny posts""" subcategory = "post" diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index a027be1..e3db789 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -408,7 +408,7 @@ class InstagramPostsExtractor(InstagramExtractor): url = "{}/{}/".format(self.root, self.item) user = self._extract_profile_page(url) - query_hash = "42d2750e44dbac713ff30130659cd891" + query_hash = "32b14723a678bd4628d70c1f877b94c9" variables = {"id": user["id"], "first": 50} edge = self._get_edge_data(user, "edge_owner_to_timeline_media") return self._pagination_graphql(query_hash, variables, edge) @@ -613,7 +613,7 @@ class InstagramPostExtractor(InstagramExtractor): ) def posts(self): - query_hash = "cf28bf5eb45d62d4dc8e77cdb99d750d" + query_hash = "d4e8ae69cb68f66329dcebe82fb69f6d" variables = { "shortcode" : self.item, "child_comment_count" : 3, diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 377e00b..1b5e5e9 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -26,24 +26,41 @@ class KemonopartyExtractor(Extractor): def items(self): find_inline = re.compile(r'src="(/inline/[^"]+)').findall + if self.config("metadata"): + username = text.unescape(text.extract( + self.request(self.user_url).text, "<title>", " | Kemono<")[0]) + else: + username = None + for post in self.posts(): files = [] - if post["file"]: - files.append(post["file"]) - if post["attachments"]: - files.extend(post["attachments"]) + append = files.append + file = post["file"] + + if file: + file["type"] = "file" + append(file) + for attachment in post["attachments"]: + attachment["type"] = "attachment" + append(attachment) for path in find_inline(post["content"] or ""): - files.append({"path": path, "name": path}) + append({"path": path, "name": path, "type": "inline"}) post["date"] = text.parse_datetime( post["published"], "%a, %d %b %Y %H:%M:%S %Z") + if username: + post["username"] = username yield Message.Directory, post for post["num"], file in enumerate(files, 1): + post["type"] = file["type"] url = file["path"] if url[0] == "/": - url = self.root + url + url = "https://data.kemono.party" + url + elif url.startswith("https://kemono.party/"): + url = "https://data.kemono.party" + url[20:] + text.nameext_from_url(file["name"], post) yield Message.Url, url, post @@ -64,6 +81,7 @@ class KemonopartyUserExtractor(KemonopartyExtractor): KemonopartyExtractor.__init__(self, match) service, user_id = match.groups() self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) + self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) def posts(self): url = self.api_url @@ -84,7 +102,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): pattern = BASE_PATTERN + r"/post/([^/?#]+)" test = ( ("https://kemono.party/fanbox/user/6993449/post/506575", { - "pattern": r"https://kemono\.party/files/fanbox" + "pattern": r"https://data\.kemono\.party/files/fanbox" r"/6993449/506575/P058kDFYus7DbqAkGlfWTlOr\.jpeg", "keyword": { "added": "Wed, 06 May 2020 20:28:02 GMT", @@ -101,16 +119,21 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "shared_file": False, "subcategory": "post", "title": "c96取り置き", + "type": "file", "user": "6993449", }, }), # inline image (#1286) ("https://kemono.party/fanbox/user/7356311/post/802343", { - "pattern": r"https://kemono\.party/inline/fanbox" + "pattern": r"https://data\.kemono\.party/inline/fanbox" r"/uaozO4Yga6ydkGIJFAQDixfE\.jpeg", }), + # kemono.party -> data.kemono.party + ("https://kemono.party/gumroad/user/trylsc/post/IURjT", { + "pattern": r"https://data\.kemono\.party/(file|attachment)s" + r"/gumroad/trylsc/IURjT/", + }), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), - ("https://kemono.party/gumroad/user/trylsc/post/IURjT"), ) def __init__(self, match): @@ -118,6 +141,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): service, user_id, post_id = match.groups() self.api_url = "{}/api/{}/user/{}/post/{}".format( self.root, service, user_id, post_id) + self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) def posts(self): posts = self.request(self.api_url).json() diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index f8e1473..833d18e 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -4,35 +4,23 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from https://manganelo.com/""" +"""Extractors for https://manganato.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text import re +BASE_PATTERN = \ + r"(?:https?://)?((?:(?:read)?manganato|(?:www\.)?manganelo)\.com)" -class ManganeloBase(): - """Base class for manganelo extractors""" - category = "manganelo" - root = "https://manganelo.com" - - @staticmethod - def parse_page(page, data): - """Parse metadata on 'page' and add it to 'data'""" - text.extract_all(page, ( - ("manga" , '<h1>', '</h1>'), - ('author' , '</i>Author(s) :</td>', '</tr>'), - ), values=data) - data["author"] = text.remove_html(data["author"]) - return data - -class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): +class ManganeloChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from manganelo.com""" - pattern = (r"(?:https?://)?(?:www\.)?manganelo\.com" - r"(/chapter/\w+/chapter_[^/?#]+)") + category = "manganelo" + root = "https://readmanganato.com" + pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)" test = ( - ("https://manganelo.com/chapter/gq921227/chapter_23", { + ("https://readmanganato.com/manga-gn983696/chapter-23", { "pattern": r"https://s\d+\.\w+\.com/mangakakalot/g\d+/gq921227/" r"vol3_chapter_23_24_yen/\d+\.jpg", "keyword": "3748087cf41abc97f991530e6fd53b291490d6d0", @@ -43,11 +31,12 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): "content": "fbec629c71f66b246bfa0604204407c0d1c8ae38", "count": 39, }), + ("https://manganelo.com/chapter/gq921227/chapter_23"), ) def __init__(self, match): - self.path = match.group(1) - ChapterExtractor.__init__(self, match, self.root + self.path) + domain, path = match.groups() + ChapterExtractor.__init__(self, match, "https://" + domain + path) self.session.headers['Referer'] = self.root def metadata(self, page): @@ -85,21 +74,29 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): ] -class ManganeloMangaExtractor(ManganeloBase, MangaExtractor): +class ManganeloMangaExtractor(MangaExtractor): """Extractor for manga from manganelo.com""" + category = "manganelo" + root = "https://readmanganato.com" chapterclass = ManganeloChapterExtractor - pattern = (r"(?:https?://)?(?:www\.)?manganelo\.com" - r"(/(?:manga/|read_)\w+)") + pattern = BASE_PATTERN + r"(/(?:manga[-/]|read_)\w+)/?$" test = ( - ("https://manganelo.com/manga/ol921234", { - "url": "6ba7f083a6944e414ad8214b74a0a40cb60d4562", + ("https://manganato.com/manga-gu983703", { + "pattern": ManganeloChapterExtractor.pattern, + "count": ">= 70", }), ("https://manganelo.com/manga/read_otome_no_teikoku", { "pattern": ManganeloChapterExtractor.pattern, - "count": ">= 40" + "count": ">= 40", }), + ("https://manganelo.com/manga/ol921234/"), ) + def __init__(self, match): + domain, path = match.groups() + MangaExtractor.__init__(self, match, "https://" + domain + path) + self.session.headers['Referer'] = self.root + def chapters(self, page): results = [] data = self.parse_page(page, {"lang": "en", "language": "English"}) @@ -117,3 +114,13 @@ class ManganeloMangaExtractor(ManganeloBase, MangaExtractor): data["chapter"] = text.parse_int(chapter) data["chapter_minor"] = sep + minor results.append((url, data.copy())) + + @staticmethod + def parse_page(page, data): + """Parse metadata on 'page' and add it to 'data'""" + text.extract_all(page, ( + ("manga" , '<h1>', '</h1>'), + ('author' , '</i>Author(s) :</td>', '</tr>'), + ), values=data) + data["author"] = text.remove_html(data["author"]) + return data diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 558e682..9b6d4ba 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -17,7 +17,7 @@ import re class MangaparkBase(): """Base class for mangapark extractors""" category = "mangapark" - root_fmt = "https://mangapark.{}" + root_fmt = "https://v2.mangapark.{}" browser = "firefox" @staticmethod @@ -51,7 +51,7 @@ class MangaparkBase(): class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): """Extractor for manga-chapters from mangapark.net""" - pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" + pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" r"/manga/([^?#]+/i\d+)") test = ( ("https://mangapark.net/manga/gosu/i811653/c055/1", { @@ -117,7 +117,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): """Extractor for manga from mangapark.net""" chapterclass = MangaparkChapterExtractor - pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" + pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" r"(/manga/[^/?#]+)/?$") test = ( ("https://mangapark.net/manga/aria", { diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index e1081da..b74355d 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -158,7 +158,7 @@ class NozomiTagExtractor(NozomiExtractor): """Extractor for posts from tag searches on nozomi.la""" subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") - archive_fmt = "t_{search_tags}_{postid}" + archive_fmt = "t_{search_tags}_{dataid}" pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-(\d+)\." test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", { "pattern": r"^https://i.nozomi.la/\w/\w\w/\w+\.\w+$", @@ -180,7 +180,7 @@ class NozomiSearchExtractor(NozomiExtractor): """Extractor for search results on nozomi.la""" subcategory = "search" directory_fmt = ("{category}", "{search_tags:J }") - archive_fmt = "t_{search_tags}_{postid}" + archive_fmt = "t_{search_tags}_{dataid}" pattern = r"(?:https?://)?nozomi\.la/search\.html\?q=([^&#]+)" test = ("https://nozomi.la/search.html?q=hibiscus%203:4_ratio#1", { "count": ">= 5", diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 839e0b8..9c32d7a 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -117,12 +117,22 @@ class PatreonExtractor(Extractor): attr = post["attributes"] attr["id"] = text.parse_int(post["id"]) - if post.get("current_user_can_view", True): + if attr.get("current_user_can_view", True): + + relationships = post["relationships"] attr["images"] = self._files(post, included, "images") attr["attachments"] = self._files(post, included, "attachments") attr["date"] = text.parse_datetime( attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") - user = post["relationships"]["user"] + + tags = relationships.get("user_defined_tags") + attr["tags"] = [ + tag["id"].replace("user_defined;", "") + for tag in tags["data"] + if tag["type"] == "post_tag" + ] if tags else [] + + user = relationships["user"] attr["creator"] = ( self._user(user["links"]["related"]) or included["user"][user["data"]["id"]]) @@ -299,6 +309,10 @@ class PatreonPostExtractor(PatreonExtractor): ("https://www.patreon.com/posts/19987002", { "count": 4, }), + # tags (#1539) + ("https://www.patreon.com/posts/free-post-12497641", { + "keyword": {"tags": ["AWMedia"]}, + }), ("https://www.patreon.com/posts/not-found-123", { "exception": exception.NotFoundError, }), diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py index cbd65d7..3c3fcd4 100644 --- a/gallery_dl/extractor/pillowfort.py +++ b/gallery_dl/extractor/pillowfort.py @@ -9,7 +9,9 @@ """Extractors for https://www.pillowfort.social/""" from .common import Extractor, Message -from .. import text +from ..cache import cache +from .. import text, exception +import re BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social" @@ -19,94 +21,171 @@ class PillowfortExtractor(Extractor): category = "pillowfort" root = "https://www.pillowfort.social" directory_fmt = ("{category}", "{username}") - filename_fmt = ("{post_id} {title|original_post[title]} " + filename_fmt = ("{post_id} {title|original_post[title]:?/ /}" "{num:>02}.{extension}") archive_fmt = "{id}" + cookiedomain = "www.pillowfort.social" def __init__(self, match): Extractor.__init__(self, match) self.item = match.group(1) - self.reblogs = self.config("reblogs", False) def items(self): - for post in self.posts(): + self.login() + inline = self.config("inline", True) + reblogs = self.config("reblogs", False) + external = self.config("external", False) + + if inline: + inline = re.compile(r'src="(https://img\d+\.pillowfort\.social' + r'/posts/[^"]+)').findall - if "original_post" in post and not self.reblogs: + for post in self.posts(): + if "original_post" in post and not reblogs: continue - files = post["media"] - del post["media"] + files = post.pop("media") + if inline: + for url in inline(post["content"]): + files.append({"url": url}) post["date"] = text.parse_datetime( post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + post["post_id"] = post.pop("id") yield Message.Directory, post post["num"] = 0 for file in files: url = file["url"] - if url: - post.update(file) + if not url: + continue + + if file.get("embed_code"): + if not external: + continue + msgtype = Message.Queue + else: post["num"] += 1 + msgtype = Message.Url + + post.update(file) + text.nameext_from_url(url, post) + post["hash"], _, post["filename"] = \ + post["filename"].partition("_") + + if "id" not in file: + post["id"] = post["hash"] + if "created_at" in file: post["date"] = text.parse_datetime( file["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") - yield Message.Url, url, text.nameext_from_url(url, post) + + yield msgtype, url, post + + def login(self): + cget = self.session.cookies.get + if cget("_Pf_new_session", domain=self.cookiedomain) \ + or cget("remember_user_token", domain=self.cookiedomain): + return + + username, password = self._get_auth_info() + if username: + cookies = self._login_impl(username, password) + self._update_cookies(cookies) + + @cache(maxage=14*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = "https://www.pillowfort.social/users/sign_in" + page = self.request(url).text + auth = text.extract(page, 'name="authenticity_token" value="', '"')[0] + + headers = {"Origin": self.root, "Referer": url} + data = { + "utf8" : "✓", + "authenticity_token": auth, + "user[email]" : username, + "user[password]" : password, + "user[remember_me]" : "1", + } + response = self.request(url, method="POST", headers=headers, data=data) + + if not response.history: + raise exception.AuthenticationError() + + return { + cookie.name: cookie.value + for cookie in response.history[0].cookies + } class PillowfortPostExtractor(PillowfortExtractor): """Extractor for a single pillowfort post""" subcategory = "post" pattern = BASE_PATTERN + r"/posts/(\d+)" - test = ("https://www.pillowfort.social/posts/27510", { - "pattern": r"https://img\d+\.pillowfort\.social/posts/\w+_out\d+\.png", - "count": 4, - "keyword": { - "avatar_url": str, - "col": 0, - "commentable": True, - "comments_count": int, - "community_id": None, - "content": str, - "created_at": str, - "date": "type:datetime", - "deleted": None, - "deleted_at": None, - "deleted_by_mod": None, - "deleted_for_flag_id": None, - "embed_code": None, - "id": int, - "last_activity": str, - "last_activity_elapsed": str, - "last_edited_at": None, - "likes_count": int, - "media_type": "picture", - "nsfw": False, - "num": int, - "original_post_id": None, - "original_post_user_id": None, - "picture_content_type": None, - "picture_file_name": None, - "picture_file_size": None, - "picture_updated_at": None, - "post_id": 27510, - "post_type": "picture", - "privacy": "public", - "reblog_copy_info": list, - "rebloggable": True, - "reblogged_from_post_id": None, - "reblogged_from_user_id": None, - "reblogs_count": int, - "row": int, - "small_image_url": None, - "tags": list, - "time_elapsed": str, - "timestamp": str, - "title": "What is Pillowfort.io? ", - "updated_at": str, - "url": r"re:https://img3.pillowfort.social/posts/.*\.png", - "user_id": 5, - "username": "Staff" - }, - }) + test = ( + ("https://www.pillowfort.social/posts/27510", { + "pattern": r"https://img\d+\.pillowfort\.social" + r"/posts/\w+_out\d+\.png", + "count": 4, + "keyword": { + "avatar_url": str, + "col": 0, + "commentable": True, + "comments_count": int, + "community_id": None, + "content": str, + "created_at": str, + "date": "type:datetime", + "deleted": None, + "deleted_at": None, + "deleted_by_mod": None, + "deleted_for_flag_id": None, + "embed_code": None, + "id": int, + "last_activity": str, + "last_activity_elapsed": str, + "last_edited_at": None, + "likes_count": int, + "media_type": "picture", + "nsfw": False, + "num": int, + "original_post_id": None, + "original_post_user_id": None, + "picture_content_type": None, + "picture_file_name": None, + "picture_file_size": None, + "picture_updated_at": None, + "post_id": 27510, + "post_type": "picture", + "privacy": "public", + "reblog_copy_info": list, + "rebloggable": True, + "reblogged_from_post_id": None, + "reblogged_from_user_id": None, + "reblogs_count": int, + "row": int, + "small_image_url": None, + "tags": list, + "time_elapsed": str, + "timestamp": str, + "title": "What is Pillowfort.io? ", + "updated_at": str, + "url": r"re:https://img3.pillowfort.social/posts/.*\.png", + "user_id": 5, + "username": "Staff" + }, + }), + ("https://www.pillowfort.social/posts/1557500", { + "options": (("external", True), ("inline", False)), + "pattern": r"https://twitter\.com/Aliciawitdaart/status" + r"/1282862493841457152", + }), + ("https://www.pillowfort.social/posts/1672518", { + "options": (("inline", True),), + "count": 3, + }), + ) def posts(self): url = "{}/posts/{}/json/".format(self.root, self.item) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 8bfae06..8076fff 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -29,14 +29,28 @@ class PixivExtractor(Extractor): Extractor.__init__(self, match) self.api = PixivAppAPI(self) self.load_ugoira = self.config("ugoira", True) - self.translated_tags = self.config("translated-tags", False) + self.max_posts = self.config("max-posts", 0) def items(self): - tkey = "translated_name" if self.translated_tags else "name" + tags = self.config("tags", "japanese") + if tags == "original": + transform_tags = None + elif tags == "translated": + def transform_tags(work): + work["tags"] = list(set( + tag["translated_name"] or tag["name"] + for tag in work["tags"])) + else: + def transform_tags(work): + work["tags"] = [tag["name"] for tag in work["tags"]] + ratings = {0: "General", 1: "R-18", 2: "R-18G"} metadata = self.metadata() - for work in self.works(): + works = self.works() + if self.max_posts: + works = itertools.islice(works, self.max_posts) + for work in works: if not work["user"]["id"]: continue @@ -45,12 +59,10 @@ class PixivExtractor(Extractor): del work["meta_single_page"] del work["image_urls"] del work["meta_pages"] + + if transform_tags: + transform_tags(work) work["num"] = 0 - if self.translated_tags: - work["untranslated_tags"] = [ - tag["name"] for tag in work["tags"] - ] - work["tags"] = [tag[tkey] or tag["name"] for tag in work["tags"]] work["date"] = text.parse_datetime(work["create_date"]) work["rating"] = ratings.get(work["x_restrict"]) work["suffix"] = "" @@ -66,6 +78,7 @@ class PixivExtractor(Extractor): url = ugoira["zip_urls"]["medium"].replace( "_ugoira600x600", "_ugoira1920x1080") work["frames"] = ugoira["frames"] + work["_http_adjust_extension"] = False yield Message.Url, url, text.nameext_from_url(url, work) elif work["page_count"] == 1: @@ -115,7 +128,8 @@ class PixivUserExtractor(PixivExtractor): }), # deleted account ("http://www.pixiv.net/member_illust.php?id=173531", { - "count": 0, + "options": (("metadata", True),), + "exception": exception.NotFoundError, }), ("https://www.pixiv.net/en/users/173530"), ("https://www.pixiv.net/en/users/173530/manga"), @@ -138,6 +152,11 @@ class PixivUserExtractor(PixivExtractor): self.user_id = u1 or u2 or u3 self.tag = t1 or t2 + def metadata(self): + if self.config("metadata"): + return {"user": self.api.user_detail(self.user_id)} + return {} + def works(self): works = self.api.user_illusts(self.user_id) diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 971347b..c62a942 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -78,6 +78,8 @@ class ReactorExtractor(Extractor): def _parse_post(self, post): post, _, script = post.partition('<script type="application/ld+json">') + if not script: + return images = text.extract_iter(post, '<div class="image">', '</div>') script = script[:script.index("</")].strip() @@ -210,7 +212,7 @@ class JoyreactorTagExtractor(ReactorTagExtractor): pattern = JR_BASE_PATTERN + r"/tag/([^/?#]+)" test = ( ("http://joyreactor.cc/tag/Advent+Cirno", { - "count": ">= 17", + "count": ">= 15", }), ("http://joyreactor.com/tag/Cirno", { "url": "de1e60c15bfb07a0e9603b00dc3d05f60edc7914", diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 7ffe5dc..e4075a2 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -1,17 +1,19 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://readcomiconline.to/""" +"""Extractors for https://readcomiconline.li/""" from .common import Extractor, ChapterExtractor, MangaExtractor from .. import text, exception import re +BASE_PATTERN = r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.(?:li|to)" + class ReadcomiconlineBase(): """Base class for readcomiconline extractors""" @@ -19,7 +21,7 @@ class ReadcomiconlineBase(): directory_fmt = ("{category}", "{comic}", "{issue:>03}") filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" archive_fmt = "{issue_id}_{page}" - root = "https://readcomiconline.to" + root = "https://readcomiconline.li" def request(self, url, **kwargs): """Detect and handle redirects to CAPTCHA pages""" @@ -42,11 +44,10 @@ class ReadcomiconlineBase(): class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): - """Extractor for comic-issues from readcomiconline.to""" + """Extractor for comic-issues from readcomiconline.li""" subcategory = "issue" - pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" - r"(/Comic/[^/?#]+/[^/?#]+\?id=(\d+))") - test = ("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", { + pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/[^/?#]+\?id=(\d+))" + test = ("https://readcomiconline.li/Comic/W-i-t-c-h/Issue-130?id=22289", { "url": "30d29c5afc65043bfd384c010257ec2d0ecbafa6", "keyword": "2d9ec81ce1b11fac06ebf96ce33cdbfca0e85eb5", }) @@ -78,18 +79,17 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): - """Extractor for comics from readcomiconline.to""" + """Extractor for comics from readcomiconline.li""" chapterclass = ReadcomiconlineIssueExtractor subcategory = "comic" - pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" - r"(/Comic/[^/?#]+/?)$") + pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/?)$" test = ( - ("https://readcomiconline.to/Comic/W-i-t-c-h", { - "url": "e231bc2a293edb465133c37a8e36a7e7d94cab14", + ("https://readcomiconline.li/Comic/W-i-t-c-h", { + "url": "74eb8b9504b4084fcc9367b341300b2c52260918", "keyword": "3986248e4458fa44a201ec073c3684917f48ee0c", }), ("https://readcomiconline.to/Comic/Bazooka-Jules", { - "url": "711674cb78ed10bd2557315f7a67552d01b33985", + "url": "2f66a467a772df4d4592e97a059ddbc3e8991799", "keyword": "f5ba5246cd787bb750924d9690cb1549199bd516", }), ) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 5579017..9808cb8 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -10,7 +10,7 @@ from .booru import BooruExtractor from .common import Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache import collections @@ -206,7 +206,7 @@ class SankakuAPI(): self.username, self.password = self.extractor._get_auth_info() if not self.username: - self.authenticate = lambda: None + self.authenticate = util.noop def pools(self, pool_id): params = {"lang": "en"} @@ -250,7 +250,8 @@ class SankakuAPI(): success = True if not success: code = data.get("code") - if code and code.endswith(("invalid-token", "invalid_token")): + if code and code.endswith( + ("unauthorized", "invalid-token", "invalid_token")): _authenticate_impl.invalidate(self.username) continue raise exception.StopExtraction(code) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c323fe0..afeebb0 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -32,6 +32,7 @@ class TwitterExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + self.textonly = self.config("text-tweets", False) self.retweets = self.config("retweets", True) self.replies = self.config("replies", True) self.twitpic = self.config("twitpic", False) @@ -64,7 +65,7 @@ class TwitterExtractor(Extractor): self._extract_card(tweet, files) if self.twitpic: self._extract_twitpic(tweet, files) - if not files: + if not files and not self.textonly: continue tdata = self._transform_tweet(tweet) @@ -168,7 +169,6 @@ class TwitterExtractor(Extractor): tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), "user" : self._transform_user(tweet["user"]), "lang" : tweet["lang"], - "content" : tweet["full_text"], "favorite_count": tweet["favorite_count"], "quote_count" : tweet["quote_count"], "reply_count" : tweet["reply_count"], @@ -187,6 +187,14 @@ class TwitterExtractor(Extractor): "nick": u["name"], } for u in mentions] + content = tweet["full_text"] + urls = entities.get("urls") + if urls: + for url in urls: + content = content.replace(url["url"], url["expanded_url"]) + txt, _, tco = content.rpartition(" ") + tdata["content"] = txt if tco.startswith("https://t.co/") else content + if "in_reply_to_screen_name" in tweet: tdata["reply_to"] = tweet["in_reply_to_screen_name"] @@ -489,6 +497,10 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("conversations", True),), "count": ">= 50", }), + # retweet with missing media entities (#1555) + ("https://twitter.com/morino_ya/status/1392763691599237121", { + "count": 4, + }), ) def __init__(self, match): @@ -802,6 +814,10 @@ class TwitterAPI(): tweet = retweet elif retweet: tweet["author"] = users[retweet["user_id_str"]] + if "extended_entities" in retweet and \ + "extended_entities" not in tweet: + tweet["extended_entities"] = \ + retweet["extended_entities"] tweet["user"] = users[tweet["user_id_str"]] yield tweet diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index d13ce0f..e89a5b7 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -69,7 +69,8 @@ class UnsplashImageExtractor(UnsplashExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/photos/([^/?#]+)" test = ("https://unsplash.com/photos/lsoogGC_5dg", { - "url": "b99a5829ca955b768a206aa9afc391bd3f3dd55e", + "pattern": r"https://images\.unsplash\.com/photo-1586348943529-" + r"beaae6c28db9\?ixid=\w+&ixlib=rb-1.2.1", "keyword": { "alt_description": "re:silhouette of trees near body of water ", "blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz", @@ -114,7 +115,7 @@ class UnsplashImageExtractor(UnsplashExtractor): "id": "uMJXuywXLiU", "instagram_username": "just_midwest_rock", "last_name": "Hoefler", - "location": "Madison, WI", + "location": None, "name": "Dave Hoefler", "portfolio_url": str, "total_collections": int, diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index f8da191..711d3fa 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -77,7 +77,7 @@ class WeasylSubmissionExtractor(WeasylExtractor): "keyword": { "comments" : int, "date" : "dt:2012-04-20 00:38:04", - "description" : "<p>(flex)</p>", + "description" : "<p>(flex)</p>\n", "favorites" : int, "folder_name" : "Wesley Stuff", "folderid" : 2081, @@ -160,8 +160,8 @@ class WeasylJournalExtractor(WeasylExtractor): "keyword": { "title" : "BBCode", "date" : "dt:2013-09-19 23:11:23", - "content": "<p><a>javascript:alert(42);</a></p>" - "<p>No more of that!</p>", + "content": "<p><a>javascript:alert(42);</a></p>\n\n" + "<p>No more of that!</p>\n", }, }) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index a325f87..0b6a153 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -27,16 +27,21 @@ class WeiboExtractor(Extractor): self.videos = self.config("videos", True) def items(self): - yield Message.Version, 1 + original_retweets = (self.retweets == "original") for status in self.statuses(): - files = self._files_from_status(status) if self.retweets and "retweeted_status" in status: - files = itertools.chain( - files, - self._files_from_status(status["retweeted_status"]), - ) + if original_retweets: + status = status["retweeted_status"] + files = self._files_from_status(status) + else: + files = itertools.chain( + self._files_from_status(status), + self._files_from_status(status["retweeted_status"]), + ) + else: + files = self._files_from_status(status) for num, file in enumerate(files, 1): if num == 1: @@ -143,6 +148,11 @@ class WeiboStatusExtractor(WeiboExtractor): }), # non-numeric status ID (#664) ("https://weibo.com/3314883543/Iy7fj4qVg"), + # original retweets (#1542) + ("https://m.weibo.cn/detail/4600272267522211", { + "options": (("retweets", "original"),), + "keyword": {"status": {"id": "4600167083287033"}}, + }), ("https://m.weibo.cn/status/4339748116375525"), ("https://m.weibo.cn/5746766133/4339748116375525"), ) diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index 7fd60b1..511a609 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -71,8 +71,8 @@ class WikiartArtistExtractor(WikiartExtractor): directory_fmt = ("{category}", "{artist[artistName]}") pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)/?$" test = ("https://www.wikiart.org/en/thomas-cole", { - "url": "5ba2fbe6783fcce34e65014d16e5fbc581490c98", - "keyword": "eb5b141cf33e6d279afd1518aae24e61cc0adf81", + "url": "5140343730331786117fa5f4c013a6153393e28e", + "keyword": "4d9cbc50ebddfcb186f31ff70b08833578dd0070", }) def __init__(self, match): @@ -97,8 +97,8 @@ class WikiartImageExtractor(WikiartArtistExtractor): pattern = BASE_PATTERN + r"/(?!(?:paintings|artists)-by-)([\w-]+)/([\w-]+)" test = ( ("https://www.wikiart.org/en/thomas-cole/the-departure-1838", { - "url": "4d9fd87680a2620eaeaf1f13e3273475dec93231", - "keyword": "a1b083d500ce2fd364128e35b026e4ca526000cc", + "url": "976cc2545f308a650b5dbb35c29d3cee0f4673b3", + "keyword": "8e80cdcb01c1fedb934633d1c4c3ab0419cfbedf", }), # no year or '-' in slug ("https://www.wikiart.org/en/huang-shen/summer", { diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 99f61d8..164c2a9 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -27,8 +27,11 @@ class Job(): extr = extractor.find(extr) if not extr: raise exception.NoExtractorError() + self.extractor = extr self.pathfmt = None + self.kwdict = {} + self.status = 0 self._logger_extra = { "job" : self, @@ -39,32 +42,28 @@ class Job(): extr.log = self._wrap_logger(extr.log) extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url) - self.status = 0 - self.pred_url = self._prepare_predicates("image", True) - self.pred_queue = self._prepare_predicates("chapter", False) - self.kwdict = {} - - # user-supplied metadata - kwdict = self.extractor.config("keywords") - if kwdict: - self.kwdict.update(kwdict) - # data from parent job if parent: pextr = parent.extractor # transfer (sub)category if pextr.config("category-transfer", pextr.categorytransfer): + extr._cfgpath = pextr._cfgpath extr.category = pextr.category extr.subcategory = pextr.subcategory - extr._cfgpath = pextr._cfgpath - - # transfer parent directory - extr._parentdir = pextr._parentdir # reuse connection adapters extr.session.adapters = pextr.session.adapters + # user-supplied metadata + kwdict = self.extractor.config("keywords") + if kwdict: + self.kwdict.update(kwdict) + + # predicates + self.pred_url = self._prepare_predicates("image", True) + self.pred_queue = self._prepare_predicates("chapter", False) + def run(self): """Execute or run the job""" sleep = self.extractor.config("sleep-extractor") @@ -78,6 +77,8 @@ class Job(): if exc.message: log.error(exc.message) self.status |= exc.code + except exception.TerminateExtraction: + raise except exception.GalleryDLException as exc: log.error("%s: %s", exc.__class__.__name__, exc) self.status |= exc.code @@ -188,7 +189,7 @@ class Job(): class DownloadJob(Job): """Download images into appropriate directory/filename locations""" - def __init__(self, url, parent=None, kwdict=None): + def __init__(self, url, parent=None): Job.__init__(self, url, parent) self.log = self.get_logger("download") self.blacklist = None @@ -197,19 +198,8 @@ class DownloadJob(Job): self.hooks = () self.downloaders = {} self.out = output.select() - - if parent: - self.visited = parent.visited - pfmt = parent.pathfmt - if pfmt and parent.extractor.config("parent-directory"): - self.extractor._parentdir = pfmt.directory - if parent.extractor.config("parent-metadata"): - if parent.kwdict: - self.kwdict.update(parent.kwdict) - if kwdict: - self.kwdict.update(kwdict) - else: - self.visited = set() + self.visited = parent.visited if parent else set() + self._skipcnt = 0 def handle_url(self, url, kwdict): """Download the resource specified in 'url'""" @@ -302,7 +292,27 @@ class DownloadJob(Job): extr = None if extr: - self.status |= self.__class__(extr, self, kwdict).run() + job = self.__class__(extr, self) + pfmt = self.pathfmt + pextr = self.extractor + + if pfmt and pextr.config("parent-directory"): + extr._parentdir = pfmt.directory + else: + extr._parentdir = pextr._parentdir + + if pextr.config("parent-metadata"): + if self.kwdict: + job.kwdict.update(self.kwdict) + if kwdict: + job.kwdict.update(kwdict) + + if pextr.config("parent-skip"): + job._skipcnt = self._skipcnt + self.status |= job.run() + self._skipcnt = job._skipcnt + else: + self.status |= job.run() else: self._write_unsupported(url) @@ -398,9 +408,10 @@ class DownloadJob(Job): skip, _, smax = skip.partition(":") if skip == "abort": self._skipexc = exception.StopExtraction + elif skip == "terminate": + self._skipexc = exception.TerminateExtraction elif skip == "exit": self._skipexc = sys.exit - self._skipcnt = 0 self._skipmax = text.parse_int(smax) else: # monkey-patch methods to always return False @@ -586,10 +597,16 @@ class UrlJob(Job): for url in kwdict["_fallback"]: print("|", url) - def handle_queue(self, url, _): - try: - UrlJob(url, self, self.depth + 1).run() - except exception.NoExtractorError: + def handle_queue(self, url, kwdict): + cls = kwdict.get("_extractor") + if cls: + extr = cls.from_url(url) + else: + extr = extractor.find(url) + + if extr: + self.status |= self.__class__(extr, self).run() + else: self._write_unsupported(url) @@ -636,7 +653,7 @@ class DataJob(Job): self.ascii = config.get(("output",), "ascii", ensure_ascii) private = config.get(("output",), "private") - self.filter = (lambda x: x) if private else util.filter_dict + self.filter = util.identity if private else util.filter_dict def run(self): sleep = self.extractor.config("sleep-extractor") diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 3e585fe..6018542 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -114,7 +114,7 @@ def build_parser(): ) general.add_argument( "--clear-cache", - dest="clear_cache", action="store_true", + dest="clear_cache", metavar="MODULE", nargs="?", const="all", help="Delete all cached login sessions, cookies, etc.", ) diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 2d3dc17..7e1f8c1 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -232,15 +232,19 @@ def select(): } omode = config.get(("output",), "mode", "auto").lower() if omode in pdict: - return pdict[omode]() + output = pdict[omode]() elif omode == "auto": if hasattr(sys.stdout, "isatty") and sys.stdout.isatty(): - return ColorOutput() if ANSI else TerminalOutput() + output = ColorOutput() if ANSI else TerminalOutput() else: - return PipeOutput() + output = PipeOutput() else: raise Exception("invalid output mode: " + omode) + if not config.get(("output",), "skip", True): + output.skip = util.identity + return output + class NullOutput(): diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 14eaa8d..ac094b7 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,6 +26,7 @@ class UgoiraPP(PostProcessor): self.twopass = options.get("ffmpeg-twopass", False) self.output = options.get("ffmpeg-output", True) self.delete = not options.get("keep-files", False) + self.repeat = options.get("repeat-last-frame", True) ffmpeg = options.get("ffmpeg-location") self.ffmpeg = util.expand_path(ffmpeg) if ffmpeg else "ffmpeg" @@ -34,6 +35,11 @@ class UgoiraPP(PostProcessor): if rate != "auto": self.calculate_framerate = lambda _: (None, rate) + if options.get("ffmpeg-demuxer") == "image2": + self._process = self._image2 + else: + self._process = self._concat + if options.get("libx264-prevent-odd", True): # get last video-codec argument vcodec = None @@ -72,34 +78,17 @@ class UgoiraPP(PostProcessor): if not self._frames: return - rate_in, rate_out = self.calculate_framerate(self._frames) - with tempfile.TemporaryDirectory() as tempdir: # extract frames - with zipfile.ZipFile(pathfmt.temppath) as zfile: - zfile.extractall(tempdir) - - # write ffconcat file - ffconcat = tempdir + "/ffconcat.txt" - with open(ffconcat, "w") as file: - file.write("ffconcat version 1.0\n") - for frame in self._frames: - file.write("file '{}'\n".format(frame["file"])) - file.write("duration {}\n".format(frame["delay"] / 1000)) - if self.extension != "gif": - # repeat the last frame to prevent it from only being - # displayed for a very short amount of time - file.write("file '{}'\n".format(self._frames[-1]["file"])) - - # collect command-line arguments - args = [self.ffmpeg] - if rate_in: - args += ("-r", str(rate_in)) - args += ("-i", ffconcat) - if rate_out: - args += ("-r", str(rate_out)) - if self.prevent_odd: - args += ("-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)") + try: + with zipfile.ZipFile(pathfmt.temppath) as zfile: + zfile.extractall(tempdir) + except FileNotFoundError: + pathfmt.realpath = pathfmt.temppath + return + + # process frames and collect command-line arguments + args = self._process(tempdir) if self.args: args += self.args self.log.debug("ffmpeg args: %s", args) @@ -108,7 +97,7 @@ class UgoiraPP(PostProcessor): pathfmt.set_extension(self.extension) try: if self.twopass: - if "-f" not in args: + if "-f" not in self.args: args += ("-f", self.extension) args += ("-passlogfile", tempdir + "/ffmpeg2pass", "-pass") self._exec(args + ["1", "-y", os.devnull]) @@ -127,6 +116,48 @@ class UgoiraPP(PostProcessor): else: pathfmt.set_extension("zip") + def _concat(self, path): + ffconcat = path + "/ffconcat.txt" + + content = ["ffconcat version 1.0"] + append = content.append + for frame in self._frames: + append("file '{}'\nduration {}".format( + frame["file"], frame["delay"] / 1000)) + if self.repeat: + append("file '{}'".format(frame["file"])) + append("") + + with open(ffconcat, "w") as file: + file.write("\n".join(content)) + + rate_in, rate_out = self.calculate_framerate(self._frames) + args = [self.ffmpeg, "-f", "concat"] + if rate_in: + args += ("-r", str(rate_in)) + args += ("-i", ffconcat) + if rate_out: + args += ("-r", str(rate_out)) + return args + + def _image2(self, path): + path += "/" + + # adjust frame mtime values + ts = 0 + for frame in self._frames: + os.utime(path + frame["file"], ns=(ts, ts)) + ts += frame["delay"] * 1000000 + + return [ + self.ffmpeg, + "-f", "image2", + "-ts_from_file", "2", + "-pattern_type", "sequence", + "-i", "{}%06d.{}".format( + path.replace("%", "%%"), frame["file"].rpartition(".")[2]), + ] + def _exec(self, args): out = None if self.output else subprocess.DEVNULL return subprocess.Popen(args, stdout=out, stderr=out).wait() diff --git a/gallery_dl/text.py b/gallery_dl/text.py index a6a9105..74b87fb 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -67,7 +67,7 @@ def nameext_from_url(url, data=None): filename = unquote(filename_from_url(url)) name, _, ext = filename.rpartition(".") - if name: + if name and len(ext) <= 16: data["filename"], data["extension"] = name, ext.lower() else: data["filename"], data["extension"] = filename, "" diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 2466adf..78663a0 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -81,6 +81,15 @@ def raises(cls): return wrap +def identity(x): + """Returns its argument""" + return x + + +def noop(): + """Does nothing""" + + def generate_token(size=16): """Generate a random token with hexadecimal digits""" data = random.getrandbits(size * 8).to_bytes(size, "big") @@ -321,7 +330,7 @@ CODES = { "hu": "Hungarian", "id": "Indonesian", "it": "Italian", - "jp": "Japanese", + "ja": "Japanese", "ko": "Korean", "ms": "Malay", "nl": "Dutch", @@ -804,7 +813,7 @@ class PathFormat(): @staticmethod def _build_cleanfunc(chars, repl): if not chars: - return lambda x: x + return identity elif isinstance(chars, dict): def func(x, table=str.maketrans(chars)): return x.translate(table) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 630da7d..018554e 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.17.3" +__version__ = "1.17.5" diff --git a/test/test_job.py b/test/test_job.py new file mode 100644 index 0000000..1aeec1c --- /dev/null +++ b/test/test_job.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import os +import sys +import unittest +from unittest.mock import patch + +import io +import contextlib + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from gallery_dl import job, config, text # noqa E402 +from gallery_dl.extractor.common import Extractor, Message # noqa E402 + + +class TestJob(unittest.TestCase): + + def tearDown(self): + config.clear() + + def _capture_stdout(self, extr_or_job): + if isinstance(extr_or_job, Extractor): + jobinstance = self.jobclass(extr_or_job) + else: + jobinstance = extr_or_job + + with io.StringIO() as buffer: + with contextlib.redirect_stdout(buffer): + jobinstance.run() + return buffer.getvalue() + + +class TestKeywordJob(TestJob): + jobclass = job.KeywordJob + + def test_default(self): + extr = TestExtractor.from_url("test:") + self.assertEqual(self._capture_stdout(extr), """\ +Keywords for directory names: +----------------------------- +category + test_category +subcategory + test_subcategory + +Keywords for filenames and --filter: +------------------------------------ +category + test_category +extension + jpg +filename + 1 +num + 1 +subcategory + test_subcategory +tags[] + - foo + - bar + - テスト +user[id] + 123 +user[name] + test +""") + + +class TestUrlJob(TestJob): + jobclass = job.UrlJob + + def test_default(self): + extr = TestExtractor.from_url("test:") + self.assertEqual(self._capture_stdout(extr), """\ +https://example.org/1.jpg +https://example.org/2.jpg +https://example.org/3.jpg +""") + + def test_fallback(self): + extr = TestExtractor.from_url("test:") + tjob = self.jobclass(extr) + tjob.handle_url = tjob.handle_url_fallback + + self.assertEqual(self._capture_stdout(tjob), """\ +https://example.org/1.jpg +| https://example.org/alt/1.jpg +https://example.org/2.jpg +| https://example.org/alt/2.jpg +https://example.org/3.jpg +| https://example.org/alt/3.jpg +""") + + def test_parent(self): + extr = TestExtractorParent.from_url("test:parent") + self.assertEqual(self._capture_stdout(extr), """\ +test:child +test:child +test:child +""") + + def test_child(self): + extr = TestExtractorParent.from_url("test:parent") + tjob = job.UrlJob(extr, depth=0) + self.assertEqual(self._capture_stdout(tjob), 3 * """\ +https://example.org/1.jpg +https://example.org/2.jpg +https://example.org/3.jpg +""") + + +class TestInfoJob(TestJob): + jobclass = job.InfoJob + + def test_default(self): + extr = TestExtractor.from_url("test:") + self.assertEqual(self._capture_stdout(extr), """\ +Category / Subcategory + "test_category" / "test_subcategory" +Filename format (default): + "test_{filename}.{extension}" +Directory format (default): + ["{category}"] +""") + + def test_custom(self): + config.set((), "filename", "custom") + config.set((), "directory", ("custom",)) + config.set((), "sleep-request", 321) + extr = TestExtractor.from_url("test:") + extr.request_interval = 123.456 + + self.assertEqual(self._capture_stdout(extr), """\ +Category / Subcategory + "test_category" / "test_subcategory" +Filename format (custom): + "custom" +Filename format (default): + "test_{filename}.{extension}" +Directory format (custom): + ["custom"] +Directory format (default): + ["{category}"] +Request interval (custom): + 321 +Request interval (default): + 123.456 +""") + + def test_base_category(self): + extr = TestExtractor.from_url("test:") + extr.basecategory = "test_basecategory" + + self.assertEqual(self._capture_stdout(extr), """\ +Category / Subcategory / Basecategory + "test_category" / "test_subcategory" / "test_basecategory" +Filename format (default): + "test_{filename}.{extension}" +Directory format (default): + ["{category}"] +""") + + +class TestDataJob(TestJob): + jobclass = job.DataJob + + def test_default(self): + extr = TestExtractor.from_url("test:") + tjob = self.jobclass(extr, file=io.StringIO()) + + tjob.run() + + self.assertEqual(tjob.data, [ + (Message.Directory, { + "category" : "test_category", + "subcategory": "test_subcategory", + }), + (Message.Url, "https://example.org/1.jpg", { + "category" : "test_category", + "subcategory": "test_subcategory", + "filename" : "1", + "extension" : "jpg", + "num" : 1, + "tags" : ["foo", "bar", "テスト"], + "user" : {"id": 123, "name": "test"}, + }), + (Message.Url, "https://example.org/2.jpg", { + "category" : "test_category", + "subcategory": "test_subcategory", + "filename" : "2", + "extension" : "jpg", + "num" : 2, + "tags" : ["foo", "bar", "テスト"], + "user" : {"id": 123, "name": "test"}, + }), + (Message.Url, "https://example.org/3.jpg", { + "category" : "test_category", + "subcategory": "test_subcategory", + "filename" : "3", + "extension" : "jpg", + "num" : 3, + "tags" : ["foo", "bar", "テスト"], + "user" : {"id": 123, "name": "test"}, + }), + ]) + + def test_exception(self): + extr = TestExtractorException.from_url("test:exception") + tjob = self.jobclass(extr, file=io.StringIO()) + tjob.run() + self.assertEqual( + tjob.data[-1], ("ZeroDivisionError", "division by zero")) + + def test_private(self): + config.set(("output",), "private", True) + extr = TestExtractor.from_url("test:") + tjob = self.jobclass(extr, file=io.StringIO()) + + tjob.run() + + for i in range(1, 4): + self.assertEqual( + tjob.data[i][2]["_fallback"], + ("https://example.org/alt/{}.jpg".format(i),), + ) + + def test_sleep(self): + extr = TestExtractor.from_url("test:") + tjob = self.jobclass(extr, file=io.StringIO()) + + config.set((), "sleep-extractor", 123) + with patch("time.sleep") as sleep: + tjob.run() + sleep.assert_called_once_with(123) + + config.set((), "sleep-extractor", 0) + with patch("time.sleep") as sleep: + tjob.run() + sleep.assert_not_called() + + def test_ascii(self): + extr = TestExtractor.from_url("test:") + tjob = self.jobclass(extr) + + tjob.file = buffer = io.StringIO() + tjob.run() + self.assertIn("""\ + "tags": [ + "foo", + "bar", + "\\u30c6\\u30b9\\u30c8" + ], +""", buffer.getvalue()) + + tjob.file = buffer = io.StringIO() + tjob.ascii = False + tjob.run() + self.assertIn("""\ + "tags": [ + "foo", + "bar", + "テスト" + ], +""", buffer.getvalue()) + + def test_num_string(self): + extr = TestExtractor.from_url("test:") + tjob = self.jobclass(extr, file=io.StringIO()) + + with patch("gallery_dl.util.number_to_string") as nts: + tjob.run() + self.assertEqual(len(nts.call_args_list), 0) + + config.set(("output",), "num-to-str", True) + with patch("gallery_dl.util.number_to_string") as nts: + tjob.run() + self.assertEqual(len(nts.call_args_list), 52) + + tjob.run() + self.assertEqual(tjob.data[-1][0], Message.Url) + self.assertEqual(tjob.data[-1][2]["num"], "3") + + +class TestExtractor(Extractor): + category = "test_category" + subcategory = "test_subcategory" + directory_fmt = ("{category}",) + filename_fmt = "test_{filename}.{extension}" + pattern = r"test:(child)?$" + + def items(self): + root = "https://example.org" + + yield Message.Directory, {} + for i in range(1, 4): + url = "{}/{}.jpg".format(root, i) + yield Message.Url, url, text.nameext_from_url(url, { + "num" : i, + "tags": ["foo", "bar", "テスト"], + "user": {"id": 123, "name": "test"}, + "_fallback": ("{}/alt/{}.jpg".format(root, i),), + }) + + +class TestExtractorParent(Extractor): + category = "test_category" + subcategory = "test_subcategory_parent" + pattern = r"test:parent" + + def items(self): + url = "test:child" + + for i in range(11, 14): + yield Message.Queue, url, { + "num" : i, + "tags": ["abc", "def"], + "_extractor": TestExtractor, + } + + +class TestExtractorException(Extractor): + category = "test_category" + subcategory = "test_subcategory_exception" + pattern = r"test:exception$" + + def items(self): + return 1/0 + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_results.py b/test/test_results.py index ed6b2eb..bf2496b 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -312,7 +312,7 @@ def setup_test_config(): config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") for category in ("danbooru", "instagram", "twitter", "subscribestar", - "e621", "inkbunny", "tapas"): + "e621", "inkbunny", "tapas", "pillowfort"): config.set(("extractor", category), "username", None) config.set(("extractor", "mastodon.social"), "access-token", diff --git a/test/test_text.py b/test/test_text.py index 1daefde..3ab9e73 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -148,6 +148,10 @@ class TestText(unittest.TestCase): self.assertEqual( f("http://example.org/v2/filename.ext?param=value#frag"), result) + # long "extension" + fn = "httpswww.example.orgpath-path-path-path-path-path-path-path" + self.assertEqual(f(fn), {"filename": fn, "extension": ""}) + # invalid arguments for value in INVALID: self.assertEqual(f(value), empty) diff --git a/test/test_util.py b/test/test_util.py index 06de735..e2f5084 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -134,6 +134,7 @@ class TestISO639_1(unittest.TestCase): self._run_test(util.code_to_language, { ("en",): "English", ("FR",): "French", + ("ja",): "Japanese", ("xx",): None, ("" ,): None, (None,): None, @@ -149,6 +150,7 @@ class TestISO639_1(unittest.TestCase): self._run_test(util.language_to_code, { ("English",): "en", ("fRENch",): "fr", + ("Japanese",): "ja", ("xx",): None, ("" ,): None, (None,): None, @@ -484,6 +486,13 @@ class TestOther(unittest.TestCase): with self.assertRaises(ValueError): func(3) + def test_identity(self): + for value in (123, "foo", [1, 2, 3], (1, 2, 3), {1: 2}, None): + self.assertIs(util.identity(value), value) + + def test_noop(self): + self.assertEqual(util.noop(), None) + def test_generate_token(self): tokens = set() for _ in range(100): |