diff options
| author | 2022-11-22 04:28:43 -0500 | |
|---|---|---|
| committer | 2022-11-22 04:28:43 -0500 | |
| commit | 2c529817db948dea14069ea9f5ccfae5598fff47 (patch) | |
| tree | ec67f818b3f6f329fdaa37cf8cf8a89d90022747 | |
| parent | 5ed6cfd4bbc85cc8c4bece8e0ff30600a640e6aa (diff) | |
| parent | 7af5cc29d1c02d20a6890b7b7ba78ab41532a763 (diff) | |
Update upstream source from tag 'upstream/1.24.0'
Update to upstream version '1.24.0'
with Debian dir 44b4fff355c06e5285ef1f8a3ec428ae9cef164b
124 files changed, 1990 insertions, 859 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 21341ef..f92ab19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,58 @@ # Changelog +## 1.24.0 - 2022-11-20 +### Additions +- [exhentai] add metadata to search results ([#3181](https://github.com/mikf/gallery-dl/issues/3181)) +- [gelbooru_v02] implement `notes` extraction +- [instagram] add `guide` extractor ([#3192](https://github.com/mikf/gallery-dl/issues/3192)) +- [lolisafe] add support for xbunkr ([#3153](https://github.com/mikf/gallery-dl/issues/3153), [#3156](https://github.com/mikf/gallery-dl/issues/3156)) +- [mastodon] add `instance_remote` metadata field ([#3119](https://github.com/mikf/gallery-dl/issues/3119)) +- [nitter] add extractors for Nitter instances ([#2415](https://github.com/mikf/gallery-dl/issues/2415), [#2696](https://github.com/mikf/gallery-dl/issues/2696)) +- [pixiv] add support for new daily AI rankings category ([#3214](https://github.com/mikf/gallery-dl/issues/3214), [#3221](https://github.com/mikf/gallery-dl/issues/3221)) +- [twitter] add `avatar` and `background` extractors ([#349](https://github.com/mikf/gallery-dl/issues/349), [#3023](https://github.com/mikf/gallery-dl/issues/3023)) +- [uploadir] add support for `uploadir.com` ([#3162](https://github.com/mikf/gallery-dl/issues/3162)) +- [wallhaven] add `user` extractor ([#3212](https://github.com/mikf/gallery-dl/issues/3212), [#3213](https://github.com/mikf/gallery-dl/issues/3213), [#3226](https://github.com/mikf/gallery-dl/issues/3226)) +- [downloader:http] add `chunk-size` option ([#3143](https://github.com/mikf/gallery-dl/issues/3143)) +- [downloader:http] add file signature check for `.mp4` files +- [downloader:http] add file signature check and MIME type for `.avif` files +- [postprocessor] implement `post-after` event ([#3117](https://github.com/mikf/gallery-dl/issues/3117)) +- [postprocessor:metadata] implement `"mode": "jsonl"` +- [postprocessor:metadata] add `open`, `encoding`, and `private` options +- add `--chunk-size` command-line option ([#3143](https://github.com/mikf/gallery-dl/issues/3143)) +- add `--user-agent` command-line option +- implement `http-metadata` option +- implement `"user-agent": "browser"` ([#2636](https://github.com/mikf/gallery-dl/issues/2636)) +### Changes +- [deviantart] restore cookies warning for mature scraps ([#3129](https://github.com/mikf/gallery-dl/issues/3129)) +- [instagram] use REST API for unauthenticated users by default +- [downloader:http] increase default `chunk-size` to 32768 bytes ([#3143](https://github.com/mikf/gallery-dl/issues/3143)) +- build Windows executables using py2exe's new `freeze()` API +- build executables on GitHub Actions with Python 3.11 +- reword error text for unsupported URLs +### Fixes +- [exhentai] fix pagination ([#3181](https://github.com/mikf/gallery-dl/issues/3181)) +- [khinsider] fix extraction ([#3215](https://github.com/mikf/gallery-dl/issues/3215), [#3219](https://github.com/mikf/gallery-dl/issues/3219)) +- [realbooru] fix download URLs ([#2530](https://github.com/mikf/gallery-dl/issues/2530)) +- [realbooru] fix `tags` extraction ([#2530](https://github.com/mikf/gallery-dl/issues/2530)) +- [tumblr] fall back to `gifv` when possible ([#3095](https://github.com/mikf/gallery-dl/issues/3095), [#3159](https://github.com/mikf/gallery-dl/issues/3159)) +- [twitter] fix login ([#3220](https://github.com/mikf/gallery-dl/issues/3220)) +- [twitter] update URL for syndication API ([#3160](https://github.com/mikf/gallery-dl/issues/3160)) +- [weibo] send `Referer` headers ([#3188](https://github.com/mikf/gallery-dl/issues/3188)) +- [ytdl] update `parse_bytes` location ([#3256](https://github.com/mikf/gallery-dl/issues/3256)) +### Improvements +- [imxto] extract additional metadata ([#3118](https://github.com/mikf/gallery-dl/issues/3118), [#3175](https://github.com/mikf/gallery-dl/issues/3175)) +- [instagram] allow downloading avatars for private profiles ([#3255](https://github.com/mikf/gallery-dl/issues/3255)) +- [pixiv] raise error for invalid search/ranking parameters ([#3214](https://github.com/mikf/gallery-dl/issues/3214)) +- [twitter] update `bookmarks` pagination ([#3172](https://github.com/mikf/gallery-dl/issues/3172)) +- [downloader:http] refactor file signature checks +- [downloader:http] improve `-r/--limit-rate` accuracy ([#3143](https://github.com/mikf/gallery-dl/issues/3143)) +- add loaded config files to debug output +- improve `-K` output for lists +### Removals +- [instagram] remove login support ([#3139](https://github.com/mikf/gallery-dl/issues/3139), [#3141](https://github.com/mikf/gallery-dl/issues/3141), [#3191](https://github.com/mikf/gallery-dl/issues/3191)) +- [instagram] remove `channel` extractor +- [ngomik] remove module + ## 1.23.5 - 2022-10-30 ### Fixes - [instagram] fix AttributeError on user stories extraction ([#3123](https://github.com/mikf/gallery-dl/issues/3123)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.23.5 +Version: 1.24.0 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -14,16 +14,20 @@ Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: End Users/Desktop Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2) -Classifier: Operating System :: Microsoft :: Windows -Classifier: Operating System :: POSIX -Classifier: Operating System :: MacOS +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3 :: Only Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: Implementation :: CPython +Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Internet :: WWW/HTTP Classifier: Topic :: Multimedia :: Graphics Classifier: Topic :: Utilities @@ -99,8 +103,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -243,7 +247,6 @@ and optional for ``idolcomplex``, ``imgbb``, ``inkbunny``, -``instagram``, ``mangadex``, ``mangoxo``, ``pillowfort``, @@ -66,8 +66,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -210,7 +210,6 @@ and optional for ``idolcomplex``, ``imgbb``, ``inkbunny``, -``instagram``, ``mangadex``, ``mangoxo``, ``pillowfort``, diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 5e46dc5..eb5c0f4 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -13,6 +13,7 @@ _arguments -C -S \ {-f,--filename}'[Filename format string for downloaded files ("/O" for "original" filenames)]':'<format>' \ --proxy'[Use the specified proxy]':'<url>' \ --source-address'[Client-side IP address to bind to]':'<ip>' \ +--user-agent'[User-Agent request header]':'<ua>' \ --clear-cache'[Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)]':'<module>' \ --cookies'[File to load additional cookies from]':'<file>':_files \ --cookies-from-browser'[Name of the browser to load cookies from, with optional keyring name prefixed with "+" and profile prefixed with ":"]':'<browser[+keyring][:profile]>' \ @@ -37,6 +38,7 @@ _arguments -C -S \ --sleep-extractor'[Number of seconds to wait before starting data extraction for an input URL]':'<seconds>' \ --filesize-min'[Do not download files smaller than SIZE (e.g. 500k or 2.5M)]':'<size>' \ --filesize-max'[Do not download files larger than SIZE (e.g. 500k or 2.5M)]':'<size>' \ +--chunk-size'[Size of in-memory data chunks (default: 32k)]':'<size>' \ --no-part'[Do not use .part files]' \ --no-skip'[Do not skip downloads; overwrite existing files]' \ --no-mtime'[Do not set file modification times according to Last-Modified HTTP response headers]' \ diff --git a/data/completion/gallery-dl b/data/completion/gallery-dl index 40280d5..f57306e 100644 --- a/data/completion/gallery-dl +++ b/data/completion/gallery-dl @@ -10,7 +10,7 @@ _gallery_dl() elif [[ "${prev}" =~ ^()$ ]]; then COMPREPLY=( $(compgen -d -- "${cur}") ) else - COMPREPLY=( $(compgen -W "--help --version --input-file --destination --directory --filename --proxy --source-address --clear-cache --cookies --cookies-from-browser --quiet --verbose --get-urls --resolve-urls --dump-json --simulate --extractor-info --list-keywords --list-modules --list-extractors --write-log --write-unsupported --write-pages --limit-rate --retries --http-timeout --sleep --sleep-request --sleep-extractor --filesize-min --filesize-max --no-part --no-skip --no-mtime --no-download --no-postprocessors --no-check-certificate --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --abort --terminate --range --chapter-range --filter --chapter-filter --zip --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --write-metadata --write-info-json --write-infojson --write-tags --mtime-from-date --exec --exec-after --postprocessor" -- "${cur}") ) + COMPREPLY=( $(compgen -W "--help --version --input-file --destination --directory --filename --proxy --source-address --user-agent --clear-cache --cookies --cookies-from-browser --quiet --verbose --get-urls --resolve-urls --dump-json --simulate --extractor-info --list-keywords --list-modules --list-extractors --write-log --write-unsupported --write-pages --limit-rate --retries --http-timeout --sleep --sleep-request --sleep-extractor --filesize-min --filesize-max --chunk-size --no-part --no-skip --no-mtime --no-download --no-postprocessors --no-check-certificate --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --abort --terminate --range --chapter-range --filter --chapter-filter --zip --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --write-metadata --write-info-json --write-infojson --write-tags --mtime-from-date --exec --exec-after --postprocessor" -- "${cur}") ) fi } diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish index 587ff12..87e625a 100644 --- a/data/completion/gallery-dl.fish +++ b/data/completion/gallery-dl.fish @@ -7,6 +7,7 @@ complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'D' -l 'director complete -c gallery-dl -x -s 'f' -l 'filename' -d 'Filename format string for downloaded files ("/O" for "original" filenames)' complete -c gallery-dl -x -l 'proxy' -d 'Use the specified proxy' complete -c gallery-dl -x -l 'source-address' -d 'Client-side IP address to bind to' +complete -c gallery-dl -x -l 'user-agent' -d 'User-Agent request header' complete -c gallery-dl -x -l 'clear-cache' -d 'Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)' complete -c gallery-dl -r -F -l 'cookies' -d 'File to load additional cookies from' complete -c gallery-dl -x -l 'cookies-from-browser' -d 'Name of the browser to load cookies from, with optional keyring name prefixed with "+" and profile prefixed with ":"' @@ -31,6 +32,7 @@ complete -c gallery-dl -x -l 'sleep-request' -d 'Number of seconds to wait betwe complete -c gallery-dl -x -l 'sleep-extractor' -d 'Number of seconds to wait before starting data extraction for an input URL' complete -c gallery-dl -x -l 'filesize-min' -d 'Do not download files smaller than SIZE (e.g. 500k or 2.5M)' complete -c gallery-dl -x -l 'filesize-max' -d 'Do not download files larger than SIZE (e.g. 500k or 2.5M)' +complete -c gallery-dl -x -l 'chunk-size' -d 'Size of in-memory data chunks (default: 32k)' complete -c gallery-dl -l 'no-part' -d 'Do not use .part files' complete -c gallery-dl -l 'no-skip' -d 'Do not skip downloads; overwrite existing files' complete -c gallery-dl -l 'no-mtime' -d 'Do not set file modification times according to Last-Modified HTTP response headers' diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 0b27854..059b726 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-10-30" "1.23.5" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-11-20" "1.24.0" "gallery-dl Manual" .\" disable hyphenation .nh @@ -41,6 +41,9 @@ Use the specified proxy .B "\-\-source\-address" \f[I]IP\f[] Client-side IP address to bind to .TP +.B "\-\-user\-agent" \f[I]UA\f[] +User-Agent request header +.TP .B "\-\-clear\-cache" \f[I]MODULE\f[] Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything) .TP @@ -113,6 +116,9 @@ Do not download files smaller than SIZE (e.g. 500k or 2.5M) .B "\-\-filesize\-max" \f[I]SIZE\f[] Do not download files larger than SIZE (e.g. 500k or 2.5M) .TP +.B "\-\-chunk\-size" \f[I]SIZE\f[] +Size of in-memory data chunks (default: 32k) +.TP .B "\-\-no\-part" Do not use .part files .TP diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 8944195..847d665 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-10-30" "1.23.5" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-11-20" "1.24.0" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -449,8 +449,6 @@ and optional for .br * \f[I]inkbunny\f[] .br -* \f[I]instagram\f[] -.br * \f[I]kemonoparty\f[] .br * \f[I]mangadex\f[] @@ -615,6 +613,9 @@ or a \f[I]list\f[] with IP and explicit port number as elements. .IP "Description:" 4 User-Agent header value to be used for HTTP requests. +Setting this value to \f[I]"browser"\f[] will try to automatically detect +and use the User-Agent used by the system's default browser. + Note: This option has no effect on pixiv extractors, as these need specific values to function correctly. @@ -624,7 +625,10 @@ as these need specific values to function correctly. \f[I]string\f[] .IP "Default:" 9 -\f[I]"firefox"\f[] for \f[I]patreon\f[], \f[I]null\f[] everywhere else +.br +* \f[I]"firefox"\f[] for \f[I]patreon\f[], \f[I]mangapark\f[], and \f[I]mangasee\f[] +.br +* \f[I]null\f[] everywhere else .IP "Example:" 4 .br @@ -696,6 +700,23 @@ For example, setting this option to \f[I]"gdl_path"\f[] would make it possible to access the current file's filename as \f[I]"[gdl_path.filename}"\f[]. +.SS extractor.*.http-metadata +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Insert an \f[I]object\f[] containing a file's HTTP headers and +\f[I]filename\f[], \f[I]extension\f[], and \f[I]date\f[] parsed from them +into metadata dictionaries as the given name. + +For example, setting this option to \f[I]"gdl_http"\f[] would make it possible +to access the current file's \f[I]Last-Modified\f[] header as \f[I]"{gdl_http[Last-Modified]}"\f[] +and its parsed form as \f[I]"{gdl_http[date]}"\f[]. + + .SS extractor.*.category-transfer .IP "Type:" 6 \f[I]bool\f[] @@ -1718,17 +1739,15 @@ for details) \f[I]string\f[] .IP "Default:" 9 -\f[I]"auto"\f[] +\f[I]"rest"\f[] .IP "Description:" 4 Selects which API endpoints to use. .br -* \f[I]"rest"\f[]: REST API - higher-resolution media, only usable when logged in +* \f[I]"rest"\f[]: REST API - higher-resolution media .br -* \f[I]"graphql"\f[]: GraphQL API - lower-resolution media, partially accessible when not logged in -.br -* \f[I]"auto"\f[]: Use REST API when logged in, GraphQL API otherwise +* \f[I]"graphql"\f[]: GraphQL API - lower-resolution media .SS extractor.instagram.include @@ -1748,7 +1767,6 @@ when processing a user profile. Possible values are \f[I]"posts"\f[], \f[I]"reels"\f[], -\f[I]"channel"\f[] \f[I]"tagged"\f[], \f[I]"stories"\f[], \f[I]"highlights"\f[], @@ -3105,6 +3123,32 @@ to use your account's browsing settings and default filters when searching. See https://wallhaven.cc/help/api for more information. +.SS extractor.wallhaven.include +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]"uploads"\f[] + +.IP "Example:" 4 +.br +* "uploads,collections" +.br +* ["uploads", "collections"] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Possible values are +\f[I]"uploads"\f[], \f[I]"collections"\f[]. + +It is possible to use \f[I]"all"\f[] instead of listing all values separately. + + .SS extractor.wallhaven.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -3413,7 +3457,7 @@ Minimum/Maximum allowed file size in bytes. Any file smaller/larger than this limit will not be downloaded. Possible values are valid integer or floating-point numbers -optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[] or \f[I]p\f[]. +optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[]. These suffixes are case-insensitive. @@ -3491,7 +3535,7 @@ Set this option to \f[I]null\f[] to disable this indicator. Maximum download rate in bytes per second. Possible values are valid integer or floating-point numbers -optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[] or \f[I]p\f[]. +optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[]. These suffixes are case-insensitive. @@ -3559,6 +3603,24 @@ of a file called \f[I]example.png\f[] from \f[I]png\f[] to \f[I]jpg\f[] when sai contains JPEG/JFIF data. +.SS downloader.http.chunk-size +.IP "Type:" 6 +\f[I]integer\f[] or \f[I]string\f[] + +.IP "Default:" 9 +\f[I]32768\f[] + +.IP "Example:" 4 +"50k", "0.8M" + +.IP "Description:" 4 +Number of bytes per downloaded chunk. + +Possible values are integer numbers +optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[]. +These suffixes are case-insensitive. + + .SS downloader.http.headers .IP "Type:" 6 \f[I]object\f[] @@ -4009,6 +4071,9 @@ Selects how to process metadata. * \f[I]"json"\f[]: write metadata using \f[I]json.dump() <https://docs.python.org/3/library/json.html#json.dump>\f[] .br +* \f[I]"jsonl"\f[]: write metadata in \f[I]JSON Lines +<https://jsonlines.org/>\f[] format +.br * \f[I]"tags"\f[]: write \f[I]tags\f[] separated by newlines .br * \f[I]"custom"\f[]: write the result of applying \f[I]metadata.content-format\f[] @@ -4112,6 +4177,8 @@ When skipping a file download \f[I]post\f[] When starting to download all files of a post, e.g. a Tweet on Twitter or a post on Patreon. +\f[I]post-after\f[] +After downloading all files of a post .SS metadata.fields @@ -4163,6 +4230,48 @@ Custom format string to build the content of metadata files with. Note: Only applies for \f[I]"mode": "custom"\f[]. +.SS metadata.open +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Defsult:" 4 +\f[I]"w"\f[] + +.IP "Description:" 4 +The \f[I]mode\f[] in which metadata files get opened. + +For example, +use \f[I]"a"\f[] to append to a file's content +or \f[I]"w"\f[] to truncate it. + +See the \f[I]mode\f[] parameter of \f[I]open()\f[] for further details. + + +.SS metadata.private +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Include private fields, +i.e. fields whose name starts with an underscore. + + +.SS metadata.encoding +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Defsult:" 4 +\f[I]"utf-8"\f[] + +.IP "Description:" 4 +Name of the encoding used to encode a file's content. + +See the \f[I]encoding\f[] parameter of \f[I]open()\f[] for further details. + + .SS metadata.archive .IP "Type:" 6 \f[I]Path\f[] diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index 279aeef..92509b5 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -210,6 +210,19 @@ "text-tweets": true }, + "ytdl": + { + "#": "enable 'ytdl' extractor", + "#": "i.e. invoke ytdl on all otherwise unsupported input URLs", + "enabled": true, + + "#": "use yt-dlp instead of youtube-dl", + "module": "yt_dlp", + + "#": "load ytdl options from config file", + "config-file": "~/yt-dlp.conf" + }, + "mastodon": { "#": "add 'tabletop.social' as recognized mastodon instance", diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1fcbb3b..becf599 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -159,9 +159,8 @@ }, "instagram": { - "username": null, - "password": null, - "api": "auto", + "api": "rest", + "cookies": null, "include": "posts", "sleep-request": [6.0, 12.0], "videos": true @@ -330,7 +329,8 @@ "wallhaven": { "api-key": null, - "metadata": false + "metadata": false, + "include": "uploads" }, "weasyl": { @@ -381,6 +381,7 @@ "http": { "adjust-extensions": true, + "chunk-size": 32768, "headers": null }, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index d00e803..0d42bce 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.23.5 +Version: 1.24.0 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -14,16 +14,20 @@ Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: End Users/Desktop Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2) -Classifier: Operating System :: Microsoft :: Windows -Classifier: Operating System :: POSIX -Classifier: Operating System :: MacOS +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3 :: Only Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: Implementation :: CPython +Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Internet :: WWW/HTTP Classifier: Topic :: Multimedia :: Graphics Classifier: Topic :: Utilities @@ -99,8 +103,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -243,7 +247,6 @@ and optional for ``idolcomplex``, ``imgbb``, ``inkbunny``, -``instagram``, ``mangadex``, ``mangoxo``, ``pillowfort``, diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 3fa2176..72a07ab 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -137,9 +137,9 @@ gallery_dl/extractor/nana.py gallery_dl/extractor/naver.py gallery_dl/extractor/naverwebtoon.py gallery_dl/extractor/newgrounds.py -gallery_dl/extractor/ngomik.py gallery_dl/extractor/nhentai.py gallery_dl/extractor/nijie.py +gallery_dl/extractor/nitter.py gallery_dl/extractor/nozomi.py gallery_dl/extractor/nsfwalbum.py gallery_dl/extractor/oauth.py @@ -187,6 +187,7 @@ gallery_dl/extractor/tumblrgallery.py gallery_dl/extractor/twibooru.py gallery_dl/extractor/twitter.py gallery_dl/extractor/unsplash.py +gallery_dl/extractor/uploadir.py gallery_dl/extractor/vanillarock.py gallery_dl/extractor/vichan.py gallery_dl/extractor/vk.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index b64fa2f..3701d6f 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -118,25 +118,15 @@ def main(): config.set(("output",), "mode", "null") elif args.loglevel <= logging.DEBUG: import platform - import subprocess - import os.path import requests extra = "" if getattr(sys, "frozen", False): extra = " - Executable" else: - try: - out, err = subprocess.Popen( - ("git", "rev-parse", "--short", "HEAD"), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__)), - ).communicate() - if out and not err: - extra = " - Git HEAD: " + out.decode().rstrip() - except (OSError, subprocess.SubprocessError): - pass + git_head = util.git_head() + if git_head: + extra = " - Git HEAD: " + git_head log.debug("Version %s%s", __version__, extra) log.debug("Python %s - %s", @@ -148,6 +138,8 @@ def main(): except AttributeError: pass + log.debug("Configuration Files %s", config._files) + if args.list_modules: extractor.modules.append("") sys.stdout.write("\n".join(extractor.modules)) @@ -201,7 +193,8 @@ def main(): if sys.stdin: urls += util.parse_inputfile(sys.stdin, log) else: - log.warning("input file: stdin is not readable") + log.warning( + "input file: stdin is not readable") else: with open(inputfile, encoding="utf-8") as file: urls += util.parse_inputfile(file, log) @@ -235,7 +228,7 @@ def main(): except exception.TerminateExtraction: pass except exception.NoExtractorError: - log.error("No suitable extractor found for '%s'", url) + log.error("Unsupported URL '%s'", url) retval |= 64 return retval diff --git a/gallery_dl/config.py b/gallery_dl/config.py index 953b1b1..0f2d1f1 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -21,6 +21,7 @@ log = logging.getLogger("config") # internals _config = {} +_files = [] if util.WINDOWS: _default_configs = [ @@ -61,8 +62,8 @@ def load(files=None, strict=False, fmt="json"): else: parsefunc = json.load - for path in files or _default_configs: - path = util.expand_path(path) + for pathfmt in files or _default_configs: + path = util.expand_path(pathfmt) try: with open(path, encoding="utf-8") as file: confdict = parsefunc(file) @@ -79,6 +80,7 @@ def load(files=None, strict=False, fmt="json"): _config.update(confdict) else: util.combine_dict(_config, confdict) + _files.append(pathfmt) def clear(): diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 579f755..6f9a92d 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -263,7 +263,7 @@ def _chrome_cookies_database(profile, config): path = _find_most_recently_used_file(search_root, "Cookies") if path is None: - raise FileNotFoundError("Unable tp find {} cookies database in " + raise FileNotFoundError("Unable to find {} cookies database in " "'{}'".format(config["browser"], search_root)) logger.debug("Extracting cookies from %s", path) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 5622462..26eb7b5 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -27,10 +27,11 @@ class HttpDownloader(DownloaderBase): def __init__(self, job): DownloaderBase.__init__(self, job) extractor = job.extractor - self.chunk_size = 16384 self.downloading = False self.adjust_extension = self.config("adjust-extensions", True) + self.chunk_size = self.config("chunk-size", 32768) + self.metadata = extractor.config("http-metadata") self.progress = self.config("progress", 3.0) self.headers = self.config("headers") self.minsize = self.config("filesize-min") @@ -55,6 +56,13 @@ class HttpDownloader(DownloaderBase): self.log.warning( "Invalid maximum file size (%r)", self.maxsize) self.maxsize = maxsize + if isinstance(self.chunk_size, str): + chunk_size = text.parse_bytes(self.chunk_size) + if not chunk_size: + self.log.warning( + "Invalid chunk size (%r)", self.chunk_size) + chunk_size = 32768 + self.chunk_size = chunk_size if self.rate: rate = text.parse_bytes(self.rate) if rate: @@ -83,11 +91,12 @@ class HttpDownloader(DownloaderBase): tries = 0 msg = "" + metadata = self.metadata kwdict = pathfmt.kwdict adjust_extension = kwdict.get( "_http_adjust_extension", self.adjust_extension) - if self.part: + if self.part and not metadata: pathfmt.part_enable(self.partdir) while True: @@ -164,13 +173,6 @@ class HttpDownloader(DownloaderBase): self.log.warning("Invalid response") return False - # set missing filename extension from MIME type - if not pathfmt.extension: - pathfmt.set_extension(self._find_extension(response)) - if pathfmt.exists(): - pathfmt.temppath = "" - return True - # check file size size = text.parse_int(size, None) if size is not None: @@ -185,11 +187,33 @@ class HttpDownloader(DownloaderBase): size, self.maxsize) return False + build_path = False + + # set missing filename extension from MIME type + if not pathfmt.extension: + pathfmt.set_extension(self._find_extension(response)) + build_path = True + + # set metadata from HTTP headers + if metadata: + kwdict[metadata] = util.extract_headers(response) + build_path = True + + # build and check file path + if build_path: + pathfmt.build_path() + if pathfmt.exists(): + pathfmt.temppath = "" + return True + if self.part and metadata: + pathfmt.part_enable(self.partdir) + metadata = False + content = response.iter_content(self.chunk_size) # check filename extension against file header if adjust_extension and not offset and \ - pathfmt.extension in FILE_SIGNATURES: + pathfmt.extension in SIGNATURE_CHECKS: try: file_header = next( content if response.raw.chunked @@ -220,7 +244,7 @@ class HttpDownloader(DownloaderBase): offset += len(file_header) elif offset: if adjust_extension and \ - pathfmt.extension in FILE_SIGNATURES: + pathfmt.extension in SIGNATURE_CHECKS: self._adjust_extension(pathfmt, fp.read(16)) fp.seek(offset) @@ -250,42 +274,38 @@ class HttpDownloader(DownloaderBase): return True @staticmethod - def receive(fp, content, bytes_total, bytes_downloaded): + def receive(fp, content, bytes_total, bytes_start): write = fp.write for data in content: write(data) - def _receive_rate(self, fp, content, bytes_total, bytes_downloaded): + def _receive_rate(self, fp, content, bytes_total, bytes_start): rate = self.rate - progress = self.progress - bytes_start = bytes_downloaded write = fp.write - t1 = tstart = time.time() + progress = self.progress + + bytes_downloaded = 0 + time_start = time.time() for data in content: - write(data) + time_current = time.time() + time_elapsed = time_current - time_start + bytes_downloaded += len(data) - t2 = time.time() # current time - elapsed = t2 - t1 # elapsed time - num_bytes = len(data) + write(data) if progress is not None: - bytes_downloaded += num_bytes - tdiff = t2 - tstart - if tdiff >= progress: + if time_elapsed >= progress: self.out.progress( - bytes_total, bytes_downloaded, - int((bytes_downloaded - bytes_start) / tdiff), + bytes_total, + bytes_start + bytes_downloaded, + int(bytes_downloaded / time_elapsed), ) if rate: - expected = num_bytes / rate # expected elapsed time - if elapsed < expected: - # sleep if less time elapsed than expected - time.sleep(expected - elapsed) - t2 = time.time() - - t1 = t2 + time_expected = bytes_downloaded / rate + if time_expected > time_elapsed: + time.sleep(time_expected - time_elapsed) def _find_extension(self, response): """Get filename extension from MIME type""" @@ -308,11 +328,11 @@ class HttpDownloader(DownloaderBase): @staticmethod def _adjust_extension(pathfmt, file_header): """Check filename extension against file header""" - sig = FILE_SIGNATURES[pathfmt.extension] - if not file_header.startswith(sig): - for ext, sig in FILE_SIGNATURES.items(): - if file_header.startswith(sig): + if not SIGNATURE_CHECKS[pathfmt.extension](file_header): + for ext, check in SIGNATURE_CHECKS.items(): + if check(file_header): pathfmt.set_extension(ext) + pathfmt.build_path() return True return False @@ -326,6 +346,7 @@ MIME_TYPES = { "image/x-bmp" : "bmp", "image/x-ms-bmp": "bmp", "image/webp" : "webp", + "image/avif" : "avif", "image/svg+xml" : "svg", "image/ico" : "ico", "image/icon" : "ico", @@ -362,27 +383,33 @@ MIME_TYPES = { } # https://en.wikipedia.org/wiki/List_of_file_signatures -FILE_SIGNATURES = { - "jpg" : b"\xFF\xD8\xFF", - "png" : b"\x89PNG\r\n\x1A\n", - "gif" : (b"GIF87a", b"GIF89a"), - "bmp" : b"BM", - "webp": b"RIFF", - "svg" : b"<?xml", - "ico" : b"\x00\x00\x01\x00", - "cur" : b"\x00\x00\x02\x00", - "psd" : b"8BPS", - "webm": b"\x1A\x45\xDF\xA3", - "ogg" : b"OggS", - "wav" : b"RIFF", - "mp3" : (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2", b"ID3"), - "zip" : (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"), - "rar" : b"\x52\x61\x72\x21\x1A\x07", - "7z" : b"\x37\x7A\xBC\xAF\x27\x1C", - "pdf" : b"%PDF-", - "swf" : (b"CWS", b"FWS"), +SIGNATURE_CHECKS = { + "jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF", + "png" : lambda s: s[0:8] == b"\x89PNG\r\n\x1A\n", + "gif" : lambda s: s[0:6] in (b"GIF87a", b"GIF89a"), + "bmp" : lambda s: s[0:2] == b"BM", + "webp": lambda s: (s[0:4] == b"RIFF" and + s[8:12] == b"WEBP"), + "avif": lambda s: s[4:11] == b"ftypavi" and s[11] in b"fs", + "svg" : lambda s: s[0:5] == b"<?xml", + "ico" : lambda s: s[0:4] == b"\x00\x00\x01\x00", + "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00", + "psd" : lambda s: s[0:4] == b"8BPS", + "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in ( + b"mp4", b"avc", b"iso", b"M4V")), + "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3", + "ogg" : lambda s: s[0:4] == b"OggS", + "wav" : lambda s: (s[0:4] == b"RIFF" and + s[8:12] == b"WAVE"), + "mp3" : lambda s: (s[0:3] == b"ID3" or + s[0:2] in (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2")), + "zip" : lambda s: s[0:4] in (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"), + "rar" : lambda s: s[0:6] == b"Rar!\x1A\x07", + "7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C", + "pdf" : lambda s: s[0:5] == b"%PDF-", + "swf" : lambda s: s[0:3] in (b"CWS", b"FWS"), # check 'bin' files against all other file signatures - "bin" : b"\x00\x00\x00\x00\x00\x00\x00\x00", + "bin" : lambda s: False, } __downloader__ = HttpDownloader diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index efa957b..c44ea0a 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -98,6 +98,7 @@ class YoutubeDLDownloader(DownloaderBase): pathfmt.realdirectory + filename) else: pathfmt.set_extension(info_dict["ext"]) + pathfmt.build_path() if pathfmt.exists(): pathfmt.temppath = "" @@ -118,6 +119,7 @@ class YoutubeDLDownloader(DownloaderBase): def _download_playlist(self, ytdl_instance, pathfmt, info_dict): pathfmt.set_extension("%(playlist_index)s.%(ext)s") + pathfmt.build_path() self._set_outtmpl(ytdl_instance, pathfmt.realpath) for entry in info_dict["entries"]: diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index f5125ee..92ea6ca 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -60,8 +60,8 @@ class _2chanThreadExtractor(Extractor): def metadata(self, page): """Collect metadata for extractor-job""" - title = text.extract(page, "<title>", "</title>")[0] - title, _, boardname = title.rpartition(" - ") + title, _, boardname = text.extr( + page, "<title>", "</title>").rpartition(" - ") return { "server": self.server, "title": title, @@ -72,8 +72,8 @@ class _2chanThreadExtractor(Extractor): def posts(self, page): """Build a list of all post-objects""" - page = text.extract( - page, '<div class="thre"', '<div style="clear:left"></div>')[0] + page = text.extr( + page, '<div class="thre"', '<div style="clear:left"></div>') return [ self.parse(post) for post in page.split('<table border=0>') @@ -84,7 +84,7 @@ class _2chanThreadExtractor(Extractor): data = self._extract_post(post) if data["name"]: data["name"] = data["name"].strip() - path = text.extract(post, '<a href="/', '"')[0] + path = text.extr(post, '<a href="/', '"') if path and not path.startswith("bin/jump"): self._extract_image(post, data) data["tim"], _, data["extension"] = data["filename"].partition(".") diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index 6a40d41..28acc3d 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -124,7 +124,7 @@ class _35photoUserExtractor(_35photoExtractor): def metadata(self): url = "{}/{}/".format(self.root, self.user) page = self.request(url).text - self.user_id = text.parse_int(text.extract(page, "/user_", ".xml")[0]) + self.user_id = text.parse_int(text.extr(page, "/user_", ".xml")) return { "user": self.user, "user_id": self.user_id, @@ -189,10 +189,10 @@ class _35photoGenreExtractor(_35photoExtractor): def metadata(self): url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/") page = self.request(url).text - self.photo_ids = self._photo_ids(text.extract( - page, ' class="photo', '\n')[0]) + self.photo_ids = self._photo_ids(text.extr( + page, ' class="photo', '\n')) return { - "genre": text.extract(page, " genre - ", ". ")[0], + "genre": text.extr(page, " genre - ", ". "), "genre_id": text.parse_int(self.genre_id), } diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index fe57412..fed4991 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -76,9 +76,9 @@ class _8musesAlbumExtractor(Extractor): url = self.root + self.path + self.params while True: - data = self._unobfuscate(text.extract( + data = self._unobfuscate(text.extr( self.request(url).text, - 'id="ractive-public" type="text/plain">', '</script>')[0]) + 'id="ractive-public" type="text/plain">', '</script>')) images = data.get("pictures") if images: diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e0340a..a563bfd 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -95,9 +95,9 @@ modules = [ "naver", "naverwebtoon", "newgrounds", - "ngomik", "nhentai", "nijie", + "nitter", "nozomi", "nsfwalbum", "paheal", @@ -141,6 +141,7 @@ modules = [ "twibooru", "twitter", "unsplash", + "uploadir", "vanillarock", "vichan", "vk", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 14d1e6b..da2d8f2 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -41,8 +41,8 @@ class ArtstationExtractor(Extractor): if adict["has_embedded_player"] and self.external: player = adict["player_embedded"] - url = text.extract(player, 'src="', '"')[0] or \ - text.extract(player, "src='", "'")[0] + url = (text.extr(player, 'src="', '"') or + text.extr(player, "src='", "'")) if url and not url.startswith(self.root): asset["extension"] = None yield Message.Url, "ytdl:" + url, asset diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index fa590b9..6f01572 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -128,8 +128,7 @@ class AryionExtractor(Extractor): # get filename from 'Content-Disposition' header cdis = headers["content-disposition"] - fname, _, ext = text.extract( - cdis, 'filename="', '"')[0].rpartition(".") + fname, _, ext = text.extr(cdis, 'filename="', '"').rpartition(".") if not fname: fname, ext = ext, fname diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index 17b5f52..1b49d6a 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -38,8 +38,8 @@ class BbcGalleryExtractor(GalleryExtractor): ) def metadata(self, page): - data = json.loads(text.extract( - page, '<script type="application/ld+json">', '</script>')[0]) + data = json.loads(text.extr( + page, '<script type="application/ld+json">', '</script>')) return { "programme": self.gallery_url.split("/")[4], "path": list(util.unique_sequence( diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py index 7982881..4eb446d 100644 --- a/gallery_dl/extractor/bcy.py +++ b/gallery_dl/extractor/bcy.py @@ -97,7 +97,7 @@ class BcyExtractor(Extractor): url = "{}/item/detail/{}".format(self.root, post_id) page = self.request(url, notfound="post").text return json.loads( - text.extract(page, 'JSON.parse("', '");')[0] + text.extr(page, 'JSON.parse("', '");') .replace('\\\\u002F', '/') .replace('\\"', '"') )["detail"] diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 994a701..cf332ac 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -119,8 +119,8 @@ class BehanceGalleryExtractor(BehanceExtractor): } page = self.request(url, cookies=cookies).text - data = json.loads(text.extract( - page, 'id="beconfig-store_state">', '</script>')[0]) + data = json.loads(text.extr( + page, 'id="beconfig-store_state">', '</script>')) return self._update(data["project"]["project"]) def get_images(self, data): @@ -137,7 +137,7 @@ class BehanceGalleryExtractor(BehanceExtractor): elif mtype == "video": page = self.request(module["src"]).text - url = text.extract(page, '<source src="', '"')[0] + url = text.extr(page, '<source src="', '"') if text.ext_from_url(url) == "m3u8": url = "ytdl:" + url append((url, module)) @@ -150,8 +150,7 @@ class BehanceGalleryExtractor(BehanceExtractor): elif mtype == "embed": embed = module.get("original_embed") or module.get("embed") if embed: - url = "ytdl:" + text.extract(embed, 'src="', '"')[0] - append((url, module)) + append(("ytdl:" + text.extr(embed, 'src="', '"'), module)) return result diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 232f3ea..8a1a42e 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -61,8 +61,8 @@ class BloggerExtractor(Extractor): page = self.request(post["url"]).text for url in findall_video(page): page = self.request(url).text - video_config = json.loads(text.extract( - page, 'var VIDEO_CONFIG =', '\n')[0]) + video_config = json.loads(text.extr( + page, 'var VIDEO_CONFIG =', '\n')) files.append(max( video_config["streams"], key=lambda x: x["format_id"], diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 12d98b1..0d7d13d 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -25,6 +25,7 @@ class BooruExtractor(BaseExtractor): data = self.metadata() tags = self.config("tags", False) notes = self.config("notes", False) + fetch_html = tags or notes for post in self.posts(): try: @@ -36,11 +37,13 @@ class BooruExtractor(BaseExtractor): "(md5: %s)", post.get("id"), post.get("md5")) continue - page_html = None - if tags: - page_html = self._extended_tags(post) - if notes: - self._notes(post, page_html) + if fetch_html: + html = self._html(post) + if tags: + self._tags(post, html) + if notes: + self._notes(post, html) + text.nameext_from_url(url, post) post.update(data) self._prepare(post) @@ -67,16 +70,13 @@ class BooruExtractor(BaseExtractor): _file_url = operator.itemgetter("file_url") def _prepare(self, post): - """Prepare the 'post's metadata""" + """Prepare a 'post's metadata""" - def _extended_tags(self, post, page=None): - """Generate extended tag information + def _html(self, post): + """Return HTML content of a post""" - The return value of this function will be - passed to the _notes function as the page parameter. - This makes it possible to reuse the same HTML both for - extracting tags and notes. - """ + def _tags(self, post, page): + """Extract extended tag metadata""" - def _notes(self, post, page=None): - """Generate information about notes""" + def _notes(self, post, page): + """Extract notes metadata""" diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 2502411..dde9cf8 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -68,9 +68,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): url = self.root + "/a/" + self.album_id try: - data = json.loads(text.extract( + data = json.loads(text.extr( self.request(url).text, - 'id="__NEXT_DATA__" type="application/json">', '<')[0]) + 'id="__NEXT_DATA__" type="application/json">', '<')) album = data["props"]["pageProps"]["album"] files = album["files"] except Exception as exc: diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index e304717..4352aa7 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -20,7 +20,7 @@ import requests import threading from requests.adapters import HTTPAdapter from .message import Message -from .. import config, text, util, exception +from .. import config, text, util, cache, exception class Extractor(): @@ -149,14 +149,13 @@ class Extractor(): msg = "'{} {}' for '{}'".format(code, response.reason, url) server = response.headers.get("Server") - if server and server.startswith("cloudflare"): - if code == 503 and \ - (b"_cf_chl_opt" in response.content or - b"jschl-answer" in response.content): + if server and server.startswith("cloudflare") and \ + code in (403, 503): + content = response.content + if b"_cf_chl_opt" in content or b"jschl-answer" in content: self.log.warning("Cloudflare IUAM challenge") break - if code == 403 and \ - b'name="captcha-bypass"' in response.content: + if b'name="captcha-bypass"' in content: self.log.warning("Cloudflare CAPTCHA") break if code < 500 and code != 429 and code != 430: @@ -263,9 +262,13 @@ class Extractor(): ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) ssl_ciphers = SSL_CIPHERS[browser] else: - headers["User-Agent"] = self.config("user-agent", ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:102.0) Gecko/20100101 Firefox/102.0")) + useragent = self.config("user-agent") + if useragent is None: + useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " + "rv:102.0) Gecko/20100101 Firefox/102.0") + elif useragent == "browser": + useragent = _browser_useragent() + headers["User-Agent"] = useragent headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" @@ -725,6 +728,36 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): return adapter +@cache.cache(maxage=86400) +def _browser_useragent(): + """Get User-Agent header from default browser""" + import webbrowser + import socket + + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind(("127.0.0.1", 6414)) + server.listen(1) + + webbrowser.open("http://127.0.0.1:6414/user-agent") + + client = server.accept()[0] + server.close() + + for line in client.recv(1024).split(b"\r\n"): + key, _, value = line.partition(b":") + if key.strip().lower() == b"user-agent": + useragent = value.strip() + break + else: + useragent = b"" + + client.send(b"HTTP/1.1 200 OK\r\n\r\n" + useragent) + client.close() + + return useragent.decode() + + _adapter_cache = {} _browser_cookies = {} diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index cb2aa24..45beddf 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -603,22 +603,22 @@ class DeviantartStashExtractor(DeviantartExtractor): page = self._limited_request(url).text if stash_id[0] == "0": - uuid = text.extract(page, '//deviation/', '"')[0] + uuid = text.extr(page, '//deviation/', '"') if uuid: deviation = self.api.deviation(uuid) - deviation["index"] = text.parse_int(text.extract( - page, 'gmi-deviationid="', '"')[0]) + deviation["index"] = text.parse_int(text.extr( + page, 'gmi-deviationid="', '"')) yield deviation return for item in text.extract_iter( page, 'class="stash-thumb-container', '</div>'): - url = text.extract(item, '<a href="', '"')[0] + url = text.extr(item, '<a href="', '"') if url: stash_id = url.rpartition("/")[2] else: - stash_id = text.extract(item, 'gmi-stashid="', '"')[0] + stash_id = text.extr(item, 'gmi-stashid="', '"') stash_id = "2" + util.bencode(text.parse_int( stash_id), "0123456789abcdefghijklmnopqrstuvwxyz") @@ -960,9 +960,15 @@ class DeviantartScrapsExtractor(DeviantartExtractor): ) cookiedomain = ".deviantart.com" cookienames = ("auth", "auth_secure", "userinfo") + _warning = True def deviations(self): eclipse_api = DeviantartEclipseAPI(self) + if self._warning: + DeviantartScrapsExtractor._warning = False + if not self._check_cookies(self.cookienames): + self.log.warning( + "No session cookies set: Unable to fetch mature scraps.") for obj in eclipse_api.gallery_scraps(self.user, self.offset): deviation = obj["deviation"] @@ -1478,8 +1484,8 @@ class DeviantartEclipseAPI(): def _fetch_csrf_token(self, page=None): if page is None: page = self.request(self.extractor.root + "/").text - self.csrf_token = token = text.extract( - page, "window.__CSRF_TOKEN__ = '", "'")[0] + self.csrf_token = token = text.extr( + page, "window.__CSRF_TOKEN__ = '", "'") return token diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index e5c5c01..d78f25b 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -30,7 +30,7 @@ class DynastyscansBase(): src = extr("class='btn-group'>", "</div>") url = extr(' src="', '"') - src = text.extract(src, 'href="', '"')[0] if "Source<" in src else "" + src = text.extr(src, 'href="', '"') if "Source<" in src else "" return { "url" : self.root + url, @@ -75,7 +75,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): "title" : text.unescape(match.group(4) or ""), "author" : text.remove_html(author), "group" : (text.remove_html(group) or - text.extract(group, ' alt="', '"')[0] or ""), + text.extr(group, ' alt="', '"')), "date" : text.parse_datetime(extr( '"icon-calendar"></i> ', '<'), "%b %d, %Y"), "lang" : "en", @@ -83,7 +83,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): } def images(self, page): - data = text.extract(page, "var pages = ", ";\n")[0] + data = text.extr(page, "var pages = ", ";\n") return [ (self.root + img["image"], None) for img in json.loads(data) diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 992db97..b4dadc7 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -55,8 +55,8 @@ class EromeExtractor(Extractor): yield Message.Directory, data groups = page.split('<div class="media-group"') for data["num"], group in enumerate(util.advance(groups, 1), 1): - url = (text.extract(group, '<source src="', '"')[0] or - text.extract(group, 'data-src="', '"')[0]) + url = (text.extr(group, '<source src="', '"') or + text.extr(group, 'data-src="', '"')) if url: yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index e37e81b..a546f68 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -185,7 +185,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if self.gallery_token: gpage = self._gallery_page() - self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0] + self.image_token = text.extr(gpage, 'hentai.org/s/', '"') if not self.image_token: self.log.error("Failed to extract initial image token") self.log.debug("Page content:\n%s", gpage) @@ -193,7 +193,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): ipage = self._image_page() else: ipage = self._image_page() - part = text.extract(ipage, 'hentai.org/g/', '"')[0] + part = text.extr(ipage, 'hentai.org/g/', '"') if not part: self.log.error("Failed to extract gallery token") self.log.debug("Page content:\n%s", ipage) @@ -271,8 +271,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): } if data["uploader"].startswith("<"): - data["uploader"] = text.unescape(text.extract( - data["uploader"], ">", "<")[0]) + data["uploader"] = text.unescape(text.extr( + data["uploader"], ">", "<")) f = data["favorites"][0] if f == "N": @@ -400,7 +400,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): } page = self.request(url, cookies=cookies).text - current = text.extract(page, "<strong>", "</strong>")[0] + current = text.extr(page, "<strong>", "</strong>") self.log.debug("Image Limits: %s/%s", current, self.limits) self._remaining = self.limits - text.parse_int(current) @@ -473,6 +473,10 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): "pattern": ExhentaiGalleryExtractor.pattern, "range": "1-30", "count": 30, + "keyword": { + "gallery_id": int, + "gallery_token": r"re:^[0-9a-f]{10}$" + }, }), ) @@ -490,26 +494,39 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): self.params = {"f_search": tag, "page": 0} else: self.params = text.parse_query(query) - self.params["page"] = text.parse_int(self.params.get("page")) + if "next" not in self.params: + self.params["page"] = text.parse_int(self.params.get("page")) def items(self): self.login() data = {"_extractor": ExhentaiGalleryExtractor} + search_url = self.search_url + params = self.params while True: last = None - page = self.request(self.search_url, params=self.params).text + page = self.request(search_url, params=params).text for gallery in ExhentaiGalleryExtractor.pattern.finditer(page): url = gallery.group(0) if url == last: continue last = url + data["gallery_id"] = text.parse_int(gallery.group(2)) + data["gallery_token"] = gallery.group(3) yield Message.Queue, url + "/", data - if 'class="ptdd">><' in page or ">No hits found</p>" in page: + next_url = text.extr(page, 'nexturl = "', '"', None) + if next_url is not None: + if not next_url: + return + search_url = next_url + params = None + + elif 'class="ptdd">><' in page or ">No hits found</p>" in page: return - self.params["page"] += 1 + else: + params["page"] += 1 class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor): diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py index ab0e0c5..57587b6 100644 --- a/gallery_dl/extractor/fallenangels.py +++ b/gallery_dl/extractor/fallenangels.py @@ -57,7 +57,7 @@ class FallenangelsChapterExtractor(ChapterExtractor): return [ (img["page_image"], None) for img in json.loads( - text.extract(page, "var pages = ", ";")[0] + text.extr(page, "var pages = ", ";") ) ] diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 5e6da5b..4f9a6bf 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -56,7 +56,7 @@ class FoolfuukaExtractor(BaseExtractor): """Resolve a remote media link""" needle = '<meta http-equiv="Refresh" content="0; url=' page = self.request(media["remote_media_link"]).text - return text.extract(page, needle, '"')[0] + return text.extr(page, needle, '"') @staticmethod def _remote_direct(media): diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 382cc25..81671ec 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -114,7 +114,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): }) def images(self, page): - return json.loads(text.extract(page, "var pages = ", ";")[0]) + return json.loads(text.extr(page, "var pages = ", ";")) class FoolslideMangaExtractor(FoolslideExtractor): diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index b63cfc1..cc43cec 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -160,7 +160,7 @@ class FuraffinityExtractor(Extractor): while path: page = self.request(self.root + path).text yield from text.extract_iter(page, 'id="sid-', '"') - path = text.extract(page, 'right" href="', '"')[0] + path = text.extr(page, 'right" href="', '"') def _pagination_search(self, query): url = self.root + "/search/" diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py index df55061..d6640f5 100644 --- a/gallery_dl/extractor/fuskator.py +++ b/gallery_dl/extractor/fuskator.py @@ -58,7 +58,7 @@ class FuskatorGalleryExtractor(GalleryExtractor): self.root + "/ajax/gal.aspx", params=params, headers=headers, ).json() - title = text.extract(page, "<title>", "</title>")[0].strip() + title = text.extr(page, "<title>", "</title>").strip() title, _, gallery_id = title.rpartition("#") return { @@ -104,7 +104,7 @@ class FuskatorSearchExtractor(Extractor): page, 'class="pic_pad"><a href="', '"'): yield Message.Queue, self.root + path, data - pages = text.extract(page, 'class="pages"><span>', '>>><')[0] + pages = text.extr(page, 'class="pages"><span>', '>>><') if not pages: return url = self.root + text.rextract(pages, 'href="', '"')[0] diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index a2cf0c0..d8109e1 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -68,6 +68,22 @@ class GelbooruBase(): yield "https://img2.gelbooru.com" + path yield "https://img1.gelbooru.com" + path + def _notes(self, post, page): + notes_data = text.extr(page, '<section id="notes"', '</section>') + if not notes_data: + return + + post["notes"] = notes = [] + extr = text.extract + for note in text.extract_iter(notes_data, '<article', '</article>'): + notes.append({ + "width" : int(extr(note, 'data-width="', '"')[0]), + "height": int(extr(note, 'data-height="', '"')[0]), + "x" : int(extr(note, 'data-x="', '"')[0]), + "y" : int(extr(note, 'data-y="', '"')[0]), + "body" : extr(note, 'data-body="', '"')[0], + }) + class GelbooruTagExtractor(GelbooruBase, gelbooru_v02.GelbooruV02TagExtractor): @@ -182,21 +198,21 @@ class GelbooruPostExtractor(GelbooruBase, "keywords": { "notes": [ { - "height": 553, "body": "Look over this way when you talk~", + "height": 553, "width": 246, "x": 35, - "y": 72 + "y": 72, }, { - "height": 557, "body": "Hey~\nAre you listening~?", + "height": 557, "width": 246, "x": 1233, - "y": 109 - } - ] - } + "y": 109, + }, + ], + }, }), ) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 8214614..da87b8f 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -31,6 +31,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): if self.category == "realbooru": self._file_url = self._file_url_realbooru + self._tags = self._tags_realbooru def _api_request(self, params): url = self.api_root + "/index.php?page=dapi&s=post&q=index" @@ -85,55 +86,58 @@ class GelbooruV02Extractor(booru.BooruExtractor): post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") + def _html(self, post): + return self.request("{}/index.php?page=post&s=view&id={}".format( + self.root, post["id"])).text + + def _tags(self, post, page): + tag_container = (text.extr(page, '<ul id="tag-', '</ul>') or + text.extr(page, '<ul class="tag-', '</ul>')) + if not tag_container: + return + + tags = collections.defaultdict(list) + pattern = re.compile( + r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S) + for tag_type, tag_name in pattern.findall(tag_container): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + post["tags_" + key] = " ".join(value) + + def _notes(self, post, page): + note_container = text.extr(page, 'id="note-container"', "<img ") + if not note_container: + return + + post["notes"] = notes = [] + for note in note_container.split('class="note-box"')[1:]: + extr = text.extract_from(note) + notes.append({ + "width" : int(extr("width:", "p")), + "height": int(extr("height:", "p")), + "y" : int(extr("top:", "p")), + "x" : int(extr("left:", "p")), + "id" : int(extr('id="note-body-', '"')), + "body" : text.unescape(text.remove_html(extr(">", "</div>"))), + }) + def _file_url_realbooru(self, post): url = post["file_url"] - if url.count("/") == 5: - md5 = post["md5"] + md5 = post["md5"] + if md5 not in post["preview_url"] or url.count("/") == 5: url = "{}/images/{}/{}/{}.{}".format( self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url - def _extended_tags(self, post, page=None): - if not page: - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post["id"]) - page = self.request(url).text - html = text.extract(page, '<ul id="tag-', '</ul>')[0] - if not html: - html = text.extract(page, '<ul class="tag-', '</ul>')[0] - if html: - tags = collections.defaultdict(list) - pattern = re.compile( - r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S) - for tag_type, tag_name in pattern.findall(html): - tags[tag_type].append(text.unquote(tag_name)) - for key, value in tags.items(): - post["tags_" + key] = " ".join(value) - return page - - def _notes(self, post, page=None): - if not page: - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post["id"]) - page = self.request(url).text - notes = [] - notes_data = text.extract(page, '<section id="notes"', '</section>')[0] - if not notes_data: - return - - note_iter = text.extract_iter(notes_data, '<article', '</article>') - extr = text.extract - for note_data in note_iter: - note = { - "width": int(extr(note_data, 'data-width="', '"')[0]), - "height": int(extr(note_data, 'data-height="', '"')[0]), - "x": int(extr(note_data, 'data-x="', '"')[0]), - "y": int(extr(note_data, 'data-y="', '"')[0]), - "body": extr(note_data, 'data-body="', '"')[0], - } - notes.append(note) - - post["notes"] = notes + def _tags_realbooru(self, post, page): + tag_container = text.extr(page, 'id="tagLink"', '</div>') + tags = collections.defaultdict(list) + pattern = re.compile( + r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)') + for tag_type, tag_name in pattern.findall(tag_container): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + post["tags_" + key] = " ".join(value) INSTANCES = { @@ -310,15 +314,81 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor): archive_fmt = "{id}" pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" test = ( - ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", { - "content": "97e4bbf86c3860be18de384d02d544251afe1d45", - "options": (("tags", True),), + ("https://rule34.xxx/index.php?page=post&s=view&id=863", { + "pattern": r"https://api-cdn\.rule34\.xxx/images" + r"/1/6aafbdb3e22f3f3b412ea2cf53321317a37063f3\.jpg", + "content": ("a43f418aa350039af0d11cae501396a33bbe2201", + "67b516295950867e1c1ab6bc13b35d3b762ed2a3"), + "options": (("tags", True), ("notes", True)), "keyword": { - "tags_artist": "danraku", - "tags_character": "kashima_(kantai_collection)", - "tags_copyright": "kantai_collection", + "tags_artist": "reverse_noise yamu_(reverse_noise)", + "tags_character": "hong_meiling", + "tags_copyright": "touhou", "tags_general": str, - "tags_metadata": str, + "tags_metadata": "censored translated", + "notes": [ + { + "body": "It feels angry, I'm losing myself... " + "It won't calm down!", + "height": 65, + "id": 93586, + "width": 116, + "x": 22, + "y": 333, + }, + { + "body": "REPUTATION OF RAGE", + "height": 272, + "id": 93587, + "width": 199, + "x": 78, + "y": 442, + }, + ], + + }, + }), + ("https://hypnohub.net/index.php?page=post&s=view&id=1439", { + "pattern": r"https://hypnohub\.net/images" + r"/90/24/90245c3c5250c2a8173255d3923a010b\.jpg", + "content": "5987c5d2354f22e5fa9b7ee7ce4a6f7beb8b2b71", + "options": (("tags", True), ("notes", True)), + "keyword": { + "tags_artist": "brokenteapot", + "tags_character": "hsien-ko", + "tags_copyright": "capcom darkstalkers", + "tags_general": str, + "tags_metadata": "dialogue text translated", + "notes": [ + { + "body": "Master Master Master " + "Master Master Master", + "height": 83, + "id": 10577, + "width": 129, + "x": 259, + "y": 20, + }, + { + "body": "Response Response Response " + "Response Response Response", + "height": 86, + "id": 10578, + "width": 125, + "x": 126, + "y": 20, + }, + { + "body": "Obedience Obedience Obedience " + "Obedience Obedience Obedience", + "height": 80, + "id": 10579, + "width": 98, + "x": 20, + "y": 20, + }, + ], + }, }), ("https://safebooru.org/index.php?page=post&s=view&id=1169132", { @@ -336,16 +406,18 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor): "pattern": r"https://realbooru\.com/images/dc/b5" r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg", "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", + "options": (("tags", True),), + "keyword": { + "tags_general": "1girl blonde blonde_hair blue_eyes cute " + "female female_only looking_at_viewer smile " + "solo solo_female teeth", + "tags_model": "jennifer_lawrence", + }, }), ("https://tbib.org/index.php?page=post&s=view&id=9233957", { "url": "5a6ebe07bfff8e6d27f7c30b5480f27abcb577d2", "content": "1c3831b6fbaa4686e3c79035b5d98460b1c85c43", }), - ("https://hypnohub.net/index.php?page=post&s=view&id=73964", { - "pattern": r"https://hypnohub\.net/images/7a/37" - r"/7a37c0ba372f35767fb10c904a398831\.png", - "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee", - }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 69c07d0..10c7295 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -87,25 +87,25 @@ class GenericExtractor(Extractor): """Extract generic webpage metadata, return them in a dict.""" data = {} data['pageurl'] = self.url - data['title'] = text.extract(page, '<title>', "</title>")[0] or "" - data['description'] = text.extract( - page, '<meta name="description" content="', '"')[0] or "" - data['keywords'] = text.extract( - page, '<meta name="keywords" content="', '"')[0] or "" - data['language'] = text.extract( - page, '<meta name="language" content="', '"')[0] or "" - data['name'] = text.extract( - page, '<meta itemprop="name" content="', '"')[0] or "" - data['copyright'] = text.extract( - page, '<meta name="copyright" content="', '"')[0] or "" - data['og_site'] = text.extract( - page, '<meta property="og:site" content="', '"')[0] or "" - data['og_site_name'] = text.extract( - page, '<meta property="og:site_name" content="', '"')[0] or "" - data['og_title'] = text.extract( - page, '<meta property="og:title" content="', '"')[0] or "" - data['og_description'] = text.extract( - page, '<meta property="og:description" content="', '"')[0] or "" + data['title'] = text.extr(page, '<title>', "</title>") + data['description'] = text.extr( + page, '<meta name="description" content="', '"') + data['keywords'] = text.extr( + page, '<meta name="keywords" content="', '"') + data['language'] = text.extr( + page, '<meta name="language" content="', '"') + data['name'] = text.extr( + page, '<meta itemprop="name" content="', '"') + data['copyright'] = text.extr( + page, '<meta name="copyright" content="', '"') + data['og_site'] = text.extr( + page, '<meta property="og:site" content="', '"') + data['og_site_name'] = text.extr( + page, '<meta property="og:site_name" content="', '"') + data['og_title'] = text.extr( + page, '<meta property="og:title" content="', '"') + data['og_description'] = text.extr( + page, '<meta property="og:description" content="', '"') data = {k: text.unescape(data[k]) for k in data if data[k] != ""} diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index b4f433b..593a846 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -60,7 +60,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor): self.session.headers["Referer"] = url def metadata(self, page): - title = text.extract(page, "<title>", "</title>")[0] + title = text.extr(page, "<title>", "</title>") return { "title": text.unescape(title.rpartition(" Story Viewer - ")[0]), "slug" : self.slug, diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 0741451..2dfc721 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -156,8 +156,8 @@ class HentaifoundryExtractor(Extractor): "filter_media" : "A", "filter_order" : "date_new", "filter_type" : "0", - "YII_CSRF_TOKEN" : text.unquote(text.extract( - csrf_token, "%22", "%22")[0]), + "YII_CSRF_TOKEN" : text.unquote(text.extr( + csrf_token, "%22", "%22")), } self.request(url, method="POST", data=data) diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py index c3e6d76..38ec77c 100644 --- a/gallery_dl/extractor/hentaihere.py +++ b/gallery_dl/extractor/hentaihere.py @@ -30,19 +30,24 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): "keyword": "0207d20eea3a15d2a8d1496755bdfa49de7cfa9d", }), ("https://hentaihere.com/m/S23048/1.5/1/", { - "author": "Shinozuka Yuuji", - "chapter": 1, - "chapter_id": 80186, - "chapter_minor": ".5", + "pattern": r"https://hentaicdn\.com/hentai" + r"/23048/1\.5/ccdn00\d+\.jpg", "count": 32, - "lang": "en", - "language": "English", - "manga": "High School Slut's Love Consultation", - "manga_id": 23048, - "page": int, - "title": "High School Slut's Love Consultation + " - "Girlfriend [Full Color]", - "type": "Original", + "keyword": { + "author": "Shinozuka Yuuji", + "chapter": 1, + "chapter_id": 80186, + "chapter_minor": ".5", + "count": 32, + "lang": "en", + "language": "English", + "manga": "High School Slut's Love Consultation", + "manga_id": 23048, + "page": int, + "title": "High School Slut's Love Consultation + " + "Girlfriend [Full Color]", + "type": "Original", + }, }), ) @@ -52,8 +57,8 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): ChapterExtractor.__init__(self, match, url) def metadata(self, page): - title = text.extract(page, "<title>", "</title>")[0] - chapter_id = text.extract(page, 'report/C', '"')[0] + title = text.extr(page, "<title>", "</title>") + chapter_id = text.extr(page, 'report/C', '"') chapter, sep, minor = self.chapter.partition(".") pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " match = re.match(pattern, title) @@ -72,7 +77,7 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): @staticmethod def images(page): - images = text.extract(page, "var rff_imageList = ", ";")[0] + images = text.extr(page, "var rff_imageList = ", ";") return [ ("https://hentaicdn.com/hentai" + part, None) for part in json.loads(images) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 201ffdd..adee94a 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -139,7 +139,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): self.manga_data(self.manga, page) results = [] - shortlink = text.extract(page, "rel='shortlink' href='", "'")[0] + shortlink = text.extr(page, "rel='shortlink' href='", "'") data = { "action" : "manga_get_reading_nav", "manga" : shortlink.rpartition("=")[2], @@ -182,6 +182,6 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): def chapters(self, page): results = [] for info in text.extract_iter(page, 'id="manga-item-', '<img'): - url = text.extract(info, 'href="', '"')[0] + url = text.extr(info, 'href="', '"') results.append((url, {})) return results diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index d6575cf..01ad38c 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -44,7 +44,7 @@ class HotleakExtractor(Extractor): for item in text.extract_iter( page, '<article class="movie-item', '</article>'): - yield text.extract(item, '<a href="', '"')[0] + yield text.extr(item, '<a href="', '"') params["page"] += 1 @@ -87,8 +87,8 @@ class HotleakPostExtractor(HotleakExtractor): url = "{}/{}/{}/{}".format( self.root, self.creator, self.type, self.id) page = self.request(url).text - page = text.extract( - page, '<div class="movie-image thumb">', '</article>')[0] + page = text.extr( + page, '<div class="movie-image thumb">', '</article>') data = { "id" : text.parse_int(self.id), "creator": self.creator, @@ -96,12 +96,12 @@ class HotleakPostExtractor(HotleakExtractor): } if self.type == "photo": - data["url"] = text.extract(page, 'data-src="', '"')[0] + data["url"] = text.extr(page, 'data-src="', '"') text.nameext_from_url(data["url"], data) elif self.type == "video": - data["url"] = "ytdl:" + text.extract( - text.unescape(page), '"src":"', '"')[0] + data["url"] = "ytdl:" + text.extr( + text.unescape(page), '"src":"', '"') text.nameext_from_url(data["url"], data) data["extension"] = "mp4" diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 9701f1e..ce68d6d 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -115,7 +115,7 @@ class IdolcomplexExtractor(SankakuExtractor): if self.extags: tags = collections.defaultdict(list) - tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0] + tags_html = text.extr(page, '<ul id=tag-sidebar>', '</ul>') pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)') for tag_type, tag_name in pattern.findall(tags_html or ""): tags[tag_type].append(text.unquote(tag_name)) diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 7cd67d6..f993db8 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -83,8 +83,8 @@ class ImagebamGalleryExtractor(ImagebamExtractor): @staticmethod def metadata(page): - return {"title": text.unescape(text.extract( - page, 'id="gallery-name">', '<')[0].strip())} + return {"title": text.unescape(text.extr( + page, 'id="gallery-name">', '<').strip())} def images(self, page): findall = re.compile(r'<a href="https://www\.imagebam\.com' diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index a1ba0c3..14aa16f 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -36,8 +36,8 @@ class ImagechestGalleryExtractor(GalleryExtractor): return { "gallery_id": self.gallery_id, - "title": text.unescape(text.extract( - page, 'property="og:title" content="', '"')[0].strip()) + "title": text.unescape(text.extr( + page, 'property="og:title" content="', '"').strip()) } def images(self, page): diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 2c899eb..56bd048 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -202,7 +202,7 @@ class ImagefapUserExtractor(ImagefapExtractor): response = self.request(url) self.user = response.url.split("/")[-2] - folders = text.extract(response.text, ' id="tgl_all" value="', '"')[0] + folders = text.extr(response.text, ' id="tgl_all" value="', '"') return folders.rstrip("|").split("|") def galleries(self, folder_id): diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 69455a8..622509f 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -54,6 +54,7 @@ class ImagehostImageExtractor(Extractor): url, filename = self.get_info(page) data = text.nameext_from_url(filename, {"token": self.token}) + data.update(self.metadata(page)) if self.https and url.startswith("http:"): url = "https:" + url[5:] @@ -63,6 +64,10 @@ class ImagehostImageExtractor(Extractor): def get_info(self, page): """Find image-url and string to get filename from""" + def metadata(self, page): + """Return additional metadata""" + return () + class ImxtoImageExtractor(ImagehostImageExtractor): """Extractor for single images from imx.to""" @@ -72,13 +77,23 @@ class ImxtoImageExtractor(ImagehostImageExtractor): test = ( ("https://imx.to/i/1qdeva", { # new-style URL "url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130", - "keyword": "1153a986c939d7aed599905588f5c940048bc517", "content": "0c8768055e4e20e7c7259608b67799171b691140", + "keyword": { + "size" : 18, + "width" : 64, + "height": 32, + "hash" : "94d56c599223c59f3feb71ea603484d1", + }, }), ("https://imx.to/img-57a2050547b97.html", { # old-style URL "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204", - "keyword": "fd2240aee77a21b8252d5b829a1f7e542f927f09", "content": "54592f2635674c25677c6872db3709d343cdf92f", + "keyword": { + "size" : 5284, + "width" : 320, + "height": 160, + "hash" : "40da6aaa7b8c42b18ef74309bbc713fc", + }, }), ("https://img.yt/img-57a2050547b97.html", { # img.yt domain "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204", @@ -108,6 +123,17 @@ class ImxtoImageExtractor(ImagehostImageExtractor): filename += splitext(url)[1] return url, filename or url + def metadata(self, page): + extr = text.extract_from(page, page.index("[ FILESIZE <")) + size = extr(">", "</span>").replace(" ", "")[:-1] + width, _, height = extr(">", " px</span>").partition("x") + return { + "size" : text.parse_bytes(size), + "width" : text.parse_int(width), + "height": text.parse_int(height), + "hash" : extr(">", "</span>"), + } + class AcidimgImageExtractor(ImagehostImageExtractor): """Extractor for single images from acidimg.cc""" @@ -259,7 +285,7 @@ class ViprImageExtractor(ImagehostImageExtractor): }) def get_info(self, page): - url = text.extract(page, '<img src="', '"')[0] + url = text.extr(page, '<img src="', '"') return url, url diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index f32093a..49082d8 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -71,7 +71,7 @@ class ImgbbExtractor(Extractor): url = self.root + "/login" page = self.request(url).text - token = text.extract(page, 'PF.obj.config.auth_token="', '"')[0] + token = text.extr(page, 'PF.obj.config.auth_token="', '"') headers = {"Referer": url} data = { @@ -154,7 +154,7 @@ class ImgbbAlbumExtractor(ImgbbExtractor): } def images(self, page): - url = text.extract(page, '"og:url" content="', '"')[0] + url = text.extr(page, '"og:url" content="', '"') album_id = url.rpartition("/")[2].partition("?")[0] return self._pagination(page, "https://ibb.co/json", { @@ -185,7 +185,7 @@ class ImgbbUserExtractor(ImgbbExtractor): return {"user": self.user} def images(self, page): - user = text.extract(page, '.obj.resource={"id":"', '"')[0] + user = text.extr(page, '.obj.resource={"id":"', '"') return self._pagination(page, self.page_url + "json", { "from" : "user", "userid" : user, diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py index 251f52e..530c4e1 100644 --- a/gallery_dl/extractor/imgbox.py +++ b/gallery_dl/extractor/imgbox.py @@ -53,7 +53,7 @@ class ImgboxExtractor(Extractor): @staticmethod def get_image_url(page): """Extract download-url""" - return text.extract(page, 'property="og:image" content="', '"')[0] + return text.extr(page, 'property="og:image" content="', '"') class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor): @@ -89,7 +89,7 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor): raise exception.NotFoundError("gallery") self.image_keys = re.findall(r'<a href="/([^"]+)"><img alt="', page) - title = text.extract(page, "<h1>", "</h1>")[0] + title = text.extr(page, "<h1>", "</h1>") title, _, count = title.rpartition(" - ") return { "gallery_key": self.gallery_key, diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py index 6b424ad..7e4cce4 100644 --- a/gallery_dl/extractor/imgth.py +++ b/gallery_dl/extractor/imgth.py @@ -41,7 +41,7 @@ class ImgthGalleryExtractor(Extractor): """Yield all image urls for this gallery""" pnum = 0 while True: - thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0] + thumbs = text.extr(page, '<ul class="thumbnails">', '</ul>') for url in text.extract_iter(thumbs, '<img src="', '"'): yield "https://imgth.com/images" + url[24:] if '<li class="next">' not in page: diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index ded8906..706cd34 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -236,7 +236,7 @@ class InkbunnySearchExtractor(InkbunnyExtractor): # get user_id from user profile url = "{}/{}".format(self.root, favsby) page = self.request(url).text - user_id = text.extract(page, "?user_id=", "'")[0] + user_id = text.extr(page, "?user_id=", "'") params["favs_user_id"] = user_id.partition("&")[0] return self.api.search(params) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index a4ea71a..24ad873 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -14,7 +14,6 @@ from .. import text, util, exception from ..cache import cache, memcache import binascii import json -import time import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com" @@ -45,14 +44,10 @@ class InstagramExtractor(Extractor): def items(self): self.login() - api = self.config("api") - if api is None or api == "auto": - api = InstagramRestAPI if self._logged_in else InstagramGraphqlAPI - elif api == "graphql": - api = InstagramGraphqlAPI + if self.config("api") == "graphql": + self.api = InstagramGraphqlAPI(self) else: - api = InstagramRestAPI - self.api = api(self) + self.api = InstagramRestAPI(self) data = self.metadata() videos = self.config("videos", True) @@ -385,7 +380,6 @@ class InstagramUserExtractor(InstagramExtractor): (InstagramPostsExtractor , base + "posts/"), (InstagramReelsExtractor , base + "reels/"), (InstagramTaggedExtractor , base + "tagged/"), - (InstagramChannelExtractor , base + "channel/"), ), ("posts",)) @@ -449,18 +443,25 @@ class InstagramTaggedExtractor(InstagramExtractor): return self.api.user_tagged(self.user_id) -class InstagramChannelExtractor(InstagramExtractor): - """Extractor for an Instagram user's channel posts""" - subcategory = "channel" - pattern = USER_PATTERN + r"/channel" - test = ("https://www.instagram.com/instagram/channel/", { +class InstagramGuideExtractor(InstagramExtractor): + """Extractor for an Instagram guide""" + subcategory = "guide" + pattern = USER_PATTERN + r"/guide/[^/?#]+/(\d+)" + test = (("https://www.instagram.com/kadakaofficial/guide" + "/knit-i-need-collection/18131821684305217/"), { "range": "1-16", "count": ">= 16", }) + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.guide_id = match.group(2) + + def metadata(self): + return {"guide": self.api.guide(self.guide_id)} + def posts(self): - uid = self.api.user_id(self.item) - return self.api.user_clips(uid) + return self.api.guide_media(self.guide_id) class InstagramSavedExtractor(InstagramExtractor): @@ -581,7 +582,7 @@ class InstagramAvatarExtractor(InstagramExtractor): def posts(self): if self._logged_in: - user_id = self.api.user_id(self.item) + user_id = self.api.user_id(self.item, check_private=False) user = self.api.user_by_id(user_id) avatar = (user.get("hd_profile_pic_url_info") or user["hd_profile_pic_versions"][-1]) @@ -723,6 +724,15 @@ class InstagramRestAPI(): def __init__(self, extractor): self.extractor = extractor + def guide(self, guide_id): + endpoint = "/v1/guides/web_info/" + params = {"guide_id": guide_id} + return self._call(endpoint, params=params) + + def guide_media(self, guide_id): + endpoint = "/v1/guides/guide/{}/".format(guide_id) + return self._pagination_guides(endpoint) + def highlights_media(self, user_id): chunk_size = 5 reel_ids = [hl["id"] for hl in self.highlights_tray(user_id)] @@ -770,14 +780,15 @@ class InstagramRestAPI(): endpoint = "/v1/users/{}/info/".format(user_id) return self._call(endpoint)["user"] - def user_id(self, screen_name): + def user_id(self, screen_name, check_private=True): if screen_name.startswith("id:"): return screen_name[3:] user = self.user_by_name(screen_name) if user is None: raise exception.AuthorizationError( "Login required to access this profile") - if user["is_private"] and not user["followed_by_viewer"]: + if check_private and user["is_private"] and \ + not user["followed_by_viewer"]: name = user["username"] s = "" if name.endswith("s") else "s" raise exception.StopExtraction("%s'%s posts are private", name, s) @@ -874,13 +885,28 @@ class InstagramRestAPI(): params["page"] = info["next_page"] params["max_id"] = extr._update_cursor(info["next_max_id"]) + def _pagination_guides(self, endpoint): + extr = self.extractor + params = {"max_id": extr._init_cursor()} + + while True: + data = self._call(endpoint, params=params) + + for item in data["items"]: + yield from item["media_items"] + + if "next_max_id" not in data: + return + params["max_id"] = extr._update_cursor(data["next_max_id"]) + class InstagramGraphqlAPI(): def __init__(self, extractor): self.extractor = extractor self.user_collection = self.user_saved = self.reels_media = \ - self.highlights_media = self._login_required + self.highlights_media = self.guide = self.guide_media = \ + self._unsupported self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode api = InstagramRestAPI(extractor) @@ -889,8 +915,8 @@ class InstagramGraphqlAPI(): self.user_id = api.user_id @staticmethod - def _login_required(_=None): - raise exception.AuthorizationError("Login required") + def _unsupported(_=None): + raise exception.StopExtraction("Unsupported with GraphQL API") def highlights_tray(self, user_id): query_hash = "d4d88dc1500312af6f937f7b804c68c3" @@ -990,63 +1016,9 @@ class InstagramGraphqlAPI(): @cache(maxage=90*24*3600, keyarg=1) def _login_impl(extr, username, password): - extr.log.info("Logging in as %s", username) - - user_agent = ("Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/106.0.5249.79 Mobile " - "Safari/537.36 Instagram 255.1.0.17.102") - - headers = { - "User-Agent" : user_agent, - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1", - } - url = extr.root + "/accounts/login/" - response = extr.request(url, headers=headers) - - extract = text.extract_from(response.text) - csrf_token = extract('"csrf_token":"', '"') - device_id = extract('"device_id":"', '"') - rollout_hash = extract('"rollout_hash":"', '"') - - cset = extr.session.cookies.set - cset("csrftoken", csrf_token, domain=extr.cookiedomain) - cset("ig_did", device_id, domain=extr.cookiedomain) - - headers = { - "User-Agent" : user_agent, - "Accept" : "*/*", - "X-CSRFToken" : csrf_token, - "X-Instagram-AJAX": rollout_hash, - "X-IG-App-ID" : "936619743392459", - "X-ASBD-ID" : "198387", - "X-IG-WWW-Claim" : "0", - "X-Requested-With": "XMLHttpRequest", - "Origin" : extr.root, - "Referer" : url, - "Sec-Fetch-Dest" : "empty", - "Sec-Fetch-Mode" : "cors", - "Sec-Fetch-Site" : "same-origin", - } - data = { - "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format( - int(time.time()), password), - "username" : username, - "queryParams" : "{}", - "optIntoOneTap" : "false", - "stopDeletionNonce" : "", - "trustedDeviceRecords": "{}", - } - url = extr.root + "/accounts/login/ajax/" - response = extr.request(url, method="POST", headers=headers, data=data) - - if not response.json().get("authenticated"): - raise exception.AuthenticationError() - - return {cookie.name: cookie.value - for cookie in extr.session.cookies} + extr.log.error("Login with username & password is no longer supported. " + "Use browser cookies instead.") + return {} def id_from_shortcode(shortcode): diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index ae4112b..8067f63 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -54,8 +54,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): }) def metadata(self, page): - data = json.loads(text.extract( - page, '<script data-json="', '"')[0].replace(""", '"')) + data = json.loads(text.extr( + page, '<script data-json="', '"').replace(""", '"')) doc = data["initialDocumentData"]["document"] doc["date"] = text.parse_datetime( diff --git a/gallery_dl/extractor/kabeuchi.py b/gallery_dl/extractor/kabeuchi.py index a8702f1..f172dcf 100644 --- a/gallery_dl/extractor/kabeuchi.py +++ b/gallery_dl/extractor/kabeuchi.py @@ -62,7 +62,7 @@ class KabeuchiUserExtractor(Extractor): response = self.request(url) if response.history and response.url == self.root + "/": raise exception.NotFoundError("user") - target_id = text.extract(response.text, 'user_friend_id = "', '"')[0] + target_id = text.extr(response.text, 'user_friend_id = "', '"') return self._pagination(target_id) def _pagination(self, target_id): diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py index 50ce0d3..b5d7738 100644 --- a/gallery_dl/extractor/keenspot.py +++ b/gallery_dl/extractor/keenspot.py @@ -96,7 +96,7 @@ class KeenspotComicExtractor(Extractor): self._image = '<div id="comic">' return "http://brawlinthefamily.keenspot.com/comic/theshowdown/" - url = text.extract(page, '<link rel="first" href="', '"')[0] + url = text.extr(page, '<link rel="first" href="', '"') if url: if self.comic == "porcelain": self._needle = 'id="porArchivetop_"' @@ -144,7 +144,7 @@ class KeenspotComicExtractor(Extractor): @staticmethod def _next_link(page): - return text.extract(page, '<link rel="next" href="', '"')[0] + return text.extr(page, '<link rel="next" href="', '"') @staticmethod def _next_id(page): diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 21ff114..8a61728 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -192,7 +192,7 @@ class KemonopartyExtractor(Extractor): "body": text.unescape(text.extract( dm, "<pre>", "</pre></", )[0].strip()), - "date": text.extract(dm, 'datetime="', '"')[0], + "date": text.extr(dm, 'datetime="', '"'), }) return dms diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index d2e9d88..d5cca1c 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -23,9 +23,9 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): root = "https://downloads.khinsider.com" test = (("https://downloads.khinsider.com" "/game-soundtracks/album/horizon-riders-wii"), { - "pattern": r"https?://vgm(site|downloads).com" + "pattern": r"https?://vgm(site|downloads)\.com" r"/soundtracks/horizon-riders-wii/[^/]+" - r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3", + r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3", "keyword": { "album": { "count": 1, @@ -76,15 +76,14 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): else: fmt = fmt.lower().split(",") - page = text.extract(page, '<table id="songlist">', '</table>')[0] + page = text.extr(page, '<table id="songlist">', '</table>') for num, url in enumerate(text.extract_iter( page, '<td class="clickable-row"><a href="', '"'), 1): url = text.urljoin(self.root, url) page = self.request(url, encoding="utf-8").text track = first = None - for url in text.extract_iter( - page, 'style="color: #21363f;" href="', '"'): + for url in text.extract_iter(page, '<p><a href="', '"'): track = text.nameext_from_url(url, {"num": num, "url": url}) if first is None: first = track diff --git a/gallery_dl/extractor/kissgoddess.py b/gallery_dl/extractor/kissgoddess.py index 6e66772..4ec685c 100644 --- a/gallery_dl/extractor/kissgoddess.py +++ b/gallery_dl/extractor/kissgoddess.py @@ -35,8 +35,8 @@ class KissgoddessGalleryExtractor(GalleryExtractor): def metadata(self, page): return { "gallery_id": text.parse_int(self.gallery_id), - "title" : text.extract( - page, '<title>', "<")[0].rpartition(" | ")[0], + "title" : text.extr( + page, '<title>', "<")[0].rpartition(" | "), } def images(self, page): diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 1187fd6..a9eebf4 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -62,13 +62,13 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): ) def metadata(self, page): - info = text.extract(page, "<title>", " – Komikcast<")[0] + info = text.extr(page, "<title>", " – Komikcast<") return self.parse_chapter_string(info) @staticmethod def images(page): - readerarea = text.extract( - page, '<div class="main-reading-area', '</div')[0] + readerarea = text.extr( + page, '<div class="main-reading-area', '</div') return [ (text.unescape(url), None) for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea) diff --git a/gallery_dl/extractor/lightroom.py b/gallery_dl/extractor/lightroom.py index 8131db8..d202e20 100644 --- a/gallery_dl/extractor/lightroom.py +++ b/gallery_dl/extractor/lightroom.py @@ -47,7 +47,7 @@ class LightroomGalleryExtractor(Extractor): url = "https://lightroom.adobe.com/shares/" + self.href response = self.request(url) album = json.loads( - text.extract(response.text, "albumAttributes: ", "\n")[0] + text.extr(response.text, "albumAttributes: ", "\n") ) images = self.images(album) diff --git a/gallery_dl/extractor/lineblog.py b/gallery_dl/extractor/lineblog.py index 4071a26..adb27a8 100644 --- a/gallery_dl/extractor/lineblog.py +++ b/gallery_dl/extractor/lineblog.py @@ -22,8 +22,8 @@ class LineblogBase(): body = post.pop("body") for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1): - src = text.extract(img, 'src="', '"')[0] - alt = text.extract(img, 'alt="', '"')[0] + src = text.extr(img, 'src="', '"') + alt = text.extr(img, 'alt="', '"') if not src: continue diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py index cffbc10..2765f0b 100644 --- a/gallery_dl/extractor/livedoor.py +++ b/gallery_dl/extractor/livedoor.py @@ -37,7 +37,7 @@ class LivedoorExtractor(Extractor): def _load(self, data, body): extr = text.extract_from(data) - tags = text.extract(body, 'class="article-tags">', '</dl>')[0] + tags = text.extr(body, 'class="article-tags">', '</dl>') about = extr('rdf:about="', '"') return { @@ -57,8 +57,8 @@ class LivedoorExtractor(Extractor): body = post.pop("body") for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1): - src = text.extract(img, 'src="', '"')[0] - alt = text.extract(img, 'alt="', '"')[0] + src = text.extr(img, 'src="', '"') + alt = text.extr(img, 'alt="', '"') if not src: continue diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 7c6ef69..14d4efb 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -24,6 +24,10 @@ BASE_PATTERN = LolisafeExtractor.update({ "root": "https://zz.ht", "pattern": r"zz\.(?:ht|fo)", }, + "xbunkr": { + "root": "https://xbunkr.com", + "pattern": r"xbunkr\.com", + } }) @@ -40,6 +44,15 @@ class LolisafeAlbumExtractor(LolisafeExtractor): }, }), ("https://zz.fo/a/lop7W6EZ"), + ("https://xbunkr.com/a/TA0bu3F4", { + "pattern": r"https://media\.xbunkr\.com/[^.]+\.\w+", + "count": 861, + "keyword": { + "album_id": "TA0bu3F4", + "album_name": "Hannahowo Onlyfans Photos", + } + }), + ("https://xbunkr.com/a/GNQc2I5d"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index a12a801..14a542b 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -63,8 +63,8 @@ class ManganeloChapterExtractor(ChapterExtractor): } def images(self, page): - page = text.extract( - page, 'class="container-chapter-reader', '\n<div')[0] + page = text.extr( + page, 'class="container-chapter-reader', '\n<div') return [ (url, None) for url in text.extract_iter(page, '<img src="', '"') diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index a28a966..dcf1972 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -104,7 +104,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): return data def images(self, page): - data = json.loads(text.extract(page, "var _load_pages =", ";")[0]) + data = json.loads(text.extr(page, "var _load_pages =", ";")) return [ (text.urljoin(self.root, item["u"]), { "width": text.parse_int(item["w"]), @@ -136,10 +136,10 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): results = [] data = {"lang": "en", "language": "English"} data["manga"] = text.unescape( - text.extract(page, '<title>', ' Manga - ')[0]) + text.extr(page, '<title>', ' Manga - ')) for stream in page.split('<div id="stream_')[1:]: - data["stream"] = text.parse_int(text.extract(stream, '', '"')[0]) + data["stream"] = text.parse_int(text.extr(stream, '', '"')) for chapter in text.extract_iter(stream, '<li ', '</li>'): path , pos = text.extract(chapter, 'href="', '"') diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index 1486057..ac4c797 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -38,7 +38,7 @@ class MangoxoExtractor(Extractor): url = self.root + "/login" page = self.request(url).text - token = text.extract(page, 'id="loginToken" value="', '"')[0] + token = text.extr(page, 'id="loginToken" value="', '"') url = self.root + "/api/login" headers = { @@ -115,7 +115,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor): data["extension"] = None for data["num"], path in enumerate(imgs, 1): - data["id"] = text.parse_int(text.extract(path, "=", "&")[0]) + data["id"] = text.parse_int(text.extr(path, "=", "&")) url = self.root + "/external/" + path.rpartition("url=")[2] yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 0d2cded..049e0af 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -44,6 +44,10 @@ class MastodonExtractor(BaseExtractor): del status["media_attachments"] status["instance"] = self.instance + acct = status["account"]["acct"] + status["instance_remote"] = \ + acct.rpartition("@")[2] if "@" in acct else None + status["tags"] = [tag["name"] for tag in status["tags"]] status["date"] = text.parse_datetime( status["created_at"][:19], "%Y-%m-%dT%H:%M:%S") diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 4d63c3e..0ef0a32 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -26,42 +26,39 @@ class MoebooruExtractor(BooruExtractor): def _prepare(post): post["date"] = text.parse_timestamp(post["created_at"]) - def _extended_tags(self, post, page=None): - if not page: - url = "{}/post/show/{}".format(self.root, post["id"]) - page = self.request(url).text - html = text.extract(page, '<ul id="tag-', '</ul>')[0] - if html: - tags = collections.defaultdict(list) - pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)") - for tag_type, tag_name in pattern.findall(html): - tags[tag_type].append(text.unquote(tag_name)) - for key, value in tags.items(): - post["tags_" + key] = " ".join(value) - return page - - def _notes(self, post, page=None): - if not page: - url = "{}/post/show/{}".format(self.root, post["id"]) - page = self.request(url).text - notes = [] - notes_container = text.extract(page, 'id="note-container"', "<img ")[0] - if not notes_container: + def _html(self, post): + return self.request("{}/post/show/{}".format( + self.root, post["id"])).text + + def _tags(self, post, page): + tag_container = text.extr(page, '<ul id="tag-', '</ul>') + if not tag_container: return - for note in notes_container.split('class="note-box"')[1:]: + tags = collections.defaultdict(list) + pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)") + for tag_type, tag_name in pattern.findall(tag_container): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + post["tags_" + key] = " ".join(value) + + def _notes(self, post, page): + note_container = text.extr(page, 'id="note-container"', "<img ") + if not note_container: + return + + post["notes"] = notes = [] + for note in note_container.split('class="note-box"')[1:]: extr = text.extract_from(note) notes.append({ - "width" : int(extr("width: ", "p")), - "height": int(extr("height: ", "p")), - "y" : int(extr("top: ", "p")), - "x" : int(extr("left: ", "p")), + "width" : int(extr("width:", "p")), + "height": int(extr("height:", "p")), + "y" : int(extr("top:", "p")), + "x" : int(extr("left:", "p")), "id" : int(extr('id="note-body-', '"')), - "body" : text.remove_html(extr('>', "</div>")), + "body" : text.unescape(text.remove_html(extr(">", "</div>"))), }) - post["notes"] = notes - def _pagination(self, url, params): params["page"] = self.page_start params["limit"] = self.per_page diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py index da0f589..3dbd5fc 100644 --- a/gallery_dl/extractor/myhentaigallery.py +++ b/gallery_dl/extractor/myhentaigallery.py @@ -59,7 +59,7 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor): def images(self, page): return [ - (text.unescape(text.extract(url, 'src="', '"')[0]).replace( + (text.unescape(text.extr(url, 'src="', '"')).replace( "/thumbnail/", "/original/"), None) for url in text.extract_iter(page, 'class="comic-thumb"', '</div>') ] diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index 8254118..7d23518 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -57,8 +57,8 @@ class MyportfolioGalleryExtractor(Extractor): raise exception.NotFoundError() page = response.text - projects = text.extract( - page, '<section class="project-covers', '</section>')[0] + projects = text.extr( + page, '<section class="project-covers', '</section>') if projects: data = {"_extractor": MyportfolioGalleryExtractor} diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py index 6062418..1db83b0 100644 --- a/gallery_dl/extractor/nana.py +++ b/gallery_dl/extractor/nana.py @@ -44,10 +44,10 @@ class NanaGalleryExtractor(GalleryExtractor): def metadata(self, page): title = text.unescape( - text.extract(page, '</a> ', '</div>')[0]) - artist = text.unescape(text.extract( - page, '<title>', '</title>')[0])[len(title):-10] - tags = text.extract(page, 'Reader.tags = "', '"')[0] + text.extr(page, '</a> ', '</div>')) + artist = text.unescape(text.extr( + page, '<title>', '</title>'))[len(title):-10] + tags = text.extr(page, 'Reader.tags = "', '"') return { "gallery_id": self.gallery_id, @@ -59,7 +59,7 @@ class NanaGalleryExtractor(GalleryExtractor): } def images(self, page): - data = json.loads(text.extract(page, "Reader.pages = ", ".pages")[0]) + data = json.loads(text.extr(page, "Reader.pages = ", ".pages")) return [ ("https://nana.my.id" + image, None) for image in data["pages"] @@ -108,8 +108,8 @@ class NanaSearchExtractor(Extractor): for gallery in text.extract_iter( page, '<div class="id3">', '</div>'): - url = "https://nana.my.id" + text.extract( - gallery, '<a href="', '"')[0] + url = "https://nana.my.id" + text.extr( + gallery, '<a href="', '"') yield Message.Queue, url, data self.params["p"] += 1 diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index eadd460..fa91f76 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -76,7 +76,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): @staticmethod def images(page): - view_area = text.extract(page, 'id="comic_view_area"', '</div>')[0] + view_area = text.extr(page, 'id="comic_view_area"', '</div>') return [ (url, None) for url in text.extract_iter(view_area, '<img src="', '"') diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 2c2dcb9..1f96879 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -88,8 +88,8 @@ class NewgroundsExtractor(Extractor): return self.session.cookies headers = {"Origin": self.root, "Referer": url} - url = text.urljoin(self.root, text.extract( - response.text, 'action="', '"')[0]) + url = text.urljoin(self.root, text.extr( + response.text, 'action="', '"')) data = { "username": username, "password": password, @@ -140,7 +140,7 @@ class NewgroundsExtractor(Extractor): data["score"] = text.parse_float(extr('id="score_number">', '<')) data["tags"] = text.split_html(extr('<dd class="tags">', '</dd>')) data["artist"] = [ - text.extract(user, '//', '.')[0] + text.extr(user, '//', '.') for user in text.extract_iter(page, '<div class="item-user">', '>') ] @@ -275,7 +275,7 @@ class NewgroundsExtractor(Extractor): for year, items in items.items(): for item in items: - page_url = text.extract(item, 'href="', '"')[0] + page_url = text.extr(item, 'href="', '"') if page_url[0] == "/": page_url = self.root + page_url yield page_url diff --git a/gallery_dl/extractor/ngomik.py b/gallery_dl/extractor/ngomik.py deleted file mode 100644 index 8e29d97..0000000 --- a/gallery_dl/extractor/ngomik.py +++ /dev/null @@ -1,51 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2018-2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extract manga-chapters and entire manga from http://ngomik.in/""" - -from .common import ChapterExtractor -from .. import text -import re - - -class NgomikChapterExtractor(ChapterExtractor): - """Extractor for manga-chapters from ngomik.in""" - category = "ngomik" - root = "http://ngomik.in" - pattern = (r"(?:https?://)?(?:www\.)?ngomik\.in" - r"(/[^/?#]+-chapter-[^/?#]+)") - test = ( - ("https://www.ngomik.in/14-sai-no-koi-chapter-1-6/", { - "url": "8e67fdf751bbc79bc6f4dead7675008ddb8e32a4", - "keyword": "204d177f09d438fd50c9c28d98c73289194640d8", - }), - ("https://ngomik.in/break-blade-chapter-26/", { - "count": 34, - }), - ) - - def metadata(self, page): - info = text.extract(page, '<title>', "</title>")[0] - manga, _, chapter = info.partition(" Chapter ") - chapter, sep, minor = chapter.partition(" ")[0].partition(".") - - return { - "manga": text.unescape(manga), - "chapter": text.parse_int(chapter), - "chapter_minor": sep + minor, - "lang": "id", - "language": "Indonesian", - } - - @staticmethod - def images(page): - readerarea = text.extract(page, 'id="readerarea"', 'class="chnav"')[0] - return [ - (text.unescape(url), None) - for url in re.findall(r"\ssrc=[\"']?([^\"' >]+)", readerarea) - ] diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 73911b2..079bae7 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -107,7 +107,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): """Extract image URLs from 'page'""" images = text.extract_iter(page, "/view_popup.php", "</a>") for num, image in enumerate(images): - src = text.extract(image, 'src="', '"')[0] + src = text.extr(image, 'src="', '"') if not src: continue url = ("https:" + src).replace("/__rs_l120x120/", "/") @@ -118,7 +118,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): @staticmethod def _extract_user_name(page): - return text.unescape(text.extract(page, "<br />", "<")[0] or "") + return text.unescape(text.extr(page, "<br />", "<")) def login(self): """Login and obtain session cookies""" @@ -322,8 +322,7 @@ class NijieNuitaExtractor(NijieExtractor): @staticmethod def _extract_user_name(page): - return text.unescape(text.extract( - page, "<title>", "さんの抜いた")[0] or "") + return text.unescape(text.extr(page, "<title>", "さんの抜いた")) class NijieFeedExtractor(NijieExtractor): diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py new file mode 100644 index 0000000..1ba8253 --- /dev/null +++ b/gallery_dl/extractor/nitter.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Nitter instances""" + +from .common import BaseExtractor, Message +from .. import text + + +class NitterExtractor(BaseExtractor): + """Base class for nitter extractors""" + basecategory = "nitter" + directory_fmt = ("{category}", "{user[name]}") + filename_fmt = "{tweet_id}_{num}.{extension}" + archive_fmt = "{tweet_id}_{num}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.user = match.group(match.lastindex) + + def items(self): + for tweet_html in self.tweets(): + tweet = self._tweet_from_html(tweet_html) + + attachments_html = tweet.pop("_attach", "") + if attachments_html: + attachments = list(text.extract_iter( + attachments_html, 'href="', '"')) + attachments.extend(text.extract_iter( + attachments_html, 'data-url="', '"')) + else: + attachments = () + tweet["count"] = len(attachments) + + yield Message.Directory, tweet + for tweet["num"], url in enumerate(attachments, 1): + if url[0] == "/": + url = self.root + url + if "/video/" in url: + url = "ytdl:" + url + tweet["filename"] = url.rpartition( + "%2F")[2].partition(".")[0] + tweet["extension"] = "mp4" + else: + text.nameext_from_url(url, tweet) + yield Message.Url, url, tweet + + def _tweet_from_html(self, html): + extr = text.extract_from(html) + user = { + "name": extr('class="fullname" href="/', '"'), + "nick": extr('title="', '"'), + } + extr('<span class="tweet-date', '') + link = extr('href="', '"') + return { + "user": user, + "date": text.parse_datetime( + extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"), + "tweet_id": link.rpartition("/")[2].partition("#")[0], + "content": extr('class="tweet-content', "</div").partition(">")[2], + "_attach": extr('class="attachments', 'class="tweet-stats'), + "comments": text.parse_int(extr( + 'class="icon-comment', '</div>').rpartition(">")[2]), + "retweets": text.parse_int(extr( + 'class="icon-retweet', '</div>').rpartition(">")[2]), + "quotes" : text.parse_int(extr( + 'class="icon-quote', '</div>').rpartition(">")[2]), + "likes" : text.parse_int(extr( + 'class="icon-heart', '</div>').rpartition(">")[2]), + } + + def _pagination(self, path): + base_url = url = self.root + path + + while True: + page = self.request(url).text + + yield from page.split('<div class="timeline-item')[1:] + + more = text.extr(page, '<div class="show-more"><a href="?', '"') + if not more: + return + url = base_url + "?" + text.unescape(more) + + +BASE_PATTERN = NitterExtractor.update({ + "nitter.net": { + "root": "https://nitter.net", + "pattern": r"nitter\.net", + }, + "nitter.lacontrevoie.fr": { + "root": "https://nitter.lacontrevoie.fr", + "pattern": r"nitter\.lacontrevoie\.fr", + }, + "nitter.pussthecat.org": { + "root": "https://nitter.pussthecat.org", + "pattern": r"nitter\.pussthecat\.org", + }, + "nitter.1d4.us": { + "root": "https://nitter.1d4.us", + "pattern": r"nitter\.1d4\.us", + }, + "nitter.kavin.rocks": { + "root": "https://nitter.kavin.rocks", + "pattern": r"nitter\.kavin\.rocks", + }, + "nitter.unixfox.eu": { + "root": "https://nitter.unixfox.eu", + "pattern": r"nitter\.unixfox\.eu", + }, +}) + + +class NitterTweetsExtractor(NitterExtractor): + subcategory = "tweets" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/tweets)?(?:$|\?|#)" + test = ( + ("https://nitter.net/supernaturepics", { + "pattern": r"https://nitter\.net/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + "count": 20, + "keyword": { + "comments": int, + "content": str, + "count": 1, + "date": "type:datetime", + "likes": int, + "quotes": int, + "retweets": int, + "tweet_id": r"re:\d+", + "user": { + "name": "supernaturepics", + "nick": "Nature Pictures" + }, + }, + }), + ("https://nitter.lacontrevoie.fr/supernaturepics"), + ("https://nitter.pussthecat.org/supernaturepics"), + ("https://nitter.1d4.us/supernaturepics"), + ("https://nitter.kavin.rocks/supernaturepics"), + ("https://nitter.unixfox.eu/supernaturepics"), + ) + + def tweets(self): + return self._pagination("/" + self.user) + + +class NitterRepliesExtractor(NitterExtractor): + subcategory = "replies" + pattern = BASE_PATTERN + r"/([^/?#]+)/with_replies" + test = ( + ("https://nitter.net/supernaturepics/with_replies", { + "pattern": r"https://nitter\.net/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + }), + ("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"), + ("https://nitter.pussthecat.org/supernaturepics/with_replies"), + ("https://nitter.1d4.us/supernaturepics/with_replies"), + ("https://nitter.kavin.rocks/supernaturepics/with_replies"), + ("https://nitter.unixfox.eu/supernaturepics/with_replies"), + ) + + def tweets(self): + return self._pagination("/" + self.user + "/with_replies") + + +class NitterMediaExtractor(NitterExtractor): + subcategory = "media" + pattern = BASE_PATTERN + r"/([^/?#]+)/media" + test = ( + ("https://nitter.net/supernaturepics/media", { + "pattern": r"https://nitter\.net/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + }), + ("https://nitter.lacontrevoie.fr/supernaturepics/media"), + ("https://nitter.pussthecat.org/supernaturepics/media"), + ("https://nitter.1d4.us/supernaturepics/media"), + ("https://nitter.kavin.rocks/supernaturepics/media"), + ("https://nitter.unixfox.eu/supernaturepics/media"), + ) + + def tweets(self): + return self._pagination("/" + self.user + "/media") + + +class NitterSearchExtractor(NitterExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/([^/?#]+)/search" + test = ( + ("https://nitter.net/supernaturepics/search", { + "pattern": r"https://nitter\.net/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + }), + ("https://nitter.lacontrevoie.fr/supernaturepics/search"), + ("https://nitter.pussthecat.org/supernaturepics/search"), + ("https://nitter.1d4.us/supernaturepics/search"), + ("https://nitter.kavin.rocks/supernaturepics/search"), + ("https://nitter.unixfox.eu/supernaturepics/search"), + ) + + def tweets(self): + return self._pagination("/" + self.user + "/search") + + +class NitterTweetExtractor(NitterExtractor): + """Extractor for nitter tweets""" + subcategory = "tweet" + directory_fmt = ("{category}", "{user[name]}") + filename_fmt = "{tweet_id}_{num}.{extension}" + archive_fmt = "{tweet_id}_{num}" + pattern = BASE_PATTERN + r"/[^/?#]+/status/(\d+)" + test = ( + ("https://nitter.net/supernaturepics/status/604341487988576256", { + "url": "3f2b64e175bf284aa672c3bb53ed275e470b919a", + "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", + }), + # 4 images + ("https://nitter.lacontrevoie.fr/i/status/894001459754180609", { + "url": "9c51b3a4a1114535eb9b168bba97ad95db0d59ff", + }), + # video + ("https://nitter.pussthecat.org/i/status/1065692031626829824", { + "pattern": r"ytdl:https://nitter.pussthecat.org/video" + r"/B875137EDC8FF/https%3A%2F%2Fvideo.twimg.com%2F" + r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F" + r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5", + }), + # content with emoji, newlines, hashtags (#338) + ("https://nitter.1d4.us/playpokemon/status/1263832915173048321", { + "keyword": {"content": ( + r"re:Gear up for #PokemonSwordShieldEX with special Mystery " + "Gifts! \n\nYou’ll be able to receive four Galarian form " + "Pokémon with Hidden Abilities, plus some very useful items. " + "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ " + )}, + }), + # Nitter tweet (#890) + ("https://nitter.kavin.rocks/ed1conf/status/1163841619336007680", { + "url": "e115bd1c86c660064e392b05269bbcafcd8c8b7a", + "content": "f29501e44d88437fe460f5c927b7543fda0f6e34", + }), + ) + + def tweets(self): + url = "{}/i/status/{}".format(self.root, self.user) + return (self.request(url).text,) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 35a015f..59c5f15 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -95,7 +95,7 @@ class PatreonExtractor(Extractor): if content: for img in text.extract_iter( content, '<img data-media-id="', '>'): - url = text.extract(img, 'src="', '"')[0] + url = text.extr(img, 'src="', '"') if url: yield "content", url, self._filename(url) or url @@ -181,7 +181,7 @@ class PatreonExtractor(Extractor): """Fetch filename from an URL's Content-Disposition header""" response = self.request(url, method="HEAD", fatal=False) cd = response.headers.get("Content-Disposition") - return text.extract(cd, 'filename="', '"')[0] + return text.extr(cd, 'filename="', '"') @staticmethod def _filehash(url): @@ -284,7 +284,7 @@ class PatreonCreatorExtractor(PatreonExtractor): url = "{}/{}/posts".format(self.root, self.creator) page = self.request(url, notfound="creator").text - campaign_id = text.extract(page, "/campaign/", "/")[0] + campaign_id = text.extr(page, "/campaign/", "/") if not campaign_id: raise exception.NotFoundError("creator") diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 225f0ff..fc85125 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -27,10 +27,6 @@ class PhilomenaExtractor(BooruExtractor): def _prepare(post): post["date"] = text.parse_datetime(post["created_at"]) - @staticmethod - def _extended_tags(post): - pass - def _pagination(self, url, params): params["page"] = 1 params["per_page"] = self.per_page diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py index 1993ab6..375b5e3 100644 --- a/gallery_dl/extractor/photobucket.py +++ b/gallery_dl/extractor/photobucket.py @@ -75,7 +75,7 @@ class PhotobucketAlbumExtractor(Extractor): page = self.request(url, params=params).text json_data = text.extract(page, "collectionData:", ",\n")[0] if not json_data: - msg = text.extract(page, 'libraryPrivacyBlock">', "</div>")[0] + msg = text.extr(page, 'libraryPrivacyBlock">', "</div>") msg = ' ("{}")'.format(text.remove_html(msg)) if msg else "" self.log.error("Unable to get JSON data%s", msg) return diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py index bdd9f21..841a99b 100644 --- a/gallery_dl/extractor/pillowfort.py +++ b/gallery_dl/extractor/pillowfort.py @@ -98,7 +98,7 @@ class PillowfortExtractor(Extractor): url = "https://www.pillowfort.social/users/sign_in" page = self.request(url).text - auth = text.extract(page, 'name="authenticity_token" value="', '"')[0] + auth = text.extr(page, 'name="authenticity_token" value="', '"') headers = {"Origin": self.root, "Referer": url} data = { diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index e3a96bd..fc092f1 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -462,6 +462,9 @@ class PixivRankingExtractor(PixivExtractor): ("https://www.pixiv.net/ranking.php?mode=daily&date=20170818"), ("https://www.pixiv.net/ranking.php"), ("https://touch.pixiv.net/ranking.php"), + ("https://www.pixiv.net/ranking.php?mode=unknown", { + "exception": exception.StopExtraction, + }), ) def __init__(self, match): @@ -479,6 +482,8 @@ class PixivRankingExtractor(PixivExtractor): mode_map = { "daily": "day", "daily_r18": "day_r18", + "daily_ai": "day_ai", + "daily_r18_ai": "day_r18_ai", "weekly": "week", "weekly_r18": "week_r18", "monthly": "month", @@ -490,10 +495,10 @@ class PixivRankingExtractor(PixivExtractor): "rookie": "week_rookie", "r18g": "week_r18g", } - if mode not in mode_map: - self.log.warning("invalid mode '%s'", mode) - mode = "daily" - self.mode = mode_map[mode] + try: + self.mode = mode = mode_map[mode] + except KeyError: + raise exception.StopExtraction("Invalid mode '%s'", mode) date = query.get("date") if date: @@ -525,6 +530,15 @@ class PixivSearchExtractor(PixivExtractor): "range": "1-10", "count": 10, }), + ("https://pixiv.net/en/tags/foo/artworks?order=week&s_mode=s_tag", { + "exception": exception.StopExtraction, + }), + ("https://pixiv.net/en/tags/foo/artworks?order=date&s_mode=tag", { + "exception": exception.StopExtraction, + }), + ("https://www.pixiv.net/search.php?s_mode=s_tag&name=Original", { + "exception": exception.StopExtraction, + }), ("https://www.pixiv.net/en/tags/foo/artworks?order=date&s_mode=s_tag"), ("https://www.pixiv.net/search.php?s_mode=s_tag&word=Original"), ("https://touch.pixiv.net/search.php?word=Original"), @@ -546,19 +560,20 @@ class PixivSearchExtractor(PixivExtractor): if self.word: self.word = text.unquote(self.word) else: - if "word" not in query: + try: + self.word = query["word"] + except KeyError: raise exception.StopExtraction("Missing search term") - self.word = query["word"] sort = query.get("order", "date_d") sort_map = { "date": "date_asc", "date_d": "date_desc", } - if sort not in sort_map: - self.log.warning("invalid sort order '%s'", sort) - sort = "date_d" - self.sort = sort_map[sort] + try: + self.sort = sort = sort_map[sort] + except KeyError: + raise exception.StopExtraction("Invalid search order '%s'", sort) target = query.get("s_mode", "s_tag_full") target_map = { @@ -566,10 +581,10 @@ class PixivSearchExtractor(PixivExtractor): "s_tag_full": "exact_match_for_tags", "s_tc": "title_and_caption", } - if target not in target_map: - self.log.warning("invalid search target '%s'", target) - target = "s_tag_full" - self.target = target_map[target] + try: + self.target = target = target_map[target] + except KeyError: + raise exception.StopExtraction("Invalid search mode '%s'", target) self.date_start = query.get("scd") self.date_end = query.get("ecd") @@ -638,7 +653,7 @@ class PixivPixivisionExtractor(PixivExtractor): headers = {"User-Agent": "Mozilla/5.0"} self.page = self.request(url, headers=headers).text - title = text.extract(self.page, '<title>', '<')[0] + title = text.extr(self.page, '<title>', '<') return { "pixivision_id" : self.pixivision_id, "pixivision_title": text.unescape(title), @@ -692,7 +707,7 @@ class PixivSeriesExtractor(PixivExtractor): series = body["extraData"]["meta"] series["id"] = self.series_id series["total"] = page["total"] - series["title"] = text.extract(series["title"], '"', '"')[0] + series["title"] = text.extr(series["title"], '"', '"') for info in page["series"]: work = self.api.illust_detail(info["workId"]) diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py index a52071e..15be563 100644 --- a/gallery_dl/extractor/pixnet.py +++ b/gallery_dl/extractor/pixnet.py @@ -30,7 +30,7 @@ class PixnetExtractor(Extractor): def items(self): url = self.url_fmt.format(self.root, self.item_id) page = self.request(url, encoding="utf-8").text - user = text.extract(page, '<meta name="author" content="', '";')[0] + user = text.extr(page, '<meta name="author" content="', '";') data = { "blog": self.blog, "user": user.rpartition(" (")[0], @@ -52,13 +52,13 @@ class PixnetExtractor(Extractor): while True: yield from text.extract_iter(page, '<li id="', '</li>') - pnext = text.extract(page, 'class="nextBtn"', '>')[0] + pnext = text.extr(page, 'class="nextBtn"', '>') if pnext is None and 'name="albumpass">' in page: raise exception.StopExtraction( "Album %s is password-protected.", self.item_id) if "href" not in pnext: return - url = self.root + text.extract(pnext, 'href="', '"')[0] + url = self.root + text.extr(pnext, 'href="', '"') page = self.request(url, encoding="utf-8").text diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py index dee7bd4..7e266cc 100644 --- a/gallery_dl/extractor/pururin.py +++ b/gallery_dl/extractor/pururin.py @@ -73,8 +73,8 @@ class PururinGalleryExtractor(GalleryExtractor): url = "{}/read/{}/01/x".format(self.root, self.gallery_id) page = self.request(url).text - info = json.loads(binascii.a2b_base64(text.extract( - page, '<gallery-read encoded="', '"')[0]).decode()) + info = json.loads(binascii.a2b_base64(text.extr( + page, '<gallery-read encoded="', '"')).decode()) self._ext = info["image_extension"] self._cnt = info["total_pages"] diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 448dc1b..8b5b6b6 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -109,13 +109,13 @@ class ReactorExtractor(BaseExtractor): tags.sort() for image in images: - url = text.extract(image, ' src="', '"')[0] + url = text.extr(image, ' src="', '"') if not url: continue if url.startswith("//"): url = "http:" + url - width = text.extract(image, ' width="', '"')[0] - height = text.extract(image, ' height="', '"')[0] + width = text.extr(image, ' width="', '"') + height = text.extr(image, ' height="', '"') image_id = url.rpartition("-")[2].partition(".")[0] num += 1 @@ -125,7 +125,7 @@ class ReactorExtractor(BaseExtractor): url = url.replace("/post/", "/post/full/") if self.gif and ("/post/webm/" in url or "/post/mp4/" in url): - gif_url = text.extract(image, '<a href="', '"')[0] + gif_url = text.extr(image, '<a href="', '"') if not gif_url: continue url = gif_url diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 3396e3a..7013f1b 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -63,7 +63,7 @@ class SankakuExtractor(BooruExtractor): def _check_expired(self, response): return not response.history or '.com/expired.png' not in response.url - def _extended_tags(self, post): + def _tags(self, post, page): tags = collections.defaultdict(list) types = self.TAG_TYPES for tag in post["tags"]: @@ -306,7 +306,7 @@ class SankakuAPI(): url = post["file_url"] if url: expires = text.parse_int( - text.extract(url, "e=", "&")[0]) - 60 + text.extr(url, "e=", "&")) - 60 if 0 < expires <= time(): self.extractor.log.debug("Refreshing download URLs") diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 830274a..aa6726d 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -43,7 +43,7 @@ class SexcomExtractor(Extractor): yield self.root + href pager = extr('id="pagenum"', '</div>') - url = text.extract(pager, ' href="', '"')[0] + url = text.extr(pager, ' href="', '"') if not url: return url = text.urljoin(self.root, url) @@ -71,7 +71,7 @@ class SexcomExtractor(Extractor): info = extr("player.updateSrc(", ");") if info: - path = text.extract(info, "src: '", "'")[0] + path = text.extr(info, "src: '", "'") data["filename"] = path.rpartition("/")[2] data["extension"] = "mp4" if "'HD'" in info: @@ -79,8 +79,8 @@ class SexcomExtractor(Extractor): data["url"] = self.root + path else: iframe = extr('<iframe', '>') - src = (text.extract(iframe, ' src="', '"')[0] or - text.extract(iframe, " src='", "'")[0]) + src = (text.extr(iframe, ' src="', '"') or + text.extr(iframe, " src='", "'")) if not src: self.log.warning("Unable to fetch media from %s", url) return None diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index 580e917..b5d116f 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -111,7 +111,7 @@ class SimplyhentaiImageExtractor(Extractor): url = extr('"image":"' , '&') url = extr(""content":"", "&") or url - tags = text.extract(descr, " tagged with ", " online for free ")[0] + tags = text.extr(descr, " tagged with ", " online for free ") if tags: tags = tags.split(", ") tags[-1] = tags[-1].partition(" ")[2] @@ -176,7 +176,7 @@ class SimplyhentaiVideoExtractor(Extractor): embed_url = text.extract(page, 'src="', '"', pos)[0].replace( "embedplayer.php?link=", "embed.php?name=") embed_page = self.request(embed_url).text - video_url = text.extract(embed_page, '"file":"', '"')[0] + video_url = text.extr(embed_page, '"file":"', '"') title, _, episode = title.rpartition(" Episode ") if video_url.startswith("//"): diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index d2e298c..ea39c5e 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -89,23 +89,23 @@ class SubscribestarExtractor(Extractor): def _media_from_post(html): media = [] - gallery = text.extract(html, 'data-gallery="', '"')[0] + gallery = text.extr(html, 'data-gallery="', '"') if gallery: media.extend( item for item in json.loads(text.unescape(gallery)) if "/previews/" not in item["url"] ) - attachments = text.extract( - html, 'class="uploads-docs"', 'data-role="post-edit_form"')[0] + attachments = text.extr( + html, 'class="uploads-docs"', 'data-role="post-edit_form"') if attachments: for att in attachments.split('class="doc_preview"')[1:]: media.append({ - "id" : text.parse_int(text.extract( - att, 'data-upload-id="', '"')[0]), - "name": text.unescape(text.extract( - att, 'doc_preview-title">', '<')[0] or ""), - "url" : text.unescape(text.extract(att, 'href="', '"')[0]), + "id" : text.parse_int(text.extr( + att, 'data-upload-id="', '"')), + "name": text.unescape(text.extr( + att, 'doc_preview-title">', '<')), + "url" : text.unescape(text.extr(att, 'href="', '"')), "type": "attachment", }) @@ -175,7 +175,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor): return yield from posts - url = text.extract(posts[-1], needle_next_page, '"')[0] + url = text.extr(posts[-1], needle_next_page, '"') if not url: return page = self.request(self.root + text.unescape(url)).json()["html"] diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 5451f6e..c75952a 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -203,6 +203,15 @@ class TumblrExtractor(Extractor): def _prepare_image(url, post): text.nameext_from_url(url, post) + # try ".gifv" (#3095) + # it's unknown whether all gifs in this case are actually webps + # incorrect extensions will be corrected by 'adjust-extensions' + if post["extension"] == "gif": + post["_fallback"] = (url + "v",) + post["_http_headers"] = {"Accept": # copied from chrome 106 + "image/avif,image/webp,image/apng," + "image/svg+xml,image/*,*/*;q=0.8"} + parts = post["filename"].split("_") try: post["hash"] = parts[1] if parts[1] != "inline" else parts[2] @@ -248,7 +257,7 @@ class TumblrExtractor(Extractor): except Exception: return resized, True else: - updated = text.extract(response.text, '" src="', '"')[0] + updated = text.extr(response.text, '" src="', '"') return updated, (resized == updated) def _original_image_fallback(self, url, post_id): diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py index e790613..6940f3e 100644 --- a/gallery_dl/extractor/tumblrgallery.py +++ b/gallery_dl/extractor/tumblrgallery.py @@ -46,7 +46,7 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor): def metadata(self, page): return { - "title" : text.unescape(text.extract(page, "<h1>", "</h1>"))[0], + "title" : text.unescape(text.extr(page, "<h1>", "</h1>")), "gallery_id": self.gallery_id, } @@ -82,7 +82,7 @@ class TumblrgalleryPostExtractor(TumblrgalleryExtractor): def metadata(self, page): return { "title" : text.remove_html( - text.unescape(text.extract(page, "<title>", "</title>")[0]) + text.unescape(text.extr(page, "<title>", "</title>")) ).replace("_", "-"), "gallery_id": self.gallery_id, } @@ -127,12 +127,12 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor): data = self._data_from_url(url) data["gallery_id"] = gallery_id data["title"] = text.remove_html(text.unescape( - text.extract(post_page, "<title>", "</title>")[0] + text.extr(post_page, "<title>", "</title>") )).replace("_", "-") yield url, data - next_url = text.extract( - page, '</span> <a class="btn btn-primary" href="', '"')[0] + next_url = text.extr( + page, '</span> <a class="btn btn-primary" href="', '"') if not next_url or page_url == next_url: return page_url = next_url diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index 93fa039..f010f92 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -83,7 +83,7 @@ class TwibooruPostExtractor(TwibooruExtractor): "tag_ids": list, "tags": list, "thumbnails_generated": True, - "updated_at": "2022-05-13T00:43:19.791Z", + "updated_at": "2022-09-21T14:31:50.441Z", "upvotes": int, "view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png", "width": 576, diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ba0597e..3dbadaa 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -14,10 +14,7 @@ from ..cache import cache import itertools import json -BASE_PATTERN = ( - r"(?:https?://)?(?:www\.|mobile\.)?" - r"(?:(?:[fv]x)?twitter\.com|nitter\.net)" -) +BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:[fv]x)?twitter\.com" class TwitterExtractor(Extractor): @@ -227,8 +224,8 @@ class TwitterExtractor(Extractor): response = self.request(url, fatal=False) if response.status_code >= 400: continue - url = text.extract( - response.text, 'name="twitter:image" value="', '"')[0] + url = text.extr( + response.text, 'name="twitter:image" value="', '"') if url: files.append({"url": url}) @@ -377,6 +374,24 @@ class TwitterExtractor(Extractor): except Exception: yield tweet + def _make_tweet(self, user, id_str, url, timestamp): + return { + "created_at": text.parse_timestamp(timestamp).strftime( + "%a %b %d %H:%M:%S +0000 %Y"), + "id_str": id_str, + "lang": None, + "user": user, + "entities": {}, + "extended_entities": { + "media": [ + { + "original_info": {}, + "media_url": url, + }, + ], + }, + } + def metadata(self): """Return general metadata""" return {} @@ -388,44 +403,7 @@ class TwitterExtractor(Extractor): if not self._check_cookies(self.cookienames): username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) - - @cache(maxage=360*24*3600, keyarg=1) - def _login_impl(self, username, password): - self.log.info("Logging in as %s", username) - - token = util.generate_token() - self.session.cookies.clear() - self.request(self.root + "/login") - - url = self.root + "/sessions" - cookies = { - "_mb_tk": token, - } - data = { - "redirect_after_login" : "/", - "remember_me" : "1", - "authenticity_token" : token, - "wfa" : "1", - "ui_metrics" : "{}", - "session[username_or_email]": username, - "session[password]" : password, - } - response = self.request( - url, method="POST", cookies=cookies, data=data) - - if "/account/login_verification" in response.url: - raise exception.AuthenticationError( - "Login with two-factor authentication is not supported") - - cookies = { - cookie.name: cookie.value - for cookie in self.session.cookies - } - - if "/error" in response.url or "auth_token" not in cookies: - raise exception.AuthenticationError() - return cookies + self._update_cookies(_login_impl(self, username, password)) class TwitterTimelineExtractor(TwitterExtractor): @@ -727,11 +705,6 @@ class TwitterTweetExtractor(TwitterExtractor): "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg", "count": 3, }), - # Nitter tweet (#890) - ("https://nitter.net/ed1conf/status/1163841619336007680", { - "url": "4a9ea898b14d3c112f98562d0df75c9785e239d9", - "content": "f29501e44d88437fe460f5c927b7543fda0f6e34", - }), # Twitter card (#1005) ("https://twitter.com/billboard/status/1306599586602135555", { "options": (("cards", True),), @@ -850,6 +823,76 @@ class TwitterTweetExtractor(TwitterExtractor): return itertools.chain(buffer, tweets) +class TwitterAvatarExtractor(TwitterExtractor): + subcategory = "avatar" + filename_fmt = "avatar {date}.{extension}" + archive_fmt = "AV_{user[id]}_{date}" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/photo" + test = ( + ("https://twitter.com/supernaturepics/photo", { + "pattern": r"https://pbs\.twimg\.com/profile_images" + r"/554585280938659841/FLVAlX18\.jpeg", + "keyword": { + "date": "dt:2015-01-12 10:26:49", + "extension": "jpeg", + "filename": "FLVAlX18", + "tweet_id": 554585280938659841, + }, + }), + ("https://twitter.com/User16/photo", { + "count": 0, + }), + ) + + def tweets(self): + self.api._user_id_by_screen_name(self.user) + user = self._user_obj + url = user["legacy"]["profile_image_url_https"] + + if url == ("https://abs.twimg.com/sticky" + "/default_profile_images/default_profile_normal.png"): + return () + + url = url.replace("_normal.", ".") + id_str = url.rsplit("/", 2)[1] + timestamp = ((int(id_str) >> 22) + 1288834974657) // 1000 + + return (self._make_tweet(user, id_str, url, timestamp),) + + +class TwitterBackgroundExtractor(TwitterExtractor): + subcategory = "background" + filename_fmt = "background {date}.{extension}" + archive_fmt = "BG_{user[id]}_{date}" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/header_photo" + test = ( + ("https://twitter.com/supernaturepics/header_photo", { + "pattern": r"https://pbs\.twimg\.com/profile_banners" + r"/2976459548/1421058583", + "keyword": { + "date": "dt:2015-01-12 10:29:43", + "filename": "1421058583", + "tweet_id": 0, + }, + }), + ("https://twitter.com/User16/header_photo", { + "count": 0, + }), + ) + + def tweets(self): + self.api._user_id_by_screen_name(self.user) + user = user = self._user_obj + + try: + url = user["legacy"]["profile_banner_url"] + _, timestamp = url.rsplit("/", 1) + except (KeyError, ValueError): + return () + + return (self._make_tweet(user, None, url, timestamp),) + + class TwitterImageExtractor(Extractor): category = "twitter" subcategory = "image" @@ -1021,7 +1064,7 @@ class TwitterAPI(): "count": 100, } return self._pagination_tweets( - endpoint, variables, ("bookmark_timeline", "timeline")) + endpoint, variables, ("bookmark_timeline", "timeline"), False) def list_latest_tweets_timeline(self, list_id): endpoint = "/graphql/z3l-EHlx-fyg8OvGO4JN8A/ListLatestTweetsTimeline" @@ -1253,7 +1296,8 @@ class TwitterAPI(): return params["cursor"] = cursor - def _pagination_tweets(self, endpoint, variables, path=None): + def _pagination_tweets(self, endpoint, variables, + path=None, stop_tweets=True): extr = self.extractor variables.update(self.variables) original_retweets = (extr.retweets == "original") @@ -1397,7 +1441,9 @@ class TwitterAPI(): tweet.get("rest_id")) continue - if not tweet or not cursor: + if stop_tweets and not tweet: + return + if not cursor or cursor == variables.get("cursor"): return variables["cursor"] = cursor @@ -1456,8 +1502,8 @@ class TwitterAPI(): self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text) def _syndication_tweet(self, tweet_id): - tweet = self.extractor.request( - "https://cdn.syndication.twimg.com/tweet?id=" + tweet_id).json() + base_url = "https://cdn.syndication.twimg.com/tweet-result?id=" + tweet = self.extractor.request(base_url + tweet_id).json() tweet["user"]["description"] = "" tweet["user"]["entities"] = {"description": {}} @@ -1500,3 +1546,174 @@ class TwitterAPI(): "core" : {"user_results": {"result": tweet["user"]}}, "_retweet_id_str": retweet_id, } + + +@cache(maxage=360*86400, keyarg=1) +def _login_impl(extr, username, password): + + import re + import random + + if re.fullmatch(r"[\w.%+-]+@[\w.-]+\.\w{2,}", username): + extr.log.warning( + "Login with email is no longer possible. " + "You need to provide your username or phone number instead.") + + extr.log.info("Logging in as %s", username) + + def process(response): + try: + data = response.json() + except ValueError: + data = {"errors": ({"message": "Invalid response"},)} + else: + if response.status_code < 400: + return data["flow_token"] + + errors = [] + for error in data.get("errors") or (): + msg = error.get("message") + errors.append('"{}"'.format(msg) if msg else "Unknown error") + extr.log.debug(response.text) + raise exception.AuthenticationError(", ".join(errors)) + + extr.session.cookies.clear() + api = TwitterAPI(extr) + headers = api.headers + headers["Referer"] = "https://twitter.com/i/flow/login" + + # init + data = { + "input_flow_data": { + "flow_context": { + "debug_overrides": {}, + "start_location": {"location": "unknown"}, + }, + }, + "subtask_versions": { + "action_list": 2, + "alert_dialog": 1, + "app_download_cta": 1, + "check_logged_in_account": 1, + "choice_selection": 3, + "contacts_live_sync_permission_prompt": 0, + "cta": 7, + "email_verification": 2, + "end_flow": 1, + "enter_date": 1, + "enter_email": 2, + "enter_password": 5, + "enter_phone": 2, + "enter_recaptcha": 1, + "enter_text": 5, + "enter_username": 2, + "generic_urt": 3, + "in_app_notification": 1, + "interest_picker": 3, + "js_instrumentation": 1, + "menu_dialog": 1, + "notifications_permission_prompt": 2, + "open_account": 2, + "open_home_timeline": 1, + "open_link": 1, + "phone_verification": 4, + "privacy_options": 1, + "security_key": 3, + "select_avatar": 4, + "select_banner": 2, + "settings_list": 7, + "show_code": 1, + "sign_up": 2, + "sign_up_review": 4, + "tweet_selection_urt": 1, + "update_users": 1, + "upload_media": 1, + "user_recommendations_list": 4, + "user_recommendations_urt": 1, + "wait_spinner": 3, + "web_modal": 1, + }, + } + url = "https://twitter.com/i/api/1.1/onboarding/task.json?flow_name=login" + response = extr.request(url, method="POST", headers=headers, json=data) + + data = { + "flow_token": process(response), + "subtask_inputs": [ + { + "subtask_id": "LoginJsInstrumentationSubtask", + "js_instrumentation": { + "response": "{}", + "link": "next_link", + }, + }, + ], + } + url = "https://twitter.com/i/api/1.1/onboarding/task.json" + response = extr.request( + url, method="POST", headers=headers, json=data, fatal=None) + + # username + data = { + "flow_token": process(response), + "subtask_inputs": [ + { + "subtask_id": "LoginEnterUserIdentifierSSO", + "settings_list": { + "setting_responses": [ + { + "key": "user_identifier", + "response_data": { + "text_data": {"result": username}, + }, + }, + ], + "link": "next_link", + }, + }, + ], + } + # url = "https://twitter.com/i/api/1.1/onboarding/task.json" + extr.sleep(random.uniform(2.0, 4.0), "login (username)") + response = extr.request( + url, method="POST", headers=headers, json=data, fatal=None) + + # password + data = { + "flow_token": process(response), + "subtask_inputs": [ + { + "subtask_id": "LoginEnterPassword", + "enter_password": { + "password": password, + "link": "next_link", + }, + }, + ], + } + # url = "https://twitter.com/i/api/1.1/onboarding/task.json" + extr.sleep(random.uniform(2.0, 4.0), "login (password)") + response = extr.request( + url, method="POST", headers=headers, json=data, fatal=None) + + # account duplication check ? + data = { + "flow_token": process(response), + "subtask_inputs": [ + { + "subtask_id": "AccountDuplicationCheck", + "check_logged_in_account": { + "link": "AccountDuplicationCheck_false", + }, + }, + ], + } + # url = "https://twitter.com/i/api/1.1/onboarding/task.json" + response = extr.request( + url, method="POST", headers=headers, json=data, fatal=None) + process(response) + + return { + cookie.name: cookie.value + for cookie in extr.session.cookies + } diff --git a/gallery_dl/extractor/uploadir.py b/gallery_dl/extractor/uploadir.py new file mode 100644 index 0000000..bd18c0a --- /dev/null +++ b/gallery_dl/extractor/uploadir.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://uploadir.com/""" + +from .common import Extractor, Message +from .. import text + + +class UploadirFileExtractor(Extractor): + """Extractor for uploadir files""" + category = "uploadir" + subcategory = "file" + root = "https://uploadir.com" + filename_fmt = "{filename} ({id}).{extension}" + archive_fmt = "{id}" + pattern = r"(?:https?://)?uploadir\.com/(?:user/)?u(?:ploads)?/([^/?#]+)" + test = ( + # image + ("https://uploadir.com/u/rd3t46ry", { + "pattern": r"https://uploadir\.com/u/rd3t46ry", + "count": 1, + "keyword": { + "extension": "jpg", + "filename": "Chloe and Rachel 4K jpg", + "id": "rd3t46ry", + }, + }), + # archive + ("https://uploadir.com/uploads/gxe8ti9v/downloads/new", { + "pattern": r"https://uploadir\.com/uploads/gxe8ti9v/downloads", + "count": 1, + "keyword": { + "extension": "zip", + "filename": "NYAN-Mods-Pack#1", + "id": "gxe8ti9v", + }, + }), + # utf-8 filename + ("https://uploadir.com/u/fllda6xl", { + "pattern": r"https://uploadir\.com/u/fllda6xl", + "count": 1, + "keyword": { + "extension": "png", + "filename": "_圖片_🖼_image_", + "id": "fllda6xl", + }, + }), + ("https://uploadir.com/uploads/rd3t46ry"), + ("https://uploadir.com/user/uploads/rd3t46ry"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.file_id = match.group(1) + + def items(self): + url = "{}/u/{}".format(self.root, self.file_id) + response = self.request(url, method="HEAD", allow_redirects=False) + + if 300 <= response.status_code < 400: + url = response.headers["Location"] + extr = text.extract_from(self.request(url).text) + + name = text.unescape(extr("<h2>", "</h2>").strip()) + url = self.root + extr('class="form" action="', '"') + token = extr('name="authenticity_token" value="', '"') + + data = text.nameext_from_url(name, { + "_http_method": "POST", + "_http_data" : { + "authenticity_token": token, + "upload_id": self.file_id, + }, + }) + + else: + hcd = response.headers.get("Content-Disposition") + name = (hcd.partition("filename*=UTF-8''")[2] or + text.extr(hcd, 'filename="', '"')) + data = text.nameext_from_url(name) + + data["id"] = self.file_id + yield Message.Directory, data + yield Message.Url, url, data diff --git a/gallery_dl/extractor/vanillarock.py b/gallery_dl/extractor/vanillarock.py index 3d934b2..6b1178e 100644 --- a/gallery_dl/extractor/vanillarock.py +++ b/gallery_dl/extractor/vanillarock.py @@ -44,7 +44,7 @@ class VanillarockPostExtractor(VanillarockExtractor): img = extr('<div class="main-img">', '</div>') if not img: break - imgs.append(text.extract(img, 'href="', '"')[0]) + imgs.append(text.extr(img, 'href="', '"')) data = { "count": len(imgs), @@ -89,5 +89,5 @@ class VanillarockTagExtractor(VanillarockExtractor): post = extr('<h2 class="entry-title">', '</h2>') if not post: break - yield Message.Queue, text.extract(post, 'href="', '"')[0], data + yield Message.Queue, text.extr(post, 'href="', '"'), data url = text.unescape(extr('class="next page-numbers" href="', '"')) diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 668be0f..00389fa 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -69,7 +69,7 @@ class VscoExtractor(Extractor): def _extract_preload_state(self, url): page = self.request(url, notfound=self.subcategory).text - return json.loads(text.extract(page, "__PRELOADED_STATE__ = ", "<")[0]) + return json.loads(text.extr(page, "__PRELOADED_STATE__ = ", "<")) def _pagination(self, url, params, token, key, extra=None): headers = { diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 47451bd..06f1aab 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -19,6 +19,10 @@ class WallhavenExtractor(Extractor): archive_fmt = "{id}" root = "https://wallhaven.cc" + def __init__(self, match): + Extractor.__init__(self, match) + self.api = WallhavenAPI(self) + def items(self): metadata = self.metadata() for wp in self.wallpapers(): @@ -57,7 +61,8 @@ class WallhavenSearchExtractor(WallhavenExtractor): ("https://wallhaven.cc/search?q=touhou"), (("https://wallhaven.cc/search?q=id%3A87" "&categories=111&purity=100&sorting=date_added&order=asc&page=3"), { - "pattern": r"https://w.wallhaven.cc/full/\w\w/wallhaven-\w+\.\w+", + "pattern": (r"https://w\.wallhaven\.cc" + r"/full/\w\w/wallhaven-\w+\.\w+"), "count": "<= 30", }), ) @@ -67,7 +72,7 @@ class WallhavenSearchExtractor(WallhavenExtractor): self.params = text.parse_query(match.group(1)) def wallpapers(self): - return WallhavenAPI(self).search(self.params.copy()) + return self.api.search(self.params.copy()) def metadata(self): return {"search": self.params} @@ -87,12 +92,30 @@ class WallhavenCollectionExtractor(WallhavenExtractor): self.username, self.collection_id = match.groups() def wallpapers(self): - return WallhavenAPI(self).collection(self.username, self.collection_id) + return self.api.collection(self.username, self.collection_id) def metadata(self): return {"username": self.username, "collection_id": self.collection_id} +class WallhavenUserExtractor(WallhavenExtractor): + """Extractor for a wallhaven user""" + subcategory = "user" + pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/?$" + test = ("https://wallhaven.cc/user/AksumkA/",) + + def __init__(self, match): + WallhavenExtractor.__init__(self, match) + self.username = match.group(1) + + def items(self): + base = "{}/user/{}/".format(self.root, self.username) + return self._dispatch_extractors(( + (WallhavenUploadsExtractor , base + "uploads"), + (WallhavenCollectionsExtractor, base + "favorites"), + ), ("uploads",)) + + class WallhavenCollectionsExtractor(WallhavenExtractor): """Extractor for all collections of a wallhaven user""" subcategory = "collections" @@ -107,13 +130,38 @@ class WallhavenCollectionsExtractor(WallhavenExtractor): self.username = match.group(1) def items(self): - for collection in WallhavenAPI(self).collections(self.username): + for collection in self.api.collections(self.username): collection["_extractor"] = WallhavenCollectionExtractor url = "https://wallhaven.cc/user/{}/favorites/{}".format( self.username, collection["id"]) yield Message.Queue, url, collection +class WallhavenUploadsExtractor(WallhavenExtractor): + """Extractor for all uploads of a wallhaven user""" + subcategory = "uploads" + directory_fmt = ("{category}", "{username}") + archive_fmt = "u_{username}_{id}" + pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/uploads" + test = ("https://wallhaven.cc/user/AksumkA/uploads", { + "pattern": (r"https://[^.]+\.wallhaven\.cc" + r"/full/\w\w/wallhaven-\w+\.\w+"), + "range": "1-100", + "count": 100, + }) + + def __init__(self, match): + WallhavenExtractor.__init__(self, match) + self.username = match.group(1) + + def wallpapers(self): + params = {"q": "@" + self.username} + return self.api.search(params.copy()) + + def metadata(self): + return {"username": self.username} + + class WallhavenImageExtractor(WallhavenExtractor): """Extractor for individual wallpaper on wallhaven.cc""" subcategory = "image" @@ -121,7 +169,8 @@ class WallhavenImageExtractor(WallhavenExtractor): r"|w\.wallhaven\.cc/[a-z]+/\w\w/wallhaven-)(\w+)") test = ( ("https://wallhaven.cc/w/01w334", { - "pattern": "https://[^.]+.wallhaven.cc/full/01/[^-]+-01w334.jpg", + "pattern": (r"https://[^.]+\.wallhaven\.cc" + r"/full/01/wallhaven-01w334\.jpg"), "content": "497212679383a465da1e35bd75873240435085a2", "keyword": { "id" : "01w334", @@ -159,7 +208,7 @@ class WallhavenImageExtractor(WallhavenExtractor): self.wallpaper_id = match.group(1) def wallpapers(self): - return (WallhavenAPI(self).info(self.wallpaper_id),) + return (self.api.info(self.wallpaper_id),) class WallhavenAPI(): diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 7f51732..677680f 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -57,8 +57,8 @@ class WarosuThreadExtractor(Extractor): def get_metadata(self, page): """Collect metadata for extractor-job""" - boardname = text.extract(page, "<title>", "</title>")[0] - title = text.extract(page, 'filetitle" itemprop="name">', '<')[0] + boardname = text.extr(page, "<title>", "</title>") + title = text.extr(page, 'filetitle" itemprop="name">', '<') return { "board": self.board, "board_name": boardname.rpartition(" - ")[2], @@ -68,7 +68,7 @@ class WarosuThreadExtractor(Extractor): def posts(self, page): """Build a list of all post-objects""" - page = text.extract(page, '<div class="content">', '<table>')[0] + page = text.extr(page, '<div class="content">', '<table>') needle = '<table itemscope itemtype="http://schema.org/Comment">' return [self.parse(post) for post in page.split(needle)] diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index 599a175..eca4f1a 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -225,7 +225,7 @@ class WeasylFavoriteExtractor(WeasylExtractor): pos = page.index('id="favorites-content"') if not owner_login: - owner_login = text.extract(page, '<a href="/~', '"')[0] + owner_login = text.extr(page, '<a href="/~', '"') for submitid in text.extract_iter(page, "/submissions/", "/", pos): if submitid == lastid: diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 59f46f0..8a22fcb 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -169,7 +169,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): @staticmethod def get_episode_urls(page): """Extract and return all episode urls in 'page'""" - page = text.extract(page, 'id="_listUl"', '</ul>')[0] + page = text.extr(page, 'id="_listUl"', '</ul>') return [ match.group(0) for match in WebtoonsEpisodeExtractor.pattern.finditer(page) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 189c0c5..55cee14 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -37,6 +37,7 @@ class WeiboExtractor(Extractor): cookies = _cookie_cache() if cookies is not None: self.session.cookies.update(cookies) + self.session.headers["Referer"] = self.root + "/" def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) @@ -173,7 +174,7 @@ class WeiboExtractor(Extractor): page = Extractor.request( self, passport_url, method="POST", headers=headers, data=data).text - data = json.loads(text.extract(page, "(", ");")[0])["data"] + data = json.loads(text.extr(page, "(", ");"))["data"] passport_url = "https://passport.weibo.com/visitor/visitor" params = { diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index 146ab04..0125739 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -144,8 +144,8 @@ class XhamsterGalleryExtractor(XhamsterExtractor): def _data(self, url): page = self.request(url).text - return json.loads(text.extract( - page, "window.initials=", "</script>")[0].rstrip("\n\r;")) + return json.loads(text.extr( + page, "window.initials=", "</script>").rstrip("\n\r;")) class XhamsterUserExtractor(XhamsterExtractor): diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index 0a55532..10de439 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -113,8 +113,8 @@ class XvideosUserExtractor(XvideosBase, Extractor): def items(self): url = "{}/profiles/{}".format(self.root, self.user) page = self.request(url, notfound=self.subcategory).text - data = json.loads(text.extract( - page, "xv.conf=", ";</script>")[0])["data"] + data = json.loads(text.extr( + page, "xv.conf=", ";</script>"))["data"] if not isinstance(data["galleries"], dict): return diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 72cf438..c0d43fe 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -127,7 +127,7 @@ class ZerochanTagExtractor(ZerochanExtractor): while True: page = self.request(url, params=params).text - thumbs = text.extract(page, '<ul id="thumbs', '</ul>')[0] + thumbs = text.extr(page, '<ul id="thumbs', '</ul>') extr = text.extract_from(thumbs) while True: diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 2f48ffd..1f65438 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -32,11 +32,8 @@ class Job(): self.pathfmt = None self.kwdict = {} self.status = 0 - self.url_key = extr.config("url-metadata") - path_key = extr.config("path-metadata") path_proxy = output.PathfmtProxy(self) - self._logger_extra = { "job" : self, "extractor": extr, @@ -56,12 +53,16 @@ class Job(): extr.category = pextr.category extr.subcategory = pextr.subcategory + self.metadata_url = extr.config("url-metadata") + self.metadata_http = extr.config("http-metadata") + metadata_path = extr.config("path-metadata") + # user-supplied metadata kwdict = extr.config("keywords") if kwdict: self.kwdict.update(kwdict) - if path_key: - self.kwdict[path_key] = path_proxy + if metadata_path: + self.kwdict[metadata_path] = path_proxy # predicates self.pred_url = self._prepare_predicates("image", True) @@ -120,8 +121,8 @@ class Job(): """Call the appropriate message handler""" if msg[0] == Message.Url: _, url, kwdict = msg - if self.url_key: - kwdict[self.url_key] = url + if self.metadata_url: + kwdict[self.metadata_url] = url if self.pred_url(url, kwdict): self.update_kwdict(kwdict) self.handle_url(url, kwdict) @@ -132,8 +133,8 @@ class Job(): elif msg[0] == Message.Queue: _, url, kwdict = msg - if self.url_key: - kwdict[self.url_key] = url + if self.metadata_url: + kwdict[self.metadata_url] = url if self.pred_queue(url, kwdict): self.handle_queue(url, kwdict) @@ -154,6 +155,8 @@ class Job(): extr = self.extractor kwdict["category"] = extr.category kwdict["subcategory"] = extr.subcategory + if self.metadata_http: + kwdict.pop(self.metadata_http, None) if self.kwdict: kwdict.update(self.kwdict) @@ -231,11 +234,14 @@ class DownloadJob(Job): self.handle_skip() return - if pathfmt.exists(): - if archive: - archive.add(kwdict) - self.handle_skip() - return + if pathfmt.extension and not self.metadata_http: + pathfmt.build_path() + + if pathfmt.exists(): + if archive: + archive.add(kwdict) + self.handle_skip() + return if self.sleep: self.extractor.sleep(self.sleep(), "download") @@ -283,6 +289,9 @@ class DownloadJob(Job): if not self.pathfmt: self.initialize(kwdict) else: + if "post-after" in self.hooks: + for callback in self.hooks["post-after"]: + callback(self.pathfmt) self.pathfmt.set_directory(kwdict) if "post" in self.hooks: for callback in self.hooks["post"]: @@ -337,14 +346,20 @@ class DownloadJob(Job): self._write_unsupported(url) def handle_finalize(self): - pathfmt = self.pathfmt if self.archive: self.archive.close() + + pathfmt = self.pathfmt if pathfmt: + hooks = self.hooks + if "post-after" in hooks: + for callback in hooks["post-after"]: + callback(pathfmt) + self.extractor._store_cookies() - if "finalize" in self.hooks: + if "finalize" in hooks: status = self.status - for callback in self.hooks["finalize"]: + for callback in hooks["finalize"]: callback(pathfmt, status) def handle_skip(self): @@ -526,12 +541,11 @@ class SimulationJob(DownloadJob): def handle_url(self, url, kwdict): if not kwdict["extension"]: kwdict["extension"] = "jpg" - self.pathfmt.set_filename(kwdict) if self.sleep: self.extractor.sleep(self.sleep(), "download") if self.archive: self.archive.add(kwdict) - self.out.skip(self.pathfmt.path) + self.out.skip(self.pathfmt.build_filename(kwdict)) def handle_directory(self, kwdict): if not self.pathfmt: @@ -548,6 +562,11 @@ class KeywordJob(Job): def handle_url(self, url, kwdict): stdout_write("\nKeywords for filenames and --filter:\n" "------------------------------------\n") + + if self.metadata_http and url.startswith("http"): + kwdict[self.metadata_http] = util.extract_headers( + self.extractor.request(url, method="HEAD")) + self.print_kwdict(kwdict) raise exception.StopExtraction() @@ -605,12 +624,15 @@ class KeywordJob(Job): self.print_kwdict(value, key + "[", markers) elif isinstance(value, list): - if value and isinstance(value[0], dict): - self.print_kwdict(value[0], key + "[][", markers) + if not value: + pass + elif isinstance(value[0], dict): + self.print_kwdict(value[0], key + "[N][", markers) else: - write(key + "[]\n") - for val in value: - write(" - " + str(val) + "\n") + fmt = (" {:>%s} {}\n" % len(str(len(value)))).format + write(key + "[N]\n") + for idx, val in enumerate(value, 0): + write(fmt(idx, val)) else: # string or number diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 37247a7..4d9a358 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -125,6 +125,11 @@ def build_parser(): help="Client-side IP address to bind to", ) general.add_argument( + "--user-agent", + dest="user-agent", metavar="UA", action=ConfigAction, + help="User-Agent request header", + ) + general.add_argument( "--clear-cache", dest="clear_cache", metavar="MODULE", help="Delete cached login sessions, cookies, etc. for MODULE " @@ -263,6 +268,11 @@ def build_parser(): help="Do not download files larger than SIZE (e.g. 500k or 2.5M)", ) downloader.add_argument( + "--chunk-size", + dest="chunk-size", metavar="SIZE", action=ConfigAction, + help="Size of in-memory data chunks (default: 32k)", + ) + downloader.add_argument( "--no-part", dest="part", nargs=0, action=ConfigConstAction, const=False, help="Do not use .part files", diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 28c07c3..e901fb9 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -15,16 +15,16 @@ import functools from . import util, formatter, exception WINDOWS = util.WINDOWS +EXTENSION_MAP = { + "jpeg": "jpg", + "jpe" : "jpg", + "jfif": "jpg", + "jif" : "jpg", + "jfi" : "jpg", +} class PathFormat(): - EXTENSION_MAP = { - "jpeg": "jpg", - "jpe" : "jpg", - "jfif": "jpg", - "jif" : "jpg", - "jfi" : "jpg", - } def __init__(self, extractor): config = extractor.config @@ -78,7 +78,7 @@ class PathFormat(): extension_map = config("extension-map") if extension_map is None: - extension_map = self.EXTENSION_MAP + extension_map = EXTENSION_MAP self.extension_map = extension_map.get restrict = config("path-restrict", "auto") @@ -161,12 +161,14 @@ class PathFormat(): num = 1 try: while True: - self.prefix = str(num) + "." - self.set_extension(self.extension, False) + prefix = format(num) + "." + self.kwdict["extension"] = prefix + self.extension + self.build_path() os.stat(self.realpath) # raises OSError if file doesn't exist num += 1 except OSError: pass + self.prefix = prefix return False def set_directory(self, kwdict): @@ -198,31 +200,26 @@ class PathFormat(): def set_filename(self, kwdict): """Set general filename data""" self.kwdict = kwdict - self.temppath = self.prefix = "" + self.filename = self.temppath = self.prefix = "" ext = kwdict["extension"] kwdict["extension"] = self.extension = self.extension_map(ext, ext) - if self.extension: - self.build_path() - else: - self.filename = "" - def set_extension(self, extension, real=True): """Set filename extension""" - extension = self.extension_map(extension, extension) - if real: - self.extension = extension + self.extension = extension = self.extension_map(extension, extension) self.kwdict["extension"] = self.prefix + extension - self.build_path() def fix_extension(self, _=None): """Fix filenames without a given filename extension""" if not self.extension: - self.set_extension("", False) + self.kwdict["extension"] = self.prefix + self.extension_map("", "") + self.build_path() if self.path[-1] == ".": self.path = self.path[:-1] self.temppath = self.realpath = self.realpath[:-1] + elif not self.temppath: + self.build_path() return True def build_filename(self, kwdict): @@ -296,7 +293,9 @@ class PathFormat(): if self.extension: self.temppath += ".part" else: - self.set_extension("part", False) + self.kwdict["extension"] = self.prefix + self.extension_map( + "part", "part") + self.build_path() if part_directory: self.temppath = os.path.join( part_directory, diff --git a/gallery_dl/postprocessor/compare.py b/gallery_dl/postprocessor/compare.py index b3b94f7..910e1d7 100644 --- a/gallery_dl/postprocessor/compare.py +++ b/gallery_dl/postprocessor/compare.py @@ -51,8 +51,9 @@ class ComparePP(PostProcessor): num = 1 try: while not self._compare(pathfmt.realpath, pathfmt.temppath): - pathfmt.prefix = str(num) + "." - pathfmt.set_extension(pathfmt.extension, False) + pathfmt.prefix = prefix = format(num) + "." + pathfmt.kwdict["extension"] = prefix + pathfmt.extension + pathfmt.build_path() num += 1 return self._equal(pathfmt) except OSError: diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index b21e483..2ee1cf8 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -21,6 +21,9 @@ class MetadataPP(PostProcessor): mode = options.get("mode") cfmt = options.get("content-format") or options.get("format") + omode = "w" + filename = None + if mode == "tags": self.write = self._write_tags ext = "txt" @@ -41,6 +44,12 @@ class MetadataPP(PostProcessor): cfmt = "\n".join(cfmt) + "\n" self._content_fmt = formatter.parse(cfmt).format_map ext = "txt" + elif mode == "jsonl": + self.write = self._write_json + self.indent = None + self.ascii = options.get("ascii", False) + omode = "a" + filename = "data.jsonl" else: self.write = self._write_json self.indent = options.get("indent", 4) @@ -53,7 +62,7 @@ class MetadataPP(PostProcessor): sep = os.sep + (os.altsep or "") self._metadir = util.expand_path(directory).rstrip(sep) + os.sep - filename = options.get("filename") + filename = options.get("filename", filename) extfmt = options.get("extension-format") if filename: if filename == "-": @@ -97,6 +106,9 @@ class MetadataPP(PostProcessor): self.archive = None self.mtime = options.get("mtime") + self.omode = options.get("open", omode) + self.encoding = options.get("encoding", "utf-8") + self.private = options.get("private", False) def run(self, pathfmt): archive = self.archive @@ -107,11 +119,11 @@ class MetadataPP(PostProcessor): path = directory + self._filename(pathfmt) try: - with open(path, "w", encoding="utf-8") as fp: + with open(path, self.omode, encoding=self.encoding) as fp: self.write(fp, pathfmt.kwdict) except FileNotFoundError: os.makedirs(directory, exist_ok=True) - with open(path, "w", encoding="utf-8") as fp: + with open(path, self.omode, encoding=self.encoding) as fp: self.write(fp, pathfmt.kwdict) if archive: @@ -198,7 +210,9 @@ class MetadataPP(PostProcessor): fp.write("\n".join(tags) + "\n") def _write_json(self, fp, kwdict): - util.dump_json(util.filter_dict(kwdict), fp, self.ascii, self.indent) + if not self.private: + kwdict = util.filter_dict(kwdict) + util.dump_json(kwdict, fp, self.ascii, self.indent) __postprocessor__ = MetadataPP diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 98c8246..9d2cb34 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -90,15 +90,17 @@ class UgoiraPP(PostProcessor): if pathfmt.extension != "zip": return - if "frames" in pathfmt.kwdict: - self._frames = pathfmt.kwdict["frames"] - elif "pixiv_ugoira_frame_data" in pathfmt.kwdict: - self._frames = pathfmt.kwdict["pixiv_ugoira_frame_data"]["data"] + kwdict = pathfmt.kwdict + if "frames" in kwdict: + self._frames = kwdict["frames"] + elif "pixiv_ugoira_frame_data" in kwdict: + self._frames = kwdict["pixiv_ugoira_frame_data"]["data"] else: return if self.delete: pathfmt.set_extension(self.extension) + pathfmt.build_path() def convert(self, pathfmt): if not self._frames: @@ -115,6 +117,8 @@ class UgoiraPP(PostProcessor): # process frames and collect command-line arguments pathfmt.set_extension(self.extension) + pathfmt.build_path() + args = self._process(pathfmt, tempdir) if self.args: args += self.args @@ -151,6 +155,7 @@ class UgoiraPP(PostProcessor): pathfmt.delete = True else: pathfmt.set_extension("zip") + pathfmt.build_path() def _exec(self, args): self.log.debug(args) diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 79cf016..1fb1851 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -120,6 +120,15 @@ def extract(txt, begin, end, pos=0): return None, pos +def extr(txt, begin, end, default=""): + """Stripped-down version of 'extract()'""" + try: + first = txt.index(begin) + len(begin) + return txt[first:txt.index(end, first)] + except (ValueError, TypeError, AttributeError): + return default + + def rextract(txt, begin, end, pos=-1): try: lbeg = len(begin) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 98b6d59..8ce1fb4 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -19,6 +19,7 @@ import binascii import datetime import functools import itertools +import subprocess import urllib.parse from http.cookiejar import Cookie from email.utils import mktime_tz, parsedate_tz @@ -273,6 +274,39 @@ Response Headers fp.write(response.content) +def extract_headers(response): + headers = response.headers + data = dict(headers) + + hcd = headers.get("content-disposition") + if hcd: + name = text.extr(hcd, 'filename="', '"') + if name: + text.nameext_from_url(name, data) + + hlm = headers.get("last-modified") + if hlm: + data["date"] = datetime.datetime(*parsedate_tz(hlm)[:6]) + + return data + + +@functools.lru_cache(maxsize=None) +def git_head(): + try: + out, err = subprocess.Popen( + ("git", "rev-parse", "--short", "HEAD"), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__)), + ).communicate() + if out and not err: + return out.decode().rstrip() + except (OSError, subprocess.SubprocessError): + pass + return None + + def expand_path(path): """Expand environment variables and tildes (~)""" if not path: diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 85a03de..31dbc63 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.23.5" +__version__ = "1.24.0" diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index b2da445..db313c3 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -73,7 +73,11 @@ def parse_command_line(module, argv): ytdlp = (module.__name__ == "yt_dlp") std_headers = module.std_headers - parse_bytes = module.FileDownloader.parse_bytes + + try: + parse_bytes = module.parse_bytes + except AttributeError: + parse_bytes = module.FileDownloader.parse_bytes # HTTP headers if opts.user_agent is not None: @@ -1,7 +1,8 @@ [flake8] -exclude = gallery_dl/__init__.py,gallery_dl/__main__.py,setup.py,build,scripts,archive +exclude = build,archive ignore = E203,E226,W504 per-file-ignores = + setup.py: E501 gallery_dl/extractor/500px.py: E501 [egg_info] @@ -5,7 +5,6 @@ import re import sys import os.path import warnings -from setuptools import setup def read(fname): @@ -13,6 +12,7 @@ def read(fname): with open(path, encoding="utf-8") as file: return file.read() + def check_file(fname): path = os.path.join(os.path.dirname(__file__), fname) if os.path.exists(path): @@ -41,99 +41,109 @@ FILES = [ ] ] +PACKAGES = [ + "gallery_dl", + "gallery_dl.extractor", + "gallery_dl.downloader", + "gallery_dl.postprocessor", +] + DESCRIPTION = ("Command-line program to download image galleries and " "collections from several image hosting sites") LONG_DESCRIPTION = read("README.rst") -if "py2exe" in sys.argv: - try: - import py2exe - except ImportError: - sys.exit("Error importing 'py2exe'") +def build_py2exe(): + from py2exe import freeze # py2exe dislikes version specifiers with a trailing '-dev' - VERSION = VERSION.partition("-")[0] + VERSION_ = VERSION.partition("-")[0] - params = { - "console": [{ + freeze( + console=[{ "script" : "./gallery_dl/__main__.py", "dest_base" : "gallery-dl", - "version" : VERSION, + }], + version_info={ + "version" : VERSION_, "description" : DESCRIPTION, "comments" : LONG_DESCRIPTION, "product_name" : "gallery-dl", - "product_version": VERSION, - }], - "options": {"py2exe": { - "bundle_files": 0, - "compressed" : 1, - "optimize" : 1, - "dist_dir" : ".", - "packages" : ["gallery_dl"], - "includes" : ["youtube_dl"], - "dll_excludes": ["w9xpopen.exe"], - }}, - "zipfile": None, - } + "product_version": VERSION_, + }, + options={ + "bundle_files" : 0, + "compressed" : 1, + "optimize" : 1, + "dist_dir" : "./dist", + "packages" : PACKAGES, + "includes" : ["youtube_dl"], + "dll_excludes" : ["w9xpopen.exe"], + }, + zipfile=None, + ) -else: - params = {} - - -setup( - name="gallery_dl", - version=VERSION, - description=DESCRIPTION, - long_description=LONG_DESCRIPTION, - url="https://github.com/mikf/gallery-dl", - download_url="https://github.com/mikf/gallery-dl/releases/latest", - author="Mike Fährmann", - author_email="mike_faehrmann@web.de", - maintainer="Mike Fährmann", - maintainer_email="mike_faehrmann@web.de", - license="GPLv2", - python_requires=">=3.4", - install_requires=[ - "requests>=2.11.0", - ], - extras_require={ - "video": [ - "youtube-dl", + +def build_setuptools(): + from setuptools import setup + + setup( + name="gallery_dl", + version=VERSION, + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, + url="https://github.com/mikf/gallery-dl", + download_url="https://github.com/mikf/gallery-dl/releases/latest", + author="Mike Fährmann", + author_email="mike_faehrmann@web.de", + maintainer="Mike Fährmann", + maintainer_email="mike_faehrmann@web.de", + license="GPLv2", + python_requires=">=3.4", + install_requires=[ + "requests>=2.11.0", ], - }, - packages=[ - "gallery_dl", - "gallery_dl.extractor", - "gallery_dl.downloader", - "gallery_dl.postprocessor", - ], - entry_points={ - "console_scripts": [ - "gallery-dl = gallery_dl:main", + extras_require={ + "video": [ + "youtube-dl", + ], + }, + entry_points={ + "console_scripts": [ + "gallery-dl = gallery_dl:main", + ], + }, + packages=PACKAGES, + data_files=FILES, + test_suite="test", + keywords="image gallery downloader crawler scraper", + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: End Users/Desktop", + "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Multimedia :: Graphics", + "Topic :: Utilities", ], - }, - data_files=FILES, - keywords="image gallery downloader crawler scraper", - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Environment :: Console", - "Intended Audience :: End Users/Desktop", - "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", - "Operating System :: Microsoft :: Windows", - "Operating System :: POSIX", - "Operating System :: MacOS", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3 :: Only", - "Topic :: Internet :: WWW/HTTP", - "Topic :: Multimedia :: Graphics", - "Topic :: Utilities", - ], - test_suite="test", - **params, -) + ) + + +if "py2exe" in sys.argv: + build_py2exe() +else: + build_setuptools() diff --git a/test/test_downloader.py b/test/test_downloader.py index 9350ce4..0703754 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -13,9 +13,9 @@ import unittest from unittest.mock import Mock, MagicMock, patch import re -import base64 import logging import os.path +import binascii import tempfile import threading import http.server @@ -23,6 +23,7 @@ import http.server sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from gallery_dl import downloader, extractor, output, config, path # noqa E402 +from gallery_dl.downloader.http import MIME_TYPES, SIGNATURE_CHECKS # noqa E402 class MockDownloaderModule(Mock): @@ -130,6 +131,7 @@ class TestDownloaderBase(unittest.TestCase): pathfmt = cls.job.pathfmt pathfmt.set_directory(kwdict) pathfmt.set_filename(kwdict) + pathfmt.build_path() if content: mode = "w" + ("b" if isinstance(content, bytes) else "") @@ -156,6 +158,7 @@ class TestDownloaderBase(unittest.TestCase): self.assertEqual( pathfmt.extension, expected_extension, + content[0:16], ) self.assertEqual( os.path.splitext(pathfmt.realpath)[1][1:], @@ -172,48 +175,52 @@ class TestHTTPDownloader(TestDownloaderBase): port = 8088 cls.address = "http://127.0.0.1:{}".format(port) - cls._jpg = cls.address + "/image.jpg" - cls._png = cls.address + "/image.png" - cls._gif = cls.address + "/image.gif" - server = http.server.HTTPServer(("", port), HttpRequestHandler) threading.Thread(target=server.serve_forever, daemon=True).start() + def _run_test(self, ext, input, output, + extension, expected_extension=None): + TestDownloaderBase._run_test( + self, self.address + "/" + ext, input, output, + extension, expected_extension) + def tearDown(self): self.downloader.minsize = self.downloader.maxsize = None def test_http_download(self): - self._run_test(self._jpg, None, DATA_JPG, "jpg", "jpg") - self._run_test(self._png, None, DATA_PNG, "png", "png") - self._run_test(self._gif, None, DATA_GIF, "gif", "gif") + self._run_test("jpg", None, DATA["jpg"], "jpg", "jpg") + self._run_test("png", None, DATA["png"], "png", "png") + self._run_test("gif", None, DATA["gif"], "gif", "gif") def test_http_offset(self): - self._run_test(self._jpg, DATA_JPG[:123], DATA_JPG, "jpg", "jpg") - self._run_test(self._png, DATA_PNG[:12] , DATA_PNG, "png", "png") - self._run_test(self._gif, DATA_GIF[:1] , DATA_GIF, "gif", "gif") + self._run_test("jpg", DATA["jpg"][:123], DATA["jpg"], "jpg", "jpg") + self._run_test("png", DATA["png"][:12] , DATA["png"], "png", "png") + self._run_test("gif", DATA["gif"][:1] , DATA["gif"], "gif", "gif") def test_http_extension(self): - self._run_test(self._jpg, None, DATA_JPG, None, "jpg") - self._run_test(self._png, None, DATA_PNG, None, "png") - self._run_test(self._gif, None, DATA_GIF, None, "gif") + self._run_test("jpg", None, DATA["jpg"], None, "jpg") + self._run_test("png", None, DATA["png"], None, "png") + self._run_test("gif", None, DATA["gif"], None, "gif") def test_http_adjust_extension(self): - self._run_test(self._jpg, None, DATA_JPG, "png", "jpg") - self._run_test(self._png, None, DATA_PNG, "gif", "png") - self._run_test(self._gif, None, DATA_GIF, "jpg", "gif") + self._run_test("jpg", None, DATA["jpg"], "png", "jpg") + self._run_test("png", None, DATA["png"], "gif", "png") + self._run_test("gif", None, DATA["gif"], "jpg", "gif") def test_http_filesize_min(self): + url = self.address + "/gif" pathfmt = self._prepare_destination(None, extension=None) self.downloader.minsize = 100 with self.assertLogs(self.downloader.log, "WARNING"): - success = self.downloader.download(self._gif, pathfmt) + success = self.downloader.download(url, pathfmt) self.assertFalse(success) def test_http_filesize_max(self): + url = self.address + "/jpg" pathfmt = self._prepare_destination(None, extension=None) self.downloader.maxsize = 100 with self.assertLogs(self.downloader.log, "WARNING"): - success = self.downloader.download(self._jpg, pathfmt) + success = self.downloader.download(url, pathfmt) self.assertFalse(success) @@ -237,24 +244,14 @@ class TestTextDownloader(TestDownloaderBase): class HttpRequestHandler(http.server.BaseHTTPRequestHandler): def do_GET(self): - if self.path == "/image.jpg": - content_type = "image/jpeg" - output = DATA_JPG - elif self.path == "/image.png": - content_type = "image/png" - output = DATA_PNG - elif self.path == "/image.gif": - content_type = "image/gif" - output = DATA_GIF - else: + try: + output = DATA[self.path[1:]] + except KeyError: self.send_response(404) self.wfile.write(self.path.encode()) return - headers = { - "Content-Type": content_type, - "Content-Length": len(output), - } + headers = {"Content-Length": len(output)} if "Range" in self.headers: status = 206 @@ -275,31 +272,79 @@ class HttpRequestHandler(http.server.BaseHTTPRequestHandler): self.wfile.write(output) -DATA_JPG = base64.standard_b64decode(""" -/9j/4AAQSkZJRgABAQEASABIAAD/2wBD -AAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB -AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB -AQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEB -AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB -AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB -AQEBAQEBAQEBAQEBAQH/wAARCAABAAED -AREAAhEBAxEB/8QAFAABAAAAAAAAAAAA -AAAAAAAACv/EABQQAQAAAAAAAAAAAAAA -AAAAAAD/xAAUAQEAAAAAAAAAAAAAAAAA -AAAA/8QAFBEBAAAAAAAAAAAAAAAAAAAA -AP/aAAwDAQACEQMRAD8AfwD/2Q==""") - - -DATA_PNG = base64.standard_b64decode(""" -iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB -CAAAAAA6fptVAAAACklEQVQIHWP4DwAB -AQEANl9ngAAAAABJRU5ErkJggg==""") - - -DATA_GIF = base64.standard_b64decode(""" -R0lGODdhAQABAIAAAP///////ywAAAAA -AQABAAACAkQBADs=""") - - +SAMPLES = { + ("jpg" , binascii.a2b_base64( + "/9j/4AAQSkZJRgABAQEASABIAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB" + "AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEB" + "AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB" + "AQEBAQEBAQEBAQEBAQH/wAARCAABAAEDAREAAhEBAxEB/8QAFAABAAAAAAAAAAAA" + "AAAAAAAACv/EABQQAQAAAAAAAAAAAAAAAAAAAAD/xAAUAQEAAAAAAAAAAAAAAAAA" + "AAAA/8QAFBEBAAAAAAAAAAAAAAAAAAAAAP/aAAwDAQACEQMRAD8AfwD/2Q==")), + ("png" , binascii.a2b_base64( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVQIHWP4DwAB" + "AQEANl9ngAAAAABJRU5ErkJggg==")), + ("gif" , binascii.a2b_base64( + "R0lGODdhAQABAIAAAP///////ywAAAAAAQABAAACAkQBADs=")), + ("bmp" , b"BM"), + ("webp", b"RIFF????WEBP"), + ("avif", b"????ftypavif"), + ("avif", b"????ftypavis"), + ("svg" , b"<?xml"), + ("ico" , b"\x00\x00\x01\x00"), + ("cur" , b"\x00\x00\x02\x00"), + ("psd" , b"8BPS"), + ("mp4" , b"????ftypmp4"), + ("mp4" , b"????ftypavc1"), + ("mp4" , b"????ftypiso3"), + ("mp4" , b"????ftypM4V"), + ("webm", b"\x1A\x45\xDF\xA3"), + ("ogg" , b"OggS"), + ("wav" , b"RIFF????WAVE"), + ("mp3" , b"ID3"), + ("mp3" , b"\xFF\xFB"), + ("mp3" , b"\xFF\xF3"), + ("mp3" , b"\xFF\xF2"), + ("zip" , b"PK\x03\x04"), + ("zip" , b"PK\x05\x06"), + ("zip" , b"PK\x07\x08"), + ("rar" , b"Rar!\x1A\x07"), + ("rar" , b"\x52\x61\x72\x21\x1A\x07"), + ("7z" , b"\x37\x7A\xBC\xAF\x27\x1C"), + ("pdf" , b"%PDF-"), + ("swf" , b"FWS"), + ("swf" , b"CWS"), +} + + +DATA = {} + +for ext, content in SAMPLES: + if ext not in DATA: + DATA[ext] = content + +for idx, (_, content) in enumerate(SAMPLES): + DATA["S{:>02}".format(idx)] = content + + +# reverse mime types mapping +MIME_TYPES = { + ext: mtype + for mtype, ext in MIME_TYPES.items() +} + + +def generate_tests(): + def generate_test(idx, ext, content): + def test(self): + self._run_test("S{:>02}".format(idx), None, content, "bin", ext) + test.__name__ = "test_http_ext_{:>02}_{}".format(idx, ext) + return test + + for idx, (ext, content) in enumerate(SAMPLES): + test = generate_test(idx, ext, content) + setattr(TestHTTPDownloader, test.__name__, test) + + +generate_tests() if __name__ == "__main__": unittest.main() diff --git a/test/test_job.py b/test/test_job.py index fec6997..1bd9ccc 100644 --- a/test/test_job.py +++ b/test/test_job.py @@ -87,10 +87,10 @@ num 1 subcategory test_subcategory -tags[] - - foo - - bar - - テスト +tags[N] + 0 foo + 1 bar + 2 テスト user[id] 123 user[name] diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index ba37ee0..7da2089 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -97,6 +97,7 @@ class BasePostprocessorTest(unittest.TestCase): self.pathfmt = self.job.pathfmt self.pathfmt.set_directory(kwdict) self.pathfmt.set_filename(kwdict) + self.pathfmt.build_path() pp = postprocessor.find(self.__class__.__name__[:-4].lower()) return pp(self.job, options) @@ -118,6 +119,7 @@ class ClassifyTest(BasePostprocessorTest): for ext in exts }) self.pathfmt.set_extension("jpg") + self.pathfmt.build_path() pp.prepare(self.pathfmt) path = os.path.join(self.dir.name, "test", "Pictures") @@ -150,6 +152,7 @@ class ClassifyTest(BasePostprocessorTest): "bar": "foo/bar", }) self.pathfmt.set_extension("foo") + self.pathfmt.build_path() pp.prepare(self.pathfmt) path = os.path.join(self.dir.name, "test", "foo", "bar") diff --git a/test/test_text.py b/test/test_text.py index 0ac7767..2c0be3b 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -203,6 +203,24 @@ class TestText(unittest.TestCase): self.assertEqual(f(txt , value, ">") , (None, 0)) self.assertEqual(f(txt , "<" , value), (None, 0)) + def test_extr(self, f=text.extr): + txt = "<a><b>" + self.assertEqual(f(txt, "X", ">"), "") + self.assertEqual(f(txt, "<", "X"), "") + self.assertEqual(f(txt, "<", ">"), "a") + self.assertEqual(f(txt, "><", ">"), "b") + + # 'default' argument + self.assertEqual(f(txt, "<", "X", None), None) + self.assertEqual(f(txt, "<", "X", default=None), None) + self.assertEqual(f(txt, "<", "X", default=()), ()) + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value, "<" , ">") , "") + self.assertEqual(f(txt , value, ">") , "") + self.assertEqual(f(txt , "<" , value), "") + def test_rextract(self, f=text.rextract): txt = "<a><b>" self.assertEqual(f(txt, "<", ">"), ("b" , 3)) |
