aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2022-11-22 04:28:43 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2022-11-22 04:28:43 -0500
commit2c529817db948dea14069ea9f5ccfae5598fff47 (patch)
treeec67f818b3f6f329fdaa37cf8cf8a89d90022747
parent5ed6cfd4bbc85cc8c4bece8e0ff30600a640e6aa (diff)
parent7af5cc29d1c02d20a6890b7b7ba78ab41532a763 (diff)
Update upstream source from tag 'upstream/1.24.0'
Update to upstream version '1.24.0' with Debian dir 44b4fff355c06e5285ef1f8a3ec428ae9cef164b
-rw-r--r--CHANGELOG.md53
-rw-r--r--PKG-INFO19
-rw-r--r--README.rst5
-rw-r--r--data/completion/_gallery-dl2
-rw-r--r--data/completion/gallery-dl2
-rw-r--r--data/completion/gallery-dl.fish2
-rw-r--r--data/man/gallery-dl.18
-rw-r--r--data/man/gallery-dl.conf.5133
-rw-r--r--docs/gallery-dl-example.conf13
-rw-r--r--docs/gallery-dl.conf9
-rw-r--r--gallery_dl.egg-info/PKG-INFO19
-rw-r--r--gallery_dl.egg-info/SOURCES.txt3
-rw-r--r--gallery_dl/__init__.py23
-rw-r--r--gallery_dl/config.py6
-rw-r--r--gallery_dl/cookies.py2
-rw-r--r--gallery_dl/downloader/http.py139
-rw-r--r--gallery_dl/downloader/ytdl.py2
-rw-r--r--gallery_dl/extractor/2chan.py10
-rw-r--r--gallery_dl/extractor/35photo.py10
-rw-r--r--gallery_dl/extractor/8muses.py4
-rw-r--r--gallery_dl/extractor/__init__.py3
-rw-r--r--gallery_dl/extractor/artstation.py4
-rw-r--r--gallery_dl/extractor/aryion.py3
-rw-r--r--gallery_dl/extractor/bbc.py4
-rw-r--r--gallery_dl/extractor/bcy.py2
-rw-r--r--gallery_dl/extractor/behance.py9
-rw-r--r--gallery_dl/extractor/blogger.py4
-rw-r--r--gallery_dl/extractor/booru.py30
-rw-r--r--gallery_dl/extractor/bunkr.py4
-rw-r--r--gallery_dl/extractor/common.py53
-rw-r--r--gallery_dl/extractor/deviantart.py20
-rw-r--r--gallery_dl/extractor/dynastyscans.py6
-rw-r--r--gallery_dl/extractor/erome.py4
-rw-r--r--gallery_dl/extractor/exhentai.py35
-rw-r--r--gallery_dl/extractor/fallenangels.py2
-rw-r--r--gallery_dl/extractor/foolfuuka.py2
-rw-r--r--gallery_dl/extractor/foolslide.py2
-rw-r--r--gallery_dl/extractor/furaffinity.py2
-rw-r--r--gallery_dl/extractor/fuskator.py4
-rw-r--r--gallery_dl/extractor/gelbooru.py30
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py182
-rw-r--r--gallery_dl/extractor/generic.py38
-rw-r--r--gallery_dl/extractor/hentaicosplays.py2
-rw-r--r--gallery_dl/extractor/hentaifoundry.py4
-rw-r--r--gallery_dl/extractor/hentaihere.py35
-rw-r--r--gallery_dl/extractor/hiperdex.py4
-rw-r--r--gallery_dl/extractor/hotleak.py12
-rw-r--r--gallery_dl/extractor/idolcomplex.py2
-rw-r--r--gallery_dl/extractor/imagebam.py4
-rw-r--r--gallery_dl/extractor/imagechest.py4
-rw-r--r--gallery_dl/extractor/imagefap.py2
-rw-r--r--gallery_dl/extractor/imagehosts.py32
-rw-r--r--gallery_dl/extractor/imgbb.py6
-rw-r--r--gallery_dl/extractor/imgbox.py4
-rw-r--r--gallery_dl/extractor/imgth.py2
-rw-r--r--gallery_dl/extractor/inkbunny.py2
-rw-r--r--gallery_dl/extractor/instagram.py130
-rw-r--r--gallery_dl/extractor/issuu.py4
-rw-r--r--gallery_dl/extractor/kabeuchi.py2
-rw-r--r--gallery_dl/extractor/keenspot.py4
-rw-r--r--gallery_dl/extractor/kemonoparty.py2
-rw-r--r--gallery_dl/extractor/khinsider.py9
-rw-r--r--gallery_dl/extractor/kissgoddess.py4
-rw-r--r--gallery_dl/extractor/komikcast.py6
-rw-r--r--gallery_dl/extractor/lightroom.py2
-rw-r--r--gallery_dl/extractor/lineblog.py4
-rw-r--r--gallery_dl/extractor/livedoor.py6
-rw-r--r--gallery_dl/extractor/lolisafe.py13
-rw-r--r--gallery_dl/extractor/manganelo.py4
-rw-r--r--gallery_dl/extractor/mangapark.py6
-rw-r--r--gallery_dl/extractor/mangoxo.py4
-rw-r--r--gallery_dl/extractor/mastodon.py4
-rw-r--r--gallery_dl/extractor/moebooru.py55
-rw-r--r--gallery_dl/extractor/myhentaigallery.py2
-rw-r--r--gallery_dl/extractor/myportfolio.py4
-rw-r--r--gallery_dl/extractor/nana.py14
-rw-r--r--gallery_dl/extractor/naverwebtoon.py2
-rw-r--r--gallery_dl/extractor/newgrounds.py8
-rw-r--r--gallery_dl/extractor/ngomik.py51
-rw-r--r--gallery_dl/extractor/nijie.py7
-rw-r--r--gallery_dl/extractor/nitter.py256
-rw-r--r--gallery_dl/extractor/patreon.py6
-rw-r--r--gallery_dl/extractor/philomena.py4
-rw-r--r--gallery_dl/extractor/photobucket.py2
-rw-r--r--gallery_dl/extractor/pillowfort.py2
-rw-r--r--gallery_dl/extractor/pixiv.py47
-rw-r--r--gallery_dl/extractor/pixnet.py6
-rw-r--r--gallery_dl/extractor/pururin.py4
-rw-r--r--gallery_dl/extractor/reactor.py8
-rw-r--r--gallery_dl/extractor/sankaku.py4
-rw-r--r--gallery_dl/extractor/sexcom.py8
-rw-r--r--gallery_dl/extractor/simplyhentai.py4
-rw-r--r--gallery_dl/extractor/subscribestar.py18
-rw-r--r--gallery_dl/extractor/tumblr.py11
-rw-r--r--gallery_dl/extractor/tumblrgallery.py10
-rw-r--r--gallery_dl/extractor/twibooru.py2
-rw-r--r--gallery_dl/extractor/twitter.py325
-rw-r--r--gallery_dl/extractor/uploadir.py90
-rw-r--r--gallery_dl/extractor/vanillarock.py4
-rw-r--r--gallery_dl/extractor/vsco.py2
-rw-r--r--gallery_dl/extractor/wallhaven.py61
-rw-r--r--gallery_dl/extractor/warosu.py6
-rw-r--r--gallery_dl/extractor/weasyl.py2
-rw-r--r--gallery_dl/extractor/webtoons.py2
-rw-r--r--gallery_dl/extractor/weibo.py3
-rw-r--r--gallery_dl/extractor/xhamster.py4
-rw-r--r--gallery_dl/extractor/xvideos.py4
-rw-r--r--gallery_dl/extractor/zerochan.py2
-rw-r--r--gallery_dl/job.py70
-rw-r--r--gallery_dl/option.py10
-rw-r--r--gallery_dl/path.py43
-rw-r--r--gallery_dl/postprocessor/compare.py5
-rw-r--r--gallery_dl/postprocessor/metadata.py22
-rw-r--r--gallery_dl/postprocessor/ugoira.py13
-rw-r--r--gallery_dl/text.py9
-rw-r--r--gallery_dl/util.py34
-rw-r--r--gallery_dl/version.py2
-rw-r--r--gallery_dl/ytdl.py6
-rw-r--r--setup.cfg3
-rw-r--r--setup.py172
-rw-r--r--test/test_downloader.py165
-rw-r--r--test/test_job.py8
-rw-r--r--test/test_postprocessor.py3
-rw-r--r--test/test_text.py18
124 files changed, 1990 insertions, 859 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 21341ef..f92ab19 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,58 @@
# Changelog
+## 1.24.0 - 2022-11-20
+### Additions
+- [exhentai] add metadata to search results ([#3181](https://github.com/mikf/gallery-dl/issues/3181))
+- [gelbooru_v02] implement `notes` extraction
+- [instagram] add `guide` extractor ([#3192](https://github.com/mikf/gallery-dl/issues/3192))
+- [lolisafe] add support for xbunkr ([#3153](https://github.com/mikf/gallery-dl/issues/3153), [#3156](https://github.com/mikf/gallery-dl/issues/3156))
+- [mastodon] add `instance_remote` metadata field ([#3119](https://github.com/mikf/gallery-dl/issues/3119))
+- [nitter] add extractors for Nitter instances ([#2415](https://github.com/mikf/gallery-dl/issues/2415), [#2696](https://github.com/mikf/gallery-dl/issues/2696))
+- [pixiv] add support for new daily AI rankings category ([#3214](https://github.com/mikf/gallery-dl/issues/3214), [#3221](https://github.com/mikf/gallery-dl/issues/3221))
+- [twitter] add `avatar` and `background` extractors ([#349](https://github.com/mikf/gallery-dl/issues/349), [#3023](https://github.com/mikf/gallery-dl/issues/3023))
+- [uploadir] add support for `uploadir.com` ([#3162](https://github.com/mikf/gallery-dl/issues/3162))
+- [wallhaven] add `user` extractor ([#3212](https://github.com/mikf/gallery-dl/issues/3212), [#3213](https://github.com/mikf/gallery-dl/issues/3213), [#3226](https://github.com/mikf/gallery-dl/issues/3226))
+- [downloader:http] add `chunk-size` option ([#3143](https://github.com/mikf/gallery-dl/issues/3143))
+- [downloader:http] add file signature check for `.mp4` files
+- [downloader:http] add file signature check and MIME type for `.avif` files
+- [postprocessor] implement `post-after` event ([#3117](https://github.com/mikf/gallery-dl/issues/3117))
+- [postprocessor:metadata] implement `"mode": "jsonl"`
+- [postprocessor:metadata] add `open`, `encoding`, and `private` options
+- add `--chunk-size` command-line option ([#3143](https://github.com/mikf/gallery-dl/issues/3143))
+- add `--user-agent` command-line option
+- implement `http-metadata` option
+- implement `"user-agent": "browser"` ([#2636](https://github.com/mikf/gallery-dl/issues/2636))
+### Changes
+- [deviantart] restore cookies warning for mature scraps ([#3129](https://github.com/mikf/gallery-dl/issues/3129))
+- [instagram] use REST API for unauthenticated users by default
+- [downloader:http] increase default `chunk-size` to 32768 bytes ([#3143](https://github.com/mikf/gallery-dl/issues/3143))
+- build Windows executables using py2exe's new `freeze()` API
+- build executables on GitHub Actions with Python 3.11
+- reword error text for unsupported URLs
+### Fixes
+- [exhentai] fix pagination ([#3181](https://github.com/mikf/gallery-dl/issues/3181))
+- [khinsider] fix extraction ([#3215](https://github.com/mikf/gallery-dl/issues/3215), [#3219](https://github.com/mikf/gallery-dl/issues/3219))
+- [realbooru] fix download URLs ([#2530](https://github.com/mikf/gallery-dl/issues/2530))
+- [realbooru] fix `tags` extraction ([#2530](https://github.com/mikf/gallery-dl/issues/2530))
+- [tumblr] fall back to `gifv` when possible ([#3095](https://github.com/mikf/gallery-dl/issues/3095), [#3159](https://github.com/mikf/gallery-dl/issues/3159))
+- [twitter] fix login ([#3220](https://github.com/mikf/gallery-dl/issues/3220))
+- [twitter] update URL for syndication API ([#3160](https://github.com/mikf/gallery-dl/issues/3160))
+- [weibo] send `Referer` headers ([#3188](https://github.com/mikf/gallery-dl/issues/3188))
+- [ytdl] update `parse_bytes` location ([#3256](https://github.com/mikf/gallery-dl/issues/3256))
+### Improvements
+- [imxto] extract additional metadata ([#3118](https://github.com/mikf/gallery-dl/issues/3118), [#3175](https://github.com/mikf/gallery-dl/issues/3175))
+- [instagram] allow downloading avatars for private profiles ([#3255](https://github.com/mikf/gallery-dl/issues/3255))
+- [pixiv] raise error for invalid search/ranking parameters ([#3214](https://github.com/mikf/gallery-dl/issues/3214))
+- [twitter] update `bookmarks` pagination ([#3172](https://github.com/mikf/gallery-dl/issues/3172))
+- [downloader:http] refactor file signature checks
+- [downloader:http] improve `-r/--limit-rate` accuracy ([#3143](https://github.com/mikf/gallery-dl/issues/3143))
+- add loaded config files to debug output
+- improve `-K` output for lists
+### Removals
+- [instagram] remove login support ([#3139](https://github.com/mikf/gallery-dl/issues/3139), [#3141](https://github.com/mikf/gallery-dl/issues/3141), [#3191](https://github.com/mikf/gallery-dl/issues/3191))
+- [instagram] remove `channel` extractor
+- [ngomik] remove module
+
## 1.23.5 - 2022-10-30
### Fixes
- [instagram] fix AttributeError on user stories extraction ([#3123](https://github.com/mikf/gallery-dl/issues/3123))
diff --git a/PKG-INFO b/PKG-INFO
index f229a02..b12ad70 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.23.5
+Version: 1.24.0
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -14,16 +14,20 @@ Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Console
Classifier: Intended Audience :: End Users/Desktop
Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2)
-Classifier: Operating System :: Microsoft :: Windows
-Classifier: Operating System :: POSIX
-Classifier: Operating System :: MacOS
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.4
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Internet :: WWW/HTTP
Classifier: Topic :: Multimedia :: Graphics
Classifier: Topic :: Utilities
@@ -99,8 +103,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -243,7 +247,6 @@ and optional for
``idolcomplex``,
``imgbb``,
``inkbunny``,
-``instagram``,
``mangadex``,
``mangoxo``,
``pillowfort``,
diff --git a/README.rst b/README.rst
index c385526..086141f 100644
--- a/README.rst
+++ b/README.rst
@@ -66,8 +66,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -210,7 +210,6 @@ and optional for
``idolcomplex``,
``imgbb``,
``inkbunny``,
-``instagram``,
``mangadex``,
``mangoxo``,
``pillowfort``,
diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl
index 5e46dc5..eb5c0f4 100644
--- a/data/completion/_gallery-dl
+++ b/data/completion/_gallery-dl
@@ -13,6 +13,7 @@ _arguments -C -S \
{-f,--filename}'[Filename format string for downloaded files ("/O" for "original" filenames)]':'<format>' \
--proxy'[Use the specified proxy]':'<url>' \
--source-address'[Client-side IP address to bind to]':'<ip>' \
+--user-agent'[User-Agent request header]':'<ua>' \
--clear-cache'[Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)]':'<module>' \
--cookies'[File to load additional cookies from]':'<file>':_files \
--cookies-from-browser'[Name of the browser to load cookies from, with optional keyring name prefixed with "+" and profile prefixed with ":"]':'<browser[+keyring][:profile]>' \
@@ -37,6 +38,7 @@ _arguments -C -S \
--sleep-extractor'[Number of seconds to wait before starting data extraction for an input URL]':'<seconds>' \
--filesize-min'[Do not download files smaller than SIZE (e.g. 500k or 2.5M)]':'<size>' \
--filesize-max'[Do not download files larger than SIZE (e.g. 500k or 2.5M)]':'<size>' \
+--chunk-size'[Size of in-memory data chunks (default: 32k)]':'<size>' \
--no-part'[Do not use .part files]' \
--no-skip'[Do not skip downloads; overwrite existing files]' \
--no-mtime'[Do not set file modification times according to Last-Modified HTTP response headers]' \
diff --git a/data/completion/gallery-dl b/data/completion/gallery-dl
index 40280d5..f57306e 100644
--- a/data/completion/gallery-dl
+++ b/data/completion/gallery-dl
@@ -10,7 +10,7 @@ _gallery_dl()
elif [[ "${prev}" =~ ^()$ ]]; then
COMPREPLY=( $(compgen -d -- "${cur}") )
else
- COMPREPLY=( $(compgen -W "--help --version --input-file --destination --directory --filename --proxy --source-address --clear-cache --cookies --cookies-from-browser --quiet --verbose --get-urls --resolve-urls --dump-json --simulate --extractor-info --list-keywords --list-modules --list-extractors --write-log --write-unsupported --write-pages --limit-rate --retries --http-timeout --sleep --sleep-request --sleep-extractor --filesize-min --filesize-max --no-part --no-skip --no-mtime --no-download --no-postprocessors --no-check-certificate --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --abort --terminate --range --chapter-range --filter --chapter-filter --zip --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --write-metadata --write-info-json --write-infojson --write-tags --mtime-from-date --exec --exec-after --postprocessor" -- "${cur}") )
+ COMPREPLY=( $(compgen -W "--help --version --input-file --destination --directory --filename --proxy --source-address --user-agent --clear-cache --cookies --cookies-from-browser --quiet --verbose --get-urls --resolve-urls --dump-json --simulate --extractor-info --list-keywords --list-modules --list-extractors --write-log --write-unsupported --write-pages --limit-rate --retries --http-timeout --sleep --sleep-request --sleep-extractor --filesize-min --filesize-max --chunk-size --no-part --no-skip --no-mtime --no-download --no-postprocessors --no-check-certificate --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --abort --terminate --range --chapter-range --filter --chapter-filter --zip --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --write-metadata --write-info-json --write-infojson --write-tags --mtime-from-date --exec --exec-after --postprocessor" -- "${cur}") )
fi
}
diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish
index 587ff12..87e625a 100644
--- a/data/completion/gallery-dl.fish
+++ b/data/completion/gallery-dl.fish
@@ -7,6 +7,7 @@ complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'D' -l 'director
complete -c gallery-dl -x -s 'f' -l 'filename' -d 'Filename format string for downloaded files ("/O" for "original" filenames)'
complete -c gallery-dl -x -l 'proxy' -d 'Use the specified proxy'
complete -c gallery-dl -x -l 'source-address' -d 'Client-side IP address to bind to'
+complete -c gallery-dl -x -l 'user-agent' -d 'User-Agent request header'
complete -c gallery-dl -x -l 'clear-cache' -d 'Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)'
complete -c gallery-dl -r -F -l 'cookies' -d 'File to load additional cookies from'
complete -c gallery-dl -x -l 'cookies-from-browser' -d 'Name of the browser to load cookies from, with optional keyring name prefixed with "+" and profile prefixed with ":"'
@@ -31,6 +32,7 @@ complete -c gallery-dl -x -l 'sleep-request' -d 'Number of seconds to wait betwe
complete -c gallery-dl -x -l 'sleep-extractor' -d 'Number of seconds to wait before starting data extraction for an input URL'
complete -c gallery-dl -x -l 'filesize-min' -d 'Do not download files smaller than SIZE (e.g. 500k or 2.5M)'
complete -c gallery-dl -x -l 'filesize-max' -d 'Do not download files larger than SIZE (e.g. 500k or 2.5M)'
+complete -c gallery-dl -x -l 'chunk-size' -d 'Size of in-memory data chunks (default: 32k)'
complete -c gallery-dl -l 'no-part' -d 'Do not use .part files'
complete -c gallery-dl -l 'no-skip' -d 'Do not skip downloads; overwrite existing files'
complete -c gallery-dl -l 'no-mtime' -d 'Do not set file modification times according to Last-Modified HTTP response headers'
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index 0b27854..059b726 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2022-10-30" "1.23.5" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2022-11-20" "1.24.0" "gallery-dl Manual"
.\" disable hyphenation
.nh
@@ -41,6 +41,9 @@ Use the specified proxy
.B "\-\-source\-address" \f[I]IP\f[]
Client-side IP address to bind to
.TP
+.B "\-\-user\-agent" \f[I]UA\f[]
+User-Agent request header
+.TP
.B "\-\-clear\-cache" \f[I]MODULE\f[]
Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)
.TP
@@ -113,6 +116,9 @@ Do not download files smaller than SIZE (e.g. 500k or 2.5M)
.B "\-\-filesize\-max" \f[I]SIZE\f[]
Do not download files larger than SIZE (e.g. 500k or 2.5M)
.TP
+.B "\-\-chunk\-size" \f[I]SIZE\f[]
+Size of in-memory data chunks (default: 32k)
+.TP
.B "\-\-no\-part"
Do not use .part files
.TP
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index 8944195..847d665 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2022-10-30" "1.23.5" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2022-11-20" "1.24.0" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -449,8 +449,6 @@ and optional for
.br
* \f[I]inkbunny\f[]
.br
-* \f[I]instagram\f[]
-.br
* \f[I]kemonoparty\f[]
.br
* \f[I]mangadex\f[]
@@ -615,6 +613,9 @@ or a \f[I]list\f[] with IP and explicit port number as elements.
.IP "Description:" 4
User-Agent header value to be used for HTTP requests.
+Setting this value to \f[I]"browser"\f[] will try to automatically detect
+and use the User-Agent used by the system's default browser.
+
Note: This option has no effect on pixiv extractors,
as these need specific values to function correctly.
@@ -624,7 +625,10 @@ as these need specific values to function correctly.
\f[I]string\f[]
.IP "Default:" 9
-\f[I]"firefox"\f[] for \f[I]patreon\f[], \f[I]null\f[] everywhere else
+.br
+* \f[I]"firefox"\f[] for \f[I]patreon\f[], \f[I]mangapark\f[], and \f[I]mangasee\f[]
+.br
+* \f[I]null\f[] everywhere else
.IP "Example:" 4
.br
@@ -696,6 +700,23 @@ For example, setting this option to \f[I]"gdl_path"\f[] would make it possible
to access the current file's filename as \f[I]"[gdl_path.filename}"\f[].
+.SS extractor.*.http-metadata
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]null\f[]
+
+.IP "Description:" 4
+Insert an \f[I]object\f[] containing a file's HTTP headers and
+\f[I]filename\f[], \f[I]extension\f[], and \f[I]date\f[] parsed from them
+into metadata dictionaries as the given name.
+
+For example, setting this option to \f[I]"gdl_http"\f[] would make it possible
+to access the current file's \f[I]Last-Modified\f[] header as \f[I]"{gdl_http[Last-Modified]}"\f[]
+and its parsed form as \f[I]"{gdl_http[date]}"\f[].
+
+
.SS extractor.*.category-transfer
.IP "Type:" 6
\f[I]bool\f[]
@@ -1718,17 +1739,15 @@ for details)
\f[I]string\f[]
.IP "Default:" 9
-\f[I]"auto"\f[]
+\f[I]"rest"\f[]
.IP "Description:" 4
Selects which API endpoints to use.
.br
-* \f[I]"rest"\f[]: REST API - higher-resolution media, only usable when logged in
+* \f[I]"rest"\f[]: REST API - higher-resolution media
.br
-* \f[I]"graphql"\f[]: GraphQL API - lower-resolution media, partially accessible when not logged in
-.br
-* \f[I]"auto"\f[]: Use REST API when logged in, GraphQL API otherwise
+* \f[I]"graphql"\f[]: GraphQL API - lower-resolution media
.SS extractor.instagram.include
@@ -1748,7 +1767,6 @@ when processing a user profile.
Possible values are
\f[I]"posts"\f[],
\f[I]"reels"\f[],
-\f[I]"channel"\f[]
\f[I]"tagged"\f[],
\f[I]"stories"\f[],
\f[I]"highlights"\f[],
@@ -3105,6 +3123,32 @@ to use your account's browsing settings and default filters when searching.
See https://wallhaven.cc/help/api for more information.
+.SS extractor.wallhaven.include
+.IP "Type:" 6
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
+
+.IP "Default:" 9
+\f[I]"uploads"\f[]
+
+.IP "Example:" 4
+.br
+* "uploads,collections"
+.br
+* ["uploads", "collections"]
+
+.IP "Description:" 4
+A (comma-separated) list of subcategories to include
+when processing a user profile.
+
+Possible values are
+\f[I]"uploads"\f[], \f[I]"collections"\f[].
+
+It is possible to use \f[I]"all"\f[] instead of listing all values separately.
+
+
.SS extractor.wallhaven.metadata
.IP "Type:" 6
\f[I]bool\f[]
@@ -3413,7 +3457,7 @@ Minimum/Maximum allowed file size in bytes.
Any file smaller/larger than this limit will not be downloaded.
Possible values are valid integer or floating-point numbers
-optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[] or \f[I]p\f[].
+optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[].
These suffixes are case-insensitive.
@@ -3491,7 +3535,7 @@ Set this option to \f[I]null\f[] to disable this indicator.
Maximum download rate in bytes per second.
Possible values are valid integer or floating-point numbers
-optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[] or \f[I]p\f[].
+optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[].
These suffixes are case-insensitive.
@@ -3559,6 +3603,24 @@ of a file called \f[I]example.png\f[] from \f[I]png\f[] to \f[I]jpg\f[] when sai
contains JPEG/JFIF data.
+.SS downloader.http.chunk-size
+.IP "Type:" 6
+\f[I]integer\f[] or \f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]32768\f[]
+
+.IP "Example:" 4
+"50k", "0.8M"
+
+.IP "Description:" 4
+Number of bytes per downloaded chunk.
+
+Possible values are integer numbers
+optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[].
+These suffixes are case-insensitive.
+
+
.SS downloader.http.headers
.IP "Type:" 6
\f[I]object\f[]
@@ -4009,6 +4071,9 @@ Selects how to process metadata.
* \f[I]"json"\f[]: write metadata using \f[I]json.dump()
<https://docs.python.org/3/library/json.html#json.dump>\f[]
.br
+* \f[I]"jsonl"\f[]: write metadata in \f[I]JSON Lines
+<https://jsonlines.org/>\f[] format
+.br
* \f[I]"tags"\f[]: write \f[I]tags\f[] separated by newlines
.br
* \f[I]"custom"\f[]: write the result of applying \f[I]metadata.content-format\f[]
@@ -4112,6 +4177,8 @@ When skipping a file download
\f[I]post\f[]
When starting to download all files of a post,
e.g. a Tweet on Twitter or a post on Patreon.
+\f[I]post-after\f[]
+After downloading all files of a post
.SS metadata.fields
@@ -4163,6 +4230,48 @@ Custom format string to build the content of metadata files with.
Note: Only applies for \f[I]"mode": "custom"\f[].
+.SS metadata.open
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Defsult:" 4
+\f[I]"w"\f[]
+
+.IP "Description:" 4
+The \f[I]mode\f[] in which metadata files get opened.
+
+For example,
+use \f[I]"a"\f[] to append to a file's content
+or \f[I]"w"\f[] to truncate it.
+
+See the \f[I]mode\f[] parameter of \f[I]open()\f[] for further details.
+
+
+.SS metadata.private
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Include private fields,
+i.e. fields whose name starts with an underscore.
+
+
+.SS metadata.encoding
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Defsult:" 4
+\f[I]"utf-8"\f[]
+
+.IP "Description:" 4
+Name of the encoding used to encode a file's content.
+
+See the \f[I]encoding\f[] parameter of \f[I]open()\f[] for further details.
+
+
.SS metadata.archive
.IP "Type:" 6
\f[I]Path\f[]
diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf
index 279aeef..92509b5 100644
--- a/docs/gallery-dl-example.conf
+++ b/docs/gallery-dl-example.conf
@@ -210,6 +210,19 @@
"text-tweets": true
},
+ "ytdl":
+ {
+ "#": "enable 'ytdl' extractor",
+ "#": "i.e. invoke ytdl on all otherwise unsupported input URLs",
+ "enabled": true,
+
+ "#": "use yt-dlp instead of youtube-dl",
+ "module": "yt_dlp",
+
+ "#": "load ytdl options from config file",
+ "config-file": "~/yt-dlp.conf"
+ },
+
"mastodon":
{
"#": "add 'tabletop.social' as recognized mastodon instance",
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 1fcbb3b..becf599 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -159,9 +159,8 @@
},
"instagram":
{
- "username": null,
- "password": null,
- "api": "auto",
+ "api": "rest",
+ "cookies": null,
"include": "posts",
"sleep-request": [6.0, 12.0],
"videos": true
@@ -330,7 +329,8 @@
"wallhaven":
{
"api-key": null,
- "metadata": false
+ "metadata": false,
+ "include": "uploads"
},
"weasyl":
{
@@ -381,6 +381,7 @@
"http":
{
"adjust-extensions": true,
+ "chunk-size": 32768,
"headers": null
},
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index d00e803..0d42bce 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.23.5
+Version: 1.24.0
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -14,16 +14,20 @@ Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Console
Classifier: Intended Audience :: End Users/Desktop
Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2)
-Classifier: Operating System :: Microsoft :: Windows
-Classifier: Operating System :: POSIX
-Classifier: Operating System :: MacOS
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.4
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Internet :: WWW/HTTP
Classifier: Topic :: Multimedia :: Graphics
Classifier: Topic :: Utilities
@@ -99,8 +103,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.5/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.0/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -243,7 +247,6 @@ and optional for
``idolcomplex``,
``imgbb``,
``inkbunny``,
-``instagram``,
``mangadex``,
``mangoxo``,
``pillowfort``,
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 3fa2176..72a07ab 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -137,9 +137,9 @@ gallery_dl/extractor/nana.py
gallery_dl/extractor/naver.py
gallery_dl/extractor/naverwebtoon.py
gallery_dl/extractor/newgrounds.py
-gallery_dl/extractor/ngomik.py
gallery_dl/extractor/nhentai.py
gallery_dl/extractor/nijie.py
+gallery_dl/extractor/nitter.py
gallery_dl/extractor/nozomi.py
gallery_dl/extractor/nsfwalbum.py
gallery_dl/extractor/oauth.py
@@ -187,6 +187,7 @@ gallery_dl/extractor/tumblrgallery.py
gallery_dl/extractor/twibooru.py
gallery_dl/extractor/twitter.py
gallery_dl/extractor/unsplash.py
+gallery_dl/extractor/uploadir.py
gallery_dl/extractor/vanillarock.py
gallery_dl/extractor/vichan.py
gallery_dl/extractor/vk.py
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index b64fa2f..3701d6f 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -118,25 +118,15 @@ def main():
config.set(("output",), "mode", "null")
elif args.loglevel <= logging.DEBUG:
import platform
- import subprocess
- import os.path
import requests
extra = ""
if getattr(sys, "frozen", False):
extra = " - Executable"
else:
- try:
- out, err = subprocess.Popen(
- ("git", "rev-parse", "--short", "HEAD"),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- cwd=os.path.dirname(os.path.abspath(__file__)),
- ).communicate()
- if out and not err:
- extra = " - Git HEAD: " + out.decode().rstrip()
- except (OSError, subprocess.SubprocessError):
- pass
+ git_head = util.git_head()
+ if git_head:
+ extra = " - Git HEAD: " + git_head
log.debug("Version %s%s", __version__, extra)
log.debug("Python %s - %s",
@@ -148,6 +138,8 @@ def main():
except AttributeError:
pass
+ log.debug("Configuration Files %s", config._files)
+
if args.list_modules:
extractor.modules.append("")
sys.stdout.write("\n".join(extractor.modules))
@@ -201,7 +193,8 @@ def main():
if sys.stdin:
urls += util.parse_inputfile(sys.stdin, log)
else:
- log.warning("input file: stdin is not readable")
+ log.warning(
+ "input file: stdin is not readable")
else:
with open(inputfile, encoding="utf-8") as file:
urls += util.parse_inputfile(file, log)
@@ -235,7 +228,7 @@ def main():
except exception.TerminateExtraction:
pass
except exception.NoExtractorError:
- log.error("No suitable extractor found for '%s'", url)
+ log.error("Unsupported URL '%s'", url)
retval |= 64
return retval
diff --git a/gallery_dl/config.py b/gallery_dl/config.py
index 953b1b1..0f2d1f1 100644
--- a/gallery_dl/config.py
+++ b/gallery_dl/config.py
@@ -21,6 +21,7 @@ log = logging.getLogger("config")
# internals
_config = {}
+_files = []
if util.WINDOWS:
_default_configs = [
@@ -61,8 +62,8 @@ def load(files=None, strict=False, fmt="json"):
else:
parsefunc = json.load
- for path in files or _default_configs:
- path = util.expand_path(path)
+ for pathfmt in files or _default_configs:
+ path = util.expand_path(pathfmt)
try:
with open(path, encoding="utf-8") as file:
confdict = parsefunc(file)
@@ -79,6 +80,7 @@ def load(files=None, strict=False, fmt="json"):
_config.update(confdict)
else:
util.combine_dict(_config, confdict)
+ _files.append(pathfmt)
def clear():
diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py
index 579f755..6f9a92d 100644
--- a/gallery_dl/cookies.py
+++ b/gallery_dl/cookies.py
@@ -263,7 +263,7 @@ def _chrome_cookies_database(profile, config):
path = _find_most_recently_used_file(search_root, "Cookies")
if path is None:
- raise FileNotFoundError("Unable tp find {} cookies database in "
+ raise FileNotFoundError("Unable to find {} cookies database in "
"'{}'".format(config["browser"], search_root))
logger.debug("Extracting cookies from %s", path)
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index 5622462..26eb7b5 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -27,10 +27,11 @@ class HttpDownloader(DownloaderBase):
def __init__(self, job):
DownloaderBase.__init__(self, job)
extractor = job.extractor
- self.chunk_size = 16384
self.downloading = False
self.adjust_extension = self.config("adjust-extensions", True)
+ self.chunk_size = self.config("chunk-size", 32768)
+ self.metadata = extractor.config("http-metadata")
self.progress = self.config("progress", 3.0)
self.headers = self.config("headers")
self.minsize = self.config("filesize-min")
@@ -55,6 +56,13 @@ class HttpDownloader(DownloaderBase):
self.log.warning(
"Invalid maximum file size (%r)", self.maxsize)
self.maxsize = maxsize
+ if isinstance(self.chunk_size, str):
+ chunk_size = text.parse_bytes(self.chunk_size)
+ if not chunk_size:
+ self.log.warning(
+ "Invalid chunk size (%r)", self.chunk_size)
+ chunk_size = 32768
+ self.chunk_size = chunk_size
if self.rate:
rate = text.parse_bytes(self.rate)
if rate:
@@ -83,11 +91,12 @@ class HttpDownloader(DownloaderBase):
tries = 0
msg = ""
+ metadata = self.metadata
kwdict = pathfmt.kwdict
adjust_extension = kwdict.get(
"_http_adjust_extension", self.adjust_extension)
- if self.part:
+ if self.part and not metadata:
pathfmt.part_enable(self.partdir)
while True:
@@ -164,13 +173,6 @@ class HttpDownloader(DownloaderBase):
self.log.warning("Invalid response")
return False
- # set missing filename extension from MIME type
- if not pathfmt.extension:
- pathfmt.set_extension(self._find_extension(response))
- if pathfmt.exists():
- pathfmt.temppath = ""
- return True
-
# check file size
size = text.parse_int(size, None)
if size is not None:
@@ -185,11 +187,33 @@ class HttpDownloader(DownloaderBase):
size, self.maxsize)
return False
+ build_path = False
+
+ # set missing filename extension from MIME type
+ if not pathfmt.extension:
+ pathfmt.set_extension(self._find_extension(response))
+ build_path = True
+
+ # set metadata from HTTP headers
+ if metadata:
+ kwdict[metadata] = util.extract_headers(response)
+ build_path = True
+
+ # build and check file path
+ if build_path:
+ pathfmt.build_path()
+ if pathfmt.exists():
+ pathfmt.temppath = ""
+ return True
+ if self.part and metadata:
+ pathfmt.part_enable(self.partdir)
+ metadata = False
+
content = response.iter_content(self.chunk_size)
# check filename extension against file header
if adjust_extension and not offset and \
- pathfmt.extension in FILE_SIGNATURES:
+ pathfmt.extension in SIGNATURE_CHECKS:
try:
file_header = next(
content if response.raw.chunked
@@ -220,7 +244,7 @@ class HttpDownloader(DownloaderBase):
offset += len(file_header)
elif offset:
if adjust_extension and \
- pathfmt.extension in FILE_SIGNATURES:
+ pathfmt.extension in SIGNATURE_CHECKS:
self._adjust_extension(pathfmt, fp.read(16))
fp.seek(offset)
@@ -250,42 +274,38 @@ class HttpDownloader(DownloaderBase):
return True
@staticmethod
- def receive(fp, content, bytes_total, bytes_downloaded):
+ def receive(fp, content, bytes_total, bytes_start):
write = fp.write
for data in content:
write(data)
- def _receive_rate(self, fp, content, bytes_total, bytes_downloaded):
+ def _receive_rate(self, fp, content, bytes_total, bytes_start):
rate = self.rate
- progress = self.progress
- bytes_start = bytes_downloaded
write = fp.write
- t1 = tstart = time.time()
+ progress = self.progress
+
+ bytes_downloaded = 0
+ time_start = time.time()
for data in content:
- write(data)
+ time_current = time.time()
+ time_elapsed = time_current - time_start
+ bytes_downloaded += len(data)
- t2 = time.time() # current time
- elapsed = t2 - t1 # elapsed time
- num_bytes = len(data)
+ write(data)
if progress is not None:
- bytes_downloaded += num_bytes
- tdiff = t2 - tstart
- if tdiff >= progress:
+ if time_elapsed >= progress:
self.out.progress(
- bytes_total, bytes_downloaded,
- int((bytes_downloaded - bytes_start) / tdiff),
+ bytes_total,
+ bytes_start + bytes_downloaded,
+ int(bytes_downloaded / time_elapsed),
)
if rate:
- expected = num_bytes / rate # expected elapsed time
- if elapsed < expected:
- # sleep if less time elapsed than expected
- time.sleep(expected - elapsed)
- t2 = time.time()
-
- t1 = t2
+ time_expected = bytes_downloaded / rate
+ if time_expected > time_elapsed:
+ time.sleep(time_expected - time_elapsed)
def _find_extension(self, response):
"""Get filename extension from MIME type"""
@@ -308,11 +328,11 @@ class HttpDownloader(DownloaderBase):
@staticmethod
def _adjust_extension(pathfmt, file_header):
"""Check filename extension against file header"""
- sig = FILE_SIGNATURES[pathfmt.extension]
- if not file_header.startswith(sig):
- for ext, sig in FILE_SIGNATURES.items():
- if file_header.startswith(sig):
+ if not SIGNATURE_CHECKS[pathfmt.extension](file_header):
+ for ext, check in SIGNATURE_CHECKS.items():
+ if check(file_header):
pathfmt.set_extension(ext)
+ pathfmt.build_path()
return True
return False
@@ -326,6 +346,7 @@ MIME_TYPES = {
"image/x-bmp" : "bmp",
"image/x-ms-bmp": "bmp",
"image/webp" : "webp",
+ "image/avif" : "avif",
"image/svg+xml" : "svg",
"image/ico" : "ico",
"image/icon" : "ico",
@@ -362,27 +383,33 @@ MIME_TYPES = {
}
# https://en.wikipedia.org/wiki/List_of_file_signatures
-FILE_SIGNATURES = {
- "jpg" : b"\xFF\xD8\xFF",
- "png" : b"\x89PNG\r\n\x1A\n",
- "gif" : (b"GIF87a", b"GIF89a"),
- "bmp" : b"BM",
- "webp": b"RIFF",
- "svg" : b"<?xml",
- "ico" : b"\x00\x00\x01\x00",
- "cur" : b"\x00\x00\x02\x00",
- "psd" : b"8BPS",
- "webm": b"\x1A\x45\xDF\xA3",
- "ogg" : b"OggS",
- "wav" : b"RIFF",
- "mp3" : (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2", b"ID3"),
- "zip" : (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"),
- "rar" : b"\x52\x61\x72\x21\x1A\x07",
- "7z" : b"\x37\x7A\xBC\xAF\x27\x1C",
- "pdf" : b"%PDF-",
- "swf" : (b"CWS", b"FWS"),
+SIGNATURE_CHECKS = {
+ "jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF",
+ "png" : lambda s: s[0:8] == b"\x89PNG\r\n\x1A\n",
+ "gif" : lambda s: s[0:6] in (b"GIF87a", b"GIF89a"),
+ "bmp" : lambda s: s[0:2] == b"BM",
+ "webp": lambda s: (s[0:4] == b"RIFF" and
+ s[8:12] == b"WEBP"),
+ "avif": lambda s: s[4:11] == b"ftypavi" and s[11] in b"fs",
+ "svg" : lambda s: s[0:5] == b"<?xml",
+ "ico" : lambda s: s[0:4] == b"\x00\x00\x01\x00",
+ "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00",
+ "psd" : lambda s: s[0:4] == b"8BPS",
+ "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in (
+ b"mp4", b"avc", b"iso", b"M4V")),
+ "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3",
+ "ogg" : lambda s: s[0:4] == b"OggS",
+ "wav" : lambda s: (s[0:4] == b"RIFF" and
+ s[8:12] == b"WAVE"),
+ "mp3" : lambda s: (s[0:3] == b"ID3" or
+ s[0:2] in (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2")),
+ "zip" : lambda s: s[0:4] in (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"),
+ "rar" : lambda s: s[0:6] == b"Rar!\x1A\x07",
+ "7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C",
+ "pdf" : lambda s: s[0:5] == b"%PDF-",
+ "swf" : lambda s: s[0:3] in (b"CWS", b"FWS"),
# check 'bin' files against all other file signatures
- "bin" : b"\x00\x00\x00\x00\x00\x00\x00\x00",
+ "bin" : lambda s: False,
}
__downloader__ = HttpDownloader
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
index efa957b..c44ea0a 100644
--- a/gallery_dl/downloader/ytdl.py
+++ b/gallery_dl/downloader/ytdl.py
@@ -98,6 +98,7 @@ class YoutubeDLDownloader(DownloaderBase):
pathfmt.realdirectory + filename)
else:
pathfmt.set_extension(info_dict["ext"])
+ pathfmt.build_path()
if pathfmt.exists():
pathfmt.temppath = ""
@@ -118,6 +119,7 @@ class YoutubeDLDownloader(DownloaderBase):
def _download_playlist(self, ytdl_instance, pathfmt, info_dict):
pathfmt.set_extension("%(playlist_index)s.%(ext)s")
+ pathfmt.build_path()
self._set_outtmpl(ytdl_instance, pathfmt.realpath)
for entry in info_dict["entries"]:
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py
index f5125ee..92ea6ca 100644
--- a/gallery_dl/extractor/2chan.py
+++ b/gallery_dl/extractor/2chan.py
@@ -60,8 +60,8 @@ class _2chanThreadExtractor(Extractor):
def metadata(self, page):
"""Collect metadata for extractor-job"""
- title = text.extract(page, "<title>", "</title>")[0]
- title, _, boardname = title.rpartition(" - ")
+ title, _, boardname = text.extr(
+ page, "<title>", "</title>").rpartition(" - ")
return {
"server": self.server,
"title": title,
@@ -72,8 +72,8 @@ class _2chanThreadExtractor(Extractor):
def posts(self, page):
"""Build a list of all post-objects"""
- page = text.extract(
- page, '<div class="thre"', '<div style="clear:left"></div>')[0]
+ page = text.extr(
+ page, '<div class="thre"', '<div style="clear:left"></div>')
return [
self.parse(post)
for post in page.split('<table border=0>')
@@ -84,7 +84,7 @@ class _2chanThreadExtractor(Extractor):
data = self._extract_post(post)
if data["name"]:
data["name"] = data["name"].strip()
- path = text.extract(post, '<a href="/', '"')[0]
+ path = text.extr(post, '<a href="/', '"')
if path and not path.startswith("bin/jump"):
self._extract_image(post, data)
data["tim"], _, data["extension"] = data["filename"].partition(".")
diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py
index 6a40d41..28acc3d 100644
--- a/gallery_dl/extractor/35photo.py
+++ b/gallery_dl/extractor/35photo.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2021 Mike Fährmann
+# Copyright 2019-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -124,7 +124,7 @@ class _35photoUserExtractor(_35photoExtractor):
def metadata(self):
url = "{}/{}/".format(self.root, self.user)
page = self.request(url).text
- self.user_id = text.parse_int(text.extract(page, "/user_", ".xml")[0])
+ self.user_id = text.parse_int(text.extr(page, "/user_", ".xml"))
return {
"user": self.user,
"user_id": self.user_id,
@@ -189,10 +189,10 @@ class _35photoGenreExtractor(_35photoExtractor):
def metadata(self):
url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/")
page = self.request(url).text
- self.photo_ids = self._photo_ids(text.extract(
- page, ' class="photo', '\n')[0])
+ self.photo_ids = self._photo_ids(text.extr(
+ page, ' class="photo', '\n'))
return {
- "genre": text.extract(page, " genre - ", ". ")[0],
+ "genre": text.extr(page, " genre - ", ". "),
"genre_id": text.parse_int(self.genre_id),
}
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
index fe57412..fed4991 100644
--- a/gallery_dl/extractor/8muses.py
+++ b/gallery_dl/extractor/8muses.py
@@ -76,9 +76,9 @@ class _8musesAlbumExtractor(Extractor):
url = self.root + self.path + self.params
while True:
- data = self._unobfuscate(text.extract(
+ data = self._unobfuscate(text.extr(
self.request(url).text,
- 'id="ractive-public" type="text/plain">', '</script>')[0])
+ 'id="ractive-public" type="text/plain">', '</script>'))
images = data.get("pictures")
if images:
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 9e0340a..a563bfd 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -95,9 +95,9 @@ modules = [
"naver",
"naverwebtoon",
"newgrounds",
- "ngomik",
"nhentai",
"nijie",
+ "nitter",
"nozomi",
"nsfwalbum",
"paheal",
@@ -141,6 +141,7 @@ modules = [
"twibooru",
"twitter",
"unsplash",
+ "uploadir",
"vanillarock",
"vichan",
"vk",
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index 14d1e6b..da2d8f2 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -41,8 +41,8 @@ class ArtstationExtractor(Extractor):
if adict["has_embedded_player"] and self.external:
player = adict["player_embedded"]
- url = text.extract(player, 'src="', '"')[0] or \
- text.extract(player, "src='", "'")[0]
+ url = (text.extr(player, 'src="', '"') or
+ text.extr(player, "src='", "'"))
if url and not url.startswith(self.root):
asset["extension"] = None
yield Message.Url, "ytdl:" + url, asset
diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py
index fa590b9..6f01572 100644
--- a/gallery_dl/extractor/aryion.py
+++ b/gallery_dl/extractor/aryion.py
@@ -128,8 +128,7 @@ class AryionExtractor(Extractor):
# get filename from 'Content-Disposition' header
cdis = headers["content-disposition"]
- fname, _, ext = text.extract(
- cdis, 'filename="', '"')[0].rpartition(".")
+ fname, _, ext = text.extr(cdis, 'filename="', '"').rpartition(".")
if not fname:
fname, ext = ext, fname
diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py
index 17b5f52..1b49d6a 100644
--- a/gallery_dl/extractor/bbc.py
+++ b/gallery_dl/extractor/bbc.py
@@ -38,8 +38,8 @@ class BbcGalleryExtractor(GalleryExtractor):
)
def metadata(self, page):
- data = json.loads(text.extract(
- page, '<script type="application/ld+json">', '</script>')[0])
+ data = json.loads(text.extr(
+ page, '<script type="application/ld+json">', '</script>'))
return {
"programme": self.gallery_url.split("/")[4],
"path": list(util.unique_sequence(
diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py
index 7982881..4eb446d 100644
--- a/gallery_dl/extractor/bcy.py
+++ b/gallery_dl/extractor/bcy.py
@@ -97,7 +97,7 @@ class BcyExtractor(Extractor):
url = "{}/item/detail/{}".format(self.root, post_id)
page = self.request(url, notfound="post").text
return json.loads(
- text.extract(page, 'JSON.parse("', '");')[0]
+ text.extr(page, 'JSON.parse("', '");')
.replace('\\\\u002F', '/')
.replace('\\"', '"')
)["detail"]
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index 994a701..cf332ac 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -119,8 +119,8 @@ class BehanceGalleryExtractor(BehanceExtractor):
}
page = self.request(url, cookies=cookies).text
- data = json.loads(text.extract(
- page, 'id="beconfig-store_state">', '</script>')[0])
+ data = json.loads(text.extr(
+ page, 'id="beconfig-store_state">', '</script>'))
return self._update(data["project"]["project"])
def get_images(self, data):
@@ -137,7 +137,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
elif mtype == "video":
page = self.request(module["src"]).text
- url = text.extract(page, '<source src="', '"')[0]
+ url = text.extr(page, '<source src="', '"')
if text.ext_from_url(url) == "m3u8":
url = "ytdl:" + url
append((url, module))
@@ -150,8 +150,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
elif mtype == "embed":
embed = module.get("original_embed") or module.get("embed")
if embed:
- url = "ytdl:" + text.extract(embed, 'src="', '"')[0]
- append((url, module))
+ append(("ytdl:" + text.extr(embed, 'src="', '"'), module))
return result
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 232f3ea..8a1a42e 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -61,8 +61,8 @@ class BloggerExtractor(Extractor):
page = self.request(post["url"]).text
for url in findall_video(page):
page = self.request(url).text
- video_config = json.loads(text.extract(
- page, 'var VIDEO_CONFIG =', '\n')[0])
+ video_config = json.loads(text.extr(
+ page, 'var VIDEO_CONFIG =', '\n'))
files.append(max(
video_config["streams"],
key=lambda x: x["format_id"],
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 12d98b1..0d7d13d 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -25,6 +25,7 @@ class BooruExtractor(BaseExtractor):
data = self.metadata()
tags = self.config("tags", False)
notes = self.config("notes", False)
+ fetch_html = tags or notes
for post in self.posts():
try:
@@ -36,11 +37,13 @@ class BooruExtractor(BaseExtractor):
"(md5: %s)", post.get("id"), post.get("md5"))
continue
- page_html = None
- if tags:
- page_html = self._extended_tags(post)
- if notes:
- self._notes(post, page_html)
+ if fetch_html:
+ html = self._html(post)
+ if tags:
+ self._tags(post, html)
+ if notes:
+ self._notes(post, html)
+
text.nameext_from_url(url, post)
post.update(data)
self._prepare(post)
@@ -67,16 +70,13 @@ class BooruExtractor(BaseExtractor):
_file_url = operator.itemgetter("file_url")
def _prepare(self, post):
- """Prepare the 'post's metadata"""
+ """Prepare a 'post's metadata"""
- def _extended_tags(self, post, page=None):
- """Generate extended tag information
+ def _html(self, post):
+ """Return HTML content of a post"""
- The return value of this function will be
- passed to the _notes function as the page parameter.
- This makes it possible to reuse the same HTML both for
- extracting tags and notes.
- """
+ def _tags(self, post, page):
+ """Extract extended tag metadata"""
- def _notes(self, post, page=None):
- """Generate information about notes"""
+ def _notes(self, post, page):
+ """Extract notes metadata"""
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 2502411..dde9cf8 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -68,9 +68,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
url = self.root + "/a/" + self.album_id
try:
- data = json.loads(text.extract(
+ data = json.loads(text.extr(
self.request(url).text,
- 'id="__NEXT_DATA__" type="application/json">', '<')[0])
+ 'id="__NEXT_DATA__" type="application/json">', '<'))
album = data["props"]["pageProps"]["album"]
files = album["files"]
except Exception as exc:
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index e304717..4352aa7 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -20,7 +20,7 @@ import requests
import threading
from requests.adapters import HTTPAdapter
from .message import Message
-from .. import config, text, util, exception
+from .. import config, text, util, cache, exception
class Extractor():
@@ -149,14 +149,13 @@ class Extractor():
msg = "'{} {}' for '{}'".format(code, response.reason, url)
server = response.headers.get("Server")
- if server and server.startswith("cloudflare"):
- if code == 503 and \
- (b"_cf_chl_opt" in response.content or
- b"jschl-answer" in response.content):
+ if server and server.startswith("cloudflare") and \
+ code in (403, 503):
+ content = response.content
+ if b"_cf_chl_opt" in content or b"jschl-answer" in content:
self.log.warning("Cloudflare IUAM challenge")
break
- if code == 403 and \
- b'name="captcha-bypass"' in response.content:
+ if b'name="captcha-bypass"' in content:
self.log.warning("Cloudflare CAPTCHA")
break
if code < 500 and code != 429 and code != 430:
@@ -263,9 +262,13 @@ class Extractor():
ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
ssl_ciphers = SSL_CIPHERS[browser]
else:
- headers["User-Agent"] = self.config("user-agent", (
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
- "rv:102.0) Gecko/20100101 Firefox/102.0"))
+ useragent = self.config("user-agent")
+ if useragent is None:
+ useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
+ "rv:102.0) Gecko/20100101 Firefox/102.0")
+ elif useragent == "browser":
+ useragent = _browser_useragent()
+ headers["User-Agent"] = useragent
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
@@ -725,6 +728,36 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address):
return adapter
+@cache.cache(maxage=86400)
+def _browser_useragent():
+ """Get User-Agent header from default browser"""
+ import webbrowser
+ import socket
+
+ server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ server.bind(("127.0.0.1", 6414))
+ server.listen(1)
+
+ webbrowser.open("http://127.0.0.1:6414/user-agent")
+
+ client = server.accept()[0]
+ server.close()
+
+ for line in client.recv(1024).split(b"\r\n"):
+ key, _, value = line.partition(b":")
+ if key.strip().lower() == b"user-agent":
+ useragent = value.strip()
+ break
+ else:
+ useragent = b""
+
+ client.send(b"HTTP/1.1 200 OK\r\n\r\n" + useragent)
+ client.close()
+
+ return useragent.decode()
+
+
_adapter_cache = {}
_browser_cookies = {}
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index cb2aa24..45beddf 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -603,22 +603,22 @@ class DeviantartStashExtractor(DeviantartExtractor):
page = self._limited_request(url).text
if stash_id[0] == "0":
- uuid = text.extract(page, '//deviation/', '"')[0]
+ uuid = text.extr(page, '//deviation/', '"')
if uuid:
deviation = self.api.deviation(uuid)
- deviation["index"] = text.parse_int(text.extract(
- page, 'gmi-deviationid="', '"')[0])
+ deviation["index"] = text.parse_int(text.extr(
+ page, 'gmi-deviationid="', '"'))
yield deviation
return
for item in text.extract_iter(
page, 'class="stash-thumb-container', '</div>'):
- url = text.extract(item, '<a href="', '"')[0]
+ url = text.extr(item, '<a href="', '"')
if url:
stash_id = url.rpartition("/")[2]
else:
- stash_id = text.extract(item, 'gmi-stashid="', '"')[0]
+ stash_id = text.extr(item, 'gmi-stashid="', '"')
stash_id = "2" + util.bencode(text.parse_int(
stash_id), "0123456789abcdefghijklmnopqrstuvwxyz")
@@ -960,9 +960,15 @@ class DeviantartScrapsExtractor(DeviantartExtractor):
)
cookiedomain = ".deviantart.com"
cookienames = ("auth", "auth_secure", "userinfo")
+ _warning = True
def deviations(self):
eclipse_api = DeviantartEclipseAPI(self)
+ if self._warning:
+ DeviantartScrapsExtractor._warning = False
+ if not self._check_cookies(self.cookienames):
+ self.log.warning(
+ "No session cookies set: Unable to fetch mature scraps.")
for obj in eclipse_api.gallery_scraps(self.user, self.offset):
deviation = obj["deviation"]
@@ -1478,8 +1484,8 @@ class DeviantartEclipseAPI():
def _fetch_csrf_token(self, page=None):
if page is None:
page = self.request(self.extractor.root + "/").text
- self.csrf_token = token = text.extract(
- page, "window.__CSRF_TOKEN__ = '", "'")[0]
+ self.csrf_token = token = text.extr(
+ page, "window.__CSRF_TOKEN__ = '", "'")
return token
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index e5c5c01..d78f25b 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -30,7 +30,7 @@ class DynastyscansBase():
src = extr("class='btn-group'>", "</div>")
url = extr(' src="', '"')
- src = text.extract(src, 'href="', '"')[0] if "Source<" in src else ""
+ src = text.extr(src, 'href="', '"') if "Source<" in src else ""
return {
"url" : self.root + url,
@@ -75,7 +75,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
"title" : text.unescape(match.group(4) or ""),
"author" : text.remove_html(author),
"group" : (text.remove_html(group) or
- text.extract(group, ' alt="', '"')[0] or ""),
+ text.extr(group, ' alt="', '"')),
"date" : text.parse_datetime(extr(
'"icon-calendar"></i> ', '<'), "%b %d, %Y"),
"lang" : "en",
@@ -83,7 +83,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
}
def images(self, page):
- data = text.extract(page, "var pages = ", ";\n")[0]
+ data = text.extr(page, "var pages = ", ";\n")
return [
(self.root + img["image"], None)
for img in json.loads(data)
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index 992db97..b4dadc7 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -55,8 +55,8 @@ class EromeExtractor(Extractor):
yield Message.Directory, data
groups = page.split('<div class="media-group"')
for data["num"], group in enumerate(util.advance(groups, 1), 1):
- url = (text.extract(group, '<source src="', '"')[0] or
- text.extract(group, 'data-src="', '"')[0])
+ url = (text.extr(group, '<source src="', '"') or
+ text.extr(group, 'data-src="', '"'))
if url:
yield Message.Url, url, text.nameext_from_url(url, data)
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index e37e81b..a546f68 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -185,7 +185,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
if self.gallery_token:
gpage = self._gallery_page()
- self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0]
+ self.image_token = text.extr(gpage, 'hentai.org/s/', '"')
if not self.image_token:
self.log.error("Failed to extract initial image token")
self.log.debug("Page content:\n%s", gpage)
@@ -193,7 +193,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
ipage = self._image_page()
else:
ipage = self._image_page()
- part = text.extract(ipage, 'hentai.org/g/', '"')[0]
+ part = text.extr(ipage, 'hentai.org/g/', '"')
if not part:
self.log.error("Failed to extract gallery token")
self.log.debug("Page content:\n%s", ipage)
@@ -271,8 +271,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
}
if data["uploader"].startswith("<"):
- data["uploader"] = text.unescape(text.extract(
- data["uploader"], ">", "<")[0])
+ data["uploader"] = text.unescape(text.extr(
+ data["uploader"], ">", "<"))
f = data["favorites"][0]
if f == "N":
@@ -400,7 +400,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
}
page = self.request(url, cookies=cookies).text
- current = text.extract(page, "<strong>", "</strong>")[0]
+ current = text.extr(page, "<strong>", "</strong>")
self.log.debug("Image Limits: %s/%s", current, self.limits)
self._remaining = self.limits - text.parse_int(current)
@@ -473,6 +473,10 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
"pattern": ExhentaiGalleryExtractor.pattern,
"range": "1-30",
"count": 30,
+ "keyword": {
+ "gallery_id": int,
+ "gallery_token": r"re:^[0-9a-f]{10}$"
+ },
}),
)
@@ -490,26 +494,39 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
self.params = {"f_search": tag, "page": 0}
else:
self.params = text.parse_query(query)
- self.params["page"] = text.parse_int(self.params.get("page"))
+ if "next" not in self.params:
+ self.params["page"] = text.parse_int(self.params.get("page"))
def items(self):
self.login()
data = {"_extractor": ExhentaiGalleryExtractor}
+ search_url = self.search_url
+ params = self.params
while True:
last = None
- page = self.request(self.search_url, params=self.params).text
+ page = self.request(search_url, params=params).text
for gallery in ExhentaiGalleryExtractor.pattern.finditer(page):
url = gallery.group(0)
if url == last:
continue
last = url
+ data["gallery_id"] = text.parse_int(gallery.group(2))
+ data["gallery_token"] = gallery.group(3)
yield Message.Queue, url + "/", data
- if 'class="ptdd">&gt;<' in page or ">No hits found</p>" in page:
+ next_url = text.extr(page, 'nexturl = "', '"', None)
+ if next_url is not None:
+ if not next_url:
+ return
+ search_url = next_url
+ params = None
+
+ elif 'class="ptdd">&gt;<' in page or ">No hits found</p>" in page:
return
- self.params["page"] += 1
+ else:
+ params["page"] += 1
class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py
index ab0e0c5..57587b6 100644
--- a/gallery_dl/extractor/fallenangels.py
+++ b/gallery_dl/extractor/fallenangels.py
@@ -57,7 +57,7 @@ class FallenangelsChapterExtractor(ChapterExtractor):
return [
(img["page_image"], None)
for img in json.loads(
- text.extract(page, "var pages = ", ";")[0]
+ text.extr(page, "var pages = ", ";")
)
]
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index 5e6da5b..4f9a6bf 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -56,7 +56,7 @@ class FoolfuukaExtractor(BaseExtractor):
"""Resolve a remote media link"""
needle = '<meta http-equiv="Refresh" content="0; url='
page = self.request(media["remote_media_link"]).text
- return text.extract(page, needle, '"')[0]
+ return text.extr(page, needle, '"')
@staticmethod
def _remote_direct(media):
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
index 382cc25..81671ec 100644
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -114,7 +114,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
})
def images(self, page):
- return json.loads(text.extract(page, "var pages = ", ";")[0])
+ return json.loads(text.extr(page, "var pages = ", ";"))
class FoolslideMangaExtractor(FoolslideExtractor):
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index b63cfc1..cc43cec 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -160,7 +160,7 @@ class FuraffinityExtractor(Extractor):
while path:
page = self.request(self.root + path).text
yield from text.extract_iter(page, 'id="sid-', '"')
- path = text.extract(page, 'right" href="', '"')[0]
+ path = text.extr(page, 'right" href="', '"')
def _pagination_search(self, query):
url = self.root + "/search/"
diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py
index df55061..d6640f5 100644
--- a/gallery_dl/extractor/fuskator.py
+++ b/gallery_dl/extractor/fuskator.py
@@ -58,7 +58,7 @@ class FuskatorGalleryExtractor(GalleryExtractor):
self.root + "/ajax/gal.aspx", params=params, headers=headers,
).json()
- title = text.extract(page, "<title>", "</title>")[0].strip()
+ title = text.extr(page, "<title>", "</title>").strip()
title, _, gallery_id = title.rpartition("#")
return {
@@ -104,7 +104,7 @@ class FuskatorSearchExtractor(Extractor):
page, 'class="pic_pad"><a href="', '"'):
yield Message.Queue, self.root + path, data
- pages = text.extract(page, 'class="pages"><span>', '>&gt;&gt;<')[0]
+ pages = text.extr(page, 'class="pages"><span>', '>&gt;&gt;<')
if not pages:
return
url = self.root + text.rextract(pages, 'href="', '"')[0]
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index a2cf0c0..d8109e1 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -68,6 +68,22 @@ class GelbooruBase():
yield "https://img2.gelbooru.com" + path
yield "https://img1.gelbooru.com" + path
+ def _notes(self, post, page):
+ notes_data = text.extr(page, '<section id="notes"', '</section>')
+ if not notes_data:
+ return
+
+ post["notes"] = notes = []
+ extr = text.extract
+ for note in text.extract_iter(notes_data, '<article', '</article>'):
+ notes.append({
+ "width" : int(extr(note, 'data-width="', '"')[0]),
+ "height": int(extr(note, 'data-height="', '"')[0]),
+ "x" : int(extr(note, 'data-x="', '"')[0]),
+ "y" : int(extr(note, 'data-y="', '"')[0]),
+ "body" : extr(note, 'data-body="', '"')[0],
+ })
+
class GelbooruTagExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02TagExtractor):
@@ -182,21 +198,21 @@ class GelbooruPostExtractor(GelbooruBase,
"keywords": {
"notes": [
{
- "height": 553,
"body": "Look over this way when you talk~",
+ "height": 553,
"width": 246,
"x": 35,
- "y": 72
+ "y": 72,
},
{
- "height": 557,
"body": "Hey~\nAre you listening~?",
+ "height": 557,
"width": 246,
"x": 1233,
- "y": 109
- }
- ]
- }
+ "y": 109,
+ },
+ ],
+ },
}),
)
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 8214614..da87b8f 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -31,6 +31,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
if self.category == "realbooru":
self._file_url = self._file_url_realbooru
+ self._tags = self._tags_realbooru
def _api_request(self, params):
url = self.api_root + "/index.php?page=dapi&s=post&q=index"
@@ -85,55 +86,58 @@ class GelbooruV02Extractor(booru.BooruExtractor):
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ def _html(self, post):
+ return self.request("{}/index.php?page=post&s=view&id={}".format(
+ self.root, post["id"])).text
+
+ def _tags(self, post, page):
+ tag_container = (text.extr(page, '<ul id="tag-', '</ul>') or
+ text.extr(page, '<ul class="tag-', '</ul>'))
+ if not tag_container:
+ return
+
+ tags = collections.defaultdict(list)
+ pattern = re.compile(
+ r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
+ for tag_type, tag_name in pattern.findall(tag_container):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
+
+ def _notes(self, post, page):
+ note_container = text.extr(page, 'id="note-container"', "<img ")
+ if not note_container:
+ return
+
+ post["notes"] = notes = []
+ for note in note_container.split('class="note-box"')[1:]:
+ extr = text.extract_from(note)
+ notes.append({
+ "width" : int(extr("width:", "p")),
+ "height": int(extr("height:", "p")),
+ "y" : int(extr("top:", "p")),
+ "x" : int(extr("left:", "p")),
+ "id" : int(extr('id="note-body-', '"')),
+ "body" : text.unescape(text.remove_html(extr(">", "</div>"))),
+ })
+
def _file_url_realbooru(self, post):
url = post["file_url"]
- if url.count("/") == 5:
- md5 = post["md5"]
+ md5 = post["md5"]
+ if md5 not in post["preview_url"] or url.count("/") == 5:
url = "{}/images/{}/{}/{}.{}".format(
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url
- def _extended_tags(self, post, page=None):
- if not page:
- url = "{}/index.php?page=post&s=view&id={}".format(
- self.root, post["id"])
- page = self.request(url).text
- html = text.extract(page, '<ul id="tag-', '</ul>')[0]
- if not html:
- html = text.extract(page, '<ul class="tag-', '</ul>')[0]
- if html:
- tags = collections.defaultdict(list)
- pattern = re.compile(
- r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
- for tag_type, tag_name in pattern.findall(html):
- tags[tag_type].append(text.unquote(tag_name))
- for key, value in tags.items():
- post["tags_" + key] = " ".join(value)
- return page
-
- def _notes(self, post, page=None):
- if not page:
- url = "{}/index.php?page=post&s=view&id={}".format(
- self.root, post["id"])
- page = self.request(url).text
- notes = []
- notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
- if not notes_data:
- return
-
- note_iter = text.extract_iter(notes_data, '<article', '</article>')
- extr = text.extract
- for note_data in note_iter:
- note = {
- "width": int(extr(note_data, 'data-width="', '"')[0]),
- "height": int(extr(note_data, 'data-height="', '"')[0]),
- "x": int(extr(note_data, 'data-x="', '"')[0]),
- "y": int(extr(note_data, 'data-y="', '"')[0]),
- "body": extr(note_data, 'data-body="', '"')[0],
- }
- notes.append(note)
-
- post["notes"] = notes
+ def _tags_realbooru(self, post, page):
+ tag_container = text.extr(page, 'id="tagLink"', '</div>')
+ tags = collections.defaultdict(list)
+ pattern = re.compile(
+ r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
+ for tag_type, tag_name in pattern.findall(tag_container):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
INSTANCES = {
@@ -310,15 +314,81 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
archive_fmt = "{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
test = (
- ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
- "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
- "options": (("tags", True),),
+ ("https://rule34.xxx/index.php?page=post&s=view&id=863", {
+ "pattern": r"https://api-cdn\.rule34\.xxx/images"
+ r"/1/6aafbdb3e22f3f3b412ea2cf53321317a37063f3\.jpg",
+ "content": ("a43f418aa350039af0d11cae501396a33bbe2201",
+ "67b516295950867e1c1ab6bc13b35d3b762ed2a3"),
+ "options": (("tags", True), ("notes", True)),
"keyword": {
- "tags_artist": "danraku",
- "tags_character": "kashima_(kantai_collection)",
- "tags_copyright": "kantai_collection",
+ "tags_artist": "reverse_noise yamu_(reverse_noise)",
+ "tags_character": "hong_meiling",
+ "tags_copyright": "touhou",
"tags_general": str,
- "tags_metadata": str,
+ "tags_metadata": "censored translated",
+ "notes": [
+ {
+ "body": "It feels angry, I'm losing myself... "
+ "It won't calm down!",
+ "height": 65,
+ "id": 93586,
+ "width": 116,
+ "x": 22,
+ "y": 333,
+ },
+ {
+ "body": "REPUTATION OF RAGE",
+ "height": 272,
+ "id": 93587,
+ "width": 199,
+ "x": 78,
+ "y": 442,
+ },
+ ],
+
+ },
+ }),
+ ("https://hypnohub.net/index.php?page=post&s=view&id=1439", {
+ "pattern": r"https://hypnohub\.net/images"
+ r"/90/24/90245c3c5250c2a8173255d3923a010b\.jpg",
+ "content": "5987c5d2354f22e5fa9b7ee7ce4a6f7beb8b2b71",
+ "options": (("tags", True), ("notes", True)),
+ "keyword": {
+ "tags_artist": "brokenteapot",
+ "tags_character": "hsien-ko",
+ "tags_copyright": "capcom darkstalkers",
+ "tags_general": str,
+ "tags_metadata": "dialogue text translated",
+ "notes": [
+ {
+ "body": "Master Master Master "
+ "Master Master Master",
+ "height": 83,
+ "id": 10577,
+ "width": 129,
+ "x": 259,
+ "y": 20,
+ },
+ {
+ "body": "Response Response Response "
+ "Response Response Response",
+ "height": 86,
+ "id": 10578,
+ "width": 125,
+ "x": 126,
+ "y": 20,
+ },
+ {
+ "body": "Obedience Obedience Obedience "
+ "Obedience Obedience Obedience",
+ "height": 80,
+ "id": 10579,
+ "width": 98,
+ "x": 20,
+ "y": 20,
+ },
+ ],
+
},
}),
("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
@@ -336,16 +406,18 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
"pattern": r"https://realbooru\.com/images/dc/b5"
r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg",
"content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_general": "1girl blonde blonde_hair blue_eyes cute "
+ "female female_only looking_at_viewer smile "
+ "solo solo_female teeth",
+ "tags_model": "jennifer_lawrence",
+ },
}),
("https://tbib.org/index.php?page=post&s=view&id=9233957", {
"url": "5a6ebe07bfff8e6d27f7c30b5480f27abcb577d2",
"content": "1c3831b6fbaa4686e3c79035b5d98460b1c85c43",
}),
- ("https://hypnohub.net/index.php?page=post&s=view&id=73964", {
- "pattern": r"https://hypnohub\.net/images/7a/37"
- r"/7a37c0ba372f35767fb10c904a398831\.png",
- "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
- }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index 69c07d0..10c7295 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -87,25 +87,25 @@ class GenericExtractor(Extractor):
"""Extract generic webpage metadata, return them in a dict."""
data = {}
data['pageurl'] = self.url
- data['title'] = text.extract(page, '<title>', "</title>")[0] or ""
- data['description'] = text.extract(
- page, '<meta name="description" content="', '"')[0] or ""
- data['keywords'] = text.extract(
- page, '<meta name="keywords" content="', '"')[0] or ""
- data['language'] = text.extract(
- page, '<meta name="language" content="', '"')[0] or ""
- data['name'] = text.extract(
- page, '<meta itemprop="name" content="', '"')[0] or ""
- data['copyright'] = text.extract(
- page, '<meta name="copyright" content="', '"')[0] or ""
- data['og_site'] = text.extract(
- page, '<meta property="og:site" content="', '"')[0] or ""
- data['og_site_name'] = text.extract(
- page, '<meta property="og:site_name" content="', '"')[0] or ""
- data['og_title'] = text.extract(
- page, '<meta property="og:title" content="', '"')[0] or ""
- data['og_description'] = text.extract(
- page, '<meta property="og:description" content="', '"')[0] or ""
+ data['title'] = text.extr(page, '<title>', "</title>")
+ data['description'] = text.extr(
+ page, '<meta name="description" content="', '"')
+ data['keywords'] = text.extr(
+ page, '<meta name="keywords" content="', '"')
+ data['language'] = text.extr(
+ page, '<meta name="language" content="', '"')
+ data['name'] = text.extr(
+ page, '<meta itemprop="name" content="', '"')
+ data['copyright'] = text.extr(
+ page, '<meta name="copyright" content="', '"')
+ data['og_site'] = text.extr(
+ page, '<meta property="og:site" content="', '"')
+ data['og_site_name'] = text.extr(
+ page, '<meta property="og:site_name" content="', '"')
+ data['og_title'] = text.extr(
+ page, '<meta property="og:title" content="', '"')
+ data['og_description'] = text.extr(
+ page, '<meta property="og:description" content="', '"')
data = {k: text.unescape(data[k]) for k in data if data[k] != ""}
diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py
index b4f433b..593a846 100644
--- a/gallery_dl/extractor/hentaicosplays.py
+++ b/gallery_dl/extractor/hentaicosplays.py
@@ -60,7 +60,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor):
self.session.headers["Referer"] = url
def metadata(self, page):
- title = text.extract(page, "<title>", "</title>")[0]
+ title = text.extr(page, "<title>", "</title>")
return {
"title": text.unescape(title.rpartition(" Story Viewer - ")[0]),
"slug" : self.slug,
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
index 0741451..2dfc721 100644
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -156,8 +156,8 @@ class HentaifoundryExtractor(Extractor):
"filter_media" : "A",
"filter_order" : "date_new",
"filter_type" : "0",
- "YII_CSRF_TOKEN" : text.unquote(text.extract(
- csrf_token, "%22", "%22")[0]),
+ "YII_CSRF_TOKEN" : text.unquote(text.extr(
+ csrf_token, "%22", "%22")),
}
self.request(url, method="POST", data=data)
diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py
index c3e6d76..38ec77c 100644
--- a/gallery_dl/extractor/hentaihere.py
+++ b/gallery_dl/extractor/hentaihere.py
@@ -30,19 +30,24 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
"keyword": "0207d20eea3a15d2a8d1496755bdfa49de7cfa9d",
}),
("https://hentaihere.com/m/S23048/1.5/1/", {
- "author": "Shinozuka Yuuji",
- "chapter": 1,
- "chapter_id": 80186,
- "chapter_minor": ".5",
+ "pattern": r"https://hentaicdn\.com/hentai"
+ r"/23048/1\.5/ccdn00\d+\.jpg",
"count": 32,
- "lang": "en",
- "language": "English",
- "manga": "High School Slut's Love Consultation",
- "manga_id": 23048,
- "page": int,
- "title": "High School Slut's Love Consultation + "
- "Girlfriend [Full Color]",
- "type": "Original",
+ "keyword": {
+ "author": "Shinozuka Yuuji",
+ "chapter": 1,
+ "chapter_id": 80186,
+ "chapter_minor": ".5",
+ "count": 32,
+ "lang": "en",
+ "language": "English",
+ "manga": "High School Slut's Love Consultation",
+ "manga_id": 23048,
+ "page": int,
+ "title": "High School Slut's Love Consultation + "
+ "Girlfriend [Full Color]",
+ "type": "Original",
+ },
}),
)
@@ -52,8 +57,8 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
ChapterExtractor.__init__(self, match, url)
def metadata(self, page):
- title = text.extract(page, "<title>", "</title>")[0]
- chapter_id = text.extract(page, 'report/C', '"')[0]
+ title = text.extr(page, "<title>", "</title>")
+ chapter_id = text.extr(page, 'report/C', '"')
chapter, sep, minor = self.chapter.partition(".")
pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
match = re.match(pattern, title)
@@ -72,7 +77,7 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
@staticmethod
def images(page):
- images = text.extract(page, "var rff_imageList = ", ";")[0]
+ images = text.extr(page, "var rff_imageList = ", ";")
return [
("https://hentaicdn.com/hentai" + part, None)
for part in json.loads(images)
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
index 201ffdd..adee94a 100644
--- a/gallery_dl/extractor/hiperdex.py
+++ b/gallery_dl/extractor/hiperdex.py
@@ -139,7 +139,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
self.manga_data(self.manga, page)
results = []
- shortlink = text.extract(page, "rel='shortlink' href='", "'")[0]
+ shortlink = text.extr(page, "rel='shortlink' href='", "'")
data = {
"action" : "manga_get_reading_nav",
"manga" : shortlink.rpartition("=")[2],
@@ -182,6 +182,6 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
def chapters(self, page):
results = []
for info in text.extract_iter(page, 'id="manga-item-', '<img'):
- url = text.extract(info, 'href="', '"')[0]
+ url = text.extr(info, 'href="', '"')
results.append((url, {}))
return results
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
index d6575cf..01ad38c 100644
--- a/gallery_dl/extractor/hotleak.py
+++ b/gallery_dl/extractor/hotleak.py
@@ -44,7 +44,7 @@ class HotleakExtractor(Extractor):
for item in text.extract_iter(
page, '<article class="movie-item', '</article>'):
- yield text.extract(item, '<a href="', '"')[0]
+ yield text.extr(item, '<a href="', '"')
params["page"] += 1
@@ -87,8 +87,8 @@ class HotleakPostExtractor(HotleakExtractor):
url = "{}/{}/{}/{}".format(
self.root, self.creator, self.type, self.id)
page = self.request(url).text
- page = text.extract(
- page, '<div class="movie-image thumb">', '</article>')[0]
+ page = text.extr(
+ page, '<div class="movie-image thumb">', '</article>')
data = {
"id" : text.parse_int(self.id),
"creator": self.creator,
@@ -96,12 +96,12 @@ class HotleakPostExtractor(HotleakExtractor):
}
if self.type == "photo":
- data["url"] = text.extract(page, 'data-src="', '"')[0]
+ data["url"] = text.extr(page, 'data-src="', '"')
text.nameext_from_url(data["url"], data)
elif self.type == "video":
- data["url"] = "ytdl:" + text.extract(
- text.unescape(page), '"src":"', '"')[0]
+ data["url"] = "ytdl:" + text.extr(
+ text.unescape(page), '"src":"', '"')
text.nameext_from_url(data["url"], data)
data["extension"] = "mp4"
diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py
index 9701f1e..ce68d6d 100644
--- a/gallery_dl/extractor/idolcomplex.py
+++ b/gallery_dl/extractor/idolcomplex.py
@@ -115,7 +115,7 @@ class IdolcomplexExtractor(SankakuExtractor):
if self.extags:
tags = collections.defaultdict(list)
- tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0]
+ tags_html = text.extr(page, '<ul id=tag-sidebar>', '</ul>')
pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)')
for tag_type, tag_name in pattern.findall(tags_html or ""):
tags[tag_type].append(text.unquote(tag_name))
diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py
index 7cd67d6..f993db8 100644
--- a/gallery_dl/extractor/imagebam.py
+++ b/gallery_dl/extractor/imagebam.py
@@ -83,8 +83,8 @@ class ImagebamGalleryExtractor(ImagebamExtractor):
@staticmethod
def metadata(page):
- return {"title": text.unescape(text.extract(
- page, 'id="gallery-name">', '<')[0].strip())}
+ return {"title": text.unescape(text.extr(
+ page, 'id="gallery-name">', '<').strip())}
def images(self, page):
findall = re.compile(r'<a href="https://www\.imagebam\.com'
diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py
index a1ba0c3..14aa16f 100644
--- a/gallery_dl/extractor/imagechest.py
+++ b/gallery_dl/extractor/imagechest.py
@@ -36,8 +36,8 @@ class ImagechestGalleryExtractor(GalleryExtractor):
return {
"gallery_id": self.gallery_id,
- "title": text.unescape(text.extract(
- page, 'property="og:title" content="', '"')[0].strip())
+ "title": text.unescape(text.extr(
+ page, 'property="og:title" content="', '"').strip())
}
def images(self, page):
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
index 2c899eb..56bd048 100644
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@@ -202,7 +202,7 @@ class ImagefapUserExtractor(ImagefapExtractor):
response = self.request(url)
self.user = response.url.split("/")[-2]
- folders = text.extract(response.text, ' id="tgl_all" value="', '"')[0]
+ folders = text.extr(response.text, ' id="tgl_all" value="', '"')
return folders.rstrip("|").split("|")
def galleries(self, folder_id):
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index 69455a8..622509f 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -54,6 +54,7 @@ class ImagehostImageExtractor(Extractor):
url, filename = self.get_info(page)
data = text.nameext_from_url(filename, {"token": self.token})
+ data.update(self.metadata(page))
if self.https and url.startswith("http:"):
url = "https:" + url[5:]
@@ -63,6 +64,10 @@ class ImagehostImageExtractor(Extractor):
def get_info(self, page):
"""Find image-url and string to get filename from"""
+ def metadata(self, page):
+ """Return additional metadata"""
+ return ()
+
class ImxtoImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from imx.to"""
@@ -72,13 +77,23 @@ class ImxtoImageExtractor(ImagehostImageExtractor):
test = (
("https://imx.to/i/1qdeva", { # new-style URL
"url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130",
- "keyword": "1153a986c939d7aed599905588f5c940048bc517",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
+ "keyword": {
+ "size" : 18,
+ "width" : 64,
+ "height": 32,
+ "hash" : "94d56c599223c59f3feb71ea603484d1",
+ },
}),
("https://imx.to/img-57a2050547b97.html", { # old-style URL
"url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",
- "keyword": "fd2240aee77a21b8252d5b829a1f7e542f927f09",
"content": "54592f2635674c25677c6872db3709d343cdf92f",
+ "keyword": {
+ "size" : 5284,
+ "width" : 320,
+ "height": 160,
+ "hash" : "40da6aaa7b8c42b18ef74309bbc713fc",
+ },
}),
("https://img.yt/img-57a2050547b97.html", { # img.yt domain
"url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",
@@ -108,6 +123,17 @@ class ImxtoImageExtractor(ImagehostImageExtractor):
filename += splitext(url)[1]
return url, filename or url
+ def metadata(self, page):
+ extr = text.extract_from(page, page.index("[ FILESIZE <"))
+ size = extr(">", "</span>").replace(" ", "")[:-1]
+ width, _, height = extr(">", " px</span>").partition("x")
+ return {
+ "size" : text.parse_bytes(size),
+ "width" : text.parse_int(width),
+ "height": text.parse_int(height),
+ "hash" : extr(">", "</span>"),
+ }
+
class AcidimgImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from acidimg.cc"""
@@ -259,7 +285,7 @@ class ViprImageExtractor(ImagehostImageExtractor):
})
def get_info(self, page):
- url = text.extract(page, '<img src="', '"')[0]
+ url = text.extr(page, '<img src="', '"')
return url, url
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
index f32093a..49082d8 100644
--- a/gallery_dl/extractor/imgbb.py
+++ b/gallery_dl/extractor/imgbb.py
@@ -71,7 +71,7 @@ class ImgbbExtractor(Extractor):
url = self.root + "/login"
page = self.request(url).text
- token = text.extract(page, 'PF.obj.config.auth_token="', '"')[0]
+ token = text.extr(page, 'PF.obj.config.auth_token="', '"')
headers = {"Referer": url}
data = {
@@ -154,7 +154,7 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
}
def images(self, page):
- url = text.extract(page, '"og:url" content="', '"')[0]
+ url = text.extr(page, '"og:url" content="', '"')
album_id = url.rpartition("/")[2].partition("?")[0]
return self._pagination(page, "https://ibb.co/json", {
@@ -185,7 +185,7 @@ class ImgbbUserExtractor(ImgbbExtractor):
return {"user": self.user}
def images(self, page):
- user = text.extract(page, '.obj.resource={"id":"', '"')[0]
+ user = text.extr(page, '.obj.resource={"id":"', '"')
return self._pagination(page, self.page_url + "json", {
"from" : "user",
"userid" : user,
diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py
index 251f52e..530c4e1 100644
--- a/gallery_dl/extractor/imgbox.py
+++ b/gallery_dl/extractor/imgbox.py
@@ -53,7 +53,7 @@ class ImgboxExtractor(Extractor):
@staticmethod
def get_image_url(page):
"""Extract download-url"""
- return text.extract(page, 'property="og:image" content="', '"')[0]
+ return text.extr(page, 'property="og:image" content="', '"')
class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
@@ -89,7 +89,7 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
raise exception.NotFoundError("gallery")
self.image_keys = re.findall(r'<a href="/([^"]+)"><img alt="', page)
- title = text.extract(page, "<h1>", "</h1>")[0]
+ title = text.extr(page, "<h1>", "</h1>")
title, _, count = title.rpartition(" - ")
return {
"gallery_key": self.gallery_key,
diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py
index 6b424ad..7e4cce4 100644
--- a/gallery_dl/extractor/imgth.py
+++ b/gallery_dl/extractor/imgth.py
@@ -41,7 +41,7 @@ class ImgthGalleryExtractor(Extractor):
"""Yield all image urls for this gallery"""
pnum = 0
while True:
- thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
+ thumbs = text.extr(page, '<ul class="thumbnails">', '</ul>')
for url in text.extract_iter(thumbs, '<img src="', '"'):
yield "https://imgth.com/images" + url[24:]
if '<li class="next">' not in page:
diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py
index ded8906..706cd34 100644
--- a/gallery_dl/extractor/inkbunny.py
+++ b/gallery_dl/extractor/inkbunny.py
@@ -236,7 +236,7 @@ class InkbunnySearchExtractor(InkbunnyExtractor):
# get user_id from user profile
url = "{}/{}".format(self.root, favsby)
page = self.request(url).text
- user_id = text.extract(page, "?user_id=", "'")[0]
+ user_id = text.extr(page, "?user_id=", "'")
params["favs_user_id"] = user_id.partition("&")[0]
return self.api.search(params)
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index a4ea71a..24ad873 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -14,7 +14,6 @@ from .. import text, util, exception
from ..cache import cache, memcache
import binascii
import json
-import time
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com"
@@ -45,14 +44,10 @@ class InstagramExtractor(Extractor):
def items(self):
self.login()
- api = self.config("api")
- if api is None or api == "auto":
- api = InstagramRestAPI if self._logged_in else InstagramGraphqlAPI
- elif api == "graphql":
- api = InstagramGraphqlAPI
+ if self.config("api") == "graphql":
+ self.api = InstagramGraphqlAPI(self)
else:
- api = InstagramRestAPI
- self.api = api(self)
+ self.api = InstagramRestAPI(self)
data = self.metadata()
videos = self.config("videos", True)
@@ -385,7 +380,6 @@ class InstagramUserExtractor(InstagramExtractor):
(InstagramPostsExtractor , base + "posts/"),
(InstagramReelsExtractor , base + "reels/"),
(InstagramTaggedExtractor , base + "tagged/"),
- (InstagramChannelExtractor , base + "channel/"),
), ("posts",))
@@ -449,18 +443,25 @@ class InstagramTaggedExtractor(InstagramExtractor):
return self.api.user_tagged(self.user_id)
-class InstagramChannelExtractor(InstagramExtractor):
- """Extractor for an Instagram user's channel posts"""
- subcategory = "channel"
- pattern = USER_PATTERN + r"/channel"
- test = ("https://www.instagram.com/instagram/channel/", {
+class InstagramGuideExtractor(InstagramExtractor):
+ """Extractor for an Instagram guide"""
+ subcategory = "guide"
+ pattern = USER_PATTERN + r"/guide/[^/?#]+/(\d+)"
+ test = (("https://www.instagram.com/kadakaofficial/guide"
+ "/knit-i-need-collection/18131821684305217/"), {
"range": "1-16",
"count": ">= 16",
})
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.guide_id = match.group(2)
+
+ def metadata(self):
+ return {"guide": self.api.guide(self.guide_id)}
+
def posts(self):
- uid = self.api.user_id(self.item)
- return self.api.user_clips(uid)
+ return self.api.guide_media(self.guide_id)
class InstagramSavedExtractor(InstagramExtractor):
@@ -581,7 +582,7 @@ class InstagramAvatarExtractor(InstagramExtractor):
def posts(self):
if self._logged_in:
- user_id = self.api.user_id(self.item)
+ user_id = self.api.user_id(self.item, check_private=False)
user = self.api.user_by_id(user_id)
avatar = (user.get("hd_profile_pic_url_info") or
user["hd_profile_pic_versions"][-1])
@@ -723,6 +724,15 @@ class InstagramRestAPI():
def __init__(self, extractor):
self.extractor = extractor
+ def guide(self, guide_id):
+ endpoint = "/v1/guides/web_info/"
+ params = {"guide_id": guide_id}
+ return self._call(endpoint, params=params)
+
+ def guide_media(self, guide_id):
+ endpoint = "/v1/guides/guide/{}/".format(guide_id)
+ return self._pagination_guides(endpoint)
+
def highlights_media(self, user_id):
chunk_size = 5
reel_ids = [hl["id"] for hl in self.highlights_tray(user_id)]
@@ -770,14 +780,15 @@ class InstagramRestAPI():
endpoint = "/v1/users/{}/info/".format(user_id)
return self._call(endpoint)["user"]
- def user_id(self, screen_name):
+ def user_id(self, screen_name, check_private=True):
if screen_name.startswith("id:"):
return screen_name[3:]
user = self.user_by_name(screen_name)
if user is None:
raise exception.AuthorizationError(
"Login required to access this profile")
- if user["is_private"] and not user["followed_by_viewer"]:
+ if check_private and user["is_private"] and \
+ not user["followed_by_viewer"]:
name = user["username"]
s = "" if name.endswith("s") else "s"
raise exception.StopExtraction("%s'%s posts are private", name, s)
@@ -874,13 +885,28 @@ class InstagramRestAPI():
params["page"] = info["next_page"]
params["max_id"] = extr._update_cursor(info["next_max_id"])
+ def _pagination_guides(self, endpoint):
+ extr = self.extractor
+ params = {"max_id": extr._init_cursor()}
+
+ while True:
+ data = self._call(endpoint, params=params)
+
+ for item in data["items"]:
+ yield from item["media_items"]
+
+ if "next_max_id" not in data:
+ return
+ params["max_id"] = extr._update_cursor(data["next_max_id"])
+
class InstagramGraphqlAPI():
def __init__(self, extractor):
self.extractor = extractor
self.user_collection = self.user_saved = self.reels_media = \
- self.highlights_media = self._login_required
+ self.highlights_media = self.guide = self.guide_media = \
+ self._unsupported
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
api = InstagramRestAPI(extractor)
@@ -889,8 +915,8 @@ class InstagramGraphqlAPI():
self.user_id = api.user_id
@staticmethod
- def _login_required(_=None):
- raise exception.AuthorizationError("Login required")
+ def _unsupported(_=None):
+ raise exception.StopExtraction("Unsupported with GraphQL API")
def highlights_tray(self, user_id):
query_hash = "d4d88dc1500312af6f937f7b804c68c3"
@@ -990,63 +1016,9 @@ class InstagramGraphqlAPI():
@cache(maxage=90*24*3600, keyarg=1)
def _login_impl(extr, username, password):
- extr.log.info("Logging in as %s", username)
-
- user_agent = ("Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 "
- "(KHTML, like Gecko) Chrome/106.0.5249.79 Mobile "
- "Safari/537.36 Instagram 255.1.0.17.102")
-
- headers = {
- "User-Agent" : user_agent,
- "Sec-Fetch-Dest": "document",
- "Sec-Fetch-Mode": "navigate",
- "Sec-Fetch-Site": "none",
- "Sec-Fetch-User": "?1",
- }
- url = extr.root + "/accounts/login/"
- response = extr.request(url, headers=headers)
-
- extract = text.extract_from(response.text)
- csrf_token = extract('"csrf_token":"', '"')
- device_id = extract('"device_id":"', '"')
- rollout_hash = extract('"rollout_hash":"', '"')
-
- cset = extr.session.cookies.set
- cset("csrftoken", csrf_token, domain=extr.cookiedomain)
- cset("ig_did", device_id, domain=extr.cookiedomain)
-
- headers = {
- "User-Agent" : user_agent,
- "Accept" : "*/*",
- "X-CSRFToken" : csrf_token,
- "X-Instagram-AJAX": rollout_hash,
- "X-IG-App-ID" : "936619743392459",
- "X-ASBD-ID" : "198387",
- "X-IG-WWW-Claim" : "0",
- "X-Requested-With": "XMLHttpRequest",
- "Origin" : extr.root,
- "Referer" : url,
- "Sec-Fetch-Dest" : "empty",
- "Sec-Fetch-Mode" : "cors",
- "Sec-Fetch-Site" : "same-origin",
- }
- data = {
- "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format(
- int(time.time()), password),
- "username" : username,
- "queryParams" : "{}",
- "optIntoOneTap" : "false",
- "stopDeletionNonce" : "",
- "trustedDeviceRecords": "{}",
- }
- url = extr.root + "/accounts/login/ajax/"
- response = extr.request(url, method="POST", headers=headers, data=data)
-
- if not response.json().get("authenticated"):
- raise exception.AuthenticationError()
-
- return {cookie.name: cookie.value
- for cookie in extr.session.cookies}
+ extr.log.error("Login with username & password is no longer supported. "
+ "Use browser cookies instead.")
+ return {}
def id_from_shortcode(shortcode):
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py
index ae4112b..8067f63 100644
--- a/gallery_dl/extractor/issuu.py
+++ b/gallery_dl/extractor/issuu.py
@@ -54,8 +54,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
})
def metadata(self, page):
- data = json.loads(text.extract(
- page, '<script data-json="', '"')[0].replace("&quot;", '"'))
+ data = json.loads(text.extr(
+ page, '<script data-json="', '"').replace("&quot;", '"'))
doc = data["initialDocumentData"]["document"]
doc["date"] = text.parse_datetime(
diff --git a/gallery_dl/extractor/kabeuchi.py b/gallery_dl/extractor/kabeuchi.py
index a8702f1..f172dcf 100644
--- a/gallery_dl/extractor/kabeuchi.py
+++ b/gallery_dl/extractor/kabeuchi.py
@@ -62,7 +62,7 @@ class KabeuchiUserExtractor(Extractor):
response = self.request(url)
if response.history and response.url == self.root + "/":
raise exception.NotFoundError("user")
- target_id = text.extract(response.text, 'user_friend_id = "', '"')[0]
+ target_id = text.extr(response.text, 'user_friend_id = "', '"')
return self._pagination(target_id)
def _pagination(self, target_id):
diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py
index 50ce0d3..b5d7738 100644
--- a/gallery_dl/extractor/keenspot.py
+++ b/gallery_dl/extractor/keenspot.py
@@ -96,7 +96,7 @@ class KeenspotComicExtractor(Extractor):
self._image = '<div id="comic">'
return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"
- url = text.extract(page, '<link rel="first" href="', '"')[0]
+ url = text.extr(page, '<link rel="first" href="', '"')
if url:
if self.comic == "porcelain":
self._needle = 'id="porArchivetop_"'
@@ -144,7 +144,7 @@ class KeenspotComicExtractor(Extractor):
@staticmethod
def _next_link(page):
- return text.extract(page, '<link rel="next" href="', '"')[0]
+ return text.extr(page, '<link rel="next" href="', '"')
@staticmethod
def _next_id(page):
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 21ff114..8a61728 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -192,7 +192,7 @@ class KemonopartyExtractor(Extractor):
"body": text.unescape(text.extract(
dm, "<pre>", "</pre></",
)[0].strip()),
- "date": text.extract(dm, 'datetime="', '"')[0],
+ "date": text.extr(dm, 'datetime="', '"'),
})
return dms
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
index d2e9d88..d5cca1c 100644
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -23,9 +23,9 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
root = "https://downloads.khinsider.com"
test = (("https://downloads.khinsider.com"
"/game-soundtracks/album/horizon-riders-wii"), {
- "pattern": r"https?://vgm(site|downloads).com"
+ "pattern": r"https?://vgm(site|downloads)\.com"
r"/soundtracks/horizon-riders-wii/[^/]+"
- r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3",
+ r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3",
"keyword": {
"album": {
"count": 1,
@@ -76,15 +76,14 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
else:
fmt = fmt.lower().split(",")
- page = text.extract(page, '<table id="songlist">', '</table>')[0]
+ page = text.extr(page, '<table id="songlist">', '</table>')
for num, url in enumerate(text.extract_iter(
page, '<td class="clickable-row"><a href="', '"'), 1):
url = text.urljoin(self.root, url)
page = self.request(url, encoding="utf-8").text
track = first = None
- for url in text.extract_iter(
- page, 'style="color: #21363f;" href="', '"'):
+ for url in text.extract_iter(page, '<p><a href="', '"'):
track = text.nameext_from_url(url, {"num": num, "url": url})
if first is None:
first = track
diff --git a/gallery_dl/extractor/kissgoddess.py b/gallery_dl/extractor/kissgoddess.py
index 6e66772..4ec685c 100644
--- a/gallery_dl/extractor/kissgoddess.py
+++ b/gallery_dl/extractor/kissgoddess.py
@@ -35,8 +35,8 @@ class KissgoddessGalleryExtractor(GalleryExtractor):
def metadata(self, page):
return {
"gallery_id": text.parse_int(self.gallery_id),
- "title" : text.extract(
- page, '<title>', "<")[0].rpartition(" | ")[0],
+ "title" : text.extr(
+ page, '<title>', "<")[0].rpartition(" | "),
}
def images(self, page):
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index 1187fd6..a9eebf4 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -62,13 +62,13 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
)
def metadata(self, page):
- info = text.extract(page, "<title>", " – Komikcast<")[0]
+ info = text.extr(page, "<title>", " – Komikcast<")
return self.parse_chapter_string(info)
@staticmethod
def images(page):
- readerarea = text.extract(
- page, '<div class="main-reading-area', '</div')[0]
+ readerarea = text.extr(
+ page, '<div class="main-reading-area', '</div')
return [
(text.unescape(url), None)
for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)
diff --git a/gallery_dl/extractor/lightroom.py b/gallery_dl/extractor/lightroom.py
index 8131db8..d202e20 100644
--- a/gallery_dl/extractor/lightroom.py
+++ b/gallery_dl/extractor/lightroom.py
@@ -47,7 +47,7 @@ class LightroomGalleryExtractor(Extractor):
url = "https://lightroom.adobe.com/shares/" + self.href
response = self.request(url)
album = json.loads(
- text.extract(response.text, "albumAttributes: ", "\n")[0]
+ text.extr(response.text, "albumAttributes: ", "\n")
)
images = self.images(album)
diff --git a/gallery_dl/extractor/lineblog.py b/gallery_dl/extractor/lineblog.py
index 4071a26..adb27a8 100644
--- a/gallery_dl/extractor/lineblog.py
+++ b/gallery_dl/extractor/lineblog.py
@@ -22,8 +22,8 @@ class LineblogBase():
body = post.pop("body")
for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
- src = text.extract(img, 'src="', '"')[0]
- alt = text.extract(img, 'alt="', '"')[0]
+ src = text.extr(img, 'src="', '"')
+ alt = text.extr(img, 'alt="', '"')
if not src:
continue
diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py
index cffbc10..2765f0b 100644
--- a/gallery_dl/extractor/livedoor.py
+++ b/gallery_dl/extractor/livedoor.py
@@ -37,7 +37,7 @@ class LivedoorExtractor(Extractor):
def _load(self, data, body):
extr = text.extract_from(data)
- tags = text.extract(body, 'class="article-tags">', '</dl>')[0]
+ tags = text.extr(body, 'class="article-tags">', '</dl>')
about = extr('rdf:about="', '"')
return {
@@ -57,8 +57,8 @@ class LivedoorExtractor(Extractor):
body = post.pop("body")
for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
- src = text.extract(img, 'src="', '"')[0]
- alt = text.extract(img, 'alt="', '"')[0]
+ src = text.extr(img, 'src="', '"')
+ alt = text.extr(img, 'alt="', '"')
if not src:
continue
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index 7c6ef69..14d4efb 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -24,6 +24,10 @@ BASE_PATTERN = LolisafeExtractor.update({
"root": "https://zz.ht",
"pattern": r"zz\.(?:ht|fo)",
},
+ "xbunkr": {
+ "root": "https://xbunkr.com",
+ "pattern": r"xbunkr\.com",
+ }
})
@@ -40,6 +44,15 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
},
}),
("https://zz.fo/a/lop7W6EZ"),
+ ("https://xbunkr.com/a/TA0bu3F4", {
+ "pattern": r"https://media\.xbunkr\.com/[^.]+\.\w+",
+ "count": 861,
+ "keyword": {
+ "album_id": "TA0bu3F4",
+ "album_name": "Hannahowo Onlyfans Photos",
+ }
+ }),
+ ("https://xbunkr.com/a/GNQc2I5d"),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py
index a12a801..14a542b 100644
--- a/gallery_dl/extractor/manganelo.py
+++ b/gallery_dl/extractor/manganelo.py
@@ -63,8 +63,8 @@ class ManganeloChapterExtractor(ChapterExtractor):
}
def images(self, page):
- page = text.extract(
- page, 'class="container-chapter-reader', '\n<div')[0]
+ page = text.extr(
+ page, 'class="container-chapter-reader', '\n<div')
return [
(url, None)
for url in text.extract_iter(page, '<img src="', '"')
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
index a28a966..dcf1972 100644
--- a/gallery_dl/extractor/mangapark.py
+++ b/gallery_dl/extractor/mangapark.py
@@ -104,7 +104,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
return data
def images(self, page):
- data = json.loads(text.extract(page, "var _load_pages =", ";")[0])
+ data = json.loads(text.extr(page, "var _load_pages =", ";"))
return [
(text.urljoin(self.root, item["u"]), {
"width": text.parse_int(item["w"]),
@@ -136,10 +136,10 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor):
results = []
data = {"lang": "en", "language": "English"}
data["manga"] = text.unescape(
- text.extract(page, '<title>', ' Manga - ')[0])
+ text.extr(page, '<title>', ' Manga - '))
for stream in page.split('<div id="stream_')[1:]:
- data["stream"] = text.parse_int(text.extract(stream, '', '"')[0])
+ data["stream"] = text.parse_int(text.extr(stream, '', '"'))
for chapter in text.extract_iter(stream, '<li ', '</li>'):
path , pos = text.extract(chapter, 'href="', '"')
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
index 1486057..ac4c797 100644
--- a/gallery_dl/extractor/mangoxo.py
+++ b/gallery_dl/extractor/mangoxo.py
@@ -38,7 +38,7 @@ class MangoxoExtractor(Extractor):
url = self.root + "/login"
page = self.request(url).text
- token = text.extract(page, 'id="loginToken" value="', '"')[0]
+ token = text.extr(page, 'id="loginToken" value="', '"')
url = self.root + "/api/login"
headers = {
@@ -115,7 +115,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
data["extension"] = None
for data["num"], path in enumerate(imgs, 1):
- data["id"] = text.parse_int(text.extract(path, "=", "&")[0])
+ data["id"] = text.parse_int(text.extr(path, "=", "&"))
url = self.root + "/external/" + path.rpartition("url=")[2]
yield Message.Url, url, text.nameext_from_url(url, data)
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 0d2cded..049e0af 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -44,6 +44,10 @@ class MastodonExtractor(BaseExtractor):
del status["media_attachments"]
status["instance"] = self.instance
+ acct = status["account"]["acct"]
+ status["instance_remote"] = \
+ acct.rpartition("@")[2] if "@" in acct else None
+
status["tags"] = [tag["name"] for tag in status["tags"]]
status["date"] = text.parse_datetime(
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py
index 4d63c3e..0ef0a32 100644
--- a/gallery_dl/extractor/moebooru.py
+++ b/gallery_dl/extractor/moebooru.py
@@ -26,42 +26,39 @@ class MoebooruExtractor(BooruExtractor):
def _prepare(post):
post["date"] = text.parse_timestamp(post["created_at"])
- def _extended_tags(self, post, page=None):
- if not page:
- url = "{}/post/show/{}".format(self.root, post["id"])
- page = self.request(url).text
- html = text.extract(page, '<ul id="tag-', '</ul>')[0]
- if html:
- tags = collections.defaultdict(list)
- pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)")
- for tag_type, tag_name in pattern.findall(html):
- tags[tag_type].append(text.unquote(tag_name))
- for key, value in tags.items():
- post["tags_" + key] = " ".join(value)
- return page
-
- def _notes(self, post, page=None):
- if not page:
- url = "{}/post/show/{}".format(self.root, post["id"])
- page = self.request(url).text
- notes = []
- notes_container = text.extract(page, 'id="note-container"', "<img ")[0]
- if not notes_container:
+ def _html(self, post):
+ return self.request("{}/post/show/{}".format(
+ self.root, post["id"])).text
+
+ def _tags(self, post, page):
+ tag_container = text.extr(page, '<ul id="tag-', '</ul>')
+ if not tag_container:
return
- for note in notes_container.split('class="note-box"')[1:]:
+ tags = collections.defaultdict(list)
+ pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)")
+ for tag_type, tag_name in pattern.findall(tag_container):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
+
+ def _notes(self, post, page):
+ note_container = text.extr(page, 'id="note-container"', "<img ")
+ if not note_container:
+ return
+
+ post["notes"] = notes = []
+ for note in note_container.split('class="note-box"')[1:]:
extr = text.extract_from(note)
notes.append({
- "width" : int(extr("width: ", "p")),
- "height": int(extr("height: ", "p")),
- "y" : int(extr("top: ", "p")),
- "x" : int(extr("left: ", "p")),
+ "width" : int(extr("width:", "p")),
+ "height": int(extr("height:", "p")),
+ "y" : int(extr("top:", "p")),
+ "x" : int(extr("left:", "p")),
"id" : int(extr('id="note-body-', '"')),
- "body" : text.remove_html(extr('>', "</div>")),
+ "body" : text.unescape(text.remove_html(extr(">", "</div>"))),
})
- post["notes"] = notes
-
def _pagination(self, url, params):
params["page"] = self.page_start
params["limit"] = self.per_page
diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py
index da0f589..3dbd5fc 100644
--- a/gallery_dl/extractor/myhentaigallery.py
+++ b/gallery_dl/extractor/myhentaigallery.py
@@ -59,7 +59,7 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
def images(self, page):
return [
- (text.unescape(text.extract(url, 'src="', '"')[0]).replace(
+ (text.unescape(text.extr(url, 'src="', '"')).replace(
"/thumbnail/", "/original/"), None)
for url in text.extract_iter(page, 'class="comic-thumb"', '</div>')
]
diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py
index 8254118..7d23518 100644
--- a/gallery_dl/extractor/myportfolio.py
+++ b/gallery_dl/extractor/myportfolio.py
@@ -57,8 +57,8 @@ class MyportfolioGalleryExtractor(Extractor):
raise exception.NotFoundError()
page = response.text
- projects = text.extract(
- page, '<section class="project-covers', '</section>')[0]
+ projects = text.extr(
+ page, '<section class="project-covers', '</section>')
if projects:
data = {"_extractor": MyportfolioGalleryExtractor}
diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py
index 6062418..1db83b0 100644
--- a/gallery_dl/extractor/nana.py
+++ b/gallery_dl/extractor/nana.py
@@ -44,10 +44,10 @@ class NanaGalleryExtractor(GalleryExtractor):
def metadata(self, page):
title = text.unescape(
- text.extract(page, '</a>&nbsp; ', '</div>')[0])
- artist = text.unescape(text.extract(
- page, '<title>', '</title>')[0])[len(title):-10]
- tags = text.extract(page, 'Reader.tags = "', '"')[0]
+ text.extr(page, '</a>&nbsp; ', '</div>'))
+ artist = text.unescape(text.extr(
+ page, '<title>', '</title>'))[len(title):-10]
+ tags = text.extr(page, 'Reader.tags = "', '"')
return {
"gallery_id": self.gallery_id,
@@ -59,7 +59,7 @@ class NanaGalleryExtractor(GalleryExtractor):
}
def images(self, page):
- data = json.loads(text.extract(page, "Reader.pages = ", ".pages")[0])
+ data = json.loads(text.extr(page, "Reader.pages = ", ".pages"))
return [
("https://nana.my.id" + image, None)
for image in data["pages"]
@@ -108,8 +108,8 @@ class NanaSearchExtractor(Extractor):
for gallery in text.extract_iter(
page, '<div class="id3">', '</div>'):
- url = "https://nana.my.id" + text.extract(
- gallery, '<a href="', '"')[0]
+ url = "https://nana.my.id" + text.extr(
+ gallery, '<a href="', '"')
yield Message.Queue, url, data
self.params["p"] += 1
diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py
index eadd460..fa91f76 100644
--- a/gallery_dl/extractor/naverwebtoon.py
+++ b/gallery_dl/extractor/naverwebtoon.py
@@ -76,7 +76,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
@staticmethod
def images(page):
- view_area = text.extract(page, 'id="comic_view_area"', '</div>')[0]
+ view_area = text.extr(page, 'id="comic_view_area"', '</div>')
return [
(url, None)
for url in text.extract_iter(view_area, '<img src="', '"')
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 2c2dcb9..1f96879 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -88,8 +88,8 @@ class NewgroundsExtractor(Extractor):
return self.session.cookies
headers = {"Origin": self.root, "Referer": url}
- url = text.urljoin(self.root, text.extract(
- response.text, 'action="', '"')[0])
+ url = text.urljoin(self.root, text.extr(
+ response.text, 'action="', '"'))
data = {
"username": username,
"password": password,
@@ -140,7 +140,7 @@ class NewgroundsExtractor(Extractor):
data["score"] = text.parse_float(extr('id="score_number">', '<'))
data["tags"] = text.split_html(extr('<dd class="tags">', '</dd>'))
data["artist"] = [
- text.extract(user, '//', '.')[0]
+ text.extr(user, '//', '.')
for user in text.extract_iter(page, '<div class="item-user">', '>')
]
@@ -275,7 +275,7 @@ class NewgroundsExtractor(Extractor):
for year, items in items.items():
for item in items:
- page_url = text.extract(item, 'href="', '"')[0]
+ page_url = text.extr(item, 'href="', '"')
if page_url[0] == "/":
page_url = self.root + page_url
yield page_url
diff --git a/gallery_dl/extractor/ngomik.py b/gallery_dl/extractor/ngomik.py
deleted file mode 100644
index 8e29d97..0000000
--- a/gallery_dl/extractor/ngomik.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2018-2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract manga-chapters and entire manga from http://ngomik.in/"""
-
-from .common import ChapterExtractor
-from .. import text
-import re
-
-
-class NgomikChapterExtractor(ChapterExtractor):
- """Extractor for manga-chapters from ngomik.in"""
- category = "ngomik"
- root = "http://ngomik.in"
- pattern = (r"(?:https?://)?(?:www\.)?ngomik\.in"
- r"(/[^/?#]+-chapter-[^/?#]+)")
- test = (
- ("https://www.ngomik.in/14-sai-no-koi-chapter-1-6/", {
- "url": "8e67fdf751bbc79bc6f4dead7675008ddb8e32a4",
- "keyword": "204d177f09d438fd50c9c28d98c73289194640d8",
- }),
- ("https://ngomik.in/break-blade-chapter-26/", {
- "count": 34,
- }),
- )
-
- def metadata(self, page):
- info = text.extract(page, '<title>', "</title>")[0]
- manga, _, chapter = info.partition(" Chapter ")
- chapter, sep, minor = chapter.partition(" ")[0].partition(".")
-
- return {
- "manga": text.unescape(manga),
- "chapter": text.parse_int(chapter),
- "chapter_minor": sep + minor,
- "lang": "id",
- "language": "Indonesian",
- }
-
- @staticmethod
- def images(page):
- readerarea = text.extract(page, 'id="readerarea"', 'class="chnav"')[0]
- return [
- (text.unescape(url), None)
- for url in re.findall(r"\ssrc=[\"']?([^\"' >]+)", readerarea)
- ]
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index 73911b2..079bae7 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -107,7 +107,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"""Extract image URLs from 'page'"""
images = text.extract_iter(page, "/view_popup.php", "</a>")
for num, image in enumerate(images):
- src = text.extract(image, 'src="', '"')[0]
+ src = text.extr(image, 'src="', '"')
if not src:
continue
url = ("https:" + src).replace("/__rs_l120x120/", "/")
@@ -118,7 +118,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
@staticmethod
def _extract_user_name(page):
- return text.unescape(text.extract(page, "<br />", "<")[0] or "")
+ return text.unescape(text.extr(page, "<br />", "<"))
def login(self):
"""Login and obtain session cookies"""
@@ -322,8 +322,7 @@ class NijieNuitaExtractor(NijieExtractor):
@staticmethod
def _extract_user_name(page):
- return text.unescape(text.extract(
- page, "<title>", "さんの抜いた")[0] or "")
+ return text.unescape(text.extr(page, "<title>", "さんの抜いた"))
class NijieFeedExtractor(NijieExtractor):
diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py
new file mode 100644
index 0000000..1ba8253
--- /dev/null
+++ b/gallery_dl/extractor/nitter.py
@@ -0,0 +1,256 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Nitter instances"""
+
+from .common import BaseExtractor, Message
+from .. import text
+
+
+class NitterExtractor(BaseExtractor):
+ """Base class for nitter extractors"""
+ basecategory = "nitter"
+ directory_fmt = ("{category}", "{user[name]}")
+ filename_fmt = "{tweet_id}_{num}.{extension}"
+ archive_fmt = "{tweet_id}_{num}"
+
+ def __init__(self, match):
+ BaseExtractor.__init__(self, match)
+ self.user = match.group(match.lastindex)
+
+ def items(self):
+ for tweet_html in self.tweets():
+ tweet = self._tweet_from_html(tweet_html)
+
+ attachments_html = tweet.pop("_attach", "")
+ if attachments_html:
+ attachments = list(text.extract_iter(
+ attachments_html, 'href="', '"'))
+ attachments.extend(text.extract_iter(
+ attachments_html, 'data-url="', '"'))
+ else:
+ attachments = ()
+ tweet["count"] = len(attachments)
+
+ yield Message.Directory, tweet
+ for tweet["num"], url in enumerate(attachments, 1):
+ if url[0] == "/":
+ url = self.root + url
+ if "/video/" in url:
+ url = "ytdl:" + url
+ tweet["filename"] = url.rpartition(
+ "%2F")[2].partition(".")[0]
+ tweet["extension"] = "mp4"
+ else:
+ text.nameext_from_url(url, tweet)
+ yield Message.Url, url, tweet
+
+ def _tweet_from_html(self, html):
+ extr = text.extract_from(html)
+ user = {
+ "name": extr('class="fullname" href="/', '"'),
+ "nick": extr('title="', '"'),
+ }
+ extr('<span class="tweet-date', '')
+ link = extr('href="', '"')
+ return {
+ "user": user,
+ "date": text.parse_datetime(
+ extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
+ "tweet_id": link.rpartition("/")[2].partition("#")[0],
+ "content": extr('class="tweet-content', "</div").partition(">")[2],
+ "_attach": extr('class="attachments', 'class="tweet-stats'),
+ "comments": text.parse_int(extr(
+ 'class="icon-comment', '</div>').rpartition(">")[2]),
+ "retweets": text.parse_int(extr(
+ 'class="icon-retweet', '</div>').rpartition(">")[2]),
+ "quotes" : text.parse_int(extr(
+ 'class="icon-quote', '</div>').rpartition(">")[2]),
+ "likes" : text.parse_int(extr(
+ 'class="icon-heart', '</div>').rpartition(">")[2]),
+ }
+
+ def _pagination(self, path):
+ base_url = url = self.root + path
+
+ while True:
+ page = self.request(url).text
+
+ yield from page.split('<div class="timeline-item')[1:]
+
+ more = text.extr(page, '<div class="show-more"><a href="?', '"')
+ if not more:
+ return
+ url = base_url + "?" + text.unescape(more)
+
+
+BASE_PATTERN = NitterExtractor.update({
+ "nitter.net": {
+ "root": "https://nitter.net",
+ "pattern": r"nitter\.net",
+ },
+ "nitter.lacontrevoie.fr": {
+ "root": "https://nitter.lacontrevoie.fr",
+ "pattern": r"nitter\.lacontrevoie\.fr",
+ },
+ "nitter.pussthecat.org": {
+ "root": "https://nitter.pussthecat.org",
+ "pattern": r"nitter\.pussthecat\.org",
+ },
+ "nitter.1d4.us": {
+ "root": "https://nitter.1d4.us",
+ "pattern": r"nitter\.1d4\.us",
+ },
+ "nitter.kavin.rocks": {
+ "root": "https://nitter.kavin.rocks",
+ "pattern": r"nitter\.kavin\.rocks",
+ },
+ "nitter.unixfox.eu": {
+ "root": "https://nitter.unixfox.eu",
+ "pattern": r"nitter\.unixfox\.eu",
+ },
+})
+
+
+class NitterTweetsExtractor(NitterExtractor):
+ subcategory = "tweets"
+ pattern = BASE_PATTERN + r"/([^/?#]+)(?:/tweets)?(?:$|\?|#)"
+ test = (
+ ("https://nitter.net/supernaturepics", {
+ "pattern": r"https://nitter\.net/pic/orig"
+ r"/media%2F[\w-]+\.(jpg|png)$",
+ "range": "1-20",
+ "count": 20,
+ "keyword": {
+ "comments": int,
+ "content": str,
+ "count": 1,
+ "date": "type:datetime",
+ "likes": int,
+ "quotes": int,
+ "retweets": int,
+ "tweet_id": r"re:\d+",
+ "user": {
+ "name": "supernaturepics",
+ "nick": "Nature Pictures"
+ },
+ },
+ }),
+ ("https://nitter.lacontrevoie.fr/supernaturepics"),
+ ("https://nitter.pussthecat.org/supernaturepics"),
+ ("https://nitter.1d4.us/supernaturepics"),
+ ("https://nitter.kavin.rocks/supernaturepics"),
+ ("https://nitter.unixfox.eu/supernaturepics"),
+ )
+
+ def tweets(self):
+ return self._pagination("/" + self.user)
+
+
+class NitterRepliesExtractor(NitterExtractor):
+ subcategory = "replies"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/with_replies"
+ test = (
+ ("https://nitter.net/supernaturepics/with_replies", {
+ "pattern": r"https://nitter\.net/pic/orig"
+ r"/media%2F[\w-]+\.(jpg|png)$",
+ "range": "1-20",
+ }),
+ ("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"),
+ ("https://nitter.pussthecat.org/supernaturepics/with_replies"),
+ ("https://nitter.1d4.us/supernaturepics/with_replies"),
+ ("https://nitter.kavin.rocks/supernaturepics/with_replies"),
+ ("https://nitter.unixfox.eu/supernaturepics/with_replies"),
+ )
+
+ def tweets(self):
+ return self._pagination("/" + self.user + "/with_replies")
+
+
+class NitterMediaExtractor(NitterExtractor):
+ subcategory = "media"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/media"
+ test = (
+ ("https://nitter.net/supernaturepics/media", {
+ "pattern": r"https://nitter\.net/pic/orig"
+ r"/media%2F[\w-]+\.(jpg|png)$",
+ "range": "1-20",
+ }),
+ ("https://nitter.lacontrevoie.fr/supernaturepics/media"),
+ ("https://nitter.pussthecat.org/supernaturepics/media"),
+ ("https://nitter.1d4.us/supernaturepics/media"),
+ ("https://nitter.kavin.rocks/supernaturepics/media"),
+ ("https://nitter.unixfox.eu/supernaturepics/media"),
+ )
+
+ def tweets(self):
+ return self._pagination("/" + self.user + "/media")
+
+
+class NitterSearchExtractor(NitterExtractor):
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/search"
+ test = (
+ ("https://nitter.net/supernaturepics/search", {
+ "pattern": r"https://nitter\.net/pic/orig"
+ r"/media%2F[\w-]+\.(jpg|png)$",
+ "range": "1-20",
+ }),
+ ("https://nitter.lacontrevoie.fr/supernaturepics/search"),
+ ("https://nitter.pussthecat.org/supernaturepics/search"),
+ ("https://nitter.1d4.us/supernaturepics/search"),
+ ("https://nitter.kavin.rocks/supernaturepics/search"),
+ ("https://nitter.unixfox.eu/supernaturepics/search"),
+ )
+
+ def tweets(self):
+ return self._pagination("/" + self.user + "/search")
+
+
+class NitterTweetExtractor(NitterExtractor):
+ """Extractor for nitter tweets"""
+ subcategory = "tweet"
+ directory_fmt = ("{category}", "{user[name]}")
+ filename_fmt = "{tweet_id}_{num}.{extension}"
+ archive_fmt = "{tweet_id}_{num}"
+ pattern = BASE_PATTERN + r"/[^/?#]+/status/(\d+)"
+ test = (
+ ("https://nitter.net/supernaturepics/status/604341487988576256", {
+ "url": "3f2b64e175bf284aa672c3bb53ed275e470b919a",
+ "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
+ }),
+ # 4 images
+ ("https://nitter.lacontrevoie.fr/i/status/894001459754180609", {
+ "url": "9c51b3a4a1114535eb9b168bba97ad95db0d59ff",
+ }),
+ # video
+ ("https://nitter.pussthecat.org/i/status/1065692031626829824", {
+ "pattern": r"ytdl:https://nitter.pussthecat.org/video"
+ r"/B875137EDC8FF/https%3A%2F%2Fvideo.twimg.com%2F"
+ r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F"
+ r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5",
+ }),
+ # content with emoji, newlines, hashtags (#338)
+ ("https://nitter.1d4.us/playpokemon/status/1263832915173048321", {
+ "keyword": {"content": (
+ r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
+ "Gifts! \n\nYou’ll be able to receive four Galarian form "
+ "Pokémon with Hidden Abilities, plus some very useful items. "
+ "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
+ )},
+ }),
+ # Nitter tweet (#890)
+ ("https://nitter.kavin.rocks/ed1conf/status/1163841619336007680", {
+ "url": "e115bd1c86c660064e392b05269bbcafcd8c8b7a",
+ "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
+ }),
+ )
+
+ def tweets(self):
+ url = "{}/i/status/{}".format(self.root, self.user)
+ return (self.request(url).text,)
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 35a015f..59c5f15 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -95,7 +95,7 @@ class PatreonExtractor(Extractor):
if content:
for img in text.extract_iter(
content, '<img data-media-id="', '>'):
- url = text.extract(img, 'src="', '"')[0]
+ url = text.extr(img, 'src="', '"')
if url:
yield "content", url, self._filename(url) or url
@@ -181,7 +181,7 @@ class PatreonExtractor(Extractor):
"""Fetch filename from an URL's Content-Disposition header"""
response = self.request(url, method="HEAD", fatal=False)
cd = response.headers.get("Content-Disposition")
- return text.extract(cd, 'filename="', '"')[0]
+ return text.extr(cd, 'filename="', '"')
@staticmethod
def _filehash(url):
@@ -284,7 +284,7 @@ class PatreonCreatorExtractor(PatreonExtractor):
url = "{}/{}/posts".format(self.root, self.creator)
page = self.request(url, notfound="creator").text
- campaign_id = text.extract(page, "/campaign/", "/")[0]
+ campaign_id = text.extr(page, "/campaign/", "/")
if not campaign_id:
raise exception.NotFoundError("creator")
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index 225f0ff..fc85125 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -27,10 +27,6 @@ class PhilomenaExtractor(BooruExtractor):
def _prepare(post):
post["date"] = text.parse_datetime(post["created_at"])
- @staticmethod
- def _extended_tags(post):
- pass
-
def _pagination(self, url, params):
params["page"] = 1
params["per_page"] = self.per_page
diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py
index 1993ab6..375b5e3 100644
--- a/gallery_dl/extractor/photobucket.py
+++ b/gallery_dl/extractor/photobucket.py
@@ -75,7 +75,7 @@ class PhotobucketAlbumExtractor(Extractor):
page = self.request(url, params=params).text
json_data = text.extract(page, "collectionData:", ",\n")[0]
if not json_data:
- msg = text.extract(page, 'libraryPrivacyBlock">', "</div>")[0]
+ msg = text.extr(page, 'libraryPrivacyBlock">', "</div>")
msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""
self.log.error("Unable to get JSON data%s", msg)
return
diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py
index bdd9f21..841a99b 100644
--- a/gallery_dl/extractor/pillowfort.py
+++ b/gallery_dl/extractor/pillowfort.py
@@ -98,7 +98,7 @@ class PillowfortExtractor(Extractor):
url = "https://www.pillowfort.social/users/sign_in"
page = self.request(url).text
- auth = text.extract(page, 'name="authenticity_token" value="', '"')[0]
+ auth = text.extr(page, 'name="authenticity_token" value="', '"')
headers = {"Origin": self.root, "Referer": url}
data = {
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index e3a96bd..fc092f1 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -462,6 +462,9 @@ class PixivRankingExtractor(PixivExtractor):
("https://www.pixiv.net/ranking.php?mode=daily&date=20170818"),
("https://www.pixiv.net/ranking.php"),
("https://touch.pixiv.net/ranking.php"),
+ ("https://www.pixiv.net/ranking.php?mode=unknown", {
+ "exception": exception.StopExtraction,
+ }),
)
def __init__(self, match):
@@ -479,6 +482,8 @@ class PixivRankingExtractor(PixivExtractor):
mode_map = {
"daily": "day",
"daily_r18": "day_r18",
+ "daily_ai": "day_ai",
+ "daily_r18_ai": "day_r18_ai",
"weekly": "week",
"weekly_r18": "week_r18",
"monthly": "month",
@@ -490,10 +495,10 @@ class PixivRankingExtractor(PixivExtractor):
"rookie": "week_rookie",
"r18g": "week_r18g",
}
- if mode not in mode_map:
- self.log.warning("invalid mode '%s'", mode)
- mode = "daily"
- self.mode = mode_map[mode]
+ try:
+ self.mode = mode = mode_map[mode]
+ except KeyError:
+ raise exception.StopExtraction("Invalid mode '%s'", mode)
date = query.get("date")
if date:
@@ -525,6 +530,15 @@ class PixivSearchExtractor(PixivExtractor):
"range": "1-10",
"count": 10,
}),
+ ("https://pixiv.net/en/tags/foo/artworks?order=week&s_mode=s_tag", {
+ "exception": exception.StopExtraction,
+ }),
+ ("https://pixiv.net/en/tags/foo/artworks?order=date&s_mode=tag", {
+ "exception": exception.StopExtraction,
+ }),
+ ("https://www.pixiv.net/search.php?s_mode=s_tag&name=Original", {
+ "exception": exception.StopExtraction,
+ }),
("https://www.pixiv.net/en/tags/foo/artworks?order=date&s_mode=s_tag"),
("https://www.pixiv.net/search.php?s_mode=s_tag&word=Original"),
("https://touch.pixiv.net/search.php?word=Original"),
@@ -546,19 +560,20 @@ class PixivSearchExtractor(PixivExtractor):
if self.word:
self.word = text.unquote(self.word)
else:
- if "word" not in query:
+ try:
+ self.word = query["word"]
+ except KeyError:
raise exception.StopExtraction("Missing search term")
- self.word = query["word"]
sort = query.get("order", "date_d")
sort_map = {
"date": "date_asc",
"date_d": "date_desc",
}
- if sort not in sort_map:
- self.log.warning("invalid sort order '%s'", sort)
- sort = "date_d"
- self.sort = sort_map[sort]
+ try:
+ self.sort = sort = sort_map[sort]
+ except KeyError:
+ raise exception.StopExtraction("Invalid search order '%s'", sort)
target = query.get("s_mode", "s_tag_full")
target_map = {
@@ -566,10 +581,10 @@ class PixivSearchExtractor(PixivExtractor):
"s_tag_full": "exact_match_for_tags",
"s_tc": "title_and_caption",
}
- if target not in target_map:
- self.log.warning("invalid search target '%s'", target)
- target = "s_tag_full"
- self.target = target_map[target]
+ try:
+ self.target = target = target_map[target]
+ except KeyError:
+ raise exception.StopExtraction("Invalid search mode '%s'", target)
self.date_start = query.get("scd")
self.date_end = query.get("ecd")
@@ -638,7 +653,7 @@ class PixivPixivisionExtractor(PixivExtractor):
headers = {"User-Agent": "Mozilla/5.0"}
self.page = self.request(url, headers=headers).text
- title = text.extract(self.page, '<title>', '<')[0]
+ title = text.extr(self.page, '<title>', '<')
return {
"pixivision_id" : self.pixivision_id,
"pixivision_title": text.unescape(title),
@@ -692,7 +707,7 @@ class PixivSeriesExtractor(PixivExtractor):
series = body["extraData"]["meta"]
series["id"] = self.series_id
series["total"] = page["total"]
- series["title"] = text.extract(series["title"], '"', '"')[0]
+ series["title"] = text.extr(series["title"], '"', '"')
for info in page["series"]:
work = self.api.illust_detail(info["workId"])
diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py
index a52071e..15be563 100644
--- a/gallery_dl/extractor/pixnet.py
+++ b/gallery_dl/extractor/pixnet.py
@@ -30,7 +30,7 @@ class PixnetExtractor(Extractor):
def items(self):
url = self.url_fmt.format(self.root, self.item_id)
page = self.request(url, encoding="utf-8").text
- user = text.extract(page, '<meta name="author" content="', '";')[0]
+ user = text.extr(page, '<meta name="author" content="', '";')
data = {
"blog": self.blog,
"user": user.rpartition(" (")[0],
@@ -52,13 +52,13 @@ class PixnetExtractor(Extractor):
while True:
yield from text.extract_iter(page, '<li id="', '</li>')
- pnext = text.extract(page, 'class="nextBtn"', '>')[0]
+ pnext = text.extr(page, 'class="nextBtn"', '>')
if pnext is None and 'name="albumpass">' in page:
raise exception.StopExtraction(
"Album %s is password-protected.", self.item_id)
if "href" not in pnext:
return
- url = self.root + text.extract(pnext, 'href="', '"')[0]
+ url = self.root + text.extr(pnext, 'href="', '"')
page = self.request(url, encoding="utf-8").text
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
index dee7bd4..7e266cc 100644
--- a/gallery_dl/extractor/pururin.py
+++ b/gallery_dl/extractor/pururin.py
@@ -73,8 +73,8 @@ class PururinGalleryExtractor(GalleryExtractor):
url = "{}/read/{}/01/x".format(self.root, self.gallery_id)
page = self.request(url).text
- info = json.loads(binascii.a2b_base64(text.extract(
- page, '<gallery-read encoded="', '"')[0]).decode())
+ info = json.loads(binascii.a2b_base64(text.extr(
+ page, '<gallery-read encoded="', '"')).decode())
self._ext = info["image_extension"]
self._cnt = info["total_pages"]
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index 448dc1b..8b5b6b6 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -109,13 +109,13 @@ class ReactorExtractor(BaseExtractor):
tags.sort()
for image in images:
- url = text.extract(image, ' src="', '"')[0]
+ url = text.extr(image, ' src="', '"')
if not url:
continue
if url.startswith("//"):
url = "http:" + url
- width = text.extract(image, ' width="', '"')[0]
- height = text.extract(image, ' height="', '"')[0]
+ width = text.extr(image, ' width="', '"')
+ height = text.extr(image, ' height="', '"')
image_id = url.rpartition("-")[2].partition(".")[0]
num += 1
@@ -125,7 +125,7 @@ class ReactorExtractor(BaseExtractor):
url = url.replace("/post/", "/post/full/")
if self.gif and ("/post/webm/" in url or "/post/mp4/" in url):
- gif_url = text.extract(image, '<a href="', '"')[0]
+ gif_url = text.extr(image, '<a href="', '"')
if not gif_url:
continue
url = gif_url
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 3396e3a..7013f1b 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -63,7 +63,7 @@ class SankakuExtractor(BooruExtractor):
def _check_expired(self, response):
return not response.history or '.com/expired.png' not in response.url
- def _extended_tags(self, post):
+ def _tags(self, post, page):
tags = collections.defaultdict(list)
types = self.TAG_TYPES
for tag in post["tags"]:
@@ -306,7 +306,7 @@ class SankakuAPI():
url = post["file_url"]
if url:
expires = text.parse_int(
- text.extract(url, "e=", "&")[0]) - 60
+ text.extr(url, "e=", "&")) - 60
if 0 < expires <= time():
self.extractor.log.debug("Refreshing download URLs")
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index 830274a..aa6726d 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -43,7 +43,7 @@ class SexcomExtractor(Extractor):
yield self.root + href
pager = extr('id="pagenum"', '</div>')
- url = text.extract(pager, ' href="', '"')[0]
+ url = text.extr(pager, ' href="', '"')
if not url:
return
url = text.urljoin(self.root, url)
@@ -71,7 +71,7 @@ class SexcomExtractor(Extractor):
info = extr("player.updateSrc(", ");")
if info:
- path = text.extract(info, "src: '", "'")[0]
+ path = text.extr(info, "src: '", "'")
data["filename"] = path.rpartition("/")[2]
data["extension"] = "mp4"
if "'HD'" in info:
@@ -79,8 +79,8 @@ class SexcomExtractor(Extractor):
data["url"] = self.root + path
else:
iframe = extr('<iframe', '>')
- src = (text.extract(iframe, ' src="', '"')[0] or
- text.extract(iframe, " src='", "'")[0])
+ src = (text.extr(iframe, ' src="', '"') or
+ text.extr(iframe, " src='", "'"))
if not src:
self.log.warning("Unable to fetch media from %s", url)
return None
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
index 580e917..b5d116f 100644
--- a/gallery_dl/extractor/simplyhentai.py
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -111,7 +111,7 @@ class SimplyhentaiImageExtractor(Extractor):
url = extr('&quot;image&quot;:&quot;' , '&')
url = extr("&quot;content&quot;:&quot;", "&") or url
- tags = text.extract(descr, " tagged with ", " online for free ")[0]
+ tags = text.extr(descr, " tagged with ", " online for free ")
if tags:
tags = tags.split(", ")
tags[-1] = tags[-1].partition(" ")[2]
@@ -176,7 +176,7 @@ class SimplyhentaiVideoExtractor(Extractor):
embed_url = text.extract(page, 'src="', '"', pos)[0].replace(
"embedplayer.php?link=", "embed.php?name=")
embed_page = self.request(embed_url).text
- video_url = text.extract(embed_page, '"file":"', '"')[0]
+ video_url = text.extr(embed_page, '"file":"', '"')
title, _, episode = title.rpartition(" Episode ")
if video_url.startswith("//"):
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index d2e298c..ea39c5e 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -89,23 +89,23 @@ class SubscribestarExtractor(Extractor):
def _media_from_post(html):
media = []
- gallery = text.extract(html, 'data-gallery="', '"')[0]
+ gallery = text.extr(html, 'data-gallery="', '"')
if gallery:
media.extend(
item for item in json.loads(text.unescape(gallery))
if "/previews/" not in item["url"]
)
- attachments = text.extract(
- html, 'class="uploads-docs"', 'data-role="post-edit_form"')[0]
+ attachments = text.extr(
+ html, 'class="uploads-docs"', 'data-role="post-edit_form"')
if attachments:
for att in attachments.split('class="doc_preview"')[1:]:
media.append({
- "id" : text.parse_int(text.extract(
- att, 'data-upload-id="', '"')[0]),
- "name": text.unescape(text.extract(
- att, 'doc_preview-title">', '<')[0] or ""),
- "url" : text.unescape(text.extract(att, 'href="', '"')[0]),
+ "id" : text.parse_int(text.extr(
+ att, 'data-upload-id="', '"')),
+ "name": text.unescape(text.extr(
+ att, 'doc_preview-title">', '<')),
+ "url" : text.unescape(text.extr(att, 'href="', '"')),
"type": "attachment",
})
@@ -175,7 +175,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
return
yield from posts
- url = text.extract(posts[-1], needle_next_page, '"')[0]
+ url = text.extr(posts[-1], needle_next_page, '"')
if not url:
return
page = self.request(self.root + text.unescape(url)).json()["html"]
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 5451f6e..c75952a 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -203,6 +203,15 @@ class TumblrExtractor(Extractor):
def _prepare_image(url, post):
text.nameext_from_url(url, post)
+ # try ".gifv" (#3095)
+ # it's unknown whether all gifs in this case are actually webps
+ # incorrect extensions will be corrected by 'adjust-extensions'
+ if post["extension"] == "gif":
+ post["_fallback"] = (url + "v",)
+ post["_http_headers"] = {"Accept": # copied from chrome 106
+ "image/avif,image/webp,image/apng,"
+ "image/svg+xml,image/*,*/*;q=0.8"}
+
parts = post["filename"].split("_")
try:
post["hash"] = parts[1] if parts[1] != "inline" else parts[2]
@@ -248,7 +257,7 @@ class TumblrExtractor(Extractor):
except Exception:
return resized, True
else:
- updated = text.extract(response.text, '" src="', '"')[0]
+ updated = text.extr(response.text, '" src="', '"')
return updated, (resized == updated)
def _original_image_fallback(self, url, post_id):
diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py
index e790613..6940f3e 100644
--- a/gallery_dl/extractor/tumblrgallery.py
+++ b/gallery_dl/extractor/tumblrgallery.py
@@ -46,7 +46,7 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
def metadata(self, page):
return {
- "title" : text.unescape(text.extract(page, "<h1>", "</h1>"))[0],
+ "title" : text.unescape(text.extr(page, "<h1>", "</h1>")),
"gallery_id": self.gallery_id,
}
@@ -82,7 +82,7 @@ class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
def metadata(self, page):
return {
"title" : text.remove_html(
- text.unescape(text.extract(page, "<title>", "</title>")[0])
+ text.unescape(text.extr(page, "<title>", "</title>"))
).replace("_", "-"),
"gallery_id": self.gallery_id,
}
@@ -127,12 +127,12 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
data = self._data_from_url(url)
data["gallery_id"] = gallery_id
data["title"] = text.remove_html(text.unescape(
- text.extract(post_page, "<title>", "</title>")[0]
+ text.extr(post_page, "<title>", "</title>")
)).replace("_", "-")
yield url, data
- next_url = text.extract(
- page, '</span> <a class="btn btn-primary" href="', '"')[0]
+ next_url = text.extr(
+ page, '</span> <a class="btn btn-primary" href="', '"')
if not next_url or page_url == next_url:
return
page_url = next_url
diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py
index 93fa039..f010f92 100644
--- a/gallery_dl/extractor/twibooru.py
+++ b/gallery_dl/extractor/twibooru.py
@@ -83,7 +83,7 @@ class TwibooruPostExtractor(TwibooruExtractor):
"tag_ids": list,
"tags": list,
"thumbnails_generated": True,
- "updated_at": "2022-05-13T00:43:19.791Z",
+ "updated_at": "2022-09-21T14:31:50.441Z",
"upvotes": int,
"view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png",
"width": 576,
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index ba0597e..3dbadaa 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -14,10 +14,7 @@ from ..cache import cache
import itertools
import json
-BASE_PATTERN = (
- r"(?:https?://)?(?:www\.|mobile\.)?"
- r"(?:(?:[fv]x)?twitter\.com|nitter\.net)"
-)
+BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:[fv]x)?twitter\.com"
class TwitterExtractor(Extractor):
@@ -227,8 +224,8 @@ class TwitterExtractor(Extractor):
response = self.request(url, fatal=False)
if response.status_code >= 400:
continue
- url = text.extract(
- response.text, 'name="twitter:image" value="', '"')[0]
+ url = text.extr(
+ response.text, 'name="twitter:image" value="', '"')
if url:
files.append({"url": url})
@@ -377,6 +374,24 @@ class TwitterExtractor(Extractor):
except Exception:
yield tweet
+ def _make_tweet(self, user, id_str, url, timestamp):
+ return {
+ "created_at": text.parse_timestamp(timestamp).strftime(
+ "%a %b %d %H:%M:%S +0000 %Y"),
+ "id_str": id_str,
+ "lang": None,
+ "user": user,
+ "entities": {},
+ "extended_entities": {
+ "media": [
+ {
+ "original_info": {},
+ "media_url": url,
+ },
+ ],
+ },
+ }
+
def metadata(self):
"""Return general metadata"""
return {}
@@ -388,44 +403,7 @@ class TwitterExtractor(Extractor):
if not self._check_cookies(self.cookienames):
username, password = self._get_auth_info()
if username:
- self._update_cookies(self._login_impl(username, password))
-
- @cache(maxage=360*24*3600, keyarg=1)
- def _login_impl(self, username, password):
- self.log.info("Logging in as %s", username)
-
- token = util.generate_token()
- self.session.cookies.clear()
- self.request(self.root + "/login")
-
- url = self.root + "/sessions"
- cookies = {
- "_mb_tk": token,
- }
- data = {
- "redirect_after_login" : "/",
- "remember_me" : "1",
- "authenticity_token" : token,
- "wfa" : "1",
- "ui_metrics" : "{}",
- "session[username_or_email]": username,
- "session[password]" : password,
- }
- response = self.request(
- url, method="POST", cookies=cookies, data=data)
-
- if "/account/login_verification" in response.url:
- raise exception.AuthenticationError(
- "Login with two-factor authentication is not supported")
-
- cookies = {
- cookie.name: cookie.value
- for cookie in self.session.cookies
- }
-
- if "/error" in response.url or "auth_token" not in cookies:
- raise exception.AuthenticationError()
- return cookies
+ self._update_cookies(_login_impl(self, username, password))
class TwitterTimelineExtractor(TwitterExtractor):
@@ -727,11 +705,6 @@ class TwitterTweetExtractor(TwitterExtractor):
"pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
"count": 3,
}),
- # Nitter tweet (#890)
- ("https://nitter.net/ed1conf/status/1163841619336007680", {
- "url": "4a9ea898b14d3c112f98562d0df75c9785e239d9",
- "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
- }),
# Twitter card (#1005)
("https://twitter.com/billboard/status/1306599586602135555", {
"options": (("cards", True),),
@@ -850,6 +823,76 @@ class TwitterTweetExtractor(TwitterExtractor):
return itertools.chain(buffer, tweets)
+class TwitterAvatarExtractor(TwitterExtractor):
+ subcategory = "avatar"
+ filename_fmt = "avatar {date}.{extension}"
+ archive_fmt = "AV_{user[id]}_{date}"
+ pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/photo"
+ test = (
+ ("https://twitter.com/supernaturepics/photo", {
+ "pattern": r"https://pbs\.twimg\.com/profile_images"
+ r"/554585280938659841/FLVAlX18\.jpeg",
+ "keyword": {
+ "date": "dt:2015-01-12 10:26:49",
+ "extension": "jpeg",
+ "filename": "FLVAlX18",
+ "tweet_id": 554585280938659841,
+ },
+ }),
+ ("https://twitter.com/User16/photo", {
+ "count": 0,
+ }),
+ )
+
+ def tweets(self):
+ self.api._user_id_by_screen_name(self.user)
+ user = self._user_obj
+ url = user["legacy"]["profile_image_url_https"]
+
+ if url == ("https://abs.twimg.com/sticky"
+ "/default_profile_images/default_profile_normal.png"):
+ return ()
+
+ url = url.replace("_normal.", ".")
+ id_str = url.rsplit("/", 2)[1]
+ timestamp = ((int(id_str) >> 22) + 1288834974657) // 1000
+
+ return (self._make_tweet(user, id_str, url, timestamp),)
+
+
+class TwitterBackgroundExtractor(TwitterExtractor):
+ subcategory = "background"
+ filename_fmt = "background {date}.{extension}"
+ archive_fmt = "BG_{user[id]}_{date}"
+ pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/header_photo"
+ test = (
+ ("https://twitter.com/supernaturepics/header_photo", {
+ "pattern": r"https://pbs\.twimg\.com/profile_banners"
+ r"/2976459548/1421058583",
+ "keyword": {
+ "date": "dt:2015-01-12 10:29:43",
+ "filename": "1421058583",
+ "tweet_id": 0,
+ },
+ }),
+ ("https://twitter.com/User16/header_photo", {
+ "count": 0,
+ }),
+ )
+
+ def tweets(self):
+ self.api._user_id_by_screen_name(self.user)
+ user = user = self._user_obj
+
+ try:
+ url = user["legacy"]["profile_banner_url"]
+ _, timestamp = url.rsplit("/", 1)
+ except (KeyError, ValueError):
+ return ()
+
+ return (self._make_tweet(user, None, url, timestamp),)
+
+
class TwitterImageExtractor(Extractor):
category = "twitter"
subcategory = "image"
@@ -1021,7 +1064,7 @@ class TwitterAPI():
"count": 100,
}
return self._pagination_tweets(
- endpoint, variables, ("bookmark_timeline", "timeline"))
+ endpoint, variables, ("bookmark_timeline", "timeline"), False)
def list_latest_tweets_timeline(self, list_id):
endpoint = "/graphql/z3l-EHlx-fyg8OvGO4JN8A/ListLatestTweetsTimeline"
@@ -1253,7 +1296,8 @@ class TwitterAPI():
return
params["cursor"] = cursor
- def _pagination_tweets(self, endpoint, variables, path=None):
+ def _pagination_tweets(self, endpoint, variables,
+ path=None, stop_tweets=True):
extr = self.extractor
variables.update(self.variables)
original_retweets = (extr.retweets == "original")
@@ -1397,7 +1441,9 @@ class TwitterAPI():
tweet.get("rest_id"))
continue
- if not tweet or not cursor:
+ if stop_tweets and not tweet:
+ return
+ if not cursor or cursor == variables.get("cursor"):
return
variables["cursor"] = cursor
@@ -1456,8 +1502,8 @@ class TwitterAPI():
self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text)
def _syndication_tweet(self, tweet_id):
- tweet = self.extractor.request(
- "https://cdn.syndication.twimg.com/tweet?id=" + tweet_id).json()
+ base_url = "https://cdn.syndication.twimg.com/tweet-result?id="
+ tweet = self.extractor.request(base_url + tweet_id).json()
tweet["user"]["description"] = ""
tweet["user"]["entities"] = {"description": {}}
@@ -1500,3 +1546,174 @@ class TwitterAPI():
"core" : {"user_results": {"result": tweet["user"]}},
"_retweet_id_str": retweet_id,
}
+
+
+@cache(maxage=360*86400, keyarg=1)
+def _login_impl(extr, username, password):
+
+ import re
+ import random
+
+ if re.fullmatch(r"[\w.%+-]+@[\w.-]+\.\w{2,}", username):
+ extr.log.warning(
+ "Login with email is no longer possible. "
+ "You need to provide your username or phone number instead.")
+
+ extr.log.info("Logging in as %s", username)
+
+ def process(response):
+ try:
+ data = response.json()
+ except ValueError:
+ data = {"errors": ({"message": "Invalid response"},)}
+ else:
+ if response.status_code < 400:
+ return data["flow_token"]
+
+ errors = []
+ for error in data.get("errors") or ():
+ msg = error.get("message")
+ errors.append('"{}"'.format(msg) if msg else "Unknown error")
+ extr.log.debug(response.text)
+ raise exception.AuthenticationError(", ".join(errors))
+
+ extr.session.cookies.clear()
+ api = TwitterAPI(extr)
+ headers = api.headers
+ headers["Referer"] = "https://twitter.com/i/flow/login"
+
+ # init
+ data = {
+ "input_flow_data": {
+ "flow_context": {
+ "debug_overrides": {},
+ "start_location": {"location": "unknown"},
+ },
+ },
+ "subtask_versions": {
+ "action_list": 2,
+ "alert_dialog": 1,
+ "app_download_cta": 1,
+ "check_logged_in_account": 1,
+ "choice_selection": 3,
+ "contacts_live_sync_permission_prompt": 0,
+ "cta": 7,
+ "email_verification": 2,
+ "end_flow": 1,
+ "enter_date": 1,
+ "enter_email": 2,
+ "enter_password": 5,
+ "enter_phone": 2,
+ "enter_recaptcha": 1,
+ "enter_text": 5,
+ "enter_username": 2,
+ "generic_urt": 3,
+ "in_app_notification": 1,
+ "interest_picker": 3,
+ "js_instrumentation": 1,
+ "menu_dialog": 1,
+ "notifications_permission_prompt": 2,
+ "open_account": 2,
+ "open_home_timeline": 1,
+ "open_link": 1,
+ "phone_verification": 4,
+ "privacy_options": 1,
+ "security_key": 3,
+ "select_avatar": 4,
+ "select_banner": 2,
+ "settings_list": 7,
+ "show_code": 1,
+ "sign_up": 2,
+ "sign_up_review": 4,
+ "tweet_selection_urt": 1,
+ "update_users": 1,
+ "upload_media": 1,
+ "user_recommendations_list": 4,
+ "user_recommendations_urt": 1,
+ "wait_spinner": 3,
+ "web_modal": 1,
+ },
+ }
+ url = "https://twitter.com/i/api/1.1/onboarding/task.json?flow_name=login"
+ response = extr.request(url, method="POST", headers=headers, json=data)
+
+ data = {
+ "flow_token": process(response),
+ "subtask_inputs": [
+ {
+ "subtask_id": "LoginJsInstrumentationSubtask",
+ "js_instrumentation": {
+ "response": "{}",
+ "link": "next_link",
+ },
+ },
+ ],
+ }
+ url = "https://twitter.com/i/api/1.1/onboarding/task.json"
+ response = extr.request(
+ url, method="POST", headers=headers, json=data, fatal=None)
+
+ # username
+ data = {
+ "flow_token": process(response),
+ "subtask_inputs": [
+ {
+ "subtask_id": "LoginEnterUserIdentifierSSO",
+ "settings_list": {
+ "setting_responses": [
+ {
+ "key": "user_identifier",
+ "response_data": {
+ "text_data": {"result": username},
+ },
+ },
+ ],
+ "link": "next_link",
+ },
+ },
+ ],
+ }
+ # url = "https://twitter.com/i/api/1.1/onboarding/task.json"
+ extr.sleep(random.uniform(2.0, 4.0), "login (username)")
+ response = extr.request(
+ url, method="POST", headers=headers, json=data, fatal=None)
+
+ # password
+ data = {
+ "flow_token": process(response),
+ "subtask_inputs": [
+ {
+ "subtask_id": "LoginEnterPassword",
+ "enter_password": {
+ "password": password,
+ "link": "next_link",
+ },
+ },
+ ],
+ }
+ # url = "https://twitter.com/i/api/1.1/onboarding/task.json"
+ extr.sleep(random.uniform(2.0, 4.0), "login (password)")
+ response = extr.request(
+ url, method="POST", headers=headers, json=data, fatal=None)
+
+ # account duplication check ?
+ data = {
+ "flow_token": process(response),
+ "subtask_inputs": [
+ {
+ "subtask_id": "AccountDuplicationCheck",
+ "check_logged_in_account": {
+ "link": "AccountDuplicationCheck_false",
+ },
+ },
+ ],
+ }
+ # url = "https://twitter.com/i/api/1.1/onboarding/task.json"
+ response = extr.request(
+ url, method="POST", headers=headers, json=data, fatal=None)
+ process(response)
+
+ return {
+ cookie.name: cookie.value
+ for cookie in extr.session.cookies
+ }
diff --git a/gallery_dl/extractor/uploadir.py b/gallery_dl/extractor/uploadir.py
new file mode 100644
index 0000000..bd18c0a
--- /dev/null
+++ b/gallery_dl/extractor/uploadir.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://uploadir.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class UploadirFileExtractor(Extractor):
+ """Extractor for uploadir files"""
+ category = "uploadir"
+ subcategory = "file"
+ root = "https://uploadir.com"
+ filename_fmt = "{filename} ({id}).{extension}"
+ archive_fmt = "{id}"
+ pattern = r"(?:https?://)?uploadir\.com/(?:user/)?u(?:ploads)?/([^/?#]+)"
+ test = (
+ # image
+ ("https://uploadir.com/u/rd3t46ry", {
+ "pattern": r"https://uploadir\.com/u/rd3t46ry",
+ "count": 1,
+ "keyword": {
+ "extension": "jpg",
+ "filename": "Chloe and Rachel 4K jpg",
+ "id": "rd3t46ry",
+ },
+ }),
+ # archive
+ ("https://uploadir.com/uploads/gxe8ti9v/downloads/new", {
+ "pattern": r"https://uploadir\.com/uploads/gxe8ti9v/downloads",
+ "count": 1,
+ "keyword": {
+ "extension": "zip",
+ "filename": "NYAN-Mods-Pack#1",
+ "id": "gxe8ti9v",
+ },
+ }),
+ # utf-8 filename
+ ("https://uploadir.com/u/fllda6xl", {
+ "pattern": r"https://uploadir\.com/u/fllda6xl",
+ "count": 1,
+ "keyword": {
+ "extension": "png",
+ "filename": "_圖片_🖼_image_",
+ "id": "fllda6xl",
+ },
+ }),
+ ("https://uploadir.com/uploads/rd3t46ry"),
+ ("https://uploadir.com/user/uploads/rd3t46ry"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.file_id = match.group(1)
+
+ def items(self):
+ url = "{}/u/{}".format(self.root, self.file_id)
+ response = self.request(url, method="HEAD", allow_redirects=False)
+
+ if 300 <= response.status_code < 400:
+ url = response.headers["Location"]
+ extr = text.extract_from(self.request(url).text)
+
+ name = text.unescape(extr("<h2>", "</h2>").strip())
+ url = self.root + extr('class="form" action="', '"')
+ token = extr('name="authenticity_token" value="', '"')
+
+ data = text.nameext_from_url(name, {
+ "_http_method": "POST",
+ "_http_data" : {
+ "authenticity_token": token,
+ "upload_id": self.file_id,
+ },
+ })
+
+ else:
+ hcd = response.headers.get("Content-Disposition")
+ name = (hcd.partition("filename*=UTF-8''")[2] or
+ text.extr(hcd, 'filename="', '"'))
+ data = text.nameext_from_url(name)
+
+ data["id"] = self.file_id
+ yield Message.Directory, data
+ yield Message.Url, url, data
diff --git a/gallery_dl/extractor/vanillarock.py b/gallery_dl/extractor/vanillarock.py
index 3d934b2..6b1178e 100644
--- a/gallery_dl/extractor/vanillarock.py
+++ b/gallery_dl/extractor/vanillarock.py
@@ -44,7 +44,7 @@ class VanillarockPostExtractor(VanillarockExtractor):
img = extr('<div class="main-img">', '</div>')
if not img:
break
- imgs.append(text.extract(img, 'href="', '"')[0])
+ imgs.append(text.extr(img, 'href="', '"'))
data = {
"count": len(imgs),
@@ -89,5 +89,5 @@ class VanillarockTagExtractor(VanillarockExtractor):
post = extr('<h2 class="entry-title">', '</h2>')
if not post:
break
- yield Message.Queue, text.extract(post, 'href="', '"')[0], data
+ yield Message.Queue, text.extr(post, 'href="', '"'), data
url = text.unescape(extr('class="next page-numbers" href="', '"'))
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index 668be0f..00389fa 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -69,7 +69,7 @@ class VscoExtractor(Extractor):
def _extract_preload_state(self, url):
page = self.request(url, notfound=self.subcategory).text
- return json.loads(text.extract(page, "__PRELOADED_STATE__ = ", "<")[0])
+ return json.loads(text.extr(page, "__PRELOADED_STATE__ = ", "<"))
def _pagination(self, url, params, token, key, extra=None):
headers = {
diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py
index 47451bd..06f1aab 100644
--- a/gallery_dl/extractor/wallhaven.py
+++ b/gallery_dl/extractor/wallhaven.py
@@ -19,6 +19,10 @@ class WallhavenExtractor(Extractor):
archive_fmt = "{id}"
root = "https://wallhaven.cc"
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = WallhavenAPI(self)
+
def items(self):
metadata = self.metadata()
for wp in self.wallpapers():
@@ -57,7 +61,8 @@ class WallhavenSearchExtractor(WallhavenExtractor):
("https://wallhaven.cc/search?q=touhou"),
(("https://wallhaven.cc/search?q=id%3A87"
"&categories=111&purity=100&sorting=date_added&order=asc&page=3"), {
- "pattern": r"https://w.wallhaven.cc/full/\w\w/wallhaven-\w+\.\w+",
+ "pattern": (r"https://w\.wallhaven\.cc"
+ r"/full/\w\w/wallhaven-\w+\.\w+"),
"count": "<= 30",
}),
)
@@ -67,7 +72,7 @@ class WallhavenSearchExtractor(WallhavenExtractor):
self.params = text.parse_query(match.group(1))
def wallpapers(self):
- return WallhavenAPI(self).search(self.params.copy())
+ return self.api.search(self.params.copy())
def metadata(self):
return {"search": self.params}
@@ -87,12 +92,30 @@ class WallhavenCollectionExtractor(WallhavenExtractor):
self.username, self.collection_id = match.groups()
def wallpapers(self):
- return WallhavenAPI(self).collection(self.username, self.collection_id)
+ return self.api.collection(self.username, self.collection_id)
def metadata(self):
return {"username": self.username, "collection_id": self.collection_id}
+class WallhavenUserExtractor(WallhavenExtractor):
+ """Extractor for a wallhaven user"""
+ subcategory = "user"
+ pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/?$"
+ test = ("https://wallhaven.cc/user/AksumkA/",)
+
+ def __init__(self, match):
+ WallhavenExtractor.__init__(self, match)
+ self.username = match.group(1)
+
+ def items(self):
+ base = "{}/user/{}/".format(self.root, self.username)
+ return self._dispatch_extractors((
+ (WallhavenUploadsExtractor , base + "uploads"),
+ (WallhavenCollectionsExtractor, base + "favorites"),
+ ), ("uploads",))
+
+
class WallhavenCollectionsExtractor(WallhavenExtractor):
"""Extractor for all collections of a wallhaven user"""
subcategory = "collections"
@@ -107,13 +130,38 @@ class WallhavenCollectionsExtractor(WallhavenExtractor):
self.username = match.group(1)
def items(self):
- for collection in WallhavenAPI(self).collections(self.username):
+ for collection in self.api.collections(self.username):
collection["_extractor"] = WallhavenCollectionExtractor
url = "https://wallhaven.cc/user/{}/favorites/{}".format(
self.username, collection["id"])
yield Message.Queue, url, collection
+class WallhavenUploadsExtractor(WallhavenExtractor):
+ """Extractor for all uploads of a wallhaven user"""
+ subcategory = "uploads"
+ directory_fmt = ("{category}", "{username}")
+ archive_fmt = "u_{username}_{id}"
+ pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/uploads"
+ test = ("https://wallhaven.cc/user/AksumkA/uploads", {
+ "pattern": (r"https://[^.]+\.wallhaven\.cc"
+ r"/full/\w\w/wallhaven-\w+\.\w+"),
+ "range": "1-100",
+ "count": 100,
+ })
+
+ def __init__(self, match):
+ WallhavenExtractor.__init__(self, match)
+ self.username = match.group(1)
+
+ def wallpapers(self):
+ params = {"q": "@" + self.username}
+ return self.api.search(params.copy())
+
+ def metadata(self):
+ return {"username": self.username}
+
+
class WallhavenImageExtractor(WallhavenExtractor):
"""Extractor for individual wallpaper on wallhaven.cc"""
subcategory = "image"
@@ -121,7 +169,8 @@ class WallhavenImageExtractor(WallhavenExtractor):
r"|w\.wallhaven\.cc/[a-z]+/\w\w/wallhaven-)(\w+)")
test = (
("https://wallhaven.cc/w/01w334", {
- "pattern": "https://[^.]+.wallhaven.cc/full/01/[^-]+-01w334.jpg",
+ "pattern": (r"https://[^.]+\.wallhaven\.cc"
+ r"/full/01/wallhaven-01w334\.jpg"),
"content": "497212679383a465da1e35bd75873240435085a2",
"keyword": {
"id" : "01w334",
@@ -159,7 +208,7 @@ class WallhavenImageExtractor(WallhavenExtractor):
self.wallpaper_id = match.group(1)
def wallpapers(self):
- return (WallhavenAPI(self).info(self.wallpaper_id),)
+ return (self.api.info(self.wallpaper_id),)
class WallhavenAPI():
diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py
index 7f51732..677680f 100644
--- a/gallery_dl/extractor/warosu.py
+++ b/gallery_dl/extractor/warosu.py
@@ -57,8 +57,8 @@ class WarosuThreadExtractor(Extractor):
def get_metadata(self, page):
"""Collect metadata for extractor-job"""
- boardname = text.extract(page, "<title>", "</title>")[0]
- title = text.extract(page, 'filetitle" itemprop="name">', '<')[0]
+ boardname = text.extr(page, "<title>", "</title>")
+ title = text.extr(page, 'filetitle" itemprop="name">', '<')
return {
"board": self.board,
"board_name": boardname.rpartition(" - ")[2],
@@ -68,7 +68,7 @@ class WarosuThreadExtractor(Extractor):
def posts(self, page):
"""Build a list of all post-objects"""
- page = text.extract(page, '<div class="content">', '<table>')[0]
+ page = text.extr(page, '<div class="content">', '<table>')
needle = '<table itemscope itemtype="http://schema.org/Comment">'
return [self.parse(post) for post in page.split(needle)]
diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py
index 599a175..eca4f1a 100644
--- a/gallery_dl/extractor/weasyl.py
+++ b/gallery_dl/extractor/weasyl.py
@@ -225,7 +225,7 @@ class WeasylFavoriteExtractor(WeasylExtractor):
pos = page.index('id="favorites-content"')
if not owner_login:
- owner_login = text.extract(page, '<a href="/~', '"')[0]
+ owner_login = text.extr(page, '<a href="/~', '"')
for submitid in text.extract_iter(page, "/submissions/", "/", pos):
if submitid == lastid:
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 59f46f0..8a22fcb 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -169,7 +169,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor):
@staticmethod
def get_episode_urls(page):
"""Extract and return all episode urls in 'page'"""
- page = text.extract(page, 'id="_listUl"', '</ul>')[0]
+ page = text.extr(page, 'id="_listUl"', '</ul>')
return [
match.group(0)
for match in WebtoonsEpisodeExtractor.pattern.finditer(page)
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 189c0c5..55cee14 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -37,6 +37,7 @@ class WeiboExtractor(Extractor):
cookies = _cookie_cache()
if cookies is not None:
self.session.cookies.update(cookies)
+ self.session.headers["Referer"] = self.root + "/"
def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)
@@ -173,7 +174,7 @@ class WeiboExtractor(Extractor):
page = Extractor.request(
self, passport_url, method="POST", headers=headers, data=data).text
- data = json.loads(text.extract(page, "(", ");")[0])["data"]
+ data = json.loads(text.extr(page, "(", ");"))["data"]
passport_url = "https://passport.weibo.com/visitor/visitor"
params = {
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index 146ab04..0125739 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -144,8 +144,8 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
def _data(self, url):
page = self.request(url).text
- return json.loads(text.extract(
- page, "window.initials=", "</script>")[0].rstrip("\n\r;"))
+ return json.loads(text.extr(
+ page, "window.initials=", "</script>").rstrip("\n\r;"))
class XhamsterUserExtractor(XhamsterExtractor):
diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py
index 0a55532..10de439 100644
--- a/gallery_dl/extractor/xvideos.py
+++ b/gallery_dl/extractor/xvideos.py
@@ -113,8 +113,8 @@ class XvideosUserExtractor(XvideosBase, Extractor):
def items(self):
url = "{}/profiles/{}".format(self.root, self.user)
page = self.request(url, notfound=self.subcategory).text
- data = json.loads(text.extract(
- page, "xv.conf=", ";</script>")[0])["data"]
+ data = json.loads(text.extr(
+ page, "xv.conf=", ";</script>"))["data"]
if not isinstance(data["galleries"], dict):
return
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index 72cf438..c0d43fe 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -127,7 +127,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
while True:
page = self.request(url, params=params).text
- thumbs = text.extract(page, '<ul id="thumbs', '</ul>')[0]
+ thumbs = text.extr(page, '<ul id="thumbs', '</ul>')
extr = text.extract_from(thumbs)
while True:
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index 2f48ffd..1f65438 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -32,11 +32,8 @@ class Job():
self.pathfmt = None
self.kwdict = {}
self.status = 0
- self.url_key = extr.config("url-metadata")
- path_key = extr.config("path-metadata")
path_proxy = output.PathfmtProxy(self)
-
self._logger_extra = {
"job" : self,
"extractor": extr,
@@ -56,12 +53,16 @@ class Job():
extr.category = pextr.category
extr.subcategory = pextr.subcategory
+ self.metadata_url = extr.config("url-metadata")
+ self.metadata_http = extr.config("http-metadata")
+ metadata_path = extr.config("path-metadata")
+
# user-supplied metadata
kwdict = extr.config("keywords")
if kwdict:
self.kwdict.update(kwdict)
- if path_key:
- self.kwdict[path_key] = path_proxy
+ if metadata_path:
+ self.kwdict[metadata_path] = path_proxy
# predicates
self.pred_url = self._prepare_predicates("image", True)
@@ -120,8 +121,8 @@ class Job():
"""Call the appropriate message handler"""
if msg[0] == Message.Url:
_, url, kwdict = msg
- if self.url_key:
- kwdict[self.url_key] = url
+ if self.metadata_url:
+ kwdict[self.metadata_url] = url
if self.pred_url(url, kwdict):
self.update_kwdict(kwdict)
self.handle_url(url, kwdict)
@@ -132,8 +133,8 @@ class Job():
elif msg[0] == Message.Queue:
_, url, kwdict = msg
- if self.url_key:
- kwdict[self.url_key] = url
+ if self.metadata_url:
+ kwdict[self.metadata_url] = url
if self.pred_queue(url, kwdict):
self.handle_queue(url, kwdict)
@@ -154,6 +155,8 @@ class Job():
extr = self.extractor
kwdict["category"] = extr.category
kwdict["subcategory"] = extr.subcategory
+ if self.metadata_http:
+ kwdict.pop(self.metadata_http, None)
if self.kwdict:
kwdict.update(self.kwdict)
@@ -231,11 +234,14 @@ class DownloadJob(Job):
self.handle_skip()
return
- if pathfmt.exists():
- if archive:
- archive.add(kwdict)
- self.handle_skip()
- return
+ if pathfmt.extension and not self.metadata_http:
+ pathfmt.build_path()
+
+ if pathfmt.exists():
+ if archive:
+ archive.add(kwdict)
+ self.handle_skip()
+ return
if self.sleep:
self.extractor.sleep(self.sleep(), "download")
@@ -283,6 +289,9 @@ class DownloadJob(Job):
if not self.pathfmt:
self.initialize(kwdict)
else:
+ if "post-after" in self.hooks:
+ for callback in self.hooks["post-after"]:
+ callback(self.pathfmt)
self.pathfmt.set_directory(kwdict)
if "post" in self.hooks:
for callback in self.hooks["post"]:
@@ -337,14 +346,20 @@ class DownloadJob(Job):
self._write_unsupported(url)
def handle_finalize(self):
- pathfmt = self.pathfmt
if self.archive:
self.archive.close()
+
+ pathfmt = self.pathfmt
if pathfmt:
+ hooks = self.hooks
+ if "post-after" in hooks:
+ for callback in hooks["post-after"]:
+ callback(pathfmt)
+
self.extractor._store_cookies()
- if "finalize" in self.hooks:
+ if "finalize" in hooks:
status = self.status
- for callback in self.hooks["finalize"]:
+ for callback in hooks["finalize"]:
callback(pathfmt, status)
def handle_skip(self):
@@ -526,12 +541,11 @@ class SimulationJob(DownloadJob):
def handle_url(self, url, kwdict):
if not kwdict["extension"]:
kwdict["extension"] = "jpg"
- self.pathfmt.set_filename(kwdict)
if self.sleep:
self.extractor.sleep(self.sleep(), "download")
if self.archive:
self.archive.add(kwdict)
- self.out.skip(self.pathfmt.path)
+ self.out.skip(self.pathfmt.build_filename(kwdict))
def handle_directory(self, kwdict):
if not self.pathfmt:
@@ -548,6 +562,11 @@ class KeywordJob(Job):
def handle_url(self, url, kwdict):
stdout_write("\nKeywords for filenames and --filter:\n"
"------------------------------------\n")
+
+ if self.metadata_http and url.startswith("http"):
+ kwdict[self.metadata_http] = util.extract_headers(
+ self.extractor.request(url, method="HEAD"))
+
self.print_kwdict(kwdict)
raise exception.StopExtraction()
@@ -605,12 +624,15 @@ class KeywordJob(Job):
self.print_kwdict(value, key + "[", markers)
elif isinstance(value, list):
- if value and isinstance(value[0], dict):
- self.print_kwdict(value[0], key + "[][", markers)
+ if not value:
+ pass
+ elif isinstance(value[0], dict):
+ self.print_kwdict(value[0], key + "[N][", markers)
else:
- write(key + "[]\n")
- for val in value:
- write(" - " + str(val) + "\n")
+ fmt = (" {:>%s} {}\n" % len(str(len(value)))).format
+ write(key + "[N]\n")
+ for idx, val in enumerate(value, 0):
+ write(fmt(idx, val))
else:
# string or number
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
index 37247a7..4d9a358 100644
--- a/gallery_dl/option.py
+++ b/gallery_dl/option.py
@@ -125,6 +125,11 @@ def build_parser():
help="Client-side IP address to bind to",
)
general.add_argument(
+ "--user-agent",
+ dest="user-agent", metavar="UA", action=ConfigAction,
+ help="User-Agent request header",
+ )
+ general.add_argument(
"--clear-cache",
dest="clear_cache", metavar="MODULE",
help="Delete cached login sessions, cookies, etc. for MODULE "
@@ -263,6 +268,11 @@ def build_parser():
help="Do not download files larger than SIZE (e.g. 500k or 2.5M)",
)
downloader.add_argument(
+ "--chunk-size",
+ dest="chunk-size", metavar="SIZE", action=ConfigAction,
+ help="Size of in-memory data chunks (default: 32k)",
+ )
+ downloader.add_argument(
"--no-part",
dest="part", nargs=0, action=ConfigConstAction, const=False,
help="Do not use .part files",
diff --git a/gallery_dl/path.py b/gallery_dl/path.py
index 28c07c3..e901fb9 100644
--- a/gallery_dl/path.py
+++ b/gallery_dl/path.py
@@ -15,16 +15,16 @@ import functools
from . import util, formatter, exception
WINDOWS = util.WINDOWS
+EXTENSION_MAP = {
+ "jpeg": "jpg",
+ "jpe" : "jpg",
+ "jfif": "jpg",
+ "jif" : "jpg",
+ "jfi" : "jpg",
+}
class PathFormat():
- EXTENSION_MAP = {
- "jpeg": "jpg",
- "jpe" : "jpg",
- "jfif": "jpg",
- "jif" : "jpg",
- "jfi" : "jpg",
- }
def __init__(self, extractor):
config = extractor.config
@@ -78,7 +78,7 @@ class PathFormat():
extension_map = config("extension-map")
if extension_map is None:
- extension_map = self.EXTENSION_MAP
+ extension_map = EXTENSION_MAP
self.extension_map = extension_map.get
restrict = config("path-restrict", "auto")
@@ -161,12 +161,14 @@ class PathFormat():
num = 1
try:
while True:
- self.prefix = str(num) + "."
- self.set_extension(self.extension, False)
+ prefix = format(num) + "."
+ self.kwdict["extension"] = prefix + self.extension
+ self.build_path()
os.stat(self.realpath) # raises OSError if file doesn't exist
num += 1
except OSError:
pass
+ self.prefix = prefix
return False
def set_directory(self, kwdict):
@@ -198,31 +200,26 @@ class PathFormat():
def set_filename(self, kwdict):
"""Set general filename data"""
self.kwdict = kwdict
- self.temppath = self.prefix = ""
+ self.filename = self.temppath = self.prefix = ""
ext = kwdict["extension"]
kwdict["extension"] = self.extension = self.extension_map(ext, ext)
- if self.extension:
- self.build_path()
- else:
- self.filename = ""
-
def set_extension(self, extension, real=True):
"""Set filename extension"""
- extension = self.extension_map(extension, extension)
- if real:
- self.extension = extension
+ self.extension = extension = self.extension_map(extension, extension)
self.kwdict["extension"] = self.prefix + extension
- self.build_path()
def fix_extension(self, _=None):
"""Fix filenames without a given filename extension"""
if not self.extension:
- self.set_extension("", False)
+ self.kwdict["extension"] = self.prefix + self.extension_map("", "")
+ self.build_path()
if self.path[-1] == ".":
self.path = self.path[:-1]
self.temppath = self.realpath = self.realpath[:-1]
+ elif not self.temppath:
+ self.build_path()
return True
def build_filename(self, kwdict):
@@ -296,7 +293,9 @@ class PathFormat():
if self.extension:
self.temppath += ".part"
else:
- self.set_extension("part", False)
+ self.kwdict["extension"] = self.prefix + self.extension_map(
+ "part", "part")
+ self.build_path()
if part_directory:
self.temppath = os.path.join(
part_directory,
diff --git a/gallery_dl/postprocessor/compare.py b/gallery_dl/postprocessor/compare.py
index b3b94f7..910e1d7 100644
--- a/gallery_dl/postprocessor/compare.py
+++ b/gallery_dl/postprocessor/compare.py
@@ -51,8 +51,9 @@ class ComparePP(PostProcessor):
num = 1
try:
while not self._compare(pathfmt.realpath, pathfmt.temppath):
- pathfmt.prefix = str(num) + "."
- pathfmt.set_extension(pathfmt.extension, False)
+ pathfmt.prefix = prefix = format(num) + "."
+ pathfmt.kwdict["extension"] = prefix + pathfmt.extension
+ pathfmt.build_path()
num += 1
return self._equal(pathfmt)
except OSError:
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
index b21e483..2ee1cf8 100644
--- a/gallery_dl/postprocessor/metadata.py
+++ b/gallery_dl/postprocessor/metadata.py
@@ -21,6 +21,9 @@ class MetadataPP(PostProcessor):
mode = options.get("mode")
cfmt = options.get("content-format") or options.get("format")
+ omode = "w"
+ filename = None
+
if mode == "tags":
self.write = self._write_tags
ext = "txt"
@@ -41,6 +44,12 @@ class MetadataPP(PostProcessor):
cfmt = "\n".join(cfmt) + "\n"
self._content_fmt = formatter.parse(cfmt).format_map
ext = "txt"
+ elif mode == "jsonl":
+ self.write = self._write_json
+ self.indent = None
+ self.ascii = options.get("ascii", False)
+ omode = "a"
+ filename = "data.jsonl"
else:
self.write = self._write_json
self.indent = options.get("indent", 4)
@@ -53,7 +62,7 @@ class MetadataPP(PostProcessor):
sep = os.sep + (os.altsep or "")
self._metadir = util.expand_path(directory).rstrip(sep) + os.sep
- filename = options.get("filename")
+ filename = options.get("filename", filename)
extfmt = options.get("extension-format")
if filename:
if filename == "-":
@@ -97,6 +106,9 @@ class MetadataPP(PostProcessor):
self.archive = None
self.mtime = options.get("mtime")
+ self.omode = options.get("open", omode)
+ self.encoding = options.get("encoding", "utf-8")
+ self.private = options.get("private", False)
def run(self, pathfmt):
archive = self.archive
@@ -107,11 +119,11 @@ class MetadataPP(PostProcessor):
path = directory + self._filename(pathfmt)
try:
- with open(path, "w", encoding="utf-8") as fp:
+ with open(path, self.omode, encoding=self.encoding) as fp:
self.write(fp, pathfmt.kwdict)
except FileNotFoundError:
os.makedirs(directory, exist_ok=True)
- with open(path, "w", encoding="utf-8") as fp:
+ with open(path, self.omode, encoding=self.encoding) as fp:
self.write(fp, pathfmt.kwdict)
if archive:
@@ -198,7 +210,9 @@ class MetadataPP(PostProcessor):
fp.write("\n".join(tags) + "\n")
def _write_json(self, fp, kwdict):
- util.dump_json(util.filter_dict(kwdict), fp, self.ascii, self.indent)
+ if not self.private:
+ kwdict = util.filter_dict(kwdict)
+ util.dump_json(kwdict, fp, self.ascii, self.indent)
__postprocessor__ = MetadataPP
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
index 98c8246..9d2cb34 100644
--- a/gallery_dl/postprocessor/ugoira.py
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -90,15 +90,17 @@ class UgoiraPP(PostProcessor):
if pathfmt.extension != "zip":
return
- if "frames" in pathfmt.kwdict:
- self._frames = pathfmt.kwdict["frames"]
- elif "pixiv_ugoira_frame_data" in pathfmt.kwdict:
- self._frames = pathfmt.kwdict["pixiv_ugoira_frame_data"]["data"]
+ kwdict = pathfmt.kwdict
+ if "frames" in kwdict:
+ self._frames = kwdict["frames"]
+ elif "pixiv_ugoira_frame_data" in kwdict:
+ self._frames = kwdict["pixiv_ugoira_frame_data"]["data"]
else:
return
if self.delete:
pathfmt.set_extension(self.extension)
+ pathfmt.build_path()
def convert(self, pathfmt):
if not self._frames:
@@ -115,6 +117,8 @@ class UgoiraPP(PostProcessor):
# process frames and collect command-line arguments
pathfmt.set_extension(self.extension)
+ pathfmt.build_path()
+
args = self._process(pathfmt, tempdir)
if self.args:
args += self.args
@@ -151,6 +155,7 @@ class UgoiraPP(PostProcessor):
pathfmt.delete = True
else:
pathfmt.set_extension("zip")
+ pathfmt.build_path()
def _exec(self, args):
self.log.debug(args)
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 79cf016..1fb1851 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -120,6 +120,15 @@ def extract(txt, begin, end, pos=0):
return None, pos
+def extr(txt, begin, end, default=""):
+ """Stripped-down version of 'extract()'"""
+ try:
+ first = txt.index(begin) + len(begin)
+ return txt[first:txt.index(end, first)]
+ except (ValueError, TypeError, AttributeError):
+ return default
+
+
def rextract(txt, begin, end, pos=-1):
try:
lbeg = len(begin)
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 98b6d59..8ce1fb4 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -19,6 +19,7 @@ import binascii
import datetime
import functools
import itertools
+import subprocess
import urllib.parse
from http.cookiejar import Cookie
from email.utils import mktime_tz, parsedate_tz
@@ -273,6 +274,39 @@ Response Headers
fp.write(response.content)
+def extract_headers(response):
+ headers = response.headers
+ data = dict(headers)
+
+ hcd = headers.get("content-disposition")
+ if hcd:
+ name = text.extr(hcd, 'filename="', '"')
+ if name:
+ text.nameext_from_url(name, data)
+
+ hlm = headers.get("last-modified")
+ if hlm:
+ data["date"] = datetime.datetime(*parsedate_tz(hlm)[:6])
+
+ return data
+
+
+@functools.lru_cache(maxsize=None)
+def git_head():
+ try:
+ out, err = subprocess.Popen(
+ ("git", "rev-parse", "--short", "HEAD"),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ cwd=os.path.dirname(os.path.abspath(__file__)),
+ ).communicate()
+ if out and not err:
+ return out.decode().rstrip()
+ except (OSError, subprocess.SubprocessError):
+ pass
+ return None
+
+
def expand_path(path):
"""Expand environment variables and tildes (~)"""
if not path:
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 85a03de..31dbc63 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.23.5"
+__version__ = "1.24.0"
diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py
index b2da445..db313c3 100644
--- a/gallery_dl/ytdl.py
+++ b/gallery_dl/ytdl.py
@@ -73,7 +73,11 @@ def parse_command_line(module, argv):
ytdlp = (module.__name__ == "yt_dlp")
std_headers = module.std_headers
- parse_bytes = module.FileDownloader.parse_bytes
+
+ try:
+ parse_bytes = module.parse_bytes
+ except AttributeError:
+ parse_bytes = module.FileDownloader.parse_bytes
# HTTP headers
if opts.user_agent is not None:
diff --git a/setup.cfg b/setup.cfg
index 68f3711..f3565af 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,7 +1,8 @@
[flake8]
-exclude = gallery_dl/__init__.py,gallery_dl/__main__.py,setup.py,build,scripts,archive
+exclude = build,archive
ignore = E203,E226,W504
per-file-ignores =
+ setup.py: E501
gallery_dl/extractor/500px.py: E501
[egg_info]
diff --git a/setup.py b/setup.py
index bf1d927..3d97d27 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,6 @@ import re
import sys
import os.path
import warnings
-from setuptools import setup
def read(fname):
@@ -13,6 +12,7 @@ def read(fname):
with open(path, encoding="utf-8") as file:
return file.read()
+
def check_file(fname):
path = os.path.join(os.path.dirname(__file__), fname)
if os.path.exists(path):
@@ -41,99 +41,109 @@ FILES = [
]
]
+PACKAGES = [
+ "gallery_dl",
+ "gallery_dl.extractor",
+ "gallery_dl.downloader",
+ "gallery_dl.postprocessor",
+]
+
DESCRIPTION = ("Command-line program to download image galleries and "
"collections from several image hosting sites")
LONG_DESCRIPTION = read("README.rst")
-if "py2exe" in sys.argv:
- try:
- import py2exe
- except ImportError:
- sys.exit("Error importing 'py2exe'")
+def build_py2exe():
+ from py2exe import freeze
# py2exe dislikes version specifiers with a trailing '-dev'
- VERSION = VERSION.partition("-")[0]
+ VERSION_ = VERSION.partition("-")[0]
- params = {
- "console": [{
+ freeze(
+ console=[{
"script" : "./gallery_dl/__main__.py",
"dest_base" : "gallery-dl",
- "version" : VERSION,
+ }],
+ version_info={
+ "version" : VERSION_,
"description" : DESCRIPTION,
"comments" : LONG_DESCRIPTION,
"product_name" : "gallery-dl",
- "product_version": VERSION,
- }],
- "options": {"py2exe": {
- "bundle_files": 0,
- "compressed" : 1,
- "optimize" : 1,
- "dist_dir" : ".",
- "packages" : ["gallery_dl"],
- "includes" : ["youtube_dl"],
- "dll_excludes": ["w9xpopen.exe"],
- }},
- "zipfile": None,
- }
+ "product_version": VERSION_,
+ },
+ options={
+ "bundle_files" : 0,
+ "compressed" : 1,
+ "optimize" : 1,
+ "dist_dir" : "./dist",
+ "packages" : PACKAGES,
+ "includes" : ["youtube_dl"],
+ "dll_excludes" : ["w9xpopen.exe"],
+ },
+ zipfile=None,
+ )
-else:
- params = {}
-
-
-setup(
- name="gallery_dl",
- version=VERSION,
- description=DESCRIPTION,
- long_description=LONG_DESCRIPTION,
- url="https://github.com/mikf/gallery-dl",
- download_url="https://github.com/mikf/gallery-dl/releases/latest",
- author="Mike Fährmann",
- author_email="mike_faehrmann@web.de",
- maintainer="Mike Fährmann",
- maintainer_email="mike_faehrmann@web.de",
- license="GPLv2",
- python_requires=">=3.4",
- install_requires=[
- "requests>=2.11.0",
- ],
- extras_require={
- "video": [
- "youtube-dl",
+
+def build_setuptools():
+ from setuptools import setup
+
+ setup(
+ name="gallery_dl",
+ version=VERSION,
+ description=DESCRIPTION,
+ long_description=LONG_DESCRIPTION,
+ url="https://github.com/mikf/gallery-dl",
+ download_url="https://github.com/mikf/gallery-dl/releases/latest",
+ author="Mike Fährmann",
+ author_email="mike_faehrmann@web.de",
+ maintainer="Mike Fährmann",
+ maintainer_email="mike_faehrmann@web.de",
+ license="GPLv2",
+ python_requires=">=3.4",
+ install_requires=[
+ "requests>=2.11.0",
],
- },
- packages=[
- "gallery_dl",
- "gallery_dl.extractor",
- "gallery_dl.downloader",
- "gallery_dl.postprocessor",
- ],
- entry_points={
- "console_scripts": [
- "gallery-dl = gallery_dl:main",
+ extras_require={
+ "video": [
+ "youtube-dl",
+ ],
+ },
+ entry_points={
+ "console_scripts": [
+ "gallery-dl = gallery_dl:main",
+ ],
+ },
+ packages=PACKAGES,
+ data_files=FILES,
+ test_suite="test",
+ keywords="image gallery downloader crawler scraper",
+ classifiers=[
+ "Development Status :: 5 - Production/Stable",
+ "Environment :: Console",
+ "Intended Audience :: End Users/Desktop",
+ "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3 :: Only",
+ "Programming Language :: Python :: 3.4",
+ "Programming Language :: Python :: 3.5",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: Implementation :: CPython",
+ "Programming Language :: Python :: Implementation :: PyPy",
+ "Topic :: Internet :: WWW/HTTP",
+ "Topic :: Multimedia :: Graphics",
+ "Topic :: Utilities",
],
- },
- data_files=FILES,
- keywords="image gallery downloader crawler scraper",
- classifiers=[
- "Development Status :: 5 - Production/Stable",
- "Environment :: Console",
- "Intended Audience :: End Users/Desktop",
- "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
- "Operating System :: Microsoft :: Windows",
- "Operating System :: POSIX",
- "Operating System :: MacOS",
- "Programming Language :: Python :: 3.4",
- "Programming Language :: Python :: 3.5",
- "Programming Language :: Python :: 3.6",
- "Programming Language :: Python :: 3.7",
- "Programming Language :: Python :: 3.8",
- "Programming Language :: Python :: 3.9",
- "Programming Language :: Python :: 3 :: Only",
- "Topic :: Internet :: WWW/HTTP",
- "Topic :: Multimedia :: Graphics",
- "Topic :: Utilities",
- ],
- test_suite="test",
- **params,
-)
+ )
+
+
+if "py2exe" in sys.argv:
+ build_py2exe()
+else:
+ build_setuptools()
diff --git a/test/test_downloader.py b/test/test_downloader.py
index 9350ce4..0703754 100644
--- a/test/test_downloader.py
+++ b/test/test_downloader.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2018-2021 Mike Fährmann
+# Copyright 2018-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -13,9 +13,9 @@ import unittest
from unittest.mock import Mock, MagicMock, patch
import re
-import base64
import logging
import os.path
+import binascii
import tempfile
import threading
import http.server
@@ -23,6 +23,7 @@ import http.server
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gallery_dl import downloader, extractor, output, config, path # noqa E402
+from gallery_dl.downloader.http import MIME_TYPES, SIGNATURE_CHECKS # noqa E402
class MockDownloaderModule(Mock):
@@ -130,6 +131,7 @@ class TestDownloaderBase(unittest.TestCase):
pathfmt = cls.job.pathfmt
pathfmt.set_directory(kwdict)
pathfmt.set_filename(kwdict)
+ pathfmt.build_path()
if content:
mode = "w" + ("b" if isinstance(content, bytes) else "")
@@ -156,6 +158,7 @@ class TestDownloaderBase(unittest.TestCase):
self.assertEqual(
pathfmt.extension,
expected_extension,
+ content[0:16],
)
self.assertEqual(
os.path.splitext(pathfmt.realpath)[1][1:],
@@ -172,48 +175,52 @@ class TestHTTPDownloader(TestDownloaderBase):
port = 8088
cls.address = "http://127.0.0.1:{}".format(port)
- cls._jpg = cls.address + "/image.jpg"
- cls._png = cls.address + "/image.png"
- cls._gif = cls.address + "/image.gif"
-
server = http.server.HTTPServer(("", port), HttpRequestHandler)
threading.Thread(target=server.serve_forever, daemon=True).start()
+ def _run_test(self, ext, input, output,
+ extension, expected_extension=None):
+ TestDownloaderBase._run_test(
+ self, self.address + "/" + ext, input, output,
+ extension, expected_extension)
+
def tearDown(self):
self.downloader.minsize = self.downloader.maxsize = None
def test_http_download(self):
- self._run_test(self._jpg, None, DATA_JPG, "jpg", "jpg")
- self._run_test(self._png, None, DATA_PNG, "png", "png")
- self._run_test(self._gif, None, DATA_GIF, "gif", "gif")
+ self._run_test("jpg", None, DATA["jpg"], "jpg", "jpg")
+ self._run_test("png", None, DATA["png"], "png", "png")
+ self._run_test("gif", None, DATA["gif"], "gif", "gif")
def test_http_offset(self):
- self._run_test(self._jpg, DATA_JPG[:123], DATA_JPG, "jpg", "jpg")
- self._run_test(self._png, DATA_PNG[:12] , DATA_PNG, "png", "png")
- self._run_test(self._gif, DATA_GIF[:1] , DATA_GIF, "gif", "gif")
+ self._run_test("jpg", DATA["jpg"][:123], DATA["jpg"], "jpg", "jpg")
+ self._run_test("png", DATA["png"][:12] , DATA["png"], "png", "png")
+ self._run_test("gif", DATA["gif"][:1] , DATA["gif"], "gif", "gif")
def test_http_extension(self):
- self._run_test(self._jpg, None, DATA_JPG, None, "jpg")
- self._run_test(self._png, None, DATA_PNG, None, "png")
- self._run_test(self._gif, None, DATA_GIF, None, "gif")
+ self._run_test("jpg", None, DATA["jpg"], None, "jpg")
+ self._run_test("png", None, DATA["png"], None, "png")
+ self._run_test("gif", None, DATA["gif"], None, "gif")
def test_http_adjust_extension(self):
- self._run_test(self._jpg, None, DATA_JPG, "png", "jpg")
- self._run_test(self._png, None, DATA_PNG, "gif", "png")
- self._run_test(self._gif, None, DATA_GIF, "jpg", "gif")
+ self._run_test("jpg", None, DATA["jpg"], "png", "jpg")
+ self._run_test("png", None, DATA["png"], "gif", "png")
+ self._run_test("gif", None, DATA["gif"], "jpg", "gif")
def test_http_filesize_min(self):
+ url = self.address + "/gif"
pathfmt = self._prepare_destination(None, extension=None)
self.downloader.minsize = 100
with self.assertLogs(self.downloader.log, "WARNING"):
- success = self.downloader.download(self._gif, pathfmt)
+ success = self.downloader.download(url, pathfmt)
self.assertFalse(success)
def test_http_filesize_max(self):
+ url = self.address + "/jpg"
pathfmt = self._prepare_destination(None, extension=None)
self.downloader.maxsize = 100
with self.assertLogs(self.downloader.log, "WARNING"):
- success = self.downloader.download(self._jpg, pathfmt)
+ success = self.downloader.download(url, pathfmt)
self.assertFalse(success)
@@ -237,24 +244,14 @@ class TestTextDownloader(TestDownloaderBase):
class HttpRequestHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
- if self.path == "/image.jpg":
- content_type = "image/jpeg"
- output = DATA_JPG
- elif self.path == "/image.png":
- content_type = "image/png"
- output = DATA_PNG
- elif self.path == "/image.gif":
- content_type = "image/gif"
- output = DATA_GIF
- else:
+ try:
+ output = DATA[self.path[1:]]
+ except KeyError:
self.send_response(404)
self.wfile.write(self.path.encode())
return
- headers = {
- "Content-Type": content_type,
- "Content-Length": len(output),
- }
+ headers = {"Content-Length": len(output)}
if "Range" in self.headers:
status = 206
@@ -275,31 +272,79 @@ class HttpRequestHandler(http.server.BaseHTTPRequestHandler):
self.wfile.write(output)
-DATA_JPG = base64.standard_b64decode("""
-/9j/4AAQSkZJRgABAQEASABIAAD/2wBD
-AAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
-AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
-AQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEB
-AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
-AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
-AQEBAQEBAQEBAQEBAQH/wAARCAABAAED
-AREAAhEBAxEB/8QAFAABAAAAAAAAAAAA
-AAAAAAAACv/EABQQAQAAAAAAAAAAAAAA
-AAAAAAD/xAAUAQEAAAAAAAAAAAAAAAAA
-AAAA/8QAFBEBAAAAAAAAAAAAAAAAAAAA
-AP/aAAwDAQACEQMRAD8AfwD/2Q==""")
-
-
-DATA_PNG = base64.standard_b64decode("""
-iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB
-CAAAAAA6fptVAAAACklEQVQIHWP4DwAB
-AQEANl9ngAAAAABJRU5ErkJggg==""")
-
-
-DATA_GIF = base64.standard_b64decode("""
-R0lGODdhAQABAIAAAP///////ywAAAAA
-AQABAAACAkQBADs=""")
-
-
+SAMPLES = {
+ ("jpg" , binascii.a2b_base64(
+ "/9j/4AAQSkZJRgABAQEASABIAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB"
+ "AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEB"
+ "AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB"
+ "AQEBAQEBAQEBAQEBAQH/wAARCAABAAEDAREAAhEBAxEB/8QAFAABAAAAAAAAAAAA"
+ "AAAAAAAACv/EABQQAQAAAAAAAAAAAAAAAAAAAAD/xAAUAQEAAAAAAAAAAAAAAAAA"
+ "AAAA/8QAFBEBAAAAAAAAAAAAAAAAAAAAAP/aAAwDAQACEQMRAD8AfwD/2Q==")),
+ ("png" , binascii.a2b_base64(
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVQIHWP4DwAB"
+ "AQEANl9ngAAAAABJRU5ErkJggg==")),
+ ("gif" , binascii.a2b_base64(
+ "R0lGODdhAQABAIAAAP///////ywAAAAAAQABAAACAkQBADs=")),
+ ("bmp" , b"BM"),
+ ("webp", b"RIFF????WEBP"),
+ ("avif", b"????ftypavif"),
+ ("avif", b"????ftypavis"),
+ ("svg" , b"<?xml"),
+ ("ico" , b"\x00\x00\x01\x00"),
+ ("cur" , b"\x00\x00\x02\x00"),
+ ("psd" , b"8BPS"),
+ ("mp4" , b"????ftypmp4"),
+ ("mp4" , b"????ftypavc1"),
+ ("mp4" , b"????ftypiso3"),
+ ("mp4" , b"????ftypM4V"),
+ ("webm", b"\x1A\x45\xDF\xA3"),
+ ("ogg" , b"OggS"),
+ ("wav" , b"RIFF????WAVE"),
+ ("mp3" , b"ID3"),
+ ("mp3" , b"\xFF\xFB"),
+ ("mp3" , b"\xFF\xF3"),
+ ("mp3" , b"\xFF\xF2"),
+ ("zip" , b"PK\x03\x04"),
+ ("zip" , b"PK\x05\x06"),
+ ("zip" , b"PK\x07\x08"),
+ ("rar" , b"Rar!\x1A\x07"),
+ ("rar" , b"\x52\x61\x72\x21\x1A\x07"),
+ ("7z" , b"\x37\x7A\xBC\xAF\x27\x1C"),
+ ("pdf" , b"%PDF-"),
+ ("swf" , b"FWS"),
+ ("swf" , b"CWS"),
+}
+
+
+DATA = {}
+
+for ext, content in SAMPLES:
+ if ext not in DATA:
+ DATA[ext] = content
+
+for idx, (_, content) in enumerate(SAMPLES):
+ DATA["S{:>02}".format(idx)] = content
+
+
+# reverse mime types mapping
+MIME_TYPES = {
+ ext: mtype
+ for mtype, ext in MIME_TYPES.items()
+}
+
+
+def generate_tests():
+ def generate_test(idx, ext, content):
+ def test(self):
+ self._run_test("S{:>02}".format(idx), None, content, "bin", ext)
+ test.__name__ = "test_http_ext_{:>02}_{}".format(idx, ext)
+ return test
+
+ for idx, (ext, content) in enumerate(SAMPLES):
+ test = generate_test(idx, ext, content)
+ setattr(TestHTTPDownloader, test.__name__, test)
+
+
+generate_tests()
if __name__ == "__main__":
unittest.main()
diff --git a/test/test_job.py b/test/test_job.py
index fec6997..1bd9ccc 100644
--- a/test/test_job.py
+++ b/test/test_job.py
@@ -87,10 +87,10 @@ num
1
subcategory
test_subcategory
-tags[]
- - foo
- - bar
- - テスト
+tags[N]
+ 0 foo
+ 1 bar
+ 2 テスト
user[id]
123
user[name]
diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py
index ba37ee0..7da2089 100644
--- a/test/test_postprocessor.py
+++ b/test/test_postprocessor.py
@@ -97,6 +97,7 @@ class BasePostprocessorTest(unittest.TestCase):
self.pathfmt = self.job.pathfmt
self.pathfmt.set_directory(kwdict)
self.pathfmt.set_filename(kwdict)
+ self.pathfmt.build_path()
pp = postprocessor.find(self.__class__.__name__[:-4].lower())
return pp(self.job, options)
@@ -118,6 +119,7 @@ class ClassifyTest(BasePostprocessorTest):
for ext in exts
})
self.pathfmt.set_extension("jpg")
+ self.pathfmt.build_path()
pp.prepare(self.pathfmt)
path = os.path.join(self.dir.name, "test", "Pictures")
@@ -150,6 +152,7 @@ class ClassifyTest(BasePostprocessorTest):
"bar": "foo/bar",
})
self.pathfmt.set_extension("foo")
+ self.pathfmt.build_path()
pp.prepare(self.pathfmt)
path = os.path.join(self.dir.name, "test", "foo", "bar")
diff --git a/test/test_text.py b/test/test_text.py
index 0ac7767..2c0be3b 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -203,6 +203,24 @@ class TestText(unittest.TestCase):
self.assertEqual(f(txt , value, ">") , (None, 0))
self.assertEqual(f(txt , "<" , value), (None, 0))
+ def test_extr(self, f=text.extr):
+ txt = "<a><b>"
+ self.assertEqual(f(txt, "X", ">"), "")
+ self.assertEqual(f(txt, "<", "X"), "")
+ self.assertEqual(f(txt, "<", ">"), "a")
+ self.assertEqual(f(txt, "><", ">"), "b")
+
+ # 'default' argument
+ self.assertEqual(f(txt, "<", "X", None), None)
+ self.assertEqual(f(txt, "<", "X", default=None), None)
+ self.assertEqual(f(txt, "<", "X", default=()), ())
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value, "<" , ">") , "")
+ self.assertEqual(f(txt , value, ">") , "")
+ self.assertEqual(f(txt , "<" , value), "")
+
def test_rextract(self, f=text.rextract):
txt = "<a><b>"
self.assertEqual(f(txt, "<", ">"), ("b" , 3))