From a6e995c093de8aae2e91a0787281bb34c0b871eb Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Thu, 31 Jul 2025 01:22:01 -0400 Subject: New upstream version 1.30.2. --- CHANGELOG.md | 45 +- PKG-INFO | 31 +- README.rst | 14 +- data/completion/_gallery-dl | 12 +- data/completion/gallery-dl | 2 +- data/completion/gallery-dl.fish | 12 +- data/man/gallery-dl.1 | 22 +- data/man/gallery-dl.conf.5 | 1161 +++++++++++++++++++++++++------ docs/gallery-dl.conf | 134 +++- gallery_dl.egg-info/PKG-INFO | 31 +- gallery_dl.egg-info/SOURCES.txt | 18 +- gallery_dl.egg-info/requires.txt | 16 +- gallery_dl/__init__.py | 142 ++-- gallery_dl/actions.py | 82 ++- gallery_dl/aes.py | 6 +- gallery_dl/cache.py | 8 +- gallery_dl/config.py | 46 +- gallery_dl/cookies.py | 69 +- gallery_dl/downloader/common.py | 12 +- gallery_dl/downloader/http.py | 114 ++- gallery_dl/downloader/ytdl.py | 52 +- gallery_dl/exception.py | 132 ++-- gallery_dl/extractor/2ch.py | 25 +- gallery_dl/extractor/2chan.py | 16 +- gallery_dl/extractor/2chen.py | 6 +- gallery_dl/extractor/35photo.py | 23 +- gallery_dl/extractor/4archive.py | 17 +- gallery_dl/extractor/4chan.py | 33 +- gallery_dl/extractor/4chanarchives.py | 13 +- gallery_dl/extractor/500px.py | 12 +- gallery_dl/extractor/8chan.py | 19 +- gallery_dl/extractor/8muses.py | 18 +- gallery_dl/extractor/__init__.py | 28 +- gallery_dl/extractor/adultempire.py | 6 +- gallery_dl/extractor/agnph.py | 19 +- gallery_dl/extractor/ao3.py | 19 +- gallery_dl/extractor/arcalive.py | 40 +- gallery_dl/extractor/architizer.py | 11 +- gallery_dl/extractor/artstation.py | 167 ++--- gallery_dl/extractor/aryion.py | 24 +- gallery_dl/extractor/batoto.py | 18 +- gallery_dl/extractor/bbc.py | 11 +- gallery_dl/extractor/behance.py | 57 +- gallery_dl/extractor/bilibili.py | 76 +- gallery_dl/extractor/blogger.py | 58 +- gallery_dl/extractor/bluesky.py | 60 +- gallery_dl/extractor/booru.py | 3 +- gallery_dl/extractor/boosty.py | 20 +- gallery_dl/extractor/bunkr.py | 17 +- gallery_dl/extractor/catbox.py | 2 +- gallery_dl/extractor/chevereto.py | 4 +- gallery_dl/extractor/cien.py | 15 +- gallery_dl/extractor/civitai.py | 387 ++++++++--- gallery_dl/extractor/comick.py | 198 ++++++ gallery_dl/extractor/comicvine.py | 7 +- gallery_dl/extractor/common.py | 562 +++++++++------ gallery_dl/extractor/cyberdrop.py | 8 +- gallery_dl/extractor/danbooru.py | 53 +- gallery_dl/extractor/dankefuerslesen.py | 120 ++++ gallery_dl/extractor/desktopography.py | 8 +- gallery_dl/extractor/deviantart.py | 162 ++--- gallery_dl/extractor/directlink.py | 4 +- gallery_dl/extractor/discord.py | 89 +-- gallery_dl/extractor/dynastyscans.py | 66 +- gallery_dl/extractor/e621.py | 26 +- gallery_dl/extractor/erome.py | 106 +-- gallery_dl/extractor/everia.py | 16 +- gallery_dl/extractor/exhentai.py | 208 +++--- gallery_dl/extractor/facebook.py | 179 +++-- gallery_dl/extractor/fanbox.py | 130 ++-- gallery_dl/extractor/fantia.py | 8 +- gallery_dl/extractor/fapachi.py | 14 +- gallery_dl/extractor/fapello.py | 19 +- gallery_dl/extractor/flickr.py | 14 +- gallery_dl/extractor/foolfuuka.py | 85 ++- gallery_dl/extractor/foolslide.py | 13 +- gallery_dl/extractor/furaffinity.py | 66 +- gallery_dl/extractor/furry34.py | 12 +- gallery_dl/extractor/fuskator.py | 17 +- gallery_dl/extractor/gelbooru.py | 18 +- gallery_dl/extractor/gelbooru_v01.py | 32 +- gallery_dl/extractor/gelbooru_v02.py | 73 +- gallery_dl/extractor/generic.py | 29 +- gallery_dl/extractor/girlsreleased.py | 76 ++ gallery_dl/extractor/girlswithmuscle.py | 177 +++++ gallery_dl/extractor/gofile.py | 11 +- gallery_dl/extractor/hatenablog.py | 21 +- gallery_dl/extractor/hentai2read.py | 8 +- gallery_dl/extractor/hentaicosplays.py | 4 +- gallery_dl/extractor/hentaifoundry.py | 42 +- gallery_dl/extractor/hentaihand.py | 16 +- gallery_dl/extractor/hentaihere.py | 23 +- gallery_dl/extractor/hentainexus.py | 26 +- gallery_dl/extractor/hiperdex.py | 21 +- gallery_dl/extractor/hitomi.py | 157 ++--- gallery_dl/extractor/hotleak.py | 11 +- gallery_dl/extractor/idolcomplex.py | 9 +- gallery_dl/extractor/imagebam.py | 18 +- gallery_dl/extractor/imagechest.py | 13 +- gallery_dl/extractor/imagefap.py | 40 +- gallery_dl/extractor/imagehosts.py | 38 +- gallery_dl/extractor/imgbb.py | 16 +- gallery_dl/extractor/imgbox.py | 29 +- gallery_dl/extractor/imgth.py | 11 +- gallery_dl/extractor/imgur.py | 32 +- gallery_dl/extractor/imhentai.py | 4 +- gallery_dl/extractor/inkbunny.py | 38 +- gallery_dl/extractor/instagram.py | 147 ++-- gallery_dl/extractor/issuu.py | 15 +- gallery_dl/extractor/itaku.py | 299 +++++--- gallery_dl/extractor/itchio.py | 12 +- gallery_dl/extractor/iwara.py | 440 ++++++++++++ gallery_dl/extractor/jschan.py | 28 +- gallery_dl/extractor/kabeuchi.py | 20 +- gallery_dl/extractor/keenspot.py | 33 +- gallery_dl/extractor/kemono.py | 680 ++++++++++++++++++ gallery_dl/extractor/kemonoparty.py | 625 ----------------- gallery_dl/extractor/khinsider.py | 4 +- gallery_dl/extractor/koharu.py | 251 ------- gallery_dl/extractor/komikcast.py | 37 +- gallery_dl/extractor/leakgallery.py | 141 ++++ gallery_dl/extractor/lensdump.py | 8 +- gallery_dl/extractor/lexica.py | 10 +- gallery_dl/extractor/lightroom.py | 2 +- gallery_dl/extractor/livedoor.py | 14 +- gallery_dl/extractor/lofter.py | 2 +- gallery_dl/extractor/lolisafe.py | 8 +- gallery_dl/extractor/luscious.py | 14 +- gallery_dl/extractor/lynxchan.py | 25 +- gallery_dl/extractor/madokami.py | 93 +++ gallery_dl/extractor/mangadex.py | 48 +- gallery_dl/extractor/mangafox.py | 4 +- gallery_dl/extractor/mangahere.py | 18 +- gallery_dl/extractor/manganelo.py | 4 +- gallery_dl/extractor/mangapark.py | 35 +- gallery_dl/extractor/mangaread.py | 20 +- gallery_dl/extractor/mangasee.py | 117 ---- gallery_dl/extractor/mangoxo.py | 16 +- gallery_dl/extractor/mastodon.py | 27 +- gallery_dl/extractor/misskey.py | 107 ++- gallery_dl/extractor/moebooru.py | 53 +- gallery_dl/extractor/motherless.py | 140 ++-- gallery_dl/extractor/myhentaigallery.py | 6 +- gallery_dl/extractor/myportfolio.py | 8 +- gallery_dl/extractor/naver.py | 174 ----- gallery_dl/extractor/naverblog.py | 173 +++++ gallery_dl/extractor/naverchzzk.py | 81 +++ gallery_dl/extractor/naverwebtoon.py | 24 +- gallery_dl/extractor/nekohouse.py | 5 +- gallery_dl/extractor/newgrounds.py | 71 +- gallery_dl/extractor/nhentai.py | 7 +- gallery_dl/extractor/nijie.py | 47 +- gallery_dl/extractor/nitter.py | 35 +- gallery_dl/extractor/nozomi.py | 25 +- gallery_dl/extractor/nsfwalbum.py | 16 +- gallery_dl/extractor/nudostar.py | 71 ++ gallery_dl/extractor/oauth.py | 65 +- gallery_dl/extractor/paheal.py | 28 +- gallery_dl/extractor/patreon.py | 206 +++--- gallery_dl/extractor/pexels.py | 8 +- gallery_dl/extractor/philomena.py | 36 +- gallery_dl/extractor/photovogue.py | 4 +- gallery_dl/extractor/picarto.py | 6 +- gallery_dl/extractor/pictoa.py | 4 +- gallery_dl/extractor/piczel.py | 10 +- gallery_dl/extractor/pillowfort.py | 19 +- gallery_dl/extractor/pinterest.py | 63 +- gallery_dl/extractor/pixeldrain.py | 33 +- gallery_dl/extractor/pixiv.py | 267 +++---- gallery_dl/extractor/pixnet.py | 18 +- gallery_dl/extractor/plurk.py | 22 +- gallery_dl/extractor/poipiku.py | 6 +- gallery_dl/extractor/poringa.py | 4 +- gallery_dl/extractor/pornhub.py | 40 +- gallery_dl/extractor/pornpics.py | 10 +- gallery_dl/extractor/postmill.py | 19 +- gallery_dl/extractor/rawkuma.py | 83 +++ gallery_dl/extractor/reactor.py | 12 +- gallery_dl/extractor/readcomiconline.py | 45 +- gallery_dl/extractor/realbooru.py | 11 +- gallery_dl/extractor/recursive.py | 9 +- gallery_dl/extractor/redbust.py | 186 +++++ gallery_dl/extractor/reddit.py | 44 +- gallery_dl/extractor/redgifs.py | 39 +- gallery_dl/extractor/rule34us.py | 16 +- gallery_dl/extractor/rule34vault.py | 12 +- gallery_dl/extractor/rule34xyz.py | 40 +- gallery_dl/extractor/saint.py | 2 +- gallery_dl/extractor/sankaku.py | 45 +- gallery_dl/extractor/sankakucomplex.py | 24 +- gallery_dl/extractor/schalenetwork.py | 246 +++++++ gallery_dl/extractor/scrolller.py | 6 +- gallery_dl/extractor/seiga.py | 16 +- gallery_dl/extractor/senmanga.py | 2 +- gallery_dl/extractor/sexcom.py | 138 +++- gallery_dl/extractor/shimmie2.py | 27 +- gallery_dl/extractor/shopify.py | 16 +- gallery_dl/extractor/simplyhentai.py | 17 +- gallery_dl/extractor/skeb.py | 32 +- gallery_dl/extractor/slickpic.py | 12 +- gallery_dl/extractor/slideshare.py | 12 +- gallery_dl/extractor/smugmug.py | 18 +- gallery_dl/extractor/soundgasm.py | 6 +- gallery_dl/extractor/speakerdeck.py | 22 +- gallery_dl/extractor/steamgriddb.py | 24 +- gallery_dl/extractor/subscribestar.py | 32 +- gallery_dl/extractor/szurubooru.py | 17 +- gallery_dl/extractor/tapas.py | 19 +- gallery_dl/extractor/tcbscans.py | 4 +- gallery_dl/extractor/telegraph.py | 6 +- gallery_dl/extractor/tenor.py | 29 +- gallery_dl/extractor/tiktok.py | 29 +- gallery_dl/extractor/tmohentai.py | 9 +- gallery_dl/extractor/toyhouse.py | 8 +- gallery_dl/extractor/tsumino.py | 49 +- gallery_dl/extractor/tumblr.py | 68 +- gallery_dl/extractor/tumblrgallery.py | 23 +- gallery_dl/extractor/twibooru.py | 18 +- gallery_dl/extractor/twitter.py | 245 +++---- gallery_dl/extractor/unsplash.py | 23 +- gallery_dl/extractor/uploadir.py | 6 +- gallery_dl/extractor/urlgalleries.py | 7 +- gallery_dl/extractor/urlshortener.py | 2 +- gallery_dl/extractor/vanillarock.py | 4 +- gallery_dl/extractor/vichan.py | 57 +- gallery_dl/extractor/vipergirls.py | 20 +- gallery_dl/extractor/vk.py | 82 ++- gallery_dl/extractor/vsco.py | 58 +- gallery_dl/extractor/wallhaven.py | 39 +- gallery_dl/extractor/wallpapercave.py | 9 +- gallery_dl/extractor/warosu.py | 19 +- gallery_dl/extractor/weasyl.py | 27 +- gallery_dl/extractor/webmshare.py | 6 +- gallery_dl/extractor/webtoons.py | 116 +-- gallery_dl/extractor/weebcentral.py | 9 +- gallery_dl/extractor/weibo.py | 65 +- gallery_dl/extractor/wikiart.py | 37 +- gallery_dl/extractor/wikifeet.py | 13 +- gallery_dl/extractor/wikimedia.py | 29 +- gallery_dl/extractor/xfolio.py | 20 +- gallery_dl/extractor/xhamster.py | 6 +- gallery_dl/extractor/xvideos.py | 15 +- gallery_dl/extractor/yiffverse.py | 12 +- gallery_dl/extractor/ytdl.py | 19 +- gallery_dl/extractor/zerochan.py | 16 +- gallery_dl/extractor/zzup.py | 15 +- gallery_dl/formatter.py | 128 +++- gallery_dl/job.py | 214 +++--- gallery_dl/option.py | 71 +- gallery_dl/output.py | 29 +- gallery_dl/path.py | 51 +- gallery_dl/postprocessor/__init__.py | 2 +- gallery_dl/postprocessor/common.py | 3 +- gallery_dl/postprocessor/compare.py | 11 +- gallery_dl/postprocessor/exec.py | 76 +- gallery_dl/postprocessor/metadata.py | 18 +- gallery_dl/postprocessor/mtime.py | 5 +- gallery_dl/postprocessor/ugoira.py | 53 +- gallery_dl/text.py | 84 ++- gallery_dl/transaction_id.py | 3 +- gallery_dl/update.py | 16 +- gallery_dl/util.py | 260 ++++--- gallery_dl/version.py | 4 +- gallery_dl/ytdl.py | 50 +- setup.py | 18 +- test/test_config.py | 5 +- test/test_cookies.py | 25 +- test/test_downloader.py | 8 +- test/test_extractor.py | 59 +- test/test_formatter.py | 159 ++++- test/test_job.py | 11 +- test/test_postprocessor.py | 115 ++- test/test_results.py | 176 +++-- test/test_text.py | 76 +- test/test_util.py | 234 +++++-- test/test_ytdl.py | 11 +- 276 files changed, 10012 insertions(+), 6088 deletions(-) create mode 100644 gallery_dl/extractor/comick.py create mode 100644 gallery_dl/extractor/dankefuerslesen.py create mode 100644 gallery_dl/extractor/girlsreleased.py create mode 100644 gallery_dl/extractor/girlswithmuscle.py create mode 100644 gallery_dl/extractor/iwara.py create mode 100644 gallery_dl/extractor/kemono.py delete mode 100644 gallery_dl/extractor/kemonoparty.py delete mode 100644 gallery_dl/extractor/koharu.py create mode 100644 gallery_dl/extractor/leakgallery.py create mode 100644 gallery_dl/extractor/madokami.py delete mode 100644 gallery_dl/extractor/mangasee.py delete mode 100644 gallery_dl/extractor/naver.py create mode 100644 gallery_dl/extractor/naverblog.py create mode 100644 gallery_dl/extractor/naverchzzk.py create mode 100644 gallery_dl/extractor/nudostar.py create mode 100644 gallery_dl/extractor/rawkuma.py create mode 100644 gallery_dl/extractor/redbust.py create mode 100644 gallery_dl/extractor/schalenetwork.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c7e75a8..159ff0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,40 +1,11 @@ -## 1.29.7 - 2025-05-23 +## 1.30.2 - 2025-07-27 ### Extractors #### Additions -- [mangadex] add `following` extractor ([#7487](https://github.com/mikf/gallery-dl/issues/7487)) -- [pixeldrain] add support for filesystem URLs ([#7473](https://github.com/mikf/gallery-dl/issues/7473)) +- [itaku] add `posts` & `bookmarks` extractors ([#7707](https://github.com/mikf/gallery-dl/issues/7707)) #### Fixes -- [bluesky] handle posts without `record` data ([#7499](https://github.com/mikf/gallery-dl/issues/7499)) -- [civitai] fix & improve video downloads ([#7502](https://github.com/mikf/gallery-dl/issues/7502)) -- [civitai] fix exception for images without `modelVersionId` ([#7432](https://github.com/mikf/gallery-dl/issues/7432)) -- [civitai] make metadata extraction non-fatal ([#7562](https://github.com/mikf/gallery-dl/issues/7562)) -- [fanbox] use `"browser": "firefox"` by default ([#7490](https://github.com/mikf/gallery-dl/issues/7490)) -- [idolcomplex] fix pagination logic ([#7549](https://github.com/mikf/gallery-dl/issues/7549)) -- [idolcomplex] fix 429 error during login by adding a 10s delay -- [instagram:stories] fix `post_date` metadata ([#7521](https://github.com/mikf/gallery-dl/issues/7521)) -- [motherless] fix video gallery downloads ([#7530](https://github.com/mikf/gallery-dl/issues/7530)) -- [pinterest] handle `story_pin_product_sticker_block` blocks ([#7563](https://github.com/mikf/gallery-dl/issues/7563)) -- [subscribestar] fix `content` and `title` metadata ([#7486](https://github.com/mikf/gallery-dl/issues/7486) [#7526](https://github.com/mikf/gallery-dl/issues/7526)) -#### Improvements -- [arcalive] allow overriding default `User-Agent` header ([#7556](https://github.com/mikf/gallery-dl/issues/7556)) -- [fanbox] update API headers ([#7490](https://github.com/mikf/gallery-dl/issues/7490)) -- [flickr] add `info` option ([#4720](https://github.com/mikf/gallery-dl/issues/4720) [#6817](https://github.com/mikf/gallery-dl/issues/6817)) -- [flickr] add `profile` option -- [instagram:stories] add `split` option ([#7521](https://github.com/mikf/gallery-dl/issues/7521)) -- [mangadex] implement login with client credentials -- [mangadex] send `Authorization` header only when necessary -- [mastodon] support Akkoma/Pleroma `/notice/:ID` URLs ([#7496](https://github.com/mikf/gallery-dl/issues/7496)) -- [mastodon] support Akkoma/Pleroma `/objects/:UUID` URLs ([#7497](https://github.com/mikf/gallery-dl/issues/7497)) -- [pixiv] Implement sanity handling for ugoira works ([#4327](https://github.com/mikf/gallery-dl/issues/4327) [#6297](https://github.com/mikf/gallery-dl/issues/6297) [#7285](https://github.com/mikf/gallery-dl/issues/7285) [#7434](https://github.com/mikf/gallery-dl/issues/7434)) -- [twitter:ctid] reduce chance of generating the same ID -#### Metadata -- [civitai] provide proper `extension` for model files ([#7432](https://github.com/mikf/gallery-dl/issues/7432)) -- [flickr] provide `license_name` metadata -- [sankaku] support new `tags` categories ([#7333](https://github.com/mikf/gallery-dl/issues/7333) [#7553](https://github.com/mikf/gallery-dl/issues/7553)) -- [vipergirls] provide `num` and `count` metadata ([#7479](https://github.com/mikf/gallery-dl/issues/7479)) -- [vipergirls] extract more metadata & rename fields ([#7479](https://github.com/mikf/gallery-dl/issues/7479)) -### Downloaders -- [http] fix setting `mtime` per file ([#7529](https://github.com/mikf/gallery-dl/issues/7529)) -- [ytdl] improve temp/part file handling ([#6949](https://github.com/mikf/gallery-dl/issues/6949) [#7494](https://github.com/mikf/gallery-dl/issues/7494)) -### Cookies -- support Zen browser ([#7233](https://github.com/mikf/gallery-dl/issues/7233) [#7546](https://github.com/mikf/gallery-dl/issues/7546)) +- [kemono] support new `kemono.cr` domain ([#7902](https://github.com/mikf/gallery-dl/issues/7902) [#7909](https://github.com/mikf/gallery-dl/issues/7909) [#7911](https://github.com/mikf/gallery-dl/issues/7911) [#7913](https://github.com/mikf/gallery-dl/issues/7913) [#7904](https://github.com/mikf/gallery-dl/issues/7904)) +- [coomer] support new `coomer.st` domain ([#7907](https://github.com/mikf/gallery-dl/issues/7907) [#7909](https://github.com/mikf/gallery-dl/issues/7909) [#7911](https://github.com/mikf/gallery-dl/issues/7911) [#7904](https://github.com/mikf/gallery-dl/issues/7904)) +### Post Processors +- [exec] use `False` as `start_new_session` default to avoid a `TypeError` ([#7899](https://github.com/mikf/gallery-dl/issues/7899)) +### Miscellaneous +- [tests/postprocessor] fix `TypeError` when logging an error ([#6582](https://github.com/mikf/gallery-dl/issues/6582)) diff --git a/PKG-INFO b/PKG-INFO index c022f84..550241f 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: gallery_dl -Version: 1.29.7 +Version: 1.30.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -18,10 +18,6 @@ Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3 :: Only -Classifier: Programming Language :: Python :: 3.4 -Classifier: Programming Language :: Python :: 3.5 -Classifier: Programming Language :: Python :: 3.6 -Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 @@ -33,11 +29,18 @@ Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Internet :: WWW/HTTP Classifier: Topic :: Multimedia :: Graphics Classifier: Topic :: Utilities -Requires-Python: >=3.4 +Requires-Python: >=3.8 License-File: LICENSE Requires-Dist: requests>=2.11.0 Provides-Extra: video -Requires-Dist: youtube-dl; extra == "video" +Requires-Dist: yt-dlp; extra == "video" +Provides-Extra: extra +Requires-Dist: requests[socks]; extra == "extra" +Requires-Dist: yt-dlp[default]; extra == "extra" +Requires-Dist: pyyaml; extra == "extra" +Requires-Dist: toml; python_version < "3.11" and extra == "extra" +Requires-Dist: truststore; python_version >= "3.10" and extra == "extra" +Requires-Dist: secretstorage; sys_platform == "linux" and extra == "extra" Dynamic: author Dynamic: author-email Dynamic: classifier @@ -75,7 +78,7 @@ and powerful `filenaming capabilities `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds @@ -517,7 +522,7 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ .. _pip: https://pip.pypa.io/en/stable/ -.. _Requests: https://requests.readthedocs.io/en/master/ +.. _Requests: https://requests.readthedocs.io/en/latest/ .. _FFmpeg: https://www.ffmpeg.org/ .. _mkvmerge: https://www.matroska.org/downloads/mkvtoolnix.html .. _yt-dlp: https://github.com/yt-dlp/yt-dlp @@ -530,10 +535,12 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _toml: https://pypi.org/project/toml/ .. _SecretStorage: https://pypi.org/project/SecretStorage/ .. _Psycopg: https://www.psycopg.org/ +.. _truststore: https://truststore.readthedocs.io/en/latest/ +.. _Jinja: https://jinja.palletsprojects.com/ .. _Snapd: https://docs.snapcraft.io/installing-snapd .. _OAuth: https://en.wikipedia.org/wiki/OAuth .. _Chocolatey: https://chocolatey.org/install -.. _Scoop: https://scoop.sh +.. _Scoop: https://scoop.sh/ .. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg :target: https://pypi.org/project/gallery-dl/ diff --git a/README.rst b/README.rst index 1fbdce5..3ca61b2 100644 --- a/README.rst +++ b/README.rst @@ -19,7 +19,7 @@ and powerful `filenaming capabilities `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds @@ -461,7 +463,7 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ .. _pip: https://pip.pypa.io/en/stable/ -.. _Requests: https://requests.readthedocs.io/en/master/ +.. _Requests: https://requests.readthedocs.io/en/latest/ .. _FFmpeg: https://www.ffmpeg.org/ .. _mkvmerge: https://www.matroska.org/downloads/mkvtoolnix.html .. _yt-dlp: https://github.com/yt-dlp/yt-dlp @@ -474,10 +476,12 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _toml: https://pypi.org/project/toml/ .. _SecretStorage: https://pypi.org/project/SecretStorage/ .. _Psycopg: https://www.psycopg.org/ +.. _truststore: https://truststore.readthedocs.io/en/latest/ +.. _Jinja: https://jinja.palletsprojects.com/ .. _Snapd: https://docs.snapcraft.io/installing-snapd .. _OAuth: https://en.wikipedia.org/wiki/OAuth .. _Chocolatey: https://chocolatey.org/install -.. _Scoop: https://scoop.sh +.. _Scoop: https://scoop.sh/ .. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg :target: https://pypi.org/project/gallery-dl/ diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 99fb8ad..f0d654e 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -13,6 +13,7 @@ _arguments -s -S \ {-X,--extractors}'[Load external extractors from PATH]':'' \ --user-agent'[User-Agent request header]':'' \ --clear-cache'[Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)]':'' \ +--compat'[Restore legacy '\''category'\'' names]' \ {-U,--update-check}'[Check if a newer version is available]' \ {-i,--input-file}'[Download URLs found in FILE ('\''-'\'' for stdin). More than one --input-file can be specified]':'':_files \ {-I,--input-file-comment}'[Download URLs found in FILE. Comment them out after they were downloaded successfully.]':'':_files \ @@ -29,8 +30,10 @@ _arguments -s -S \ {-E,--extractor-info}'[Print extractor defaults and settings]' \ {-K,--list-keywords}'[Print a list of available keywords and example values for the given URLs]' \ {-e,--error-file}'[Add input URLs which returned an error to FILE]':'':_files \ -{-N,--print}'[Write FORMAT during EVENT (default '\''prepare'\'') to standard output. Examples: '\''id'\'' or '\''post:{md5\[:8\]}'\'']':'<[event:]format>' \ ---print-to-file'[Append FORMAT during EVENT to FILE]':'<[event:]format file>' \ +{-N,--print}'[Write FORMAT during EVENT (default '\''prepare'\'') to standard output instead of downloading files. Can be used multiple times. Examples: '\''id'\'' or '\''post:{md5\[:8\]}'\'']':'<[event:]format>' \ +--Print'[Like --print, but downloads files as well]':'<[event:]format>' \ +--print-to-file'[Append FORMAT during EVENT to FILE instead of downloading files. Can be used multiple times]':'<[event:]format file>' \ +--Print-to-file'[Like --print-to-file, but downloads files as well]':'<[event:]format file>' \ --list-modules'[Print a list of available extractor modules]' \ --list-extractors'[Print a list of extractor classes with description, (sub)category and example URL]':'<[categories]>' \ --write-log'[Write logging output to FILE]':'':_files \ @@ -45,10 +48,11 @@ _arguments -s -S \ {-4,--force-ipv4}'[Make all connections via IPv4]' \ {-6,--force-ipv6}'[Make all connections via IPv6]' \ --no-check-certificate'[Disable HTTPS certificate validation]' \ -{-r,--limit-rate}'[Maximum download rate (e.g. 500k or 2.5M)]':'' \ +{-r,--limit-rate}'[Maximum download rate (e.g. 500k, 2.5M, or 800k-2M)]':'' \ --chunk-size'[Size of in-memory data chunks (default: 32k)]':'' \ --sleep'[Number of seconds to wait before each download. This can be either a constant value or a range (e.g. 2.7 or 2.0-3.5)]':'' \ --sleep-request'[Number of seconds to wait between HTTP requests during data extraction]':'' \ +--sleep-429'[Number of seconds to wait when receiving a '\''429 Too Many Requests'\'' response]':'' \ --sleep-extractor'[Number of seconds to wait before starting data extraction for an input URL]':'' \ --no-part'[Do not use .part files]' \ --no-skip'[Do not skip downloads; overwrite existing files]' \ @@ -72,7 +76,7 @@ _arguments -s -S \ {-T,--terminate}'[Stop current and parent extractor runs after N consecutive file downloads were skipped]':'' \ --filesize-min'[Do not download files smaller than SIZE (e.g. 500k or 2.5M)]':'' \ --filesize-max'[Do not download files larger than SIZE (e.g. 500k or 2.5M)]':'' \ ---download-archive'[Record all downloaded or skipped files in FILE and skip downloading any file already in it]':'':_files \ +--download-archive'[Record successfully downloaded files in FILE and skip downloading any file already in it]':'':_files \ --range'[Index range(s) specifying which files to download. These can be either a constant value, range, or slice (e.g. '\''5'\'', '\''8-20'\'', or '\''1:24:3'\'')]':'' \ --chapter-range'[Like '\''--range'\'', but applies to manga chapters and other delegated URLs]':'' \ --filter'[Python expression controlling which files to download. Files for which the expression evaluates to False are ignored. Available keys are the filename-specific ones listed by '\''-K'\''. Example: --filter "image_width >= 1000 and rating in ('\''s'\'', '\''q'\'')"]':'' \ diff --git a/data/completion/gallery-dl b/data/completion/gallery-dl index 161113c..ae4cb0f 100644 --- a/data/completion/gallery-dl +++ b/data/completion/gallery-dl @@ -10,7 +10,7 @@ _gallery_dl() elif [[ "${prev}" =~ ^()$ ]]; then COMPREPLY=( $(compgen -d -- "${cur}") ) else - COMPREPLY=( $(compgen -W "--help --version --filename --destination --directory --extractors --user-agent --clear-cache --update-check --input-file --input-file-comment --input-file-delete --no-input --quiet --warning --verbose --get-urls --resolve-urls --dump-json --resolve-json --simulate --extractor-info --list-keywords --error-file --print --print-to-file --list-modules --list-extractors --write-log --write-unsupported --write-pages --print-traffic --no-colors --retries --http-timeout --proxy --source-address --force-ipv4 --force-ipv6 --no-check-certificate --limit-rate --chunk-size --sleep --sleep-request --sleep-extractor --no-part --no-skip --no-mtime --no-download --option --config --config-yaml --config-toml --config-create --config-status --config-open --config-ignore --ignore-config --username --password --netrc --cookies --cookies-export --cookies-from-browser --abort --terminate --filesize-min --filesize-max --download-archive --range --chapter-range --filter --chapter-filter --postprocessor --no-postprocessors --postprocessor-option --write-metadata --write-info-json --write-infojson --write-tags --zip --cbz --mtime --mtime-from-date --rename --rename-to --ugoira --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --exec --exec-after" -- "${cur}") ) + COMPREPLY=( $(compgen -W "--help --version --filename --destination --directory --extractors --user-agent --clear-cache --compat --update-check --input-file --input-file-comment --input-file-delete --no-input --quiet --warning --verbose --get-urls --resolve-urls --dump-json --resolve-json --simulate --extractor-info --list-keywords --error-file --print --Print --print-to-file --Print-to-file --list-modules --list-extractors --write-log --write-unsupported --write-pages --print-traffic --no-colors --retries --http-timeout --proxy --source-address --force-ipv4 --force-ipv6 --no-check-certificate --limit-rate --chunk-size --sleep --sleep-request --sleep-429 --sleep-extractor --no-part --no-skip --no-mtime --no-download --option --config --config-yaml --config-toml --config-create --config-status --config-open --config-ignore --ignore-config --username --password --netrc --cookies --cookies-export --cookies-from-browser --abort --terminate --filesize-min --filesize-max --download-archive --range --chapter-range --filter --chapter-filter --postprocessor --no-postprocessors --postprocessor-option --write-metadata --write-info-json --write-infojson --write-tags --zip --cbz --mtime --mtime-from-date --rename --rename-to --ugoira --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --exec --exec-after" -- "${cur}") ) fi } diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish index f8bb723..8eb427a 100644 --- a/data/completion/gallery-dl.fish +++ b/data/completion/gallery-dl.fish @@ -7,6 +7,7 @@ complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'D' -l 'director complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'X' -l 'extractors' -d 'Load external extractors from PATH' complete -c gallery-dl -x -l 'user-agent' -d 'User-Agent request header' complete -c gallery-dl -x -l 'clear-cache' -d 'Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)' +complete -c gallery-dl -l 'compat' -d 'Restore legacy "category" names' complete -c gallery-dl -s 'U' -l 'update-check' -d 'Check if a newer version is available' complete -c gallery-dl -r -F -s 'i' -l 'input-file' -d 'Download URLs found in FILE ("-" for stdin). More than one --input-file can be specified' complete -c gallery-dl -r -F -s 'I' -l 'input-file-comment' -d 'Download URLs found in FILE. Comment them out after they were downloaded successfully.' @@ -23,8 +24,10 @@ complete -c gallery-dl -s 's' -l 'simulate' -d 'Simulate data extraction; do not complete -c gallery-dl -s 'E' -l 'extractor-info' -d 'Print extractor defaults and settings' complete -c gallery-dl -s 'K' -l 'list-keywords' -d 'Print a list of available keywords and example values for the given URLs' complete -c gallery-dl -r -F -s 'e' -l 'error-file' -d 'Add input URLs which returned an error to FILE' -complete -c gallery-dl -x -s 'N' -l 'print' -d 'Write FORMAT during EVENT (default "prepare") to standard output. Examples: "id" or "post:{md5[:8]}"' -complete -c gallery-dl -x -l 'print-to-file' -d 'Append FORMAT during EVENT to FILE' +complete -c gallery-dl -x -s 'N' -l 'print' -d 'Write FORMAT during EVENT (default "prepare") to standard output instead of downloading files. Can be used multiple times. Examples: "id" or "post:{md5[:8]}"' +complete -c gallery-dl -x -l 'Print' -d 'Like --print, but downloads files as well' +complete -c gallery-dl -x -l 'print-to-file' -d 'Append FORMAT during EVENT to FILE instead of downloading files. Can be used multiple times' +complete -c gallery-dl -x -l 'Print-to-file' -d 'Like --print-to-file, but downloads files as well' complete -c gallery-dl -l 'list-modules' -d 'Print a list of available extractor modules' complete -c gallery-dl -x -l 'list-extractors' -d 'Print a list of extractor classes with description, (sub)category and example URL' complete -c gallery-dl -r -F -l 'write-log' -d 'Write logging output to FILE' @@ -39,10 +42,11 @@ complete -c gallery-dl -x -l 'source-address' -d 'Client-side IP address to bind complete -c gallery-dl -s '4' -l 'force-ipv4' -d 'Make all connections via IPv4' complete -c gallery-dl -s '6' -l 'force-ipv6' -d 'Make all connections via IPv6' complete -c gallery-dl -l 'no-check-certificate' -d 'Disable HTTPS certificate validation' -complete -c gallery-dl -x -s 'r' -l 'limit-rate' -d 'Maximum download rate (e.g. 500k or 2.5M)' +complete -c gallery-dl -x -s 'r' -l 'limit-rate' -d 'Maximum download rate (e.g. 500k, 2.5M, or 800k-2M)' complete -c gallery-dl -x -l 'chunk-size' -d 'Size of in-memory data chunks (default: 32k)' complete -c gallery-dl -x -l 'sleep' -d 'Number of seconds to wait before each download. This can be either a constant value or a range (e.g. 2.7 or 2.0-3.5)' complete -c gallery-dl -x -l 'sleep-request' -d 'Number of seconds to wait between HTTP requests during data extraction' +complete -c gallery-dl -x -l 'sleep-429' -d 'Number of seconds to wait when receiving a "429 Too Many Requests" response' complete -c gallery-dl -x -l 'sleep-extractor' -d 'Number of seconds to wait before starting data extraction for an input URL' complete -c gallery-dl -l 'no-part' -d 'Do not use .part files' complete -c gallery-dl -l 'no-skip' -d 'Do not skip downloads; overwrite existing files' @@ -67,7 +71,7 @@ complete -c gallery-dl -x -s 'A' -l 'abort' -d 'Stop current extractor run after complete -c gallery-dl -x -s 'T' -l 'terminate' -d 'Stop current and parent extractor runs after N consecutive file downloads were skipped' complete -c gallery-dl -x -l 'filesize-min' -d 'Do not download files smaller than SIZE (e.g. 500k or 2.5M)' complete -c gallery-dl -x -l 'filesize-max' -d 'Do not download files larger than SIZE (e.g. 500k or 2.5M)' -complete -c gallery-dl -r -F -l 'download-archive' -d 'Record all downloaded or skipped files in FILE and skip downloading any file already in it' +complete -c gallery-dl -r -F -l 'download-archive' -d 'Record successfully downloaded files in FILE and skip downloading any file already in it' complete -c gallery-dl -x -l 'range' -d 'Index range(s) specifying which files to download. These can be either a constant value, range, or slice (e.g. "5", "8-20", or "1:24:3")' complete -c gallery-dl -x -l 'chapter-range' -d 'Like "--range", but applies to manga chapters and other delegated URLs' complete -c gallery-dl -x -l 'filter' -d 'Python expression controlling which files to download. Files for which the expression evaluates to False are ignored. Available keys are the filename-specific ones listed by "-K". Example: --filter "image_width >= 1000 and rating in ("s", "q")"' diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 77403b1..4979279 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2025-05-23" "1.29.7" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2025-07-27" "1.30.2" "gallery-dl Manual" .\" disable hyphenation .nh @@ -41,6 +41,9 @@ User-Agent request header .B "\-\-clear\-cache" \f[I]MODULE\f[] Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything) .TP +.B "\-\-compat" +Restore legacy 'category' names +.TP .B "\-U, \-\-update\-check" Check if a newer version is available .TP @@ -90,10 +93,16 @@ Print a list of available keywords and example values for the given URLs Add input URLs which returned an error to FILE .TP .B "\-N, \-\-print" \f[I][EVENT:]FORMAT\f[] -Write FORMAT during EVENT (default 'prepare') to standard output. Examples: 'id' or 'post:{md5[:8]}' +Write FORMAT during EVENT (default 'prepare') to standard output instead of downloading files. Can be used multiple times. Examples: 'id' or 'post:{md5[:8]}' +.TP +.B "\-\-Print" \f[I][EVENT:]FORMAT\f[] +Like --print, but downloads files as well .TP .B "\-\-print\-to\-file" \f[I][EVENT:]FORMAT FILE\f[] -Append FORMAT during EVENT to FILE +Append FORMAT during EVENT to FILE instead of downloading files. Can be used multiple times +.TP +.B "\-\-Print\-to\-file" \f[I][EVENT:]FORMAT FILE\f[] +Like --print-to-file, but downloads files as well .TP .B "\-\-list\-modules" Print a list of available extractor modules @@ -138,7 +147,7 @@ Make all connections via IPv6 Disable HTTPS certificate validation .TP .B "\-r, \-\-limit\-rate" \f[I]RATE\f[] -Maximum download rate (e.g. 500k or 2.5M) +Maximum download rate (e.g. 500k, 2.5M, or 800k-2M) .TP .B "\-\-chunk\-size" \f[I]SIZE\f[] Size of in-memory data chunks (default: 32k) @@ -149,6 +158,9 @@ Number of seconds to wait before each download. This can be either a constant va .B "\-\-sleep\-request" \f[I]SECONDS\f[] Number of seconds to wait between HTTP requests during data extraction .TP +.B "\-\-sleep\-429" \f[I]SECONDS\f[] +Number of seconds to wait when receiving a '429 Too Many Requests' response +.TP .B "\-\-sleep\-extractor" \f[I]SECONDS\f[] Number of seconds to wait before starting data extraction for an input URL .TP @@ -219,7 +231,7 @@ Do not download files smaller than SIZE (e.g. 500k or 2.5M) Do not download files larger than SIZE (e.g. 500k or 2.5M) .TP .B "\-\-download\-archive" \f[I]FILE\f[] -Record all downloaded or skipped files in FILE and skip downloading any file already in it +Record successfully downloaded files in FILE and skip downloading any file already in it .TP .B "\-\-range" \f[I]RANGE\f[] Index range(s) specifying which files to download. These can be either a constant value, range, or slice (e.g. '5', '8-20', or '1:24:3') diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 1c2a2a0..12eea08 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2025-05-23" "1.29.7" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2025-07-27" "1.30.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -99,8 +99,8 @@ A \f[I]format string\f[] to build filenames for downloaded files with. If this is an \f[I]object\f[], it must contain Python expressions mapping to the filename format strings to use. -These expressions are evaluated in the order as specified in Python 3.6+ -and in an undetermined order in Python 3.4 and 3.5. +These expressions are evaluated in the specified order until one evaluates +to \f[I]True\f[]. The available replacement keys depend on the extractor used. A list of keys for a specific one can be acquired by calling *gallery-dl* @@ -242,14 +242,15 @@ Share number of skipped downloads between parent and child extractors. .br * "/!? (){}" .br -* {" ": "_", "/": "-", "|": "-", ":": "_-_", "*": "_+_"} +* {"/": "_", "+": "_+_", "({[": "(", "]})": ")", "a-z": "*"} .IP "Description:" 4 -A string of characters to be replaced with the value of +A \f[I]string\f[] of characters to be replaced with the value of .br \f[I]path-replace\f[] -or an object mapping invalid/unwanted characters to their replacements +or an \f[I]object\f[] mapping invalid/unwanted characters, character sets, .br +or character ranges to their replacements for generated path segment names. .br @@ -461,13 +462,13 @@ response before \f[I]retrying\f[] the request. \f[I][E621]\f[], \f[I][foolfuuka]:search\f[], \f[I]itaku\f[], -\f[I]koharu\f[], \f[I]newgrounds\f[], \f[I][philomena]\f[], -\f[I]pixiv:novel\f[], +\f[I]pixiv-novel\f[], \f[I]plurk\f[], \f[I]poipiku\f[] , \f[I]pornpics\f[], +\f[I]schalenetwork\f[], \f[I]scrolller\f[], \f[I]soundgasm\f[], \f[I]urlgalleries\f[], @@ -541,7 +542,7 @@ This is supported for .br * \f[I]booruvar\f[] (*) .br -* \f[I]coomerparty\f[] +* \f[I]coomer\f[] .br * \f[I]danbooru\f[] (*) .br @@ -555,6 +556,8 @@ This is supported for .br * \f[I]exhentai\f[] .br +* \f[I]girlswithmuscle\f[] +.br * \f[I]horne\f[] (R) .br * \f[I]idolcomplex\f[] @@ -563,9 +566,11 @@ This is supported for .br * \f[I]inkbunny\f[] .br -* \f[I]kemonoparty\f[] +* \f[I]iwara\f[] +.br +* \f[I]kemono\f[] .br -* \f[I]koharu\f[] +* \f[I]madokami\f[] (R) .br * \f[I]mangadex\f[] .br @@ -577,8 +582,12 @@ This is supported for .br * \f[I]pillowfort\f[] .br +* \f[I]rule34xyz\f[] +.br * \f[I]sankaku\f[] .br +* \f[I]schalenetwork\f[] +.br * \f[I]scrolller\f[] .br * \f[I]seiga\f[] @@ -828,11 +837,6 @@ User-Agent header value used for HTTP requests. Setting this value to \f[I]"browser"\f[] will try to automatically detect and use the \f[I]User-Agent\f[] header of the system's default browser. -Note: -This option has *no* effect if -\f[I]extractor.browser\f[] -is enabled. - .SS extractor.*.browser .IP "Type:" 6 @@ -840,12 +844,14 @@ is enabled. .IP "Default:" 9 .br -* \f[I]"firefox"\f[]: \f[I]artstation\f[], \f[I]fanbox\f[], \f[I]mangasee\f[], \f[I]twitter\f[] +* \f[I]"firefox"\f[]: \f[I]artstation\f[], \f[I]behance\f[], \f[I]fanbox\f[], \f[I]twitter\f[] .br * \f[I]null\f[]: otherwise .IP "Example:" 4 .br +* "firefox/128:linux" +.br * "chrome:macos" .IP "Description:" 4 @@ -855,10 +861,23 @@ by using their default HTTP headers and TLS ciphers for HTTP requests. Optionally, the operating system used in the \f[I]User-Agent\f[] header can be specified after a \f[I]:\f[] (\f[I]windows\f[], \f[I]linux\f[], or \f[I]macos\f[]). +Supported browsers: + +.br +* \f[I]firefox\f[] +.br +* \f[I]firefox/140\f[] +.br +* \f[I]firefox/128\f[] +.br +* \f[I]chrome\f[] +.br +* \f[I]chrome/138\f[] +.br +* \f[I]chrome/111\f[] + Note: -This option overrides -\f[I]user-agent\f[] -and sets custom +This option sets custom \f[I]headers\f[] and \f[I]ciphers\f[] @@ -888,7 +907,10 @@ instead of the extractor's \f[I]root\f[] domain. .SS extractor.*.headers .IP "Type:" 6 -\f[I]object\f[] (name -> value) +.br +* \f[I]"string"\f[] +.br +* \f[I]object\f[] (name -> value) .IP "Default:" 9 .. code:: json @@ -908,13 +930,22 @@ to be sent with each HTTP request, To disable sending a header, set its value to \f[I]null\f[]. +Set this option to \f[I]"firefox"\f[] or \f[I]"chrome"\f[] +to use these browser's default headers. + .SS extractor.*.ciphers .IP "Type:" 6 -\f[I]list\f[] of \f[I]strings\f[] +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] .IP "Example:" 4 -.. code:: json +.br +* "firefox" +.br +* .. code:: json ["ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-GCM-SHA256", @@ -928,6 +959,9 @@ List of TLS/SSL cipher suites in to be passed to \f[I]ssl.SSLContext.set_ciphers()\f[] +Set this option to \f[I]"firefox"\f[] or \f[I]"chrome"\f[] +to use these browser's default ciphers. + .SS extractor.*.tls12 .IP "Type:" 6 @@ -935,7 +969,7 @@ to be passed to .IP "Default:" 9 .br -* \f[I]false\f[]: \f[I]artstation\f[] +* \f[I]false\f[]: \f[I]artstation\f[], \f[I]behance\f[] .br * \f[I]true\f[]: otherwise @@ -1228,9 +1262,9 @@ for available \f[I]PRAGMA\f[] statements and further details. .SS extractor.*.actions .IP "Type:" 6 .br -* \f[I]object\f[] (pattern -> action(s)) +* \f[I]object\f[] (pattern -> \f[I]Action(s)\f[]) .br -* \f[I]list\f[] of \f[I]lists\f[] with pattern -> action(s) pairs as elements +* \f[I]list\f[] of \f[I]lists\f[] with pattern -> \f[I]Action(s)\f[] pairs as elements .IP "Example:" 4 .. code:: json @@ -1259,57 +1293,17 @@ for available \f[I]PRAGMA\f[] statements and further details. .IP "Description:" 4 -Perform an \f[I]action\f[] when logging a message matched by \f[I]pattern\f[]. +Perform an \f[I]Action\f[] when logging a message matched by \f[I]pattern\f[]. \f[I]pattern\f[] is parsed as severity level (\f[I]debug\f[], \f[I]info\f[], \f[I]warning\f[], \f[I]error\f[], or integer value) -followed by an optional \f[I]Python Regular Expression\f[] -separated by a colon \f[I]:\f[]. +followed by an optional +\f[I]Python Regular Expression\f[] +separated by a colon \f[I]:\f[] + Using \f[I]*\f[] as level or leaving it empty matches logging messages of all levels (e.g. \f[I]*:\f[] or \f[I]:\f[]). -\f[I]action\f[] is parsed as action type -followed by (optional) arguments. - -It is possible to specify more than one \f[I]action\f[] per \f[I]pattern\f[] -by providing them as a \f[I]list\f[]: \f[I]["", "", …]\f[] - -Supported Action Types: - -\f[I]status\f[]: -Modify job exit status. -.br -Expected syntax is \f[I] \f[] (e.g. \f[I]= 100\f[]). -.br - -Supported operators are -\f[I]=\f[] (assignment), -\f[I]&\f[] (bitwise AND), -\f[I]|\f[] (bitwise OR), -\f[I]^\f[] (bitwise XOR). -\f[I]level\f[]: -Modify severity level of the current logging message. -.br -Can be one of \f[I]debug\f[], \f[I]info\f[], \f[I]warning\f[], \f[I]error\f[] or an integer value. -.br -\f[I]print\f[]: -Write argument to stdout. -\f[I]exec\f[]: -Run a shell command. -\f[I]abort\f[]: -Stop the current extractor run. -\f[I]terminate\f[]: -Stop the current extractor run, including parent extractors. -\f[I]restart\f[]: -Restart the current extractor run. -\f[I]wait\f[]: -Sleep for a given \f[I]Duration\f[] or -.br -wait until Enter is pressed when no argument was given. -.br -\f[I]exit\f[]: -Exit the program with the given argument as exit status. - .SS extractor.*.postprocessors .IP "Type:" 6 @@ -1443,6 +1437,25 @@ This value gets internally used as the \f[I]verify\f[] parameter for the \f[I]requests.request()\f[] method. +.SS extractor.*.truststore +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Use a +.br +\f[I]truststore\f[] +\f[I]SSLContext\f[] for verifying SSL/TLS certificates +to make use of your system's native certificate stores +.br +instead of relying on +\f[I]certifi\f[] +certificates. + + .SS extractor.*.download .IP "Type:" 6 \f[I]bool\f[] @@ -1700,6 +1713,17 @@ Try to follow external URLs of embedded players. Limit the number of posts/projects to download. +.SS extractor.artstation.mviews +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download \f[I].mview\f[] files. + + .SS extractor.artstation.previews .IP "Type:" 6 \f[I]bool\f[] @@ -1708,7 +1732,7 @@ Limit the number of posts/projects to download. \f[I]false\f[] .IP "Description:" 4 -Download video previews. +Download embed previews. .SS extractor.artstation.videos @@ -2043,7 +2067,7 @@ Possibly available formats are \f[I]string\f[] .IP "Default:" 9 -\f[I]"/api/_001"\f[] +\f[I]"/api/_001_v2"\f[] .IP "Description:" 4 API endpoint for retrieving file URLs. @@ -2135,7 +2159,7 @@ Available types are * \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 -\f[I]["user-models", "user-posts"]\f[] +\f[I]["user-images", "user-videos"]\f[] .IP "Description:" 4 A (comma-separated) list of subcategories to include @@ -2154,6 +2178,14 @@ Possible values are It is possible to use \f[I]"all"\f[] instead of listing all values separately. +.IP "Note:" 4 +To get a more complete set of metadata +like \f[I]model['name']\f[] and \f[I]post['title']\f[], +include \f[I]user-models\f[] and \f[I]user-posts\f[] +as well as the default \f[I]user-images\f[] and \f[I]user-videos\f[]: + +\f[I]["user-models", "user-posts", "user-images", "user-videos"]\f[] + .SS extractor.civitai.metadata .IP "Type:" 6 @@ -2169,14 +2201,14 @@ It is possible to use \f[I]"all"\f[] instead of listing all values separately. .IP "Example:" 4 .br -* "generation,version" +* "generation,post,version" .br -* ["generation", "version"] +* ["version", "generation"] .IP "Description:" 4 -Extract additional \f[I]generation\f[] and \f[I]version\f[] metadata. +Extract additional \f[I]generation\f[], \f[I]version\f[], and \f[I]post\f[] metadata. -Note: This requires 1 additional HTTP request per image or video. +Note: This requires 1 or more additional API requests per image or video. .SS extractor.civitai.nsfw @@ -2375,6 +2407,17 @@ greater than the per-page limit, gallery-dl will stop after the first batch. The value cannot be less than 1. +.SS extractor.dankefuerslesen.zip +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download each chapter as a single ZIP archive instead of individual images. + + .SS extractor.deviantart.auto-watch .IP "Type:" 6 \f[I]bool\f[] @@ -2806,6 +2849,18 @@ Discord Bot Token for API requests. You can follow \f[I]this guide\f[] to get a token. +.SS extractor.dynastyscans.anthology.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract \f[I]alert\f[], \f[I]description\f[], and \f[I]status\f[] metadata +from an anthology's HTML page. + + .SS extractor.[E621].metadata .IP "Type:" 6 .br @@ -2922,8 +2977,27 @@ Selects how to handle "you do not have enough GP" errors. \f[I]null\f[] .IP "Description:" 4 -Sets a custom image download limit and -stops extraction when it gets exceeded. +Set a custom image download limit and perform +\f[I]limits-action\f[] +when it gets exceeded. + + +.SS extractor.exhentai.limits-action +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"stop"\f[] + +.IP "Description:" 4 +Action to perform when the image limit is exceeded. + +.br +* "stop": Stop the current extractor run. +.br +* "wait": Wait for user input. +.br +* "reset": Spend GP to reset your account's image limits. .SS extractor.exhentai.metadata @@ -2937,8 +3011,10 @@ stops extraction when it gets exceeded. Load extended gallery metadata from the \f[I]API\f[]. -Adds \f[I]archiver_key\f[], \f[I]posted\f[], and \f[I]torrents\f[]. -Makes \f[I]date\f[] and \f[I]filesize\f[] more precise. +.br +* Adds \f[I]archiver_key\f[], \f[I]posted\f[], and \f[I]torrents\f[] +.br +* Provides exact \f[I]date\f[] and \f[I]filesize\f[] .SS extractor.exhentai.original @@ -2964,6 +3040,9 @@ Selects an alternative source to download files from. .br * \f[I]"hitomi"\f[]: Download the corresponding gallery from \f[I]hitomi.la\f[] +.br +* \f[I]"metadata"\f[]: Load only a gallery's metadata from the +\f[I]API\f[] .SS extractor.exhentai.tags @@ -2990,6 +3069,36 @@ for example \f[I]tags_artist\f[] or \f[I]tags_character\f[]. Extract comments that include photo attachments made by the author of the post. +.SS extractor.facebook.include +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]"photos"\f[] + +.IP "Example:" 4 +.br +* "avatar,photos" +.br +* ["avatar", "photos"] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Supported values are + +.br +* \f[I]avatar\f[] +.br +* \f[I]photos\f[] + +It is possible to use \f[I]"all"\f[] instead of listing all values separately. + + .SS extractor.facebook.videos .IP "Type:" 6 .br @@ -3048,6 +3157,17 @@ extraction and download for YouTube, Vimeo, and SoundCloud embeds. * \f[I]false\f[]: Ignore embeds. +.SS extractor.fanbox.fee-max +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Description:" 4 +Do not request API data or extract files from posts +that require a fee (\f[I]feeRequired\f[]) greater than the specified amount. + +Note: This option has no effect on individual post URLs. + + .SS extractor.fanbox.metadata .IP "Type:" 6 .br @@ -3495,13 +3615,15 @@ Selects which API endpoints to use. .IP "Description:" 4 Controls from which position to start the extraction process from. +\f[I]true\f[] +Start from the beginning. .br -* \f[I]true\f[]: Start from the beginning. Log the most recent \f[I]cursor\f[] value when interrupted before reaching the end. .br -* \f[I]false\f[]: Start from the beginning. -.br -* any \f[I]string\f[]: Start from the position defined by this value. +\f[I]false\f[] +Start from the beginning. +any \f[I]string\f[] +Start from the position defined by this value. .SS extractor.instagram.include @@ -3651,6 +3773,42 @@ Do not download videos Split \f[I]stories\f[] elements into separate posts. +.SS extractor.itaku.include +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]"gallery"\f[] + +.IP "Example:" 4 +.br +* "stars,gallery" +.br +* ["stars", "gallery"] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Supported values are + +.br +* \f[I]gallery\f[] +.br +* \f[I]posts\f[] +.br +* \f[I]followers\f[] +.br +* \f[I]following\f[] +.br +* \f[I]stars\f[] + +It is possible to use \f[I]"all"\f[] instead of listing all values separately. + + .SS extractor.itaku.videos .IP "Type:" 6 \f[I]bool\f[] @@ -3662,7 +3820,33 @@ Split \f[I]stories\f[] elements into separate posts. Download video files. -.SS extractor.kemonoparty.archives +.SS extractor.iwara.include +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]["user-images", "user-videos"]\f[] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Possible values are + +.br +* \f[I]"user-images"\f[] +.br +* \f[I]"user-videos"\f[] +.br +* \f[I]"user-playlists"\f[] + +It is possible to use \f[I]"all"\f[] instead of listing all values separately. + + +.SS extractor.kemono.archives .IP "Type:" 6 \f[I]bool\f[] @@ -3676,7 +3860,7 @@ Extract additional metadata for \f[I]archives\f[] files, including Note: This requires 1 additional HTTP request per \f[I]archives\f[] file. -.SS extractor.kemonoparty.comments +.SS extractor.kemono.comments .IP "Type:" 6 \f[I]bool\f[] @@ -3689,23 +3873,39 @@ Extract \f[I]comments\f[] metadata. Note: This requires 1 additional HTTP request per post. -.SS extractor.kemonoparty.duplicates +.SS extractor.kemono.duplicates .IP "Type:" 6 -\f[I]bool\f[] +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 \f[I]false\f[] +.IP "Example:" 4 +.br +* "attachment,inline" +.br +* ["file", "attachment"] + .IP "Description:" 4 Controls how to handle duplicate files in a post. +\f[I]true\f[] +Download duplicates +\f[I]false\f[] +Ignore duplicates +any \f[I]list\f[] or \f[I]string\f[] +Download a duplicate file if its \f[I]type\f[] is in the given list .br -* \f[I]true\f[]: Download duplicates +Ignore it otherwise .br -* \f[I]false\f[]: Ignore duplicates -.SS extractor.kemonoparty.dms +.SS extractor.kemono.dms .IP "Type:" 6 \f[I]bool\f[] @@ -3716,7 +3916,7 @@ Controls how to handle duplicate files in a post. Extract a user's direct messages as \f[I]dms\f[] metadata. -.SS extractor.kemonoparty.announcements +.SS extractor.kemono.announcements .IP "Type:" 6 \f[I]bool\f[] @@ -3727,7 +3927,7 @@ Extract a user's direct messages as \f[I]dms\f[] metadata. Extract a user's announcements as \f[I]announcements\f[] metadata. -.SS extractor.kemonoparty.endpoint +.SS extractor.kemono.endpoint .IP "Type:" 6 \f[I]string\f[] @@ -3762,7 +3962,7 @@ Provides more metadata, but might not return a creator's first/last posts. .br -.SS extractor.kemonoparty.favorites +.SS extractor.kemono.favorites .IP "Type:" 6 \f[I]string\f[] @@ -3775,7 +3975,7 @@ Determines the type of favorites to be downloaded. Available types are \f[I]artist\f[], and \f[I]post\f[]. -.SS extractor.kemonoparty.files +.SS extractor.kemono.files .IP "Type:" 6 \f[I]list\f[] of \f[I]strings\f[] @@ -3788,7 +3988,7 @@ Determines the type and order of files to be downloaded. Available types are \f[I]file\f[], \f[I]attachments\f[], and \f[I]inline\f[]. -.SS extractor.kemonoparty.max-posts +.SS extractor.kemono.max-posts .IP "Type:" 6 \f[I]integer\f[] @@ -3799,7 +3999,7 @@ Available types are \f[I]file\f[], \f[I]attachments\f[], and \f[I]inline\f[]. Limit the number of posts to download. -.SS extractor.kemonoparty.metadata +.SS extractor.kemono.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -3810,7 +4010,7 @@ Limit the number of posts to download. Extract \f[I]username\f[] and \f[I]user_profile\f[] metadata. -.SS extractor.kemonoparty.revisions +.SS extractor.kemono.revisions .IP "Type:" 6 .br * \f[I]bool\f[] @@ -3828,7 +4028,7 @@ Set this to \f[I]"unique"\f[] to filter out duplicate revisions. Note: This requires 1 additional HTTP request per post. -.SS extractor.kemonoparty.order-revisions +.SS extractor.kemono.order-revisions .IP "Type:" 6 \f[I]string\f[] @@ -3876,7 +4076,7 @@ If the selected format is not available, the first in the list gets chosen (usually mp3). -.SS extractor.koharu.cbz +.SS extractor.schalenetwork.cbz .IP "Type:" 6 \f[I]bool\f[] @@ -3890,7 +4090,7 @@ Disabling this option causes a gallery to be downloaded as individual image files. -.SS extractor.koharu.format +.SS extractor.schalenetwork.format .IP "Type:" 6 .br * \f[I]string\f[] @@ -3911,7 +4111,7 @@ Possible formats are .br -.SS extractor.koharu.tags +.SS extractor.schalenetwork.tags .IP "Type:" 6 \f[I]bool\f[] @@ -4000,11 +4200,22 @@ to filter chapters by. .SS extractor.mangadex.ratings .IP "Type:" 6 -\f[I]list\f[] of \f[I]strings\f[] +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 \f[I]["safe", "suggestive", "erotica", "pornographic"]\f[] +.IP "Example:" 4 +.br +* "safe" +.br +* "erotica,suggestive" +.br +* ["erotica", "suggestive"] + .IP "Description:" 4 List of acceptable content ratings for returned chapters. @@ -4103,20 +4314,49 @@ Also emit metadata for text-only posts without media content. Your access token, necessary to fetch favorited notes. -.SS extractor.[misskey].renotes +.SS extractor.[misskey].include .IP "Type:" 6 -\f[I]bool\f[] +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 -\f[I]false\f[] +\f[I]"notes"\f[] + +.IP "Example:" 4 +.br +* "avatar,background,notes" +.br +* ["avatar", "background", "notes"] .IP "Description:" 4 -Fetch media from renoted notes. +A (comma-separated) list of subcategories to include +when processing a user profile. +Possible values are +\f[I]"info"\f[], +\f[I]"avatar"\f[], +\f[I]"background"\f[], +\f[I]"notes"\f[], -.SS extractor.[misskey].replies -.IP "Type:" 6 -\f[I]bool\f[] +It is possible to use \f[I]"all"\f[] instead of listing all values separately. + + +.SS extractor.[misskey].renotes +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Fetch media from renoted notes. + + +.SS extractor.[misskey].replies +.IP "Type:" 6 +\f[I]bool\f[] .IP "Default:" 9 \f[I]true\f[] @@ -4138,7 +4378,7 @@ Extract extended \f[I]pool\f[] metadata. Note: Not supported by all \f[I]moebooru\f[] instances. -.SS extractor.naver.videos +.SS extractor.naver-blog.videos .IP "Type:" 6 \f[I]bool\f[] @@ -4149,6 +4389,17 @@ Note: Not supported by all \f[I]moebooru\f[] instances. Download videos. +.SS extractor.naver-chzzk.offset +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]0\f[] + +.IP "Description:" 4 +Custom \f[I]offset\f[] starting value when paginating over comments. + + .SS extractor.newgrounds.flash .IP "Type:" 6 \f[I]bool\f[] @@ -4345,6 +4596,33 @@ Extract additional metadata (\f[I]source\f[], \f[I]uploader\f[]) Note: This requires 1 additional HTTP request per post. +.SS extractor.patreon.cursor +.IP "Type:" 6 +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Example:" 4 +"03:eyJ2IjoxLCJjIjoiMzU0NDQ1MjAiLCJ0IjoiIn0=:DTcmjBoVj01o_492YBYqHhqx" + +.IP "Description:" 4 +Controls from which position to start the extraction process from. + +\f[I]true\f[] +Start from the beginning. +.br +Log the most recent \f[I]cursor\f[] value when interrupted before reaching the end. +.br +\f[I]false\f[] +Start from the beginning. +any \f[I]string\f[] +Start from the position defined by this value. + + .SS extractor.patreon.files .IP "Type:" 6 \f[I]list\f[] of \f[I]strings\f[] @@ -4403,6 +4681,17 @@ Possible formats: * \f[I]thumbnail_small\f[] (\f[I]"h":100,"w":100\f[]) +.SS extractor.patreon.user.date-max +.IP "Type:" 6 +\f[I]Date\f[] + +.IP "Default:" 9 +\f[I]0\f[] + +.IP "Description:" 4 +Sets the \f[I]Date\f[] to start from. + + .SS extractor.[philomena].api-key .IP "Type:" 6 \f[I]string\f[] @@ -4592,40 +4881,6 @@ by using a third-party tool like \f[I]gppt\f[]. -.SS extractor.pixiv.novel.covers -.IP "Type:" 6 -\f[I]bool\f[] - -.IP "Default:" 9 -\f[I]false\f[] - -.IP "Description:" 4 -Download cover images. - - -.SS extractor.pixiv.novel.embeds -.IP "Type:" 6 -\f[I]bool\f[] - -.IP "Default:" 9 -\f[I]false\f[] - -.IP "Description:" 4 -Download embedded images. - - -.SS extractor.pixiv.novel.full-series -.IP "Type:" 6 -\f[I]bool\f[] - -.IP "Default:" 9 -\f[I]false\f[] - -.IP "Description:" 4 -When downloading a novel being part of a series, -download all novels of that series. - - .SS extractor.pixiv.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -4753,6 +5008,125 @@ A value of \f[I]0\f[] means no limit. Try to fetch \f[I]limit_sanity_level\f[] works via web API. +.SS extractor.pixiv-novel.comments +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Fetch \f[I]comments\f[] metadata. + +Note: This requires 1 or more additional API requests per novel, +depending on the number of comments. + + +.SS extractor.pixiv-novel.covers +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download cover images. + + +.SS extractor.pixiv-novel.embeds +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download embedded images. + + +.SS extractor.pixiv-novel.full-series +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +When downloading a novel being part of a series, +download all novels of that series. + + +.SS extractor.pixiv-novel.max-posts +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]0\f[] + +.IP "Description:" 4 +When downloading multiple novels, +this sets the maximum number of novels to get. + +A value of \f[I]0\f[] means no limit. + + +.SS extractor.pixiv-novel.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Fetch extended \f[I]user\f[] metadata. + + +.SS extractor.pixiv-novel.metadata-bookmark +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +For novels bookmarked by +\f[I]your own account\f[], +fetch bookmark tags as \f[I]tags_bookmark\f[] metadata. + +Note: This requires 1 additional API request per bookmarked post. + + +.SS extractor.pixiv-novel.refresh-token +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Description:" 4 +The \f[I]refresh-token\f[] value you get +from running \f[I]gallery-dl oauth:pixiv\f[] (see OAuth_) or +by using a third-party tool like +\f[I]gppt\f[]. + +This can be the same value as \f[I]extractor.pixiv.refresh-token\f[] + + +.SS extractor.pixiv-novel.tags +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"japanese"\f[] + +.IP "Description:" 4 +Controls the \f[I]tags\f[] metadata field. + +.br +* "japanese": List of Japanese tags +.br +* "translated": List of translated tags +.br +* "original": Unmodified list with both Japanese and translated tags + + .SS extractor.plurk.comments .IP "Type:" 6 \f[I]bool\f[] @@ -5027,22 +5401,6 @@ Selects the file format to extract. When more than one format is given, the first available one is selected. -.SS extractor.sankaku.id-format -.IP "Type:" 6 -\f[I]string\f[] - -.IP "Default:" 9 -\f[I]"numeric"\f[] - -.IP "Description:" 4 -Format of \f[I]id\f[] metadata fields. - -.br -* \f[I]"alphanumeric"\f[] or \f[I]"alnum"\f[]: 11-character alphanumeric IDs (\f[I]y0abGlDOr2o\f[]) -.br -* \f[I]"numeric"\f[] or \f[I]"legacy"\f[]: numeric IDs (\f[I]360451\f[]) - - .SS extractor.sankaku.refresh .IP "Type:" 6 \f[I]bool\f[] @@ -5117,6 +5475,17 @@ Download video embeds from external sites. Download videos. +.SS extractor.sexcom.gifs +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download animated images as \f[I].gif\f[] instead of \f[I].webp\f[] + + .SS extractor.skeb.article .IP "Type:" 6 \f[I]bool\f[] @@ -5500,13 +5869,13 @@ Download user avatars. .SS extractor.tiktok.user.module .IP "Type:" 6 -\f[I]string\f[] +\f[I]Module\f[] .IP "Default:" 9 \f[I]null\f[] .IP "Description:" 4 -Name or filesystem path of the \f[I]ytdl\f[] Python module +The \f[I]ytdl\f[] \f[I]Module\f[] to extract posts from a \f[I]tiktok\f[] user profile with. See \f[I]extractor.ytdl.module\f[]. @@ -5858,15 +6227,18 @@ Controls how to handle Cross Site Request Forgery (CSRF) tokens. .IP "Description:" 4 Controls from which position to start the extraction process from. +\f[I]true\f[] +Start from the beginning. .br -* \f[I]true\f[]: Start from the beginning. Log the most recent \f[I]cursor\f[] value when interrupted before reaching the end. .br -* \f[I]false\f[]: Start from the beginning. -.br -* any \f[I]string\f[]: Start from the position defined by this value. +\f[I]false\f[] +Start from the beginning. +any \f[I]string\f[] +Start from the position defined by this value. -Note: A \f[I]cursor\f[] value from one timeline cannot be used with another. +.IP "Note:" 4 +A \f[I]cursor\f[] value from one timeline cannot be used with another. .SS extractor.twitter.expand @@ -6448,6 +6820,30 @@ Use the given values as \f[I]type\f[] parameter for URLs with the specified exte .br +.SS extractor.webtoons.banners +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download the active comic's \f[I]banner\f[]. + + +.SS extractor.webtoons.thumbnails +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download the active episode's \f[I]thumbnail\f[]. + +Useful for creating CBZ archives with actual source thumbnails. + + .SS extractor.weibo.gifs .IP "Type:" 6 .br @@ -6593,6 +6989,17 @@ See Location of a \f[I]ytdl\f[] configuration file to load options from. +.SS extractor.ytdl.deprecations +.IP "Type:" 6 +´´bool´´ + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Allow \f[I]ytdl\f[] to warn about deprecated options and features. + + .SS extractor.ytdl.enabled .IP "Type:" 6 \f[I]bool\f[] @@ -6635,12 +7042,25 @@ See \f[I]true\f[] .IP "Description:" 4 -Enables the use of \f[I]ytdl's\f[] \f[I]generic\f[] extractor. +Enables the use of \f[I]ytdl's\f[] \f[I]Generic\f[] extractor. Set this option to \f[I]"force"\f[] for the same effect as \f[I]--force-generic-extractor\f[]. +.SS extractor.ytdl.generic-category +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +When using \f[I]ytdl's\f[] \f[I]Generic\f[] extractor, +change category to \f[I]"ytdl-generic"\f[] and +set subcategory to the input URL's domain. + + .SS extractor.ytdl.logging .IP "Type:" 6 \f[I]bool\f[] @@ -6658,10 +7078,7 @@ Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in .SS extractor.ytdl.module .IP "Type:" 6 -.br -* \f[I]string\f[] -.br -* \f[I]Path\f[] +\f[I]Module\f[] .IP "Default:" 9 \f[I]null\f[] @@ -6673,7 +7090,7 @@ Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in * "/home/user/.local/lib/python3.13/site-packages/youtube_dl" .IP "Description:" 4 -Name or filesystem path of the \f[I]ytdl\f[] Python module to import. +The \f[I]ytdl\f[] \f[I]Module\f[] to import. Setting this to \f[I]null\f[] will try to import \f[I]"yt_dlp"\f[] followed by \f[I]"youtube_dl"\f[] as fallback. @@ -6692,7 +7109,6 @@ followed by \f[I]"youtube_dl"\f[] as fallback. "merge_output_format": "mkv" } - .IP "Description:" 4 Additional options passed directly to the \f[I]YoutubeDL\f[] constructor. @@ -6936,13 +7352,23 @@ Set this option to \f[I]null\f[] to disable this indicator. .SS downloader.*.rate .IP "Type:" 6 -\f[I]string\f[] +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] with 2 \f[I]strings\f[] .IP "Default:" 9 \f[I]null\f[] .IP "Example:" 4 -"32000", "500k", "2.5M" +.br +* "32000" +.br +* "500k" +.br +* "1M - 2.5M" +.br +* ["1M", "2.5M"] .IP "Description:" 4 Maximum download rate in bytes per second. @@ -6951,6 +7377,10 @@ Possible values are valid integer or floating-point numbers optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[]. These suffixes are case-insensitive. +If given as a range, the maximum download rate +will be randomly chosen before each download. +(see \f[I]random.randint()\f[]) + .SS downloader.*.retries .IP "Type:" 6 @@ -7123,6 +7553,21 @@ Fail a download when a file does not pass instead of downloading a potentially broken file. +.SS downloader.http.validate-html +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Check for unexpected HTML responses. + +Fail file downloads with a \f[I]text/html\f[] +\f[I]Content-Type header\f[] +when expecting a media file instead. + + .SS downloader.ytdl.cmdline-args .IP "Type:" 6 .br @@ -7156,6 +7601,17 @@ See Location of a \f[I]ytdl\f[] configuration file to load options from. +.SS downloader.ytdl.deprecations +.IP "Type:" 6 +´´bool´´ + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Allow \f[I]ytdl\f[] to warn about deprecated options and features. + + .SS downloader.ytdl.format .IP "Type:" 6 \f[I]string\f[] @@ -7204,10 +7660,7 @@ Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in .SS downloader.ytdl.module .IP "Type:" 6 -.br -* \f[I]string\f[] -.br -* \f[I]Path\f[] +\f[I]Module\f[] .IP "Default:" 9 \f[I]null\f[] @@ -7219,7 +7672,7 @@ Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in * "/home/user/.local/lib/python3.13/site-packages/youtube_dl" .IP "Description:" 4 -Name or filesystem path of the \f[I]ytdl\f[] Python module to import. +The \f[I]ytdl\f[] \f[I]Module\f[] to import. Setting this to \f[I]null\f[] will try to import \f[I]"yt_dlp"\f[] followed by \f[I]"youtube_dl"\f[] as fallback. @@ -7301,8 +7754,8 @@ Controls the output string format and status indicators. .br * \f[I]"color"\f[]: Suitable for terminals that understand ANSI escape codes and colors .br -* \f[I]"auto"\f[]: \f[I]"terminal"\f[] on Windows with \f[I]output.ansi\f[] disabled, -\f[I]"color"\f[] otherwise. +* \f[I]"auto"\f[]: \f[I]"pipe"\f[] if not on a TTY, \f[I]"terminal"\f[] on Windows with +\f[I]output.ansi\f[] disabled, \f[I]"color"\f[] otherwise. It is possible to use custom output format strings .br @@ -7759,6 +8212,26 @@ the files' metadata as well as \f[I]{_path}\f[], \f[I]{_directory}\f[], and \f[I]{_filename}\f[]. +.SS exec.commands +.IP "Type:" 6 +\f[I]list\f[] of \f[I]commands\f[] + +.IP "Example:" 4 +.. code:: json + +[ +["echo", "{user[account]}", "{id}"] +["magick", "convert" "{_path}", "\\fF {_path.rpartition('.')[0]}.png"], +"rm {}", +] + +.IP "Description:" 4 +Multiple \f[I]commands\f[] to run in succession. + +All \f[I]commands\f[] after the first returning with a non-zero +exit status will not be run. + + .SS exec.event .IP "Type:" 6 .br @@ -7775,6 +8248,27 @@ The event(s) for which \f[I]exec.command\f[] is run. See \f[I]metadata.event\f[] for a list of available events. +.SS exec.session +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Start subprocesses in a new session. + +On Windows, this means passing +\f[I]CREATE_NEW_PROCESS_GROUP\f[] +as a \f[I]creationflags\f[] argument to +\f[I]subprocess.Popen\f[] + +On POSIX systems, this means enabling the +\f[I]start_new_session\f[] argument of +\f[I]subprocess.Popen\f[] +to have it call \f[I]setsid()\f[]. + + .SS hash.chunk-size .IP "Type:" 6 \f[I]integer\f[] @@ -8385,16 +8879,18 @@ See \f[I]metadata.event\f[] for a list of available events. .br * "my_module:generate_text" .br -* "~/.local/share/gdl-utils.py:resize" +* "~/.local/share/gdl_utils.py:resize" .IP "Description:" 4 The Python function to call. -This function is specified as \f[I]:\f[] -and gets called with the current metadata dict as argument. +This function is specified as \f[I]:\f[], where +.br +\f[I]\f[] is a \f[I]Module\f[] and +.br +\f[I]\f[] is the name of the function in that module. -\f[I]module\f[] is either an importable Python module name -or the \f[I]Path\f[] to a .py file, +It gets called with the current metadata dict as argument. .SS rename.from @@ -8745,29 +9241,133 @@ Note: \f[I]null\f[] references internal extractors defined in or by \f[I]extractor.modules\f[]. -.SS globals +.SS extractor.category-map .IP "Type:" 6 .br -* \f[I]Path\f[] +* \f[I]object\f[] (category -> category) .br * \f[I]string\f[] .IP "Example:" 4 -.br -* "~/.local/share/gdl-globals.py" -.br -* "gdl-globals" +.. code:: json + +{ +"danbooru": "booru", +"gelbooru": "booru" +} .IP "Description:" 4 -Path to or name of an -.br -\f[I]importable\f[] -Python module, -whose namespace, +A JSON object mapping category names to their replacements. + +Special values: + .br +* \f[I]"compat"\f[] +.. code:: json + +{ +"coomer" : "coomerparty", +"kemono" : "kemonoparty", +"schalenetwork": "koharu", +"naver-chzzk" : "chzzk", +"naver-blog" : "naver", +"naver-webtoon": "naverwebtoon", +"pixiv-novel" : "pixiv", +"pixiv-novel:novel" : ["pixiv", "novel"], +"pixiv-novel:user" : ["pixiv", "novel-user"], +"pixiv-novel:series" : ["pixiv", "novel-series"], +"pixiv-novel:bookmark": ["pixiv", "novel-bookmark"] +} + + +.SS extractor.config-map +.IP "Type:" 6 +\f[I]object\f[] (category -> category) + +.IP "Default:" 9 +.. code:: json + +{ +"coomerparty" : "coomer", +"kemonoparty" : "kemono", +"koharu" : "schalenetwork", +"chzzk" : "naver-chzzk", +"naver" : "naver-blog", +"naverwebtoon": "naver-webtoon", +"pixiv" : "pixiv-novel" +} + +.IP "Description:" 4 +Duplicate the configuration settings of extractor categories +to other names. + +For example, a \f[I]"naver": "naver-blog"\f[] key-value pair will make all +\f[I]naver\f[] config settings available for ´´naver-blog´´ extractors as well. + + +.SS jinja.environment +.IP "Type:" 6 +\f[I]object\f[] (name -> value) + +.IP "Example:" 4 +.. code:: json + +{ +"variable_start_string": "(((", +"variable_end_string" : ")))", +"keep_trailing_newline": true +} + +.IP "Description:" 4 +Initialization parameters for the \f[I]jinja\f[] +\f[I]Environment\f[] +object. + + +.SS jinja.policies +.IP "Type:" 6 +\f[I]object\f[] (name -> value) + +.IP "Example:" 4 +.. code:: json + +{ +"urlize.rel": "nofollow noopener", +"ext.i18n.trimmed": true +} + +.IP "Description:" 4 +\f[I]jinja\f[] +\f[I]Policies\f[] + + +.SS jinja.filters +.IP "Type:" 6 +\f[I]Module\f[] + +.IP "Description:" 4 +A Python \f[I]Module\f[] containing custom \f[I]jinja\f[] +\f[I]filters\f[] + + +.SS jinja.tests +.IP "Type:" 6 +\f[I]Module\f[] + +.IP "Description:" 4 +A Python \f[I]Module\f[] containing custom \f[I]jinja\f[] +\f[I]tests\f[] + + +.SS globals +.IP "Type:" 6 +\f[I]Module\f[] + +.IP "Description:" 4 +A Python \f[I]Module\f[] whose namespace, in addition to the \f[I]GLOBALS\f[] dict in \f[I]util.py\f[], -gets used as \f[I]globals parameter\f[] for compiled Python expressions. +is used as \f[I]globals parameter\f[] for compiled Python expressions. .SS cache.file @@ -8851,6 +9451,28 @@ The list of signal names to ignore, i.e. set as signal handler for. +.SS signals-actions +.IP "Type:" 6 +\f[I]object\f[] (signal -> \f[I]Action(s)\f[]) + +.IP "Example:" 4 +.. code:: json + +{ +"SIGINT" : "flag download = stop", +"SIGUSR1": [ +"print Received SIGUSR1", +"exec notify.sh", +"exit 127" +] +} + +.IP "Description:" 4 +\f[I]Action(s)\f[] to perform when a +\f[I]signal\f[] +is received. + + .SS subconfigs .IP "Type:" 6 \f[I]list\f[] of \f[I]Path\f[] @@ -9089,6 +9711,42 @@ it will be randomly chosen with uniform distribution such that \f[I]a <= N <= b\ value (\f[I]"2.85"\f[]) or a range (\f[I]"1.5-3.0"\f[]). +.SS Module +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]Path\f[] + +.IP "Example:" 4 +.br +* "gdl_utils" +.br +* "~/.local/share/gdl/" +.br +* "~/.local/share/gdl_utils.py" + +.IP "Description:" 4 +A Python +\f[I]Module\f[] + +This can be one of + +.br +* the name of an +\f[I]importable\f[] +Python module +.br +* the \f[I]Path\f[] to a Python +\f[I]package\f[] +.br +* the \f[I]Path\f[] to a .py file + +See +\f[I]Python/Modules\f[] +for details. + + .SS Path .IP "Type:" 6 .br @@ -9114,12 +9772,22 @@ Simple \f[I]tilde expansion\f[] and \f[I]environment variable expansion\f[] is supported. -In Windows environments, backslashes (\f[I]"\\"\f[]) can, in addition to -forward slashes (\f[I]"/"\f[]), be used as path separators. -Because backslashes are JSON's escape character, -they themselves have to be escaped. -The path \f[I]C:\\path\\to\\file.ext\f[] has therefore to be written as -\f[I]"C:\\\\path\\\\to\\\\file.ext"\f[] if you want to use backslashes. +.IP "Note::" 4 +In Windows environments, +both backslashes \f[I]\\\f[] as well as forward slashes \f[I]/\f[] +can be used as path separators. + +However, since backslashes are JSON's escape character, +they themselves must be escaped as \f[I]\\\\\f[]. + +For example, a path like \f[I]C:\\path\\to\\file.ext\f[] has to be specified as + +.br +* \f[I]"C:\\\\path\\\\to\\\\file.ext"\f[] when using backslashes +.br +* \f[I]"C:/path/to/file.ext"\f[] when using forward slashes + +in a JSON file. .SS Logging Configuration @@ -9265,6 +9933,81 @@ Convert Pixiv Ugoira to WebM using \f[I]ffmpeg\f[] Store files in a ZIP archive +.SS Action +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Example:" 4 +.br +* "exit" +.br +* "print Hello World" +.br +* "raise AbortExtraction an error occured" +.br +* "flag file = terminate" + +.IP "Description:" 4 +An \f[I]Action\f[] is parsed as Action Type +followed by (optional) arguments. + +It is possible to specify more than one \f[I]action\f[] +by providing them as a \f[I]list\f[]: \f[I]["", "", …]\f[] + +Supported Action Types: + +\f[I]status\f[]: +Modify job exit status. +.br +Expected syntax is \f[I] \f[] (e.g. \f[I]= 100\f[]). +.br + +Supported operators are +\f[I]=\f[] (assignment), +\f[I]&\f[] (bitwise AND), +\f[I]|\f[] (bitwise OR), +\f[I]^\f[] (bitwise XOR). +\f[I]level\f[]: +Modify severity level of the current logging message. +.br +Can be one of \f[I]debug\f[], \f[I]info\f[], \f[I]warning\f[], \f[I]error\f[] or an integer value. +.br +\f[I]print\f[]: +Write argument to stdout. +\f[I]exec\f[]: +Run a shell command. +\f[I]abort\f[]: +Stop the current extractor run. +\f[I]terminate\f[]: +Stop the current extractor run, including parent extractors. +\f[I]restart\f[]: +Restart the current extractor run. +\f[I]raise\f[]: +Raise an exception. + +This can be an exception defined in +\f[I]exception.py\f[] +or a +\f[I]built-in exception\f[] +(e.g. \f[I]ZeroDivisionError\f[]) +\f[I]flag\f[]: +Set a \f[I]flag\f[]. + +Expected syntax is \f[I][ = ]\f[] (e.g. \f[I]post = stop\f[]) +.br +\f[I]\f[] can be one of \f[I]file\f[], \f[I]post\f[], \f[I]child\f[], \f[I]download\f[] +.br +\f[I]\f[] can be one of \f[I]stop\f[], \f[I]abort\f[], \f[I]terminate\f[], \f[I]restart\f[] (default \f[I]stop\f[]) +.br +\f[I]wait\f[]: +Sleep for a given \f[I]Duration\f[] or +.br +wait until Enter is pressed when no argument was given. +.br +\f[I]exit\f[]: +Exit the program with the given argument as exit status. + + .SH BUGS https://github.com/mikf/gallery-dl/issues diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index eac3390..6541030 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -28,6 +28,7 @@ "retry-codes" : [], "timeout" : 30.0, "verify" : true, + "truststore" : false, "download" : true, "fallback" : true, @@ -87,6 +88,18 @@ "jfi" : "jpg" }, + "category-map": {}, + "config-map": { + "coomerparty" : "coomer", + "kemonoparty" : "kemono", + "koharu" : "schalenetwork", + "chzzk" : "naver-chzzk", + "naver" : "naver-blog", + "naverwebtoon": "naver-webtoon", + "pixiv" : "pixiv-novel" + }, + + "#": "===============================================================", "#": "==== Site-specific Extractor Options ====================", @@ -110,6 +123,7 @@ { "external" : false, "max-posts": null, + "mviews" : true, "previews" : false, "videos" : true, @@ -170,7 +184,7 @@ }, "bunkr": { - "endpoint": "/api/_001", + "endpoint": "/api/_001_v2", "tlds": false }, "cien": @@ -185,13 +199,13 @@ "api" : "trpc", "files" : ["image"], - "include" : ["user-models", "user-posts"], + "include" : ["user-images", "user-videos"], "metadata": false, "nsfw" : true, "quality" : "original=true", "quality-videos": "quality=100" }, - "coomerparty": + "coomer": { "username": "", "password": "", @@ -211,6 +225,10 @@ { "domain": null }, + "dankefuerslesen": + { + "zip": false + }, "deviantart": { "client-id" : null, @@ -245,6 +263,18 @@ "subfolders": true } }, + "discord": + { + "embeds" : ["image", "gifv", "video"], + "threads": true, + "token" : "" + }, + "dynastyscans": + { + "anthology": { + "metadata": false + } + }, "exhentai": { "username": "", @@ -255,19 +285,29 @@ "domain" : "auto", "fav" : null, "gp" : "resized", - "limits" : null, "metadata": false, "original": true, "source" : null, "tags" : false, + "limits" : null, + "limits-action" : "stop", "fallback-retries": 2 }, + "facebook": + { + "cookies": null, + + "author-followups": false, + "include": "photos", + "videos" : true + }, "fanbox": { "cookies" : null, "comments": false, "embeds" : true, + "fee-max" : null, "metadata": false }, "flickr": @@ -307,6 +347,11 @@ { "enabled": false }, + "girlswithmuscle": + { + "username": "", + "password": "" + }, "gofile": { "api-token": null, @@ -374,9 +419,17 @@ "itaku": { "sleep-request": "0.5-1.5", - "videos": true + "include": "gallery", + "videos" : true }, - "kemonoparty": + "iwara": + { + "username": "", + "password": "", + + "include": ["user-images", "user-images"] + }, + "kemono": { "username": "", "password": "", @@ -399,20 +452,15 @@ "covers": false, "format": "mp3" }, - "koharu": - { - "username": "", - "password": "", - "sleep-request": "0.5-1.5", - - "cbz" : true, - "format": ["0", "1600", "1280", "980", "780"], - "tags" : false - }, "luscious": { "gif": false }, + "madokami": + { + "username": "", + "password": "" + }, "mangadex": { "client-id" : "", @@ -430,10 +478,14 @@ "username": "", "password": "" }, - "naver": + "naver-blog": { "videos": true }, + "naver-chzzk": + { + "offset": 0 + }, "newgrounds": { "username": "", @@ -463,8 +515,13 @@ { "cookies": null, + "cursor" : true, "files" : ["images", "image_large", "attachments", "postfile", "content"], - "format-images": "download_url" + "format-images": "download_url", + + "user": { + "date-max" : 0 + } }, "pexels": { @@ -504,7 +561,17 @@ "metadata-bookmark": false, "sanity" : true, "tags" : "japanese", - "ugoira" : true, + "ugoira" : true + }, + "pixiv-novel": + { + "refresh-token": null, + + "comments" : false, + "max-posts": null, + "metadata" : false, + "metadata-bookmark": false, + "tags" : "japanese", "covers" : false, "embeds" : false, @@ -555,6 +622,9 @@ }, "rule34xyz": { + "username": "", + "password": "", + "format": ["10", "40", "41", "2"] }, "sankaku": @@ -562,7 +632,6 @@ "username": "", "password": "", - "id-format": "numeric", "refresh" : false, "tags" : false }, @@ -571,12 +640,26 @@ "embeds": false, "videos": true }, + "schalenetwork": + { + "username": "", + "password": "", + "sleep-request": "0.5-1.5", + + "cbz" : true, + "format": ["0", "1600", "1280", "980", "780"], + "tags" : false + }, "scrolller": { "username": "", "password": "", "sleep-request": "0.5-1.5" }, + "sexcom": + { + "gifs": true + }, "skeb": { "article" : false, @@ -755,7 +838,9 @@ { "sleep-request": "0.5-1.5", - "quality": "original" + "quality" : "original", + "banners" : false, + "thumbnails": false }, "weebcentral": { @@ -780,9 +865,11 @@ { "cmdline-args": null, "config-file" : null, + "deprecations": false, "enabled" : false, "format" : null, "generic" : true, + "generic-category": true, "logging" : true, "module" : null, "raw-options" : null @@ -889,6 +976,7 @@ "misskey": { "access-token": null, + "include" : ["notes"], "renotes" : false, "replies" : true }, @@ -987,13 +1075,15 @@ "headers" : null, "retry-codes" : [], "sleep-429" : 60.0, - "validate" : true + "validate" : true, + "validate-html" : true }, "ytdl": { "cmdline-args" : null, "config-file" : null, + "deprecations" : false, "enabled" : true, "format" : null, "forward-cookies": true, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index c022f84..550241f 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: gallery_dl -Version: 1.29.7 +Version: 1.30.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -18,10 +18,6 @@ Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3 :: Only -Classifier: Programming Language :: Python :: 3.4 -Classifier: Programming Language :: Python :: 3.5 -Classifier: Programming Language :: Python :: 3.6 -Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 @@ -33,11 +29,18 @@ Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Internet :: WWW/HTTP Classifier: Topic :: Multimedia :: Graphics Classifier: Topic :: Utilities -Requires-Python: >=3.4 +Requires-Python: >=3.8 License-File: LICENSE Requires-Dist: requests>=2.11.0 Provides-Extra: video -Requires-Dist: youtube-dl; extra == "video" +Requires-Dist: yt-dlp; extra == "video" +Provides-Extra: extra +Requires-Dist: requests[socks]; extra == "extra" +Requires-Dist: yt-dlp[default]; extra == "extra" +Requires-Dist: pyyaml; extra == "extra" +Requires-Dist: toml; python_version < "3.11" and extra == "extra" +Requires-Dist: truststore; python_version >= "3.10" and extra == "extra" +Requires-Dist: secretstorage; sys_platform == "linux" and extra == "extra" Dynamic: author Dynamic: author-email Dynamic: classifier @@ -75,7 +78,7 @@ and powerful `filenaming capabilities `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds @@ -517,7 +522,7 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ .. _pip: https://pip.pypa.io/en/stable/ -.. _Requests: https://requests.readthedocs.io/en/master/ +.. _Requests: https://requests.readthedocs.io/en/latest/ .. _FFmpeg: https://www.ffmpeg.org/ .. _mkvmerge: https://www.matroska.org/downloads/mkvtoolnix.html .. _yt-dlp: https://github.com/yt-dlp/yt-dlp @@ -530,10 +535,12 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _toml: https://pypi.org/project/toml/ .. _SecretStorage: https://pypi.org/project/SecretStorage/ .. _Psycopg: https://www.psycopg.org/ +.. _truststore: https://truststore.readthedocs.io/en/latest/ +.. _Jinja: https://jinja.palletsprojects.com/ .. _Snapd: https://docs.snapcraft.io/installing-snapd .. _OAuth: https://en.wikipedia.org/wiki/OAuth .. _Chocolatey: https://chocolatey.org/install -.. _Scoop: https://scoop.sh +.. _Scoop: https://scoop.sh/ .. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg :target: https://pypi.org/project/gallery-dl/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 5dc17bd..8ae28f6 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -76,10 +76,12 @@ gallery_dl/extractor/catbox.py gallery_dl/extractor/chevereto.py gallery_dl/extractor/cien.py gallery_dl/extractor/civitai.py +gallery_dl/extractor/comick.py gallery_dl/extractor/comicvine.py gallery_dl/extractor/common.py gallery_dl/extractor/cyberdrop.py gallery_dl/extractor/danbooru.py +gallery_dl/extractor/dankefuerslesen.py gallery_dl/extractor/desktopography.py gallery_dl/extractor/deviantart.py gallery_dl/extractor/directlink.py @@ -104,6 +106,8 @@ gallery_dl/extractor/gelbooru.py gallery_dl/extractor/gelbooru_v01.py gallery_dl/extractor/gelbooru_v02.py gallery_dl/extractor/generic.py +gallery_dl/extractor/girlsreleased.py +gallery_dl/extractor/girlswithmuscle.py gallery_dl/extractor/gofile.py gallery_dl/extractor/hatenablog.py gallery_dl/extractor/hentai2read.py @@ -130,13 +134,14 @@ gallery_dl/extractor/instagram.py gallery_dl/extractor/issuu.py gallery_dl/extractor/itaku.py gallery_dl/extractor/itchio.py +gallery_dl/extractor/iwara.py gallery_dl/extractor/jschan.py gallery_dl/extractor/kabeuchi.py gallery_dl/extractor/keenspot.py -gallery_dl/extractor/kemonoparty.py +gallery_dl/extractor/kemono.py gallery_dl/extractor/khinsider.py -gallery_dl/extractor/koharu.py gallery_dl/extractor/komikcast.py +gallery_dl/extractor/leakgallery.py gallery_dl/extractor/lensdump.py gallery_dl/extractor/lexica.py gallery_dl/extractor/lightroom.py @@ -145,13 +150,13 @@ gallery_dl/extractor/lofter.py gallery_dl/extractor/lolisafe.py gallery_dl/extractor/luscious.py gallery_dl/extractor/lynxchan.py +gallery_dl/extractor/madokami.py gallery_dl/extractor/mangadex.py gallery_dl/extractor/mangafox.py gallery_dl/extractor/mangahere.py gallery_dl/extractor/manganelo.py gallery_dl/extractor/mangapark.py gallery_dl/extractor/mangaread.py -gallery_dl/extractor/mangasee.py gallery_dl/extractor/mangoxo.py gallery_dl/extractor/mastodon.py gallery_dl/extractor/message.py @@ -160,7 +165,8 @@ gallery_dl/extractor/moebooru.py gallery_dl/extractor/motherless.py gallery_dl/extractor/myhentaigallery.py gallery_dl/extractor/myportfolio.py -gallery_dl/extractor/naver.py +gallery_dl/extractor/naverblog.py +gallery_dl/extractor/naverchzzk.py gallery_dl/extractor/naverwebtoon.py gallery_dl/extractor/nekohouse.py gallery_dl/extractor/newgrounds.py @@ -170,6 +176,7 @@ gallery_dl/extractor/nitter.py gallery_dl/extractor/noop.py gallery_dl/extractor/nozomi.py gallery_dl/extractor/nsfwalbum.py +gallery_dl/extractor/nudostar.py gallery_dl/extractor/oauth.py gallery_dl/extractor/paheal.py gallery_dl/extractor/patreon.py @@ -190,10 +197,12 @@ gallery_dl/extractor/poringa.py gallery_dl/extractor/pornhub.py gallery_dl/extractor/pornpics.py gallery_dl/extractor/postmill.py +gallery_dl/extractor/rawkuma.py gallery_dl/extractor/reactor.py gallery_dl/extractor/readcomiconline.py gallery_dl/extractor/realbooru.py gallery_dl/extractor/recursive.py +gallery_dl/extractor/redbust.py gallery_dl/extractor/reddit.py gallery_dl/extractor/redgifs.py gallery_dl/extractor/rule34us.py @@ -202,6 +211,7 @@ gallery_dl/extractor/rule34xyz.py gallery_dl/extractor/saint.py gallery_dl/extractor/sankaku.py gallery_dl/extractor/sankakucomplex.py +gallery_dl/extractor/schalenetwork.py gallery_dl/extractor/scrolller.py gallery_dl/extractor/seiga.py gallery_dl/extractor/senmanga.py diff --git a/gallery_dl.egg-info/requires.txt b/gallery_dl.egg-info/requires.txt index 44dd863..531a762 100644 --- a/gallery_dl.egg-info/requires.txt +++ b/gallery_dl.egg-info/requires.txt @@ -1,4 +1,18 @@ requests>=2.11.0 +[extra] +requests[socks] +yt-dlp[default] +pyyaml + +[extra:python_version < "3.11"] +toml + +[extra:python_version >= "3.10"] +truststore + +[extra:sys_platform == "linux"] +secretstorage + [video] -youtube-dl +yt-dlp diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index ec882c3..9ab61e5 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,7 +11,7 @@ import logging from . import version, config, option, output, extractor, job, util, exception __author__ = "Mike Fährmann" -__copyright__ = "Copyright 2014-2023 Mike Fährmann" +__copyright__ = "Copyright 2014-2025 Mike Fährmann" __license__ = "GPLv2" __maintainer__ = "Mike Fährmann" __email__ = "mike_faehrmann@web.de" @@ -78,8 +78,7 @@ def main(): output.configure_standard_streams() # signals - signals = config.get((), "signals-ignore") - if signals: + if signals := config.get((), "signals-ignore"): import signal if isinstance(signals, str): signals = signals.split(",") @@ -90,6 +89,10 @@ def main(): else: signal.signal(signal_num, signal.SIG_IGN) + if signals := config.get((), "signals-actions"): + from . import actions + actions.parse_signals(signals) + # enable ANSI escape sequences on Windows if util.WINDOWS and config.get(("output",), "ansi", output.COLORS): from ctypes import windll, wintypes, byref @@ -118,14 +121,12 @@ def main(): util.compile_expression = util.compile_expression_defaultdict # format string separator - separator = config.get((), "format-separator") - if separator: + if separator := config.get((), "format-separator"): from . import formatter formatter._SEPARATOR = separator # eval globals - path = config.get((), "globals") - if path: + if path := config.get((), "globals"): util.GLOBALS.update(util.import_file(path).__dict__) # loglevels @@ -137,13 +138,12 @@ def main(): import platform import requests - extra = "" if util.EXECUTABLE: - extra = " - Executable ({})".format(version.__variant__) + extra = f" - Executable ({version.__variant__})" + elif git_head := util.git_head(): + extra = " - Git HEAD: " + git_head else: - git_head = util.git_head() - if git_head: - extra = " - Git HEAD: " + git_head + extra = "" log.debug("Version %s%s", __version__, extra) log.debug("Python %s - %s", @@ -157,10 +157,40 @@ def main(): log.debug("Configuration Files %s", config._files) + if args.clear_cache: + from . import cache + log = logging.getLogger("cache") + cnt = cache.clear(args.clear_cache) + + if cnt is None: + log.error("Database file not available") + return 1 + + log.info("Deleted %d entr%s from '%s'", + cnt, "y" if cnt == 1 else "ies", cache._path()) + return 0 + + if args.config: + if args.config == "init": + return config.initialize() + elif args.config == "status": + return config.status() + else: + return config.open_extern() + if args.print_traffic: import requests requests.packages.urllib3.connection.HTTPConnection.debuglevel = 1 + if args.update: + from . import update + extr = update.UpdateExtractor.from_url("update:" + args.update) + ujob = update.UpdateJob(extr) + return ujob.run() + + # category renaming + config.remap_categories() + # extractor modules modules = config.get(("extractor",), "modules") if modules is not None: @@ -199,13 +229,7 @@ def main(): else: extractor._module_iter = iter(modules[0]) - if args.update: - from . import update - extr = update.UpdateExtractor.from_url("update:" + args.update) - ujob = update.UpdateJob(extr) - return ujob.run() - - elif args.list_modules: + if args.list_modules: extractor.modules.append("") sys.stdout.write("\n".join(extractor.modules)) @@ -228,31 +252,8 @@ def main(): extr.example, )) - elif args.clear_cache: - from . import cache - log = logging.getLogger("cache") - cnt = cache.clear(args.clear_cache) - - if cnt is None: - log.error("Database file not available") - return 1 - else: - log.info( - "Deleted %d %s from '%s'", - cnt, "entry" if cnt == 1 else "entries", cache._path(), - ) - - elif args.config: - if args.config == "init": - return config.initialize() - elif args.config == "status": - return config.status() - else: - return config.open_extern() - else: - input_files = config.get((), "input-files") - if input_files: + if input_files := config.get((), "input-files"): for input_file in input_files: if isinstance(input_file, str): input_file = (input_file, None) @@ -271,8 +272,7 @@ def main(): jobtype = job.UrlJob jobtype.maxdepth = args.list_urls if config.get(("output",), "fallback", True): - jobtype.handle_url = \ - staticmethod(jobtype.handle_url_fallback) + jobtype.handle_url = jobtype.handle_url_fallback elif args.dump_json: jobtype = job.DataJob jobtype.resolve = args.dump_json - 1 @@ -283,17 +283,15 @@ def main(): input_manager.log = input_log = logging.getLogger("inputfile") # unsupported file logging handler - handler = output.setup_logging_handler( - "unsupportedfile", fmt="{message}") - if handler: + if handler := output.setup_logging_handler( + "unsupportedfile", fmt="{message}"): ulog = job.Job.ulog = logging.getLogger("unsupported") ulog.addHandler(handler) ulog.propagate = False # error file logging handler - handler = output.setup_logging_handler( - "errorfile", fmt="{message}", mode="a") - if handler: + if handler := output.setup_logging_handler( + "errorfile", fmt="{message}", mode="a"): elog = input_manager.err = logging.getLogger("errorfile") elog.addHandler(handler) elog.propagate = False @@ -315,6 +313,24 @@ def main(): args.loglevel < logging.ERROR: input_manager.progress(pformat) + if catmap := config.interpolate(("extractor",), "category-map"): + if catmap == "compat": + catmap = { + "coomer" : "coomerparty", + "kemono" : "kemonoparty", + "schalenetwork": "koharu", + "naver-blog" : "naver", + "naver-chzzk" : "chzzk", + "naver-webtoon": "naverwebtoon", + "pixiv-novel" : "pixiv", + "pixiv-novel:novel" : ("pixiv", "novel"), + "pixiv-novel:user" : ("pixiv", "novel-user"), + "pixiv-novel:series" : ("pixiv", "novel-series"), + "pixiv-novel:bookmark": ("pixiv", "novel-bookmark"), + } + from .extractor import common + common.CATEGORY_MAP = catmap + # process input URLs retval = 0 for url in input_manager: @@ -335,13 +351,11 @@ def main(): else: input_manager.success() - except exception.StopExtraction: - pass - except exception.TerminateExtraction: - pass except exception.RestartExtraction: log.debug("Restarting '%s'", url) continue + except exception.ControlException: + pass except exception.NoExtractorError: log.error("Unsupported URL '%s'", url) retval |= 64 @@ -462,16 +476,15 @@ class InputManager(): key, sep, value = line.partition("=") if not sep: raise exception.InputFileError( - "Invalid KEY=VALUE pair '%s' on line %s in %s", - line, n+1, path) + f"Invalid KEY=VALUE pair '{line}' " + f"on line {n+1} in {path}") try: value = util.json_loads(value.strip()) except ValueError as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) raise exception.InputFileError( - "Unable to parse '%s' on line %s in %s", - value, n+1, path) + f"Unable to parse '{value}' on line {n+1} in {path}") key = key.strip().split(".") conf.append((key[:-1], key[-1], value)) @@ -480,8 +493,7 @@ class InputManager(): # url if " #" in line or "\t#" in line: if strip_comment is None: - import re - strip_comment = re.compile(r"\s+#.*").sub + strip_comment = util.re(r"\s+#.*").sub line = strip_comment("", line) if gconf or lconf: url = ExtendedUrl(line, gconf, lconf) @@ -536,13 +548,11 @@ class InputManager(): "Unable to update '%s' (%s: %s)", path, exc.__class__.__name__, exc) - @staticmethod - def _action_comment(lines, indicies): + def _action_comment(self, lines, indicies): for i in indicies: lines[i] = "# " + lines[i] - @staticmethod - def _action_delete(lines, indicies): + def _action_delete(self, lines, indicies): for i in indicies: lines[i] = "" diff --git a/gallery_dl/actions.py b/gallery_dl/actions.py index 668032d..971c4d9 100644 --- a/gallery_dl/actions.py +++ b/gallery_dl/actions.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2023 Mike Fährmann +# Copyright 2023-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,6 @@ """ """ -import re import time import logging import operator @@ -16,7 +15,7 @@ import functools from . import util, exception -def parse(actionspec): +def parse_logging(actionspec): if isinstance(actionspec, dict): actionspec = actionspec.items() @@ -32,7 +31,7 @@ def parse(actionspec): for event, spec in actionspec: level, _, pattern = event.partition(":") - search = re.compile(pattern).search if pattern else util.true + search = util.re(pattern).search if pattern else util.true if isinstance(spec, str): type, _, args = spec.partition(" ") @@ -74,6 +73,41 @@ def parse(actionspec): return actions +def parse_signals(actionspec): + import signal + + if isinstance(actionspec, dict): + actionspec = actionspec.items() + + for signal_name, spec in actionspec: + signal_num = getattr(signal, signal_name, None) + if signal_num is None: + log = logging.getLogger("gallery-dl") + log.warning("signal '%s' is not defined", signal_name) + continue + + if isinstance(spec, str): + type, _, args = spec.partition(" ") + before, after = ACTIONS[type](args) + action = before if after is None else after + else: + actions_before = [] + actions_after = [] + for s in spec: + type, _, args = s.partition(" ") + before, after = ACTIONS[type](args) + if before is not None: + actions_before.append(before) + if after is not None: + actions_after.append(after) + + actions = actions_before + actions.extend(actions_after) + action = _chain_actions(actions) + + signal.signal(signal_num, signals_handler(action)) + + class LoggerAdapter(): def __init__(self, logger, job): @@ -129,6 +163,12 @@ def _chain_actions(actions): return _chain +def signals_handler(action, args={}): + def handler(signal_num, frame): + action(args) + return handler + + # -------------------------------------------------------------------- def action_print(opts): @@ -138,7 +178,7 @@ def action_print(opts): def action_status(opts): - op, value = re.match(r"\s*([&|^=])=?\s*(\d+)", opts).groups() + op, value = util.re(r"\s*([&|^=])=?\s*(\d+)").match(opts).groups() op = { "&": operator.and_, @@ -181,6 +221,36 @@ def action_wait(opts): return None, _wait +def action_flag(opts): + flag, value = util.re( + r"(?i)(file|post|child|download)(?:\s*[= ]\s*(.+))?" + ).match(opts).groups() + flag = flag.upper() + value = "stop" if value is None else value.lower() + + def _flag(args): + util.FLAGS.__dict__[flag] = value + return _flag, None + + +def action_raise(opts): + name, _, arg = opts.partition(" ") + + exc = getattr(exception, name, None) + if exc is None: + import builtins + exc = getattr(builtins, name, Exception) + + if arg: + def _raise(args): + raise exc(arg) + else: + def _raise(args): + raise exc() + + return None, _raise + + def action_abort(opts): return None, util.raises(exception.StopExtraction) @@ -208,8 +278,10 @@ ACTIONS = { "abort" : action_abort, "exec" : action_exec, "exit" : action_exit, + "flag" : action_flag, "level" : action_level, "print" : action_print, + "raise" : action_raise, "restart" : action_restart, "status" : action_status, "terminate": action_terminate, diff --git a/gallery_dl/aes.py b/gallery_dl/aes.py index 3fd1d5e..c671de9 100644 --- a/gallery_dl/aes.py +++ b/gallery_dl/aes.py @@ -58,7 +58,7 @@ bytes_to_intlist = list def intlist_to_bytes(xs): if not xs: return b"" - return struct.pack("%dB" % len(xs), *xs) + return struct.pack(f"{len(xs)}B", *xs) def unpad_pkcs7(data): @@ -615,7 +615,7 @@ def block_product(block_x, block_y): if len(block_x) != BLOCK_SIZE_BYTES or len(block_y) != BLOCK_SIZE_BYTES: raise ValueError( - "Length of blocks need to be %d bytes" % BLOCK_SIZE_BYTES) + f"Length of blocks need to be {BLOCK_SIZE_BYTES} bytes") block_r = [0xE1] + [0] * (BLOCK_SIZE_BYTES - 1) block_v = block_y[:] @@ -639,7 +639,7 @@ def ghash(subkey, data): if len(data) % BLOCK_SIZE_BYTES: raise ValueError( - "Length of data should be %d bytes" % BLOCK_SIZE_BYTES) + f"Length of data should be {BLOCK_SIZE_BYTES} bytes") last_y = [0] * BLOCK_SIZE_BYTES for i in range(0, len(data), BLOCK_SIZE_BYTES): diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py index 923ed32..b04552e 100644 --- a/gallery_dl/cache.py +++ b/gallery_dl/cache.py @@ -73,7 +73,7 @@ class DatabaseCacheDecorator(): _init = True def __init__(self, func, keyarg, maxage): - self.key = "%s.%s" % (func.__module__, func.__name__) + self.key = f"{func.__module__}.{func.__name__}" self.func = func self.cache = {} self.keyarg = keyarg @@ -95,7 +95,7 @@ class DatabaseCacheDecorator(): pass # database lookup - fullkey = "%s-%s" % (self.key, key) + fullkey = f"{self.key}-{key}" with self.database() as db: cursor = db.cursor() try: @@ -128,7 +128,7 @@ class DatabaseCacheDecorator(): with self.database() as db: db.execute( "INSERT OR REPLACE INTO data VALUES (?,?,?)", - ("%s-%s" % (self.key, key), pickle.dumps(value), expires), + (f"{self.key}-{key}", pickle.dumps(value), expires), ) def invalidate(self, key): @@ -139,7 +139,7 @@ class DatabaseCacheDecorator(): with self.database() as db: db.execute( "DELETE FROM data WHERE key=?", - ("%s-%s" % (self.key, key),), + (f"{self.key}-{key}",), ) def database(self): diff --git a/gallery_dl/config.py b/gallery_dl/config.py index 92e55d3..1873634 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -103,14 +103,12 @@ def open_extern(): openers = ("explorer", "notepad") else: openers = ("xdg-open", "open") - editor = os.environ.get("EDITOR") - if editor: + if editor := os.environ.get("EDITOR"): openers = (editor,) + openers import shutil for opener in openers: - opener = shutil.which(opener) - if opener: + if opener := shutil.which(opener): break else: log.warning("Unable to find a program to open '%s' with", path) @@ -155,13 +153,38 @@ def status(): paths.append((path, status)) - fmt = "{{:<{}}} : {{}}\n".format( - max(len(p[0]) for p in paths)).format + fmt = f"{{:<{max(len(p[0]) for p in paths)}}} : {{}}\n".format for path, status in paths: stdout_write(fmt(path, status)) +def remap_categories(): + opts = _config.get("extractor") + if not opts: + return + + cmap = opts.get("config-map") + if cmap is None: + cmap = ( + ("coomerparty" , "coomer"), + ("kemonoparty" , "kemono"), + ("koharu" , "schalenetwork"), + ("naver" , "naver-blog"), + ("chzzk" , "naver-chzzk"), + ("naverwebtoon", "naver-webtoon"), + ("pixiv" , "pixiv-novel"), + ) + elif not cmap: + return + elif isinstance(cmap, dict): + cmap = cmap.items() + + for old, new in cmap: + if old in opts and new not in opts: + opts[new] = opts[old] + + def load(files=None, strict=False, loads=util.json_loads): """Load JSON configuration files""" for pathfmt in files or _default_configs: @@ -186,8 +209,7 @@ def load(files=None, strict=False, loads=util.json_loads): _files.append(pathfmt) if "subconfigs" in conf: - subconfigs = conf["subconfigs"] - if subconfigs: + if subconfigs := conf["subconfigs"]: if isinstance(subconfigs, str): subconfigs = (subconfigs,) load(subconfigs, strict, loads) @@ -259,8 +281,7 @@ def accumulate(path, key, conf=_config): result = [] try: if key in conf: - value = conf[key] - if value: + if value := conf[key]: if isinstance(value, list): result.extend(value) else: @@ -268,8 +289,7 @@ def accumulate(path, key, conf=_config): for p in path: conf = conf[p] if key in conf: - value = conf[key] - if value: + if value := conf[key]: if isinstance(value, list): result[:0] = value else: diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index f03ad58..5d6c3d7 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022-2023 Mike Fährmann +# Copyright 2022-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,7 +26,7 @@ from . import aes, text, util SUPPORTED_BROWSERS_CHROMIUM = { "brave", "chrome", "chromium", "edge", "opera", "thorium", "vivaldi"} -SUPPORTED_BROWSERS_FIREFOX = {"firefox", "zen"} +SUPPORTED_BROWSERS_FIREFOX = {"firefox", "librewolf", "zen"} SUPPORTED_BROWSERS = \ SUPPORTED_BROWSERS_CHROMIUM | SUPPORTED_BROWSERS_FIREFOX | {"safari"} @@ -43,7 +43,7 @@ def load_cookies(browser_specification): elif browser_name in SUPPORTED_BROWSERS_CHROMIUM: return load_cookies_chromium(browser_name, profile, keyring, domain) else: - raise ValueError("unknown browser '{}'".format(browser_name)) + raise ValueError(f"unknown browser '{browser_name}'") def load_cookies_firefox(browser_name, profile=None, @@ -59,7 +59,7 @@ def load_cookies_firefox(browser_name, profile=None, if container_id is False: conditions.append("NOT INSTR(originAttributes,'userContextId=')") elif container_id: - uid = "%userContextId={}".format(container_id) + uid = f"%userContextId={container_id}" conditions.append("originAttributes LIKE ? OR originAttributes LIKE ?") parameters += (uid, uid + "&%") @@ -72,7 +72,7 @@ def load_cookies_firefox(browser_name, profile=None, parameters += (domain, "." + domain) if conditions: - sql = "{} WHERE ( {} )".format(sql, " ) AND ( ".join(conditions)) + sql = f"{sql} WHERE ( {' ) AND ( '.join(conditions)} )" with DatabaseConnection(path) as db: cookies = [ @@ -186,7 +186,7 @@ def load_cookies_chromium(browser_name, profile=None, )) if failed_cookies > 0: - failed_message = " ({} could not be decrypted)".format(failed_cookies) + failed_message = f" ({failed_cookies} could not be decrypted)" else: failed_message = "" @@ -212,8 +212,9 @@ def _firefox_cookies_database(browser_name, profile=None, container=None): path = _find_most_recently_used_file(search_root, "cookies.sqlite") if path is None: - raise FileNotFoundError("Unable to find Firefox cookies database in " - "{}".format(search_root)) + raise FileNotFoundError(f"Unable to find {browser_name.capitalize()} " + f"cookies database in {search_root}") + _log_debug("Extracting cookies from %s", path) if not container or container == "none": @@ -243,8 +244,7 @@ def _firefox_cookies_database(browser_name, profile=None, container=None): container_id = context["userContextId"] break else: - raise ValueError("Unable to find Firefox container '{}'".format( - container)) + raise ValueError(f"Unable to find Firefox container '{container}'") _log_debug("Only loading cookies from container '%s' (ID %s)", container, container_id) @@ -257,20 +257,23 @@ def _firefox_browser_directory(browser_name): if sys.platform in ("win32", "cygwin"): appdata = os.path.expandvars("%APPDATA%") return { - "firefox": join(appdata, R"Mozilla\Firefox\Profiles"), - "zen" : join(appdata, R"zen\Profiles") + "firefox" : join(appdata, R"Mozilla\Firefox\Profiles"), + "librewolf": join(appdata, R"librewolf\Profiles"), + "zen" : join(appdata, R"zen\Profiles"), }[browser_name] elif sys.platform == "darwin": appdata = os.path.expanduser("~/Library/Application Support") return { - "firefox": join(appdata, R"Firefox/Profiles"), - "zen" : join(appdata, R"zen/Profiles") + "firefox" : join(appdata, R"Firefox/Profiles"), + "librewolf": join(appdata, R"librewolf/Profiles"), + "zen" : join(appdata, R"zen/Profiles"), }[browser_name] else: home = os.path.expanduser("~") return { - "firefox": join(home, R".mozilla/firefox"), - "zen" : join(home, R".zen") + "firefox" : join(home, R".mozilla/firefox"), + "librewolf": join(home, R".librewolf"), + "zen" : join(home, R".zen"), }[browser_name] @@ -386,8 +389,8 @@ def _chromium_cookies_database(profile, config): path = _find_most_recently_used_file(search_root, "Cookies") if path is None: - raise FileNotFoundError("Unable to find {} cookies database in " - "'{}'".format(config["browser"], search_root)) + raise FileNotFoundError(f"Unable to find {config['browser']} cookies " + f"database in '{search_root}'") return path @@ -519,8 +522,7 @@ class LinuxChromiumCookieDecryptor(ChromiumCookieDecryptor): self._cookie_counts = {"v10": 0, "v11": 0, "other": 0} self._offset = (32 if meta_version >= 24 else 0) - @staticmethod - def derive_key(password): + def derive_key(self, password): # values from # https://chromium.googlesource.com/chromium/src/+/refs/heads # /main/components/os_crypt/os_crypt_linux.cc @@ -564,8 +566,7 @@ class MacChromiumCookieDecryptor(ChromiumCookieDecryptor): self._cookie_counts = {"v10": 0, "other": 0} self._offset = (32 if meta_version >= 24 else 0) - @staticmethod - def derive_key(password): + def derive_key(self, password): # values from # https://chromium.googlesource.com/chromium/src/+/refs/heads # /main/components/os_crypt/os_crypt_mac.mm @@ -713,9 +714,9 @@ def _get_kwallet_password(browser_keyring_name): ) if proc.returncode != 0: - _log_error("kwallet-query failed with return code {}. " - "Please consult the kwallet-query man page " - "for details".format(proc.returncode)) + _log_error(f"kwallet-query failed with return code " + f"{proc.returncode}. Please consult the kwallet-query " + f"man page for details") return b"" if stdout.lower().startswith(b"failed to read"): @@ -844,7 +845,7 @@ class DataParser: def read_bytes(self, num_bytes): if num_bytes < 0: - raise ParserError("invalid read of {} bytes".format(num_bytes)) + raise ParserError(f"invalid read of {num_bytes} bytes") end = self.cursor + num_bytes if end > len(self._data): raise ParserError("reached end of input") @@ -855,8 +856,8 @@ class DataParser: def expect_bytes(self, expected_value, message): value = self.read_bytes(len(expected_value)) if value != expected_value: - raise ParserError("unexpected value: {} != {} ({})".format( - value, expected_value, message)) + raise ParserError(f"unexpected value: {value} != {expected_value} " + f"({message})") def read_uint(self, big_endian=False): data_format = ">I" if big_endian else " 0: - _log_debug("Skipping {} bytes ({}): {!r}".format( - num_bytes, description, self.read_bytes(num_bytes))) + _log_debug(f"Skipping {num_bytes} bytes ({description}): " + f"{self.read_bytes(num_bytes)!r}") elif num_bytes < 0: - raise ParserError("Invalid skip of {} bytes".format(num_bytes)) + raise ParserError(f"Invalid skip of {num_bytes} bytes") def skip_to(self, offset, description="unknown"): self.skip(offset - self.cursor, description) @@ -903,7 +904,7 @@ class DatabaseConnection(): if util.WINDOWS: path = "/" + os.path.abspath(path) - uri = "file:{}?mode=ro&immutable=1".format(path) + uri = f"file:{path}?mode=ro&immutable=1" self.database = sqlite3.connect( uri, uri=True, isolation_level=None, check_same_thread=False) return self.database @@ -1101,9 +1102,9 @@ def _parse_browser_specification( browser, profile=None, keyring=None, container=None, domain=None): browser = browser.lower() if browser not in SUPPORTED_BROWSERS: - raise ValueError("Unsupported browser '{}'".format(browser)) + raise ValueError(f"Unsupported browser '{browser}'") if keyring and keyring not in SUPPORTED_KEYRINGS: - raise ValueError("Unsupported keyring '{}'".format(keyring)) + raise ValueError(f"Unsupported keyring '{keyring}'") if profile and _is_path(profile): profile = os.path.expanduser(profile) return browser, profile, keyring, container, domain diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index dc1219f..7cd8d10 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -21,8 +21,7 @@ class DownloaderBase(): extractor = job.extractor self.log = job.get_logger("downloader." + self.scheme) - opts = self._extractor_config(extractor) - if opts: + if opts := self._extractor_config(extractor): self.opts = opts self.config = self.config_opts @@ -60,8 +59,7 @@ class DownloaderBase(): opts = {} for cat, sub in reversed(path): - popts = self._extractor_opts(cat, sub) - if popts: + if popts := self._extractor_opts(cat, sub): opts.update(popts) return opts @@ -70,12 +68,10 @@ class DownloaderBase(): if not cfg: return None - copts = cfg.get(self.scheme) - if copts: + if copts := cfg.get(self.scheme): if subcategory in cfg: try: - sopts = cfg[subcategory].get(self.scheme) - if sopts: + if sopts := cfg[subcategory].get(self.scheme): opts = copts.copy() opts.update(sopts) return opts diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index c58e2fb..4595483 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,8 +12,9 @@ import time import mimetypes from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase -from .. import text, util, output +from .. import text, util, output, exception from ssl import SSLError +FLAGS = util.FLAGS class HttpDownloader(DownloaderBase): @@ -29,6 +30,7 @@ class HttpDownloader(DownloaderBase): self.metadata = extractor.config("http-metadata") self.progress = self.config("progress", 3.0) self.validate = self.config("validate", True) + self.validate_html = self.config("validate-html", True) self.headers = self.config("headers") self.minsize = self.config("filesize-min") self.maxsize = self.config("filesize-max") @@ -68,14 +70,16 @@ class HttpDownloader(DownloaderBase): chunk_size = 32768 self.chunk_size = chunk_size if self.rate: - rate = text.parse_bytes(self.rate) - if rate: - if rate < self.chunk_size: - self.chunk_size = rate - self.rate = rate + func = util.build_selection_func(self.rate, 0, text.parse_bytes) + if rmax := func.args[1] if hasattr(func, "args") else func(): + if rmax < self.chunk_size: + # reduce chunk_size to allow for one iteration each second + self.chunk_size = rmax + self.rate = func self.receive = self._receive_rate else: self.log.warning("Invalid rate limit (%r)", self.rate) + self.rate = False if self.progress is not None: self.receive = self._receive_rate if self.progress < 0.0: @@ -88,8 +92,10 @@ class HttpDownloader(DownloaderBase): def download(self, url, pathfmt): try: return self._download_impl(url, pathfmt) - except Exception: - output.stderr_write("\n") + except Exception as exc: + if self.downloading: + output.stderr_write("\n") + self.log.debug("", exc_info=exc) raise finally: # remove file from incomplete downloads @@ -134,16 +140,14 @@ class HttpDownloader(DownloaderBase): # collect HTTP headers headers = {"Accept": "*/*"} # file-specific headers - extra = kwdict.get("_http_headers") - if extra: + if extra := kwdict.get("_http_headers"): headers.update(extra) # general headers if self.headers: headers.update(self.headers) # partial content - file_size = pathfmt.part_size() - if file_size: - headers["Range"] = "bytes={}-".format(file_size) + if file_size := pathfmt.part_size(): + headers["Range"] = f"bytes={file_size}-" # connect to (remote) source try: @@ -161,7 +165,7 @@ class HttpDownloader(DownloaderBase): reason = exc.args[0].reason cls = reason.__class__.__name__ pre, _, err = str(reason.args[-1]).partition(":") - msg = "{}: {}".format(cls, (err or pre).lstrip()) + msg = f"{cls}: {(err or pre).lstrip()}" except Exception: msg = str(exc) continue @@ -183,7 +187,7 @@ class HttpDownloader(DownloaderBase): elif code == 416 and file_size: # Requested Range Not Satisfiable break else: - msg = "'{} {}' for '{}'".format(code, response.reason, url) + msg = f"'{code} {response.reason}' for '{url}'" challenge = util.detect_challenge(response) if challenge is not None: @@ -199,8 +203,8 @@ class HttpDownloader(DownloaderBase): return False # check for invalid responses - validate = kwdict.get("_http_validate") - if validate and self.validate: + if self.validate and \ + (validate := kwdict.get("_http_validate")) is not None: try: result = validate(response) except Exception: @@ -214,6 +218,14 @@ class HttpDownloader(DownloaderBase): self.release_conn(response) self.log.warning("Invalid response") return False + if self.validate_html and response.headers.get( + "content-type", "").startswith("text/html") and \ + pathfmt.extension not in ("html", "htm"): + if response.history: + self.log.warning("HTTP redirect to '%s'", response.url) + else: + self.log.warning("HTML response") + return False # check file size size = text.parse_int(size, None) @@ -265,19 +277,28 @@ class HttpDownloader(DownloaderBase): content = response.iter_content(self.chunk_size) + validate_sig = kwdict.get("_http_signature") + validate_ext = (adjust_extension and + pathfmt.extension in SIGNATURE_CHECKS) + # check filename extension against file header - if adjust_extension and not offset and \ - pathfmt.extension in SIGNATURE_CHECKS: + if not offset and (validate_ext or validate_sig): try: file_header = next( content if response.raw.chunked else response.iter_content(16), b"") except (RequestException, SSLError) as exc: msg = str(exc) - output.stderr_write("\n") continue - if self._adjust_extension(pathfmt, file_header) and \ - pathfmt.exists(): + if validate_sig: + result = validate_sig(file_header) + if result is not True: + self.release_conn(response) + self.log.warning( + result or "Invalid file signature bytes") + return False + if validate_ext and self._adjust_extension( + pathfmt, file_header) and pathfmt.exists(): pathfmt.temppath = "" response.close() return True @@ -294,6 +315,9 @@ class HttpDownloader(DownloaderBase): # download content self.downloading = True with pathfmt.open(mode) as fp: + if fp is None: + # '.part' file no longer exists + break if file_header: fp.write(file_header) offset += len(file_header) @@ -310,11 +334,16 @@ class HttpDownloader(DownloaderBase): msg = str(exc) output.stderr_write("\n") continue + except exception.StopExtraction: + response.close() + return False + except exception.ControlException: + response.close() + raise # check file size if size and fp.tell() < size: - msg = "file size mismatch ({} < {})".format( - fp.tell(), size) + msg = f"file size mismatch ({fp.tell()} < {size})" output.stderr_write("\n") continue @@ -323,11 +352,11 @@ class HttpDownloader(DownloaderBase): self.downloading = False if self.mtime: if "_http_lastmodified" in kwdict: - kwdict["_mtime"] = kwdict["_http_lastmodified"] + kwdict["_mtime_http"] = kwdict["_http_lastmodified"] else: - kwdict["_mtime"] = response.headers.get("Last-Modified") + kwdict["_mtime_http"] = response.headers.get("Last-Modified") else: - kwdict["_mtime"] = None + kwdict["_mtime_http"] = None return True @@ -343,14 +372,16 @@ class HttpDownloader(DownloaderBase): "closing the connection anyway", exc.__class__.__name__, exc) response.close() - @staticmethod - def receive(fp, content, bytes_total, bytes_start): + def receive(self, fp, content, bytes_total, bytes_start): write = fp.write for data in content: write(data) + if FLAGS.DOWNLOAD is not None: + FLAGS.process("DOWNLOAD") + def _receive_rate(self, fp, content, bytes_total, bytes_start): - rate = self.rate + rate = self.rate() if self.rate else None write = fp.write progress = self.progress @@ -363,6 +394,9 @@ class HttpDownloader(DownloaderBase): write(data) + if FLAGS.DOWNLOAD is not None: + FLAGS.process("DOWNLOAD") + if progress is not None: if time_elapsed > progress: self.out.progress( @@ -371,7 +405,7 @@ class HttpDownloader(DownloaderBase): int(bytes_downloaded / time_elapsed), ) - if rate: + if rate is not None: time_expected = bytes_downloaded / rate if time_expected > time_elapsed: time.sleep(time_expected - time_elapsed) @@ -387,15 +421,13 @@ class HttpDownloader(DownloaderBase): if mtype in MIME_TYPES: return MIME_TYPES[mtype] - ext = mimetypes.guess_extension(mtype, strict=False) - if ext: + if ext := mimetypes.guess_extension(mtype, strict=False): return ext[1:] self.log.warning("Unknown MIME type '%s'", mtype) return "bin" - @staticmethod - def _adjust_extension(pathfmt, file_header): + def _adjust_extension(self, pathfmt, file_header): """Check filename extension against file header""" if not SIGNATURE_CHECKS[pathfmt.extension](file_header): for ext, check in SIGNATURE_CHECKS.items(): @@ -452,12 +484,20 @@ MIME_TYPES = { "application/x-pdf": "pdf", "application/x-shockwave-flash": "swf", + "text/html": "html", + "application/ogg": "ogg", # https://www.iana.org/assignments/media-types/model/obj "model/obj": "obj", "application/octet-stream": "bin", } + +def _signature_html(s): + s = s[:14].lstrip() + return s and b"' , '<'), ("name", 'class="cnm">' , '<'), @@ -85,8 +84,7 @@ class _2chanThreadExtractor(Extractor): ("com" , '>', ''), ))[0] - @staticmethod - def _extract_image(post, data): + def _extract_image(self, post, data): text.extract_all(post, ( (None , '_blank', ''), ("filename", '>', '<'), diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index 0c97889..ee3510c 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -28,7 +28,7 @@ class _2chenThreadExtractor(Extractor): self.board, self.thread = match.groups() def items(self): - url = "{}/{}/{}".format(self.root, self.board, self.thread) + url = f"{self.root}/{self.board}/{self.thread}" page = self.request(url, encoding="utf-8", notfound="thread").text data = self.metadata(page) yield Message.Directory, data @@ -86,10 +86,10 @@ class _2chenBoardExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.board = match.group(1) + self.board = match[1] def items(self): - url = "{}/{}/catalog".format(self.root, self.board) + url = f"{self.root}/{self.board}/catalog" page = self.request(url, notfound="board").text data = {"_extractor": _2chenThreadExtractor} for thread in text.extract_iter( diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index 773116e..ec5f0cb 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -49,14 +49,14 @@ class _35photoExtractor(Extractor): if extra_ids: yield from extra_ids while params["lastId"]: - data = self.request(url, headers=headers, params=params).json() + data = self.request_json(url, headers=headers, params=params) yield from self._photo_ids(data["data"]) params["lastId"] = data["lastId"] def _photo_data(self, photo_id): params = {"method": "photo.getData", "photoId": photo_id} - data = self.request( - "https://api.35photo.pro/", params=params).json()["data"][photo_id] + data = self.request_json( + "https://api.35photo.pro/", params=params)["data"][photo_id] info = { "url" : data["src"], "id" : data["photo_id"], @@ -83,8 +83,7 @@ class _35photoExtractor(Extractor): info["num"] = 1 yield info - @staticmethod - def _photo_ids(page): + def _photo_ids(self, page): """Extract unique photo IDs and return them as sorted list""" # searching for photo-id="..." doesn't always work (see unit tests) if not page: @@ -105,11 +104,11 @@ class _35photoUserExtractor(_35photoExtractor): def __init__(self, match): _35photoExtractor.__init__(self, match) - self.user = match.group(1) + self.user = match[1] self.user_id = 0 def metadata(self): - url = "{}/{}/".format(self.root, self.user) + url = f"{self.root}/{self.user}/" page = self.request(url).text self.user_id = text.parse_int(text.extr(page, "/user_", ".xml")) return { @@ -134,7 +133,7 @@ class _35photoTagExtractor(_35photoExtractor): def __init__(self, match): _35photoExtractor.__init__(self, match) - self.tag = match.group(1) + self.tag = match[1] def metadata(self): return {"search_tag": text.unquote(self.tag).lower()} @@ -143,7 +142,7 @@ class _35photoTagExtractor(_35photoExtractor): num = 1 while True: - url = "{}/tags/{}/list_{}/".format(self.root, self.tag, num) + url = f"{self.root}/tags/{self.tag}/list_{num}/" page = self.request(url).text prev = None @@ -171,7 +170,7 @@ class _35photoGenreExtractor(_35photoExtractor): self.photo_ids = None def metadata(self): - url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/") + url = f"{self.root}/genre_{self.genre_id}{self.new or '/'}" page = self.request(url).text self.photo_ids = self._photo_ids(text.extr( page, ' class="photo', '\n')) @@ -199,7 +198,7 @@ class _35photoImageExtractor(_35photoExtractor): def __init__(self, match): _35photoExtractor.__init__(self, match) - self.photo_id = match.group(1) + self.photo_id = match[1] def photos(self): return (self.photo_id,) diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index d198369..c9be2a4 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -27,8 +27,7 @@ class _4archiveThreadExtractor(Extractor): self.board, self.thread = match.groups() def items(self): - url = "{}/board/{}/thread/{}".format( - self.root, self.board, self.thread) + url = f"{self.root}/board/{self.board}/thread/{self.thread}" page = self.request(url).text data = self.metadata(page) posts = self.posts(page) @@ -58,15 +57,14 @@ class _4archiveThreadExtractor(Extractor): for post in page.split('class="postContainer')[1:] ] - @staticmethod - def parse(post): + def parse(self, post): extr = text.extract_from(post) data = { "name": extr('class="name">', ""), "date": text.parse_datetime( extr('class="dateTime postNum" >', "<").strip(), "%Y-%m-%d %H:%M:%S"), - "no" : text.parse_int(extr('href="#p', '"')), + "no" : text.parse_int(extr(">Post No.", "<")), } if 'class="file"' in post: extr('class="fileText"', ">File: ")[2]), } - @staticmethod - def _extract_file(html, post): + def _extract_file(self, html, post): extr = text.extract_from(html, html.index(">File: <")) post["url"] = extr('href="', '"') post["filename"] = text.unquote(extr(">", "<").rpartition(".")[0]) @@ -106,7 +103,7 @@ class _4chanarchivesBoardExtractor(Extractor): data["pageCount"]: return - url = "{}/{}/{}.json".format(self.root, board, pnum) - threads = self.request(url).json()["threads"] + url = f"{self.root}/{board}/{pnum}.json" + threads = self.request_json(url)["threads"] diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index 68b906e..120cd8a 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,8 +26,8 @@ class _8musesAlbumExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.path = match.group(1) - self.params = match.group(2) or "" + self.path = match[1] + self.params = match[2] or "" def items(self): url = self.root + self.path + self.params @@ -37,8 +37,7 @@ class _8musesAlbumExtractor(Extractor): self.request(url).text, 'id="ractive-public" type="text/plain">', '')) - images = data.get("pictures") - if images: + if images := data.get("pictures"): count = len(images) album = self._make_album(data["album"]) yield Message.Directory, {"album": album, "count": count} @@ -54,8 +53,7 @@ class _8musesAlbumExtractor(Extractor): } yield Message.Url, url, img - albums = data.get("albums") - if albums: + if albums := data.get("albums"): for album in albums: permalink = album.get("permalink") if not permalink: @@ -74,8 +72,7 @@ class _8musesAlbumExtractor(Extractor): return path, _, num = self.path.rstrip("/").rpartition("/") path = path if num.isdecimal() else self.path - url = "{}{}/{}{}".format( - self.root, path, data["page"] + 1, self.params) + url = f"{self.root}{path}/{data['page'] + 1}{self.params}" def _make_album(self, album): return { @@ -92,8 +89,7 @@ class _8musesAlbumExtractor(Extractor): album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"), } - @staticmethod - def _unobfuscate(data): + def _unobfuscate(self, data): return util.json_loads("".join([ chr(33 + (ord(c) + 14) % 94) if "!" <= c <= "~" else c for c in text.unescape(data.strip("\t\n\r !")) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 2da471e..688f0a0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. import sys -from ..util import re_compile +from ..text import re_compile modules = [ "2ch", @@ -40,9 +40,11 @@ modules = [ "chevereto", "cien", "civitai", + "comick", "comicvine", "cyberdrop", "danbooru", + "dankefuerslesen", "desktopography", "deviantart", "discord", @@ -63,6 +65,8 @@ modules = [ "gelbooru", "gelbooru_v01", "gelbooru_v02", + "girlsreleased", + "girlswithmuscle", "gofile", "hatenablog", "hentai2read", @@ -88,13 +92,14 @@ modules = [ "issuu", "itaku", "itchio", + "iwara", "jschan", "kabeuchi", "keenspot", - "kemonoparty", + "kemono", "khinsider", - "koharu", "komikcast", + "leakgallery", "lensdump", "lexica", "lightroom", @@ -102,19 +107,20 @@ modules = [ "lofter", "luscious", "lynxchan", + "madokami", "mangadex", "mangafox", "mangahere", "manganelo", "mangapark", "mangaread", - "mangasee", "mangoxo", "misskey", "motherless", "myhentaigallery", "myportfolio", - "naver", + "naverblog", + "naverchzzk", "naverwebtoon", "nekohouse", "newgrounds", @@ -123,6 +129,7 @@ modules = [ "nitter", "nozomi", "nsfwalbum", + "nudostar", "paheal", "patreon", "pexels", @@ -142,9 +149,11 @@ modules = [ "pornhub", "pornpics", "postmill", + "rawkuma", "reactor", "readcomiconline", "realbooru", + "redbust", "reddit", "redgifs", "rule34us", @@ -153,6 +162,7 @@ modules = [ "saint", "sankaku", "sankakucomplex", + "schalenetwork", "scrolller", "seiga", "senmanga", @@ -226,8 +236,7 @@ modules = [ def find(url): """Find a suitable extractor for the given URL""" for cls in _list_classes(): - match = cls.pattern.match(url) - if match: + if match := cls.pattern.match(url): return cls(match) return None @@ -242,8 +251,7 @@ def add(cls): def add_module(module): """Add all extractors in 'module' to the list of available extractors""" - classes = _get_classes(module) - if classes: + if classes := _get_classes(module): if isinstance(classes[0].pattern, str): for cls in classes: cls.pattern = re_compile(cls.pattern) diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py index c891b17..3249ae6 100644 --- a/gallery_dl/extractor/adultempire.py +++ b/gallery_dl/extractor/adultempire.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,7 +22,7 @@ class AdultempireGalleryExtractor(GalleryExtractor): def __init__(self, match): GalleryExtractor.__init__(self, match) - self.gallery_id = match.group(2) + self.gallery_id = match[2] def _init(self): self.cookies.set("ageConfirmed", "true", domain="www.adultempire.com") @@ -48,4 +48,4 @@ class AdultempireGalleryExtractor(GalleryExtractor): if len(urls) < 24: return params["page"] += 1 - page = self.request(self.gallery_url, params=params).text + page = self.request(self.page_url, params=params).text diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py index 653b73f..5bb1835 100644 --- a/gallery_dl/extractor/agnph.py +++ b/gallery_dl/extractor/agnph.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,11 +9,8 @@ """Extractors for https://agn.ph/""" from . import booru -from .. import text - -from xml.etree import ElementTree +from .. import text, util import collections -import re BASE_PATTERN = r"(?:https?://)?agn\.ph" @@ -52,8 +49,7 @@ class AgnphExtractor(booru.BooruExtractor): params["page"] = self.page_start while True: - data = self.request(url, params=params).text - root = ElementTree.fromstring(data) + root = self.request_xml(url, params=params) yield from map(self._xml_to_dict, root) @@ -64,7 +60,7 @@ class AgnphExtractor(booru.BooruExtractor): params["page"] += 1 def _html(self, post): - url = "{}/gallery/post/show/{}/".format(self.root, post["id"]) + url = f"{self.root}/gallery/post/show/{post['id']}/" return self.request(url).text def _tags(self, post, page): @@ -74,7 +70,7 @@ class AgnphExtractor(booru.BooruExtractor): return tags = collections.defaultdict(list) - pattern = re.compile(r'class="(.)typetag">([^<]+)') + pattern = util.re(r'class="(.)typetag">([^<]+)') for tag_type, tag_name in pattern.findall(tag_container): tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) for key, value in tags.items(): @@ -107,7 +103,6 @@ class AgnphPostExtractor(AgnphExtractor): example = "https://agn.ph/gallery/post/show/12345/" def posts(self): - url = "{}/gallery/post/show/{}/?api=xml".format( - self.root, self.groups[0]) - post = ElementTree.fromstring(self.request(url).text) + url = f"{self.root}/gallery/post/show/{self.groups[0]}/?api=xml" + post = self.request_xml(url) return (self._xml_to_dict(post),) diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py index d3ab846..2652acb 100644 --- a/gallery_dl/extractor/ao3.py +++ b/gallery_dl/extractor/ao3.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://archiveofourown.org/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import cache @@ -135,7 +135,7 @@ class Ao3WorkExtractor(Ao3Extractor): self.login() work_id = self.groups[0] - url = "{}/works/{}".format(self.root, work_id) + url = f"{self.root}/works/{work_id}" response = self.request(url, notfound="work") if response.url.endswith("/users/login?restricted=true"): @@ -144,7 +144,7 @@ class Ao3WorkExtractor(Ao3Extractor): page = response.text if len(page) < 20000 and \ '

Adult Content Warning]+)").findall( + post["content"]): if not self.emoticons and 'class="arca-emoticon"' in media: continue @@ -75,36 +74,37 @@ class ArcalivePostExtractor(ArcaliveExtractor): if not src: continue - src = text.unescape(src.partition("?")[0]) + src, _, query = text.unescape(src).partition("?") if src[0] == "/": if src[1] == "/": - url = "https:" + src + url = "https:" + src.replace( + "//ac-p.namu", "//ac-o.namu", 1) else: url = self.root + src else: url = src fallback = () - orig = text.extr(media, 'data-orig="', '"') - if orig: + query = f"?type=orig&{query}" + if orig := text.extr(media, 'data-orig="', '"'): path, _, ext = url.rpartition(".") if ext != orig: - fallback = (url + "?type=orig",) + fallback = (url + query,) url = path + "." + orig elif video and self.gifs: url_gif = url.rpartition(".")[0] + ".gif" if self.gifs_fallback: - fallback = (url + "?type=orig",) + fallback = (url + query,) url = url_gif else: response = self.request( - url_gif + "?type=orig", method="HEAD", fatal=False) + url_gif + query, method="HEAD", fatal=False) if response.status_code < 400: - fallback = (url + "?type=orig",) + fallback = (url + query,) url = url_gif files.append({ - "url" : url + "?type=orig", + "url" : url + query, "width" : text.parse_int(text.extr(media, 'width="', '"')), "height": text.parse_int(text.extr(media, 'height="', '"')), "_fallback": fallback, @@ -112,11 +112,6 @@ class ArcalivePostExtractor(ArcaliveExtractor): return files - def _extract_media(self, content): - ArcalivePostExtractor._extract_media = extr = re.compile( - r"<(?:img|vide(o)) ([^>]+)").findall - return extr(content) - class ArcaliveBoardExtractor(ArcaliveExtractor): """Extractor for an arca.live board's posts""" @@ -175,9 +170,8 @@ class ArcaliveAPI(): return data self.log.debug("Server response: %s", data) - msg = data.get("message") - raise exception.StopExtraction( - "API request failed%s", ": " + msg if msg else "") + msg = f": {msg}" if (msg := data.get("message")) else "" + raise exception.AbortExtraction(f"API request failed{msg}") def _pagination(self, endpoint, params, key): while True: diff --git a/gallery_dl/extractor/architizer.py b/gallery_dl/extractor/architizer.py index 911753b..e39d3d2 100644 --- a/gallery_dl/extractor/architizer.py +++ b/gallery_dl/extractor/architizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -24,7 +24,7 @@ class ArchitizerProjectExtractor(GalleryExtractor): example = "https://architizer.com/projects/NAME/" def __init__(self, match): - url = "{}/projects/{}/".format(self.root, match.group(1)) + url = f"{self.root}/projects/{match[1]}/" GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -68,15 +68,14 @@ class ArchitizerFirmExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.firm = match.group(1) + self.firm = match[1] def items(self): - url = url = "{}/firms/{}/?requesting_merlin=pages".format( - self.root, self.firm) + url = url = f"{self.root}/firms/{self.firm}/?requesting_merlin=pages" page = self.request(url).text data = {"_extractor": ArchitizerProjectExtractor} for project in text.extract_iter(page, '= 400: @@ -141,9 +140,9 @@ class AryionExtractor(Extractor): # fix 'Last-Modified' header lmod = headers["last-modified"] if lmod[22] != ":": - lmod = "{}:{} GMT".format(lmod[:22], lmod[22:24]) + lmod = f"{lmod[:22]}:{lmod[22:24]} GMT" - post_url = "{}/g4/view/{}".format(self.root, post_id) + post_url = f"{self.root}/g4/view/{post_id}" extr = text.extract_from(self.request(post_url).text) title, _, artist = text.unescape(extr( @@ -195,10 +194,10 @@ class AryionGalleryExtractor(AryionExtractor): def posts(self): if self.recursive: - url = "{}/g4/gallery/{}".format(self.root, self.user) + url = f"{self.root}/g4/gallery/{self.user}" return self._pagination_params(url) else: - url = "{}/g4/latest.php?name={}".format(self.root, self.user) + url = f"{self.root}/g4/latest.php?name={self.user}" return util.advance(self._pagination_next(url), self.offset) @@ -212,9 +211,8 @@ class AryionFavoriteExtractor(AryionExtractor): example = "https://aryion.com/g4/favorites/USER" def posts(self): - url = "{}/g4/favorites/{}".format(self.root, self.user) - return self._pagination_params( - url, None, "class='gallery-item favorite' id='") + url = f"{self.root}/g4/favorites/{self.user}" + return self._pagination_params(url, None, "data-item-id='") class AryionTagExtractor(AryionExtractor): diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index a1ad3ae..50e0c5d 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -7,8 +7,7 @@ """Extractors for https://bato.to/""" from .common import Extractor, ChapterExtractor, MangaExtractor -from .. import text, exception -import re +from .. import text, util BASE_PATTERN = (r"(?:https?://)?(" r"(?:ba|d|f|h|j|m|w)to\.to|" @@ -87,7 +86,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): ChapterExtractor.__init__(self, match, False) self._init_root() self.chapter_id = self.groups[1] - self.gallery_url = "{}/title/0/{}".format(self.root, self.chapter_id) + self.page_url = f"{self.root}/title/0/{self.chapter_id}" def metadata(self, page): extr = text.extract_from(page) @@ -104,9 +103,9 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): info = text.remove_html(extr('link-hover">', "', "<") - if warning: - raise exception.StopExtraction("'%s'", text.remove_html(warning)) + if warning := extr(' class="alert alert-warning">', ""): + self.log.warning("'%s'", text.remove_html(warning)) data = { "manga_id": text.parse_int(self.manga_id), @@ -178,6 +176,6 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor): data["date"] = text.parse_datetime( extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ") - url = "{}/title/{}".format(self.root, href) + url = f"{self.root}/title/{href}" results.append((url, data.copy())) return results diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index b398152..8efb3db 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -33,7 +33,7 @@ class BbcGalleryExtractor(GalleryExtractor): page, "

", "

").rpartition("")[2]), "description": text.unescape(text.extr( page, 'property="og:description" content="', '"')), - "programme": self.gallery_url.split("/")[4], + "programme": self.page_url.split("/")[4], "path": list(util.unique_sequence( element["name"] for element in data["itemListElement"] @@ -43,7 +43,7 @@ class BbcGalleryExtractor(GalleryExtractor): def images(self, page): width = self.config("width") width = width - width % 16 if width else 1920 - dimensions = "/{}xn/".format(width) + dimensions = f"/{width}xn/" results = [] for img in text.extract_iter(page, 'class="gallery__thumbnail', ">"): @@ -60,12 +60,11 @@ class BbcGalleryExtractor(GalleryExtractor): )) return results - @staticmethod - def _fallback_urls(src, max_width): + def _fallback_urls(self, src, max_width): front, _, back = src.partition("/320x180_b/") for width in (1920, 1600, 1280, 976): if width < max_width: - yield "{}/{}xn/{}".format(front, width, back) + yield f"{front}/{width}xn/{back}" class BbcProgrammeExtractor(Extractor): diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 14598b7..4a7c074 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2023 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,6 +17,8 @@ class BehanceExtractor(Extractor): category = "behance" root = "https://www.behance.net" request_interval = (2.0, 4.0) + browser = "firefox" + tls12 = False def _init(self): self._bcp = self.cookies.get("bcp", domain="www.behance.net") @@ -44,15 +46,15 @@ class BehanceExtractor(Extractor): "variables": variables, } - return self.request(url, method="POST", headers=headers, - json=data).json()["data"] + return self.request_json( + url, method="POST", headers=headers, json=data)["data"] def _update(self, data): # compress data to simple lists - if data.get("fields") and isinstance(data["fields"][0], dict): + if (fields := data.get("fields")) and isinstance(fields[0], dict): data["fields"] = [ field.get("name") or field.get("label") - for field in data["fields"] + for field in fields ] data["owners"] = [ @@ -68,6 +70,9 @@ class BehanceExtractor(Extractor): data["date"] = text.parse_timestamp( data.get("publishedOn") or data.get("conceived_on") or 0) + if creator := data.get("creator"): + creator["name"] = creator["url"].rpartition("/")[2] + # backwards compatibility data["gallery_id"] = data["id"] data["title"] = data["name"] @@ -87,13 +92,12 @@ class BehanceGalleryExtractor(BehanceExtractor): def __init__(self, match): BehanceExtractor.__init__(self, match) - self.gallery_id = match.group(1) + self.gallery_id = match[1] def _init(self): BehanceExtractor._init(self) - modules = self.config("modules") - if modules: + if modules := self.config("modules"): if isinstance(modules, str): modules = modules.split(",") self.modules = set(modules) @@ -114,12 +118,15 @@ class BehanceGalleryExtractor(BehanceExtractor): def get_gallery_data(self): """Collect gallery info dict""" - url = "{}/gallery/{}/a".format(self.root, self.gallery_id) + url = f"{self.root}/gallery/{self.gallery_id}/a" cookies = { - "gki": '{"feature_project_view":false,' - '"feature_discover_login_prompt":false,' - '"feature_project_login_prompt":false}', + "gk_suid": "14118261", + "gki": "feature_3_in_1_checkout_test:false,hire_browse_get_quote_c" + "ta_ab_test:false,feature_hire_dashboard_services_ab_test:f" + "alse,feature_show_details_jobs_row_ab_test:false,feature_a" + "i_freelance_project_create_flow:false,", "ilo0": "true", + "originalReferrer": "", } page = self.request(url, cookies=cookies).text @@ -141,9 +148,7 @@ class BehanceGalleryExtractor(BehanceExtractor): raise exception.AuthorizationError() return () - result = [] - append = result.append - + results = [] for module in data["modules"]: mtype = module["__typename"][:-6].lower() @@ -161,7 +166,7 @@ class BehanceGalleryExtractor(BehanceExtractor): sizes.get("fs") or sizes.get("hd") or sizes.get("disp")) - append((size["url"], module)) + results.append((size["url"], module)) elif mtype == "video": try: @@ -173,7 +178,7 @@ class BehanceGalleryExtractor(BehanceExtractor): url = "ytdl:" + url module["_ytdl_manifest"] = "hls" module["extension"] = "mp4" - append((url, module)) + results.append((url, module)) continue except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) @@ -194,7 +199,7 @@ class BehanceGalleryExtractor(BehanceExtractor): self.log.debug("%s: %s", exc.__class__.__name__, exc) url = "ytdl:" + renditions[-1]["url"] - append((url, module)) + results.append((url, module)) elif mtype == "mediacollection": for component in module["components"]: @@ -202,21 +207,21 @@ class BehanceGalleryExtractor(BehanceExtractor): if size: parts = size["url"].split("/") parts[4] = "source" - append(("/".join(parts), module)) + results.append(("/".join(parts), module)) break elif mtype == "embed": - embed = module.get("originalEmbed") or module.get("fluidEmbed") - if embed: + if embed := (module.get("originalEmbed") or + module.get("fluidEmbed")): embed = text.unescape(text.extr(embed, 'src="', '"')) module["extension"] = "mp4" - append(("ytdl:" + embed, module)) + results.append(("ytdl:" + embed, module)) elif mtype == "text": module["extension"] = "txt" - append(("text:" + module["text"], module)) + results.append(("text:" + module["text"], module)) - return result + return results class BehanceUserExtractor(BehanceExtractor): @@ -228,7 +233,7 @@ class BehanceUserExtractor(BehanceExtractor): def __init__(self, match): BehanceExtractor.__init__(self, match) - self.user = match.group(1) + self.user = match[1] def galleries(self): endpoint = "GetProfileProjects" @@ -256,7 +261,7 @@ class BehanceCollectionExtractor(BehanceExtractor): def __init__(self, match): BehanceExtractor.__init__(self, match) - self.collection_id = match.group(1) + self.collection_id = match[1] def galleries(self): endpoint = "GetMoodboardItemsAndRecommendations" diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py index 597ec40..3f0acff 100644 --- a/gallery_dl/extractor/bilibili.py +++ b/gallery_dl/extractor/bilibili.py @@ -19,20 +19,15 @@ class BilibiliExtractor(Extractor): def _init(self): self.api = BilibiliAPI(self) - -class BilibiliUserArticlesExtractor(BilibiliExtractor): - """Extractor for a bilibili user's articles""" - subcategory = "user-articles" - pattern = (r"(?:https?://)?space\.bilibili\.com/(\d+)" - r"/(?:article|upload/opus)") - example = "https://space.bilibili.com/12345/article" - def items(self): - for article in self.api.user_articles(self.groups[0]): + for article in self.articles(): article["_extractor"] = BilibiliArticleExtractor - url = "{}/opus/{}".format(self.root, article["opus_id"]) + url = f"{self.root}/opus/{article['opus_id']}" yield Message.Queue, url, article + def articles(self): + return () + class BilibiliArticleExtractor(BilibiliExtractor): """Extractor for a bilibili article""" @@ -45,12 +40,16 @@ class BilibiliArticleExtractor(BilibiliExtractor): archive_fmt = "{id}_{num}" def items(self): - article = self.api.article(self.groups[0]) + article_id = self.groups[0] + article = self.api.article(article_id) # Flatten modules list modules = {} for module in article["detail"]["modules"]: - del module['module_type'] + if module["module_type"] == "MODULE_TYPE_BLOCKED": + self.log.warning("%s: Blocked Article\n%s", article_id, + module["module_blocked"].get("hint_message")) + del module["module_type"] modules.update(module) article["detail"]["modules"] = modules @@ -64,14 +63,15 @@ class BilibiliArticleExtractor(BilibiliExtractor): except Exception: pass - for paragraph in modules['module_content']['paragraphs']: - if "pic" not in paragraph: - continue + if "module_content" in modules: + for paragraph in modules["module_content"]["paragraphs"]: + if "pic" not in paragraph: + continue - try: - pics.extend(paragraph["pic"]["pics"]) - except Exception: - pass + try: + pics.extend(paragraph["pic"]["pics"]) + except Exception: + pass article["count"] = len(pics) yield Message.Directory, article @@ -81,6 +81,17 @@ class BilibiliArticleExtractor(BilibiliExtractor): yield Message.Url, url, text.nameext_from_url(url, article) +class BilibiliUserArticlesExtractor(BilibiliExtractor): + """Extractor for a bilibili user's articles""" + subcategory = "user-articles" + pattern = (r"(?:https?://)?space\.bilibili\.com/(\d+)" + r"/(?:article|upload/opus)") + example = "https://space.bilibili.com/12345/article" + + def articles(self): + return self.api.user_articles(self.groups[0]) + + class BilibiliUserArticlesFavoriteExtractor(BilibiliExtractor): subcategory = "user-articles-favorite" pattern = (r"(?:https?://)?space\.bilibili\.com" @@ -88,18 +99,12 @@ class BilibiliUserArticlesFavoriteExtractor(BilibiliExtractor): example = "https://space.bilibili.com/12345/favlist?fid=opus" _warning = True - def _init(self): - BilibiliExtractor._init(self) + def articles(self): if self._warning: if not self.cookies_check(("SESSDATA",)): self.log.error("'SESSDATA' cookie required") BilibiliUserArticlesFavoriteExtractor._warning = False - - def items(self): - for article in self.api.user_favlist(): - article["_extractor"] = BilibiliArticleExtractor - url = "{}/opus/{}".format(self.root, article["opus_id"]) - yield Message.Queue, url, article + return self.api.user_favlist() class BilibiliAPI(): @@ -108,11 +113,11 @@ class BilibiliAPI(): def _call(self, endpoint, params): url = "https://api.bilibili.com/x/polymer/web-dynamic/v1" + endpoint - data = self.extractor.request(url, params=params).json() + data = self.extractor.request_json(url, params=params) - if data["code"] != 0: + if data["code"]: self.extractor.log.debug("Server response: %s", data) - raise exception.StopExtraction("API request failed") + raise exception.AbortExtraction("API request failed") return data @@ -140,8 +145,8 @@ class BilibiliAPI(): page, "window.__INITIAL_STATE__=", "};") + "}") except Exception: if "window._riskdata_" not in page: - raise exception.StopExtraction( - "%s: Unable to extract INITIAL_STATE data", article_id) + raise exception.AbortExtraction( + f"{article_id}: Unable to extract INITIAL_STATE data") self.extractor.wait(seconds=300) def user_favlist(self): @@ -159,12 +164,13 @@ class BilibiliAPI(): def login_user_id(self): url = "https://api.bilibili.com/x/space/v2/myinfo" - data = self.extractor.request(url).json() + data = self.extractor.request_json(url) if data["code"] != 0: self.extractor.log.debug("Server response: %s", data) - raise exception.StopExtraction("API request failed,Are you login?") + raise exception.AbortExtraction( + "API request failed. Are you logges in?") try: return data["data"]["profile"]["mid"] except Exception: - raise exception.StopExtraction("API request failed") + raise exception.AbortExtraction("API request failed") diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index ef117da..796d9d1 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,12 @@ from .common import BaseExtractor, Message from .. import text, util -import re + + +def original(url): + return (util.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)") + .sub(r"\1s0", url) + .replace("http:", "https:", 1)) class BloggerExtractor(BaseExtractor): @@ -33,13 +38,12 @@ class BloggerExtractor(BaseExtractor): blog["date"] = text.parse_datetime(blog["published"]) del blog["selfLink"] - sub = re.compile(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)").sub - findall_image = re.compile( + findall_image = util.re( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' r'lh\d+(?:-\w+)?\.googleusercontent\.com|' r'\d+\.bp\.blogspot\.com)/[^"]+)').findall - findall_video = re.compile( + findall_video = util.re( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall metadata = self.metadata() @@ -48,7 +52,7 @@ class BloggerExtractor(BaseExtractor): files = findall_image(content) for idx, url in enumerate(files): - files[idx] = sub(r"\1s0", url).replace("http:", "https:", 1) + files[idx] = original(url) if self.videos and 'id="BLOG_video-' in content: page = self.request(post["url"]).text @@ -98,12 +102,8 @@ class BloggerPostExtractor(BloggerExtractor): pattern = BASE_PATTERN + r"(/\d\d\d\d/\d\d/[^/?#]+\.html)" example = "https://BLOG.blogspot.com/1970/01/TITLE.html" - def __init__(self, match): - BloggerExtractor.__init__(self, match) - self.path = match.group(match.lastindex) - def posts(self, blog): - return (self.api.post_by_path(blog["id"], self.path),) + return (self.api.post_by_path(blog["id"], self.groups[-1]),) class BloggerBlogExtractor(BloggerExtractor): @@ -122,16 +122,13 @@ class BloggerSearchExtractor(BloggerExtractor): pattern = BASE_PATTERN + r"/search/?\?q=([^&#]+)" example = "https://BLOG.blogspot.com/search?q=QUERY" - def __init__(self, match): - BloggerExtractor.__init__(self, match) - self.query = text.unquote(match.group(match.lastindex)) + def metadata(self): + self.query = query = text.unquote(self.groups[-1]) + return {"query": query} def posts(self, blog): return self.api.blog_search(blog["id"], self.query) - def metadata(self): - return {"query": self.query} - class BloggerLabelExtractor(BloggerExtractor): """Extractor for Blogger posts by label""" @@ -139,21 +136,18 @@ class BloggerLabelExtractor(BloggerExtractor): pattern = BASE_PATTERN + r"/search/label/([^/?#]+)" example = "https://BLOG.blogspot.com/search/label/LABEL" - def __init__(self, match): - BloggerExtractor.__init__(self, match) - self.label = text.unquote(match.group(match.lastindex)) + def metadata(self): + self.label = label = text.unquote(self.groups[-1]) + return {"label": label} def posts(self, blog): return self.api.blog_posts(blog["id"], self.label) - def metadata(self): - return {"label": self.label} - class BloggerAPI(): - """Minimal interface for the Blogger v3 API + """Minimal interface for the Blogger API v3 - Ref: https://developers.google.com/blogger + https://developers.google.com/blogger """ API_KEY = "AIzaSyCN9ax34oMMyM07g_M-5pjeDp_312eITK8" @@ -162,27 +156,27 @@ class BloggerAPI(): self.api_key = extractor.config("api-key") or self.API_KEY def blog_by_url(self, url): - return self._call("blogs/byurl", {"url": url}, "blog") + return self._call("/blogs/byurl", {"url": url}, "blog") def blog_posts(self, blog_id, label=None): - endpoint = "blogs/{}/posts".format(blog_id) + endpoint = f"/blogs/{blog_id}/posts" params = {"labels": label} return self._pagination(endpoint, params) def blog_search(self, blog_id, query): - endpoint = "blogs/{}/posts/search".format(blog_id) + endpoint = f"/blogs/{blog_id}/posts/search" params = {"q": query} return self._pagination(endpoint, params) def post_by_path(self, blog_id, path): - endpoint = "blogs/{}/posts/bypath".format(blog_id) + endpoint = f"/blogs/{blog_id}/posts/bypath" return self._call(endpoint, {"path": path}, "post") def _call(self, endpoint, params, notfound=None): - url = "https://www.googleapis.com/blogger/v3/" + endpoint + url = "https://www.googleapis.com/blogger/v3" + endpoint params["key"] = self.api_key - return self.extractor.request( - url, params=params, notfound=notfound).json() + return self.extractor.request_json( + url, params=params, notfound=notfound) def _pagination(self, endpoint, params): while True: diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index 6f4abd5..e2c5334 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://bsky.app/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import cache, memcache @@ -26,8 +26,7 @@ class BlueskyExtractor(Extractor): root = "https://bsky.app" def _init(self): - meta = self.config("metadata") or () - if meta: + if meta := self.config("metadata") or (): if isinstance(meta, str): meta = meta.replace(" ", "").split(",") elif not isinstance(meta, (list, tuple)): @@ -62,9 +61,8 @@ class BlueskyExtractor(Extractor): yield Message.Directory, post if files: did = post["author"]["did"] - base = ( - "{}/xrpc/com.atproto.sync.getBlob?did={}&cid=".format( - self.api.service_endpoint(did), did)) + base = (f"{self.api.service_endpoint(did)}/xrpc" + f"/com.atproto.sync.getBlob?did={did}&cid=") for post["num"], file in enumerate(files, 1): post.update(file) yield Message.Url, base + file["filename"], post @@ -96,7 +94,7 @@ class BlueskyExtractor(Extractor): uri = record["value"]["subject"]["uri"] if "/app.bsky.feed.post/" in uri: yield from self.api.get_post_thread_uri(uri, depth) - except exception.StopExtraction: + except exception.ControlException: pass # deleted post except Exception as exc: self.log.debug(record, exc_info=exc) @@ -210,16 +208,12 @@ class BlueskyExtractor(Extractor): },) -class BlueskyUserExtractor(BlueskyExtractor): - subcategory = "user" +class BlueskyUserExtractor(Dispatch, BlueskyExtractor): pattern = USER_PATTERN + r"$" example = "https://bsky.app/profile/HANDLE" - def initialize(self): - pass - def items(self): - base = "{}/profile/{}/".format(self.root, self.groups[0]) + base = f"{self.root}/profile/{self.groups[0]}/" default = ("posts" if self.config("quoted", False) or self.config("reposts", False) else "media") return self._dispatch_extractors(( @@ -415,11 +409,9 @@ class BlueskyAPI(): def get_feed(self, actor, feed): endpoint = "app.bsky.feed.getFeed" - params = { - "feed" : "at://{}/app.bsky.feed.generator/{}".format( - self._did_from_actor(actor), feed), - "limit": "100", - } + uri = (f"at://{self._did_from_actor(actor)}" + f"/app.bsky.feed.generator/{feed}") + params = {"feed": uri, "limit": "100"} return self._pagination(endpoint, params) def get_follows(self, actor): @@ -432,16 +424,13 @@ class BlueskyAPI(): def get_list_feed(self, actor, list): endpoint = "app.bsky.feed.getListFeed" - params = { - "list" : "at://{}/app.bsky.graph.list/{}".format( - self._did_from_actor(actor), list), - "limit": "100", - } + uri = f"at://{self._did_from_actor(actor)}/app.bsky.graph.list/{list}" + params = {"list" : uri, "limit": "100"} return self._pagination(endpoint, params) def get_post_thread(self, actor, post_id): - uri = "at://{}/app.bsky.feed.post/{}".format( - self._did_from_actor(actor), post_id) + uri = (f"at://{self._did_from_actor(actor)}" + f"/app.bsky.feed.post/{post_id}") depth = self.extractor.config("depth", "0") return self.get_post_thread_uri(uri, depth) @@ -498,7 +487,7 @@ class BlueskyAPI(): url = "https://plc.directory/" + did try: - data = self.extractor.request(url).json() + data = self.extractor.request_json(url) for service in data["service"]: if service["type"] == "AtprotoPersonalDataServer": return service["serviceEndpoint"] @@ -551,15 +540,15 @@ class BlueskyAPI(): "password" : self.password, } - url = "{}/xrpc/{}".format(self.root, endpoint) + url = f"{self.root}/xrpc/{endpoint}" response = self.extractor.request( url, method="POST", headers=headers, json=data, fatal=None) data = response.json() if response.status_code != 200: self.log.debug("Server response: %s", data) - raise exception.AuthenticationError('"{}: {}"'.format( - data.get("error"), data.get("message"))) + raise exception.AuthenticationError( + f"\"{data.get('error')}: {data.get('message')}\"") _refresh_token_cache.update(self.username, data["refreshJwt"]) return "Bearer " + data["accessJwt"] @@ -567,7 +556,7 @@ class BlueskyAPI(): def _call(self, endpoint, params, root=None): if root is None: root = self.root - url = "{}/xrpc/{}".format(root, endpoint) + url = f"{root}/xrpc/{endpoint}" while True: self.authenticate() @@ -581,16 +570,15 @@ class BlueskyAPI(): self.extractor.wait(until=until) continue + msg = "API request failed" try: data = response.json() - msg = "API request failed ('{}: {}')".format( - data["error"], data["message"]) + msg = f"{msg} ('{data['error']}: {data['message']}')" except Exception: - msg = "API request failed ({} {})".format( - response.status_code, response.reason) + msg = f"{msg} ({response.status_code} {response.reason})" self.extractor.log.debug("Server response: %s", response.text) - raise exception.StopExtraction(msg) + raise exception.AbortExtraction(msg) def _pagination(self, endpoint, params, key="feed", root=None, check_empty=False): diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 7e26f38..3b97e9a 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -27,8 +27,7 @@ class BooruExtractor(BaseExtractor): notes = self.config("notes", False) fetch_html = tags or notes - url_key = self.config("url") - if url_key: + if url_key := self.config("url"): if isinstance(url_key, (list, tuple)): self._file_url = self._file_url_list self._file_url_keys = url_key diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py index f3e441b..e0383bf 100644 --- a/gallery_dl/extractor/boosty.py +++ b/gallery_dl/extractor/boosty.py @@ -144,8 +144,7 @@ class BoostyExtractor(Extractor): url = block["url"] sep = "&" if "?" in url else "?" - signed_query = post.get("signedQuery") - if signed_query: + if signed_query := post.get("signedQuery"): url += sep + signed_query[1:] sep = "&" @@ -218,7 +217,7 @@ class BoostyFollowingExtractor(BoostyExtractor): def items(self): for user in self.api.user_subscriptions(): - url = "{}/{}".format(self.root, user["blog"]["blogUrl"]) + url = f"{self.root}/{user['blog']['blogUrl']}" user["_extractor"] = BoostyUserExtractor yield Message.Queue, url, user @@ -280,15 +279,14 @@ class BoostyAPI(): } if not access_token: - auth = self.extractor.cookies.get("auth", domain=".boosty.to") - if auth: + if auth := self.extractor.cookies.get("auth", domain=".boosty.to"): access_token = text.extr( auth, "%22accessToken%22%3A%22", "%22") if access_token: self.headers["Authorization"] = "Bearer " + access_token def blog_posts(self, username, params): - endpoint = "/v1/blog/{}/post/".format(username) + endpoint = f"/v1/blog/{username}/post/" params = self._merge_params(params, { "limit" : "5", "offset" : None, @@ -298,7 +296,7 @@ class BoostyAPI(): return self._pagination(endpoint, params) def blog_media_album(self, username, type="all", params=()): - endpoint = "/v1/blog/{}/media_album/".format(username) + endpoint = f"/v1/blog/{username}/media_album/" params = self._merge_params(params, { "type" : type.rstrip("s"), "limit" : "15", @@ -318,7 +316,7 @@ class BoostyAPI(): return posts def post(self, username, post_id): - endpoint = "/v1/blog/{}/post/{}".format(username, post_id) + endpoint = f"/v1/blog/{username}/post/{post_id}" return self._call(endpoint) def feed_posts(self, params=None): @@ -381,7 +379,7 @@ class BoostyAPI(): else: self.extractor.log.debug(response.text) - raise exception.StopExtraction("API request failed") + raise exception.AbortExtraction("API request failed") def _pagination(self, endpoint, params, transform=None, key=None): if "is_only_allowed" not in params and self.extractor.only_allowed: @@ -418,11 +416,11 @@ class BoostyAPI(): params["offset"] = offset def dialog(self, dialog_id): - endpoint = "/v1/dialog/{}".format(dialog_id) + endpoint = f"/v1/dialog/{dialog_id}" return self._call(endpoint) def dialog_messages(self, dialog_id, limit=300, offset=None): - endpoint = "/v1/dialog/{}/message/".format(dialog_id) + endpoint = f"/v1/dialog/{dialog_id}/message/" params = { "limit": limit, "reverse": "true", diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 481e962..eba1678 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022-2023 Mike Fährmann +# Copyright 2022-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -61,6 +61,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): category = "bunkr" root = "https://bunkr.si" root_dl = "https://get.bunkrr.su" + root_api = "https://apidl.bunkr.ru" archive_fmt = "{album_id}_{id|id_url}" pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://bunkr.si/a/ID" @@ -76,9 +77,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): endpoint = self.config("endpoint") if not endpoint: - endpoint = self.root_dl + "/api/_001" + endpoint = self.root_api + "/api/_001_v2" elif endpoint[0] == "/": - endpoint = self.root_dl + endpoint + endpoint = self.root_api + endpoint self.endpoint = endpoint self.offset = 0 @@ -123,7 +124,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): pass else: if not DOMAINS: - raise exception.StopExtraction( + raise exception.AbortExtraction( "All Bunkr domains require solving a CF challenge") # select alternative domain @@ -168,7 +169,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): info[-1], "%H:%M:%S %d/%m/%Y") yield file - except exception.StopExtraction: + except exception.ControlException: raise except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) @@ -180,11 +181,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): referer = self.root_dl + "/file/" + data_id headers = {"Referer": referer, "Origin": self.root_dl} - data = self.request(self.endpoint, method="POST", headers=headers, - json={"id": data_id}).json() + data = self.request_json(self.endpoint, method="POST", headers=headers, + json={"id": data_id}) if data.get("encrypted"): - key = "SECRET_KEY_{}".format(data["timestamp"] // 3600) + key = f"SECRET_KEY_{data['timestamp'] // 3600}" file_url = util.decrypt_xor(data["url"], key.encode()) else: file_url = data["url"] diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py index 6c81f53..22f7a97 100644 --- a/gallery_dl/extractor/catbox.py +++ b/gallery_dl/extractor/catbox.py @@ -26,7 +26,7 @@ class CatboxAlbumExtractor(GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) return { - "album_id" : self.gallery_url.rpartition("/")[2], + "album_id" : self.page_url.rpartition("/")[2], "album_name" : text.unescape(extr("

", "<")), "date" : text.parse_datetime(extr( "

Created ", "<"), "%B %d %Y"), diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index dc963c5..1da7e23 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2023 Mike Fährmann +# Copyright 2023-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -68,7 +68,7 @@ class CheveretoImageExtractor(CheveretoExtractor): extr('url: "', '"')) if not url or url.endswith("/loading.svg"): pos = page.find(" download=") - url = text.rextract(page, 'href="', '"', pos)[0] + url = text.rextr(page, 'href="', '"', pos) if not url.startswith("https://"): url = util.decrypt_xor( url, b"seltilovessimpcity@simpcityhatesscrapers", diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py index 27d50e7..7dfe6b6 100644 --- a/gallery_dl/extractor/cien.py +++ b/gallery_dl/extractor/cien.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,7 +20,7 @@ class CienExtractor(Extractor): request_interval = (1.0, 2.0) def __init__(self, match): - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(match[0]) Extractor.__init__(self, match) def _init(self): @@ -52,8 +52,7 @@ class CienArticleExtractor(CienExtractor): example = "https://ci-en.net/creator/123/article/12345" def items(self): - url = "{}/creator/{}/article/{}".format( - self.root, self.groups[0], self.groups[1]) + url = f"{self.root}/creator/{self.groups[0]}/article/{self.groups[1]}" page = self.request(url, notfound="article").text files = self._extract_files(page) @@ -121,7 +120,7 @@ class CienArticleExtractor(CienExtractor): auth = text.extr(video, ' auth-key="', '"') file = text.nameext_from_url(name) - file["url"] = "{}video-web.mp4?{}".format(path, auth) + file["url"] = f"{path}video-web.mp4?{auth}" file["type"] = "video" files.append(file) @@ -145,12 +144,12 @@ class CienArticleExtractor(CienExtractor): "gallery_id": text.extr(gallery, ' gallery-id="', '"'), "time" : text.extr(gallery, ' time="', '"'), } - data = self.request(url, params=params).json() + data = self.request_json(url, params=params) url = self.root + "/api/creator/gallery/imagePath" for params["page"], params["file_id"] in enumerate( data["imgList"]): - path = self.request(url, params=params).json()["path"] + path = self.request_json(url, params=params)["path"] file = params.copy() file["url"] = path @@ -163,7 +162,7 @@ class CienCreatorExtractor(CienExtractor): example = "https://ci-en.net/creator/123" def items(self): - url = "{}/creator/{}/article".format(self.root, self.groups[0]) + url = f"{self.root}/creator/{self.groups[0]}/article" params = text.parse_query(self.groups[1]) params["mode"] = "list" return self._pagination_articles(url, params) diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index 56fe851..dc5b777 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://www.civitai.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import memcache import itertools @@ -22,9 +22,9 @@ class CivitaiExtractor(Extractor): """Base class for civitai extractors""" category = "civitai" root = "https://civitai.com" - directory_fmt = ("{category}", "{username|user[username]}", "images") - filename_fmt = "{file[id]|id|filename}.{extension}" - archive_fmt = "{file[uuid]|uuid}" + directory_fmt = ("{category}", "{user[username]}", "images") + filename_fmt = "{file[id]}.{extension}" + archive_fmt = "{file[uuid]}" request_interval = (0.5, 1.5) def _init(self): @@ -35,8 +35,7 @@ class CivitaiExtractor(Extractor): self.log.debug("Using tRPC API") self.api = CivitaiTrpcAPI(self) - quality = self.config("quality") - if quality: + if quality := self.config("quality"): if not isinstance(quality, str): quality = ",".join(quality) self._image_quality = quality @@ -45,8 +44,7 @@ class CivitaiExtractor(Extractor): self._image_quality = "original=true" self._image_ext = "png" - quality_video = self.config("quality-videos") - if quality_video: + if quality_video := self.config("quality-videos"): if not isinstance(quality_video, str): quality_video = ",".join(quality_video) if quality_video[0] == "+": @@ -59,28 +57,27 @@ class CivitaiExtractor(Extractor): self._video_quality = "quality=100" self._video_ext = "webm" - metadata = self.config("metadata") - if metadata: + if metadata := self.config("metadata"): if isinstance(metadata, str): metadata = metadata.split(",") elif not isinstance(metadata, (list, tuple)): - metadata = ("generation", "version") + metadata = ("generation", "version", "post") self._meta_generation = ("generation" in metadata) self._meta_version = ("version" in metadata) + self._meta_post = ("post" in metadata) else: - self._meta_generation = self._meta_version = False + self._meta_generation = self._meta_version = self._meta_post = \ + False def items(self): - models = self.models() - if models: + if models := self.models(): data = {"_extractor": CivitaiModelExtractor} for model in models: - url = "{}/models/{}".format(self.root, model["id"]) + url = f"{self.root}/models/{model['id']}" yield Message.Queue, url, data return - posts = self.posts() - if posts: + if posts := self.posts(): for post in posts: if "images" in post: @@ -105,27 +102,37 @@ class CivitaiExtractor(Extractor): yield Message.Url, file["url"], file return - images = self.images() - if images: - for image in images: + if images := self.images(): + for file in images: + + data = { + "file": file, + "user": file.pop("user"), + } if self._meta_generation: - image["generation"] = \ - self._extract_meta_generation(image) + data["generation"] = \ + self._extract_meta_generation(file) if self._meta_version: - image["model"], image["version"] = \ - self._extract_meta_version(image, False) - image["date"] = text.parse_datetime( - image["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") - - url = self._url(image) - text.nameext_from_url(url, image) - if not image["extension"]: - image["extension"] = ( - self._video_ext if image.get("type") == "video" else + data["model"], data["version"] = \ + self._extract_meta_version(file, False) + if "post" in file: + data["post"] = file.pop("post") + if self._meta_post and "post" not in data: + data["post"] = post = self._extract_meta_post(file) + if post: + post.pop("user", None) + file["date"] = text.parse_datetime( + file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + + data["url"] = url = self._url(file) + text.nameext_from_url(url, data) + if not data["extension"]: + data["extension"] = ( + self._video_ext if file.get("type") == "video" else self._image_ext) - yield Message.Directory, image - yield Message.Url, url, image + yield Message.Directory, data + yield Message.Url, url, data return def models(self): @@ -151,12 +158,13 @@ class CivitaiExtractor(Extractor): image["uuid"] = url name = image.get("name") if not name: - mime = image.get("mimeType") or self._image_ext - name = "{}.{}".format(image.get("id"), mime.rpartition("/")[2]) - return ( - "https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/{}/{}/{}".format( - url, quality, name) - ) + if mime := image.get("mimeType"): + name = f"{image.get('id')}.{mime.rpartition('/')[2]}" + else: + ext = self._video_ext if video else self._image_ext + name = f"{image.get('id')}.{ext}" + return (f"https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA" + f"/{url}/{quality}/{name}") def _image_results(self, images): for num, file in enumerate(images, 1): @@ -171,10 +179,29 @@ class CivitaiExtractor(Extractor): self._image_ext) if "id" not in file and data["filename"].isdecimal(): file["id"] = text.parse_int(data["filename"]) + if "date" not in file: + file["date"] = text.parse_datetime( + file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") if self._meta_generation: file["generation"] = self._extract_meta_generation(file) yield data + def _image_reactions(self): + self._require_auth() + + params = self.params + params["authed"] = True + params["useIndex"] = False + if "reactions" not in params: + params["reactions"] = ("Like", "Dislike", "Heart", "Laugh", "Cry") + return self.api.images(params) + + def _require_auth(self): + if "Authorization" not in self.api.headers and \ + not self.cookies.get( + "__Secure-civitai-token", domain=".civitai.com"): + raise exception.AuthRequired(("'api-key'", "cookies")) + def _parse_query(self, value): return text.parse_query_list( value, {"tags", "reactions", "baseModels", "tools", "techniques", @@ -186,10 +213,18 @@ class CivitaiExtractor(Extractor): except Exception as exc: return self.log.debug("", exc_info=exc) + def _extract_meta_post(self, image): + try: + post = self.api.post(image["postId"]) + post["date"] = text.parse_datetime( + post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + return post + except Exception as exc: + return self.log.debug("", exc_info=exc) + def _extract_meta_version(self, item, is_post=True): try: - version_id = self._extract_version_id(item, is_post) - if version_id: + if version_id := self._extract_version_id(item, is_post): version = self.api.model_version(version_id).copy() return version.pop("model", None), version except Exception as exc: @@ -197,12 +232,11 @@ class CivitaiExtractor(Extractor): return None, None def _extract_version_id(self, item, is_post=True): - version_id = item.get("modelVersionId") - if version_id: + if version_id := item.get("modelVersionId"): return version_id - - version_ids = item.get("modelVersionIds") - if version_ids: + if version_ids := item.get("modelVersionIds"): + return version_ids[0] + if version_ids := item.get("modelVersionIdsManual"): return version_ids[0] if is_post: @@ -285,16 +319,15 @@ class CivitaiModelExtractor(CivitaiExtractor): if not sep: name = ext ext = "bin" - file["uuid"] = "model-{}-{}-{}".format( - model["id"], version["id"], file["id"]) + file["uuid"] = f"model-{model['id']}-{version['id']}-{file['id']}" files.append({ "num" : num, "file" : file, "filename" : name, "extension": ext, - "url" : (file.get("downloadUrl") or - "{}/api/download/models/{}".format( - self.root, version["id"])), + "url" : ( + file.get("downloadUrl") or + f"{self.root}/api/download/models/{version['id']}"), "_http_headers" : { "Authorization": self.api.headers.get("Authorization")}, "_http_validate": self._validate_file_model, @@ -308,7 +341,7 @@ class CivitaiModelExtractor(CivitaiExtractor): else: params = { "modelVersionId": version["id"], - "prioritizedUserIds": [user["id"]], + "prioritizedUserIds": (user["id"],), "period": "AllTime", "sort": "Most Reactions", "limit": 20, @@ -327,8 +360,7 @@ class CivitaiModelExtractor(CivitaiExtractor): alert = text.extr( response.text, 'mantine-Alert-message">', "") if alert: - msg = "\"{}\" - 'api-key' required".format( - text.remove_html(alert)) + msg = f"\"{text.remove_html(alert)}\" - 'api-key' required" else: msg = "'api-key' required to download this file" self.log.warning(msg) @@ -366,14 +398,26 @@ class CivitaiTagExtractor(CivitaiExtractor): return self.api.models_tag(tag) -class CivitaiSearchExtractor(CivitaiExtractor): - subcategory = "search" +class CivitaiSearchModelsExtractor(CivitaiExtractor): + subcategory = "search-models" pattern = BASE_PATTERN + r"/search/models\?([^#]+)" example = "https://civitai.com/search/models?query=QUERY" def models(self): - params = text.parse_query(self.groups[0]) - return self.api.models(params) + params = self._parse_query(self.groups[0]) + return CivitaiSearchAPI(self).search_models( + params.get("query"), params.get("sortBy"), self.api.nsfw) + + +class CivitaiSearchImagesExtractor(CivitaiExtractor): + subcategory = "search-images" + pattern = BASE_PATTERN + r"/search/images\?([^#]+)" + example = "https://civitai.com/search/images?query=QUERY" + + def images(self): + params = self._parse_query(self.groups[0]) + return CivitaiSearchAPI(self).search_images( + params.get("query"), params.get("sortBy"), self.api.nsfw) class CivitaiModelsExtractor(CivitaiExtractor): @@ -382,7 +426,7 @@ class CivitaiModelsExtractor(CivitaiExtractor): example = "https://civitai.com/models" def models(self): - params = text.parse_query(self.groups[0]) + params = self._parse_query(self.groups[0]) return self.api.models(params) @@ -392,26 +436,32 @@ class CivitaiImagesExtractor(CivitaiExtractor): example = "https://civitai.com/images" def images(self): - params = text.parse_query(self.groups[0]) + params = self._parse_query(self.groups[0]) return self.api.images(params) -class CivitaiUserExtractor(CivitaiExtractor): - subcategory = "user" +class CivitaiPostsExtractor(CivitaiExtractor): + subcategory = "posts" + pattern = BASE_PATTERN + r"/posts(?:/?\?([^#]+))?(?:$|#)" + example = "https://civitai.com/posts" + + def posts(self): + params = self._parse_query(self.groups[0]) + return self.api.posts(params) + + +class CivitaiUserExtractor(Dispatch, CivitaiExtractor): pattern = USER_PATTERN + r"/?(?:$|\?|#)" example = "https://civitai.com/user/USER" - def initialize(self): - pass - def items(self): - base = "{}/user/{}/".format(self.root, self.groups[0]) + base = f"{self.root}/user/{self.groups[0]}/" return self._dispatch_extractors(( (CivitaiUserModelsExtractor, base + "models"), (CivitaiUserPostsExtractor , base + "posts"), (CivitaiUserImagesExtractor, base + "images"), (CivitaiUserVideosExtractor, base + "videos"), - ), ("user-models", "user-posts")) + ), ("user-images", "user-videos")) class CivitaiUserModelsExtractor(CivitaiExtractor): @@ -446,29 +496,17 @@ class CivitaiUserImagesExtractor(CivitaiExtractor): example = "https://civitai.com/user/USER/images" def __init__(self, match): - self.params = self._parse_query(match.group(2)) + user, query = match.groups() + self.params = self._parse_query(query) if self.params.get("section") == "reactions": - self.subcategory = "reactions" - self.images = self.images_reactions + self.subcategory = "reactions-images" + self.images = self._image_reactions + else: + self.params["username"] = text.unquote(user) CivitaiExtractor.__init__(self, match) def images(self): - params = self.params - params["username"] = text.unquote(self.groups[0]) - return self.api.images(params) - - def images_reactions(self): - if "Authorization" not in self.api.headers and \ - not self.cookies.get( - "__Secure-civitai-token", domain=".civitai.com"): - raise exception.AuthorizationError("api-key or cookies required") - - params = self.params - params["authed"] = True - params["useIndex"] = False - if "reactions" not in params: - params["reactions"] = ("Like", "Dislike", "Heart", "Laugh", "Cry") - return self.api.images(params) + return self.api.images(self.params) class CivitaiUserVideosExtractor(CivitaiExtractor): @@ -477,14 +515,40 @@ class CivitaiUserVideosExtractor(CivitaiExtractor): pattern = USER_PATTERN + r"/videos/?(?:\?([^#]+))?" example = "https://civitai.com/user/USER/videos" - def images(self): - self._image_ext = "mp4" + def __init__(self, match): + user, query = match.groups() + self.params = self._parse_query(query) + self.params["types"] = ("video",) + if self.params.get("section") == "reactions": + self.subcategory = "reactions-videos" + self.images = self._image_reactions + else: + self.params["username"] = text.unquote(user) + CivitaiExtractor.__init__(self, match) - user, query = self.groups - params = self._parse_query(query) - params["types"] = ["video"] - params["username"] = text.unquote(user) - return self.api.images(params) + images = CivitaiUserImagesExtractor.images + + +class CivitaiGeneratedExtractor(CivitaiExtractor): + """Extractor for your generated files feed""" + subcategory = "generated" + filename_fmt = "{filename}.{extension}" + directory_fmt = ("{category}", "generated") + pattern = f"{BASE_PATTERN}/generate" + example = "https://civitai.com/generate" + + def items(self): + self._require_auth() + + for gen in self.api.orchestrator_queryGeneratedImages(): + gen["date"] = text.parse_datetime( + gen["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + yield Message.Directory, gen + for step in gen.pop("steps", ()): + for image in step.pop("images", ()): + data = {"file": image, **step, **gen} + url = image["url"] + yield Message.Url, url, text.nameext_from_url(url, data) class CivitaiRestAPI(): @@ -498,8 +562,7 @@ class CivitaiRestAPI(): self.root = extractor.root + "/api" self.headers = {"Content-Type": "application/json"} - api_key = extractor.config("api-key") - if api_key: + if api_key := extractor.config("api-key"): extractor.log.debug("Using api_key authentication") self.headers["Authorization"] = "Bearer " + api_key @@ -528,12 +591,12 @@ class CivitaiRestAPI(): }) def model(self, model_id): - endpoint = "/v1/models/{}".format(model_id) + endpoint = f"/v1/models/{model_id}" return self._call(endpoint) @memcache(keyarg=1) def model_version(self, model_version_id): - endpoint = "/v1/model-versions/{}".format(model_version_id) + endpoint = f"/v1/model-versions/{model_version_id}" return self._call(endpoint) def models(self, params): @@ -572,13 +635,12 @@ class CivitaiTrpcAPI(): self.root = extractor.root + "/api/trpc/" self.headers = { "content-type" : "application/json", - "x-client-version": "5.0.701", + "x-client-version": "5.0.920", "x-client-date" : "", "x-client" : "web", "x-fingerprint" : "undefined", } - api_key = extractor.config("api-key") - if api_key: + if api_key := extractor.config("api-key"): extractor.log.debug("Using api_key authentication") self.headers["Authorization"] = "Bearer " + api_key @@ -607,11 +669,11 @@ class CivitaiTrpcAPI(): "useIndex" : True, "period" : "AllTime", "sort" : "Newest", - "types" : ["image"], + "types" : ("image",), "withMeta" : False, # Metadata Only "fromPlatform" : False, # Made On-Site "browsingLevel": self.nsfw, - "include" : ["cosmetics"], + "include" : ("cosmetics",), }) params = self._type_params(params) @@ -690,9 +752,10 @@ class CivitaiTrpcAPI(): "followed" : False, "draftOnly" : False, "pending" : True, - "include" : ["cosmetics"], + "include" : ("cosmetics",), }) + params = self._type_params(params) return self._pagination(endpoint, params, meta) def user(self, username): @@ -700,6 +763,15 @@ class CivitaiTrpcAPI(): params = {"username": username} return (self._call(endpoint, params),) + def orchestrator_queryGeneratedImages(self): + endpoint = "orchestrator.queryGeneratedImages" + params = { + "ascending": False, + "tags" : ("gen",), + "authed" : True, + } + return self._pagination(endpoint, params) + def _call(self, endpoint, params, meta=None): url = self.root + endpoint headers = self.headers @@ -765,4 +837,107 @@ class CivitaiTrpcAPI(): def _bool(value): - return True if value == "true" else False + return value == "true" + + +class CivitaiSearchAPI(): + + def __init__(self, extractor): + self.extractor = extractor + self.root = "https://search.civitai.com" + self.headers = { + "Authorization": "Bearer ab8565e5ab8dc2d8f0d4256d204781cb63fe8b031" + "eb3779cbbed38a7b5308e5c", + "Content-Type": "application/json", + "X-Meilisearch-Client": "Meilisearch instant-meilisearch (v0.13.5)" + " ; Meilisearch JavaScript (v0.34.0)", + "Origin": extractor.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + "Priority": "u=4", + } + + def search(self, query, type, facets, nsfw=31): + endpoint = "/multi-search" + + query = { + "q" : query, + "indexUid": type, + "facets" : facets, + "attributesToHighlight": (), + "highlightPreTag" : "__ais-highlight__", + "highlightPostTag": "__/ais-highlight__", + "limit" : 51, + "offset": 0, + "filter": (self._generate_filter(nsfw),), + } + + return self._pagination(endpoint, query) + + def search_models(self, query, type=None, nsfw=31): + facets = ( + "category.name", + "checkpointType", + "fileFormats", + "lastVersionAtUnix", + "tags.name", + "type", + "user.username", + "version.baseModel", + ) + return self.search(query, type or "models_v9", facets, nsfw) + + def search_images(self, query, type=None, nsfw=31): + facets = ( + "aspectRatio", + "baseModel", + "createdAtUnix", + "tagNames", + "techniqueNames", + "toolNames", + "type", + "user.username", + ) + return self.search(query, type or "images_v6", facets, nsfw) + + def _call(self, endpoint, query): + url = self.root + endpoint + params = util.json_dumps({"queries": (query,)}) + + data = self.extractor.request_json( + url, method="POST", headers=self.headers, data=params) + + return data["results"][0] + + def _pagination(self, endpoint, query): + limit = query["limit"] - 1 + threshold = limit // 2 + + while True: + data = self._call(endpoint, query) + + items = data["hits"] + yield from items + + if len(items) < threshold: + return + query["offset"] += limit + + def _generate_filter(self, level): + fltr = [] + + if level & 1: + fltr.append("1") + if level & 2: + fltr.append("2") + if level & 4: + fltr.append("4") + if level & 8: + fltr.append("8") + if level & 16: + fltr.append("16") + + if not fltr: + return "()" + return "(nsfwLevel=" + " OR nsfwLevel=".join(fltr) + ")" diff --git a/gallery_dl/extractor/comick.py b/gallery_dl/extractor/comick.py new file mode 100644 index 0000000..7ef4607 --- /dev/null +++ b/gallery_dl/extractor/comick.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://comick.io/""" + +from .common import ChapterExtractor, MangaExtractor, Message +from .. import text +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?comick\.io" + + +class ComickBase(): + """Base class for comick.io extractors""" + category = "comick" + root = "https://comick.io" + + @memcache(keyarg=1) + def _manga_info(self, slug): + url = f"{self.root}/comic/{slug}" + page = self.request(url).text + data = self._extract_nextdata(page) + props = data["props"]["pageProps"] + comic = props["comic"] + + genre = [] + theme = [] + format = "" + for item in comic["md_comic_md_genres"]: + item = item["md_genres"] + group = item["group"] + if group == "Genre": + genre.append(item["name"]) + elif group == "Theme": + theme.append(item["name"]) + else: + format = item["name"] + + if mu := comic["mu_comics"]: + tags = [c["mu_categories"]["title"] + for c in mu["mu_comic_categories"]] + publisher = [p["mu_publishers"]["title"] + for p in mu["mu_comic_publishers"]] + else: + tags = publisher = () + + return { + "manga": comic["title"], + "manga_id": comic["id"], + "manga_hid": comic["hid"], + "manga_slug": slug, + "manga_titles": [t["title"] for t in comic["md_titles"]], + "artist": [a["name"] for a in props["artists"]], + "author": [a["name"] for a in props["authors"]], + "genre" : genre, + "theme" : theme, + "format": format, + "tags" : tags, + "publisher": publisher, + "published": text.parse_int(comic["year"]), + "description": comic["desc"], + "demographic": props["demographic"], + "origin": comic["iso639_1"], + "mature": props["matureContent"], + "rating": comic["content_rating"], + "rank" : comic["follow_rank"], + "score" : text.parse_float(comic["bayesian_rating"]), + "status": "Complete" if comic["status"] == 2 else "Ongoing", + "links" : comic["links"], + "_build_id": data["buildId"], + } + + def _chapter_info(self, manga, chstr): + slug = manga['manga_slug'] + url = (f"{self.root}/_next/data/{manga['_build_id']}" + f"/comic/{slug}/{chstr}.json") + params = {"slug": slug, "chapter": chstr} + return self.request_json(url, params=params)["pageProps"] + + +class ComickChapterExtractor(ComickBase, ChapterExtractor): + """Extractor for comick.io manga chapters""" + archive_fmt = "{chapter_hid}_{page}" + pattern = BASE_PATTERN + r"/comic/([\w-]+)/(\w+-chapter-[^/?#]+)" + example = "https://comick.io/comic/MANGA/ID-chapter-123-en" + + def metadata(self, page): + slug, chstr = self.groups + manga = self._manga_info(slug) + props = self._chapter_info(manga, chstr) + + ch = props["chapter"] + self._images = ch["md_images"] + chapter, sep, minor = ch["chap"].partition(".") + + return { + **manga, + "title" : props["chapTitle"], + "volume" : text.parse_int(ch["vol"]), + "chapter" : text.parse_int(chapter), + "chapter_minor" : sep + minor, + "chapter_id" : ch["id"], + "chapter_hid" : ch["hid"], + "chapter_string": chstr, + "group" : ch["group_name"], + "date" : text.parse_datetime( + ch["created_at"][:19], "%Y-%m-%dT%H:%M:%S"), + "date_updated" : text.parse_datetime( + ch["updated_at"][:19], "%Y-%m-%dT%H:%M:%S"), + "lang" : ch["lang"], + } + + def images(self, page): + return [ + ("https://meo.comick.pictures/" + img["b2key"], { + "width" : img["w"], + "height" : img["h"], + "size" : img["s"], + "optimized": img["optimized"], + }) + for img in self._images + ] + + +class ComickMangaExtractor(ComickBase, MangaExtractor): + """Extractor for comick.io manga""" + pattern = BASE_PATTERN + r"/comic/([\w-]+)/?(?:\?([^#]+))?" + example = "https://comick.io/comic/MANGA" + + def items(self): + slug = self.groups[0] + manga = self._manga_info(slug) + + for ch in self.chapters(manga): + url = (f"{self.root}/comic/{slug}" + f"/{ch['hid']}-chapter-{ch['chap']}-{ch['lang']}") + + ch.update(manga) + chapter, sep, minor = ch["chap"].partition(".") + ch["chapter"] = text.parse_int(chapter) + ch["chapter_minor"] = sep + minor + ch["_extractor"] = ComickChapterExtractor + + yield Message.Queue, url, ch + + def chapters(self, manga): + info = True + slug, query = self.groups + + url = f"https://api.comick.io/comic/{manga['manga_hid']}/chapters" + headers = { + "Origin": "https://comick.io", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + } + + query = text.parse_query(query) + params = {"lang": query.get("lang") or None} + params["page"] = page = text.parse_int(query.get("page"), 1) + + if date_order := query.get("date-order"): + params["date-order"] = date_order + elif chap_order := query.get("chap-order"): + params["chap-order"] = chap_order + else: + params["chap-order"] = \ + "0" if self.config("chapter-reverse", False) else "1" + + group = query.get("group", None) + if group == "0": + group = None + + while True: + data = self.request_json(url, params=params, headers=headers) + limit = data["limit"] + + if info: + info = False + total = data["total"] - limit * page + if total > limit: + self.log.info("Collecting %s chapters", total) + + if group is None: + yield from data["chapters"] + else: + for ch in data["chapters"]: + if group in ch["group_name"]: + yield ch + + if data["total"] <= limit * page: + return + params["page"] = page = page + 1 diff --git a/gallery_dl/extractor/comicvine.py b/gallery_dl/extractor/comicvine.py index d076795..39397b9 100644 --- a/gallery_dl/extractor/comicvine.py +++ b/gallery_dl/extractor/comicvine.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -46,7 +46,7 @@ class ComicvineTagExtractor(BooruExtractor): } while True: - images = self.request(url, params=params).json()["images"] + images = self.request_json(url, params=params)["images"] yield from images if len(images) < self.per_page: @@ -59,8 +59,7 @@ class ComicvineTagExtractor(BooruExtractor): _file_url = operator.itemgetter("original") - @staticmethod - def _prepare(post): + def _prepare(self, post): post["date"] = text.parse_datetime( post["dateCreated"], "%a, %b %d %Y") post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index c430ec1..d46152b 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,9 +17,10 @@ import queue import random import getpass import logging -import datetime import requests import threading +from datetime import datetime +from xml.etree import ElementTree from requests.adapters import HTTPAdapter from .message import Message from .. import config, output, text, util, cache, exception @@ -35,6 +36,7 @@ class Extractor(): directory_fmt = ("{category}",) filename_fmt = "{filename}.{extension}" archive_fmt = "" + status = 0 root = "" cookies_domain = "" cookies_index = 0 @@ -53,6 +55,15 @@ class Extractor(): self.url = match.string self.match = match self.groups = match.groups() + self.kwdict = {} + + if self.category in CATEGORY_MAP: + catsub = f"{self.category}:{self.subcategory}" + if catsub in CATEGORY_MAP: + self.category, self.subcategory = CATEGORY_MAP[catsub] + else: + self.category = CATEGORY_MAP[self.category] + self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" @@ -125,11 +136,10 @@ class Extractor(): if first: first = False values = config.accumulate(extr + path, key) - else: - conf = config.get(extr, path[0]) - if conf: - values[:0] = config.accumulate( - (self.subcategory,), key, conf=conf) + elif conf := config.get(extr, path[0]): + values[:0] = config.accumulate( + (self.subcategory,), key, conf=conf) + return values def request(self, url, method="GET", session=None, @@ -149,17 +159,15 @@ class Extractor(): kwargs["verify"] = self._verify if "json" in kwargs: - json = kwargs["json"] - if json is not None: + if (json := kwargs["json"]) is not None: kwargs["data"] = util.json_dumps(json).encode() del kwargs["json"] - headers = kwargs.get("headers") - if headers: + if headers := kwargs.get("headers"): headers["Content-Type"] = "application/json" else: kwargs["headers"] = {"Content-Type": "application/json"} - response = None + response = challenge = None tries = 1 if self._interval: @@ -172,21 +180,22 @@ class Extractor(): try: response = session.request(method, url, **kwargs) except requests.exceptions.ConnectionError as exc: - code = 0 try: reason = exc.args[0].reason cls = reason.__class__.__name__ pre, _, err = str(reason.args[-1]).partition(":") - msg = " {}: {}".format(cls, (err or pre).lstrip()) + msg = f" {cls}: {(err or pre).lstrip()}" except Exception: msg = exc + code = 0 except (requests.exceptions.Timeout, requests.exceptions.ChunkedEncodingError, requests.exceptions.ContentDecodingError) as exc: msg = exc code = 0 except (requests.exceptions.RequestException) as exc: - raise exception.HttpError(exc) + msg = exc + break else: code = response.status_code if self._write_pages: @@ -201,10 +210,10 @@ class Extractor(): response.encoding = encoding return response if notfound and code == 404: + self.status |= exception.NotFoundError.code raise exception.NotFoundError(notfound) - msg = "'{} {}' for '{}'".format( - code, response.reason, response.url) + msg = f"'{code} {response.reason}' for '{response.url}'" challenge = util.detect_challenge(response) if challenge is not None: @@ -238,13 +247,59 @@ class Extractor(): self.sleep(seconds, "retry") tries += 1 - raise exception.HttpError(msg, response) + if not fatal or fatal is ...: + self.log.warning(msg) + return util.NullResponse(url, msg) + + if challenge is None: + exc = exception.HttpError(msg, response) + else: + exc = exception.ChallengeError(challenge, response) + self.status |= exc.code + raise exc def request_location(self, url, **kwargs): kwargs.setdefault("method", "HEAD") kwargs.setdefault("allow_redirects", False) return self.request(url, **kwargs).headers.get("location", "") + def request_json(self, url, **kwargs): + response = self.request(url, **kwargs) + + try: + return util.json_loads(response.text) + except Exception as exc: + fatal = kwargs.get("fatal", True) + if not fatal or fatal is ...: + if challenge := util.detect_challenge(response): + self.log.warning(challenge) + else: + self.log.warning("%s: %s", exc.__class__.__name__, exc) + return {} + raise + + def request_xml(self, url, xmlns=True, **kwargs): + response = self.request(url, **kwargs) + + if xmlns: + text = response.text + else: + text = response.text.replace(" xmlns=", " ns=") + + parser = ElementTree.XMLParser() + try: + parser.feed(text) + return parser.close() + except Exception as exc: + fatal = kwargs.get("fatal", True) + if not fatal or fatal is ...: + if challenge := util.detect_challenge(response): + self.log.warning(challenge) + else: + self.log.warning("%s: %s", exc.__class__.__name__, exc) + return ElementTree.Element("") + raise + _handle_429 = util.false def wait(self, seconds=None, until=None, adjust=1.0, @@ -255,7 +310,7 @@ class Extractor(): seconds = float(seconds) until = now + seconds elif until: - if isinstance(until, datetime.datetime): + if isinstance(until, datetime): # convert to UTC timestamp until = util.datetime_to_timestamp(until) else: @@ -269,8 +324,8 @@ class Extractor(): return if reason: - t = datetime.datetime.fromtimestamp(until).time() - isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second) + t = datetime.fromtimestamp(until).time() + isotime = f"{t.hour:02}:{t.minute:02}:{t.second:02}" self.log.info("Waiting until %s (%s)", isotime, reason) time.sleep(seconds) @@ -295,8 +350,8 @@ class Extractor(): if input is None: input = output.TTY_STDIN if not input: - raise exception.StopExtraction( - "User input required (%s)", prompt.strip(" :")) + raise exception.AbortExtraction( + f"User input required ({prompt.strip(' :')})") def _get_auth_info(self): """Return authentication information as (username, password) tuple""" @@ -366,36 +421,31 @@ class Extractor(): elif platform == "linux": platform = "X11; Linux x86_64" elif platform == "macos": - platform = "Macintosh; Intel Mac OS X 11.5" + platform = "Macintosh; Intel Mac OS X 15.5" if browser == "chrome": if platform.startswith("Macintosh"): - platform = platform.replace(".", "_") + "_2" + platform = platform.replace(".", "_") else: browser = "firefox" - for key, value in HTTP_HEADERS[browser]: + for key, value in HEADERS[browser]: if value and "{}" in value: - headers[key] = value.format(platform) + headers[key] = value.replace("{}", platform) else: headers[key] = value ssl_options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) - ssl_ciphers = SSL_CIPHERS[browser] + ssl_ciphers = CIPHERS[browser] else: - useragent = self.config("user-agent") - if useragent is None or useragent == "auto": - useragent = self.useragent - elif useragent == "browser": - useragent = _browser_useragent() - elif self.useragent is not Extractor.useragent and \ - useragent is config.get(("extractor",), "user-agent"): - useragent = self.useragent - headers["User-Agent"] = useragent + headers["User-Agent"] = self.useragent headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" + ssl_ciphers = self.ciphers + if ssl_ciphers is not None and ssl_ciphers in CIPHERS: + ssl_ciphers = CIPHERS[ssl_ciphers] if BROTLI: headers["Accept-Encoding"] = "gzip, deflate, br" @@ -404,26 +454,40 @@ class Extractor(): if ZSTD: headers["Accept-Encoding"] += ", zstd" - referer = self.config("referer", self.referer) - if referer: + if referer := self.config("referer", self.referer): if isinstance(referer, str): headers["Referer"] = referer elif self.root: headers["Referer"] = self.root + "/" - custom_headers = self.config("headers") - if custom_headers: + custom_ua = self.config("user-agent") + if custom_ua is None or custom_ua == "auto": + pass + elif custom_ua == "browser": + headers["User-Agent"] = _browser_useragent() + elif self.useragent is Extractor.useragent and not self.browser or \ + custom_ua is not config.get(("extractor",), "user-agent"): + headers["User-Agent"] = custom_ua + + if custom_headers := self.config("headers"): + if isinstance(custom_headers, str): + if custom_headers in HEADERS: + custom_headers = HEADERS[custom_headers] + else: + self.log.error("Invalid 'headers' value '%s'", + custom_headers) + custom_headers = () headers.update(custom_headers) - custom_ciphers = self.config("ciphers") - if custom_ciphers: + if custom_ciphers := self.config("ciphers"): if isinstance(custom_ciphers, list): ssl_ciphers = ":".join(custom_ciphers) + elif custom_ciphers in CIPHERS: + ssl_ciphers = CIPHERS[custom_ciphers] else: ssl_ciphers = custom_ciphers - source_address = self.config("source-address") - if source_address: + if source_address := self.config("source-address"): if isinstance(source_address, str): source_address = (source_address, 0) else: @@ -436,8 +500,17 @@ class Extractor(): ssl_options |= ssl.OP_NO_TLSv1_2 self.log.debug("TLS 1.2 disabled.") + if self.config("truststore"): + try: + from truststore import SSLContext as ssl_ctx + except ImportError as exc: + self.log.error("%s: %s", exc.__class__.__name__, exc) + ssl_ctx = None + else: + ssl_ctx = None + adapter = _build_requests_adapter( - ssl_options, ssl_ciphers, source_address) + ssl_options, ssl_ciphers, ssl_ctx, source_address) session.mount("https://", adapter) session.mount("http://", adapter) @@ -448,10 +521,8 @@ class Extractor(): if self.cookies_domain is None: return - cookies = self.config("cookies") - if cookies: - select = self.config("cookies-select") - if select: + if cookies := self.config("cookies"): + if select := self.config("cookies-select"): if select == "rotate": cookies = cookies[self.cookies_index % len(cookies)] Extractor.cookies_index += 1 @@ -469,9 +540,11 @@ class Extractor(): with open(path) as fp: cookies = util.cookiestxt_load(fp) except Exception as exc: - self.log.warning("cookies: %s", exc) + self.log.warning("cookies: Failed to load '%s' (%s: %s)", + cookies_source, exc.__class__.__name__, exc) else: - self.log.debug("Loading cookies from '%s'", cookies_source) + self.log.debug("cookies: Loading cookies from '%s'", + cookies_source) set_cookie = self.cookies.set_cookie for cookie in cookies: set_cookie(cookie) @@ -479,7 +552,7 @@ class Extractor(): elif isinstance(cookies_source, (list, tuple)): key = tuple(cookies_source) - cookies = _browser_cookies.get(key) + cookies = CACHE_COOKIES.get(key) if cookies is None: from ..cookies import load_cookies @@ -489,18 +562,18 @@ class Extractor(): self.log.warning("cookies: %s", exc) cookies = () else: - _browser_cookies[key] = cookies + CACHE_COOKIES[key] = cookies else: - self.log.debug("Using cached cookies from %s", key) + self.log.debug("cookies: Using cached cookies from %s", key) set_cookie = self.cookies.set_cookie for cookie in cookies: set_cookie(cookie) else: - self.log.warning( - "Expected 'dict', 'list', or 'str' value for 'cookies' " - "option, got '%s' (%s)", + self.log.error( + "cookies: Expected 'dict', 'list', or 'str' value for " + "'cookies' option, got '%s' instead (%r)", cookies_source.__class__.__name__, cookies_source) def cookies_store(self): @@ -522,7 +595,8 @@ class Extractor(): util.cookiestxt_store(fp, self.cookies) os.replace(path_tmp, path) except OSError as exc: - self.log.warning("cookies: %s", exc) + self.log.error("cookies: Failed to write to '%s' " + "(%s: %s)", path, exc.__class__.__name__, exc) def cookies_update(self, cookies, domain=""): """Update the session's cookiejar with 'cookies'""" @@ -568,14 +642,17 @@ class Extractor(): if diff <= 0: self.log.warning( - "Cookie '%s' has expired", cookie.name) + "cookies: %s/%s expired at %s", + cookie.domain.lstrip("."), cookie.name, + datetime.fromtimestamp(cookie.expires)) continue elif diff <= 86400: hours = diff // 3600 self.log.warning( - "Cookie '%s' will expire in less than %s hour%s", - cookie.name, hours + 1, "s" if hours else "") + "cookies: %s/%s will expire in less than %s hour%s", + cookie.domain.lstrip("."), cookie.name, + hours + 1, "s" if hours else "") names.discard(cookie.name) if not names: @@ -590,11 +667,6 @@ class Extractor(): return util.json_loads(text.extr( page, ' id="__NEXT_DATA__" type="application/json">', "")) - def _prepare_ddosguard_cookies(self): - if not self.cookies.get("__ddg2", domain=self.cookies_domain): - self.cookies.set( - "__ddg2", util.generate_token(), domain=self.cookies_domain) - def _cache(self, func, maxage, keyarg=None): # return cache.DatabaseCacheDecorator(func, maxage, keyarg) return cache.DatabaseCacheDecorator(func, keyarg, maxage) @@ -608,7 +680,7 @@ class Extractor(): ts = self.config(key, default) if isinstance(ts, str): try: - ts = int(datetime.datetime.strptime(ts, fmt).timestamp()) + ts = int(datetime.strptime(ts, fmt).timestamp()) except ValueError as exc: self.log.warning("Unable to parse '%s': %s", key, exc) ts = default @@ -616,35 +688,12 @@ class Extractor(): fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S") return get("date-min", dmin), get("date-max", dmax) - def _dispatch_extractors(self, extractor_data, default=()): - """ """ - extractors = { - data[0].subcategory: data - for data in extractor_data - } - - include = self.config("include", default) or () - if include == "all": - include = extractors - elif isinstance(include, str): - include = include.replace(" ", "").split(",") - - result = [(Message.Version, 1)] - for category in include: - try: - extr, url = extractors[category] - except KeyError: - self.log.warning("Invalid include '%s'", category) - else: - result.append((Message.Queue, url, {"_extractor": extr})) - return iter(result) - @classmethod def _dump(cls, obj): util.dump_json(obj, ensure_ascii=False, indent=2) def _dump_response(self, response, history=True): - """Write the response content to a .dump file in the current directory. + """Write the response content to a .txt file in the current directory. The file name is derived from the response url, replacing special characters with "_" @@ -657,12 +706,11 @@ class Extractor(): Extractor._dump_index += 1 else: Extractor._dump_index = 1 - Extractor._dump_sanitize = re.compile(r"[\\\\|/<>:\"?*&=#]+").sub + Extractor._dump_sanitize = util.re_compile( + r"[\\\\|/<>:\"?*&=#]+").sub - fname = "{:>02}_{}".format( - Extractor._dump_index, - Extractor._dump_sanitize('_', response.url), - ) + fname = (f"{Extractor._dump_index:>02}_" + f"{Extractor._dump_sanitize('_', response.url)}") if util.WINDOWS: path = os.path.abspath(fname)[:255] @@ -693,19 +741,24 @@ class GalleryExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.gallery_url = self.root + self.groups[0] if url is None else url + + if url is None and (path := self.groups[0]) and path[0] == "/": + self.page_url = f"{self.root}{path}" + else: + self.page_url = url def items(self): self.login() - if self.gallery_url: + if self.page_url: page = self.request( - self.gallery_url, notfound=self.subcategory).text + self.page_url, notfound=self.subcategory).text else: page = None data = self.metadata(page) imgs = self.images(page) + assets = self.assets(page) if "count" in data: if self.config("page-reverse"): @@ -727,7 +780,18 @@ class GalleryExtractor(Extractor): images = enum(imgs, 1) yield Message.Directory, data - for data[self.enum], (url, imgdata) in images: + enum_key = self.enum + + if assets: + for asset in assets: + url = asset["url"] + asset.update(data) + asset[enum_key] = 0 + if "extension" not in asset: + text.nameext_from_url(url, asset) + yield Message.Url, url, asset + + for data[enum_key], (url, imgdata) in images: if imgdata: data.update(imgdata) if "extension" not in imgdata: @@ -743,7 +807,13 @@ class GalleryExtractor(Extractor): """Return a dict with general metadata""" def images(self, page): - """Return a list of all (image-url, metadata)-tuples""" + """Return a list or iterable of all (image-url, metadata)-tuples""" + + def assets(self, page): + """Return an iterable of additional gallery assets + + Each asset must be a 'dict' containing at least 'url' and 'type' + """ class ChapterExtractor(GalleryExtractor): @@ -768,7 +838,11 @@ class MangaExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.manga_url = self.root + self.groups[0] if url is None else url + + if url is None and (path := self.groups[0]) and path[0] == "/": + self.page_url = f"{self.root}{path}" + else: + self.page_url = url if self.config("chapter-reverse", False): self.reverse = not self.reverse @@ -776,8 +850,8 @@ class MangaExtractor(Extractor): def items(self): self.login() - if self.manga_url: - page = self.request(self.manga_url, notfound=self.subcategory).text + if self.page_url: + page = self.request(self.page_url, notfound=self.subcategory).text else: page = None @@ -796,6 +870,45 @@ class MangaExtractor(Extractor): """Return a list of all (chapter-url, metadata)-tuples""" +class Dispatch(): + subcategory = "user" + cookies_domain = None + finalize = Extractor.finalize + skip = Extractor.skip + + def __iter__(self): + return self.items() + + def initialize(self): + pass + + def _dispatch_extractors(self, extractor_data, default=(), alt=None): + extractors = { + data[0].subcategory: data + for data in extractor_data + } + + if alt is not None: + for sub, sub_alt in alt: + extractors[sub_alt] = extractors[sub] + + include = self.config("include", default) or () + if include == "all": + include = extractors + elif isinstance(include, str): + include = include.replace(" ", "").split(",") + + results = [(Message.Version, 1)] + for category in include: + try: + extr, url = extractors[category] + except KeyError: + self.log.warning("Invalid include '%s'", category) + else: + results.append((Message.Queue, url, {"_extractor": extr})) + return iter(results) + + class AsynchronousMixin(): """Run info extraction in a separate thread""" @@ -846,7 +959,7 @@ class BaseExtractor(Extractor): if index: self.category, self.root, info = self.instances[index-1] if not self.root: - self.root = text.root_from_url(self.match.group(0)) + self.root = text.root_from_url(self.match[0]) self.config_instance = info.get else: self.root = group @@ -855,8 +968,7 @@ class BaseExtractor(Extractor): @classmethod def update(cls, instances): - extra_instances = config.get(("extractor",), cls.basecategory) - if extra_instances: + if extra_instances := config.get(("extractor",), cls.basecategory): for category, info in extra_instances.items(): if isinstance(info, dict) and "root" in info: instances[category] = info @@ -864,8 +976,7 @@ class BaseExtractor(Extractor): pattern_list = [] instance_list = cls.instances = [] for category, info in instances.items(): - root = info["root"] - if root: + if root := info["root"]: root = root.rstrip("/") instance_list.append((category, root, info)) @@ -898,24 +1009,35 @@ class RequestsAdapter(HTTPAdapter): return HTTPAdapter.proxy_manager_for(self, *args, **kwargs) -def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): - key = (ssl_options, ssl_ciphers, source_address) +def _build_requests_adapter( + ssl_options, ssl_ciphers, ssl_ctx, source_address): + + key = (ssl_options, ssl_ciphers, ssl_ctx, source_address) try: - return _adapter_cache[key] + return CACHE_ADAPTERS[key] except KeyError: pass - if ssl_options or ssl_ciphers: - ssl_context = urllib3.connection.create_urllib3_context( - options=ssl_options or None, ciphers=ssl_ciphers) - if not requests.__version__ < "2.32": - # https://github.com/psf/requests/pull/6731 - ssl_context.load_verify_locations(requests.certs.where()) + if ssl_options or ssl_ciphers or ssl_ctx: + if ssl_ctx is None: + ssl_context = urllib3.connection.create_urllib3_context( + options=ssl_options or None, ciphers=ssl_ciphers) + if not requests.__version__ < "2.32": + # https://github.com/psf/requests/pull/6731 + ssl_context.load_verify_locations(requests.certs.where()) + else: + ssl_ctx_orig = urllib3.util.ssl_.SSLContext + try: + urllib3.util.ssl_.SSLContext = ssl_ctx + ssl_context = urllib3.connection.create_urllib3_context( + options=ssl_options or None, ciphers=ssl_ciphers) + finally: + urllib3.util.ssl_.SSLContext = ssl_ctx_orig ssl_context.check_hostname = False else: ssl_context = None - adapter = _adapter_cache[key] = RequestsAdapter( + adapter = CACHE_ADAPTERS[key] = RequestsAdapter( ssl_context, source_address) return adapter @@ -932,7 +1054,7 @@ def _browser_useragent(): server.listen(1) host, port = server.getsockname() - webbrowser.open("http://{}:{}/user-agent".format(host, port)) + webbrowser.open(f"http://{host}:{port}/user-agent") client = server.accept()[0] server.close() @@ -951,83 +1073,131 @@ def _browser_useragent(): return useragent.decode() -_adapter_cache = {} -_browser_cookies = {} - - -HTTP_HEADERS = { - "firefox": ( - ("User-Agent", "Mozilla/5.0 ({}; " - "rv:128.0) Gecko/20100101 Firefox/128.0"), - ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"), - ("Accept-Language", "en-US,en;q=0.5"), - ("Accept-Encoding", None), - ("Referer", None), - ("Connection", "keep-alive"), - ("Upgrade-Insecure-Requests", "1"), - ("Cookie", None), - ("Sec-Fetch-Dest", "empty"), - ("Sec-Fetch-Mode", "no-cors"), - ("Sec-Fetch-Site", "same-origin"), - ("TE", "trailers"), - ), - "chrome": ( - ("Connection", "keep-alive"), - ("Upgrade-Insecure-Requests", "1"), - ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, " - "like Gecko) Chrome/111.0.0.0 Safari/537.36"), - ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,image/apng,*/*;q=0.8," - "application/signed-exchange;v=b3;q=0.7"), - ("Referer", None), - ("Sec-Fetch-Site", "same-origin"), - ("Sec-Fetch-Mode", "no-cors"), - ("Sec-Fetch-Dest", "empty"), - ("Accept-Encoding", None), - ("Accept-Language", "en-US,en;q=0.9"), - ("cookie", None), - ("content-length", None), - ), +CACHE_ADAPTERS = {} +CACHE_COOKIES = {} +CATEGORY_MAP = () + + +HEADERS_FIREFOX_140 = ( + ("User-Agent", "Mozilla/5.0 ({}; rv:140.0) Gecko/20100101 Firefox/140.0"), + ("Accept", "text/html,application/xhtml+xml," + "application/xml;q=0.9,*/*;q=0.8"), + ("Accept-Language", "en-US,en;q=0.5"), + ("Accept-Encoding", None), + ("Connection", "keep-alive"), + ("Content-Type", None), + ("Content-Length", None), + ("Referer", None), + ("Origin", None), + ("Cookie", None), + ("Sec-Fetch-Dest", "empty"), + ("Sec-Fetch-Mode", "cors"), + ("Sec-Fetch-Site", "same-origin"), + ("TE", "trailers"), +) +HEADERS_FIREFOX_128 = ( + ("User-Agent", "Mozilla/5.0 ({}; rv:128.0) Gecko/20100101 Firefox/128.0"), + ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"), + ("Accept-Language", "en-US,en;q=0.5"), + ("Accept-Encoding", None), + ("Referer", None), + ("Connection", "keep-alive"), + ("Upgrade-Insecure-Requests", "1"), + ("Cookie", None), + ("Sec-Fetch-Dest", "empty"), + ("Sec-Fetch-Mode", "no-cors"), + ("Sec-Fetch-Site", "same-origin"), + ("TE", "trailers"), +) +HEADERS_CHROMIUM_138 = ( + ("Connection", "keep-alive"), + ("sec-ch-ua", '"Not)A;Brand";v="8", "Chromium";v="138"'), + ("sec-ch-ua-mobile", "?0"), + ("sec-ch-ua-platform", '"Linux"'), + ("Upgrade-Insecure-Requests", "1"), + ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, " + "like Gecko) Chrome/138.0.0.0 Safari/537.36"), + ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,image/apng,*/*;q=0.8," + "application/signed-exchange;v=b3;q=0.7"), + ("Referer", None), + ("Sec-Fetch-Site", "same-origin"), + ("Sec-Fetch-Mode", "no-cors"), + # ("Sec-Fetch-User", "?1"), + ("Sec-Fetch-Dest", "empty"), + ("Accept-Encoding", None), + ("Accept-Language", "en-US,en;q=0.9"), +) +HEADERS_CHROMIUM_111 = ( + ("Connection", "keep-alive"), + ("Upgrade-Insecure-Requests", "1"), + ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, " + "like Gecko) Chrome/111.0.0.0 Safari/537.36"), + ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,image/apng,*/*;q=0.8," + "application/signed-exchange;v=b3;q=0.7"), + ("Referer", None), + ("Sec-Fetch-Site", "same-origin"), + ("Sec-Fetch-Mode", "no-cors"), + ("Sec-Fetch-Dest", "empty"), + ("Accept-Encoding", None), + ("Accept-Language", "en-US,en;q=0.9"), + ("cookie", None), + ("content-length", None), +) +HEADERS = { + "firefox" : HEADERS_FIREFOX_140, + "firefox/140": HEADERS_FIREFOX_140, + "firefox/128": HEADERS_FIREFOX_128, + "chrome" : HEADERS_CHROMIUM_138, + "chrome/138" : HEADERS_CHROMIUM_138, + "chrome/111" : HEADERS_CHROMIUM_111, } -SSL_CIPHERS = { - "firefox": ( - "TLS_AES_128_GCM_SHA256:" - "TLS_CHACHA20_POLY1305_SHA256:" - "TLS_AES_256_GCM_SHA384:" - "ECDHE-ECDSA-AES128-GCM-SHA256:" - "ECDHE-RSA-AES128-GCM-SHA256:" - "ECDHE-ECDSA-CHACHA20-POLY1305:" - "ECDHE-RSA-CHACHA20-POLY1305:" - "ECDHE-ECDSA-AES256-GCM-SHA384:" - "ECDHE-RSA-AES256-GCM-SHA384:" - "ECDHE-ECDSA-AES256-SHA:" - "ECDHE-ECDSA-AES128-SHA:" - "ECDHE-RSA-AES128-SHA:" - "ECDHE-RSA-AES256-SHA:" - "AES128-GCM-SHA256:" - "AES256-GCM-SHA384:" - "AES128-SHA:" - "AES256-SHA" - ), - "chrome": ( - "TLS_AES_128_GCM_SHA256:" - "TLS_AES_256_GCM_SHA384:" - "TLS_CHACHA20_POLY1305_SHA256:" - "ECDHE-ECDSA-AES128-GCM-SHA256:" - "ECDHE-RSA-AES128-GCM-SHA256:" - "ECDHE-ECDSA-AES256-GCM-SHA384:" - "ECDHE-RSA-AES256-GCM-SHA384:" - "ECDHE-ECDSA-CHACHA20-POLY1305:" - "ECDHE-RSA-CHACHA20-POLY1305:" - "ECDHE-RSA-AES128-SHA:" - "ECDHE-RSA-AES256-SHA:" - "AES128-GCM-SHA256:" - "AES256-GCM-SHA384:" - "AES128-SHA:" - "AES256-SHA" - ), +CIPHERS_FIREFOX = ( + "TLS_AES_128_GCM_SHA256:" + "TLS_CHACHA20_POLY1305_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-AES256-SHA:" + "ECDHE-ECDSA-AES128-SHA:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "AES128-GCM-SHA256:" + "AES256-GCM-SHA384:" + "AES128-SHA:" + "AES256-SHA" +) +CIPHERS_CHROMIUM = ( + "TLS_AES_128_GCM_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "TLS_CHACHA20_POLY1305_SHA256:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "AES128-GCM-SHA256:" + "AES256-GCM-SHA384:" + "AES128-SHA:" + "AES256-SHA" +) +CIPHERS = { + "firefox" : CIPHERS_FIREFOX, + "firefox/140": CIPHERS_FIREFOX, + "firefox/128": CIPHERS_FIREFOX, + "chrome" : CIPHERS_CHROMIUM, + "chrome/138" : CIPHERS_CHROMIUM, + "chrome/111" : CIPHERS_CHROMIUM, } diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index e150829..b3944f7 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -32,7 +32,7 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): yield Message.Url, file["url"], file def fetch_album(self, album_id): - url = "{}/a/{}".format(self.root, album_id) + url = f"{self.root}/a/{album_id}" page = self.request(url).text extr = text.extract_from(page) @@ -60,9 +60,9 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): def _extract_files(self, file_ids): for file_id in file_ids: try: - url = "{}/api/file/info/{}".format(self.root_api, file_id) - file = self.request(url).json() - auth = self.request(file["auth_url"]).json() + url = f"{self.root_api}/api/file/info/{file_id}" + file = self.request_json(url) + auth = self.request_json(file["auth_url"]) file["url"] = auth["url"] except Exception as exc: self.log.warning("%s (%s: %s)", diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 06c31b9..ff071c5 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -49,8 +49,7 @@ class DanbooruExtractor(BaseExtractor): def items(self): # 'includes' initialization must be done here and not in '_init()' # or it'll cause an exception with e621 when 'metadata' is enabled - includes = self.config("metadata") - if includes: + if includes := self.config("metadata"): if isinstance(includes, (list, tuple)): includes = ",".join(includes) elif not isinstance(includes, str): @@ -112,8 +111,7 @@ class DanbooruExtractor(BaseExtractor): def items_artists(self): for artist in self.artists(): artist["_extractor"] = DanbooruTagExtractor - url = "{}/posts?tags={}".format( - self.root, text.quote(artist["name"])) + url = f"{self.root}/posts?tags={text.quote(artist['name'])}" yield Message.Queue, url, artist def metadata(self): @@ -129,7 +127,7 @@ class DanbooruExtractor(BaseExtractor): first = True while True: - posts = self.request(url, params=params).json() + posts = self.request_json(url, params=params) if isinstance(posts, dict): posts = posts["posts"] @@ -142,8 +140,7 @@ class DanbooruExtractor(BaseExtractor): } data = { meta["id"]: meta - for meta in self.request( - url, params=params_meta).json() + for meta in self.request_json(url, params=params_meta) } for post in posts: post.update(data[post["id"]]) @@ -157,7 +154,7 @@ class DanbooruExtractor(BaseExtractor): return if prefix: - params["page"] = "{}{}".format(prefix, posts[-1]["id"]) + params["page"] = f"{prefix}{posts[-1]['id']}" elif params["page"]: params["page"] += 1 else: @@ -165,11 +162,17 @@ class DanbooruExtractor(BaseExtractor): first = False def _ugoira_frames(self, post): - data = self.request("{}/posts/{}.json?only=media_metadata".format( - self.root, post["id"]) - ).json()["media_metadata"]["metadata"] + data = self.request_json( + f"{self.root}/posts/{post['id']}.json?only=media_metadata" + )["media_metadata"]["metadata"] + + if "Ugoira:FrameMimeType" in data: + ext = data["Ugoira:FrameMimeType"].rpartition("/")[2] + if ext == "jpeg": + ext = "jpg" + else: + ext = data["ZIP:ZipFileName"].rpartition(".")[2] - ext = data["ZIP:ZipFileName"].rpartition(".")[2] fmt = ("{:>06}." + ext).format delays = data["Ugoira:FrameDelays"] return [{"file": fmt(index), "delay": delay} @@ -180,15 +183,15 @@ class DanbooruExtractor(BaseExtractor): order = self.config("order-posts") if not order or order in {"asc", "pool", "pool_asc", "asc_pool"}: - params = {"tags": "ord{}:{}".format(ctype, cid)} + params = {"tags": f"ord{ctype}:{cid}"} elif order in {"id", "desc_id", "id_desc"}: - params = {"tags": "{}:{}".format(ctype, cid)} + params = {"tags": f"{ctype}:{cid}"} prefix = "b" elif order in {"desc", "desc_pool", "pool_desc"}: - params = {"tags": "ord{}:{}".format(ctype, cid)} + params = {"tags": f"ord{ctype}:{cid}"} reverse = True elif order in {"asc_id", "id_asc"}: - params = {"tags": "{}:{}".format(ctype, cid)} + params = {"tags": f"{ctype}:{cid}"} reverse = True posts = self._pagination("/posts.json", params, prefix) @@ -199,8 +202,8 @@ class DanbooruExtractor(BaseExtractor): return self._collection_enumerate(posts) def _collection_metadata(self, cid, ctype, cname=None): - url = "{}/{}s/{}.json".format(self.root, cname or ctype, cid) - collection = self.request(url).json() + url = f"{self.root}/{cname or ctype}s/{cid}.json" + collection = self.request_json(url) collection["name"] = collection["name"].replace("_", " ") self.post_ids = collection.pop("post_ids", ()) return {ctype: collection} @@ -315,11 +318,11 @@ class DanbooruPostExtractor(DanbooruExtractor): example = "https://danbooru.donmai.us/posts/12345" def posts(self): - url = "{}/posts/{}.json".format(self.root, self.groups[-1]) - post = self.request(url).json() + url = f"{self.root}/posts/{self.groups[-1]}.json" + post = self.request_json(url) if self.includes: params = {"only": self.includes} - post.update(self.request(url, params=params).json()) + post.update(self.request_json(url, params=params)) return (post,) @@ -357,8 +360,8 @@ class DanbooruArtistExtractor(DanbooruExtractor): items = DanbooruExtractor.items_artists def artists(self): - url = "{}/artists/{}.json".format(self.root, self.groups[-1]) - return (self.request(url).json(),) + url = f"{self.root}/artists/{self.groups[-1]}.json" + return (self.request_json(url),) class DanbooruArtistSearchExtractor(DanbooruExtractor): @@ -375,7 +378,7 @@ class DanbooruArtistSearchExtractor(DanbooruExtractor): params["page"] = text.parse_int(params.get("page"), 1) while True: - artists = self.request(url, params=params).json() + artists = self.request_json(url, params=params) yield from artists diff --git a/gallery_dl/extractor/dankefuerslesen.py b/gallery_dl/extractor/dankefuerslesen.py new file mode 100644 index 0000000..a2b0f42 --- /dev/null +++ b/gallery_dl/extractor/dankefuerslesen.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://danke.moe/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, util +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?danke\.moe" + + +class DankefuerslesenBase(): + """Base class for dankefuerslesen extractors""" + category = "dankefuerslesen" + root = "https://danke.moe" + + @memcache(keyarg=1) + def _manga_info(self, slug): + url = f"{self.root}/api/series/{slug}/" + return self.request_json(url) + + +class DankefuerslesenChapterExtractor(DankefuerslesenBase, ChapterExtractor): + """Extractor for Danke fürs Lesen manga chapters""" + pattern = BASE_PATTERN + r"/read/manga/([\w-]+)/([\w-]+)" + example = "https://danke.moe/read/manga/TITLE/123/1/" + + def _init(self): + self.zip = self.config("zip", False) + if self.zip: + self.filename_fmt = f"{self.directory_fmt[-1]}.{{extension}}" + self.directory_fmt = self.directory_fmt[:-1] + + def metadata(self, page): + slug, ch = self.groups + manga = self._manga_info(slug) + + if "-" in ch: + chapter, sep, minor = ch.rpartition("-") + ch = ch.replace("-", ".") + minor = "." + minor + else: + chapter = ch + minor = "" + + data = manga["chapters"][ch] + group_id, self._files = next(iter(data["groups"].items())) + + if not self.zip: + self.base = (f"{self.root}/media/manga/{slug}/chapters" + f"/{data['folder']}/{group_id}/") + + return { + "manga" : manga["title"], + "manga_slug": manga["slug"], + "title" : data["title"], + "volume" : text.parse_int(data["volume"]), + "chapter" : text.parse_int(chapter), + "chapter_minor": minor, + "group" : manga["groups"][group_id].split(" & "), + "group_id" : text.parse_int(group_id), + "date" : text.parse_timestamp(data["release_date"][group_id]), + "lang" : util.NONE, + "language" : util.NONE, + } + + def images(self, page): + if self.zip: + return () + + base = self.base + return [(base + file, None) for file in self._files] + + def assets(self, page): + if self.zip: + slug, ch = self.groups + url = f"{self.root}/api/download_chapter/{slug}/{ch}/" + return ({ + "type" : "archive", + "extension": "zip", + "url" : url, + },) + + +class DankefuerslesenMangaExtractor(DankefuerslesenBase, MangaExtractor): + """Extractor for Danke fürs Lesen manga""" + chapterclass = DankefuerslesenChapterExtractor + reverse = False + pattern = BASE_PATTERN + r"/read/manga/([^/?#]+)" + example = "https://danke.moe/read/manga/TITLE/" + + def chapters(self, page): + results = [] + + manga = self._manga_info(self.groups[0]).copy() + manga["lang"] = util.NONE + manga["language"] = util.NONE + + base = f"{self.root}/read/manga/{manga['slug']}/" + for ch, data in manga.pop("chapters").items(): + + if "." in ch: + chapter, sep, minor = ch.rpartition(".") + ch = ch.replace('.', '-') + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor + else: + data["chapter"] = text.parse_int(ch) + data["chapter_minor"] = "" + + manga.update(data) + results.append((f"{base}{ch}/1/", manga)) + + return results diff --git a/gallery_dl/extractor/desktopography.py b/gallery_dl/extractor/desktopography.py index 35bb299..364d88f 100644 --- a/gallery_dl/extractor/desktopography.py +++ b/gallery_dl/extractor/desktopography.py @@ -46,10 +46,10 @@ class DesktopographyExhibitionExtractor(DesktopographyExtractor): def __init__(self, match): DesktopographyExtractor.__init__(self, match) - self.year = match.group(1) + self.year = match[1] def items(self): - url = "{}/exhibition-{}/".format(self.root, self.year) + url = f"{self.root}/exhibition-{self.year}/" base_entry_url = "https://desktopography.net/portfolios/" page = self.request(url).text @@ -75,10 +75,10 @@ class DesktopographyEntryExtractor(DesktopographyExtractor): def __init__(self, match): DesktopographyExtractor.__init__(self, match) - self.entry = match.group(1) + self.entry = match[1] def items(self): - url = "{}/portfolios/{}".format(self.root, self.entry) + url = f"{self.root}/portfolios/{self.entry}" page = self.request(url).text entry_data = {"entry": self.entry} diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 37f57fe..66e2a1e 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,14 +8,13 @@ """Extractors for https://www.deviantart.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import cache, memcache import collections import mimetypes import binascii import time -import re BASE_PATTERN = ( r"(?:https?://)?(?:" @@ -37,7 +36,7 @@ class DeviantartExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = (match.group(1) or match.group(2) or "").lower() + self.user = (match[1] or match[2] or "").lower() self.offset = 0 def _init(self): @@ -56,8 +55,7 @@ class DeviantartExtractor(Extractor): self.group = False self._premium_cache = {} - unwatch = self.config("auto-unwatch") - if unwatch: + if self.config("auto-unwatch"): self.unwatch = [] self.finalize = self._unwatch_premium else: @@ -66,10 +64,13 @@ class DeviantartExtractor(Extractor): if self.quality: if self.quality == "png": self.quality = "-fullview.png?" - self.quality_sub = re.compile(r"-fullview\.[a-z0-9]+\?").sub + self.quality_sub = util.re(r"-fullview\.[a-z0-9]+\?").sub else: - self.quality = ",q_{}".format(self.quality) - self.quality_sub = re.compile(r",q_\d+").sub + self.quality = f",q_{self.quality}" + self.quality_sub = util.re(r",q_\d+").sub + + if self.intermediary: + self.intermediary_subn = util.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn if isinstance(self.original, str) and \ self.original.lower().startswith("image"): @@ -116,15 +117,13 @@ class DeviantartExtractor(Extractor): def items(self): if self.user: - group = self.config("group", True) - if group: - user = _user_details(self, self.user) - if user: + if group := self.config("group", True): + if user := _user_details(self, self.user): self.user = user["username"] self.group = False elif group == "skip": self.log.info("Skipping group '%s'", self.user) - raise exception.StopExtraction() + raise exception.AbortExtraction() else: self.subcategory = "group-" + self.subcategory self.group = True @@ -177,8 +176,7 @@ class DeviantartExtractor(Extractor): yield self.commit(deviation, deviation["flash"]) if self.commit_journal: - journal = self._extract_journal(deviation) - if journal: + if journal := self._extract_journal(deviation): if self.extra: deviation["_journal"] = journal["html"] deviation["is_original"] = True @@ -194,7 +192,7 @@ class DeviantartExtractor(Extractor): continue _user_details.update(name, user) - url = "{}/{}/avatar/".format(self.root, name) + url = f"{self.root}/{name}/avatar/" comment["_extractor"] = DeviantartAvatarExtractor yield Message.Queue, url, comment @@ -225,7 +223,7 @@ class DeviantartExtractor(Extractor): if txt is None: continue for match in DeviantartStashExtractor.pattern.finditer(txt): - url = text.ensure_http_scheme(match.group(0)) + url = text.ensure_http_scheme(match[0]) deviation["_extractor"] = DeviantartStashExtractor yield Message.Queue, url, deviation @@ -271,15 +269,14 @@ class DeviantartExtractor(Extractor): ) # filename metadata - sub = re.compile(r"\W").sub + sub = util.re(r"\W").sub deviation["filename"] = "".join(( sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["author"]["username"].lower()), "-d", deviation["index_base36"], )) - @staticmethod - def commit(deviation, target): + def commit(self, deviation, target): url = target["src"] name = target.get("filename") or url target = target.copy() @@ -321,7 +318,7 @@ class DeviantartExtractor(Extractor): header = HEADER_TEMPLATE.format( title=title, url=url, - userurl="{}/{}/".format(self.root, urlname), + userurl=f"{self.root}/{urlname}/", username=username, date=deviation["date"], ) @@ -388,8 +385,7 @@ class DeviantartExtractor(Extractor): deviations = state["@@entities"]["deviation"] content = deviations.popitem()[1]["textContent"] - html = self._textcontent_to_html(deviation, content) - if html: + if html := self._textcontent_to_html(deviation, content): return {"html": html} return {"html": content["excerpt"].replace("\n", "
")} @@ -431,12 +427,11 @@ class DeviantartExtractor(Extractor): type = content["type"] if type == "paragraph": - children = content.get("content") - if children: + if children := content.get("content"): html.append('

\ if content["src"].startswith("https://images-wixmp-"): if self.intermediary and deviation["index"] <= 790677560: # https://github.com/r888888888/danbooru/issues/4069 - intermediary, count = re.subn( - r"(/f/[^/]+/[^/]+)/v\d+/.*", + intermediary, count = self.intermediary_subn( r"/intermediary\1", content["src"], 1) if count: deviation["is_original"] = False @@ -679,11 +671,10 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ return content - @staticmethod - def _find_folder(folders, name, uuid): + def _find_folder(self, folders, name, uuid): if uuid.isdecimal(): - match = re.compile(name.replace( - "-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match + match = util.re( + "(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match for folder in folders: if match(folder["name"]): return folder @@ -702,10 +693,10 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ raise exception.NotFoundError("folder") def _folder_urls(self, folders, category, extractor): - base = "{}/{}/{}/".format(self.root, self.user, category) + base = f"{self.root}/{self.user}/{category}/" for folder in folders: folder["_extractor"] = extractor - url = "{}{}/{}".format(base, folder["folderid"], folder["name"]) + url = f"{base}{folder['folderid']}/{folder['name']}" yield url, folder def _update_content_default(self, deviation, content): @@ -748,13 +739,10 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ deviation["_fallback"] = (content["src"],) deviation["is_original"] = True + pl = binascii.b2a_base64(payload).rstrip(b'=\n').decode() content["src"] = ( - "{}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{}.".format( - url, - # base64 of 'header' is precomputed as 'eyJ0eX...' - # binascii.b2a_base64(header).rstrip(b"=\n").decode(), - binascii.b2a_base64(payload).rstrip(b"=\n").decode()) - ) + # base64 of 'header' is precomputed as 'eyJ0eX...' + f"{url}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{pl}.") def _extract_comments(self, target_id, target_type="deviation"): results = None @@ -845,8 +833,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ for fmt in media["types"] } - tokens = media.get("token") or () - if tokens: + if tokens := media.get("token") or (): if len(tokens) <= 1: fmt = formats[format] if "c" in fmt: @@ -873,19 +860,13 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ .replace("\\\\", "\\") -class DeviantartUserExtractor(DeviantartExtractor): +class DeviantartUserExtractor(Dispatch, DeviantartExtractor): """Extractor for an artist's user profile""" - subcategory = "user" pattern = BASE_PATTERN + r"/?$" example = "https://www.deviantart.com/USER" - def initialize(self): - pass - - skip = Extractor.skip - def items(self): - base = "{}/{}/".format(self.root, self.user) + base = f"{self.root}/{self.user}/" return self._dispatch_extractors(( (DeviantartAvatarExtractor , base + "avatar"), (DeviantartBackgroundExtractor, base + "banner"), @@ -950,8 +931,8 @@ class DeviantartAvatarExtractor(DeviantartExtractor): fmt, _, ext = fmt.rpartition(".") if fmt: fmt = "-" + fmt - url = "https://a.deviantart.net/avatars{}/{}/{}/{}.{}?{}".format( - fmt, name[0], name[1], name, ext, index) + url = (f"https://a.deviantart.net/avatars{fmt}" + f"/{name[0]}/{name[1]}/{name}.{ext}?{index}") results.append(self._make_deviation(url, user, index, fmt)) return results @@ -995,8 +976,8 @@ class DeviantartFolderExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) self.folder = None - self.folder_id = match.group(3) - self.folder_name = match.group(4) + self.folder_id = match[3] + self.folder_name = match[4] def deviations(self): folders = self.api.gallery_folders(self.user) @@ -1049,7 +1030,7 @@ class DeviantartStashExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.user = None + self.user = "" def deviations(self, stash_id=None, stash_data=None): if stash_id is None: @@ -1067,8 +1048,7 @@ class DeviantartStashExtractor(DeviantartExtractor): page = self._limited_request(url).text if stash_id[0] == "0": - uuid = text.extr(page, '//deviation/', '"') - if uuid: + if uuid := text.extr(page, '//deviation/', '"'): deviation = self.api.deviation(uuid) deviation["_page"] = page deviation["index"] = text.parse_int(text.extr( @@ -1091,8 +1071,7 @@ class DeviantartStashExtractor(DeviantartExtractor): yield deviation return - stash_data = text.extr(page, ',\\"stash\\":', ',\\"@@') - if stash_data: + if stash_data := text.extr(page, ',\\"stash\\":', ',\\"@@'): stash_data = util.json_loads(self._unescape_json(stash_data)) for sid in text.extract_iter( @@ -1130,8 +1109,8 @@ class DeviantartCollectionExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) self.collection = None - self.collection_id = match.group(3) - self.collection_name = match.group(4) + self.collection_id = match[3] + self.collection_name = match[4] def deviations(self): folders = self.api.collections_folders(self.user) @@ -1173,15 +1152,15 @@ class DeviantartStatusExtractor(DeviantartExtractor): def deviations(self): for status in self.api.user_statuses(self.user, self.offset): - yield from self.status(status) + yield from self.process_status(status) - def status(self, status): + def process_status(self, status): for item in status.get("items") or (): # do not trust is_share # shared deviations/statuses if "deviation" in item: yield item["deviation"].copy() if "status" in item: - yield from self.status(item["status"].copy()) + yield from self.process_status(item["status"].copy()) # assume is_deleted == true means necessary fields are missing if status["is_deleted"]: self.log.warning( @@ -1233,7 +1212,8 @@ class DeviantartTagExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.tag = text.unquote(match.group(1)) + self.tag = text.unquote(match[1]) + self.user = "" def deviations(self): return self.api.browse_tags(self.tag, self.offset) @@ -1282,16 +1262,16 @@ class DeviantartDeviationExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.type = match.group(3) + self.type = match[3] self.deviation_id = \ - match.group(4) or match.group(5) or id_from_base36(match.group(6)) + match[4] or match[5] or id_from_base36(match[6]) def deviations(self): if self.user: - url = "{}/{}/{}/{}".format( - self.root, self.user, self.type or "art", self.deviation_id) + url = (f"{self.root}/{self.user}" + f"/{self.type or 'art'}/{self.deviation_id}") else: - url = "{}/view/{}/".format(self.root, self.deviation_id) + url = f"{self.root}/view/{self.deviation_id}/" page = self._limited_request(url, notfound="deviation").text uuid = text.extr(page, '"deviationUuid\\":\\"', '\\') @@ -1379,7 +1359,7 @@ class DeviantartSearchExtractor(DeviantartExtractor): response = self.request(url, params=params) if response.history and "/users/login" in response.url: - raise exception.StopExtraction("HTTP redirect to login page") + raise exception.AbortExtraction("HTTP redirect to login page") page = response.text for dev in DeviantartDeviationExtractor.pattern.findall( @@ -1405,7 +1385,7 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.query = match.group(3) + self.query = match[3] def deviations(self): self.login() @@ -1437,7 +1417,7 @@ class DeviantartFollowingExtractor(DeviantartExtractor): api = DeviantartOAuthAPI(self) for user in api.user_friends(self.user): - url = "{}/{}".format(self.root, user["user"]["username"]) + url = f"{self.root}/{user['user']['username']}" user["_extractor"] = DeviantartUserExtractor yield Message.Queue, url, user @@ -1470,8 +1450,7 @@ class DeviantartOAuthAPI(): self.folders = extractor.config("folders", False) self.public = extractor.config("public", True) - client_id = extractor.config("client-id") - if client_id: + if client_id := extractor.config("client-id"): self.client_id = str(client_id) self.client_secret = extractor.config("client-secret") else: @@ -1585,7 +1564,7 @@ class DeviantartOAuthAPI(): def comments(self, target_id, target_type="deviation", comment_id=None, offset=0): """Fetch comments posted on a target""" - endpoint = "/comments/{}/{}".format(target_type, target_id) + endpoint = f"/comments/{target_type}/{target_id}" params = { "commentid" : comment_id, "maxdepth" : "5", @@ -1639,7 +1618,7 @@ class DeviantartOAuthAPI(): def deviation_metadata(self, deviations): """ Fetch deviation metadata for a set of deviations""" endpoint = "/deviation/metadata?" + "&".join( - "deviationids[{}]={}".format(num, deviation["deviationid"]) + f"deviationids[{num}]={deviation['deviationid']}" for num, deviation in enumerate(deviations) ) return self._call( @@ -1746,8 +1725,8 @@ class DeviantartOAuthAPI(): if response.status_code != 200: self.log.debug("Server response: %s", data) - raise exception.AuthenticationError('"{}" ({})'.format( - data.get("error_description"), data.get("error"))) + raise exception.AuthenticationError( + f"\"{data.get('error_description')}\" ({data.get('error')})") if refresh_token_key: _refresh_token_cache.update( refresh_token_key, data["refresh_token"]) @@ -1790,8 +1769,7 @@ class DeviantartOAuthAPI(): raise exception.AuthorizationError() self.log.debug(response.text) - msg = "API responded with {} {}".format( - status, response.reason) + msg = f"API responded with {status} {response.reason}" if status == 429: if self.delay < 30: self.delay += 1 @@ -1889,12 +1867,9 @@ class DeviantartOAuthAPI(): params["offset"] = int(params["offset"]) + len(results) def _pagination_list(self, endpoint, params, key="results"): - result = [] - result.extend(self._pagination(endpoint, params, False, key=key)) - return result + return list(self._pagination(endpoint, params, False, key=key)) - @staticmethod - def _shared_content(results): + def _shared_content(self, results): """Return an iterable of shared deviations in 'results'""" for result in results: for item in result.get("items") or (): @@ -2075,7 +2050,7 @@ class DeviantartEclipseAPI(): params["offset"] = int(params["offset"]) + len(results) def _ids_watching(self, user): - url = "{}/{}/about".format(self.extractor.root, user) + url = f"{self.extractor.root}/{user}/about" page = self.request(url).text gruser_id = text.extr(page, ' data-userid="', '"') @@ -2083,8 +2058,7 @@ class DeviantartEclipseAPI(): pos = page.find('\\"name\\":\\"watching\\"') if pos < 0: raise exception.NotFoundError("'watching' module ID") - module_id = text.rextract( - page, '\\"id\\":', ',', pos)[0].strip('" ') + module_id = text.rextr(page, '\\"id\\":', ',', pos).strip('" ') self._fetch_csrf_token(page) return gruser_id, module_id diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 4559aff..85358ba 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2023 Mike Fährmann +# Copyright 2017-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -24,9 +24,9 @@ class DirectlinkExtractor(Extractor): example = "https://en.wikipedia.org/static/images/project-logos/enwiki.png" def __init__(self, match): - Extractor.__init__(self, match) self.data = data = match.groupdict() self.subcategory = ".".join(data["domain"].rsplit(".", 2)[-2:]) + Extractor.__init__(self, match) def items(self): data = self.data diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py index ac21fec..216e486 100644 --- a/gallery_dl/extractor/discord.py +++ b/gallery_dl/extractor/discord.py @@ -22,8 +22,6 @@ class DiscordExtractor(Extractor): filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}" archive_fmt = "{message_id}_{num}" - cdn_fmt = "https://cdn.discordapp.com/{}/{}/{}.png?size=4096" - server_metadata = {} server_channels_metadata = {} @@ -86,44 +84,50 @@ class DiscordExtractor(Extractor): ): if message["author"].get(icon_type): message_metadata["author_files"].append({ - "url": self.cdn_fmt.format( - icon_path, - message_metadata["author_id"], - message["author"][icon_type] - ), + "url": (f"https://cdn.discordapp.com/{icon_path}/" + f"{message_metadata['author_id']}/" + f"{message['author'][icon_type]}.png" + f"?size=4096"), "filename": icon_type, "extension": "png", }) - for attachment in message["attachments"]: - message_metadata["files"].append({ - "url": attachment["url"], - "type": "attachment", - }) + message_snapshots = [message] + message_snapshots.extend( + msg["message"] for msg in message.get("message_snapshots", []) + if msg["message"]["type"] in (0, 19, 21) + ) + + for snapshot in message_snapshots: + for attachment in snapshot["attachments"]: + message_metadata["files"].append({ + "url": attachment["url"], + "type": "attachment", + }) - for embed in message["embeds"]: - if embed["type"] in self.enabled_embeds: - for field in ("video", "image", "thumbnail"): - if field not in embed: - continue - url = embed[field].get("proxy_url") - if url is not None: - message_metadata["files"].append({ - "url": url, - "type": "embed", - }) - break - - for num, file in enumerate(message_metadata["files"], start=1): - text.nameext_from_url(file["url"], file) - file["num"] = num - - yield Message.Directory, message_metadata - - for file in message_metadata["files"]: - message_metadata_file = message_metadata.copy() - message_metadata_file.update(file) - yield Message.Url, file["url"], message_metadata_file + for embed in snapshot["embeds"]: + if embed["type"] in self.enabled_embeds: + for field in ("video", "image", "thumbnail"): + if field not in embed: + continue + url = embed[field].get("proxy_url") + if url is not None: + message_metadata["files"].append({ + "url": url, + "type": "embed", + }) + break + + for num, file in enumerate(message_metadata["files"], start=1): + text.nameext_from_url(file["url"], file) + file["num"] = num + + yield Message.Directory, message_metadata + + for file in message_metadata["files"]: + message_metadata_file = message_metadata.copy() + message_metadata_file.update(file) + yield Message.Url, file["url"], message_metadata_file def extract_channel_text(self, channel_id): for message in self.api.get_channel_messages(channel_id): @@ -158,7 +162,7 @@ class DiscordExtractor(Extractor): yield from self.extract_channel( channel["channel_id"], safe=True) elif not safe: - raise exception.StopExtraction( + raise exception.AbortExtraction( "This channel type is not supported." ) except exception.HttpError as exc: @@ -215,11 +219,9 @@ class DiscordExtractor(Extractor): ): if server.get(icon_type): self.server_metadata["server_files"].append({ - "url": self.cdn_fmt.format( - icon_path, - self.server_metadata["server_id"], - server[icon_type] - ), + "url": (f"https://cdn.discordapp.com/{icon_path}/" + f"{self.server_metadata['server_id']}/" + f"{server[icon_type]}.png?size=4096"), "filename": icon_type, "extension": "png", }) @@ -342,7 +344,7 @@ class DiscordAPI(): "sort_order": "desc", "limit": THREADS_BATCH, "offset": + offset, - })["threads"] + }).get("threads", []) return self._pagination(_method, THREADS_BATCH) @@ -391,8 +393,7 @@ class DiscordAPI(): return offset += len(data) - @staticmethod - def _raise_invalid_token(): + def _raise_invalid_token(self): raise exception.AuthenticationError("""Invalid or missing token. Please provide a valid token following these instructions: diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 583869f..3e0424d 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor, Extractor, Message from .. import text, util -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com" @@ -21,7 +20,7 @@ class DynastyscansBase(): root = "https://dynasty-scans.com" def _parse_image_page(self, image_id): - url = "{}/images/{}".format(self.root, image_id) + url = f"{self.root}/images/{image_id}" extr = text.extract_from(self.request(url).text) date = extr("class='create_at'>", "") @@ -47,20 +46,19 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) - match = re.match( - (r"(?:]*>)?([^<]+)(?:)?" # manga name - r"(?: ch(\d+)([^:<]*))?" # chapter info - r"(?:: (.+))?"), # title - extr("

", ""), - ) + match = util.re( + r"(?:]*>)?([^<]+)(?:)?" # manga name + r"(?: ch(\d+)([^:<]*))?" # chapter info + r"(?:: (.+))?" # title + ).match(extr("

", "")) author = extr(" by ", "") group = extr('"icon-print"> ', '') return { - "manga" : text.unescape(match.group(1)), - "chapter" : text.parse_int(match.group(2)), - "chapter_minor": match.group(3) or "", - "title" : text.unescape(match.group(4) or ""), + "manga" : text.unescape(match[1]), + "chapter" : text.parse_int(match[2]), + "chapter_minor": match[3] or "", + "title" : text.unescape(match[4] or ""), "author" : text.remove_html(author), "group" : (text.remove_html(group) or text.extr(group, ' alt="', '"')), @@ -104,7 +102,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.query = match.group(1) or "" + self.query = match[1] or "" def items(self): yield Message.Directory, {} @@ -133,3 +131,43 @@ class DynastyscansImageExtractor(DynastyscansSearchExtractor): def images(self): return (self.query,) + + +class DynastyscansAnthologyExtractor(DynastyscansSearchExtractor): + """Extractor for dynasty-scans anthologies""" + subcategory = "anthology" + pattern = BASE_PATTERN + r"/anthologies/([^/?#]+)" + example = "https://dynasty-scans.com/anthologies/TITLE" + + def items(self): + url = f"{self.root}/anthologies/{self.groups[0]}.atom" + root = self.request_xml(url, xmlns=False) + + data = { + "_extractor": DynastyscansChapterExtractor, + "anthology" : root[3].text[28:], + } + + if self.config("metadata", False): + page = self.request(url[:-5]).text + alert = text.extr(page, "
", "
") + + for element in root: + if element.tag != "entry": + continue + content = element[6][0] + data["author"] = content[0].text[8:] + data["scanlator"] = content[1].text[11:] + data["tags"] = content[2].text[6:].lower().split(", ") + data["title"] = element[5].text + data["date"] = text.parse_datetime( + element[1].text, "%Y-%m-%dT%H:%M:%S%z") + data["date_updated"] = text.parse_datetime( + element[2].text, "%Y-%m-%dT%H:%M:%S%z") + yield Message.Queue, element[4].text, data diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 76ea792..71c3b30 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -24,8 +24,7 @@ class E621Extractor(danbooru.DanbooruExtractor): request_interval_min = 1.0 def items(self): - includes = self.config("metadata") or () - if includes: + if includes := self.config("metadata") or (): if isinstance(includes, str): includes = includes.split(",") elif not isinstance(includes, (list, tuple)): @@ -40,8 +39,8 @@ class E621Extractor(danbooru.DanbooruExtractor): if not file["url"]: md5 = file["md5"] - file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format( - self.root[8:], md5[0:2], md5[2:4], md5, file["ext"]) + file["url"] = (f"https://static1.{self.root[8:]}/data" + f"/{md5[0:2]}/{md5[2:4]}/{md5}.{file['ext']}") if notes and post.get("has_notes"): post["notes"] = self._get_notes(post["id"]) @@ -60,13 +59,13 @@ class E621Extractor(danbooru.DanbooruExtractor): yield Message.Url, file["url"], post def _get_notes(self, id): - return self.request( - "{}/notes.json?search[post_id]={}".format(self.root, id)).json() + return self.request_json( + f"{self.root}/notes.json?search[post_id]={id}") @memcache(keyarg=1) def _get_pools(self, ids): - pools = self.request( - "{}/pools.json?search[id]={}".format(self.root, ids)).json() + pools = self.request_json( + f"{self.root}/pools.json?search[id]={ids}") for pool in pools: pool["name"] = pool["name"].replace("_", " ") return pools @@ -75,7 +74,7 @@ class E621Extractor(danbooru.DanbooruExtractor): BASE_PATTERN = E621Extractor.update({ "e621": { "root": "https://e621.net", - "pattern": r"e621\.net", + "pattern": r"e621\.(?:net|cc)", }, "e926": { "root": "https://e926.net", @@ -109,12 +108,11 @@ class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor): } posts = [] - append = posts.append for num, pid in enumerate(self.post_ids, 1): if pid in id_to_post: post = id_to_post[pid] post["num"] = num - append(post) + posts.append(post) else: self.log.warning("Post %s is unavailable", pid) return posts @@ -126,8 +124,8 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): example = "https://e621.net/posts/12345" def posts(self): - url = "{}/posts/{}.json".format(self.root, self.groups[-1]) - return (self.request(url).json()["post"],) + url = f"{self.root}/posts/{self.groups[-1]}.json" + return (self.request_json(url)["post"],) class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor): diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 7582528..7beeac5 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,62 +22,20 @@ class EromeExtractor(Extractor): filename_fmt = "{album_id} {title} {num:>02}.{extension}" archive_fmt = "{album_id}_{num}" root = "https://www.erome.com" + _cookies = True def items(self): - self.__cookies = True + base = f"{self.root}/a/" + data = {"_extractor": EromeAlbumExtractor} for album_id in self.albums(): - url = "{}/a/{}".format(self.root, album_id) - - try: - page = self.request(url).text - except exception.HttpError as exc: - self.log.warning( - "Unable to fetch album '%s' (%s)", album_id, exc) - continue - - title, pos = text.extract( - page, 'property="og:title" content="', '"') - pos = page.index('
', pos) - - urls = [] - date = None - groups = page.split('
1: - date = text.parse_timestamp(ts) - - data = { - "album_id": album_id, - "title" : text.unescape(title), - "user" : text.unquote(user), - "count" : len(urls), - "date" : date, - "tags" : ([t.replace("+", " ") - for t in text.extract_iter(tags, "?q=", '"')] - if tags else ()), - "_http_headers": {"Referer": url}, - } - - yield Message.Directory, data - for data["num"], url in enumerate(urls, 1): - yield Message.Url, url, text.nameext_from_url(url, data) + yield Message.Queue, f"{base}{album_id}", data def albums(self): return () def request(self, url, **kwargs): - if self.__cookies: - self.__cookies = False + if self._cookies: + self._cookies = False self.cookies.update(_cookie_cache()) for _ in range(5): @@ -106,8 +64,52 @@ class EromeAlbumExtractor(EromeExtractor): pattern = BASE_PATTERN + r"/a/(\w+)" example = "https://www.erome.com/a/ID" - def albums(self): - return (self.groups[0],) + def items(self): + album_id = self.groups[0] + url = f"{self.root}/a/{album_id}" + + try: + page = self.request(url).text + except exception.HttpError as exc: + raise exception.AbortExtraction( + f"{album_id}: Unable to fetch album page ({exc})") + + title, pos = text.extract( + page, 'property="og:title" content="', '"') + pos = page.index('
', pos) + + urls = [] + date = None + groups = page.split('
1: + date = text.parse_timestamp(ts) + + data = { + "album_id": album_id, + "title" : text.unescape(title), + "user" : text.unquote(user), + "count" : len(urls), + "date" : date, + "tags" : ([t.replace("+", " ") + for t in text.extract_iter(tags, "?q=", '"')] + if tags else ()), + "_http_headers": {"Referer": url}, + } + + yield Message.Directory, data + for data["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, data) class EromeUserExtractor(EromeExtractor): @@ -116,7 +118,7 @@ class EromeUserExtractor(EromeExtractor): example = "https://www.erome.com/USER" def albums(self): - url = "{}/{}".format(self.root, self.groups[0]) + url = f"{self.root}/{self.groups[0]}" return self._pagination(url, {}) diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py index 3bf0a74..787786e 100644 --- a/gallery_dl/extractor/everia.py +++ b/gallery_dl/extractor/everia.py @@ -7,8 +7,7 @@ """Extractors for https://everia.club""" from .common import Extractor, Message -from .. import text -import re +from .. import text, util BASE_PATTERN = r"(?:https?://)?everia\.club" @@ -26,13 +25,13 @@ class EveriaExtractor(Extractor): return self._pagination(self.groups[0]) def _pagination(self, path, params=None, pnum=1): - find_posts = re.compile(r'thumbnail">\s*\s*= 300: @@ -50,16 +49,16 @@ class EveriaPostExtractor(EveriaExtractor): example = "https://everia.club/0000/00/00/TITLE" def items(self): - url = self.root + self.groups[0] + url = self.root + self.groups[0] + "/" page = self.request(url).text content = text.extr(page, 'itemprop="text">', "', "', "")), - "post_url": url, + "post_url": text.unquote(url), "post_category": text.extr( page, "post-in-category-", " ").capitalize(), "count": len(urls), @@ -67,6 +66,7 @@ class EveriaPostExtractor(EveriaExtractor): yield Message.Directory, data for data["num"], url in enumerate(urls, 1): + url = text.unquote(url) yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index e7ba78e..f147959 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -34,7 +34,7 @@ class ExhentaiExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.version = match.group(1) + self.version = match[1] def initialize(self): domain = self.config("domain", "auto") @@ -59,7 +59,7 @@ class ExhentaiExtractor(Extractor): def login(self): """Login and set necessary cookies""" if self.LIMIT: - raise exception.StopExtraction("Image limit reached!") + raise exception.AbortExtraction("Image limit reached!") if self.cookies_check(self.cookies_names): return @@ -122,10 +122,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def __init__(self, match): ExhentaiExtractor.__init__(self, match) - self.gallery_id = text.parse_int(match.group(2) or match.group(5)) - self.gallery_token = match.group(3) - self.image_token = match.group(4) - self.image_num = text.parse_int(match.group(6), 1) + self.gallery_id = text.parse_int(match[2] or match[5]) + self.gallery_token = match[3] + self.image_token = match[4] + self.image_num = text.parse_int(match[6], 1) self.key_start = None self.key_show = None self.key_next = None @@ -136,11 +136,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): source = self.config("source") if source == "hitomi": self.items = self._items_hitomi + elif source == "metadata": + self.items = self._items_metadata limits = self.config("limits", False) if limits and limits.__class__ is int: self.limits = limits - self._remaining = 0 + self._limits_remaining = 0 else: self.limits = False @@ -176,7 +178,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.image_token = text.extr(gpage, 'hentai.org/s/', '"') if not self.image_token: self.log.debug("Page content:\n%s", gpage) - raise exception.StopExtraction( + raise exception.AbortExtraction( "Failed to extract initial image token") ipage = self._image_page() else: @@ -184,7 +186,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): part = text.extr(ipage, 'hentai.org/g/', '"') if not part: self.log.debug("Page content:\n%s", ipage) - raise exception.StopExtraction( + raise exception.AbortExtraction( "Failed to extract gallery token") self.gallery_token = part.split("/")[1] gpage = self._gallery_page() @@ -198,11 +200,12 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): for url, image in images: data.update(image) if self.limits: - self._check_limits(data) + self._limits_check(data) if "/fullimg" in url: data["_http_validate"] = self._validate_response else: data["_http_validate"] = None + data["_http_signature"] = self._validate_signature yield Message.Url, url, data fav = self.config("fav") @@ -218,10 +221,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data = {} from .hitomi import HitomiGalleryExtractor - url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id) + url = f"https://hitomi.la/galleries/{self.gallery_id}.html" data["_extractor"] = HitomiGalleryExtractor yield Message.Queue, url, data + def _items_metadata(self): + yield Message.Directory, self.metadata_from_api() + def get_metadata(self, page): """Extract gallery metadata""" data = self.metadata_from_page(page) @@ -240,8 +246,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def metadata_from_page(self, page): extr = text.extract_from(page) - api_url = extr('var api_url = "', '"') - if api_url: + if api_url := extr('var api_url = "', '"'): self.api_url = api_url data = { @@ -293,9 +298,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "namespace": 1, } - data = self.request(self.api_url, method="POST", json=data).json() + data = self.request_json(self.api_url, method="POST", json=data) if "error" in data: - raise exception.StopExtraction(data["error"]) + raise exception.AbortExtraction(data["error"]) return data["gmetadata"][0] @@ -320,8 +325,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["_fallback"] = self._fallback_1280(nl, self.image_num) except IndexError: self.log.debug("Page content:\n%s", page) - raise exception.StopExtraction( - "Unable to parse image info for '%s'", url) + raise exception.AbortExtraction( + f"Unable to parse image info for '{url}'") data["num"] = self.image_num data["image_token"] = self.key_start = extr('var startkey="', '";') @@ -345,7 +350,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): } for request["page"] in range(self.image_num + 1, self.count + 1): - page = self.request(api_url, method="POST", json=request).json() + page = self.request_json(api_url, method="POST", json=request) i3 = page["i3"] i6 = page["i6"] @@ -371,8 +376,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): nl, request["page"], imgkey) except IndexError: self.log.debug("Page content:\n%s", page) - raise exception.StopExtraction( - "Unable to parse image info for '%s'", url) + raise exception.AbortExtraction( + f"Unable to parse image info for '{url}'") data["num"] = request["page"] data["image_token"] = imgkey @@ -385,66 +390,106 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): request["imgkey"] = nextkey def _validate_response(self, response): - if not response.history and response.headers.get( + if response.history or not response.headers.get( "content-type", "").startswith("text/html"): - page = response.text - self.log.warning("'%s'", page) - - if " requires GP" in page: - gp = self.config("gp") - if gp == "stop": - raise exception.StopExtraction("Not enough GP") - elif gp == "wait": - input("Press ENTER to continue.") - return response.url - - self.log.info("Falling back to non-original downloads") - self.original = False - return self.data["_url_1280"] - - if " temporarily banned " in page: - raise exception.AuthorizationError("Temporarily Banned") - - self._report_limits() - return True - - def _report_limits(self): - ExhentaiExtractor.LIMIT = True - raise exception.StopExtraction("Image limit reached!") - - def _check_limits(self, data): - if not self._remaining or data["num"] % 25 == 0: - self._update_limits() - self._remaining -= data["cost"] - if self._remaining <= 0: - self._report_limits() - - def _check_509(self, url): - # full 509.gif URLs - # - https://exhentai.org/img/509.gif - # - https://ehgt.org/g/509.gif - if url.endswith(("hentai.org/img/509.gif", - "ehgt.org/g/509.gif")): - self.log.debug(url) - self._report_limits() + return True - def _update_limits(self): + page = response.text + self.log.warning("'%s'", page) + + if " requires GP" in page: + gp = self.config("gp") + if gp == "stop": + raise exception.AbortExtraction("Not enough GP") + elif gp == "wait": + self.input("Press ENTER to continue.") + return response.url + + self.log.info("Falling back to non-original downloads") + self.original = False + return self.data["_url_1280"] + + if " temporarily banned " in page: + raise exception.AuthorizationError("Temporarily Banned") + + self._limits_exceeded() + return response.url + + def _validate_signature(self, signature): + """Return False if all file signature bytes are zero""" + if signature: + if byte := signature[0]: + # 60 == b"<" + if byte == 60 and b"", "").replace(",", "") self.log.debug("Image Limits: %s/%s", current, self.limits) - self._remaining = self.limits - text.parse_int(current) + self._limits_remaining = self.limits - text.parse_int(current) + + return page + + def _check_509(self, url): + # full 509.gif URLs + # - https://exhentai.org/img/509.gif + # - https://ehgt.org/g/509.gif + if url.endswith(("hentai.org/img/509.gif", + "ehgt.org/g/509.gif")): + self.log.debug(url) + self._limits_exceeded() + + def _limits_exceeded(self): + msg = "Image limit exceeded!" + action = self.config("limits-action") + + if not action or action == "stop": + ExhentaiExtractor.LIMIT = True + raise exception.AbortExtraction(msg) + + self.log.warning(msg) + if action == "wait": + self.input("Press ENTER to continue.") + self._limits_update() + elif action == "reset": + self._limits_reset() + else: + self.log.error("Invalid 'limits-action' value '%s'", action) + + def _limits_check(self, data): + if not self._limits_remaining or data["num"] % 25 == 0: + self._limits_update() + self._limits_remaining -= data["cost"] + if self._limits_remaining <= 0: + self._limits_exceeded() + + def _limits_reset(self): + self.log.info("Resetting image limits") + self._request_home( + method="POST", + headers={"Content-Type": "application/x-www-form-urlencoded"}, + data=b"reset_imagelimit=Reset+Quota") + + _limits_update = _request_home def _gallery_page(self): - url = "{}/g/{}/{}/".format( - self.root, self.gallery_id, self.gallery_token) + url = f"{self.root}/g/{self.gallery_id}/{self.gallery_token}/" response = self.request(url, fatal=False) page = response.text @@ -457,8 +502,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): return page def _image_page(self): - url = "{}/s/{}/{}-{}".format( - self.root, self.image_token, self.gallery_id, self.image_num) + url = (f"{self.root}/s/{self.image_token}" + f"/{self.gallery_id}-{self.image_num}") page = self.request(url, fatal=False).text if page.startswith(("Invalid page", "Keep trying")): @@ -466,7 +511,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): return page def _fallback_original(self, nl, fullimg): - url = "{}?nl={}".format(fullimg, nl) + url = f"{fullimg}?nl={nl}" for _ in util.repeat(self.fallback_retries): yield url @@ -475,8 +520,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): token = self.key_start for _ in util.repeat(self.fallback_retries): - url = "{}/s/{}/{}-{}?nl={}".format( - self.root, token, self.gallery_id, num, nl) + url = f"{self.root}/s/{token}/{self.gallery_id}-{num}?nl={nl}" page = self.request(url, fatal=False).text if page.startswith(("Invalid page", "Keep trying")): @@ -486,8 +530,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): nl = data["_nl"] - @staticmethod - def _parse_image_info(url): + def _parse_image_info(self, url): for part in url.split("/")[4:]: try: _, size, width, height, _ = part.split("-") @@ -504,8 +547,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "height": text.parse_int(height), } - @staticmethod - def _parse_original_info(info): + def _parse_original_info(self, info): parts = info.lstrip().split(" ") size = text.parse_bytes(parts[3] + parts[4][0]) @@ -527,11 +569,11 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): def __init__(self, match): ExhentaiExtractor.__init__(self, match) - _, query, tag = match.groups() + _, query, tag = self.groups if tag: if "+" in tag: ns, _, tag = tag.rpartition(":") - tag = '{}:"{}$"'.format(ns, tag.replace("+", " ")) + tag = f"{ns}:\"{tag.replace('+', ' ')}$\"" else: tag += "$" self.params = {"f_search": tag, "page": 0} @@ -553,13 +595,13 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): last = None page = self.request(search_url, params=params).text - for gallery in ExhentaiGalleryExtractor.pattern.finditer(page): - url = gallery.group(0) + for match in ExhentaiGalleryExtractor.pattern.finditer(page): + url = match[0] if url == last: continue last = url - data["gallery_id"] = text.parse_int(gallery.group(2)) - data["gallery_token"] = gallery.group(3) + data["gallery_id"] = text.parse_int(match[2]) + data["gallery_token"] = match[3] yield Message.Queue, url + "/", data next_url = text.extr(page, 'nexturl="', '"', None) diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index b284ee8..069ed99 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -6,10 +6,14 @@ """Extractors for https://www.facebook.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, exception +from ..cache import memcache BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com" +USER_PATTERN = (BASE_PATTERN + + r"/(?!media/|photo/|photo.php|watch/)" + r"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)") class FacebookExtractor(Extractor): @@ -20,9 +24,6 @@ class FacebookExtractor(Extractor): filename_fmt = "{id}.{extension}" archive_fmt = "{id}.{extension}" - set_url_fmt = root + "/media/set/?set={set_id}" - photo_url_fmt = root + "/photo/?fbid={photo_id}&set={set_id}" - def _init(self): headers = self.session.headers headers["Accept"] = ( @@ -37,22 +38,20 @@ class FacebookExtractor(Extractor): self.videos = self.config("videos", True) self.author_followups = self.config("author-followups", False) - @staticmethod - def decode_all(txt): + def decode_all(self, txt): return text.unescape( txt.encode().decode("unicode_escape") .encode("utf_16", "surrogatepass").decode("utf_16") ).replace("\\/", "/") - @staticmethod - def parse_set_page(set_page): + def parse_set_page(self, set_page): directory = { "set_id": text.extr( set_page, '"mediaSetToken":"', '"' ) or text.extr( set_page, '"mediasetToken":"', '"' ), - "username": FacebookExtractor.decode_all( + "username": self.decode_all( text.extr( set_page, '"user":{"__isProfile":"User","name":"', '","' ) or text.extr( @@ -62,7 +61,7 @@ class FacebookExtractor(Extractor): "user_id": text.extr( set_page, '"owner":{"__typename":"User","id":"', '"' ), - "title": FacebookExtractor.decode_all(text.extr( + "title": self.decode_all(text.extr( set_page, '"title":{"text":"', '"' )), "first_photo_id": text.extr( @@ -77,8 +76,7 @@ class FacebookExtractor(Extractor): return directory - @staticmethod - def parse_photo_page(photo_page): + def parse_photo_page(self, photo_page): photo = { "id": text.extr( photo_page, '"__isNode":"Photo","id":"', '"' @@ -88,13 +86,13 @@ class FacebookExtractor(Extractor): '"url":"https:\\/\\/www.facebook.com\\/photo\\/?fbid=', '"' ).rsplit("&set=", 1)[-1], - "username": FacebookExtractor.decode_all(text.extr( + "username": self.decode_all(text.extr( photo_page, '"owner":{"__typename":"User","name":"', '"' )), "user_id": text.extr( photo_page, '"owner":{"__typename":"User","id":"', '"' ), - "caption": FacebookExtractor.decode_all(text.extr( + "caption": self.decode_all(text.extr( photo_page, '"message":{"delight_ranges"', '"},"message_preferred_body"' @@ -103,7 +101,7 @@ class FacebookExtractor(Extractor): text.extr(photo_page, '\\"publish_time\\":', ',') or text.extr(photo_page, '"created_time":', ',') ), - "url": FacebookExtractor.decode_all(text.extr( + "url": self.decode_all(text.extr( photo_page, ',"image":{"uri":"', '","' )), "next_photo_id": text.extr( @@ -133,8 +131,7 @@ class FacebookExtractor(Extractor): return photo - @staticmethod - def parse_post_page(post_page): + def parse_post_page(self, post_page): first_photo_url = text.extr( text.extr( post_page, '"__isMedia":"Photo"', '"target_group"' @@ -148,13 +145,12 @@ class FacebookExtractor(Extractor): return post - @staticmethod - def parse_video_page(video_page): + def parse_video_page(self, video_page): video = { "id": text.extr( video_page, '\\"video_id\\":\\"', '\\"' ), - "username": FacebookExtractor.decode_all(text.extr( + "username": self.decode_all(text.extr( video_page, '"actors":[{"__typename":"User","name":"', '","' )), "user_id": text.extr( @@ -167,7 +163,7 @@ class FacebookExtractor(Extractor): } if not video["username"]: - video["username"] = FacebookExtractor.decode_all(text.extr( + video["username"] = self.decode_all(text.extr( video_page, '"__typename":"User","id":"' + video["user_id"] + '","name":"', '","' @@ -179,7 +175,7 @@ class FacebookExtractor(Extractor): audio = { **video, - "url": FacebookExtractor.decode_all(text.extr( + "url": self.decode_all(text.extr( text.extr( first_video_raw, "AudioChannelConfiguration", @@ -196,7 +192,7 @@ class FacebookExtractor(Extractor): first_video_raw, 'FBQualityLabel=\\"', '\\u003C\\/BaseURL>' ): resolution = raw_url.split('\\"', 1)[0] - video["urls"][resolution] = FacebookExtractor.decode_all( + video["urls"][resolution] = self.decode_all( raw_url.split('BaseURL>', 1)[1] ) @@ -224,17 +220,16 @@ class FacebookExtractor(Extractor): res = self.request(url, **kwargs) if res.url.startswith(self.root + "/login"): - raise exception.AuthenticationError( - "You must be logged in to continue viewing images." + - LEFT_OFF_TXT + raise exception.AuthRequired( + message=(f"You must be logged in to continue viewing images." + f"{LEFT_OFF_TXT}") ) if b'{"__dr":"CometErrorRoot.react"}' in res.content: - raise exception.StopExtraction( - "You've been temporarily blocked from viewing images. " - "\nPlease try using a different account, " - "using a VPN or waiting before you retry." + - LEFT_OFF_TXT + raise exception.AbortExtraction( + f"You've been temporarily blocked from viewing images.\n" + f"Please try using a different account, " + f"using a VPN or waiting before you retry.{LEFT_OFF_TXT}" ) return res @@ -248,9 +243,7 @@ class FacebookExtractor(Extractor): while i < len(all_photo_ids): photo_id = all_photo_ids[i] - photo_url = self.photo_url_fmt.format( - photo_id=photo_id, set_id=set_id - ) + photo_url = f"{self.root}/photo/?fbid={photo_id}&set={set_id}" photo_page = self.photo_page_request_wrapper(photo_url).text photo = self.parse_photo_page(photo_page) @@ -302,6 +295,36 @@ class FacebookExtractor(Extractor): i += 1 + @memcache(keyarg=1) + def _extract_profile_photos_page(self, profile): + profile_photos_url = f"{self.root}/{profile}/photos_by" + + for _ in range(self.fallback_retries + 1): + profile_photos_page = self.request(profile_photos_url).text + if set_id := self._extract_profile_set_id(profile_photos_page): + break + self.log.debug("Got empty profile photos page, retrying...") + else: + raise exception.AbortExtraction("Failed to extract profile data") + + avatar_page_url = text.extr( + profile_photos_page, ',"profilePhoto":{"url":"', '"') + + return set_id, avatar_page_url.replace("\\/", "/") + + def _extract_profile_set_id(self, profile_photos_page): + set_ids_raw = text.extr( + profile_photos_page, '"pageItems"', '"page_info"' + ) + + set_id = text.extr( + set_ids_raw, 'set=', '"' + ).rsplit("&", 1)[0] or text.extr( + set_ids_raw, '\\/photos\\/', '\\/' + ) + + return set_id + class FacebookSetExtractor(FacebookExtractor): """Base class for Facebook Set extractors""" @@ -317,13 +340,12 @@ class FacebookSetExtractor(FacebookExtractor): def items(self): set_id = self.groups[0] or self.groups[3] - path = self.groups[1] - if path: + if path := self.groups[1]: post_url = self.root + "/" + path post_page = self.request(post_url).text set_id = self.parse_post_page(post_page)["set_id"] - set_url = self.set_url_fmt.format(set_id=set_id) + set_url = f"{self.root}/media/set/?set={set_id}" set_page = self.request(set_url).text set_data = self.parse_set_page(set_page) if self.groups[2]: @@ -342,16 +364,15 @@ class FacebookPhotoExtractor(FacebookExtractor): def items(self): photo_id = self.groups[0] - photo_url = self.photo_url_fmt.format(photo_id=photo_id, set_id="") + photo_url = f"{self.root}/photo/?fbid={photo_id}&set=" photo_page = self.photo_page_request_wrapper(photo_url).text i = 1 photo = self.parse_photo_page(photo_page) photo["num"] = i - set_page = self.request( - self.set_url_fmt.format(set_id=photo["set_id"]) - ).text + set_url = f"{self.root}/media/set/?set={photo['set_id']}" + set_page = self.request(set_url).text directory = self.parse_set_page(set_page) @@ -362,9 +383,7 @@ class FacebookPhotoExtractor(FacebookExtractor): for comment_photo_id in photo["followups_ids"]: comment_photo = self.parse_photo_page( self.photo_page_request_wrapper( - self.photo_url_fmt.format( - photo_id=comment_photo_id, set_id="" - ) + f"{self.root}/photo/?fbid={comment_photo_id}&set=" ).text ) i += 1 @@ -399,44 +418,50 @@ class FacebookVideoExtractor(FacebookExtractor): yield Message.Url, audio["url"], audio -class FacebookProfileExtractor(FacebookExtractor): - """Base class for Facebook Profile Photos Set extractors""" - subcategory = "profile" - pattern = ( - BASE_PATTERN + - r"/(?!media/|photo/|photo.php|watch/)" - r"(?:profile\.php\?id=|people/[^/?#]+/)?" - r"([^/?&#]+)(?:/photos(?:_by)?|/videos|/posts)?/?(?:$|\?|#)" - ) - example = "https://www.facebook.com/USERNAME" +class FacebookPhotosExtractor(FacebookExtractor): + """Extractor for Facebook Profile Photos""" + subcategory = "photos" + pattern = USER_PATTERN + r"/photos(?:_by)?" + example = "https://www.facebook.com/USERNAME/photos" - @staticmethod - def get_profile_photos_set_id(profile_photos_page): - set_ids_raw = text.extr( - profile_photos_page, '"pageItems"', '"page_info"' - ) + def items(self): + set_id = self._extract_profile_photos_page(self.groups[0])[0] + set_url = f"{self.root}/media/set/?set={set_id}" + set_page = self.request(set_url).text + set_data = self.parse_set_page(set_page) + return self.extract_set(set_data) - set_id = text.extr( - set_ids_raw, 'set=', '"' - ).rsplit("&", 1)[0] or text.extr( - set_ids_raw, '\\/photos\\/', '\\/' - ) - return set_id +class FacebookAvatarExtractor(FacebookExtractor): + """Extractor for Facebook Profile Avatars""" + subcategory = "avatar" + pattern = USER_PATTERN + r"/avatar" + example = "https://www.facebook.com/USERNAME/avatar" def items(self): - profile_photos_url = ( - self.root + "/" + self.groups[0] + "/photos_by" - ) - profile_photos_page = self.request(profile_photos_url).text + avatar_page_url = self._extract_profile_photos_page(self.groups[0])[1] + avatar_page = self.photo_page_request_wrapper(avatar_page_url).text - set_id = self.get_profile_photos_set_id(profile_photos_page) + avatar = self.parse_photo_page(avatar_page) + avatar["count"] = avatar["num"] = 1 + avatar["type"] = "avatar" - if set_id: - set_url = self.set_url_fmt.format(set_id=set_id) - set_page = self.request(set_url).text - set_data = self.parse_set_page(set_page) - return self.extract_set(set_data) + set_url = f"{self.root}/media/set/?set={avatar['set_id']}" + set_page = self.request(set_url).text + directory = self.parse_set_page(set_page) - self.log.debug("Profile photos set ID not found.") - return iter(()) + yield Message.Directory, directory + yield Message.Url, avatar["url"], avatar + + +class FacebookUserExtractor(Dispatch, FacebookExtractor): + """Extractor for Facebook Profiles""" + pattern = USER_PATTERN + r"/?(?:$|\?|#)" + example = "https://www.facebook.com/USERNAME" + + def items(self): + base = f"{self.root}/{self.groups[0]}/" + return self._dispatch_extractors(( + (FacebookAvatarExtractor, base + "avatar"), + (FacebookPhotosExtractor, base + "photos"), + ), ("photos",)) diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 8981c29..70b06e7 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -7,9 +7,8 @@ """Extractors for https://www.fanbox.cc/""" from .common import Extractor, Message -from .. import text +from .. import text, util from ..cache import memcache -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?fanbox\.cc" USER_PATTERN = ( @@ -41,8 +40,7 @@ class FanboxExtractor(Extractor): } self.embeds = self.config("embeds", True) - includes = self.config("metadata") - if includes: + if includes := self.config("metadata"): if isinstance(includes, str): includes = includes.split(",") elif not isinstance(includes, (list, tuple)): @@ -62,7 +60,23 @@ class FanboxExtractor(Extractor): FanboxExtractor._warning = False def items(self): - for content_body, post in self.posts(): + fee_max = self.config("fee-max") + + for item in self.posts(): + if fee_max is not None and fee_max < item["feeRequired"]: + self.log.warning("Skipping post %s (feeRequired of %s > %s)", + item["id"], item["feeRequired"], fee_max) + continue + + try: + url = "https://api.fanbox.cc/post.info?postId=" + item["id"] + body = self.request_json(url, headers=self.headers)["body"] + content_body, post = self._extract_post(body) + except Exception as exc: + self.log.warning("Skipping post %s (%s: %s)", + item["id"], exc.__class__.__name__, exc) + continue + yield Message.Directory, post yield from self._get_urls_from_post(content_body, post) @@ -72,22 +86,17 @@ class FanboxExtractor(Extractor): def _pagination(self, url): while url: url = text.ensure_http_scheme(url) - body = self.request(url, headers=self.headers).json()["body"] - for item in body["items"]: - try: - yield self._get_post_data(item["id"]) - except Exception as exc: - self.log.warning("Skipping post %s (%s: %s)", - item["id"], exc.__class__.__name__, exc) + body = self.request_json(url, headers=self.headers)["body"] + + yield from body["items"] + url = body["nextUrl"] - def _get_post_data(self, post_id): + def _extract_post(self, post): """Fetch and process post data""" - url = "https://api.fanbox.cc/post.info?postId="+post_id - post = self.request(url, headers=self.headers).json()["body"] + post["archives"] = () - content_body = post.pop("body", None) - if content_body: + if content_body := post.pop("body", None): if "html" in content_body: post["html"] = content_body["html"] if post["type"] == "article": @@ -95,29 +104,30 @@ class FanboxExtractor(Extractor): if "blocks" in content_body: content = [] # text content images = [] # image IDs in 'body' order + files = [] # file IDs in 'body' order - append = content.append - append_img = images.append for block in content_body["blocks"]: if "text" in block: - append(block["text"]) + content.append(block["text"]) if "links" in block: for link in block["links"]: - append(link["url"]) + content.append(link["url"]) if "imageId" in block: - append_img(block["imageId"]) - - if images and "imageMap" in content_body: - # reorder 'imageMap' (#2718) - image_map = content_body["imageMap"] - content_body["imageMap"] = { - image_id: image_map[image_id] - for image_id in images - if image_id in image_map - } + images.append(block["imageId"]) + if "fileId" in block: + files.append(block["fileId"]) post["content"] = "\n".join(content) + self._sort_map(content_body, "imageMap", images) + if file_map := self._sort_map(content_body, "fileMap", files): + exts = util.EXTS_ARCHIVE + post["archives"] = [ + file + for file in file_map.values() + if file.get("extension", "").lower() in exts + ] + post["date"] = text.parse_datetime(post["publishedDatetime"]) post["text"] = content_body.get("text") if content_body else None post["isCoverImage"] = False @@ -130,8 +140,7 @@ class FanboxExtractor(Extractor): try: post["plan"] = plans[fee] except KeyError: - fees = [f for f in plans if f >= fee] - if fees: + if fees := [f for f in plans if f >= fee]: plan = plans[min(fees)] else: plan = plans[0].copy() @@ -139,17 +148,30 @@ class FanboxExtractor(Extractor): post["plan"] = plans[fee] = plan if self._meta_comments: if post["commentCount"]: - post["comments"] = list(self._get_comment_data(post_id)) + post["comments"] = list(self._get_comment_data(post["id"])) else: post["commentd"] = () return content_body, post + def _sort_map(self, body, key, ids): + orig = body.get(key) + if not orig: + return {} if orig is None else orig + + body[key] = new = { + id: orig[id] + for id in ids + if id in orig + } + + return new + @memcache(keyarg=1) def _get_user_data(self, creator_id): url = "https://api.fanbox.cc/creator.get" params = {"creatorId": creator_id} - data = self.request(url, params=params, headers=self.headers).json() + data = self.request_json(url, params=params, headers=self.headers) user = data["body"] user.update(user.pop("user")) @@ -160,7 +182,7 @@ class FanboxExtractor(Extractor): def _get_plan_data(self, creator_id): url = "https://api.fanbox.cc/plan.listCreator" params = {"creatorId": creator_id} - data = self.request(url, params=params, headers=self.headers).json() + data = self.request_json(url, params=params, headers=self.headers) plans = {0: { "id" : "", @@ -185,7 +207,7 @@ class FanboxExtractor(Extractor): comments = [] while url: url = text.ensure_http_scheme(url) - body = self.request(url, headers=self.headers).json()["body"] + body = self.request_json(url, headers=self.headers)["body"] data = body["commentList"] comments.extend(data["items"]) url = data["nextUrl"] @@ -193,9 +215,8 @@ class FanboxExtractor(Extractor): def _get_urls_from_post(self, content_body, post): num = 0 - cover_image = post.get("coverImageUrl") - if cover_image: - cover_image = re.sub("/c/[0-9a-z_]+", "", cover_image) + if cover_image := post.get("coverImageUrl"): + cover_image = util.re("/c/[0-9a-z_]+").sub("", cover_image) final_post = post.copy() final_post["isCoverImage"] = True final_post["fileUrl"] = cover_image @@ -313,10 +334,10 @@ class FanboxExtractor(Extractor): elif provider == "twitter": url = "https://twitter.com/_/status/"+content_id elif provider == "google_forms": - templ = "https://docs.google.com/forms/d/e/{}/viewform?usp=sf_link" - url = templ.format(content_id) + url = (f"https://docs.google.com/forms/d/e/" + f"{content_id}/viewform?usp=sf_link") else: - self.log.warning("service not recognized: {}".format(provider)) + self.log.warning(f"service not recognized: {provider}") if url: final_post["embed"] = embed @@ -334,25 +355,16 @@ class FanboxCreatorExtractor(FanboxExtractor): pattern = USER_PATTERN + r"(?:/posts)?/?$" example = "https://USER.fanbox.cc/" - def __init__(self, match): - FanboxExtractor.__init__(self, match) - self.creator_id = match.group(1) or match.group(2) - def posts(self): url = "https://api.fanbox.cc/post.paginateCreator?creatorId=" - return self._pagination_creator(url + self.creator_id) + creator_id = self.groups[0] or self.groups[1] + return self._pagination_creator(url + creator_id) def _pagination_creator(self, url): - urls = self.request(url, headers=self.headers).json()["body"] + urls = self.request_json(url, headers=self.headers)["body"] for url in urls: url = text.ensure_http_scheme(url) - body = self.request(url, headers=self.headers).json()["body"] - for item in body: - try: - yield self._get_post_data(item["id"]) - except Exception as exc: - self.log.warning("Skipping post %s (%s: %s)", - item["id"], exc.__class__.__name__, exc) + yield from self.request_json(url, headers=self.headers)["body"] class FanboxPostExtractor(FanboxExtractor): @@ -361,12 +373,8 @@ class FanboxPostExtractor(FanboxExtractor): pattern = USER_PATTERN + r"/posts/(\d+)" example = "https://USER.fanbox.cc/posts/12345" - def __init__(self, match): - FanboxExtractor.__init__(self, match) - self.post_id = match.group(3) - def posts(self): - return (self._get_post_data(self.post_id),) + return ({"id": self.groups[2], "feeRequired": 0},) class FanboxHomeExtractor(FanboxExtractor): diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 6218f19..e32a86b 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -93,7 +93,7 @@ class FantiaExtractor(Extractor): def _get_post_data(self, post_id): """Fetch and process post data""" url = self.root+"/api/v1/posts/"+post_id - resp = self.request(url, headers=self.headers).json()["post"] + resp = self.request_json(url, headers=self.headers)["post"] return { "post_id": resp["id"], "post_url": self.root + "/posts/" + str(resp["id"]), @@ -181,10 +181,10 @@ class FantiaCreatorExtractor(FantiaExtractor): def __init__(self, match): FantiaExtractor.__init__(self, match) - self.creator_id = match.group(1) + self.creator_id = match[1] def posts(self): - url = "{}/fanclubs/{}/posts".format(self.root, self.creator_id) + url = f"{self.root}/fanclubs/{self.creator_id}/posts" return self._pagination(url) @@ -196,7 +196,7 @@ class FantiaPostExtractor(FantiaExtractor): def __init__(self, match): FantiaExtractor.__init__(self, match) - self.post_id = match.group(1) + self.post_id = match[1] def posts(self): self._csrf_token() diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py index 43627e2..7ff71b0 100644 --- a/gallery_dl/extractor/fapachi.py +++ b/gallery_dl/extractor/fapachi.py @@ -31,8 +31,7 @@ class FapachiPostExtractor(Extractor): "user": self.user, "id" : self.id, } - page = self.request("{}/{}/media/{}".format( - self.root, self.user, self.id)).text + page = self.request(f"{self.root}/{self.user}/media/{self.id}").text url = self.root + text.extract( page, 'data-src="', '"', page.index('class="media-img'))[0] yield Message.Directory, data @@ -50,17 +49,16 @@ class FapachiUserExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = match.group(1) - self.num = text.parse_int(match.group(2), 1) + self.user = match[1] + self.num = text.parse_int(match[2], 1) def items(self): data = {"_extractor": FapachiPostExtractor} while True: - page = self.request("{}/{}/page/{}".format( - self.root, self.user, self.num)).text + url = f"{self.root}/{self.user}/page/{self.num}" + page = self.request(url).text for post in text.extract_iter(page, 'model-media-prew">', ">"): - path = text.extr(post, 'Next page' not in page: diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index cf18edc..b961cbe 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -25,11 +25,11 @@ class FapelloPostExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(match[0]) self.model, self.id = match.groups() def items(self): - url = "{}/{}/{}/".format(self.root, self.model, self.id) + url = f"{self.root}/{self.model}/{self.id}/" page = text.extr( self.request(url, allow_redirects=False).text, 'class="uk-align-center"', "
", None) @@ -59,15 +59,14 @@ class FapelloModelExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) - self.model = match.group(1) + self.root = text.root_from_url(match[0]) + self.model = match[1] def items(self): num = 1 data = {"_extractor": FapelloPostExtractor} while True: - url = "{}/ajax/model/{}/page-{}/".format( - self.root, self.model, num) + url = f"{self.root}/ajax/model/{self.model}/page-{num}/" page = self.request(url).text if not page: return @@ -93,8 +92,8 @@ class FapelloPathExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) - self.path = match.group(1) + self.root = text.root_from_url(match[0]) + self.path = match[1] def items(self): num = 1 @@ -109,8 +108,8 @@ class FapelloPathExtractor(Extractor): data = {"_extractor": FapelloModelExtractor} while True: - page = self.request("{}/ajax/{}/page-{}/".format( - self.root, self.path, num)).text + url = f"{self.root}/ajax/{self.path}/page-{num}/" + page = self.request(url).text if not page: return diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index eb68c3e..35263a3 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2023 Mike Fährmann +# Copyright 2017-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -134,8 +134,8 @@ class FlickrAlbumExtractor(FlickrExtractor): for album in self.api.photosets_getList(self.user["nsid"]): self.api._clean_info(album).update(data) - url = "https://www.flickr.com/photos/{}/albums/{}".format( - self.user["path_alias"], album["id"]) + url = (f"https://www.flickr.com/photos/{self.user['path_alias']}" + f"/albums/{album['id']}") yield Message.Queue, url, album def metadata(self): @@ -451,14 +451,13 @@ class FlickrAPI(oauth.OAuth1API): raise exception.AuthenticationError(msg) elif data["code"] == 99: raise exception.AuthorizationError(msg) - raise exception.StopExtraction("API request failed: %s", msg) + raise exception.AbortExtraction(f"API request failed: {msg}") return data def _pagination(self, method, params, key="photos"): extras = ("description,date_upload,tags,views,media," "path_alias,owner_name,") - includes = self.extractor.config("metadata") - if includes: + if includes := self.extractor.config("metadata"): if isinstance(includes, (list, tuple)): includes = ",".join(includes) elif not isinstance(includes, str): @@ -585,8 +584,7 @@ class FlickrAPI(oauth.OAuth1API): if "license" in photo: photo["license_name"] = self.LICENSES.get(photo["license"]) - @staticmethod - def _clean_info(info): + def _clean_info(self, info): info["title"] = info["title"]["_content"] info["description"] = info["description"]["_content"] return info diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 5f90afc..dc23488 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,6 +26,9 @@ class FoolfuukaExtractor(BaseExtractor): self.remote = self._remote_direct elif self.category == "archivedmoe": self.referer = False + self.fixup_redirect = True + else: + self.fixup_redirect = False def items(self): yield Message.Directory, self.metadata() @@ -57,13 +60,45 @@ class FoolfuukaExtractor(BaseExtractor): """Resolve a remote media link""" page = self.request(media["remote_media_link"]).text url = text.extr(page, 'http-equiv="Refresh" content="0; url=', '"') - if url.endswith(".webm") and \ - url.startswith("https://thebarchive.com/"): - return url[:-1] + + if url.startswith("https://thebarchive.com/"): + # '.webm' -> '.web' (#5116) + if url.endswith(".webm"): + url = url[:-1] + + elif self.fixup_redirect: + # update redirect domain or filename (#7652) + path, _, filename = url.rpartition("/") + + # these boards link directly to i.4cdn.org + # -> redirect to warosu or 4plebs instead + board_domains = { + "3" : "warosu.org", + "biz": "warosu.org", + "ck" : "warosu.org", + "diy": "warosu.org", + "fa" : "warosu.org", + "ic" : "warosu.org", + "jp" : "warosu.org", + "lit": "warosu.org", + "sci": "warosu.org", + "tg" : "archive.4plebs.org", + } + board = url.split("/", 4)[3] + if board in board_domains: + domain = board_domains[board] + url = f"https://{domain}/{board}/full_image/{filename}" + + # if it's one of these archives, slice the name + elif any(archive in path for archive in ( + "b4k.", "desuarchive.", "palanq.")): + name, _, ext = filename.rpartition(".") + if len(name) > 13: + url = f"{path}/{name[:13]}.{ext}" + return url - @staticmethod - def _remote_direct(media): + def _remote_direct(self, media): return media["remote_media_link"] @@ -124,13 +159,12 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): def metadata(self): url = self.root + "/_/api/chan/thread/" params = {"board": self.board, "num": self.thread} - self.data = self.request(url, params=params).json()[self.thread] + self.data = self.request_json(url, params=params)[self.thread] return self.data["op"] def posts(self): op = (self.data["op"],) - posts = self.data.get("posts") - if posts: + if posts := self.data.get("posts"): posts = list(posts.values()) posts.sort(key=lambda p: p["timestamp"]) return itertools.chain(op, posts) @@ -149,13 +183,12 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): self.page = self.groups[-1] def items(self): - index_base = "{}/_/api/chan/index/?board={}&page=".format( - self.root, self.board) - thread_base = "{}/{}/thread/".format(self.root, self.board) + index_base = f"{self.root}/_/api/chan/index/?board={self.board}&page=" + thread_base = f"{self.root}/{self.board}/thread/" page = self.page for pnum in itertools.count(text.parse_int(page, 1)): - with self.request(index_base + format(pnum)) as response: + with self.request(index_base + str(pnum)) as response: try: threads = response.json() except ValueError: @@ -209,7 +242,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): while True: try: - data = self.request(url, params=params).json() + data = self.request_json(url, params=params) except ValueError: return @@ -235,27 +268,17 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor): pattern = BASE_PATTERN + r"/([^/?#]+)/gallery(?:/(\d+))?" example = "https://archived.moe/a/gallery" - def __init__(self, match): - FoolfuukaExtractor.__init__(self, match) - - board = match.group(match.lastindex) - if board.isdecimal(): - self.board = match.group(match.lastindex-1) - self.pages = (board,) - else: - self.board = board - self.pages = map(format, itertools.count(1)) - def metadata(self): - return {"board": self.board} + self.board = board = self.groups[-2] + return {"board": board} def posts(self): - base = "{}/_/api/chan/gallery/?board={}&page=".format( - self.root, self.board) + pnum = self.groups[-1] + pages = itertools.count(1) if pnum is None else (pnum,) + base = f"{self.root}/_/api/chan/gallery/?board={self.board}&page=" - for page in self.pages: - with self.request(base + page) as response: - posts = response.json() + for pnum in pages: + posts = self.request_json(f"{base}{pnum}") if not posts: return yield from posts diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index bb684c2..7c59f72 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2023 Mike Fährmann +# Copyright 2016-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,14 +18,13 @@ class FoolslideExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) - self.gallery_url = self.root + match.group(match.lastindex) + self.page_url = self.root + self.groups[-1] def request(self, url): return BaseExtractor.request( self, url, encoding="utf-8", method="POST", data={"adult": "true"}) - @staticmethod - def parse_chapter_url(url, data): + def parse_chapter_url(self, url, data): info = url.partition("/read/")[2].rstrip("/").split("/") lang = info[1].partition("-")[0] data["lang"] = lang @@ -52,7 +51,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): example = "https://read.powermanga.org/read/MANGA/en/0/123/" def items(self): - page = self.request(self.gallery_url).text + page = self.request(self.page_url).text data = self.metadata(page) imgs = self.images(page) @@ -79,7 +78,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): def metadata(self, page): extr = text.extract_from(page) extr('

', '') - return self.parse_chapter_url(self.gallery_url, { + return self.parse_chapter_url(self.page_url, { "manga" : text.unescape(extr('title="', '"')).strip(), "chapter_string": text.unescape(extr('title="', '"')), }) @@ -96,7 +95,7 @@ class FoolslideMangaExtractor(FoolslideExtractor): example = "https://read.powermanga.org/series/MANGA/" def items(self): - page = self.request(self.gallery_url).text + page = self.request(self.page_url).text chapters = self.chapters(page) if not self.config("chapter-reverse", False): diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 565fd71..0d24f83 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://www.furaffinity.net/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net" @@ -28,7 +28,7 @@ class FuraffinityExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = match.group(1) + self.user = match[1] self.offset = 0 def _init(self): @@ -51,8 +51,7 @@ class FuraffinityExtractor(Extractor): def items(self): metadata = self.metadata() for post_id in util.advance(self.posts(), self.offset): - post = self._parse_post(post_id) - if post: + if post := self._parse_post(post_id): if metadata: post.update(metadata) yield Message.Directory, post @@ -71,7 +70,7 @@ class FuraffinityExtractor(Extractor): return num def _parse_post(self, post_id): - url = "{}/view/{}/".format(self.root, post_id) + url = f"{self.root}/view/{post_id}/" extr = text.extract_from(self.request(url).text) if self._new_layout is None: @@ -117,8 +116,7 @@ class FuraffinityExtractor(Extractor): data["folders"] = folders = [] for folder in extr( "

Listed in Folders

", "").split(""): - folder = rh(folder) - if folder: + if folder := rh(folder): folders.append(folder) else: # old site layout @@ -147,22 +145,19 @@ class FuraffinityExtractor(Extractor): data["user"] = self.user or data["artist_url"] data["date"] = text.parse_timestamp(data["filename"].partition(".")[0]) data["description"] = self._process_description(data["_description"]) - data["thumbnail"] = "https://t.furaffinity.net/{}@600-{}.jpg".format( - post_id, path.rsplit("/", 2)[1]) - + data["thumbnail"] = (f"https://t.furaffinity.net/{post_id}@600-" + f"{path.rsplit('/', 2)[1]}.jpg") return data - @staticmethod - def _process_description(description): + def _process_description(self, description): return text.unescape(text.remove_html(description, "", "")) def _pagination(self, path, folder=None): num = 1 - folder = "" if folder is None else "/folder/{}/a".format(folder) + folder = "" if folder is None else f"/folder/{folder}/a" while True: - url = "{}/{}/{}{}/{}/".format( - self.root, path, self.user, folder, num) + url = f"{self.root}/{path}/{self.user}{folder}/{num}/" page = self.request(url).text post_id = None @@ -174,7 +169,7 @@ class FuraffinityExtractor(Extractor): num += 1 def _pagination_favorites(self): - path = "/favorites/{}/".format(self.user) + path = f"/favorites/{self.user}/" while path: page = self.request(self.root + path).text @@ -188,7 +183,7 @@ class FuraffinityExtractor(Extractor): pos = page.find('type="submit">Next') if pos >= 0: - path = text.rextract(page, '
Next 48")) < 0 and \ + (pos := page.find(">>>> Next 48 >>")) < 0: return + + path = text.rextr(page, 'href="', '"', pos) url = self.root + text.unescape(path) diff --git a/gallery_dl/extractor/furry34.py b/gallery_dl/extractor/furry34.py index e0c7fdb..a93ec75 100644 --- a/gallery_dl/extractor/furry34.py +++ b/gallery_dl/extractor/furry34.py @@ -46,8 +46,8 @@ class Furry34Extractor(BooruExtractor): post_id = post["id"] root = self.root_cdn if files[fmt][0] else self.root - post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format( - root, post_id // 1000, post_id, post_id, extension) + post["file_url"] = url = \ + f"{root}/posts/{post_id // 1000}/{post_id}/{post_id}.{extension}" post["format_id"] = fmt post["format"] = extension.partition(".")[0] @@ -73,11 +73,11 @@ class Furry34Extractor(BooruExtractor): post["tags_" + types[type]] = values def _fetch_post(self, post_id): - url = "{}/api/v2/post/{}".format(self.root, post_id) - return self.request(url).json() + url = f"{self.root}/api/v2/post/{post_id}" + return self.request_json(url) def _pagination(self, endpoint, params=None): - url = "{}/api{}".format(self.root, endpoint) + url = f"{self.root}/api{endpoint}" if params is None: params = {} @@ -86,7 +86,7 @@ class Furry34Extractor(BooruExtractor): threshold = self.per_page while True: - data = self.request(url, method="POST", json=params).json() + data = self.request_json(url, method="POST", json=params) yield from data["items"] diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py index beecbff..b7cf0c8 100644 --- a/gallery_dl/extractor/fuskator.py +++ b/gallery_dl/extractor/fuskator.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,13 +21,13 @@ class FuskatorGalleryExtractor(GalleryExtractor): example = "https://fuskator.com/thumbs/ID/" def __init__(self, match): - self.gallery_hash = match.group(1) - url = "{}/thumbs/{}/index.html".format(self.root, self.gallery_hash) + self.gallery_hash = match[1] + url = f"{self.root}/thumbs/{self.gallery_hash}/index.html" GalleryExtractor.__init__(self, match, url) def metadata(self, page): headers = { - "Referer" : self.gallery_url, + "Referer" : self.page_url, "X-Requested-With": "XMLHttpRequest", } auth = self.request( @@ -39,9 +39,8 @@ class FuskatorGalleryExtractor(GalleryExtractor): "hash" : self.gallery_hash, "_" : int(time.time()), } - self.data = data = self.request( - self.root + "/ajax/gal.aspx", params=params, headers=headers, - ).json() + self.data = data = self.request_json( + self.root + "/ajax/gal.aspx", params=params, headers=headers) title = text.extr(page, "", "").strip() title, _, gallery_id = title.rpartition("#") @@ -72,7 +71,7 @@ class FuskatorSearchExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.path = match.group(1) + self.path = match[1] def items(self): url = self.root + self.path @@ -87,4 +86,4 @@ class FuskatorSearchExtractor(Extractor): pages = text.extr(page, 'class="pages">', '>>><') if not pages: return - url = self.root + text.rextract(pages, 'href="', '"')[0] + url = self.root + text.rextr(pages, 'href="', '"') diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index f24b696..b152885 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,11 +26,19 @@ class GelbooruBase(): def _api_request(self, params, key="post", log=False): if "s" not in params: params["s"] = "post" + params["api_key"] = self.api_key params["user_id"] = self.user_id url = self.root + "/index.php?page=dapi&q=index&json=1" - data = self.request(url, params=params).json() + try: + data = self.request_json(url, params=params) + except exception.HttpError as exc: + if exc.status == 401: + raise exception.AuthorizationError( + f"'api-key' and 'user-id' required " + f"({exc.status}: {exc.response.reason})") + raise if not key: return data @@ -73,7 +81,7 @@ class GelbooruBase(): if id: tag = "id:" + op tags = [t for t in tags if not t.startswith(tag)] - tags = "{} id:{}".format(" ".join(tags), op) + tags = f"{' '.join(tags)} id:{op}" while True: posts = self._api_request(params) @@ -113,7 +121,7 @@ class GelbooruBase(): post["_fallback"] = (url,) md5 = post["md5"] root = text.root_from_url(post["preview_url"]) - path = "/images/{}/{}/{}.webm".format(md5[0:2], md5[2:4], md5) + path = f"/images/{md5[0:2]}/{md5[2:4]}/{md5}.webm" url = root + path return url @@ -292,7 +300,7 @@ class GelbooruRedirectExtractor(GelbooruBase, Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.url_base64 = match.group(1) + self.url_base64 = match[1] def items(self): url = text.ensure_http_scheme(binascii.a2b_base64( diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 0b96048..61d0545 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,8 +17,7 @@ class GelbooruV01Extractor(booru.BooruExtractor): per_page = 20 def _parse_post(self, post_id): - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post_id) + url = f"{self.root}/index.php?page=post&s=view&id={post_id}" extr = text.extract_from(self.request(url).text) post = { @@ -92,16 +91,12 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" example = "https://allgirl.booru.org/index.php?page=post&s=list&tags=TAG" - def __init__(self, match): - GelbooruV01Extractor.__init__(self, match) - self.tags = match.group(match.lastindex) - def metadata(self): - return {"search_tags": text.unquote(self.tags.replace("+", " "))} + self.tags = tags = self.groups[-1] + return {"search_tags": text.unquote(tags.replace("+", " "))} def posts(self): - url = "{}/index.php?page=post&s=list&tags={}&pid=".format( - self.root, self.tags) + url = f"{self.root}/index.php?page=post&s=list&tags={self.tags}&pid=" return self._pagination(url, 'class="thumb">Pool: ", "

") @@ -239,12 +243,9 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" example = "https://safebooru.org/index.php?page=favorites&s=view&id=12345" - def __init__(self, match): - GelbooruV02Extractor.__init__(self, match) - self.favorite_id = match.group(match.lastindex) - def metadata(self): - return {"favorite_id": text.parse_int(self.favorite_id)} + self.favorite_id = fav_id = self.groups[-1] + return {"favorite_id": text.parse_int(fav_id)} def posts(self): return self._pagination_html({ @@ -260,9 +261,5 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" example = "https://safebooru.org/index.php?page=post&s=view&id=12345" - def __init__(self, match): - GelbooruV02Extractor.__init__(self, match) - self.post_id = match.group(match.lastindex) - def posts(self): - return self._pagination({"id": self.post_id}) + return self._pagination({"id": self.groups[-1]}) diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 4b04732..407e478 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -7,9 +7,8 @@ """Generic information extractor""" from .common import Extractor, Message -from .. import config, text +from .. import config, text, util import os.path -import re class GenericExtractor(Extractor): @@ -37,28 +36,28 @@ class GenericExtractor(Extractor): example = "generic:https://www.nongnu.org/lzip/" def __init__(self, match): - self.subcategory = match.group('domain') + self.subcategory = match['domain'] Extractor.__init__(self, match) # Strip the "g(eneric):" prefix # and inform about "forced" or "fallback" mode - if match.group('generic'): - self.url = match.group(0).partition(":")[2] + if match['generic']: + self.url = match[0].partition(":")[2] else: self.log.info("Falling back on generic information extractor.") - self.url = match.group(0) + self.url = match[0] # Make sure we have a scheme, or use https - if match.group('scheme'): - self.scheme = match.group('scheme') + if match['scheme']: + self.scheme = match['scheme'] else: self.scheme = 'https://' self.url = text.ensure_http_scheme(self.url, self.scheme) - self.path = match.group('path') + self.path = match['path'] # Used to resolve relative image urls - self.root = self.scheme + match.group('domain') + self.root = self.scheme + match['domain'] def items(self): """Get page, extract metadata & images, yield them in suitable messages @@ -172,8 +171,8 @@ class GenericExtractor(Extractor): r"(?:[^\"'<>\s]*)?" # optional query and fragment ) - imageurls_src = re.findall(imageurl_pattern_src, page) - imageurls_ext = re.findall(imageurl_pattern_ext, page) + imageurls_src = util.re(imageurl_pattern_src).findall(page) + imageurls_ext = util.re(imageurl_pattern_ext).findall(page) imageurls = imageurls_src + imageurls_ext # Resolve relative urls @@ -182,10 +181,10 @@ class GenericExtractor(Extractor): # by prepending a suitable base url. # # If the page contains a element, use it as base url - basematch = re.search( - r"(?i)(?:[^\"' >]+)", page) + basematch = util.re( + r"(?i)(?:[^\"' >]+)").search(page) if basematch: - self.baseurl = basematch.group('url').rstrip('/') + self.baseurl = basematch['url'].rstrip('/') # Otherwise, extract the base url from self.url else: if self.url.endswith("/"): diff --git a/gallery_dl/extractor/girlsreleased.py b/gallery_dl/extractor/girlsreleased.py new file mode 100644 index 0000000..4fc77c6 --- /dev/null +++ b/gallery_dl/extractor/girlsreleased.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://girlsreleased.com/""" + +from .common import Extractor, Message +from .. import text +import itertools + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlsreleased\.com" + + +class GirlsreleasedExtractor(Extractor): + """Base class for girlsreleased extractors""" + category = "girlsreleased" + root = "https://girlsreleased.com" + request_interval = 0.5 + request_interval_min = 0.2 + + def items(self): + data = {"_extractor": GirlsreleasedSetExtractor} + base = f"{self.root}/set/" + for set in self._pagination(): + yield Message.Queue, f"{base}{set[0]}", data + + def _pagination(self): + base = f"{self.root}/api/0.1/sets/{self._path}/{self.groups[0]}/page/" + for pnum in itertools.count(): + sets = self.request_json(f"{base}{pnum}")["sets"] + if not sets: + return + + yield from sets[1:] if pnum else sets + if len(sets) < 80: + return + + +class GirlsreleasedSetExtractor(GirlsreleasedExtractor): + """Extractor for girlsreleased galleries""" + subcategory = "set" + pattern = BASE_PATTERN + r"/set/(\d+)" + example = "https://girlsreleased.com/set/12345" + + def items(self): + url = f"{self.root}/api/0.1/set/{self.groups[0]}" + json = self.request_json(url)["set"] + data = { + "title": json["name"] or json["id"], + "id": json["id"], + "site": json["site"], + "model": [model for _, model in json["models"]], + "date": text.parse_timestamp(json["date"]), + "count": len(json["images"]), + "url": "https://girlsreleased.com/set/" + json["id"], + } + yield Message.Directory, data + for data["num"], image in enumerate(json["images"], 1): + text.nameext_from_url(image[5], data) + yield Message.Queue, image[3], data + + +class GirlsreleasedModelExtractor(GirlsreleasedExtractor): + """Extractor for girlsreleased models""" + subcategory = _path = "model" + pattern = BASE_PATTERN + r"/model/(\d+(?:/.+)?)" + example = "https://girlsreleased.com/model/12345/MODEL" + + +class GirlsreleasedSiteExtractor(GirlsreleasedExtractor): + """Extractor for girlsreleased sites""" + subcategory = _path = "site" + pattern = BASE_PATTERN + r"/site/([^/?#]+(?:/model/\d+/?.*)?)" + example = "https://girlsreleased.com/site/SITE" diff --git a/gallery_dl/extractor/girlswithmuscle.py b/gallery_dl/extractor/girlswithmuscle.py new file mode 100644 index 0000000..51b979f --- /dev/null +++ b/gallery_dl/extractor/girlswithmuscle.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlswithmuscle\.com" + + +class GirlswithmuscleExtractor(Extractor): + """Base class for girlswithmuscle extractors""" + category = "girlswithmuscle" + root = "https://www.girlswithmuscle.com" + directory_fmt = ("{category}", "{model}") + filename_fmt = "{model}_{id}.{extension}" + archive_fmt = "{type}_{model}_{id}" + + def login(self): + username, password = self._get_auth_info() + if username: + self.cookies_update(self._login_impl(username, password)) + + @cache(maxage=14*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/login/" + page = self.request(url).text + csrf_token = text.extr(page, 'name="csrfmiddlewaretoken" value="', '"') + + headers = { + "Origin" : self.root, + "Referer": url, + } + data = { + "csrfmiddlewaretoken": csrf_token, + "username": username, + "password": password, + "next": "/", + } + response = self.request( + url, method="POST", headers=headers, data=data) + + if not response.history: + raise exception.AuthenticationError() + + page = response.text + if ">Wrong username or password" in page: + raise exception.AuthenticationError() + if ">Log in<" in page: + raise exception.AuthenticationError("Account data is missing") + + return {c.name: c.value for c in response.history[0].cookies} + + +class GirlswithmusclePostExtractor(GirlswithmuscleExtractor): + """Extractor for individual posts on girlswithmuscle.com""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(\d+)" + example = "https://www.girlswithmuscle.com/12345/" + + def items(self): + self.login() + + url = f"{self.root}/{self.groups[0]}/" + page = self.request(url).text + if not page: + raise exception.NotFoundError("post") + + metadata = self.metadata(page) + + if url := text.extr(page, 'class="main-image" src="', '"'): + metadata["type"] = "picture" + else: + url = text.extr(page, '', "")) + image_info = text.extr( + page, '
', "
") + uploader = text.remove_html(text.extr( + image_info, '', "
")) + + tags = text.extr(page, 'id="tags-text">', "") + score = text.parse_int(text.remove_html(text.extr( + page, "Score: ", "", "") + return "unknown" if model.startswith("Picture #") else model + + def _parse_model_list(self, model): + if model == "unknown": + return [] + else: + return [name.strip() for name in model.split(",")] + + def _parse_is_favorite(self, page): + fav_button = text.extr( + page, 'id="favorite-button">', "") + unfav_button = text.extr( + page, 'class="actionbutton unfavorite-button">', "") + + is_favorite = None + if unfav_button == "Unfavorite": + is_favorite = True + if fav_button == "Favorite": + is_favorite = False + + return is_favorite + + def _extract_comments(self, page): + comments = text.extract_iter( + page, '
', "
") + return [comment.strip() for comment in comments] + + +class GirlswithmuscleSearchExtractor(GirlswithmuscleExtractor): + """Extractor for search results on girlswithmuscle.com""" + subcategory = "search" + pattern = BASE_PATTERN + r"/images/(.*)" + example = "https://www.girlswithmuscle.com/images/?name=MODEL" + + def pages(self): + query = self.groups[0] + url = f"{self.root}/images/{query}" + response = self.request(url) + if response.history: + msg = f'Request was redirected to "{response.url}", try logging in' + raise exception.AuthorizationError(msg) + page = response.text + + match = util.re(r"Page (\d+) of (\d+)").search(page) + current, total = match.groups() + current, total = text.parse_int(current), text.parse_int(total) + + yield page + for i in range(current + 1, total + 1): + url = f"{self.root}/images/{i}/{query}" + yield self.request(url).text + + def items(self): + self.login() + for page in self.pages(): + data = { + "_extractor" : GirlswithmusclePostExtractor, + "gallery_name": text.unescape(text.extr(page, "", "<")), + } + for imgid in text.extract_iter(page, 'id="imgid-', '"'): + url = f"{self.root}/{imgid}/" + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index ef9ea60..0a6c9b9 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -23,7 +23,7 @@ class GofileFolderExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.content_id = match.group(1) + self.content_id = match[1] def items(self): recursive = self.config("recursive") @@ -86,17 +86,16 @@ class GofileFolderExtractor(Extractor): return self._api_request("contents/" + content_id, params, headers) def _api_request(self, endpoint, params=None, headers=None, method="GET"): - response = self.request( + response = self.request_json( "https://api.gofile.io/" + endpoint, - method=method, params=params, headers=headers, - ).json() + method=method, params=params, headers=headers) if response["status"] != "ok": if response["status"] == "error-notFound": raise exception.NotFoundError("content") if response["status"] == "error-passwordRequired": raise exception.AuthorizationError("Password required") - raise exception.StopExtraction( - "%s failed (Status: %s)", endpoint, response["status"]) + raise exception.AbortExtraction( + f"{endpoint} failed (Status: {response['status']})") return response["data"] diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 792f666..8e350d6 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -6,9 +6,8 @@ """Extractors for https://hatenablog.com""" -import re from .common import Extractor, Message -from .. import text +from .. import text, util BASE_PATTERN = ( @@ -28,10 +27,10 @@ class HatenablogExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.domain = match.group(1) or match.group(2) + self.domain = match[1] or match[2] def _init(self): - self._find_img = re.compile(r'<img +([^>]+)').finditer + self._find_img = util.re(r'<img +([^>]+)').finditer def _handle_article(self, article: str): extr = text.extract_from(article) @@ -43,8 +42,8 @@ class HatenablogExtractor(Extractor): '<div class="entry-content hatenablog-entry">', '</div>') images = [] - for i in self._find_img(content): - attributes = i.group(1) + for match in self._find_img(content): + attributes = match[1] if 'class="hatena-fotolife"' not in attributes: continue image = text.unescape(text.extr(attributes, 'src="', '"')) @@ -68,13 +67,13 @@ class HatenablogEntriesExtractor(HatenablogExtractor): def __init__(self, match): HatenablogExtractor.__init__(self, match) - self.path = match.group(3) + self.path = match[3] self.query = {key: value for key, value in text.parse_query( - match.group(4)).items() if self._acceptable_query(key)} + match[4]).items() if self._acceptable_query(key)} def _init(self): HatenablogExtractor._init(self) - self._find_pager_url = re.compile( + self._find_pager_url = util.re( r' class="pager-next">\s*<a href="([^"]+)').search def items(self): @@ -92,7 +91,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor): yield from self._handle_full_articles(extr) match = self._find_pager_url(page) - url = text.unescape(match.group(1)) if match else None + url = text.unescape(match[1]) if match else None query = None def _handle_partial_articles(self, extr): @@ -129,7 +128,7 @@ class HatenablogEntryExtractor(HatenablogExtractor): def __init__(self, match): HatenablogExtractor.__init__(self, match) - self.path = match.group(3) + self.path = match[3] def items(self): url = "https://" + self.domain + "/entry/" + self.path diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index 1317ce9..ac4cd02 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2023 Mike Fährmann +# Copyright 2016-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor from .. import text, util -import re class Hentai2readBase(): @@ -31,8 +30,9 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) chapter, sep, minor = self.groups[1].partition(".") - match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - " - r"([^:]+): (.+) . Page 1 ", title) + match = util.re( + r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - " + r"([^:]+): (.+) . Page 1 ").match(title) if match: manga, type, author, _, title = match.groups() else: diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 4992b7b..5c2628f 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -44,10 +44,10 @@ class HentaicosplaysGalleryExtractor( def __init__(self, match): BaseExtractor.__init__(self, match) self.slug = self.groups[-1] - self.gallery_url = "{}/story/{}/".format(self.root, self.slug) + self.page_url = f"{self.root}/story/{self.slug}/" def _init(self): - self.session.headers["Referer"] = self.gallery_url + self.session.headers["Referer"] = self.page_url def metadata(self, page): title = text.extr(page, "<title>", "") diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 7e128a4..e529940 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://www.hentai-foundry.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util BASE_PATTERN = r"(https?://)?(?:www\.)?hentai-foundry\.com" @@ -25,8 +25,8 @@ class HentaifoundryExtractor(Extractor): per_page = 25 def __init__(self, match): - self.root = (match.group(1) or "https://") + "www.hentai-foundry.com" - self.user = match.group(2) + self.root = (match[1] or "https://") + "www.hentai-foundry.com" + self.user = match[2] Extractor.__init__(self, match) self.page_url = "" self.start_post = 0 @@ -58,7 +58,7 @@ class HentaifoundryExtractor(Extractor): num = self.start_page while True: - page = self.request("{}/page/{}".format(url, num)).text + page = self.request(f"{url}/page/{num}").text yield from text.extract_iter(page, begin, end) if 'class="pager"' not in page or 'class="last hidden"' in page: @@ -192,15 +192,11 @@ class HentaifoundryExtractor(Extractor): self.request(url, method="POST", data=data) -class HentaifoundryUserExtractor(HentaifoundryExtractor): +class HentaifoundryUserExtractor(Dispatch, HentaifoundryExtractor): """Extractor for a hentaifoundry user profile""" - subcategory = "user" pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile" example = "https://www.hentai-foundry.com/user/USER/profile" - def initialize(self): - pass - def items(self): root = self.root user = "/user/" + self.user @@ -224,7 +220,7 @@ class HentaifoundryPicturesExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/pictures/user/{}".format(self.root, self.user) + self.page_url = f"{self.root}/pictures/user/{self.user}" class HentaifoundryScrapsExtractor(HentaifoundryExtractor): @@ -236,8 +232,7 @@ class HentaifoundryScrapsExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/pictures/user/{}/scraps".format( - self.root, self.user) + self.page_url = f"{self.root}/pictures/user/{self.user}/scraps" class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): @@ -250,8 +245,7 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/user/{}/faves/pictures".format( - self.root, self.user) + self.page_url = f"{self.root}/user/{self.user}/faves/pictures" class HentaifoundryTagExtractor(HentaifoundryExtractor): @@ -264,7 +258,7 @@ class HentaifoundryTagExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/pictures/tagged/{}".format(self.root, self.user) + self.page_url = f"{self.root}/pictures/tagged/{self.user}" def metadata(self): return {"search_tags": self.user} @@ -280,7 +274,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/pictures/recent/{}".format(self.root, self.user) + self.page_url = f"{self.root}/pictures/recent/{self.user}" def metadata(self): return {"date": self.user} @@ -310,11 +304,11 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.index = match.group(3) + self.index = match[3] def items(self): - post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format( - self.root, self.user, self.index) + post_url = (f"{self.root}/pictures/user/{self.user}" + f"/{self.index}/?enterAgree=1") image = self._parse_post(post_url) image["user"] = self.user yield Message.Directory, image @@ -336,7 +330,7 @@ class HentaifoundryStoriesExtractor(HentaifoundryExtractor): yield Message.Url, story["src"], story def stories(self): - url = "{}/stories/user/{}".format(self.root, self.user) + url = f"{self.root}/stories/user/{self.user}" return self._pagination(url, '
', '') @@ -351,11 +345,11 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.index = match.group(3) + self.index = match[3] def items(self): - story_url = "{}/stories/user/{}/{}/x?enterAgree=1".format( - self.root, self.user, self.index) + story_url = (f"{self.root}/stories/user/{self.user}" + f"/{self.index}/x?enterAgree=1") story = self._parse_story(self.request(story_url).text) yield Message.Directory, story yield Message.Url, story["src"], story diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py index f3f43c4..f4f9d86 100644 --- a/gallery_dl/extractor/hentaihand.py +++ b/gallery_dl/extractor/hentaihand.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,8 +20,8 @@ class HentaihandGalleryExtractor(GalleryExtractor): example = "https://hentaihand.com/en/comic/TITLE" def __init__(self, match): - self.slug = match.group(1) - url = "{}/api/comics/{}".format(self.root, self.slug) + self.slug = match[1] + url = f"{self.root}/api/comics/{self.slug}" GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -44,7 +44,7 @@ class HentaihandGalleryExtractor(GalleryExtractor): return data def images(self, _): - info = self.request(self.gallery_url + "/images").json() + info = self.request_json(self.page_url + "/images") return [(img["source_url"], img) for img in info["images"]] @@ -68,8 +68,8 @@ class HentaihandTagExtractor(Extractor): else: tpl = self.type + "s" - url = "{}/api/{}/{}".format(self.root, tpl, self.key) - tid = self.request(url, notfound=self.type).json()["id"] + url = f"{self.root}/api/{tpl}/{self.key}" + tid = self.request_json(url, notfound=self.type)["id"] url = self.root + "/api/comics" params = { @@ -82,10 +82,10 @@ class HentaihandTagExtractor(Extractor): "duration": "day", } while True: - info = self.request(url, params=params).json() + info = self.request_json(url, params=params) for gallery in info["data"]: - gurl = "{}/en/comic/{}".format(self.root, gallery["slug"]) + gurl = f"{self.root}/en/comic/{gallery['slug']}" gallery["_extractor"] = HentaihandGalleryExtractor yield Message.Queue, gurl, gallery diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py index ba9558c..b894d77 100644 --- a/gallery_dl/extractor/hentaihere.py +++ b/gallery_dl/extractor/hentaihere.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2023 Mike Fährmann +# Copyright 2016-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor from .. import text, util -import re class HentaihereBase(): @@ -27,30 +26,30 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): def __init__(self, match): self.manga_id, self.chapter = match.groups() - url = "{}/m/S{}/{}/1".format(self.root, self.manga_id, self.chapter) + url = f"{self.root}/m/S{self.manga_id}/{self.chapter}/1" ChapterExtractor.__init__(self, match, url) def metadata(self, page): title = text.extr(page, "", "") chapter_id = text.extr(page, 'report/C', '"') chapter, sep, minor = self.chapter.partition(".") - pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " - match = re.match(pattern, title) + match = util.re( + r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by " + r"(.+) at ").match(title) return { - "manga": match.group(1), + "manga": match[1], "manga_id": text.parse_int(self.manga_id), "chapter": text.parse_int(chapter), "chapter_minor": sep + minor, "chapter_id": text.parse_int(chapter_id), - "type": match.group(2), - "title": match.group(3), - "author": match.group(4), + "type": match[2], + "title": match[3], + "author": match[4], "lang": "en", "language": "English", } - @staticmethod - def images(page): + def images(self, page): images = text.extr(page, "var rff_imageList = ", ";") return [ ("https://hentaicdn.com/hentai" + part, None) @@ -73,7 +72,7 @@ class HentaihereMangaExtractor(HentaihereBase, MangaExtractor): mtype, pos = text.extract( page, '[', ']', pos) manga_id = text.parse_int( - self.manga_url.rstrip("/").rpartition("/")[2][1:]) + self.page_url.rstrip("/").rpartition("/")[2][1:]) while True: marker, pos = text.extract( diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index 286ee38..d3901ac 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2024 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,8 +22,8 @@ class HentainexusGalleryExtractor(GalleryExtractor): example = "https://hentainexus.com/view/12345" def __init__(self, match): - self.gallery_id = match.group(1) - url = "{}/view/{}".format(self.root, self.gallery_id) + self.gallery_id = match[1] + url = f"{self.root}/view/{self.gallery_id}" GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -59,7 +59,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): return data def images(self, _): - url = "{}/read/{}".format(self.root, self.gallery_id) + url = f"{self.root}/read/{self.gallery_id}" page = self.request(url).text imgs = util.json_loads(self._decode(text.extr( page, 'initReader("', '"'))) @@ -78,8 +78,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): pass return results - @staticmethod - def _decode(data): + def _decode(self, data): # https://hentainexus.com/static/js/reader.min.js?r=22 hostname = "hentainexus.com" primes = (2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53) @@ -118,8 +117,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): return result - @staticmethod - def _join_title(data): + def _join_title(self, data): event = data['event'] artist = data['artist'] circle = data['circle'] @@ -137,18 +135,18 @@ class HentainexusGalleryExtractor(GalleryExtractor): jt = '' if event: - jt += '({}) '.format(event) + jt += f'({event}) ' if circle: - jt += '[{} ({})] '.format(circle, artist) + jt += f'[{circle} ({artist})] ' else: - jt += '[{}] '.format(artist) + jt += f'[{artist}] ' jt += title if parody.lower() != 'original work': - jt += ' ({})'.format(parody) + jt += f' ({parody})' if book: - jt += ' ({})'.format(book) + jt += f' ({book})' if magazine: - jt += ' ({})'.format(magazine) + jt += f' ({magazine})' return jt diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index f15aab7..a75eee0 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,9 +9,8 @@ """Extractors for https://hiperdex.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text +from .. import text, util from ..cache import memcache -import re BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" r"(?:1st)?hiper(?:dex|toon)\d?\.(?:com|net|info|top))") @@ -25,7 +24,7 @@ class HiperdexBase(): @memcache(keyarg=1) def manga_data(self, manga, page=None): if not page: - url = "{}/manga/{}/".format(self.root, manga) + url = f"{self.root}/manga/{manga}/" page = self.request(url).text extr = text.extract_from(page) @@ -80,10 +79,10 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): return self.chapter_data(self.chapter) def images(self, page): + pattern = util.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)') return [ (url.strip(), None) - for url in re.findall( - r'id="image-\d+"\s+(?:data-)?src="([^"]+)', page) + for url in pattern.findall(page) ] @@ -100,14 +99,14 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): def chapters(self, page): data = self.manga_data(self.manga, page) - self.manga_url = url = data["url"] + self.page_url = url = data["url"] - url = self.manga_url + "ajax/chapters/" + url = self.page_url + "ajax/chapters/" headers = { "Accept": "*/*", "X-Requested-With": "XMLHttpRequest", "Origin": self.root, - "Referer": "https://" + text.quote(self.manga_url[8:]), + "Referer": "https://" + text.quote(self.page_url[8:]), } html = self.request(url, method="POST", headers=headers).text @@ -130,8 +129,8 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): example = "https://hiperdex.com/manga-artist/NAME/" def __init__(self, match): - self.root = text.ensure_http_scheme(match.group(1)) - MangaExtractor.__init__(self, match, self.root + match.group(2) + "/") + self.root = text.ensure_http_scheme(match[1]) + MangaExtractor.__init__(self, match, self.root + match[2] + "/") def chapters(self, page): results = [] diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 086b77c..82bed80 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -13,7 +13,6 @@ from .nozomi import decode_nozomi from ..cache import memcache from .. import text, util import string -import re class HitomiExtractor(Extractor): @@ -22,6 +21,27 @@ class HitomiExtractor(Extractor): root = "https://hitomi.la" domain = "gold-usergeneratedcontent.net" + def load_nozomi(self, query, language="all", headers=None): + ns, _, tag = query.strip().partition(":") + + if ns == "female" or ns == "male": + ns = "tag/" + tag = query + elif ns == "language": + ns = "" + language = tag + tag = "index" + else: + ns = f"{ns}/" + + url = (f"https://ltn.{self.domain}/n/{ns}" + f"/{tag.replace('_', ' ')}-{language}.nozomi") + if headers is None: + headers = {} + headers["Origin"] = self.root + headers["Referer"] = f"{self.root}/" + return decode_nozomi(self.request(url, headers=headers).content) + class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): """Extractor for hitomi.la galleries""" @@ -33,23 +53,19 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): def __init__(self, match): GalleryExtractor.__init__(self, match, False) self.gid = gid = self.groups[0] - self.gallery_url = "https://ltn.{}/galleries/{}.js".format( - self.domain, gid) + self.page_url = f"https://ltn.{self.domain}/galleries/{gid}.js" def _init(self): - self.session.headers["Referer"] = "{}/reader/{}.html".format( - self.root, self.gid) + self.session.headers["Referer"] = f"{self.root}/reader/{self.gid}.html" def metadata(self, page): self.info = info = util.json_loads(page.partition("=")[2]) iget = info.get - language = iget("language") - if language: + if language := iget("language"): language = language.capitalize() - date = iget("date") - if date: + if date := iget("date"): date += ":00" tags = [] @@ -83,7 +99,7 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): fmt = ext = self.config("format") or "webp" check = (fmt != "webp") - result = [] + results = [] for image in self.info["files"]: if check: ext = fmt if image.get("has" + fmt) else "webp" @@ -94,12 +110,10 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): # https://ltn.gold-usergeneratedcontent.net/common.js inum = int(ihash[-1] + ihash[-3:-1], 16) - url = "https://{}{}.{}/{}/{}/{}.{}".format( - ext[0], gg_m.get(inum, gg_default) + 1, self.domain, - gg_b, inum, ihash, ext, - ) - result.append((url, idata)) - return result + url = (f"https://{ext[0]}{gg_m.get(inum, gg_default) + 1}." + f"{self.domain}/{gg_b}/{inum}/{ihash}.{ext}") + results.append((url, idata)) + return results class HitomiTagExtractor(HitomiExtractor): @@ -123,8 +137,7 @@ class HitomiTagExtractor(HitomiExtractor): "_extractor": HitomiGalleryExtractor, "search_tags": text.unquote(self.tag.rpartition("-")[0]), } - nozomi_url = "https://ltn.{}/{}/{}.nozomi".format( - self.domain, self.type, self.tag) + nozomi_url = f"https://ltn.{self.domain}/{self.type}/{self.tag}.nozomi" headers = { "Origin": self.root, "Cache-Control": "max-age=0", @@ -133,14 +146,13 @@ class HitomiTagExtractor(HitomiExtractor): offset = 0 total = None while True: - headers["Referer"] = "{}/{}/{}.html?page={}".format( - self.root, self.type, self.tag, offset // 100 + 1) - headers["Range"] = "bytes={}-{}".format(offset, offset+99) + headers["Referer"] = (f"{self.root}/{self.type}/{self.tag}.html" + f"?page={offset // 100 + 1}") + headers["Range"] = f"bytes={offset}-{offset + 99}" response = self.request(nozomi_url, headers=headers) for gallery_id in decode_nozomi(response.content): - gallery_url = "{}/galleries/{}.html".format( - self.root, gallery_id) + gallery_url = f"{self.root}/galleries/{gallery_id}.html" yield Message.Queue, gallery_url, data offset += 100 @@ -163,8 +175,8 @@ class HitomiIndexExtractor(HitomiTagExtractor): def items(self): data = {"_extractor": HitomiGalleryExtractor} - nozomi_url = "https://ltn.{}/{}-{}.nozomi".format( - self.domain, self.tag, self.language) + nozomi_url = (f"https://ltn.{self.domain}" + f"/{self.tag}-{self.language}.nozomi") headers = { "Origin": self.root, "Cache-Control": "max-age=0", @@ -173,14 +185,13 @@ class HitomiIndexExtractor(HitomiTagExtractor): offset = 0 total = None while True: - headers["Referer"] = "{}/{}-{}.html?page={}".format( - self.root, self.tag, self.language, offset // 100 + 1) - headers["Range"] = "bytes={}-{}".format(offset, offset+99) + headers["Referer"] = (f"{self.root}/{self.tag}-{self.language}" + f".html?page={offset // 100 + 1}") + headers["Range"] = f"bytes={offset}-{offset + 99}" response = self.request(nozomi_url, headers=headers) for gallery_id in decode_nozomi(response.content): - gallery_url = "{}/galleries/{}.html".format( - self.root, gallery_id) + gallery_url = f"{self.root}/galleries/{gallery_id}.html" yield Message.Queue, gallery_url, data offset += 100 @@ -194,60 +205,46 @@ class HitomiIndexExtractor(HitomiTagExtractor): class HitomiSearchExtractor(HitomiExtractor): """Extractor for galleries from multiple tag searches on hitomi.la""" subcategory = "search" - pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)" + pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^#]+)" example = "https://hitomi.la/search.html?QUERY" - def __init__(self, match): - Extractor.__init__(self, match) - self.query = match.group(1) - self.tags = text.unquote(self.query) - def items(self): + tags = text.unquote(self.groups[0]) + data = { "_extractor": HitomiGalleryExtractor, - "search_tags": self.tags, + "search_tags": tags, } - results = [self.get_nozomi_items(tag) for tag in self.tags.split(" ")] - intersects = set.intersection(*results) - for gallery_id in sorted(intersects, reverse=True): - gallery_url = "{}/galleries/{}.html".format( - self.root, gallery_id) + for gallery_id in self.gallery_ids(tags): + gallery_url = f"{self.root}/galleries/{gallery_id}.html" yield Message.Queue, gallery_url, data - def get_nozomi_items(self, full_tag): - area, tag, language = self.get_nozomi_args(full_tag) + def gallery_ids(self, tags): + result = None + positive = [] + negative = [] - if area: - nozomi_url = "https://ltn.{}/n/{}/{}-{}.nozomi".format( - self.domain, area, tag, language) - else: - nozomi_url = "https://ltn.{}/n/{}-{}.nozomi".format( - self.domain, tag, language) + for tag in tags.split(): + if tag[0] == "-": + negative.append(tag[1:]) + else: + positive.append(tag) - headers = { - "Origin": self.root, - "Cache-Control": "max-age=0", - "Referer": "{}/search.html?{}".format(self.root, self.query), - } - - response = self.request(nozomi_url, headers=headers) - return set(decode_nozomi(response.content)) + for tag in positive: + ids = self.load_nozomi(tag) + if result is None: + result = set(ids) + else: + result.intersection_update(ids) - def get_nozomi_args(self, query): - ns, _, tag = query.strip().partition(":") - area = ns - language = "all" - - if ns == "female" or ns == "male": - area = "tag" - tag = query - elif ns == "language": - area = None - language = tag - tag = "index" + if result is None: + # result = set(self.load_nozomi("index")) + result = set(self.load_nozomi("language:all")) + for tag in negative: + result.difference_update(self.load_nozomi(tag)) - return area, tag.replace("_", " "), language + return sorted(result, reverse=True) if result else () @memcache(maxage=1800) @@ -257,8 +254,8 @@ def _parse_gg(extr): m = {} keys = [] - for match in re.finditer( - r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?", page): + for match in util.re_compile( + r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?").finditer(page): key, value = match.groups() keys.append(int(key)) @@ -268,11 +265,11 @@ def _parse_gg(extr): m[key] = value keys.clear() - for match in re.finditer( - r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)", page): - m[int(match.group(1))] = int(match.group(2)) + for match in util.re_compile( + r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)").finditer(page): + m[int(match[1])] = int(match[2]) - d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page) - b = re.search(r"b:\s*[\"'](.+)[\"']", page) + d = util.re_compile(r"(?:var\s|default:)\s*o\s*=\s*(\d+)").search(page) + b = util.re_compile(r"b:\s*[\"'](.+)[\"']").search(page) - return m, b.group(1).strip("/"), int(d.group(1)) if d else 0 + return m, b[1].strip("/"), int(d[1]) if d else 0 diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index ddfc54b..587d88c 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -70,8 +70,7 @@ class HotleakPostExtractor(HotleakExtractor): self.creator, self.type, self.id = match.groups() def posts(self): - url = "{}/{}/{}/{}".format( - self.root, self.creator, self.type, self.id) + url = f"{self.root}/{self.creator}/{self.type}/{self.id}" page = self.request(url).text page = text.extr( page, '
', '') @@ -103,10 +102,10 @@ class HotleakCreatorExtractor(HotleakExtractor): def __init__(self, match): HotleakExtractor.__init__(self, match) - self.creator = match.group(1) + self.creator = match[1] def posts(self): - url = "{}/{}".format(self.root, self.creator) + url = f"{self.root}/{self.creator}" return self._pagination(url) def _pagination(self, url): @@ -159,7 +158,7 @@ class HotleakCategoryExtractor(HotleakExtractor): self._category, self.params = match.groups() def items(self): - url = "{}/{}".format(self.root, self._category) + url = f"{self.root}/{self._category}" if self._category in ("hot", "creators"): data = {"_extractor": HotleakCreatorExtractor} @@ -178,7 +177,7 @@ class HotleakSearchExtractor(HotleakExtractor): def __init__(self, match): HotleakExtractor.__init__(self, match) - self.params = match.group(1) + self.params = match[1] def items(self): data = {"_extractor": HotleakCreatorExtractor} diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 8f4a10c..075e1f6 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2023 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -108,8 +108,7 @@ class IdolcomplexExtractor(SankakuExtractor): pid = extr(">Post ID:", "<") created = extr(' title="', '"') - file_url = extr('>Original:', 'id=') - if file_url: + if file_url := extr('>Original:', 'id='): file_url = extr(' href="', '"') width = extr(">", "x") height = extr("", " ") @@ -159,7 +158,7 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): def __init__(self, match): IdolcomplexExtractor.__init__(self, match) - query = text.parse_query(match.group(1)) + query = text.parse_query(match[1]) self.tags = text.unquote(query.get("tags", "").replace("+", " ")) self.start_page = text.parse_int(query.get("page"), 1) self.next = text.parse_int(query.get("next"), 0) @@ -184,7 +183,7 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): tags = self.tags.split() if not self.logged_in and len(tags) > 4: - raise exception.StopExtraction( + raise exception.AbortExtraction( "Non-members can only search up to 4 tags at once") return {"search_tags": " ".join(tags)} diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 68360e9..171feea 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://www.imagebam.com/""" from .common import Extractor, Message -from .. import text -import re +from .. import text, util class ImagebamExtractor(Extractor): @@ -20,7 +19,7 @@ class ImagebamExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.path = match.group(1) + self.path = match[1] def _init(self): self.cookies.set("nsfw_inter", "1", domain="www.imagebam.com") @@ -64,22 +63,19 @@ class ImagebamGalleryExtractor(ImagebamExtractor): image.update(data) yield Message.Url, image["url"], image - @staticmethod - def metadata(page): + def metadata(self, page): return {"title": text.unescape(text.extr( page, 'id="gallery-name">', '<').strip())} def images(self, page): - findall = re.compile(r'")[2].split()) - raise exception.StopExtraction("'%s'", msg) + raise exception.AbortExtraction(f"'{msg}'") self.log.warning("HTTP redirect to %s", response.url) return response @@ -45,11 +44,11 @@ class ImagefapGalleryExtractor(ImagefapExtractor): def __init__(self, match): ImagefapExtractor.__init__(self, match) - self.gid = match.group(1) + self.gid = match[1] self.image_id = "" def items(self): - url = "{}/gallery/{}".format(self.root, self.gid) + url = f"{self.root}/gallery/{self.gid}" page = self.request(url).text data = self.get_job_metadata(page) yield Message.Directory, data @@ -81,12 +80,12 @@ class ImagefapGalleryExtractor(ImagefapExtractor): def get_images(self): """Collect image-urls and -metadata""" - url = "{}/photo/{}/".format(self.root, self.image_id) + url = f"{self.root}/photo/{self.image_id}/" params = {"gid": self.gid, "idx": 0, "partial": "true"} headers = { "Content-Type": "application/x-www-form-urlencoded", "X-Requested-With": "XMLHttpRequest", - "Referer": "{}?pgid=&gid={}&page=0".format(url, self.image_id) + "Referer": f"{url}?pgid=&gid={self.image_id}&page=0" } num = 0 @@ -116,7 +115,7 @@ class ImagefapImageExtractor(ImagefapExtractor): def __init__(self, match): ImagefapExtractor.__init__(self, match) - self.image_id = match.group(1) + self.image_id = match[1] def items(self): url, data = self.get_image() @@ -124,7 +123,7 @@ class ImagefapImageExtractor(ImagefapExtractor): yield Message.Url, url, data def get_image(self): - url = "{}/photo/{}/".format(self.root, self.image_id) + url = f"{self.root}/photo/{self.image_id}/" page = self.request(url).text url, pos = text.extract( @@ -161,7 +160,7 @@ class ImagefapFolderExtractor(ImagefapExtractor): def items(self): for gallery_id, name, folder in self.galleries(self.folder_id): - url = "{}/gallery/{}".format(self.root, gallery_id) + url = f"{self.root}/gallery/{gallery_id}" data = { "gallery_id": gallery_id, "title" : text.unescape(name), @@ -175,14 +174,13 @@ class ImagefapFolderExtractor(ImagefapExtractor): if folder_id == "-1": folder_name = "Uncategorized" if self._id: - url = "{}/usergallery.php?userid={}&folderid=-1".format( - self.root, self.user) + url = (f"{self.root}/usergallery.php" + f"?userid={self.user}&folderid=-1") else: - url = "{}/profile/{}/galleries?folderid=-1".format( - self.root, self.user) + url = f"{self.root}/profile/{self.user}/galleries?folderid=-1" else: folder_name = None - url = "{}/organizer/{}/".format(self.root, folder_id) + url = f"{self.root}/organizer/{folder_id}/" params = {"page": 0} extr = text.extract_from(self.request(url, params=params).text) @@ -222,19 +220,17 @@ class ImagefapUserExtractor(ImagefapExtractor): for folder_id in self.folders(): if folder_id == "-1": - url = "{}/profile/{}/galleries?folderid=-1".format( - self.root, self.user) + url = f"{self.root}/profile/{self.user}/galleries?folderid=-1" else: - url = "{}/organizer/{}/".format(self.root, folder_id) + url = f"{self.root}/organizer/{folder_id}/" yield Message.Queue, url, data def folders(self): """Return a list of folder IDs of a user""" if self.user: - url = "{}/profile/{}/galleries".format(self.root, self.user) + url = f"{self.root}/profile/{self.user}/galleries" else: - url = "{}/usergallery.php?userid={}".format( - self.root, self.user_id) + url = f"{self.root}/usergallery.php?userid={self.user_id}" response = self.request(url) self.user = response.url.split("/")[-2] diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index d6b36cb..0e5ce7e 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2023 Mike Fährmann +# Copyright 2016-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,12 +23,12 @@ class ImagehostImageExtractor(Extractor): _params = None _cookies = None _encoding = None + _validate = None def __init__(self, match): Extractor.__init__(self, match) - self.page_url = "http{}://{}".format( - "s" if self._https else "", match.group(1)) - self.token = match.group(2) + self.page_url = f"http{'s' if self._https else ''}://{match[1]}" + self.token = match[2] if self._params == "simple": self._params = { @@ -57,6 +57,8 @@ class ImagehostImageExtractor(Extractor): data.update(self.metadata(page)) if self._https and url.startswith("http:"): url = "https:" + url[5:] + if self._validate is not None: + data["_http_validate"] = self._validate yield Message.Directory, data yield Message.Url, url, data @@ -164,6 +166,14 @@ class ImagevenueImageExtractor(ImagehostImageExtractor): filename, pos = text.extract(page, 'alt="', '"', pos) return url, text.unescape(filename) + def _validate(self, response): + hget = response.headers.get + return not ( + hget("content-length") == "14396" and + hget("content-type") == "image/jpeg" and + hget("last-modified") == "Mon, 04 May 2020 07:19:52 GMT" + ) + class ImagetwistImageExtractor(ImagehostImageExtractor): """Extractor for single images from imagetwist.com""" @@ -200,6 +210,26 @@ class ImagetwistGalleryExtractor(ImagehostImageExtractor): yield Message.Queue, root + path, data +class ImgadultImageExtractor(ImagehostImageExtractor): + """Extractor for single images from imgadult.com""" + category = "imgadult" + _cookies = {"img_i_d": "1"} + pattern = r"(?:https?://)?((?:www\.)?imgadult\.com/img-([0-9a-f]+)\.html)" + example = "https://imgadult.com/img-0123456789abc.html" + + def get_info(self, page): + url , pos = text.extract(page, "' src='", "'") + name, pos = text.extract(page, "alt='", "'", pos) + + if name: + name, _, rhs = name.rpartition(" image hosted at ImgAdult.com") + if not name: + name = rhs + name = text.unescape(name) + + return url, name + + class ImgspiceImageExtractor(ImagehostImageExtractor): """Extractor for single images from imgspice.com""" category = "imgspice" diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index b926cb2..e6abdeb 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -112,7 +112,7 @@ class ImgbbExtractor(Extractor): params["page"] += 1 elif not seek or 'class="pagination-next"' not in page: return - data = self.request(endpoint, method="POST", data=params).json() + data = self.request_json(endpoint, method="POST", data=params) page = data["html"] @@ -126,8 +126,8 @@ class ImgbbAlbumExtractor(ImgbbExtractor): def __init__(self, match): ImgbbExtractor.__init__(self, match) self.album_name = None - self.album_id = match.group(1) - self.sort = text.parse_query(match.group(2)).get("sort", "date_desc") + self.album_id = match[1] + self.sort = text.parse_query(match[2]).get("sort", "date_desc") self.page_url = "https://ibb.co/album/" + self.album_id def metadata(self, page): @@ -162,9 +162,9 @@ class ImgbbUserExtractor(ImgbbExtractor): def __init__(self, match): ImgbbExtractor.__init__(self, match) - self.user = match.group(1) - self.sort = text.parse_query(match.group(2)).get("sort", "date_desc") - self.page_url = "https://{}.imgbb.com/".format(self.user) + self.user = match[1] + self.sort = text.parse_query(match[2]).get("sort", "date_desc") + self.page_url = f"https://{self.user}.imgbb.com/" def metadata(self, page): user = self._extract_user(page) @@ -191,7 +191,7 @@ class ImgbbImageExtractor(ImgbbExtractor): def __init__(self, match): ImgbbExtractor.__init__(self, match) - self.image_id = match.group(1) + self.image_id = match[1] def items(self): url = "https://ibb.co/" + self.image_id diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py index 7069717..5def88d 100644 --- a/gallery_dl/extractor/imgbox.py +++ b/gallery_dl/extractor/imgbox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://imgbox.com/""" from .common import Extractor, Message, AsynchronousMixin -from .. import text, exception -import re +from .. import text, util, exception class ImgboxExtractor(Extractor): @@ -31,18 +30,15 @@ class ImgboxExtractor(Extractor): text.nameext_from_url(imgdata["filename"], imgdata) yield Message.Url, self.get_image_url(imgpage), imgdata - @staticmethod - def get_job_metadata(): + def get_job_metadata(self): """Collect metadata for extractor-job""" return {} - @staticmethod - def get_image_keys(): + def get_image_keys(self): """Return an iterable containing all image-keys""" return [] - @staticmethod - def get_image_metadata(page): + def get_image_metadata(self, page): """Collect metadata for a downloadable file""" return text.extract_all(page, ( ("num" , '   ', ' of '), @@ -50,8 +46,7 @@ class ImgboxExtractor(Extractor): ("filename" , ' title="', '"'), ))[0] - @staticmethod - def get_image_url(page): + def get_image_url(self, page): """Extract download-url""" return text.extr(page, 'property="og:image" content="', '"') @@ -67,14 +62,15 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor): def __init__(self, match): ImgboxExtractor.__init__(self, match) - self.gallery_key = match.group(1) + self.gallery_key = match[1] self.image_keys = [] def get_job_metadata(self): page = self.request(self.root + "/g/" + self.gallery_key).text if "The specified gallery could not be found." in page: raise exception.NotFoundError("gallery") - self.image_keys = re.findall(r'', page)
+        self.image_keys = util.re(
+            r'<a href=').findall(page)
 
         title = text.extr(page, ", "

") title, _, count = title.rpartition(" - ") @@ -97,14 +93,13 @@ class ImgboxImageExtractor(ImgboxExtractor): def __init__(self, match): ImgboxExtractor.__init__(self, match) - self.image_key = match.group(1) + self.image_key = match[1] def get_image_keys(self): return (self.image_key,) - @staticmethod - def get_image_metadata(page): - data = ImgboxExtractor.get_image_metadata(page) + def get_image_metadata(self, page): + data = ImgboxExtractor.get_image_metadata(self, page) if not data["filename"]: raise exception.NotFoundError("image") return data diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py index 3aa7922..7e5e6cf 100644 --- a/gallery_dl/extractor/imgth.py +++ b/gallery_dl/extractor/imgth.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,8 +20,8 @@ class ImgthGalleryExtractor(GalleryExtractor): example = "https://imgth.com/gallery/123/TITLE" def __init__(self, match): - self.gallery_id = gid = match.group(1) - url = "{}/gallery/{}/g/".format(self.root, gid) + self.gallery_id = gid = match[1] + url = f"{self.root}/gallery/{gid}/g/" GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -45,12 +45,11 @@ class ImgthGalleryExtractor(GalleryExtractor): thumbs = text.extr(page, '
    ', '
') for url in text.extract_iter(thumbs, '' not in page: return pnum += 1 - url = "{}/gallery/{}/g/page/{}".format( - self.root, self.gallery_id, pnum) + url = f"{self.root}/gallery/{self.gallery_id}/g/page/{pnum}" page = self.request(url).text diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 20f8ea4..1ac76e0 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,7 +21,7 @@ class ImgurExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.key = match.group(1) + self.key = match[1] def _init(self): self.api = ImgurAPI(self) @@ -36,8 +36,8 @@ class ImgurExtractor(Extractor): elif image["is_animated"] and self.mp4 and image["ext"] == "gif": image["ext"] = "mp4" - image["url"] = url = "https://i.imgur.com/{}.{}".format( - image["id"], image["ext"]) + image["url"] = url = \ + f"https://i.imgur.com/{image['id']}.{image['ext']}" image["date"] = text.parse_datetime(image["created_at"]) image["_http_validate"] = self._validate text.nameext_from_url(url, image) @@ -131,10 +131,10 @@ class ImgurGalleryExtractor(ImgurExtractor): def items(self): if self.api.gallery(self.key)["is_album"]: - url = "{}/a/{}".format(self.root, self.key) + url = f"{self.root}/a/{self.key}" extr = ImgurAlbumExtractor else: - url = "{}/{}".format(self.root, self.key) + url = f"{self.root}/{self.key}" extr = ImgurImageExtractor yield Message.Queue, url, {"_extractor": extr} @@ -168,7 +168,7 @@ class ImgurFavoriteFolderExtractor(ImgurExtractor): def __init__(self, match): ImgurExtractor.__init__(self, match) - self.folder_id = match.group(2) + self.folder_id = match[2] def items(self): return self._items_queue(self.api.account_favorites_folder( @@ -234,16 +234,15 @@ class ImgurAPI(): self.headers = {"Authorization": "Client-ID " + self.client_id} def account_submissions(self, account): - endpoint = "/3/account/{}/submissions".format(account) + endpoint = f"/3/account/{account}/submissions" return self._pagination(endpoint) def account_favorites(self, account): - endpoint = "/3/account/{}/gallery_favorites".format(account) + endpoint = f"/3/account/{account}/gallery_favorites" return self._pagination(endpoint) def account_favorites_folder(self, account, folder_id): - endpoint = "/3/account/{}/folders/{}/favorites".format( - account, folder_id) + endpoint = f"/3/account/{account}/folders/{folder_id}/favorites" return self._pagination_v2(endpoint) def accounts_me_allposts(self): @@ -270,11 +269,11 @@ class ImgurAPI(): return self._pagination(endpoint, params) def gallery_subreddit(self, subreddit): - endpoint = "/3/gallery/r/{}".format(subreddit) + endpoint = f"/3/gallery/r/{subreddit}" return self._pagination(endpoint) def gallery_tag(self, tag): - endpoint = "/3/gallery/t/{}".format(tag) + endpoint = f"/3/gallery/t/{tag}" return self._pagination(endpoint, key="items") def image(self, image_hash): @@ -294,10 +293,9 @@ class ImgurAPI(): def _call(self, endpoint, params=None, headers=None): while True: try: - return self.extractor.request( + return self.extractor.request_json( "https://api.imgur.com" + endpoint, - params=params, headers=(headers or self.headers), - ).json() + params=params, headers=(headers or self.headers)) except exception.HttpError as exc: if exc.status not in (403, 429) or \ b"capacity" not in exc.response.content: @@ -308,7 +306,7 @@ class ImgurAPI(): num = 0 while True: - data = self._call("{}/{}".format(endpoint, num), params)["data"] + data = self._call(f"{endpoint}/{num}", params)["data"] if key: data = data[key] if not data: diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py index 1b0fba3..5ad1c30 100644 --- a/gallery_dl/extractor/imhentai.py +++ b/gallery_dl/extractor/imhentai.py @@ -38,7 +38,7 @@ class ImhentaiExtractor(BaseExtractor): yield Message.Queue, base + gallery_id, data prev = gallery_id - href = text.rextract(page, "class='page-link' href='", "'")[0] + href = text.rextr(page, "class='page-link' href='", "'") if not href or href == "#": return if href[0] == "/": @@ -85,7 +85,7 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): def __init__(self, match): ImhentaiExtractor.__init__(self, match) self.gallery_id = self.groups[-1] - self.gallery_url = "{}/gallery/{}/".format(self.root, self.gallery_id) + self.page_url = f"{self.root}/gallery/{self.gallery_id}/" def metadata(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 47e071a..45ae52e 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -109,12 +109,11 @@ class InkbunnyPoolExtractor(InkbunnyExtractor): def __init__(self, match): InkbunnyExtractor.__init__(self, match) - pid = match.group(1) - if pid: + if pid := match[1]: self.pool_id = pid self.orderby = "pool_order" else: - params = text.parse_query(match.group(2)) + params = text.parse_query(match[2]) self.pool_id = params.get("pool_id") self.orderby = params.get("orderby", "pool_order") @@ -142,19 +141,18 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): def __init__(self, match): InkbunnyExtractor.__init__(self, match) - uid = match.group(1) - if uid: + if uid := match[1]: self.user_id = uid self.orderby = self.config("orderby", "fav_datetime") else: - params = text.parse_query(match.group(2)) + params = text.parse_query(match[2]) self.user_id = params.get("user_id") self.orderby = params.get("orderby", "fav_datetime") def metadata(self): # Lookup fav user ID as username - url = "{}/userfavorites_process.php?favs_user_id={}".format( - self.root, self.user_id) + url = (f"{self.root}/userfavorites_process.php" + f"?favs_user_id={self.user_id}") page = self.request(url).text user_link = text.extr(page, '

05}" + p2[4:] - return [(ufmt.format(num), None) for num in range(1, count + 1)] + p2 = p2[4:] + return [(f"{p1}/image{i:>05}{p2}", None) for i in range(1, count + 1)] def images_v2(self, page): + base = f"{self.root}/showimage/" results = [] while True: for path in text.extract_iter( page, ' class="picbox">*", extr.subcategory)) - cfgpath.append((extr.category, extr.subcategory)) - self.parents = parents - else: - self.parents = () - else: - self.parents = () - - if extr.basecategory: - if not cfgpath: - cfgpath.append((extr.category, extr.subcategory)) - cfgpath.append((extr.basecategory, extr.subcategory)) - - if cfgpath: + if cfgpath := self._build_config_path(parent): + if isinstance(cfgpath, list): + extr.config = extr._config_shared + extr.config_accumulate = extr._config_shared_accumulate extr._cfgpath = cfgpath - extr.config = extr._config_shared - extr.config_accumulate = extr._config_shared_accumulate - actions = extr.config("actions") - if actions: - from .actions import LoggerAdapter, parse + if actions := extr.config("actions"): + from .actions import LoggerAdapter, parse_logging self._logger_adapter = LoggerAdapter - self._logger_actions = parse(actions) + self._logger_actions = parse_logging(actions) path_proxy = output.PathfmtProxy(self) self._logger_extra = { @@ -93,16 +69,6 @@ class Job(): extr.log = self._wrap_logger(extr.log) extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url) - # data from parent job - if parent: - pextr = parent.extractor - - # transfer (sub)category - if pextr.config("category-transfer", pextr.categorytransfer): - extr._cfgpath = pextr._cfgpath - extr.category = pextr.category - extr.subcategory = pextr.subcategory - self.metadata_url = extr.config2("metadata-url", "url-metadata") self.metadata_http = extr.config2("metadata-http", "http-metadata") metadata_path = extr.config2("metadata-path", "path-metadata") @@ -121,8 +87,7 @@ class Job(): "current_git_head": util.git_head() } # user-supplied metadata - kwdict = extr.config("keywords") - if kwdict: + if kwdict := extr.config("keywords"): if extr.config("keywords-eval"): self.kwdict_eval = [] for key, value in kwdict.items(): @@ -134,6 +99,41 @@ class Job(): else: self.kwdict.update(kwdict) + def _build_config_path(self, parent): + extr = self.extractor + cfgpath = [] + + if parent: + pextr = parent.extractor + if extr.category == pextr.category or \ + extr.category in parent.parents: + parents = parent.parents + else: + parents = parent.parents + (pextr.category,) + self.parents = parents + + if pextr.config("category-transfer", pextr.categorytransfer): + extr.category = pextr.category + extr.subcategory = pextr.subcategory + return pextr._cfgpath + + if parents: + sub = extr.subcategory + for category in parents: + cat = f"{category}>{extr.category}" + cfgpath.append((cat, sub)) + cfgpath.append((category + ">*", sub)) + cfgpath.append((extr.category, sub)) + else: + self.parents = () + + if extr.basecategory: + if not cfgpath: + cfgpath.append((extr.category, extr.subcategory)) + cfgpath.append((extr.basecategory, extr.subcategory)) + + return cfgpath + def run(self): """Execute or run the job""" extractor = self.extractor @@ -151,9 +151,10 @@ class Job(): try: for msg in extractor: self.dispatch(msg) - except exception.StopExtraction as exc: - if exc.message: - log.error(exc.message) + except exception.StopExtraction: + pass + except exception.AbortExtraction as exc: + log.error(exc.message) self.status |= exc.code except (exception.TerminateExtraction, exception.RestartExtraction): raise @@ -162,10 +163,14 @@ class Job(): log.debug("", exc_info=exc) self.status |= exc.code except OSError as exc: - log.error("Unable to download data: %s: %s", - exc.__class__.__name__, exc) log.debug("", exc_info=exc) - self.status |= 128 + name = exc.__class__.__name__ + if name == "JSONDecodeError": + log.error("Failed to parse JSON data: %s: %s", name, exc) + self.status |= 1 + else: # regular OSError + log.error("Unable to download data: %s: %s", name, exc) + self.status |= 128 except Exception as exc: log.error(("An unexpected error occurred: %s - %s. " "Please run gallery-dl again with the --verbose flag, " @@ -184,6 +189,8 @@ class Job(): self.handle_finalize() extractor.finalize() + if s := extractor.status: + self.status |= s return self.status def dispatch(self, msg): @@ -195,6 +202,8 @@ class Job(): if self.pred_url(url, kwdict): self.update_kwdict(kwdict) self.handle_url(url, kwdict) + if FLAGS.FILE is not None: + FLAGS.process("FILE") elif msg[0] == Message.Directory: self.update_kwdict(msg[1]) @@ -205,7 +214,10 @@ class Job(): if self.metadata_url: kwdict[self.metadata_url] = url if self.pred_queue(url, kwdict): + self.update_kwdict(kwdict) self.handle_queue(url, kwdict) + if FLAGS.CHILD is not None: + FLAGS.process("CHILD") def handle_url(self, url, kwdict): """Handle Message.Url""" @@ -226,6 +238,8 @@ class Job(): kwdict["subcategory"] = extr.subcategory if self.metadata_http: kwdict.pop(self.metadata_http, None) + if extr.kwdict: + kwdict.update(extr.kwdict) if self.kwdict: kwdict.update(self.kwdict) if self.kwdict_eval: @@ -243,8 +257,7 @@ class Job(): if self.extractor.config(target + "-unique"): predicates.append(util.UniquePredicate()) - pfilter = self.extractor.config(target + "-filter") - if pfilter: + if pfilter := self.extractor.config(target + "-filter"): try: pred = util.FilterPredicate(pfilter, target) except (SyntaxError, ValueError, TypeError) as exc: @@ -252,8 +265,7 @@ class Job(): else: predicates.append(pred) - prange = self.extractor.config(target + "-range") - if prange: + if prange := self.extractor.config(target + "-range"): try: pred = util.RangePredicate(prange) except ValueError as exc: @@ -382,6 +394,8 @@ class DownloadJob(Job): if "post-after" in self.hooks: for callback in self.hooks["post-after"]: callback(self.pathfmt) + if FLAGS.POST is not None: + FLAGS.process("POST") self.pathfmt.set_directory(kwdict) if "post" in self.hooks: for callback in self.hooks["post"]: @@ -392,12 +406,10 @@ class DownloadJob(Job): return self.visited.add(url) - cls = kwdict.get("_extractor") - if cls: + if cls := kwdict.get("_extractor"): extr = cls.from_url(url) else: - extr = extractor.find(url) - if extr: + if extr := extractor.find(url): if self._extractor_filter is None: self._extractor_filter = self._build_extractor_filter() if not self._extractor_filter(extr): @@ -413,8 +425,7 @@ class DownloadJob(Job): else: extr._parentdir = pextr._parentdir - pmeta = pextr.config2("parent-metadata", "metadata-parent") - if pmeta: + if pmeta := pextr.config2("parent-metadata", "metadata-parent"): if isinstance(pmeta, str): data = self.kwdict.copy() if kwdict: @@ -446,9 +457,13 @@ class DownloadJob(Job): except StopIteration: pass else: + pextr.log.info("Downloading fallback URL") text.nameext_from_url(url, kwdict) + if kwdict["filename"].startswith(( + "HLS", "DASH")): + kwdict["filename"] = url.rsplit("/", 2)[-2] if url.startswith("ytdl:"): - kwdict["extension"] = "" + kwdict["extension"] = "mp4" self.handle_url(url, kwdict) break except exception.RestartExtraction: @@ -463,8 +478,7 @@ class DownloadJob(Job): self.archive.finalize() self.archive.close() - pathfmt = self.pathfmt - if pathfmt: + if pathfmt := self.pathfmt: hooks = self.hooks if "post-after" in hooks: for callback in hooks["post-after"]: @@ -500,8 +514,7 @@ class DownloadJob(Job): def download(self, url): """Download 'url'""" scheme = url.partition(":")[0] - downloader = self.get_downloader(scheme) - if downloader: + if downloader := self.get_downloader(scheme): try: return downloader.download(url, self.pathfmt) except OSError as exc: @@ -547,8 +560,7 @@ class DownloadJob(Job): # monkey-patch method to do nothing and always return True self.download = pathfmt.fix_extension - archive_path = cfg("archive") - if archive_path: + if archive_path := cfg("archive"): archive_table = cfg("archive-table") archive_prefix = cfg("archive-prefix") if archive_prefix is None: @@ -585,8 +597,7 @@ class DownloadJob(Job): self._archive_write_file = ("file" in events) self._archive_write_skip = ("skip" in events) - skip = cfg("skip", True) - if skip: + if skip := cfg("skip", True): self._skipexc = None if skip == "enumerate": pathfmt.check_file = pathfmt._enum_file @@ -600,8 +611,7 @@ class DownloadJob(Job): self._skipexc = SystemExit self._skipmax = text.parse_int(smax) - skip_filter = cfg("skip-filter") - if skip_filter: + if skip_filter := cfg("skip-filter"): self._skipftr = util.compile_filter(skip_filter) else: self._skipftr = None @@ -614,8 +624,7 @@ class DownloadJob(Job): if not cfg("postprocess", True): return - postprocessors = extr.config_accumulate("postprocessors") - if postprocessors: + if postprocessors := extr.config_accumulate("postprocessors"): self.hooks = collections.defaultdict(list) pp_log = self.get_logger("postprocessor") @@ -648,7 +657,26 @@ class DownloadJob(Job): clist, negate)(extr): continue - name = pp_dict.get("name") + name = pp_dict.get("name", "") + if "__init__" not in pp_dict: + name, sep, event = name.rpartition("@") + if sep: + pp_dict["name"] = name + if "event" not in pp_dict: + pp_dict["event"] = event + else: + name = event + + name, sep, mode = name.rpartition("/") + if sep: + pp_dict["name"] = name + if "mode" not in pp_dict: + pp_dict["mode"] = mode + else: + name = mode + + pp_dict["__init__"] = None + pp_cls = postprocessor.find(name) if not pp_cls: pp_log.warning("module '%s' not found", name) @@ -680,8 +708,7 @@ class DownloadJob(Job): for hook, callback in hooks.items(): self.hooks[hook].append(callback) - @staticmethod - def _call_hook(callback, condition, pathfmt): + def _call_hook(self, callback, condition, pathfmt): if condition(pathfmt.kwdict): callback(pathfmt) @@ -775,7 +802,7 @@ class KeywordJob(Job): if markers is None: markers = {markerid} elif markerid in markers: - write("{}\n \n".format(prefix[:-2])) + write(f"{prefix[:-2]}\n \n") return # ignore circular reference else: markers.add(markerid) @@ -801,7 +828,7 @@ class KeywordJob(Job): else: # string or number - write("{}\n {}\n".format(key, value)) + write(f"{key}\n {value}\n") markers.remove(markerid) @@ -816,20 +843,17 @@ class UrlJob(Job): if depth >= self.maxdepth: self.handle_queue = self.handle_url - @staticmethod - def handle_url(url, _): + def handle_url(self, url, _): stdout_write(url + "\n") - @staticmethod - def handle_url_fallback(url, kwdict): + def handle_url_fallback(self, url, kwdict): stdout_write(url + "\n") if "_fallback" in kwdict: for url in kwdict["_fallback"]: stdout_write("| " + url + "\n") def handle_queue(self, url, kwdict): - cls = kwdict.get("_extractor") - if cls: + if cls := kwdict.get("_extractor"): extr = cls.from_url(url) else: extr = extractor.find(url) @@ -862,20 +886,18 @@ class InfoJob(Job): return 0 def _print_multi(self, title, *values): - stdout_write("{}\n {}\n\n".format( - title, " / ".join(map(util.json_dumps, values)))) + stdout_write( + f"{title}\n {' / '.join(map(util.json_dumps, values))}\n\n") def _print_config(self, title, optname, value): optval = self.extractor.config(optname, util.SENTINEL) if optval is not util.SENTINEL: stdout_write( - "{} (custom):\n {}\n{} (default):\n {}\n\n".format( - title, util.json_dumps(optval), - title, util.json_dumps(value))) + f"{title} (custom):\n {util.json_dumps(optval)}\n" + f"{title} (default):\n {util.json_dumps(value)}\n\n") elif value: stdout_write( - "{} (default):\n {}\n\n".format( - title, util.json_dumps(value))) + f"{title} (default):\n {util.json_dumps(value)}\n\n") class DataJob(Job): @@ -912,7 +934,10 @@ class DataJob(Job): except exception.StopExtraction: pass except Exception as exc: - self.data.append((exc.__class__.__name__, str(exc))) + self.data.append((-1, { + "error" : exc.__class__.__name__, + "message": str(exc), + })) except BaseException: pass @@ -941,8 +966,7 @@ class DataJob(Job): self.data.append((Message.Queue, url, self.filter(kwdict))) def handle_queue_resolve(self, url, kwdict): - cls = kwdict.get("_extractor") - if cls: + if cls := kwdict.get("_extractor"): extr = cls.from_url(url) else: extr = extractor.find(url) diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 3c03271..963f957 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2023 Mike Fährmann +# Copyright 2017-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -40,8 +40,8 @@ class DeprecatedConfigConstAction(argparse.Action): """Set argparse const values as config values + deprecation warning""" def __call__(self, parser, namespace, values, option_string=None): sys.stderr.write( - "warning: {} is deprecated. Use {} instead.\n".format( - "/".join(self.option_strings), self.choices)) + f"Warning: {'/'.join(self.option_strings)} is deprecated. " + f"Use {self.choices} instead.\n") namespace.options.append(((), self.dest, self.const)) @@ -71,7 +71,7 @@ class MtimeAction(argparse.Action): def __call__(self, parser, namespace, value, option_string=None): namespace.postprocessors.append({ "name": "mtime", - "value": "{" + (self.const or value) + "}", + "value": f"{{{self.const or value}}}", }) @@ -144,7 +144,7 @@ class UgoiraAction(argparse.Action): } namespace.options.append(((), "ugoira", "original")) else: - parser.error("Unsupported Ugoira format '{}'".format(value)) + parser.error(f"Unsupported Ugoira format '{value}'") pp["name"] = "ugoira" pp["whitelist"] = ("pixiv", "danbooru") @@ -156,10 +156,17 @@ class UgoiraAction(argparse.Action): class PrintAction(argparse.Action): def __call__(self, parser, namespace, value, option_string=None): if self.const: - filename = self.const + if self.const == "-": + namespace.options.append(((), "skip", False)) + namespace.options.append(((), "download", False)) + namespace.options.append((("output",), "mode", False)) + filename = "-" base = None mode = "w" else: + if self.const is None: + namespace.options.append(((), "skip", False)) + namespace.options.append(((), "download", False)) value, path = value base, filename = os.path.split(path) mode = "a" @@ -186,7 +193,7 @@ class PrintAction(argparse.Action): if format_string[1] == "F" and format_string[-1] != "\n": format_string += "\n" elif "{" not in format_string and " " not in format_string: - format_string = "{" + format_string + "}\n" + format_string = f"{{{format_string}}}\n" elif format_string[-1] != "\n": format_string += "\n" @@ -205,12 +212,19 @@ class Formatter(argparse.HelpFormatter): def __init__(self, prog): argparse.HelpFormatter.__init__(self, prog, max_help_position=30) - def _format_action_invocation(self, action, join=", ".join): + def _format_action_invocation(self, action): opts = action.option_strings if action.metavar: opts = opts.copy() - opts[-1] += " " + action.metavar - return join(opts) + opts[-1] = f"{opts[-1]} {action.metavar}" + return ", ".join(opts) + + def _format_usage(self, usage, actions, groups, prefix): + return f"Usage: {self._prog} [OPTIONS] URL [URL...]\n" + + def format_help(self): + return self._long_break_matcher.sub( + "\n\n", self._root_section.format_help()) def _parse_option(opt): @@ -225,7 +239,6 @@ def _parse_option(opt): def build_parser(): """Build and configure an ArgumentParser object""" parser = argparse.ArgumentParser( - usage="%(prog)s [OPTION]... URL...", formatter_class=Formatter, add_help=False, ) @@ -273,6 +286,11 @@ def build_parser(): help="Delete cached login sessions, cookies, etc. for MODULE " "(ALL to delete everything)", ) + general.add_argument( + "--compat", + dest="category-map", nargs=0, action=ConfigConstAction, const="compat", + help="Restore legacy 'category' names", + ) update = parser.add_argument_group("Update Options") if util.EXECUTABLE: @@ -395,13 +413,28 @@ def build_parser(): dest="postprocessors", metavar="[EVENT:]FORMAT", action=PrintAction, const="-", default=[], help=("Write FORMAT during EVENT (default 'prepare') to standard " - "output. Examples: 'id' or 'post:{md5[:8]}'"), + "output instead of downloading files. " + "Can be used multiple times. " + "Examples: 'id' or 'post:{md5[:8]}'"), + ) + output.add_argument( + "--Print", + dest="postprocessors", metavar="[EVENT:]FORMAT", + action=PrintAction, const="+", + help="Like --print, but downloads files as well", ) output.add_argument( "--print-to-file", dest="postprocessors", metavar="[EVENT:]FORMAT FILE", - action=PrintAction, nargs=2, - help="Append FORMAT during EVENT to FILE", + action=PrintAction, const=None, nargs=2, + help=("Append FORMAT during EVENT to FILE instead of downloading " + "files. Can be used multiple times"), + ) + output.add_argument( + "--Print-to-file", + dest="postprocessors", metavar="[EVENT:]FORMAT FILE", + action=PrintAction, const=False, nargs=2, + help="Like --print-to-file, but downloads files as well", ) output.add_argument( "--list-modules", @@ -485,7 +518,7 @@ def build_parser(): downloader.add_argument( "-r", "--limit-rate", dest="rate", metavar="RATE", action=ConfigAction, - help="Maximum download rate (e.g. 500k or 2.5M)", + help="Maximum download rate (e.g. 500k, 2.5M, or 800k-2M)", ) downloader.add_argument( "--chunk-size", @@ -505,6 +538,12 @@ def build_parser(): help=("Number of seconds to wait between HTTP requests " "during data extraction"), ) + downloader.add_argument( + "--sleep-429", + dest="sleep-429", metavar="SECONDS", action=ConfigAction, + help=("Number of seconds to wait when receiving a " + "'429 Too Many Requests' response"), + ) downloader.add_argument( "--sleep-extractor", dest="sleep-extractor", metavar="SECONDS", action=ConfigAction, @@ -648,7 +687,7 @@ def build_parser(): selection.add_argument( "--download-archive", dest="archive", metavar="FILE", action=ConfigAction, - help=("Record all downloaded or skipped files in FILE and " + help=("Record successfully downloaded files in FILE and " "skip downloading any file already in it"), ) selection.add_argument( diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 1649487..e4937f4 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -121,8 +121,7 @@ class PathfmtProxy(): return pathfmt.__dict__.get(name) if pathfmt else None def __str__(self): - pathfmt = object.__getattribute__(self, "job").pathfmt - if pathfmt: + if pathfmt := object.__getattribute__(self, "job").pathfmt: return pathfmt.path or pathfmt.directory return "" @@ -235,8 +234,7 @@ def configure_logging(loglevel): minlevel = handler.level # file logging handler - handler = setup_logging_handler("logfile", lvl=loglevel) - if handler: + if handler := setup_logging_handler("logfile", lvl=loglevel): root.addHandler(handler) if minlevel > handler.level: minlevel = handler.level @@ -394,8 +392,7 @@ class PipeOutput(NullOutput): class TerminalOutput(): def __init__(self): - shorten = config.get(("output",), "shorten", True) - if shorten: + if shorten := config.get(("output",), "shorten", True): func = shorten_string_eaw if shorten == "eaw" else shorten_string limit = shutil.get_terminal_size().columns - OFFSET sep = CHAR_ELLIPSIES @@ -416,10 +413,10 @@ class TerminalOutput(): bdl = util.format_value(bytes_downloaded) bps = util.format_value(bytes_per_second) if bytes_total is None: - stderr_write("\r{:>7}B {:>7}B/s ".format(bdl, bps)) + stderr_write(f"\r{bdl:>7}B {bps:>7}B/s ") else: - stderr_write("\r{:>3}% {:>7}B {:>7}B/s ".format( - bytes_downloaded * 100 // bytes_total, bdl, bps)) + stderr_write(f"\r{bytes_downloaded * 100 // bytes_total:>3}% " + f"{bdl:>7}B {bps:>7}B/s ") class ColorOutput(TerminalOutput): @@ -431,10 +428,8 @@ class ColorOutput(TerminalOutput): if colors is None: colors = COLORS_DEFAULT - self.color_skip = "\033[{}m".format( - colors.get("skip", "2")) - self.color_success = "\r\033[{}m".format( - colors.get("success", "1;32")) + self.color_skip = f"\x1b[{colors.get('skip', '2')}m" + self.color_success = f"\r\x1b[{colors.get('success', '1;32')}m" def start(self, path): stdout_write_flush(self.shorten(path)) @@ -462,8 +457,7 @@ class CustomOutput(): if isinstance(fmt_success, list): off_success, fmt_success = fmt_success - shorten = config.get(("output",), "shorten", True) - if shorten: + if shorten := config.get(("output",), "shorten", True): func = shorten_string_eaw if shorten == "eaw" else shorten_string width = shutil.get_terminal_size().columns @@ -483,8 +477,7 @@ class CustomOutput(): self._fmt_progress_total = (options.get("progress-total") or "\r{3:>3}% {0:>7}B {1:>7}B/s ").format - @staticmethod - def _make_func(shorten, format_string, limit): + def _make_func(self, shorten, format_string, limit): fmt = format_string.format return lambda txt: fmt(shorten(txt, limit, CHAR_ELLIPSIES)) diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 54cf126..795564d 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,7 +9,6 @@ """Filesystem path handling""" import os -import re import shutil import functools from . import util, formatter, exception @@ -134,26 +133,45 @@ class PathFormat(): basedir = self.clean_path(basedir) self.basedirectory = basedir - @staticmethod - def _build_cleanfunc(chars, repl): + def _build_cleanfunc(self, chars, repl): if not chars: return util.identity elif isinstance(chars, dict): + if 0 not in chars: + chars = self._process_repl_dict(chars) + chars[0] = None + def func(x, table=str.maketrans(chars)): return x.translate(table) elif len(chars) == 1: def func(x, c=chars, r=repl): return x.replace(c, r) else: - return functools.partial( - re.compile("[" + chars + "]").sub, repl) + return functools.partial(util.re(f"[{chars}]").sub, repl) return func + def _process_repl_dict(self, chars): + # can't modify 'chars' while *directly* iterating over its keys + for char in [c for c in chars if len(c) > 1]: + if len(char) == 3 and char[1] == "-": + citer = range(ord(char[0]), ord(char[2])+1) + else: + citer = char + + repl = chars.pop(char) + for c in citer: + chars[c] = repl + + return chars + def open(self, mode="wb"): """Open file and return a corresponding file object""" try: return open(self.temppath, mode) except FileNotFoundError: + if "r" in mode: + # '.part' file no longer exists + return util.NullContext() os.makedirs(self.realdirectory) return open(self.temppath, mode) @@ -163,8 +181,7 @@ class PathFormat(): return self.check_file() return False - @staticmethod - def check_file(): + def check_file(self): return True def _enum_file(self): @@ -185,8 +202,7 @@ class PathFormat(): """Build directory path and create it if necessary""" self.kwdict = kwdict - segments = self.build_directory(kwdict) - if segments: + if segments := self.build_directory(kwdict): self.directory = directory = self.basedirectory + self.clean_path( os.sep.join(segments) + os.sep) else: @@ -263,7 +279,6 @@ class PathFormat(): def build_directory(self, kwdict): """Apply 'kwdict' to directory format strings""" segments = [] - append = segments.append strip = self.strip try: @@ -273,14 +288,13 @@ class PathFormat(): # remove trailing dots and spaces (#647) segment = segment.rstrip(strip) if segment: - append(self.clean_segment(segment)) + segments.append(self.clean_segment(segment)) return segments except Exception as exc: raise exception.DirectoryFormatError(exc) def build_directory_conditional(self, kwdict): segments = [] - append = segments.append strip = self.strip try: @@ -294,7 +308,7 @@ class PathFormat(): if strip and segment != "..": segment = segment.rstrip(strip) if segment: - append(self.clean_segment(segment)) + segments.append(self.clean_segment(segment)) return segments except Exception as exc: raise exception.DirectoryFormatError(exc) @@ -329,6 +343,11 @@ class PathFormat(): pass return 0 + def set_mtime(self, path=None): + if (mtime := (self.kwdict.get("_mtime_meta") or + self.kwdict.get("_mtime_http"))): + util.set_mtime(self.realpath if path is None else path, mtime) + def finalize(self): """Move tempfile to its target location""" if self.delete: @@ -362,6 +381,4 @@ class PathFormat(): os.unlink(self.temppath) break - mtime = self.kwdict.get("_mtime") - if mtime: - util.set_mtime(self.realpath, mtime) + self.set_mtime() diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py index dd44a8a..1a4ce56 100644 --- a/gallery_dl/postprocessor/__init__.py +++ b/gallery_dl/postprocessor/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2023 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index 3099547..8da8417 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -22,8 +22,7 @@ class PostProcessor(): return self.__class__.__name__ def _init_archive(self, job, options, prefix=None): - archive_path = options.get("archive") - if archive_path: + if archive_path := options.get("archive"): extr = job.extractor archive_table = options.get("archive-table") diff --git a/gallery_dl/postprocessor/compare.py b/gallery_dl/postprocessor/compare.py index c6bc54d..c3d328d 100644 --- a/gallery_dl/postprocessor/compare.py +++ b/gallery_dl/postprocessor/compare.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,8 +21,7 @@ class ComparePP(PostProcessor): self._compare = self._compare_size self._equal_exc = self._equal_cnt = 0 - equal = options.get("equal") - if equal: + if equal := options.get("equal"): equal, _, emax = equal.partition(":") self._equal_max = text.parse_int(emax) if equal == "abort": @@ -62,12 +61,10 @@ class ComparePP(PostProcessor): def _compare(self, f1, f2): return self._compare_size(f1, f2) and self._compare_content(f1, f2) - @staticmethod - def _compare_size(f1, f2): + def _compare_size(self, f1, f2): return os.stat(f1).st_size == os.stat(f2).st_size - @staticmethod - def _compare_content(f1, f2): + def _compare_content(self, f1, f2): size = 16384 with open(f1, "rb") as fp1, open(f2, "rb") as fp2: while True: diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index 7d2be2b..0bfe1a2 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -10,13 +10,14 @@ from .common import PostProcessor from .. import util, formatter +import subprocess import os -import re if util.WINDOWS: def quote(s): - return '"' + s.replace('"', '\\"') + '"' + s = s.replace('"', '\\"') + return f'"{s}"' else: from shlex import quote @@ -26,17 +27,21 @@ class ExecPP(PostProcessor): def __init__(self, job, options): PostProcessor.__init__(self, job) - if options.get("async", False): - self._exec = self._exec_async - - args = options["command"] - if isinstance(args, str): - self.args = args - self._sub = re.compile(r"\{(_directory|_filename|_path|)\}").sub - execute = self.exec_string + if cmds := options.get("commands"): + self.cmds = [self._prepare_cmd(c) for c in cmds] + execute = self.exec_many else: - self.args = [formatter.parse(arg) for arg in args] - execute = self.exec_list + execute, self.args = self._prepare_cmd(options["command"]) + if options.get("async", False): + self._exec = self._popen + + self.session = False + self.creationflags = 0 + if options.get("session"): + if util.WINDOWS: + self.creationflags = subprocess.CREATE_NEW_PROCESS_GROUP + else: + self.session = True events = options.get("event") if events is None: @@ -47,6 +52,13 @@ class ExecPP(PostProcessor): self._init_archive(job, options) + def _prepare_cmd(self, cmd): + if isinstance(cmd, str): + self._sub = util.re(r"\{(_directory|_filename|_path|)\}").sub + return self.exec_string, cmd + else: + return self.exec_list, [formatter.parse(arg) for arg in cmd] + def exec_list(self, pathfmt): archive = self.archive kwdict = pathfmt.kwdict @@ -60,10 +72,11 @@ class ExecPP(PostProcessor): args = [arg.format_map(kwdict) for arg in self.args] args[0] = os.path.expanduser(args[0]) - self._exec(args, False) + retcode = self._exec(args, False) if archive: archive.add(kwdict) + return retcode def exec_string(self, pathfmt): archive = self.archive @@ -72,24 +85,47 @@ class ExecPP(PostProcessor): self.pathfmt = pathfmt args = self._sub(self._replace, self.args) - self._exec(args, True) + retcode = self._exec(args, True) if archive: archive.add(pathfmt.kwdict) + return retcode + + def exec_many(self, pathfmt): + if archive := self.archive: + if archive.check(pathfmt.kwdict): + return + self.archive = False + + retcode = 0 + for execute, args in self.cmds: + self.args = args + if retcode := execute(pathfmt): + # non-zero exit status + break + + if archive: + self.archive = archive + archive.add(pathfmt.kwdict) + return retcode def _exec(self, args, shell): - self.log.debug("Running '%s'", args) - retcode = util.Popen(args, shell=shell).wait() - if retcode: + if retcode := self._popen(args, shell).wait(): self.log.warning("'%s' returned with non-zero exit status (%d)", args, retcode) + return retcode - def _exec_async(self, args, shell): + def _popen(self, args, shell): self.log.debug("Running '%s'", args) - util.Popen(args, shell=shell) + return util.Popen( + args, + shell=shell, + creationflags=self.creationflags, + start_new_session=self.session, + ) def _replace(self, match): - name = match.group(1) + name = match[1] if name == "_directory": return quote(self.pathfmt.realdirectory) if name == "_filename": diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index fbb3fb8..c74f92f 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -55,8 +55,7 @@ class MetadataPP(PostProcessor): self._json_encode = self._make_encoder(options, 4).encode ext = "json" - base_directory = options.get("base-directory") - if base_directory: + if base_directory := options.get("base-directory"): if base_directory is True: self._base = lambda p: p.basedirectory else: @@ -139,9 +138,7 @@ class MetadataPP(PostProcessor): archive.add(pathfmt.kwdict) if self.mtime: - mtime = pathfmt.kwdict.get("_mtime") - if mtime: - util.set_mtime(path, mtime) + pathfmt.set_mtime(path) def _run_stdout(self, pathfmt): self.write(sys.stdout, pathfmt.kwdict) @@ -183,8 +180,7 @@ class MetadataPP(PostProcessor): try: pathfmt.directory_formatters = self._directory_formatters pathfmt.directory_conditions = () - segments = pathfmt.build_directory(pathfmt.kwdict) - if segments: + if segments := pathfmt.build_directory(pathfmt.kwdict): directory = pathfmt.clean_path(os.sep.join(segments) + os.sep) else: directory = "." + os.sep @@ -246,8 +242,7 @@ class MetadataPP(PostProcessor): fp.write(self._json_encode(kwdict) + "\n") def _make_filter(self, options): - include = options.get("include") - if include: + if include := options.get("include"): if isinstance(include, str): include = include.split(",") return lambda d: {k: d[k] for k in include if k in d} @@ -268,8 +263,7 @@ class MetadataPP(PostProcessor): if not private: return util.filter_dict - @staticmethod - def _make_encoder(options, indent=None): + def _make_encoder(self, options, indent=None): return json.JSONEncoder( ensure_ascii=options.get("ascii", False), sort_keys=options.get("sort", False), diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py index 6ded1e2..b1269dd 100644 --- a/gallery_dl/postprocessor/mtime.py +++ b/gallery_dl/postprocessor/mtime.py @@ -17,8 +17,7 @@ class MtimePP(PostProcessor): def __init__(self, job, options): PostProcessor.__init__(self, job) - value = options.get("value") - if value: + if value := options.get("value"): self._get = formatter.parse(value, None, util.identity).format_map else: key = options.get("key", "date") @@ -36,7 +35,7 @@ class MtimePP(PostProcessor): if mtime is None: return - pathfmt.kwdict["_mtime"] = ( + pathfmt.kwdict["_mtime_meta"] = ( util.datetime_to_timestamp(mtime) if isinstance(mtime, datetime) else text.parse_int(mtime) diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 5340335..33ebb75 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2023 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -170,8 +170,8 @@ class UgoiraPP(PostProcessor): for frame in self._files: # update frame filename extension - frame["file"] = name = "{}.{}".format( - frame["file"].partition(".")[0], frame["ext"]) + frame["file"] = name = \ + f"{frame['file'].partition('.')[0]}.{frame['ext']}" if tempdir: # move frame into tempdir @@ -236,9 +236,7 @@ class UgoiraPP(PostProcessor): pathfmt.realpath = pathfmt.temppath else: if self.mtime: - mtime = pathfmt.kwdict.get("_mtime") - if mtime: - util.set_mtime(pathfmt.realpath, mtime) + pathfmt.set_mtime() return True def convert_to_archive(self, pathfmt, tempdir): @@ -298,8 +296,7 @@ class UgoiraPP(PostProcessor): def _exec(self, args): self.log.debug(args) out = None if self.output else subprocess.DEVNULL - retcode = util.Popen(args, stdout=out, stderr=out).wait() - if retcode: + if retcode := util.Popen(args, stdout=out, stderr=out).wait(): output.stderr_write("\n") self.log.error("Non-zero exit status when running %s (%s)", args, retcode) @@ -334,7 +331,7 @@ class UgoiraPP(PostProcessor): last_copy = last.copy() frames.append(last_copy) name, _, ext = last_copy["file"].rpartition(".") - last_copy["file"] = "{:>06}.{}".format(int(name)+1, ext) + last_copy["file"] = f"{int(name) + 1:>06}.{ext}" shutil.copyfile(tempdir + last["file"], tempdir + last_copy["file"]) @@ -349,10 +346,8 @@ class UgoiraPP(PostProcessor): "-f", "image2", "-ts_from_file", "2", "-pattern_type", "sequence", - "-i", "{}%06d.{}".format( - tempdir.replace("%", "%%"), - frame["file"].rpartition(".")[2] - ), + "-i", (f"{tempdir.replace('%', '%%')}%06d." + f"{frame['file'].rpartition('.')[2]}"), ] def _process_mkvmerge(self, pathfmt, tempdir): @@ -363,10 +358,8 @@ class UgoiraPP(PostProcessor): self.ffmpeg, "-f", "image2", "-pattern_type", "sequence", - "-i", "{}/%06d.{}".format( - tempdir.replace("%", "%%"), - self._frames[0]["file"].rpartition(".")[2] - ), + "-i", (f"{tempdir.replace('%', '%%')}/%06d." + f"{self._frames[0]['file'].rpartition('.')[2]}"), ] def _finalize_mkvmerge(self, pathfmt, tempdir): @@ -384,14 +377,13 @@ class UgoiraPP(PostProcessor): def _write_ffmpeg_concat(self, tempdir): content = ["ffconcat version 1.0"] - append = content.append for frame in self._frames: - append("file '{}'\nduration {}".format( - frame["file"], frame["delay"] / 1000)) + content.append(f"file '{frame['file']}'\n" + f"duration {frame['delay'] / 1000}") if self.repeat: - append("file '{}'".format(frame["file"])) - append("") + content.append(f"file '{frame['file']}'") + content.append("") ffconcat = tempdir + "/ffconcat.txt" with open(ffconcat, "w") as fp: @@ -400,14 +392,13 @@ class UgoiraPP(PostProcessor): def _write_mkvmerge_timecodes(self, tempdir): content = ["# timecode format v2"] - append = content.append delay_sum = 0 for frame in self._frames: - append(str(delay_sum)) + content.append(str(delay_sum)) delay_sum += frame["delay"] - append(str(delay_sum)) - append("") + content.append(str(delay_sum)) + content.append("") timecodes = tempdir + "/timecodes.tc" with open(timecodes, "w") as fp: @@ -416,24 +407,22 @@ class UgoiraPP(PostProcessor): def calculate_framerate(self, frames): if self._delay_is_uniform(frames): - return ("1000/{}".format(frames[0]["delay"]), None) + return (f"1000/{frames[0]['delay']}", None) if not self.uniform: gcd = self._delay_gcd(frames) if gcd >= 10: - return (None, "1000/{}".format(gcd)) + return (None, f"1000/{gcd}") return (None, None) - @staticmethod - def _delay_gcd(frames): + def _delay_gcd(self, frames): result = frames[0]["delay"] for f in frames: result = gcd(result, f["delay"]) return result - @staticmethod - def _delay_is_uniform(frames): + def _delay_is_uniform(self, frames): delay = frames[0]["delay"] for f in frames: if f["delay"] != delay: diff --git a/gallery_dl/text.py b/gallery_dl/text.py index c1dde94..a7539ad 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,14 +8,29 @@ """Collection of functions that work on strings/text""" -import re import sys import html import time import datetime import urllib.parse +import re as re_module -HTML_RE = re.compile("<[^>]+>") +try: + re_compile = re_module._compiler.compile +except AttributeError: + re_compile = re_module.sre_compile.compile + +HTML_RE = re_compile(r"<[^>]+>") +PATTERN_CACHE = {} + + +def re(pattern): + """Compile a regular expression pattern""" + try: + return PATTERN_CACHE[pattern] + except KeyError: + p = PATTERN_CACHE[pattern] = re_compile(pattern) + return p def remove_html(txt, repl=" ", sep=" "): @@ -47,8 +62,13 @@ def slugify(value): Adapted from: https://github.com/django/django/blob/master/django/utils/text.py """ - value = re.sub(r"[^\w\s-]", "", str(value).lower()) - return re.sub(r"[-\s]+", "-", value).strip("-_") + value = re(r"[^\w\s-]").sub("", str(value).lower()) + return re(r"[-\s]+").sub("-", value).strip("-_") + + +def sanitize_whitespace(value): + """Replace all whitespace characters with a single space""" + return re(r"\s+").sub(" ", value.strip()) def ensure_http_scheme(url, scheme="https://"): @@ -100,7 +120,7 @@ def nameext_from_url(url, data=None): return data -def extract(txt, begin, end, pos=0): +def extract(txt, begin, end, pos=None): """Extract the text between 'begin' and 'end' from 'txt' Args: @@ -125,7 +145,7 @@ def extract(txt, begin, end, pos=0): last = txt.index(end, first) return txt[first:last], last+len(end) except Exception: - return None, pos + return None, 0 if pos is None else pos def extr(txt, begin, end, default=""): @@ -137,17 +157,26 @@ def extr(txt, begin, end, default=""): return default -def rextract(txt, begin, end, pos=-1): +def rextract(txt, begin, end, pos=None): try: lbeg = len(begin) - first = txt.rindex(begin, 0, pos) + first = txt.rindex(begin, None, pos) last = txt.index(end, first + lbeg) return txt[first + lbeg:last], first except Exception: - return None, pos + return None, -1 if pos is None else pos + + +def rextr(txt, begin, end, pos=None, default=""): + """Stripped-down version of 'rextract()'""" + try: + first = txt.rindex(begin, None, pos) + len(begin) + return txt[first:txt.index(end, first)] + except Exception: + return default -def extract_all(txt, rules, pos=0, values=None): +def extract_all(txt, rules, pos=None, values=None): """Calls extract for each rule and returns the result in a dict""" if values is None: values = {} @@ -155,10 +184,10 @@ def extract_all(txt, rules, pos=0, values=None): result, pos = extract(txt, begin, end, pos) if key: values[key] = result - return values, pos + return values, 0 if pos is None else pos -def extract_iter(txt, begin, end, pos=0): +def extract_iter(txt, begin, end, pos=None): """Yield values that would be returned by repeated calls of extract()""" try: index = txt.index @@ -173,7 +202,7 @@ def extract_iter(txt, begin, end, pos=0): return -def extract_from(txt, pos=0, default=""): +def extract_from(txt, pos=None, default=""): """Returns a function object that extracts from 'txt'""" def extr(begin, end, index=txt.index, txt=txt): nonlocal pos @@ -190,21 +219,22 @@ def extract_from(txt, pos=0, default=""): def parse_unicode_escapes(txt): """Convert JSON Unicode escapes in 'txt' into actual characters""" if "\\u" in txt: - return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt) + return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt) return txt def _hex_to_char(match): - return chr(int(match.group(1), 16)) + return chr(int(match[1], 16)) def parse_bytes(value, default=0, suffixes="bkmgtp"): """Convert a bytes-amount ("500k", "2.5M", ...) to int""" - try: - last = value[-1].lower() - except Exception: + if not value: return default + value = str(value).strip() + last = value[-1].lower() + if last in suffixes: mul = 1024 ** suffixes.index(last) value = value[:-1] @@ -279,12 +309,19 @@ def parse_query_list(qs, as_list=()): else: result[name] = [value] elif name not in result: - result[name] = unquote(value.replace("+", " ")) + result[name] = value except Exception: pass return result +def build_query(params): + return "&".join([ + f"{quote(name)}={quote(value)}" + for name, value in params.items() + ]) + + if sys.hexversion < 0x30c0000: # Python <= 3.11 def parse_timestamp(ts, default=None): @@ -307,12 +344,7 @@ else: def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0): """Create a datetime object by parsing 'date_string'""" try: - if format.endswith("%z") and date_string[-3] == ":": - # workaround for Python < 3.7: +00:00 -> +0000 - ds = date_string[:-3] + date_string[-2:] - else: - ds = date_string - d = datetime.datetime.strptime(ds, format) + d = datetime.datetime.strptime(date_string, format) o = d.utcoffset() if o is not None: # convert to naive UTC diff --git a/gallery_dl/transaction_id.py b/gallery_dl/transaction_id.py index 89e3d5b..915b7b3 100644 --- a/gallery_dl/transaction_id.py +++ b/gallery_dl/transaction_id.py @@ -139,8 +139,7 @@ class ClientTransaction(): (now >> 24) & 0xFF, ) - payload = "{}!{}!{}{}{}".format( - method, path, now, keyword, self.animation_key) + payload = f"{method}!{path}!{now}{keyword}{self.animation_key}" bytes_hash = hashlib.sha256(payload.encode()).digest()[:16] num = (random.randrange(16) << 4) + int((nowf - nowi) * 16.0) diff --git a/gallery_dl/update.py b/gallery_dl/update.py index 6650ec4..273ca18 100644 --- a/gallery_dl/update.py +++ b/gallery_dl/update.py @@ -1,13 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. import os -import re import sys from .extractor.common import Extractor, Message @@ -98,7 +97,7 @@ class UpdateJob(DownloadJob): import atexit import subprocess - cmd = 'ping 127.0.0.1 -n 5 -w 1000 & del /F "{}"'.format(path_old) + cmd = f'ping 127.0.0.1 -n 5 -w 1000 & del /F "{path_old}"' atexit.register( util.Popen, cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, @@ -184,17 +183,16 @@ class UpdateExtractor(Extractor): tag = channel exact = True - if re.match(r"\d\.\d+\.\d+", tag): + if util.re_compile(r"\d\.\d+\.\d+").match(tag): tag = "v" + tag try: path_repo = REPOS[repo or "stable"] except KeyError: - raise exception.StopExtraction("Invalid channel '%s'", repo) + raise exception.AbortExtraction(f"Invalid channel '{repo}'") path_tag = tag if tag == "latest" else "tags/" + tag - url = "{}/repos/{}/releases/{}".format( - self.root_api, path_repo, path_tag) + url = f"{self.root_api}/repos/{path_repo}/releases/{path_tag}" headers = { "Accept": "application/vnd.github+json", "User-Agent": util.USERAGENT, @@ -211,8 +209,8 @@ class UpdateExtractor(Extractor): else: binary_name = BINARIES[repo][binary] - url = "{}/{}/releases/download/{}/{}".format( - self.root, path_repo, data["tag_name"], binary_name) + url = (f"{self.root}/{path_repo}/releases/download" + f"/{data['tag_name']}/{binary_name}") yield Message.Directory, data yield Message.Url, url, data diff --git a/gallery_dl/util.py b/gallery_dl/util.py index ba31ea7..4027ac6 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2023 Mike Fährmann +# Copyright 2017-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,6 @@ """Utility functions and classes""" -import re import os import sys import json @@ -27,11 +26,6 @@ from http.cookiejar import Cookie from email.utils import mktime_tz, parsedate_tz from . import text, version, exception -try: - re_compile = re._compiler.compile -except AttributeError: - re_compile = re.sre_compile.compile - def bencode(num, alphabet="0123456789"): """Encode an integer into a base-N encoded string""" @@ -48,8 +42,7 @@ def bdecode(data, alphabet="0123456789"): num = 0 base = len(alphabet) for c in data: - num *= base - num += alphabet.index(c) + num = num * base + alphabet.find(c) return num @@ -135,7 +128,7 @@ def false(_, __=None): return False -def noop(): +def noop(_=None): """Does nothing""" @@ -159,18 +152,17 @@ def sha1(s): def generate_token(size=16): """Generate a random token with hexadecimal digits""" - data = random.getrandbits(size * 8).to_bytes(size, "big") - return binascii.hexlify(data).decode() + return random.getrandbits(size * 8).to_bytes(size, "big").hex() def format_value(value, suffixes="kMGTPEZY"): - value = format(value) + value = str(value) value_len = len(value) index = value_len - 4 if index >= 0: offset = (value_len - 1) % 3 + 1 - return (value[:offset] + "." + value[offset:offset+2] + - suffixes[index // 3]) + return (f"{value[:offset]}.{value[offset:offset+2]}" + f"{suffixes[index // 3]}") return value @@ -236,6 +228,34 @@ def to_string(value): return str(value) +def to_datetime(value): + """Convert 'value' to a datetime object""" + if not value: + return EPOCH + + if isinstance(value, datetime.datetime): + return value + + if isinstance(value, str): + try: + if value[-1] == "Z": + # compat for Python < 3.11 + value = value[:-1] + dt = datetime.datetime.fromisoformat(value) + if dt.tzinfo is None: + if dt.microsecond: + dt = dt.replace(microsecond=0) + else: + # convert to naive UTC + dt = dt.astimezone(datetime.timezone.utc).replace( + microsecond=0, tzinfo=None) + return dt + except Exception: + pass + + return text.parse_timestamp(value, EPOCH) + + def datetime_to_timestamp(dt): """Convert naive UTC datetime to Unix timestamp""" return (dt - EPOCH) / SECOND @@ -298,7 +318,32 @@ def dump_response(response, fp, headers=False, content=True, hide_auth=True): request = response.request req_headers = request.headers.copy() res_headers = response.headers.copy() - outfmt = """\ + + if hide_auth: + if authorization := req_headers.get("Authorization"): + atype, sep, _ = str(authorization).partition(" ") + req_headers["Authorization"] = f"{atype} ***" if sep else "***" + + if cookie := req_headers.get("Cookie"): + req_headers["Cookie"] = ";".join( + c.partition("=")[0] + "=***" + for c in cookie.split(";") + ) + + if set_cookie := res_headers.get("Set-Cookie"): + res_headers["Set-Cookie"] = re(r"(^|, )([^ =]+)=[^,;]*").sub( + r"\1\2=***", set_cookie) + + request_headers = "\n".join( + f"{name}: {value}" + for name, value in req_headers.items() + ) + response_headers = "\n".join( + f"{name}: {value}" + for name, value in res_headers.items() + ) + + output = f"""\ {request.method} {request.url} Status: {response.status_code} {response.reason} @@ -307,49 +352,17 @@ Request Headers {request_headers} """ if request.body: - outfmt += """ + output = f"""{output} Request Body ------------ {request.body} """ - outfmt += """ + output = f"""{output} Response Headers ---------------- {response_headers} """ - if hide_auth: - authorization = req_headers.get("Authorization") - if authorization: - atype, sep, _ = str(authorization).partition(" ") - req_headers["Authorization"] = atype + " ***" if sep else "***" - - cookie = req_headers.get("Cookie") - if cookie: - req_headers["Cookie"] = ";".join( - c.partition("=")[0] + "=***" - for c in cookie.split(";") - ) - - set_cookie = res_headers.get("Set-Cookie") - if set_cookie: - res_headers["Set-Cookie"] = re.sub( - r"(^|, )([^ =]+)=[^,;]*", r"\1\2=***", set_cookie, - ) - - fmt_nv = "{}: {}".format - - fp.write(outfmt.format( - request=request, - response=response, - request_headers="\n".join( - fmt_nv(name, value) - for name, value in req_headers.items() - ), - response_headers="\n".join( - fmt_nv(name, value) - for name, value in res_headers.items() - ), - ).encode()) + fp.write(output.encode()) if content: if headers: @@ -361,14 +374,11 @@ def extract_headers(response): headers = response.headers data = dict(headers) - hcd = headers.get("content-disposition") - if hcd: - name = text.extr(hcd, 'filename="', '"') - if name: + if hcd := headers.get("content-disposition"): + if name := text.extr(hcd, 'filename="', '"'): text.nameext_from_url(name, data) - hlm = headers.get("last-modified") - if hlm: + if hlm := headers.get("last-modified"): data["date"] = datetime.datetime(*parsedate_tz(hlm)[:6]) return data @@ -488,8 +498,7 @@ def cookiestxt_load(fp): def cookiestxt_store(fp, cookies): """Write 'cookies' in Netscape cookies.txt format to 'fp'""" - write = fp.write - write("# Netscape HTTP Cookie File\n\n") + fp.write("# Netscape HTTP Cookie File\n\n") for cookie in cookies: if not cookie.domain: @@ -503,7 +512,7 @@ def cookiestxt_store(fp, cookies): value = cookie.value domain = cookie.domain - write("\t".join(( + fp.write("\t".join(( domain, "TRUE" if domain and domain[0] == "." else "FALSE", cookie.path, @@ -568,8 +577,7 @@ class HTTPBasicAuth(): def __init__(self, username, password): self.authorization = b"Basic " + binascii.b2a_base64( - username.encode("latin1") + b":" + str(password).encode("latin1") - )[:-1] + f"{username}:{password}".encode("latin1"), newline=False) def __call__(self, request): request.headers["Authorization"] = self.authorization @@ -611,6 +619,28 @@ class NullContext(): pass +class NullResponse(): + __slots__ = ("url", "reason") + + ok = is_redirect = is_permanent_redirect = False + cookies = headers = history = links = {} + encoding = apparent_encoding = "utf-8" + content = b"" + text = "" + status_code = 900 + close = noop + + def __init__(self, url, reason=""): + self.url = url + self.reason = str(reason) + + def __str__(self): + return "900 " + self.reason + + def json(self): + return {} + + class CustomNone(): """None-style type that supports more operations than regular None""" __slots__ = () @@ -622,15 +652,14 @@ class CustomNone(): def __call__(self, *args, **kwargs): return self - @staticmethod - def __next__(): + def __next__(self): raise StopIteration def __eq__(self, other): - return self is other + return other is self or other is None def __ne__(self, other): - return self is not other + return other is not self and other is not None __lt__ = true __le__ = true @@ -671,25 +700,40 @@ class CustomNone(): __abs__ = identity __invert__ = identity - @staticmethod - def __len__(): + def __len__(self): return 0 __int__ = __len__ __hash__ = __len__ __index__ = __len__ - @staticmethod - def __format__(_): + def __format__(self, _): return "None" - @staticmethod - def __str__(): + def __str__(self): return "None" __repr__ = __str__ +class Flags(): + + def __init__(self): + self.FILE = self.POST = self.CHILD = self.DOWNLOAD = None + + def process(self, flag): + value = self.__dict__[flag] + self.__dict__[flag] = None + + if value == "abort": + raise exception.AbortExtraction() + if value == "terminate": + raise exception.TerminateExtraction() + if value == "restart": + raise exception.RestartExtraction() + raise exception.StopExtraction() + + # v137.0 release of Firefox on 2025-04-01 has ordinal 739342 # 735506 == 739342 - 137 * 28 # v135.0 release of Chrome on 2025-04-01 has ordinal 739342 @@ -701,19 +745,30 @@ class CustomNone(): _ff_ver = (datetime.date.today().toordinal() - 735506) // 28 # _ch_ver = _ff_ver - 2 +re = text.re +re_compile = text.re_compile + NONE = CustomNone() +FLAGS = Flags() EPOCH = datetime.datetime(1970, 1, 1) SECOND = datetime.timedelta(0, 1) WINDOWS = (os.name == "nt") SENTINEL = object() EXECUTABLE = getattr(sys, "frozen", False) +SPECIAL_EXTRACTORS = {"oauth", "recursive", "generic"} + +EXTS_IMAGE = {"jpg", "jpeg", "png", "gif", "bmp", "svg", "psd", "ico", + "webp", "avif", "heic", "heif"} +EXTS_VIDEO = {"mp4", "m4v", "mov", "webm", "mkv", "ogv", "flv", "avi", "wmv"} +EXTS_ARCHIVE = {"zip", "rar", "7z", "tar", "gz", "bz2", "lzma", "xz"} + USERAGENT = "gallery-dl/" + version.__version__ -USERAGENT_FIREFOX = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{}.0) " - "Gecko/20100101 Firefox/{}.0").format(_ff_ver, _ff_ver) +USERAGENT_FIREFOX = (f"Mozilla/5.0 (Windows NT 10.0; Win64; x64; " + f"rv:{_ff_ver}.0) Gecko/20100101 Firefox/{_ff_ver}.0") USERAGENT_CHROME = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{}.0.0.0 " - "Safari/537.36").format(_ff_ver - 2) -SPECIAL_EXTRACTORS = {"oauth", "recursive", "generic"} + "AppleWebKit/537.36 (KHTML, like Gecko) " + f"Chrome/{_ff_ver - 2}.0.0.0 Safari/537.36") + GLOBALS = { "contains" : contains, "parse_int": text.parse_int, @@ -721,12 +776,16 @@ GLOBALS = { "datetime" : datetime.datetime, "timedelta": datetime.timedelta, "abort" : raises(exception.StopExtraction), + "error" : raises(exception.AbortExtraction), "terminate": raises(exception.TerminateExtraction), "restart" : raises(exception.RestartExtraction), "hash_sha1": sha1, "hash_md5" : md5, "std" : ModuleProxy(), - "re" : re, + "re" : text.re_module, + "exts_image" : EXTS_IMAGE, + "exts_video" : EXTS_VIDEO, + "exts_archive": EXTS_ARCHIVE, } @@ -786,10 +845,12 @@ def compile_expression_defaultdict_impl(expr, name="", globals=None): def compile_expression_tryexcept(expr, name="", globals=None): code_object = compile(expr, name, "eval") + if globals is None: + globals = GLOBALS - def _eval(locals=None, globals=(globals or GLOBALS), co=code_object): + def _eval(locals=None): try: - return eval(co, globals, locals) + return eval(code_object, globals, locals) except exception.GalleryDLException: raise except Exception: @@ -803,7 +864,7 @@ compile_expression = compile_expression_tryexcept def compile_filter(expr, name="", globals=None): if not isinstance(expr, str): - expr = "(" + ") and (".join(expr) + ")" + expr = f"({') and ('.join(expr)})" return compile_expression(expr, name, globals) @@ -826,25 +887,25 @@ def import_file(path): return __import__(name.replace("-", "_")) -def build_duration_func(duration, min=0.0): - if not duration: +def build_selection_func(value, min=0.0, conv=float): + if not value: if min: return lambda: min return None - if isinstance(duration, str): - lower, _, upper = duration.partition("-") - lower = float(lower) + if isinstance(value, str): + lower, _, upper = value.partition("-") else: try: - lower, upper = duration + lower, upper = value except TypeError: - lower, upper = duration, None + lower, upper = value, None + lower = conv(lower) if upper: - upper = float(upper) + upper = conv(upper) return functools.partial( - random.uniform, + random.uniform if lower.__class__ is float else random.randint, lower if lower > min else min, upper if upper > min else min, ) @@ -854,6 +915,9 @@ def build_duration_func(duration, min=0.0): return lambda: lower +build_duration_func = build_selection_func + + def build_extractor_filter(categories, negate=True, special=None): """Build a function that takes an Extractor class as argument and returns True if that class is allowed by 'categories' @@ -931,13 +995,13 @@ def build_proxy_map(proxies, log=None): proxies[scheme] = "http://" + proxy.lstrip("/") return proxies - if log: + if log is not None: log.warning("invalid proxy specifier: %s", proxies) def build_predicate(predicates): if not predicates: - return lambda url, kwdict: True + return true elif len(predicates) == 1: return predicates[0] return functools.partial(chain_predicates, predicates) @@ -977,8 +1041,7 @@ class RangePredicate(): return True return False - @staticmethod - def _parse(rangespec): + def _parse(self, rangespec): """Parse an integer range string and return the resulting ranges Examples: @@ -987,7 +1050,6 @@ class RangePredicate(): _parse("1:2,4:8:2") -> [(1,1), (4,7,2)] """ ranges = [] - append = ranges.append if isinstance(rangespec, str): rangespec = rangespec.split(",") @@ -999,7 +1061,7 @@ class RangePredicate(): elif ":" in group: start, _, stop = group.partition(":") stop, _, step = stop.partition(":") - append(range( + ranges.append(range( int(start) if start.strip() else 1, int(stop) if stop.strip() else sys.maxsize, int(step) if step.strip() else 1, @@ -1007,14 +1069,14 @@ class RangePredicate(): elif "-" in group: start, _, stop = group.partition("-") - append(range( + ranges.append(range( int(start) if start.strip() else 1, int(stop) + 1 if stop.strip() else sys.maxsize, )) else: start = int(group) - append(range(start, start+1)) + ranges.append(range(start, start+1)) return ranges @@ -1037,7 +1099,7 @@ class FilterPredicate(): """Predicate; True if evaluating the given expression returns True""" def __init__(self, expr, target="image"): - name = "<{} filter>".format(target) + name = f"<{target} filter>" self.expr = compile_filter(expr, name) def __call__(self, _, kwdict): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index e543a31..af7e3c6 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2023 Mike Fährmann +# Copyright 2016-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.29.7" +__version__ = "1.30.2" __variant__ = None diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index 319e781..cfc6b50 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,6 @@ """Helpers for interacting with youtube-dl""" -import re import shlex import itertools from . import text, util, exception @@ -27,14 +26,16 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None): opts = argv = None config = obj.config - cfg = config("config-file") - if cfg: + if not config("deprecations"): + module.YoutubeDL.deprecated_feature = util.false + module.YoutubeDL.deprecation_warning = util.false + + if cfg := config("config-file"): with open(util.expand_path(cfg)) as fp: contents = fp.read() argv = shlex.split(contents, comments=True) - cmd = config("cmdline-args") - if cmd: + if cmd := config("cmdline-args"): if isinstance(cmd, str): cmd = shlex.split(cmd) argv = (argv + cmd) if argv else cmd @@ -42,7 +43,7 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None): try: opts = parse_command_line(module, argv) if argv else user_opts except SystemExit: - raise exception.StopExtraction("Invalid command-line option") + raise exception.AbortExtraction("Invalid command-line option") if opts.get("format") is None: opts["format"] = config("format") @@ -50,28 +51,35 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None): opts["nopart"] = not config("part", True) if opts.get("updatetime") is None: opts["updatetime"] = config("mtime", True) - if opts.get("ratelimit") is None: - opts["ratelimit"] = text.parse_bytes(config("rate"), None) if opts.get("min_filesize") is None: opts["min_filesize"] = text.parse_bytes(config("filesize-min"), None) if opts.get("max_filesize") is None: opts["max_filesize"] = text.parse_bytes(config("filesize-max"), None) + if opts.get("ratelimit") is None: + if rate := config("rate"): + func = util.build_selection_func(rate, 0, text.parse_bytes) + if hasattr(func, "args"): + opts["__gdl_ratelimit_func"] = func + else: + opts["ratelimit"] = func() or None + else: + opts["ratelimit"] = None - raw_opts = config("raw-options") - if raw_opts: + if raw_opts := config("raw-options"): opts.update(raw_opts) if config("logging", True): opts["logger"] = obj.log if system_opts: opts.update(system_opts) + opts["__gdl_initialize"] = True return module.YoutubeDL(opts) def parse_command_line(module, argv): parser, opts, args = module.parseOpts(argv) - ytdlp = (module.__name__ == "yt_dlp") + ytdlp = hasattr(module, "cookies") std_headers = module.std_headers try: @@ -141,7 +149,7 @@ def parse_command_line(module, argv): if name not in compat_opts: return False compat_opts.discard(name) - compat_opts.update(["*%s" % name]) + compat_opts.update([f"*{name}"]) return True def set_default_compat( @@ -206,7 +214,7 @@ def parse_command_line(module, argv): if "pre_process" not in parse_metadata: parse_metadata["pre_process"] = [] parse_metadata["pre_process"].append( - "title:%s" % opts.metafromtitle) + f"title:{opts.metafromtitle}") opts.parse_metadata = { k: list(itertools.chain.from_iterable(map( metadataparser_actions, v))) @@ -216,7 +224,7 @@ def parse_command_line(module, argv): if parse_metadata is None: parse_metadata = [] if opts.metafromtitle is not None: - parse_metadata.append("title:%s" % opts.metafromtitle) + parse_metadata.append(f"title:{opts.metafromtitle}") opts.parse_metadata = list(itertools.chain.from_iterable(map( metadataparser_actions, parse_metadata))) @@ -250,15 +258,13 @@ def parse_command_line(module, argv): None if opts.match_filter is None else module.match_filter_func(opts.match_filter)) - cookiesfrombrowser = getattr(opts, "cookiesfrombrowser", None) - if cookiesfrombrowser: - match = re.fullmatch(r"""(?x) + if cookiesfrombrowser := getattr(opts, "cookiesfrombrowser", None): + pattern = util.re(r"""(?x) (?P[^+:]+) (?:\s*\+\s*(?P[^:]+))? (?:\s*:\s*(?!:)(?P.+?))? - (?:\s*::\s*(?P.+))? - """, cookiesfrombrowser) - if match: + (?:\s*::\s*(?P.+))?""") + if match := pattern.fullmatch(cookiesfrombrowser): browser, keyring, profile, container = match.groups() if keyring is not None: keyring = keyring.upper() @@ -518,7 +524,7 @@ def legacy_postprocessors(opts, module, ytdlp, compat_opts): if len(dur) == 2 and all(t is not None for t in dur): remove_ranges.append(tuple(dur)) continue - remove_chapters_patterns.append(re.compile(regex)) + remove_chapters_patterns.append(util.re(regex)) if opts.remove_chapters or sponsorblock_query: postprocessors.append({ "key": "ModifyChapters", diff --git a/setup.py b/setup.py index 44acef9..c52d1d7 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def check_file(fname): VERSION = re.search( r'__version__\s*=\s*"([^"]+)"', read("gallery_dl/version.py"), -).group(1) +)[1] FILES = [ (path, [f for f in files if check_file(f)]) @@ -100,13 +100,21 @@ def build_setuptools(): maintainer="Mike Fährmann", maintainer_email="mike_faehrmann@web.de", license="GPLv2", - python_requires=">=3.4", + python_requires=">=3.8", install_requires=[ "requests>=2.11.0", ], extras_require={ "video": [ - "youtube-dl", + "yt-dlp", + ], + "extra": [ + "requests[socks]", + "yt-dlp[default]", + "pyyaml", + "toml; python_version < '3.11'", + "truststore; python_version >= '3.10'", + "secretstorage; sys_platform == 'linux'", ], }, entry_points={ @@ -127,10 +135,6 @@ def build_setuptools(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", diff --git a/test/test_config.py b/test/test_config.py index be58456..5c94b1b 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -223,8 +223,7 @@ class TestConfigFiles(unittest.TestCase): self.assertIsInstance(cfg, dict) self.assertTrue(cfg) - @staticmethod - def _load(name): + def _load(self, name): path = os.path.join(ROOTDIR, "docs", name) try: with open(path) as fp: diff --git a/test/test_cookies.py b/test/test_cookies.py index 9ba562c..5900473 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2017-2023 Mike Fährmann +# Copyright 2017-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,6 +14,7 @@ from unittest import mock import time import logging +import datetime import tempfile from os.path import join @@ -70,8 +71,7 @@ class TestCookiejar(unittest.TestCase): self.assertEqual(len(cookies), 0) self.assertEqual(mock_warning.call_count, 1) - self.assertEqual(mock_warning.call_args[0][0], "cookies: %s") - self.assertIsInstance(mock_warning.call_args[0][1], exc) + self.assertIsInstance(mock_warning.call_args[0][-1], exc) class TestCookiedict(unittest.TestCase): @@ -205,27 +205,32 @@ class TestCookieUtils(unittest.TestCase): now = int(time.time()) log = logging.getLogger("generic") - extr.cookies.set("a", "1", expires=now-100) + extr.cookies.set("a", "1", expires=now-100, domain=".example.org") with mock.patch.object(log, "warning") as mw: self.assertFalse(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 1) - self.assertEqual(mw.call_args[0], ("Cookie '%s' has expired", "a")) + self.assertEqual(mw.call_args[0], ( + "cookies: %s/%s expired at %s", "example.org", "a", + datetime.datetime.fromtimestamp(now-100))) - extr.cookies.set("a", "1", expires=now+100) + extr.cookies.set("a", "1", expires=now+100, domain=".example.org") with mock.patch.object(log, "warning") as mw: self.assertTrue(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 1) self.assertEqual(mw.call_args[0], ( - "Cookie '%s' will expire in less than %s hour%s", "a", 1, "")) + "cookies: %s/%s will expire in less than %s hour%s", + "example.org", "a", 1, "")) - extr.cookies.set("a", "1", expires=now+100+7200) + extr.cookies.set("a", "1", expires=now+100+7200, domain=".example.org") with mock.patch.object(log, "warning") as mw: self.assertTrue(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 1) self.assertEqual(mw.call_args[0], ( - "Cookie '%s' will expire in less than %s hour%s", "a", 3, "s")) + "cookies: %s/%s will expire in less than %s hour%s", + "example.org", "a", 3, "s")) - extr.cookies.set("a", "1", expires=now+100+24*3600) + extr.cookies.set( + "a", "1", expires=now+100+24*3600, domain=".example.org") with mock.patch.object(log, "warning") as mw: self.assertTrue(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 0) diff --git a/test/test_downloader.py b/test/test_downloader.py index 5a9a20b..3e5bf84 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -163,7 +163,7 @@ class TestDownloaderConfig(unittest.TestCase): self.assertEqual(dl.timeout, 10) self.assertEqual(dl.verify, False) self.assertEqual(dl.mtime, False) - self.assertEqual(dl.rate, 42) + self.assertEqual(dl.rate(), 42) self.assertEqual(dl.part, False) @@ -332,7 +332,7 @@ class HttpRequestHandler(http.server.BaseHTTPRequestHandler): status = 206 match = re.match(r"bytes=(\d+)-", self.headers["Range"]) - start = int(match.group(1)) + start = int(match[1]) headers["Content-Range"] = "bytes {}-{}/{}".format( start, len(output)-1, len(output)) @@ -369,6 +369,8 @@ SAMPLES = { ("heic", b"????ftypheis"), ("heic", b"????ftypheix"), ("svg" , b"..."), + ("html", b" \n \n\r\t\n ..."), ("ico" , b"\x00\x00\x01\x00"), ("cur" , b"\x00\x00\x02\x00"), ("psd" , b"8BPS"), diff --git a/test/test_extractor.py b/test/test_extractor.py index dfc5ff8..bf4aa07 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2018-2023 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -122,8 +122,8 @@ class TestExtractorModule(unittest.TestCase): extr = cls.from_url(url) except ImportError as exc: if exc.name in ("youtube_dl", "yt_dlp"): - print("Skipping '{}' category checks".format(cls.category)) - return + return sys.stdout.write( + f"Skipping '{cls.category}' category checks\n") raise self.assertTrue(extr, url) @@ -138,46 +138,8 @@ class TestExtractorModule(unittest.TestCase): self.assertEqual(extr.subcategory, sub, url) self.assertEqual(extr.basecategory, base, url) - @unittest.skipIf(not results, "no test data") - def test_unique_pattern_matches(self): - # collect testcase URLs - test_urls = [] - append = test_urls.append - - for result in results.all(): - if not result.get("#fail"): - append((result["#url"], result["#class"])) - - # iterate over all testcase URLs - for url, extr1 in test_urls: - matches = [] - - # ... and apply all regex patterns to each one - for extr2 in _list_classes(): - - # skip DirectlinkExtractor pattern if it isn't tested - if extr1 != DirectlinkExtractor and \ - extr2 == DirectlinkExtractor: - continue - - match = extr2.pattern.match(url) - if match: - matches.append((match, extr2)) - - # fail if more or less than 1 match happened - if len(matches) > 1: - msg = "'{}' gets matched by more than one pattern:".format(url) - for match, extr in matches: - msg += "\n\n- {}:\n{}".format( - extr.__name__, match.re.pattern) - self.fail(msg) - - elif len(matches) < 1: - msg = "'{}' isn't matched by any pattern".format(url) - self.fail(msg) - - else: - self.assertIs(extr1, matches[0][1], url) + if base not in ("reactor", "wikimedia"): + self.assertEqual(extr._cfgpath, ("extractor", cat, sub), url) def test_init(self): """Test for exceptions in Extractor.initialize() and .finalize()""" @@ -188,14 +150,16 @@ class TestExtractorModule(unittest.TestCase): if cls.category == "ytdl": continue extr = cls.from_url(cls.example) - if not extr and cls.basecategory and not cls.instances: - continue + if not extr: + if cls.basecategory and not cls.instances: + continue + self.fail(f"{cls.__name__} pattern does not match " + f"example URL '{cls.example}'") extr.request = fail_request extr.initialize() extr.finalize() - @unittest.skipIf(sys.hexversion < 0x3060000, "test fails in CI") def test_init_ytdl(self): try: extr = extractor.find("ytdl:") @@ -293,8 +257,7 @@ class TestExtractorWait(unittest.TestCase): u = self._isotime_to_seconds(until.time().isoformat()[:8]) self.assertLessEqual(o-u, 1.0) - @staticmethod - def _isotime_to_seconds(isotime): + def _isotime_to_seconds(self, isotime): parts = isotime.split(":") return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) diff --git a/test/test_formatter.py b/test/test_formatter.py index 646f179..3305983 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -15,11 +15,19 @@ import datetime import tempfile sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from gallery_dl import formatter, text, util # noqa E402 +from gallery_dl import formatter, text, util, config # noqa E402 + +try: + import jinja2 +except ImportError: + jinja2 = None class TestFormatter(unittest.TestCase): + def tearDown(self): + config.clear() + kwdict = { "a": "hElLo wOrLd", "b": "äöü", @@ -27,16 +35,23 @@ class TestFormatter(unittest.TestCase): "d": {"a": "foo", "b": 0, "c": None}, "i": 2, "l": ["a", "b", "c"], + "L": [ + {"name": "John Doe" , "age": 42, "email": "jd@example.org"}, + {"name": "Jane Smith" , "age": 24, "email": None}, + {"name": "Max Mustermann", "age": False}, + ], "n": None, "s": " \n\r\tSPACE ", + "S": " \n\r\tS P A\tC\nE ", "h": "

foo

& bar

", "u": "'< / >'", "t": 1262304000, - "ds": "2010-01-01T01:00:00+0100", + "ds": "2010-01-01T01:00:00+01:00", "dt": datetime.datetime(2010, 1, 1), "dt_dst": datetime.datetime(2010, 6, 1), "i_str": "12345", "f_str": "12.45", + "lang": "en", "name": "Name", "title1": "Title", "title2": "", @@ -50,6 +65,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{a!c}", "Hello world") self._run_test("{a!C}", "Hello World") self._run_test("{s!t}", "SPACE") + self._run_test("{S!t}", "S P A\tC\nE") self._run_test("{a!U}", self.kwdict["a"]) self._run_test("{u!U}", "'< / >'") self._run_test("{a!H}", self.kwdict["a"]) @@ -65,13 +81,22 @@ class TestFormatter(unittest.TestCase): self._run_test("{n!S}", "") self._run_test("{t!d}", datetime.datetime(2010, 1, 1)) self._run_test("{t!d:%Y-%m-%d}", "2010-01-01") + self._run_test("{t!D}" , datetime.datetime(2010, 1, 1)) + self._run_test("{ds!D}", datetime.datetime(2010, 1, 1)) + self._run_test("{dt!D}", datetime.datetime(2010, 1, 1)) + self._run_test("{t!D:%Y-%m-%d}", "2010-01-01") self._run_test("{dt!T}", "1262304000") self._run_test("{l!j}", '["a","b","c"]') self._run_test("{dt!j}", '"2010-01-01 00:00:00"') self._run_test("{a!g}", "hello-world") - self._run_test("{a!L}", 11) - self._run_test("{l!L}", 3) - self._run_test("{d!L}", 3) + self._run_test("{lang!L}", "English") + self._run_test("{'fr'!L}", "French") + self._run_test("{a!L}", None) + self._run_test("{a!n}", 11) + self._run_test("{l!n}", 3) + self._run_test("{d!n}", 3) + self._run_test("{s!W}", "SPACE") + self._run_test("{S!W}", "S P A C E") self._run_test("{i_str!i}", 12345) self._run_test("{i_str!f}", 12345.0) self._run_test("{f_str!f}", 12.45) @@ -201,7 +226,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{j:[b:]}" , v) self._run_test("{j:[b::]}" , v) - def test_maxlen(self): + def test_specifier_maxlen(self): v = self.kwdict["a"] self._run_test("{a:L5/foo/}" , "foo") self._run_test("{a:L50/foo/}", v) @@ -209,7 +234,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{a:L50/foo/>51}", "foo") self._run_test("{a:Lab/foo/}", "foo") - def test_join(self): + def test_specifier_join(self): self._run_test("{l:J}" , "abc") self._run_test("{l:J,}" , "a,b,c") self._run_test("{l:J,/}" , "a,b,c") @@ -221,7 +246,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{a:J/}" , self.kwdict["a"]) self._run_test("{a:J, /}" , self.kwdict["a"]) - def test_replace(self): + def test_specifier_replace(self): self._run_test("{a:Rh/C/}" , "CElLo wOrLd") self._run_test("{a!l:Rh/C/}", "Cello world") self._run_test("{a!u:Rh/C/}", "HELLO WORLD") @@ -230,12 +255,12 @@ class TestFormatter(unittest.TestCase): self._run_test("{a!l:Rl//}" , "heo word") self._run_test("{name:Rame/othing/}", "Nothing") - def test_datetime(self): + def test_specifier_datetime(self): self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z}", "2010-01-01 00:00:00") - self._run_test("{ds:D%Y}", "2010-01-01T01:00:00+0100") + self._run_test("{ds:D%Y}", "2010-01-01T01:00:00+01:00") self._run_test("{l:D%Y}", "None") - def test_offset(self): + def test_specifier_offset(self): self._run_test("{dt:O 01:00}", "2010-01-01 01:00:00") self._run_test("{dt:O+02:00}", "2010-01-01 02:00:00") self._run_test("{dt:O-03:45}", "2009-12-31 20:15:00") @@ -246,7 +271,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z/O1}", "2010-01-01 01:00:00") self._run_test("{t!d:O2}", "2010-01-01 02:00:00") - def test_offset_local(self): + def test_specifier_offset_local(self): ts = self.kwdict["dt"].replace( tzinfo=datetime.timezone.utc).timestamp() offset = time.localtime(ts).tm_gmtoff @@ -261,7 +286,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{dt_dst:O}", str(dt)) self._run_test("{dt_dst:Olocal}", str(dt)) - def test_sort(self): + def test_specifier_sort(self): self._run_test("{l:S}" , "['a', 'b', 'c']") self._run_test("{l:Sa}", "['a', 'b', 'c']") self._run_test("{l:Sd}", "['c', 'b', 'a']") @@ -293,6 +318,19 @@ class TestFormatter(unittest.TestCase): with self.assertRaises(ValueError): self._run_test("{a:Xfoo/ */}", "hello wo *") + def test_specifier_map(self): + self._run_test("{L:Mname/}" , + "['John Doe', 'Jane Smith', 'Max Mustermann']") + self._run_test("{L:Mage/}" , + "[42, 24, False]") + + self._run_test("{a:Mname}", self.kwdict["a"]) + self._run_test("{n:Mname}", "None") + self._run_test("{title4:Mname}", "0") + + with self.assertRaises(ValueError): + self._run_test("{t:Mname", "") + def test_chain_special(self): # multiple replacements self._run_test("{a:Rh/C/RE/e/RL/l/}", "Cello wOrld") @@ -314,6 +352,9 @@ class TestFormatter(unittest.TestCase): # sort and join self._run_test("{a:S/J}", " ELLOdhlorw") + # map and join + self._run_test("{L:Mname/J-}", "John Doe-Jane Smith-Max Mustermann") + def test_separator(self): orig_separator = formatter._SEPARATOR try: @@ -420,7 +461,6 @@ class TestFormatter(unittest.TestCase): self._run_test("\fE name * 2 + ' ' + a", "{}{} {}".format( self.kwdict["name"], self.kwdict["name"], self.kwdict["a"])) - @unittest.skipIf(sys.hexversion < 0x3060000, "no fstring support") def test_fstring(self): self._run_test("\fF {a}", self.kwdict["a"]) self._run_test("\fF {name}{name} {a}", "{}{} {}".format( @@ -428,7 +468,6 @@ class TestFormatter(unittest.TestCase): self._run_test("\fF foo-'\"{a.upper()}\"'-bar", """foo-'"{}"'-bar""".format(self.kwdict["a"].upper())) - @unittest.skipIf(sys.hexversion < 0x3060000, "no fstring support") def test_template_fstring(self): with tempfile.TemporaryDirectory() as tmpdirname: path1 = os.path.join(tmpdirname, "tpl1") @@ -449,6 +488,90 @@ class TestFormatter(unittest.TestCase): with self.assertRaises(OSError): formatter.parse("\fTF /") + @unittest.skipIf(jinja2 is None, "no jinja2") + def test_jinja(self): + formatter.JinjaFormatter.env = None + + self._run_test("\fJ {{a}}", self.kwdict["a"]) + self._run_test("\fJ {{name}}{{name}} {{a}}", "{}{} {}".format( + self.kwdict["name"], self.kwdict["name"], self.kwdict["a"])) + self._run_test("\fJ foo-'\"{{a | upper}}\"'-bar", + """foo-'"{}"'-bar""".format(self.kwdict["a"].upper())) + + @unittest.skipIf(jinja2 is None, "no jinja2") + def test_template_jinja(self): + formatter.JinjaFormatter.env = None + + with tempfile.TemporaryDirectory() as tmpdirname: + path1 = os.path.join(tmpdirname, "tpl1") + path2 = os.path.join(tmpdirname, "tpl2") + + with open(path1, "w") as fp: + fp.write("{{a}}") + fmt1 = formatter.parse("\fTJ " + path1) + + with open(path2, "w") as fp: + fp.write("foo-'\"{{a | upper}}\"'-bar") + fmt2 = formatter.parse("\fTJ " + path2) + + self.assertEqual(fmt1.format_map(self.kwdict), self.kwdict["a"]) + self.assertEqual(fmt2.format_map(self.kwdict), + """foo-'"{}"'-bar""".format(self.kwdict["a"].upper())) + + with self.assertRaises(OSError): + formatter.parse("\fTJ /") + + @unittest.skipIf(jinja2 is None, "no jinja2") + def test_template_jinja_opts(self): + formatter.JinjaFormatter.env = None + + with tempfile.TemporaryDirectory() as tmpdirname: + path_filters = os.path.join(tmpdirname, "jinja_filters.py") + path_template = os.path.join(tmpdirname, "jinja_template.txt") + + config.set((), "jinja", { + "environment": { + "variable_start_string": "(((", + "variable_end_string" : ")))", + "keep_trailing_newline": True, + }, + "filters": path_filters, + }) + + with open(path_filters, "w") as fp: + fp.write(r""" +import re + +def datetime_format(value, format="%H:%M %d-%m-%y"): + return value.strftime(format) + +def sanitize(value): + return re.sub(r"\s+", " ", value.strip()) + +__filters__ = { + "dt_fmt": datetime_format, + "sanitize_whitespace": sanitize, +} +""") + + with open(path_template, "w") as fp: + fp.write("""\ +Present Day is ((( dt | dt_fmt("%B %d, %Y") ))) +Present Time is ((( dt | dt_fmt("%H:%M:%S") ))) + +Hello ((( s | sanitize_whitespace ))). +I hope there is enough "(((S|sanitize_whitespace)))" for you. +""") + fmt = formatter.parse("\fTJ " + path_template) + + self.assertEqual(fmt.format_map(self.kwdict), """\ +Present Day is January 01, 2010 +Present Time is 00:00:00 + +Hello SPACE. +I hope there is enough "S P A C E" for you. +""") + def test_module(self): with tempfile.TemporaryDirectory() as tmpdirname: path = os.path.join(tmpdirname, "testmod.py") @@ -488,10 +611,10 @@ def noarg(): fmt4 = formatter.parse("\fM " + path + ":lengths") self.assertEqual(fmt1.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt2.format_map(self.kwdict), "136") + self.assertEqual(fmt2.format_map(self.kwdict), "168") self.assertEqual(fmt3.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt4.format_map(self.kwdict), "136") + self.assertEqual(fmt4.format_map(self.kwdict), "168") with self.assertRaises(TypeError): self.assertEqual(fmt0.format_map(self.kwdict), "") diff --git a/test/test_job.py b/test/test_job.py index 3e6f85b..3aa28e8 100644 --- a/test/test_job.py +++ b/test/test_job.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -282,7 +282,12 @@ class TestDataJob(TestJob): tjob = self.jobclass(extr, file=io.StringIO()) tjob.run() self.assertEqual( - tjob.data[-1], ("ZeroDivisionError", "division by zero")) + tjob.data[-1], + (-1, { + "error" : "ZeroDivisionError", + "message": "division by zero", + }) + ) def test_private(self): config.set(("output",), "private", True) @@ -364,7 +369,7 @@ class TestExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = {"id": 123, "name": "test"} - if match.group(1) == "self": + if match[1] == "self": self.user["self"] = self.user def items(self): diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 76e728c..2e39cc7 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,7 @@ import os import sys import unittest -from unittest.mock import Mock, mock_open, patch +from unittest.mock import Mock, mock_open, patch, call import shutil import logging @@ -20,7 +20,7 @@ import collections from datetime import datetime sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from gallery_dl import extractor, output, path # noqa E402 +from gallery_dl import extractor, output, path, util # noqa E402 from gallery_dl import postprocessor, config # noqa E402 from gallery_dl.postprocessor.common import PostProcessor # noqa E402 @@ -209,7 +209,10 @@ class ExecTest(BasePostprocessorTest): self.pathfmt.realpath, self.pathfmt.realdirectory, self.pathfmt.filename), - shell=True) + shell=True, + creationflags=0, + start_new_session=False, + ) i.wait.assert_called_once_with() def test_command_list(self): @@ -231,8 +234,46 @@ class ExecTest(BasePostprocessorTest): self.pathfmt.realdirectory.upper(), ], shell=False, + creationflags=0, + start_new_session=False, ) + def test_command_many(self): + self._create({ + "commands": [ + "echo {} {_path} {_directory} {_filename} && rm {};", + ["~/script.sh", "{category}", "\fE _directory.upper()"], + ] + }) + + with patch("gallery_dl.util.Popen") as p: + i = Mock() + i.wait.return_value = 0 + p.return_value = i + self._trigger(("after",)) + + self.assertEqual(p.call_args_list, [ + call( + "echo {0} {0} {1} {2} && rm {0};".format( + self.pathfmt.realpath, + self.pathfmt.realdirectory, + self.pathfmt.filename), + shell=True, + creationflags=0, + start_new_session=False, + ), + call( + [ + os.path.expanduser("~/script.sh"), + self.pathfmt.kwdict["category"], + self.pathfmt.realdirectory.upper(), + ], + shell=False, + creationflags=0, + start_new_session=False, + ), + ]) + def test_command_returncode(self): self._create({ "command": "echo {}", @@ -264,6 +305,49 @@ class ExecTest(BasePostprocessorTest): self.assertTrue(p.called) self.assertFalse(i.wait.called) + @unittest.skipIf(util.WINDOWS, "not POSIX") + def test_session_posix(self): + self._create({ + "session": True, + "command": ["echo", "foobar"], + }) + + with patch("gallery_dl.util.Popen") as p: + i = Mock() + i.wait.return_value = 0 + p.return_value = i + self._trigger(("after",)) + + p.assert_called_once_with( + ["echo", "foobar"], + shell=False, + creationflags=0, + start_new_session=True, + ) + i.wait.assert_called_once_with() + + @unittest.skipIf(not util.WINDOWS, "not Windows") + def test_session_windows(self): + self._create({ + "session": True, + "command": ["echo", "foobar"], + }) + + with patch("gallery_dl.util.Popen") as p: + i = Mock() + i.wait.return_value = 0 + p.return_value = i + self._trigger(("after",)) + + import subprocess + p.assert_called_once_with( + ["echo", "foobar"], + shell=False, + creationflags=subprocess.CREATE_NEW_PROCESS_GROUP, + start_new_session=False, + ) + i.wait.assert_called_once_with() + class HashTest(BasePostprocessorTest): @@ -345,9 +429,7 @@ class MetadataTest(BasePostprocessorTest): path = self.pathfmt.realpath + ".JSON" m.assert_called_once_with(path, "w", encoding="utf-8") - if sys.hexversion >= 0x3060000: - # python 3.4 & 3.5 have random order without 'sort: True' - self.assertEqual(self._output(m), """{ + self.assertEqual(self._output(m), """{ "category": "test", "filename": "file", "extension": "ext", @@ -713,8 +795,7 @@ class MetadataTest(BasePostprocessorTest): } """) - @staticmethod - def _output(mock): + def _output(self, mock): return "".join( call[1][0] for call in mock.mock_calls @@ -727,32 +808,32 @@ class MtimeTest(BasePostprocessorTest): def test_mtime_datetime(self): self._create(None, {"date": datetime(1980, 1, 1)}) self._trigger() - self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + self.assertEqual(self.pathfmt.kwdict["_mtime_meta"], 315532800) def test_mtime_timestamp(self): self._create(None, {"date": 315532800}) self._trigger() - self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + self.assertEqual(self.pathfmt.kwdict["_mtime_meta"], 315532800) def test_mtime_none(self): self._create(None, {"date": None}) self._trigger() - self.assertNotIn("_mtime", self.pathfmt.kwdict) + self.assertNotIn("_mtime_meta", self.pathfmt.kwdict) def test_mtime_undefined(self): self._create(None, {}) self._trigger() - self.assertNotIn("_mtime", self.pathfmt.kwdict) + self.assertNotIn("_mtime_meta", self.pathfmt.kwdict) def test_mtime_key(self): self._create({"key": "foo"}, {"foo": 315532800}) self._trigger() - self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + self.assertEqual(self.pathfmt.kwdict["_mtime_meta"], 315532800) def test_mtime_value(self): self._create({"value": "{foo}"}, {"foo": 315532800}) self._trigger() - self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + self.assertEqual(self.pathfmt.kwdict["_mtime_meta"], 315532800) class PythonTest(BasePostprocessorTest): @@ -945,8 +1026,8 @@ class ZipTest(BasePostprocessorTest): self._trigger(("finalize",)) self.assertEqual(pp.zfile.write.call_count, 3) - for call in pp.zfile.write.call_args_list: - args, kwargs = call + for call_args in pp.zfile.write.call_args_list: + args, kwargs = call_args self.assertEqual(len(args), 2) self.assertEqual(len(kwargs), 0) self.assertEqual(args[0], self.pathfmt.temppath) diff --git a/test/test_results.py b/test/test_results.py index 6e04e1d..4b1c4c1 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -80,9 +80,9 @@ class TestExtractorResults(unittest.TestCase): @classmethod def tearDownClass(cls): if cls._skipped: - print("\n\nSkipped tests:") - for url, exc in cls._skipped: - print('- {} ("{}")'.format(url, exc)) + sys.stdout.write("\n\nSkipped tests:\n") + for url, reason in cls._skipped: + sys.stdout.write(f'- {url} ("{reason}")\n') def assertRange(self, value, range, msg=None): if range.step > 1: @@ -91,6 +91,24 @@ class TestExtractorResults(unittest.TestCase): self.assertLessEqual(value, range.stop, msg=msg) self.assertGreaterEqual(value, range.start, msg=msg) + def assertLogEqual(self, expected, output): + if isinstance(expected, str): + expected = (expected,) + self.assertEqual(len(expected), len(output), "#log/count") + + for exp, out in zip(expected, output): + level, name, message = out.split(":", 2) + + if isinstance(exp, str): + return self.assertEqual(exp, message, "#log") + + self.assertEqual(exp[0].lower(), level.lower(), "#log/level") + if len(exp) < 3: + self.assertEqual(exp[1], message, "#log/message") + else: + self.assertEqual(exp[1], name , "#log/name") + self.assertEqual(exp[2], message, "#log/message") + def _run_test(self, result): if result.get("#fail"): del result["#fail"] @@ -145,7 +163,11 @@ class TestExtractorResults(unittest.TestCase): return try: - tjob.run() + if "#log" in result: + with self.assertLogs() as log_info: + tjob.run() + else: + tjob.run() except exception.StopExtraction: pass except exception.HttpError as exc: @@ -156,6 +178,9 @@ class TestExtractorResults(unittest.TestCase): self.skipTest(exc) raise + if "#log" in result: + self.assertLogEqual(result["#log"], log_info.output) + if result.get("#archive", True): self.assertEqual( len(set(tjob.archive_list)), @@ -220,13 +245,15 @@ class TestExtractorResults(unittest.TestCase): for url, pat in zip(tjob.url_list, pattern): self.assertRegex(url, pat, msg="#pattern") - if "#urls" in result: - expected = result["#urls"] + if "#results" in result: + expected = result["#results"] if isinstance(expected, str): - self.assertTrue(tjob.url_list, msg="#urls") - self.assertEqual(tjob.url_list[0], expected, msg="#urls") + self.assertTrue(tjob.url_list, msg="#results") + self.assertEqual( + tjob.url_list[0], expected, msg="#results") else: - self.assertSequenceEqual(tjob.url_list, expected, msg="#urls") + self.assertSequenceEqual( + tjob.url_list, expected, msg="#results") metadata = {k: v for k, v in result.items() if k[0] != "#"} if metadata: @@ -235,56 +262,74 @@ class TestExtractorResults(unittest.TestCase): def _test_kwdict(self, kwdict, tests, parent=None): for key, test in tests.items(): + if key.startswith("?"): key = key[1:] if key not in kwdict: continue + if key.endswith("[*]"): + key = key[:-3] + subtest = True + else: + subtest = False + path = "{}.{}".format(parent, key) if parent else key + if key.startswith("!"): self.assertNotIn(key[1:], kwdict, msg=path) continue + self.assertIn(key, kwdict, msg=path) value = kwdict[key] - if isinstance(test, dict): - self._test_kwdict(value, test, path) - elif isinstance(test, type): - self.assertIsInstance(value, test, msg=path) - elif isinstance(test, range): - self.assertRange(value, test, msg=path) - elif isinstance(test, set): - try: - self.assertIn(value, test, msg=path) - except AssertionError: - self.assertIn(type(value), test, msg=path) - elif isinstance(test, list): - subtest = False - for idx, item in enumerate(test): - if isinstance(item, dict): - subtest = True - subpath = "{}[{}]".format(path, idx) - self._test_kwdict(value[idx], item, subpath) - if not subtest: - self.assertEqual(test, value, msg=path) - elif isinstance(test, str): - if test.startswith("re:"): - self.assertRegex(value, test[3:], msg=path) - elif test.startswith("dt:"): - self.assertIsInstance(value, datetime.datetime, msg=path) - self.assertEqual(test[3:], str(value), msg=path) - elif test.startswith("type:"): - self.assertEqual(test[5:], type(value).__name__, msg=path) - elif test.startswith("len:"): - cls, _, length = test[4:].rpartition(":") - if cls: - self.assertEqual( - cls, type(value).__name__, msg=path + "/type") - self.assertEqual(int(length), len(value), msg=path) - else: - self.assertEqual(test, value, msg=path) + if subtest: + self.assertNotIsInstance(value, str, msg=path) + for idx, item in enumerate(value): + subpath = "{}[{}]".format(path, idx) + self._test_kwdict_value(item, test, subpath) else: + self._test_kwdict_value(value, test, path) + + def _test_kwdict_value(self, value, test, path): + if isinstance(test, dict): + self._test_kwdict(value, test, path) + elif isinstance(test, type): + self.assertIsInstance(value, test, msg=path) + elif isinstance(test, range): + self.assertRange(value, test, msg=path) + elif isinstance(test, set): + try: + self.assertIn(value, test, msg=path) + except AssertionError: + self.assertIn(type(value), test, msg=path) + elif isinstance(test, list): + subtest = False + for idx, item in enumerate(test): + if isinstance(item, dict): + subtest = True + subpath = "{}[{}]".format(path, idx) + self._test_kwdict(value[idx], item, subpath) + if not subtest: self.assertEqual(test, value, msg=path) + elif isinstance(test, str): + if test.startswith("re:"): + self.assertRegex(value, test[3:], msg=path) + elif test.startswith("dt:"): + self.assertIsInstance(value, datetime.datetime, msg=path) + self.assertEqual(test[3:], str(value), msg=path) + elif test.startswith("type:"): + self.assertEqual(test[5:], type(value).__name__, msg=path) + elif test.startswith("len:"): + cls, _, length = test[4:].rpartition(":") + if cls: + self.assertEqual( + cls, type(value).__name__, msg=path + "/type") + self.assertEqual(int(length), len(value), msg=path) + else: + self.assertEqual(test, value, msg=path) + else: + self.assertEqual(test, value, msg=path) class ResultJob(job.DownloadJob): @@ -402,27 +447,31 @@ class TestPathfmt(): class TestFormatter(formatter.StringFormatter): - @staticmethod - def _noop(_): - return "" - def _apply_simple(self, key, fmt): if key == "extension" or "_parse_optional." in repr(fmt): - return self._noop - - def wrap(obj): - return fmt(obj[key]) + def wrap(obj): + try: + return fmt(obj[key]) + except KeyError: + return "" + else: + def wrap(obj): + return fmt(obj[key]) return wrap def _apply(self, key, funcs, fmt): if key == "extension" or "_parse_optional." in repr(fmt): - return self._noop - - def wrap(obj): - obj = obj[key] - for func in funcs: - obj = func(obj) - return fmt(obj) + def wrap(obj): + obj = obj[key] if key in obj else "" + for func in funcs: + obj = func(obj) + return fmt(obj) + else: + def wrap(obj): + obj = obj[key] + for func in funcs: + obj = func(obj) + return fmt(obj) return wrap @@ -457,7 +506,10 @@ def generate_tests(): """Dynamically generate extractor unittests""" def _generate_method(result): def test(self): - print("\n" + result["#url"]) + sys.stdout.write(f"\n{result['#url']}\n") + if "#comment" in result: + sys.stdout.write(f"# {result['#comment']}\n") + try: self._run_test(result) except KeyboardInterrupt as exc: diff --git a/test/test_text.py b/test/test_text.py index d42507c..13029d2 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,6 +23,20 @@ INVALID_ALT = ((), [], {}, None, "") class TestText(unittest.TestCase): + def test_re(self): + p1 = text.re_compile("foo") + p2 = text.re("foo") + p3 = text.re("foo") + + Pattern = text.re_module.Pattern + self.assertIsInstance(p1, Pattern) + self.assertIsInstance(p2, Pattern) + self.assertIsInstance(p3, Pattern) + + self.assertEqual(p1, p2) + self.assertIsNot(p1, p2) + self.assertIs(p2, p3) + def test_remove_html(self, f=text.remove_html): result = "Hello World." @@ -92,6 +106,17 @@ class TestText(unittest.TestCase): self.assertEqual(f(1), "1") self.assertEqual(f(2.3), "23") + def test_sanitize_whitespace(self, f=text.sanitize_whitespace): + self.assertEqual(f("Hello World"), "Hello World") + self.assertEqual(f("Hello\tWorld"), "Hello World") + self.assertEqual(f(" Hello World "), "Hello World") + self.assertEqual(f("\tHello \n\tWorld "), "Hello World") + + self.assertEqual(f(""), "") + self.assertEqual(f(" "), "") + self.assertEqual(f(" "), "") + self.assertEqual(f(" \t\n "), "") + def test_ensure_http_scheme(self, f=text.ensure_http_scheme): result = "https://example.org/filename.ext" @@ -241,6 +266,29 @@ class TestText(unittest.TestCase): self.assertEqual(f(txt , value, ">") , (None, -1)) self.assertEqual(f(txt , "<" , value), (None, -1)) + def test_rextr(self, f=text.rextr): + txt = "
" + self.assertEqual(f(txt, "<", ">"), "b") + self.assertEqual(f(txt, "X", ">"), "") + self.assertEqual(f(txt, "<", "X"), "") + + # 'pos' argument + for i in range(10, 3, -1): + self.assertEqual(f(txt, "<", ">", i), "b") + for i in range(3, 0, -1): + self.assertEqual(f(txt, "<", ">", i), "a") + + # 'default' argument + self.assertEqual(f(txt, "[", "]", -1, "none"), "none") + self.assertEqual(f(txt, "[", "]", None, "none"), "none") + self.assertEqual(f(txt, "[", "]", default="none"), "none") + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value, "<" , ">") , "") + self.assertEqual(f(txt , value, ">") , "") + self.assertEqual(f(txt , "<" , value), "") + def test_extract_all(self, f=text.extract_all): txt = "[c][b][a]: xyz! [d][e" @@ -336,6 +384,8 @@ class TestText(unittest.TestCase): ) def test_parse_bytes(self, f=text.parse_bytes): + self.assertEqual(f(0), 0) + self.assertEqual(f(50), 50) self.assertEqual(f("0"), 0) self.assertEqual(f("50"), 50) self.assertEqual(f("50k"), 50 * 1024**1) @@ -343,10 +393,13 @@ class TestText(unittest.TestCase): self.assertEqual(f("50g"), 50 * 1024**3) self.assertEqual(f("50t"), 50 * 1024**4) self.assertEqual(f("50p"), 50 * 1024**5) + self.assertEqual(f(" 50p "), 50 * 1024**5) # fractions + self.assertEqual(f(123.456), 123) self.assertEqual(f("123.456"), 123) self.assertEqual(f("123.567"), 124) + self.assertEqual(f(" 123.89 "), 124) self.assertEqual(f("0.5M"), round(0.5 * 1024**2)) # invalid arguments @@ -405,8 +458,12 @@ class TestText(unittest.TestCase): # missing value self.assertEqual(f("bar"), {}) + self.assertEqual(f("bar="), {"bar": ""}) self.assertEqual(f("foo=1&bar"), {"foo": "1"}) + self.assertEqual(f("foo=1&bar="), {"foo": "1", "bar": ""}) self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"}) + self.assertEqual(f("foo=1&bar=&baz=3"), + {"foo": "1", "bar": "", "baz": "3"}) # keys with identical names self.assertEqual(f("foo=1&foo=2"), {"foo": "1"}) @@ -424,6 +481,8 @@ class TestText(unittest.TestCase): self.assertEqual(f(""), {}) self.assertEqual(f("foo=1"), {"foo": "1"}) self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"}) + self.assertEqual(f("%C3%A4%26=%E3%81%82%E3%81%A8&%23=%3F"), + {"ä&": "あと", "#": "?"}) # missing value self.assertEqual(f("bar"), {}) @@ -441,6 +500,21 @@ class TestText(unittest.TestCase): for value in INVALID: self.assertEqual(f(value), {}) + def test_build_query(self, f=text.build_query): + # standard usage + self.assertEqual(f({}), "") + self.assertEqual(f({"foo": "1"}), "foo=1") + self.assertEqual(f({"foo": "1", "bar": "2"}), "foo=1&bar=2") + + # missing value + self.assertEqual(f({"bar": ""}), "bar=") + self.assertEqual(f({"foo": "1", "bar": ""}), "foo=1&bar=") + self.assertEqual(f({"foo": "1", "bar": "", "baz": "3"}), + "foo=1&bar=&baz=3") + + self.assertEqual(f({"ä&": "あと", "#": "?"}), + "%C3%A4%26=%E3%81%82%E3%81%A8&%23=%3F") + def test_parse_timestamp(self, f=text.parse_timestamp): null = util.datetime_utcfromtimestamp(0) value = util.datetime_utcfromtimestamp(1555816235) diff --git a/test/test_util.py b/test/test_util.py index 27f78ec..00e8c4b 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,6 +10,7 @@ import os import sys import unittest +from unittest.mock import patch import io import time @@ -27,11 +28,18 @@ from gallery_dl import util, text, exception # noqa E402 class TestRange(unittest.TestCase): - def test_parse_empty(self, f=util.RangePredicate._parse): + def setUp(self): + self.predicate = util.RangePredicate("") + + def test_parse_empty(self): + f = self.predicate._parse + self.assertEqual(f(""), []) self.assertEqual(f([]), []) - def test_parse_digit(self, f=util.RangePredicate._parse): + def test_parse_digit(self): + f = self.predicate._parse + self.assertEqual(f("2"), [range(2, 3)]) self.assertEqual( @@ -41,7 +49,9 @@ class TestRange(unittest.TestCase): range(4, 5)], ) - def test_parse_range(self, f=util.RangePredicate._parse): + def test_parse_range(self): + f = self.predicate._parse + self.assertEqual(f("1-2"), [range(1, 3)]) self.assertEqual(f("2-"), [range(2, sys.maxsize)]) self.assertEqual(f("-3"), [range(1, 4)]) @@ -61,7 +71,9 @@ class TestRange(unittest.TestCase): range(2, 7)], ) - def test_parse_slice(self, f=util.RangePredicate._parse): + def test_parse_slice(self): + f = self.predicate._parse + self.assertEqual(f("2:4") , [range(2, 4)]) self.assertEqual(f("3::") , [range(3, sys.maxsize)]) self.assertEqual(f(":4:") , [range(1, 4)]) @@ -149,6 +161,10 @@ class TestPredicate(unittest.TestCase): self.assertFalse(pred(url, {"a": 2})) + pred = util.FilterPredicate("re.search(r'.+', url)") + self.assertTrue(pred(url, {"url": "https://example.org/"})) + self.assertFalse(pred(url, {"url": ""})) + def test_build_predicate(self): pred = util.build_predicate([]) self.assertIsInstance(pred, type(lambda: True)) @@ -390,6 +406,89 @@ def hash(value): self.assertEqual(expr(value), result) +class TestDatetime(unittest.TestCase): + + def test_to_datetime(self, f=util.to_datetime): + + def _assert(value, expected): + result = f(value) + self.assertIsInstance(result, datetime.datetime) + self.assertEqual(result, expected, msg=repr(value)) + + dt = datetime.datetime(2010, 1, 1) + self.assertIs(f(dt), dt) + + _assert(dt , dt) + _assert(1262304000 , dt) + _assert(1262304000.0 , dt) + _assert(1262304000.123, dt) + _assert("1262304000" , dt) + + _assert("2010-01-01" , dt) + _assert("2010-01-01 00:00:00" , dt) + _assert("2010-01-01T00:00:00" , dt) + _assert("2010-01-01T00:00:00.123456" , dt) + _assert("2009-12-31T19:00:00-05:00" , dt) + _assert("2009-12-31T19:00:00.123456-05:00", dt) + _assert("2010-01-01T00:00:00Z" , dt) + _assert("2010-01-01T00:00:00.123456Z" , dt) + + _assert(0 , util.EPOCH) + _assert("" , util.EPOCH) + _assert("foo", util.EPOCH) + _assert(None , util.EPOCH) + _assert(() , util.EPOCH) + _assert([] , util.EPOCH) + _assert({} , util.EPOCH) + _assert((1, 2, 3), util.EPOCH) + + @unittest.skipIf(sys.hexversion < 0x30b0000, + "extended fromisoformat timezones") + def test_to_datetime_tz(self, f=util.to_datetime): + + def _assert(value, expected): + result = f(value) + self.assertIsInstance(result, datetime.datetime) + self.assertEqual(result, expected, msg=repr(value)) + + dt = datetime.datetime(2010, 1, 1) + + _assert("2009-12-31T19:00:00-05" , dt) + _assert("2009-12-31T19:00:00-0500" , dt) + _assert("2009-12-31T19:00:00.123456-05" , dt) + _assert("2009-12-31T19:00:00.123456-0500" , dt) + + def test_datetime_to_timestamp(self, f=util.datetime_to_timestamp): + self.assertEqual(f(util.EPOCH), 0.0) + self.assertEqual(f(datetime.datetime(2010, 1, 1)), 1262304000.0) + self.assertEqual(f(datetime.datetime(2010, 1, 1, 0, 0, 0, 128000)), + 1262304000.128000) + with self.assertRaises(TypeError): + f(None) + + def test_datetime_to_timestamp_string( + self, f=util.datetime_to_timestamp_string): + self.assertEqual(f(util.EPOCH), "0") + self.assertEqual(f(datetime.datetime(2010, 1, 1)), "1262304000") + self.assertEqual(f(None), "") + + def test_datetime_from_timestamp( + self, f=util.datetime_from_timestamp): + self.assertEqual(f(0.0), util.EPOCH) + self.assertEqual(f(1262304000.0), datetime.datetime(2010, 1, 1)) + self.assertEqual(f(1262304000.128000).replace(microsecond=0), + datetime.datetime(2010, 1, 1, 0, 0, 0)) + + def test_datetime_utcfromtimestamp( + self, f=util.datetime_utcfromtimestamp): + self.assertEqual(f(0.0), util.EPOCH) + self.assertEqual(f(1262304000.0), datetime.datetime(2010, 1, 1)) + + def test_datetime_utcnow( + self, f=util.datetime_utcnow): + self.assertIsInstance(f(), datetime.datetime) + + class TestOther(unittest.TestCase): def test_bencode(self): @@ -492,6 +591,7 @@ class TestOther(unittest.TestCase): def test_noop(self): self.assertEqual(util.noop(), None) + self.assertEqual(util.noop(...), None) def test_md5(self): self.assertEqual(util.md5(b""), @@ -552,17 +652,21 @@ value = 123 self.assertEqual(module.value, 123) self.assertIs(module.datetime, datetime) - def test_build_duration_func(self, f=util.build_duration_func): + def test_build_selection_func(self, f=util.build_selection_func): - def test_single(df, v): + def test_single(df, v, type=None): for _ in range(10): self.assertEqual(df(), v) + if type is not None: + self.assertIsInstance(df(), type) - def test_range(df, lower, upper): + def test_range(df, lower, upper, type=None): for __ in range(10): v = df() self.assertGreaterEqual(v, lower) self.assertLessEqual(v, upper) + if type is not None: + self.assertIsInstance(v, type) for v in (0, 0.0, "", None, (), []): self.assertIsNone(f(v)) @@ -570,16 +674,24 @@ value = 123 for v in (0, 0.0, "", None, (), []): test_single(f(v, 1.0), 1.0) - test_single(f(3), 3) - test_single(f(3.0), 3.0) - test_single(f("3"), 3) - test_single(f("3.0-"), 3) - test_single(f(" 3 -"), 3) + test_single(f(3) , 3 , float) + test_single(f(3.0) , 3.0, float) + test_single(f("3") , 3 , float) + test_single(f("3.0-") , 3 , float) + test_single(f(" 3 -"), 3 , float) - test_range(f((2, 4)), 2, 4) - test_range(f([2, 4]), 2, 4) - test_range(f("2-4"), 2, 4) - test_range(f(" 2.0 - 4 "), 2, 4) + test_range(f((2, 4)) , 2, 4, float) + test_range(f([2.0, 4.0]) , 2, 4, float) + test_range(f("2-4") , 2, 4, float) + test_range(f(" 2.0 - 4 "), 2, 4, float) + + pb = text.parse_bytes + test_single(f("3", 0, pb) , 3, int) + test_single(f("3.0-", 0, pb) , 3, int) + test_single(f(" 3 -", 0, pb), 3, int) + + test_range(f("2k-4k", 0, pb) , 2048, 4096, int) + test_range(f(" 2.0k - 4k ", 0, pb), 2048, 4096, int) def test_extractor_filter(self): # empty @@ -765,40 +877,16 @@ value = 123 self.assertEqual(f(["a", "b", "c"]), "a, b, c") self.assertEqual(f([1, 2, 3]), "1, 2, 3") - def test_datetime_to_timestamp(self, f=util.datetime_to_timestamp): - self.assertEqual(f(util.EPOCH), 0.0) - self.assertEqual(f(datetime.datetime(2010, 1, 1)), 1262304000.0) - self.assertEqual(f(datetime.datetime(2010, 1, 1, 0, 0, 0, 128000)), - 1262304000.128000) - with self.assertRaises(TypeError): - f(None) - - def test_datetime_to_timestamp_string( - self, f=util.datetime_to_timestamp_string): - self.assertEqual(f(util.EPOCH), "0") - self.assertEqual(f(datetime.datetime(2010, 1, 1)), "1262304000") - self.assertEqual(f(None), "") - - def test_datetime_from_timestamp( - self, f=util.datetime_from_timestamp): - self.assertEqual(f(0.0), util.EPOCH) - self.assertEqual(f(1262304000.0), datetime.datetime(2010, 1, 1)) - self.assertEqual(f(1262304000.128000).replace(microsecond=0), - datetime.datetime(2010, 1, 1, 0, 0, 0)) - - def test_datetime_utcfromtimestamp( - self, f=util.datetime_utcfromtimestamp): - self.assertEqual(f(0.0), util.EPOCH) - self.assertEqual(f(1262304000.0), datetime.datetime(2010, 1, 1)) - - def test_datetime_utcnow( - self, f=util.datetime_utcnow): - self.assertIsInstance(f(), datetime.datetime) - def test_universal_none(self): obj = util.NONE self.assertFalse(obj) + self.assertEqual(obj, obj) + self.assertEqual(obj, None) + self.assertNotEqual(obj, False) + self.assertNotEqual(obj, 0) + self.assertNotEqual(obj, "") + self.assertEqual(len(obj), 0) self.assertEqual(int(obj), 0) self.assertEqual(hash(obj), 0) @@ -873,6 +961,26 @@ value = 123 i += 1 self.assertEqual(i, 0) + def test_HTTPBasicAuth(self, f=util.HTTPBasicAuth): + class Request: + headers = {} + request = Request() + + auth = f("", "") + auth(request) + self.assertEqual(request.headers["Authorization"], + b"Basic Og==") + + f("foo", "bar")(request) + self.assertEqual(request.headers["Authorization"], + b"Basic Zm9vOmJhcg==") + + f("ewsxcvbhnjtr", + "RVXQ4i9Ju5ypi86VGJ8MqhDYpDKluS0sxiSRBAG7ymB3Imok")(request) + self.assertEqual(request.headers["Authorization"], + b"Basic ZXdzeGN2YmhuanRyOlJWWFE0aTlKdTV5cGk4NlZHSjhNc" + b"WhEWXBES2x1UzBzeGlTUkJBRzd5bUIzSW1vaw==") + def test_module_proxy(self): proxy = util.ModuleProxy() @@ -887,6 +995,16 @@ value = 123 self.assertIs(proxy["abc.def.ghi"], util.NONE) self.assertIs(proxy["os.path2"], util.NONE) + def test_lazy_prompt(self): + prompt = util.LazyPrompt() + + with patch("getpass.getpass") as p: + p.return_value = "***" + result = str(prompt) + + self.assertEqual(result, "***") + p.assert_called_once_with() + def test_null_context(self): with util.NullContext(): pass @@ -901,6 +1019,28 @@ value = 123 except ValueError as exc: self.assertIs(exc, exc_orig) + def test_null_response(self): + response = util.NullResponse("https://example.org") + + self.assertEqual(response.url, "https://example.org") + self.assertEqual(response.status_code, 900) + self.assertEqual(response.reason, "") + self.assertEqual(response.text, "") + self.assertEqual(response.content, b"") + self.assertEqual(response.json(), {}) + + self.assertFalse(response.ok) + self.assertFalse(response.is_redirect) + self.assertFalse(response.is_permanent_redirect) + self.assertFalse(response.history) + + self.assertEqual(response.encoding, "utf-8") + self.assertEqual(response.apparent_encoding, "utf-8") + self.assertEqual(response.cookies.get("foo"), None) + self.assertEqual(response.headers.get("foo"), None) + self.assertEqual(response.links.get("next"), None) + self.assertEqual(response.close(), None) + class TestExtractor(): category = "test_category" diff --git a/test/test_ytdl.py b/test/test_ytdl.py index f7eb671..ecc6d2f 100644 --- a/test/test_ytdl.py +++ b/test/test_ytdl.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2022-2023 Mike Fährmann +# Copyright 2022-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,6 +26,7 @@ class Test_CommandlineArguments(unittest.TestCase): raise unittest.SkipTest("cannot import module '{}'".format( cls.module_name)) cls.default = ytdl.parse_command_line(cls.module, []) + cls.ytdlp = hasattr(cls.module, "cookies") def test_ignore_errors(self): self._("--ignore-errors" , "ignoreerrors", True) @@ -155,21 +156,21 @@ class Test_CommandlineArguments(unittest.TestCase): def test_subs(self): opts = self._(["--convert-subs", "srt"]) conv = {"key": "FFmpegSubtitlesConvertor", "format": "srt"} - if self.module_name == "yt_dlp": + if self.ytdlp: conv["when"] = "before_dl" self.assertEqual(opts["postprocessors"][0], conv) def test_embed(self): subs = {"key": "FFmpegEmbedSubtitle"} thumb = {"key": "EmbedThumbnail", "already_have_thumbnail": False} - if self.module_name == "yt_dlp": + if self.ytdlp: subs["already_have_subtitle"] = False opts = self._(["--embed-subs", "--embed-thumbnail"]) self.assertEqual(opts["postprocessors"][:2], [subs, thumb]) thumb["already_have_thumbnail"] = True - if self.module_name == "yt_dlp": + if self.ytdlp: subs["already_have_subtitle"] = True thumb["already_have_thumbnail"] = "all" @@ -212,7 +213,7 @@ class Test_CommandlineArguments(unittest.TestCase): "--ignore-config", ] - if self.module_name != "yt_dlp": + if not self.ytdlp: cmdline.extend(( "--dump-json", "--dump-single-json", -- cgit v1.2.3