summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md45
-rw-r--r--PKG-INFO31
-rw-r--r--README.rst14
-rw-r--r--data/completion/_gallery-dl12
-rw-r--r--data/completion/gallery-dl2
-rw-r--r--data/completion/gallery-dl.fish12
-rw-r--r--data/man/gallery-dl.122
-rw-r--r--data/man/gallery-dl.conf.51147
-rw-r--r--docs/gallery-dl.conf134
-rw-r--r--gallery_dl.egg-info/PKG-INFO31
-rw-r--r--gallery_dl.egg-info/SOURCES.txt18
-rw-r--r--gallery_dl.egg-info/requires.txt16
-rw-r--r--gallery_dl/__init__.py142
-rw-r--r--gallery_dl/actions.py82
-rw-r--r--gallery_dl/aes.py6
-rw-r--r--gallery_dl/cache.py8
-rw-r--r--gallery_dl/config.py46
-rw-r--r--gallery_dl/cookies.py69
-rw-r--r--gallery_dl/downloader/common.py12
-rw-r--r--gallery_dl/downloader/http.py114
-rw-r--r--gallery_dl/downloader/ytdl.py52
-rw-r--r--gallery_dl/exception.py132
-rw-r--r--gallery_dl/extractor/2ch.py25
-rw-r--r--gallery_dl/extractor/2chan.py16
-rw-r--r--gallery_dl/extractor/2chen.py6
-rw-r--r--gallery_dl/extractor/35photo.py23
-rw-r--r--gallery_dl/extractor/4archive.py17
-rw-r--r--gallery_dl/extractor/4chan.py33
-rw-r--r--gallery_dl/extractor/4chanarchives.py13
-rw-r--r--gallery_dl/extractor/500px.py12
-rw-r--r--gallery_dl/extractor/8chan.py19
-rw-r--r--gallery_dl/extractor/8muses.py18
-rw-r--r--gallery_dl/extractor/__init__.py28
-rw-r--r--gallery_dl/extractor/adultempire.py6
-rw-r--r--gallery_dl/extractor/agnph.py19
-rw-r--r--gallery_dl/extractor/ao3.py19
-rw-r--r--gallery_dl/extractor/arcalive.py40
-rw-r--r--gallery_dl/extractor/architizer.py11
-rw-r--r--gallery_dl/extractor/artstation.py167
-rw-r--r--gallery_dl/extractor/aryion.py24
-rw-r--r--gallery_dl/extractor/batoto.py18
-rw-r--r--gallery_dl/extractor/bbc.py11
-rw-r--r--gallery_dl/extractor/behance.py57
-rw-r--r--gallery_dl/extractor/bilibili.py76
-rw-r--r--gallery_dl/extractor/blogger.py58
-rw-r--r--gallery_dl/extractor/bluesky.py60
-rw-r--r--gallery_dl/extractor/booru.py3
-rw-r--r--gallery_dl/extractor/boosty.py20
-rw-r--r--gallery_dl/extractor/bunkr.py17
-rw-r--r--gallery_dl/extractor/catbox.py2
-rw-r--r--gallery_dl/extractor/chevereto.py4
-rw-r--r--gallery_dl/extractor/cien.py15
-rw-r--r--gallery_dl/extractor/civitai.py387
-rw-r--r--gallery_dl/extractor/comick.py198
-rw-r--r--gallery_dl/extractor/comicvine.py7
-rw-r--r--gallery_dl/extractor/common.py562
-rw-r--r--gallery_dl/extractor/cyberdrop.py8
-rw-r--r--gallery_dl/extractor/danbooru.py53
-rw-r--r--gallery_dl/extractor/dankefuerslesen.py120
-rw-r--r--gallery_dl/extractor/desktopography.py8
-rw-r--r--gallery_dl/extractor/deviantart.py162
-rw-r--r--gallery_dl/extractor/directlink.py4
-rw-r--r--gallery_dl/extractor/discord.py89
-rw-r--r--gallery_dl/extractor/dynastyscans.py66
-rw-r--r--gallery_dl/extractor/e621.py26
-rw-r--r--gallery_dl/extractor/erome.py106
-rw-r--r--gallery_dl/extractor/everia.py16
-rw-r--r--gallery_dl/extractor/exhentai.py208
-rw-r--r--gallery_dl/extractor/facebook.py179
-rw-r--r--gallery_dl/extractor/fanbox.py130
-rw-r--r--gallery_dl/extractor/fantia.py8
-rw-r--r--gallery_dl/extractor/fapachi.py14
-rw-r--r--gallery_dl/extractor/fapello.py19
-rw-r--r--gallery_dl/extractor/flickr.py14
-rw-r--r--gallery_dl/extractor/foolfuuka.py85
-rw-r--r--gallery_dl/extractor/foolslide.py13
-rw-r--r--gallery_dl/extractor/furaffinity.py66
-rw-r--r--gallery_dl/extractor/furry34.py12
-rw-r--r--gallery_dl/extractor/fuskator.py17
-rw-r--r--gallery_dl/extractor/gelbooru.py18
-rw-r--r--gallery_dl/extractor/gelbooru_v01.py32
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py73
-rw-r--r--gallery_dl/extractor/generic.py29
-rw-r--r--gallery_dl/extractor/girlsreleased.py76
-rw-r--r--gallery_dl/extractor/girlswithmuscle.py177
-rw-r--r--gallery_dl/extractor/gofile.py11
-rw-r--r--gallery_dl/extractor/hatenablog.py21
-rw-r--r--gallery_dl/extractor/hentai2read.py8
-rw-r--r--gallery_dl/extractor/hentaicosplays.py4
-rw-r--r--gallery_dl/extractor/hentaifoundry.py42
-rw-r--r--gallery_dl/extractor/hentaihand.py16
-rw-r--r--gallery_dl/extractor/hentaihere.py23
-rw-r--r--gallery_dl/extractor/hentainexus.py26
-rw-r--r--gallery_dl/extractor/hiperdex.py21
-rw-r--r--gallery_dl/extractor/hitomi.py157
-rw-r--r--gallery_dl/extractor/hotleak.py11
-rw-r--r--gallery_dl/extractor/idolcomplex.py9
-rw-r--r--gallery_dl/extractor/imagebam.py18
-rw-r--r--gallery_dl/extractor/imagechest.py13
-rw-r--r--gallery_dl/extractor/imagefap.py40
-rw-r--r--gallery_dl/extractor/imagehosts.py38
-rw-r--r--gallery_dl/extractor/imgbb.py16
-rw-r--r--gallery_dl/extractor/imgbox.py29
-rw-r--r--gallery_dl/extractor/imgth.py11
-rw-r--r--gallery_dl/extractor/imgur.py32
-rw-r--r--gallery_dl/extractor/imhentai.py4
-rw-r--r--gallery_dl/extractor/inkbunny.py38
-rw-r--r--gallery_dl/extractor/instagram.py147
-rw-r--r--gallery_dl/extractor/issuu.py15
-rw-r--r--gallery_dl/extractor/itaku.py299
-rw-r--r--gallery_dl/extractor/itchio.py12
-rw-r--r--gallery_dl/extractor/iwara.py440
-rw-r--r--gallery_dl/extractor/jschan.py28
-rw-r--r--gallery_dl/extractor/kabeuchi.py20
-rw-r--r--gallery_dl/extractor/keenspot.py33
-rw-r--r--gallery_dl/extractor/kemono.py (renamed from gallery_dl/extractor/kemonoparty.py)309
-rw-r--r--gallery_dl/extractor/khinsider.py4
-rw-r--r--gallery_dl/extractor/komikcast.py37
-rw-r--r--gallery_dl/extractor/leakgallery.py141
-rw-r--r--gallery_dl/extractor/lensdump.py8
-rw-r--r--gallery_dl/extractor/lexica.py10
-rw-r--r--gallery_dl/extractor/lightroom.py2
-rw-r--r--gallery_dl/extractor/livedoor.py14
-rw-r--r--gallery_dl/extractor/lofter.py2
-rw-r--r--gallery_dl/extractor/lolisafe.py8
-rw-r--r--gallery_dl/extractor/luscious.py14
-rw-r--r--gallery_dl/extractor/lynxchan.py25
-rw-r--r--gallery_dl/extractor/madokami.py93
-rw-r--r--gallery_dl/extractor/mangadex.py48
-rw-r--r--gallery_dl/extractor/mangafox.py4
-rw-r--r--gallery_dl/extractor/mangahere.py18
-rw-r--r--gallery_dl/extractor/manganelo.py4
-rw-r--r--gallery_dl/extractor/mangapark.py35
-rw-r--r--gallery_dl/extractor/mangaread.py20
-rw-r--r--gallery_dl/extractor/mangasee.py117
-rw-r--r--gallery_dl/extractor/mangoxo.py16
-rw-r--r--gallery_dl/extractor/mastodon.py27
-rw-r--r--gallery_dl/extractor/misskey.py107
-rw-r--r--gallery_dl/extractor/moebooru.py53
-rw-r--r--gallery_dl/extractor/motherless.py140
-rw-r--r--gallery_dl/extractor/myhentaigallery.py6
-rw-r--r--gallery_dl/extractor/myportfolio.py8
-rw-r--r--gallery_dl/extractor/naverblog.py (renamed from gallery_dl/extractor/naver.py)41
-rw-r--r--gallery_dl/extractor/naverchzzk.py81
-rw-r--r--gallery_dl/extractor/naverwebtoon.py24
-rw-r--r--gallery_dl/extractor/nekohouse.py5
-rw-r--r--gallery_dl/extractor/newgrounds.py71
-rw-r--r--gallery_dl/extractor/nhentai.py7
-rw-r--r--gallery_dl/extractor/nijie.py47
-rw-r--r--gallery_dl/extractor/nitter.py35
-rw-r--r--gallery_dl/extractor/nozomi.py25
-rw-r--r--gallery_dl/extractor/nsfwalbum.py16
-rw-r--r--gallery_dl/extractor/nudostar.py71
-rw-r--r--gallery_dl/extractor/oauth.py65
-rw-r--r--gallery_dl/extractor/paheal.py28
-rw-r--r--gallery_dl/extractor/patreon.py206
-rw-r--r--gallery_dl/extractor/pexels.py8
-rw-r--r--gallery_dl/extractor/philomena.py36
-rw-r--r--gallery_dl/extractor/photovogue.py4
-rw-r--r--gallery_dl/extractor/picarto.py6
-rw-r--r--gallery_dl/extractor/pictoa.py4
-rw-r--r--gallery_dl/extractor/piczel.py10
-rw-r--r--gallery_dl/extractor/pillowfort.py19
-rw-r--r--gallery_dl/extractor/pinterest.py63
-rw-r--r--gallery_dl/extractor/pixeldrain.py33
-rw-r--r--gallery_dl/extractor/pixiv.py267
-rw-r--r--gallery_dl/extractor/pixnet.py18
-rw-r--r--gallery_dl/extractor/plurk.py22
-rw-r--r--gallery_dl/extractor/poipiku.py6
-rw-r--r--gallery_dl/extractor/poringa.py4
-rw-r--r--gallery_dl/extractor/pornhub.py40
-rw-r--r--gallery_dl/extractor/pornpics.py10
-rw-r--r--gallery_dl/extractor/postmill.py19
-rw-r--r--gallery_dl/extractor/rawkuma.py83
-rw-r--r--gallery_dl/extractor/reactor.py12
-rw-r--r--gallery_dl/extractor/readcomiconline.py45
-rw-r--r--gallery_dl/extractor/realbooru.py11
-rw-r--r--gallery_dl/extractor/recursive.py9
-rw-r--r--gallery_dl/extractor/redbust.py186
-rw-r--r--gallery_dl/extractor/reddit.py44
-rw-r--r--gallery_dl/extractor/redgifs.py39
-rw-r--r--gallery_dl/extractor/rule34us.py16
-rw-r--r--gallery_dl/extractor/rule34vault.py12
-rw-r--r--gallery_dl/extractor/rule34xyz.py40
-rw-r--r--gallery_dl/extractor/saint.py2
-rw-r--r--gallery_dl/extractor/sankaku.py45
-rw-r--r--gallery_dl/extractor/sankakucomplex.py24
-rw-r--r--gallery_dl/extractor/schalenetwork.py (renamed from gallery_dl/extractor/koharu.py)51
-rw-r--r--gallery_dl/extractor/scrolller.py6
-rw-r--r--gallery_dl/extractor/seiga.py16
-rw-r--r--gallery_dl/extractor/senmanga.py2
-rw-r--r--gallery_dl/extractor/sexcom.py138
-rw-r--r--gallery_dl/extractor/shimmie2.py27
-rw-r--r--gallery_dl/extractor/shopify.py16
-rw-r--r--gallery_dl/extractor/simplyhentai.py17
-rw-r--r--gallery_dl/extractor/skeb.py32
-rw-r--r--gallery_dl/extractor/slickpic.py12
-rw-r--r--gallery_dl/extractor/slideshare.py12
-rw-r--r--gallery_dl/extractor/smugmug.py18
-rw-r--r--gallery_dl/extractor/soundgasm.py6
-rw-r--r--gallery_dl/extractor/speakerdeck.py22
-rw-r--r--gallery_dl/extractor/steamgriddb.py24
-rw-r--r--gallery_dl/extractor/subscribestar.py32
-rw-r--r--gallery_dl/extractor/szurubooru.py17
-rw-r--r--gallery_dl/extractor/tapas.py19
-rw-r--r--gallery_dl/extractor/tcbscans.py4
-rw-r--r--gallery_dl/extractor/telegraph.py6
-rw-r--r--gallery_dl/extractor/tenor.py29
-rw-r--r--gallery_dl/extractor/tiktok.py29
-rw-r--r--gallery_dl/extractor/tmohentai.py9
-rw-r--r--gallery_dl/extractor/toyhouse.py8
-rw-r--r--gallery_dl/extractor/tsumino.py49
-rw-r--r--gallery_dl/extractor/tumblr.py68
-rw-r--r--gallery_dl/extractor/tumblrgallery.py23
-rw-r--r--gallery_dl/extractor/twibooru.py18
-rw-r--r--gallery_dl/extractor/twitter.py245
-rw-r--r--gallery_dl/extractor/unsplash.py23
-rw-r--r--gallery_dl/extractor/uploadir.py6
-rw-r--r--gallery_dl/extractor/urlgalleries.py7
-rw-r--r--gallery_dl/extractor/urlshortener.py2
-rw-r--r--gallery_dl/extractor/vanillarock.py4
-rw-r--r--gallery_dl/extractor/vichan.py57
-rw-r--r--gallery_dl/extractor/vipergirls.py20
-rw-r--r--gallery_dl/extractor/vk.py82
-rw-r--r--gallery_dl/extractor/vsco.py58
-rw-r--r--gallery_dl/extractor/wallhaven.py39
-rw-r--r--gallery_dl/extractor/wallpapercave.py9
-rw-r--r--gallery_dl/extractor/warosu.py19
-rw-r--r--gallery_dl/extractor/weasyl.py27
-rw-r--r--gallery_dl/extractor/webmshare.py6
-rw-r--r--gallery_dl/extractor/webtoons.py116
-rw-r--r--gallery_dl/extractor/weebcentral.py9
-rw-r--r--gallery_dl/extractor/weibo.py65
-rw-r--r--gallery_dl/extractor/wikiart.py37
-rw-r--r--gallery_dl/extractor/wikifeet.py13
-rw-r--r--gallery_dl/extractor/wikimedia.py29
-rw-r--r--gallery_dl/extractor/xfolio.py20
-rw-r--r--gallery_dl/extractor/xhamster.py6
-rw-r--r--gallery_dl/extractor/xvideos.py15
-rw-r--r--gallery_dl/extractor/yiffverse.py12
-rw-r--r--gallery_dl/extractor/ytdl.py19
-rw-r--r--gallery_dl/extractor/zerochan.py16
-rw-r--r--gallery_dl/extractor/zzup.py15
-rw-r--r--gallery_dl/formatter.py128
-rw-r--r--gallery_dl/job.py214
-rw-r--r--gallery_dl/option.py71
-rw-r--r--gallery_dl/output.py29
-rw-r--r--gallery_dl/path.py51
-rw-r--r--gallery_dl/postprocessor/__init__.py2
-rw-r--r--gallery_dl/postprocessor/common.py3
-rw-r--r--gallery_dl/postprocessor/compare.py11
-rw-r--r--gallery_dl/postprocessor/exec.py76
-rw-r--r--gallery_dl/postprocessor/metadata.py18
-rw-r--r--gallery_dl/postprocessor/mtime.py5
-rw-r--r--gallery_dl/postprocessor/ugoira.py53
-rw-r--r--gallery_dl/text.py84
-rw-r--r--gallery_dl/transaction_id.py3
-rw-r--r--gallery_dl/update.py16
-rw-r--r--gallery_dl/util.py260
-rw-r--r--gallery_dl/version.py4
-rw-r--r--gallery_dl/ytdl.py50
-rw-r--r--setup.py18
-rw-r--r--test/test_config.py5
-rw-r--r--test/test_cookies.py25
-rw-r--r--test/test_downloader.py8
-rw-r--r--test/test_extractor.py59
-rw-r--r--test/test_formatter.py159
-rw-r--r--test/test_job.py11
-rw-r--r--test/test_postprocessor.py115
-rw-r--r--test/test_results.py176
-rw-r--r--test/test_text.py76
-rw-r--r--test/test_util.py234
-rw-r--r--test/test_ytdl.py11
273 files changed, 9131 insertions, 5207 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c7e75a8..159ff0d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,40 +1,11 @@
-## 1.29.7 - 2025-05-23
+## 1.30.2 - 2025-07-27
### Extractors
#### Additions
-- [mangadex] add `following` extractor ([#7487](https://github.com/mikf/gallery-dl/issues/7487))
-- [pixeldrain] add support for filesystem URLs ([#7473](https://github.com/mikf/gallery-dl/issues/7473))
+- [itaku] add `posts` & `bookmarks` extractors ([#7707](https://github.com/mikf/gallery-dl/issues/7707))
#### Fixes
-- [bluesky] handle posts without `record` data ([#7499](https://github.com/mikf/gallery-dl/issues/7499))
-- [civitai] fix & improve video downloads ([#7502](https://github.com/mikf/gallery-dl/issues/7502))
-- [civitai] fix exception for images without `modelVersionId` ([#7432](https://github.com/mikf/gallery-dl/issues/7432))
-- [civitai] make metadata extraction non-fatal ([#7562](https://github.com/mikf/gallery-dl/issues/7562))
-- [fanbox] use `"browser": "firefox"` by default ([#7490](https://github.com/mikf/gallery-dl/issues/7490))
-- [idolcomplex] fix pagination logic ([#7549](https://github.com/mikf/gallery-dl/issues/7549))
-- [idolcomplex] fix 429 error during login by adding a 10s delay
-- [instagram:stories] fix `post_date` metadata ([#7521](https://github.com/mikf/gallery-dl/issues/7521))
-- [motherless] fix video gallery downloads ([#7530](https://github.com/mikf/gallery-dl/issues/7530))
-- [pinterest] handle `story_pin_product_sticker_block` blocks ([#7563](https://github.com/mikf/gallery-dl/issues/7563))
-- [subscribestar] fix `content` and `title` metadata ([#7486](https://github.com/mikf/gallery-dl/issues/7486) [#7526](https://github.com/mikf/gallery-dl/issues/7526))
-#### Improvements
-- [arcalive] allow overriding default `User-Agent` header ([#7556](https://github.com/mikf/gallery-dl/issues/7556))
-- [fanbox] update API headers ([#7490](https://github.com/mikf/gallery-dl/issues/7490))
-- [flickr] add `info` option ([#4720](https://github.com/mikf/gallery-dl/issues/4720) [#6817](https://github.com/mikf/gallery-dl/issues/6817))
-- [flickr] add `profile` option
-- [instagram:stories] add `split` option ([#7521](https://github.com/mikf/gallery-dl/issues/7521))
-- [mangadex] implement login with client credentials
-- [mangadex] send `Authorization` header only when necessary
-- [mastodon] support Akkoma/Pleroma `/notice/:ID` URLs ([#7496](https://github.com/mikf/gallery-dl/issues/7496))
-- [mastodon] support Akkoma/Pleroma `/objects/:UUID` URLs ([#7497](https://github.com/mikf/gallery-dl/issues/7497))
-- [pixiv] Implement sanity handling for ugoira works ([#4327](https://github.com/mikf/gallery-dl/issues/4327) [#6297](https://github.com/mikf/gallery-dl/issues/6297) [#7285](https://github.com/mikf/gallery-dl/issues/7285) [#7434](https://github.com/mikf/gallery-dl/issues/7434))
-- [twitter:ctid] reduce chance of generating the same ID
-#### Metadata
-- [civitai] provide proper `extension` for model files ([#7432](https://github.com/mikf/gallery-dl/issues/7432))
-- [flickr] provide `license_name` metadata
-- [sankaku] support new `tags` categories ([#7333](https://github.com/mikf/gallery-dl/issues/7333) [#7553](https://github.com/mikf/gallery-dl/issues/7553))
-- [vipergirls] provide `num` and `count` metadata ([#7479](https://github.com/mikf/gallery-dl/issues/7479))
-- [vipergirls] extract more metadata & rename fields ([#7479](https://github.com/mikf/gallery-dl/issues/7479))
-### Downloaders
-- [http] fix setting `mtime` per file ([#7529](https://github.com/mikf/gallery-dl/issues/7529))
-- [ytdl] improve temp/part file handling ([#6949](https://github.com/mikf/gallery-dl/issues/6949) [#7494](https://github.com/mikf/gallery-dl/issues/7494))
-### Cookies
-- support Zen browser ([#7233](https://github.com/mikf/gallery-dl/issues/7233) [#7546](https://github.com/mikf/gallery-dl/issues/7546))
+- [kemono] support new `kemono.cr` domain ([#7902](https://github.com/mikf/gallery-dl/issues/7902) [#7909](https://github.com/mikf/gallery-dl/issues/7909) [#7911](https://github.com/mikf/gallery-dl/issues/7911) [#7913](https://github.com/mikf/gallery-dl/issues/7913) [#7904](https://github.com/mikf/gallery-dl/issues/7904))
+- [coomer] support new `coomer.st` domain ([#7907](https://github.com/mikf/gallery-dl/issues/7907) [#7909](https://github.com/mikf/gallery-dl/issues/7909) [#7911](https://github.com/mikf/gallery-dl/issues/7911) [#7904](https://github.com/mikf/gallery-dl/issues/7904))
+### Post Processors
+- [exec] use `False` as `start_new_session` default to avoid a `TypeError` ([#7899](https://github.com/mikf/gallery-dl/issues/7899))
+### Miscellaneous
+- [tests/postprocessor] fix `TypeError` when logging an error ([#6582](https://github.com/mikf/gallery-dl/issues/6582))
diff --git a/PKG-INFO b/PKG-INFO
index c022f84..550241f 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.4
Name: gallery_dl
-Version: 1.29.7
+Version: 1.30.2
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -18,10 +18,6 @@ Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.4
-Classifier: Programming Language :: Python :: 3.5
-Classifier: Programming Language :: Python :: 3.6
-Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
@@ -33,11 +29,18 @@ Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Internet :: WWW/HTTP
Classifier: Topic :: Multimedia :: Graphics
Classifier: Topic :: Utilities
-Requires-Python: >=3.4
+Requires-Python: >=3.8
License-File: LICENSE
Requires-Dist: requests>=2.11.0
Provides-Extra: video
-Requires-Dist: youtube-dl; extra == "video"
+Requires-Dist: yt-dlp; extra == "video"
+Provides-Extra: extra
+Requires-Dist: requests[socks]; extra == "extra"
+Requires-Dist: yt-dlp[default]; extra == "extra"
+Requires-Dist: pyyaml; extra == "extra"
+Requires-Dist: toml; python_version < "3.11" and extra == "extra"
+Requires-Dist: truststore; python_version >= "3.10" and extra == "extra"
+Requires-Dist: secretstorage; sys_platform == "linux" and extra == "extra"
Dynamic: author
Dynamic: author-email
Dynamic: classifier
@@ -75,7 +78,7 @@ and powerful `filenaming capabilities <https://gdl-org.github.io/docs/formatting
Dependencies
============
-- Python_ 3.4+
+- Python_ 3.8+
- Requests_
Optional
@@ -91,6 +94,8 @@ Optional
- toml_: TOML configuration file support for Python<3.11
- SecretStorage_: GNOME keyring passwords for ``--cookies-from-browser``
- Psycopg_: PostgreSQL archive support
+- truststore_: Native system certificate support
+- Jinja_: Jinja template support
Installation
@@ -133,9 +138,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.7/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.2/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.7/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.2/gallery-dl.bin>`__
Nightly Builds
@@ -517,7 +522,7 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
.. _Python: https://www.python.org/downloads/
.. _PyPI: https://pypi.org/
.. _pip: https://pip.pypa.io/en/stable/
-.. _Requests: https://requests.readthedocs.io/en/master/
+.. _Requests: https://requests.readthedocs.io/en/latest/
.. _FFmpeg: https://www.ffmpeg.org/
.. _mkvmerge: https://www.matroska.org/downloads/mkvtoolnix.html
.. _yt-dlp: https://github.com/yt-dlp/yt-dlp
@@ -530,10 +535,12 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
.. _toml: https://pypi.org/project/toml/
.. _SecretStorage: https://pypi.org/project/SecretStorage/
.. _Psycopg: https://www.psycopg.org/
+.. _truststore: https://truststore.readthedocs.io/en/latest/
+.. _Jinja: https://jinja.palletsprojects.com/
.. _Snapd: https://docs.snapcraft.io/installing-snapd
.. _OAuth: https://en.wikipedia.org/wiki/OAuth
.. _Chocolatey: https://chocolatey.org/install
-.. _Scoop: https://scoop.sh
+.. _Scoop: https://scoop.sh/
.. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg
:target: https://pypi.org/project/gallery-dl/
diff --git a/README.rst b/README.rst
index 1fbdce5..3ca61b2 100644
--- a/README.rst
+++ b/README.rst
@@ -19,7 +19,7 @@ and powerful `filenaming capabilities <https://gdl-org.github.io/docs/formatting
Dependencies
============
-- Python_ 3.4+
+- Python_ 3.8+
- Requests_
Optional
@@ -35,6 +35,8 @@ Optional
- toml_: TOML configuration file support for Python<3.11
- SecretStorage_: GNOME keyring passwords for ``--cookies-from-browser``
- Psycopg_: PostgreSQL archive support
+- truststore_: Native system certificate support
+- Jinja_: Jinja template support
Installation
@@ -77,9 +79,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.7/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.2/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.7/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.2/gallery-dl.bin>`__
Nightly Builds
@@ -461,7 +463,7 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
.. _Python: https://www.python.org/downloads/
.. _PyPI: https://pypi.org/
.. _pip: https://pip.pypa.io/en/stable/
-.. _Requests: https://requests.readthedocs.io/en/master/
+.. _Requests: https://requests.readthedocs.io/en/latest/
.. _FFmpeg: https://www.ffmpeg.org/
.. _mkvmerge: https://www.matroska.org/downloads/mkvtoolnix.html
.. _yt-dlp: https://github.com/yt-dlp/yt-dlp
@@ -474,10 +476,12 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
.. _toml: https://pypi.org/project/toml/
.. _SecretStorage: https://pypi.org/project/SecretStorage/
.. _Psycopg: https://www.psycopg.org/
+.. _truststore: https://truststore.readthedocs.io/en/latest/
+.. _Jinja: https://jinja.palletsprojects.com/
.. _Snapd: https://docs.snapcraft.io/installing-snapd
.. _OAuth: https://en.wikipedia.org/wiki/OAuth
.. _Chocolatey: https://chocolatey.org/install
-.. _Scoop: https://scoop.sh
+.. _Scoop: https://scoop.sh/
.. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg
:target: https://pypi.org/project/gallery-dl/
diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl
index 99fb8ad..f0d654e 100644
--- a/data/completion/_gallery-dl
+++ b/data/completion/_gallery-dl
@@ -13,6 +13,7 @@ _arguments -s -S \
{-X,--extractors}'[Load external extractors from PATH]':'<path>' \
--user-agent'[User-Agent request header]':'<ua>' \
--clear-cache'[Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)]':'<module>' \
+--compat'[Restore legacy '\''category'\'' names]' \
{-U,--update-check}'[Check if a newer version is available]' \
{-i,--input-file}'[Download URLs found in FILE ('\''-'\'' for stdin). More than one --input-file can be specified]':'<file>':_files \
{-I,--input-file-comment}'[Download URLs found in FILE. Comment them out after they were downloaded successfully.]':'<file>':_files \
@@ -29,8 +30,10 @@ _arguments -s -S \
{-E,--extractor-info}'[Print extractor defaults and settings]' \
{-K,--list-keywords}'[Print a list of available keywords and example values for the given URLs]' \
{-e,--error-file}'[Add input URLs which returned an error to FILE]':'<file>':_files \
-{-N,--print}'[Write FORMAT during EVENT (default '\''prepare'\'') to standard output. Examples: '\''id'\'' or '\''post:{md5\[:8\]}'\'']':'<[event:]format>' \
---print-to-file'[Append FORMAT during EVENT to FILE]':'<[event:]format file>' \
+{-N,--print}'[Write FORMAT during EVENT (default '\''prepare'\'') to standard output instead of downloading files. Can be used multiple times. Examples: '\''id'\'' or '\''post:{md5\[:8\]}'\'']':'<[event:]format>' \
+--Print'[Like --print, but downloads files as well]':'<[event:]format>' \
+--print-to-file'[Append FORMAT during EVENT to FILE instead of downloading files. Can be used multiple times]':'<[event:]format file>' \
+--Print-to-file'[Like --print-to-file, but downloads files as well]':'<[event:]format file>' \
--list-modules'[Print a list of available extractor modules]' \
--list-extractors'[Print a list of extractor classes with description, (sub)category and example URL]':'<[categories]>' \
--write-log'[Write logging output to FILE]':'<file>':_files \
@@ -45,10 +48,11 @@ _arguments -s -S \
{-4,--force-ipv4}'[Make all connections via IPv4]' \
{-6,--force-ipv6}'[Make all connections via IPv6]' \
--no-check-certificate'[Disable HTTPS certificate validation]' \
-{-r,--limit-rate}'[Maximum download rate (e.g. 500k or 2.5M)]':'<rate>' \
+{-r,--limit-rate}'[Maximum download rate (e.g. 500k, 2.5M, or 800k-2M)]':'<rate>' \
--chunk-size'[Size of in-memory data chunks (default: 32k)]':'<size>' \
--sleep'[Number of seconds to wait before each download. This can be either a constant value or a range (e.g. 2.7 or 2.0-3.5)]':'<seconds>' \
--sleep-request'[Number of seconds to wait between HTTP requests during data extraction]':'<seconds>' \
+--sleep-429'[Number of seconds to wait when receiving a '\''429 Too Many Requests'\'' response]':'<seconds>' \
--sleep-extractor'[Number of seconds to wait before starting data extraction for an input URL]':'<seconds>' \
--no-part'[Do not use .part files]' \
--no-skip'[Do not skip downloads; overwrite existing files]' \
@@ -72,7 +76,7 @@ _arguments -s -S \
{-T,--terminate}'[Stop current and parent extractor runs after N consecutive file downloads were skipped]':'<n>' \
--filesize-min'[Do not download files smaller than SIZE (e.g. 500k or 2.5M)]':'<size>' \
--filesize-max'[Do not download files larger than SIZE (e.g. 500k or 2.5M)]':'<size>' \
---download-archive'[Record all downloaded or skipped files in FILE and skip downloading any file already in it]':'<file>':_files \
+--download-archive'[Record successfully downloaded files in FILE and skip downloading any file already in it]':'<file>':_files \
--range'[Index range(s) specifying which files to download. These can be either a constant value, range, or slice (e.g. '\''5'\'', '\''8-20'\'', or '\''1:24:3'\'')]':'<range>' \
--chapter-range'[Like '\''--range'\'', but applies to manga chapters and other delegated URLs]':'<range>' \
--filter'[Python expression controlling which files to download. Files for which the expression evaluates to False are ignored. Available keys are the filename-specific ones listed by '\''-K'\''. Example: --filter "image_width >= 1000 and rating in ('\''s'\'', '\''q'\'')"]':'<expr>' \
diff --git a/data/completion/gallery-dl b/data/completion/gallery-dl
index 161113c..ae4cb0f 100644
--- a/data/completion/gallery-dl
+++ b/data/completion/gallery-dl
@@ -10,7 +10,7 @@ _gallery_dl()
elif [[ "${prev}" =~ ^()$ ]]; then
COMPREPLY=( $(compgen -d -- "${cur}") )
else
- COMPREPLY=( $(compgen -W "--help --version --filename --destination --directory --extractors --user-agent --clear-cache --update-check --input-file --input-file-comment --input-file-delete --no-input --quiet --warning --verbose --get-urls --resolve-urls --dump-json --resolve-json --simulate --extractor-info --list-keywords --error-file --print --print-to-file --list-modules --list-extractors --write-log --write-unsupported --write-pages --print-traffic --no-colors --retries --http-timeout --proxy --source-address --force-ipv4 --force-ipv6 --no-check-certificate --limit-rate --chunk-size --sleep --sleep-request --sleep-extractor --no-part --no-skip --no-mtime --no-download --option --config --config-yaml --config-toml --config-create --config-status --config-open --config-ignore --ignore-config --username --password --netrc --cookies --cookies-export --cookies-from-browser --abort --terminate --filesize-min --filesize-max --download-archive --range --chapter-range --filter --chapter-filter --postprocessor --no-postprocessors --postprocessor-option --write-metadata --write-info-json --write-infojson --write-tags --zip --cbz --mtime --mtime-from-date --rename --rename-to --ugoira --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --exec --exec-after" -- "${cur}") )
+ COMPREPLY=( $(compgen -W "--help --version --filename --destination --directory --extractors --user-agent --clear-cache --compat --update-check --input-file --input-file-comment --input-file-delete --no-input --quiet --warning --verbose --get-urls --resolve-urls --dump-json --resolve-json --simulate --extractor-info --list-keywords --error-file --print --Print --print-to-file --Print-to-file --list-modules --list-extractors --write-log --write-unsupported --write-pages --print-traffic --no-colors --retries --http-timeout --proxy --source-address --force-ipv4 --force-ipv6 --no-check-certificate --limit-rate --chunk-size --sleep --sleep-request --sleep-429 --sleep-extractor --no-part --no-skip --no-mtime --no-download --option --config --config-yaml --config-toml --config-create --config-status --config-open --config-ignore --ignore-config --username --password --netrc --cookies --cookies-export --cookies-from-browser --abort --terminate --filesize-min --filesize-max --download-archive --range --chapter-range --filter --chapter-filter --postprocessor --no-postprocessors --postprocessor-option --write-metadata --write-info-json --write-infojson --write-tags --zip --cbz --mtime --mtime-from-date --rename --rename-to --ugoira --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --exec --exec-after" -- "${cur}") )
fi
}
diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish
index f8bb723..8eb427a 100644
--- a/data/completion/gallery-dl.fish
+++ b/data/completion/gallery-dl.fish
@@ -7,6 +7,7 @@ complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'D' -l 'director
complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'X' -l 'extractors' -d 'Load external extractors from PATH'
complete -c gallery-dl -x -l 'user-agent' -d 'User-Agent request header'
complete -c gallery-dl -x -l 'clear-cache' -d 'Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)'
+complete -c gallery-dl -l 'compat' -d 'Restore legacy "category" names'
complete -c gallery-dl -s 'U' -l 'update-check' -d 'Check if a newer version is available'
complete -c gallery-dl -r -F -s 'i' -l 'input-file' -d 'Download URLs found in FILE ("-" for stdin). More than one --input-file can be specified'
complete -c gallery-dl -r -F -s 'I' -l 'input-file-comment' -d 'Download URLs found in FILE. Comment them out after they were downloaded successfully.'
@@ -23,8 +24,10 @@ complete -c gallery-dl -s 's' -l 'simulate' -d 'Simulate data extraction; do not
complete -c gallery-dl -s 'E' -l 'extractor-info' -d 'Print extractor defaults and settings'
complete -c gallery-dl -s 'K' -l 'list-keywords' -d 'Print a list of available keywords and example values for the given URLs'
complete -c gallery-dl -r -F -s 'e' -l 'error-file' -d 'Add input URLs which returned an error to FILE'
-complete -c gallery-dl -x -s 'N' -l 'print' -d 'Write FORMAT during EVENT (default "prepare") to standard output. Examples: "id" or "post:{md5[:8]}"'
-complete -c gallery-dl -x -l 'print-to-file' -d 'Append FORMAT during EVENT to FILE'
+complete -c gallery-dl -x -s 'N' -l 'print' -d 'Write FORMAT during EVENT (default "prepare") to standard output instead of downloading files. Can be used multiple times. Examples: "id" or "post:{md5[:8]}"'
+complete -c gallery-dl -x -l 'Print' -d 'Like --print, but downloads files as well'
+complete -c gallery-dl -x -l 'print-to-file' -d 'Append FORMAT during EVENT to FILE instead of downloading files. Can be used multiple times'
+complete -c gallery-dl -x -l 'Print-to-file' -d 'Like --print-to-file, but downloads files as well'
complete -c gallery-dl -l 'list-modules' -d 'Print a list of available extractor modules'
complete -c gallery-dl -x -l 'list-extractors' -d 'Print a list of extractor classes with description, (sub)category and example URL'
complete -c gallery-dl -r -F -l 'write-log' -d 'Write logging output to FILE'
@@ -39,10 +42,11 @@ complete -c gallery-dl -x -l 'source-address' -d 'Client-side IP address to bind
complete -c gallery-dl -s '4' -l 'force-ipv4' -d 'Make all connections via IPv4'
complete -c gallery-dl -s '6' -l 'force-ipv6' -d 'Make all connections via IPv6'
complete -c gallery-dl -l 'no-check-certificate' -d 'Disable HTTPS certificate validation'
-complete -c gallery-dl -x -s 'r' -l 'limit-rate' -d 'Maximum download rate (e.g. 500k or 2.5M)'
+complete -c gallery-dl -x -s 'r' -l 'limit-rate' -d 'Maximum download rate (e.g. 500k, 2.5M, or 800k-2M)'
complete -c gallery-dl -x -l 'chunk-size' -d 'Size of in-memory data chunks (default: 32k)'
complete -c gallery-dl -x -l 'sleep' -d 'Number of seconds to wait before each download. This can be either a constant value or a range (e.g. 2.7 or 2.0-3.5)'
complete -c gallery-dl -x -l 'sleep-request' -d 'Number of seconds to wait between HTTP requests during data extraction'
+complete -c gallery-dl -x -l 'sleep-429' -d 'Number of seconds to wait when receiving a "429 Too Many Requests" response'
complete -c gallery-dl -x -l 'sleep-extractor' -d 'Number of seconds to wait before starting data extraction for an input URL'
complete -c gallery-dl -l 'no-part' -d 'Do not use .part files'
complete -c gallery-dl -l 'no-skip' -d 'Do not skip downloads; overwrite existing files'
@@ -67,7 +71,7 @@ complete -c gallery-dl -x -s 'A' -l 'abort' -d 'Stop current extractor run after
complete -c gallery-dl -x -s 'T' -l 'terminate' -d 'Stop current and parent extractor runs after N consecutive file downloads were skipped'
complete -c gallery-dl -x -l 'filesize-min' -d 'Do not download files smaller than SIZE (e.g. 500k or 2.5M)'
complete -c gallery-dl -x -l 'filesize-max' -d 'Do not download files larger than SIZE (e.g. 500k or 2.5M)'
-complete -c gallery-dl -r -F -l 'download-archive' -d 'Record all downloaded or skipped files in FILE and skip downloading any file already in it'
+complete -c gallery-dl -r -F -l 'download-archive' -d 'Record successfully downloaded files in FILE and skip downloading any file already in it'
complete -c gallery-dl -x -l 'range' -d 'Index range(s) specifying which files to download. These can be either a constant value, range, or slice (e.g. "5", "8-20", or "1:24:3")'
complete -c gallery-dl -x -l 'chapter-range' -d 'Like "--range", but applies to manga chapters and other delegated URLs'
complete -c gallery-dl -x -l 'filter' -d 'Python expression controlling which files to download. Files for which the expression evaluates to False are ignored. Available keys are the filename-specific ones listed by "-K". Example: --filter "image_width >= 1000 and rating in ("s", "q")"'
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index 77403b1..4979279 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2025-05-23" "1.29.7" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2025-07-27" "1.30.2" "gallery-dl Manual"
.\" disable hyphenation
.nh
@@ -41,6 +41,9 @@ User-Agent request header
.B "\-\-clear\-cache" \f[I]MODULE\f[]
Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)
.TP
+.B "\-\-compat"
+Restore legacy 'category' names
+.TP
.B "\-U, \-\-update\-check"
Check if a newer version is available
.TP
@@ -90,10 +93,16 @@ Print a list of available keywords and example values for the given URLs
Add input URLs which returned an error to FILE
.TP
.B "\-N, \-\-print" \f[I][EVENT:]FORMAT\f[]
-Write FORMAT during EVENT (default 'prepare') to standard output. Examples: 'id' or 'post:{md5[:8]}'
+Write FORMAT during EVENT (default 'prepare') to standard output instead of downloading files. Can be used multiple times. Examples: 'id' or 'post:{md5[:8]}'
+.TP
+.B "\-\-Print" \f[I][EVENT:]FORMAT\f[]
+Like --print, but downloads files as well
.TP
.B "\-\-print\-to\-file" \f[I][EVENT:]FORMAT FILE\f[]
-Append FORMAT during EVENT to FILE
+Append FORMAT during EVENT to FILE instead of downloading files. Can be used multiple times
+.TP
+.B "\-\-Print\-to\-file" \f[I][EVENT:]FORMAT FILE\f[]
+Like --print-to-file, but downloads files as well
.TP
.B "\-\-list\-modules"
Print a list of available extractor modules
@@ -138,7 +147,7 @@ Make all connections via IPv6
Disable HTTPS certificate validation
.TP
.B "\-r, \-\-limit\-rate" \f[I]RATE\f[]
-Maximum download rate (e.g. 500k or 2.5M)
+Maximum download rate (e.g. 500k, 2.5M, or 800k-2M)
.TP
.B "\-\-chunk\-size" \f[I]SIZE\f[]
Size of in-memory data chunks (default: 32k)
@@ -149,6 +158,9 @@ Number of seconds to wait before each download. This can be either a constant va
.B "\-\-sleep\-request" \f[I]SECONDS\f[]
Number of seconds to wait between HTTP requests during data extraction
.TP
+.B "\-\-sleep\-429" \f[I]SECONDS\f[]
+Number of seconds to wait when receiving a '429 Too Many Requests' response
+.TP
.B "\-\-sleep\-extractor" \f[I]SECONDS\f[]
Number of seconds to wait before starting data extraction for an input URL
.TP
@@ -219,7 +231,7 @@ Do not download files smaller than SIZE (e.g. 500k or 2.5M)
Do not download files larger than SIZE (e.g. 500k or 2.5M)
.TP
.B "\-\-download\-archive" \f[I]FILE\f[]
-Record all downloaded or skipped files in FILE and skip downloading any file already in it
+Record successfully downloaded files in FILE and skip downloading any file already in it
.TP
.B "\-\-range" \f[I]RANGE\f[]
Index range(s) specifying which files to download. These can be either a constant value, range, or slice (e.g. '5', '8-20', or '1:24:3')
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index 1c2a2a0..12eea08 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2025-05-23" "1.29.7" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2025-07-27" "1.30.2" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -99,8 +99,8 @@ A \f[I]format string\f[] to build filenames for downloaded files with.
If this is an \f[I]object\f[], it must contain Python expressions mapping to the
filename format strings to use.
-These expressions are evaluated in the order as specified in Python 3.6+
-and in an undetermined order in Python 3.4 and 3.5.
+These expressions are evaluated in the specified order until one evaluates
+to \f[I]True\f[].
The available replacement keys depend on the extractor used. A list
of keys for a specific one can be acquired by calling *gallery-dl*
@@ -242,14 +242,15 @@ Share number of skipped downloads between parent and child extractors.
.br
* "/!? (){}"
.br
-* {" ": "_", "/": "-", "|": "-", ":": "_-_", "*": "_+_"}
+* {"/": "_", "+": "_+_", "({[": "(", "]})": ")", "a-z": "*"}
.IP "Description:" 4
-A string of characters to be replaced with the value of
+A \f[I]string\f[] of characters to be replaced with the value of
.br
\f[I]path-replace\f[]
-or an object mapping invalid/unwanted characters to their replacements
+or an \f[I]object\f[] mapping invalid/unwanted characters, character sets,
.br
+or character ranges to their replacements
for generated path segment names.
.br
@@ -461,13 +462,13 @@ response before \f[I]retrying\f[] the request.
\f[I][E621]\f[],
\f[I][foolfuuka]:search\f[],
\f[I]itaku\f[],
-\f[I]koharu\f[],
\f[I]newgrounds\f[],
\f[I][philomena]\f[],
-\f[I]pixiv:novel\f[],
+\f[I]pixiv-novel\f[],
\f[I]plurk\f[],
\f[I]poipiku\f[] ,
\f[I]pornpics\f[],
+\f[I]schalenetwork\f[],
\f[I]scrolller\f[],
\f[I]soundgasm\f[],
\f[I]urlgalleries\f[],
@@ -541,7 +542,7 @@ This is supported for
.br
* \f[I]booruvar\f[] (*)
.br
-* \f[I]coomerparty\f[]
+* \f[I]coomer\f[]
.br
* \f[I]danbooru\f[] (*)
.br
@@ -555,6 +556,8 @@ This is supported for
.br
* \f[I]exhentai\f[]
.br
+* \f[I]girlswithmuscle\f[]
+.br
* \f[I]horne\f[] (R)
.br
* \f[I]idolcomplex\f[]
@@ -563,9 +566,11 @@ This is supported for
.br
* \f[I]inkbunny\f[]
.br
-* \f[I]kemonoparty\f[]
+* \f[I]iwara\f[]
+.br
+* \f[I]kemono\f[]
.br
-* \f[I]koharu\f[]
+* \f[I]madokami\f[] (R)
.br
* \f[I]mangadex\f[]
.br
@@ -577,8 +582,12 @@ This is supported for
.br
* \f[I]pillowfort\f[]
.br
+* \f[I]rule34xyz\f[]
+.br
* \f[I]sankaku\f[]
.br
+* \f[I]schalenetwork\f[]
+.br
* \f[I]scrolller\f[]
.br
* \f[I]seiga\f[]
@@ -828,11 +837,6 @@ User-Agent header value used for HTTP requests.
Setting this value to \f[I]"browser"\f[] will try to automatically detect
and use the \f[I]User-Agent\f[] header of the system's default browser.
-Note:
-This option has *no* effect if
-\f[I]extractor.browser\f[]
-is enabled.
-
.SS extractor.*.browser
.IP "Type:" 6
@@ -840,12 +844,14 @@ is enabled.
.IP "Default:" 9
.br
-* \f[I]"firefox"\f[]: \f[I]artstation\f[], \f[I]fanbox\f[], \f[I]mangasee\f[], \f[I]twitter\f[]
+* \f[I]"firefox"\f[]: \f[I]artstation\f[], \f[I]behance\f[], \f[I]fanbox\f[], \f[I]twitter\f[]
.br
* \f[I]null\f[]: otherwise
.IP "Example:" 4
.br
+* "firefox/128:linux"
+.br
* "chrome:macos"
.IP "Description:" 4
@@ -855,10 +861,23 @@ by using their default HTTP headers and TLS ciphers for HTTP requests.
Optionally, the operating system used in the \f[I]User-Agent\f[] header can be
specified after a \f[I]:\f[] (\f[I]windows\f[], \f[I]linux\f[], or \f[I]macos\f[]).
+Supported browsers:
+
+.br
+* \f[I]firefox\f[]
+.br
+* \f[I]firefox/140\f[]
+.br
+* \f[I]firefox/128\f[]
+.br
+* \f[I]chrome\f[]
+.br
+* \f[I]chrome/138\f[]
+.br
+* \f[I]chrome/111\f[]
+
Note:
-This option overrides
-\f[I]user-agent\f[]
-and sets custom
+This option sets custom
\f[I]headers\f[]
and
\f[I]ciphers\f[]
@@ -888,7 +907,10 @@ instead of the extractor's \f[I]root\f[] domain.
.SS extractor.*.headers
.IP "Type:" 6
-\f[I]object\f[] (name -> value)
+.br
+* \f[I]"string"\f[]
+.br
+* \f[I]object\f[] (name -> value)
.IP "Default:" 9
.. code:: json
@@ -908,13 +930,22 @@ to be sent with each HTTP request,
To disable sending a header, set its value to \f[I]null\f[].
+Set this option to \f[I]"firefox"\f[] or \f[I]"chrome"\f[]
+to use these browser's default headers.
+
.SS extractor.*.ciphers
.IP "Type:" 6
-\f[I]list\f[] of \f[I]strings\f[]
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
.IP "Example:" 4
-.. code:: json
+.br
+* "firefox"
+.br
+* .. code:: json
["ECDHE-ECDSA-AES128-GCM-SHA256",
"ECDHE-RSA-AES128-GCM-SHA256",
@@ -928,6 +959,9 @@ List of TLS/SSL cipher suites in
to be passed to
\f[I]ssl.SSLContext.set_ciphers()\f[]
+Set this option to \f[I]"firefox"\f[] or \f[I]"chrome"\f[]
+to use these browser's default ciphers.
+
.SS extractor.*.tls12
.IP "Type:" 6
@@ -935,7 +969,7 @@ to be passed to
.IP "Default:" 9
.br
-* \f[I]false\f[]: \f[I]artstation\f[]
+* \f[I]false\f[]: \f[I]artstation\f[], \f[I]behance\f[]
.br
* \f[I]true\f[]: otherwise
@@ -1228,9 +1262,9 @@ for available \f[I]PRAGMA\f[] statements and further details.
.SS extractor.*.actions
.IP "Type:" 6
.br
-* \f[I]object\f[] (pattern -> action(s))
+* \f[I]object\f[] (pattern -> \f[I]Action(s)\f[])
.br
-* \f[I]list\f[] of \f[I]lists\f[] with pattern -> action(s) pairs as elements
+* \f[I]list\f[] of \f[I]lists\f[] with pattern -> \f[I]Action(s)\f[] pairs as elements
.IP "Example:" 4
.. code:: json
@@ -1259,57 +1293,17 @@ for available \f[I]PRAGMA\f[] statements and further details.
.IP "Description:" 4
-Perform an \f[I]action\f[] when logging a message matched by \f[I]pattern\f[].
+Perform an \f[I]Action\f[] when logging a message matched by \f[I]pattern\f[].
\f[I]pattern\f[] is parsed as severity level (\f[I]debug\f[], \f[I]info\f[], \f[I]warning\f[], \f[I]error\f[], or integer value)
-followed by an optional \f[I]Python Regular Expression\f[]
-separated by a colon \f[I]:\f[].
+followed by an optional
+\f[I]Python Regular Expression\f[]
+separated by a colon \f[I]:\f[]
+
Using \f[I]*\f[] as level or leaving it empty
matches logging messages of all levels
(e.g. \f[I]*:<re>\f[] or \f[I]:<re>\f[]).
-\f[I]action\f[] is parsed as action type
-followed by (optional) arguments.
-
-It is possible to specify more than one \f[I]action\f[] per \f[I]pattern\f[]
-by providing them as a \f[I]list\f[]: \f[I]["<action1>", "<action2>", …]\f[]
-
-Supported Action Types:
-
-\f[I]status\f[]:
-Modify job exit status.
-.br
-Expected syntax is \f[I]<operator> <value>\f[] (e.g. \f[I]= 100\f[]).
-.br
-
-Supported operators are
-\f[I]=\f[] (assignment),
-\f[I]&\f[] (bitwise AND),
-\f[I]|\f[] (bitwise OR),
-\f[I]^\f[] (bitwise XOR).
-\f[I]level\f[]:
-Modify severity level of the current logging message.
-.br
-Can be one of \f[I]debug\f[], \f[I]info\f[], \f[I]warning\f[], \f[I]error\f[] or an integer value.
-.br
-\f[I]print\f[]:
-Write argument to stdout.
-\f[I]exec\f[]:
-Run a shell command.
-\f[I]abort\f[]:
-Stop the current extractor run.
-\f[I]terminate\f[]:
-Stop the current extractor run, including parent extractors.
-\f[I]restart\f[]:
-Restart the current extractor run.
-\f[I]wait\f[]:
-Sleep for a given \f[I]Duration\f[] or
-.br
-wait until Enter is pressed when no argument was given.
-.br
-\f[I]exit\f[]:
-Exit the program with the given argument as exit status.
-
.SS extractor.*.postprocessors
.IP "Type:" 6
@@ -1443,6 +1437,25 @@ This value gets internally used as the \f[I]verify\f[] parameter for the
\f[I]requests.request()\f[] method.
+.SS extractor.*.truststore
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Use a
+.br
+\f[I]truststore\f[]
+\f[I]SSLContext\f[] for verifying SSL/TLS certificates
+to make use of your system's native certificate stores
+.br
+instead of relying on
+\f[I]certifi\f[]
+certificates.
+
+
.SS extractor.*.download
.IP "Type:" 6
\f[I]bool\f[]
@@ -1700,6 +1713,17 @@ Try to follow external URLs of embedded players.
Limit the number of posts/projects to download.
+.SS extractor.artstation.mviews
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Download \f[I].mview\f[] files.
+
+
.SS extractor.artstation.previews
.IP "Type:" 6
\f[I]bool\f[]
@@ -1708,7 +1732,7 @@ Limit the number of posts/projects to download.
\f[I]false\f[]
.IP "Description:" 4
-Download video previews.
+Download embed previews.
.SS extractor.artstation.videos
@@ -2043,7 +2067,7 @@ Possibly available formats are
\f[I]string\f[]
.IP "Default:" 9
-\f[I]"/api/_001"\f[]
+\f[I]"/api/_001_v2"\f[]
.IP "Description:" 4
API endpoint for retrieving file URLs.
@@ -2135,7 +2159,7 @@ Available types are
* \f[I]list\f[] of \f[I]strings\f[]
.IP "Default:" 9
-\f[I]["user-models", "user-posts"]\f[]
+\f[I]["user-images", "user-videos"]\f[]
.IP "Description:" 4
A (comma-separated) list of subcategories to include
@@ -2154,6 +2178,14 @@ Possible values are
It is possible to use \f[I]"all"\f[] instead of listing all values separately.
+.IP "Note:" 4
+To get a more complete set of metadata
+like \f[I]model['name']\f[] and \f[I]post['title']\f[],
+include \f[I]user-models\f[] and \f[I]user-posts\f[]
+as well as the default \f[I]user-images\f[] and \f[I]user-videos\f[]:
+
+\f[I]["user-models", "user-posts", "user-images", "user-videos"]\f[]
+
.SS extractor.civitai.metadata
.IP "Type:" 6
@@ -2169,14 +2201,14 @@ It is possible to use \f[I]"all"\f[] instead of listing all values separately.
.IP "Example:" 4
.br
-* "generation,version"
+* "generation,post,version"
.br
-* ["generation", "version"]
+* ["version", "generation"]
.IP "Description:" 4
-Extract additional \f[I]generation\f[] and \f[I]version\f[] metadata.
+Extract additional \f[I]generation\f[], \f[I]version\f[], and \f[I]post\f[] metadata.
-Note: This requires 1 additional HTTP request per image or video.
+Note: This requires 1 or more additional API requests per image or video.
.SS extractor.civitai.nsfw
@@ -2375,6 +2407,17 @@ greater than the per-page limit, gallery-dl will stop after the first
batch. The value cannot be less than 1.
+.SS extractor.dankefuerslesen.zip
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Download each chapter as a single ZIP archive instead of individual images.
+
+
.SS extractor.deviantart.auto-watch
.IP "Type:" 6
\f[I]bool\f[]
@@ -2806,6 +2849,18 @@ Discord Bot Token for API requests.
You can follow \f[I]this guide\f[] to get a token.
+.SS extractor.dynastyscans.anthology.metadata
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Extract \f[I]alert\f[], \f[I]description\f[], and \f[I]status\f[] metadata
+from an anthology's HTML page.
+
+
.SS extractor.[E621].metadata
.IP "Type:" 6
.br
@@ -2922,8 +2977,27 @@ Selects how to handle "you do not have enough GP" errors.
\f[I]null\f[]
.IP "Description:" 4
-Sets a custom image download limit and
-stops extraction when it gets exceeded.
+Set a custom image download limit and perform
+\f[I]limits-action\f[]
+when it gets exceeded.
+
+
+.SS extractor.exhentai.limits-action
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"stop"\f[]
+
+.IP "Description:" 4
+Action to perform when the image limit is exceeded.
+
+.br
+* "stop": Stop the current extractor run.
+.br
+* "wait": Wait for user input.
+.br
+* "reset": Spend GP to reset your account's image limits.
.SS extractor.exhentai.metadata
@@ -2937,8 +3011,10 @@ stops extraction when it gets exceeded.
Load extended gallery metadata from the
\f[I]API\f[].
-Adds \f[I]archiver_key\f[], \f[I]posted\f[], and \f[I]torrents\f[].
-Makes \f[I]date\f[] and \f[I]filesize\f[] more precise.
+.br
+* Adds \f[I]archiver_key\f[], \f[I]posted\f[], and \f[I]torrents\f[]
+.br
+* Provides exact \f[I]date\f[] and \f[I]filesize\f[]
.SS extractor.exhentai.original
@@ -2964,6 +3040,9 @@ Selects an alternative source to download files from.
.br
* \f[I]"hitomi"\f[]: Download the corresponding gallery from \f[I]hitomi.la\f[]
+.br
+* \f[I]"metadata"\f[]: Load only a gallery's metadata from the
+\f[I]API\f[]
.SS extractor.exhentai.tags
@@ -2990,6 +3069,36 @@ for example \f[I]tags_artist\f[] or \f[I]tags_character\f[].
Extract comments that include photo attachments made by the author of the post.
+.SS extractor.facebook.include
+.IP "Type:" 6
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
+
+.IP "Default:" 9
+\f[I]"photos"\f[]
+
+.IP "Example:" 4
+.br
+* "avatar,photos"
+.br
+* ["avatar", "photos"]
+
+.IP "Description:" 4
+A (comma-separated) list of subcategories to include
+when processing a user profile.
+
+Supported values are
+
+.br
+* \f[I]avatar\f[]
+.br
+* \f[I]photos\f[]
+
+It is possible to use \f[I]"all"\f[] instead of listing all values separately.
+
+
.SS extractor.facebook.videos
.IP "Type:" 6
.br
@@ -3048,6 +3157,17 @@ extraction and download for YouTube, Vimeo, and SoundCloud embeds.
* \f[I]false\f[]: Ignore embeds.
+.SS extractor.fanbox.fee-max
+.IP "Type:" 6
+\f[I]integer\f[]
+
+.IP "Description:" 4
+Do not request API data or extract files from posts
+that require a fee (\f[I]feeRequired\f[]) greater than the specified amount.
+
+Note: This option has no effect on individual post URLs.
+
+
.SS extractor.fanbox.metadata
.IP "Type:" 6
.br
@@ -3495,13 +3615,15 @@ Selects which API endpoints to use.
.IP "Description:" 4
Controls from which position to start the extraction process from.
+\f[I]true\f[]
+Start from the beginning.
.br
-* \f[I]true\f[]: Start from the beginning.
Log the most recent \f[I]cursor\f[] value when interrupted before reaching the end.
.br
-* \f[I]false\f[]: Start from the beginning.
-.br
-* any \f[I]string\f[]: Start from the position defined by this value.
+\f[I]false\f[]
+Start from the beginning.
+any \f[I]string\f[]
+Start from the position defined by this value.
.SS extractor.instagram.include
@@ -3651,6 +3773,42 @@ Do not download videos
Split \f[I]stories\f[] elements into separate posts.
+.SS extractor.itaku.include
+.IP "Type:" 6
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
+
+.IP "Default:" 9
+\f[I]"gallery"\f[]
+
+.IP "Example:" 4
+.br
+* "stars,gallery"
+.br
+* ["stars", "gallery"]
+
+.IP "Description:" 4
+A (comma-separated) list of subcategories to include
+when processing a user profile.
+
+Supported values are
+
+.br
+* \f[I]gallery\f[]
+.br
+* \f[I]posts\f[]
+.br
+* \f[I]followers\f[]
+.br
+* \f[I]following\f[]
+.br
+* \f[I]stars\f[]
+
+It is possible to use \f[I]"all"\f[] instead of listing all values separately.
+
+
.SS extractor.itaku.videos
.IP "Type:" 6
\f[I]bool\f[]
@@ -3662,7 +3820,33 @@ Split \f[I]stories\f[] elements into separate posts.
Download video files.
-.SS extractor.kemonoparty.archives
+.SS extractor.iwara.include
+.IP "Type:" 6
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
+
+.IP "Default:" 9
+\f[I]["user-images", "user-videos"]\f[]
+
+.IP "Description:" 4
+A (comma-separated) list of subcategories to include
+when processing a user profile.
+
+Possible values are
+
+.br
+* \f[I]"user-images"\f[]
+.br
+* \f[I]"user-videos"\f[]
+.br
+* \f[I]"user-playlists"\f[]
+
+It is possible to use \f[I]"all"\f[] instead of listing all values separately.
+
+
+.SS extractor.kemono.archives
.IP "Type:" 6
\f[I]bool\f[]
@@ -3676,7 +3860,7 @@ Extract additional metadata for \f[I]archives\f[] files, including
Note: This requires 1 additional HTTP request per \f[I]archives\f[] file.
-.SS extractor.kemonoparty.comments
+.SS extractor.kemono.comments
.IP "Type:" 6
\f[I]bool\f[]
@@ -3689,23 +3873,39 @@ Extract \f[I]comments\f[] metadata.
Note: This requires 1 additional HTTP request per post.
-.SS extractor.kemonoparty.duplicates
+.SS extractor.kemono.duplicates
.IP "Type:" 6
-\f[I]bool\f[]
+.br
+* \f[I]bool\f[]
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
.IP "Default:" 9
\f[I]false\f[]
+.IP "Example:" 4
+.br
+* "attachment,inline"
+.br
+* ["file", "attachment"]
+
.IP "Description:" 4
Controls how to handle duplicate files in a post.
+\f[I]true\f[]
+Download duplicates
+\f[I]false\f[]
+Ignore duplicates
+any \f[I]list\f[] or \f[I]string\f[]
+Download a duplicate file if its \f[I]type\f[] is in the given list
.br
-* \f[I]true\f[]: Download duplicates
+Ignore it otherwise
.br
-* \f[I]false\f[]: Ignore duplicates
-.SS extractor.kemonoparty.dms
+.SS extractor.kemono.dms
.IP "Type:" 6
\f[I]bool\f[]
@@ -3716,7 +3916,7 @@ Controls how to handle duplicate files in a post.
Extract a user's direct messages as \f[I]dms\f[] metadata.
-.SS extractor.kemonoparty.announcements
+.SS extractor.kemono.announcements
.IP "Type:" 6
\f[I]bool\f[]
@@ -3727,7 +3927,7 @@ Extract a user's direct messages as \f[I]dms\f[] metadata.
Extract a user's announcements as \f[I]announcements\f[] metadata.
-.SS extractor.kemonoparty.endpoint
+.SS extractor.kemono.endpoint
.IP "Type:" 6
\f[I]string\f[]
@@ -3762,7 +3962,7 @@ Provides more metadata, but might not return a creator's first/last posts.
.br
-.SS extractor.kemonoparty.favorites
+.SS extractor.kemono.favorites
.IP "Type:" 6
\f[I]string\f[]
@@ -3775,7 +3975,7 @@ Determines the type of favorites to be downloaded.
Available types are \f[I]artist\f[], and \f[I]post\f[].
-.SS extractor.kemonoparty.files
+.SS extractor.kemono.files
.IP "Type:" 6
\f[I]list\f[] of \f[I]strings\f[]
@@ -3788,7 +3988,7 @@ Determines the type and order of files to be downloaded.
Available types are \f[I]file\f[], \f[I]attachments\f[], and \f[I]inline\f[].
-.SS extractor.kemonoparty.max-posts
+.SS extractor.kemono.max-posts
.IP "Type:" 6
\f[I]integer\f[]
@@ -3799,7 +3999,7 @@ Available types are \f[I]file\f[], \f[I]attachments\f[], and \f[I]inline\f[].
Limit the number of posts to download.
-.SS extractor.kemonoparty.metadata
+.SS extractor.kemono.metadata
.IP "Type:" 6
\f[I]bool\f[]
@@ -3810,7 +4010,7 @@ Limit the number of posts to download.
Extract \f[I]username\f[] and \f[I]user_profile\f[] metadata.
-.SS extractor.kemonoparty.revisions
+.SS extractor.kemono.revisions
.IP "Type:" 6
.br
* \f[I]bool\f[]
@@ -3828,7 +4028,7 @@ Set this to \f[I]"unique"\f[] to filter out duplicate revisions.
Note: This requires 1 additional HTTP request per post.
-.SS extractor.kemonoparty.order-revisions
+.SS extractor.kemono.order-revisions
.IP "Type:" 6
\f[I]string\f[]
@@ -3876,7 +4076,7 @@ If the selected format is not available,
the first in the list gets chosen (usually mp3).
-.SS extractor.koharu.cbz
+.SS extractor.schalenetwork.cbz
.IP "Type:" 6
\f[I]bool\f[]
@@ -3890,7 +4090,7 @@ Disabling this option causes a gallery
to be downloaded as individual image files.
-.SS extractor.koharu.format
+.SS extractor.schalenetwork.format
.IP "Type:" 6
.br
* \f[I]string\f[]
@@ -3911,7 +4111,7 @@ Possible formats are
.br
-.SS extractor.koharu.tags
+.SS extractor.schalenetwork.tags
.IP "Type:" 6
\f[I]bool\f[]
@@ -4000,11 +4200,22 @@ to filter chapters by.
.SS extractor.mangadex.ratings
.IP "Type:" 6
-\f[I]list\f[] of \f[I]strings\f[]
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
.IP "Default:" 9
\f[I]["safe", "suggestive", "erotica", "pornographic"]\f[]
+.IP "Example:" 4
+.br
+* "safe"
+.br
+* "erotica,suggestive"
+.br
+* ["erotica", "suggestive"]
+
.IP "Description:" 4
List of acceptable content ratings for returned chapters.
@@ -4103,6 +4314,35 @@ Also emit metadata for text-only posts without media content.
Your access token, necessary to fetch favorited notes.
+.SS extractor.[misskey].include
+.IP "Type:" 6
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
+
+.IP "Default:" 9
+\f[I]"notes"\f[]
+
+.IP "Example:" 4
+.br
+* "avatar,background,notes"
+.br
+* ["avatar", "background", "notes"]
+
+.IP "Description:" 4
+A (comma-separated) list of subcategories to include
+when processing a user profile.
+
+Possible values are
+\f[I]"info"\f[],
+\f[I]"avatar"\f[],
+\f[I]"background"\f[],
+\f[I]"notes"\f[],
+
+It is possible to use \f[I]"all"\f[] instead of listing all values separately.
+
+
.SS extractor.[misskey].renotes
.IP "Type:" 6
\f[I]bool\f[]
@@ -4138,7 +4378,7 @@ Extract extended \f[I]pool\f[] metadata.
Note: Not supported by all \f[I]moebooru\f[] instances.
-.SS extractor.naver.videos
+.SS extractor.naver-blog.videos
.IP "Type:" 6
\f[I]bool\f[]
@@ -4149,6 +4389,17 @@ Note: Not supported by all \f[I]moebooru\f[] instances.
Download videos.
+.SS extractor.naver-chzzk.offset
+.IP "Type:" 6
+\f[I]integer\f[]
+
+.IP "Default:" 9
+\f[I]0\f[]
+
+.IP "Description:" 4
+Custom \f[I]offset\f[] starting value when paginating over comments.
+
+
.SS extractor.newgrounds.flash
.IP "Type:" 6
\f[I]bool\f[]
@@ -4345,6 +4596,33 @@ Extract additional metadata (\f[I]source\f[], \f[I]uploader\f[])
Note: This requires 1 additional HTTP request per post.
+.SS extractor.patreon.cursor
+.IP "Type:" 6
+.br
+* \f[I]bool\f[]
+.br
+* \f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Example:" 4
+"03:eyJ2IjoxLCJjIjoiMzU0NDQ1MjAiLCJ0IjoiIn0=:DTcmjBoVj01o_492YBYqHhqx"
+
+.IP "Description:" 4
+Controls from which position to start the extraction process from.
+
+\f[I]true\f[]
+Start from the beginning.
+.br
+Log the most recent \f[I]cursor\f[] value when interrupted before reaching the end.
+.br
+\f[I]false\f[]
+Start from the beginning.
+any \f[I]string\f[]
+Start from the position defined by this value.
+
+
.SS extractor.patreon.files
.IP "Type:" 6
\f[I]list\f[] of \f[I]strings\f[]
@@ -4403,6 +4681,17 @@ Possible formats:
* \f[I]thumbnail_small\f[] (\f[I]"h":100,"w":100\f[])
+.SS extractor.patreon.user.date-max
+.IP "Type:" 6
+\f[I]Date\f[]
+
+.IP "Default:" 9
+\f[I]0\f[]
+
+.IP "Description:" 4
+Sets the \f[I]Date\f[] to start from.
+
+
.SS extractor.[philomena].api-key
.IP "Type:" 6
\f[I]string\f[]
@@ -4592,40 +4881,6 @@ by using a third-party tool like
\f[I]gppt\f[].
-.SS extractor.pixiv.novel.covers
-.IP "Type:" 6
-\f[I]bool\f[]
-
-.IP "Default:" 9
-\f[I]false\f[]
-
-.IP "Description:" 4
-Download cover images.
-
-
-.SS extractor.pixiv.novel.embeds
-.IP "Type:" 6
-\f[I]bool\f[]
-
-.IP "Default:" 9
-\f[I]false\f[]
-
-.IP "Description:" 4
-Download embedded images.
-
-
-.SS extractor.pixiv.novel.full-series
-.IP "Type:" 6
-\f[I]bool\f[]
-
-.IP "Default:" 9
-\f[I]false\f[]
-
-.IP "Description:" 4
-When downloading a novel being part of a series,
-download all novels of that series.
-
-
.SS extractor.pixiv.metadata
.IP "Type:" 6
\f[I]bool\f[]
@@ -4753,6 +5008,125 @@ A value of \f[I]0\f[] means no limit.
Try to fetch \f[I]limit_sanity_level\f[] works via web API.
+.SS extractor.pixiv-novel.comments
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Fetch \f[I]comments\f[] metadata.
+
+Note: This requires 1 or more additional API requests per novel,
+depending on the number of comments.
+
+
+.SS extractor.pixiv-novel.covers
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Download cover images.
+
+
+.SS extractor.pixiv-novel.embeds
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Download embedded images.
+
+
+.SS extractor.pixiv-novel.full-series
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+When downloading a novel being part of a series,
+download all novels of that series.
+
+
+.SS extractor.pixiv-novel.max-posts
+.IP "Type:" 6
+\f[I]integer\f[]
+
+.IP "Default:" 9
+\f[I]0\f[]
+
+.IP "Description:" 4
+When downloading multiple novels,
+this sets the maximum number of novels to get.
+
+A value of \f[I]0\f[] means no limit.
+
+
+.SS extractor.pixiv-novel.metadata
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Fetch extended \f[I]user\f[] metadata.
+
+
+.SS extractor.pixiv-novel.metadata-bookmark
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+For novels bookmarked by
+\f[I]your own account\f[],
+fetch bookmark tags as \f[I]tags_bookmark\f[] metadata.
+
+Note: This requires 1 additional API request per bookmarked post.
+
+
+.SS extractor.pixiv-novel.refresh-token
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Description:" 4
+The \f[I]refresh-token\f[] value you get
+from running \f[I]gallery-dl oauth:pixiv\f[] (see OAuth_) or
+by using a third-party tool like
+\f[I]gppt\f[].
+
+This can be the same value as \f[I]extractor.pixiv.refresh-token\f[]
+
+
+.SS extractor.pixiv-novel.tags
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"japanese"\f[]
+
+.IP "Description:" 4
+Controls the \f[I]tags\f[] metadata field.
+
+.br
+* "japanese": List of Japanese tags
+.br
+* "translated": List of translated tags
+.br
+* "original": Unmodified list with both Japanese and translated tags
+
+
.SS extractor.plurk.comments
.IP "Type:" 6
\f[I]bool\f[]
@@ -5027,22 +5401,6 @@ Selects the file format to extract.
When more than one format is given, the first available one is selected.
-.SS extractor.sankaku.id-format
-.IP "Type:" 6
-\f[I]string\f[]
-
-.IP "Default:" 9
-\f[I]"numeric"\f[]
-
-.IP "Description:" 4
-Format of \f[I]id\f[] metadata fields.
-
-.br
-* \f[I]"alphanumeric"\f[] or \f[I]"alnum"\f[]: 11-character alphanumeric IDs (\f[I]y0abGlDOr2o\f[])
-.br
-* \f[I]"numeric"\f[] or \f[I]"legacy"\f[]: numeric IDs (\f[I]360451\f[])
-
-
.SS extractor.sankaku.refresh
.IP "Type:" 6
\f[I]bool\f[]
@@ -5117,6 +5475,17 @@ Download video embeds from external sites.
Download videos.
+.SS extractor.sexcom.gifs
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Download animated images as \f[I].gif\f[] instead of \f[I].webp\f[]
+
+
.SS extractor.skeb.article
.IP "Type:" 6
\f[I]bool\f[]
@@ -5500,13 +5869,13 @@ Download user avatars.
.SS extractor.tiktok.user.module
.IP "Type:" 6
-\f[I]string\f[]
+\f[I]Module\f[]
.IP "Default:" 9
\f[I]null\f[]
.IP "Description:" 4
-Name or filesystem path of the \f[I]ytdl\f[] Python module
+The \f[I]ytdl\f[] \f[I]Module\f[]
to extract posts from a \f[I]tiktok\f[] user profile with.
See \f[I]extractor.ytdl.module\f[].
@@ -5858,15 +6227,18 @@ Controls how to handle Cross Site Request Forgery (CSRF) tokens.
.IP "Description:" 4
Controls from which position to start the extraction process from.
+\f[I]true\f[]
+Start from the beginning.
.br
-* \f[I]true\f[]: Start from the beginning.
Log the most recent \f[I]cursor\f[] value when interrupted before reaching the end.
.br
-* \f[I]false\f[]: Start from the beginning.
-.br
-* any \f[I]string\f[]: Start from the position defined by this value.
+\f[I]false\f[]
+Start from the beginning.
+any \f[I]string\f[]
+Start from the position defined by this value.
-Note: A \f[I]cursor\f[] value from one timeline cannot be used with another.
+.IP "Note:" 4
+A \f[I]cursor\f[] value from one timeline cannot be used with another.
.SS extractor.twitter.expand
@@ -6448,6 +6820,30 @@ Use the given values as \f[I]type\f[] parameter for URLs with the specified exte
.br
+.SS extractor.webtoons.banners
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Download the active comic's \f[I]banner\f[].
+
+
+.SS extractor.webtoons.thumbnails
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Download the active episode's \f[I]thumbnail\f[].
+
+Useful for creating CBZ archives with actual source thumbnails.
+
+
.SS extractor.weibo.gifs
.IP "Type:" 6
.br
@@ -6593,6 +6989,17 @@ See
Location of a \f[I]ytdl\f[] configuration file to load options from.
+.SS extractor.ytdl.deprecations
+.IP "Type:" 6
+´´bool´´
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Allow \f[I]ytdl\f[] to warn about deprecated options and features.
+
+
.SS extractor.ytdl.enabled
.IP "Type:" 6
\f[I]bool\f[]
@@ -6635,12 +7042,25 @@ See
\f[I]true\f[]
.IP "Description:" 4
-Enables the use of \f[I]ytdl's\f[] \f[I]generic\f[] extractor.
+Enables the use of \f[I]ytdl's\f[] \f[I]Generic\f[] extractor.
Set this option to \f[I]"force"\f[] for the same effect as
\f[I]--force-generic-extractor\f[].
+.SS extractor.ytdl.generic-category
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+When using \f[I]ytdl's\f[] \f[I]Generic\f[] extractor,
+change category to \f[I]"ytdl-generic"\f[] and
+set subcategory to the input URL's domain.
+
+
.SS extractor.ytdl.logging
.IP "Type:" 6
\f[I]bool\f[]
@@ -6658,10 +7078,7 @@ Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in
.SS extractor.ytdl.module
.IP "Type:" 6
-.br
-* \f[I]string\f[]
-.br
-* \f[I]Path\f[]
+\f[I]Module\f[]
.IP "Default:" 9
\f[I]null\f[]
@@ -6673,7 +7090,7 @@ Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in
* "/home/user/.local/lib/python3.13/site-packages/youtube_dl"
.IP "Description:" 4
-Name or filesystem path of the \f[I]ytdl\f[] Python module to import.
+The \f[I]ytdl\f[] \f[I]Module\f[] to import.
Setting this to \f[I]null\f[] will try to import \f[I]"yt_dlp"\f[]
followed by \f[I]"youtube_dl"\f[] as fallback.
@@ -6692,7 +7109,6 @@ followed by \f[I]"youtube_dl"\f[] as fallback.
"merge_output_format": "mkv"
}
-
.IP "Description:" 4
Additional options passed directly to the \f[I]YoutubeDL\f[] constructor.
@@ -6936,13 +7352,23 @@ Set this option to \f[I]null\f[] to disable this indicator.
.SS downloader.*.rate
.IP "Type:" 6
-\f[I]string\f[]
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] with 2 \f[I]strings\f[]
.IP "Default:" 9
\f[I]null\f[]
.IP "Example:" 4
-"32000", "500k", "2.5M"
+.br
+* "32000"
+.br
+* "500k"
+.br
+* "1M - 2.5M"
+.br
+* ["1M", "2.5M"]
.IP "Description:" 4
Maximum download rate in bytes per second.
@@ -6951,6 +7377,10 @@ Possible values are valid integer or floating-point numbers
optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[].
These suffixes are case-insensitive.
+If given as a range, the maximum download rate
+will be randomly chosen before each download.
+(see \f[I]random.randint()\f[])
+
.SS downloader.*.retries
.IP "Type:" 6
@@ -7123,6 +7553,21 @@ Fail a download when a file does not pass
instead of downloading a potentially broken file.
+.SS downloader.http.validate-html
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Check for unexpected HTML responses.
+
+Fail file downloads with a \f[I]text/html\f[]
+\f[I]Content-Type header\f[]
+when expecting a media file instead.
+
+
.SS downloader.ytdl.cmdline-args
.IP "Type:" 6
.br
@@ -7156,6 +7601,17 @@ See
Location of a \f[I]ytdl\f[] configuration file to load options from.
+.SS downloader.ytdl.deprecations
+.IP "Type:" 6
+´´bool´´
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Allow \f[I]ytdl\f[] to warn about deprecated options and features.
+
+
.SS downloader.ytdl.format
.IP "Type:" 6
\f[I]string\f[]
@@ -7204,10 +7660,7 @@ Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in
.SS downloader.ytdl.module
.IP "Type:" 6
-.br
-* \f[I]string\f[]
-.br
-* \f[I]Path\f[]
+\f[I]Module\f[]
.IP "Default:" 9
\f[I]null\f[]
@@ -7219,7 +7672,7 @@ Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in
* "/home/user/.local/lib/python3.13/site-packages/youtube_dl"
.IP "Description:" 4
-Name or filesystem path of the \f[I]ytdl\f[] Python module to import.
+The \f[I]ytdl\f[] \f[I]Module\f[] to import.
Setting this to \f[I]null\f[] will try to import \f[I]"yt_dlp"\f[]
followed by \f[I]"youtube_dl"\f[] as fallback.
@@ -7301,8 +7754,8 @@ Controls the output string format and status indicators.
.br
* \f[I]"color"\f[]: Suitable for terminals that understand ANSI escape codes and colors
.br
-* \f[I]"auto"\f[]: \f[I]"terminal"\f[] on Windows with \f[I]output.ansi\f[] disabled,
-\f[I]"color"\f[] otherwise.
+* \f[I]"auto"\f[]: \f[I]"pipe"\f[] if not on a TTY, \f[I]"terminal"\f[] on Windows with
+\f[I]output.ansi\f[] disabled, \f[I]"color"\f[] otherwise.
It is possible to use custom output format strings
.br
@@ -7759,6 +8212,26 @@ the files' metadata as well as \f[I]{_path}\f[], \f[I]{_directory}\f[],
and \f[I]{_filename}\f[].
+.SS exec.commands
+.IP "Type:" 6
+\f[I]list\f[] of \f[I]commands\f[]
+
+.IP "Example:" 4
+.. code:: json
+
+[
+["echo", "{user[account]}", "{id}"]
+["magick", "convert" "{_path}", "\\fF {_path.rpartition('.')[0]}.png"],
+"rm {}",
+]
+
+.IP "Description:" 4
+Multiple \f[I]commands\f[] to run in succession.
+
+All \f[I]commands\f[] after the first returning with a non-zero
+exit status will not be run.
+
+
.SS exec.event
.IP "Type:" 6
.br
@@ -7775,6 +8248,27 @@ The event(s) for which \f[I]exec.command\f[] is run.
See \f[I]metadata.event\f[] for a list of available events.
+.SS exec.session
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Start subprocesses in a new session.
+
+On Windows, this means passing
+\f[I]CREATE_NEW_PROCESS_GROUP\f[]
+as a \f[I]creationflags\f[] argument to
+\f[I]subprocess.Popen\f[]
+
+On POSIX systems, this means enabling the
+\f[I]start_new_session\f[] argument of
+\f[I]subprocess.Popen\f[]
+to have it call \f[I]setsid()\f[].
+
+
.SS hash.chunk-size
.IP "Type:" 6
\f[I]integer\f[]
@@ -8385,16 +8879,18 @@ See \f[I]metadata.event\f[] for a list of available events.
.br
* "my_module:generate_text"
.br
-* "~/.local/share/gdl-utils.py:resize"
+* "~/.local/share/gdl_utils.py:resize"
.IP "Description:" 4
The Python function to call.
-This function is specified as \f[I]<module>:<function name>\f[]
-and gets called with the current metadata dict as argument.
+This function is specified as \f[I]<module>:<function name>\f[], where
+.br
+\f[I]<module>\f[] is a \f[I]Module\f[] and
+.br
+\f[I]<function name>\f[] is the name of the function in that module.
-\f[I]module\f[] is either an importable Python module name
-or the \f[I]Path\f[] to a .py file,
+It gets called with the current metadata dict as argument.
.SS rename.from
@@ -8745,29 +9241,133 @@ Note: \f[I]null\f[] references internal extractors defined in
or by \f[I]extractor.modules\f[].
-.SS globals
+.SS extractor.category-map
.IP "Type:" 6
.br
-* \f[I]Path\f[]
+* \f[I]object\f[] (category -> category)
.br
* \f[I]string\f[]
.IP "Example:" 4
-.br
-* "~/.local/share/gdl-globals.py"
-.br
-* "gdl-globals"
+.. code:: json
+
+{
+"danbooru": "booru",
+"gelbooru": "booru"
+}
.IP "Description:" 4
-Path to or name of an
-.br
-\f[I]importable\f[]
-Python module,
-whose namespace,
+A JSON object mapping category names to their replacements.
+
+Special values:
+
.br
+* \f[I]"compat"\f[]
+.. code:: json
+
+{
+"coomer" : "coomerparty",
+"kemono" : "kemonoparty",
+"schalenetwork": "koharu",
+"naver-chzzk" : "chzzk",
+"naver-blog" : "naver",
+"naver-webtoon": "naverwebtoon",
+"pixiv-novel" : "pixiv",
+"pixiv-novel:novel" : ["pixiv", "novel"],
+"pixiv-novel:user" : ["pixiv", "novel-user"],
+"pixiv-novel:series" : ["pixiv", "novel-series"],
+"pixiv-novel:bookmark": ["pixiv", "novel-bookmark"]
+}
+
+
+.SS extractor.config-map
+.IP "Type:" 6
+\f[I]object\f[] (category -> category)
+
+.IP "Default:" 9
+.. code:: json
+
+{
+"coomerparty" : "coomer",
+"kemonoparty" : "kemono",
+"koharu" : "schalenetwork",
+"chzzk" : "naver-chzzk",
+"naver" : "naver-blog",
+"naverwebtoon": "naver-webtoon",
+"pixiv" : "pixiv-novel"
+}
+
+.IP "Description:" 4
+Duplicate the configuration settings of extractor categories
+to other names.
+
+For example, a \f[I]"naver": "naver-blog"\f[] key-value pair will make all
+\f[I]naver\f[] config settings available for ´´naver-blog´´ extractors as well.
+
+
+.SS jinja.environment
+.IP "Type:" 6
+\f[I]object\f[] (name -> value)
+
+.IP "Example:" 4
+.. code:: json
+
+{
+"variable_start_string": "(((",
+"variable_end_string" : ")))",
+"keep_trailing_newline": true
+}
+
+.IP "Description:" 4
+Initialization parameters for the \f[I]jinja\f[]
+\f[I]Environment\f[]
+object.
+
+
+.SS jinja.policies
+.IP "Type:" 6
+\f[I]object\f[] (name -> value)
+
+.IP "Example:" 4
+.. code:: json
+
+{
+"urlize.rel": "nofollow noopener",
+"ext.i18n.trimmed": true
+}
+
+.IP "Description:" 4
+\f[I]jinja\f[]
+\f[I]Policies\f[]
+
+
+.SS jinja.filters
+.IP "Type:" 6
+\f[I]Module\f[]
+
+.IP "Description:" 4
+A Python \f[I]Module\f[] containing custom \f[I]jinja\f[]
+\f[I]filters\f[]
+
+
+.SS jinja.tests
+.IP "Type:" 6
+\f[I]Module\f[]
+
+.IP "Description:" 4
+A Python \f[I]Module\f[] containing custom \f[I]jinja\f[]
+\f[I]tests\f[]
+
+
+.SS globals
+.IP "Type:" 6
+\f[I]Module\f[]
+
+.IP "Description:" 4
+A Python \f[I]Module\f[] whose namespace,
in addition to the \f[I]GLOBALS\f[] dict in
\f[I]util.py\f[],
-gets used as \f[I]globals parameter\f[] for compiled Python expressions.
+is used as \f[I]globals parameter\f[] for compiled Python expressions.
.SS cache.file
@@ -8851,6 +9451,28 @@ The list of signal names to ignore, i.e. set
as signal handler for.
+.SS signals-actions
+.IP "Type:" 6
+\f[I]object\f[] (signal -> \f[I]Action(s)\f[])
+
+.IP "Example:" 4
+.. code:: json
+
+{
+"SIGINT" : "flag download = stop",
+"SIGUSR1": [
+"print Received SIGUSR1",
+"exec notify.sh",
+"exit 127"
+]
+}
+
+.IP "Description:" 4
+\f[I]Action(s)\f[] to perform when a
+\f[I]signal\f[]
+is received.
+
+
.SS subconfigs
.IP "Type:" 6
\f[I]list\f[] of \f[I]Path\f[]
@@ -9089,6 +9711,42 @@ it will be randomly chosen with uniform distribution such that \f[I]a <= N <= b\
value (\f[I]"2.85"\f[]) or a range (\f[I]"1.5-3.0"\f[]).
+.SS Module
+.IP "Type:" 6
+.br
+* \f[I]string\f[]
+.br
+* \f[I]Path\f[]
+
+.IP "Example:" 4
+.br
+* "gdl_utils"
+.br
+* "~/.local/share/gdl/"
+.br
+* "~/.local/share/gdl_utils.py"
+
+.IP "Description:" 4
+A Python
+\f[I]Module\f[]
+
+This can be one of
+
+.br
+* the name of an
+\f[I]importable\f[]
+Python module
+.br
+* the \f[I]Path\f[] to a Python
+\f[I]package\f[]
+.br
+* the \f[I]Path\f[] to a .py file
+
+See
+\f[I]Python/Modules\f[]
+for details.
+
+
.SS Path
.IP "Type:" 6
.br
@@ -9114,12 +9772,22 @@ Simple \f[I]tilde expansion\f[]
and \f[I]environment variable expansion\f[]
is supported.
-In Windows environments, backslashes (\f[I]"\\"\f[]) can, in addition to
-forward slashes (\f[I]"/"\f[]), be used as path separators.
-Because backslashes are JSON's escape character,
-they themselves have to be escaped.
-The path \f[I]C:\\path\\to\\file.ext\f[] has therefore to be written as
-\f[I]"C:\\\\path\\\\to\\\\file.ext"\f[] if you want to use backslashes.
+.IP "Note::" 4
+In Windows environments,
+both backslashes \f[I]\\\f[] as well as forward slashes \f[I]/\f[]
+can be used as path separators.
+
+However, since backslashes are JSON's escape character,
+they themselves must be escaped as \f[I]\\\\\f[].
+
+For example, a path like \f[I]C:\\path\\to\\file.ext\f[] has to be specified as
+
+.br
+* \f[I]"C:\\\\path\\\\to\\\\file.ext"\f[] when using backslashes
+.br
+* \f[I]"C:/path/to/file.ext"\f[] when using forward slashes
+
+in a JSON file.
.SS Logging Configuration
@@ -9265,6 +9933,81 @@ Convert Pixiv Ugoira to WebM using \f[I]ffmpeg\f[]
Store files in a ZIP archive
+.SS Action
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Example:" 4
+.br
+* "exit"
+.br
+* "print Hello World"
+.br
+* "raise AbortExtraction an error occured"
+.br
+* "flag file = terminate"
+
+.IP "Description:" 4
+An \f[I]Action\f[] is parsed as Action Type
+followed by (optional) arguments.
+
+It is possible to specify more than one \f[I]action\f[]
+by providing them as a \f[I]list\f[]: \f[I]["<action1>", "<action2>", …]\f[]
+
+Supported Action Types:
+
+\f[I]status\f[]:
+Modify job exit status.
+.br
+Expected syntax is \f[I]<operator> <value>\f[] (e.g. \f[I]= 100\f[]).
+.br
+
+Supported operators are
+\f[I]=\f[] (assignment),
+\f[I]&\f[] (bitwise AND),
+\f[I]|\f[] (bitwise OR),
+\f[I]^\f[] (bitwise XOR).
+\f[I]level\f[]:
+Modify severity level of the current logging message.
+.br
+Can be one of \f[I]debug\f[], \f[I]info\f[], \f[I]warning\f[], \f[I]error\f[] or an integer value.
+.br
+\f[I]print\f[]:
+Write argument to stdout.
+\f[I]exec\f[]:
+Run a shell command.
+\f[I]abort\f[]:
+Stop the current extractor run.
+\f[I]terminate\f[]:
+Stop the current extractor run, including parent extractors.
+\f[I]restart\f[]:
+Restart the current extractor run.
+\f[I]raise\f[]:
+Raise an exception.
+
+This can be an exception defined in
+\f[I]exception.py\f[]
+or a
+\f[I]built-in exception\f[]
+(e.g. \f[I]ZeroDivisionError\f[])
+\f[I]flag\f[]:
+Set a \f[I]flag\f[].
+
+Expected syntax is \f[I]<flag>[ = <value>]\f[] (e.g. \f[I]post = stop\f[])
+.br
+\f[I]<flag>\f[] can be one of \f[I]file\f[], \f[I]post\f[], \f[I]child\f[], \f[I]download\f[]
+.br
+\f[I]<value>\f[] can be one of \f[I]stop\f[], \f[I]abort\f[], \f[I]terminate\f[], \f[I]restart\f[] (default \f[I]stop\f[])
+.br
+\f[I]wait\f[]:
+Sleep for a given \f[I]Duration\f[] or
+.br
+wait until Enter is pressed when no argument was given.
+.br
+\f[I]exit\f[]:
+Exit the program with the given argument as exit status.
+
+
.SH BUGS
https://github.com/mikf/gallery-dl/issues
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index eac3390..6541030 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -28,6 +28,7 @@
"retry-codes" : [],
"timeout" : 30.0,
"verify" : true,
+ "truststore" : false,
"download" : true,
"fallback" : true,
@@ -87,6 +88,18 @@
"jfi" : "jpg"
},
+ "category-map": {},
+ "config-map": {
+ "coomerparty" : "coomer",
+ "kemonoparty" : "kemono",
+ "koharu" : "schalenetwork",
+ "chzzk" : "naver-chzzk",
+ "naver" : "naver-blog",
+ "naverwebtoon": "naver-webtoon",
+ "pixiv" : "pixiv-novel"
+ },
+
+
"#": "===============================================================",
"#": "==== Site-specific Extractor Options ====================",
@@ -110,6 +123,7 @@
{
"external" : false,
"max-posts": null,
+ "mviews" : true,
"previews" : false,
"videos" : true,
@@ -170,7 +184,7 @@
},
"bunkr":
{
- "endpoint": "/api/_001",
+ "endpoint": "/api/_001_v2",
"tlds": false
},
"cien":
@@ -185,13 +199,13 @@
"api" : "trpc",
"files" : ["image"],
- "include" : ["user-models", "user-posts"],
+ "include" : ["user-images", "user-videos"],
"metadata": false,
"nsfw" : true,
"quality" : "original=true",
"quality-videos": "quality=100"
},
- "coomerparty":
+ "coomer":
{
"username": "",
"password": "",
@@ -211,6 +225,10 @@
{
"domain": null
},
+ "dankefuerslesen":
+ {
+ "zip": false
+ },
"deviantart":
{
"client-id" : null,
@@ -245,6 +263,18 @@
"subfolders": true
}
},
+ "discord":
+ {
+ "embeds" : ["image", "gifv", "video"],
+ "threads": true,
+ "token" : ""
+ },
+ "dynastyscans":
+ {
+ "anthology": {
+ "metadata": false
+ }
+ },
"exhentai":
{
"username": "",
@@ -255,19 +285,29 @@
"domain" : "auto",
"fav" : null,
"gp" : "resized",
- "limits" : null,
"metadata": false,
"original": true,
"source" : null,
"tags" : false,
+ "limits" : null,
+ "limits-action" : "stop",
"fallback-retries": 2
},
+ "facebook":
+ {
+ "cookies": null,
+
+ "author-followups": false,
+ "include": "photos",
+ "videos" : true
+ },
"fanbox":
{
"cookies" : null,
"comments": false,
"embeds" : true,
+ "fee-max" : null,
"metadata": false
},
"flickr":
@@ -307,6 +347,11 @@
{
"enabled": false
},
+ "girlswithmuscle":
+ {
+ "username": "",
+ "password": ""
+ },
"gofile":
{
"api-token": null,
@@ -374,9 +419,17 @@
"itaku":
{
"sleep-request": "0.5-1.5",
- "videos": true
+ "include": "gallery",
+ "videos" : true
},
- "kemonoparty":
+ "iwara":
+ {
+ "username": "",
+ "password": "",
+
+ "include": ["user-images", "user-images"]
+ },
+ "kemono":
{
"username": "",
"password": "",
@@ -399,20 +452,15 @@
"covers": false,
"format": "mp3"
},
- "koharu":
- {
- "username": "",
- "password": "",
- "sleep-request": "0.5-1.5",
-
- "cbz" : true,
- "format": ["0", "1600", "1280", "980", "780"],
- "tags" : false
- },
"luscious":
{
"gif": false
},
+ "madokami":
+ {
+ "username": "",
+ "password": ""
+ },
"mangadex":
{
"client-id" : "",
@@ -430,10 +478,14 @@
"username": "",
"password": ""
},
- "naver":
+ "naver-blog":
{
"videos": true
},
+ "naver-chzzk":
+ {
+ "offset": 0
+ },
"newgrounds":
{
"username": "",
@@ -463,8 +515,13 @@
{
"cookies": null,
+ "cursor" : true,
"files" : ["images", "image_large", "attachments", "postfile", "content"],
- "format-images": "download_url"
+ "format-images": "download_url",
+
+ "user": {
+ "date-max" : 0
+ }
},
"pexels":
{
@@ -504,7 +561,17 @@
"metadata-bookmark": false,
"sanity" : true,
"tags" : "japanese",
- "ugoira" : true,
+ "ugoira" : true
+ },
+ "pixiv-novel":
+ {
+ "refresh-token": null,
+
+ "comments" : false,
+ "max-posts": null,
+ "metadata" : false,
+ "metadata-bookmark": false,
+ "tags" : "japanese",
"covers" : false,
"embeds" : false,
@@ -555,6 +622,9 @@
},
"rule34xyz":
{
+ "username": "",
+ "password": "",
+
"format": ["10", "40", "41", "2"]
},
"sankaku":
@@ -562,7 +632,6 @@
"username": "",
"password": "",
- "id-format": "numeric",
"refresh" : false,
"tags" : false
},
@@ -571,12 +640,26 @@
"embeds": false,
"videos": true
},
+ "schalenetwork":
+ {
+ "username": "",
+ "password": "",
+ "sleep-request": "0.5-1.5",
+
+ "cbz" : true,
+ "format": ["0", "1600", "1280", "980", "780"],
+ "tags" : false
+ },
"scrolller":
{
"username": "",
"password": "",
"sleep-request": "0.5-1.5"
},
+ "sexcom":
+ {
+ "gifs": true
+ },
"skeb":
{
"article" : false,
@@ -755,7 +838,9 @@
{
"sleep-request": "0.5-1.5",
- "quality": "original"
+ "quality" : "original",
+ "banners" : false,
+ "thumbnails": false
},
"weebcentral":
{
@@ -780,9 +865,11 @@
{
"cmdline-args": null,
"config-file" : null,
+ "deprecations": false,
"enabled" : false,
"format" : null,
"generic" : true,
+ "generic-category": true,
"logging" : true,
"module" : null,
"raw-options" : null
@@ -889,6 +976,7 @@
"misskey":
{
"access-token": null,
+ "include" : ["notes"],
"renotes" : false,
"replies" : true
},
@@ -987,13 +1075,15 @@
"headers" : null,
"retry-codes" : [],
"sleep-429" : 60.0,
- "validate" : true
+ "validate" : true,
+ "validate-html" : true
},
"ytdl":
{
"cmdline-args" : null,
"config-file" : null,
+ "deprecations" : false,
"enabled" : true,
"format" : null,
"forward-cookies": true,
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index c022f84..550241f 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.4
Name: gallery_dl
-Version: 1.29.7
+Version: 1.30.2
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -18,10 +18,6 @@ Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.4
-Classifier: Programming Language :: Python :: 3.5
-Classifier: Programming Language :: Python :: 3.6
-Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
@@ -33,11 +29,18 @@ Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Internet :: WWW/HTTP
Classifier: Topic :: Multimedia :: Graphics
Classifier: Topic :: Utilities
-Requires-Python: >=3.4
+Requires-Python: >=3.8
License-File: LICENSE
Requires-Dist: requests>=2.11.0
Provides-Extra: video
-Requires-Dist: youtube-dl; extra == "video"
+Requires-Dist: yt-dlp; extra == "video"
+Provides-Extra: extra
+Requires-Dist: requests[socks]; extra == "extra"
+Requires-Dist: yt-dlp[default]; extra == "extra"
+Requires-Dist: pyyaml; extra == "extra"
+Requires-Dist: toml; python_version < "3.11" and extra == "extra"
+Requires-Dist: truststore; python_version >= "3.10" and extra == "extra"
+Requires-Dist: secretstorage; sys_platform == "linux" and extra == "extra"
Dynamic: author
Dynamic: author-email
Dynamic: classifier
@@ -75,7 +78,7 @@ and powerful `filenaming capabilities <https://gdl-org.github.io/docs/formatting
Dependencies
============
-- Python_ 3.4+
+- Python_ 3.8+
- Requests_
Optional
@@ -91,6 +94,8 @@ Optional
- toml_: TOML configuration file support for Python<3.11
- SecretStorage_: GNOME keyring passwords for ``--cookies-from-browser``
- Psycopg_: PostgreSQL archive support
+- truststore_: Native system certificate support
+- Jinja_: Jinja template support
Installation
@@ -133,9 +138,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.7/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.2/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.7/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.2/gallery-dl.bin>`__
Nightly Builds
@@ -517,7 +522,7 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
.. _Python: https://www.python.org/downloads/
.. _PyPI: https://pypi.org/
.. _pip: https://pip.pypa.io/en/stable/
-.. _Requests: https://requests.readthedocs.io/en/master/
+.. _Requests: https://requests.readthedocs.io/en/latest/
.. _FFmpeg: https://www.ffmpeg.org/
.. _mkvmerge: https://www.matroska.org/downloads/mkvtoolnix.html
.. _yt-dlp: https://github.com/yt-dlp/yt-dlp
@@ -530,10 +535,12 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
.. _toml: https://pypi.org/project/toml/
.. _SecretStorage: https://pypi.org/project/SecretStorage/
.. _Psycopg: https://www.psycopg.org/
+.. _truststore: https://truststore.readthedocs.io/en/latest/
+.. _Jinja: https://jinja.palletsprojects.com/
.. _Snapd: https://docs.snapcraft.io/installing-snapd
.. _OAuth: https://en.wikipedia.org/wiki/OAuth
.. _Chocolatey: https://chocolatey.org/install
-.. _Scoop: https://scoop.sh
+.. _Scoop: https://scoop.sh/
.. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg
:target: https://pypi.org/project/gallery-dl/
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 5dc17bd..8ae28f6 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -76,10 +76,12 @@ gallery_dl/extractor/catbox.py
gallery_dl/extractor/chevereto.py
gallery_dl/extractor/cien.py
gallery_dl/extractor/civitai.py
+gallery_dl/extractor/comick.py
gallery_dl/extractor/comicvine.py
gallery_dl/extractor/common.py
gallery_dl/extractor/cyberdrop.py
gallery_dl/extractor/danbooru.py
+gallery_dl/extractor/dankefuerslesen.py
gallery_dl/extractor/desktopography.py
gallery_dl/extractor/deviantart.py
gallery_dl/extractor/directlink.py
@@ -104,6 +106,8 @@ gallery_dl/extractor/gelbooru.py
gallery_dl/extractor/gelbooru_v01.py
gallery_dl/extractor/gelbooru_v02.py
gallery_dl/extractor/generic.py
+gallery_dl/extractor/girlsreleased.py
+gallery_dl/extractor/girlswithmuscle.py
gallery_dl/extractor/gofile.py
gallery_dl/extractor/hatenablog.py
gallery_dl/extractor/hentai2read.py
@@ -130,13 +134,14 @@ gallery_dl/extractor/instagram.py
gallery_dl/extractor/issuu.py
gallery_dl/extractor/itaku.py
gallery_dl/extractor/itchio.py
+gallery_dl/extractor/iwara.py
gallery_dl/extractor/jschan.py
gallery_dl/extractor/kabeuchi.py
gallery_dl/extractor/keenspot.py
-gallery_dl/extractor/kemonoparty.py
+gallery_dl/extractor/kemono.py
gallery_dl/extractor/khinsider.py
-gallery_dl/extractor/koharu.py
gallery_dl/extractor/komikcast.py
+gallery_dl/extractor/leakgallery.py
gallery_dl/extractor/lensdump.py
gallery_dl/extractor/lexica.py
gallery_dl/extractor/lightroom.py
@@ -145,13 +150,13 @@ gallery_dl/extractor/lofter.py
gallery_dl/extractor/lolisafe.py
gallery_dl/extractor/luscious.py
gallery_dl/extractor/lynxchan.py
+gallery_dl/extractor/madokami.py
gallery_dl/extractor/mangadex.py
gallery_dl/extractor/mangafox.py
gallery_dl/extractor/mangahere.py
gallery_dl/extractor/manganelo.py
gallery_dl/extractor/mangapark.py
gallery_dl/extractor/mangaread.py
-gallery_dl/extractor/mangasee.py
gallery_dl/extractor/mangoxo.py
gallery_dl/extractor/mastodon.py
gallery_dl/extractor/message.py
@@ -160,7 +165,8 @@ gallery_dl/extractor/moebooru.py
gallery_dl/extractor/motherless.py
gallery_dl/extractor/myhentaigallery.py
gallery_dl/extractor/myportfolio.py
-gallery_dl/extractor/naver.py
+gallery_dl/extractor/naverblog.py
+gallery_dl/extractor/naverchzzk.py
gallery_dl/extractor/naverwebtoon.py
gallery_dl/extractor/nekohouse.py
gallery_dl/extractor/newgrounds.py
@@ -170,6 +176,7 @@ gallery_dl/extractor/nitter.py
gallery_dl/extractor/noop.py
gallery_dl/extractor/nozomi.py
gallery_dl/extractor/nsfwalbum.py
+gallery_dl/extractor/nudostar.py
gallery_dl/extractor/oauth.py
gallery_dl/extractor/paheal.py
gallery_dl/extractor/patreon.py
@@ -190,10 +197,12 @@ gallery_dl/extractor/poringa.py
gallery_dl/extractor/pornhub.py
gallery_dl/extractor/pornpics.py
gallery_dl/extractor/postmill.py
+gallery_dl/extractor/rawkuma.py
gallery_dl/extractor/reactor.py
gallery_dl/extractor/readcomiconline.py
gallery_dl/extractor/realbooru.py
gallery_dl/extractor/recursive.py
+gallery_dl/extractor/redbust.py
gallery_dl/extractor/reddit.py
gallery_dl/extractor/redgifs.py
gallery_dl/extractor/rule34us.py
@@ -202,6 +211,7 @@ gallery_dl/extractor/rule34xyz.py
gallery_dl/extractor/saint.py
gallery_dl/extractor/sankaku.py
gallery_dl/extractor/sankakucomplex.py
+gallery_dl/extractor/schalenetwork.py
gallery_dl/extractor/scrolller.py
gallery_dl/extractor/seiga.py
gallery_dl/extractor/senmanga.py
diff --git a/gallery_dl.egg-info/requires.txt b/gallery_dl.egg-info/requires.txt
index 44dd863..531a762 100644
--- a/gallery_dl.egg-info/requires.txt
+++ b/gallery_dl.egg-info/requires.txt
@@ -1,4 +1,18 @@
requests>=2.11.0
+[extra]
+requests[socks]
+yt-dlp[default]
+pyyaml
+
+[extra:python_version < "3.11"]
+toml
+
+[extra:python_version >= "3.10"]
+truststore
+
+[extra:sys_platform == "linux"]
+secretstorage
+
[video]
-youtube-dl
+yt-dlp
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index ec882c3..9ab61e5 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,7 @@ import logging
from . import version, config, option, output, extractor, job, util, exception
__author__ = "Mike Fährmann"
-__copyright__ = "Copyright 2014-2023 Mike Fährmann"
+__copyright__ = "Copyright 2014-2025 Mike Fährmann"
__license__ = "GPLv2"
__maintainer__ = "Mike Fährmann"
__email__ = "mike_faehrmann@web.de"
@@ -78,8 +78,7 @@ def main():
output.configure_standard_streams()
# signals
- signals = config.get((), "signals-ignore")
- if signals:
+ if signals := config.get((), "signals-ignore"):
import signal
if isinstance(signals, str):
signals = signals.split(",")
@@ -90,6 +89,10 @@ def main():
else:
signal.signal(signal_num, signal.SIG_IGN)
+ if signals := config.get((), "signals-actions"):
+ from . import actions
+ actions.parse_signals(signals)
+
# enable ANSI escape sequences on Windows
if util.WINDOWS and config.get(("output",), "ansi", output.COLORS):
from ctypes import windll, wintypes, byref
@@ -118,14 +121,12 @@ def main():
util.compile_expression = util.compile_expression_defaultdict
# format string separator
- separator = config.get((), "format-separator")
- if separator:
+ if separator := config.get((), "format-separator"):
from . import formatter
formatter._SEPARATOR = separator
# eval globals
- path = config.get((), "globals")
- if path:
+ if path := config.get((), "globals"):
util.GLOBALS.update(util.import_file(path).__dict__)
# loglevels
@@ -137,13 +138,12 @@ def main():
import platform
import requests
- extra = ""
if util.EXECUTABLE:
- extra = " - Executable ({})".format(version.__variant__)
+ extra = f" - Executable ({version.__variant__})"
+ elif git_head := util.git_head():
+ extra = " - Git HEAD: " + git_head
else:
- git_head = util.git_head()
- if git_head:
- extra = " - Git HEAD: " + git_head
+ extra = ""
log.debug("Version %s%s", __version__, extra)
log.debug("Python %s - %s",
@@ -157,10 +157,40 @@ def main():
log.debug("Configuration Files %s", config._files)
+ if args.clear_cache:
+ from . import cache
+ log = logging.getLogger("cache")
+ cnt = cache.clear(args.clear_cache)
+
+ if cnt is None:
+ log.error("Database file not available")
+ return 1
+
+ log.info("Deleted %d entr%s from '%s'",
+ cnt, "y" if cnt == 1 else "ies", cache._path())
+ return 0
+
+ if args.config:
+ if args.config == "init":
+ return config.initialize()
+ elif args.config == "status":
+ return config.status()
+ else:
+ return config.open_extern()
+
if args.print_traffic:
import requests
requests.packages.urllib3.connection.HTTPConnection.debuglevel = 1
+ if args.update:
+ from . import update
+ extr = update.UpdateExtractor.from_url("update:" + args.update)
+ ujob = update.UpdateJob(extr)
+ return ujob.run()
+
+ # category renaming
+ config.remap_categories()
+
# extractor modules
modules = config.get(("extractor",), "modules")
if modules is not None:
@@ -199,13 +229,7 @@ def main():
else:
extractor._module_iter = iter(modules[0])
- if args.update:
- from . import update
- extr = update.UpdateExtractor.from_url("update:" + args.update)
- ujob = update.UpdateJob(extr)
- return ujob.run()
-
- elif args.list_modules:
+ if args.list_modules:
extractor.modules.append("")
sys.stdout.write("\n".join(extractor.modules))
@@ -228,31 +252,8 @@ def main():
extr.example,
))
- elif args.clear_cache:
- from . import cache
- log = logging.getLogger("cache")
- cnt = cache.clear(args.clear_cache)
-
- if cnt is None:
- log.error("Database file not available")
- return 1
- else:
- log.info(
- "Deleted %d %s from '%s'",
- cnt, "entry" if cnt == 1 else "entries", cache._path(),
- )
-
- elif args.config:
- if args.config == "init":
- return config.initialize()
- elif args.config == "status":
- return config.status()
- else:
- return config.open_extern()
-
else:
- input_files = config.get((), "input-files")
- if input_files:
+ if input_files := config.get((), "input-files"):
for input_file in input_files:
if isinstance(input_file, str):
input_file = (input_file, None)
@@ -271,8 +272,7 @@ def main():
jobtype = job.UrlJob
jobtype.maxdepth = args.list_urls
if config.get(("output",), "fallback", True):
- jobtype.handle_url = \
- staticmethod(jobtype.handle_url_fallback)
+ jobtype.handle_url = jobtype.handle_url_fallback
elif args.dump_json:
jobtype = job.DataJob
jobtype.resolve = args.dump_json - 1
@@ -283,17 +283,15 @@ def main():
input_manager.log = input_log = logging.getLogger("inputfile")
# unsupported file logging handler
- handler = output.setup_logging_handler(
- "unsupportedfile", fmt="{message}")
- if handler:
+ if handler := output.setup_logging_handler(
+ "unsupportedfile", fmt="{message}"):
ulog = job.Job.ulog = logging.getLogger("unsupported")
ulog.addHandler(handler)
ulog.propagate = False
# error file logging handler
- handler = output.setup_logging_handler(
- "errorfile", fmt="{message}", mode="a")
- if handler:
+ if handler := output.setup_logging_handler(
+ "errorfile", fmt="{message}", mode="a"):
elog = input_manager.err = logging.getLogger("errorfile")
elog.addHandler(handler)
elog.propagate = False
@@ -315,6 +313,24 @@ def main():
args.loglevel < logging.ERROR:
input_manager.progress(pformat)
+ if catmap := config.interpolate(("extractor",), "category-map"):
+ if catmap == "compat":
+ catmap = {
+ "coomer" : "coomerparty",
+ "kemono" : "kemonoparty",
+ "schalenetwork": "koharu",
+ "naver-blog" : "naver",
+ "naver-chzzk" : "chzzk",
+ "naver-webtoon": "naverwebtoon",
+ "pixiv-novel" : "pixiv",
+ "pixiv-novel:novel" : ("pixiv", "novel"),
+ "pixiv-novel:user" : ("pixiv", "novel-user"),
+ "pixiv-novel:series" : ("pixiv", "novel-series"),
+ "pixiv-novel:bookmark": ("pixiv", "novel-bookmark"),
+ }
+ from .extractor import common
+ common.CATEGORY_MAP = catmap
+
# process input URLs
retval = 0
for url in input_manager:
@@ -335,13 +351,11 @@ def main():
else:
input_manager.success()
- except exception.StopExtraction:
- pass
- except exception.TerminateExtraction:
- pass
except exception.RestartExtraction:
log.debug("Restarting '%s'", url)
continue
+ except exception.ControlException:
+ pass
except exception.NoExtractorError:
log.error("Unsupported URL '%s'", url)
retval |= 64
@@ -462,16 +476,15 @@ class InputManager():
key, sep, value = line.partition("=")
if not sep:
raise exception.InputFileError(
- "Invalid KEY=VALUE pair '%s' on line %s in %s",
- line, n+1, path)
+ f"Invalid KEY=VALUE pair '{line}' "
+ f"on line {n+1} in {path}")
try:
value = util.json_loads(value.strip())
except ValueError as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
raise exception.InputFileError(
- "Unable to parse '%s' on line %s in %s",
- value, n+1, path)
+ f"Unable to parse '{value}' on line {n+1} in {path}")
key = key.strip().split(".")
conf.append((key[:-1], key[-1], value))
@@ -480,8 +493,7 @@ class InputManager():
# url
if " #" in line or "\t#" in line:
if strip_comment is None:
- import re
- strip_comment = re.compile(r"\s+#.*").sub
+ strip_comment = util.re(r"\s+#.*").sub
line = strip_comment("", line)
if gconf or lconf:
url = ExtendedUrl(line, gconf, lconf)
@@ -536,13 +548,11 @@ class InputManager():
"Unable to update '%s' (%s: %s)",
path, exc.__class__.__name__, exc)
- @staticmethod
- def _action_comment(lines, indicies):
+ def _action_comment(self, lines, indicies):
for i in indicies:
lines[i] = "# " + lines[i]
- @staticmethod
- def _action_delete(lines, indicies):
+ def _action_delete(self, lines, indicies):
for i in indicies:
lines[i] = ""
diff --git a/gallery_dl/actions.py b/gallery_dl/actions.py
index 668032d..971c4d9 100644
--- a/gallery_dl/actions.py
+++ b/gallery_dl/actions.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,6 @@
""" """
-import re
import time
import logging
import operator
@@ -16,7 +15,7 @@ import functools
from . import util, exception
-def parse(actionspec):
+def parse_logging(actionspec):
if isinstance(actionspec, dict):
actionspec = actionspec.items()
@@ -32,7 +31,7 @@ def parse(actionspec):
for event, spec in actionspec:
level, _, pattern = event.partition(":")
- search = re.compile(pattern).search if pattern else util.true
+ search = util.re(pattern).search if pattern else util.true
if isinstance(spec, str):
type, _, args = spec.partition(" ")
@@ -74,6 +73,41 @@ def parse(actionspec):
return actions
+def parse_signals(actionspec):
+ import signal
+
+ if isinstance(actionspec, dict):
+ actionspec = actionspec.items()
+
+ for signal_name, spec in actionspec:
+ signal_num = getattr(signal, signal_name, None)
+ if signal_num is None:
+ log = logging.getLogger("gallery-dl")
+ log.warning("signal '%s' is not defined", signal_name)
+ continue
+
+ if isinstance(spec, str):
+ type, _, args = spec.partition(" ")
+ before, after = ACTIONS[type](args)
+ action = before if after is None else after
+ else:
+ actions_before = []
+ actions_after = []
+ for s in spec:
+ type, _, args = s.partition(" ")
+ before, after = ACTIONS[type](args)
+ if before is not None:
+ actions_before.append(before)
+ if after is not None:
+ actions_after.append(after)
+
+ actions = actions_before
+ actions.extend(actions_after)
+ action = _chain_actions(actions)
+
+ signal.signal(signal_num, signals_handler(action))
+
+
class LoggerAdapter():
def __init__(self, logger, job):
@@ -129,6 +163,12 @@ def _chain_actions(actions):
return _chain
+def signals_handler(action, args={}):
+ def handler(signal_num, frame):
+ action(args)
+ return handler
+
+
# --------------------------------------------------------------------
def action_print(opts):
@@ -138,7 +178,7 @@ def action_print(opts):
def action_status(opts):
- op, value = re.match(r"\s*([&|^=])=?\s*(\d+)", opts).groups()
+ op, value = util.re(r"\s*([&|^=])=?\s*(\d+)").match(opts).groups()
op = {
"&": operator.and_,
@@ -181,6 +221,36 @@ def action_wait(opts):
return None, _wait
+def action_flag(opts):
+ flag, value = util.re(
+ r"(?i)(file|post|child|download)(?:\s*[= ]\s*(.+))?"
+ ).match(opts).groups()
+ flag = flag.upper()
+ value = "stop" if value is None else value.lower()
+
+ def _flag(args):
+ util.FLAGS.__dict__[flag] = value
+ return _flag, None
+
+
+def action_raise(opts):
+ name, _, arg = opts.partition(" ")
+
+ exc = getattr(exception, name, None)
+ if exc is None:
+ import builtins
+ exc = getattr(builtins, name, Exception)
+
+ if arg:
+ def _raise(args):
+ raise exc(arg)
+ else:
+ def _raise(args):
+ raise exc()
+
+ return None, _raise
+
+
def action_abort(opts):
return None, util.raises(exception.StopExtraction)
@@ -208,8 +278,10 @@ ACTIONS = {
"abort" : action_abort,
"exec" : action_exec,
"exit" : action_exit,
+ "flag" : action_flag,
"level" : action_level,
"print" : action_print,
+ "raise" : action_raise,
"restart" : action_restart,
"status" : action_status,
"terminate": action_terminate,
diff --git a/gallery_dl/aes.py b/gallery_dl/aes.py
index 3fd1d5e..c671de9 100644
--- a/gallery_dl/aes.py
+++ b/gallery_dl/aes.py
@@ -58,7 +58,7 @@ bytes_to_intlist = list
def intlist_to_bytes(xs):
if not xs:
return b""
- return struct.pack("%dB" % len(xs), *xs)
+ return struct.pack(f"{len(xs)}B", *xs)
def unpad_pkcs7(data):
@@ -615,7 +615,7 @@ def block_product(block_x, block_y):
if len(block_x) != BLOCK_SIZE_BYTES or len(block_y) != BLOCK_SIZE_BYTES:
raise ValueError(
- "Length of blocks need to be %d bytes" % BLOCK_SIZE_BYTES)
+ f"Length of blocks need to be {BLOCK_SIZE_BYTES} bytes")
block_r = [0xE1] + [0] * (BLOCK_SIZE_BYTES - 1)
block_v = block_y[:]
@@ -639,7 +639,7 @@ def ghash(subkey, data):
if len(data) % BLOCK_SIZE_BYTES:
raise ValueError(
- "Length of data should be %d bytes" % BLOCK_SIZE_BYTES)
+ f"Length of data should be {BLOCK_SIZE_BYTES} bytes")
last_y = [0] * BLOCK_SIZE_BYTES
for i in range(0, len(data), BLOCK_SIZE_BYTES):
diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py
index 923ed32..b04552e 100644
--- a/gallery_dl/cache.py
+++ b/gallery_dl/cache.py
@@ -73,7 +73,7 @@ class DatabaseCacheDecorator():
_init = True
def __init__(self, func, keyarg, maxage):
- self.key = "%s.%s" % (func.__module__, func.__name__)
+ self.key = f"{func.__module__}.{func.__name__}"
self.func = func
self.cache = {}
self.keyarg = keyarg
@@ -95,7 +95,7 @@ class DatabaseCacheDecorator():
pass
# database lookup
- fullkey = "%s-%s" % (self.key, key)
+ fullkey = f"{self.key}-{key}"
with self.database() as db:
cursor = db.cursor()
try:
@@ -128,7 +128,7 @@ class DatabaseCacheDecorator():
with self.database() as db:
db.execute(
"INSERT OR REPLACE INTO data VALUES (?,?,?)",
- ("%s-%s" % (self.key, key), pickle.dumps(value), expires),
+ (f"{self.key}-{key}", pickle.dumps(value), expires),
)
def invalidate(self, key):
@@ -139,7 +139,7 @@ class DatabaseCacheDecorator():
with self.database() as db:
db.execute(
"DELETE FROM data WHERE key=?",
- ("%s-%s" % (self.key, key),),
+ (f"{self.key}-{key}",),
)
def database(self):
diff --git a/gallery_dl/config.py b/gallery_dl/config.py
index 92e55d3..1873634 100644
--- a/gallery_dl/config.py
+++ b/gallery_dl/config.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -103,14 +103,12 @@ def open_extern():
openers = ("explorer", "notepad")
else:
openers = ("xdg-open", "open")
- editor = os.environ.get("EDITOR")
- if editor:
+ if editor := os.environ.get("EDITOR"):
openers = (editor,) + openers
import shutil
for opener in openers:
- opener = shutil.which(opener)
- if opener:
+ if opener := shutil.which(opener):
break
else:
log.warning("Unable to find a program to open '%s' with", path)
@@ -155,13 +153,38 @@ def status():
paths.append((path, status))
- fmt = "{{:<{}}} : {{}}\n".format(
- max(len(p[0]) for p in paths)).format
+ fmt = f"{{:<{max(len(p[0]) for p in paths)}}} : {{}}\n".format
for path, status in paths:
stdout_write(fmt(path, status))
+def remap_categories():
+ opts = _config.get("extractor")
+ if not opts:
+ return
+
+ cmap = opts.get("config-map")
+ if cmap is None:
+ cmap = (
+ ("coomerparty" , "coomer"),
+ ("kemonoparty" , "kemono"),
+ ("koharu" , "schalenetwork"),
+ ("naver" , "naver-blog"),
+ ("chzzk" , "naver-chzzk"),
+ ("naverwebtoon", "naver-webtoon"),
+ ("pixiv" , "pixiv-novel"),
+ )
+ elif not cmap:
+ return
+ elif isinstance(cmap, dict):
+ cmap = cmap.items()
+
+ for old, new in cmap:
+ if old in opts and new not in opts:
+ opts[new] = opts[old]
+
+
def load(files=None, strict=False, loads=util.json_loads):
"""Load JSON configuration files"""
for pathfmt in files or _default_configs:
@@ -186,8 +209,7 @@ def load(files=None, strict=False, loads=util.json_loads):
_files.append(pathfmt)
if "subconfigs" in conf:
- subconfigs = conf["subconfigs"]
- if subconfigs:
+ if subconfigs := conf["subconfigs"]:
if isinstance(subconfigs, str):
subconfigs = (subconfigs,)
load(subconfigs, strict, loads)
@@ -259,8 +281,7 @@ def accumulate(path, key, conf=_config):
result = []
try:
if key in conf:
- value = conf[key]
- if value:
+ if value := conf[key]:
if isinstance(value, list):
result.extend(value)
else:
@@ -268,8 +289,7 @@ def accumulate(path, key, conf=_config):
for p in path:
conf = conf[p]
if key in conf:
- value = conf[key]
- if value:
+ if value := conf[key]:
if isinstance(value, list):
result[:0] = value
else:
diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py
index f03ad58..5d6c3d7 100644
--- a/gallery_dl/cookies.py
+++ b/gallery_dl/cookies.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,7 +26,7 @@ from . import aes, text, util
SUPPORTED_BROWSERS_CHROMIUM = {
"brave", "chrome", "chromium", "edge", "opera", "thorium", "vivaldi"}
-SUPPORTED_BROWSERS_FIREFOX = {"firefox", "zen"}
+SUPPORTED_BROWSERS_FIREFOX = {"firefox", "librewolf", "zen"}
SUPPORTED_BROWSERS = \
SUPPORTED_BROWSERS_CHROMIUM | SUPPORTED_BROWSERS_FIREFOX | {"safari"}
@@ -43,7 +43,7 @@ def load_cookies(browser_specification):
elif browser_name in SUPPORTED_BROWSERS_CHROMIUM:
return load_cookies_chromium(browser_name, profile, keyring, domain)
else:
- raise ValueError("unknown browser '{}'".format(browser_name))
+ raise ValueError(f"unknown browser '{browser_name}'")
def load_cookies_firefox(browser_name, profile=None,
@@ -59,7 +59,7 @@ def load_cookies_firefox(browser_name, profile=None,
if container_id is False:
conditions.append("NOT INSTR(originAttributes,'userContextId=')")
elif container_id:
- uid = "%userContextId={}".format(container_id)
+ uid = f"%userContextId={container_id}"
conditions.append("originAttributes LIKE ? OR originAttributes LIKE ?")
parameters += (uid, uid + "&%")
@@ -72,7 +72,7 @@ def load_cookies_firefox(browser_name, profile=None,
parameters += (domain, "." + domain)
if conditions:
- sql = "{} WHERE ( {} )".format(sql, " ) AND ( ".join(conditions))
+ sql = f"{sql} WHERE ( {' ) AND ( '.join(conditions)} )"
with DatabaseConnection(path) as db:
cookies = [
@@ -186,7 +186,7 @@ def load_cookies_chromium(browser_name, profile=None,
))
if failed_cookies > 0:
- failed_message = " ({} could not be decrypted)".format(failed_cookies)
+ failed_message = f" ({failed_cookies} could not be decrypted)"
else:
failed_message = ""
@@ -212,8 +212,9 @@ def _firefox_cookies_database(browser_name, profile=None, container=None):
path = _find_most_recently_used_file(search_root, "cookies.sqlite")
if path is None:
- raise FileNotFoundError("Unable to find Firefox cookies database in "
- "{}".format(search_root))
+ raise FileNotFoundError(f"Unable to find {browser_name.capitalize()} "
+ f"cookies database in {search_root}")
+
_log_debug("Extracting cookies from %s", path)
if not container or container == "none":
@@ -243,8 +244,7 @@ def _firefox_cookies_database(browser_name, profile=None, container=None):
container_id = context["userContextId"]
break
else:
- raise ValueError("Unable to find Firefox container '{}'".format(
- container))
+ raise ValueError(f"Unable to find Firefox container '{container}'")
_log_debug("Only loading cookies from container '%s' (ID %s)",
container, container_id)
@@ -257,20 +257,23 @@ def _firefox_browser_directory(browser_name):
if sys.platform in ("win32", "cygwin"):
appdata = os.path.expandvars("%APPDATA%")
return {
- "firefox": join(appdata, R"Mozilla\Firefox\Profiles"),
- "zen" : join(appdata, R"zen\Profiles")
+ "firefox" : join(appdata, R"Mozilla\Firefox\Profiles"),
+ "librewolf": join(appdata, R"librewolf\Profiles"),
+ "zen" : join(appdata, R"zen\Profiles"),
}[browser_name]
elif sys.platform == "darwin":
appdata = os.path.expanduser("~/Library/Application Support")
return {
- "firefox": join(appdata, R"Firefox/Profiles"),
- "zen" : join(appdata, R"zen/Profiles")
+ "firefox" : join(appdata, R"Firefox/Profiles"),
+ "librewolf": join(appdata, R"librewolf/Profiles"),
+ "zen" : join(appdata, R"zen/Profiles"),
}[browser_name]
else:
home = os.path.expanduser("~")
return {
- "firefox": join(home, R".mozilla/firefox"),
- "zen" : join(home, R".zen")
+ "firefox" : join(home, R".mozilla/firefox"),
+ "librewolf": join(home, R".librewolf"),
+ "zen" : join(home, R".zen"),
}[browser_name]
@@ -386,8 +389,8 @@ def _chromium_cookies_database(profile, config):
path = _find_most_recently_used_file(search_root, "Cookies")
if path is None:
- raise FileNotFoundError("Unable to find {} cookies database in "
- "'{}'".format(config["browser"], search_root))
+ raise FileNotFoundError(f"Unable to find {config['browser']} cookies "
+ f"database in '{search_root}'")
return path
@@ -519,8 +522,7 @@ class LinuxChromiumCookieDecryptor(ChromiumCookieDecryptor):
self._cookie_counts = {"v10": 0, "v11": 0, "other": 0}
self._offset = (32 if meta_version >= 24 else 0)
- @staticmethod
- def derive_key(password):
+ def derive_key(self, password):
# values from
# https://chromium.googlesource.com/chromium/src/+/refs/heads
# /main/components/os_crypt/os_crypt_linux.cc
@@ -564,8 +566,7 @@ class MacChromiumCookieDecryptor(ChromiumCookieDecryptor):
self._cookie_counts = {"v10": 0, "other": 0}
self._offset = (32 if meta_version >= 24 else 0)
- @staticmethod
- def derive_key(password):
+ def derive_key(self, password):
# values from
# https://chromium.googlesource.com/chromium/src/+/refs/heads
# /main/components/os_crypt/os_crypt_mac.mm
@@ -713,9 +714,9 @@ def _get_kwallet_password(browser_keyring_name):
)
if proc.returncode != 0:
- _log_error("kwallet-query failed with return code {}. "
- "Please consult the kwallet-query man page "
- "for details".format(proc.returncode))
+ _log_error(f"kwallet-query failed with return code "
+ f"{proc.returncode}. Please consult the kwallet-query "
+ f"man page for details")
return b""
if stdout.lower().startswith(b"failed to read"):
@@ -844,7 +845,7 @@ class DataParser:
def read_bytes(self, num_bytes):
if num_bytes < 0:
- raise ParserError("invalid read of {} bytes".format(num_bytes))
+ raise ParserError(f"invalid read of {num_bytes} bytes")
end = self.cursor + num_bytes
if end > len(self._data):
raise ParserError("reached end of input")
@@ -855,8 +856,8 @@ class DataParser:
def expect_bytes(self, expected_value, message):
value = self.read_bytes(len(expected_value))
if value != expected_value:
- raise ParserError("unexpected value: {} != {} ({})".format(
- value, expected_value, message))
+ raise ParserError(f"unexpected value: {value} != {expected_value} "
+ f"({message})")
def read_uint(self, big_endian=False):
data_format = ">I" if big_endian else "<I"
@@ -877,10 +878,10 @@ class DataParser:
def skip(self, num_bytes, description="unknown"):
if num_bytes > 0:
- _log_debug("Skipping {} bytes ({}): {!r}".format(
- num_bytes, description, self.read_bytes(num_bytes)))
+ _log_debug(f"Skipping {num_bytes} bytes ({description}): "
+ f"{self.read_bytes(num_bytes)!r}")
elif num_bytes < 0:
- raise ParserError("Invalid skip of {} bytes".format(num_bytes))
+ raise ParserError(f"Invalid skip of {num_bytes} bytes")
def skip_to(self, offset, description="unknown"):
self.skip(offset - self.cursor, description)
@@ -903,7 +904,7 @@ class DatabaseConnection():
if util.WINDOWS:
path = "/" + os.path.abspath(path)
- uri = "file:{}?mode=ro&immutable=1".format(path)
+ uri = f"file:{path}?mode=ro&immutable=1"
self.database = sqlite3.connect(
uri, uri=True, isolation_level=None, check_same_thread=False)
return self.database
@@ -1101,9 +1102,9 @@ def _parse_browser_specification(
browser, profile=None, keyring=None, container=None, domain=None):
browser = browser.lower()
if browser not in SUPPORTED_BROWSERS:
- raise ValueError("Unsupported browser '{}'".format(browser))
+ raise ValueError(f"Unsupported browser '{browser}'")
if keyring and keyring not in SUPPORTED_KEYRINGS:
- raise ValueError("Unsupported keyring '{}'".format(keyring))
+ raise ValueError(f"Unsupported keyring '{keyring}'")
if profile and _is_path(profile):
profile = os.path.expanduser(profile)
return browser, profile, keyring, container, domain
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py
index dc1219f..7cd8d10 100644
--- a/gallery_dl/downloader/common.py
+++ b/gallery_dl/downloader/common.py
@@ -21,8 +21,7 @@ class DownloaderBase():
extractor = job.extractor
self.log = job.get_logger("downloader." + self.scheme)
- opts = self._extractor_config(extractor)
- if opts:
+ if opts := self._extractor_config(extractor):
self.opts = opts
self.config = self.config_opts
@@ -60,8 +59,7 @@ class DownloaderBase():
opts = {}
for cat, sub in reversed(path):
- popts = self._extractor_opts(cat, sub)
- if popts:
+ if popts := self._extractor_opts(cat, sub):
opts.update(popts)
return opts
@@ -70,12 +68,10 @@ class DownloaderBase():
if not cfg:
return None
- copts = cfg.get(self.scheme)
- if copts:
+ if copts := cfg.get(self.scheme):
if subcategory in cfg:
try:
- sopts = cfg[subcategory].get(self.scheme)
- if sopts:
+ if sopts := cfg[subcategory].get(self.scheme):
opts = copts.copy()
opts.update(sopts)
return opts
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index c58e2fb..4595483 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,8 +12,9 @@ import time
import mimetypes
from requests.exceptions import RequestException, ConnectionError, Timeout
from .common import DownloaderBase
-from .. import text, util, output
+from .. import text, util, output, exception
from ssl import SSLError
+FLAGS = util.FLAGS
class HttpDownloader(DownloaderBase):
@@ -29,6 +30,7 @@ class HttpDownloader(DownloaderBase):
self.metadata = extractor.config("http-metadata")
self.progress = self.config("progress", 3.0)
self.validate = self.config("validate", True)
+ self.validate_html = self.config("validate-html", True)
self.headers = self.config("headers")
self.minsize = self.config("filesize-min")
self.maxsize = self.config("filesize-max")
@@ -68,14 +70,16 @@ class HttpDownloader(DownloaderBase):
chunk_size = 32768
self.chunk_size = chunk_size
if self.rate:
- rate = text.parse_bytes(self.rate)
- if rate:
- if rate < self.chunk_size:
- self.chunk_size = rate
- self.rate = rate
+ func = util.build_selection_func(self.rate, 0, text.parse_bytes)
+ if rmax := func.args[1] if hasattr(func, "args") else func():
+ if rmax < self.chunk_size:
+ # reduce chunk_size to allow for one iteration each second
+ self.chunk_size = rmax
+ self.rate = func
self.receive = self._receive_rate
else:
self.log.warning("Invalid rate limit (%r)", self.rate)
+ self.rate = False
if self.progress is not None:
self.receive = self._receive_rate
if self.progress < 0.0:
@@ -88,8 +92,10 @@ class HttpDownloader(DownloaderBase):
def download(self, url, pathfmt):
try:
return self._download_impl(url, pathfmt)
- except Exception:
- output.stderr_write("\n")
+ except Exception as exc:
+ if self.downloading:
+ output.stderr_write("\n")
+ self.log.debug("", exc_info=exc)
raise
finally:
# remove file from incomplete downloads
@@ -134,16 +140,14 @@ class HttpDownloader(DownloaderBase):
# collect HTTP headers
headers = {"Accept": "*/*"}
# file-specific headers
- extra = kwdict.get("_http_headers")
- if extra:
+ if extra := kwdict.get("_http_headers"):
headers.update(extra)
# general headers
if self.headers:
headers.update(self.headers)
# partial content
- file_size = pathfmt.part_size()
- if file_size:
- headers["Range"] = "bytes={}-".format(file_size)
+ if file_size := pathfmt.part_size():
+ headers["Range"] = f"bytes={file_size}-"
# connect to (remote) source
try:
@@ -161,7 +165,7 @@ class HttpDownloader(DownloaderBase):
reason = exc.args[0].reason
cls = reason.__class__.__name__
pre, _, err = str(reason.args[-1]).partition(":")
- msg = "{}: {}".format(cls, (err or pre).lstrip())
+ msg = f"{cls}: {(err or pre).lstrip()}"
except Exception:
msg = str(exc)
continue
@@ -183,7 +187,7 @@ class HttpDownloader(DownloaderBase):
elif code == 416 and file_size: # Requested Range Not Satisfiable
break
else:
- msg = "'{} {}' for '{}'".format(code, response.reason, url)
+ msg = f"'{code} {response.reason}' for '{url}'"
challenge = util.detect_challenge(response)
if challenge is not None:
@@ -199,8 +203,8 @@ class HttpDownloader(DownloaderBase):
return False
# check for invalid responses
- validate = kwdict.get("_http_validate")
- if validate and self.validate:
+ if self.validate and \
+ (validate := kwdict.get("_http_validate")) is not None:
try:
result = validate(response)
except Exception:
@@ -214,6 +218,14 @@ class HttpDownloader(DownloaderBase):
self.release_conn(response)
self.log.warning("Invalid response")
return False
+ if self.validate_html and response.headers.get(
+ "content-type", "").startswith("text/html") and \
+ pathfmt.extension not in ("html", "htm"):
+ if response.history:
+ self.log.warning("HTTP redirect to '%s'", response.url)
+ else:
+ self.log.warning("HTML response")
+ return False
# check file size
size = text.parse_int(size, None)
@@ -265,19 +277,28 @@ class HttpDownloader(DownloaderBase):
content = response.iter_content(self.chunk_size)
+ validate_sig = kwdict.get("_http_signature")
+ validate_ext = (adjust_extension and
+ pathfmt.extension in SIGNATURE_CHECKS)
+
# check filename extension against file header
- if adjust_extension and not offset and \
- pathfmt.extension in SIGNATURE_CHECKS:
+ if not offset and (validate_ext or validate_sig):
try:
file_header = next(
content if response.raw.chunked
else response.iter_content(16), b"")
except (RequestException, SSLError) as exc:
msg = str(exc)
- output.stderr_write("\n")
continue
- if self._adjust_extension(pathfmt, file_header) and \
- pathfmt.exists():
+ if validate_sig:
+ result = validate_sig(file_header)
+ if result is not True:
+ self.release_conn(response)
+ self.log.warning(
+ result or "Invalid file signature bytes")
+ return False
+ if validate_ext and self._adjust_extension(
+ pathfmt, file_header) and pathfmt.exists():
pathfmt.temppath = ""
response.close()
return True
@@ -294,6 +315,9 @@ class HttpDownloader(DownloaderBase):
# download content
self.downloading = True
with pathfmt.open(mode) as fp:
+ if fp is None:
+ # '.part' file no longer exists
+ break
if file_header:
fp.write(file_header)
offset += len(file_header)
@@ -310,11 +334,16 @@ class HttpDownloader(DownloaderBase):
msg = str(exc)
output.stderr_write("\n")
continue
+ except exception.StopExtraction:
+ response.close()
+ return False
+ except exception.ControlException:
+ response.close()
+ raise
# check file size
if size and fp.tell() < size:
- msg = "file size mismatch ({} < {})".format(
- fp.tell(), size)
+ msg = f"file size mismatch ({fp.tell()} < {size})"
output.stderr_write("\n")
continue
@@ -323,11 +352,11 @@ class HttpDownloader(DownloaderBase):
self.downloading = False
if self.mtime:
if "_http_lastmodified" in kwdict:
- kwdict["_mtime"] = kwdict["_http_lastmodified"]
+ kwdict["_mtime_http"] = kwdict["_http_lastmodified"]
else:
- kwdict["_mtime"] = response.headers.get("Last-Modified")
+ kwdict["_mtime_http"] = response.headers.get("Last-Modified")
else:
- kwdict["_mtime"] = None
+ kwdict["_mtime_http"] = None
return True
@@ -343,14 +372,16 @@ class HttpDownloader(DownloaderBase):
"closing the connection anyway", exc.__class__.__name__, exc)
response.close()
- @staticmethod
- def receive(fp, content, bytes_total, bytes_start):
+ def receive(self, fp, content, bytes_total, bytes_start):
write = fp.write
for data in content:
write(data)
+ if FLAGS.DOWNLOAD is not None:
+ FLAGS.process("DOWNLOAD")
+
def _receive_rate(self, fp, content, bytes_total, bytes_start):
- rate = self.rate
+ rate = self.rate() if self.rate else None
write = fp.write
progress = self.progress
@@ -363,6 +394,9 @@ class HttpDownloader(DownloaderBase):
write(data)
+ if FLAGS.DOWNLOAD is not None:
+ FLAGS.process("DOWNLOAD")
+
if progress is not None:
if time_elapsed > progress:
self.out.progress(
@@ -371,7 +405,7 @@ class HttpDownloader(DownloaderBase):
int(bytes_downloaded / time_elapsed),
)
- if rate:
+ if rate is not None:
time_expected = bytes_downloaded / rate
if time_expected > time_elapsed:
time.sleep(time_expected - time_elapsed)
@@ -387,15 +421,13 @@ class HttpDownloader(DownloaderBase):
if mtype in MIME_TYPES:
return MIME_TYPES[mtype]
- ext = mimetypes.guess_extension(mtype, strict=False)
- if ext:
+ if ext := mimetypes.guess_extension(mtype, strict=False):
return ext[1:]
self.log.warning("Unknown MIME type '%s'", mtype)
return "bin"
- @staticmethod
- def _adjust_extension(pathfmt, file_header):
+ def _adjust_extension(self, pathfmt, file_header):
"""Check filename extension against file header"""
if not SIGNATURE_CHECKS[pathfmt.extension](file_header):
for ext, check in SIGNATURE_CHECKS.items():
@@ -452,12 +484,20 @@ MIME_TYPES = {
"application/x-pdf": "pdf",
"application/x-shockwave-flash": "swf",
+ "text/html": "html",
+
"application/ogg": "ogg",
# https://www.iana.org/assignments/media-types/model/obj
"model/obj": "obj",
"application/octet-stream": "bin",
}
+
+def _signature_html(s):
+ s = s[:14].lstrip()
+ return s and b"<!doctype html".startswith(s.lower())
+
+
# https://en.wikipedia.org/wiki/List_of_file_signatures
SIGNATURE_CHECKS = {
"jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF",
@@ -488,6 +528,8 @@ SIGNATURE_CHECKS = {
"7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C",
"pdf" : lambda s: s[0:5] == b"%PDF-",
"swf" : lambda s: s[0:3] in (b"CWS", b"FWS"),
+ "html": _signature_html,
+ "htm" : _signature_html,
"blend": lambda s: s[0:7] == b"BLENDER",
# unfortunately the Wavefront .obj format doesn't have a signature,
# so we check for the existence of Blender's comment
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
index 1fc2f82..69a59ff 100644
--- a/gallery_dl/downloader/ytdl.py
+++ b/gallery_dl/downloader/ytdl.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2022 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -30,6 +30,7 @@ class YoutubeDLDownloader(DownloaderBase):
}
self.ytdl_instance = None
+ self.rate_dyn = None
self.forward_cookies = self.config("forward-cookies", True)
self.progress = self.config("progress", 3.0)
self.outtmpl = self.config("outtmpl")
@@ -67,18 +68,23 @@ class YoutubeDLDownloader(DownloaderBase):
for cookie in self.session.cookies:
set_cookie(cookie)
- if self.progress is not None and not ytdl_instance._progress_hooks:
- ytdl_instance.add_progress_hook(self._progress_hook)
+ if "__gdl_initialize" in ytdl_instance.params:
+ del ytdl_instance.params["__gdl_initialize"]
+
+ if self.progress is not None:
+ ytdl_instance.add_progress_hook(self._progress_hook)
+ if rlf := ytdl_instance.params.pop("__gdl_ratelimit_func", False):
+ self.rate_dyn = rlf
info_dict = kwdict.pop("_ytdl_info_dict", None)
if not info_dict:
url = url[5:]
try:
- manifest = kwdict.pop("_ytdl_manifest", None)
- if manifest:
+ if manifest := kwdict.pop("_ytdl_manifest", None):
info_dict = self._extract_manifest(
ytdl_instance, url, manifest,
- kwdict.pop("_ytdl_manifest_data", None))
+ kwdict.pop("_ytdl_manifest_data", None),
+ kwdict.pop("_ytdl_manifest_headers", None))
else:
info_dict = self._extract_info(ytdl_instance, url)
except Exception as exc:
@@ -96,8 +102,7 @@ class YoutubeDLDownloader(DownloaderBase):
else:
info_dict = info_dict["entries"][index]
- extra = kwdict.get("_ytdl_extra")
- if extra:
+ if extra := kwdict.get("_ytdl_extra"):
info_dict.update(extra)
return self._download_video(ytdl_instance, pathfmt, info_dict)
@@ -131,26 +136,31 @@ class YoutubeDLDownloader(DownloaderBase):
pathfmt.temppath = ""
return True
+ if self.rate_dyn is not None:
+ # static ratelimits are set in ytdl.construct_YoutubeDL
+ ytdl_instance.params["ratelimit"] = self.rate_dyn()
+
self.out.start(pathfmt.path)
if self.part:
- pathfmt.kwdict["extension"] = pathfmt.prefix + "part"
+ pathfmt.kwdict["extension"] = pathfmt.prefix
filename = pathfmt.build_filename(pathfmt.kwdict)
pathfmt.kwdict["extension"] = info_dict["ext"]
if self.partdir:
path = os.path.join(self.partdir, filename)
else:
path = pathfmt.realdirectory + filename
+ path = path.replace("%", "%%") + "%(ext)s"
else:
- path = pathfmt.realpath
+ path = pathfmt.realpath.replace("%", "%%")
- self._set_outtmpl(ytdl_instance, path.replace("%", "%%"))
+ self._set_outtmpl(ytdl_instance, path)
try:
ytdl_instance.process_info(info_dict)
except Exception as exc:
self.log.debug("", exc_info=exc)
return False
- pathfmt.temppath = info_dict["filepath"]
+ pathfmt.temppath = info_dict.get("filepath") or info_dict["_filename"]
return True
def _download_playlist(self, ytdl_instance, pathfmt, info_dict):
@@ -159,13 +169,16 @@ class YoutubeDLDownloader(DownloaderBase):
self._set_outtmpl(ytdl_instance, pathfmt.realpath)
for entry in info_dict["entries"]:
+ if self.rate_dyn is not None:
+ ytdl_instance.params["ratelimit"] = self.rate_dyn()
ytdl_instance.process_info(entry)
return True
def _extract_info(self, ytdl, url):
return ytdl.extract_info(url, download=False)
- def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None):
+ def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None,
+ headers=None):
extr = ytdl.get_info_extractor("Generic")
video_id = extr._generic_id(url)
@@ -173,9 +186,10 @@ class YoutubeDLDownloader(DownloaderBase):
if manifest_data is None:
try:
fmts, subs = extr._extract_m3u8_formats_and_subtitles(
- url, video_id, "mp4")
+ url, video_id, "mp4", headers=headers)
except AttributeError:
- fmts = extr._extract_m3u8_formats(url, video_id, "mp4")
+ fmts = extr._extract_m3u8_formats(
+ url, video_id, "mp4", headers=headers)
subs = None
else:
try:
@@ -189,9 +203,10 @@ class YoutubeDLDownloader(DownloaderBase):
if manifest_data is None:
try:
fmts, subs = extr._extract_mpd_formats_and_subtitles(
- url, video_id)
+ url, video_id, headers=headers)
except AttributeError:
- fmts = extr._extract_mpd_formats(url, video_id)
+ fmts = extr._extract_mpd_formats(
+ url, video_id, headers=headers)
subs = None
else:
if isinstance(manifest_data, str):
@@ -228,8 +243,7 @@ class YoutubeDLDownloader(DownloaderBase):
int(speed) if speed else 0,
)
- @staticmethod
- def _set_outtmpl(ytdl_instance, outtmpl):
+ def _set_outtmpl(self, ytdl_instance, outtmpl):
try:
ytdl_instance._parse_outtmpl
except AttributeError:
diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py
index 6b2ce3a..5a52581 100644
--- a/gallery_dl/exception.py
+++ b/gallery_dl/exception.py
@@ -11,21 +11,26 @@
Class Hierarchy:
Exception
- +-- GalleryDLException
- +-- ExtractionError
- | +-- AuthenticationError
- | +-- AuthorizationError
- | +-- NotFoundError
- | +-- HttpError
- +-- FormatError
- | +-- FilenameFormatError
- | +-- DirectoryFormatError
- +-- FilterError
- +-- InputFileError
- +-- NoExtractorError
- +-- StopExtraction
- +-- TerminateExtraction
- +-- RestartExtraction
+ └── GalleryDLException
+ ├── ExtractionError
+ │ ├── HttpError
+ │ │ └── ChallengeError
+ │ ├── AuthorizationError
+ │ │ └── AuthRequired
+ │ ├── AuthenticationError
+ │ └── NotFoundError
+ ├── InputError
+ │ ├── FormatError
+ │ │ ├── FilenameFormatError
+ │ │ └── DirectoryFormatError
+ │ ├── FilterError
+ │ ├── InputFileError
+ │ └── NoExtractorError
+ └── ControlException
+ ├── StopExtraction
+ ├── AbortExtraction
+ ├── TerminateExtraction
+ └── RestartExtraction
"""
@@ -39,20 +44,24 @@ class GalleryDLException(Exception):
if not message:
message = self.default
elif isinstance(message, Exception):
- message = "{}: {}".format(message.__class__.__name__, message)
- if self.msgfmt and fmt:
- message = self.msgfmt.format(message)
+ message = f"{message.__class__.__name__}: {message}"
+ if fmt and self.msgfmt is not None:
+ message = self.msgfmt.replace("{}", message)
+ self.message = message
Exception.__init__(self, message)
+###############################################################################
+# Extractor Errors ############################################################
+
class ExtractionError(GalleryDLException):
"""Base class for exceptions during information extraction"""
+ code = 4
class HttpError(ExtractionError):
"""HTTP request during data extraction failed"""
default = "HTTP request failed"
- code = 4
def __init__(self, message="", response=None):
self.response = response
@@ -61,35 +70,63 @@ class HttpError(ExtractionError):
else:
self.status = response.status_code
if not message:
- message = "'{} {}' for '{}'".format(
- response.status_code, response.reason, response.url)
+ message = (f"'{response.status_code} {response.reason}' "
+ f"for '{response.url}'")
ExtractionError.__init__(self, message)
-class NotFoundError(ExtractionError):
- """Requested resource (gallery/image) could not be found"""
- msgfmt = "Requested {} could not be found"
- default = "resource (gallery/image)"
+class ChallengeError(HttpError):
code = 8
+ def __init__(self, challenge, response):
+ message = (
+ f"{challenge} ({response.status_code} {response.reason}) "
+ f"for '{response.url}'")
+ HttpError.__init__(self, message, response)
+
class AuthenticationError(ExtractionError):
"""Invalid or missing login credentials"""
- default = "Invalid or missing login credentials"
+ default = "Invalid login credentials"
code = 16
class AuthorizationError(ExtractionError):
"""Insufficient privileges to access a resource"""
- default = "Insufficient privileges to access the specified resource"
+ default = "Insufficient privileges to access this resource"
code = 16
-class FormatError(GalleryDLException):
- """Error while building output paths"""
+class AuthRequired(AuthorizationError):
+ default = "Account credentials required"
+
+ def __init__(self, required=None, message=None):
+ if required and not message:
+ if isinstance(required, str):
+ message = f"{required} required"
+ else:
+ message = f"{' or '.join(required)} required"
+ AuthorizationError.__init__(self, message)
+
+
+class NotFoundError(ExtractionError):
+ """Requested resource (gallery/image) could not be found"""
+ msgfmt = "Requested {} could not be found"
+ default = "resource (gallery/image)"
+
+
+###############################################################################
+# User Input ##################################################################
+
+class InputError(GalleryDLException):
+ """Error caused by user input and config options"""
code = 32
+class FormatError(InputError):
+ """Error while building output paths"""
+
+
class FilenameFormatError(FormatError):
"""Error while building output filenames"""
msgfmt = "Applying filename format string failed ({})"
@@ -100,40 +137,37 @@ class DirectoryFormatError(FormatError):
msgfmt = "Applying directory format string failed ({})"
-class FilterError(GalleryDLException):
+class FilterError(InputError):
"""Error while evaluating a filter expression"""
msgfmt = "Evaluating filter expression failed ({})"
- code = 32
-
-class InputFileError(GalleryDLException):
- """Error when parsing input file"""
- code = 32
- def __init__(self, message, *args):
- GalleryDLException.__init__(
- self, message % args if args else message)
+class InputFileError(InputError):
+ """Error when parsing an input file"""
-class NoExtractorError(GalleryDLException):
+class NoExtractorError(InputError):
"""No extractor can handle the given URL"""
- code = 64
-class StopExtraction(GalleryDLException):
+###############################################################################
+# Control Flow ################################################################
+
+class ControlException(GalleryDLException):
+ code = 0
+
+
+class StopExtraction(ControlException):
"""Stop data extraction"""
- def __init__(self, message=None, *args):
- GalleryDLException.__init__(self)
- self.message = message % args if args else message
- self.code = 1 if message else 0
+
+class AbortExtraction(ExtractionError, ControlException):
+ """Abort data extraction due to an error"""
-class TerminateExtraction(GalleryDLException):
+class TerminateExtraction(ControlException):
"""Terminate data extraction"""
- code = 0
-class RestartExtraction(GalleryDLException):
+class RestartExtraction(ControlException):
"""Restart data extraction"""
- code = 0
diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py
index dbbf21b..f5bb7b7 100644
--- a/gallery_dl/extractor/2ch.py
+++ b/gallery_dl/extractor/2ch.py
@@ -26,8 +26,8 @@ class _2chThreadExtractor(Extractor):
self.board, self.thread = match.groups()
def items(self):
- url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
- posts = self.request(url).json()["threads"][0]["posts"]
+ url = f"{self.root}/{self.board}/res/{self.thread}.json"
+ posts = self.request_json(url)["threads"][0]["posts"]
op = posts[0]
title = op.get("subject") or text.remove_html(op["comment"])
@@ -40,8 +40,7 @@ class _2chThreadExtractor(Extractor):
yield Message.Directory, thread
for post in posts:
- files = post.get("files")
- if files:
+ if files := post.get("files"):
post["post_name"] = post["name"]
post["date"] = text.parse_timestamp(post["timestamp"])
del post["files"]
@@ -68,24 +67,24 @@ class _2chBoardExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.board = match.group(1)
+ self.board = match[1]
def items(self):
+ base = f"{self.root}/{self.board}"
+
# index page
- url = "{}/{}/index.json".format(self.root, self.board)
- index = self.request(url).json()
+ url = f"{base}/index.json"
+ index = self.request_json(url)
index["_extractor"] = _2chThreadExtractor
for thread in index["threads"]:
- url = "{}/{}/res/{}.html".format(
- self.root, self.board, thread["thread_num"])
+ url = f"{base}/res/{thread['thread_num']}.html"
yield Message.Queue, url, index
# pages 1..n
for n in util.advance(index["pages"], 1):
- url = "{}/{}/{}.json".format(self.root, self.board, n)
- page = self.request(url).json()
+ url = f"{base}/{n}.json"
+ page = self.request_json(url)
page["_extractor"] = _2chThreadExtractor
for thread in page["threads"]:
- url = "{}/{}/res/{}.html".format(
- self.root, self.board, thread["thread_num"])
+ url = f"{base}/res/{thread['thread_num']}.html"
yield Message.Queue, url, page
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py
index 337ba48..9927b5a 100644
--- a/gallery_dl/extractor/2chan.py
+++ b/gallery_dl/extractor/2chan.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -19,7 +19,6 @@ class _2chanThreadExtractor(Extractor):
directory_fmt = ("{category}", "{board_name}", "{thread}")
filename_fmt = "{tim}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
- url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
pattern = r"(?:https?://)?([\w-]+)\.2chan\.net/([^/?#]+)/res/(\d+)"
example = "https://dec.2chan.net/12/res/12345.htm"
@@ -28,8 +27,8 @@ class _2chanThreadExtractor(Extractor):
self.server, self.board, self.thread = match.groups()
def items(self):
- url = "https://{}.2chan.net/{}/res/{}.htm".format(
- self.server, self.board, self.thread)
+ url = (f"https://{self.server}.2chan.net"
+ f"/{self.board}/res/{self.thread}.htm")
page = self.request(url).text
data = self.metadata(page)
yield Message.Directory, data
@@ -37,7 +36,8 @@ class _2chanThreadExtractor(Extractor):
if "filename" not in post:
continue
post.update(data)
- url = self.url_fmt.format_map(post)
+ url = (f"https://{post['server']}.2chan.net"
+ f"/{post['board']}/src/{post['filename']}")
yield Message.Url, url, post
def metadata(self, page):
@@ -74,8 +74,7 @@ class _2chanThreadExtractor(Extractor):
data["ext"] = "." + data["extension"]
return data
- @staticmethod
- def _extract_post(post):
+ def _extract_post(self, post):
return text.extract_all(post, (
("post", 'class="csb">' , '<'),
("name", 'class="cnm">' , '<'),
@@ -85,8 +84,7 @@ class _2chanThreadExtractor(Extractor):
("com" , '>', '</blockquote>'),
))[0]
- @staticmethod
- def _extract_image(post, data):
+ def _extract_image(self, post, data):
text.extract_all(post, (
(None , '_blank', ''),
("filename", '>', '<'),
diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py
index 0c97889..ee3510c 100644
--- a/gallery_dl/extractor/2chen.py
+++ b/gallery_dl/extractor/2chen.py
@@ -28,7 +28,7 @@ class _2chenThreadExtractor(Extractor):
self.board, self.thread = match.groups()
def items(self):
- url = "{}/{}/{}".format(self.root, self.board, self.thread)
+ url = f"{self.root}/{self.board}/{self.thread}"
page = self.request(url, encoding="utf-8", notfound="thread").text
data = self.metadata(page)
yield Message.Directory, data
@@ -86,10 +86,10 @@ class _2chenBoardExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.board = match.group(1)
+ self.board = match[1]
def items(self):
- url = "{}/{}/catalog".format(self.root, self.board)
+ url = f"{self.root}/{self.board}/catalog"
page = self.request(url, notfound="board").text
data = {"_extractor": _2chenThreadExtractor}
for thread in text.extract_iter(
diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py
index 773116e..ec5f0cb 100644
--- a/gallery_dl/extractor/35photo.py
+++ b/gallery_dl/extractor/35photo.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -49,14 +49,14 @@ class _35photoExtractor(Extractor):
if extra_ids:
yield from extra_ids
while params["lastId"]:
- data = self.request(url, headers=headers, params=params).json()
+ data = self.request_json(url, headers=headers, params=params)
yield from self._photo_ids(data["data"])
params["lastId"] = data["lastId"]
def _photo_data(self, photo_id):
params = {"method": "photo.getData", "photoId": photo_id}
- data = self.request(
- "https://api.35photo.pro/", params=params).json()["data"][photo_id]
+ data = self.request_json(
+ "https://api.35photo.pro/", params=params)["data"][photo_id]
info = {
"url" : data["src"],
"id" : data["photo_id"],
@@ -83,8 +83,7 @@ class _35photoExtractor(Extractor):
info["num"] = 1
yield info
- @staticmethod
- def _photo_ids(page):
+ def _photo_ids(self, page):
"""Extract unique photo IDs and return them as sorted list"""
# searching for photo-id="..." doesn't always work (see unit tests)
if not page:
@@ -105,11 +104,11 @@ class _35photoUserExtractor(_35photoExtractor):
def __init__(self, match):
_35photoExtractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
self.user_id = 0
def metadata(self):
- url = "{}/{}/".format(self.root, self.user)
+ url = f"{self.root}/{self.user}/"
page = self.request(url).text
self.user_id = text.parse_int(text.extr(page, "/user_", ".xml"))
return {
@@ -134,7 +133,7 @@ class _35photoTagExtractor(_35photoExtractor):
def __init__(self, match):
_35photoExtractor.__init__(self, match)
- self.tag = match.group(1)
+ self.tag = match[1]
def metadata(self):
return {"search_tag": text.unquote(self.tag).lower()}
@@ -143,7 +142,7 @@ class _35photoTagExtractor(_35photoExtractor):
num = 1
while True:
- url = "{}/tags/{}/list_{}/".format(self.root, self.tag, num)
+ url = f"{self.root}/tags/{self.tag}/list_{num}/"
page = self.request(url).text
prev = None
@@ -171,7 +170,7 @@ class _35photoGenreExtractor(_35photoExtractor):
self.photo_ids = None
def metadata(self):
- url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/")
+ url = f"{self.root}/genre_{self.genre_id}{self.new or '/'}"
page = self.request(url).text
self.photo_ids = self._photo_ids(text.extr(
page, ' class="photo', '\n'))
@@ -199,7 +198,7 @@ class _35photoImageExtractor(_35photoExtractor):
def __init__(self, match):
_35photoExtractor.__init__(self, match)
- self.photo_id = match.group(1)
+ self.photo_id = match[1]
def photos(self):
return (self.photo_id,)
diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py
index d198369..c9be2a4 100644
--- a/gallery_dl/extractor/4archive.py
+++ b/gallery_dl/extractor/4archive.py
@@ -27,8 +27,7 @@ class _4archiveThreadExtractor(Extractor):
self.board, self.thread = match.groups()
def items(self):
- url = "{}/board/{}/thread/{}".format(
- self.root, self.board, self.thread)
+ url = f"{self.root}/board/{self.board}/thread/{self.thread}"
page = self.request(url).text
data = self.metadata(page)
posts = self.posts(page)
@@ -58,15 +57,14 @@ class _4archiveThreadExtractor(Extractor):
for post in page.split('class="postContainer')[1:]
]
- @staticmethod
- def parse(post):
+ def parse(self, post):
extr = text.extract_from(post)
data = {
"name": extr('class="name">', "</span>"),
"date": text.parse_datetime(
extr('class="dateTime postNum" >', "<").strip(),
"%Y-%m-%d %H:%M:%S"),
- "no" : text.parse_int(extr('href="#p', '"')),
+ "no" : text.parse_int(extr(">Post No.", "<")),
}
if 'class="file"' in post:
extr('class="fileText"', ">File: <a")
@@ -94,18 +92,17 @@ class _4archiveBoardExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.board = match.group(1)
- self.num = text.parse_int(match.group(2), 1)
+ self.board = match[1]
+ self.num = text.parse_int(match[2], 1)
def items(self):
data = {"_extractor": _4archiveThreadExtractor}
while True:
- url = "{}/board/{}/{}".format(self.root, self.board, self.num)
+ url = f"{self.root}/board/{self.board}/{self.num}"
page = self.request(url).text
if 'class="thread"' not in page:
return
for thread in text.extract_iter(page, 'class="thread" id="t', '"'):
- url = "{}/board/{}/thread/{}".format(
- self.root, self.board, thread)
+ url = f"{self.root}/board/{self.board}/thread/{thread}"
yield Message.Queue, url, data
self.num += 1
diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py
index 2db6042..d81f305 100644
--- a/gallery_dl/extractor/4chan.py
+++ b/gallery_dl/extractor/4chan.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -28,9 +28,8 @@ class _4chanThreadExtractor(Extractor):
self.board, self.thread = match.groups()
def items(self):
- url = "https://a.4cdn.org/{}/thread/{}.json".format(
- self.board, self.thread)
- posts = self.request(url).json()["posts"]
+ url = f"https://a.4cdn.org/{self.board}/thread/{self.thread}.json"
+ posts = self.request_json(url)["posts"]
title = posts[0].get("sub") or text.remove_html(posts[0]["com"])
data = {
@@ -45,11 +44,23 @@ class _4chanThreadExtractor(Extractor):
post.update(data)
post["extension"] = post["ext"][1:]
post["filename"] = text.unescape(post["filename"])
- url = "https://i.4cdn.org/{}/{}{}".format(
- post["board"], post["tim"], post["ext"])
+ post["_http_signature"] = _detect_null_byte
+ url = (f"https://i.4cdn.org"
+ f"/{post['board']}/{post['tim']}{post['ext']}")
yield Message.Url, url, post
+def _detect_null_byte(signature):
+ """Return False if all file signature bytes are null"""
+ if signature:
+ if signature[0]:
+ return True
+ for byte in signature:
+ if byte:
+ return True
+ return "File data consists of null bytes"
+
+
class _4chanBoardExtractor(Extractor):
"""Extractor for 4chan boards"""
category = "4chan"
@@ -59,16 +70,16 @@ class _4chanBoardExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.board = match.group(1)
+ self.board = match[1]
def items(self):
- url = "https://a.4cdn.org/{}/threads.json".format(self.board)
- threads = self.request(url).json()
+ url = f"https://a.4cdn.org/{self.board}/threads.json"
+ threads = self.request_json(url)
for page in threads:
for thread in page["threads"]:
- url = "https://boards.4chan.org/{}/thread/{}/".format(
- self.board, thread["no"])
+ url = (f"https://boards.4chan.org"
+ f"/{self.board}/thread/{thread['no']}/")
thread["page"] = page["page"]
thread["_extractor"] = _4chanThreadExtractor
yield Message.Queue, url, thread
diff --git a/gallery_dl/extractor/4chanarchives.py b/gallery_dl/extractor/4chanarchives.py
index 27ac7c5..c187b41 100644
--- a/gallery_dl/extractor/4chanarchives.py
+++ b/gallery_dl/extractor/4chanarchives.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -29,8 +29,7 @@ class _4chanarchivesThreadExtractor(Extractor):
self.board, self.thread = match.groups()
def items(self):
- url = "{}/board/{}/thread/{}".format(
- self.root, self.board, self.thread)
+ url = f"{self.root}/board/{self.board}/thread/{self.thread}"
page = self.request(url).text
data = self.metadata(page)
posts = self.posts(page)
@@ -66,8 +65,7 @@ class _4chanarchivesThreadExtractor(Extractor):
post["extension"] = post["url"].rpartition(".")[2]
return post
- @staticmethod
- def _extract_post(html):
+ def _extract_post(self, html):
extr = text.extract_from(html)
return {
"no" : text.parse_int(extr('', '"')),
@@ -77,8 +75,7 @@ class _4chanarchivesThreadExtractor(Extractor):
html[html.find('<blockquote'):].partition(">")[2]),
}
- @staticmethod
- def _extract_file(html, post):
+ def _extract_file(self, html, post):
extr = text.extract_from(html, html.index(">File: <"))
post["url"] = extr('href="', '"')
post["filename"] = text.unquote(extr(">", "<").rpartition(".")[0])
@@ -106,7 +103,7 @@ class _4chanarchivesBoardExtractor(Extractor):
<span><a href="'''
while True:
- url = "{}/board/{}/{}".format(self.root, self.board, pnum)
+ url = f"{self.root}/board/{self.board}/{pnum}"
page = self.request(url).text
thread = None
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
index 41cc0de..d1ac503 100644
--- a/gallery_dl/extractor/500px.py
+++ b/gallery_dl/extractor/500px.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -72,7 +72,7 @@ class _500pxExtractor(Extractor):
"x-csrf-token": self.cookies.get(
"x-csrf-token", domain=".500px.com"),
}
- return self.request(url, headers=headers, params=params).json()
+ return self.request_json(url, headers=headers, params=params)
def _request_graphql(self, opname, variables):
url = "https://api.500px.com/graphql"
@@ -85,8 +85,8 @@ class _500pxExtractor(Extractor):
"variables" : util.json_dumps(variables),
"query" : QUERIES[opname],
}
- return self.request(
- url, method="POST", headers=headers, json=data).json()["data"]
+ return self.request_json(
+ url, method="POST", headers=headers, json=data)["data"]
class _500pxUserExtractor(_500pxExtractor):
@@ -97,7 +97,7 @@ class _500pxUserExtractor(_500pxExtractor):
def __init__(self, match):
_500pxExtractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
def photos(self):
variables = {"username": self.user, "pageSize": 20}
@@ -207,7 +207,7 @@ class _500pxImageExtractor(_500pxExtractor):
def __init__(self, match):
_500pxExtractor.__init__(self, match)
- self.photo_id = match.group(1)
+ self.photo_id = match[1]
def photos(self):
edges = ({"node": {"legacyId": self.photo_id}},)
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
index 3e30ddc..0385067 100644
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,7 +23,7 @@ class _8chanExtractor(Extractor):
root = "https://8chan.moe"
def __init__(self, match):
- self.root = "https://8chan." + match.group(1)
+ self.root = "https://8chan." + match[1]
Extractor.__init__(self, match)
@memcache()
@@ -78,9 +78,9 @@ class _8chanThreadExtractor(_8chanExtractor):
self.cookies.set(self.cookies_tos_name(), "1", domain=self.root[8:])
# fetch thread data
- url = "{}/{}/res/{}.".format(self.root, board, thread)
+ url = f"{self.root}/{board}/res/{thread}."
self.session.headers["Referer"] = url + "html"
- thread = self.request(url + "json").json()
+ thread = self.request_json(url + "json")
thread["postId"] = thread["threadId"]
thread["_http_headers"] = {"Referer": url + "html"}
@@ -116,19 +116,18 @@ class _8chanBoardExtractor(_8chanExtractor):
self.cookies.set(self.cookies_tos_name(), "1", domain=self.root[8:])
pnum = text.parse_int(pnum, 1)
- url = "{}/{}/{}.json".format(self.root, board, pnum)
- data = self.request(url).json()
+ url = f"{self.root}/{board}/{pnum}.json"
+ data = self.request_json(url)
threads = data["threads"]
while True:
for thread in threads:
thread["_extractor"] = _8chanThreadExtractor
- url = "{}/{}/res/{}.html".format(
- self.root, board, thread["threadId"])
+ url = f"{self.root}/{board}/res/{thread['threadId']}.html"
yield Message.Queue, url, thread
pnum += 1
if pnum > data["pageCount"]:
return
- url = "{}/{}/{}.json".format(self.root, board, pnum)
- threads = self.request(url).json()["threads"]
+ url = f"{self.root}/{board}/{pnum}.json"
+ threads = self.request_json(url)["threads"]
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
index 68b906e..120cd8a 100644
--- a/gallery_dl/extractor/8muses.py
+++ b/gallery_dl/extractor/8muses.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,8 +26,8 @@ class _8musesAlbumExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.path = match.group(1)
- self.params = match.group(2) or ""
+ self.path = match[1]
+ self.params = match[2] or ""
def items(self):
url = self.root + self.path + self.params
@@ -37,8 +37,7 @@ class _8musesAlbumExtractor(Extractor):
self.request(url).text,
'id="ractive-public" type="text/plain">', '</script>'))
- images = data.get("pictures")
- if images:
+ if images := data.get("pictures"):
count = len(images)
album = self._make_album(data["album"])
yield Message.Directory, {"album": album, "count": count}
@@ -54,8 +53,7 @@ class _8musesAlbumExtractor(Extractor):
}
yield Message.Url, url, img
- albums = data.get("albums")
- if albums:
+ if albums := data.get("albums"):
for album in albums:
permalink = album.get("permalink")
if not permalink:
@@ -74,8 +72,7 @@ class _8musesAlbumExtractor(Extractor):
return
path, _, num = self.path.rstrip("/").rpartition("/")
path = path if num.isdecimal() else self.path
- url = "{}{}/{}{}".format(
- self.root, path, data["page"] + 1, self.params)
+ url = f"{self.root}{path}/{data['page'] + 1}{self.params}"
def _make_album(self, album):
return {
@@ -92,8 +89,7 @@ class _8musesAlbumExtractor(Extractor):
album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"),
}
- @staticmethod
- def _unobfuscate(data):
+ def _unobfuscate(self, data):
return util.json_loads("".join([
chr(33 + (ord(c) + 14) % 94) if "!" <= c <= "~" else c
for c in text.unescape(data.strip("\t\n\r !"))
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 2da471e..688f0a0 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -1,13 +1,13 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import sys
-from ..util import re_compile
+from ..text import re_compile
modules = [
"2ch",
@@ -40,9 +40,11 @@ modules = [
"chevereto",
"cien",
"civitai",
+ "comick",
"comicvine",
"cyberdrop",
"danbooru",
+ "dankefuerslesen",
"desktopography",
"deviantart",
"discord",
@@ -63,6 +65,8 @@ modules = [
"gelbooru",
"gelbooru_v01",
"gelbooru_v02",
+ "girlsreleased",
+ "girlswithmuscle",
"gofile",
"hatenablog",
"hentai2read",
@@ -88,13 +92,14 @@ modules = [
"issuu",
"itaku",
"itchio",
+ "iwara",
"jschan",
"kabeuchi",
"keenspot",
- "kemonoparty",
+ "kemono",
"khinsider",
- "koharu",
"komikcast",
+ "leakgallery",
"lensdump",
"lexica",
"lightroom",
@@ -102,19 +107,20 @@ modules = [
"lofter",
"luscious",
"lynxchan",
+ "madokami",
"mangadex",
"mangafox",
"mangahere",
"manganelo",
"mangapark",
"mangaread",
- "mangasee",
"mangoxo",
"misskey",
"motherless",
"myhentaigallery",
"myportfolio",
- "naver",
+ "naverblog",
+ "naverchzzk",
"naverwebtoon",
"nekohouse",
"newgrounds",
@@ -123,6 +129,7 @@ modules = [
"nitter",
"nozomi",
"nsfwalbum",
+ "nudostar",
"paheal",
"patreon",
"pexels",
@@ -142,9 +149,11 @@ modules = [
"pornhub",
"pornpics",
"postmill",
+ "rawkuma",
"reactor",
"readcomiconline",
"realbooru",
+ "redbust",
"reddit",
"redgifs",
"rule34us",
@@ -153,6 +162,7 @@ modules = [
"saint",
"sankaku",
"sankakucomplex",
+ "schalenetwork",
"scrolller",
"seiga",
"senmanga",
@@ -226,8 +236,7 @@ modules = [
def find(url):
"""Find a suitable extractor for the given URL"""
for cls in _list_classes():
- match = cls.pattern.match(url)
- if match:
+ if match := cls.pattern.match(url):
return cls(match)
return None
@@ -242,8 +251,7 @@ def add(cls):
def add_module(module):
"""Add all extractors in 'module' to the list of available extractors"""
- classes = _get_classes(module)
- if classes:
+ if classes := _get_classes(module):
if isinstance(classes[0].pattern, str):
for cls in classes:
cls.pattern = re_compile(cls.pattern)
diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py
index c891b17..3249ae6 100644
--- a/gallery_dl/extractor/adultempire.py
+++ b/gallery_dl/extractor/adultempire.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,7 +22,7 @@ class AdultempireGalleryExtractor(GalleryExtractor):
def __init__(self, match):
GalleryExtractor.__init__(self, match)
- self.gallery_id = match.group(2)
+ self.gallery_id = match[2]
def _init(self):
self.cookies.set("ageConfirmed", "true", domain="www.adultempire.com")
@@ -48,4 +48,4 @@ class AdultempireGalleryExtractor(GalleryExtractor):
if len(urls) < 24:
return
params["page"] += 1
- page = self.request(self.gallery_url, params=params).text
+ page = self.request(self.page_url, params=params).text
diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py
index 653b73f..5bb1835 100644
--- a/gallery_dl/extractor/agnph.py
+++ b/gallery_dl/extractor/agnph.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,11 +9,8 @@
"""Extractors for https://agn.ph/"""
from . import booru
-from .. import text
-
-from xml.etree import ElementTree
+from .. import text, util
import collections
-import re
BASE_PATTERN = r"(?:https?://)?agn\.ph"
@@ -52,8 +49,7 @@ class AgnphExtractor(booru.BooruExtractor):
params["page"] = self.page_start
while True:
- data = self.request(url, params=params).text
- root = ElementTree.fromstring(data)
+ root = self.request_xml(url, params=params)
yield from map(self._xml_to_dict, root)
@@ -64,7 +60,7 @@ class AgnphExtractor(booru.BooruExtractor):
params["page"] += 1
def _html(self, post):
- url = "{}/gallery/post/show/{}/".format(self.root, post["id"])
+ url = f"{self.root}/gallery/post/show/{post['id']}/"
return self.request(url).text
def _tags(self, post, page):
@@ -74,7 +70,7 @@ class AgnphExtractor(booru.BooruExtractor):
return
tags = collections.defaultdict(list)
- pattern = re.compile(r'class="(.)typetag">([^<]+)')
+ pattern = util.re(r'class="(.)typetag">([^<]+)')
for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unquote(tag_name).replace(" ", "_"))
for key, value in tags.items():
@@ -107,7 +103,6 @@ class AgnphPostExtractor(AgnphExtractor):
example = "https://agn.ph/gallery/post/show/12345/"
def posts(self):
- url = "{}/gallery/post/show/{}/?api=xml".format(
- self.root, self.groups[0])
- post = ElementTree.fromstring(self.request(url).text)
+ url = f"{self.root}/gallery/post/show/{self.groups[0]}/?api=xml"
+ post = self.request_xml(url)
return (self._xml_to_dict(post),)
diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py
index d3ab846..2652acb 100644
--- a/gallery_dl/extractor/ao3.py
+++ b/gallery_dl/extractor/ao3.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://archiveofourown.org/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from ..cache import cache
@@ -135,7 +135,7 @@ class Ao3WorkExtractor(Ao3Extractor):
self.login()
work_id = self.groups[0]
- url = "{}/works/{}".format(self.root, work_id)
+ url = f"{self.root}/works/{work_id}"
response = self.request(url, notfound="work")
if response.url.endswith("/users/login?restricted=true"):
@@ -144,7 +144,7 @@ class Ao3WorkExtractor(Ao3Extractor):
page = response.text
if len(page) < 20000 and \
'<h2 class="landmark heading">Adult Content Warning</' in page:
- raise exception.StopExtraction("Adult Content")
+ raise exception.AbortExtraction("Adult Content")
extr = text.extract_from(page)
@@ -205,8 +205,7 @@ class Ao3WorkExtractor(Ao3Extractor):
}
data["language"] = util.code_to_language(data["lang"])
- series = data["series"]
- if series:
+ if series := data["series"]:
extr = text.extract_from(series)
data["series"] = {
"prev" : extr(' class="previous" href="/works/', '"'),
@@ -249,18 +248,14 @@ class Ao3SearchExtractor(Ao3Extractor):
example = "https://archiveofourown.org/works/search?work_search[query]=air"
-class Ao3UserExtractor(Ao3Extractor):
+class Ao3UserExtractor(Dispatch, Ao3Extractor):
"""Extractor for an AO3 user profile"""
- subcategory = "user"
pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)"
r"(?:/profile)?/?(?:$|\?|#)")
example = "https://archiveofourown.org/users/USER"
- def initialize(self):
- pass
-
def items(self):
- base = "{}/users/{}/".format(self.root, self.groups[0])
+ base = f"{self.root}/users/{self.groups[0]}/"
return self._dispatch_extractors((
(Ao3UserWorksExtractor , base + "works"),
(Ao3UserSeriesExtractor , base + "series"),
diff --git a/gallery_dl/extractor/arcalive.py b/gallery_dl/extractor/arcalive.py
index 3c39a1a..1df7e0f 100644
--- a/gallery_dl/extractor/arcalive.py
+++ b/gallery_dl/extractor/arcalive.py
@@ -8,7 +8,6 @@
from .common import Extractor, Message
from .. import text, util, exception
-import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?arca\.live"
@@ -27,7 +26,7 @@ class ArcaliveExtractor(Extractor):
for article in self.articles():
article["_extractor"] = ArcalivePostExtractor
board = self.board or article.get("boardSlug") or "breaking"
- url = "{}/b/{}/{}".format(self.root, board, article["id"])
+ url = f"{self.root}/b/{board}/{article['id']}"
yield Message.Queue, url, article
@@ -52,8 +51,8 @@ class ArcalivePostExtractor(ArcaliveExtractor):
post["count"] = len(files)
post["date"] = text.parse_datetime(
post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
- post["post_url"] = post_url = "{}/b/{}/{}".format(
- self.root, post["boardSlug"], post["id"])
+ post["post_url"] = post_url = \
+ f"{self.root}/b/{post['boardSlug']}/{post['id']}"
post["_http_headers"] = {"Referer": post_url + "?p=1"}
yield Message.Directory, post
@@ -65,8 +64,8 @@ class ArcalivePostExtractor(ArcaliveExtractor):
def _extract_files(self, post):
files = []
- for video, media in self._extract_media(post["content"]):
-
+ for video, media in util.re(r"<(?:img|vide(o)) ([^>]+)").findall(
+ post["content"]):
if not self.emoticons and 'class="arca-emoticon"' in media:
continue
@@ -75,36 +74,37 @@ class ArcalivePostExtractor(ArcaliveExtractor):
if not src:
continue
- src = text.unescape(src.partition("?")[0])
+ src, _, query = text.unescape(src).partition("?")
if src[0] == "/":
if src[1] == "/":
- url = "https:" + src
+ url = "https:" + src.replace(
+ "//ac-p.namu", "//ac-o.namu", 1)
else:
url = self.root + src
else:
url = src
fallback = ()
- orig = text.extr(media, 'data-orig="', '"')
- if orig:
+ query = f"?type=orig&{query}"
+ if orig := text.extr(media, 'data-orig="', '"'):
path, _, ext = url.rpartition(".")
if ext != orig:
- fallback = (url + "?type=orig",)
+ fallback = (url + query,)
url = path + "." + orig
elif video and self.gifs:
url_gif = url.rpartition(".")[0] + ".gif"
if self.gifs_fallback:
- fallback = (url + "?type=orig",)
+ fallback = (url + query,)
url = url_gif
else:
response = self.request(
- url_gif + "?type=orig", method="HEAD", fatal=False)
+ url_gif + query, method="HEAD", fatal=False)
if response.status_code < 400:
- fallback = (url + "?type=orig",)
+ fallback = (url + query,)
url = url_gif
files.append({
- "url" : url + "?type=orig",
+ "url" : url + query,
"width" : text.parse_int(text.extr(media, 'width="', '"')),
"height": text.parse_int(text.extr(media, 'height="', '"')),
"_fallback": fallback,
@@ -112,11 +112,6 @@ class ArcalivePostExtractor(ArcaliveExtractor):
return files
- def _extract_media(self, content):
- ArcalivePostExtractor._extract_media = extr = re.compile(
- r"<(?:img|vide(o)) ([^>]+)").findall
- return extr(content)
-
class ArcaliveBoardExtractor(ArcaliveExtractor):
"""Extractor for an arca.live board's posts"""
@@ -175,9 +170,8 @@ class ArcaliveAPI():
return data
self.log.debug("Server response: %s", data)
- msg = data.get("message")
- raise exception.StopExtraction(
- "API request failed%s", ": " + msg if msg else "")
+ msg = f": {msg}" if (msg := data.get("message")) else ""
+ raise exception.AbortExtraction(f"API request failed{msg}")
def _pagination(self, endpoint, params, key):
while True:
diff --git a/gallery_dl/extractor/architizer.py b/gallery_dl/extractor/architizer.py
index 911753b..e39d3d2 100644
--- a/gallery_dl/extractor/architizer.py
+++ b/gallery_dl/extractor/architizer.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -24,7 +24,7 @@ class ArchitizerProjectExtractor(GalleryExtractor):
example = "https://architizer.com/projects/NAME/"
def __init__(self, match):
- url = "{}/projects/{}/".format(self.root, match.group(1))
+ url = f"{self.root}/projects/{match[1]}/"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -68,15 +68,14 @@ class ArchitizerFirmExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.firm = match.group(1)
+ self.firm = match[1]
def items(self):
- url = url = "{}/firms/{}/?requesting_merlin=pages".format(
- self.root, self.firm)
+ url = url = f"{self.root}/firms/{self.firm}/?requesting_merlin=pages"
page = self.request(url).text
data = {"_extractor": ArchitizerProjectExtractor}
for project in text.extract_iter(page, '<a href="/projects/', '"'):
if not project.startswith("q/"):
- url = "{}/projects/{}".format(self.root, project)
+ url = f"{self.root}/projects/{project}"
yield Message.Queue, url, data
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index f448710..fdb92c4 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -25,22 +25,22 @@ class ArtstationExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1) or match.group(2)
+ self.user = match[1] or match[2]
def _init(self):
self.session.headers["Cache-Control"] = "max-age=0"
+ self.mviews = self.config("mviews", True)
+ self.videos = self.config("videos", True)
+ self.external = self.config("external", False)
+ self.previews = self.config("previews", False)
+ self.max_posts = self.config("max-posts")
def items(self):
- videos = self.config("videos", True)
- previews = self.config("previews", False)
- external = self.config("external", False)
- max_posts = self.config("max-posts")
-
data = self.metadata()
projects = self.projects()
- if max_posts:
- projects = itertools.islice(projects, max_posts)
+ if self.max_posts:
+ projects = itertools.islice(projects, self.max_posts)
for project in projects:
for num, asset in enumerate(
self.get_project_assets(project["hash_id"]), 1):
@@ -50,28 +50,10 @@ class ArtstationExtractor(Extractor):
yield Message.Directory, asset
if adict["has_embedded_player"]:
- player = adict["player_embedded"]
- url = (text.extr(player, 'src="', '"') or
- text.extr(player, "src='", "'"))
- if url.startswith(self.root):
- # video clip hosted on artstation
- if videos:
- page = self.request(url).text
- url = text.extr(page, ' src="', '"')
- text.nameext_from_url(url, asset)
- yield Message.Url, url, asset
- elif url:
- # external URL
- if external:
- asset["extension"] = "mp4"
- yield Message.Url, "ytdl:" + url, asset
- else:
- self.log.debug(player)
- self.log.warning(
- "Failed to extract embedded player URL (%s)",
- adict.get("id"))
-
- if not previews:
+ if url := self._extract_embed(asset):
+ text.nameext_from_url(url, asset)
+ yield Message.Url, url, asset
+ if not self.previews:
continue
if adict["has_image"]:
@@ -79,19 +61,50 @@ class ArtstationExtractor(Extractor):
text.nameext_from_url(url, asset)
url = self._no_cache(url)
- if "/video_clips/" not in url:
+ if "/images/images/" in url:
lhs, _, rhs = url.partition("/large/")
if rhs:
- url = lhs + "/4k/" + rhs
+ url = f"{lhs}/4k/{rhs}"
asset["_fallback"] = self._image_fallback(lhs, rhs)
yield Message.Url, url, asset
- @staticmethod
- def _image_fallback(lhs, rhs):
- yield lhs + "/large/" + rhs
- yield lhs + "/medium/" + rhs
- yield lhs + "/small/" + rhs
+ def _extract_embed(self, asset):
+ adict = asset["asset"]
+ player = adict["player_embedded"]
+ url = (text.extr(player, 'src="', '"') or
+ text.extr(player, "src='", "'"))
+
+ if url.startswith(self.root):
+ # embed or video clip hosted on artstation
+ type = text.extr(adict.get("image_url", ""), "/assets/", "/")
+ if type == "marmosets":
+ if not self.mviews:
+ return
+ page = self.request(url).text
+ return text.extr(page, "marmoset.embed(", '",').strip("\"' ")
+
+ elif type:
+ if not self.videos:
+ return
+ page = self.request(url).text
+ return text.extr(page, ' src="', '"')
+
+ if url:
+ # external URL
+ if not self.external:
+ return
+ asset["extension"] = "mp4"
+ return f"ytdl:{url}"
+
+ self.log.debug(player)
+ self.log.warning("Failed to extract embedded player URL (%s)",
+ adict.get("id"))
+
+ def _image_fallback(self, lhs, rhs):
+ yield f"{lhs}/large/{rhs}"
+ yield f"{lhs}/medium/{rhs}"
+ yield f"{lhs}/small/{rhs}"
def metadata(self):
"""Return general metadata"""
@@ -102,10 +115,10 @@ class ArtstationExtractor(Extractor):
def get_project_assets(self, project_id):
"""Return all assets associated with 'project_id'"""
- url = "{}/projects/{}.json".format(self.root, project_id)
+ url = f"{self.root}/projects/{project_id}.json"
try:
- data = self.request(url).json()
+ data = self.request_json(url)
except exception.HttpError as exc:
self.log.warning(exc)
return
@@ -130,7 +143,7 @@ class ArtstationExtractor(Extractor):
def get_user_info(self, username):
"""Return metadata for a specific user"""
- url = "{}/users/{}/quick.json".format(self.root, username.lower())
+ url = f"{self.root}/users/{username.lower()}/quick.json"
response = self.request(url, notfound="user")
return response.json()
@@ -153,7 +166,7 @@ class ArtstationExtractor(Extractor):
params["page"] = 1
while True:
- data = self.request(url, **kwargs).json()
+ data = self.request_json(url, **kwargs)
yield from data["data"]
total += len(data["data"])
@@ -168,12 +181,10 @@ class ArtstationExtractor(Extractor):
"Accept" : "*/*",
"Origin" : self.root,
}
- return self.request(
- url, method="POST", headers=headers, json={},
- ).json()["public_csrf_token"]
+ return self.request_json(
+ url, method="POST", headers=headers, json={})["public_csrf_token"]
- @staticmethod
- def _no_cache(url):
+ def _no_cache(self, url):
"""Cause a cache miss to prevent Cloudflare 'optimizations'
Cloudflare's 'Polish' optimization strips image metadata and may even
@@ -199,7 +210,7 @@ class ArtstationUserExtractor(ArtstationExtractor):
example = "https://www.artstation.com/USER"
def projects(self):
- url = "{}/users/{}/projects.json".format(self.root, self.user)
+ url = f"{self.root}/users/{self.user}/projects.json"
params = {"album_id": "all"}
return self._pagination(url, params)
@@ -217,7 +228,7 @@ class ArtstationAlbumExtractor(ArtstationExtractor):
def __init__(self, match):
ArtstationExtractor.__init__(self, match)
- self.album_id = text.parse_int(match.group(3))
+ self.album_id = text.parse_int(match[3])
def metadata(self):
userinfo = self.get_user_info(self.user)
@@ -235,7 +246,7 @@ class ArtstationAlbumExtractor(ArtstationExtractor):
}
def projects(self):
- url = "{}/users/{}/projects.json".format(self.root, self.user)
+ url = f"{self.root}/users/{self.user}/projects.json"
params = {"album_id": self.album_id}
return self._pagination(url, params)
@@ -250,7 +261,7 @@ class ArtstationLikesExtractor(ArtstationExtractor):
example = "https://www.artstation.com/USER/likes"
def projects(self):
- url = "{}/users/{}/likes.json".format(self.root, self.user)
+ url = f"{self.root}/users/{self.user}/likes.json"
return self._pagination(url)
@@ -266,19 +277,17 @@ class ArtstationCollectionExtractor(ArtstationExtractor):
def __init__(self, match):
ArtstationExtractor.__init__(self, match)
- self.collection_id = match.group(2)
+ self.collection_id = match[2]
def metadata(self):
- url = "{}/collections/{}.json".format(
- self.root, self.collection_id)
+ url = f"{self.root}/collections/{self.collection_id}.json"
params = {"username": self.user}
- collection = self.request(
- url, params=params, notfound="collection").json()
+ collection = self.request_json(
+ url, params=params, notfound="collection")
return {"collection": collection, "user": self.user}
def projects(self):
- url = "{}/collections/{}/projects.json".format(
- self.root, self.collection_id)
+ url = f"{self.root}/collections/{self.collection_id}/projects.json"
params = {"collection_id": self.collection_id}
return self._pagination(url, params)
@@ -294,10 +303,9 @@ class ArtstationCollectionsExtractor(ArtstationExtractor):
url = self.root + "/collections.json"
params = {"username": self.user}
- for collection in self.request(
- url, params=params, notfound="collections").json():
- url = "{}/{}/collections/{}".format(
- self.root, self.user, collection["id"])
+ for collection in self.request_json(
+ url, params=params, notfound="collections"):
+ url = f"{self.root}/{self.user}/collections/{collection['id']}"
collection["_extractor"] = ArtstationCollectionExtractor
yield Message.Queue, url, collection
@@ -316,18 +324,16 @@ class ArtstationChallengeExtractor(ArtstationExtractor):
def __init__(self, match):
ArtstationExtractor.__init__(self, match)
- self.challenge_id = match.group(1)
- self.sorting = match.group(2) or "popular"
+ self.challenge_id = match[1]
+ self.sorting = match[2] or "popular"
def items(self):
- challenge_url = "{}/contests/_/challenges/{}.json".format(
- self.root, self.challenge_id)
- submission_url = "{}/contests/_/challenges/{}/submissions.json".format(
- self.root, self.challenge_id)
- update_url = "{}/contests/submission_updates.json".format(
- self.root)
-
- challenge = self.request(challenge_url).json()
+ base = f"{self.root}/contests/_/challenges/{self.challenge_id}"
+ challenge_url = f"{base}.json"
+ submission_url = f"{base}/submissions.json"
+ update_url = f"{self.root}/contests/submission_updates.json"
+
+ challenge = self.request_json(challenge_url)
yield Message.Directory, {"challenge": challenge}
params = {"sorting": self.sorting}
@@ -344,8 +350,7 @@ class ArtstationChallengeExtractor(ArtstationExtractor):
text.nameext_from_url(url, update)
yield Message.Url, self._no_cache(url), update
- @staticmethod
- def _id_from_url(url):
+ def _id_from_url(self, url):
"""Get an image's submission ID from its URL"""
parts = url.split("/")
return text.parse_int("".join(parts[7:10]))
@@ -362,7 +367,7 @@ class ArtstationSearchExtractor(ArtstationExtractor):
def __init__(self, match):
ArtstationExtractor.__init__(self, match)
- self.params = query = text.parse_query(match.group(1))
+ self.params = query = text.parse_query(match[1])
self.query = text.unquote(query.get("query") or query.get("q", ""))
self.sorting = query.get("sort_by", "relevance").lower()
self.tags = query.get("tags", "").split(",")
@@ -384,7 +389,7 @@ class ArtstationSearchExtractor(ArtstationExtractor):
"value" : value.split(","),
})
- url = "{}/api/v2/search/projects.json".format(self.root)
+ url = f"{self.root}/api/v2/search/projects.json"
data = {
"query" : self.query,
"page" : None,
@@ -409,13 +414,13 @@ class ArtstationArtworkExtractor(ArtstationExtractor):
def __init__(self, match):
ArtstationExtractor.__init__(self, match)
- self.query = text.parse_query(match.group(1))
+ self.query = text.parse_query(match[1])
def metadata(self):
return {"artwork": self.query}
def projects(self):
- url = "{}/projects.json".format(self.root)
+ url = f"{self.root}/projects.json"
return self._pagination(url, self.query.copy())
@@ -429,7 +434,7 @@ class ArtstationImageExtractor(ArtstationExtractor):
def __init__(self, match):
ArtstationExtractor.__init__(self, match)
- self.project_id = match.group(1)
+ self.project_id = match[1]
self.assets = None
def metadata(self):
@@ -456,8 +461,8 @@ class ArtstationFollowingExtractor(ArtstationExtractor):
example = "https://www.artstation.com/USER/following"
def items(self):
- url = "{}/users/{}/following.json".format(self.root, self.user)
+ url = f"{self.root}/users/{self.user}/following.json"
for user in self._pagination(url):
- url = "{}/{}".format(self.root, user["username"])
+ url = f"{self.root}/{user['username']}"
user["_extractor"] = ArtstationUserExtractor
yield Message.Queue, url, user
diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py
index ca88187..8a7cb04 100644
--- a/gallery_dl/extractor/aryion.py
+++ b/gallery_dl/extractor/aryion.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -29,7 +29,7 @@ class AryionExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
self.recursive = True
def login(self):
@@ -61,8 +61,7 @@ class AryionExtractor(Extractor):
data = self.metadata()
for post_id in self.posts():
- post = self._parse_post(post_id)
- if post:
+ if post := self._parse_post(post_id):
if data:
post.update(data)
yield Message.Directory, post
@@ -108,10 +107,10 @@ class AryionExtractor(Extractor):
pos = page.find("Next &gt;&gt;")
if pos < 0:
return
- url = self.root + text.rextract(page, "href='", "'", pos)[0]
+ url = self.root + text.rextr(page, "href='", "'", pos)
def _parse_post(self, post_id):
- url = "{}/g4/data.php?id={}".format(self.root, post_id)
+ url = f"{self.root}/g4/data.php?id={post_id}"
with self.request(url, method="HEAD", fatal=False) as response:
if response.status_code >= 400:
@@ -141,9 +140,9 @@ class AryionExtractor(Extractor):
# fix 'Last-Modified' header
lmod = headers["last-modified"]
if lmod[22] != ":":
- lmod = "{}:{} GMT".format(lmod[:22], lmod[22:24])
+ lmod = f"{lmod[:22]}:{lmod[22:24]} GMT"
- post_url = "{}/g4/view/{}".format(self.root, post_id)
+ post_url = f"{self.root}/g4/view/{post_id}"
extr = text.extract_from(self.request(post_url).text)
title, _, artist = text.unescape(extr(
@@ -195,10 +194,10 @@ class AryionGalleryExtractor(AryionExtractor):
def posts(self):
if self.recursive:
- url = "{}/g4/gallery/{}".format(self.root, self.user)
+ url = f"{self.root}/g4/gallery/{self.user}"
return self._pagination_params(url)
else:
- url = "{}/g4/latest.php?name={}".format(self.root, self.user)
+ url = f"{self.root}/g4/latest.php?name={self.user}"
return util.advance(self._pagination_next(url), self.offset)
@@ -212,9 +211,8 @@ class AryionFavoriteExtractor(AryionExtractor):
example = "https://aryion.com/g4/favorites/USER"
def posts(self):
- url = "{}/g4/favorites/{}".format(self.root, self.user)
- return self._pagination_params(
- url, None, "class='gallery-item favorite' id='")
+ url = f"{self.root}/g4/favorites/{self.user}"
+ return self._pagination_params(url, None, "data-item-id='")
class AryionTagExtractor(AryionExtractor):
diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py
index a1ad3ae..50e0c5d 100644
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@@ -7,8 +7,7 @@
"""Extractors for https://bato.to/"""
from .common import Extractor, ChapterExtractor, MangaExtractor
-from .. import text, exception
-import re
+from .. import text, util
BASE_PATTERN = (r"(?:https?://)?("
r"(?:ba|d|f|h|j|m|w)to\.to|"
@@ -87,7 +86,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
ChapterExtractor.__init__(self, match, False)
self._init_root()
self.chapter_id = self.groups[1]
- self.gallery_url = "{}/title/0/{}".format(self.root, self.chapter_id)
+ self.page_url = f"{self.root}/title/0/{self.chapter_id}"
def metadata(self, page):
extr = text.extract_from(page)
@@ -104,9 +103,9 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
info = text.remove_html(extr('link-hover">', "</"))
info = text.unescape(info)
- match = re.match(
+ match = util.re(
r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?"
- r"(?:Chapter|Episode)\s*(\d+)([\w.]*)", info)
+ r"(?:Chapter|Episode)\s*(\d+)([\w.]*)").match(info)
if match:
volume, chapter, minor = match.groups()
else:
@@ -148,14 +147,13 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor):
MangaExtractor.__init__(self, match, False)
self._init_root()
self.manga_id = self.groups[1] or self.groups[2]
- self.manga_url = "{}/title/{}".format(self.root, self.manga_id)
+ self.page_url = f"{self.root}/title/{self.manga_id}"
def chapters(self, page):
extr = text.extract_from(page)
- warning = extr(' class="alert alert-warning">', "</div><")
- if warning:
- raise exception.StopExtraction("'%s'", text.remove_html(warning))
+ if warning := extr(' class="alert alert-warning">', "</div>"):
+ self.log.warning("'%s'", text.remove_html(warning))
data = {
"manga_id": text.parse_int(self.manga_id),
@@ -178,6 +176,6 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor):
data["date"] = text.parse_datetime(
extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ")
- url = "{}/title/{}".format(self.root, href)
+ url = f"{self.root}/title/{href}"
results.append((url, data.copy()))
return results
diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py
index b398152..8efb3db 100644
--- a/gallery_dl/extractor/bbc.py
+++ b/gallery_dl/extractor/bbc.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -33,7 +33,7 @@ class BbcGalleryExtractor(GalleryExtractor):
page, "<h1>", "</h1>").rpartition("</span>")[2]),
"description": text.unescape(text.extr(
page, 'property="og:description" content="', '"')),
- "programme": self.gallery_url.split("/")[4],
+ "programme": self.page_url.split("/")[4],
"path": list(util.unique_sequence(
element["name"]
for element in data["itemListElement"]
@@ -43,7 +43,7 @@ class BbcGalleryExtractor(GalleryExtractor):
def images(self, page):
width = self.config("width")
width = width - width % 16 if width else 1920
- dimensions = "/{}xn/".format(width)
+ dimensions = f"/{width}xn/"
results = []
for img in text.extract_iter(page, 'class="gallery__thumbnail', ">"):
@@ -60,12 +60,11 @@ class BbcGalleryExtractor(GalleryExtractor):
))
return results
- @staticmethod
- def _fallback_urls(src, max_width):
+ def _fallback_urls(self, src, max_width):
front, _, back = src.partition("/320x180_b/")
for width in (1920, 1600, 1280, 976):
if width < max_width:
- yield "{}/{}xn/{}".format(front, width, back)
+ yield f"{front}/{width}xn/{back}"
class BbcProgrammeExtractor(Extractor):
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index 14598b7..4a7c074 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -17,6 +17,8 @@ class BehanceExtractor(Extractor):
category = "behance"
root = "https://www.behance.net"
request_interval = (2.0, 4.0)
+ browser = "firefox"
+ tls12 = False
def _init(self):
self._bcp = self.cookies.get("bcp", domain="www.behance.net")
@@ -44,15 +46,15 @@ class BehanceExtractor(Extractor):
"variables": variables,
}
- return self.request(url, method="POST", headers=headers,
- json=data).json()["data"]
+ return self.request_json(
+ url, method="POST", headers=headers, json=data)["data"]
def _update(self, data):
# compress data to simple lists
- if data.get("fields") and isinstance(data["fields"][0], dict):
+ if (fields := data.get("fields")) and isinstance(fields[0], dict):
data["fields"] = [
field.get("name") or field.get("label")
- for field in data["fields"]
+ for field in fields
]
data["owners"] = [
@@ -68,6 +70,9 @@ class BehanceExtractor(Extractor):
data["date"] = text.parse_timestamp(
data.get("publishedOn") or data.get("conceived_on") or 0)
+ if creator := data.get("creator"):
+ creator["name"] = creator["url"].rpartition("/")[2]
+
# backwards compatibility
data["gallery_id"] = data["id"]
data["title"] = data["name"]
@@ -87,13 +92,12 @@ class BehanceGalleryExtractor(BehanceExtractor):
def __init__(self, match):
BehanceExtractor.__init__(self, match)
- self.gallery_id = match.group(1)
+ self.gallery_id = match[1]
def _init(self):
BehanceExtractor._init(self)
- modules = self.config("modules")
- if modules:
+ if modules := self.config("modules"):
if isinstance(modules, str):
modules = modules.split(",")
self.modules = set(modules)
@@ -114,12 +118,15 @@ class BehanceGalleryExtractor(BehanceExtractor):
def get_gallery_data(self):
"""Collect gallery info dict"""
- url = "{}/gallery/{}/a".format(self.root, self.gallery_id)
+ url = f"{self.root}/gallery/{self.gallery_id}/a"
cookies = {
- "gki": '{"feature_project_view":false,'
- '"feature_discover_login_prompt":false,'
- '"feature_project_login_prompt":false}',
+ "gk_suid": "14118261",
+ "gki": "feature_3_in_1_checkout_test:false,hire_browse_get_quote_c"
+ "ta_ab_test:false,feature_hire_dashboard_services_ab_test:f"
+ "alse,feature_show_details_jobs_row_ab_test:false,feature_a"
+ "i_freelance_project_create_flow:false,",
"ilo0": "true",
+ "originalReferrer": "",
}
page = self.request(url, cookies=cookies).text
@@ -141,9 +148,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
raise exception.AuthorizationError()
return ()
- result = []
- append = result.append
-
+ results = []
for module in data["modules"]:
mtype = module["__typename"][:-6].lower()
@@ -161,7 +166,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
sizes.get("fs") or
sizes.get("hd") or
sizes.get("disp"))
- append((size["url"], module))
+ results.append((size["url"], module))
elif mtype == "video":
try:
@@ -173,7 +178,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
url = "ytdl:" + url
module["_ytdl_manifest"] = "hls"
module["extension"] = "mp4"
- append((url, module))
+ results.append((url, module))
continue
except Exception as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
@@ -194,7 +199,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
self.log.debug("%s: %s", exc.__class__.__name__, exc)
url = "ytdl:" + renditions[-1]["url"]
- append((url, module))
+ results.append((url, module))
elif mtype == "mediacollection":
for component in module["components"]:
@@ -202,21 +207,21 @@ class BehanceGalleryExtractor(BehanceExtractor):
if size:
parts = size["url"].split("/")
parts[4] = "source"
- append(("/".join(parts), module))
+ results.append(("/".join(parts), module))
break
elif mtype == "embed":
- embed = module.get("originalEmbed") or module.get("fluidEmbed")
- if embed:
+ if embed := (module.get("originalEmbed") or
+ module.get("fluidEmbed")):
embed = text.unescape(text.extr(embed, 'src="', '"'))
module["extension"] = "mp4"
- append(("ytdl:" + embed, module))
+ results.append(("ytdl:" + embed, module))
elif mtype == "text":
module["extension"] = "txt"
- append(("text:" + module["text"], module))
+ results.append(("text:" + module["text"], module))
- return result
+ return results
class BehanceUserExtractor(BehanceExtractor):
@@ -228,7 +233,7 @@ class BehanceUserExtractor(BehanceExtractor):
def __init__(self, match):
BehanceExtractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
def galleries(self):
endpoint = "GetProfileProjects"
@@ -256,7 +261,7 @@ class BehanceCollectionExtractor(BehanceExtractor):
def __init__(self, match):
BehanceExtractor.__init__(self, match)
- self.collection_id = match.group(1)
+ self.collection_id = match[1]
def galleries(self):
endpoint = "GetMoodboardItemsAndRecommendations"
diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py
index 597ec40..3f0acff 100644
--- a/gallery_dl/extractor/bilibili.py
+++ b/gallery_dl/extractor/bilibili.py
@@ -19,20 +19,15 @@ class BilibiliExtractor(Extractor):
def _init(self):
self.api = BilibiliAPI(self)
-
-class BilibiliUserArticlesExtractor(BilibiliExtractor):
- """Extractor for a bilibili user's articles"""
- subcategory = "user-articles"
- pattern = (r"(?:https?://)?space\.bilibili\.com/(\d+)"
- r"/(?:article|upload/opus)")
- example = "https://space.bilibili.com/12345/article"
-
def items(self):
- for article in self.api.user_articles(self.groups[0]):
+ for article in self.articles():
article["_extractor"] = BilibiliArticleExtractor
- url = "{}/opus/{}".format(self.root, article["opus_id"])
+ url = f"{self.root}/opus/{article['opus_id']}"
yield Message.Queue, url, article
+ def articles(self):
+ return ()
+
class BilibiliArticleExtractor(BilibiliExtractor):
"""Extractor for a bilibili article"""
@@ -45,12 +40,16 @@ class BilibiliArticleExtractor(BilibiliExtractor):
archive_fmt = "{id}_{num}"
def items(self):
- article = self.api.article(self.groups[0])
+ article_id = self.groups[0]
+ article = self.api.article(article_id)
# Flatten modules list
modules = {}
for module in article["detail"]["modules"]:
- del module['module_type']
+ if module["module_type"] == "MODULE_TYPE_BLOCKED":
+ self.log.warning("%s: Blocked Article\n%s", article_id,
+ module["module_blocked"].get("hint_message"))
+ del module["module_type"]
modules.update(module)
article["detail"]["modules"] = modules
@@ -64,14 +63,15 @@ class BilibiliArticleExtractor(BilibiliExtractor):
except Exception:
pass
- for paragraph in modules['module_content']['paragraphs']:
- if "pic" not in paragraph:
- continue
+ if "module_content" in modules:
+ for paragraph in modules["module_content"]["paragraphs"]:
+ if "pic" not in paragraph:
+ continue
- try:
- pics.extend(paragraph["pic"]["pics"])
- except Exception:
- pass
+ try:
+ pics.extend(paragraph["pic"]["pics"])
+ except Exception:
+ pass
article["count"] = len(pics)
yield Message.Directory, article
@@ -81,6 +81,17 @@ class BilibiliArticleExtractor(BilibiliExtractor):
yield Message.Url, url, text.nameext_from_url(url, article)
+class BilibiliUserArticlesExtractor(BilibiliExtractor):
+ """Extractor for a bilibili user's articles"""
+ subcategory = "user-articles"
+ pattern = (r"(?:https?://)?space\.bilibili\.com/(\d+)"
+ r"/(?:article|upload/opus)")
+ example = "https://space.bilibili.com/12345/article"
+
+ def articles(self):
+ return self.api.user_articles(self.groups[0])
+
+
class BilibiliUserArticlesFavoriteExtractor(BilibiliExtractor):
subcategory = "user-articles-favorite"
pattern = (r"(?:https?://)?space\.bilibili\.com"
@@ -88,18 +99,12 @@ class BilibiliUserArticlesFavoriteExtractor(BilibiliExtractor):
example = "https://space.bilibili.com/12345/favlist?fid=opus"
_warning = True
- def _init(self):
- BilibiliExtractor._init(self)
+ def articles(self):
if self._warning:
if not self.cookies_check(("SESSDATA",)):
self.log.error("'SESSDATA' cookie required")
BilibiliUserArticlesFavoriteExtractor._warning = False
-
- def items(self):
- for article in self.api.user_favlist():
- article["_extractor"] = BilibiliArticleExtractor
- url = "{}/opus/{}".format(self.root, article["opus_id"])
- yield Message.Queue, url, article
+ return self.api.user_favlist()
class BilibiliAPI():
@@ -108,11 +113,11 @@ class BilibiliAPI():
def _call(self, endpoint, params):
url = "https://api.bilibili.com/x/polymer/web-dynamic/v1" + endpoint
- data = self.extractor.request(url, params=params).json()
+ data = self.extractor.request_json(url, params=params)
- if data["code"] != 0:
+ if data["code"]:
self.extractor.log.debug("Server response: %s", data)
- raise exception.StopExtraction("API request failed")
+ raise exception.AbortExtraction("API request failed")
return data
@@ -140,8 +145,8 @@ class BilibiliAPI():
page, "window.__INITIAL_STATE__=", "};") + "}")
except Exception:
if "window._riskdata_" not in page:
- raise exception.StopExtraction(
- "%s: Unable to extract INITIAL_STATE data", article_id)
+ raise exception.AbortExtraction(
+ f"{article_id}: Unable to extract INITIAL_STATE data")
self.extractor.wait(seconds=300)
def user_favlist(self):
@@ -159,12 +164,13 @@ class BilibiliAPI():
def login_user_id(self):
url = "https://api.bilibili.com/x/space/v2/myinfo"
- data = self.extractor.request(url).json()
+ data = self.extractor.request_json(url)
if data["code"] != 0:
self.extractor.log.debug("Server response: %s", data)
- raise exception.StopExtraction("API request failed,Are you login?")
+ raise exception.AbortExtraction(
+ "API request failed. Are you logges in?")
try:
return data["data"]["profile"]["mid"]
except Exception:
- raise exception.StopExtraction("API request failed")
+ raise exception.AbortExtraction("API request failed")
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index ef117da..796d9d1 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,12 @@
from .common import BaseExtractor, Message
from .. import text, util
-import re
+
+
+def original(url):
+ return (util.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)")
+ .sub(r"\1s0", url)
+ .replace("http:", "https:", 1))
class BloggerExtractor(BaseExtractor):
@@ -33,13 +38,12 @@ class BloggerExtractor(BaseExtractor):
blog["date"] = text.parse_datetime(blog["published"])
del blog["selfLink"]
- sub = re.compile(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)").sub
- findall_image = re.compile(
+ findall_image = util.re(
r'src="(https?://(?:'
r'blogger\.googleusercontent\.com/img|'
r'lh\d+(?:-\w+)?\.googleusercontent\.com|'
r'\d+\.bp\.blogspot\.com)/[^"]+)').findall
- findall_video = re.compile(
+ findall_video = util.re(
r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
metadata = self.metadata()
@@ -48,7 +52,7 @@ class BloggerExtractor(BaseExtractor):
files = findall_image(content)
for idx, url in enumerate(files):
- files[idx] = sub(r"\1s0", url).replace("http:", "https:", 1)
+ files[idx] = original(url)
if self.videos and 'id="BLOG_video-' in content:
page = self.request(post["url"]).text
@@ -98,12 +102,8 @@ class BloggerPostExtractor(BloggerExtractor):
pattern = BASE_PATTERN + r"(/\d\d\d\d/\d\d/[^/?#]+\.html)"
example = "https://BLOG.blogspot.com/1970/01/TITLE.html"
- def __init__(self, match):
- BloggerExtractor.__init__(self, match)
- self.path = match.group(match.lastindex)
-
def posts(self, blog):
- return (self.api.post_by_path(blog["id"], self.path),)
+ return (self.api.post_by_path(blog["id"], self.groups[-1]),)
class BloggerBlogExtractor(BloggerExtractor):
@@ -122,16 +122,13 @@ class BloggerSearchExtractor(BloggerExtractor):
pattern = BASE_PATTERN + r"/search/?\?q=([^&#]+)"
example = "https://BLOG.blogspot.com/search?q=QUERY"
- def __init__(self, match):
- BloggerExtractor.__init__(self, match)
- self.query = text.unquote(match.group(match.lastindex))
+ def metadata(self):
+ self.query = query = text.unquote(self.groups[-1])
+ return {"query": query}
def posts(self, blog):
return self.api.blog_search(blog["id"], self.query)
- def metadata(self):
- return {"query": self.query}
-
class BloggerLabelExtractor(BloggerExtractor):
"""Extractor for Blogger posts by label"""
@@ -139,21 +136,18 @@ class BloggerLabelExtractor(BloggerExtractor):
pattern = BASE_PATTERN + r"/search/label/([^/?#]+)"
example = "https://BLOG.blogspot.com/search/label/LABEL"
- def __init__(self, match):
- BloggerExtractor.__init__(self, match)
- self.label = text.unquote(match.group(match.lastindex))
+ def metadata(self):
+ self.label = label = text.unquote(self.groups[-1])
+ return {"label": label}
def posts(self, blog):
return self.api.blog_posts(blog["id"], self.label)
- def metadata(self):
- return {"label": self.label}
-
class BloggerAPI():
- """Minimal interface for the Blogger v3 API
+ """Minimal interface for the Blogger API v3
- Ref: https://developers.google.com/blogger
+ https://developers.google.com/blogger
"""
API_KEY = "AIzaSyCN9ax34oMMyM07g_M-5pjeDp_312eITK8"
@@ -162,27 +156,27 @@ class BloggerAPI():
self.api_key = extractor.config("api-key") or self.API_KEY
def blog_by_url(self, url):
- return self._call("blogs/byurl", {"url": url}, "blog")
+ return self._call("/blogs/byurl", {"url": url}, "blog")
def blog_posts(self, blog_id, label=None):
- endpoint = "blogs/{}/posts".format(blog_id)
+ endpoint = f"/blogs/{blog_id}/posts"
params = {"labels": label}
return self._pagination(endpoint, params)
def blog_search(self, blog_id, query):
- endpoint = "blogs/{}/posts/search".format(blog_id)
+ endpoint = f"/blogs/{blog_id}/posts/search"
params = {"q": query}
return self._pagination(endpoint, params)
def post_by_path(self, blog_id, path):
- endpoint = "blogs/{}/posts/bypath".format(blog_id)
+ endpoint = f"/blogs/{blog_id}/posts/bypath"
return self._call(endpoint, {"path": path}, "post")
def _call(self, endpoint, params, notfound=None):
- url = "https://www.googleapis.com/blogger/v3/" + endpoint
+ url = "https://www.googleapis.com/blogger/v3" + endpoint
params["key"] = self.api_key
- return self.extractor.request(
- url, params=params, notfound=notfound).json()
+ return self.extractor.request_json(
+ url, params=params, notfound=notfound)
def _pagination(self, endpoint, params):
while True:
diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py
index 6f4abd5..e2c5334 100644
--- a/gallery_dl/extractor/bluesky.py
+++ b/gallery_dl/extractor/bluesky.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://bsky.app/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from ..cache import cache, memcache
@@ -26,8 +26,7 @@ class BlueskyExtractor(Extractor):
root = "https://bsky.app"
def _init(self):
- meta = self.config("metadata") or ()
- if meta:
+ if meta := self.config("metadata") or ():
if isinstance(meta, str):
meta = meta.replace(" ", "").split(",")
elif not isinstance(meta, (list, tuple)):
@@ -62,9 +61,8 @@ class BlueskyExtractor(Extractor):
yield Message.Directory, post
if files:
did = post["author"]["did"]
- base = (
- "{}/xrpc/com.atproto.sync.getBlob?did={}&cid=".format(
- self.api.service_endpoint(did), did))
+ base = (f"{self.api.service_endpoint(did)}/xrpc"
+ f"/com.atproto.sync.getBlob?did={did}&cid=")
for post["num"], file in enumerate(files, 1):
post.update(file)
yield Message.Url, base + file["filename"], post
@@ -96,7 +94,7 @@ class BlueskyExtractor(Extractor):
uri = record["value"]["subject"]["uri"]
if "/app.bsky.feed.post/" in uri:
yield from self.api.get_post_thread_uri(uri, depth)
- except exception.StopExtraction:
+ except exception.ControlException:
pass # deleted post
except Exception as exc:
self.log.debug(record, exc_info=exc)
@@ -210,16 +208,12 @@ class BlueskyExtractor(Extractor):
},)
-class BlueskyUserExtractor(BlueskyExtractor):
- subcategory = "user"
+class BlueskyUserExtractor(Dispatch, BlueskyExtractor):
pattern = USER_PATTERN + r"$"
example = "https://bsky.app/profile/HANDLE"
- def initialize(self):
- pass
-
def items(self):
- base = "{}/profile/{}/".format(self.root, self.groups[0])
+ base = f"{self.root}/profile/{self.groups[0]}/"
default = ("posts" if self.config("quoted", False) or
self.config("reposts", False) else "media")
return self._dispatch_extractors((
@@ -415,11 +409,9 @@ class BlueskyAPI():
def get_feed(self, actor, feed):
endpoint = "app.bsky.feed.getFeed"
- params = {
- "feed" : "at://{}/app.bsky.feed.generator/{}".format(
- self._did_from_actor(actor), feed),
- "limit": "100",
- }
+ uri = (f"at://{self._did_from_actor(actor)}"
+ f"/app.bsky.feed.generator/{feed}")
+ params = {"feed": uri, "limit": "100"}
return self._pagination(endpoint, params)
def get_follows(self, actor):
@@ -432,16 +424,13 @@ class BlueskyAPI():
def get_list_feed(self, actor, list):
endpoint = "app.bsky.feed.getListFeed"
- params = {
- "list" : "at://{}/app.bsky.graph.list/{}".format(
- self._did_from_actor(actor), list),
- "limit": "100",
- }
+ uri = f"at://{self._did_from_actor(actor)}/app.bsky.graph.list/{list}"
+ params = {"list" : uri, "limit": "100"}
return self._pagination(endpoint, params)
def get_post_thread(self, actor, post_id):
- uri = "at://{}/app.bsky.feed.post/{}".format(
- self._did_from_actor(actor), post_id)
+ uri = (f"at://{self._did_from_actor(actor)}"
+ f"/app.bsky.feed.post/{post_id}")
depth = self.extractor.config("depth", "0")
return self.get_post_thread_uri(uri, depth)
@@ -498,7 +487,7 @@ class BlueskyAPI():
url = "https://plc.directory/" + did
try:
- data = self.extractor.request(url).json()
+ data = self.extractor.request_json(url)
for service in data["service"]:
if service["type"] == "AtprotoPersonalDataServer":
return service["serviceEndpoint"]
@@ -551,15 +540,15 @@ class BlueskyAPI():
"password" : self.password,
}
- url = "{}/xrpc/{}".format(self.root, endpoint)
+ url = f"{self.root}/xrpc/{endpoint}"
response = self.extractor.request(
url, method="POST", headers=headers, json=data, fatal=None)
data = response.json()
if response.status_code != 200:
self.log.debug("Server response: %s", data)
- raise exception.AuthenticationError('"{}: {}"'.format(
- data.get("error"), data.get("message")))
+ raise exception.AuthenticationError(
+ f"\"{data.get('error')}: {data.get('message')}\"")
_refresh_token_cache.update(self.username, data["refreshJwt"])
return "Bearer " + data["accessJwt"]
@@ -567,7 +556,7 @@ class BlueskyAPI():
def _call(self, endpoint, params, root=None):
if root is None:
root = self.root
- url = "{}/xrpc/{}".format(root, endpoint)
+ url = f"{root}/xrpc/{endpoint}"
while True:
self.authenticate()
@@ -581,16 +570,15 @@ class BlueskyAPI():
self.extractor.wait(until=until)
continue
+ msg = "API request failed"
try:
data = response.json()
- msg = "API request failed ('{}: {}')".format(
- data["error"], data["message"])
+ msg = f"{msg} ('{data['error']}: {data['message']}')"
except Exception:
- msg = "API request failed ({} {})".format(
- response.status_code, response.reason)
+ msg = f"{msg} ({response.status_code} {response.reason})"
self.extractor.log.debug("Server response: %s", response.text)
- raise exception.StopExtraction(msg)
+ raise exception.AbortExtraction(msg)
def _pagination(self, endpoint, params,
key="feed", root=None, check_empty=False):
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 7e26f38..3b97e9a 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -27,8 +27,7 @@ class BooruExtractor(BaseExtractor):
notes = self.config("notes", False)
fetch_html = tags or notes
- url_key = self.config("url")
- if url_key:
+ if url_key := self.config("url"):
if isinstance(url_key, (list, tuple)):
self._file_url = self._file_url_list
self._file_url_keys = url_key
diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py
index f3e441b..e0383bf 100644
--- a/gallery_dl/extractor/boosty.py
+++ b/gallery_dl/extractor/boosty.py
@@ -144,8 +144,7 @@ class BoostyExtractor(Extractor):
url = block["url"]
sep = "&" if "?" in url else "?"
- signed_query = post.get("signedQuery")
- if signed_query:
+ if signed_query := post.get("signedQuery"):
url += sep + signed_query[1:]
sep = "&"
@@ -218,7 +217,7 @@ class BoostyFollowingExtractor(BoostyExtractor):
def items(self):
for user in self.api.user_subscriptions():
- url = "{}/{}".format(self.root, user["blog"]["blogUrl"])
+ url = f"{self.root}/{user['blog']['blogUrl']}"
user["_extractor"] = BoostyUserExtractor
yield Message.Queue, url, user
@@ -280,15 +279,14 @@ class BoostyAPI():
}
if not access_token:
- auth = self.extractor.cookies.get("auth", domain=".boosty.to")
- if auth:
+ if auth := self.extractor.cookies.get("auth", domain=".boosty.to"):
access_token = text.extr(
auth, "%22accessToken%22%3A%22", "%22")
if access_token:
self.headers["Authorization"] = "Bearer " + access_token
def blog_posts(self, username, params):
- endpoint = "/v1/blog/{}/post/".format(username)
+ endpoint = f"/v1/blog/{username}/post/"
params = self._merge_params(params, {
"limit" : "5",
"offset" : None,
@@ -298,7 +296,7 @@ class BoostyAPI():
return self._pagination(endpoint, params)
def blog_media_album(self, username, type="all", params=()):
- endpoint = "/v1/blog/{}/media_album/".format(username)
+ endpoint = f"/v1/blog/{username}/media_album/"
params = self._merge_params(params, {
"type" : type.rstrip("s"),
"limit" : "15",
@@ -318,7 +316,7 @@ class BoostyAPI():
return posts
def post(self, username, post_id):
- endpoint = "/v1/blog/{}/post/{}".format(username, post_id)
+ endpoint = f"/v1/blog/{username}/post/{post_id}"
return self._call(endpoint)
def feed_posts(self, params=None):
@@ -381,7 +379,7 @@ class BoostyAPI():
else:
self.extractor.log.debug(response.text)
- raise exception.StopExtraction("API request failed")
+ raise exception.AbortExtraction("API request failed")
def _pagination(self, endpoint, params, transform=None, key=None):
if "is_only_allowed" not in params and self.extractor.only_allowed:
@@ -418,11 +416,11 @@ class BoostyAPI():
params["offset"] = offset
def dialog(self, dialog_id):
- endpoint = "/v1/dialog/{}".format(dialog_id)
+ endpoint = f"/v1/dialog/{dialog_id}"
return self._call(endpoint)
def dialog_messages(self, dialog_id, limit=300, offset=None):
- endpoint = "/v1/dialog/{}/message/".format(dialog_id)
+ endpoint = f"/v1/dialog/{dialog_id}/message/"
params = {
"limit": limit,
"reverse": "true",
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 481e962..eba1678 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -61,6 +61,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
category = "bunkr"
root = "https://bunkr.si"
root_dl = "https://get.bunkrr.su"
+ root_api = "https://apidl.bunkr.ru"
archive_fmt = "{album_id}_{id|id_url}"
pattern = BASE_PATTERN + r"/a/([^/?#]+)"
example = "https://bunkr.si/a/ID"
@@ -76,9 +77,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
endpoint = self.config("endpoint")
if not endpoint:
- endpoint = self.root_dl + "/api/_001"
+ endpoint = self.root_api + "/api/_001_v2"
elif endpoint[0] == "/":
- endpoint = self.root_dl + endpoint
+ endpoint = self.root_api + endpoint
self.endpoint = endpoint
self.offset = 0
@@ -123,7 +124,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
pass
else:
if not DOMAINS:
- raise exception.StopExtraction(
+ raise exception.AbortExtraction(
"All Bunkr domains require solving a CF challenge")
# select alternative domain
@@ -168,7 +169,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
info[-1], "%H:%M:%S %d/%m/%Y")
yield file
- except exception.StopExtraction:
+ except exception.ControlException:
raise
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
@@ -180,11 +181,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
referer = self.root_dl + "/file/" + data_id
headers = {"Referer": referer, "Origin": self.root_dl}
- data = self.request(self.endpoint, method="POST", headers=headers,
- json={"id": data_id}).json()
+ data = self.request_json(self.endpoint, method="POST", headers=headers,
+ json={"id": data_id})
if data.get("encrypted"):
- key = "SECRET_KEY_{}".format(data["timestamp"] // 3600)
+ key = f"SECRET_KEY_{data['timestamp'] // 3600}"
file_url = util.decrypt_xor(data["url"], key.encode())
else:
file_url = data["url"]
diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py
index 6c81f53..22f7a97 100644
--- a/gallery_dl/extractor/catbox.py
+++ b/gallery_dl/extractor/catbox.py
@@ -26,7 +26,7 @@ class CatboxAlbumExtractor(GalleryExtractor):
def metadata(self, page):
extr = text.extract_from(page)
return {
- "album_id" : self.gallery_url.rpartition("/")[2],
+ "album_id" : self.page_url.rpartition("/")[2],
"album_name" : text.unescape(extr("<h1>", "<")),
"date" : text.parse_datetime(extr(
"<p>Created ", "<"), "%B %d %Y"),
diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py
index dc963c5..1da7e23 100644
--- a/gallery_dl/extractor/chevereto.py
+++ b/gallery_dl/extractor/chevereto.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -68,7 +68,7 @@ class CheveretoImageExtractor(CheveretoExtractor):
extr('url: "', '"'))
if not url or url.endswith("/loading.svg"):
pos = page.find(" download=")
- url = text.rextract(page, 'href="', '"', pos)[0]
+ url = text.rextr(page, 'href="', '"', pos)
if not url.startswith("https://"):
url = util.decrypt_xor(
url, b"seltilovessimpcity@simpcityhatesscrapers",
diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py
index 27d50e7..7dfe6b6 100644
--- a/gallery_dl/extractor/cien.py
+++ b/gallery_dl/extractor/cien.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -20,7 +20,7 @@ class CienExtractor(Extractor):
request_interval = (1.0, 2.0)
def __init__(self, match):
- self.root = text.root_from_url(match.group(0))
+ self.root = text.root_from_url(match[0])
Extractor.__init__(self, match)
def _init(self):
@@ -52,8 +52,7 @@ class CienArticleExtractor(CienExtractor):
example = "https://ci-en.net/creator/123/article/12345"
def items(self):
- url = "{}/creator/{}/article/{}".format(
- self.root, self.groups[0], self.groups[1])
+ url = f"{self.root}/creator/{self.groups[0]}/article/{self.groups[1]}"
page = self.request(url, notfound="article").text
files = self._extract_files(page)
@@ -121,7 +120,7 @@ class CienArticleExtractor(CienExtractor):
auth = text.extr(video, ' auth-key="', '"')
file = text.nameext_from_url(name)
- file["url"] = "{}video-web.mp4?{}".format(path, auth)
+ file["url"] = f"{path}video-web.mp4?{auth}"
file["type"] = "video"
files.append(file)
@@ -145,12 +144,12 @@ class CienArticleExtractor(CienExtractor):
"gallery_id": text.extr(gallery, ' gallery-id="', '"'),
"time" : text.extr(gallery, ' time="', '"'),
}
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
url = self.root + "/api/creator/gallery/imagePath"
for params["page"], params["file_id"] in enumerate(
data["imgList"]):
- path = self.request(url, params=params).json()["path"]
+ path = self.request_json(url, params=params)["path"]
file = params.copy()
file["url"] = path
@@ -163,7 +162,7 @@ class CienCreatorExtractor(CienExtractor):
example = "https://ci-en.net/creator/123"
def items(self):
- url = "{}/creator/{}/article".format(self.root, self.groups[0])
+ url = f"{self.root}/creator/{self.groups[0]}/article"
params = text.parse_query(self.groups[1])
params["mode"] = "list"
return self._pagination_articles(url, params)
diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py
index 56fe851..dc5b777 100644
--- a/gallery_dl/extractor/civitai.py
+++ b/gallery_dl/extractor/civitai.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://www.civitai.com/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from ..cache import memcache
import itertools
@@ -22,9 +22,9 @@ class CivitaiExtractor(Extractor):
"""Base class for civitai extractors"""
category = "civitai"
root = "https://civitai.com"
- directory_fmt = ("{category}", "{username|user[username]}", "images")
- filename_fmt = "{file[id]|id|filename}.{extension}"
- archive_fmt = "{file[uuid]|uuid}"
+ directory_fmt = ("{category}", "{user[username]}", "images")
+ filename_fmt = "{file[id]}.{extension}"
+ archive_fmt = "{file[uuid]}"
request_interval = (0.5, 1.5)
def _init(self):
@@ -35,8 +35,7 @@ class CivitaiExtractor(Extractor):
self.log.debug("Using tRPC API")
self.api = CivitaiTrpcAPI(self)
- quality = self.config("quality")
- if quality:
+ if quality := self.config("quality"):
if not isinstance(quality, str):
quality = ",".join(quality)
self._image_quality = quality
@@ -45,8 +44,7 @@ class CivitaiExtractor(Extractor):
self._image_quality = "original=true"
self._image_ext = "png"
- quality_video = self.config("quality-videos")
- if quality_video:
+ if quality_video := self.config("quality-videos"):
if not isinstance(quality_video, str):
quality_video = ",".join(quality_video)
if quality_video[0] == "+":
@@ -59,28 +57,27 @@ class CivitaiExtractor(Extractor):
self._video_quality = "quality=100"
self._video_ext = "webm"
- metadata = self.config("metadata")
- if metadata:
+ if metadata := self.config("metadata"):
if isinstance(metadata, str):
metadata = metadata.split(",")
elif not isinstance(metadata, (list, tuple)):
- metadata = ("generation", "version")
+ metadata = ("generation", "version", "post")
self._meta_generation = ("generation" in metadata)
self._meta_version = ("version" in metadata)
+ self._meta_post = ("post" in metadata)
else:
- self._meta_generation = self._meta_version = False
+ self._meta_generation = self._meta_version = self._meta_post = \
+ False
def items(self):
- models = self.models()
- if models:
+ if models := self.models():
data = {"_extractor": CivitaiModelExtractor}
for model in models:
- url = "{}/models/{}".format(self.root, model["id"])
+ url = f"{self.root}/models/{model['id']}"
yield Message.Queue, url, data
return
- posts = self.posts()
- if posts:
+ if posts := self.posts():
for post in posts:
if "images" in post:
@@ -105,27 +102,37 @@ class CivitaiExtractor(Extractor):
yield Message.Url, file["url"], file
return
- images = self.images()
- if images:
- for image in images:
+ if images := self.images():
+ for file in images:
+
+ data = {
+ "file": file,
+ "user": file.pop("user"),
+ }
if self._meta_generation:
- image["generation"] = \
- self._extract_meta_generation(image)
+ data["generation"] = \
+ self._extract_meta_generation(file)
if self._meta_version:
- image["model"], image["version"] = \
- self._extract_meta_version(image, False)
- image["date"] = text.parse_datetime(
- image["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
-
- url = self._url(image)
- text.nameext_from_url(url, image)
- if not image["extension"]:
- image["extension"] = (
- self._video_ext if image.get("type") == "video" else
+ data["model"], data["version"] = \
+ self._extract_meta_version(file, False)
+ if "post" in file:
+ data["post"] = file.pop("post")
+ if self._meta_post and "post" not in data:
+ data["post"] = post = self._extract_meta_post(file)
+ if post:
+ post.pop("user", None)
+ file["date"] = text.parse_datetime(
+ file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+
+ data["url"] = url = self._url(file)
+ text.nameext_from_url(url, data)
+ if not data["extension"]:
+ data["extension"] = (
+ self._video_ext if file.get("type") == "video" else
self._image_ext)
- yield Message.Directory, image
- yield Message.Url, url, image
+ yield Message.Directory, data
+ yield Message.Url, url, data
return
def models(self):
@@ -151,12 +158,13 @@ class CivitaiExtractor(Extractor):
image["uuid"] = url
name = image.get("name")
if not name:
- mime = image.get("mimeType") or self._image_ext
- name = "{}.{}".format(image.get("id"), mime.rpartition("/")[2])
- return (
- "https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/{}/{}/{}".format(
- url, quality, name)
- )
+ if mime := image.get("mimeType"):
+ name = f"{image.get('id')}.{mime.rpartition('/')[2]}"
+ else:
+ ext = self._video_ext if video else self._image_ext
+ name = f"{image.get('id')}.{ext}"
+ return (f"https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA"
+ f"/{url}/{quality}/{name}")
def _image_results(self, images):
for num, file in enumerate(images, 1):
@@ -171,10 +179,29 @@ class CivitaiExtractor(Extractor):
self._image_ext)
if "id" not in file and data["filename"].isdecimal():
file["id"] = text.parse_int(data["filename"])
+ if "date" not in file:
+ file["date"] = text.parse_datetime(
+ file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
if self._meta_generation:
file["generation"] = self._extract_meta_generation(file)
yield data
+ def _image_reactions(self):
+ self._require_auth()
+
+ params = self.params
+ params["authed"] = True
+ params["useIndex"] = False
+ if "reactions" not in params:
+ params["reactions"] = ("Like", "Dislike", "Heart", "Laugh", "Cry")
+ return self.api.images(params)
+
+ def _require_auth(self):
+ if "Authorization" not in self.api.headers and \
+ not self.cookies.get(
+ "__Secure-civitai-token", domain=".civitai.com"):
+ raise exception.AuthRequired(("'api-key'", "cookies"))
+
def _parse_query(self, value):
return text.parse_query_list(
value, {"tags", "reactions", "baseModels", "tools", "techniques",
@@ -186,10 +213,18 @@ class CivitaiExtractor(Extractor):
except Exception as exc:
return self.log.debug("", exc_info=exc)
+ def _extract_meta_post(self, image):
+ try:
+ post = self.api.post(image["postId"])
+ post["date"] = text.parse_datetime(
+ post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ return post
+ except Exception as exc:
+ return self.log.debug("", exc_info=exc)
+
def _extract_meta_version(self, item, is_post=True):
try:
- version_id = self._extract_version_id(item, is_post)
- if version_id:
+ if version_id := self._extract_version_id(item, is_post):
version = self.api.model_version(version_id).copy()
return version.pop("model", None), version
except Exception as exc:
@@ -197,12 +232,11 @@ class CivitaiExtractor(Extractor):
return None, None
def _extract_version_id(self, item, is_post=True):
- version_id = item.get("modelVersionId")
- if version_id:
+ if version_id := item.get("modelVersionId"):
return version_id
-
- version_ids = item.get("modelVersionIds")
- if version_ids:
+ if version_ids := item.get("modelVersionIds"):
+ return version_ids[0]
+ if version_ids := item.get("modelVersionIdsManual"):
return version_ids[0]
if is_post:
@@ -285,16 +319,15 @@ class CivitaiModelExtractor(CivitaiExtractor):
if not sep:
name = ext
ext = "bin"
- file["uuid"] = "model-{}-{}-{}".format(
- model["id"], version["id"], file["id"])
+ file["uuid"] = f"model-{model['id']}-{version['id']}-{file['id']}"
files.append({
"num" : num,
"file" : file,
"filename" : name,
"extension": ext,
- "url" : (file.get("downloadUrl") or
- "{}/api/download/models/{}".format(
- self.root, version["id"])),
+ "url" : (
+ file.get("downloadUrl") or
+ f"{self.root}/api/download/models/{version['id']}"),
"_http_headers" : {
"Authorization": self.api.headers.get("Authorization")},
"_http_validate": self._validate_file_model,
@@ -308,7 +341,7 @@ class CivitaiModelExtractor(CivitaiExtractor):
else:
params = {
"modelVersionId": version["id"],
- "prioritizedUserIds": [user["id"]],
+ "prioritizedUserIds": (user["id"],),
"period": "AllTime",
"sort": "Most Reactions",
"limit": 20,
@@ -327,8 +360,7 @@ class CivitaiModelExtractor(CivitaiExtractor):
alert = text.extr(
response.text, 'mantine-Alert-message">', "</div></div></div>")
if alert:
- msg = "\"{}\" - 'api-key' required".format(
- text.remove_html(alert))
+ msg = f"\"{text.remove_html(alert)}\" - 'api-key' required"
else:
msg = "'api-key' required to download this file"
self.log.warning(msg)
@@ -366,14 +398,26 @@ class CivitaiTagExtractor(CivitaiExtractor):
return self.api.models_tag(tag)
-class CivitaiSearchExtractor(CivitaiExtractor):
- subcategory = "search"
+class CivitaiSearchModelsExtractor(CivitaiExtractor):
+ subcategory = "search-models"
pattern = BASE_PATTERN + r"/search/models\?([^#]+)"
example = "https://civitai.com/search/models?query=QUERY"
def models(self):
- params = text.parse_query(self.groups[0])
- return self.api.models(params)
+ params = self._parse_query(self.groups[0])
+ return CivitaiSearchAPI(self).search_models(
+ params.get("query"), params.get("sortBy"), self.api.nsfw)
+
+
+class CivitaiSearchImagesExtractor(CivitaiExtractor):
+ subcategory = "search-images"
+ pattern = BASE_PATTERN + r"/search/images\?([^#]+)"
+ example = "https://civitai.com/search/images?query=QUERY"
+
+ def images(self):
+ params = self._parse_query(self.groups[0])
+ return CivitaiSearchAPI(self).search_images(
+ params.get("query"), params.get("sortBy"), self.api.nsfw)
class CivitaiModelsExtractor(CivitaiExtractor):
@@ -382,7 +426,7 @@ class CivitaiModelsExtractor(CivitaiExtractor):
example = "https://civitai.com/models"
def models(self):
- params = text.parse_query(self.groups[0])
+ params = self._parse_query(self.groups[0])
return self.api.models(params)
@@ -392,26 +436,32 @@ class CivitaiImagesExtractor(CivitaiExtractor):
example = "https://civitai.com/images"
def images(self):
- params = text.parse_query(self.groups[0])
+ params = self._parse_query(self.groups[0])
return self.api.images(params)
-class CivitaiUserExtractor(CivitaiExtractor):
- subcategory = "user"
+class CivitaiPostsExtractor(CivitaiExtractor):
+ subcategory = "posts"
+ pattern = BASE_PATTERN + r"/posts(?:/?\?([^#]+))?(?:$|#)"
+ example = "https://civitai.com/posts"
+
+ def posts(self):
+ params = self._parse_query(self.groups[0])
+ return self.api.posts(params)
+
+
+class CivitaiUserExtractor(Dispatch, CivitaiExtractor):
pattern = USER_PATTERN + r"/?(?:$|\?|#)"
example = "https://civitai.com/user/USER"
- def initialize(self):
- pass
-
def items(self):
- base = "{}/user/{}/".format(self.root, self.groups[0])
+ base = f"{self.root}/user/{self.groups[0]}/"
return self._dispatch_extractors((
(CivitaiUserModelsExtractor, base + "models"),
(CivitaiUserPostsExtractor , base + "posts"),
(CivitaiUserImagesExtractor, base + "images"),
(CivitaiUserVideosExtractor, base + "videos"),
- ), ("user-models", "user-posts"))
+ ), ("user-images", "user-videos"))
class CivitaiUserModelsExtractor(CivitaiExtractor):
@@ -446,29 +496,17 @@ class CivitaiUserImagesExtractor(CivitaiExtractor):
example = "https://civitai.com/user/USER/images"
def __init__(self, match):
- self.params = self._parse_query(match.group(2))
+ user, query = match.groups()
+ self.params = self._parse_query(query)
if self.params.get("section") == "reactions":
- self.subcategory = "reactions"
- self.images = self.images_reactions
+ self.subcategory = "reactions-images"
+ self.images = self._image_reactions
+ else:
+ self.params["username"] = text.unquote(user)
CivitaiExtractor.__init__(self, match)
def images(self):
- params = self.params
- params["username"] = text.unquote(self.groups[0])
- return self.api.images(params)
-
- def images_reactions(self):
- if "Authorization" not in self.api.headers and \
- not self.cookies.get(
- "__Secure-civitai-token", domain=".civitai.com"):
- raise exception.AuthorizationError("api-key or cookies required")
-
- params = self.params
- params["authed"] = True
- params["useIndex"] = False
- if "reactions" not in params:
- params["reactions"] = ("Like", "Dislike", "Heart", "Laugh", "Cry")
- return self.api.images(params)
+ return self.api.images(self.params)
class CivitaiUserVideosExtractor(CivitaiExtractor):
@@ -477,14 +515,40 @@ class CivitaiUserVideosExtractor(CivitaiExtractor):
pattern = USER_PATTERN + r"/videos/?(?:\?([^#]+))?"
example = "https://civitai.com/user/USER/videos"
- def images(self):
- self._image_ext = "mp4"
+ def __init__(self, match):
+ user, query = match.groups()
+ self.params = self._parse_query(query)
+ self.params["types"] = ("video",)
+ if self.params.get("section") == "reactions":
+ self.subcategory = "reactions-videos"
+ self.images = self._image_reactions
+ else:
+ self.params["username"] = text.unquote(user)
+ CivitaiExtractor.__init__(self, match)
- user, query = self.groups
- params = self._parse_query(query)
- params["types"] = ["video"]
- params["username"] = text.unquote(user)
- return self.api.images(params)
+ images = CivitaiUserImagesExtractor.images
+
+
+class CivitaiGeneratedExtractor(CivitaiExtractor):
+ """Extractor for your generated files feed"""
+ subcategory = "generated"
+ filename_fmt = "{filename}.{extension}"
+ directory_fmt = ("{category}", "generated")
+ pattern = f"{BASE_PATTERN}/generate"
+ example = "https://civitai.com/generate"
+
+ def items(self):
+ self._require_auth()
+
+ for gen in self.api.orchestrator_queryGeneratedImages():
+ gen["date"] = text.parse_datetime(
+ gen["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ yield Message.Directory, gen
+ for step in gen.pop("steps", ()):
+ for image in step.pop("images", ()):
+ data = {"file": image, **step, **gen}
+ url = image["url"]
+ yield Message.Url, url, text.nameext_from_url(url, data)
class CivitaiRestAPI():
@@ -498,8 +562,7 @@ class CivitaiRestAPI():
self.root = extractor.root + "/api"
self.headers = {"Content-Type": "application/json"}
- api_key = extractor.config("api-key")
- if api_key:
+ if api_key := extractor.config("api-key"):
extractor.log.debug("Using api_key authentication")
self.headers["Authorization"] = "Bearer " + api_key
@@ -528,12 +591,12 @@ class CivitaiRestAPI():
})
def model(self, model_id):
- endpoint = "/v1/models/{}".format(model_id)
+ endpoint = f"/v1/models/{model_id}"
return self._call(endpoint)
@memcache(keyarg=1)
def model_version(self, model_version_id):
- endpoint = "/v1/model-versions/{}".format(model_version_id)
+ endpoint = f"/v1/model-versions/{model_version_id}"
return self._call(endpoint)
def models(self, params):
@@ -572,13 +635,12 @@ class CivitaiTrpcAPI():
self.root = extractor.root + "/api/trpc/"
self.headers = {
"content-type" : "application/json",
- "x-client-version": "5.0.701",
+ "x-client-version": "5.0.920",
"x-client-date" : "",
"x-client" : "web",
"x-fingerprint" : "undefined",
}
- api_key = extractor.config("api-key")
- if api_key:
+ if api_key := extractor.config("api-key"):
extractor.log.debug("Using api_key authentication")
self.headers["Authorization"] = "Bearer " + api_key
@@ -607,11 +669,11 @@ class CivitaiTrpcAPI():
"useIndex" : True,
"period" : "AllTime",
"sort" : "Newest",
- "types" : ["image"],
+ "types" : ("image",),
"withMeta" : False, # Metadata Only
"fromPlatform" : False, # Made On-Site
"browsingLevel": self.nsfw,
- "include" : ["cosmetics"],
+ "include" : ("cosmetics",),
})
params = self._type_params(params)
@@ -690,9 +752,10 @@ class CivitaiTrpcAPI():
"followed" : False,
"draftOnly" : False,
"pending" : True,
- "include" : ["cosmetics"],
+ "include" : ("cosmetics",),
})
+ params = self._type_params(params)
return self._pagination(endpoint, params, meta)
def user(self, username):
@@ -700,6 +763,15 @@ class CivitaiTrpcAPI():
params = {"username": username}
return (self._call(endpoint, params),)
+ def orchestrator_queryGeneratedImages(self):
+ endpoint = "orchestrator.queryGeneratedImages"
+ params = {
+ "ascending": False,
+ "tags" : ("gen",),
+ "authed" : True,
+ }
+ return self._pagination(endpoint, params)
+
def _call(self, endpoint, params, meta=None):
url = self.root + endpoint
headers = self.headers
@@ -765,4 +837,107 @@ class CivitaiTrpcAPI():
def _bool(value):
- return True if value == "true" else False
+ return value == "true"
+
+
+class CivitaiSearchAPI():
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.root = "https://search.civitai.com"
+ self.headers = {
+ "Authorization": "Bearer ab8565e5ab8dc2d8f0d4256d204781cb63fe8b031"
+ "eb3779cbbed38a7b5308e5c",
+ "Content-Type": "application/json",
+ "X-Meilisearch-Client": "Meilisearch instant-meilisearch (v0.13.5)"
+ " ; Meilisearch JavaScript (v0.34.0)",
+ "Origin": extractor.root,
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "same-site",
+ "Priority": "u=4",
+ }
+
+ def search(self, query, type, facets, nsfw=31):
+ endpoint = "/multi-search"
+
+ query = {
+ "q" : query,
+ "indexUid": type,
+ "facets" : facets,
+ "attributesToHighlight": (),
+ "highlightPreTag" : "__ais-highlight__",
+ "highlightPostTag": "__/ais-highlight__",
+ "limit" : 51,
+ "offset": 0,
+ "filter": (self._generate_filter(nsfw),),
+ }
+
+ return self._pagination(endpoint, query)
+
+ def search_models(self, query, type=None, nsfw=31):
+ facets = (
+ "category.name",
+ "checkpointType",
+ "fileFormats",
+ "lastVersionAtUnix",
+ "tags.name",
+ "type",
+ "user.username",
+ "version.baseModel",
+ )
+ return self.search(query, type or "models_v9", facets, nsfw)
+
+ def search_images(self, query, type=None, nsfw=31):
+ facets = (
+ "aspectRatio",
+ "baseModel",
+ "createdAtUnix",
+ "tagNames",
+ "techniqueNames",
+ "toolNames",
+ "type",
+ "user.username",
+ )
+ return self.search(query, type or "images_v6", facets, nsfw)
+
+ def _call(self, endpoint, query):
+ url = self.root + endpoint
+ params = util.json_dumps({"queries": (query,)})
+
+ data = self.extractor.request_json(
+ url, method="POST", headers=self.headers, data=params)
+
+ return data["results"][0]
+
+ def _pagination(self, endpoint, query):
+ limit = query["limit"] - 1
+ threshold = limit // 2
+
+ while True:
+ data = self._call(endpoint, query)
+
+ items = data["hits"]
+ yield from items
+
+ if len(items) < threshold:
+ return
+ query["offset"] += limit
+
+ def _generate_filter(self, level):
+ fltr = []
+
+ if level & 1:
+ fltr.append("1")
+ if level & 2:
+ fltr.append("2")
+ if level & 4:
+ fltr.append("4")
+ if level & 8:
+ fltr.append("8")
+ if level & 16:
+ fltr.append("16")
+
+ if not fltr:
+ return "()"
+ return "(nsfwLevel=" + " OR nsfwLevel=".join(fltr) + ")"
diff --git a/gallery_dl/extractor/comick.py b/gallery_dl/extractor/comick.py
new file mode 100644
index 0000000..7ef4607
--- /dev/null
+++ b/gallery_dl/extractor/comick.py
@@ -0,0 +1,198 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://comick.io/"""
+
+from .common import ChapterExtractor, MangaExtractor, Message
+from .. import text
+from ..cache import memcache
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?comick\.io"
+
+
+class ComickBase():
+ """Base class for comick.io extractors"""
+ category = "comick"
+ root = "https://comick.io"
+
+ @memcache(keyarg=1)
+ def _manga_info(self, slug):
+ url = f"{self.root}/comic/{slug}"
+ page = self.request(url).text
+ data = self._extract_nextdata(page)
+ props = data["props"]["pageProps"]
+ comic = props["comic"]
+
+ genre = []
+ theme = []
+ format = ""
+ for item in comic["md_comic_md_genres"]:
+ item = item["md_genres"]
+ group = item["group"]
+ if group == "Genre":
+ genre.append(item["name"])
+ elif group == "Theme":
+ theme.append(item["name"])
+ else:
+ format = item["name"]
+
+ if mu := comic["mu_comics"]:
+ tags = [c["mu_categories"]["title"]
+ for c in mu["mu_comic_categories"]]
+ publisher = [p["mu_publishers"]["title"]
+ for p in mu["mu_comic_publishers"]]
+ else:
+ tags = publisher = ()
+
+ return {
+ "manga": comic["title"],
+ "manga_id": comic["id"],
+ "manga_hid": comic["hid"],
+ "manga_slug": slug,
+ "manga_titles": [t["title"] for t in comic["md_titles"]],
+ "artist": [a["name"] for a in props["artists"]],
+ "author": [a["name"] for a in props["authors"]],
+ "genre" : genre,
+ "theme" : theme,
+ "format": format,
+ "tags" : tags,
+ "publisher": publisher,
+ "published": text.parse_int(comic["year"]),
+ "description": comic["desc"],
+ "demographic": props["demographic"],
+ "origin": comic["iso639_1"],
+ "mature": props["matureContent"],
+ "rating": comic["content_rating"],
+ "rank" : comic["follow_rank"],
+ "score" : text.parse_float(comic["bayesian_rating"]),
+ "status": "Complete" if comic["status"] == 2 else "Ongoing",
+ "links" : comic["links"],
+ "_build_id": data["buildId"],
+ }
+
+ def _chapter_info(self, manga, chstr):
+ slug = manga['manga_slug']
+ url = (f"{self.root}/_next/data/{manga['_build_id']}"
+ f"/comic/{slug}/{chstr}.json")
+ params = {"slug": slug, "chapter": chstr}
+ return self.request_json(url, params=params)["pageProps"]
+
+
+class ComickChapterExtractor(ComickBase, ChapterExtractor):
+ """Extractor for comick.io manga chapters"""
+ archive_fmt = "{chapter_hid}_{page}"
+ pattern = BASE_PATTERN + r"/comic/([\w-]+)/(\w+-chapter-[^/?#]+)"
+ example = "https://comick.io/comic/MANGA/ID-chapter-123-en"
+
+ def metadata(self, page):
+ slug, chstr = self.groups
+ manga = self._manga_info(slug)
+ props = self._chapter_info(manga, chstr)
+
+ ch = props["chapter"]
+ self._images = ch["md_images"]
+ chapter, sep, minor = ch["chap"].partition(".")
+
+ return {
+ **manga,
+ "title" : props["chapTitle"],
+ "volume" : text.parse_int(ch["vol"]),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor" : sep + minor,
+ "chapter_id" : ch["id"],
+ "chapter_hid" : ch["hid"],
+ "chapter_string": chstr,
+ "group" : ch["group_name"],
+ "date" : text.parse_datetime(
+ ch["created_at"][:19], "%Y-%m-%dT%H:%M:%S"),
+ "date_updated" : text.parse_datetime(
+ ch["updated_at"][:19], "%Y-%m-%dT%H:%M:%S"),
+ "lang" : ch["lang"],
+ }
+
+ def images(self, page):
+ return [
+ ("https://meo.comick.pictures/" + img["b2key"], {
+ "width" : img["w"],
+ "height" : img["h"],
+ "size" : img["s"],
+ "optimized": img["optimized"],
+ })
+ for img in self._images
+ ]
+
+
+class ComickMangaExtractor(ComickBase, MangaExtractor):
+ """Extractor for comick.io manga"""
+ pattern = BASE_PATTERN + r"/comic/([\w-]+)/?(?:\?([^#]+))?"
+ example = "https://comick.io/comic/MANGA"
+
+ def items(self):
+ slug = self.groups[0]
+ manga = self._manga_info(slug)
+
+ for ch in self.chapters(manga):
+ url = (f"{self.root}/comic/{slug}"
+ f"/{ch['hid']}-chapter-{ch['chap']}-{ch['lang']}")
+
+ ch.update(manga)
+ chapter, sep, minor = ch["chap"].partition(".")
+ ch["chapter"] = text.parse_int(chapter)
+ ch["chapter_minor"] = sep + minor
+ ch["_extractor"] = ComickChapterExtractor
+
+ yield Message.Queue, url, ch
+
+ def chapters(self, manga):
+ info = True
+ slug, query = self.groups
+
+ url = f"https://api.comick.io/comic/{manga['manga_hid']}/chapters"
+ headers = {
+ "Origin": "https://comick.io",
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "same-site",
+ }
+
+ query = text.parse_query(query)
+ params = {"lang": query.get("lang") or None}
+ params["page"] = page = text.parse_int(query.get("page"), 1)
+
+ if date_order := query.get("date-order"):
+ params["date-order"] = date_order
+ elif chap_order := query.get("chap-order"):
+ params["chap-order"] = chap_order
+ else:
+ params["chap-order"] = \
+ "0" if self.config("chapter-reverse", False) else "1"
+
+ group = query.get("group", None)
+ if group == "0":
+ group = None
+
+ while True:
+ data = self.request_json(url, params=params, headers=headers)
+ limit = data["limit"]
+
+ if info:
+ info = False
+ total = data["total"] - limit * page
+ if total > limit:
+ self.log.info("Collecting %s chapters", total)
+
+ if group is None:
+ yield from data["chapters"]
+ else:
+ for ch in data["chapters"]:
+ if group in ch["group_name"]:
+ yield ch
+
+ if data["total"] <= limit * page:
+ return
+ params["page"] = page = page + 1
diff --git a/gallery_dl/extractor/comicvine.py b/gallery_dl/extractor/comicvine.py
index d076795..39397b9 100644
--- a/gallery_dl/extractor/comicvine.py
+++ b/gallery_dl/extractor/comicvine.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -46,7 +46,7 @@ class ComicvineTagExtractor(BooruExtractor):
}
while True:
- images = self.request(url, params=params).json()["images"]
+ images = self.request_json(url, params=params)["images"]
yield from images
if len(images) < self.per_page:
@@ -59,8 +59,7 @@ class ComicvineTagExtractor(BooruExtractor):
_file_url = operator.itemgetter("original")
- @staticmethod
- def _prepare(post):
+ def _prepare(self, post):
post["date"] = text.parse_datetime(
post["dateCreated"], "%a, %b %d %Y")
post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]]
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index c430ec1..d46152b 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -17,9 +17,10 @@ import queue
import random
import getpass
import logging
-import datetime
import requests
import threading
+from datetime import datetime
+from xml.etree import ElementTree
from requests.adapters import HTTPAdapter
from .message import Message
from .. import config, output, text, util, cache, exception
@@ -35,6 +36,7 @@ class Extractor():
directory_fmt = ("{category}",)
filename_fmt = "{filename}.{extension}"
archive_fmt = ""
+ status = 0
root = ""
cookies_domain = ""
cookies_index = 0
@@ -53,6 +55,15 @@ class Extractor():
self.url = match.string
self.match = match
self.groups = match.groups()
+ self.kwdict = {}
+
+ if self.category in CATEGORY_MAP:
+ catsub = f"{self.category}:{self.subcategory}"
+ if catsub in CATEGORY_MAP:
+ self.category, self.subcategory = CATEGORY_MAP[catsub]
+ else:
+ self.category = CATEGORY_MAP[self.category]
+
self._cfgpath = ("extractor", self.category, self.subcategory)
self._parentdir = ""
@@ -125,11 +136,10 @@ class Extractor():
if first:
first = False
values = config.accumulate(extr + path, key)
- else:
- conf = config.get(extr, path[0])
- if conf:
- values[:0] = config.accumulate(
- (self.subcategory,), key, conf=conf)
+ elif conf := config.get(extr, path[0]):
+ values[:0] = config.accumulate(
+ (self.subcategory,), key, conf=conf)
+
return values
def request(self, url, method="GET", session=None,
@@ -149,17 +159,15 @@ class Extractor():
kwargs["verify"] = self._verify
if "json" in kwargs:
- json = kwargs["json"]
- if json is not None:
+ if (json := kwargs["json"]) is not None:
kwargs["data"] = util.json_dumps(json).encode()
del kwargs["json"]
- headers = kwargs.get("headers")
- if headers:
+ if headers := kwargs.get("headers"):
headers["Content-Type"] = "application/json"
else:
kwargs["headers"] = {"Content-Type": "application/json"}
- response = None
+ response = challenge = None
tries = 1
if self._interval:
@@ -172,21 +180,22 @@ class Extractor():
try:
response = session.request(method, url, **kwargs)
except requests.exceptions.ConnectionError as exc:
- code = 0
try:
reason = exc.args[0].reason
cls = reason.__class__.__name__
pre, _, err = str(reason.args[-1]).partition(":")
- msg = " {}: {}".format(cls, (err or pre).lstrip())
+ msg = f" {cls}: {(err or pre).lstrip()}"
except Exception:
msg = exc
+ code = 0
except (requests.exceptions.Timeout,
requests.exceptions.ChunkedEncodingError,
requests.exceptions.ContentDecodingError) as exc:
msg = exc
code = 0
except (requests.exceptions.RequestException) as exc:
- raise exception.HttpError(exc)
+ msg = exc
+ break
else:
code = response.status_code
if self._write_pages:
@@ -201,10 +210,10 @@ class Extractor():
response.encoding = encoding
return response
if notfound and code == 404:
+ self.status |= exception.NotFoundError.code
raise exception.NotFoundError(notfound)
- msg = "'{} {}' for '{}'".format(
- code, response.reason, response.url)
+ msg = f"'{code} {response.reason}' for '{response.url}'"
challenge = util.detect_challenge(response)
if challenge is not None:
@@ -238,13 +247,59 @@ class Extractor():
self.sleep(seconds, "retry")
tries += 1
- raise exception.HttpError(msg, response)
+ if not fatal or fatal is ...:
+ self.log.warning(msg)
+ return util.NullResponse(url, msg)
+
+ if challenge is None:
+ exc = exception.HttpError(msg, response)
+ else:
+ exc = exception.ChallengeError(challenge, response)
+ self.status |= exc.code
+ raise exc
def request_location(self, url, **kwargs):
kwargs.setdefault("method", "HEAD")
kwargs.setdefault("allow_redirects", False)
return self.request(url, **kwargs).headers.get("location", "")
+ def request_json(self, url, **kwargs):
+ response = self.request(url, **kwargs)
+
+ try:
+ return util.json_loads(response.text)
+ except Exception as exc:
+ fatal = kwargs.get("fatal", True)
+ if not fatal or fatal is ...:
+ if challenge := util.detect_challenge(response):
+ self.log.warning(challenge)
+ else:
+ self.log.warning("%s: %s", exc.__class__.__name__, exc)
+ return {}
+ raise
+
+ def request_xml(self, url, xmlns=True, **kwargs):
+ response = self.request(url, **kwargs)
+
+ if xmlns:
+ text = response.text
+ else:
+ text = response.text.replace(" xmlns=", " ns=")
+
+ parser = ElementTree.XMLParser()
+ try:
+ parser.feed(text)
+ return parser.close()
+ except Exception as exc:
+ fatal = kwargs.get("fatal", True)
+ if not fatal or fatal is ...:
+ if challenge := util.detect_challenge(response):
+ self.log.warning(challenge)
+ else:
+ self.log.warning("%s: %s", exc.__class__.__name__, exc)
+ return ElementTree.Element("")
+ raise
+
_handle_429 = util.false
def wait(self, seconds=None, until=None, adjust=1.0,
@@ -255,7 +310,7 @@ class Extractor():
seconds = float(seconds)
until = now + seconds
elif until:
- if isinstance(until, datetime.datetime):
+ if isinstance(until, datetime):
# convert to UTC timestamp
until = util.datetime_to_timestamp(until)
else:
@@ -269,8 +324,8 @@ class Extractor():
return
if reason:
- t = datetime.datetime.fromtimestamp(until).time()
- isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second)
+ t = datetime.fromtimestamp(until).time()
+ isotime = f"{t.hour:02}:{t.minute:02}:{t.second:02}"
self.log.info("Waiting until %s (%s)", isotime, reason)
time.sleep(seconds)
@@ -295,8 +350,8 @@ class Extractor():
if input is None:
input = output.TTY_STDIN
if not input:
- raise exception.StopExtraction(
- "User input required (%s)", prompt.strip(" :"))
+ raise exception.AbortExtraction(
+ f"User input required ({prompt.strip(' :')})")
def _get_auth_info(self):
"""Return authentication information as (username, password) tuple"""
@@ -366,36 +421,31 @@ class Extractor():
elif platform == "linux":
platform = "X11; Linux x86_64"
elif platform == "macos":
- platform = "Macintosh; Intel Mac OS X 11.5"
+ platform = "Macintosh; Intel Mac OS X 15.5"
if browser == "chrome":
if platform.startswith("Macintosh"):
- platform = platform.replace(".", "_") + "_2"
+ platform = platform.replace(".", "_")
else:
browser = "firefox"
- for key, value in HTTP_HEADERS[browser]:
+ for key, value in HEADERS[browser]:
if value and "{}" in value:
- headers[key] = value.format(platform)
+ headers[key] = value.replace("{}", platform)
else:
headers[key] = value
ssl_options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
- ssl_ciphers = SSL_CIPHERS[browser]
+ ssl_ciphers = CIPHERS[browser]
else:
- useragent = self.config("user-agent")
- if useragent is None or useragent == "auto":
- useragent = self.useragent
- elif useragent == "browser":
- useragent = _browser_useragent()
- elif self.useragent is not Extractor.useragent and \
- useragent is config.get(("extractor",), "user-agent"):
- useragent = self.useragent
- headers["User-Agent"] = useragent
+ headers["User-Agent"] = self.useragent
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
+
ssl_ciphers = self.ciphers
+ if ssl_ciphers is not None and ssl_ciphers in CIPHERS:
+ ssl_ciphers = CIPHERS[ssl_ciphers]
if BROTLI:
headers["Accept-Encoding"] = "gzip, deflate, br"
@@ -404,26 +454,40 @@ class Extractor():
if ZSTD:
headers["Accept-Encoding"] += ", zstd"
- referer = self.config("referer", self.referer)
- if referer:
+ if referer := self.config("referer", self.referer):
if isinstance(referer, str):
headers["Referer"] = referer
elif self.root:
headers["Referer"] = self.root + "/"
- custom_headers = self.config("headers")
- if custom_headers:
+ custom_ua = self.config("user-agent")
+ if custom_ua is None or custom_ua == "auto":
+ pass
+ elif custom_ua == "browser":
+ headers["User-Agent"] = _browser_useragent()
+ elif self.useragent is Extractor.useragent and not self.browser or \
+ custom_ua is not config.get(("extractor",), "user-agent"):
+ headers["User-Agent"] = custom_ua
+
+ if custom_headers := self.config("headers"):
+ if isinstance(custom_headers, str):
+ if custom_headers in HEADERS:
+ custom_headers = HEADERS[custom_headers]
+ else:
+ self.log.error("Invalid 'headers' value '%s'",
+ custom_headers)
+ custom_headers = ()
headers.update(custom_headers)
- custom_ciphers = self.config("ciphers")
- if custom_ciphers:
+ if custom_ciphers := self.config("ciphers"):
if isinstance(custom_ciphers, list):
ssl_ciphers = ":".join(custom_ciphers)
+ elif custom_ciphers in CIPHERS:
+ ssl_ciphers = CIPHERS[custom_ciphers]
else:
ssl_ciphers = custom_ciphers
- source_address = self.config("source-address")
- if source_address:
+ if source_address := self.config("source-address"):
if isinstance(source_address, str):
source_address = (source_address, 0)
else:
@@ -436,8 +500,17 @@ class Extractor():
ssl_options |= ssl.OP_NO_TLSv1_2
self.log.debug("TLS 1.2 disabled.")
+ if self.config("truststore"):
+ try:
+ from truststore import SSLContext as ssl_ctx
+ except ImportError as exc:
+ self.log.error("%s: %s", exc.__class__.__name__, exc)
+ ssl_ctx = None
+ else:
+ ssl_ctx = None
+
adapter = _build_requests_adapter(
- ssl_options, ssl_ciphers, source_address)
+ ssl_options, ssl_ciphers, ssl_ctx, source_address)
session.mount("https://", adapter)
session.mount("http://", adapter)
@@ -448,10 +521,8 @@ class Extractor():
if self.cookies_domain is None:
return
- cookies = self.config("cookies")
- if cookies:
- select = self.config("cookies-select")
- if select:
+ if cookies := self.config("cookies"):
+ if select := self.config("cookies-select"):
if select == "rotate":
cookies = cookies[self.cookies_index % len(cookies)]
Extractor.cookies_index += 1
@@ -469,9 +540,11 @@ class Extractor():
with open(path) as fp:
cookies = util.cookiestxt_load(fp)
except Exception as exc:
- self.log.warning("cookies: %s", exc)
+ self.log.warning("cookies: Failed to load '%s' (%s: %s)",
+ cookies_source, exc.__class__.__name__, exc)
else:
- self.log.debug("Loading cookies from '%s'", cookies_source)
+ self.log.debug("cookies: Loading cookies from '%s'",
+ cookies_source)
set_cookie = self.cookies.set_cookie
for cookie in cookies:
set_cookie(cookie)
@@ -479,7 +552,7 @@ class Extractor():
elif isinstance(cookies_source, (list, tuple)):
key = tuple(cookies_source)
- cookies = _browser_cookies.get(key)
+ cookies = CACHE_COOKIES.get(key)
if cookies is None:
from ..cookies import load_cookies
@@ -489,18 +562,18 @@ class Extractor():
self.log.warning("cookies: %s", exc)
cookies = ()
else:
- _browser_cookies[key] = cookies
+ CACHE_COOKIES[key] = cookies
else:
- self.log.debug("Using cached cookies from %s", key)
+ self.log.debug("cookies: Using cached cookies from %s", key)
set_cookie = self.cookies.set_cookie
for cookie in cookies:
set_cookie(cookie)
else:
- self.log.warning(
- "Expected 'dict', 'list', or 'str' value for 'cookies' "
- "option, got '%s' (%s)",
+ self.log.error(
+ "cookies: Expected 'dict', 'list', or 'str' value for "
+ "'cookies' option, got '%s' instead (%r)",
cookies_source.__class__.__name__, cookies_source)
def cookies_store(self):
@@ -522,7 +595,8 @@ class Extractor():
util.cookiestxt_store(fp, self.cookies)
os.replace(path_tmp, path)
except OSError as exc:
- self.log.warning("cookies: %s", exc)
+ self.log.error("cookies: Failed to write to '%s' "
+ "(%s: %s)", path, exc.__class__.__name__, exc)
def cookies_update(self, cookies, domain=""):
"""Update the session's cookiejar with 'cookies'"""
@@ -568,14 +642,17 @@ class Extractor():
if diff <= 0:
self.log.warning(
- "Cookie '%s' has expired", cookie.name)
+ "cookies: %s/%s expired at %s",
+ cookie.domain.lstrip("."), cookie.name,
+ datetime.fromtimestamp(cookie.expires))
continue
elif diff <= 86400:
hours = diff // 3600
self.log.warning(
- "Cookie '%s' will expire in less than %s hour%s",
- cookie.name, hours + 1, "s" if hours else "")
+ "cookies: %s/%s will expire in less than %s hour%s",
+ cookie.domain.lstrip("."), cookie.name,
+ hours + 1, "s" if hours else "")
names.discard(cookie.name)
if not names:
@@ -590,11 +667,6 @@ class Extractor():
return util.json_loads(text.extr(
page, ' id="__NEXT_DATA__" type="application/json">', "</script>"))
- def _prepare_ddosguard_cookies(self):
- if not self.cookies.get("__ddg2", domain=self.cookies_domain):
- self.cookies.set(
- "__ddg2", util.generate_token(), domain=self.cookies_domain)
-
def _cache(self, func, maxage, keyarg=None):
# return cache.DatabaseCacheDecorator(func, maxage, keyarg)
return cache.DatabaseCacheDecorator(func, keyarg, maxage)
@@ -608,7 +680,7 @@ class Extractor():
ts = self.config(key, default)
if isinstance(ts, str):
try:
- ts = int(datetime.datetime.strptime(ts, fmt).timestamp())
+ ts = int(datetime.strptime(ts, fmt).timestamp())
except ValueError as exc:
self.log.warning("Unable to parse '%s': %s", key, exc)
ts = default
@@ -616,35 +688,12 @@ class Extractor():
fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S")
return get("date-min", dmin), get("date-max", dmax)
- def _dispatch_extractors(self, extractor_data, default=()):
- """ """
- extractors = {
- data[0].subcategory: data
- for data in extractor_data
- }
-
- include = self.config("include", default) or ()
- if include == "all":
- include = extractors
- elif isinstance(include, str):
- include = include.replace(" ", "").split(",")
-
- result = [(Message.Version, 1)]
- for category in include:
- try:
- extr, url = extractors[category]
- except KeyError:
- self.log.warning("Invalid include '%s'", category)
- else:
- result.append((Message.Queue, url, {"_extractor": extr}))
- return iter(result)
-
@classmethod
def _dump(cls, obj):
util.dump_json(obj, ensure_ascii=False, indent=2)
def _dump_response(self, response, history=True):
- """Write the response content to a .dump file in the current directory.
+ """Write the response content to a .txt file in the current directory.
The file name is derived from the response url,
replacing special characters with "_"
@@ -657,12 +706,11 @@ class Extractor():
Extractor._dump_index += 1
else:
Extractor._dump_index = 1
- Extractor._dump_sanitize = re.compile(r"[\\\\|/<>:\"?*&=#]+").sub
+ Extractor._dump_sanitize = util.re_compile(
+ r"[\\\\|/<>:\"?*&=#]+").sub
- fname = "{:>02}_{}".format(
- Extractor._dump_index,
- Extractor._dump_sanitize('_', response.url),
- )
+ fname = (f"{Extractor._dump_index:>02}_"
+ f"{Extractor._dump_sanitize('_', response.url)}")
if util.WINDOWS:
path = os.path.abspath(fname)[:255]
@@ -693,19 +741,24 @@ class GalleryExtractor(Extractor):
def __init__(self, match, url=None):
Extractor.__init__(self, match)
- self.gallery_url = self.root + self.groups[0] if url is None else url
+
+ if url is None and (path := self.groups[0]) and path[0] == "/":
+ self.page_url = f"{self.root}{path}"
+ else:
+ self.page_url = url
def items(self):
self.login()
- if self.gallery_url:
+ if self.page_url:
page = self.request(
- self.gallery_url, notfound=self.subcategory).text
+ self.page_url, notfound=self.subcategory).text
else:
page = None
data = self.metadata(page)
imgs = self.images(page)
+ assets = self.assets(page)
if "count" in data:
if self.config("page-reverse"):
@@ -727,7 +780,18 @@ class GalleryExtractor(Extractor):
images = enum(imgs, 1)
yield Message.Directory, data
- for data[self.enum], (url, imgdata) in images:
+ enum_key = self.enum
+
+ if assets:
+ for asset in assets:
+ url = asset["url"]
+ asset.update(data)
+ asset[enum_key] = 0
+ if "extension" not in asset:
+ text.nameext_from_url(url, asset)
+ yield Message.Url, url, asset
+
+ for data[enum_key], (url, imgdata) in images:
if imgdata:
data.update(imgdata)
if "extension" not in imgdata:
@@ -743,7 +807,13 @@ class GalleryExtractor(Extractor):
"""Return a dict with general metadata"""
def images(self, page):
- """Return a list of all (image-url, metadata)-tuples"""
+ """Return a list or iterable of all (image-url, metadata)-tuples"""
+
+ def assets(self, page):
+ """Return an iterable of additional gallery assets
+
+ Each asset must be a 'dict' containing at least 'url' and 'type'
+ """
class ChapterExtractor(GalleryExtractor):
@@ -768,7 +838,11 @@ class MangaExtractor(Extractor):
def __init__(self, match, url=None):
Extractor.__init__(self, match)
- self.manga_url = self.root + self.groups[0] if url is None else url
+
+ if url is None and (path := self.groups[0]) and path[0] == "/":
+ self.page_url = f"{self.root}{path}"
+ else:
+ self.page_url = url
if self.config("chapter-reverse", False):
self.reverse = not self.reverse
@@ -776,8 +850,8 @@ class MangaExtractor(Extractor):
def items(self):
self.login()
- if self.manga_url:
- page = self.request(self.manga_url, notfound=self.subcategory).text
+ if self.page_url:
+ page = self.request(self.page_url, notfound=self.subcategory).text
else:
page = None
@@ -796,6 +870,45 @@ class MangaExtractor(Extractor):
"""Return a list of all (chapter-url, metadata)-tuples"""
+class Dispatch():
+ subcategory = "user"
+ cookies_domain = None
+ finalize = Extractor.finalize
+ skip = Extractor.skip
+
+ def __iter__(self):
+ return self.items()
+
+ def initialize(self):
+ pass
+
+ def _dispatch_extractors(self, extractor_data, default=(), alt=None):
+ extractors = {
+ data[0].subcategory: data
+ for data in extractor_data
+ }
+
+ if alt is not None:
+ for sub, sub_alt in alt:
+ extractors[sub_alt] = extractors[sub]
+
+ include = self.config("include", default) or ()
+ if include == "all":
+ include = extractors
+ elif isinstance(include, str):
+ include = include.replace(" ", "").split(",")
+
+ results = [(Message.Version, 1)]
+ for category in include:
+ try:
+ extr, url = extractors[category]
+ except KeyError:
+ self.log.warning("Invalid include '%s'", category)
+ else:
+ results.append((Message.Queue, url, {"_extractor": extr}))
+ return iter(results)
+
+
class AsynchronousMixin():
"""Run info extraction in a separate thread"""
@@ -846,7 +959,7 @@ class BaseExtractor(Extractor):
if index:
self.category, self.root, info = self.instances[index-1]
if not self.root:
- self.root = text.root_from_url(self.match.group(0))
+ self.root = text.root_from_url(self.match[0])
self.config_instance = info.get
else:
self.root = group
@@ -855,8 +968,7 @@ class BaseExtractor(Extractor):
@classmethod
def update(cls, instances):
- extra_instances = config.get(("extractor",), cls.basecategory)
- if extra_instances:
+ if extra_instances := config.get(("extractor",), cls.basecategory):
for category, info in extra_instances.items():
if isinstance(info, dict) and "root" in info:
instances[category] = info
@@ -864,8 +976,7 @@ class BaseExtractor(Extractor):
pattern_list = []
instance_list = cls.instances = []
for category, info in instances.items():
- root = info["root"]
- if root:
+ if root := info["root"]:
root = root.rstrip("/")
instance_list.append((category, root, info))
@@ -898,24 +1009,35 @@ class RequestsAdapter(HTTPAdapter):
return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
-def _build_requests_adapter(ssl_options, ssl_ciphers, source_address):
- key = (ssl_options, ssl_ciphers, source_address)
+def _build_requests_adapter(
+ ssl_options, ssl_ciphers, ssl_ctx, source_address):
+
+ key = (ssl_options, ssl_ciphers, ssl_ctx, source_address)
try:
- return _adapter_cache[key]
+ return CACHE_ADAPTERS[key]
except KeyError:
pass
- if ssl_options or ssl_ciphers:
- ssl_context = urllib3.connection.create_urllib3_context(
- options=ssl_options or None, ciphers=ssl_ciphers)
- if not requests.__version__ < "2.32":
- # https://github.com/psf/requests/pull/6731
- ssl_context.load_verify_locations(requests.certs.where())
+ if ssl_options or ssl_ciphers or ssl_ctx:
+ if ssl_ctx is None:
+ ssl_context = urllib3.connection.create_urllib3_context(
+ options=ssl_options or None, ciphers=ssl_ciphers)
+ if not requests.__version__ < "2.32":
+ # https://github.com/psf/requests/pull/6731
+ ssl_context.load_verify_locations(requests.certs.where())
+ else:
+ ssl_ctx_orig = urllib3.util.ssl_.SSLContext
+ try:
+ urllib3.util.ssl_.SSLContext = ssl_ctx
+ ssl_context = urllib3.connection.create_urllib3_context(
+ options=ssl_options or None, ciphers=ssl_ciphers)
+ finally:
+ urllib3.util.ssl_.SSLContext = ssl_ctx_orig
ssl_context.check_hostname = False
else:
ssl_context = None
- adapter = _adapter_cache[key] = RequestsAdapter(
+ adapter = CACHE_ADAPTERS[key] = RequestsAdapter(
ssl_context, source_address)
return adapter
@@ -932,7 +1054,7 @@ def _browser_useragent():
server.listen(1)
host, port = server.getsockname()
- webbrowser.open("http://{}:{}/user-agent".format(host, port))
+ webbrowser.open(f"http://{host}:{port}/user-agent")
client = server.accept()[0]
server.close()
@@ -951,83 +1073,131 @@ def _browser_useragent():
return useragent.decode()
-_adapter_cache = {}
-_browser_cookies = {}
-
-
-HTTP_HEADERS = {
- "firefox": (
- ("User-Agent", "Mozilla/5.0 ({}; "
- "rv:128.0) Gecko/20100101 Firefox/128.0"),
- ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
- "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"),
- ("Accept-Language", "en-US,en;q=0.5"),
- ("Accept-Encoding", None),
- ("Referer", None),
- ("Connection", "keep-alive"),
- ("Upgrade-Insecure-Requests", "1"),
- ("Cookie", None),
- ("Sec-Fetch-Dest", "empty"),
- ("Sec-Fetch-Mode", "no-cors"),
- ("Sec-Fetch-Site", "same-origin"),
- ("TE", "trailers"),
- ),
- "chrome": (
- ("Connection", "keep-alive"),
- ("Upgrade-Insecure-Requests", "1"),
- ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, "
- "like Gecko) Chrome/111.0.0.0 Safari/537.36"),
- ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
- "image/avif,image/webp,image/apng,*/*;q=0.8,"
- "application/signed-exchange;v=b3;q=0.7"),
- ("Referer", None),
- ("Sec-Fetch-Site", "same-origin"),
- ("Sec-Fetch-Mode", "no-cors"),
- ("Sec-Fetch-Dest", "empty"),
- ("Accept-Encoding", None),
- ("Accept-Language", "en-US,en;q=0.9"),
- ("cookie", None),
- ("content-length", None),
- ),
+CACHE_ADAPTERS = {}
+CACHE_COOKIES = {}
+CATEGORY_MAP = ()
+
+
+HEADERS_FIREFOX_140 = (
+ ("User-Agent", "Mozilla/5.0 ({}; rv:140.0) Gecko/20100101 Firefox/140.0"),
+ ("Accept", "text/html,application/xhtml+xml,"
+ "application/xml;q=0.9,*/*;q=0.8"),
+ ("Accept-Language", "en-US,en;q=0.5"),
+ ("Accept-Encoding", None),
+ ("Connection", "keep-alive"),
+ ("Content-Type", None),
+ ("Content-Length", None),
+ ("Referer", None),
+ ("Origin", None),
+ ("Cookie", None),
+ ("Sec-Fetch-Dest", "empty"),
+ ("Sec-Fetch-Mode", "cors"),
+ ("Sec-Fetch-Site", "same-origin"),
+ ("TE", "trailers"),
+)
+HEADERS_FIREFOX_128 = (
+ ("User-Agent", "Mozilla/5.0 ({}; rv:128.0) Gecko/20100101 Firefox/128.0"),
+ ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
+ "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"),
+ ("Accept-Language", "en-US,en;q=0.5"),
+ ("Accept-Encoding", None),
+ ("Referer", None),
+ ("Connection", "keep-alive"),
+ ("Upgrade-Insecure-Requests", "1"),
+ ("Cookie", None),
+ ("Sec-Fetch-Dest", "empty"),
+ ("Sec-Fetch-Mode", "no-cors"),
+ ("Sec-Fetch-Site", "same-origin"),
+ ("TE", "trailers"),
+)
+HEADERS_CHROMIUM_138 = (
+ ("Connection", "keep-alive"),
+ ("sec-ch-ua", '"Not)A;Brand";v="8", "Chromium";v="138"'),
+ ("sec-ch-ua-mobile", "?0"),
+ ("sec-ch-ua-platform", '"Linux"'),
+ ("Upgrade-Insecure-Requests", "1"),
+ ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, "
+ "like Gecko) Chrome/138.0.0.0 Safari/537.36"),
+ ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
+ "image/avif,image/webp,image/apng,*/*;q=0.8,"
+ "application/signed-exchange;v=b3;q=0.7"),
+ ("Referer", None),
+ ("Sec-Fetch-Site", "same-origin"),
+ ("Sec-Fetch-Mode", "no-cors"),
+ # ("Sec-Fetch-User", "?1"),
+ ("Sec-Fetch-Dest", "empty"),
+ ("Accept-Encoding", None),
+ ("Accept-Language", "en-US,en;q=0.9"),
+)
+HEADERS_CHROMIUM_111 = (
+ ("Connection", "keep-alive"),
+ ("Upgrade-Insecure-Requests", "1"),
+ ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, "
+ "like Gecko) Chrome/111.0.0.0 Safari/537.36"),
+ ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
+ "image/avif,image/webp,image/apng,*/*;q=0.8,"
+ "application/signed-exchange;v=b3;q=0.7"),
+ ("Referer", None),
+ ("Sec-Fetch-Site", "same-origin"),
+ ("Sec-Fetch-Mode", "no-cors"),
+ ("Sec-Fetch-Dest", "empty"),
+ ("Accept-Encoding", None),
+ ("Accept-Language", "en-US,en;q=0.9"),
+ ("cookie", None),
+ ("content-length", None),
+)
+HEADERS = {
+ "firefox" : HEADERS_FIREFOX_140,
+ "firefox/140": HEADERS_FIREFOX_140,
+ "firefox/128": HEADERS_FIREFOX_128,
+ "chrome" : HEADERS_CHROMIUM_138,
+ "chrome/138" : HEADERS_CHROMIUM_138,
+ "chrome/111" : HEADERS_CHROMIUM_111,
}
-SSL_CIPHERS = {
- "firefox": (
- "TLS_AES_128_GCM_SHA256:"
- "TLS_CHACHA20_POLY1305_SHA256:"
- "TLS_AES_256_GCM_SHA384:"
- "ECDHE-ECDSA-AES128-GCM-SHA256:"
- "ECDHE-RSA-AES128-GCM-SHA256:"
- "ECDHE-ECDSA-CHACHA20-POLY1305:"
- "ECDHE-RSA-CHACHA20-POLY1305:"
- "ECDHE-ECDSA-AES256-GCM-SHA384:"
- "ECDHE-RSA-AES256-GCM-SHA384:"
- "ECDHE-ECDSA-AES256-SHA:"
- "ECDHE-ECDSA-AES128-SHA:"
- "ECDHE-RSA-AES128-SHA:"
- "ECDHE-RSA-AES256-SHA:"
- "AES128-GCM-SHA256:"
- "AES256-GCM-SHA384:"
- "AES128-SHA:"
- "AES256-SHA"
- ),
- "chrome": (
- "TLS_AES_128_GCM_SHA256:"
- "TLS_AES_256_GCM_SHA384:"
- "TLS_CHACHA20_POLY1305_SHA256:"
- "ECDHE-ECDSA-AES128-GCM-SHA256:"
- "ECDHE-RSA-AES128-GCM-SHA256:"
- "ECDHE-ECDSA-AES256-GCM-SHA384:"
- "ECDHE-RSA-AES256-GCM-SHA384:"
- "ECDHE-ECDSA-CHACHA20-POLY1305:"
- "ECDHE-RSA-CHACHA20-POLY1305:"
- "ECDHE-RSA-AES128-SHA:"
- "ECDHE-RSA-AES256-SHA:"
- "AES128-GCM-SHA256:"
- "AES256-GCM-SHA384:"
- "AES128-SHA:"
- "AES256-SHA"
- ),
+CIPHERS_FIREFOX = (
+ "TLS_AES_128_GCM_SHA256:"
+ "TLS_CHACHA20_POLY1305_SHA256:"
+ "TLS_AES_256_GCM_SHA384:"
+ "ECDHE-ECDSA-AES128-GCM-SHA256:"
+ "ECDHE-RSA-AES128-GCM-SHA256:"
+ "ECDHE-ECDSA-CHACHA20-POLY1305:"
+ "ECDHE-RSA-CHACHA20-POLY1305:"
+ "ECDHE-ECDSA-AES256-GCM-SHA384:"
+ "ECDHE-RSA-AES256-GCM-SHA384:"
+ "ECDHE-ECDSA-AES256-SHA:"
+ "ECDHE-ECDSA-AES128-SHA:"
+ "ECDHE-RSA-AES128-SHA:"
+ "ECDHE-RSA-AES256-SHA:"
+ "AES128-GCM-SHA256:"
+ "AES256-GCM-SHA384:"
+ "AES128-SHA:"
+ "AES256-SHA"
+)
+CIPHERS_CHROMIUM = (
+ "TLS_AES_128_GCM_SHA256:"
+ "TLS_AES_256_GCM_SHA384:"
+ "TLS_CHACHA20_POLY1305_SHA256:"
+ "ECDHE-ECDSA-AES128-GCM-SHA256:"
+ "ECDHE-RSA-AES128-GCM-SHA256:"
+ "ECDHE-ECDSA-AES256-GCM-SHA384:"
+ "ECDHE-RSA-AES256-GCM-SHA384:"
+ "ECDHE-ECDSA-CHACHA20-POLY1305:"
+ "ECDHE-RSA-CHACHA20-POLY1305:"
+ "ECDHE-RSA-AES128-SHA:"
+ "ECDHE-RSA-AES256-SHA:"
+ "AES128-GCM-SHA256:"
+ "AES256-GCM-SHA384:"
+ "AES128-SHA:"
+ "AES256-SHA"
+)
+CIPHERS = {
+ "firefox" : CIPHERS_FIREFOX,
+ "firefox/140": CIPHERS_FIREFOX,
+ "firefox/128": CIPHERS_FIREFOX,
+ "chrome" : CIPHERS_CHROMIUM,
+ "chrome/138" : CIPHERS_CHROMIUM,
+ "chrome/111" : CIPHERS_CHROMIUM,
}
diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py
index e150829..b3944f7 100644
--- a/gallery_dl/extractor/cyberdrop.py
+++ b/gallery_dl/extractor/cyberdrop.py
@@ -32,7 +32,7 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
yield Message.Url, file["url"], file
def fetch_album(self, album_id):
- url = "{}/a/{}".format(self.root, album_id)
+ url = f"{self.root}/a/{album_id}"
page = self.request(url).text
extr = text.extract_from(page)
@@ -60,9 +60,9 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
def _extract_files(self, file_ids):
for file_id in file_ids:
try:
- url = "{}/api/file/info/{}".format(self.root_api, file_id)
- file = self.request(url).json()
- auth = self.request(file["auth_url"]).json()
+ url = f"{self.root_api}/api/file/info/{file_id}"
+ file = self.request_json(url)
+ auth = self.request_json(file["auth_url"])
file["url"] = auth["url"]
except Exception as exc:
self.log.warning("%s (%s: %s)",
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 06c31b9..ff071c5 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -49,8 +49,7 @@ class DanbooruExtractor(BaseExtractor):
def items(self):
# 'includes' initialization must be done here and not in '_init()'
# or it'll cause an exception with e621 when 'metadata' is enabled
- includes = self.config("metadata")
- if includes:
+ if includes := self.config("metadata"):
if isinstance(includes, (list, tuple)):
includes = ",".join(includes)
elif not isinstance(includes, str):
@@ -112,8 +111,7 @@ class DanbooruExtractor(BaseExtractor):
def items_artists(self):
for artist in self.artists():
artist["_extractor"] = DanbooruTagExtractor
- url = "{}/posts?tags={}".format(
- self.root, text.quote(artist["name"]))
+ url = f"{self.root}/posts?tags={text.quote(artist['name'])}"
yield Message.Queue, url, artist
def metadata(self):
@@ -129,7 +127,7 @@ class DanbooruExtractor(BaseExtractor):
first = True
while True:
- posts = self.request(url, params=params).json()
+ posts = self.request_json(url, params=params)
if isinstance(posts, dict):
posts = posts["posts"]
@@ -142,8 +140,7 @@ class DanbooruExtractor(BaseExtractor):
}
data = {
meta["id"]: meta
- for meta in self.request(
- url, params=params_meta).json()
+ for meta in self.request_json(url, params=params_meta)
}
for post in posts:
post.update(data[post["id"]])
@@ -157,7 +154,7 @@ class DanbooruExtractor(BaseExtractor):
return
if prefix:
- params["page"] = "{}{}".format(prefix, posts[-1]["id"])
+ params["page"] = f"{prefix}{posts[-1]['id']}"
elif params["page"]:
params["page"] += 1
else:
@@ -165,11 +162,17 @@ class DanbooruExtractor(BaseExtractor):
first = False
def _ugoira_frames(self, post):
- data = self.request("{}/posts/{}.json?only=media_metadata".format(
- self.root, post["id"])
- ).json()["media_metadata"]["metadata"]
+ data = self.request_json(
+ f"{self.root}/posts/{post['id']}.json?only=media_metadata"
+ )["media_metadata"]["metadata"]
+
+ if "Ugoira:FrameMimeType" in data:
+ ext = data["Ugoira:FrameMimeType"].rpartition("/")[2]
+ if ext == "jpeg":
+ ext = "jpg"
+ else:
+ ext = data["ZIP:ZipFileName"].rpartition(".")[2]
- ext = data["ZIP:ZipFileName"].rpartition(".")[2]
fmt = ("{:>06}." + ext).format
delays = data["Ugoira:FrameDelays"]
return [{"file": fmt(index), "delay": delay}
@@ -180,15 +183,15 @@ class DanbooruExtractor(BaseExtractor):
order = self.config("order-posts")
if not order or order in {"asc", "pool", "pool_asc", "asc_pool"}:
- params = {"tags": "ord{}:{}".format(ctype, cid)}
+ params = {"tags": f"ord{ctype}:{cid}"}
elif order in {"id", "desc_id", "id_desc"}:
- params = {"tags": "{}:{}".format(ctype, cid)}
+ params = {"tags": f"{ctype}:{cid}"}
prefix = "b"
elif order in {"desc", "desc_pool", "pool_desc"}:
- params = {"tags": "ord{}:{}".format(ctype, cid)}
+ params = {"tags": f"ord{ctype}:{cid}"}
reverse = True
elif order in {"asc_id", "id_asc"}:
- params = {"tags": "{}:{}".format(ctype, cid)}
+ params = {"tags": f"{ctype}:{cid}"}
reverse = True
posts = self._pagination("/posts.json", params, prefix)
@@ -199,8 +202,8 @@ class DanbooruExtractor(BaseExtractor):
return self._collection_enumerate(posts)
def _collection_metadata(self, cid, ctype, cname=None):
- url = "{}/{}s/{}.json".format(self.root, cname or ctype, cid)
- collection = self.request(url).json()
+ url = f"{self.root}/{cname or ctype}s/{cid}.json"
+ collection = self.request_json(url)
collection["name"] = collection["name"].replace("_", " ")
self.post_ids = collection.pop("post_ids", ())
return {ctype: collection}
@@ -315,11 +318,11 @@ class DanbooruPostExtractor(DanbooruExtractor):
example = "https://danbooru.donmai.us/posts/12345"
def posts(self):
- url = "{}/posts/{}.json".format(self.root, self.groups[-1])
- post = self.request(url).json()
+ url = f"{self.root}/posts/{self.groups[-1]}.json"
+ post = self.request_json(url)
if self.includes:
params = {"only": self.includes}
- post.update(self.request(url, params=params).json())
+ post.update(self.request_json(url, params=params))
return (post,)
@@ -357,8 +360,8 @@ class DanbooruArtistExtractor(DanbooruExtractor):
items = DanbooruExtractor.items_artists
def artists(self):
- url = "{}/artists/{}.json".format(self.root, self.groups[-1])
- return (self.request(url).json(),)
+ url = f"{self.root}/artists/{self.groups[-1]}.json"
+ return (self.request_json(url),)
class DanbooruArtistSearchExtractor(DanbooruExtractor):
@@ -375,7 +378,7 @@ class DanbooruArtistSearchExtractor(DanbooruExtractor):
params["page"] = text.parse_int(params.get("page"), 1)
while True:
- artists = self.request(url, params=params).json()
+ artists = self.request_json(url, params=params)
yield from artists
diff --git a/gallery_dl/extractor/dankefuerslesen.py b/gallery_dl/extractor/dankefuerslesen.py
new file mode 100644
index 0000000..a2b0f42
--- /dev/null
+++ b/gallery_dl/extractor/dankefuerslesen.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://danke.moe/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, util
+from ..cache import memcache
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?danke\.moe"
+
+
+class DankefuerslesenBase():
+ """Base class for dankefuerslesen extractors"""
+ category = "dankefuerslesen"
+ root = "https://danke.moe"
+
+ @memcache(keyarg=1)
+ def _manga_info(self, slug):
+ url = f"{self.root}/api/series/{slug}/"
+ return self.request_json(url)
+
+
+class DankefuerslesenChapterExtractor(DankefuerslesenBase, ChapterExtractor):
+ """Extractor for Danke fürs Lesen manga chapters"""
+ pattern = BASE_PATTERN + r"/read/manga/([\w-]+)/([\w-]+)"
+ example = "https://danke.moe/read/manga/TITLE/123/1/"
+
+ def _init(self):
+ self.zip = self.config("zip", False)
+ if self.zip:
+ self.filename_fmt = f"{self.directory_fmt[-1]}.{{extension}}"
+ self.directory_fmt = self.directory_fmt[:-1]
+
+ def metadata(self, page):
+ slug, ch = self.groups
+ manga = self._manga_info(slug)
+
+ if "-" in ch:
+ chapter, sep, minor = ch.rpartition("-")
+ ch = ch.replace("-", ".")
+ minor = "." + minor
+ else:
+ chapter = ch
+ minor = ""
+
+ data = manga["chapters"][ch]
+ group_id, self._files = next(iter(data["groups"].items()))
+
+ if not self.zip:
+ self.base = (f"{self.root}/media/manga/{slug}/chapters"
+ f"/{data['folder']}/{group_id}/")
+
+ return {
+ "manga" : manga["title"],
+ "manga_slug": manga["slug"],
+ "title" : data["title"],
+ "volume" : text.parse_int(data["volume"]),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": minor,
+ "group" : manga["groups"][group_id].split(" & "),
+ "group_id" : text.parse_int(group_id),
+ "date" : text.parse_timestamp(data["release_date"][group_id]),
+ "lang" : util.NONE,
+ "language" : util.NONE,
+ }
+
+ def images(self, page):
+ if self.zip:
+ return ()
+
+ base = self.base
+ return [(base + file, None) for file in self._files]
+
+ def assets(self, page):
+ if self.zip:
+ slug, ch = self.groups
+ url = f"{self.root}/api/download_chapter/{slug}/{ch}/"
+ return ({
+ "type" : "archive",
+ "extension": "zip",
+ "url" : url,
+ },)
+
+
+class DankefuerslesenMangaExtractor(DankefuerslesenBase, MangaExtractor):
+ """Extractor for Danke fürs Lesen manga"""
+ chapterclass = DankefuerslesenChapterExtractor
+ reverse = False
+ pattern = BASE_PATTERN + r"/read/manga/([^/?#]+)"
+ example = "https://danke.moe/read/manga/TITLE/"
+
+ def chapters(self, page):
+ results = []
+
+ manga = self._manga_info(self.groups[0]).copy()
+ manga["lang"] = util.NONE
+ manga["language"] = util.NONE
+
+ base = f"{self.root}/read/manga/{manga['slug']}/"
+ for ch, data in manga.pop("chapters").items():
+
+ if "." in ch:
+ chapter, sep, minor = ch.rpartition(".")
+ ch = ch.replace('.', '-')
+ data["chapter"] = text.parse_int(chapter)
+ data["chapter_minor"] = sep + minor
+ else:
+ data["chapter"] = text.parse_int(ch)
+ data["chapter_minor"] = ""
+
+ manga.update(data)
+ results.append((f"{base}{ch}/1/", manga))
+
+ return results
diff --git a/gallery_dl/extractor/desktopography.py b/gallery_dl/extractor/desktopography.py
index 35bb299..364d88f 100644
--- a/gallery_dl/extractor/desktopography.py
+++ b/gallery_dl/extractor/desktopography.py
@@ -46,10 +46,10 @@ class DesktopographyExhibitionExtractor(DesktopographyExtractor):
def __init__(self, match):
DesktopographyExtractor.__init__(self, match)
- self.year = match.group(1)
+ self.year = match[1]
def items(self):
- url = "{}/exhibition-{}/".format(self.root, self.year)
+ url = f"{self.root}/exhibition-{self.year}/"
base_entry_url = "https://desktopography.net/portfolios/"
page = self.request(url).text
@@ -75,10 +75,10 @@ class DesktopographyEntryExtractor(DesktopographyExtractor):
def __init__(self, match):
DesktopographyExtractor.__init__(self, match)
- self.entry = match.group(1)
+ self.entry = match[1]
def items(self):
- url = "{}/portfolios/{}".format(self.root, self.entry)
+ url = f"{self.root}/portfolios/{self.entry}"
page = self.request(url).text
entry_data = {"entry": self.entry}
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 37f57fe..66e2a1e 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,14 +8,13 @@
"""Extractors for https://www.deviantart.com/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from ..cache import cache, memcache
import collections
import mimetypes
import binascii
import time
-import re
BASE_PATTERN = (
r"(?:https?://)?(?:"
@@ -37,7 +36,7 @@ class DeviantartExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = (match.group(1) or match.group(2) or "").lower()
+ self.user = (match[1] or match[2] or "").lower()
self.offset = 0
def _init(self):
@@ -56,8 +55,7 @@ class DeviantartExtractor(Extractor):
self.group = False
self._premium_cache = {}
- unwatch = self.config("auto-unwatch")
- if unwatch:
+ if self.config("auto-unwatch"):
self.unwatch = []
self.finalize = self._unwatch_premium
else:
@@ -66,10 +64,13 @@ class DeviantartExtractor(Extractor):
if self.quality:
if self.quality == "png":
self.quality = "-fullview.png?"
- self.quality_sub = re.compile(r"-fullview\.[a-z0-9]+\?").sub
+ self.quality_sub = util.re(r"-fullview\.[a-z0-9]+\?").sub
else:
- self.quality = ",q_{}".format(self.quality)
- self.quality_sub = re.compile(r",q_\d+").sub
+ self.quality = f",q_{self.quality}"
+ self.quality_sub = util.re(r",q_\d+").sub
+
+ if self.intermediary:
+ self.intermediary_subn = util.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn
if isinstance(self.original, str) and \
self.original.lower().startswith("image"):
@@ -116,15 +117,13 @@ class DeviantartExtractor(Extractor):
def items(self):
if self.user:
- group = self.config("group", True)
- if group:
- user = _user_details(self, self.user)
- if user:
+ if group := self.config("group", True):
+ if user := _user_details(self, self.user):
self.user = user["username"]
self.group = False
elif group == "skip":
self.log.info("Skipping group '%s'", self.user)
- raise exception.StopExtraction()
+ raise exception.AbortExtraction()
else:
self.subcategory = "group-" + self.subcategory
self.group = True
@@ -177,8 +176,7 @@ class DeviantartExtractor(Extractor):
yield self.commit(deviation, deviation["flash"])
if self.commit_journal:
- journal = self._extract_journal(deviation)
- if journal:
+ if journal := self._extract_journal(deviation):
if self.extra:
deviation["_journal"] = journal["html"]
deviation["is_original"] = True
@@ -194,7 +192,7 @@ class DeviantartExtractor(Extractor):
continue
_user_details.update(name, user)
- url = "{}/{}/avatar/".format(self.root, name)
+ url = f"{self.root}/{name}/avatar/"
comment["_extractor"] = DeviantartAvatarExtractor
yield Message.Queue, url, comment
@@ -225,7 +223,7 @@ class DeviantartExtractor(Extractor):
if txt is None:
continue
for match in DeviantartStashExtractor.pattern.finditer(txt):
- url = text.ensure_http_scheme(match.group(0))
+ url = text.ensure_http_scheme(match[0])
deviation["_extractor"] = DeviantartStashExtractor
yield Message.Queue, url, deviation
@@ -271,15 +269,14 @@ class DeviantartExtractor(Extractor):
)
# filename metadata
- sub = re.compile(r"\W").sub
+ sub = util.re(r"\W").sub
deviation["filename"] = "".join((
sub("_", deviation["title"].lower()), "_by_",
sub("_", deviation["author"]["username"].lower()), "-d",
deviation["index_base36"],
))
- @staticmethod
- def commit(deviation, target):
+ def commit(self, deviation, target):
url = target["src"]
name = target.get("filename") or url
target = target.copy()
@@ -321,7 +318,7 @@ class DeviantartExtractor(Extractor):
header = HEADER_TEMPLATE.format(
title=title,
url=url,
- userurl="{}/{}/".format(self.root, urlname),
+ userurl=f"{self.root}/{urlname}/",
username=username,
date=deviation["date"],
)
@@ -388,8 +385,7 @@ class DeviantartExtractor(Extractor):
deviations = state["@@entities"]["deviation"]
content = deviations.popitem()[1]["textContent"]
- html = self._textcontent_to_html(deviation, content)
- if html:
+ if html := self._textcontent_to_html(deviation, content):
return {"html": html}
return {"html": content["excerpt"].replace("\n", "<br />")}
@@ -431,12 +427,11 @@ class DeviantartExtractor(Extractor):
type = content["type"]
if type == "paragraph":
- children = content.get("content")
- if children:
+ if children := content.get("content"):
html.append('<p style="')
attrs = content["attrs"]
- if "textAlign" in attrs:
+ if attrs.get("textAlign"):
html.append("text-align:")
html.append(attrs["textAlign"])
html.append(";")
@@ -546,8 +541,7 @@ class DeviantartExtractor(Extractor):
self.log.warning("Unsupported content type '%s'", type)
def _tiptap_process_text(self, html, content):
- marks = content.get("marks")
- if marks:
+ if marks := content.get("marks"):
close = []
for mark in marks:
type = mark["type"]
@@ -586,8 +580,7 @@ class DeviantartExtractor(Extractor):
html.append(text.escape(content["text"]))
def _tiptap_process_children(self, html, content):
- children = content.get("content")
- if children:
+ if children := content.get("content"):
for block in children:
self._tiptap_process_content(html, block)
@@ -666,8 +659,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
if content["src"].startswith("https://images-wixmp-"):
if self.intermediary and deviation["index"] <= 790677560:
# https://github.com/r888888888/danbooru/issues/4069
- intermediary, count = re.subn(
- r"(/f/[^/]+/[^/]+)/v\d+/.*",
+ intermediary, count = self.intermediary_subn(
r"/intermediary\1", content["src"], 1)
if count:
deviation["is_original"] = False
@@ -679,11 +671,10 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
return content
- @staticmethod
- def _find_folder(folders, name, uuid):
+ def _find_folder(self, folders, name, uuid):
if uuid.isdecimal():
- match = re.compile(name.replace(
- "-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match
+ match = util.re(
+ "(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match
for folder in folders:
if match(folder["name"]):
return folder
@@ -702,10 +693,10 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
raise exception.NotFoundError("folder")
def _folder_urls(self, folders, category, extractor):
- base = "{}/{}/{}/".format(self.root, self.user, category)
+ base = f"{self.root}/{self.user}/{category}/"
for folder in folders:
folder["_extractor"] = extractor
- url = "{}{}/{}".format(base, folder["folderid"], folder["name"])
+ url = f"{base}{folder['folderid']}/{folder['name']}"
yield url, folder
def _update_content_default(self, deviation, content):
@@ -748,13 +739,10 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
deviation["_fallback"] = (content["src"],)
deviation["is_original"] = True
+ pl = binascii.b2a_base64(payload).rstrip(b'=\n').decode()
content["src"] = (
- "{}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{}.".format(
- url,
- # base64 of 'header' is precomputed as 'eyJ0eX...'
- # binascii.b2a_base64(header).rstrip(b"=\n").decode(),
- binascii.b2a_base64(payload).rstrip(b"=\n").decode())
- )
+ # base64 of 'header' is precomputed as 'eyJ0eX...'
+ f"{url}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{pl}.")
def _extract_comments(self, target_id, target_type="deviation"):
results = None
@@ -845,8 +833,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
for fmt in media["types"]
}
- tokens = media.get("token") or ()
- if tokens:
+ if tokens := media.get("token") or ():
if len(tokens) <= 1:
fmt = formats[format]
if "c" in fmt:
@@ -873,19 +860,13 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
.replace("\\\\", "\\")
-class DeviantartUserExtractor(DeviantartExtractor):
+class DeviantartUserExtractor(Dispatch, DeviantartExtractor):
"""Extractor for an artist's user profile"""
- subcategory = "user"
pattern = BASE_PATTERN + r"/?$"
example = "https://www.deviantart.com/USER"
- def initialize(self):
- pass
-
- skip = Extractor.skip
-
def items(self):
- base = "{}/{}/".format(self.root, self.user)
+ base = f"{self.root}/{self.user}/"
return self._dispatch_extractors((
(DeviantartAvatarExtractor , base + "avatar"),
(DeviantartBackgroundExtractor, base + "banner"),
@@ -950,8 +931,8 @@ class DeviantartAvatarExtractor(DeviantartExtractor):
fmt, _, ext = fmt.rpartition(".")
if fmt:
fmt = "-" + fmt
- url = "https://a.deviantart.net/avatars{}/{}/{}/{}.{}?{}".format(
- fmt, name[0], name[1], name, ext, index)
+ url = (f"https://a.deviantart.net/avatars{fmt}"
+ f"/{name[0]}/{name[1]}/{name}.{ext}?{index}")
results.append(self._make_deviation(url, user, index, fmt))
return results
@@ -995,8 +976,8 @@ class DeviantartFolderExtractor(DeviantartExtractor):
def __init__(self, match):
DeviantartExtractor.__init__(self, match)
self.folder = None
- self.folder_id = match.group(3)
- self.folder_name = match.group(4)
+ self.folder_id = match[3]
+ self.folder_name = match[4]
def deviations(self):
folders = self.api.gallery_folders(self.user)
@@ -1049,7 +1030,7 @@ class DeviantartStashExtractor(DeviantartExtractor):
def __init__(self, match):
DeviantartExtractor.__init__(self, match)
- self.user = None
+ self.user = ""
def deviations(self, stash_id=None, stash_data=None):
if stash_id is None:
@@ -1067,8 +1048,7 @@ class DeviantartStashExtractor(DeviantartExtractor):
page = self._limited_request(url).text
if stash_id[0] == "0":
- uuid = text.extr(page, '//deviation/', '"')
- if uuid:
+ if uuid := text.extr(page, '//deviation/', '"'):
deviation = self.api.deviation(uuid)
deviation["_page"] = page
deviation["index"] = text.parse_int(text.extr(
@@ -1091,8 +1071,7 @@ class DeviantartStashExtractor(DeviantartExtractor):
yield deviation
return
- stash_data = text.extr(page, ',\\"stash\\":', ',\\"@@')
- if stash_data:
+ if stash_data := text.extr(page, ',\\"stash\\":', ',\\"@@'):
stash_data = util.json_loads(self._unescape_json(stash_data))
for sid in text.extract_iter(
@@ -1130,8 +1109,8 @@ class DeviantartCollectionExtractor(DeviantartExtractor):
def __init__(self, match):
DeviantartExtractor.__init__(self, match)
self.collection = None
- self.collection_id = match.group(3)
- self.collection_name = match.group(4)
+ self.collection_id = match[3]
+ self.collection_name = match[4]
def deviations(self):
folders = self.api.collections_folders(self.user)
@@ -1173,15 +1152,15 @@ class DeviantartStatusExtractor(DeviantartExtractor):
def deviations(self):
for status in self.api.user_statuses(self.user, self.offset):
- yield from self.status(status)
+ yield from self.process_status(status)
- def status(self, status):
+ def process_status(self, status):
for item in status.get("items") or (): # do not trust is_share
# shared deviations/statuses
if "deviation" in item:
yield item["deviation"].copy()
if "status" in item:
- yield from self.status(item["status"].copy())
+ yield from self.process_status(item["status"].copy())
# assume is_deleted == true means necessary fields are missing
if status["is_deleted"]:
self.log.warning(
@@ -1233,7 +1212,8 @@ class DeviantartTagExtractor(DeviantartExtractor):
def __init__(self, match):
DeviantartExtractor.__init__(self, match)
- self.tag = text.unquote(match.group(1))
+ self.tag = text.unquote(match[1])
+ self.user = ""
def deviations(self):
return self.api.browse_tags(self.tag, self.offset)
@@ -1282,16 +1262,16 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
def __init__(self, match):
DeviantartExtractor.__init__(self, match)
- self.type = match.group(3)
+ self.type = match[3]
self.deviation_id = \
- match.group(4) or match.group(5) or id_from_base36(match.group(6))
+ match[4] or match[5] or id_from_base36(match[6])
def deviations(self):
if self.user:
- url = "{}/{}/{}/{}".format(
- self.root, self.user, self.type or "art", self.deviation_id)
+ url = (f"{self.root}/{self.user}"
+ f"/{self.type or 'art'}/{self.deviation_id}")
else:
- url = "{}/view/{}/".format(self.root, self.deviation_id)
+ url = f"{self.root}/view/{self.deviation_id}/"
page = self._limited_request(url, notfound="deviation").text
uuid = text.extr(page, '"deviationUuid\\":\\"', '\\')
@@ -1379,7 +1359,7 @@ class DeviantartSearchExtractor(DeviantartExtractor):
response = self.request(url, params=params)
if response.history and "/users/login" in response.url:
- raise exception.StopExtraction("HTTP redirect to login page")
+ raise exception.AbortExtraction("HTTP redirect to login page")
page = response.text
for dev in DeviantartDeviationExtractor.pattern.findall(
@@ -1405,7 +1385,7 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor):
def __init__(self, match):
DeviantartExtractor.__init__(self, match)
- self.query = match.group(3)
+ self.query = match[3]
def deviations(self):
self.login()
@@ -1437,7 +1417,7 @@ class DeviantartFollowingExtractor(DeviantartExtractor):
api = DeviantartOAuthAPI(self)
for user in api.user_friends(self.user):
- url = "{}/{}".format(self.root, user["user"]["username"])
+ url = f"{self.root}/{user['user']['username']}"
user["_extractor"] = DeviantartUserExtractor
yield Message.Queue, url, user
@@ -1470,8 +1450,7 @@ class DeviantartOAuthAPI():
self.folders = extractor.config("folders", False)
self.public = extractor.config("public", True)
- client_id = extractor.config("client-id")
- if client_id:
+ if client_id := extractor.config("client-id"):
self.client_id = str(client_id)
self.client_secret = extractor.config("client-secret")
else:
@@ -1585,7 +1564,7 @@ class DeviantartOAuthAPI():
def comments(self, target_id, target_type="deviation",
comment_id=None, offset=0):
"""Fetch comments posted on a target"""
- endpoint = "/comments/{}/{}".format(target_type, target_id)
+ endpoint = f"/comments/{target_type}/{target_id}"
params = {
"commentid" : comment_id,
"maxdepth" : "5",
@@ -1639,7 +1618,7 @@ class DeviantartOAuthAPI():
def deviation_metadata(self, deviations):
""" Fetch deviation metadata for a set of deviations"""
endpoint = "/deviation/metadata?" + "&".join(
- "deviationids[{}]={}".format(num, deviation["deviationid"])
+ f"deviationids[{num}]={deviation['deviationid']}"
for num, deviation in enumerate(deviations)
)
return self._call(
@@ -1746,8 +1725,8 @@ class DeviantartOAuthAPI():
if response.status_code != 200:
self.log.debug("Server response: %s", data)
- raise exception.AuthenticationError('"{}" ({})'.format(
- data.get("error_description"), data.get("error")))
+ raise exception.AuthenticationError(
+ f"\"{data.get('error_description')}\" ({data.get('error')})")
if refresh_token_key:
_refresh_token_cache.update(
refresh_token_key, data["refresh_token"])
@@ -1790,8 +1769,7 @@ class DeviantartOAuthAPI():
raise exception.AuthorizationError()
self.log.debug(response.text)
- msg = "API responded with {} {}".format(
- status, response.reason)
+ msg = f"API responded with {status} {response.reason}"
if status == 429:
if self.delay < 30:
self.delay += 1
@@ -1889,12 +1867,9 @@ class DeviantartOAuthAPI():
params["offset"] = int(params["offset"]) + len(results)
def _pagination_list(self, endpoint, params, key="results"):
- result = []
- result.extend(self._pagination(endpoint, params, False, key=key))
- return result
+ return list(self._pagination(endpoint, params, False, key=key))
- @staticmethod
- def _shared_content(results):
+ def _shared_content(self, results):
"""Return an iterable of shared deviations in 'results'"""
for result in results:
for item in result.get("items") or ():
@@ -2075,7 +2050,7 @@ class DeviantartEclipseAPI():
params["offset"] = int(params["offset"]) + len(results)
def _ids_watching(self, user):
- url = "{}/{}/about".format(self.extractor.root, user)
+ url = f"{self.extractor.root}/{user}/about"
page = self.request(url).text
gruser_id = text.extr(page, ' data-userid="', '"')
@@ -2083,8 +2058,7 @@ class DeviantartEclipseAPI():
pos = page.find('\\"name\\":\\"watching\\"')
if pos < 0:
raise exception.NotFoundError("'watching' module ID")
- module_id = text.rextract(
- page, '\\"id\\":', ',', pos)[0].strip('" ')
+ module_id = text.rextr(page, '\\"id\\":', ',', pos).strip('" ')
self._fetch_csrf_token(page)
return gruser_id, module_id
diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py
index 4559aff..85358ba 100644
--- a/gallery_dl/extractor/directlink.py
+++ b/gallery_dl/extractor/directlink.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -24,9 +24,9 @@ class DirectlinkExtractor(Extractor):
example = "https://en.wikipedia.org/static/images/project-logos/enwiki.png"
def __init__(self, match):
- Extractor.__init__(self, match)
self.data = data = match.groupdict()
self.subcategory = ".".join(data["domain"].rsplit(".", 2)[-2:])
+ Extractor.__init__(self, match)
def items(self):
data = self.data
diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py
index ac21fec..216e486 100644
--- a/gallery_dl/extractor/discord.py
+++ b/gallery_dl/extractor/discord.py
@@ -22,8 +22,6 @@ class DiscordExtractor(Extractor):
filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}"
archive_fmt = "{message_id}_{num}"
- cdn_fmt = "https://cdn.discordapp.com/{}/{}/{}.png?size=4096"
-
server_metadata = {}
server_channels_metadata = {}
@@ -86,44 +84,50 @@ class DiscordExtractor(Extractor):
):
if message["author"].get(icon_type):
message_metadata["author_files"].append({
- "url": self.cdn_fmt.format(
- icon_path,
- message_metadata["author_id"],
- message["author"][icon_type]
- ),
+ "url": (f"https://cdn.discordapp.com/{icon_path}/"
+ f"{message_metadata['author_id']}/"
+ f"{message['author'][icon_type]}.png"
+ f"?size=4096"),
"filename": icon_type,
"extension": "png",
})
- for attachment in message["attachments"]:
- message_metadata["files"].append({
- "url": attachment["url"],
- "type": "attachment",
- })
+ message_snapshots = [message]
+ message_snapshots.extend(
+ msg["message"] for msg in message.get("message_snapshots", [])
+ if msg["message"]["type"] in (0, 19, 21)
+ )
+
+ for snapshot in message_snapshots:
+ for attachment in snapshot["attachments"]:
+ message_metadata["files"].append({
+ "url": attachment["url"],
+ "type": "attachment",
+ })
- for embed in message["embeds"]:
- if embed["type"] in self.enabled_embeds:
- for field in ("video", "image", "thumbnail"):
- if field not in embed:
- continue
- url = embed[field].get("proxy_url")
- if url is not None:
- message_metadata["files"].append({
- "url": url,
- "type": "embed",
- })
- break
-
- for num, file in enumerate(message_metadata["files"], start=1):
- text.nameext_from_url(file["url"], file)
- file["num"] = num
-
- yield Message.Directory, message_metadata
-
- for file in message_metadata["files"]:
- message_metadata_file = message_metadata.copy()
- message_metadata_file.update(file)
- yield Message.Url, file["url"], message_metadata_file
+ for embed in snapshot["embeds"]:
+ if embed["type"] in self.enabled_embeds:
+ for field in ("video", "image", "thumbnail"):
+ if field not in embed:
+ continue
+ url = embed[field].get("proxy_url")
+ if url is not None:
+ message_metadata["files"].append({
+ "url": url,
+ "type": "embed",
+ })
+ break
+
+ for num, file in enumerate(message_metadata["files"], start=1):
+ text.nameext_from_url(file["url"], file)
+ file["num"] = num
+
+ yield Message.Directory, message_metadata
+
+ for file in message_metadata["files"]:
+ message_metadata_file = message_metadata.copy()
+ message_metadata_file.update(file)
+ yield Message.Url, file["url"], message_metadata_file
def extract_channel_text(self, channel_id):
for message in self.api.get_channel_messages(channel_id):
@@ -158,7 +162,7 @@ class DiscordExtractor(Extractor):
yield from self.extract_channel(
channel["channel_id"], safe=True)
elif not safe:
- raise exception.StopExtraction(
+ raise exception.AbortExtraction(
"This channel type is not supported."
)
except exception.HttpError as exc:
@@ -215,11 +219,9 @@ class DiscordExtractor(Extractor):
):
if server.get(icon_type):
self.server_metadata["server_files"].append({
- "url": self.cdn_fmt.format(
- icon_path,
- self.server_metadata["server_id"],
- server[icon_type]
- ),
+ "url": (f"https://cdn.discordapp.com/{icon_path}/"
+ f"{self.server_metadata['server_id']}/"
+ f"{server[icon_type]}.png?size=4096"),
"filename": icon_type,
"extension": "png",
})
@@ -342,7 +344,7 @@ class DiscordAPI():
"sort_order": "desc",
"limit": THREADS_BATCH,
"offset": + offset,
- })["threads"]
+ }).get("threads", [])
return self._pagination(_method, THREADS_BATCH)
@@ -391,8 +393,7 @@ class DiscordAPI():
return
offset += len(data)
- @staticmethod
- def _raise_invalid_token():
+ def _raise_invalid_token(self):
raise exception.AuthenticationError("""Invalid or missing token.
Please provide a valid token following these instructions:
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index 583869f..3e0424d 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,6 @@
from .common import ChapterExtractor, MangaExtractor, Extractor, Message
from .. import text, util
-import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
@@ -21,7 +20,7 @@ class DynastyscansBase():
root = "https://dynasty-scans.com"
def _parse_image_page(self, image_id):
- url = "{}/images/{}".format(self.root, image_id)
+ url = f"{self.root}/images/{image_id}"
extr = text.extract_from(self.request(url).text)
date = extr("class='create_at'>", "</span>")
@@ -47,20 +46,19 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
def metadata(self, page):
extr = text.extract_from(page)
- match = re.match(
- (r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
- r"(?: ch(\d+)([^:<]*))?" # chapter info
- r"(?:: (.+))?"), # title
- extr("<h3 id='chapter-title'><b>", "</b>"),
- )
+ match = util.re(
+ r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
+ r"(?: ch(\d+)([^:<]*))?" # chapter info
+ r"(?:: (.+))?" # title
+ ).match(extr("<h3 id='chapter-title'><b>", "</b>"))
author = extr(" by ", "</a>")
group = extr('"icon-print"></i> ', '</span>')
return {
- "manga" : text.unescape(match.group(1)),
- "chapter" : text.parse_int(match.group(2)),
- "chapter_minor": match.group(3) or "",
- "title" : text.unescape(match.group(4) or ""),
+ "manga" : text.unescape(match[1]),
+ "chapter" : text.parse_int(match[2]),
+ "chapter_minor": match[3] or "",
+ "title" : text.unescape(match[4] or ""),
"author" : text.remove_html(author),
"group" : (text.remove_html(group) or
text.extr(group, ' alt="', '"')),
@@ -104,7 +102,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.query = match.group(1) or ""
+ self.query = match[1] or ""
def items(self):
yield Message.Directory, {}
@@ -133,3 +131,43 @@ class DynastyscansImageExtractor(DynastyscansSearchExtractor):
def images(self):
return (self.query,)
+
+
+class DynastyscansAnthologyExtractor(DynastyscansSearchExtractor):
+ """Extractor for dynasty-scans anthologies"""
+ subcategory = "anthology"
+ pattern = BASE_PATTERN + r"/anthologies/([^/?#]+)"
+ example = "https://dynasty-scans.com/anthologies/TITLE"
+
+ def items(self):
+ url = f"{self.root}/anthologies/{self.groups[0]}.atom"
+ root = self.request_xml(url, xmlns=False)
+
+ data = {
+ "_extractor": DynastyscansChapterExtractor,
+ "anthology" : root[3].text[28:],
+ }
+
+ if self.config("metadata", False):
+ page = self.request(url[:-5]).text
+ alert = text.extr(page, "<div class='alert", "</div>")
+
+ data["alert"] = text.split_html(alert)[1:] if alert else ()
+ data["status"] = text.extr(
+ page, "<small>&mdash; ", "</small>")
+ data["description"] = text.extr(
+ page, "<div class='description'>", "</div>")
+
+ for element in root:
+ if element.tag != "entry":
+ continue
+ content = element[6][0]
+ data["author"] = content[0].text[8:]
+ data["scanlator"] = content[1].text[11:]
+ data["tags"] = content[2].text[6:].lower().split(", ")
+ data["title"] = element[5].text
+ data["date"] = text.parse_datetime(
+ element[1].text, "%Y-%m-%dT%H:%M:%S%z")
+ data["date_updated"] = text.parse_datetime(
+ element[2].text, "%Y-%m-%dT%H:%M:%S%z")
+ yield Message.Queue, element[4].text, data
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index 76ea792..71c3b30 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -24,8 +24,7 @@ class E621Extractor(danbooru.DanbooruExtractor):
request_interval_min = 1.0
def items(self):
- includes = self.config("metadata") or ()
- if includes:
+ if includes := self.config("metadata") or ():
if isinstance(includes, str):
includes = includes.split(",")
elif not isinstance(includes, (list, tuple)):
@@ -40,8 +39,8 @@ class E621Extractor(danbooru.DanbooruExtractor):
if not file["url"]:
md5 = file["md5"]
- file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format(
- self.root[8:], md5[0:2], md5[2:4], md5, file["ext"])
+ file["url"] = (f"https://static1.{self.root[8:]}/data"
+ f"/{md5[0:2]}/{md5[2:4]}/{md5}.{file['ext']}")
if notes and post.get("has_notes"):
post["notes"] = self._get_notes(post["id"])
@@ -60,13 +59,13 @@ class E621Extractor(danbooru.DanbooruExtractor):
yield Message.Url, file["url"], post
def _get_notes(self, id):
- return self.request(
- "{}/notes.json?search[post_id]={}".format(self.root, id)).json()
+ return self.request_json(
+ f"{self.root}/notes.json?search[post_id]={id}")
@memcache(keyarg=1)
def _get_pools(self, ids):
- pools = self.request(
- "{}/pools.json?search[id]={}".format(self.root, ids)).json()
+ pools = self.request_json(
+ f"{self.root}/pools.json?search[id]={ids}")
for pool in pools:
pool["name"] = pool["name"].replace("_", " ")
return pools
@@ -75,7 +74,7 @@ class E621Extractor(danbooru.DanbooruExtractor):
BASE_PATTERN = E621Extractor.update({
"e621": {
"root": "https://e621.net",
- "pattern": r"e621\.net",
+ "pattern": r"e621\.(?:net|cc)",
},
"e926": {
"root": "https://e926.net",
@@ -109,12 +108,11 @@ class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor):
}
posts = []
- append = posts.append
for num, pid in enumerate(self.post_ids, 1):
if pid in id_to_post:
post = id_to_post[pid]
post["num"] = num
- append(post)
+ posts.append(post)
else:
self.log.warning("Post %s is unavailable", pid)
return posts
@@ -126,8 +124,8 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor):
example = "https://e621.net/posts/12345"
def posts(self):
- url = "{}/posts/{}.json".format(self.root, self.groups[-1])
- return (self.request(url).json()["post"],)
+ url = f"{self.root}/posts/{self.groups[-1]}.json"
+ return (self.request_json(url)["post"],)
class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor):
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index 7582528..7beeac5 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,62 +22,20 @@ class EromeExtractor(Extractor):
filename_fmt = "{album_id} {title} {num:>02}.{extension}"
archive_fmt = "{album_id}_{num}"
root = "https://www.erome.com"
+ _cookies = True
def items(self):
- self.__cookies = True
+ base = f"{self.root}/a/"
+ data = {"_extractor": EromeAlbumExtractor}
for album_id in self.albums():
- url = "{}/a/{}".format(self.root, album_id)
-
- try:
- page = self.request(url).text
- except exception.HttpError as exc:
- self.log.warning(
- "Unable to fetch album '%s' (%s)", album_id, exc)
- continue
-
- title, pos = text.extract(
- page, 'property="og:title" content="', '"')
- pos = page.index('<div class="user-profile', pos)
- user, pos = text.extract(
- page, 'href="https://www.erome.com/', '"', pos)
- tags, pos = text.extract(
- page, '<p class="mt-10"', '</p>', pos)
-
- urls = []
- date = None
- groups = page.split('<div class="media-group"')
- for group in util.advance(groups, 1):
- url = (text.extr(group, '<source src="', '"') or
- text.extr(group, 'data-src="', '"'))
- if url:
- urls.append(url)
- if not date:
- ts = text.extr(group, '?v=', '"')
- if len(ts) > 1:
- date = text.parse_timestamp(ts)
-
- data = {
- "album_id": album_id,
- "title" : text.unescape(title),
- "user" : text.unquote(user),
- "count" : len(urls),
- "date" : date,
- "tags" : ([t.replace("+", " ")
- for t in text.extract_iter(tags, "?q=", '"')]
- if tags else ()),
- "_http_headers": {"Referer": url},
- }
-
- yield Message.Directory, data
- for data["num"], url in enumerate(urls, 1):
- yield Message.Url, url, text.nameext_from_url(url, data)
+ yield Message.Queue, f"{base}{album_id}", data
def albums(self):
return ()
def request(self, url, **kwargs):
- if self.__cookies:
- self.__cookies = False
+ if self._cookies:
+ self._cookies = False
self.cookies.update(_cookie_cache())
for _ in range(5):
@@ -106,8 +64,52 @@ class EromeAlbumExtractor(EromeExtractor):
pattern = BASE_PATTERN + r"/a/(\w+)"
example = "https://www.erome.com/a/ID"
- def albums(self):
- return (self.groups[0],)
+ def items(self):
+ album_id = self.groups[0]
+ url = f"{self.root}/a/{album_id}"
+
+ try:
+ page = self.request(url).text
+ except exception.HttpError as exc:
+ raise exception.AbortExtraction(
+ f"{album_id}: Unable to fetch album page ({exc})")
+
+ title, pos = text.extract(
+ page, 'property="og:title" content="', '"')
+ pos = page.index('<div class="user-profile', pos)
+ user, pos = text.extract(
+ page, 'href="https://www.erome.com/', '"', pos)
+ tags, pos = text.extract(
+ page, '<p class="mt-10"', '</p>', pos)
+
+ urls = []
+ date = None
+ groups = page.split('<div class="media-group"')
+ for group in util.advance(groups, 1):
+ url = (text.extr(group, '<source src="', '"') or
+ text.extr(group, 'data-src="', '"'))
+ if url:
+ urls.append(url)
+ if not date:
+ ts = text.extr(group, '?v=', '"')
+ if len(ts) > 1:
+ date = text.parse_timestamp(ts)
+
+ data = {
+ "album_id": album_id,
+ "title" : text.unescape(title),
+ "user" : text.unquote(user),
+ "count" : len(urls),
+ "date" : date,
+ "tags" : ([t.replace("+", " ")
+ for t in text.extract_iter(tags, "?q=", '"')]
+ if tags else ()),
+ "_http_headers": {"Referer": url},
+ }
+
+ yield Message.Directory, data
+ for data["num"], url in enumerate(urls, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
class EromeUserExtractor(EromeExtractor):
@@ -116,7 +118,7 @@ class EromeUserExtractor(EromeExtractor):
example = "https://www.erome.com/USER"
def albums(self):
- url = "{}/{}".format(self.root, self.groups[0])
+ url = f"{self.root}/{self.groups[0]}"
return self._pagination(url, {})
diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py
index 3bf0a74..787786e 100644
--- a/gallery_dl/extractor/everia.py
+++ b/gallery_dl/extractor/everia.py
@@ -7,8 +7,7 @@
"""Extractors for https://everia.club"""
from .common import Extractor, Message
-from .. import text
-import re
+from .. import text, util
BASE_PATTERN = r"(?:https?://)?everia\.club"
@@ -26,13 +25,13 @@ class EveriaExtractor(Extractor):
return self._pagination(self.groups[0])
def _pagination(self, path, params=None, pnum=1):
- find_posts = re.compile(r'thumbnail">\s*<a href="([^"]+)').findall
+ find_posts = util.re(r'thumbnail">\s*<a href="([^"]+)').findall
while True:
if pnum == 1:
- url = "{}{}/".format(self.root, path)
+ url = f"{self.root}{path}/"
else:
- url = "{}{}/page/{}/".format(self.root, path, pnum)
+ url = f"{self.root}{path}/page/{pnum}/"
response = self.request(url, params=params, allow_redirects=False)
if response.status_code >= 300:
@@ -50,16 +49,16 @@ class EveriaPostExtractor(EveriaExtractor):
example = "https://everia.club/0000/00/00/TITLE"
def items(self):
- url = self.root + self.groups[0]
+ url = self.root + self.groups[0] + "/"
page = self.request(url).text
content = text.extr(page, 'itemprop="text">', "<h3")
- urls = re.findall(r'img.*?src="([^"]+)', content)
+ urls = util.re(r'img.*?src="([^"]+)').findall(content)
data = {
"title": text.unescape(
text.extr(page, 'itemprop="headline">', "</h")),
"tags": list(text.extract_iter(page, 'rel="tag">', "</a>")),
- "post_url": url,
+ "post_url": text.unquote(url),
"post_category": text.extr(
page, "post-in-category-", " ").capitalize(),
"count": len(urls),
@@ -67,6 +66,7 @@ class EveriaPostExtractor(EveriaExtractor):
yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
+ url = text.unquote(url)
yield Message.Url, url, text.nameext_from_url(url, data)
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index e7ba78e..f147959 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -34,7 +34,7 @@ class ExhentaiExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.version = match.group(1)
+ self.version = match[1]
def initialize(self):
domain = self.config("domain", "auto")
@@ -59,7 +59,7 @@ class ExhentaiExtractor(Extractor):
def login(self):
"""Login and set necessary cookies"""
if self.LIMIT:
- raise exception.StopExtraction("Image limit reached!")
+ raise exception.AbortExtraction("Image limit reached!")
if self.cookies_check(self.cookies_names):
return
@@ -122,10 +122,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def __init__(self, match):
ExhentaiExtractor.__init__(self, match)
- self.gallery_id = text.parse_int(match.group(2) or match.group(5))
- self.gallery_token = match.group(3)
- self.image_token = match.group(4)
- self.image_num = text.parse_int(match.group(6), 1)
+ self.gallery_id = text.parse_int(match[2] or match[5])
+ self.gallery_token = match[3]
+ self.image_token = match[4]
+ self.image_num = text.parse_int(match[6], 1)
self.key_start = None
self.key_show = None
self.key_next = None
@@ -136,11 +136,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
source = self.config("source")
if source == "hitomi":
self.items = self._items_hitomi
+ elif source == "metadata":
+ self.items = self._items_metadata
limits = self.config("limits", False)
if limits and limits.__class__ is int:
self.limits = limits
- self._remaining = 0
+ self._limits_remaining = 0
else:
self.limits = False
@@ -176,7 +178,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
self.image_token = text.extr(gpage, 'hentai.org/s/', '"')
if not self.image_token:
self.log.debug("Page content:\n%s", gpage)
- raise exception.StopExtraction(
+ raise exception.AbortExtraction(
"Failed to extract initial image token")
ipage = self._image_page()
else:
@@ -184,7 +186,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
part = text.extr(ipage, 'hentai.org/g/', '"')
if not part:
self.log.debug("Page content:\n%s", ipage)
- raise exception.StopExtraction(
+ raise exception.AbortExtraction(
"Failed to extract gallery token")
self.gallery_token = part.split("/")[1]
gpage = self._gallery_page()
@@ -198,11 +200,12 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
for url, image in images:
data.update(image)
if self.limits:
- self._check_limits(data)
+ self._limits_check(data)
if "/fullimg" in url:
data["_http_validate"] = self._validate_response
else:
data["_http_validate"] = None
+ data["_http_signature"] = self._validate_signature
yield Message.Url, url, data
fav = self.config("fav")
@@ -218,10 +221,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data = {}
from .hitomi import HitomiGalleryExtractor
- url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id)
+ url = f"https://hitomi.la/galleries/{self.gallery_id}.html"
data["_extractor"] = HitomiGalleryExtractor
yield Message.Queue, url, data
+ def _items_metadata(self):
+ yield Message.Directory, self.metadata_from_api()
+
def get_metadata(self, page):
"""Extract gallery metadata"""
data = self.metadata_from_page(page)
@@ -240,8 +246,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def metadata_from_page(self, page):
extr = text.extract_from(page)
- api_url = extr('var api_url = "', '"')
- if api_url:
+ if api_url := extr('var api_url = "', '"'):
self.api_url = api_url
data = {
@@ -293,9 +298,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"namespace": 1,
}
- data = self.request(self.api_url, method="POST", json=data).json()
+ data = self.request_json(self.api_url, method="POST", json=data)
if "error" in data:
- raise exception.StopExtraction(data["error"])
+ raise exception.AbortExtraction(data["error"])
return data["gmetadata"][0]
@@ -320,8 +325,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["_fallback"] = self._fallback_1280(nl, self.image_num)
except IndexError:
self.log.debug("Page content:\n%s", page)
- raise exception.StopExtraction(
- "Unable to parse image info for '%s'", url)
+ raise exception.AbortExtraction(
+ f"Unable to parse image info for '{url}'")
data["num"] = self.image_num
data["image_token"] = self.key_start = extr('var startkey="', '";')
@@ -345,7 +350,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
}
for request["page"] in range(self.image_num + 1, self.count + 1):
- page = self.request(api_url, method="POST", json=request).json()
+ page = self.request_json(api_url, method="POST", json=request)
i3 = page["i3"]
i6 = page["i6"]
@@ -371,8 +376,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
nl, request["page"], imgkey)
except IndexError:
self.log.debug("Page content:\n%s", page)
- raise exception.StopExtraction(
- "Unable to parse image info for '%s'", url)
+ raise exception.AbortExtraction(
+ f"Unable to parse image info for '{url}'")
data["num"] = request["page"]
data["image_token"] = imgkey
@@ -385,66 +390,106 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
request["imgkey"] = nextkey
def _validate_response(self, response):
- if not response.history and response.headers.get(
+ if response.history or not response.headers.get(
"content-type", "").startswith("text/html"):
- page = response.text
- self.log.warning("'%s'", page)
-
- if " requires GP" in page:
- gp = self.config("gp")
- if gp == "stop":
- raise exception.StopExtraction("Not enough GP")
- elif gp == "wait":
- input("Press ENTER to continue.")
- return response.url
-
- self.log.info("Falling back to non-original downloads")
- self.original = False
- return self.data["_url_1280"]
-
- if " temporarily banned " in page:
- raise exception.AuthorizationError("Temporarily Banned")
-
- self._report_limits()
- return True
-
- def _report_limits(self):
- ExhentaiExtractor.LIMIT = True
- raise exception.StopExtraction("Image limit reached!")
-
- def _check_limits(self, data):
- if not self._remaining or data["num"] % 25 == 0:
- self._update_limits()
- self._remaining -= data["cost"]
- if self._remaining <= 0:
- self._report_limits()
-
- def _check_509(self, url):
- # full 509.gif URLs
- # - https://exhentai.org/img/509.gif
- # - https://ehgt.org/g/509.gif
- if url.endswith(("hentai.org/img/509.gif",
- "ehgt.org/g/509.gif")):
- self.log.debug(url)
- self._report_limits()
+ return True
- def _update_limits(self):
+ page = response.text
+ self.log.warning("'%s'", page)
+
+ if " requires GP" in page:
+ gp = self.config("gp")
+ if gp == "stop":
+ raise exception.AbortExtraction("Not enough GP")
+ elif gp == "wait":
+ self.input("Press ENTER to continue.")
+ return response.url
+
+ self.log.info("Falling back to non-original downloads")
+ self.original = False
+ return self.data["_url_1280"]
+
+ if " temporarily banned " in page:
+ raise exception.AuthorizationError("Temporarily Banned")
+
+ self._limits_exceeded()
+ return response.url
+
+ def _validate_signature(self, signature):
+ """Return False if all file signature bytes are zero"""
+ if signature:
+ if byte := signature[0]:
+ # 60 == b"<"
+ if byte == 60 and b"<!doctype html".startswith(
+ signature[:14].lower()):
+ return "HTML response"
+ return True
+ for byte in signature:
+ if byte:
+ return True
+ return False
+
+ def _request_home(self, **kwargs):
url = "https://e-hentai.org/home.php"
- cookies = {
+ kwargs["cookies"] = {
cookie.name: cookie.value
for cookie in self.cookies
if cookie.domain == self.cookies_domain and
cookie.name != "igneous"
}
+ page = self.request(url, **kwargs).text
- page = self.request(url, cookies=cookies).text
+ # update image limits
current = text.extr(page, "<strong>", "</strong>").replace(",", "")
self.log.debug("Image Limits: %s/%s", current, self.limits)
- self._remaining = self.limits - text.parse_int(current)
+ self._limits_remaining = self.limits - text.parse_int(current)
+
+ return page
+
+ def _check_509(self, url):
+ # full 509.gif URLs
+ # - https://exhentai.org/img/509.gif
+ # - https://ehgt.org/g/509.gif
+ if url.endswith(("hentai.org/img/509.gif",
+ "ehgt.org/g/509.gif")):
+ self.log.debug(url)
+ self._limits_exceeded()
+
+ def _limits_exceeded(self):
+ msg = "Image limit exceeded!"
+ action = self.config("limits-action")
+
+ if not action or action == "stop":
+ ExhentaiExtractor.LIMIT = True
+ raise exception.AbortExtraction(msg)
+
+ self.log.warning(msg)
+ if action == "wait":
+ self.input("Press ENTER to continue.")
+ self._limits_update()
+ elif action == "reset":
+ self._limits_reset()
+ else:
+ self.log.error("Invalid 'limits-action' value '%s'", action)
+
+ def _limits_check(self, data):
+ if not self._limits_remaining or data["num"] % 25 == 0:
+ self._limits_update()
+ self._limits_remaining -= data["cost"]
+ if self._limits_remaining <= 0:
+ self._limits_exceeded()
+
+ def _limits_reset(self):
+ self.log.info("Resetting image limits")
+ self._request_home(
+ method="POST",
+ headers={"Content-Type": "application/x-www-form-urlencoded"},
+ data=b"reset_imagelimit=Reset+Quota")
+
+ _limits_update = _request_home
def _gallery_page(self):
- url = "{}/g/{}/{}/".format(
- self.root, self.gallery_id, self.gallery_token)
+ url = f"{self.root}/g/{self.gallery_id}/{self.gallery_token}/"
response = self.request(url, fatal=False)
page = response.text
@@ -457,8 +502,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
return page
def _image_page(self):
- url = "{}/s/{}/{}-{}".format(
- self.root, self.image_token, self.gallery_id, self.image_num)
+ url = (f"{self.root}/s/{self.image_token}"
+ f"/{self.gallery_id}-{self.image_num}")
page = self.request(url, fatal=False).text
if page.startswith(("Invalid page", "Keep trying")):
@@ -466,7 +511,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
return page
def _fallback_original(self, nl, fullimg):
- url = "{}?nl={}".format(fullimg, nl)
+ url = f"{fullimg}?nl={nl}"
for _ in util.repeat(self.fallback_retries):
yield url
@@ -475,8 +520,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
token = self.key_start
for _ in util.repeat(self.fallback_retries):
- url = "{}/s/{}/{}-{}?nl={}".format(
- self.root, token, self.gallery_id, num, nl)
+ url = f"{self.root}/s/{token}/{self.gallery_id}-{num}?nl={nl}"
page = self.request(url, fatal=False).text
if page.startswith(("Invalid page", "Keep trying")):
@@ -486,8 +530,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
nl = data["_nl"]
- @staticmethod
- def _parse_image_info(url):
+ def _parse_image_info(self, url):
for part in url.split("/")[4:]:
try:
_, size, width, height, _ = part.split("-")
@@ -504,8 +547,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"height": text.parse_int(height),
}
- @staticmethod
- def _parse_original_info(info):
+ def _parse_original_info(self, info):
parts = info.lstrip().split(" ")
size = text.parse_bytes(parts[3] + parts[4][0])
@@ -527,11 +569,11 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
def __init__(self, match):
ExhentaiExtractor.__init__(self, match)
- _, query, tag = match.groups()
+ _, query, tag = self.groups
if tag:
if "+" in tag:
ns, _, tag = tag.rpartition(":")
- tag = '{}:"{}$"'.format(ns, tag.replace("+", " "))
+ tag = f"{ns}:\"{tag.replace('+', ' ')}$\""
else:
tag += "$"
self.params = {"f_search": tag, "page": 0}
@@ -553,13 +595,13 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
last = None
page = self.request(search_url, params=params).text
- for gallery in ExhentaiGalleryExtractor.pattern.finditer(page):
- url = gallery.group(0)
+ for match in ExhentaiGalleryExtractor.pattern.finditer(page):
+ url = match[0]
if url == last:
continue
last = url
- data["gallery_id"] = text.parse_int(gallery.group(2))
- data["gallery_token"] = gallery.group(3)
+ data["gallery_id"] = text.parse_int(match[2])
+ data["gallery_token"] = match[3]
yield Message.Queue, url + "/", data
next_url = text.extr(page, 'nexturl="', '"', None)
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py
index b284ee8..069ed99 100644
--- a/gallery_dl/extractor/facebook.py
+++ b/gallery_dl/extractor/facebook.py
@@ -6,10 +6,14 @@
"""Extractors for https://www.facebook.com/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, exception
+from ..cache import memcache
BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com"
+USER_PATTERN = (BASE_PATTERN +
+ r"/(?!media/|photo/|photo.php|watch/)"
+ r"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)")
class FacebookExtractor(Extractor):
@@ -20,9 +24,6 @@ class FacebookExtractor(Extractor):
filename_fmt = "{id}.{extension}"
archive_fmt = "{id}.{extension}"
- set_url_fmt = root + "/media/set/?set={set_id}"
- photo_url_fmt = root + "/photo/?fbid={photo_id}&set={set_id}"
-
def _init(self):
headers = self.session.headers
headers["Accept"] = (
@@ -37,22 +38,20 @@ class FacebookExtractor(Extractor):
self.videos = self.config("videos", True)
self.author_followups = self.config("author-followups", False)
- @staticmethod
- def decode_all(txt):
+ def decode_all(self, txt):
return text.unescape(
txt.encode().decode("unicode_escape")
.encode("utf_16", "surrogatepass").decode("utf_16")
).replace("\\/", "/")
- @staticmethod
- def parse_set_page(set_page):
+ def parse_set_page(self, set_page):
directory = {
"set_id": text.extr(
set_page, '"mediaSetToken":"', '"'
) or text.extr(
set_page, '"mediasetToken":"', '"'
),
- "username": FacebookExtractor.decode_all(
+ "username": self.decode_all(
text.extr(
set_page, '"user":{"__isProfile":"User","name":"', '","'
) or text.extr(
@@ -62,7 +61,7 @@ class FacebookExtractor(Extractor):
"user_id": text.extr(
set_page, '"owner":{"__typename":"User","id":"', '"'
),
- "title": FacebookExtractor.decode_all(text.extr(
+ "title": self.decode_all(text.extr(
set_page, '"title":{"text":"', '"'
)),
"first_photo_id": text.extr(
@@ -77,8 +76,7 @@ class FacebookExtractor(Extractor):
return directory
- @staticmethod
- def parse_photo_page(photo_page):
+ def parse_photo_page(self, photo_page):
photo = {
"id": text.extr(
photo_page, '"__isNode":"Photo","id":"', '"'
@@ -88,13 +86,13 @@ class FacebookExtractor(Extractor):
'"url":"https:\\/\\/www.facebook.com\\/photo\\/?fbid=',
'"'
).rsplit("&set=", 1)[-1],
- "username": FacebookExtractor.decode_all(text.extr(
+ "username": self.decode_all(text.extr(
photo_page, '"owner":{"__typename":"User","name":"', '"'
)),
"user_id": text.extr(
photo_page, '"owner":{"__typename":"User","id":"', '"'
),
- "caption": FacebookExtractor.decode_all(text.extr(
+ "caption": self.decode_all(text.extr(
photo_page,
'"message":{"delight_ranges"',
'"},"message_preferred_body"'
@@ -103,7 +101,7 @@ class FacebookExtractor(Extractor):
text.extr(photo_page, '\\"publish_time\\":', ',') or
text.extr(photo_page, '"created_time":', ',')
),
- "url": FacebookExtractor.decode_all(text.extr(
+ "url": self.decode_all(text.extr(
photo_page, ',"image":{"uri":"', '","'
)),
"next_photo_id": text.extr(
@@ -133,8 +131,7 @@ class FacebookExtractor(Extractor):
return photo
- @staticmethod
- def parse_post_page(post_page):
+ def parse_post_page(self, post_page):
first_photo_url = text.extr(
text.extr(
post_page, '"__isMedia":"Photo"', '"target_group"'
@@ -148,13 +145,12 @@ class FacebookExtractor(Extractor):
return post
- @staticmethod
- def parse_video_page(video_page):
+ def parse_video_page(self, video_page):
video = {
"id": text.extr(
video_page, '\\"video_id\\":\\"', '\\"'
),
- "username": FacebookExtractor.decode_all(text.extr(
+ "username": self.decode_all(text.extr(
video_page, '"actors":[{"__typename":"User","name":"', '","'
)),
"user_id": text.extr(
@@ -167,7 +163,7 @@ class FacebookExtractor(Extractor):
}
if not video["username"]:
- video["username"] = FacebookExtractor.decode_all(text.extr(
+ video["username"] = self.decode_all(text.extr(
video_page,
'"__typename":"User","id":"' + video["user_id"] + '","name":"',
'","'
@@ -179,7 +175,7 @@ class FacebookExtractor(Extractor):
audio = {
**video,
- "url": FacebookExtractor.decode_all(text.extr(
+ "url": self.decode_all(text.extr(
text.extr(
first_video_raw,
"AudioChannelConfiguration",
@@ -196,7 +192,7 @@ class FacebookExtractor(Extractor):
first_video_raw, 'FBQualityLabel=\\"', '\\u003C\\/BaseURL>'
):
resolution = raw_url.split('\\"', 1)[0]
- video["urls"][resolution] = FacebookExtractor.decode_all(
+ video["urls"][resolution] = self.decode_all(
raw_url.split('BaseURL>', 1)[1]
)
@@ -224,17 +220,16 @@ class FacebookExtractor(Extractor):
res = self.request(url, **kwargs)
if res.url.startswith(self.root + "/login"):
- raise exception.AuthenticationError(
- "You must be logged in to continue viewing images." +
- LEFT_OFF_TXT
+ raise exception.AuthRequired(
+ message=(f"You must be logged in to continue viewing images."
+ f"{LEFT_OFF_TXT}")
)
if b'{"__dr":"CometErrorRoot.react"}' in res.content:
- raise exception.StopExtraction(
- "You've been temporarily blocked from viewing images. "
- "\nPlease try using a different account, "
- "using a VPN or waiting before you retry." +
- LEFT_OFF_TXT
+ raise exception.AbortExtraction(
+ f"You've been temporarily blocked from viewing images.\n"
+ f"Please try using a different account, "
+ f"using a VPN or waiting before you retry.{LEFT_OFF_TXT}"
)
return res
@@ -248,9 +243,7 @@ class FacebookExtractor(Extractor):
while i < len(all_photo_ids):
photo_id = all_photo_ids[i]
- photo_url = self.photo_url_fmt.format(
- photo_id=photo_id, set_id=set_id
- )
+ photo_url = f"{self.root}/photo/?fbid={photo_id}&set={set_id}"
photo_page = self.photo_page_request_wrapper(photo_url).text
photo = self.parse_photo_page(photo_page)
@@ -302,6 +295,36 @@ class FacebookExtractor(Extractor):
i += 1
+ @memcache(keyarg=1)
+ def _extract_profile_photos_page(self, profile):
+ profile_photos_url = f"{self.root}/{profile}/photos_by"
+
+ for _ in range(self.fallback_retries + 1):
+ profile_photos_page = self.request(profile_photos_url).text
+ if set_id := self._extract_profile_set_id(profile_photos_page):
+ break
+ self.log.debug("Got empty profile photos page, retrying...")
+ else:
+ raise exception.AbortExtraction("Failed to extract profile data")
+
+ avatar_page_url = text.extr(
+ profile_photos_page, ',"profilePhoto":{"url":"', '"')
+
+ return set_id, avatar_page_url.replace("\\/", "/")
+
+ def _extract_profile_set_id(self, profile_photos_page):
+ set_ids_raw = text.extr(
+ profile_photos_page, '"pageItems"', '"page_info"'
+ )
+
+ set_id = text.extr(
+ set_ids_raw, 'set=', '"'
+ ).rsplit("&", 1)[0] or text.extr(
+ set_ids_raw, '\\/photos\\/', '\\/'
+ )
+
+ return set_id
+
class FacebookSetExtractor(FacebookExtractor):
"""Base class for Facebook Set extractors"""
@@ -317,13 +340,12 @@ class FacebookSetExtractor(FacebookExtractor):
def items(self):
set_id = self.groups[0] or self.groups[3]
- path = self.groups[1]
- if path:
+ if path := self.groups[1]:
post_url = self.root + "/" + path
post_page = self.request(post_url).text
set_id = self.parse_post_page(post_page)["set_id"]
- set_url = self.set_url_fmt.format(set_id=set_id)
+ set_url = f"{self.root}/media/set/?set={set_id}"
set_page = self.request(set_url).text
set_data = self.parse_set_page(set_page)
if self.groups[2]:
@@ -342,16 +364,15 @@ class FacebookPhotoExtractor(FacebookExtractor):
def items(self):
photo_id = self.groups[0]
- photo_url = self.photo_url_fmt.format(photo_id=photo_id, set_id="")
+ photo_url = f"{self.root}/photo/?fbid={photo_id}&set="
photo_page = self.photo_page_request_wrapper(photo_url).text
i = 1
photo = self.parse_photo_page(photo_page)
photo["num"] = i
- set_page = self.request(
- self.set_url_fmt.format(set_id=photo["set_id"])
- ).text
+ set_url = f"{self.root}/media/set/?set={photo['set_id']}"
+ set_page = self.request(set_url).text
directory = self.parse_set_page(set_page)
@@ -362,9 +383,7 @@ class FacebookPhotoExtractor(FacebookExtractor):
for comment_photo_id in photo["followups_ids"]:
comment_photo = self.parse_photo_page(
self.photo_page_request_wrapper(
- self.photo_url_fmt.format(
- photo_id=comment_photo_id, set_id=""
- )
+ f"{self.root}/photo/?fbid={comment_photo_id}&set="
).text
)
i += 1
@@ -399,44 +418,50 @@ class FacebookVideoExtractor(FacebookExtractor):
yield Message.Url, audio["url"], audio
-class FacebookProfileExtractor(FacebookExtractor):
- """Base class for Facebook Profile Photos Set extractors"""
- subcategory = "profile"
- pattern = (
- BASE_PATTERN +
- r"/(?!media/|photo/|photo.php|watch/)"
- r"(?:profile\.php\?id=|people/[^/?#]+/)?"
- r"([^/?&#]+)(?:/photos(?:_by)?|/videos|/posts)?/?(?:$|\?|#)"
- )
- example = "https://www.facebook.com/USERNAME"
+class FacebookPhotosExtractor(FacebookExtractor):
+ """Extractor for Facebook Profile Photos"""
+ subcategory = "photos"
+ pattern = USER_PATTERN + r"/photos(?:_by)?"
+ example = "https://www.facebook.com/USERNAME/photos"
- @staticmethod
- def get_profile_photos_set_id(profile_photos_page):
- set_ids_raw = text.extr(
- profile_photos_page, '"pageItems"', '"page_info"'
- )
+ def items(self):
+ set_id = self._extract_profile_photos_page(self.groups[0])[0]
+ set_url = f"{self.root}/media/set/?set={set_id}"
+ set_page = self.request(set_url).text
+ set_data = self.parse_set_page(set_page)
+ return self.extract_set(set_data)
- set_id = text.extr(
- set_ids_raw, 'set=', '"'
- ).rsplit("&", 1)[0] or text.extr(
- set_ids_raw, '\\/photos\\/', '\\/'
- )
- return set_id
+class FacebookAvatarExtractor(FacebookExtractor):
+ """Extractor for Facebook Profile Avatars"""
+ subcategory = "avatar"
+ pattern = USER_PATTERN + r"/avatar"
+ example = "https://www.facebook.com/USERNAME/avatar"
def items(self):
- profile_photos_url = (
- self.root + "/" + self.groups[0] + "/photos_by"
- )
- profile_photos_page = self.request(profile_photos_url).text
+ avatar_page_url = self._extract_profile_photos_page(self.groups[0])[1]
+ avatar_page = self.photo_page_request_wrapper(avatar_page_url).text
- set_id = self.get_profile_photos_set_id(profile_photos_page)
+ avatar = self.parse_photo_page(avatar_page)
+ avatar["count"] = avatar["num"] = 1
+ avatar["type"] = "avatar"
- if set_id:
- set_url = self.set_url_fmt.format(set_id=set_id)
- set_page = self.request(set_url).text
- set_data = self.parse_set_page(set_page)
- return self.extract_set(set_data)
+ set_url = f"{self.root}/media/set/?set={avatar['set_id']}"
+ set_page = self.request(set_url).text
+ directory = self.parse_set_page(set_page)
- self.log.debug("Profile photos set ID not found.")
- return iter(())
+ yield Message.Directory, directory
+ yield Message.Url, avatar["url"], avatar
+
+
+class FacebookUserExtractor(Dispatch, FacebookExtractor):
+ """Extractor for Facebook Profiles"""
+ pattern = USER_PATTERN + r"/?(?:$|\?|#)"
+ example = "https://www.facebook.com/USERNAME"
+
+ def items(self):
+ base = f"{self.root}/{self.groups[0]}/"
+ return self._dispatch_extractors((
+ (FacebookAvatarExtractor, base + "avatar"),
+ (FacebookPhotosExtractor, base + "photos"),
+ ), ("photos",))
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
index 8981c29..70b06e7 100644
--- a/gallery_dl/extractor/fanbox.py
+++ b/gallery_dl/extractor/fanbox.py
@@ -7,9 +7,8 @@
"""Extractors for https://www.fanbox.cc/"""
from .common import Extractor, Message
-from .. import text
+from .. import text, util
from ..cache import memcache
-import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?fanbox\.cc"
USER_PATTERN = (
@@ -41,8 +40,7 @@ class FanboxExtractor(Extractor):
}
self.embeds = self.config("embeds", True)
- includes = self.config("metadata")
- if includes:
+ if includes := self.config("metadata"):
if isinstance(includes, str):
includes = includes.split(",")
elif not isinstance(includes, (list, tuple)):
@@ -62,7 +60,23 @@ class FanboxExtractor(Extractor):
FanboxExtractor._warning = False
def items(self):
- for content_body, post in self.posts():
+ fee_max = self.config("fee-max")
+
+ for item in self.posts():
+ if fee_max is not None and fee_max < item["feeRequired"]:
+ self.log.warning("Skipping post %s (feeRequired of %s > %s)",
+ item["id"], item["feeRequired"], fee_max)
+ continue
+
+ try:
+ url = "https://api.fanbox.cc/post.info?postId=" + item["id"]
+ body = self.request_json(url, headers=self.headers)["body"]
+ content_body, post = self._extract_post(body)
+ except Exception as exc:
+ self.log.warning("Skipping post %s (%s: %s)",
+ item["id"], exc.__class__.__name__, exc)
+ continue
+
yield Message.Directory, post
yield from self._get_urls_from_post(content_body, post)
@@ -72,22 +86,17 @@ class FanboxExtractor(Extractor):
def _pagination(self, url):
while url:
url = text.ensure_http_scheme(url)
- body = self.request(url, headers=self.headers).json()["body"]
- for item in body["items"]:
- try:
- yield self._get_post_data(item["id"])
- except Exception as exc:
- self.log.warning("Skipping post %s (%s: %s)",
- item["id"], exc.__class__.__name__, exc)
+ body = self.request_json(url, headers=self.headers)["body"]
+
+ yield from body["items"]
+
url = body["nextUrl"]
- def _get_post_data(self, post_id):
+ def _extract_post(self, post):
"""Fetch and process post data"""
- url = "https://api.fanbox.cc/post.info?postId="+post_id
- post = self.request(url, headers=self.headers).json()["body"]
+ post["archives"] = ()
- content_body = post.pop("body", None)
- if content_body:
+ if content_body := post.pop("body", None):
if "html" in content_body:
post["html"] = content_body["html"]
if post["type"] == "article":
@@ -95,29 +104,30 @@ class FanboxExtractor(Extractor):
if "blocks" in content_body:
content = [] # text content
images = [] # image IDs in 'body' order
+ files = [] # file IDs in 'body' order
- append = content.append
- append_img = images.append
for block in content_body["blocks"]:
if "text" in block:
- append(block["text"])
+ content.append(block["text"])
if "links" in block:
for link in block["links"]:
- append(link["url"])
+ content.append(link["url"])
if "imageId" in block:
- append_img(block["imageId"])
-
- if images and "imageMap" in content_body:
- # reorder 'imageMap' (#2718)
- image_map = content_body["imageMap"]
- content_body["imageMap"] = {
- image_id: image_map[image_id]
- for image_id in images
- if image_id in image_map
- }
+ images.append(block["imageId"])
+ if "fileId" in block:
+ files.append(block["fileId"])
post["content"] = "\n".join(content)
+ self._sort_map(content_body, "imageMap", images)
+ if file_map := self._sort_map(content_body, "fileMap", files):
+ exts = util.EXTS_ARCHIVE
+ post["archives"] = [
+ file
+ for file in file_map.values()
+ if file.get("extension", "").lower() in exts
+ ]
+
post["date"] = text.parse_datetime(post["publishedDatetime"])
post["text"] = content_body.get("text") if content_body else None
post["isCoverImage"] = False
@@ -130,8 +140,7 @@ class FanboxExtractor(Extractor):
try:
post["plan"] = plans[fee]
except KeyError:
- fees = [f for f in plans if f >= fee]
- if fees:
+ if fees := [f for f in plans if f >= fee]:
plan = plans[min(fees)]
else:
plan = plans[0].copy()
@@ -139,17 +148,30 @@ class FanboxExtractor(Extractor):
post["plan"] = plans[fee] = plan
if self._meta_comments:
if post["commentCount"]:
- post["comments"] = list(self._get_comment_data(post_id))
+ post["comments"] = list(self._get_comment_data(post["id"]))
else:
post["commentd"] = ()
return content_body, post
+ def _sort_map(self, body, key, ids):
+ orig = body.get(key)
+ if not orig:
+ return {} if orig is None else orig
+
+ body[key] = new = {
+ id: orig[id]
+ for id in ids
+ if id in orig
+ }
+
+ return new
+
@memcache(keyarg=1)
def _get_user_data(self, creator_id):
url = "https://api.fanbox.cc/creator.get"
params = {"creatorId": creator_id}
- data = self.request(url, params=params, headers=self.headers).json()
+ data = self.request_json(url, params=params, headers=self.headers)
user = data["body"]
user.update(user.pop("user"))
@@ -160,7 +182,7 @@ class FanboxExtractor(Extractor):
def _get_plan_data(self, creator_id):
url = "https://api.fanbox.cc/plan.listCreator"
params = {"creatorId": creator_id}
- data = self.request(url, params=params, headers=self.headers).json()
+ data = self.request_json(url, params=params, headers=self.headers)
plans = {0: {
"id" : "",
@@ -185,7 +207,7 @@ class FanboxExtractor(Extractor):
comments = []
while url:
url = text.ensure_http_scheme(url)
- body = self.request(url, headers=self.headers).json()["body"]
+ body = self.request_json(url, headers=self.headers)["body"]
data = body["commentList"]
comments.extend(data["items"])
url = data["nextUrl"]
@@ -193,9 +215,8 @@ class FanboxExtractor(Extractor):
def _get_urls_from_post(self, content_body, post):
num = 0
- cover_image = post.get("coverImageUrl")
- if cover_image:
- cover_image = re.sub("/c/[0-9a-z_]+", "", cover_image)
+ if cover_image := post.get("coverImageUrl"):
+ cover_image = util.re("/c/[0-9a-z_]+").sub("", cover_image)
final_post = post.copy()
final_post["isCoverImage"] = True
final_post["fileUrl"] = cover_image
@@ -313,10 +334,10 @@ class FanboxExtractor(Extractor):
elif provider == "twitter":
url = "https://twitter.com/_/status/"+content_id
elif provider == "google_forms":
- templ = "https://docs.google.com/forms/d/e/{}/viewform?usp=sf_link"
- url = templ.format(content_id)
+ url = (f"https://docs.google.com/forms/d/e/"
+ f"{content_id}/viewform?usp=sf_link")
else:
- self.log.warning("service not recognized: {}".format(provider))
+ self.log.warning(f"service not recognized: {provider}")
if url:
final_post["embed"] = embed
@@ -334,25 +355,16 @@ class FanboxCreatorExtractor(FanboxExtractor):
pattern = USER_PATTERN + r"(?:/posts)?/?$"
example = "https://USER.fanbox.cc/"
- def __init__(self, match):
- FanboxExtractor.__init__(self, match)
- self.creator_id = match.group(1) or match.group(2)
-
def posts(self):
url = "https://api.fanbox.cc/post.paginateCreator?creatorId="
- return self._pagination_creator(url + self.creator_id)
+ creator_id = self.groups[0] or self.groups[1]
+ return self._pagination_creator(url + creator_id)
def _pagination_creator(self, url):
- urls = self.request(url, headers=self.headers).json()["body"]
+ urls = self.request_json(url, headers=self.headers)["body"]
for url in urls:
url = text.ensure_http_scheme(url)
- body = self.request(url, headers=self.headers).json()["body"]
- for item in body:
- try:
- yield self._get_post_data(item["id"])
- except Exception as exc:
- self.log.warning("Skipping post %s (%s: %s)",
- item["id"], exc.__class__.__name__, exc)
+ yield from self.request_json(url, headers=self.headers)["body"]
class FanboxPostExtractor(FanboxExtractor):
@@ -361,12 +373,8 @@ class FanboxPostExtractor(FanboxExtractor):
pattern = USER_PATTERN + r"/posts/(\d+)"
example = "https://USER.fanbox.cc/posts/12345"
- def __init__(self, match):
- FanboxExtractor.__init__(self, match)
- self.post_id = match.group(3)
-
def posts(self):
- return (self._get_post_data(self.post_id),)
+ return ({"id": self.groups[2], "feeRequired": 0},)
class FanboxHomeExtractor(FanboxExtractor):
diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py
index 6218f19..e32a86b 100644
--- a/gallery_dl/extractor/fantia.py
+++ b/gallery_dl/extractor/fantia.py
@@ -93,7 +93,7 @@ class FantiaExtractor(Extractor):
def _get_post_data(self, post_id):
"""Fetch and process post data"""
url = self.root+"/api/v1/posts/"+post_id
- resp = self.request(url, headers=self.headers).json()["post"]
+ resp = self.request_json(url, headers=self.headers)["post"]
return {
"post_id": resp["id"],
"post_url": self.root + "/posts/" + str(resp["id"]),
@@ -181,10 +181,10 @@ class FantiaCreatorExtractor(FantiaExtractor):
def __init__(self, match):
FantiaExtractor.__init__(self, match)
- self.creator_id = match.group(1)
+ self.creator_id = match[1]
def posts(self):
- url = "{}/fanclubs/{}/posts".format(self.root, self.creator_id)
+ url = f"{self.root}/fanclubs/{self.creator_id}/posts"
return self._pagination(url)
@@ -196,7 +196,7 @@ class FantiaPostExtractor(FantiaExtractor):
def __init__(self, match):
FantiaExtractor.__init__(self, match)
- self.post_id = match.group(1)
+ self.post_id = match[1]
def posts(self):
self._csrf_token()
diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py
index 43627e2..7ff71b0 100644
--- a/gallery_dl/extractor/fapachi.py
+++ b/gallery_dl/extractor/fapachi.py
@@ -31,8 +31,7 @@ class FapachiPostExtractor(Extractor):
"user": self.user,
"id" : self.id,
}
- page = self.request("{}/{}/media/{}".format(
- self.root, self.user, self.id)).text
+ page = self.request(f"{self.root}/{self.user}/media/{self.id}").text
url = self.root + text.extract(
page, 'data-src="', '"', page.index('class="media-img'))[0]
yield Message.Directory, data
@@ -50,17 +49,16 @@ class FapachiUserExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1)
- self.num = text.parse_int(match.group(2), 1)
+ self.user = match[1]
+ self.num = text.parse_int(match[2], 1)
def items(self):
data = {"_extractor": FapachiPostExtractor}
while True:
- page = self.request("{}/{}/page/{}".format(
- self.root, self.user, self.num)).text
+ url = f"{self.root}/{self.user}/page/{self.num}"
+ page = self.request(url).text
for post in text.extract_iter(page, 'model-media-prew">', ">"):
- path = text.extr(post, '<a href="', '"')
- if path:
+ if path := text.extr(post, '<a href="', '"'):
yield Message.Queue, self.root + path, data
if '">Next page</a>' not in page:
diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py
index cf18edc..b961cbe 100644
--- a/gallery_dl/extractor/fapello.py
+++ b/gallery_dl/extractor/fapello.py
@@ -25,11 +25,11 @@ class FapelloPostExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.root = text.root_from_url(match.group(0))
+ self.root = text.root_from_url(match[0])
self.model, self.id = match.groups()
def items(self):
- url = "{}/{}/{}/".format(self.root, self.model, self.id)
+ url = f"{self.root}/{self.model}/{self.id}/"
page = text.extr(
self.request(url, allow_redirects=False).text,
'class="uk-align-center"', "</div>", None)
@@ -59,15 +59,14 @@ class FapelloModelExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.root = text.root_from_url(match.group(0))
- self.model = match.group(1)
+ self.root = text.root_from_url(match[0])
+ self.model = match[1]
def items(self):
num = 1
data = {"_extractor": FapelloPostExtractor}
while True:
- url = "{}/ajax/model/{}/page-{}/".format(
- self.root, self.model, num)
+ url = f"{self.root}/ajax/model/{self.model}/page-{num}/"
page = self.request(url).text
if not page:
return
@@ -93,8 +92,8 @@ class FapelloPathExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.root = text.root_from_url(match.group(0))
- self.path = match.group(1)
+ self.root = text.root_from_url(match[0])
+ self.path = match[1]
def items(self):
num = 1
@@ -109,8 +108,8 @@ class FapelloPathExtractor(Extractor):
data = {"_extractor": FapelloModelExtractor}
while True:
- page = self.request("{}/ajax/{}/page-{}/".format(
- self.root, self.path, num)).text
+ url = f"{self.root}/ajax/{self.path}/page-{num}/"
+ page = self.request(url).text
if not page:
return
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index eb68c3e..35263a3 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -134,8 +134,8 @@ class FlickrAlbumExtractor(FlickrExtractor):
for album in self.api.photosets_getList(self.user["nsid"]):
self.api._clean_info(album).update(data)
- url = "https://www.flickr.com/photos/{}/albums/{}".format(
- self.user["path_alias"], album["id"])
+ url = (f"https://www.flickr.com/photos/{self.user['path_alias']}"
+ f"/albums/{album['id']}")
yield Message.Queue, url, album
def metadata(self):
@@ -451,14 +451,13 @@ class FlickrAPI(oauth.OAuth1API):
raise exception.AuthenticationError(msg)
elif data["code"] == 99:
raise exception.AuthorizationError(msg)
- raise exception.StopExtraction("API request failed: %s", msg)
+ raise exception.AbortExtraction(f"API request failed: {msg}")
return data
def _pagination(self, method, params, key="photos"):
extras = ("description,date_upload,tags,views,media,"
"path_alias,owner_name,")
- includes = self.extractor.config("metadata")
- if includes:
+ if includes := self.extractor.config("metadata"):
if isinstance(includes, (list, tuple)):
includes = ",".join(includes)
elif not isinstance(includes, str):
@@ -585,8 +584,7 @@ class FlickrAPI(oauth.OAuth1API):
if "license" in photo:
photo["license_name"] = self.LICENSES.get(photo["license"])
- @staticmethod
- def _clean_info(info):
+ def _clean_info(self, info):
info["title"] = info["title"]["_content"]
info["description"] = info["description"]["_content"]
return info
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index 5f90afc..dc23488 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,6 +26,9 @@ class FoolfuukaExtractor(BaseExtractor):
self.remote = self._remote_direct
elif self.category == "archivedmoe":
self.referer = False
+ self.fixup_redirect = True
+ else:
+ self.fixup_redirect = False
def items(self):
yield Message.Directory, self.metadata()
@@ -57,13 +60,45 @@ class FoolfuukaExtractor(BaseExtractor):
"""Resolve a remote media link"""
page = self.request(media["remote_media_link"]).text
url = text.extr(page, 'http-equiv="Refresh" content="0; url=', '"')
- if url.endswith(".webm") and \
- url.startswith("https://thebarchive.com/"):
- return url[:-1]
+
+ if url.startswith("https://thebarchive.com/"):
+ # '.webm' -> '.web' (#5116)
+ if url.endswith(".webm"):
+ url = url[:-1]
+
+ elif self.fixup_redirect:
+ # update redirect domain or filename (#7652)
+ path, _, filename = url.rpartition("/")
+
+ # these boards link directly to i.4cdn.org
+ # -> redirect to warosu or 4plebs instead
+ board_domains = {
+ "3" : "warosu.org",
+ "biz": "warosu.org",
+ "ck" : "warosu.org",
+ "diy": "warosu.org",
+ "fa" : "warosu.org",
+ "ic" : "warosu.org",
+ "jp" : "warosu.org",
+ "lit": "warosu.org",
+ "sci": "warosu.org",
+ "tg" : "archive.4plebs.org",
+ }
+ board = url.split("/", 4)[3]
+ if board in board_domains:
+ domain = board_domains[board]
+ url = f"https://{domain}/{board}/full_image/{filename}"
+
+ # if it's one of these archives, slice the name
+ elif any(archive in path for archive in (
+ "b4k.", "desuarchive.", "palanq.")):
+ name, _, ext = filename.rpartition(".")
+ if len(name) > 13:
+ url = f"{path}/{name[:13]}.{ext}"
+
return url
- @staticmethod
- def _remote_direct(media):
+ def _remote_direct(self, media):
return media["remote_media_link"]
@@ -124,13 +159,12 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
def metadata(self):
url = self.root + "/_/api/chan/thread/"
params = {"board": self.board, "num": self.thread}
- self.data = self.request(url, params=params).json()[self.thread]
+ self.data = self.request_json(url, params=params)[self.thread]
return self.data["op"]
def posts(self):
op = (self.data["op"],)
- posts = self.data.get("posts")
- if posts:
+ if posts := self.data.get("posts"):
posts = list(posts.values())
posts.sort(key=lambda p: p["timestamp"])
return itertools.chain(op, posts)
@@ -149,13 +183,12 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor):
self.page = self.groups[-1]
def items(self):
- index_base = "{}/_/api/chan/index/?board={}&page=".format(
- self.root, self.board)
- thread_base = "{}/{}/thread/".format(self.root, self.board)
+ index_base = f"{self.root}/_/api/chan/index/?board={self.board}&page="
+ thread_base = f"{self.root}/{self.board}/thread/"
page = self.page
for pnum in itertools.count(text.parse_int(page, 1)):
- with self.request(index_base + format(pnum)) as response:
+ with self.request(index_base + str(pnum)) as response:
try:
threads = response.json()
except ValueError:
@@ -209,7 +242,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
while True:
try:
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
except ValueError:
return
@@ -235,27 +268,17 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor):
pattern = BASE_PATTERN + r"/([^/?#]+)/gallery(?:/(\d+))?"
example = "https://archived.moe/a/gallery"
- def __init__(self, match):
- FoolfuukaExtractor.__init__(self, match)
-
- board = match.group(match.lastindex)
- if board.isdecimal():
- self.board = match.group(match.lastindex-1)
- self.pages = (board,)
- else:
- self.board = board
- self.pages = map(format, itertools.count(1))
-
def metadata(self):
- return {"board": self.board}
+ self.board = board = self.groups[-2]
+ return {"board": board}
def posts(self):
- base = "{}/_/api/chan/gallery/?board={}&page=".format(
- self.root, self.board)
+ pnum = self.groups[-1]
+ pages = itertools.count(1) if pnum is None else (pnum,)
+ base = f"{self.root}/_/api/chan/gallery/?board={self.board}&page="
- for page in self.pages:
- with self.request(base + page) as response:
- posts = response.json()
+ for pnum in pages:
+ posts = self.request_json(f"{base}{pnum}")
if not posts:
return
yield from posts
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
index bb684c2..7c59f72 100644
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -18,14 +18,13 @@ class FoolslideExtractor(BaseExtractor):
def __init__(self, match):
BaseExtractor.__init__(self, match)
- self.gallery_url = self.root + match.group(match.lastindex)
+ self.page_url = self.root + self.groups[-1]
def request(self, url):
return BaseExtractor.request(
self, url, encoding="utf-8", method="POST", data={"adult": "true"})
- @staticmethod
- def parse_chapter_url(url, data):
+ def parse_chapter_url(self, url, data):
info = url.partition("/read/")[2].rstrip("/").split("/")
lang = info[1].partition("-")[0]
data["lang"] = lang
@@ -52,7 +51,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
example = "https://read.powermanga.org/read/MANGA/en/0/123/"
def items(self):
- page = self.request(self.gallery_url).text
+ page = self.request(self.page_url).text
data = self.metadata(page)
imgs = self.images(page)
@@ -79,7 +78,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
def metadata(self, page):
extr = text.extract_from(page)
extr('<h1 class="tbtitle dnone">', '')
- return self.parse_chapter_url(self.gallery_url, {
+ return self.parse_chapter_url(self.page_url, {
"manga" : text.unescape(extr('title="', '"')).strip(),
"chapter_string": text.unescape(extr('title="', '"')),
})
@@ -96,7 +95,7 @@ class FoolslideMangaExtractor(FoolslideExtractor):
example = "https://read.powermanga.org/series/MANGA/"
def items(self):
- page = self.request(self.gallery_url).text
+ page = self.request(self.page_url).text
chapters = self.chapters(page)
if not self.config("chapter-reverse", False):
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index 565fd71..0d24f83 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://www.furaffinity.net/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net"
@@ -28,7 +28,7 @@ class FuraffinityExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
self.offset = 0
def _init(self):
@@ -51,8 +51,7 @@ class FuraffinityExtractor(Extractor):
def items(self):
metadata = self.metadata()
for post_id in util.advance(self.posts(), self.offset):
- post = self._parse_post(post_id)
- if post:
+ if post := self._parse_post(post_id):
if metadata:
post.update(metadata)
yield Message.Directory, post
@@ -71,7 +70,7 @@ class FuraffinityExtractor(Extractor):
return num
def _parse_post(self, post_id):
- url = "{}/view/{}/".format(self.root, post_id)
+ url = f"{self.root}/view/{post_id}/"
extr = text.extract_from(self.request(url).text)
if self._new_layout is None:
@@ -117,8 +116,7 @@ class FuraffinityExtractor(Extractor):
data["folders"] = folders = []
for folder in extr(
"<h3>Listed in Folders</h3>", "</section>").split("</a>"):
- folder = rh(folder)
- if folder:
+ if folder := rh(folder):
folders.append(folder)
else:
# old site layout
@@ -147,22 +145,19 @@ class FuraffinityExtractor(Extractor):
data["user"] = self.user or data["artist_url"]
data["date"] = text.parse_timestamp(data["filename"].partition(".")[0])
data["description"] = self._process_description(data["_description"])
- data["thumbnail"] = "https://t.furaffinity.net/{}@600-{}.jpg".format(
- post_id, path.rsplit("/", 2)[1])
-
+ data["thumbnail"] = (f"https://t.furaffinity.net/{post_id}@600-"
+ f"{path.rsplit('/', 2)[1]}.jpg")
return data
- @staticmethod
- def _process_description(description):
+ def _process_description(self, description):
return text.unescape(text.remove_html(description, "", ""))
def _pagination(self, path, folder=None):
num = 1
- folder = "" if folder is None else "/folder/{}/a".format(folder)
+ folder = "" if folder is None else f"/folder/{folder}/a"
while True:
- url = "{}/{}/{}{}/{}/".format(
- self.root, path, self.user, folder, num)
+ url = f"{self.root}/{path}/{self.user}{folder}/{num}/"
page = self.request(url).text
post_id = None
@@ -174,7 +169,7 @@ class FuraffinityExtractor(Extractor):
num += 1
def _pagination_favorites(self):
- path = "/favorites/{}/".format(self.user)
+ path = f"/favorites/{self.user}/"
while path:
page = self.request(self.root + path).text
@@ -188,7 +183,7 @@ class FuraffinityExtractor(Extractor):
pos = page.find('type="submit">Next</button>')
if pos >= 0:
- path = text.rextract(page, '<form action="', '"', pos)[0]
+ path = text.rextr(page, '<form action="', '"', pos)
continue
path = text.extr(page, 'right" href="', '"')
@@ -283,8 +278,7 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor):
return self._pagination_favorites()
def _parse_post(self, post_id):
- post = FuraffinityExtractor._parse_post(self, post_id)
- if post:
+ if post := FuraffinityExtractor._parse_post(self, post_id):
post["favorite_id"] = self._favorite_id
return post
@@ -298,7 +292,7 @@ class FuraffinitySearchExtractor(FuraffinityExtractor):
def __init__(self, match):
FuraffinityExtractor.__init__(self, match)
- self.query = text.parse_query(match.group(2))
+ self.query = text.parse_query(match[2])
if self.user and "q" not in self.query:
self.query["q"] = text.unquote(self.user)
@@ -321,24 +315,18 @@ class FuraffinityPostExtractor(FuraffinityExtractor):
return (post_id,)
-class FuraffinityUserExtractor(FuraffinityExtractor):
+class FuraffinityUserExtractor(Dispatch, FuraffinityExtractor):
"""Extractor for furaffinity user profiles"""
- subcategory = "user"
- cookies_domain = None
pattern = BASE_PATTERN + r"/user/([^/?#]+)"
example = "https://www.furaffinity.net/user/USER/"
- def initialize(self):
- pass
-
- skip = Extractor.skip
-
def items(self):
- base = "{}/{{}}/{}/".format(self.root, self.user)
+ base = self.root
+ user = f"{self.user}/"
return self._dispatch_extractors((
- (FuraffinityGalleryExtractor , base.format("gallery")),
- (FuraffinityScrapsExtractor , base.format("scraps")),
- (FuraffinityFavoriteExtractor, base.format("favorites")),
+ (FuraffinityGalleryExtractor , f"{base}/gallery/{user}"),
+ (FuraffinityScrapsExtractor , f"{base}/scraps/{user}"),
+ (FuraffinityFavoriteExtractor, f"{base}/favorites/{user}"),
), ("gallery",))
@@ -349,7 +337,7 @@ class FuraffinityFollowingExtractor(FuraffinityExtractor):
example = "https://www.furaffinity.net/watchlist/by/USER/"
def items(self):
- url = "{}/watchlist/by/{}/".format(self.root, self.user)
+ url = f"{self.root}/watchlist/by/{self.user}/"
data = {"_extractor": FuraffinityUserExtractor}
while True:
@@ -358,7 +346,7 @@ class FuraffinityFollowingExtractor(FuraffinityExtractor):
for path in text.extract_iter(page, '<a href="', '"'):
yield Message.Queue, self.root + path, data
- path = text.rextract(page, 'action="', '"')[0]
+ path = text.rextr(page, 'action="', '"')
if url.endswith(path):
return
url = self.root + path
@@ -382,9 +370,9 @@ class FuraffinitySubmissionsExtractor(FuraffinityExtractor):
for post_id in text.extract_iter(page, 'id="sid-', '"'):
yield post_id
- path = (text.extr(page, '<a class="button standard more" href="', '"') or # noqa 501
- text.extr(page, '<a class="more-half" href="', '"') or
- text.extr(page, '<a class="more" href="', '"'))
- if not path:
+ if (pos := page.find(">Next 48</a>")) < 0 and \
+ (pos := page.find(">&gt;&gt;&gt; Next 48 &gt;&gt;")) < 0:
return
+
+ path = text.rextr(page, 'href="', '"', pos)
url = self.root + text.unescape(path)
diff --git a/gallery_dl/extractor/furry34.py b/gallery_dl/extractor/furry34.py
index e0c7fdb..a93ec75 100644
--- a/gallery_dl/extractor/furry34.py
+++ b/gallery_dl/extractor/furry34.py
@@ -46,8 +46,8 @@ class Furry34Extractor(BooruExtractor):
post_id = post["id"]
root = self.root_cdn if files[fmt][0] else self.root
- post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format(
- root, post_id // 1000, post_id, post_id, extension)
+ post["file_url"] = url = \
+ f"{root}/posts/{post_id // 1000}/{post_id}/{post_id}.{extension}"
post["format_id"] = fmt
post["format"] = extension.partition(".")[0]
@@ -73,11 +73,11 @@ class Furry34Extractor(BooruExtractor):
post["tags_" + types[type]] = values
def _fetch_post(self, post_id):
- url = "{}/api/v2/post/{}".format(self.root, post_id)
- return self.request(url).json()
+ url = f"{self.root}/api/v2/post/{post_id}"
+ return self.request_json(url)
def _pagination(self, endpoint, params=None):
- url = "{}/api{}".format(self.root, endpoint)
+ url = f"{self.root}/api{endpoint}"
if params is None:
params = {}
@@ -86,7 +86,7 @@ class Furry34Extractor(BooruExtractor):
threshold = self.per_page
while True:
- data = self.request(url, method="POST", json=params).json()
+ data = self.request_json(url, method="POST", json=params)
yield from data["items"]
diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py
index beecbff..b7cf0c8 100644
--- a/gallery_dl/extractor/fuskator.py
+++ b/gallery_dl/extractor/fuskator.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -21,13 +21,13 @@ class FuskatorGalleryExtractor(GalleryExtractor):
example = "https://fuskator.com/thumbs/ID/"
def __init__(self, match):
- self.gallery_hash = match.group(1)
- url = "{}/thumbs/{}/index.html".format(self.root, self.gallery_hash)
+ self.gallery_hash = match[1]
+ url = f"{self.root}/thumbs/{self.gallery_hash}/index.html"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
headers = {
- "Referer" : self.gallery_url,
+ "Referer" : self.page_url,
"X-Requested-With": "XMLHttpRequest",
}
auth = self.request(
@@ -39,9 +39,8 @@ class FuskatorGalleryExtractor(GalleryExtractor):
"hash" : self.gallery_hash,
"_" : int(time.time()),
}
- self.data = data = self.request(
- self.root + "/ajax/gal.aspx", params=params, headers=headers,
- ).json()
+ self.data = data = self.request_json(
+ self.root + "/ajax/gal.aspx", params=params, headers=headers)
title = text.extr(page, "<title>", "</title>").strip()
title, _, gallery_id = title.rpartition("#")
@@ -72,7 +71,7 @@ class FuskatorSearchExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.path = match.group(1)
+ self.path = match[1]
def items(self):
url = self.root + self.path
@@ -87,4 +86,4 @@ class FuskatorSearchExtractor(Extractor):
pages = text.extr(page, 'class="pages"><span>', '>&gt;&gt;<')
if not pages:
return
- url = self.root + text.rextract(pages, 'href="', '"')[0]
+ url = self.root + text.rextr(pages, 'href="', '"')
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index f24b696..b152885 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,11 +26,19 @@ class GelbooruBase():
def _api_request(self, params, key="post", log=False):
if "s" not in params:
params["s"] = "post"
+
params["api_key"] = self.api_key
params["user_id"] = self.user_id
url = self.root + "/index.php?page=dapi&q=index&json=1"
- data = self.request(url, params=params).json()
+ try:
+ data = self.request_json(url, params=params)
+ except exception.HttpError as exc:
+ if exc.status == 401:
+ raise exception.AuthorizationError(
+ f"'api-key' and 'user-id' required "
+ f"({exc.status}: {exc.response.reason})")
+ raise
if not key:
return data
@@ -73,7 +81,7 @@ class GelbooruBase():
if id:
tag = "id:" + op
tags = [t for t in tags if not t.startswith(tag)]
- tags = "{} id:{}".format(" ".join(tags), op)
+ tags = f"{' '.join(tags)} id:{op}"
while True:
posts = self._api_request(params)
@@ -113,7 +121,7 @@ class GelbooruBase():
post["_fallback"] = (url,)
md5 = post["md5"]
root = text.root_from_url(post["preview_url"])
- path = "/images/{}/{}/{}.webm".format(md5[0:2], md5[2:4], md5)
+ path = f"/images/{md5[0:2]}/{md5[2:4]}/{md5}.webm"
url = root + path
return url
@@ -292,7 +300,7 @@ class GelbooruRedirectExtractor(GelbooruBase, Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.url_base64 = match.group(1)
+ self.url_base64 = match[1]
def items(self):
url = text.ensure_http_scheme(binascii.a2b_base64(
diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py
index 0b96048..61d0545 100644
--- a/gallery_dl/extractor/gelbooru_v01.py
+++ b/gallery_dl/extractor/gelbooru_v01.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -17,8 +17,7 @@ class GelbooruV01Extractor(booru.BooruExtractor):
per_page = 20
def _parse_post(self, post_id):
- url = "{}/index.php?page=post&s=view&id={}".format(
- self.root, post_id)
+ url = f"{self.root}/index.php?page=post&s=view&id={post_id}"
extr = text.extract_from(self.request(url).text)
post = {
@@ -92,16 +91,12 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor):
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
example = "https://allgirl.booru.org/index.php?page=post&s=list&tags=TAG"
- def __init__(self, match):
- GelbooruV01Extractor.__init__(self, match)
- self.tags = match.group(match.lastindex)
-
def metadata(self):
- return {"search_tags": text.unquote(self.tags.replace("+", " "))}
+ self.tags = tags = self.groups[-1]
+ return {"search_tags": text.unquote(tags.replace("+", " "))}
def posts(self):
- url = "{}/index.php?page=post&s=list&tags={}&pid=".format(
- self.root, self.tags)
+ url = f"{self.root}/index.php?page=post&s=list&tags={self.tags}&pid="
return self._pagination(url, 'class="thumb"><a id="p', '"')
@@ -113,16 +108,13 @@ class GelbooruV01FavoriteExtractor(GelbooruV01Extractor):
pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)"
example = "https://allgirl.booru.org/index.php?page=favorites&s=view&id=1"
- def __init__(self, match):
- GelbooruV01Extractor.__init__(self, match)
- self.favorite_id = match.group(match.lastindex)
-
def metadata(self):
- return {"favorite_id": text.parse_int(self.favorite_id)}
+ self.favorite_id = fav_id = self.groups[-1]
+ return {"favorite_id": text.parse_int(fav_id)}
def posts(self):
- url = "{}/index.php?page=favorites&s=view&id={}&pid=".format(
- self.root, self.favorite_id)
+ url = (f"{self.root}/index.php"
+ f"?page=favorites&s=view&id={self.favorite_id}&pid=")
return self._pagination(url, "posts[", "]")
@@ -132,9 +124,5 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor):
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
example = "https://allgirl.booru.org/index.php?page=post&s=view&id=12345"
- def __init__(self, match):
- GelbooruV01Extractor.__init__(self, match)
- self.post_id = match.group(match.lastindex)
-
def posts(self):
- return (self._parse_post(self.post_id),)
+ return (self._parse_post(self.groups[-1]),)
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 2c1174a..c12a7a2 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2022 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,10 +10,7 @@
from . import booru
from .. import text, util, exception
-
-from xml.etree import ElementTree
import collections
-import re
class GelbooruV02Extractor(booru.BooruExtractor):
@@ -24,9 +21,12 @@ class GelbooruV02Extractor(booru.BooruExtractor):
self.user_id = self.config("user-id")
self.root_api = self.config_instance("root-api") or self.root
+ if self.category == "rule34":
+ self._file_url = self._file_url_rule34
+
def _api_request(self, params):
url = self.root_api + "/index.php?page=dapi&s=post&q=index"
- return ElementTree.fromstring(self.request(url, params=params).text)
+ return self.request_xml(url, params=params)
def _pagination(self, params):
params["pid"] = self.page_start
@@ -38,7 +38,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
while True:
try:
root = self._api_request(params)
- except ElementTree.ParseError:
+ except SyntaxError: # ElementTree.ParseError
if "tags" not in params or post is None:
raise
taglist = [tag for tag in params["tags"].split()
@@ -50,7 +50,9 @@ class GelbooruV02Extractor(booru.BooruExtractor):
if total is None:
try:
- total = int(root.attrib["count"])
+ self.kwdict["total"] = total = int(root.attrib["count"])
+ if "search_tags" in self.kwdict:
+ self.kwdict["search_count"] = total
self.log.debug("%s posts in total", total)
except Exception as exc:
total = 0
@@ -78,7 +80,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
params["pid"] = self.page_start * self.per_page
data = {}
- find_ids = re.compile(r"\sid=\"p(\d+)").findall
+ find_ids = util.re(r"\sid=\"p(\d+)").findall
while True:
page = self.request(url, params=params).text
@@ -92,15 +94,24 @@ class GelbooruV02Extractor(booru.BooruExtractor):
return
params["pid"] += self.per_page
- @staticmethod
- def _prepare(post):
+ def _file_url_rule34(self, post):
+ url = post["file_url"]
+
+ if text.ext_from_url(url) not in util.EXTS_VIDEO:
+ path = url.partition(".")[2]
+ post["_fallback"] = (url,)
+ post["file_url"] = url = "https://wimg." + path
+
+ return url
+
+ def _prepare(self, post):
post["tags"] = post["tags"].strip()
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
def _html(self, post):
- return self.request("{}/index.php?page=post&s=view&id={}".format(
- self.root, post["id"])).text
+ url = f"{self.root}/index.php?page=post&s=view&id={post['id']}"
+ return self.request(url).text
def _tags(self, post, page):
tag_container = (text.extr(page, '<ul id="tag-', '</ul>') or
@@ -109,8 +120,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
return
tags = collections.defaultdict(list)
- pattern = re.compile(
- r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
+ pattern = util.re(r"(?s)tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)")
for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unescape(text.unquote(tag_name)))
for key, value in tags.items():
@@ -166,18 +176,13 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor):
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)"
example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG"
- def __init__(self, match):
- GelbooruV02Extractor.__init__(self, match)
- tags = match.group(match.lastindex)
- self.tags = text.unquote(tags.replace("+", " "))
-
- def metadata(self):
- return {"search_tags": self.tags}
-
def posts(self):
- if self.tags == "all":
- self.tags = ""
- return self._pagination({"tags": self.tags})
+ self.kwdict["search_tags"] = tags = text.unquote(
+ self.groups[-1].replace("+", " "))
+
+ if tags == "all":
+ tags = ""
+ return self._pagination({"tags": tags})
class GelbooruV02PoolExtractor(GelbooruV02Extractor):
@@ -189,7 +194,7 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor):
def __init__(self, match):
GelbooruV02Extractor.__init__(self, match)
- self.pool_id = match.group(match.lastindex)
+ self.pool_id = self.groups[-1]
if self.category == "rule34":
self.posts = self._posts_pages
@@ -202,8 +207,7 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor):
return num
def metadata(self):
- url = "{}/index.php?page=pool&s=show&id={}".format(
- self.root, self.pool_id)
+ url = f"{self.root}/index.php?page=pool&s=show&id={self.pool_id}"
page = self.request(url).text
name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
@@ -239,12 +243,9 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor):
pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)"
example = "https://safebooru.org/index.php?page=favorites&s=view&id=12345"
- def __init__(self, match):
- GelbooruV02Extractor.__init__(self, match)
- self.favorite_id = match.group(match.lastindex)
-
def metadata(self):
- return {"favorite_id": text.parse_int(self.favorite_id)}
+ self.favorite_id = fav_id = self.groups[-1]
+ return {"favorite_id": text.parse_int(fav_id)}
def posts(self):
return self._pagination_html({
@@ -260,9 +261,5 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
example = "https://safebooru.org/index.php?page=post&s=view&id=12345"
- def __init__(self, match):
- GelbooruV02Extractor.__init__(self, match)
- self.post_id = match.group(match.lastindex)
-
def posts(self):
- return self._pagination({"id": self.post_id})
+ return self._pagination({"id": self.groups[-1]})
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index 4b04732..407e478 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -7,9 +7,8 @@
"""Generic information extractor"""
from .common import Extractor, Message
-from .. import config, text
+from .. import config, text, util
import os.path
-import re
class GenericExtractor(Extractor):
@@ -37,28 +36,28 @@ class GenericExtractor(Extractor):
example = "generic:https://www.nongnu.org/lzip/"
def __init__(self, match):
- self.subcategory = match.group('domain')
+ self.subcategory = match['domain']
Extractor.__init__(self, match)
# Strip the "g(eneric):" prefix
# and inform about "forced" or "fallback" mode
- if match.group('generic'):
- self.url = match.group(0).partition(":")[2]
+ if match['generic']:
+ self.url = match[0].partition(":")[2]
else:
self.log.info("Falling back on generic information extractor.")
- self.url = match.group(0)
+ self.url = match[0]
# Make sure we have a scheme, or use https
- if match.group('scheme'):
- self.scheme = match.group('scheme')
+ if match['scheme']:
+ self.scheme = match['scheme']
else:
self.scheme = 'https://'
self.url = text.ensure_http_scheme(self.url, self.scheme)
- self.path = match.group('path')
+ self.path = match['path']
# Used to resolve relative image urls
- self.root = self.scheme + match.group('domain')
+ self.root = self.scheme + match['domain']
def items(self):
"""Get page, extract metadata & images, yield them in suitable messages
@@ -172,8 +171,8 @@ class GenericExtractor(Extractor):
r"(?:[^\"'<>\s]*)?" # optional query and fragment
)
- imageurls_src = re.findall(imageurl_pattern_src, page)
- imageurls_ext = re.findall(imageurl_pattern_ext, page)
+ imageurls_src = util.re(imageurl_pattern_src).findall(page)
+ imageurls_ext = util.re(imageurl_pattern_ext).findall(page)
imageurls = imageurls_src + imageurls_ext
# Resolve relative urls
@@ -182,10 +181,10 @@ class GenericExtractor(Extractor):
# by prepending a suitable base url.
#
# If the page contains a <base> element, use it as base url
- basematch = re.search(
- r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)", page)
+ basematch = util.re(
+ r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)").search(page)
if basematch:
- self.baseurl = basematch.group('url').rstrip('/')
+ self.baseurl = basematch['url'].rstrip('/')
# Otherwise, extract the base url from self.url
else:
if self.url.endswith("/"):
diff --git a/gallery_dl/extractor/girlsreleased.py b/gallery_dl/extractor/girlsreleased.py
new file mode 100644
index 0000000..4fc77c6
--- /dev/null
+++ b/gallery_dl/extractor/girlsreleased.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://girlsreleased.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import itertools
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlsreleased\.com"
+
+
+class GirlsreleasedExtractor(Extractor):
+ """Base class for girlsreleased extractors"""
+ category = "girlsreleased"
+ root = "https://girlsreleased.com"
+ request_interval = 0.5
+ request_interval_min = 0.2
+
+ def items(self):
+ data = {"_extractor": GirlsreleasedSetExtractor}
+ base = f"{self.root}/set/"
+ for set in self._pagination():
+ yield Message.Queue, f"{base}{set[0]}", data
+
+ def _pagination(self):
+ base = f"{self.root}/api/0.1/sets/{self._path}/{self.groups[0]}/page/"
+ for pnum in itertools.count():
+ sets = self.request_json(f"{base}{pnum}")["sets"]
+ if not sets:
+ return
+
+ yield from sets[1:] if pnum else sets
+ if len(sets) < 80:
+ return
+
+
+class GirlsreleasedSetExtractor(GirlsreleasedExtractor):
+ """Extractor for girlsreleased galleries"""
+ subcategory = "set"
+ pattern = BASE_PATTERN + r"/set/(\d+)"
+ example = "https://girlsreleased.com/set/12345"
+
+ def items(self):
+ url = f"{self.root}/api/0.1/set/{self.groups[0]}"
+ json = self.request_json(url)["set"]
+ data = {
+ "title": json["name"] or json["id"],
+ "id": json["id"],
+ "site": json["site"],
+ "model": [model for _, model in json["models"]],
+ "date": text.parse_timestamp(json["date"]),
+ "count": len(json["images"]),
+ "url": "https://girlsreleased.com/set/" + json["id"],
+ }
+ yield Message.Directory, data
+ for data["num"], image in enumerate(json["images"], 1):
+ text.nameext_from_url(image[5], data)
+ yield Message.Queue, image[3], data
+
+
+class GirlsreleasedModelExtractor(GirlsreleasedExtractor):
+ """Extractor for girlsreleased models"""
+ subcategory = _path = "model"
+ pattern = BASE_PATTERN + r"/model/(\d+(?:/.+)?)"
+ example = "https://girlsreleased.com/model/12345/MODEL"
+
+
+class GirlsreleasedSiteExtractor(GirlsreleasedExtractor):
+ """Extractor for girlsreleased sites"""
+ subcategory = _path = "site"
+ pattern = BASE_PATTERN + r"/site/([^/?#]+(?:/model/\d+/?.*)?)"
+ example = "https://girlsreleased.com/site/SITE"
diff --git a/gallery_dl/extractor/girlswithmuscle.py b/gallery_dl/extractor/girlswithmuscle.py
new file mode 100644
index 0000000..51b979f
--- /dev/null
+++ b/gallery_dl/extractor/girlswithmuscle.py
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from .common import Extractor, Message
+from .. import text, util, exception
+from ..cache import cache
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlswithmuscle\.com"
+
+
+class GirlswithmuscleExtractor(Extractor):
+ """Base class for girlswithmuscle extractors"""
+ category = "girlswithmuscle"
+ root = "https://www.girlswithmuscle.com"
+ directory_fmt = ("{category}", "{model}")
+ filename_fmt = "{model}_{id}.{extension}"
+ archive_fmt = "{type}_{model}_{id}"
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self.cookies_update(self._login_impl(username, password))
+
+ @cache(maxage=14*86400, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ url = self.root + "/login/"
+ page = self.request(url).text
+ csrf_token = text.extr(page, 'name="csrfmiddlewaretoken" value="', '"')
+
+ headers = {
+ "Origin" : self.root,
+ "Referer": url,
+ }
+ data = {
+ "csrfmiddlewaretoken": csrf_token,
+ "username": username,
+ "password": password,
+ "next": "/",
+ }
+ response = self.request(
+ url, method="POST", headers=headers, data=data)
+
+ if not response.history:
+ raise exception.AuthenticationError()
+
+ page = response.text
+ if ">Wrong username or password" in page:
+ raise exception.AuthenticationError()
+ if ">Log in<" in page:
+ raise exception.AuthenticationError("Account data is missing")
+
+ return {c.name: c.value for c in response.history[0].cookies}
+
+
+class GirlswithmusclePostExtractor(GirlswithmuscleExtractor):
+ """Extractor for individual posts on girlswithmuscle.com"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/(\d+)"
+ example = "https://www.girlswithmuscle.com/12345/"
+
+ def items(self):
+ self.login()
+
+ url = f"{self.root}/{self.groups[0]}/"
+ page = self.request(url).text
+ if not page:
+ raise exception.NotFoundError("post")
+
+ metadata = self.metadata(page)
+
+ if url := text.extr(page, 'class="main-image" src="', '"'):
+ metadata["type"] = "picture"
+ else:
+ url = text.extr(page, '<source src="', '"')
+ metadata["type"] = "video"
+
+ text.nameext_from_url(url, metadata)
+ yield Message.Directory, metadata
+ yield Message.Url, url, metadata
+
+ def metadata(self, page):
+ source = text.remove_html(text.extr(
+ page, '<div id="info-source" style="display: none">', "</div>"))
+ image_info = text.extr(
+ page, '<div class="image-info">', "</div>")
+ uploader = text.remove_html(text.extr(
+ image_info, '<span class="username-html">', "</a>"))
+
+ tags = text.extr(page, 'id="tags-text">', "</div>")
+ score = text.parse_int(text.remove_html(text.extr(
+ page, "Score: <b>", "</span")))
+ model = self._extract_model(page)
+
+ return {
+ "id": self.groups[0],
+ "model": model,
+ "model_list": self._parse_model_list(model),
+ "tags": text.split_html(tags)[1::2],
+ "date": text.parse_datetime(
+ text.extr(page, 'class="hover-time" title="', '"')[:19],
+ "%Y-%m-%d %H:%M:%S"),
+ "is_favorite": self._parse_is_favorite(page),
+ "source_filename": source,
+ "uploader": uploader,
+ "score": score,
+ "comments": self._extract_comments(page),
+ }
+
+ def _extract_model(self, page):
+ model = text.extr(page, "<title>", "</title>")
+ return "unknown" if model.startswith("Picture #") else model
+
+ def _parse_model_list(self, model):
+ if model == "unknown":
+ return []
+ else:
+ return [name.strip() for name in model.split(",")]
+
+ def _parse_is_favorite(self, page):
+ fav_button = text.extr(
+ page, 'id="favorite-button">', "</span>")
+ unfav_button = text.extr(
+ page, 'class="actionbutton unfavorite-button">', "</span>")
+
+ is_favorite = None
+ if unfav_button == "Unfavorite":
+ is_favorite = True
+ if fav_button == "Favorite":
+ is_favorite = False
+
+ return is_favorite
+
+ def _extract_comments(self, page):
+ comments = text.extract_iter(
+ page, '<div class="comment-body-inner">', "</div>")
+ return [comment.strip() for comment in comments]
+
+
+class GirlswithmuscleSearchExtractor(GirlswithmuscleExtractor):
+ """Extractor for search results on girlswithmuscle.com"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/images/(.*)"
+ example = "https://www.girlswithmuscle.com/images/?name=MODEL"
+
+ def pages(self):
+ query = self.groups[0]
+ url = f"{self.root}/images/{query}"
+ response = self.request(url)
+ if response.history:
+ msg = f'Request was redirected to "{response.url}", try logging in'
+ raise exception.AuthorizationError(msg)
+ page = response.text
+
+ match = util.re(r"Page (\d+) of (\d+)").search(page)
+ current, total = match.groups()
+ current, total = text.parse_int(current), text.parse_int(total)
+
+ yield page
+ for i in range(current + 1, total + 1):
+ url = f"{self.root}/images/{i}/{query}"
+ yield self.request(url).text
+
+ def items(self):
+ self.login()
+ for page in self.pages():
+ data = {
+ "_extractor" : GirlswithmusclePostExtractor,
+ "gallery_name": text.unescape(text.extr(page, "<title>", "<")),
+ }
+ for imgid in text.extract_iter(page, 'id="imgid-', '"'):
+ url = f"{self.root}/{imgid}/"
+ yield Message.Queue, url, data
diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py
index ef9ea60..0a6c9b9 100644
--- a/gallery_dl/extractor/gofile.py
+++ b/gallery_dl/extractor/gofile.py
@@ -23,7 +23,7 @@ class GofileFolderExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.content_id = match.group(1)
+ self.content_id = match[1]
def items(self):
recursive = self.config("recursive")
@@ -86,17 +86,16 @@ class GofileFolderExtractor(Extractor):
return self._api_request("contents/" + content_id, params, headers)
def _api_request(self, endpoint, params=None, headers=None, method="GET"):
- response = self.request(
+ response = self.request_json(
"https://api.gofile.io/" + endpoint,
- method=method, params=params, headers=headers,
- ).json()
+ method=method, params=params, headers=headers)
if response["status"] != "ok":
if response["status"] == "error-notFound":
raise exception.NotFoundError("content")
if response["status"] == "error-passwordRequired":
raise exception.AuthorizationError("Password required")
- raise exception.StopExtraction(
- "%s failed (Status: %s)", endpoint, response["status"])
+ raise exception.AbortExtraction(
+ f"{endpoint} failed (Status: {response['status']})")
return response["data"]
diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py
index 792f666..8e350d6 100644
--- a/gallery_dl/extractor/hatenablog.py
+++ b/gallery_dl/extractor/hatenablog.py
@@ -6,9 +6,8 @@
"""Extractors for https://hatenablog.com"""
-import re
from .common import Extractor, Message
-from .. import text
+from .. import text, util
BASE_PATTERN = (
@@ -28,10 +27,10 @@ class HatenablogExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.domain = match.group(1) or match.group(2)
+ self.domain = match[1] or match[2]
def _init(self):
- self._find_img = re.compile(r'<img +([^>]+)').finditer
+ self._find_img = util.re(r'<img +([^>]+)').finditer
def _handle_article(self, article: str):
extr = text.extract_from(article)
@@ -43,8 +42,8 @@ class HatenablogExtractor(Extractor):
'<div class="entry-content hatenablog-entry">', '</div>')
images = []
- for i in self._find_img(content):
- attributes = i.group(1)
+ for match in self._find_img(content):
+ attributes = match[1]
if 'class="hatena-fotolife"' not in attributes:
continue
image = text.unescape(text.extr(attributes, 'src="', '"'))
@@ -68,13 +67,13 @@ class HatenablogEntriesExtractor(HatenablogExtractor):
def __init__(self, match):
HatenablogExtractor.__init__(self, match)
- self.path = match.group(3)
+ self.path = match[3]
self.query = {key: value for key, value in text.parse_query(
- match.group(4)).items() if self._acceptable_query(key)}
+ match[4]).items() if self._acceptable_query(key)}
def _init(self):
HatenablogExtractor._init(self)
- self._find_pager_url = re.compile(
+ self._find_pager_url = util.re(
r' class="pager-next">\s*<a href="([^"]+)').search
def items(self):
@@ -92,7 +91,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor):
yield from self._handle_full_articles(extr)
match = self._find_pager_url(page)
- url = text.unescape(match.group(1)) if match else None
+ url = text.unescape(match[1]) if match else None
query = None
def _handle_partial_articles(self, extr):
@@ -129,7 +128,7 @@ class HatenablogEntryExtractor(HatenablogExtractor):
def __init__(self, match):
HatenablogExtractor.__init__(self, match)
- self.path = match.group(3)
+ self.path = match[3]
def items(self):
url = "https://" + self.domain + "/entry/" + self.path
diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py
index 1317ce9..ac4cd02 100644
--- a/gallery_dl/extractor/hentai2read.py
+++ b/gallery_dl/extractor/hentai2read.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,6 @@
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
-import re
class Hentai2readBase():
@@ -31,8 +30,9 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor):
chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
chapter, sep, minor = self.groups[1].partition(".")
- match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - "
- r"([^:]+): (.+) . Page 1 ", title)
+ match = util.re(
+ r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - "
+ r"([^:]+): (.+) . Page 1 ").match(title)
if match:
manga, type, author, _, title = match.groups()
else:
diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py
index 4992b7b..5c2628f 100644
--- a/gallery_dl/extractor/hentaicosplays.py
+++ b/gallery_dl/extractor/hentaicosplays.py
@@ -44,10 +44,10 @@ class HentaicosplaysGalleryExtractor(
def __init__(self, match):
BaseExtractor.__init__(self, match)
self.slug = self.groups[-1]
- self.gallery_url = "{}/story/{}/".format(self.root, self.slug)
+ self.page_url = f"{self.root}/story/{self.slug}/"
def _init(self):
- self.session.headers["Referer"] = self.gallery_url
+ self.session.headers["Referer"] = self.page_url
def metadata(self, page):
title = text.extr(page, "<title>", "</title>")
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
index 7e128a4..e529940 100644
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://www.hentai-foundry.com/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util
BASE_PATTERN = r"(https?://)?(?:www\.)?hentai-foundry\.com"
@@ -25,8 +25,8 @@ class HentaifoundryExtractor(Extractor):
per_page = 25
def __init__(self, match):
- self.root = (match.group(1) or "https://") + "www.hentai-foundry.com"
- self.user = match.group(2)
+ self.root = (match[1] or "https://") + "www.hentai-foundry.com"
+ self.user = match[2]
Extractor.__init__(self, match)
self.page_url = ""
self.start_post = 0
@@ -58,7 +58,7 @@ class HentaifoundryExtractor(Extractor):
num = self.start_page
while True:
- page = self.request("{}/page/{}".format(url, num)).text
+ page = self.request(f"{url}/page/{num}").text
yield from text.extract_iter(page, begin, end)
if 'class="pager"' not in page or 'class="last hidden"' in page:
@@ -192,15 +192,11 @@ class HentaifoundryExtractor(Extractor):
self.request(url, method="POST", data=data)
-class HentaifoundryUserExtractor(HentaifoundryExtractor):
+class HentaifoundryUserExtractor(Dispatch, HentaifoundryExtractor):
"""Extractor for a hentaifoundry user profile"""
- subcategory = "user"
pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile"
example = "https://www.hentai-foundry.com/user/USER/profile"
- def initialize(self):
- pass
-
def items(self):
root = self.root
user = "/user/" + self.user
@@ -224,7 +220,7 @@ class HentaifoundryPicturesExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match)
- self.page_url = "{}/pictures/user/{}".format(self.root, self.user)
+ self.page_url = f"{self.root}/pictures/user/{self.user}"
class HentaifoundryScrapsExtractor(HentaifoundryExtractor):
@@ -236,8 +232,7 @@ class HentaifoundryScrapsExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match)
- self.page_url = "{}/pictures/user/{}/scraps".format(
- self.root, self.user)
+ self.page_url = f"{self.root}/pictures/user/{self.user}/scraps"
class HentaifoundryFavoriteExtractor(HentaifoundryExtractor):
@@ -250,8 +245,7 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match)
- self.page_url = "{}/user/{}/faves/pictures".format(
- self.root, self.user)
+ self.page_url = f"{self.root}/user/{self.user}/faves/pictures"
class HentaifoundryTagExtractor(HentaifoundryExtractor):
@@ -264,7 +258,7 @@ class HentaifoundryTagExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match)
- self.page_url = "{}/pictures/tagged/{}".format(self.root, self.user)
+ self.page_url = f"{self.root}/pictures/tagged/{self.user}"
def metadata(self):
return {"search_tags": self.user}
@@ -280,7 +274,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match)
- self.page_url = "{}/pictures/recent/{}".format(self.root, self.user)
+ self.page_url = f"{self.root}/pictures/recent/{self.user}"
def metadata(self):
return {"date": self.user}
@@ -310,11 +304,11 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match)
- self.index = match.group(3)
+ self.index = match[3]
def items(self):
- post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format(
- self.root, self.user, self.index)
+ post_url = (f"{self.root}/pictures/user/{self.user}"
+ f"/{self.index}/?enterAgree=1")
image = self._parse_post(post_url)
image["user"] = self.user
yield Message.Directory, image
@@ -336,7 +330,7 @@ class HentaifoundryStoriesExtractor(HentaifoundryExtractor):
yield Message.Url, story["src"], story
def stories(self):
- url = "{}/stories/user/{}".format(self.root, self.user)
+ url = f"{self.root}/stories/user/{self.user}"
return self._pagination(url, '<div class="storyRow">', '</tr></table>')
@@ -351,11 +345,11 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match)
- self.index = match.group(3)
+ self.index = match[3]
def items(self):
- story_url = "{}/stories/user/{}/{}/x?enterAgree=1".format(
- self.root, self.user, self.index)
+ story_url = (f"{self.root}/stories/user/{self.user}"
+ f"/{self.index}/x?enterAgree=1")
story = self._parse_story(self.request(story_url).text)
yield Message.Directory, story
yield Message.Url, story["src"], story
diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py
index f3f43c4..f4f9d86 100644
--- a/gallery_dl/extractor/hentaihand.py
+++ b/gallery_dl/extractor/hentaihand.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -20,8 +20,8 @@ class HentaihandGalleryExtractor(GalleryExtractor):
example = "https://hentaihand.com/en/comic/TITLE"
def __init__(self, match):
- self.slug = match.group(1)
- url = "{}/api/comics/{}".format(self.root, self.slug)
+ self.slug = match[1]
+ url = f"{self.root}/api/comics/{self.slug}"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -44,7 +44,7 @@ class HentaihandGalleryExtractor(GalleryExtractor):
return data
def images(self, _):
- info = self.request(self.gallery_url + "/images").json()
+ info = self.request_json(self.page_url + "/images")
return [(img["source_url"], img) for img in info["images"]]
@@ -68,8 +68,8 @@ class HentaihandTagExtractor(Extractor):
else:
tpl = self.type + "s"
- url = "{}/api/{}/{}".format(self.root, tpl, self.key)
- tid = self.request(url, notfound=self.type).json()["id"]
+ url = f"{self.root}/api/{tpl}/{self.key}"
+ tid = self.request_json(url, notfound=self.type)["id"]
url = self.root + "/api/comics"
params = {
@@ -82,10 +82,10 @@ class HentaihandTagExtractor(Extractor):
"duration": "day",
}
while True:
- info = self.request(url, params=params).json()
+ info = self.request_json(url, params=params)
for gallery in info["data"]:
- gurl = "{}/en/comic/{}".format(self.root, gallery["slug"])
+ gurl = f"{self.root}/en/comic/{gallery['slug']}"
gallery["_extractor"] = HentaihandGalleryExtractor
yield Message.Queue, gurl, gallery
diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py
index ba9558c..b894d77 100644
--- a/gallery_dl/extractor/hentaihere.py
+++ b/gallery_dl/extractor/hentaihere.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,6 @@
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
-import re
class HentaihereBase():
@@ -27,30 +26,30 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
def __init__(self, match):
self.manga_id, self.chapter = match.groups()
- url = "{}/m/S{}/{}/1".format(self.root, self.manga_id, self.chapter)
+ url = f"{self.root}/m/S{self.manga_id}/{self.chapter}/1"
ChapterExtractor.__init__(self, match, url)
def metadata(self, page):
title = text.extr(page, "<title>", "</title>")
chapter_id = text.extr(page, 'report/C', '"')
chapter, sep, minor = self.chapter.partition(".")
- pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
- match = re.match(pattern, title)
+ match = util.re(
+ r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by "
+ r"(.+) at ").match(title)
return {
- "manga": match.group(1),
+ "manga": match[1],
"manga_id": text.parse_int(self.manga_id),
"chapter": text.parse_int(chapter),
"chapter_minor": sep + minor,
"chapter_id": text.parse_int(chapter_id),
- "type": match.group(2),
- "title": match.group(3),
- "author": match.group(4),
+ "type": match[2],
+ "title": match[3],
+ "author": match[4],
"lang": "en",
"language": "English",
}
- @staticmethod
- def images(page):
+ def images(self, page):
images = text.extr(page, "var rff_imageList = ", ";")
return [
("https://hentaicdn.com/hentai" + part, None)
@@ -73,7 +72,7 @@ class HentaihereMangaExtractor(HentaihereBase, MangaExtractor):
mtype, pos = text.extract(
page, '<span class="mngType text-danger">[', ']</span>', pos)
manga_id = text.parse_int(
- self.manga_url.rstrip("/").rpartition("/")[2][1:])
+ self.page_url.rstrip("/").rpartition("/")[2][1:])
while True:
marker, pos = text.extract(
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
index 286ee38..d3901ac 100644
--- a/gallery_dl/extractor/hentainexus.py
+++ b/gallery_dl/extractor/hentainexus.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2024 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,8 +22,8 @@ class HentainexusGalleryExtractor(GalleryExtractor):
example = "https://hentainexus.com/view/12345"
def __init__(self, match):
- self.gallery_id = match.group(1)
- url = "{}/view/{}".format(self.root, self.gallery_id)
+ self.gallery_id = match[1]
+ url = f"{self.root}/view/{self.gallery_id}"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -59,7 +59,7 @@ class HentainexusGalleryExtractor(GalleryExtractor):
return data
def images(self, _):
- url = "{}/read/{}".format(self.root, self.gallery_id)
+ url = f"{self.root}/read/{self.gallery_id}"
page = self.request(url).text
imgs = util.json_loads(self._decode(text.extr(
page, 'initReader("', '"')))
@@ -78,8 +78,7 @@ class HentainexusGalleryExtractor(GalleryExtractor):
pass
return results
- @staticmethod
- def _decode(data):
+ def _decode(self, data):
# https://hentainexus.com/static/js/reader.min.js?r=22
hostname = "hentainexus.com"
primes = (2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53)
@@ -118,8 +117,7 @@ class HentainexusGalleryExtractor(GalleryExtractor):
return result
- @staticmethod
- def _join_title(data):
+ def _join_title(self, data):
event = data['event']
artist = data['artist']
circle = data['circle']
@@ -137,18 +135,18 @@ class HentainexusGalleryExtractor(GalleryExtractor):
jt = ''
if event:
- jt += '({}) '.format(event)
+ jt += f'({event}) '
if circle:
- jt += '[{} ({})] '.format(circle, artist)
+ jt += f'[{circle} ({artist})] '
else:
- jt += '[{}] '.format(artist)
+ jt += f'[{artist}] '
jt += title
if parody.lower() != 'original work':
- jt += ' ({})'.format(parody)
+ jt += f' ({parody})'
if book:
- jt += ' ({})'.format(book)
+ jt += f' ({book})'
if magazine:
- jt += ' ({})'.format(magazine)
+ jt += f' ({magazine})'
return jt
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
index f15aab7..a75eee0 100644
--- a/gallery_dl/extractor/hiperdex.py
+++ b/gallery_dl/extractor/hiperdex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,9 +9,8 @@
"""Extractors for https://hiperdex.com/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text
+from .. import text, util
from ..cache import memcache
-import re
BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"
r"(?:1st)?hiper(?:dex|toon)\d?\.(?:com|net|info|top))")
@@ -25,7 +24,7 @@ class HiperdexBase():
@memcache(keyarg=1)
def manga_data(self, manga, page=None):
if not page:
- url = "{}/manga/{}/".format(self.root, manga)
+ url = f"{self.root}/manga/{manga}/"
page = self.request(url).text
extr = text.extract_from(page)
@@ -80,10 +79,10 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
return self.chapter_data(self.chapter)
def images(self, page):
+ pattern = util.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)')
return [
(url.strip(), None)
- for url in re.findall(
- r'id="image-\d+"\s+(?:data-)?src="([^"]+)', page)
+ for url in pattern.findall(page)
]
@@ -100,14 +99,14 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
def chapters(self, page):
data = self.manga_data(self.manga, page)
- self.manga_url = url = data["url"]
+ self.page_url = url = data["url"]
- url = self.manga_url + "ajax/chapters/"
+ url = self.page_url + "ajax/chapters/"
headers = {
"Accept": "*/*",
"X-Requested-With": "XMLHttpRequest",
"Origin": self.root,
- "Referer": "https://" + text.quote(self.manga_url[8:]),
+ "Referer": "https://" + text.quote(self.page_url[8:]),
}
html = self.request(url, method="POST", headers=headers).text
@@ -130,8 +129,8 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
example = "https://hiperdex.com/manga-artist/NAME/"
def __init__(self, match):
- self.root = text.ensure_http_scheme(match.group(1))
- MangaExtractor.__init__(self, match, self.root + match.group(2) + "/")
+ self.root = text.ensure_http_scheme(match[1])
+ MangaExtractor.__init__(self, match, self.root + match[2] + "/")
def chapters(self, page):
results = []
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 086b77c..82bed80 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -13,7 +13,6 @@ from .nozomi import decode_nozomi
from ..cache import memcache
from .. import text, util
import string
-import re
class HitomiExtractor(Extractor):
@@ -22,6 +21,27 @@ class HitomiExtractor(Extractor):
root = "https://hitomi.la"
domain = "gold-usergeneratedcontent.net"
+ def load_nozomi(self, query, language="all", headers=None):
+ ns, _, tag = query.strip().partition(":")
+
+ if ns == "female" or ns == "male":
+ ns = "tag/"
+ tag = query
+ elif ns == "language":
+ ns = ""
+ language = tag
+ tag = "index"
+ else:
+ ns = f"{ns}/"
+
+ url = (f"https://ltn.{self.domain}/n/{ns}"
+ f"/{tag.replace('_', ' ')}-{language}.nozomi")
+ if headers is None:
+ headers = {}
+ headers["Origin"] = self.root
+ headers["Referer"] = f"{self.root}/"
+ return decode_nozomi(self.request(url, headers=headers).content)
+
class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor):
"""Extractor for hitomi.la galleries"""
@@ -33,23 +53,19 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor):
def __init__(self, match):
GalleryExtractor.__init__(self, match, False)
self.gid = gid = self.groups[0]
- self.gallery_url = "https://ltn.{}/galleries/{}.js".format(
- self.domain, gid)
+ self.page_url = f"https://ltn.{self.domain}/galleries/{gid}.js"
def _init(self):
- self.session.headers["Referer"] = "{}/reader/{}.html".format(
- self.root, self.gid)
+ self.session.headers["Referer"] = f"{self.root}/reader/{self.gid}.html"
def metadata(self, page):
self.info = info = util.json_loads(page.partition("=")[2])
iget = info.get
- language = iget("language")
- if language:
+ if language := iget("language"):
language = language.capitalize()
- date = iget("date")
- if date:
+ if date := iget("date"):
date += ":00"
tags = []
@@ -83,7 +99,7 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor):
fmt = ext = self.config("format") or "webp"
check = (fmt != "webp")
- result = []
+ results = []
for image in self.info["files"]:
if check:
ext = fmt if image.get("has" + fmt) else "webp"
@@ -94,12 +110,10 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor):
# https://ltn.gold-usergeneratedcontent.net/common.js
inum = int(ihash[-1] + ihash[-3:-1], 16)
- url = "https://{}{}.{}/{}/{}/{}.{}".format(
- ext[0], gg_m.get(inum, gg_default) + 1, self.domain,
- gg_b, inum, ihash, ext,
- )
- result.append((url, idata))
- return result
+ url = (f"https://{ext[0]}{gg_m.get(inum, gg_default) + 1}."
+ f"{self.domain}/{gg_b}/{inum}/{ihash}.{ext}")
+ results.append((url, idata))
+ return results
class HitomiTagExtractor(HitomiExtractor):
@@ -123,8 +137,7 @@ class HitomiTagExtractor(HitomiExtractor):
"_extractor": HitomiGalleryExtractor,
"search_tags": text.unquote(self.tag.rpartition("-")[0]),
}
- nozomi_url = "https://ltn.{}/{}/{}.nozomi".format(
- self.domain, self.type, self.tag)
+ nozomi_url = f"https://ltn.{self.domain}/{self.type}/{self.tag}.nozomi"
headers = {
"Origin": self.root,
"Cache-Control": "max-age=0",
@@ -133,14 +146,13 @@ class HitomiTagExtractor(HitomiExtractor):
offset = 0
total = None
while True:
- headers["Referer"] = "{}/{}/{}.html?page={}".format(
- self.root, self.type, self.tag, offset // 100 + 1)
- headers["Range"] = "bytes={}-{}".format(offset, offset+99)
+ headers["Referer"] = (f"{self.root}/{self.type}/{self.tag}.html"
+ f"?page={offset // 100 + 1}")
+ headers["Range"] = f"bytes={offset}-{offset + 99}"
response = self.request(nozomi_url, headers=headers)
for gallery_id in decode_nozomi(response.content):
- gallery_url = "{}/galleries/{}.html".format(
- self.root, gallery_id)
+ gallery_url = f"{self.root}/galleries/{gallery_id}.html"
yield Message.Queue, gallery_url, data
offset += 100
@@ -163,8 +175,8 @@ class HitomiIndexExtractor(HitomiTagExtractor):
def items(self):
data = {"_extractor": HitomiGalleryExtractor}
- nozomi_url = "https://ltn.{}/{}-{}.nozomi".format(
- self.domain, self.tag, self.language)
+ nozomi_url = (f"https://ltn.{self.domain}"
+ f"/{self.tag}-{self.language}.nozomi")
headers = {
"Origin": self.root,
"Cache-Control": "max-age=0",
@@ -173,14 +185,13 @@ class HitomiIndexExtractor(HitomiTagExtractor):
offset = 0
total = None
while True:
- headers["Referer"] = "{}/{}-{}.html?page={}".format(
- self.root, self.tag, self.language, offset // 100 + 1)
- headers["Range"] = "bytes={}-{}".format(offset, offset+99)
+ headers["Referer"] = (f"{self.root}/{self.tag}-{self.language}"
+ f".html?page={offset // 100 + 1}")
+ headers["Range"] = f"bytes={offset}-{offset + 99}"
response = self.request(nozomi_url, headers=headers)
for gallery_id in decode_nozomi(response.content):
- gallery_url = "{}/galleries/{}.html".format(
- self.root, gallery_id)
+ gallery_url = f"{self.root}/galleries/{gallery_id}.html"
yield Message.Queue, gallery_url, data
offset += 100
@@ -194,60 +205,46 @@ class HitomiIndexExtractor(HitomiTagExtractor):
class HitomiSearchExtractor(HitomiExtractor):
"""Extractor for galleries from multiple tag searches on hitomi.la"""
subcategory = "search"
- pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)"
+ pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^#]+)"
example = "https://hitomi.la/search.html?QUERY"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.query = match.group(1)
- self.tags = text.unquote(self.query)
-
def items(self):
+ tags = text.unquote(self.groups[0])
+
data = {
"_extractor": HitomiGalleryExtractor,
- "search_tags": self.tags,
+ "search_tags": tags,
}
- results = [self.get_nozomi_items(tag) for tag in self.tags.split(" ")]
- intersects = set.intersection(*results)
- for gallery_id in sorted(intersects, reverse=True):
- gallery_url = "{}/galleries/{}.html".format(
- self.root, gallery_id)
+ for gallery_id in self.gallery_ids(tags):
+ gallery_url = f"{self.root}/galleries/{gallery_id}.html"
yield Message.Queue, gallery_url, data
- def get_nozomi_items(self, full_tag):
- area, tag, language = self.get_nozomi_args(full_tag)
+ def gallery_ids(self, tags):
+ result = None
+ positive = []
+ negative = []
- if area:
- nozomi_url = "https://ltn.{}/n/{}/{}-{}.nozomi".format(
- self.domain, area, tag, language)
- else:
- nozomi_url = "https://ltn.{}/n/{}-{}.nozomi".format(
- self.domain, tag, language)
+ for tag in tags.split():
+ if tag[0] == "-":
+ negative.append(tag[1:])
+ else:
+ positive.append(tag)
- headers = {
- "Origin": self.root,
- "Cache-Control": "max-age=0",
- "Referer": "{}/search.html?{}".format(self.root, self.query),
- }
-
- response = self.request(nozomi_url, headers=headers)
- return set(decode_nozomi(response.content))
+ for tag in positive:
+ ids = self.load_nozomi(tag)
+ if result is None:
+ result = set(ids)
+ else:
+ result.intersection_update(ids)
- def get_nozomi_args(self, query):
- ns, _, tag = query.strip().partition(":")
- area = ns
- language = "all"
-
- if ns == "female" or ns == "male":
- area = "tag"
- tag = query
- elif ns == "language":
- area = None
- language = tag
- tag = "index"
+ if result is None:
+ # result = set(self.load_nozomi("index"))
+ result = set(self.load_nozomi("language:all"))
+ for tag in negative:
+ result.difference_update(self.load_nozomi(tag))
- return area, tag.replace("_", " "), language
+ return sorted(result, reverse=True) if result else ()
@memcache(maxage=1800)
@@ -257,8 +254,8 @@ def _parse_gg(extr):
m = {}
keys = []
- for match in re.finditer(
- r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?", page):
+ for match in util.re_compile(
+ r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?").finditer(page):
key, value = match.groups()
keys.append(int(key))
@@ -268,11 +265,11 @@ def _parse_gg(extr):
m[key] = value
keys.clear()
- for match in re.finditer(
- r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)", page):
- m[int(match.group(1))] = int(match.group(2))
+ for match in util.re_compile(
+ r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)").finditer(page):
+ m[int(match[1])] = int(match[2])
- d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page)
- b = re.search(r"b:\s*[\"'](.+)[\"']", page)
+ d = util.re_compile(r"(?:var\s|default:)\s*o\s*=\s*(\d+)").search(page)
+ b = util.re_compile(r"b:\s*[\"'](.+)[\"']").search(page)
- return m, b.group(1).strip("/"), int(d.group(1)) if d else 0
+ return m, b[1].strip("/"), int(d[1]) if d else 0
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
index ddfc54b..587d88c 100644
--- a/gallery_dl/extractor/hotleak.py
+++ b/gallery_dl/extractor/hotleak.py
@@ -70,8 +70,7 @@ class HotleakPostExtractor(HotleakExtractor):
self.creator, self.type, self.id = match.groups()
def posts(self):
- url = "{}/{}/{}/{}".format(
- self.root, self.creator, self.type, self.id)
+ url = f"{self.root}/{self.creator}/{self.type}/{self.id}"
page = self.request(url).text
page = text.extr(
page, '<div class="movie-image thumb">', '</article>')
@@ -103,10 +102,10 @@ class HotleakCreatorExtractor(HotleakExtractor):
def __init__(self, match):
HotleakExtractor.__init__(self, match)
- self.creator = match.group(1)
+ self.creator = match[1]
def posts(self):
- url = "{}/{}".format(self.root, self.creator)
+ url = f"{self.root}/{self.creator}"
return self._pagination(url)
def _pagination(self, url):
@@ -159,7 +158,7 @@ class HotleakCategoryExtractor(HotleakExtractor):
self._category, self.params = match.groups()
def items(self):
- url = "{}/{}".format(self.root, self._category)
+ url = f"{self.root}/{self._category}"
if self._category in ("hot", "creators"):
data = {"_extractor": HotleakCreatorExtractor}
@@ -178,7 +177,7 @@ class HotleakSearchExtractor(HotleakExtractor):
def __init__(self, match):
HotleakExtractor.__init__(self, match)
- self.params = match.group(1)
+ self.params = match[1]
def items(self):
data = {"_extractor": HotleakCreatorExtractor}
diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py
index 8f4a10c..075e1f6 100644
--- a/gallery_dl/extractor/idolcomplex.py
+++ b/gallery_dl/extractor/idolcomplex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -108,8 +108,7 @@ class IdolcomplexExtractor(SankakuExtractor):
pid = extr(">Post ID:", "<")
created = extr(' title="', '"')
- file_url = extr('>Original:', 'id=')
- if file_url:
+ if file_url := extr('>Original:', 'id='):
file_url = extr(' href="', '"')
width = extr(">", "x")
height = extr("", " ")
@@ -159,7 +158,7 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
def __init__(self, match):
IdolcomplexExtractor.__init__(self, match)
- query = text.parse_query(match.group(1))
+ query = text.parse_query(match[1])
self.tags = text.unquote(query.get("tags", "").replace("+", " "))
self.start_page = text.parse_int(query.get("page"), 1)
self.next = text.parse_int(query.get("next"), 0)
@@ -184,7 +183,7 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
tags = self.tags.split()
if not self.logged_in and len(tags) > 4:
- raise exception.StopExtraction(
+ raise exception.AbortExtraction(
"Non-members can only search up to 4 tags at once")
return {"search_tags": " ".join(tags)}
diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py
index 68360e9..171feea 100644
--- a/gallery_dl/extractor/imagebam.py
+++ b/gallery_dl/extractor/imagebam.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://www.imagebam.com/"""
from .common import Extractor, Message
-from .. import text
-import re
+from .. import text, util
class ImagebamExtractor(Extractor):
@@ -20,7 +19,7 @@ class ImagebamExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.path = match.group(1)
+ self.path = match[1]
def _init(self):
self.cookies.set("nsfw_inter", "1", domain="www.imagebam.com")
@@ -64,22 +63,19 @@ class ImagebamGalleryExtractor(ImagebamExtractor):
image.update(data)
yield Message.Url, image["url"], image
- @staticmethod
- def metadata(page):
+ def metadata(self, page):
return {"title": text.unescape(text.extr(
page, 'id="gallery-name">', '<').strip())}
def images(self, page):
- findall = re.compile(r'<a href="https://www\.imagebam\.com'
- r'(/(?:image/|view/M)[a-zA-Z0-9]+)').findall
-
+ findall = util.re(r'<a href="https://www\.imagebam\.com'
+ r'(/(?:image/|view/M)[a-zA-Z0-9]+)').findall
paths = []
while True:
paths += findall(page)
pos = page.find('rel="next" aria-label="Next')
if pos > 0:
- url = text.rextract(page, 'href="', '"', pos)[0]
- if url:
+ if url := text.rextr(page, 'href="', '"', pos):
page = self.request(url).text
continue
return paths
diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py
index 159feba..464e489 100644
--- a/gallery_dl/extractor/imagechest.py
+++ b/gallery_dl/extractor/imagechest.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2020 Leonid "Bepis" Pavel
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,15 +23,14 @@ class ImagechestGalleryExtractor(GalleryExtractor):
example = "https://imgchest.com/p/abcdefghijk"
def __init__(self, match):
- self.gallery_id = match.group(1)
+ self.gallery_id = match[1]
url = self.root + "/p/" + self.gallery_id
GalleryExtractor.__init__(self, match, url)
def _init(self):
- access_token = self.config("access-token")
- if access_token:
+ if access_token := self.config("access-token"):
self.api = ImagechestAPI(self, access_token)
- self.gallery_url = None
+ self.page_url = None
self.metadata = self._metadata_api
def metadata(self, page):
@@ -97,7 +96,7 @@ class ImagechestUserExtractor(Extractor):
while True:
try:
- data = self.request(url, params=params).json()["data"]
+ data = self.request_json(url, params=params)["data"]
except (TypeError, KeyError):
return
@@ -152,4 +151,4 @@ class ImagechestAPI():
else:
self.extractor.log.debug(response.text)
- raise exception.StopExtraction("API request failed")
+ raise exception.AbortExtraction("API request failed")
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
index dd5220d..993af7c 100644
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -28,10 +28,9 @@ class ImagefapExtractor(Extractor):
response = Extractor.request(self, url, **kwargs)
if response.history and response.url.endswith("/human-verification"):
- msg = text.extr(response.text, '<div class="mt-4', '<')
- if msg:
+ if msg := text.extr(response.text, '<div class="mt-4', '<'):
msg = " ".join(msg.partition(">")[2].split())
- raise exception.StopExtraction("'%s'", msg)
+ raise exception.AbortExtraction(f"'{msg}'")
self.log.warning("HTTP redirect to %s", response.url)
return response
@@ -45,11 +44,11 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
def __init__(self, match):
ImagefapExtractor.__init__(self, match)
- self.gid = match.group(1)
+ self.gid = match[1]
self.image_id = ""
def items(self):
- url = "{}/gallery/{}".format(self.root, self.gid)
+ url = f"{self.root}/gallery/{self.gid}"
page = self.request(url).text
data = self.get_job_metadata(page)
yield Message.Directory, data
@@ -81,12 +80,12 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
def get_images(self):
"""Collect image-urls and -metadata"""
- url = "{}/photo/{}/".format(self.root, self.image_id)
+ url = f"{self.root}/photo/{self.image_id}/"
params = {"gid": self.gid, "idx": 0, "partial": "true"}
headers = {
"Content-Type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest",
- "Referer": "{}?pgid=&gid={}&page=0".format(url, self.image_id)
+ "Referer": f"{url}?pgid=&gid={self.image_id}&page=0"
}
num = 0
@@ -116,7 +115,7 @@ class ImagefapImageExtractor(ImagefapExtractor):
def __init__(self, match):
ImagefapExtractor.__init__(self, match)
- self.image_id = match.group(1)
+ self.image_id = match[1]
def items(self):
url, data = self.get_image()
@@ -124,7 +123,7 @@ class ImagefapImageExtractor(ImagefapExtractor):
yield Message.Url, url, data
def get_image(self):
- url = "{}/photo/{}/".format(self.root, self.image_id)
+ url = f"{self.root}/photo/{self.image_id}/"
page = self.request(url).text
url, pos = text.extract(
@@ -161,7 +160,7 @@ class ImagefapFolderExtractor(ImagefapExtractor):
def items(self):
for gallery_id, name, folder in self.galleries(self.folder_id):
- url = "{}/gallery/{}".format(self.root, gallery_id)
+ url = f"{self.root}/gallery/{gallery_id}"
data = {
"gallery_id": gallery_id,
"title" : text.unescape(name),
@@ -175,14 +174,13 @@ class ImagefapFolderExtractor(ImagefapExtractor):
if folder_id == "-1":
folder_name = "Uncategorized"
if self._id:
- url = "{}/usergallery.php?userid={}&folderid=-1".format(
- self.root, self.user)
+ url = (f"{self.root}/usergallery.php"
+ f"?userid={self.user}&folderid=-1")
else:
- url = "{}/profile/{}/galleries?folderid=-1".format(
- self.root, self.user)
+ url = f"{self.root}/profile/{self.user}/galleries?folderid=-1"
else:
folder_name = None
- url = "{}/organizer/{}/".format(self.root, folder_id)
+ url = f"{self.root}/organizer/{folder_id}/"
params = {"page": 0}
extr = text.extract_from(self.request(url, params=params).text)
@@ -222,19 +220,17 @@ class ImagefapUserExtractor(ImagefapExtractor):
for folder_id in self.folders():
if folder_id == "-1":
- url = "{}/profile/{}/galleries?folderid=-1".format(
- self.root, self.user)
+ url = f"{self.root}/profile/{self.user}/galleries?folderid=-1"
else:
- url = "{}/organizer/{}/".format(self.root, folder_id)
+ url = f"{self.root}/organizer/{folder_id}/"
yield Message.Queue, url, data
def folders(self):
"""Return a list of folder IDs of a user"""
if self.user:
- url = "{}/profile/{}/galleries".format(self.root, self.user)
+ url = f"{self.root}/profile/{self.user}/galleries"
else:
- url = "{}/usergallery.php?userid={}".format(
- self.root, self.user_id)
+ url = f"{self.root}/usergallery.php?userid={self.user_id}"
response = self.request(url)
self.user = response.url.split("/")[-2]
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index d6b36cb..0e5ce7e 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,12 +23,12 @@ class ImagehostImageExtractor(Extractor):
_params = None
_cookies = None
_encoding = None
+ _validate = None
def __init__(self, match):
Extractor.__init__(self, match)
- self.page_url = "http{}://{}".format(
- "s" if self._https else "", match.group(1))
- self.token = match.group(2)
+ self.page_url = f"http{'s' if self._https else ''}://{match[1]}"
+ self.token = match[2]
if self._params == "simple":
self._params = {
@@ -57,6 +57,8 @@ class ImagehostImageExtractor(Extractor):
data.update(self.metadata(page))
if self._https and url.startswith("http:"):
url = "https:" + url[5:]
+ if self._validate is not None:
+ data["_http_validate"] = self._validate
yield Message.Directory, data
yield Message.Url, url, data
@@ -164,6 +166,14 @@ class ImagevenueImageExtractor(ImagehostImageExtractor):
filename, pos = text.extract(page, 'alt="', '"', pos)
return url, text.unescape(filename)
+ def _validate(self, response):
+ hget = response.headers.get
+ return not (
+ hget("content-length") == "14396" and
+ hget("content-type") == "image/jpeg" and
+ hget("last-modified") == "Mon, 04 May 2020 07:19:52 GMT"
+ )
+
class ImagetwistImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from imagetwist.com"""
@@ -200,6 +210,26 @@ class ImagetwistGalleryExtractor(ImagehostImageExtractor):
yield Message.Queue, root + path, data
+class ImgadultImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from imgadult.com"""
+ category = "imgadult"
+ _cookies = {"img_i_d": "1"}
+ pattern = r"(?:https?://)?((?:www\.)?imgadult\.com/img-([0-9a-f]+)\.html)"
+ example = "https://imgadult.com/img-0123456789abc.html"
+
+ def get_info(self, page):
+ url , pos = text.extract(page, "' src='", "'")
+ name, pos = text.extract(page, "alt='", "'", pos)
+
+ if name:
+ name, _, rhs = name.rpartition(" image hosted at ImgAdult.com")
+ if not name:
+ name = rhs
+ name = text.unescape(name)
+
+ return url, name
+
+
class ImgspiceImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from imgspice.com"""
category = "imgspice"
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
index b926cb2..e6abdeb 100644
--- a/gallery_dl/extractor/imgbb.py
+++ b/gallery_dl/extractor/imgbb.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -112,7 +112,7 @@ class ImgbbExtractor(Extractor):
params["page"] += 1
elif not seek or 'class="pagination-next"' not in page:
return
- data = self.request(endpoint, method="POST", data=params).json()
+ data = self.request_json(endpoint, method="POST", data=params)
page = data["html"]
@@ -126,8 +126,8 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
def __init__(self, match):
ImgbbExtractor.__init__(self, match)
self.album_name = None
- self.album_id = match.group(1)
- self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
+ self.album_id = match[1]
+ self.sort = text.parse_query(match[2]).get("sort", "date_desc")
self.page_url = "https://ibb.co/album/" + self.album_id
def metadata(self, page):
@@ -162,9 +162,9 @@ class ImgbbUserExtractor(ImgbbExtractor):
def __init__(self, match):
ImgbbExtractor.__init__(self, match)
- self.user = match.group(1)
- self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
- self.page_url = "https://{}.imgbb.com/".format(self.user)
+ self.user = match[1]
+ self.sort = text.parse_query(match[2]).get("sort", "date_desc")
+ self.page_url = f"https://{self.user}.imgbb.com/"
def metadata(self, page):
user = self._extract_user(page)
@@ -191,7 +191,7 @@ class ImgbbImageExtractor(ImgbbExtractor):
def __init__(self, match):
ImgbbExtractor.__init__(self, match)
- self.image_id = match.group(1)
+ self.image_id = match[1]
def items(self):
url = "https://ibb.co/" + self.image_id
diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py
index 7069717..5def88d 100644
--- a/gallery_dl/extractor/imgbox.py
+++ b/gallery_dl/extractor/imgbox.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://imgbox.com/"""
from .common import Extractor, Message, AsynchronousMixin
-from .. import text, exception
-import re
+from .. import text, util, exception
class ImgboxExtractor(Extractor):
@@ -31,18 +30,15 @@ class ImgboxExtractor(Extractor):
text.nameext_from_url(imgdata["filename"], imgdata)
yield Message.Url, self.get_image_url(imgpage), imgdata
- @staticmethod
- def get_job_metadata():
+ def get_job_metadata(self):
"""Collect metadata for extractor-job"""
return {}
- @staticmethod
- def get_image_keys():
+ def get_image_keys(self):
"""Return an iterable containing all image-keys"""
return []
- @staticmethod
- def get_image_metadata(page):
+ def get_image_metadata(self, page):
"""Collect metadata for a downloadable file"""
return text.extract_all(page, (
("num" , '</a> &nbsp; ', ' of '),
@@ -50,8 +46,7 @@ class ImgboxExtractor(Extractor):
("filename" , ' title="', '"'),
))[0]
- @staticmethod
- def get_image_url(page):
+ def get_image_url(self, page):
"""Extract download-url"""
return text.extr(page, 'property="og:image" content="', '"')
@@ -67,14 +62,15 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
def __init__(self, match):
ImgboxExtractor.__init__(self, match)
- self.gallery_key = match.group(1)
+ self.gallery_key = match[1]
self.image_keys = []
def get_job_metadata(self):
page = self.request(self.root + "/g/" + self.gallery_key).text
if "The specified gallery could not be found." in page:
raise exception.NotFoundError("gallery")
- self.image_keys = re.findall(r'<a href="/([^"]+)"><img alt="', page)
+ self.image_keys = util.re(
+ r'<a href="/([^"]+)"><img alt="').findall(page)
title = text.extr(page, "<h1>", "</h1>")
title, _, count = title.rpartition(" - ")
@@ -97,14 +93,13 @@ class ImgboxImageExtractor(ImgboxExtractor):
def __init__(self, match):
ImgboxExtractor.__init__(self, match)
- self.image_key = match.group(1)
+ self.image_key = match[1]
def get_image_keys(self):
return (self.image_key,)
- @staticmethod
- def get_image_metadata(page):
- data = ImgboxExtractor.get_image_metadata(page)
+ def get_image_metadata(self, page):
+ data = ImgboxExtractor.get_image_metadata(self, page)
if not data["filename"]:
raise exception.NotFoundError("image")
return data
diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py
index 3aa7922..7e5e6cf 100644
--- a/gallery_dl/extractor/imgth.py
+++ b/gallery_dl/extractor/imgth.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -20,8 +20,8 @@ class ImgthGalleryExtractor(GalleryExtractor):
example = "https://imgth.com/gallery/123/TITLE"
def __init__(self, match):
- self.gallery_id = gid = match.group(1)
- url = "{}/gallery/{}/g/".format(self.root, gid)
+ self.gallery_id = gid = match[1]
+ url = f"{self.root}/gallery/{gid}/g/"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -45,12 +45,11 @@ class ImgthGalleryExtractor(GalleryExtractor):
thumbs = text.extr(page, '<ul class="thumbnails">', '</ul>')
for url in text.extract_iter(thumbs, '<img src="', '"'):
path = url.partition("/thumbs/")[2]
- yield ("{}/images/{}".format(self.root, path), None)
+ yield (f"{self.root}/images/{path}", None)
if '<li class="next">' not in page:
return
pnum += 1
- url = "{}/gallery/{}/g/page/{}".format(
- self.root, self.gallery_id, pnum)
+ url = f"{self.root}/gallery/{self.gallery_id}/g/page/{pnum}"
page = self.request(url).text
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index 20f8ea4..1ac76e0 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -21,7 +21,7 @@ class ImgurExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.key = match.group(1)
+ self.key = match[1]
def _init(self):
self.api = ImgurAPI(self)
@@ -36,8 +36,8 @@ class ImgurExtractor(Extractor):
elif image["is_animated"] and self.mp4 and image["ext"] == "gif":
image["ext"] = "mp4"
- image["url"] = url = "https://i.imgur.com/{}.{}".format(
- image["id"], image["ext"])
+ image["url"] = url = \
+ f"https://i.imgur.com/{image['id']}.{image['ext']}"
image["date"] = text.parse_datetime(image["created_at"])
image["_http_validate"] = self._validate
text.nameext_from_url(url, image)
@@ -131,10 +131,10 @@ class ImgurGalleryExtractor(ImgurExtractor):
def items(self):
if self.api.gallery(self.key)["is_album"]:
- url = "{}/a/{}".format(self.root, self.key)
+ url = f"{self.root}/a/{self.key}"
extr = ImgurAlbumExtractor
else:
- url = "{}/{}".format(self.root, self.key)
+ url = f"{self.root}/{self.key}"
extr = ImgurImageExtractor
yield Message.Queue, url, {"_extractor": extr}
@@ -168,7 +168,7 @@ class ImgurFavoriteFolderExtractor(ImgurExtractor):
def __init__(self, match):
ImgurExtractor.__init__(self, match)
- self.folder_id = match.group(2)
+ self.folder_id = match[2]
def items(self):
return self._items_queue(self.api.account_favorites_folder(
@@ -234,16 +234,15 @@ class ImgurAPI():
self.headers = {"Authorization": "Client-ID " + self.client_id}
def account_submissions(self, account):
- endpoint = "/3/account/{}/submissions".format(account)
+ endpoint = f"/3/account/{account}/submissions"
return self._pagination(endpoint)
def account_favorites(self, account):
- endpoint = "/3/account/{}/gallery_favorites".format(account)
+ endpoint = f"/3/account/{account}/gallery_favorites"
return self._pagination(endpoint)
def account_favorites_folder(self, account, folder_id):
- endpoint = "/3/account/{}/folders/{}/favorites".format(
- account, folder_id)
+ endpoint = f"/3/account/{account}/folders/{folder_id}/favorites"
return self._pagination_v2(endpoint)
def accounts_me_allposts(self):
@@ -270,11 +269,11 @@ class ImgurAPI():
return self._pagination(endpoint, params)
def gallery_subreddit(self, subreddit):
- endpoint = "/3/gallery/r/{}".format(subreddit)
+ endpoint = f"/3/gallery/r/{subreddit}"
return self._pagination(endpoint)
def gallery_tag(self, tag):
- endpoint = "/3/gallery/t/{}".format(tag)
+ endpoint = f"/3/gallery/t/{tag}"
return self._pagination(endpoint, key="items")
def image(self, image_hash):
@@ -294,10 +293,9 @@ class ImgurAPI():
def _call(self, endpoint, params=None, headers=None):
while True:
try:
- return self.extractor.request(
+ return self.extractor.request_json(
"https://api.imgur.com" + endpoint,
- params=params, headers=(headers or self.headers),
- ).json()
+ params=params, headers=(headers or self.headers))
except exception.HttpError as exc:
if exc.status not in (403, 429) or \
b"capacity" not in exc.response.content:
@@ -308,7 +306,7 @@ class ImgurAPI():
num = 0
while True:
- data = self._call("{}/{}".format(endpoint, num), params)["data"]
+ data = self._call(f"{endpoint}/{num}", params)["data"]
if key:
data = data[key]
if not data:
diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py
index 1b0fba3..5ad1c30 100644
--- a/gallery_dl/extractor/imhentai.py
+++ b/gallery_dl/extractor/imhentai.py
@@ -38,7 +38,7 @@ class ImhentaiExtractor(BaseExtractor):
yield Message.Queue, base + gallery_id, data
prev = gallery_id
- href = text.rextract(page, "class='page-link' href='", "'")[0]
+ href = text.rextr(page, "class='page-link' href='", "'")
if not href or href == "#":
return
if href[0] == "/":
@@ -85,7 +85,7 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
def __init__(self, match):
ImhentaiExtractor.__init__(self, match)
self.gallery_id = self.groups[-1]
- self.gallery_url = "{}/gallery/{}/".format(self.root, self.gallery_id)
+ self.page_url = f"{self.root}/gallery/{self.gallery_id}/"
def metadata(self, page):
extr = text.extract_from(page)
diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py
index 47e071a..45ae52e 100644
--- a/gallery_dl/extractor/inkbunny.py
+++ b/gallery_dl/extractor/inkbunny.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -109,12 +109,11 @@ class InkbunnyPoolExtractor(InkbunnyExtractor):
def __init__(self, match):
InkbunnyExtractor.__init__(self, match)
- pid = match.group(1)
- if pid:
+ if pid := match[1]:
self.pool_id = pid
self.orderby = "pool_order"
else:
- params = text.parse_query(match.group(2))
+ params = text.parse_query(match[2])
self.pool_id = params.get("pool_id")
self.orderby = params.get("orderby", "pool_order")
@@ -142,19 +141,18 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor):
def __init__(self, match):
InkbunnyExtractor.__init__(self, match)
- uid = match.group(1)
- if uid:
+ if uid := match[1]:
self.user_id = uid
self.orderby = self.config("orderby", "fav_datetime")
else:
- params = text.parse_query(match.group(2))
+ params = text.parse_query(match[2])
self.user_id = params.get("user_id")
self.orderby = params.get("orderby", "fav_datetime")
def metadata(self):
# Lookup fav user ID as username
- url = "{}/userfavorites_process.php?favs_user_id={}".format(
- self.root, self.user_id)
+ url = (f"{self.root}/userfavorites_process.php"
+ f"?favs_user_id={self.user_id}")
page = self.request(url).text
user_link = text.extr(page, '<a rel="author"', '</a>')
favs_username = text.extr(user_link, 'href="/', '"')
@@ -184,7 +182,7 @@ class InkbunnyUnreadExtractor(InkbunnyExtractor):
def __init__(self, match):
InkbunnyExtractor.__init__(self, match)
- self.params = text.parse_query(match.group(1))
+ self.params = text.parse_query(match[1])
def posts(self):
params = self.params.copy()
@@ -204,7 +202,7 @@ class InkbunnySearchExtractor(InkbunnyExtractor):
def __init__(self, match):
InkbunnyExtractor.__init__(self, match)
- self.params = text.parse_query(match.group(1))
+ self.params = text.parse_query(match[1])
def metadata(self):
return {"search": self.params}
@@ -218,10 +216,9 @@ class InkbunnySearchExtractor(InkbunnyExtractor):
params["dayslimit"] = pop("days", None)
params["username"] = pop("artist", None)
- favsby = pop("favsby", None)
- if favsby:
+ if favsby := pop("favsby", None):
# get user_id from user profile
- url = "{}/{}".format(self.root, favsby)
+ url = f"{self.root}/{favsby}"
page = self.request(url).text
user_id = text.extr(page, "?user_id=", "'")
params["favs_user_id"] = user_id.partition("&")[0]
@@ -241,8 +238,8 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor):
def __init__(self, match):
InkbunnyExtractor.__init__(self, match)
- self.user_id = match.group(1) or \
- text.parse_query(match.group(2)).get("user_id")
+ self.user_id = match[1] or \
+ text.parse_query(match[2]).get("user_id")
def items(self):
url = self.root + "/watchlist_process.php"
@@ -276,7 +273,7 @@ class InkbunnyPostExtractor(InkbunnyExtractor):
def __init__(self, match):
InkbunnyExtractor.__init__(self, match)
- self.submission_id = match.group(1)
+ self.submission_id = match[1]
def posts(self):
submissions = self.api.detail(({"submission_id": self.submission_id},))
@@ -304,6 +301,7 @@ class InkbunnyAPI():
params = {
"submission_ids": ",".join(ids),
"show_description": "yes",
+ "show_pools": "yes",
}
submissions = [None] * len(ids)
@@ -341,7 +339,7 @@ class InkbunnyAPI():
while True:
params["sid"] = self.session_id
- data = self.extractor.request(url, params=params).json()
+ data = self.extractor.request_json(url, params=params)
if "error_code" not in data:
return data
@@ -350,7 +348,7 @@ class InkbunnyAPI():
self.authenticate(invalidate=True)
continue
- raise exception.StopExtraction(data.get("error_message"))
+ raise exception.AbortExtraction(data.get("error_message"))
def _pagination_search(self, params):
params["page"] = 1
@@ -378,7 +376,7 @@ def _authenticate_impl(api, username, password):
url = "https://inkbunny.net/api_login.php"
data = {"username": username, "password": password}
- data = api.extractor.request(url, method="POST", data=data).json()
+ data = api.extractor.request_json(url, method="POST", data=data)
if "sid" not in data:
raise exception.AuthenticationError(data.get("error_message"))
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 624bba2..6213e9a 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2018-2020 Leonardo Taccari
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,12 +9,11 @@
"""Extractors for https://www.instagram.com/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from ..cache import cache, memcache
import itertools
import binascii
-import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com"
USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)"
@@ -34,12 +33,12 @@ class InstagramExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.item = match.group(1)
+ self.item = match[1]
def _init(self):
self.www_claim = "0"
self.csrf_token = util.generate_token()
- self._find_tags = re.compile(r"#\w+").findall
+ self._find_tags = util.re(r"#\w+").findall
self._logged_in = True
self._cursor = None
self._user = None
@@ -56,8 +55,7 @@ class InstagramExtractor(Extractor):
self.login()
data = self.metadata()
- videos = self.config("videos", True)
- if videos:
+ if videos := self.config("videos", True):
videos_dash = (videos != "merged")
videos_headers = {"User-Agent": "Mozilla/5.0"}
previews = self.config("previews", False)
@@ -92,15 +90,14 @@ class InstagramExtractor(Extractor):
for file in files:
file.update(post)
- url = file.get("video_url")
- if url:
+ if url := file.get("video_url"):
if videos:
file["_http_headers"] = videos_headers
text.nameext_from_url(url, file)
if videos_dash:
file["_fallback"] = (url,)
file["_ytdl_manifest"] = "dash"
- url = "ytdl:dash"
+ url = f"ytdl:{post['post_url']}{file['num']}.mp4"
yield Message.Url, url, file
if previews:
file["media_id"] += "p"
@@ -137,16 +134,15 @@ class InstagramExtractor(Extractor):
else:
page = None
- if page:
- raise exception.StopExtraction("HTTP redirect to %s page (%s)",
- page, url.partition("?")[0])
+ if page is not None:
+ raise exception.AbortExtraction(
+ f"HTTP redirect to {page} page ({url.partition('?')[0]})")
www_claim = response.headers.get("x-ig-set-www-claim")
if www_claim is not None:
self.www_claim = www_claim
- csrf_token = response.cookies.get("csrftoken")
- if csrf_token:
+ if csrf_token := response.cookies.get("csrftoken"):
self.csrf_token = csrf_token
return response
@@ -165,11 +161,15 @@ class InstagramExtractor(Extractor):
if "items" in post: # story or highlight
items = post["items"]
reel_id = str(post["id"]).rpartition(":")[2]
- expires = post.get("expiring_at")
+ if expires := post.get("expiring_at"):
+ post_url = f"{self.root}/stories/{post['user']['username']}/"
+ else:
+ post_url = f"{self.root}/stories/highlights/{reel_id}/"
data = {
"expires": text.parse_timestamp(expires),
"post_id": reel_id,
"post_shortcode": shortcode_from_id(reel_id),
+ "post_url": post_url,
}
if "title" in post:
data["highlight_title"] = post["title"]
@@ -180,7 +180,7 @@ class InstagramExtractor(Extractor):
data = {
"post_id" : post["pk"],
"post_shortcode": post["code"],
- "post_url": "{}/p/{}/".format(self.root, post["code"]),
+ "post_url": f"{self.root}/p/{post['code']}/",
"likes": post.get("like_count", 0),
"liked": post.get("has_liked", False),
"pinned": self._extract_pinned(post),
@@ -189,20 +189,17 @@ class InstagramExtractor(Extractor):
caption = post["caption"]
data["description"] = caption["text"] if caption else ""
- tags = self._find_tags(data["description"])
- if tags:
+ if tags := self._find_tags(data["description"]):
data["tags"] = sorted(set(tags))
- location = post.get("location")
- if location:
+ if location := post.get("location"):
slug = location["short_name"].replace(" ", "-").lower()
data["location_id"] = location["pk"]
data["location_slug"] = slug
- data["location_url"] = "{}/explore/locations/{}/{}/".format(
- self.root, location["pk"], slug)
+ data["location_url"] = \
+ f"{self.root}/explore/locations/{location['pk']}/{slug}/"
- coauthors = post.get("coauthor_producers")
- if coauthors:
+ if coauthors := post.get("coauthor_producers"):
data["coauthors"] = [
{"id" : user["pk"],
"username" : user["username"],
@@ -210,8 +207,7 @@ class InstagramExtractor(Extractor):
for user in coauthors
]
- items = post.get("carousel_media")
- if items:
+ if items := post.get("carousel_media"):
data["sidecar_media_id"] = data["post_id"]
data["sidecar_shortcode"] = data["post_shortcode"]
else:
@@ -233,8 +229,7 @@ class InstagramExtractor(Extractor):
data["post_shortcode"])
continue
- video_versions = item.get("video_versions")
- if video_versions:
+ if video_versions := item.get("video_versions"):
video = max(
video_versions,
key=lambda x: (x["width"], x["height"], x["type"]),
@@ -277,8 +272,7 @@ class InstagramExtractor(Extractor):
"edge_sidecar_to_children" not in post:
post = self.api.media(post["id"])[0]
- pinned = post.get("pinned_for_users", ())
- if pinned:
+ if pinned := post.get("pinned_for_users", ()):
for index, user in enumerate(pinned):
pinned[index] = int(user["id"])
@@ -293,7 +287,7 @@ class InstagramExtractor(Extractor):
"fullname" : owner.get("full_name"),
"post_id" : post["id"],
"post_shortcode": post["shortcode"],
- "post_url" : "{}/p/{}/".format(self.root, post["shortcode"]),
+ "post_url" : f"{self.root}/p/{post['shortcode']}/",
"post_date" : text.parse_timestamp(post["taken_at_timestamp"]),
"description": text.parse_unicode_escapes("\n".join(
edge["node"]["text"]
@@ -302,19 +296,16 @@ class InstagramExtractor(Extractor):
}
data["date"] = data["post_date"]
- tags = self._find_tags(data["description"])
- if tags:
+ if tags := self._find_tags(data["description"]):
data["tags"] = sorted(set(tags))
- location = post.get("location")
- if location:
+ if location := post.get("location"):
data["location_id"] = location["id"]
data["location_slug"] = location["slug"]
- data["location_url"] = "{}/explore/locations/{}/{}/".format(
- self.root, location["id"], location["slug"])
+ data["location_url"] = (f"{self.root}/explore/locations/"
+ f"{location['id']}/{location['slug']}/")
- coauthors = post.get("coauthor_producers")
- if coauthors:
+ if coauthors := post.get("coauthor_producers"):
data["coauthors"] = [
{"id" : user["id"],
"username": user["username"]}
@@ -358,36 +349,31 @@ class InstagramExtractor(Extractor):
return data
- @staticmethod
- def _extract_tagged_users(src, dest):
+ def _extract_tagged_users(self, src, dest):
dest["tagged_users"] = tagged_users = []
- edges = src.get("edge_media_to_tagged_user")
- if edges:
+ if edges := src.get("edge_media_to_tagged_user"):
for edge in edges["edges"]:
user = edge["node"]["user"]
tagged_users.append({"id" : user["id"],
"username" : user["username"],
"full_name": user["full_name"]})
- usertags = src.get("usertags")
- if usertags:
+ if usertags := src.get("usertags"):
for tag in usertags["in"]:
user = tag["user"]
tagged_users.append({"id" : user["pk"],
"username" : user["username"],
"full_name": user["full_name"]})
- mentions = src.get("reel_mentions")
- if mentions:
+ if mentions := src.get("reel_mentions"):
for mention in mentions:
user = mention["user"]
tagged_users.append({"id" : user.get("pk"),
"username" : user["username"],
"full_name": user["full_name"]})
- stickers = src.get("story_bloks_stickers")
- if stickers:
+ if stickers := src.get("story_bloks_stickers"):
for sticker in stickers:
sticker = sticker["bloks_sticker"]
if sticker["bloks_sticker_type"] == "mention":
@@ -430,21 +416,14 @@ class InstagramExtractor(Extractor):
user[key] = 0
-class InstagramUserExtractor(InstagramExtractor):
+class InstagramUserExtractor(Dispatch, InstagramExtractor):
"""Extractor for an Instagram user profile"""
- subcategory = "user"
pattern = USER_PATTERN + r"/?(?:$|[?#])"
example = "https://www.instagram.com/USER/"
- def initialize(self):
- pass
-
- def finalize(self):
- pass
-
def items(self):
- base = "{}/{}/".format(self.root, self.item)
- stories = "{}/stories/{}/".format(self.root, self.item)
+ base = f"{self.root}/{self.item}/"
+ stories = f"{self.root}/stories/{self.item}/"
return self._dispatch_extractors((
(InstagramInfoExtractor , base + "info/"),
(InstagramAvatarExtractor , base + "avatar/"),
@@ -522,7 +501,7 @@ class InstagramGuideExtractor(InstagramExtractor):
def __init__(self, match):
InstagramExtractor.__init__(self, match)
- self.guide_id = match.group(2)
+ self.guide_id = match[2]
def metadata(self):
return {"guide": self.api.guide(self.guide_id)}
@@ -632,7 +611,7 @@ class InstagramFollowersExtractor(InstagramExtractor):
uid = self.api.user_id(self.item)
for user in self.api.user_followers(uid):
user["_extractor"] = InstagramUserExtractor
- url = "{}/{}".format(self.root, user["username"])
+ url = f"{self.root}/{user['username']}"
yield Message.Queue, url, user
@@ -646,7 +625,7 @@ class InstagramFollowingExtractor(InstagramExtractor):
uid = self.api.user_id(self.item)
for user in self.api.user_following(uid):
user["_extractor"] = InstagramUserExtractor
- url = "{}/{}".format(self.root, user["username"])
+ url = f"{self.root}/{user['username']}"
yield Message.Queue, url, user
@@ -702,8 +681,7 @@ class InstagramAvatarExtractor(InstagramExtractor):
url = user.get("profile_pic_url_hd") or user["profile_pic_url"]
avatar = {"url": url, "width": 0, "height": 0}
- pk = user.get("profile_pic_id")
- if pk:
+ if pk := user.get("profile_pic_id"):
pk = pk.partition("_")[0]
code = shortcode_from_id(pk)
else:
@@ -751,14 +729,13 @@ class InstagramRestAPI():
return self._call(endpoint, params=params)
def guide_media(self, guide_id):
- endpoint = "/v1/guides/guide/{}/".format(guide_id)
+ endpoint = f"/v1/guides/guide/{guide_id}/"
return self._pagination_guides(endpoint)
def highlights_media(self, user_id, chunk_size=5):
reel_ids = [hl["id"] for hl in self.highlights_tray(user_id)]
- order = self.extractor.config("order-posts")
- if order:
+ if order := self.extractor.config("order-posts"):
if order in ("desc", "reverse"):
reel_ids.reverse()
elif order in ("id", "id_asc"):
@@ -773,13 +750,13 @@ class InstagramRestAPI():
reel_ids[offset : offset+chunk_size])
def highlights_tray(self, user_id):
- endpoint = "/v1/highlights/{}/highlights_tray/".format(user_id)
+ endpoint = f"/v1/highlights/{user_id}/highlights_tray/"
return self._call(endpoint)["tray"]
def media(self, shortcode):
if len(shortcode) > 28:
shortcode = shortcode[:-28]
- endpoint = "/v1/media/{}/info/".format(id_from_shortcode(shortcode))
+ endpoint = f"/v1/media/{id_from_shortcode(shortcode)}/info/"
return self._pagination(endpoint)
def reels_media(self, reel_ids):
@@ -796,7 +773,7 @@ class InstagramRestAPI():
yield media["media"]
def tags_sections(self, tag):
- endpoint = "/v1/tags/{}/sections/".format(tag)
+ endpoint = f"/v1/tags/{tag}/sections/"
data = {
"include_persistent": "0",
"max_id" : None,
@@ -815,7 +792,7 @@ class InstagramRestAPI():
@memcache(keyarg=1)
def user_by_id(self, user_id):
- endpoint = "/v1/users/{}/info/".format(user_id)
+ endpoint = f"/v1/users/{user_id}/info/"
return self._call(endpoint)["user"]
def user_id(self, screen_name, check_private=True):
@@ -847,22 +824,22 @@ class InstagramRestAPI():
return self._pagination_post(endpoint, data)
def user_collection(self, collection_id):
- endpoint = "/v1/feed/collection/{}/posts/".format(collection_id)
+ endpoint = f"/v1/feed/collection/{collection_id}/posts/"
params = {"count": 50}
return self._pagination(endpoint, params, media=True)
def user_feed(self, user_id):
- endpoint = "/v1/feed/user/{}/".format(user_id)
+ endpoint = f"/v1/feed/user/{user_id}/"
params = {"count": 30}
return self._pagination(endpoint, params)
def user_followers(self, user_id):
- endpoint = "/v1/friendships/{}/followers/".format(user_id)
+ endpoint = f"/v1/friendships/{user_id}/followers/"
params = {"count": 12}
return self._pagination_following(endpoint, params)
def user_following(self, user_id):
- endpoint = "/v1/friendships/{}/following/".format(user_id)
+ endpoint = f"/v1/friendships/{user_id}/following/"
params = {"count": 12}
return self._pagination_following(endpoint, params)
@@ -872,7 +849,7 @@ class InstagramRestAPI():
return self._pagination(endpoint, params, media=True)
def user_tagged(self, user_id):
- endpoint = "/v1/usertags/{}/feed/".format(user_id)
+ endpoint = f"/v1/usertags/{user_id}/feed/"
params = {"count": 20}
return self._pagination(endpoint, params)
@@ -893,7 +870,7 @@ class InstagramRestAPI():
"Sec-Fetch-Mode" : "cors",
"Sec-Fetch-Site" : "same-origin",
}
- return extr.request(url, **kwargs).json()
+ return extr.request_json(url, **kwargs)
def _pagination(self, endpoint, params=None, media=False):
if params is None:
@@ -987,9 +964,8 @@ class InstagramGraphqlAPI():
self.user_by_id = api.user_by_id
self.user_id = api.user_id
- @staticmethod
- def _unsupported(_=None):
- raise exception.StopExtraction("Unsupported with GraphQL API")
+ def _unsupported(self, _=None):
+ raise exception.AbortExtraction("Unsupported with GraphQL API")
def highlights_tray(self, user_id):
query_hash = "d4d88dc1500312af6f937f7b804c68c3"
@@ -1057,7 +1033,7 @@ class InstagramGraphqlAPI():
"X-Requested-With": "XMLHttpRequest",
"Referer" : extr.root + "/",
}
- return extr.request(url, params=params, headers=headers).json()["data"]
+ return extr.request_json(url, params=params, headers=headers)["data"]
def _pagination(self, query_hash, variables,
key_data="user", key_edge=None):
@@ -1075,9 +1051,10 @@ class InstagramGraphqlAPI():
if not info["has_next_page"]:
return extr._update_cursor(None)
elif not data["edges"]:
- s = "" if self.extractor.item.endswith("s") else "s"
- raise exception.StopExtraction(
- "%s'%s posts are private", self.extractor.item, s)
+ user = self.extractor.item
+ s = "" if user.endswith("s") else "s"
+ raise exception.AbortExtraction(
+ f"{user}'{s} posts are private")
variables["after"] = extr._update_cursor(info["end_cursor"])
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py
index abbdfd5..06c5caa 100644
--- a/gallery_dl/extractor/issuu.py
+++ b/gallery_dl/extractor/issuu.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -39,18 +39,15 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
doc["date"] = text.parse_datetime(
doc["originalPublishDateInISOString"], "%Y-%m-%dT%H:%M:%S.%fZ")
- self._cnt = text.parse_int(doc["pageCount"])
- self._tpl = "https://{}/{}-{}/jpg/page_{{}}.jpg".format(
- "image.isu.pub", # data["config"]["hosts"]["image"],
- doc["revisionId"],
- doc["publicationId"],
- )
+ self.count = text.parse_int(doc["pageCount"])
+ self.base = (f"https://image.isu.pub/{doc['revisionId']}-"
+ f"{doc['publicationId']}/jpg/page_")
return {"document": doc}
def images(self, page):
- fmt = self._tpl.format
- return [(fmt(i), None) for i in range(1, self._cnt + 1)]
+ return [(f"{self.base}{i}.jpg", None)
+ for i in range(1, self.count + 1)]
class IssuuUserExtractor(IssuuBase, Extractor):
diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py
index e602665..19ffc50 100644
--- a/gallery_dl/extractor/itaku.py
+++ b/gallery_dl/extractor/itaku.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,11 +8,12 @@
"""Extractors for https://itaku.ee/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from ..cache import memcache
-from .. import text
+from .. import text, util
BASE_PATTERN = r"(?:https?://)?itaku\.ee"
+USER_PATTERN = BASE_PATTERN + r"/profile/([^/?#]+)"
class ItakuExtractor(Extractor):
@@ -29,49 +30,166 @@ class ItakuExtractor(Extractor):
self.videos = self.config("videos", True)
def items(self):
- for post in self.posts():
-
- post["date"] = text.parse_datetime(
- post["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
- for category, tags in post.pop("categorized_tags").items():
- post["tags_" + category.lower()] = [t["name"] for t in tags]
- post["tags"] = [t["name"] for t in post["tags"]]
-
- sections = []
- for s in post["sections"]:
- group = s["group"]
- if group:
- sections.append(group["title"] + "/" + s["title"])
+ if images := self.images():
+ for image in images:
+ image["date"] = text.parse_datetime(
+ image["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ for category, tags in image.pop("categorized_tags").items():
+ image[f"tags_{category.lower()}"] = [
+ t["name"] for t in tags]
+ image["tags"] = [t["name"] for t in image["tags"]]
+
+ sections = []
+ for s in image["sections"]:
+ if group := s["group"]:
+ sections.append(f"{group['title']}/{s['title']}")
+ else:
+ sections.append(s["title"])
+ image["sections"] = sections
+
+ if self.videos and image["video"]:
+ url = image["video"]["video"]
else:
- sections.append(s["title"])
- post["sections"] = sections
+ url = image["image"]
- if post["video"] and self.videos:
- url = post["video"]["video"]
- else:
- url = post["image"]
+ yield Message.Directory, image
+ yield Message.Url, url, text.nameext_from_url(url, image)
+ return
+
+ if posts := self.posts():
+ for post in posts:
+ images = post.pop("gallery_images") or ()
+ post["count"] = len(images)
+ post["date"] = text.parse_datetime(
+ post["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["tags"] = [t["name"] for t in post["tags"]]
+
+ yield Message.Directory, post
+ for post["num"], image in enumerate(images, 1):
+ post["file"] = image
+ image["date"] = text.parse_datetime(
+ image["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
+
+ url = image["image"]
+ yield Message.Url, url, text.nameext_from_url(url, post)
+ return
+
+ if users := self.users():
+ base = f"{self.root}/profile/"
+ for user in users:
+ url = f"{base}{user['owner_username']}"
+ user["_extractor"] = ItakuUserExtractor
+ yield Message.Queue, url, user
+ return
- yield Message.Directory, post
- yield Message.Url, url, text.nameext_from_url(url, post)
+ images = posts = users = util.noop
class ItakuGalleryExtractor(ItakuExtractor):
- """Extractor for posts from an itaku user gallery"""
+ """Extractor for an itaku user's gallery"""
subcategory = "gallery"
- pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery(?:/(\d+))?"
+ pattern = USER_PATTERN + r"/gallery(?:/(\d+))?"
example = "https://itaku.ee/profile/USER/gallery"
+ def images(self):
+ user, section = self.groups
+ return self.api.galleries_images({
+ "owner" : self.api.user_id(user),
+ "sections": section,
+ })
+
+
+class ItakuPostsExtractor(ItakuExtractor):
+ """Extractor for an itaku user's posts"""
+ subcategory = "posts"
+ directory_fmt = ("{category}", "{owner_username}", "Posts",
+ "{id}{title:? //}")
+ filename_fmt = "{file[id]}{file[title]:? //}.{extension}"
+ archive_fmt = "{id}_{file[id]}"
+ pattern = USER_PATTERN + r"/posts(?:/(\d+))?"
+ example = "https://itaku.ee/profile/USER/posts"
+
def posts(self):
- return self.api.galleries_images(*self.groups)
+ user, folder = self.groups
+ return self.api.posts({
+ "owner" : self.api.user_id(user),
+ "folders": folder,
+ })
class ItakuStarsExtractor(ItakuExtractor):
+ """Extractor for an itaku user's starred images"""
subcategory = "stars"
- pattern = BASE_PATTERN + r"/profile/([^/?#]+)/stars(?:/(\d+))?"
+ pattern = USER_PATTERN + r"/stars(?:/(\d+))?"
example = "https://itaku.ee/profile/USER/stars"
- def posts(self):
- return self.api.galleries_images_starred(*self.groups)
+ def images(self):
+ user, section = self.groups
+ return self.api.galleries_images({
+ "stars_of": self.api.user_id(user),
+ "sections": section,
+ "ordering": "-like_date",
+ }, "/user_starred_imgs")
+
+
+class ItakuFollowingExtractor(ItakuExtractor):
+ subcategory = "following"
+ pattern = USER_PATTERN + r"/following"
+ example = "https://itaku.ee/profile/USER/following"
+
+ def users(self):
+ return self.api.user_profiles({
+ "followed_by": self.api.user_id(self.groups[0]),
+ })
+
+
+class ItakuFollowersExtractor(ItakuExtractor):
+ subcategory = "followers"
+ pattern = USER_PATTERN + r"/followers"
+ example = "https://itaku.ee/profile/USER/followers"
+
+ def users(self):
+ return self.api.user_profiles({
+ "followers_of": self.api.user_id(self.groups[0]),
+ })
+
+
+class ItakuBookmarksExtractor(ItakuExtractor):
+ """Extractor for an itaku bookmarks folder"""
+ subcategory = "bookmarks"
+ pattern = USER_PATTERN + r"/bookmarks/(image|user)/(\d+)"
+ example = "https://itaku.ee/profile/USER/bookmarks/image/12345"
+
+ def _init(self):
+ if self.groups[1] == "user":
+ self.images = util.noop
+ ItakuExtractor._init(self)
+
+ def images(self):
+ return self.api.galleries_images({
+ "bookmark_folder": self.groups[2],
+ })
+
+ def users(self):
+ return self.api.user_profiles({
+ "bookmark_folder": self.groups[2],
+ })
+
+
+class ItakuUserExtractor(Dispatch, ItakuExtractor):
+ """Extractor for itaku user profiles"""
+ pattern = USER_PATTERN + r"/?(?:$|\?|#)"
+ example = "https://itaku.ee/profile/USER"
+
+ def items(self):
+ base = f"{self.root}/profile/{self.groups[0]}/"
+ return self._dispatch_extractors((
+ (ItakuGalleryExtractor , base + "gallery"),
+ (ItakuPostsExtractor , base + "posts"),
+ (ItakuFollowersExtractor, base + "followers"),
+ (ItakuFollowingExtractor, base + "following"),
+ (ItakuStarsExtractor , base + "stars"),
+ ), ("gallery",))
class ItakuImageExtractor(ItakuExtractor):
@@ -79,19 +197,51 @@ class ItakuImageExtractor(ItakuExtractor):
pattern = BASE_PATTERN + r"/images/(\d+)"
example = "https://itaku.ee/images/12345"
- def posts(self):
+ def images(self):
return (self.api.image(self.groups[0]),)
+class ItakuPostExtractor(ItakuExtractor):
+ subcategory = "post"
+ directory_fmt = ("{category}", "{owner_username}", "Posts",
+ "{id}{title:? //}")
+ filename_fmt = "{file[id]}{file[title]:? //}.{extension}"
+ archive_fmt = "{id}_{file[id]}"
+ pattern = BASE_PATTERN + r"/posts/(\d+)"
+ example = "https://itaku.ee/posts/12345"
+
+ def posts(self):
+ return (self.api.post(self.groups[0]),)
+
+
class ItakuSearchExtractor(ItakuExtractor):
subcategory = "search"
pattern = BASE_PATTERN + r"/home/images/?\?([^#]+)"
example = "https://itaku.ee/home/images?tags=SEARCH"
- def posts(self):
+ def images(self):
+ required_tags = []
+ negative_tags = []
+ optional_tags = []
+
params = text.parse_query_list(
self.groups[0], {"tags", "maturity_rating"})
- return self.api.search_images(params)
+ if tags := params.pop("tags", None):
+ for tag in tags:
+ if not tag:
+ pass
+ elif tag[0] == "-":
+ negative_tags.append(tag[1:])
+ elif tag[0] == "~":
+ optional_tags.append(tag[1:])
+ else:
+ required_tags.append(tag)
+
+ return self.api.galleries_images({
+ "required_tags": required_tags,
+ "negative_tags": negative_tags,
+ "optional_tags": optional_tags,
+ })
class ItakuAPI():
@@ -103,90 +253,77 @@ class ItakuAPI():
"Accept": "application/json, text/plain, */*",
}
- def search_images(self, params):
- endpoint = "/galleries/images/"
- required_tags = []
- negative_tags = []
- optional_tags = []
-
- for tag in params.pop("tags", None) or ():
- if not tag:
- pass
- elif tag[0] == "-":
- negative_tags.append(tag[1:])
- elif tag[0] == "~":
- optional_tags.append(tag[1:])
- else:
- required_tags.append(tag)
-
- api_params = {
- "required_tags": required_tags,
- "negative_tags": negative_tags,
- "optional_tags": optional_tags,
+ def galleries_images(self, params, path=""):
+ endpoint = f"/galleries/images{path}/"
+ params = {
+ "cursor" : None,
"date_range": "",
"maturity_rating": ("SFW", "Questionable", "NSFW"),
"ordering" : "-date_added",
"page" : "1",
"page_size" : "30",
"visibility": ("PUBLIC", "PROFILE_ONLY"),
+ **params,
}
- api_params.update(params)
- return self._pagination(endpoint, api_params, self.image)
+ return self._pagination(endpoint, params, self.image)
- def galleries_images(self, username, section=None):
- endpoint = "/galleries/images/"
+ def posts(self, params):
+ endpoint = "/posts/"
params = {
"cursor" : None,
- "owner" : self.user(username)["owner"],
- "sections" : section,
"date_range": "",
"maturity_rating": ("SFW", "Questionable", "NSFW"),
"ordering" : "-date_added",
"page" : "1",
"page_size" : "30",
- "visibility": ("PUBLIC", "PROFILE_ONLY"),
+ **params,
}
- return self._pagination(endpoint, params, self.image)
+ return self._pagination(endpoint, params)
- def galleries_images_starred(self, username, section=None):
- endpoint = "/galleries/images/user_starred_imgs/"
+ def user_profiles(self, params):
+ endpoint = "/user_profiles/"
params = {
- "cursor" : None,
- "stars_of" : self.user(username)["owner"],
- "sections" : section,
- "date_range": "",
- "ordering" : "-date_added",
- "maturity_rating": ("SFW", "Questionable", "NSFW"),
- "page" : "1",
- "page_size" : "30",
- "visibility": ("PUBLIC", "PROFILE_ONLY"),
+ "cursor" : None,
+ "ordering" : "-date_added",
+ "page" : "1",
+ "page_size": "50",
+ "sfw_only" : "false",
+ **params,
}
- return self._pagination(endpoint, params, self.image)
+ return self._pagination(endpoint, params)
def image(self, image_id):
- endpoint = "/galleries/images/{}/".format(image_id)
+ endpoint = f"/galleries/images/{image_id}/"
+ return self._call(endpoint)
+
+ def post(self, post_id):
+ endpoint = f"/posts/{post_id}/"
return self._call(endpoint)
@memcache(keyarg=1)
def user(self, username):
- return self._call("/user_profiles/{}/".format(username))
+ return self._call(f"/user_profiles/{username}/")
+
+ def user_id(self, username):
+ if username.startswith("id:"):
+ return int(username[3:])
+ return self.user(username)["owner"]
def _call(self, endpoint, params=None):
if not endpoint.startswith("http"):
endpoint = self.root + endpoint
- response = self.extractor.request(
+ return self.extractor.request_json(
endpoint, params=params, headers=self.headers)
- return response.json()
- def _pagination(self, endpoint, params, extend):
+ def _pagination(self, endpoint, params, extend=None):
data = self._call(endpoint, params)
while True:
- if extend:
+ if extend is None:
+ yield from data["results"]
+ else:
for result in data["results"]:
yield extend(result["id"])
- else:
- yield from data["results"]
url_next = data["links"].get("next")
if not url_next:
diff --git a/gallery_dl/extractor/itchio.py b/gallery_dl/extractor/itchio.py
index 799dd66..6312e58 100644
--- a/gallery_dl/extractor/itchio.py
+++ b/gallery_dl/extractor/itchio.py
@@ -20,7 +20,7 @@ class ItchioGameExtractor(Extractor):
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{game[title]} ({id}).{extension}"
archive_fmt = "{id}"
- pattern = r"(?:https?://)?(\w+).itch\.io/([\w-]+)"
+ pattern = r"(?:https?://)?(\w+)\.itch\.io/([\w-]+)"
example = "https://USER.itch.io/GAME"
def __init__(self, match):
@@ -28,7 +28,7 @@ class ItchioGameExtractor(Extractor):
Extractor.__init__(self, match)
def items(self):
- game_url = "https://{}.itch.io/{}".format(self.user, self.slug)
+ game_url = f"https://{self.user}.itch.io/{self.slug}"
page = self.request(game_url).text
params = {
@@ -39,16 +39,16 @@ class ItchioGameExtractor(Extractor):
headers = {
"Referer": game_url,
"X-Requested-With": "XMLHttpRequest",
- "Origin": "https://{}.itch.io".format(self.user),
+ "Origin": f"https://{self.user}.itch.io",
}
data = {
"csrf_token": text.unquote(self.cookies["itchio_token"]),
}
for upload_id in text.extract_iter(page, 'data-upload_id="', '"'):
- file_url = "{}/file/{}".format(game_url, upload_id)
- info = self.request(file_url, method="POST", params=params,
- headers=headers, data=data).json()
+ file_url = f"{game_url}/file/{upload_id}"
+ info = self.request_json(file_url, method="POST", params=params,
+ headers=headers, data=data)
game = info["lightbox"]["game"]
user = info["lightbox"]["user"]
diff --git a/gallery_dl/extractor/iwara.py b/gallery_dl/extractor/iwara.py
new file mode 100644
index 0000000..934b301
--- /dev/null
+++ b/gallery_dl/extractor/iwara.py
@@ -0,0 +1,440 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.iwara.tv/"""
+
+from .common import Extractor, Message, Dispatch
+from .. import text, util, exception
+from ..cache import cache, memcache
+import hashlib
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?iwara\.tv"
+USER_PATTERN = rf"{BASE_PATTERN}/profile/([^/?#]+)"
+
+
+class IwaraExtractor(Extractor):
+ """Base class for iwara.tv extractors"""
+ category = "iwara"
+ root = "https://www.iwara.tv"
+ directory_fmt = ("{category}", "{user[name]}")
+ filename_fmt = "{date} {id} {title[:200]} {filename}.{extension}"
+ archive_fmt = "{type} {user[name]} {id} {file_id}"
+
+ def _init(self):
+ self.api = IwaraAPI(self)
+
+ def items_image(self, images, user=None):
+ for image in images:
+ try:
+ if "image" in image:
+ # could extract 'date_favorited' here
+ image = image["image"]
+ if not (files := image.get("files")):
+ image = self.api.image(image["id"])
+ files = image["files"]
+
+ group_info = self.extract_media_info(image, "file", False)
+ group_info["user"] = (self.extract_user_info(image)
+ if user is None else user)
+ except Exception as exc:
+ self.status |= 1
+ self.log.error("Failed to process image %s (%s: %s)",
+ image["id"], exc.__class__.__name__, exc)
+ continue
+
+ group_info["count"] = len(files)
+ yield Message.Directory, group_info
+ for num, file in enumerate(files, 1):
+ file_info = self.extract_media_info(file, None)
+ file_id = file_info["file_id"]
+ url = (f"https://i.iwara.tv/image/original/"
+ f"{file_id}/{file_id}.{file_info['extension']}")
+ yield Message.Url, url, {**file_info, **group_info, "num": num}
+
+ def items_video(self, videos, user=None):
+ for video in videos:
+ try:
+ if "video" in video:
+ video = video["video"]
+ if "fileUrl" not in video:
+ video = self.api.video(video["id"])
+ file_url = video["fileUrl"]
+ sources = self.api.source(file_url)
+ source = next((s for s in sources
+ if s.get("name") == "Source"), None)
+ download_url = source.get('src', {}).get('download')
+
+ info = self.extract_media_info(video, "file")
+ info["count"] = info["num"] = 1
+ info["user"] = (self.extract_user_info(video)
+ if user is None else user)
+ except Exception as exc:
+ self.status |= 1
+ self.log.error("Failed to process video %s (%s: %s)",
+ video["id"], exc.__class__.__name__, exc)
+ continue
+
+ yield Message.Directory, info
+ yield Message.Url, f"https:{download_url}", info
+
+ def items_user(self, users, key=None):
+ base = f"{self.root}/profile/"
+ for user in users:
+ if key is not None:
+ user = user[key]
+ if (username := user["username"]) is None:
+ continue
+ user["type"] = "user"
+ user["_extractor"] = IwaraUserExtractor
+ yield Message.Queue, f"{base}{username}", user
+
+ def items_by_type(self, type, results):
+ if type == "image":
+ return self.items_image(results)
+ if type == "video":
+ return self.items_video(results)
+ if type == "user":
+ return self.items_user(results)
+
+ raise exception.AbortExtraction(f"Unsupported result type '{type}'")
+
+ def extract_media_info(self, item, key, include_file_info=True):
+ title = t.strip() if (t := item.get("title")) else ""
+
+ if include_file_info:
+ file_info = item if key is None else item.get(key) or {}
+ filename, _, extension = file_info.get("name", "").rpartition(".")
+
+ return {
+ "id" : item["id"],
+ "file_id" : file_info.get("id"),
+ "title" : title,
+ "filename" : filename,
+ "extension": extension,
+ "date" : text.parse_datetime(
+ file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ"),
+ "date_updated": text.parse_datetime(
+ file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ"),
+ "mime" : file_info.get("mime"),
+ "size" : file_info.get("size"),
+ "width" : file_info.get("width"),
+ "height" : file_info.get("height"),
+ "duration" : file_info.get("duration"),
+ "type" : file_info.get("type"),
+ }
+ else:
+ return {
+ "id" : item["id"],
+ "title": title,
+ }
+
+ def extract_user_info(self, profile):
+ user = profile.get("user") or {}
+ return {
+ "id" : user.get("id"),
+ "name" : user.get("username"),
+ "nick" : user.get("name").strip(),
+ "status" : user.get("status"),
+ "role" : user.get("role"),
+ "premium": user.get("premium"),
+ "date" : text.parse_datetime(
+ user.get("createdAt"), "%Y-%m-%dT%H:%M:%S.000Z"),
+ "description": profile.get("body"),
+ }
+
+ def _user_params(self):
+ user, qs = self.groups
+ params = text.parse_query(qs)
+ profile = self.api.profile(user)
+ params["user"] = profile["user"]["id"]
+ return self.extract_user_info(profile), params
+
+
+class IwaraUserExtractor(Dispatch, IwaraExtractor):
+ """Extractor for iwara.tv profile pages"""
+ pattern = rf"{USER_PATTERN}/?$"
+ example = "https://www.iwara.tv/profile/USERNAME"
+
+ def items(self):
+ base = f"{self.root}/profile/{self.groups[0]}/"
+ return self._dispatch_extractors((
+ (IwaraUserImagesExtractor , f"{base}images"),
+ (IwaraUserVideosExtractor , f"{base}videos"),
+ (IwaraUserPlaylistsExtractor, f"{base}playlists"),
+ ), ("user-images", "user-videos"))
+
+
+class IwaraUserImagesExtractor(IwaraExtractor):
+ subcategory = "user-images"
+ pattern = rf"{USER_PATTERN}/images(?:\?([^#]+))?"
+ example = "https://www.iwara.tv/profile/USERNAME/images"
+
+ def items(self):
+ user, params = self._user_params()
+ return self.items_image(self.api.images(params), user)
+
+
+class IwaraUserVideosExtractor(IwaraExtractor):
+ subcategory = "user-videos"
+ pattern = rf"{USER_PATTERN}/videos(?:\?([^#]+))?"
+ example = "https://www.iwara.tv/profile/USERNAME/videos"
+
+ def items(self):
+ user, params = self._user_params()
+ return self.items_video(self.api.videos(params), user)
+
+
+class IwaraUserPlaylistsExtractor(IwaraExtractor):
+ subcategory = "user-playlists"
+ pattern = rf"{USER_PATTERN}/playlists(?:\?([^#]+))?"
+ example = "https://www.iwara.tv/profile/USERNAME/playlists"
+
+ def items(self):
+ base = f"{self.root}/playlist/"
+
+ for playlist in self.api.playlists(self._user_params()[1]):
+ playlist["type"] = "playlist"
+ playlist["_extractor"] = IwaraPlaylistExtractor
+ url = f"{base}{playlist['id']}"
+ yield Message.Queue, url, playlist
+
+
+class IwaraFollowingExtractor(IwaraExtractor):
+ subcategory = "following"
+ pattern = rf"{USER_PATTERN}/following"
+ example = "https://www.iwara.tv/profile/USERNAME/following"
+
+ def items(self):
+ uid = self.api.profile(self.groups[0])["user"]["id"]
+ return self.items_user(self.api.user_following(uid), "user")
+
+
+class IwaraFollowersExtractor(IwaraExtractor):
+ subcategory = "followers"
+ pattern = rf"{USER_PATTERN}/followers"
+ example = "https://www.iwara.tv/profile/USERNAME/followers"
+
+ def items(self):
+ uid = self.api.profile(self.groups[0])["user"]["id"]
+ return self.items_user(self.api.user_followers(uid), "follower")
+
+
+class IwaraImageExtractor(IwaraExtractor):
+ """Extractor for individual iwara.tv image pages"""
+ subcategory = "image"
+ pattern = rf"{BASE_PATTERN}/image/([^/?#]+)"
+ example = "https://www.iwara.tv/image/ID"
+
+ def items(self):
+ return self.items_image((self.api.image(self.groups[0]),))
+
+
+class IwaraVideoExtractor(IwaraExtractor):
+ """Extractor for individual iwara.tv videos"""
+ subcategory = "video"
+ pattern = rf"{BASE_PATTERN}/video/([^/?#]+)"
+ example = "https://www.iwara.tv/video/ID"
+
+ def items(self):
+ return self.items_video((self.api.video(self.groups[0]),))
+
+
+class IwaraPlaylistExtractor(IwaraExtractor):
+ """Extractor for individual iwara.tv playlist pages"""
+ subcategory = "playlist"
+ pattern = rf"{BASE_PATTERN}/playlist/([^/?#]+)"
+ example = "https://www.iwara.tv/playlist/ID"
+
+ def items(self):
+ return self.items_video(self.api.playlist(self.groups[0]))
+
+
+class IwaraFavoriteExtractor(IwaraExtractor):
+ subcategory = "favorite"
+ pattern = rf"{BASE_PATTERN}/favorites(?:/(image|video)s)?"
+ example = "https://www.iwara.tv/favorites/videos"
+
+ def items(self):
+ type = self.groups[0] or "vidoo"
+ return self.items_by_type(type, self.api.favorites(type))
+
+
+class IwaraSearchExtractor(IwaraExtractor):
+ """Extractor for iwara.tv search pages"""
+ subcategory = "search"
+ pattern = rf"{BASE_PATTERN}/search\?([^#]+)"
+ example = "https://www.iwara.tv/search?query=QUERY&type=TYPE"
+
+ def items(self):
+ params = text.parse_query(self.groups[0])
+ type = params.get("type")
+ self.kwdict["search_tags"] = query = params.get("query")
+ return self.items_by_type(type, self.api.search(type, query))
+
+
+class IwaraTagExtractor(IwaraExtractor):
+ """Extractor for iwara.tv tag search"""
+ subcategory = "tag"
+ pattern = rf"{BASE_PATTERN}/(image|video)s(?:\?([^#]+))?"
+ example = "https://www.iwara.tv/videos?tags=TAGS"
+
+ def items(self):
+ type, qs = self.groups
+ params = text.parse_query(qs)
+ self.kwdict["search_tags"] = params.get("tags")
+ return self.items_by_type(type, self.api.media(type, params))
+
+
+class IwaraAPI():
+ """Interface for the Iwara API"""
+ root = "https://api.iwara.tv"
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.headers = {
+ "Referer" : f"{extractor.root}/",
+ "Content-Type": "application/json",
+ "Origin" : extractor.root,
+ }
+
+ self.username, self.password = extractor._get_auth_info()
+ if not self.username:
+ self.authenticate = util.noop
+
+ def image(self, image_id):
+ endpoint = f"/image/{image_id}"
+ return self._call(endpoint)
+
+ def video(self, video_id):
+ endpoint = f"/video/{video_id}"
+ return self._call(endpoint)
+
+ def playlist(self, playlist_id):
+ endpoint = f"/playlist/{playlist_id}"
+ return self._pagination(endpoint)
+
+ def detail(self, media):
+ endpoint = f"/{media['type']}/{media['id']}"
+ return self._call(endpoint)
+
+ def images(self, params):
+ endpoint = "/images"
+ params.setdefault("rating", "all")
+ return self._pagination(endpoint, params)
+
+ def videos(self, params):
+ endpoint = "/videos"
+ params.setdefault("rating", "all")
+ return self._pagination(endpoint, params)
+
+ def playlists(self, params):
+ endpoint = "/playlists"
+ return self._pagination(endpoint, params)
+
+ def media(self, type, params):
+ endpoint = f"/{type}s"
+ params.setdefault("rating", "all")
+ return self._pagination(endpoint, params)
+
+ def favorites(self, type):
+ if not self.username:
+ raise exception.AuthRequired("'username' & 'password'")
+ endpoint = f"/favorites/{type}s"
+ return self._pagination(endpoint)
+
+ def search(self, type, query):
+ endpoint = "/search"
+ params = {"type": type, "query": query}
+ return self._pagination(endpoint, params)
+
+ @memcache(keyarg=1)
+ def profile(self, username):
+ endpoint = f"/profile/{username}"
+ return self._call(endpoint)
+
+ def user_following(self, user_id):
+ endpoint = f"/user/{user_id}/following"
+ return self._pagination(endpoint)
+
+ def user_followers(self, user_id):
+ endpoint = f"/user/{user_id}/followers"
+ return self._pagination(endpoint)
+
+ def source(self, file_url):
+ base, _, query = file_url.partition("?")
+ if not (expires := text.extr(query, "expires=", "&")):
+ return ()
+ file_id = base.rpartition("/")[2]
+ sha_postfix = "5nFp9kmbNnHdAFhaqMvt"
+ sha_key = f"{file_id}_{expires}_{sha_postfix}"
+ hash = hashlib.sha1(sha_key.encode()).hexdigest()
+ headers = {"X-Version": hash, **self.headers}
+ return self.extractor.request_json(file_url, headers=headers)
+
+ def authenticate(self):
+ self.headers["Authorization"] = self._authenticate_impl(self.username)
+
+ @cache(maxage=3600, keyarg=1)
+ def _authenticate_impl(self, username):
+ refresh_token = _refresh_token_cache(username)
+ if refresh_token is None:
+ self.extractor.log.info("Logging in as %s", username)
+
+ url = f"{self.root}/user/login"
+ json = {
+ "email" : username,
+ "password": self.password
+ }
+ data = self.extractor.request_json(
+ url, method="POST", headers=self.headers, json=json,
+ fatal=False)
+
+ if not (refresh_token := data.get("token")):
+ self.extractor.log.debug(data)
+ raise exception.AuthenticationError(data.get("message"))
+ _refresh_token_cache.update(username, refresh_token)
+
+ self.extractor.log.info("Refreshing access token for %s", username)
+
+ url = f"{self.root}/user/token"
+ headers = {"Authorization": f"Bearer {refresh_token}", **self.headers}
+ data = self.extractor.request_json(
+ url, method="POST", headers=headers, fatal=False)
+
+ if not (access_token := data.get("accessToken")):
+ self.extractor.log.debug(data)
+ raise exception.AuthenticationError(data.get("message"))
+ return f"Bearer {access_token}"
+
+ def _call(self, endpoint, params=None, headers=None):
+ if headers is None:
+ headers = self.headers
+
+ url = self.root + endpoint
+ self.authenticate()
+ return self.extractor.request_json(url, params=params, headers=headers)
+
+ def _pagination(self, endpoint, params=None):
+ if params is None:
+ params = {}
+ params["page"] = 0
+ params["limit"] = 50
+
+ while True:
+ data = self._call(endpoint, params)
+
+ if not (results := data.get("results")):
+ break
+ yield from results
+
+ if len(results) < params["limit"]:
+ break
+ params["page"] += 1
+
+
+@cache(maxage=28*86400, keyarg=0)
+def _refresh_token_cache(username):
+ return None
diff --git a/gallery_dl/extractor/jschan.py b/gallery_dl/extractor/jschan.py
index 398256d..5f3e75a 100644
--- a/gallery_dl/extractor/jschan.py
+++ b/gallery_dl/extractor/jschan.py
@@ -33,27 +33,19 @@ class JschanThreadExtractor(JschanExtractor):
pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)\.html"
example = "https://94chan.org/a/thread/12345.html"
- def __init__(self, match):
- JschanExtractor.__init__(self, match)
- index = match.lastindex
- self.board = match.group(index-1)
- self.thread = match.group(index)
-
def items(self):
- url = "{}/{}/thread/{}.json".format(
- self.root, self.board, self.thread)
- thread = self.request(url).json()
+ url = f"{self.root}/{self.groups[-2]}/thread/{self.groups[-1]}.json"
+ thread = self.request_json(url)
thread["threadId"] = thread["postId"]
posts = thread.pop("replies", ())
yield Message.Directory, thread
for post in itertools.chain((thread,), posts):
- files = post.pop("files", ())
- if files:
+ if files := post.pop("files", ()):
thread.update(post)
thread["count"] = len(files)
for num, file in enumerate(files):
- url = self.root + "/file/" + file["filename"]
+ url = f"{self.root}/file/{file['filename']}"
file.update(thread)
file["num"] = num
file["siteFilename"] = file["filename"]
@@ -68,14 +60,10 @@ class JschanBoardExtractor(JschanExtractor):
r"(?:/index\.html|/catalog\.html|/\d+\.html|/?$)")
example = "https://94chan.org/a/"
- def __init__(self, match):
- JschanExtractor.__init__(self, match)
- self.board = match.group(match.lastindex)
-
def items(self):
- url = "{}/{}/catalog.json".format(self.root, self.board)
- for thread in self.request(url).json():
- url = "{}/{}/thread/{}.html".format(
- self.root, self.board, thread["postId"])
+ board = self.groups[-1]
+ url = f"{self.root}/{board}/catalog.json"
+ for thread in self.request_json(url):
+ url = f"{self.root}/{board}/thread/{thread['postId']}.html"
thread["_extractor"] = JschanThreadExtractor
yield Message.Queue, url, thread
diff --git a/gallery_dl/extractor/kabeuchi.py b/gallery_dl/extractor/kabeuchi.py
index 867f0da..c259c47 100644
--- a/gallery_dl/extractor/kabeuchi.py
+++ b/gallery_dl/extractor/kabeuchi.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,16 +23,12 @@ class KabeuchiUserExtractor(Extractor):
pattern = r"(?:https?://)?kabe-uchiroom\.com/mypage/?\?id=(\d+)"
example = "https://kabe-uchiroom.com/mypage/?id=12345"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.user_id = match.group(1)
-
def items(self):
- base = "{}/accounts/upfile/{}/{}/".format(
- self.root, self.user_id[-1], self.user_id)
+ uid = self.groups[0]
+ base = f"{self.root}/accounts/upfile/{uid[-1]}/{uid}/"
keys = ("image1", "image2", "image3", "image4", "image5", "image6")
- for post in self.posts():
+ for post in self.posts(uid):
if post.get("is_ad") or not post["image1"]:
continue
@@ -48,8 +44,8 @@ class KabeuchiUserExtractor(Extractor):
post["num"] = ord(key[-1]) - 48
yield Message.Url, url, text.nameext_from_url(name, post)
- def posts(self):
- url = "{}/mypage/?id={}".format(self.root, self.user_id)
+ def posts(self, uid):
+ url = f"{self.root}/mypage/?id={uid}"
response = self.request(url)
if response.history and response.url == self.root + "/":
raise exception.NotFoundError("user")
@@ -57,7 +53,7 @@ class KabeuchiUserExtractor(Extractor):
return self._pagination(target_id)
def _pagination(self, target_id):
- url = "{}/get_posts.php".format(self.root)
+ url = f"{self.root}/get_posts.php"
data = {
"user_id" : "0",
"target_id" : target_id,
@@ -69,7 +65,7 @@ class KabeuchiUserExtractor(Extractor):
}
while True:
- info = self.request(url, method="POST", data=data).json()
+ info = self.request_json(url, method="POST", data=data)
datas = info["datas"]
if not datas or not isinstance(datas, list):
diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py
index b8ecc14..f55a930 100644
--- a/gallery_dl/extractor/keenspot.py
+++ b/gallery_dl/extractor/keenspot.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -24,8 +24,8 @@ class KeenspotComicExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.comic = match.group(1).lower()
- self.path = match.group(2)
+ self.comic = match[1].lower()
+ self.path = match[2]
self.root = "http://" + self.comic + ".keenspot.com"
self._needle = ""
@@ -75,8 +75,7 @@ class KeenspotComicExtractor(Extractor):
self._image = '<div id="comic">'
return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"
- url = text.extr(page, '<link rel="first" href="', '"')
- if url:
+ if url := text.extr(page, '<link rel="first" href="', '"'):
if self.comic == "porcelain":
self._needle = 'id="porArchivetop_"'
else:
@@ -86,7 +85,7 @@ class KeenspotComicExtractor(Extractor):
pos = page.find('id="first_day1"')
if pos >= 0:
self._next = self._next_id
- return text.rextract(page, 'href="', '"', pos)[0]
+ return text.rextr(page, 'href="', '"', pos)
pos = page.find('>FIRST PAGE<')
if pos >= 0:
@@ -95,7 +94,7 @@ class KeenspotComicExtractor(Extractor):
self._image = '<div id="comic">'
else:
self._next = self._next_id
- return text.rextract(page, 'href="', '"', pos)[0]
+ return text.rextr(page, 'href="', '"', pos)
pos = page.find('<div id="kscomicpart"')
if pos >= 0:
@@ -106,13 +105,13 @@ class KeenspotComicExtractor(Extractor):
if pos >= 0:
self._image = '</header>'
self._needle = 'class="navarchive"'
- return text.rextract(page, 'href="', '"', pos)[0]
+ return text.rextr(page, 'href="', '"', pos)
pos = page.find('id="flip_FirstDay"') # flipside
if pos >= 0:
self._image = 'class="flip_Pages ksc"'
self._needle = 'id="flip_ArcButton"'
- return text.rextract(page, 'href="', '"', pos)[0]
+ return text.rextr(page, 'href="', '"', pos)
self.log.error("Unrecognized page layout")
return None
@@ -121,22 +120,18 @@ class KeenspotComicExtractor(Extractor):
pos = page.index(self._needle) + len(self._needle)
return text.extract(page, 'href="', '"', pos)[0]
- @staticmethod
- def _next_link(page):
+ def _next_link(self, page):
return text.extr(page, '<link rel="next" href="', '"')
- @staticmethod
- def _next_id(page):
+ def _next_id(self, page):
pos = page.find('id="next_')
- return text.rextract(page, 'href="', '"', pos)[0] if pos >= 0 else None
+ return text.rextr(page, 'href="', '"', pos) if pos >= 0 else None
- @staticmethod
- def _next_lastblood(page):
+ def _next_lastblood(self, page):
pos = page.index("link rel='next'")
return text.extract(page, "href='", "'", pos)[0]
- @staticmethod
- def _next_brawl(page):
+ def _next_brawl(self, page):
pos = page.index("comic-nav-next")
- url = text.rextract(page, 'href="', '"', pos)[0]
+ url = text.rextr(page, 'href="', '"', pos)
return None if "?random" in url else url
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemono.py
index 4893f19..1e88891 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemono.py
@@ -1,40 +1,39 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://kemono.su/"""
+"""Extractors for https://kemono.cr/"""
from .common import Extractor, Message
from .. import text, util, exception
-from ..cache import cache
+from ..cache import cache, memcache
import itertools
import json
-import re
-BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(su|party)"
+BASE_PATTERN = (r"(?:https?://)?(?:www\.|beta\.)?"
+ r"(kemono|coomer)\.(cr|s[tu]|party)")
USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})"
-class KemonopartyExtractor(Extractor):
- """Base class for kemonoparty extractors"""
- category = "kemonoparty"
- root = "https://kemono.su"
+class KemonoExtractor(Extractor):
+ """Base class for kemono extractors"""
+ category = "kemono"
+ root = "https://kemono.cr"
directory_fmt = ("{category}", "{service}", "{user}")
filename_fmt = "{id}_{title[:180]}_{num:>02}_{filename[:180]}.{extension}"
archive_fmt = "{service}_{user}_{id}_{num}"
- cookies_domain = ".kemono.su"
+ cookies_domain = ".kemono.cr"
def __init__(self, match):
- domain = match.group(1)
- tld = match.group(2)
- self.category = domain + "party"
- self.root = text.root_from_url(match.group(0))
- self.cookies_domain = ".{}.{}".format(domain, tld)
+ if match[1] == "coomer":
+ self.category = "coomer"
+ self.root = "https://coomer.st"
+ self.cookies_domain = ".coomer.st"
Extractor.__init__(self, match)
def _init(self):
@@ -45,25 +44,33 @@ class KemonopartyExtractor(Extractor):
order = self.config("order-revisions")
self.revisions_reverse = order[0] in ("r", "a") if order else False
- self._prepare_ddosguard_cookies()
- self._find_inline = re.compile(
- r'src="(?:https?://(?:kemono|coomer)\.(?:su|party))?(/inline/[^"]+'
+ self._find_inline = util.re(
+ r'src="(?:https?://(?:kemono\.cr|coomer\.st))?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
self._json_dumps = json.JSONEncoder(
ensure_ascii=False, check_circular=False,
sort_keys=True, separators=(",", ":")).encode
def items(self):
- find_hash = re.compile(HASH_PATTERN).match
+ find_hash = util.re(HASH_PATTERN).match
generators = self._build_file_generators(self.config("files"))
announcements = True if self.config("announcements") else None
archives = True if self.config("archives") else False
comments = True if self.config("comments") else False
- duplicates = True if self.config("duplicates") else False
dms = True if self.config("dms") else None
max_posts = self.config("max-posts")
creator_info = {} if self.config("metadata", True) else None
- exts_archive = {"zip", "rar", "7z"}
+ exts_archive = util.EXTS_ARCHIVE
+
+ if duplicates := self.config("duplicates"):
+ if isinstance(duplicates, str):
+ duplicates = set(duplicates.split(","))
+ elif isinstance(duplicates, (list, tuple)):
+ duplicates = set(duplicates)
+ else:
+ duplicates = {"file", "attachment", "inline"}
+ else:
+ duplicates = ()
# prevent files from being sent with gzip compression
headers = {"Accept-Encoding": "identity"}
@@ -75,8 +82,8 @@ class KemonopartyExtractor(Extractor):
posts = self._revisions(posts)
for post in posts:
- headers["Referer"] = "{}/{}/user/{}/post/{}".format(
- self.root, post["service"], post["user"], post["id"])
+ headers["Referer"] = (f"{self.root}/{post['service']}/user/"
+ f"{post['user']}/post/{post['id']}")
post["_http_headers"] = headers
post["date"] = self._parse_datetime(
post.get("published") or post.get("added") or "")
@@ -84,7 +91,7 @@ class KemonopartyExtractor(Extractor):
creator_id = post["user"]
if creator_info is not None:
- key = "{}_{}".format(service, creator_id)
+ key = f"{service}_{creator_id}"
if key not in creator_info:
creator = creator_info[key] = self.api.creator_profile(
service, creator_id)
@@ -126,14 +133,13 @@ class KemonopartyExtractor(Extractor):
if "\\" in url:
file["path"] = url = url.replace("\\", "/")
- match = find_hash(url)
- if match:
- file["hash"] = hash = match.group(1)
- if not duplicates:
- if hash in hashes:
- self.log.debug("Skipping %s (duplicate)", url)
- continue
- hashes.add(hash)
+ if match := find_hash(url):
+ file["hash"] = hash = match[1]
+ if file["type"] not in duplicates and hash in hashes:
+ self.log.debug("Skipping %s %s (duplicate)",
+ file["type"], url)
+ continue
+ hashes.add(hash)
else:
file["hash"] = hash = ""
@@ -153,7 +159,7 @@ class KemonopartyExtractor(Extractor):
file["type"] = "archive"
if archives:
try:
- data = self.api.file(file["hash"])
+ data = self.api.file(hash)
data.update(file)
post_archives.append(data)
except Exception as exc:
@@ -306,29 +312,31 @@ def _validate(response):
response.content != b"not found")
-class KemonopartyUserExtractor(KemonopartyExtractor):
- """Extractor for all posts from a kemono.su user listing"""
+class KemonoUserExtractor(KemonoExtractor):
+ """Extractor for all posts from a kemono.cr user listing"""
subcategory = "user"
pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)"
- example = "https://kemono.su/SERVICE/user/12345"
+ example = "https://kemono.cr/SERVICE/user/12345"
def __init__(self, match):
- self.subcategory = match.group(3)
- KemonopartyExtractor.__init__(self, match)
+ self.subcategory = match[3]
+ KemonoExtractor.__init__(self, match)
def posts(self):
+ _, _, service, creator_id, query = self.groups
+ params = text.parse_query(query)
+ tag = params.get("tag")
+
endpoint = self.config("endpoint")
- if endpoint == "legacy":
- endpoint = self.api.creator_posts_legacy
- elif endpoint == "legacy+":
+ if endpoint == "legacy+":
endpoint = self._posts_legacy_plus
+ elif endpoint == "legacy" or tag:
+ endpoint = self.api.creator_posts_legacy
else:
endpoint = self.api.creator_posts
- _, _, service, creator_id, query = self.groups
- params = text.parse_query(query)
return endpoint(service, creator_id,
- params.get("o"), params.get("q"), params.get("tag"))
+ params.get("o"), params.get("q"), tag)
def _posts_legacy_plus(self, service, creator_id,
offset=0, query=None, tags=None):
@@ -338,11 +346,11 @@ class KemonopartyUserExtractor(KemonopartyExtractor):
service, creator_id, post["id"])["post"]
-class KemonopartyPostsExtractor(KemonopartyExtractor):
- """Extractor for kemono.su post listings"""
+class KemonoPostsExtractor(KemonoExtractor):
+ """Extractor for kemono.cr post listings"""
subcategory = "posts"
pattern = BASE_PATTERN + r"/posts()()(?:/?\?([^#]+))?"
- example = "https://kemono.su/posts"
+ example = "https://kemono.cr/posts"
def posts(self):
params = text.parse_query(self.groups[4])
@@ -350,15 +358,15 @@ class KemonopartyPostsExtractor(KemonopartyExtractor):
params.get("o"), params.get("q"), params.get("tag"))
-class KemonopartyPostExtractor(KemonopartyExtractor):
- """Extractor for a single kemono.su post"""
+class KemonoPostExtractor(KemonoExtractor):
+ """Extractor for a single kemono.cr post"""
subcategory = "post"
pattern = USER_PATTERN + r"/post/([^/?#]+)(/revisions?(?:/(\d*))?)?"
- example = "https://kemono.su/SERVICE/user/12345/post/12345"
+ example = "https://kemono.cr/SERVICE/user/12345/post/12345"
def __init__(self, match):
- self.subcategory = match.group(3)
- KemonopartyExtractor.__init__(self, match)
+ self.subcategory = match[3]
+ KemonoExtractor.__init__(self, match)
def posts(self):
_, _, service, creator_id, post_id, revision, revision_id = self.groups
@@ -379,62 +387,57 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
raise exception.NotFoundError("revision")
-class KemonopartyDiscordExtractor(KemonopartyExtractor):
- """Extractor for kemono.su discord servers"""
+class KemonoDiscordExtractor(KemonoExtractor):
+ """Extractor for kemono.cr discord servers"""
subcategory = "discord"
- directory_fmt = ("{category}", "discord", "{server}",
- "{channel_name|channel}")
+ directory_fmt = ("{category}", "discord",
+ "{server_id} {server}", "{channel_id} {channel}")
filename_fmt = "{id}_{num:>02}_{filename}.{extension}"
- archive_fmt = "discord_{server}_{id}_{num}"
- pattern = (BASE_PATTERN + r"/discord/server/(\d+)"
- r"(?:/(?:channel/)?(\d+)(?:#(.+))?|#(.+))")
- example = "https://kemono.su/discord/server/12345/12345"
+ archive_fmt = "discord_{server_id}_{id}_{num}"
+ pattern = BASE_PATTERN + r"/discord/server/(\d+)[/#](?:channel/)?(\d+)"
+ example = "https://kemono.cr/discord/server/12345/12345"
def items(self):
- self._prepare_ddosguard_cookies()
- _, _, server_id, channel_id, channel_name, channel = self.groups
-
- if channel_id is None:
- if channel.isdecimal() and len(channel) >= 16:
- key = "id"
- else:
- key = "name"
- else:
- key = "id"
- channel = channel_id
+ _, _, server_id, channel_id = self.groups
- if not channel_name or not channel_id:
- for ch in self.api.discord_server(server_id):
- if ch[key] == channel:
- break
- else:
- raise exception.NotFoundError("channel")
- channel_id = ch["id"]
- channel_name = ch["name"]
+ try:
+ server, channels = discord_server_info(self, server_id)
+ channel = channels[channel_id]
+ except Exception:
+ raise exception.NotFoundError("channel")
+
+ data = {
+ "server" : server["name"],
+ "server_id" : server["id"],
+ "channel" : channel["name"],
+ "channel_id" : channel["id"],
+ "channel_nsfw" : channel["is_nsfw"],
+ "channel_type" : channel["type"],
+ "channel_topic": channel["topic"],
+ "parent_id" : channel["parent_channel_id"],
+ }
- find_inline = re.compile(
+ find_inline = util.re(
r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)"
r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall
- find_hash = re.compile(HASH_PATTERN).match
+ find_hash = util.re(HASH_PATTERN).match
posts = self.api.discord_channel(channel_id)
- max_posts = self.config("max-posts")
- if max_posts:
+ if max_posts := self.config("max-posts"):
posts = itertools.islice(posts, max_posts)
for post in posts:
files = []
- append = files.append
for attachment in post["attachments"]:
match = find_hash(attachment["path"])
- attachment["hash"] = match.group(1) if match else ""
+ attachment["hash"] = match[1] if match else ""
attachment["type"] = "attachment"
- append(attachment)
+ files.append(attachment)
for path in find_inline(post["content"] or ""):
- append({"path": "https://cdn.discordapp.com" + path,
- "name": path, "type": "inline", "hash": ""})
+ files.append({"path": "https://cdn.discordapp.com" + path,
+ "name": path, "type": "inline", "hash": ""})
- post["channel_name"] = channel_name
+ post.update(data)
post["date"] = self._parse_datetime(post["published"])
post["count"] = len(files)
yield Message.Directory, post
@@ -455,28 +458,40 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
yield Message.Url, url, post
-class KemonopartyDiscordServerExtractor(KemonopartyExtractor):
+class KemonoDiscordServerExtractor(KemonoExtractor):
subcategory = "discord-server"
pattern = BASE_PATTERN + r"/discord/server/(\d+)$"
- example = "https://kemono.su/discord/server/12345"
+ example = "https://kemono.cr/discord/server/12345"
def items(self):
server_id = self.groups[2]
- for channel in self.api.discord_server(server_id):
- url = "{}/discord/server/{}/{}#{}".format(
- self.root, server_id, channel["id"], channel["name"])
- channel["_extractor"] = KemonopartyDiscordExtractor
- yield Message.Queue, url, channel
-
-
-class KemonopartyFavoriteExtractor(KemonopartyExtractor):
- """Extractor for kemono.su favorites"""
+ server, channels = discord_server_info(self, server_id)
+ for channel in channels.values():
+ url = (f"{self.root}/discord/server/{server_id}/"
+ f"{channel['id']}#{channel['name']}")
+ yield Message.Queue, url, {
+ "server" : server,
+ "channel" : channel,
+ "_extractor": KemonoDiscordExtractor,
+ }
+
+
+@memcache(keyarg=1)
+def discord_server_info(extr, server_id):
+ server = extr.api.discord_server(server_id)
+ return server, {
+ channel["id"]: channel
+ for channel in server.pop("channels")
+ }
+
+
+class KemonoFavoriteExtractor(KemonoExtractor):
+ """Extractor for kemono.cr favorites"""
subcategory = "favorite"
pattern = BASE_PATTERN + r"/(?:account/)?favorites()()(?:/?\?([^#]+))?"
- example = "https://kemono.su/account/favorites/artists"
+ example = "https://kemono.cr/account/favorites/artists"
def items(self):
- self._prepare_ddosguard_cookies()
self.login()
params = text.parse_query(self.groups[4])
@@ -496,13 +511,11 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
for user in users:
service = user["service"]
if service == "discord":
- user["_extractor"] = KemonopartyDiscordServerExtractor
- url = "{}/discord/server/{}".format(
- self.root, user["id"])
+ user["_extractor"] = KemonoDiscordServerExtractor
+ url = f"{self.root}/discord/server/{user['id']}"
else:
- user["_extractor"] = KemonopartyUserExtractor
- url = "{}/{}/user/{}".format(
- self.root, service, user["id"])
+ user["_extractor"] = KemonoUserExtractor
+ url = f"{self.root}/{service}/user/{user['id']}"
yield Message.Queue, url, user
elif type == "post":
@@ -514,16 +527,52 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
reverse=(order == "desc"))
for post in posts:
- post["_extractor"] = KemonopartyPostExtractor
- url = "{}/{}/user/{}/post/{}".format(
- self.root, post["service"], post["user"], post["id"])
+ post["_extractor"] = KemonoPostExtractor
+ url = (f"{self.root}/{post['service']}/user/"
+ f"{post['user']}/post/{post['id']}")
yield Message.Queue, url, post
+class KemonoArtistsExtractor(KemonoExtractor):
+ """Extractor for kemono artists"""
+ subcategory = "artists"
+ pattern = BASE_PATTERN + r"/artists(?:\?([^#]+))?"
+ example = "https://kemono.cr/artists"
+
+ def items(self):
+ params = text.parse_query(self.groups[2])
+ users = self.api.creators()
+
+ if params.get("service"):
+ service = params["service"].lower()
+ users = [user for user in users
+ if user["service"] == service]
+
+ if params.get("q"):
+ q = params["q"].lower()
+ users = [user for user in users
+ if q in user["name"].lower()]
+
+ sort = params.get("sort_by") or "favorited"
+ order = params.get("order") or "desc"
+ users.sort(key=lambda user: user[sort] or util.NONE,
+ reverse=(order != "asc"))
+
+ for user in users:
+ service = user["service"]
+ if service == "discord":
+ user["_extractor"] = KemonoDiscordServerExtractor
+ url = f"{self.root}/discord/server/{user['id']}"
+ else:
+ user["_extractor"] = KemonoUserExtractor
+ url = f"{self.root}/{service}/user/{user['id']}"
+ yield Message.Queue, url, user
+
+
class KemonoAPI():
"""Interface for the Kemono API v1.1.0
- https://kemono.su/documentation/api
+ https://kemono.cr/documentation/api
"""
def __init__(self, extractor):
@@ -539,62 +588,68 @@ class KemonoAPI():
endpoint = "/file/" + file_hash
return self._call(endpoint)
+ def creators(self):
+ endpoint = "/creators.txt"
+ return self._call(endpoint)
+
def creator_posts(self, service, creator_id,
offset=0, query=None, tags=None):
- endpoint = "/{}/user/{}".format(service, creator_id)
+ endpoint = f"/{service}/user/{creator_id}"
params = {"q": query, "tag": tags, "o": offset}
return self._pagination(endpoint, params, 50)
def creator_posts_legacy(self, service, creator_id,
offset=0, query=None, tags=None):
- endpoint = "/{}/user/{}/posts-legacy".format(service, creator_id)
+ endpoint = f"/{service}/user/{creator_id}/posts-legacy"
params = {"o": offset, "tag": tags, "q": query}
return self._pagination(endpoint, params, 50, "results")
def creator_announcements(self, service, creator_id):
- endpoint = "/{}/user/{}/announcements".format(service, creator_id)
+ endpoint = f"/{service}/user/{creator_id}/announcements"
return self._call(endpoint)
def creator_dms(self, service, creator_id):
- endpoint = "/{}/user/{}/dms".format(service, creator_id)
+ endpoint = f"/{service}/user/{creator_id}/dms"
return self._call(endpoint)
def creator_fancards(self, service, creator_id):
- endpoint = "/{}/user/{}/fancards".format(service, creator_id)
+ endpoint = f"/{service}/user/{creator_id}/fancards"
return self._call(endpoint)
def creator_post(self, service, creator_id, post_id):
- endpoint = "/{}/user/{}/post/{}".format(service, creator_id, post_id)
+ endpoint = f"/{service}/user/{creator_id}/post/{post_id}"
return self._call(endpoint)
def creator_post_comments(self, service, creator_id, post_id):
- endpoint = "/{}/user/{}/post/{}/comments".format(
- service, creator_id, post_id)
+ endpoint = f"/{service}/user/{creator_id}/post/{post_id}/comments"
return self._call(endpoint)
def creator_post_revisions(self, service, creator_id, post_id):
- endpoint = "/{}/user/{}/post/{}/revisions".format(
- service, creator_id, post_id)
+ endpoint = f"/{service}/user/{creator_id}/post/{post_id}/revisions"
return self._call(endpoint)
def creator_profile(self, service, creator_id):
- endpoint = "/{}/user/{}/profile".format(service, creator_id)
+ endpoint = f"/{service}/user/{creator_id}/profile"
return self._call(endpoint)
def creator_links(self, service, creator_id):
- endpoint = "/{}/user/{}/links".format(service, creator_id)
+ endpoint = f"/{service}/user/{creator_id}/links"
return self._call(endpoint)
def creator_tags(self, service, creator_id):
- endpoint = "/{}/user/{}/tags".format(service, creator_id)
+ endpoint = f"/{service}/user/{creator_id}/tags"
return self._call(endpoint)
def discord_channel(self, channel_id):
- endpoint = "/discord/channel/{}".format(channel_id)
+ endpoint = f"/discord/channel/{channel_id}"
return self._pagination(endpoint, {}, 150)
+ def discord_channel_lookup(self, server_id):
+ endpoint = f"/discord/channel/lookup/{server_id}"
+ return self._call(endpoint)
+
def discord_server(self, server_id):
- endpoint = "/discord/channel/lookup/{}".format(server_id)
+ endpoint = f"/discord/server/{server_id}"
return self._call(endpoint)
def account_favorites(self, type):
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
index e779e97..9c33d4f 100644
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,7 +26,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.album = match.group(1)
+ self.album = match[1]
def items(self):
url = self.root + "/game-soundtracks/album/" + self.album
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index 89a1b5e..816bc3d 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -1,36 +1,32 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://komikcast.la/"""
+"""Extractors for https://komikcast.li/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text
-import re
+from .. import text, util
BASE_PATTERN = (r"(?:https?://)?(?:www\.)?"
- r"komikcast\.(?:la|cz|lol|site|mo?e|com)")
+ r"komikcast\d*\.(?:l(?:i|a|ol)|com|cz|site|mo?e)")
class KomikcastBase():
"""Base class for komikcast extractors"""
category = "komikcast"
- root = "https://komikcast.la"
+ root = "https://komikcast.li"
- @staticmethod
- def parse_chapter_string(chapter_string, data=None):
+ def parse_chapter_string(self, chapter_string, data=None):
"""Parse 'chapter_string' value and add its info to 'data'"""
- if not data:
+ if data is None:
data = {}
- match = re.match(
- r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?",
- text.unescape(chapter_string),
- )
+ pattern = util.re(r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?")
+ match = pattern.match(text.unescape(chapter_string))
manga, chapter, data["chapter_minor"], title = match.groups()
if manga:
@@ -49,27 +45,27 @@ class KomikcastBase():
class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
"""Extractor for komikcast manga chapters"""
pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)"
- example = "https://komikcast.la/chapter/TITLE/"
+ example = "https://komikcast.li/chapter/TITLE/"
def metadata(self, page):
info = text.extr(page, "<title>", " - Komikcast<")
return self.parse_chapter_string(info)
- @staticmethod
- def images(page):
+ def images(self, page):
readerarea = text.extr(
page, '<div class="main-reading-area', '</div')
+ pattern = util.re(r"<img[^>]* src=[\"']([^\"']+)")
return [
(text.unescape(url), None)
- for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)
+ for url in pattern.findall(readerarea)
]
class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
"""Extractor for komikcast manga"""
chapterclass = KomikcastChapterExtractor
- pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$"
- example = "https://komikcast.la/komik/TITLE"
+ pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+/?)$"
+ example = "https://komikcast.li/komik/TITLE"
def chapters(self, page):
results = []
@@ -84,8 +80,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
results.append((url, data.copy()))
return results
- @staticmethod
- def metadata(page):
+ def metadata(self, page):
"""Return a dict with general metadata"""
manga , pos = text.extract(page, "<title>" , " - Komikcast<")
genres, pos = text.extract(
diff --git a/gallery_dl/extractor/leakgallery.py b/gallery_dl/extractor/leakgallery.py
new file mode 100644
index 0000000..c609891
--- /dev/null
+++ b/gallery_dl/extractor/leakgallery.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://leakgallery.com"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?leakgallery\.com"
+
+
+class LeakgalleryExtractor(Extractor):
+ category = "leakgallery"
+ directory_fmt = ("{category}", "{creator}")
+ filename_fmt = "{id}_{filename}.{extension}"
+ archive_fmt = "{creator}_{id}"
+
+ def _yield_media_items(self, medias, creator=None):
+ seen = set()
+ for media in medias:
+ path = media["file_path"]
+ if path in seen:
+ continue
+ seen.add(path)
+
+ if creator is None:
+ try:
+ media["creator"] = \
+ media["profile"]["username"] or "unknown"
+ except Exception:
+ media["creator"] = "unknown"
+ else:
+ media["creator"] = creator
+
+ media["url"] = url = f"https://cdn.leakgallery.com/{path}"
+ text.nameext_from_url(url, media)
+ yield Message.Directory, media
+ yield Message.Url, url, media
+
+ def _pagination(self, type, base, params=None, creator=None, pnum=1):
+ while True:
+ try:
+ data = self.request_json(f"{base}{pnum}", params=params)
+
+ if not data:
+ return
+ if "medias" in data:
+ data = data["medias"]
+ if not data or not isinstance(data, list):
+ return
+
+ yield from self._yield_media_items(data, creator)
+ pnum += 1
+ except Exception as exc:
+ self.log.error("Failed to retrieve %s page %s: %s",
+ type, pnum, exc)
+ return
+
+
+class LeakgalleryUserExtractor(LeakgalleryExtractor):
+ """Extractor for profile posts on leakgallery.com"""
+ subcategory = "user"
+ pattern = (
+ BASE_PATTERN +
+ r"/(?!trending-medias|most-liked|random/medias)([^/?#]+)"
+ r"(?:/(Photos|Videos|All))?"
+ r"(?:/(MostRecent|MostViewed|MostLiked))?/?$"
+ )
+ example = "https://leakgallery.com/creator"
+
+ def items(self):
+ creator, mtype, msort = self.groups
+ base = f"https://api.leakgallery.com/profile/{creator}/"
+ params = {"type": mtype or "All", "sort": msort or "MostRecent"}
+ return self._pagination(creator, base, params, creator)
+
+
+class LeakgalleryTrendingExtractor(LeakgalleryExtractor):
+ """Extractor for trending posts on leakgallery.com"""
+ subcategory = "trending"
+ pattern = BASE_PATTERN + r"/trending-medias(?:/([\w-]+))?"
+ example = "https://leakgallery.com/trending-medias/Week"
+
+ def items(self):
+ period = self.groups[0] or "Last-Hour"
+ base = f"https://api.leakgallery.com/popular/media/{period}/"
+ return self._pagination("trending", base)
+
+
+class LeakgalleryMostlikedExtractor(LeakgalleryExtractor):
+ """Extractor for most liked posts on leakgallery.com"""
+ subcategory = "mostliked"
+ pattern = BASE_PATTERN + r"/most-liked"
+ example = "https://leakgallery.com/most-liked"
+
+ def items(self):
+ base = "https://api.leakgallery.com/most-liked/"
+ return self._pagination("most-liked", base)
+
+
+class LeakgalleryPostExtractor(LeakgalleryExtractor):
+ """Extractor for individual posts on leakgallery.com"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/(\d+)"
+ example = "https://leakgallery.com/CREATOR/12345"
+
+ def items(self):
+ creator, post_id = self.groups
+ url = f"https://leakgallery.com/{creator}/{post_id}"
+
+ try:
+ page = self.request(url).text
+ video_urls = text.re(
+ r"https://cdn\.leakgallery\.com/content[^/?#]*/"
+ r"(?:compressed_)?watermark_[^\"]+\."
+ r"(?:mp4|mov|m4a|webm)"
+ ).findall(page)
+ image_urls = text.re(
+ r"https://cdn\.leakgallery\.com/content[^/?#]*/"
+ r"watermark_[^\"]+\.(?:jpe?g|png)"
+ ).findall(page)
+
+ seen = set()
+ for url in video_urls + image_urls:
+ if url in seen:
+ continue
+ seen.add(url)
+ data = {
+ "id": post_id,
+ "creator": creator,
+ "url": url,
+ }
+ text.nameext_from_url(url, data)
+ yield Message.Directory, data
+ yield Message.Url, url, data
+ except Exception as exc:
+ self.log.error("Failed to extract post page %s/%s: %s",
+ creator, post_id, exc)
diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py
index 72a6453..c700a29 100644
--- a/gallery_dl/extractor/lensdump.py
+++ b/gallery_dl/extractor/lensdump.py
@@ -37,9 +37,9 @@ class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor):
def __init__(self, match):
self.gallery_id, query = match.groups()
if query:
- url = "{}/a/{}/?{}".format(self.root, self.gallery_id, query)
+ url = f"{self.root}/a/{self.gallery_id}/?{query}"
else:
- url = "{}/a/{}".format(self.root, self.gallery_id)
+ url = f"{self.root}/a/{self.gallery_id}"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -81,7 +81,7 @@ class LensdumpAlbumsExtractor(LensdumpBase, Extractor):
def items(self):
user, query = self.groups
- url = "{}/{}/".format(self.root, user)
+ url = f"{self.root}/{user}/"
if query:
params = text.parse_query(query)
else:
@@ -105,7 +105,7 @@ class LensdumpImageExtractor(LensdumpBase, Extractor):
def items(self):
key = self.groups[0]
- url = "{}/i/{}".format(self.root, key)
+ url = f"{self.root}/i/{key}"
extr = text.extract_from(self.request(url).text)
data = {
diff --git a/gallery_dl/extractor/lexica.py b/gallery_dl/extractor/lexica.py
index d55d821..6e54847 100644
--- a/gallery_dl/extractor/lexica.py
+++ b/gallery_dl/extractor/lexica.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -24,7 +24,7 @@ class LexicaSearchExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.query = match.group(1)
+ self.query = match[1]
self.text = text.unquote(self.query).replace("+", " ")
def items(self):
@@ -43,7 +43,7 @@ class LexicaSearchExtractor(Extractor):
url = self.root + "/api/infinite-prompts"
headers = {
"Accept" : "application/json, text/plain, */*",
- "Referer": "{}/?q={}".format(self.root, self.query),
+ "Referer": f"{self.root}/?q={self.query}",
}
json = {
"text" : self.text,
@@ -54,8 +54,8 @@ class LexicaSearchExtractor(Extractor):
}
while True:
- data = self.request(
- url, method="POST", headers=headers, json=json).json()
+ data = self.request_json(
+ url, method="POST", headers=headers, json=json)
prompts = {
prompt["id"]: prompt
diff --git a/gallery_dl/extractor/lightroom.py b/gallery_dl/extractor/lightroom.py
index 2cbaa97..b557149 100644
--- a/gallery_dl/extractor/lightroom.py
+++ b/gallery_dl/extractor/lightroom.py
@@ -22,7 +22,7 @@ class LightroomGalleryExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.href = match.group(1)
+ self.href = match[1]
def items(self):
# Get config
diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py
index e21659f..ab3be69 100644
--- a/gallery_dl/extractor/livedoor.py
+++ b/gallery_dl/extractor/livedoor.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,12 +22,11 @@ class LivedoorExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
def items(self):
for post in self.posts():
- images = self._images(post)
- if images:
+ if images := self._images(post):
yield Message.Directory, {"post": post}
for image in images:
yield Message.Url, image["url"], image
@@ -87,7 +86,7 @@ class LivedoorBlogExtractor(LivedoorExtractor):
example = "http://blog.livedoor.jp/USER/"
def posts(self):
- url = "{}/{}".format(self.root, self.user)
+ url = f"{self.root}/{self.user}"
while url:
extr = text.extract_from(self.request(url).text)
while True:
@@ -108,11 +107,10 @@ class LivedoorPostExtractor(LivedoorExtractor):
def __init__(self, match):
LivedoorExtractor.__init__(self, match)
- self.post_id = match.group(2)
+ self.post_id = match[2]
def posts(self):
- url = "{}/{}/archives/{}.html".format(
- self.root, self.user, self.post_id)
+ url = f"{self.root}/{self.user}/archives/{self.post_id}.html"
extr = text.extract_from(self.request(url).text)
data = extr('<rdf:RDF', '</rdf:RDF>')
body = extr('class="article-body-inner">', 'class="article-footer">')
diff --git a/gallery_dl/extractor/lofter.py b/gallery_dl/extractor/lofter.py
index b92a6ff..c20d983 100644
--- a/gallery_dl/extractor/lofter.py
+++ b/gallery_dl/extractor/lofter.py
@@ -136,7 +136,7 @@ class LofterAPI():
if info["meta"]["status"] != 200:
self.extractor.log.debug("Server response: %s", info)
- raise exception.StopExtraction("API request failed")
+ raise exception.AbortExtraction("API request failed")
return info["response"]
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index 6a9f633..5233033 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -63,7 +63,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
file["filename"] = file["name"] + "-" + fid
elif "id" in file:
file["name"] = file["filename"]
- file["filename"] = "{}-{}".format(file["name"], file["id"])
+ file["filename"] = f"{file['name']}-{file['id']}"
else:
file["name"], sep, file["id"] = \
file["filename"].rpartition("-")
@@ -71,8 +71,8 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
yield Message.Url, url, file
def fetch_album(self, album_id):
- url = "{}/api/album/get/{}".format(self.root, album_id)
- data = self.request(url).json()
+ url = f"{self.root}/api/album/get/{album_id}"
+ data = self.request_json(url)
return data["files"], {
"album_id" : self.album_id,
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
index 8e73964..0cbc523 100644
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,15 +26,15 @@ class LusciousExtractor(Extractor):
"variables" : variables,
}
response = self.request(
- "{}/graphql/nobatch/?operationName={}".format(self.root, op),
+ f"{self.root}/graphql/nobatch/?operationName={op}",
method="POST", json=data, fatal=False,
)
if response.status_code >= 400:
self.log.debug("Server response: %s", response.text)
- raise exception.StopExtraction(
- "GraphQL query failed ('%s %s')",
- response.status_code, response.reason)
+ raise exception.AbortExtraction(
+ f"GraphQL query failed "
+ f"('{response.status_code} {response.reason}')")
return response.json()["data"]
@@ -51,7 +51,7 @@ class LusciousAlbumExtractor(LusciousExtractor):
def __init__(self, match):
LusciousExtractor.__init__(self, match)
- self.album_id = match.group(1)
+ self.album_id = match[1]
def _init(self):
self.gif = self.config("gif", False)
@@ -280,7 +280,7 @@ class LusciousSearchExtractor(LusciousExtractor):
def __init__(self, match):
LusciousExtractor.__init__(self, match)
- self.query = match.group(1)
+ self.query = match[1]
def items(self):
query = text.parse_query(self.query)
diff --git a/gallery_dl/extractor/lynxchan.py b/gallery_dl/extractor/lynxchan.py
index 85b3fef..fde2df5 100644
--- a/gallery_dl/extractor/lynxchan.py
+++ b/gallery_dl/extractor/lynxchan.py
@@ -42,22 +42,15 @@ class LynxchanThreadExtractor(LynxchanExtractor):
pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
example = "https://endchan.org/a/res/12345.html"
- def __init__(self, match):
- LynxchanExtractor.__init__(self, match)
- index = match.lastindex
- self.board = match.group(index-1)
- self.thread = match.group(index)
-
def items(self):
- url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
- thread = self.request(url).json()
+ url = f"{self.root}/{self.groups[-2]}/res/{self.groups[-1]}.json"
+ thread = self.request_json(url)
thread["postId"] = thread["threadId"]
posts = thread.pop("posts", ())
yield Message.Directory, thread
for post in itertools.chain((thread,), posts):
- files = post.pop("files", ())
- if files:
+ if files := post.pop("files", ()):
thread.update(post)
for num, file in enumerate(files):
file.update(thread)
@@ -73,14 +66,10 @@ class LynxchanBoardExtractor(LynxchanExtractor):
pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
example = "https://endchan.org/a/"
- def __init__(self, match):
- LynxchanExtractor.__init__(self, match)
- self.board = match.group(match.lastindex)
-
def items(self):
- url = "{}/{}/catalog.json".format(self.root, self.board)
- for thread in self.request(url).json():
- url = "{}/{}/res/{}.html".format(
- self.root, self.board, thread["threadId"])
+ board = self.groups[-1]
+ url = f"{self.root}/{board}/catalog.json"
+ for thread in self.request_json(url):
+ url = f"{self.root}/{board}/res/{thread['threadId']}.html"
thread["_extractor"] = LynxchanThreadExtractor
yield Message.Queue, url, thread
diff --git a/gallery_dl/extractor/madokami.py b/gallery_dl/extractor/madokami.py
new file mode 100644
index 0000000..e87dbba
--- /dev/null
+++ b/gallery_dl/extractor/madokami.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://manga.madokami.al/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+
+BASE_PATTERN = r"(?:https?://)?manga\.madokami\.al"
+
+
+class MadokamiExtractor(Extractor):
+ """Base class for madokami extractors"""
+ category = "madokami"
+ root = "https://manga.madokami.al"
+
+
+class MadokamiMangaExtractor(MadokamiExtractor):
+ """Extractor for madokami manga"""
+ subcategory = "manga"
+ directory_fmt = ("{category}", "{manga}")
+ archive_fmt = "{chapter_id}"
+ pattern = rf"{BASE_PATTERN}/Manga/(\w/\w{{2}}/\w{{4}}/.+)"
+ example = "https://manga.madokami.al/Manga/A/AB/ABCD/ABCDE_TITLE"
+
+ def items(self):
+ username, password = self._get_auth_info()
+ if not username:
+ raise exception.AuthRequired("'username' & 'password'")
+ self.session.auth = util.HTTPBasicAuth(username, password)
+
+ url = f"{self.root}/Manga/{self.groups[0]}"
+ page = self.request(url).text
+ extr = text.extract_from(page)
+
+ chapters = []
+ while True:
+ if not (cid := extr('<tr data-record="', '"')):
+ break
+ chapters.append({
+ "chapter_id": text.parse_int(cid),
+ "path": text.unescape(extr('href="', '"')),
+ "chapter_string": text.unescape(extr(">", "<")),
+ "size": text.parse_bytes(extr("<td>", "</td>")),
+ "date": text.parse_datetime(
+ extr("<td>", "</td>").strip(), "%Y-%m-%d %H:%M"),
+ })
+
+ if self.config("chapter-reverse"):
+ chapters.reverse()
+
+ self.kwdict.update({
+ "manga" : text.unescape(extr('itemprop="name">', "<")),
+ "year" : text.parse_int(extr(
+ 'itemprop="datePublished" content="', "-")),
+ "author": text.split_html(extr('<p class="staff', "</p>"))[1::2],
+ "genre" : text.split_html(extr("<h3>Genres</h3>", "</div>")),
+ "tags" : text.split_html(extr("<h3>Tags</h3>", "</div>")),
+ "complete": extr('span class="scanstatus">', "<").lower() == "yes",
+ })
+
+ search_chstr = text.re(
+ r"(?i)((?:v(?:ol)?\.?\s*(\d+))"
+ r"(?:\s+ch?\.?\s*(\d+)(?:-(\d+))?)?)").search
+ search_chstr_min = text.re(
+ r"(?i)(ch?\.?\s*(\d+)(?:-(\d+))?)").search
+
+ for ch in chapters:
+
+ chstr = ch["chapter_string"]
+ if match := search_chstr(chstr):
+ ch["chapter_string"], volume, chapter, end = match.groups()
+ ch["volume"] = text.parse_int(volume)
+ ch["chapter"] = text.parse_int(chapter)
+ ch["chapter_end"] = text.parse_int(end)
+ elif match := search_chstr_min(chstr):
+ ch["chapter_string"], chapter, end = match.groups()
+ ch["volume"] = 0
+ ch["chapter"] = text.parse_int(chapter)
+ ch["chapter_end"] = text.parse_int(end)
+ else:
+ ch["volume"] = ch["chapter"] = ch["chapter_end"] = 0
+
+ url = f"{self.root}{ch['path']}"
+ text.nameext_from_url(url, ch)
+
+ yield Message.Directory, ch
+ yield Message.Url, url, ch
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index 42a508d..225560d 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -44,7 +44,7 @@ class MangadexExtractor(Extractor):
def _items_manga(self):
data = {"_extractor": MangadexMangaExtractor}
for manga in self.manga():
- url = "{}/title/{}".format(self.root, manga["id"])
+ url = f"{self.root}/title/{manga['id']}"
yield Message.Queue, url, data
def _transform(self, chapter):
@@ -58,8 +58,7 @@ class MangadexExtractor(Extractor):
cattributes = chapter["attributes"]
mattributes = manga["attributes"]
- lang = cattributes.get("translatedLanguage")
- if lang:
+ if lang := cattributes.get("translatedLanguage"):
lang = lang.partition("-")[0]
if cattributes["chapter"]:
@@ -112,16 +111,16 @@ class MangadexChapterExtractor(MangadexExtractor):
data = self._transform(chapter)
if data.get("_external_url") and not data["count"]:
- raise exception.StopExtraction(
- "Chapter %s%s is not available on MangaDex and can instead be "
- "read on the official publisher's website at %s.",
- data["chapter"], data["chapter_minor"], data["_external_url"])
+ raise exception.AbortExtraction(
+ f"Chapter {data['chapter']}{data['chapter_minor']} is not "
+ f"available on MangaDex and can instead be read on the "
+ f"official publisher's website at {data['_external_url']}.")
yield Message.Directory, data
server = self.api.athome_server(self.uuid)
chapter = server["chapter"]
- base = "{}/data/{}/".format(server["baseUrl"], chapter["hash"])
+ base = f"{server['baseUrl']}/data/{chapter['hash']}/"
enum = util.enumerate_reversed if self.config(
"page-reverse") else enumerate
@@ -172,11 +171,11 @@ class MangadexListExtractor(MangadexExtractor):
"/01234567-89ab-cdef-0123-456789abcdef/NAME")
def __init__(self, match):
- MangadexExtractor.__init__(self, match)
- if match.group(2) == "feed":
+ if match[2] == "feed":
self.subcategory = "list-feed"
else:
self.items = self._items_manga
+ MangadexExtractor.__init__(self, match)
def chapters(self):
return self.api.list_feed(self.uuid)
@@ -199,7 +198,7 @@ class MangadexAuthorExtractor(MangadexExtractor):
def items(self):
for manga in self.api.manga_author(self.uuid):
manga["_extractor"] = MangadexMangaExtractor
- url = "{}/title/{}".format(self.root, manga["id"])
+ url = f"{self.root}/title/{manga['id']}"
yield Message.Queue, url, manga
@@ -279,8 +278,7 @@ class MangadexAPI():
@cache(maxage=900, keyarg=1)
def _authenticate_impl_client(self, username, password):
- refresh_token = _refresh_token_cache((username, "personal"))
- if refresh_token:
+ if refresh_token := _refresh_token_cache((username, "personal")):
self.extractor.log.info("Refreshing access token")
data = {
"grant_type" : "refresh_token",
@@ -301,8 +299,8 @@ class MangadexAPI():
self.extractor.log.debug("Using client-id '%s…'", self.client_id[:24])
url = ("https://auth.mangadex.org/realms/mangadex"
"/protocol/openid-connect/token")
- data = self.extractor.request(
- url, method="POST", data=data, fatal=None).json()
+ data = self.extractor.request_json(
+ url, method="POST", data=data, fatal=None)
try:
access_token = data["access_token"]
@@ -317,8 +315,7 @@ class MangadexAPI():
@cache(maxage=900, keyarg=1)
def _authenticate_impl_legacy(self, username, password):
- refresh_token = _refresh_token_cache(username)
- if refresh_token:
+ if refresh_token := _refresh_token_cache(username):
self.extractor.log.info("Refreshing access token")
url = self.root + "/auth/refresh"
json = {"token": refresh_token}
@@ -328,8 +325,8 @@ class MangadexAPI():
json = {"username": username, "password": password}
self.extractor.log.debug("Using legacy login method")
- data = self.extractor.request(
- url, method="POST", json=json, fatal=None).json()
+ data = self.extractor.request_json(
+ url, method="POST", json=json, fatal=None)
if data.get("result") != "ok":
raise exception.AuthenticationError()
@@ -354,10 +351,10 @@ class MangadexAPI():
self.extractor.wait(until=until)
continue
- msg = ", ".join('{title}: "{detail}"'.format_map(error)
+ msg = ", ".join(f'{error["title"]}: "{error["detail"]}"'
for error in response.json()["errors"])
- raise exception.StopExtraction(
- "%s %s (%s)", response.status_code, response.reason, msg)
+ raise exception.AbortExtraction(
+ f"{response.status_code} {response.reason} ({msg})")
def _pagination_chapters(self, endpoint, params=None, auth=False):
if params is None:
@@ -384,11 +381,12 @@ class MangadexAPI():
ratings = config("ratings")
if ratings is None:
ratings = ("safe", "suggestive", "erotica", "pornographic")
+ elif isinstance(ratings, str):
+ ratings = ratings.split(",")
params["contentRating[]"] = ratings
params["offset"] = 0
- api_params = config("api-parameters")
- if api_params:
+ if api_params := config("api-parameters"):
params.update(api_params)
while True:
diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py
index 827756a..76f4b7e 100644
--- a/gallery_dl/extractor/mangafox.py
+++ b/gallery_dl/extractor/mangafox.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -54,7 +54,7 @@ class MangafoxChapterExtractor(ChapterExtractor):
yield text.ensure_http_scheme(text.unescape(url)), None
pnum += 2
- page = self.request("{}/{}.html".format(self.urlbase, pnum)).text
+ page = self.request(f"{self.urlbase}/{pnum}.html").text
class MangafoxMangaExtractor(MangaExtractor):
diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py
index 8c94f04..151e809 100644
--- a/gallery_dl/extractor/mangahere.py
+++ b/gallery_dl/extractor/mangahere.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://www.mangahere.cc/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text
-import re
+from .. import text, util
class MangahereBase():
@@ -18,7 +17,6 @@ class MangahereBase():
category = "mangahere"
root = "https://www.mangahere.cc"
root_mobile = "https://m.mangahere.cc"
- url_fmt = root_mobile + "/manga/{}/{}.html"
class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
@@ -29,8 +27,8 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
def __init__(self, match):
self.part, self.volume, self.chapter = match.groups()
- url = self.url_fmt.format(self.part, 1)
- ChapterExtractor.__init__(self, match, url)
+ self.base = f"{self.root_mobile}/manga/{self.part}/"
+ ChapterExtractor.__init__(self, match, f"{self.base}1.html")
def _init(self):
self.session.headers["Referer"] = self.root_mobile + "/"
@@ -65,10 +63,10 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
url, pos = text.extract(page, ' src="', '"', pos)
yield text.ensure_http_scheme(text.unescape(url)), None
pnum += 2
- page = self.request(self.url_fmt.format(self.part, pnum)).text
+ page = self.request(f"{self.base}{pnum}.html").text
def _get_title(self):
- url = "{}/manga/{}/".format(self.root, self.part)
+ url = f"{self.root}/manga/{self.part}/"
page = self.request(url).text
try:
@@ -104,8 +102,8 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor):
info, pos = text.extract(page, 'class="title3">', '<', pos)
date, pos = text.extract(page, 'class="title2">', '<', pos)
- match = re.match(
- r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?", info)
+ match = util.re(
+ r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?").match(info)
if match:
volume, chapter, minor, title = match.groups()
else:
diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py
index 5e92aee..a6948e3 100644
--- a/gallery_dl/extractor/manganelo.py
+++ b/gallery_dl/extractor/manganelo.py
@@ -44,7 +44,7 @@ class ManganeloChapterExtractor(ManganeloExtractor, ChapterExtractor):
def __init__(self, match):
ManganeloExtractor.__init__(self, match)
- self.gallery_url = self.root + self.groups[-1]
+ self.page_url = self.root + self.groups[-1]
def metadata(self, page):
extr = text.extract_from(page)
@@ -91,7 +91,7 @@ class ManganeloMangaExtractor(ManganeloExtractor, MangaExtractor):
def __init__(self, match):
ManganeloExtractor.__init__(self, match)
- self.manga_url = self.root + self.groups[-1]
+ self.page_url = self.root + self.groups[-1]
def chapters(self, page):
extr = text.extract_from(page)
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
index b11f81d..19aee33 100644
--- a/gallery_dl/extractor/mangapark.py
+++ b/gallery_dl/extractor/mangapark.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,6 @@
from .common import ChapterExtractor, Extractor, Message
from .. import text, util, exception
from ..cache import memcache
-import re
BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:"
r"(?:manga|comic|read)park\.(?:com|net|org|me|io|to)|"
@@ -22,17 +21,14 @@ BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:"
class MangaparkBase():
"""Base class for mangapark extractors"""
category = "mangapark"
- _match_title = None
def _parse_chapter_title(self, title):
- if not self._match_title:
- MangaparkBase._match_title = re.compile(
- r"(?i)"
- r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?"
- r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)"
- r"(?:\s*:\s*(.*))?"
- ).match
- match = self._match_title(title)
+ match = util.re(
+ r"(?i)"
+ r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?"
+ r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)"
+ r"(?:\s*:\s*(.*))?"
+ ).match(title)
return match.groups() if match else (0, 0, "", "")
@memcache(keyarg=1)
@@ -68,8 +64,8 @@ class MangaparkBase():
"variables" : variables,
"operationName": opname,
}
- return self.request(
- url, method="POST", json=data).json()["data"].popitem()[1]
+ return self.request_json(
+ url, method="POST", json=data)["data"].popitem()[1]
class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
@@ -79,7 +75,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
example = "https://mangapark.net/title/MANGA/12345-en-ch.01"
def __init__(self, match):
- self.root = text.root_from_url(match.group(0))
+ self.root = text.root_from_url(match[0])
ChapterExtractor.__init__(self, match, False)
def metadata(self, _):
@@ -119,8 +115,8 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor):
example = "https://mangapark.net/title/12345-MANGA"
def __init__(self, match):
- self.root = text.root_from_url(match.group(0))
- self.manga_id = int(match.group(1))
+ self.root = text.root_from_url(match[0])
+ self.manga_id = int(match[1])
Extractor.__init__(self, match)
def items(self):
@@ -149,8 +145,7 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor):
yield Message.Queue, url, data
def chapters(self):
- source = self.config("source")
- if source:
+ if source := self.config("source"):
source_id = self._select_source(source)
self.log.debug("Requesting chapters for source_id %s", source_id)
chapters = self._extract_chapters_source(source_id)
@@ -180,8 +175,8 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor):
not lang or data["lang"] == lang):
return data["id"]
- raise exception.StopExtraction(
- "'%s' does not match any available source", source)
+ raise exception.AbortExtraction(
+ f"'{source}' does not match any available source")
QUERIES = {
diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py
index 6970b4f..a3bdf39 100644
--- a/gallery_dl/extractor/mangaread.py
+++ b/gallery_dl/extractor/mangaread.py
@@ -7,8 +7,7 @@
"""Extractors for https://mangaread.org/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, exception
-import re
+from .. import text, util, exception
class MangareadBase():
@@ -16,11 +15,10 @@ class MangareadBase():
category = "mangaread"
root = "https://www.mangaread.org"
- @staticmethod
- def parse_chapter_string(chapter_string, data):
- match = re.match(
- r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?",
- text.unescape(chapter_string).strip())
+ def parse_chapter_string(self, chapter_string, data):
+ match = util.re(
+ r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?"
+ ).match(text.unescape(chapter_string).strip())
manga, chapter, minor, title = match.groups()
manga = manga.strip() if manga else ""
data["manga"] = data.pop("manga", manga)
@@ -65,14 +63,14 @@ class MangareadMangaExtractor(MangareadBase, MangaExtractor):
if 'class="error404' in page:
raise exception.NotFoundError("manga")
data = self.metadata(page)
- result = []
+ results = []
for chapter in text.extract_iter(
page, '<li class="wp-manga-chapter', "</li>"):
url , pos = text.extract(chapter, '<a href="', '"')
info, _ = text.extract(chapter, ">", "</a>", pos)
self.parse_chapter_string(info, data)
- result.append((url, data.copy()))
- return result
+ results.append((url, data.copy()))
+ return results
def metadata(self, page):
extr = text.extract_from(text.extr(
@@ -84,7 +82,7 @@ class MangareadMangaExtractor(MangareadBase, MangaExtractor):
"rating" : text.parse_float(
extr('total_votes">', "</span>").strip()),
"manga_alt" : text.remove_html(
- extr("Alternative </h5>\n</div>", "</div>")).split("; "),
+ extr("Alternative\t\t</h5>\n\t</div>", "</div>")).split("; "),
"author" : list(text.extract_iter(
extr('class="author-content">', "</div>"), '"tag">', "</a>")),
"artist" : list(text.extract_iter(
diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py
deleted file mode 100644
index 7261332..0000000
--- a/gallery_dl/extractor/mangasee.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2021-2023 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://mangasee123.com/"""
-
-from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
-
-
-class MangaseeBase():
- category = "mangasee"
- browser = "firefox"
- root = "https://mangasee123.com"
-
- @staticmethod
- def _transform_chapter(data):
- chapter = data["Chapter"]
- return {
- "title" : data["ChapterName"] or "",
- "index" : chapter[0],
- "chapter" : int(chapter[1:-1]),
- "chapter_minor": "" if chapter[-1] == "0" else "." + chapter[-1],
- "chapter_string": chapter,
- "lang" : "en",
- "language": "English",
- "date" : text.parse_datetime(
- data["Date"], "%Y-%m-%d %H:%M:%S"),
- }
-
-
-class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor):
- pattern = (r"(?:https?://)?(mangasee123|manga4life)\.com"
- r"(/read-online/[^/?#]+\.html)")
- example = "https://mangasee123.com/read-online/MANGA-chapter-1-page-1.html"
-
- def __init__(self, match):
- if match.group(1) == "manga4life":
- self.category = "mangalife"
- self.root = "https://manga4life.com"
- ChapterExtractor.__init__(self, match, self.root + match.group(2))
-
- def _init(self):
- self.session.headers["Referer"] = self.gallery_url
-
- domain = self.root.rpartition("/")[2]
- cookies = self.cookies
- if not cookies.get("PHPSESSID", domain=domain):
- cookies.set("PHPSESSID", util.generate_token(13), domain=domain)
-
- def metadata(self, page):
- extr = text.extract_from(page)
- author = util.json_loads(extr('"author":', '],') + "]")
- genre = util.json_loads(extr('"genre":', '],') + "]")
- self.chapter = data = util.json_loads(extr("vm.CurChapter =", ";\r\n"))
- self.domain = extr('vm.CurPathName = "', '"')
- self.slug = extr('vm.IndexName = "', '"')
-
- data = self._transform_chapter(data)
- data["manga"] = text.unescape(extr('vm.SeriesName = "', '"'))
- data["author"] = author
- data["genre"] = genre
- return data
-
- def images(self, page):
- chapter = self.chapter["Chapter"][1:]
- if chapter[-1] == "0":
- chapter = chapter[:-1]
- else:
- chapter = chapter[:-1] + "." + chapter[-1]
-
- base = "https://{}/manga/{}/".format(self.domain, self.slug)
- if self.chapter["Directory"]:
- base += self.chapter["Directory"] + "/"
- base += chapter + "-"
-
- return [
- ("{}{:>03}.png".format(base, i), None)
- for i in range(1, int(self.chapter["Page"]) + 1)
- ]
-
-
-class MangaseeMangaExtractor(MangaseeBase, MangaExtractor):
- chapterclass = MangaseeChapterExtractor
- pattern = r"(?:https?://)?(mangasee123|manga4life)\.com(/manga/[^/?#]+)"
- example = "https://mangasee123.com/manga/MANGA"
-
- def __init__(self, match):
- if match.group(1) == "manga4life":
- self.category = "mangalife"
- self.root = "https://manga4life.com"
- MangaExtractor.__init__(self, match, self.root + match.group(2))
-
- def chapters(self, page):
- extr = text.extract_from(page)
- author = util.json_loads(extr('"author":', '],') + "]")
- genre = util.json_loads(extr('"genre":', '],') + "]")
- slug = extr('vm.IndexName = "', '"')
- chapters = util.json_loads(extr("vm.Chapters = ", ";\r\n"))
-
- result = []
- for data in map(self._transform_chapter, chapters):
- url = "{}/read-online/{}-chapter-{}{}".format(
- self.root, slug, data["chapter"], data["chapter_minor"])
- if data["index"] != "1":
- url += "-index-" + data["index"]
- url += "-page-1.html"
-
- data["manga"] = slug
- data["author"] = author
- data["genre"] = genre
- result.append((url, data))
- return result
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
index b208f03..beb13ce 100644
--- a/gallery_dl/extractor/mangoxo.py
+++ b/gallery_dl/extractor/mangoxo.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -53,8 +53,7 @@ class MangoxoExtractor(Extractor):
raise exception.AuthenticationError(data.get("msg"))
return {"SESSION": self.cookies.get("SESSION")}
- @staticmethod
- def _sign_by_md5(username, password, token):
+ def _sign_by_md5(self, username, password, token):
# https://dns.mangoxo.com/libs/plugins/phoenix-ui/js/phoenix-ui.js
params = [
("username" , username),
@@ -68,8 +67,7 @@ class MangoxoExtractor(Extractor):
params.append(("sign", sign.upper()))
return params
- @staticmethod
- def _total_pages(page):
+ def _total_pages(self, page):
return text.parse_int(text.extract(page, "total :", ",")[0])
@@ -84,11 +82,11 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
def __init__(self, match):
MangoxoExtractor.__init__(self, match)
- self.album_id = match.group(1)
+ self.album_id = match[1]
def items(self):
self.login()
- url = "{}/album/{}/".format(self.root, self.album_id)
+ url = f"{self.root}/album/{self.album_id}/"
page = self.request(url).text
data = self.metadata(page)
imgs = self.images(url, page)
@@ -149,12 +147,12 @@ class MangoxoChannelExtractor(MangoxoExtractor):
def __init__(self, match):
MangoxoExtractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
def items(self):
self.login()
num = total = 1
- url = "{}/{}/album/".format(self.root, self.user)
+ url = f"{self.root}/{self.user}/album/"
data = {"_extractor": MangoxoAlbumExtractor}
while True:
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 8b38474..1bab63a 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,7 +22,7 @@ class MastodonExtractor(BaseExtractor):
def __init__(self, match):
BaseExtractor.__init__(self, match)
- self.item = match.group(match.lastindex)
+ self.item = self.groups[-1]
def _init(self):
self.instance = self.root.partition("://")[2]
@@ -49,10 +49,8 @@ class MastodonExtractor(BaseExtractor):
attachments.extend(status["reblog"]["media_attachments"])
if self.cards:
- card = status.get("card")
- if card:
- url = card.get("image")
- if url:
+ if card := status.get("card"):
+ if url := card.get("image"):
card["weburl"] = card.get("url")
card["url"] = url
card["id"] = "card" + "".join(
@@ -202,7 +200,7 @@ class MastodonStatusExtractor(MastodonExtractor):
def statuses(self):
if self.groups[-2] is not None:
- url = "{}/objects/{}".format(self.root, self.item)
+ url = f"{self.root}/objects/{self.item}"
location = self.request_location(url)
self.item = location.rpartition("/")[2]
return (MastodonAPI(self).status(self.item),)
@@ -243,7 +241,7 @@ class MastodonAPI():
if "@" in username:
handle = "@" + username
else:
- handle = "@{}@{}".format(username, self.extractor.instance)
+ handle = f"@{username}@{self.extractor.instance}"
for account in self.account_search(handle, 1):
if account["acct"] == username:
@@ -263,7 +261,7 @@ class MastodonAPI():
def account_following(self, account_id):
"""Accounts which the given account is following"""
- endpoint = "/v1/accounts/{}/following".format(account_id)
+ endpoint = f"/v1/accounts/{account_id}/following"
return self._pagination(endpoint, None)
def account_lookup(self, username):
@@ -281,7 +279,7 @@ class MastodonAPI():
def account_statuses(self, account_id, only_media=True,
exclude_replies=False):
"""Statuses posted to the given account"""
- endpoint = "/v1/accounts/{}/statuses".format(account_id)
+ endpoint = f"/v1/accounts/{account_id}/statuses"
params = {"only_media" : "true" if only_media else "false",
"exclude_replies": "true" if exclude_replies else "false"}
return self._pagination(endpoint, params)
@@ -315,10 +313,9 @@ class MastodonAPI():
if code < 400:
return response
if code == 401:
- raise exception.StopExtraction(
- "Invalid or missing access token.\n"
- "Run 'gallery-dl oauth:mastodon:%s' to obtain one.",
- self.extractor.instance)
+ raise exception.AbortExtraction(
+ f"Invalid or missing access token.\nRun 'gallery-dl oauth:"
+ f"mastodon:{self.extractor.instance}' to obtain one.")
if code == 404:
raise exception.NotFoundError()
if code == 429:
@@ -327,7 +324,7 @@ class MastodonAPI():
"%Y-%m-%dT%H:%M:%S.%fZ",
))
continue
- raise exception.StopExtraction(response.json().get("error"))
+ raise exception.AbortExtraction(response.json().get("error"))
def _pagination(self, endpoint, params):
url = endpoint
diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py
index 5385f8a..f579a2d 100644
--- a/gallery_dl/extractor/misskey.py
+++ b/gallery_dl/extractor/misskey.py
@@ -6,8 +6,9 @@
"""Extractors for Misskey instances"""
-from .common import BaseExtractor, Message
+from .common import BaseExtractor, Message, Dispatch
from .. import text, exception
+from ..cache import memcache
class MisskeyExtractor(BaseExtractor):
@@ -19,7 +20,7 @@ class MisskeyExtractor(BaseExtractor):
def __init__(self, match):
BaseExtractor.__init__(self, match)
- self.item = match.group(match.lastindex)
+ self.item = self.groups[-1]
def _init(self):
self.api = MisskeyAPI(self)
@@ -32,15 +33,13 @@ class MisskeyExtractor(BaseExtractor):
if "note" in note:
note = note["note"]
files = note.pop("files") or []
- renote = note.get("renote")
- if renote:
+ if renote := note.get("renote"):
if not self.renotes:
self.log.debug("Skipping %s (renote)", note["id"])
continue
files.extend(renote.get("files") or ())
- reply = note.get("reply")
- if reply:
+ if reply := note.get("reply"):
if not self.replies:
self.log.debug("Skipping %s (reply)", note["id"])
continue
@@ -64,6 +63,23 @@ class MisskeyExtractor(BaseExtractor):
"""Return an iterable containing all relevant Note objects"""
return ()
+ def _make_note(self, type, user, url):
+ # extract real URL from potential proxy
+ path, sep, query = url.partition("?")
+ if sep:
+ url = text.parse_query(query).get("url") or path
+
+ return {
+ "id" : type,
+ "user" : user,
+ "files": ({
+ "id" : url.rpartition("/")[2].partition(".")[0], # ID from URL
+ "url": url,
+ "createdAt": "",
+ },),
+ "createdAt": "",
+ }
+
BASE_PATTERN = MisskeyExtractor.update({
"misskey.io": {
@@ -85,16 +101,67 @@ BASE_PATTERN = MisskeyExtractor.update({
})
-class MisskeyUserExtractor(MisskeyExtractor):
+class MisskeyUserExtractor(Dispatch, MisskeyExtractor):
"""Extractor for all images of a Misskey user"""
subcategory = "user"
pattern = BASE_PATTERN + r"/@([^/?#]+)/?$"
example = "https://misskey.io/@USER"
+ def items(self):
+ base = f"{self.root}/@{self.item}/"
+ return self._dispatch_extractors((
+ (MisskeyInfoExtractor , base + "info"),
+ (MisskeyAvatarExtractor , base + "avatar"),
+ (MisskeyBackgroundExtractor, base + "banner"),
+ (MisskeyNotesExtractor , base + "notes"),
+ ), ("notes",))
+
+
+class MisskeyNotesExtractor(MisskeyExtractor):
+ """Extractor for a Misskey user's notes"""
+ subcategory = "notes"
+ pattern = BASE_PATTERN + r"/@([^/?#]+)/notes"
+ example = "https://misskey.io/@USER/notes"
+
def notes(self):
return self.api.users_notes(self.api.user_id_by_username(self.item))
+class MisskeyInfoExtractor(MisskeyExtractor):
+ """Extractor for a Misskey user's profile data"""
+ subcategory = "info"
+ pattern = BASE_PATTERN + r"/@([^/?#]+)/info"
+ example = "https://misskey.io/@USER/info"
+
+ def items(self):
+ user = self.api.users_show(self.item)
+ return iter(((Message.Directory, user),))
+
+
+class MisskeyAvatarExtractor(MisskeyExtractor):
+ """Extractor for a Misskey user's avatar"""
+ subcategory = "avatar"
+ pattern = BASE_PATTERN + r"/@([^/?#]+)/avatar"
+ example = "https://misskey.io/@USER/avatar"
+
+ def notes(self):
+ user = self.api.users_show(self.item)
+ url = user.get("avatarUrl")
+ return (self._make_note("avatar", user, url),) if url else ()
+
+
+class MisskeyBackgroundExtractor(MisskeyExtractor):
+ """Extractor for a Misskey user's banner image"""
+ subcategory = "background"
+ pattern = BASE_PATTERN + r"/@([^/?#]+)/ba(?:nner|ckground)"
+ example = "https://misskey.io/@USER/banner"
+
+ def notes(self):
+ user = self.api.users_show(self.item)
+ url = user.get("bannerUrl")
+ return (self._make_note("background", user, url),) if url else ()
+
+
class MisskeyFollowingExtractor(MisskeyExtractor):
"""Extractor for followed Misskey users"""
subcategory = "following"
@@ -105,10 +172,9 @@ class MisskeyFollowingExtractor(MisskeyExtractor):
user_id = self.api.user_id_by_username(self.item)
for user in self.api.users_following(user_id):
user = user["followee"]
- url = self.root + "/@" + user["username"]
- host = user["host"]
- if host is not None:
- url += "@" + host
+ url = f"{self.root}/@{user['username']}"
+ if (host := user["host"]) is not None:
+ url = f"{url}@{host}"
user["_extractor"] = MisskeyUserExtractor
yield Message.Queue, url, user
@@ -144,15 +210,10 @@ class MisskeyAPI():
def __init__(self, extractor):
self.root = extractor.root
self.extractor = extractor
- self.headers = {"Content-Type": "application/json"}
self.access_token = extractor.config("access-token")
def user_id_by_username(self, username):
- endpoint = "/users/show"
- data = {"username": username}
- if "@" in username:
- data["username"], _, data["host"] = username.partition("@")
- return self._call(endpoint, data)["id"]
+ return self.users_show(username)["id"]
def users_following(self, user_id):
endpoint = "/users/following"
@@ -164,6 +225,13 @@ class MisskeyAPI():
data = {"userId": user_id}
return self._pagination(endpoint, data)
+ @memcache(keyarg=1)
+ def users_show(self, username):
+ endpoint = "/users/show"
+ username, _, host = username.partition("@")
+ data = {"username": username, "host": host or None}
+ return self._call(endpoint, data)
+
def notes_show(self, note_id):
endpoint = "/notes/show"
data = {"noteId": note_id}
@@ -177,9 +245,8 @@ class MisskeyAPI():
return self._pagination(endpoint, data)
def _call(self, endpoint, data):
- url = self.root + "/api" + endpoint
- return self.extractor.request(
- url, method="POST", headers=self.headers, json=data).json()
+ url = f"{self.root}/api{endpoint}"
+ return self.extractor.request_json(url, method="POST", json=data)
def _pagination(self, endpoint, data):
data["limit"] = 100
diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py
index 9fd66e2..ba27994 100644
--- a/gallery_dl/extractor/moebooru.py
+++ b/gallery_dl/extractor/moebooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,11 +9,9 @@
"""Extractors for Moebooru based sites"""
from .booru import BooruExtractor
-from .. import text
-
+from .. import text, util
import collections
import datetime
-import re
class MoebooruExtractor(BooruExtractor):
@@ -22,13 +20,12 @@ class MoebooruExtractor(BooruExtractor):
filename_fmt = "{category}_{id}_{md5}.{extension}"
page_start = 1
- @staticmethod
- def _prepare(post):
+ def _prepare(self, post):
post["date"] = text.parse_timestamp(post["created_at"])
def _html(self, post):
- return self.request("{}/post/show/{}".format(
- self.root, post["id"])).text
+ url = f"{self.root}/post/show/{post['id']}"
+ return self.request(url).text
def _tags(self, post, page):
tag_container = text.extr(page, '<ul id="tag-', '</ul>')
@@ -36,7 +33,7 @@ class MoebooruExtractor(BooruExtractor):
return
tags = collections.defaultdict(list)
- pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)")
+ pattern = util.re(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)")
for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
@@ -64,7 +61,7 @@ class MoebooruExtractor(BooruExtractor):
params["limit"] = self.per_page
while True:
- posts = self.request(url, params=params).json()
+ posts = self.request_json(url, params=params)
yield from posts
if len(posts) < self.per_page:
@@ -101,15 +98,14 @@ class MoebooruTagExtractor(MoebooruExtractor):
def __init__(self, match):
MoebooruExtractor.__init__(self, match)
- tags = match.group(match.lastindex)
- self.tags = text.unquote(tags.replace("+", " "))
+ self.tags = text.unquote(self.groups[-1].replace("+", " "))
def metadata(self):
return {"search_tags": self.tags}
def posts(self):
params = {"tags": self.tags}
- return self._pagination(self.root + "/post.json", params)
+ return self._pagination(f"{self.root}/post.json", params)
class MoebooruPoolExtractor(MoebooruExtractor):
@@ -121,12 +117,12 @@ class MoebooruPoolExtractor(MoebooruExtractor):
def __init__(self, match):
MoebooruExtractor.__init__(self, match)
- self.pool_id = match.group(match.lastindex)
+ self.pool_id = self.groups[-1]
def metadata(self):
if self.config("metadata"):
- url = "{}/pool/show/{}.json".format(self.root, self.pool_id)
- pool = self.request(url).json()
+ url = f"{self.root}/pool/show/{self.pool_id}.json"
+ pool = self.request_json(url)
pool["name"] = pool["name"].replace("_", " ")
pool.pop("posts", None)
return {"pool": pool}
@@ -134,7 +130,7 @@ class MoebooruPoolExtractor(MoebooruExtractor):
def posts(self):
params = {"tags": "pool:" + self.pool_id}
- return self._pagination(self.root + "/post.json", params)
+ return self._pagination(f"{self.root}/post.json", params)
class MoebooruPostExtractor(MoebooruExtractor):
@@ -143,13 +139,9 @@ class MoebooruPostExtractor(MoebooruExtractor):
pattern = BASE_PATTERN + r"/post/show/(\d+)"
example = "https://yande.re/post/show/12345"
- def __init__(self, match):
- MoebooruExtractor.__init__(self, match)
- self.post_id = match.group(match.lastindex)
-
def posts(self):
- params = {"tags": "id:" + self.post_id}
- return self.request(self.root + "/post.json", params=params).json()
+ params = {"tags": "id:" + self.groups[-1]}
+ return self.request_json(f"{self.root}/post.json", params=params)
class MoebooruPopularExtractor(MoebooruExtractor):
@@ -162,18 +154,15 @@ class MoebooruPopularExtractor(MoebooruExtractor):
def __init__(self, match):
MoebooruExtractor.__init__(self, match)
- self.scale = match.group(match.lastindex-1)
- self.query = match.group(match.lastindex)
+ self.scale = self.groups[-2]
+ self.query = self.groups[-1]
def metadata(self):
self.params = params = text.parse_query(self.query)
if "year" in params:
- date = "{:>04}-{:>02}-{:>02}".format(
- params["year"],
- params.get("month", "01"),
- params.get("day", "01"),
- )
+ date = (f"{params['year']:>04}-{params.get('month', '01'):>02}-"
+ f"{params.get('day', '01'):>02}")
else:
date = datetime.date.today().isoformat()
@@ -189,5 +178,5 @@ class MoebooruPopularExtractor(MoebooruExtractor):
return {"date": date, "scale": scale}
def posts(self):
- url = "{}/post/popular_{}.json".format(self.root, self.scale)
- return self.request(url, params=self.params).json()
+ url = f"{self.root}/post/popular_{self.scale}.json"
+ return self.request_json(url, params=self.params)
diff --git a/gallery_dl/extractor/motherless.py b/gallery_dl/extractor/motherless.py
index ce83ded..c81a4d1 100644
--- a/gallery_dl/extractor/motherless.py
+++ b/gallery_dl/extractor/motherless.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -24,7 +24,7 @@ class MotherlessExtractor(Extractor):
archive_fmt = "{id}"
def _extract_media(self, path):
- url = self.root + "/" + path
+ url = f"{self.root}/{path}"
page = self.request(url).text
extr = text.extract_from(page)
@@ -48,10 +48,59 @@ class MotherlessExtractor(Extractor):
"uploader": text.unescape(extr('class="username">', "<").strip()),
}
- if path and path[0] == "G":
+ if not path:
+ pass
+ elif path[0] == "G":
data["gallery_id"] = path[1:]
data["gallery_title"] = self._extract_gallery_title(
page, data["gallery_id"])
+ elif path[0] == "g":
+ data["group_id"] = path[2:]
+ data["group_title"] = self._extract_group_title(
+ page, data["group_id"])
+
+ return data
+
+ def _pagination(self, page):
+ while True:
+ for thumb in text.extract_iter(
+ page, 'class="thumb-container', "</div>"):
+ yield thumb
+
+ url = text.extr(page, '<link rel="next" href="', '"')
+ if not url:
+ return
+ page = self.request(text.unescape(url)).text
+
+ def _extract_data(self, page, category):
+ extr = text.extract_from(page)
+
+ gid = self.groups[-1]
+ if category == "gallery":
+ title = self._extract_gallery_title(page, gid)
+ else:
+ title = self._extract_group_title(page, gid)
+
+ return {
+ f"{category}_id": gid,
+ f"{category}_title": title,
+ "uploader": text.remove_html(extr(
+ f'class="{category}-member-username">', "</")),
+ "count": text.parse_int(
+ extr('<span class="active">', ")")
+ .rpartition("(")[2].replace(",", "")),
+ }
+
+ def _parse_thumb_data(self, thumb):
+ extr = text.extract_from(thumb)
+
+ data = {
+ "id" : extr('data-codename="', '"'),
+ "type" : extr('data-mediatype="', '"'),
+ "thumbnail": extr('class="static" src="', '"'),
+ "title" : extr(' alt="', '"'),
+ }
+ data["url"] = data["thumbnail"].replace("thumb", data["type"])
return data
@@ -72,13 +121,23 @@ class MotherlessExtractor(Extractor):
if title:
return text.unescape(title.strip())
- pos = page.find(' href="/G' + gallery_id + '"')
+ pos = page.find(f' href="/G{gallery_id}"')
if pos >= 0:
return text.unescape(text.extract(
page, ' title="', '"', pos)[0])
return ""
+ @memcache(keyarg=2)
+ def _extract_group_title(self, page, group_id):
+ title = text.extr(
+ text.extr(page, '<h1 class="group-bio-name">', "</h1>"),
+ ">", "<")
+ if title:
+ return text.unescape(title.strip())
+
+ return ""
+
class MotherlessMediaExtractor(MotherlessExtractor):
"""Extractor for a single image/video from motherless.com"""
@@ -109,59 +168,62 @@ class MotherlessGalleryExtractor(MotherlessExtractor):
if not type:
data = {"_extractor": MotherlessGalleryExtractor}
- yield Message.Queue, self.root + "/GI" + gid, data
- yield Message.Queue, self.root + "/GV" + gid, data
+ yield Message.Queue, f"{self.root}/GI{gid}", data
+ yield Message.Queue, f"{self.root}/GV{gid}", data
return
- url = "{}/G{}{}".format(self.root, type, gid)
+ url = f"{self.root}/G{type}{gid}"
page = self.request(url).text
- data = self._extract_gallery_data(page)
+ data = self._extract_data(page, "gallery")
for num, thumb in enumerate(self._pagination(page), 1):
file = self._parse_thumb_data(thumb)
+ thumbnail = file["thumbnail"]
if file["type"] == "video":
file = self._extract_media(file["id"])
file.update(data)
file["num"] = num
+ file["thumbnail"] = thumbnail
url = file["url"]
yield Message.Directory, file
yield Message.Url, url, text.nameext_from_url(url, file)
- def _pagination(self, page):
- while True:
- for thumb in text.extract_iter(
- page, 'class="thumb-container', "</div>"):
- yield thumb
- url = text.extr(page, '<link rel="next" href="', '"')
- if not url:
- return
- page = self.request(text.unescape(url)).text
+class MotherlessGroupExtractor(MotherlessExtractor):
+ subcategory = "group"
+ directory_fmt = ("{category}", "{uploader}",
+ "{group_id} {group_title}")
+ archive_fmt = "{group_id}_{id}"
+ pattern = BASE_PATTERN + "/g([iv]?)/?([a-z0-9_]+)/?$"
+ example = "https://motherless.com/g/abc123"
- def _extract_gallery_data(self, page):
- extr = text.extract_from(page)
- return {
- "gallery_id": self.groups[-1],
- "gallery_title": text.unescape(extr(
- "<title>", "<").rpartition(" | ")[0]),
- "uploader": text.remove_html(extr(
- 'class="gallery-member-username">', "</")),
- "count": text.parse_int(
- extr('<span class="active">', ")")
- .rpartition("(")[2].replace(",", "")),
- }
+ def items(self):
+ type, gid = self.groups
- def _parse_thumb_data(self, thumb):
- extr = text.extract_from(thumb)
+ if not type:
+ data = {"_extractor": MotherlessGroupExtractor}
+ yield Message.Queue, f"{self.root}/gi/{gid}", data
+ yield Message.Queue, f"{self.root}/gv/{gid}", data
+ return
- data = {
- "id" : extr('data-codename="', '"'),
- "type" : extr('data-mediatype="', '"'),
- "thumbnail": extr('class="static" src="', '"'),
- "title" : extr(' alt="', '"'),
- }
- data["url"] = data["thumbnail"].replace("thumb", data["type"])
+ url = f"{self.root}/g{type}/{gid}"
+ page = self.request(url).text
+ data = self._extract_data(page, "group")
- return data
+ for num, thumb in enumerate(self._pagination(page), 1):
+ file = self._parse_thumb_data(thumb)
+ thumbnail = file["thumbnail"]
+
+ file = self._extract_media(file["id"])
+
+ uploader = file.get("uploader")
+ file.update(data)
+ file["num"] = num
+ file["thumbnail"] = thumbnail
+ file["uploader"] = uploader
+ file["group"] = file["group_id"]
+ url = file["url"]
+ yield Message.Directory, file
+ yield Message.Url, url, text.nameext_from_url(url, file)
diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py
index f09507c..2a39dc9 100644
--- a/gallery_dl/extractor/myhentaigallery.py
+++ b/gallery_dl/extractor/myhentaigallery.py
@@ -20,12 +20,12 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
example = "https://myhentaigallery.com/g/12345"
def __init__(self, match):
- self.gallery_id = match.group(1)
- url = "{}/g/{}".format(self.root, self.gallery_id)
+ self.gallery_id = match[1]
+ url = f"{self.root}/g/{self.gallery_id}"
GalleryExtractor.__init__(self, match, url)
def _init(self):
- self.session.headers["Referer"] = self.gallery_url
+ self.session.headers["Referer"] = self.page_url
def metadata(self, page):
extr = text.extract_from(page)
diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py
index 7620d08..0223d0b 100644
--- a/gallery_dl/extractor/myportfolio.py
+++ b/gallery_dl/extractor/myportfolio.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -53,8 +53,7 @@ class MyportfolioGalleryExtractor(Extractor):
for data["num"], url in enumerate(imgs, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
- @staticmethod
- def metadata(page):
+ def metadata(self, page):
"""Collect general image metadata"""
# og:title contains data as "<user> - <title>", but both
# <user> and <title> can contain a "-" as well, so we get the title
@@ -81,8 +80,7 @@ class MyportfolioGalleryExtractor(Extractor):
"description": text.unescape(descr),
}
- @staticmethod
- def images(page):
+ def images(self, page):
"""Extract and return a list of all image-urls"""
return (
list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) or
diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naverblog.py
index 2287325..302cb63 100644
--- a/gallery_dl/extractor/naver.py
+++ b/gallery_dl/extractor/naverblog.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -14,13 +14,13 @@ import datetime
import time
-class NaverBase():
- """Base class for naver extractors"""
- category = "naver"
+class NaverBlogBase():
+ """Base class for blog.naver.com extractors"""
+ category = "naver-blog"
root = "https://blog.naver.com"
-class NaverPostExtractor(NaverBase, GalleryExtractor):
+class NaverBlogPostExtractor(NaverBlogBase, GalleryExtractor):
"""Extractor for blog posts on blog.naver.com"""
subcategory = "post"
filename_fmt = "{num:>03}.{extension}"
@@ -33,16 +33,15 @@ class NaverPostExtractor(NaverBase, GalleryExtractor):
example = "https://blog.naver.com/BLOGID/12345"
def __init__(self, match):
- blog_id = match.group(1)
- if blog_id:
+ if blog_id := match[1]:
self.blog_id = blog_id
- self.post_id = match.group(2)
+ self.post_id = match[2]
else:
- self.blog_id = match.group(3)
- self.post_id = match.group(4)
+ self.blog_id = match[3]
+ self.post_id = match[4]
- url = "{}/PostView.nhn?blogId={}&logNo={}".format(
- self.root, self.blog_id, self.post_id)
+ url = (f"{self.root}/PostView.nhn"
+ f"?blogId={self.blog_id}&logNo={self.post_id}")
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -117,13 +116,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor):
"adt" : "glad",
"lc" : "ko_KR",
}
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
video = max(data["videos"]["list"],
key=lambda v: v.get("size") or 0)
files.append((video["source"], video))
-class NaverBlogExtractor(NaverBase, Extractor):
+class NaverBlogBlogExtractor(NaverBlogBase, Extractor):
"""Extractor for a user's blog on blog.naver.com"""
subcategory = "blog"
categorytransfer = True
@@ -134,17 +133,17 @@ class NaverBlogExtractor(NaverBase, Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.blog_id = match.group(1) or match.group(2)
+ self.blog_id = match[1] or match[2]
def items(self):
# fetch first post number
- url = "{}/PostList.nhn?blogId={}".format(self.root, self.blog_id)
+ url = f"{self.root}/PostList.nhn?blogId={self.blog_id}"
post_num = text.extr(
self.request(url).text, 'gnFirstLogNo = "', '"',
)
# setup params for API calls
- url = "{}/PostViewBottomTitleListAsync.nhn".format(self.root)
+ url = f"{self.root}/PostViewBottomTitleListAsync.nhn"
params = {
"blogId" : self.blog_id,
"logNo" : post_num or "0",
@@ -160,12 +159,12 @@ class NaverBlogExtractor(NaverBase, Extractor):
# loop over all posts
while True:
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
for post in data["postList"]:
- post["url"] = "{}/PostView.nhn?blogId={}&logNo={}".format(
- self.root, self.blog_id, post["logNo"])
- post["_extractor"] = NaverPostExtractor
+ post["url"] = (f"{self.root}/PostView.nhn?blogId="
+ f"{self.blog_id}&logNo={post['logNo']}")
+ post["_extractor"] = NaverBlogPostExtractor
yield Message.Queue, post["url"], post
if not data["hasNextPage"]:
diff --git a/gallery_dl/extractor/naverchzzk.py b/gallery_dl/extractor/naverchzzk.py
new file mode 100644
index 0000000..de4ee7a
--- /dev/null
+++ b/gallery_dl/extractor/naverchzzk.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://chzzk.naver.com/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+
+class NaverChzzkExtractor(Extractor):
+ """Base class for chzzk.naver.com extractors"""
+ category = "naver-chzzk"
+ filename_fmt = "{uid}_{id}_{num}.{extension}"
+ directory_fmt = ("{category}", "{user[userNickname]}")
+ archive_fmt = "{uid}_{id}_{num}"
+
+ def request_api(self, uid, id=None, params=None):
+ return self.request_json(
+ f"https://apis.naver.com/nng_main/nng_comment_api/v1/type"
+ f"/CHANNEL_POST/id/{uid}/comments/{id or ''}",
+ params=params)["content"]
+
+ def items(self):
+ for comment in self.comments():
+ data = comment["comment"]
+ files = data.pop("attaches") or ()
+ data["id"] = data["commentId"]
+ data["uid"] = data["objectId"]
+ data["user"] = comment["user"]
+ data["count"] = len(files)
+ data["date"] = text.parse_datetime(
+ data["createdDate"], "%Y%m%d%H%M%S")
+
+ yield Message.Directory, data
+ for data["num"], file in enumerate(files, 1):
+ if extra := file.get("extraJson"):
+ file.update(util.json_loads(extra))
+ file["date"] = text.parse_datetime(
+ file["createdDate"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ file["date_updated"] = text.parse_datetime(
+ file["updatedDate"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ data["file"] = file
+ url = file["attachValue"]
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class NaverChzzkCommentExtractor(NaverChzzkExtractor):
+ """Extractor for individual comment from chzzk.naver.com"""
+ subcategory = "comment"
+ pattern = r"(?:https?://)?chzzk\.naver\.com/(\w+)/community/detail/(\d+)"
+ example = "https://chzzk.naver.com/0123456789abcdef/community/detail/12345"
+
+ def comments(self):
+ uid, id = self.groups
+ res = self.request_api(uid, id)
+ return ({"comment": res["comment"], "user": res["user"]},)
+
+
+class NaverChzzkCommunityExtractor(NaverChzzkExtractor):
+ """Extractor for comments from chzzk.naver.com"""
+ subcategory = "community"
+ pattern = r"(?:https?://)?chzzk\.naver\.com/(\w+)/community"
+ example = "https://chzzk.naver.com/0123456789abcdef/community"
+ request_interval = (0.5, 1.5)
+
+ def comments(self):
+ uid = self.match[1]
+ params = {
+ "limit": 10,
+ "offset": text.parse_int(self.config("offset")),
+ "pagingType": "PAGE",
+ }
+ while True:
+ comments = self.request_api(uid, params=params)["comments"]
+ yield from comments["data"]
+ if not comments["page"]["next"]:
+ return
+ params["offset"] += params["limit"]
diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py
index 4137f5d..3211941 100644
--- a/gallery_dl/extractor/naverwebtoon.py
+++ b/gallery_dl/extractor/naverwebtoon.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2021 Seonghyeon Cho
-# Copyright 2022-2033 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -16,13 +16,13 @@ BASE_PATTERN = (r"(?:https?://)?comic\.naver\.com"
r"/(webtoon|challenge|bestChallenge)")
-class NaverwebtoonBase():
- """Base class for naver webtoon extractors"""
- category = "naverwebtoon"
+class NaverWebtoonBase():
+ """Base class for comic.naver.com extractors"""
+ category = "naver-webtoon"
root = "https://comic.naver.com"
-class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
+class NaverWebtoonEpisodeExtractor(NaverWebtoonBase, GalleryExtractor):
subcategory = "episode"
directory_fmt = ("{category}", "{comic}")
filename_fmt = "{episode:>03}-{num:>02}.{extension}"
@@ -32,7 +32,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
def __init__(self, match):
path, query = match.groups()
- url = "{}/{}/detail?{}".format(self.root, path, query)
+ url = f"{self.root}/{path}/detail?{query}"
GalleryExtractor.__init__(self, match, url)
query = text.parse_query(query)
@@ -54,8 +54,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
extr('"painters":[', ']'), '"name":"', '"')]
}
- @staticmethod
- def images(page):
+ def images(self, page):
view_area = text.extr(page, 'id="comic_view_area"', '</div>')
return [
(url, None)
@@ -64,7 +63,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
]
-class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor):
+class NaverWebtoonComicExtractor(NaverWebtoonBase, Extractor):
subcategory = "comic"
categorytransfer = True
pattern = BASE_PATTERN + r"/list(?:\.nhn)?\?([^#]+)"
@@ -90,14 +89,13 @@ class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor):
}
while True:
- data = self.request(url, headers=headers, params=params).json()
+ data = self.request_json(url, headers=headers, params=params)
path = data["webtoonLevelCode"].lower().replace("_c", "C", 1)
- base = "{}/{}/detail?titleId={}&no=".format(
- self.root, path, data["titleId"])
+ base = f"{self.root}/{path}/detail?titleId={data['titleId']}&no="
for article in data["articleList"]:
- article["_extractor"] = NaverwebtoonEpisodeExtractor
+ article["_extractor"] = NaverWebtoonEpisodeExtractor
yield Message.Queue, base + str(article["no"]), article
params["page"] = data["pageInfo"]["nextPage"]
diff --git a/gallery_dl/extractor/nekohouse.py b/gallery_dl/extractor/nekohouse.py
index fe9d512..e6b0461 100644
--- a/gallery_dl/extractor/nekohouse.py
+++ b/gallery_dl/extractor/nekohouse.py
@@ -32,8 +32,7 @@ class NekohousePostExtractor(NekohouseExtractor):
def items(self):
service, user_id, post_id = self.groups
- url = "{}/{}/user/{}/post/{}".format(
- self.root, service, user_id, post_id)
+ url = f"{self.root}/{service}/user/{user_id}/post/{post_id}"
html = self.request(url).text
files = self._extract_files(html)
@@ -104,7 +103,7 @@ class NekohouseUserExtractor(NekohouseExtractor):
def items(self):
service, user_id, _ = self.groups
- creator_url = "{}/{}/user/{}".format(self.root, service, user_id)
+ creator_url = f"{self.root}/{service}/user/{user_id}"
params = {"o": 0}
data = {"_extractor": NekohousePostExtractor}
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 648f7df..de96aa0 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,11 +8,10 @@
"""Extractors for https://www.newgrounds.com/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from ..cache import cache
import itertools
-import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?newgrounds\.com"
USER_PATTERN = r"(?:https?://)?([\w-]+)\.newgrounds\.com"
@@ -31,11 +30,11 @@ class NewgroundsExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1)
- self.user_root = "https://{}.newgrounds.com".format(self.user)
+ self.user = match[1]
+ self.user_root = f"https://{self.user}.newgrounds.com"
def _init(self):
- self._extract_comment_urls = re.compile(
+ self._extract_comment_urls = util.re(
r'(?:<img |data-smartload-)src="([^"]+)').findall
self.flash = self.config("flash", True)
@@ -72,8 +71,7 @@ class NewgroundsExtractor(Extractor):
if "_multi" in post:
for data in post["_multi"]:
post["num"] += 1
- post["_index"] = "{}_{:>02}".format(
- post["index"], post["num"])
+ post["_index"] = f"{post['index']}_{post['num']:>02}"
post.update(data)
url = data["image"]
@@ -85,8 +83,7 @@ class NewgroundsExtractor(Extractor):
for url in self._extract_comment_urls(post["_comment"]):
post["num"] += 1
- post["_index"] = "{}_{:>02}".format(
- post["index"], post["num"])
+ post["_index"] = f"{post['index']}_{post['num']:>02}"
url = text.ensure_http_scheme(url)
text.nameext_from_url(url, post)
yield Message.Url, url, post
@@ -153,7 +150,7 @@ class NewgroundsExtractor(Extractor):
data["codehint"] = " "
elif result.get("requiresEmailMfa"):
email = result.get("obfuscatedEmail")
- prompt = "Email Verification Code ({}): ".format(email)
+ prompt = f"Email Verification Code ({email}): "
data["code"] = self.input(prompt)
data["codehint"] = " "
@@ -198,7 +195,10 @@ class NewgroundsExtractor(Extractor):
data["favorites"] = text.parse_int(extr(
'id="faves_load">', '<').replace(",", ""))
data["score"] = text.parse_float(extr('id="score_number">', '<'))
- data["tags"] = text.split_html(extr('<dd class="tags">', '</dd>'))
+ data["tags"] = [
+ t for t in text.split_html(extr('<dd class="tags">', '</dd>'))
+ if "(function(" not in t
+ ]
data["artist"] = [
text.extr(user, '//', '.')
for user in text.extract_iter(page, '<div class="item-user">', '>')
@@ -228,12 +228,10 @@ class NewgroundsExtractor(Extractor):
data["index"] = text.parse_int(index)
data["_index"] = index
- image_data = extr("let imageData =", "\n];")
- if image_data:
+ if image_data := extr("let imageData =", "\n];"):
data["_multi"] = self._extract_images_multi(image_data)
else:
- art_images = extr('<div class="art-images', '\n\t\t</div>')
- if art_images:
+ if art_images := extr('<div class="art-images', '\n\t\t</div>'):
data["_multi"] = self._extract_images_art(art_images, data)
return data
@@ -258,8 +256,7 @@ class NewgroundsExtractor(Extractor):
else:
yield {"image": url}
- @staticmethod
- def _extract_audio_data(extr, url):
+ def _extract_audio_data(self, extr, url):
index = url.split("/")[5]
return {
"title" : text.unescape(extr('"og:title" content="', '"')),
@@ -297,7 +294,7 @@ class NewgroundsExtractor(Extractor):
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
}
- sources = self.request(url, headers=headers).json()["sources"]
+ sources = self.request_json(url, headers=headers)["sources"]
formats = self._video_formats(sources)
src = next(formats, "")
date = text.parse_timestamp(src.rpartition("?")[2])
@@ -319,7 +316,7 @@ class NewgroundsExtractor(Extractor):
def _video_formats(self, sources):
src = sources["360p"][0]["src"]
- sub = re.compile(r"\.360p\.\w+").sub
+ sub = util.re(r"\.360p\.\w+").sub
for fmt in self.format:
try:
@@ -345,7 +342,7 @@ class NewgroundsExtractor(Extractor):
yield fmt[1][0]["src"]
def _pagination(self, kind, pnum=1):
- url = "{}/{}".format(self.user_root, kind)
+ url = f"{self.user_root}/{kind}"
params = {
"page": text.parse_int(pnum, 1),
"isAjaxRequest": "1",
@@ -367,7 +364,7 @@ class NewgroundsExtractor(Extractor):
return
if "errors" in data:
msg = ", ".join(text.unescape(e) for e in data["errors"])
- raise exception.StopExtraction(msg)
+ raise exception.AbortExtraction(msg)
items = data.get("items")
if not items:
@@ -396,12 +393,11 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
def __init__(self, match):
NewgroundsExtractor.__init__(self, match)
- if match.group(2):
- self.user = match.group(2)
- self.post_url = "https://www.newgrounds.com/art/view/{}/{}".format(
- self.user, match.group(3))
+ if match[2]:
+ self.user = match[2]
+ self.post_url = f"{self.root}/art/view/{self.user}/{match[3]}"
else:
- self.post_url = text.ensure_http_scheme(match.group(0))
+ self.post_url = text.ensure_http_scheme(match[0])
def posts(self):
return (self.post_url,)
@@ -416,7 +412,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
def __init__(self, match):
NewgroundsExtractor.__init__(self, match)
self.user = ""
- self.post_url = self.root + match.group(1)
+ self.post_url = self.root + match[1]
def posts(self):
return (self.post_url,)
@@ -450,15 +446,11 @@ class NewgroundsGamesExtractor(NewgroundsExtractor):
example = "https://USER.newgrounds.com/games"
-class NewgroundsUserExtractor(NewgroundsExtractor):
+class NewgroundsUserExtractor(Dispatch, NewgroundsExtractor):
"""Extractor for a newgrounds user profile"""
- subcategory = "user"
pattern = USER_PATTERN + r"/?$"
example = "https://USER.newgrounds.com"
- def initialize(self):
- pass
-
def items(self):
base = self.user_root + "/"
return self._dispatch_extractors((
@@ -486,7 +478,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
)
def _pagination_favorites(self, kind, pnum=1):
- url = "{}/favorites/{}".format(self.user_root, kind)
+ url = f"{self.user_root}/favorites/{kind}"
params = {
"page": text.parse_int(pnum, 1),
"isAjaxRequest": "1",
@@ -512,8 +504,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
def _extract_favorites(self, page):
return [
self.root + path
- for path in text.extract_iter(
- page, 'href="https://www.newgrounds.com', '"')
+ for path in text.extract_iter(page, f'href="{self.root}', '"')
]
@@ -531,8 +522,7 @@ class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
for url in self._pagination_favorites(kind, pnum):
yield Message.Queue, url, data
- @staticmethod
- def _extract_favorites(page):
+ def _extract_favorites(self, page):
return [
text.ensure_http_scheme(user.rpartition('"')[2])
for user in text.extract_iter(page, 'class="item-user', '"><img')
@@ -552,8 +542,7 @@ class NewgroundsSearchExtractor(NewgroundsExtractor):
self.query = text.parse_query(query)
def posts(self):
- suitabilities = self.query.get("suitabilities")
- if suitabilities:
+ if suitabilities := self.query.get("suitabilities"):
data = {"view_suitability_" + s: "on"
for s in suitabilities.split(",")}
self.request(self.root + "/suitabilities",
@@ -574,7 +563,7 @@ class NewgroundsSearchExtractor(NewgroundsExtractor):
}
while True:
- data = self.request(url, params=params, headers=headers).json()
+ data = self.request_json(url, params=params, headers=headers)
post_url = None
for post_url in text.extract_iter(data["content"], 'href="', '"'):
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
index 0d656d0..b0ec41f 100644
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,7 +22,7 @@ class NhentaiGalleryExtractor(GalleryExtractor):
example = "https://nhentai.net/g/12345/"
def __init__(self, match):
- url = self.root + "/api/gallery/" + match.group(1)
+ url = self.root + "/api/gallery/" + match[1]
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -87,7 +87,8 @@ class NhentaiExtractor(Extractor):
def items(self):
data = {"_extractor": NhentaiGalleryExtractor}
for gallery_id in self._pagination():
- url = "{}/g/{}/".format(self.root, gallery_id)
+ url = f"{self.root}/g/{gallery_id}/"
+ data["gallery_id"] = text.parse_int(gallery_id)
yield Message.Queue, url, data
def _pagination(self):
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index b01c591..c6df835 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for nijie instances"""
-from .common import BaseExtractor, Message, AsynchronousMixin
+from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin
from .. import text, exception
from ..cache import cache
@@ -23,7 +23,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
def __init__(self, match):
BaseExtractor.__init__(self, match)
- self.user_id = text.parse_int(match.group(match.lastindex))
+ self.user_id = text.parse_int(self.groups[-1])
def initialize(self):
self.cookies_domain = "." + self.root.rpartition("/")[2]
@@ -40,7 +40,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
for image_id in self.image_ids():
- url = "{}/view.php?id={}".format(self.root, image_id)
+ url = f"{self.root}/view.php?id={image_id}"
response = self.request(url, fatal=False)
if response.status_code >= 400:
continue
@@ -73,8 +73,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
def image_ids(self):
"""Collect all relevant image-ids"""
- @staticmethod
- def _extract_data(page):
+ def _extract_data(self, page):
"""Extract image metadata from 'page'"""
extr = text.extract_from(page)
keywords = text.unescape(extr(
@@ -90,8 +89,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"tags" : keywords[2:-1],
}
- @staticmethod
- def _extract_data_horne(page):
+ def _extract_data_horne(self, page):
"""Extract image metadata from 'page'"""
extr = text.extract_from(page)
keywords = text.unescape(extr(
@@ -111,20 +109,20 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
def _extract_images(self, image_id, page):
if '&#diff_1" ' in page:
# multiple images
- url = "{}/view_popup.php?id={}".format(self.root, image_id)
+ url = f"{self.root}/view_popup.php?id={image_id}"
page = self.request(url).text
return [
text.extr(media, ' src="', '"')
for media in text.extract_iter(
page, 'href="javascript:void(0);"><', '>')
+ if ' src="' in media
]
else:
pos = page.find('id="view-center"') + 1
# do NOT use text.extr() here, as it doesn't support a pos argument
return (text.extract(page, 'itemprop="image" src="', '"', pos)[0],)
- @staticmethod
- def _extract_user_name(page):
+ def _extract_user_name(self, page):
return text.unescape(text.extr(page, "<br />", "<"))
def login(self):
@@ -141,7 +139,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- url = "{}/login_int.php".format(self.root)
+ url = f"{self.root}/login_int.php"
data = {"email": username, "password": password, "save": "on"}
response = self.request(url, method="POST", data=data)
@@ -150,7 +148,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
return self.cookies
def _pagination(self, path):
- url = "{}/{}.php".format(self.root, path)
+ url = f"{self.root}/{path}.php"
params = {"id": self.user_id, "p": 1}
while True:
@@ -177,18 +175,13 @@ BASE_PATTERN = NijieExtractor.update({
})
-class NijieUserExtractor(NijieExtractor):
+class NijieUserExtractor(Dispatch, NijieExtractor):
"""Extractor for nijie user profiles"""
- subcategory = "user"
- cookies_domain = None
pattern = BASE_PATTERN + r"/members\.php\?id=(\d+)"
example = "https://nijie.info/members.php?id=12345"
- def initialize(self):
- pass
-
def items(self):
- fmt = "{}/{{}}.php?id={}".format(self.root, self.user_id).format
+ fmt = f"{self.root}/{{}}.php?id={self.user_id}".format
return self._dispatch_extractors((
(NijieIllustrationExtractor, fmt("members_illust")),
(NijieDoujinExtractor , fmt("members_dojin")),
@@ -252,8 +245,7 @@ class NijieNuitaExtractor(NijieExtractor):
data["user_name"] = self.user_name
return data
- @staticmethod
- def _extract_user_name(page):
+ def _extract_user_name(self, page):
return text.unescape(text.extr(page, "<title>", "さんの抜いた"))
@@ -266,8 +258,7 @@ class NijieFeedExtractor(NijieExtractor):
def image_ids(self):
return self._pagination("like_user_view")
- @staticmethod
- def _extract_user_name(page):
+ def _extract_user_name(self, page):
return ""
@@ -289,7 +280,7 @@ class NijieFollowedExtractor(NijieExtractor):
for user_id in text.extract_iter(
page, '"><a href="/members.php?id=', '"'):
- user_url = "{}/members.php?id={}".format(self.root, user_id)
+ user_url = f"{self.root}/members.php?id={user_id}"
yield Message.Queue, user_url, data
if '<a rel="next"' not in page:
@@ -303,9 +294,5 @@ class NijieImageExtractor(NijieExtractor):
pattern = BASE_PATTERN + r"/view(?:_popup)?\.php\?id=(\d+)"
example = "https://nijie.info/view.php?id=12345"
- def __init__(self, match):
- NijieExtractor.__init__(self, match)
- self.image_id = match.group(match.lastindex)
-
def image_ids(self):
- return (self.image_id,)
+ return (self.groups[-1],)
diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py
index cfc8861..69d8299 100644
--- a/gallery_dl/extractor/nitter.py
+++ b/gallery_dl/extractor/nitter.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -24,15 +24,13 @@ class NitterExtractor(BaseExtractor):
self.cookies_domain = self.root.partition("://")[2]
BaseExtractor.__init__(self, match)
- lastindex = match.lastindex
- self.user = match.group(lastindex)
- self.user_id = match.group(lastindex + 1)
+ self.user = self.groups[-2]
+ self.user_id = self.groups[-1]
self.user_obj = None
def items(self):
retweets = self.config("retweets", False)
- videos = self.config("videos", True)
- if videos:
+ if videos := self.config("videos", True):
ytdl = (videos == "ytdl")
videos = True
self.cookies.set("hlsPlayback", "on", domain=self.cookies_domain)
@@ -43,11 +41,8 @@ class NitterExtractor(BaseExtractor):
self.log.debug("Skipping %s (retweet)", tweet["tweet_id"])
continue
- attachments = tweet.pop("_attach", "")
- if attachments:
+ if attachments := tweet.pop("_attach", ""):
files = []
- append = files.append
-
for url in text.extract_iter(
attachments, 'href="', '"'):
@@ -67,15 +62,12 @@ class NitterExtractor(BaseExtractor):
file = {"url": url, "_http_retry": _retry_on_404}
file["filename"], _, file["extension"] = \
name.rpartition(".")
- append(file)
+ files.append(file)
if videos and not files:
if ytdl:
- append({
- "url": "ytdl:{}/i/status/{}".format(
- self.root, tweet["tweet_id"]),
- "extension": None,
- })
+ url = f"ytdl:{self.root}/i/status/{tweet['tweet_id']}"
+ files.append({"url": url, "extension": "mp4"})
else:
for url in text.extract_iter(
attachments, 'data-url="', '"'):
@@ -88,7 +80,7 @@ class NitterExtractor(BaseExtractor):
if url[0] == "/":
url = self.root + url
- append({
+ files.append({
"url" : "ytdl:" + url,
"filename" : name.rpartition(".")[0],
"extension": "mp4",
@@ -98,7 +90,8 @@ class NitterExtractor(BaseExtractor):
attachments, '<source src="', '"'):
if url[0] == "/":
url = self.root + url
- append(text.nameext_from_url(url, {"url": url}))
+ files.append(
+ text.nameext_from_url(url, {"url": url}))
else:
files = ()
@@ -206,10 +199,10 @@ class NitterExtractor(BaseExtractor):
if self.user_id:
self.user = self.request(
- "{}/i/user/{}".format(self.root, self.user_id),
+ f"{self.root}/i/user/{self.user_id}",
allow_redirects=False,
).headers["location"].rpartition("/")[2]
- base_url = url = "{}/{}{}".format(self.root, self.user, path)
+ base_url = url = f"{self.root}/{self.user}{path}"
while True:
tweets_html = self.request(url).text.split(
@@ -285,7 +278,7 @@ class NitterTweetExtractor(NitterExtractor):
example = "https://nitter.net/USER/status/12345"
def tweets(self):
- url = "{}/i/status/{}".format(self.root, self.user)
+ url = f"{self.root}/i/status/{self.user}"
html = text.extr(self.request(url).text, 'class="main-tweet', '''\
</div>
</div></div></div>''')
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 3d1722a..21c361c 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -32,8 +32,8 @@ class NozomiExtractor(Extractor):
data = self.metadata()
for post_id in map(str, self.posts()):
- url = "https://j.{}/post/{}/{}/{}.json".format(
- self.domain, post_id[-1], post_id[-3:-1], post_id)
+ url = (f"https://j.{self.domain}/post"
+ f"/{post_id[-1]}/{post_id[-3:-1]}/{post_id}.json")
response = self.request(url, fatal=False)
if response.status_code >= 400:
@@ -77,8 +77,8 @@ class NozomiExtractor(Extractor):
ext = "webp"
post["extension"] = ext
- post["url"] = url = "https://{}.{}/{}/{}/{}.{}".format(
- subdomain, self.domain, did[-1], did[-3:-1], did, ext)
+ post["url"] = url = (f"https://{subdomain}.{self.domain}"
+ f"/{did[-1]}/{did[-3:-1]}/{did}.{ext}")
yield Message.Url, url, post
def posts(self):
@@ -86,7 +86,7 @@ class NozomiExtractor(Extractor):
offset = (text.parse_int(self.pnum, 1) - 1) * 256
while True:
- headers = {"Range": "bytes={}-{}".format(offset, offset+255)}
+ headers = {"Range": f"bytes={offset}-{offset + 255}"}
response = self.request(url, headers=headers)
yield from decode_nozomi(response.content)
@@ -98,8 +98,7 @@ class NozomiExtractor(Extractor):
def metadata(self):
return {}
- @staticmethod
- def _list(src):
+ def _list(self, src):
return [x["tagname_display"] for x in src] if src else ()
@@ -111,7 +110,7 @@ class NozomiPostExtractor(NozomiExtractor):
def __init__(self, match):
NozomiExtractor.__init__(self, match)
- self.post_id = match.group(1)
+ self.post_id = match[1]
def posts(self):
return (self.post_id,)
@@ -127,7 +126,7 @@ class NozomiIndexExtractor(NozomiExtractor):
def __init__(self, match):
NozomiExtractor.__init__(self, match)
index, self.pnum = match.groups()
- self.nozomi = "/{}.nozomi".format(index or "index")
+ self.nozomi = f"/{index or 'index'}.nozomi"
class NozomiTagExtractor(NozomiExtractor):
@@ -142,7 +141,7 @@ class NozomiTagExtractor(NozomiExtractor):
NozomiExtractor.__init__(self, match)
tags, self.pnum = match.groups()
self.tags = text.unquote(tags)
- self.nozomi = "/nozomi/{}.nozomi".format(self.tags)
+ self.nozomi = f"/nozomi/{self.tags}.nozomi"
def metadata(self):
return {"search_tags": self.tags}
@@ -158,7 +157,7 @@ class NozomiSearchExtractor(NozomiExtractor):
def __init__(self, match):
NozomiExtractor.__init__(self, match)
- self.tags = text.unquote(match.group(1)).split()
+ self.tags = text.unquote(match[1]).split()
def metadata(self):
return {"search_tags": self.tags}
@@ -169,7 +168,7 @@ class NozomiSearchExtractor(NozomiExtractor):
negative = []
def nozomi(path):
- url = "https://j.{}/{}.nozomi".format(self.domain, path)
+ url = f"https://j.{self.domain}/{path}.nozomi"
return decode_nozomi(self.request(url).content)
for tag in self.tags:
diff --git a/gallery_dl/extractor/nsfwalbum.py b/gallery_dl/extractor/nsfwalbum.py
index eb5d31f..0b84f9c 100644
--- a/gallery_dl/extractor/nsfwalbum.py
+++ b/gallery_dl/extractor/nsfwalbum.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -25,7 +25,7 @@ class NsfwalbumAlbumExtractor(GalleryExtractor):
example = "https://nsfwalbum.com/album/12345"
def __init__(self, match):
- self.album_id = match.group(2)
+ self.album_id = match[2]
GalleryExtractor.__init__(self, match)
def metadata(self, page):
@@ -53,7 +53,7 @@ class NsfwalbumAlbumExtractor(GalleryExtractor):
self.request(iframe + image_id).text,
'giraffe.annihilate("', '"')[0])
params = {"spirit": spirit, "photo": image_id}
- data = self.request(backend, params=params).json()
+ data = self.request_json(backend, params=params)
break
except Exception:
tries += 1
@@ -66,17 +66,15 @@ class NsfwalbumAlbumExtractor(GalleryExtractor):
"width" : text.parse_int(data[1]),
"height": text.parse_int(data[2]),
"_http_validate": self._validate_response,
- "_fallback": ("{}/imageProxy.php?photoId={}&spirit={}".format(
- self.root, image_id, spirit),),
+ "_fallback": (f"{self.root}/imageProxy.php"
+ f"?photoId={image_id}&spirit={spirit}",),
}
- @staticmethod
- def _validate_response(response):
+ def _validate_response(self, response):
return not response.url.endswith(
("/no_image.jpg", "/placeholder.png", "/error.jpg"))
- @staticmethod
- def _annihilate(value, base=6):
+ def _annihilate(self, value, base=6):
return "".join(
chr(ord(char) ^ base)
for char in value
diff --git a/gallery_dl/extractor/nudostar.py b/gallery_dl/extractor/nudostar.py
new file mode 100644
index 0000000..467d36a
--- /dev/null
+++ b/gallery_dl/extractor/nudostar.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://nudostar.tv/"""
+
+from .common import GalleryExtractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:[a-z]{2}.)?nudostar\.tv"
+
+
+class NudostarExtractor(GalleryExtractor):
+ """Base class for NudoStar extractors"""
+ category = "nudostar"
+ root = "https://nudostar.tv"
+
+
+class NudostarModelExtractor(NudostarExtractor):
+ """Extractor for NudoStar models"""
+ subcategory = "model"
+ pattern = BASE_PATTERN + r"(/models/([^/?#]+)/?)$"
+ example = "https://nudostar.tv/models/MODEL/"
+
+ def metadata(self, page):
+ names = text.extr(page, "<title>", "<").rpartition(
+ " Nude ")[0].split(" / ")
+ slug = self.groups[1]
+
+ return {
+ "gallery_id" : slug,
+ "model_slug" : slug,
+ "model_names": names,
+ "model" : names[0],
+ "title" : "",
+ }
+
+ def images(self, page):
+ path = text.extr(page, '" src="https://nudostar.tv', '"')
+ path, cnt, end = path.rsplit("_", 2)
+
+ base = f"{self.root}{path}_"
+ ext = "." + end.rpartition(".")[2]
+
+ return [
+ (f"{base}{i:04}{ext}", None)
+ for i in range(1, int(cnt)+1)
+ ]
+
+
+class NudostarImageExtractor(NudostarExtractor):
+ """Extractor for NudoStar images"""
+ subcategory = "image"
+ pattern = BASE_PATTERN + r"(/models/([^/?#]+)/(\d+)/)"
+ example = "https://nudostar.tv/models/MODEL/123/"
+
+ def items(self):
+ page = self.request(self.page_url, notfound=self.subcategory).text
+
+ img_url = text.extract(
+ page, 'src="', '"', page.index('class="headline"'))[0]
+
+ data = NudostarModelExtractor.metadata(self, page)
+ data = text.nameext_from_url(img_url, data)
+ data["num"] = text.parse_int(self.groups[2])
+ data["url"] = img_url
+
+ yield Message.Directory, data
+ yield Message.Url, img_url, data
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 815a214..2d9a061 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -74,8 +74,7 @@ class OAuthBase(Extractor):
"""Open 'url' in browser amd return response parameters"""
url += "?" + urllib.parse.urlencode(params)
- browser = self.config("browser", True)
- if browser:
+ if browser := self.config("browser", True):
try:
import webbrowser
browser = webbrowser.get()
@@ -83,18 +82,17 @@ class OAuthBase(Extractor):
browser = None
if browser and browser.open(url):
- name = getattr(browser, "name", None)
- if name:
+ if name := getattr(browser, "name", None):
self.log.info("Opening URL with %s:", name.capitalize())
else:
self.log.info("Please open this URL in your browser:")
- stdout_write("\n{}\n\n".format(url))
+ stdout_write(f"\n{url}\n\n")
return (recv or self.recv)()
def error(self, msg):
return self.send(
- "Remote server reported an error:\n\n{}\n".format(msg))
+ f"Remote server reported an error:\n\n{msg}\n")
def _oauth1_authorization_flow(
self, default_key, default_secret,
@@ -151,10 +149,7 @@ class OAuthBase(Extractor):
"default" if client_id == default_id else "custom",
instance or self.subcategory, client_id)
- state = "gallery-dl_{}_{}".format(
- self.subcategory,
- oauth.nonce(8),
- )
+ state = f"gallery-dl_{self.subcategory}_{oauth.nonce(8)}"
auth_params = {
"client_id" : client_id,
@@ -170,8 +165,8 @@ class OAuthBase(Extractor):
# check authorization response
if state != params.get("state"):
- self.send("'state' mismatch: expected {}, got {}.\n".format(
- state, params.get("state")))
+ self.send(f"'state' mismatch: expected {state}, "
+ f"got {params.get('state')}.\n")
return
if "error" in params:
return self.error(params)
@@ -190,8 +185,8 @@ class OAuthBase(Extractor):
data["client_id"] = client_id
data["client_secret"] = client_secret
- data = self.request(
- token_url, method="POST", data=data, auth=auth).json()
+ data = self.request_json(
+ token_url, method="POST", data=data, auth=auth)
# check token response
if "error" in data:
@@ -217,27 +212,23 @@ class OAuthBase(Extractor):
("These values have", "these values", "are", "them")
)
- msg = "\nYour {} {}\n\n{}\n\n".format(
- " and ".join("'" + n + "'" for n in names),
- _is,
- "\n".join(values),
- )
+ key = " and ".join(f"'{n}'" for n in names)
+ val = "\n".join(values)
+ msg = f"\nYour {key} {_is}\n\n{val}\n\n"
opt = self.oauth_config(names[0])
if self.cache and (opt is None or opt == "cache"):
msg += _vh + " been cached and will automatically be used.\n"
else:
- msg += "Put " + _va + " into your configuration file as \n"
+ msg += f"Put {_va} into your configuration file as \n"
msg += " and\n".join(
- "'extractor." + self.subcategory + "." + n + "'"
+ f"'extractor.{self.subcategory}.{n}'"
for n in names
)
if self.cache:
- msg += (
- "\nor set\n'extractor.{}.{}' to \"cache\""
- .format(self.subcategory, names[0])
- )
- msg += "\nto use {}.\n".format(_it)
+ msg = (f"{msg}\nor set\n'extractor."
+ f"{self.subcategory}.{names[0]}' to \"cache\"")
+ msg = f"{msg}\nto use {_it}.\n"
return msg
@@ -354,7 +345,7 @@ class OAuthMastodon(OAuthBase):
def __init__(self, match):
OAuthBase.__init__(self, match)
- self.instance = match.group(1)
+ self.instance = match[1]
def items(self):
yield Message.Version, 1
@@ -371,8 +362,8 @@ class OAuthMastodon(OAuthBase):
application["client-secret"],
application["client-id"],
application["client-secret"],
- "https://{}/oauth/authorize".format(self.instance),
- "https://{}/oauth/token".format(self.instance),
+ f"https://{self.instance}/oauth/authorize",
+ f"https://{self.instance}/oauth/token",
instance=self.instance,
key="access_token",
cache=mastodon._access_token_cache,
@@ -382,17 +373,17 @@ class OAuthMastodon(OAuthBase):
def _register(self, instance):
self.log.info("Registering application for '%s'", instance)
- url = "https://{}/api/v1/apps".format(instance)
+ url = f"https://{instance}/api/v1/apps"
data = {
"client_name": "gdl:" + oauth.nonce(8),
"redirect_uris": self.redirect_uri,
"scopes": "read",
}
- data = self.request(url, method="POST", data=data).json()
+ data = self.request_json(url, method="POST", data=data)
if "client_id" not in data or "client_secret" not in data:
- raise exception.StopExtraction(
- "Failed to register new application: '%s'", data)
+ raise exception.AbortExtraction(
+ f"Failed to register new application: '{data}'")
data["client-id"] = data.pop("client_id")
data["client-secret"] = data.pop("client_secret")
@@ -443,11 +434,11 @@ class OAuthPixiv(OAuthBase):
"redirect_uri" : "https://app-api.pixiv.net"
"/web/v1/users/auth/pixiv/callback",
}
- data = self.request(
- url, method="POST", headers=headers, data=data).json()
+ data = self.request_json(
+ url, method="POST", headers=headers, data=data)
if "error" in data:
- stdout_write("\n{}\n".format(data))
+ stdout_write(f"\n{data}\n")
if data["error"] in ("invalid_request", "invalid_grant"):
stdout_write("'code' expired, try again\n\n")
return
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index 2330b08..5245f31 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -42,7 +42,7 @@ class PahealExtractor(Extractor):
"""Return an iterable containing data of all relevant posts"""
def _extract_post(self, post_id):
- url = "{}/post/view/{}".format(self.root, post_id)
+ url = f"{self.root}/post/view/{post_id}"
extr = text.extract_from(self.request(url).text)
post = {
@@ -64,7 +64,7 @@ class PahealExtractor(Extractor):
post["width"], _, height = dimensions.partition("x")
post["height"], _, duration = height.partition(", ")
post["duration"] = text.parse_float(duration[:-1])
- post["filename"] = "{} - {}".format(post_id, post["tags"])
+ post["filename"] = f"{post_id} - {post['tags']}"
post["extension"] = ext
return post
@@ -80,10 +80,6 @@ class PahealTagExtractor(PahealExtractor):
page_start = 1
per_page = 70
- def __init__(self, match):
- PahealExtractor.__init__(self, match)
- self.tags = text.unquote(match.group(1))
-
def _init(self):
if self.config("metadata"):
self._extract_data = self._extract_data_ex
@@ -94,13 +90,14 @@ class PahealTagExtractor(PahealExtractor):
return pages * self.per_page
def get_metadata(self):
- return {"search_tags": self.tags}
+ return {"search_tags": text.unquote(self.groups[0])}
def get_posts(self):
pnum = self.page_start
+ base = f"{self.root}/post/list/{self.groups[0]}/"
+
while True:
- url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
- page = self.request(url).text
+ page = self.request(base + str(pnum)).text
pos = page.find("id='image-list'")
for post in text.extract_iter(
@@ -111,8 +108,7 @@ class PahealTagExtractor(PahealExtractor):
return
pnum += 1
- @staticmethod
- def _extract_data(post):
+ def _extract_data(self, post):
pid , pos = text.extract(post, "", "'")
data, pos = text.extract(post, "title='", "'", pos)
md5 , pos = text.extract(post, "/_thumbs/", "/", pos)
@@ -133,7 +129,7 @@ class PahealTagExtractor(PahealExtractor):
"tags" : text.unescape(tags),
"size" : text.parse_bytes(size[:-1]),
"date" : text.parse_datetime(date, "%B %d, %Y; %H:%M"),
- "filename" : "{} - {}".format(pid, tags),
+ "filename" : f"{pid} - {tags}",
"extension": ext,
}
@@ -149,9 +145,5 @@ class PahealPostExtractor(PahealExtractor):
r"/post/view/(\d+)")
example = "https://rule34.paheal.net/post/view/12345"
- def __init__(self, match):
- PahealExtractor.__init__(self, match)
- self.post_id = match.group(1)
-
def get_posts(self):
- return (self._extract_post(self.post_id),)
+ return (self._extract_post(self.groups[0]),)
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 2b6742e..fb2f32c 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -35,20 +35,21 @@ class PatreonExtractor(Extractor):
self.session.headers["User-Agent"] = \
"Patreon/7.6.28 (Android; Android 11; Scale/2.10)"
- format_images = self.config("format-images")
- if format_images:
+ if format_images := self.config("format-images"):
self._images_fmt = format_images
self._images_url = self._images_url_fmt
+ self._cursor = None
+
def items(self):
generators = self._build_file_generators(self.config("files"))
for post in self.posts():
+ yield Message.Directory, post
if not post.get("current_user_can_view", True):
self.log.warning("Not allowed to view post %s", post["id"])
continue
- yield Message.Directory, post
post["num"] = 0
hashes = set()
@@ -63,18 +64,24 @@ class PatreonExtractor(Extractor):
text.nameext_from_url(name, post)
if text.ext_from_url(url) == "m3u8":
url = "ytdl:" + url
+ headers = {"referer": self.root + "/"}
post["_ytdl_manifest"] = "hls"
+ post["_ytdl_manifest_headers"] = headers
+ post["_ytdl_extra"] = {"http_headers": headers}
post["extension"] = "mp4"
yield Message.Url, url, post
else:
self.log.debug("skipping %s (%s %s)", url, fhash, kind)
+ def finalize(self):
+ if self._cursor:
+ self.log.info("Use '-o cursor=%s' to continue downloading "
+ "from the current position", self._cursor)
+
def _postfile(self, post):
- postfile = post.get("post_file")
- if postfile:
+ if postfile := post.get("post_file"):
url = postfile["url"]
- name = postfile.get("name")
- if not name:
+ if not (name := postfile.get("name")):
if url.startswith("https://stream.mux.com/"):
name = url
else:
@@ -83,11 +90,11 @@ class PatreonExtractor(Extractor):
return ()
def _images(self, post):
- for image in post.get("images") or ():
- url = self._images_url(image)
- if url:
- name = image.get("file_name") or self._filename(url) or url
- yield "image", url, name
+ if images := post.get("images"):
+ for image in images:
+ if url := self._images_url(image):
+ name = image.get("file_name") or self._filename(url) or url
+ yield "image", url, name
def _images_url(self, image):
return image.get("download_url")
@@ -99,32 +106,26 @@ class PatreonExtractor(Extractor):
return image.get("download_url")
def _image_large(self, post):
- image = post.get("image")
- if image:
- url = image.get("large_url")
- if url:
+ if image := post.get("image"):
+ if url := image.get("large_url"):
name = image.get("file_name") or self._filename(url) or url
return (("image_large", url, name),)
return ()
def _attachments(self, post):
for attachment in post.get("attachments") or ():
- url = self.request_location(attachment["url"], fatal=False)
- if url:
+ if url := self.request_location(attachment["url"], fatal=False):
yield "attachment", url, attachment["name"]
for attachment in post.get("attachments_media") or ():
- url = attachment.get("download_url")
- if url:
+ if url := attachment.get("download_url"):
yield "attachment", url, attachment["file_name"]
def _content(self, post):
- content = post.get("content")
- if content:
+ if content := post.get("content"):
for img in text.extract_iter(
content, '<img data-media-id="', '>'):
- url = text.extr(img, 'src="', '"')
- if url:
+ if url := text.extr(img, 'src="', '"'):
yield "content", url, self._filename(url) or url
def posts(self):
@@ -136,8 +137,9 @@ class PatreonExtractor(Extractor):
}
while url:
+ self._update_cursor(url)
url = text.ensure_http_scheme(url)
- posts = self.request(url, headers=headers).json()
+ posts = self.request_json(url, headers=headers)
if "included" in posts:
included = self._transform(posts["included"])
@@ -145,56 +147,67 @@ class PatreonExtractor(Extractor):
yield self._process(post, included)
if "links" not in posts:
- return
+ break
url = posts["links"].get("next")
+ self._update_cursor("")
+
+ def _init_cursor(self):
+ if cursor := self.config("cursor", True):
+ return "" if cursor is True else cursor
+ self._update_cursor = util.identity
+ return ""
+
+ def _update_cursor(self, url):
+ params = text.parse_query(url.partition("?")[2])
+ self._cursor = cursor = params.get("page[cursor]")
+ if cursor:
+ self.log.debug("Cursor: %s", cursor)
+ return cursor
+
def _process(self, post, included):
"""Process and extend a 'post' object"""
attr = post["attributes"]
attr["id"] = text.parse_int(post["id"])
- if attr.get("current_user_can_view", True):
-
- relationships = post["relationships"]
- attr["images"] = self._files(
- post, included, "images")
- attr["attachments"] = self._files(
- post, included, "attachments")
- attr["attachments_media"] = self._files(
- post, included, "attachments_media")
- attr["date"] = text.parse_datetime(
- attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ relationships = post["relationships"]
+ attr["images"] = self._files(
+ post, included, "images")
+ attr["attachments"] = self._files(
+ post, included, "attachments")
+ attr["attachments_media"] = self._files(
+ post, included, "attachments_media")
+ attr["date"] = text.parse_datetime(
+ attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
- try:
- attr["campaign"] = (included["campaign"][
- relationships["campaign"]["data"]["id"]])
- except Exception:
- attr["campaign"] = None
+ try:
+ attr["campaign"] = (included["campaign"][
+ relationships["campaign"]["data"]["id"]])
+ except Exception:
+ attr["campaign"] = None
- tags = relationships.get("user_defined_tags")
- attr["tags"] = [
- tag["id"].replace("user_defined;", "")
- for tag in tags["data"]
- if tag["type"] == "post_tag"
- ] if tags else []
+ tags = relationships.get("user_defined_tags")
+ attr["tags"] = [
+ tag["id"].replace("user_defined;", "")
+ for tag in tags["data"]
+ if tag["type"] == "post_tag"
+ ] if tags else []
- user = relationships["user"]
- attr["creator"] = (
- self._user(user["links"]["related"]) or
- included["user"][user["data"]["id"]])
+ user = relationships["user"]
+ attr["creator"] = (
+ self._user(user["links"]["related"]) or
+ included["user"][user["data"]["id"]])
return attr
- @staticmethod
- def _transform(included):
+ def _transform(self, included):
"""Transform 'included' into an easier to handle format"""
result = collections.defaultdict(dict)
for inc in included:
result[inc["type"]][inc["id"]] = inc["attributes"]
return result
- @staticmethod
- def _files(post, included, key):
+ def _files(self, post, included, key):
"""Build a list of files"""
files = post["relationships"].get(key)
if files and files.get("data"):
@@ -223,8 +236,7 @@ class PatreonExtractor(Extractor):
cd = response.headers.get("Content-Disposition")
return text.extr(cd, 'filename="', '"')
- @staticmethod
- def _filehash(url):
+ def _filehash(self, url):
"""Extract MD5 hash from a download URL"""
parts = url.partition("?")[0].split("/")
parts.reverse()
@@ -234,10 +246,9 @@ class PatreonExtractor(Extractor):
return part
return ""
- @staticmethod
- def _build_url(endpoint, query):
+ def _build_url(self, endpoint, query):
return (
- "https://www.patreon.com/api/" + endpoint +
+ f"https://www.patreon.com/api/{endpoint}"
"?include=campaign,access_rules,attachments,attachments_media,"
"audio,images,media,native_video_insights,poll.choices,"
@@ -267,7 +278,10 @@ class PatreonExtractor(Extractor):
"&fields[media]=id,image_urls,download_url,metadata,file_name"
"&fields[native_video_insights]=average_view_duration,"
"average_view_pct,has_preview,id,last_updated_at,num_views,"
- "preview_views,video_duration" + query +
+ "preview_views,video_duration"
+
+ f"&page[cursor]={self._init_cursor()}"
+ f"{query}"
"&json-api-version=1.0"
)
@@ -307,18 +321,16 @@ class PatreonExtractor(Extractor):
if bootstrap:
return util.json_loads(bootstrap + "}")
- bootstrap = text.extr(page, "window.patreon.bootstrap,", "});")
- if bootstrap:
+ if bootstrap := text.extr(page, "window.patreon.bootstrap,", "});"):
return util.json_loads(bootstrap + "}")
- data = text.extr(page, "window.patreon = {", "};\n")
- if data:
+ if data := text.extr(page, "window.patreon = {", "};\n"):
try:
- return util.json_loads("{" + data + "}")["bootstrap"]
+ return util.json_loads(f"{{{data}}}")["bootstrap"]
except Exception:
pass
- raise exception.StopExtraction("Unable to extract bootstrap data")
+ raise exception.AbortExtraction("Unable to extract bootstrap data")
class PatreonCreatorExtractor(PatreonExtractor):
@@ -327,57 +339,63 @@ class PatreonCreatorExtractor(PatreonExtractor):
pattern = (r"(?:https?://)?(?:www\.)?patreon\.com"
r"/(?!(?:home|create|login|signup|search|posts|messages)"
r"(?:$|[/?#]))"
- r"(?:profile/creators|(?:c/)?([^/?#]+)(?:/posts)?)"
+ r"(?:profile/creators|(?:cw?/)?([^/?#]+)(?:/posts)?)"
r"/?(?:\?([^#]+))?")
example = "https://www.patreon.com/c/USER"
def posts(self):
creator, query = self.groups
- query = text.parse_query(query)
- campaign_id = self._get_campaign_id(creator, query)
- filters = self._get_filters(query)
-
+ params = text.parse_query(query)
+ campaign_id = self._get_campaign_id(creator, params)
self.log.debug("campaign_id: %s", campaign_id)
url = self._build_url("posts", (
- "&filter[campaign_id]=" + campaign_id +
+ f"&filter[campaign_id]={campaign_id}"
"&filter[contains_exclusive_posts]=true"
- "&filter[is_draft]=false" + filters +
- "&sort=" + query.get("sort", "-published_at")
+ "&filter[is_draft]=false"
+ f"{self._get_filters(params)}"
+ f"&sort={params.get('sort', '-published_at')}"
))
return self._pagination(url)
- def _get_campaign_id(self, creator, query):
+ def _get_campaign_id(self, creator, params):
if creator and creator.startswith("id:"):
return creator[3:]
- campaign_id = query.get("c") or query.get("campaign_id")
- if campaign_id:
+ if campaign_id := params.get("c") or params.get("campaign_id"):
return campaign_id
- user_id = query.get("u")
- if user_id:
- url = "{}/user?u={}".format(self.root, user_id)
+ if user_id := params.get("u"):
+ url = f"{self.root}/user?u={user_id}"
else:
- url = "{}/{}".format(self.root, creator)
+ url = f"{self.root}/{creator}"
page = self.request(url, notfound="creator").text
try:
data = None
data = self._extract_bootstrap(page)
return data["campaign"]["data"]["id"]
+ except exception.ControlException:
+ pass
except Exception as exc:
if data:
self.log.debug(data)
- raise exception.StopExtraction(
- "Unable to extract campaign ID (%s: %s)",
- exc.__class__.__name__, exc)
+ raise exception.AbortExtraction(
+ f"Unable to extract campaign ID "
+ f"({exc.__class__.__name__}: {exc})")
+
+ # Next.js 13
+ if cid := text.extr(
+ page, r'{\"value\":{\"campaign\":{\"data\":{\"id\":\"', '\\"'):
+ return cid
- def _get_filters(self, query):
+ raise exception.AbortExtraction("Failed to extract campaign ID")
+
+ def _get_filters(self, params):
return "".join(
- "&filter[{}={}".format(key[8:], text.escape(value))
- for key, value in query.items()
+ f"&filter[{key[8:]}={text.escape(value)}"
+ for key, value in params.items()
if key.startswith("filters[")
)
@@ -389,8 +407,12 @@ class PatreonUserExtractor(PatreonExtractor):
example = "https://www.patreon.com/home"
def posts(self):
+ if date_max := self._get_date_min_max(None, None)[1]:
+ self._cursor = cursor = \
+ util.datetime_from_timestamp(date_max).isoformat()
+ self._init_cursor = lambda: cursor
+
url = self._build_url("stream", (
- "&page[cursor]=null"
"&filter[is_following]=true"
"&json-api-use-default-includes=false"
))
@@ -404,7 +426,7 @@ class PatreonPostExtractor(PatreonExtractor):
example = "https://www.patreon.com/posts/TITLE-12345"
def posts(self):
- url = "{}/posts/{}".format(self.root, self.groups[0])
+ url = f"{self.root}/posts/{self.groups[0]}"
page = self.request(url, notfound="post").text
bootstrap = self._extract_bootstrap(page)
diff --git a/gallery_dl/extractor/pexels.py b/gallery_dl/extractor/pexels.py
index 804623b..f95d409 100644
--- a/gallery_dl/extractor/pexels.py
+++ b/gallery_dl/extractor/pexels.py
@@ -105,7 +105,7 @@ class PexelsImageExtractor(PexelsExtractor):
example = "https://www.pexels.com/photo/SLUG-12345/"
def posts(self):
- url = "{}/photo/{}/".format(self.root, self.groups[0])
+ url = f"{self.root}/photo/{self.groups[0]}/"
page = self.request(url).text
return (self._extract_nextdata(page)["props"]["pageProps"]["medium"],)
@@ -132,7 +132,7 @@ class PexelsAPI():
}
def collections_media(self, collection_id):
- endpoint = "/v3/collections/{}/media".format(collection_id)
+ endpoint = f"/v3/collections/{collection_id}/media"
params = {
"page" : "1",
"per_page": "24",
@@ -153,7 +153,7 @@ class PexelsAPI():
return self._pagination(endpoint, params)
def users_media_recent(self, user_id):
- endpoint = "/v3/users/{}/media/recent".format(user_id)
+ endpoint = f"/v3/users/{user_id}/media/recent"
params = {
"page" : "1",
"per_page": "24",
@@ -175,7 +175,7 @@ class PexelsAPI():
else:
self.extractor.log.debug(response.text)
- raise exception.StopExtraction("API request failed")
+ raise exception.AbortExtraction("API request failed")
def _pagination(self, endpoint, params):
while True:
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index 201d4d6..8891dc0 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -35,8 +35,7 @@ class PhilomenaExtractor(BooruExtractor):
return url.rpartition(".")[0] + ".svg"
return url
- @staticmethod
- def _prepare(post):
+ def _prepare(self, post):
post["date"] = text.parse_datetime(
post["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
@@ -66,12 +65,8 @@ class PhilomenaPostExtractor(PhilomenaExtractor):
pattern = BASE_PATTERN + r"/(?:images/)?(\d+)"
example = "https://derpibooru.org/images/12345"
- def __init__(self, match):
- PhilomenaExtractor.__init__(self, match)
- self.image_id = match.group(match.lastindex)
-
def posts(self):
- return (self.api.image(self.image_id),)
+ return (self.api.image(self.groups[-1]),)
class PhilomenaSearchExtractor(PhilomenaExtractor):
@@ -83,9 +78,9 @@ class PhilomenaSearchExtractor(PhilomenaExtractor):
def __init__(self, match):
PhilomenaExtractor.__init__(self, match)
- groups = match.groups()
- if groups[-1]:
- q = groups[-1].replace("+", " ")
+
+ if q := self.groups[-1]:
+ q = q.replace("+", " ")
for old, new in (
("-colon-" , ":"),
("-dash-" , "-"),
@@ -98,7 +93,7 @@ class PhilomenaSearchExtractor(PhilomenaExtractor):
q = q.replace(old, new)
self.params = {"q": text.unquote(text.unquote(q))}
else:
- self.params = text.parse_query(groups[-2])
+ self.params = text.parse_query(self.groups[-2])
def metadata(self):
return {"search_tags": self.params.get("q", "")}
@@ -115,18 +110,14 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor):
pattern = BASE_PATTERN + r"/galleries/(\d+)"
example = "https://derpibooru.org/galleries/12345"
- def __init__(self, match):
- PhilomenaExtractor.__init__(self, match)
- self.gallery_id = match.group(match.lastindex)
-
def metadata(self):
try:
- return {"gallery": self.api.gallery(self.gallery_id)}
+ return {"gallery": self.api.gallery(self.groups[-1])}
except IndexError:
raise exception.NotFoundError("gallery")
def posts(self):
- gallery_id = "gallery_id:" + self.gallery_id
+ gallery_id = f"gallery_id:{self.groups[-1]}"
params = {"sd": "desc", "sf": gallery_id, "q": gallery_id}
return self.api.search(params)
@@ -169,18 +160,15 @@ class PhilomenaAPI():
# error
self.extractor.log.debug(response.content)
- raise exception.StopExtraction(
- "%s %s", response.status_code, response.reason)
+ raise exception.HttpError("", response)
def _pagination(self, endpoint, params):
extr = self.extractor
- api_key = extr.config("api-key")
- if api_key:
+ if api_key := extr.config("api-key"):
params["key"] = api_key
- filter_id = extr.config("filter")
- if filter_id:
+ if filter_id := extr.config("filter"):
params["filter_id"] = filter_id
elif not api_key:
params["filter_id"] = extr.config_instance("filter_id") or "2"
diff --git a/gallery_dl/extractor/photovogue.py b/gallery_dl/extractor/photovogue.py
index 2a2df5a..e604304 100644
--- a/gallery_dl/extractor/photovogue.py
+++ b/gallery_dl/extractor/photovogue.py
@@ -23,7 +23,7 @@ class PhotovogueUserExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user_id = match.group(1)
+ self.user_id = match[1]
def items(self):
for photo in self.photos():
@@ -45,7 +45,7 @@ class PhotovogueUserExtractor(Extractor):
}
while True:
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
yield from data["items"]
if not data["has_next"]:
diff --git a/gallery_dl/extractor/picarto.py b/gallery_dl/extractor/picarto.py
index cc7eee5..62ac38a 100644
--- a/gallery_dl/extractor/picarto.py
+++ b/gallery_dl/extractor/picarto.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -25,7 +25,7 @@ class PicartoGalleryExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.username = match.group(1)
+ self.username = match[1]
def items(self):
for post in self.posts():
@@ -62,7 +62,7 @@ class PicartoGalleryExtractor(Extractor):
}
while True:
- posts = self.request(url, params=params).json()
+ posts = self.request_json(url, params=params)
if not posts:
return
yield from posts
diff --git a/gallery_dl/extractor/pictoa.py b/gallery_dl/extractor/pictoa.py
index a8008cf..da252f3 100644
--- a/gallery_dl/extractor/pictoa.py
+++ b/gallery_dl/extractor/pictoa.py
@@ -30,7 +30,7 @@ class PictoaImageExtractor(PictoaExtractor):
def items(self):
album_id, image_id = self.groups
- url = "{}/albums/{}/{}.html".format(self.root, album_id, image_id)
+ url = f"{self.root}/albums/{album_id}/{image_id}.html"
page = self.request(url).text
album_title = text.extr(page, 'property="og:title" content="', '"')
image_url = text.extr(page, 'property="og:image" content="', '"')
@@ -55,7 +55,7 @@ class PictoaAlbumExtractor(PictoaExtractor):
def items(self):
album_id = self.groups[0]
- url = "{}/albums/{}.html".format(self.root, album_id)
+ url = f"{self.root}/albums/{album_id}.html"
page = self.request(url).text
album_data = {
diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py
index 8a729f3..968776b 100644
--- a/gallery_dl/extractor/piczel.py
+++ b/gallery_dl/extractor/piczel.py
@@ -55,7 +55,7 @@ class PiczelExtractor(Extractor):
params = {"page": pnum}
while True:
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
yield from data["data"]
@@ -71,7 +71,7 @@ class PiczelUserExtractor(PiczelExtractor):
example = "https://piczel.tv/gallery/USER"
def posts(self):
- url = "{}/api/users/{}/gallery".format(self.root_api, self.groups[0])
+ url = f"{self.root_api}/api/users/{self.groups[0]}/gallery"
return self._pagination(url)
@@ -84,7 +84,7 @@ class PiczelFolderExtractor(PiczelExtractor):
example = "https://piczel.tv/gallery/USER/12345"
def posts(self):
- url = "{}/api/gallery/folder/{}".format(self.root_api, self.groups[0])
+ url = f"{self.root_api}/api/gallery/folder/{self.groups[0]}"
return self._pagination(url)
@@ -95,5 +95,5 @@ class PiczelImageExtractor(PiczelExtractor):
example = "https://piczel.tv/gallery/image/12345"
def posts(self):
- url = "{}/api/gallery/{}".format(self.root_api, self.groups[0])
- return (self.request(url).json(),)
+ url = f"{self.root_api}/api/gallery/{self.groups[0]}"
+ return (self.request_json(url),)
diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py
index 5749240..05bc8e7 100644
--- a/gallery_dl/extractor/pillowfort.py
+++ b/gallery_dl/extractor/pillowfort.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,8 +10,7 @@
from .common import Extractor, Message
from ..cache import cache
-from .. import text, exception
-import re
+from .. import text, util, exception
BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social"
@@ -28,7 +27,7 @@ class PillowfortExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.item = match.group(1)
+ self.item = match[1]
def items(self):
self.login()
@@ -37,8 +36,8 @@ class PillowfortExtractor(Extractor):
external = self.config("external", False)
if inline:
- inline = re.compile(r'src="(https://img\d+\.pillowfort\.social'
- r'/posts/[^"]+)').findall
+ inline = util.re(r'src="(https://img\d+\.pillowfort\.social'
+ r'/posts/[^"]+)').findall
for post in self.posts():
if "original_post" in post and not reblogs:
@@ -126,8 +125,8 @@ class PillowfortPostExtractor(PillowfortExtractor):
example = "https://www.pillowfort.social/posts/12345"
def posts(self):
- url = "{}/posts/{}/json/".format(self.root, self.item)
- return (self.request(url).json(),)
+ url = f"{self.root}/posts/{self.item}/json/"
+ return (self.request_json(url),)
class PillowfortUserExtractor(PillowfortExtractor):
@@ -137,11 +136,11 @@ class PillowfortUserExtractor(PillowfortExtractor):
example = "https://www.pillowfort.social/USER"
def posts(self):
- url = "{}/{}/json/".format(self.root, self.item)
+ url = f"{self.root}/{self.item}/json/"
params = {"p": 1}
while True:
- posts = self.request(url, params=params).json()["posts"]
+ posts = self.request_json(url, params=params)["posts"]
yield from posts
if len(posts) < 20:
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index 62fa9be..9c335ad 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -55,6 +55,14 @@ class PinterestExtractor(Extractor):
pin.update(data)
pin["count"] = len(files)
+ for key in (
+ "description",
+ "closeup_description",
+ "closeup_unified_description",
+ ):
+ if value := pin.get(key):
+ pin[key] = value.strip()
+
yield Message.Directory, pin
for pin["num"], file in enumerate(files, 1):
url = file["url"]
@@ -84,8 +92,7 @@ class PinterestExtractor(Extractor):
if story_pin_data and self.stories:
return self._extract_story(pin, story_pin_data)
- carousel_data = pin.get("carousel_data")
- if carousel_data:
+ if carousel_data := pin.get("carousel_data"):
return self._extract_carousel(pin, carousel_data)
videos = pin.get("videos")
@@ -164,8 +171,8 @@ class PinterestExtractor(Extractor):
def _extract_image(self, page, block):
sig = block.get("image_signature") or page["image_signature"]
- url_base = "https://i.pinimg.com/originals/{}/{}/{}/{}.".format(
- sig[0:2], sig[2:4], sig[4:6], sig)
+ url_base = (f"https://i.pinimg.com/originals"
+ f"/{sig[0:2]}/{sig[2:4]}/{sig[4:6]}/{sig}.")
url_jpg = url_base + "jpg"
url_png = url_base + "png"
url_webp = url_base + "webp"
@@ -205,7 +212,7 @@ class PinterestPinExtractor(PinterestExtractor):
def __init__(self, match):
PinterestExtractor.__init__(self, match)
- self.pin_id = match.group(1)
+ self.pin_id = match[1]
self.pin = None
def metadata(self):
@@ -222,13 +229,13 @@ class PinterestBoardExtractor(PinterestExtractor):
directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}")
archive_fmt = "{board[id]}_{id}"
pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)"
- "/(?!_saved|_created|pins/)([^/?#]+)/?$")
+ r"/(?!_saved|_created|pins/)([^/?#]+)/?(?:$|\?|#)")
example = "https://www.pinterest.com/USER/BOARD/"
def __init__(self, match):
PinterestExtractor.__init__(self, match)
- self.user = text.unquote(match.group(1))
- self.board_name = text.unquote(match.group(2))
+ self.user = text.unquote(match[1])
+ self.board_name = text.unquote(match[2])
self.board = None
def metadata(self):
@@ -240,7 +247,7 @@ class PinterestBoardExtractor(PinterestExtractor):
pins = self.api.board_pins(board["id"])
if board["section_count"] and self.config("sections", True):
- base = "{}{}id:".format(self.root, board["url"])
+ base = f"{self.root}{board['url']}id:"
data = {"_extractor": PinterestSectionExtractor}
sections = [(base + section["id"], data)
for section in self.api.board_sections(board["id"])]
@@ -257,12 +264,11 @@ class PinterestUserExtractor(PinterestExtractor):
def __init__(self, match):
PinterestExtractor.__init__(self, match)
- self.user = text.unquote(match.group(1))
+ self.user = text.unquote(match[1])
def items(self):
for board in self.api.boards(self.user):
- url = board.get("url")
- if url:
+ if url := board.get("url"):
board["_extractor"] = PinterestBoardExtractor
yield Message.Queue, self.root + url, board
@@ -276,7 +282,7 @@ class PinterestAllpinsExtractor(PinterestExtractor):
def __init__(self, match):
PinterestExtractor.__init__(self, match)
- self.user = text.unquote(match.group(1))
+ self.user = text.unquote(match[1])
def metadata(self):
return {"user": self.user}
@@ -294,7 +300,7 @@ class PinterestCreatedExtractor(PinterestExtractor):
def __init__(self, match):
PinterestExtractor.__init__(self, match)
- self.user = text.unquote(match.group(1))
+ self.user = text.unquote(match[1])
def metadata(self):
return {"user": self.user}
@@ -314,9 +320,9 @@ class PinterestSectionExtractor(PinterestExtractor):
def __init__(self, match):
PinterestExtractor.__init__(self, match)
- self.user = text.unquote(match.group(1))
- self.board_slug = text.unquote(match.group(2))
- self.section_slug = text.unquote(match.group(3))
+ self.user = text.unquote(match[1])
+ self.board_slug = text.unquote(match[2])
+ self.section_slug = text.unquote(match[3])
self.section = None
def metadata(self):
@@ -342,7 +348,7 @@ class PinterestSearchExtractor(PinterestExtractor):
def __init__(self, match):
PinterestExtractor.__init__(self, match)
- self.search = text.unquote(match.group(1))
+ self.search = text.unquote(match[1])
def metadata(self):
return {"search": self.search}
@@ -384,12 +390,19 @@ class PinterestPinitExtractor(PinterestExtractor):
example = "https://pin.it/abcde"
def items(self):
- url = "https://api.pinterest.com/url_shortener/{}/redirect/".format(
- self.groups[0])
+ url = (f"https://api.pinterest.com/url_shortener"
+ f"/{self.groups[0]}/redirect/")
location = self.request_location(url)
- if not location or not PinterestPinExtractor.pattern.match(location):
+ if not location:
+ raise exception.NotFoundError("pin")
+ elif PinterestPinExtractor.pattern.match(location):
+ yield Message.Queue, location, {
+ "_extractor": PinterestPinExtractor}
+ elif PinterestBoardExtractor.pattern.match(location):
+ yield Message.Queue, location, {
+ "_extractor": PinterestBoardExtractor}
+ else:
raise exception.NotFoundError("pin")
- yield Message.Queue, location, {"_extractor": PinterestPinExtractor}
class PinterestAPI():
@@ -511,7 +524,7 @@ class PinterestAPI():
return self._pagination("BaseSearch", options)
def _call(self, resource, options):
- url = "{}/resource/{}Resource/get/".format(self.root, resource)
+ url = f"{self.root}/resource/{resource}Resource/get/"
params = {
"data" : util.json_dumps({"options": options}),
"source_url": "",
@@ -534,7 +547,7 @@ class PinterestAPI():
resource = self.extractor.subcategory.rpartition("-")[2]
raise exception.NotFoundError(resource)
self.extractor.log.debug("Server response: %s", response.text)
- raise exception.StopExtraction("API request failed")
+ raise exception.AbortExtraction("API request failed")
def _pagination(self, resource, options):
while True:
diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py
index 7a4d1a5..73f4b1f 100644
--- a/gallery_dl/extractor/pixeldrain.py
+++ b/gallery_dl/extractor/pixeldrain.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023-2024 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -21,8 +21,7 @@ class PixeldrainExtractor(Extractor):
archive_fmt = "{id}"
def _init(self):
- api_key = self.config("api-key")
- if api_key:
+ if api_key := self.config("api-key"):
self.session.auth = util.HTTPBasicAuth("", api_key)
def parse_datetime(self, date_string):
@@ -39,11 +38,11 @@ class PixeldrainFileExtractor(PixeldrainExtractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.file_id = match.group(1)
+ self.file_id = match[1]
def items(self):
- url = "{}/api/file/{}".format(self.root, self.file_id)
- file = self.request(url + "/info").json()
+ url = f"{self.root}/api/file/{self.file_id}"
+ file = self.request_json(url + "/info")
file["url"] = url + "?download"
file["date"] = self.parse_datetime(file["date_upload"])
@@ -64,12 +63,12 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.album_id = match.group(1)
- self.file_index = match.group(2)
+ self.album_id = match[1]
+ self.file_index = match[2]
def items(self):
- url = "{}/api/list/{}".format(self.root, self.album_id)
- album = self.request(url).json()
+ url = f"{self.root}/api/list/{self.album_id}"
+ album = self.request_json(url)
files = album["files"]
album["count"] = album["file_count"]
@@ -91,8 +90,7 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor):
for num, file in enumerate(files, idx+1):
file["album"] = album
file["num"] = num
- file["url"] = url = "{}/api/file/{}?download".format(
- self.root, file["id"])
+ file["url"] = url = f"{self.root}/api/file/{file['id']}?download"
file["date"] = self.parse_datetime(file["date_upload"])
text.nameext_from_url(file["name"], file)
yield Message.Url, url, file
@@ -120,8 +118,8 @@ class PixeldrainFolderExtractor(PixeldrainExtractor):
def items(self):
recursive = self.config("recursive")
- url = "{}/api/filesystem/{}".format(self.root, self.groups[0])
- stat = self.request(url + "?stat").json()
+ url = f"{self.root}/api/filesystem/{self.groups[0]}"
+ stat = self.request_json(url + "?stat")
paths = stat["path"]
path = paths[stat["base_index"]]
@@ -143,9 +141,8 @@ class PixeldrainFolderExtractor(PixeldrainExtractor):
for child in children:
if child["type"] == "file":
num += 1
- url = "{}/api/filesystem{}?attach".format(
- self.root, child["path"])
- share_url = "{}/d{}".format(self.root, child["path"])
+ url = f"{self.root}/api/filesystem{child['path']}?attach"
+ share_url = f"{self.root}/d{child['path']}"
data = self.metadata(child)
data.update({
"id" : folder["id"],
@@ -159,7 +156,7 @@ class PixeldrainFolderExtractor(PixeldrainExtractor):
elif child["type"] == "dir":
if recursive:
- url = "{}/d{}".format(self.root, child["path"])
+ url = f"{self.root}/d{child['path']}"
child["_extractor"] = PixeldrainFolderExtractor
yield Message.Queue, url, child
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 73c5c1c..cb0e93e 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://www.pixiv.net/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from ..cache import cache, memcache
from datetime import datetime, timedelta
@@ -43,6 +43,10 @@ class PixivExtractor(Extractor):
self.meta_comments = self.config("comments")
self.meta_captions = self.config("captions")
+ if self.sanity_workaround or self.meta_captions:
+ self.meta_captions_sub = util.re(
+ r'<a href="/jump\.php\?([^"]+)').sub
+
def items(self):
tags = self.config("tags", "japanese")
if tags == "original":
@@ -85,9 +89,9 @@ class PixivExtractor(Extractor):
if tag["is_registered"]]
if self.meta_captions and not work.get("caption") and \
not work.get("_mypixiv") and not work.get("_ajax"):
- body = self._request_ajax("/illust/" + str(work["id"]))
- if body:
- work["caption"] = text.unescape(body["illustComment"])
+ if body := self._request_ajax("/illust/" + str(work["id"])):
+ work["caption"] = self._sanitize_ajax_caption(
+ body["illustComment"])
if transform_tags:
transform_tags(work)
@@ -115,7 +119,7 @@ class PixivExtractor(Extractor):
return [
{
"url" : img["image_urls"]["original"],
- "suffix": "_p{:02}".format(num),
+ "suffix": f"_p{num:02}",
"_fallback": self._fallback_image(img),
}
for num, img in enumerate(meta_pages)
@@ -198,7 +202,7 @@ class PixivExtractor(Extractor):
for ext in ("jpg", "png", "gif"):
try:
- url = "{}0.{}".format(base, ext)
+ url = f"{base}0.{ext}"
self.request(url, method="HEAD")
break
except exception.HttpError:
@@ -209,8 +213,8 @@ class PixivExtractor(Extractor):
return [
{
- "url": "{}{}.{}".format(base, num, ext),
- "suffix": "_p{:02}".format(num),
+ "url": f"{base}{num}.{ext}",
+ "suffix": f"_p{num:02}",
"_ugoira_frame_index": num,
}
for num in range(len(frames))
@@ -226,9 +230,16 @@ class PixivExtractor(Extractor):
return ({"url": url},)
def _request_ajax(self, endpoint):
- url = "{}/ajax{}".format(self.root, endpoint)
+ url = f"{self.root}/ajax{endpoint}"
try:
- return self.request(url, headers=self.headers_web).json()["body"]
+ data = self.request_json(
+ url, headers=self.headers_web, fatal=False)
+ if not data.get("error"):
+ return data["body"]
+
+ self.log.debug("Server response: %s", util.json_dumps(data))
+ return self.log.error(
+ "'%s'", data.get("message") or "General Error")
except Exception:
return None
@@ -272,7 +283,7 @@ class PixivExtractor(Extractor):
translated_name = None
tags.append({"name": name, "translated_name": translated_name})
- work["caption"] = text.unescape(body["illustComment"])
+ work["caption"] = self._sanitize_ajax_caption(body["illustComment"])
work["page_count"] = count = body["pageCount"]
if count == 1:
return ({"url": url},)
@@ -280,16 +291,15 @@ class PixivExtractor(Extractor):
base, _, ext = url.rpartition("_p0.")
return [
{
- "url" : "{}_p{}.{}".format(base, num, ext),
- "suffix": "_p{:02}".format(num),
+ "url" : f"{base}_p{num}.{ext}",
+ "suffix": f"_p{num:02}",
}
for num in range(count)
]
def _extract_ajax_url(self, body):
try:
- original = body["urls"]["original"]
- if original:
+ if original := body["urls"]["original"]:
return original
except Exception:
pass
@@ -305,12 +315,18 @@ class PixivExtractor(Extractor):
for ext in ("jpg", "png", "gif"):
try:
- url = "{}_p0.{}".format(base, ext)
+ url = f"{base}_p0.{ext}"
self.request(url, method="HEAD")
return url
except exception.HttpError:
pass
+ def _sanitize_ajax_caption(self, caption):
+ if not caption:
+ return ""
+ return text.unescape(self.meta_captions_sub(
+ lambda m: '<a href="' + text.unquote(m[1]), caption))
+
def _fallback_image(self, src):
if isinstance(src, str):
urls = None
@@ -329,8 +345,7 @@ class PixivExtractor(Extractor):
if fmt in urls:
yield urls[fmt]
- @staticmethod
- def _date_from_url(url, offset=timedelta(hours=9)):
+ def _date_from_url(self, url, offset=timedelta(hours=9)):
try:
_, _, _, _, _, y, m, d, H, M, S, _ = url.split("/")
return datetime(
@@ -338,12 +353,11 @@ class PixivExtractor(Extractor):
except Exception:
return None
- @staticmethod
- def _make_work(kind, url, user):
+ def _make_work(self, kind, url, user):
p = url.split("/")
return {
- "create_date" : "{}-{}-{}T{}:{}:{}+09:00".format(
- p[5], p[6], p[7], p[8], p[9], p[10]) if len(p) > 9 else None,
+ "create_date" : (f"{p[5]}-{p[6]}-{p[7]}T{p[8]}:{p[9]}:{p[10]}"
+ f"+09:00" if len(p) > 9 else None),
"height" : 0,
"id" : kind,
"image_urls" : None,
@@ -367,23 +381,15 @@ class PixivExtractor(Extractor):
return {}
-class PixivUserExtractor(PixivExtractor):
+class PixivUserExtractor(Dispatch, PixivExtractor):
"""Extractor for a pixiv user profile"""
- subcategory = "user"
pattern = (BASE_PATTERN + r"/(?:"
r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id="
r")(\d+)(?:$|[?#])")
example = "https://www.pixiv.net/en/users/12345"
- def __init__(self, match):
- PixivExtractor.__init__(self, match)
- self.user_id = match.group(1)
-
- def initialize(self):
- pass
-
def items(self):
- base = "{}/users/{}/".format(self.root, self.user_id)
+ base = f"{self.root}/users/{self.groups[0]}/"
return self._dispatch_extractors((
(PixivAvatarExtractor , base + "avatar"),
(PixivBackgroundExtractor , base + "background"),
@@ -391,7 +397,10 @@ class PixivUserExtractor(PixivExtractor):
(PixivFavoriteExtractor , base + "bookmarks/artworks"),
(PixivNovelBookmarkExtractor, base + "bookmarks/novels"),
(PixivNovelUserExtractor , base + "novels"),
- ), ("artworks",))
+ ), ("artworks",), (
+ ("bookmark", "novel-bookmark"),
+ ("user" , "novel-user"),
+ ))
class PixivArtworksExtractor(PixivExtractor):
@@ -434,7 +443,9 @@ class PixivArtworksExtractor(PixivExtractor):
if self.sanity_workaround:
body = self._request_ajax(
- "/user/{}/profile/all".format(self.user_id))
+ f"/user/{self.user_id}/profile/all")
+ if not body:
+ return ()
try:
ajax_ids = list(map(int, body["illusts"]))
ajax_ids.extend(map(int, body["manga"]))
@@ -557,7 +568,7 @@ class PixivWorkExtractor(PixivExtractor):
def __init__(self, match):
PixivExtractor.__init__(self, match)
- self.illust_id = match.group(1) or match.group(2)
+ self.illust_id = match[1] or match[2]
def works(self):
works = (self.api.illust_detail(self.illust_id),)
@@ -642,7 +653,7 @@ class PixivFavoriteExtractor(PixivExtractor):
for preview in self.api.user_following(self.user_id, restrict):
user = preview["user"]
user["_extractor"] = PixivUserExtractor
- url = "https://www.pixiv.net/users/{}".format(user["id"])
+ url = f"https://www.pixiv.net/users/{user['id']}"
yield Message.Queue, url, user
@@ -657,7 +668,7 @@ class PixivRankingExtractor(PixivExtractor):
def __init__(self, match):
PixivExtractor.__init__(self, match)
- self.query = match.group(1)
+ self.query = match[1]
self.mode = self.date = None
def works(self):
@@ -693,12 +704,11 @@ class PixivRankingExtractor(PixivExtractor):
try:
self.mode = mode = mode_map[mode]
except KeyError:
- raise exception.StopExtraction("Invalid mode '%s'", mode)
+ raise exception.AbortExtraction(f"Invalid mode '{mode}'")
- date = query.get("date")
- if date:
+ if date := query.get("date"):
if len(date) == 8 and date.isdecimal():
- date = "{}-{}-{}".format(date[0:4], date[4:6], date[6:8])
+ date = f"{date[0:4]}-{date[4:6]}-{date[6:8]}"
else:
self.log.warning("invalid date '%s'", date)
date = None
@@ -746,7 +756,7 @@ class PixivSearchExtractor(PixivExtractor):
try:
self.word = query["word"]
except KeyError:
- raise exception.StopExtraction("Missing search term")
+ raise exception.AbortExtraction("Missing search term")
sort = query.get("order", "date_d")
sort_map = {
@@ -759,7 +769,7 @@ class PixivSearchExtractor(PixivExtractor):
try:
self.sort = sort = sort_map[sort]
except KeyError:
- raise exception.StopExtraction("Invalid search order '%s'", sort)
+ raise exception.AbortExtraction(f"Invalid search order '{sort}'")
target = query.get("s_mode", "s_tag_full")
target_map = {
@@ -770,7 +780,7 @@ class PixivSearchExtractor(PixivExtractor):
try:
self.target = target = target_map[target]
except KeyError:
- raise exception.StopExtraction("Invalid search mode '%s'", target)
+ raise exception.AbortExtraction(f"Invalid search mode '{target}'")
self.date_start = query.get("scd")
self.date_end = query.get("ecd")
@@ -811,7 +821,7 @@ class PixivPixivisionExtractor(PixivExtractor):
def __init__(self, match):
PixivExtractor.__init__(self, match)
- self.pixivision_id = match.group(1)
+ self.pixivision_id = match[1]
def works(self):
return (
@@ -860,18 +870,71 @@ class PixivSeriesExtractor(PixivExtractor):
yield work
+class PixivSketchExtractor(Extractor):
+ """Extractor for user pages on sketch.pixiv.net"""
+ category = "pixiv"
+ subcategory = "sketch"
+ directory_fmt = ("{category}", "sketch", "{user[unique_name]}")
+ filename_fmt = "{post_id} {id}.{extension}"
+ archive_fmt = "S{user[id]}_{id}"
+ root = "https://sketch.pixiv.net"
+ cookies_domain = ".pixiv.net"
+ pattern = r"(?:https?://)?sketch\.pixiv\.net/@([^/?#]+)"
+ example = "https://sketch.pixiv.net/@USER"
+
+ def items(self):
+ self.username = self.groups[0]
+ headers = {"Referer": f"{self.root}/@{self.username}"}
+
+ for post in self.posts():
+ media = post["media"]
+ post["post_id"] = post["id"]
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ util.delete_items(post, ("id", "media", "_links"))
+
+ yield Message.Directory, post
+ post["_http_headers"] = headers
+
+ for photo in media:
+ original = photo["photo"]["original"]
+ post["id"] = photo["id"]
+ post["width"] = original["width"]
+ post["height"] = original["height"]
+
+ url = original["url"]
+ text.nameext_from_url(url, post)
+ yield Message.Url, url, post
+
+ def posts(self):
+ url = f"{self.root}/api/walls/@{self.username}/posts/public.json"
+ headers = {
+ "Accept": "application/vnd.sketch-v4+json",
+ "Referer": self.root + "/",
+ "X-Requested-With": f"{self.root}/@{self.username}",
+ }
+
+ while True:
+ data = self.request_json(url, headers=headers)
+ yield from data["data"]["items"]
+
+ next_url = data["_links"].get("next")
+ if not next_url:
+ return
+ url = self.root + next_url["href"]
+
+
+###############################################################################
+# Novels ######################################################################
+
class PixivNovelExtractor(PixivExtractor):
- """Extractor for pixiv novels"""
- subcategory = "novel"
+ """Base class for pixiv novel extractors"""
+ category = "pixiv-novel"
request_interval = (0.5, 1.5)
- pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)"
- example = "https://www.pixiv.net/novel/show.php?id=12345"
-
- def __init__(self, match):
- PixivExtractor.__init__(self, match)
- self.novel_id = match.group(1)
def items(self):
+ self.novel_id = self.groups[0]
+
tags = self.config("tags", "japanese")
if tags == "original":
transform_tags = None
@@ -928,7 +991,7 @@ class PixivNovelExtractor(PixivExtractor):
path.rpartition(".")[0].replace("_master1200", ""))
novel["date_url"] = self._date_from_url(url)
novel["num"] += 1
- novel["suffix"] = "_p{:02}".format(novel["num"])
+ novel["suffix"] = f"_p{novel['num']:02}"
novel["_fallback"] = (url + ".png",)
url_jpg = url + ".jpg"
text.nameext_from_url(url_jpg, novel)
@@ -960,7 +1023,7 @@ class PixivNovelExtractor(PixivExtractor):
novel.update(image)
novel["date_url"] = self._date_from_url(url)
novel["num"] += 1
- novel["suffix"] = "_p{:02}".format(novel["num"])
+ novel["suffix"] = f"_p{novel['num']:02}"
text.nameext_from_url(url, novel)
yield Message.Url, url, novel
@@ -969,10 +1032,17 @@ class PixivNovelExtractor(PixivExtractor):
novel["date_url"] = None
for illust_id in illusts:
novel["num"] += 1
- novel["suffix"] = "_p{:02}".format(novel["num"])
- url = "{}/artworks/{}".format(self.root, illust_id)
+ novel["suffix"] = f"_p{novel['num']:02}"
+ url = f"{self.root}/artworks/{illust_id}"
yield Message.Queue, url, novel
+
+class PixivNovelNovelExtractor(PixivNovelExtractor):
+ """Extractor for pixiv novels"""
+ subcategory = "novel"
+ pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)"
+ example = "https://www.pixiv.net/novel/show.php?id=12345"
+
def novels(self):
novel = self.api.novel_detail(self.novel_id)
if self.config("full-series") and novel["series"]:
@@ -983,7 +1053,7 @@ class PixivNovelExtractor(PixivExtractor):
class PixivNovelUserExtractor(PixivNovelExtractor):
"""Extractor for pixiv users' novels"""
- subcategory = "novel-user"
+ subcategory = "user"
pattern = USER_PATTERN + r"/novels"
example = "https://www.pixiv.net/en/users/12345/novels"
@@ -993,7 +1063,7 @@ class PixivNovelUserExtractor(PixivNovelExtractor):
class PixivNovelSeriesExtractor(PixivNovelExtractor):
"""Extractor for pixiv novel series"""
- subcategory = "novel-series"
+ subcategory = "series"
pattern = BASE_PATTERN + r"/novel/series/(\d+)"
example = "https://www.pixiv.net/novel/series/12345"
@@ -1003,86 +1073,25 @@ class PixivNovelSeriesExtractor(PixivNovelExtractor):
class PixivNovelBookmarkExtractor(PixivNovelExtractor):
"""Extractor for bookmarked pixiv novels"""
- subcategory = "novel-bookmark"
+ subcategory = "bookmark"
pattern = (USER_PATTERN + r"/bookmarks/novels"
r"(?:/([^/?#]+))?(?:/?\?([^#]+))?")
example = "https://www.pixiv.net/en/users/12345/bookmarks/novels"
- def __init__(self, match):
- PixivNovelExtractor.__init__(self, match)
- self.user_id, self.tag, self.query = match.groups()
-
def novels(self):
- if self.tag:
- tag = text.unquote(self.tag)
- else:
- tag = None
+ user_id, tag, query = self.groups
+ tag = text.unquote(tag) if tag else None
- if text.parse_query(self.query).get("rest") == "hide":
+ if text.parse_query(query).get("rest") == "hide":
restrict = "private"
else:
restrict = "public"
- return self.api.user_bookmarks_novel(self.user_id, tag, restrict)
-
-
-class PixivSketchExtractor(Extractor):
- """Extractor for user pages on sketch.pixiv.net"""
- category = "pixiv"
- subcategory = "sketch"
- directory_fmt = ("{category}", "sketch", "{user[unique_name]}")
- filename_fmt = "{post_id} {id}.{extension}"
- archive_fmt = "S{user[id]}_{id}"
- root = "https://sketch.pixiv.net"
- cookies_domain = ".pixiv.net"
- pattern = r"(?:https?://)?sketch\.pixiv\.net/@([^/?#]+)"
- example = "https://sketch.pixiv.net/@USER"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.username = match.group(1)
-
- def items(self):
- headers = {"Referer": "{}/@{}".format(self.root, self.username)}
-
- for post in self.posts():
- media = post["media"]
- post["post_id"] = post["id"]
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
- util.delete_items(post, ("id", "media", "_links"))
-
- yield Message.Directory, post
- post["_http_headers"] = headers
-
- for photo in media:
- original = photo["photo"]["original"]
- post["id"] = photo["id"]
- post["width"] = original["width"]
- post["height"] = original["height"]
-
- url = original["url"]
- text.nameext_from_url(url, post)
- yield Message.Url, url, post
-
- def posts(self):
- url = "{}/api/walls/@{}/posts/public.json".format(
- self.root, self.username)
- headers = {
- "Accept": "application/vnd.sketch-v4+json",
- "X-Requested-With": "{}/@{}".format(self.root, self.username),
- "Referer": self.root + "/",
- }
-
- while True:
- data = self.request(url, headers=headers).json()
- yield from data["data"]["items"]
+ return self.api.user_bookmarks_novel(user_id, tag, restrict)
- next_url = data["_links"].get("next")
- if not next_url:
- return
- url = self.root + next_url["href"]
+###############################################################################
+# API #########################################################################
class PixivAppAPI():
"""Minimal interface for the Pixiv App API for mobile devices
@@ -1288,7 +1297,7 @@ class PixivAppAPI():
self.extractor.wait(seconds=300)
continue
- raise exception.StopExtraction("API request failed: %s", error)
+ raise exception.AbortExtraction(f"API request failed: {error}")
def _pagination(self, endpoint, params,
key_items="illusts", key_data=None):
diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py
index eb3edc3..75c06bb 100644
--- a/gallery_dl/extractor/pixnet.py
+++ b/gallery_dl/extractor/pixnet.py
@@ -19,12 +19,11 @@ class PixnetExtractor(Extractor):
category = "pixnet"
filename_fmt = "{num:>03}_{id}.{extension}"
archive_fmt = "{id}"
- url_fmt = ""
def __init__(self, match):
Extractor.__init__(self, match)
- self.blog, self.item_id = match.groups()
- self.root = "https://{}.pixnet.net".format(self.blog)
+ self.blog, self.item_id = self.groups
+ self.root = f"https://{self.blog}.pixnet.net"
def items(self):
url = self.url_fmt.format(self.root, self.item_id)
@@ -53,8 +52,8 @@ class PixnetExtractor(Extractor):
pnext = text.extr(page, 'class="nextBtn"', '>')
if pnext is None and 'name="albumpass">' in page:
- raise exception.StopExtraction(
- "Album %s is password-protected.", self.item_id)
+ raise exception.AbortExtraction(
+ f"Album {self.item_id} is password-protected.")
if "href" not in pnext:
return
url = self.root + text.extr(pnext, 'href="', '"')
@@ -72,12 +71,12 @@ class PixnetImageExtractor(PixnetExtractor):
def items(self):
url = "https://api.pixnet.cc/oembed"
params = {
- "url": "https://{}.pixnet.net/album/photo/{}".format(
- self.blog, self.item_id),
+ "url": (f"https://{self.blog}.pixnet.net"
+ f"/album/photo/{self.item_id}"),
"format": "json",
}
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
data["id"] = text.parse_int(
data["url"].rpartition("/")[2].partition("-")[0])
data["filename"], _, data["extension"] = data["title"].rpartition(".")
@@ -91,14 +90,13 @@ class PixnetImageExtractor(PixnetExtractor):
class PixnetSetExtractor(PixnetExtractor):
"""Extractor for images from a pixnet set"""
subcategory = "set"
- url_fmt = "{}/album/set/{}"
directory_fmt = ("{category}", "{blog}",
"{folder_id} {folder_title}", "{set_id} {set_title}")
pattern = BASE_PATTERN + r"/album/set/(\d+)"
example = "https://USER.pixnet.net/album/set/12345"
def items(self):
- url = self.url_fmt.format(self.root, self.item_id)
+ url = f"{self.root}/album/set/{self.item_id}"
page = self.request(url, encoding="utf-8").text
data = self.metadata(page)
diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py
index 0bacd54..37b9b10 100644
--- a/gallery_dl/extractor/plurk.py
+++ b/gallery_dl/extractor/plurk.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,6 @@
from .common import Extractor, Message
from .. import text, util, exception
import datetime
-import re
class PlurkExtractor(Extractor):
@@ -29,8 +28,7 @@ class PlurkExtractor(Extractor):
def plurks(self):
"""Return an iterable with all relevant 'plurk' objects"""
- @staticmethod
- def _urls(obj):
+ def _urls(self, obj):
"""Extract URLs from a 'plurk' object"""
return text.extract_iter(obj["content"], ' href="', '"')
@@ -51,8 +49,8 @@ class PlurkExtractor(Extractor):
}
while True:
- info = self.request(
- url, method="POST", headers=headers, data=data).json()
+ info = self.request_json(
+ url, method="POST", headers=headers, data=data)
yield from info["responses"]
if not info["has_newer"]:
return
@@ -60,11 +58,11 @@ class PlurkExtractor(Extractor):
del data["count"]
data["from_response_id"] = info["responses"][-1]["id"] + 1
- @staticmethod
- def _load(data):
+ def _load(self, data):
if not data:
raise exception.NotFoundError("user")
- return util.json_loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data))
+ return util.json_loads(
+ util.re(r"new Date\(([^)]+)\)").sub(r"\1", data))
class PlurkTimelineExtractor(PlurkExtractor):
@@ -75,10 +73,10 @@ class PlurkTimelineExtractor(PlurkExtractor):
def __init__(self, match):
PlurkExtractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
def plurks(self):
- url = "{}/{}".format(self.root, self.user)
+ url = f"{self.root}/{self.user}"
page = self.request(url).text
user_id, pos = text.extract(page, '"page_user": {"id":', ',')
plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0])
@@ -105,7 +103,7 @@ class PlurkPostExtractor(PlurkExtractor):
example = "https://www.plurk.com/p/12345"
def plurks(self):
- url = "{}/p/{}".format(self.root, self.groups[0])
+ url = f"{self.root}/p/{self.groups[0]}"
page = self.request(url).text
user, pos = text.extract(page, " GLOBAL=", "\n")
data, pos = text.extract(page, "plurk =", ";\n", pos)
diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
index e371ee2..957e316 100644
--- a/gallery_dl/extractor/poipiku.py
+++ b/gallery_dl/extractor/poipiku.py
@@ -85,8 +85,8 @@ class PoipikuExtractor(Extractor):
"MD" : "0",
"TWF": "-1",
}
- resp = self.request(
- url, method="POST", headers=headers, data=data).json()
+ resp = self.request_json(
+ url, method="POST", headers=headers, data=data)
page = resp["html"]
if (resp.get("result_num") or 0) < 0:
@@ -145,4 +145,4 @@ class PoipikuPostExtractor(PoipikuExtractor):
self.user_id, self.post_id = match.groups()
def posts(self):
- return ("/{}/{}.html".format(self.user_id, self.post_id),)
+ return (f"/{self.user_id}/{self.post_id}.html",)
diff --git a/gallery_dl/extractor/poringa.py b/gallery_dl/extractor/poringa.py
index 0149d06..da17eae 100644
--- a/gallery_dl/extractor/poringa.py
+++ b/gallery_dl/extractor/poringa.py
@@ -23,12 +23,12 @@ class PoringaExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.item = match.group(1)
+ self.item = match[1]
self.__cookies = True
def items(self):
for post_id in self.posts():
- url = "{}/posts/imagenes/{}".format(self.root, post_id)
+ url = f"{self.root}/posts/imagenes/{post_id}"
try:
response = self.request(url)
diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py
index 9800eb2..1211397 100644
--- a/gallery_dl/extractor/pornhub.py
+++ b/gallery_dl/extractor/pornhub.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://www.pornhub.com/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com"
@@ -27,7 +27,7 @@ class PornhubExtractor(Extractor):
if "/" not in path:
path += "/public"
- url = "{}/{}/{}/ajax".format(self.root, user, path)
+ url = f"{self.root}/{user}/{path}/ajax"
params = {"page": 1}
headers = {
"Referer": url[:-5],
@@ -40,8 +40,7 @@ class PornhubExtractor(Extractor):
allow_redirects=False)
if 300 <= response.status_code < 400:
- url = "{}{}/{}/ajax".format(
- self.root, response.headers["location"], path)
+ url = f"{self.root}{response.headers['location']}/{path}/ajax"
continue
yield response.text
@@ -60,7 +59,7 @@ class PornhubGalleryExtractor(PornhubExtractor):
def __init__(self, match):
PornhubExtractor.__init__(self, match)
- self.gallery_id = match.group(1)
+ self.gallery_id = match[1]
self._first = None
def items(self):
@@ -82,11 +81,11 @@ class PornhubGalleryExtractor(PornhubExtractor):
yield Message.Url, url, text.nameext_from_url(url, image)
def metadata(self):
- url = "{}/album/{}".format(
- self.root, self.gallery_id)
+ url = f"{self.root}/album/{self.gallery_id}"
extr = text.extract_from(self.request(url).text)
title = extr("<title>", "</title>")
+ self._token = extr('name="token" value="', '"')
score = extr('<div id="albumGreenBar" style="width:', '"')
views = extr('<div id="viewsPhotAlbumCounter">', '<')
tags = extr('<div id="photoTagsBox"', '<script')
@@ -105,13 +104,12 @@ class PornhubGalleryExtractor(PornhubExtractor):
}
def images(self):
- url = "{}/album/show_album_json?album={}".format(
- self.root, self.gallery_id)
- response = self.request(url)
+ url = f"{self.root}/api/v1/album/{self.gallery_id}/show_album_json"
+ params = {"token": self._token}
+ data = self.request_json(url, params=params)
- if response.content == b"Permission denied":
+ if not (images := data.get("photos")):
raise exception.AuthorizationError()
- images = response.json()
key = end = self._first
results = []
@@ -141,10 +139,10 @@ class PornhubGifExtractor(PornhubExtractor):
def __init__(self, match):
PornhubExtractor.__init__(self, match)
- self.gallery_id = match.group(1)
+ self.gallery_id = match[1]
def items(self):
- url = "{}/gif/{}".format(self.root, self.gallery_id)
+ url = f"{self.root}/gif/{self.gallery_id}"
extr = text.extract_from(self.request(url).text)
gif = {
@@ -164,21 +162,13 @@ class PornhubGifExtractor(PornhubExtractor):
yield Message.Url, gif["url"], text.nameext_from_url(gif["url"], gif)
-class PornhubUserExtractor(PornhubExtractor):
+class PornhubUserExtractor(Dispatch, PornhubExtractor):
"""Extractor for a pornhub user"""
- subcategory = "user"
pattern = BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)/?$"
example = "https://www.pornhub.com/model/USER"
- def __init__(self, match):
- PornhubExtractor.__init__(self, match)
- self.user = match.group(1)
-
- def initialize(self):
- pass
-
def items(self):
- base = "{}/{}/".format(self.root, self.user)
+ base = f"{self.root}/{self.groups[0]}/"
return self._dispatch_extractors((
(PornhubPhotosExtractor, base + "photos"),
(PornhubGifsExtractor , base + "gifs"),
diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py
index 863ef3b..34a0111 100644
--- a/gallery_dl/extractor/pornpics.py
+++ b/gallery_dl/extractor/pornpics.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -47,8 +47,8 @@ class PornpicsExtractor(Extractor):
}
while True:
- galleries = self.request(
- url, params=params, headers=headers).json()
+ galleries = self.request_json(
+ url, params=params, headers=headers)
yield from galleries
if len(galleries) < limit:
@@ -62,7 +62,7 @@ class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor):
example = "https://www.pornpics.com/galleries/TITLE-12345/"
def __init__(self, match):
- url = "{}/galleries/{}/".format(self.root, match.group(1))
+ url = f"{self.root}/galleries/{match[1]}/"
GalleryExtractor.__init__(self, match, url)
items = GalleryExtractor.items
@@ -98,7 +98,7 @@ class PornpicsTagExtractor(PornpicsExtractor):
example = "https://www.pornpics.com/tags/TAGS/"
def galleries(self):
- url = "{}/tags/{}/".format(self.root, self.groups[0])
+ url = f"{self.root}/tags/{self.groups[0]}/"
return self._pagination(url)
diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py
index 6ea18e6..af971ab 100644
--- a/gallery_dl/extractor/postmill.py
+++ b/gallery_dl/extractor/postmill.py
@@ -6,9 +6,8 @@
"""Extractors for Postmill instances"""
-import re
from .common import BaseExtractor, Message
-from .. import text, exception
+from .. import text, util, exception
class PostmillExtractor(BaseExtractor):
@@ -21,8 +20,8 @@ class PostmillExtractor(BaseExtractor):
def _init(self):
self.instance = self.root.partition("://")[2]
self.save_link_post_body = self.config("save-link-post-body", False)
- self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search
- self._search_image_tag = re.compile(
+ self._search_canonical_url = util.re(r"/f/([\w\d_]+)/(\d+)/").search
+ self._search_image_tag = util.re(
r'<a href="[^"]+"\n +class="submission__image-link"').search
def items(self):
@@ -47,8 +46,8 @@ class PostmillExtractor(BaseExtractor):
'</div>')
match = self._search_canonical_url(post_canonical_url)
- forum = match.group(1)
- id = int(match.group(2))
+ forum = match[1]
+ id = int(match[2])
is_text_post = (url[0] == "/")
is_image_post = self._search_image_tag(page) is not None
@@ -103,8 +102,8 @@ class PostmillSubmissionsExtractor(PostmillExtractor):
if response.history:
redirect_url = response.url
if redirect_url == self.root + "/login":
- raise exception.StopExtraction(
- "HTTP redirect to login page (%s)", redirect_url)
+ raise exception.AbortExtraction(
+ f"HTTP redirect to login page ({redirect_url})")
page = response.text
for nav in text.extract_iter(page,
@@ -143,8 +142,8 @@ class PostmillPostExtractor(PostmillExtractor):
def __init__(self, match):
PostmillExtractor.__init__(self, match)
- self.forum = match.group(3)
- self.post_id = match.group(4)
+ self.forum = match[3]
+ self.post_id = match[4]
def post_urls(self):
return (self.root + "/f/" + self.forum + "/" + self.post_id,)
diff --git a/gallery_dl/extractor/rawkuma.py b/gallery_dl/extractor/rawkuma.py
new file mode 100644
index 0000000..242486d
--- /dev/null
+++ b/gallery_dl/extractor/rawkuma.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://rawkuma.net/"""
+
+from .common import MangaExtractor, ChapterExtractor
+from .. import text, util
+
+BASE_PATTERN = r"(?:https?://)?rawkuma\.(?:net|com)"
+
+
+class RawkumaBase():
+ """Base class for rawkuma extractors"""
+ category = "rawkuma"
+ root = "https://rawkuma.net"
+
+
+class RawkumaChapterExtractor(RawkumaBase, ChapterExtractor):
+ """Extractor for manga chapters from rawkuma.net"""
+ archive_fmt = "{chapter_id}_{page}"
+ pattern = BASE_PATTERN + r"/([^/?#]+-chapter-\d+(?:-\d+)?)"
+ example = "https://rawkuma.net/TITLE-chapter-123/"
+
+ def __init__(self, match):
+ url = f"{self.root}/{match[1]}/"
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ item = util.json_loads(text.extr(page, ',"item":', "}};"))
+ title = text.rextr(
+ page, '<h1 class="entry-title', "</h1>").partition(" &#8211; ")[2]
+ date = text.extr(page, 'datetime="', '"')
+ chapter, sep, minor = item["c"].partition(".")
+
+ return {
+ "manga" : item["s"],
+ "manga_id" : text.parse_int(item["mid"]),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": sep + minor,
+ "chapter_id" : text.parse_int(item["cid"]),
+ "title" : text.unescape(title),
+ "date" : text.parse_datetime(
+ date, "%Y-%m-%dWIB%H:%M:%S%z"),
+ "thumbnail" : item.get("t"),
+ "lang" : "ja",
+ "language" : "Japanese",
+ }
+
+ def images(self, page):
+ images = util.json_loads(text.extr(page, '","images":', '}'))
+ return [(url, None) for url in images]
+
+
+class RawkumaMangaExtractor(RawkumaBase, MangaExtractor):
+ """Extractor for manga from rawkuma.net"""
+ chapterclass = RawkumaChapterExtractor
+ pattern = BASE_PATTERN + r"/manga/([^/?#]+)"
+ example = "https://rawkuma.net/manga/TITLE/"
+
+ def __init__(self, match):
+ url = f"{self.root}/manga/{match[1]}/"
+ MangaExtractor.__init__(self, match, url)
+
+ def chapters(self, page):
+ manga = text.unescape(text.extr(page, "<title>", " &#8211; "))
+
+ results = []
+ for chbox in text.extract_iter(
+ page, '<li data-num="', "</a>"):
+ info = text.extr(chbox, '', '"')
+ chapter, _, title = info.partition(" - ")
+ chapter, sep, minor = chapter.partition(".")
+
+ results.append((text.extr(chbox, 'href="', '"'), {
+ "manga" : manga,
+ "chapter" : text.parse_int(chapter),
+ "chapter-minor": sep + minor,
+ "title" : title,
+ }))
+ return results
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index ab555d8..483a5ba 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,7 +23,7 @@ class ReactorExtractor(BaseExtractor):
def __init__(self, match):
BaseExtractor.__init__(self, match)
- url = text.ensure_http_scheme(match.group(0), "http://")
+ url = text.ensure_http_scheme(match[0], "http://")
pos = url.index("/", 10)
self.root = url[:pos]
self.path = url[pos:]
@@ -176,7 +176,7 @@ class ReactorTagExtractor(ReactorExtractor):
def __init__(self, match):
ReactorExtractor.__init__(self, match)
- self.tag = match.group(match.lastindex)
+ self.tag = self.groups[-1]
def metadata(self):
return {"search_tags": text.unescape(self.tag).replace("+", " ")}
@@ -192,7 +192,7 @@ class ReactorSearchExtractor(ReactorExtractor):
def __init__(self, match):
ReactorExtractor.__init__(self, match)
- self.tag = match.group(match.lastindex)
+ self.tag = self.groups[-1]
def metadata(self):
return {"search_tags": text.unescape(self.tag).replace("+", " ")}
@@ -207,7 +207,7 @@ class ReactorUserExtractor(ReactorExtractor):
def __init__(self, match):
ReactorExtractor.__init__(self, match)
- self.user = match.group(match.lastindex)
+ self.user = self.groups[-1]
def metadata(self):
return {"user": text.unescape(self.user).replace("+", " ")}
@@ -221,7 +221,7 @@ class ReactorPostExtractor(ReactorExtractor):
def __init__(self, match):
ReactorExtractor.__init__(self, match)
- self.post_id = match.group(match.lastindex)
+ self.post_id = self.groups[-1]
def items(self):
post = self.request(self.root + self.path).text
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index 2f2daca..24a0171 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,6 @@
from .common import Extractor, ChapterExtractor, MangaExtractor
from .. import text, exception
import binascii
-import re
BASE_PATTERN = r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.(?:li|to)"
@@ -37,9 +36,9 @@ class ReadcomiconlineBase():
"the CAPTCHA, and press ENTER to continue", response.url)
self.input()
else:
- raise exception.StopExtraction(
- "Redirect to \n%s\nVisit this URL in your browser and "
- "solve the CAPTCHA to continue", response.url)
+ raise exception.AbortExtraction(
+ f"Redirect to \n{response.url}\nVisit this URL in your "
+ f"browser and solve the CAPTCHA to continue")
class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
@@ -48,12 +47,8 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/[^/?#]+\?)([^#]+)"
example = "https://readcomiconline.li/Comic/TITLE/Issue-123?id=12345"
- def __init__(self, match):
- ChapterExtractor.__init__(self, match)
- self.params = match.group(2)
-
def _init(self):
- params = text.parse_query(self.params)
+ params = text.parse_query(self.groups[1])
quality = self.config("quality")
if quality is None or quality == "auto":
@@ -61,17 +56,18 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
params["quality"] = "hq"
else:
params["quality"] = str(quality)
+ params["readType"] = "0" # force "One page" Reading mode (#7890)
- self.gallery_url += "&".join(k + "=" + v for k, v in params.items())
+ self.page_url += "&".join(f"{k}={v}" for k, v in params.items())
self.issue_id = params.get("id")
def metadata(self, page):
comic, pos = text.extract(page, " - Read\r\n ", "\r\n")
iinfo, pos = text.extract(page, " ", "\r\n", pos)
- match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
+ match = text.re(r"(?:Issue )?#(\d+)|(.+)").match(iinfo)
return {
"comic": comic,
- "issue": match.group(1) or match.group(2),
+ "issue": match[1] or match[2],
"issue_id": text.parse_int(self.issue_id),
"lang": "en",
"language": "English",
@@ -79,22 +75,21 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
def images(self, page):
results = []
- referer = {"_http_headers": {"Referer": self.gallery_url}}
- root = text.extr(page, "return baeu(l, '", "'")
+ referer = {"_http_headers": {"Referer": self.page_url}}
+ root, pos = text.extract(page, "return baeu(l, '", "'")
+ _ , pos = text.extract(page, "var pth = '", "", pos)
+ var , pos = text.extract(page, "var ", "= '", pos)
- replacements = re.findall(
- r"l = l\.replace\(/([^/]+)/g, [\"']([^\"']*)", page)
+ replacements = text.re(
+ r"l = l\.replace\(/([^/]+)/g, [\"']([^\"']*)").findall(page)
- for block in page.split("\t\tpht = '")[1:]:
- pth = text.extr(block, "", "'")
+ for path in page.split(var)[2:]:
+ path = text.extr(path, "= '", "'")
- for needle, repl in re.findall(
- r"pth = pth\.replace\(/([^/]+)/g, [\"']([^\"']*)", block):
- pth = pth.replace(needle, repl)
for needle, repl in replacements:
- pth = pth.replace(needle, repl)
+ path = path.replace(needle, repl)
- results.append((baeu(pth, root), referer))
+ results.append((baeu(path, root), referer))
return results
@@ -112,7 +107,7 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
page , pos = text.extract(page, ' class="listing">', '</table>', pos)
comic = comic.rpartition("information")[0].strip()
- needle = ' title="Read {} '.format(comic)
+ needle = f' title="Read {comic} '
comic = text.unescape(comic)
for item in text.extract_iter(page, ' href="', ' comic online '):
diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py
index ab8a9b1..cf45578 100644
--- a/gallery_dl/extractor/realbooru.py
+++ b/gallery_dl/extractor/realbooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,6 @@
from . import booru
from .. import text, util
import collections
-import re
BASE_PATTERN = r"(?:https?://)?realbooru\.com"
@@ -22,8 +21,7 @@ class RealbooruExtractor(booru.BooruExtractor):
root = "https://realbooru.com"
def _parse_post(self, post_id):
- url = "{}/index.php?page=post&s=view&id={}".format(
- self.root, post_id)
+ url = f"{self.root}/index.php?page=post&s=view&id={post_id}"
page = self.request(url).text
extr = text.extract_from(page)
rating = extr('name="rating" content="', '"')
@@ -72,8 +70,7 @@ class RealbooruExtractor(booru.BooruExtractor):
page = post["_html"]
tag_container = text.extr(page, 'id="tagLink"', '</div>')
tags = collections.defaultdict(list)
- pattern = re.compile(
- r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
+ pattern = util.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unescape(text.unquote(tag_name)))
for key, value in tags.items():
@@ -128,7 +125,7 @@ class RealbooruPoolExtractor(RealbooruExtractor):
def metadata(self):
pool_id = self.groups[0]
- url = "{}/index.php?page=pool&s=show&id={}".format(self.root, pool_id)
+ url = f"{self.root}/index.php?page=pool&s=show&id={pool_id}"
page = self.request(url).text
name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py
index 1883bbc..4762fa5 100644
--- a/gallery_dl/extractor/recursive.py
+++ b/gallery_dl/extractor/recursive.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Recursive extractor"""
from .common import Extractor, Message
-from .. import text
-import re
+from .. import text, util
class RecursiveExtractor(Extractor):
@@ -28,5 +27,5 @@ class RecursiveExtractor(Extractor):
else:
page = self.request(text.ensure_http_scheme(url)).text
- for match in re.finditer(r"https?://[^\s\"']+", page):
- yield Message.Queue, match.group(0), {}
+ for match in util.re(r"https?://[^\s\"']+").finditer(page):
+ yield Message.Queue, match[0], {}
diff --git a/gallery_dl/extractor/redbust.py b/gallery_dl/extractor/redbust.py
new file mode 100644
index 0000000..d00ed52
--- /dev/null
+++ b/gallery_dl/extractor/redbust.py
@@ -0,0 +1,186 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://redbust.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?redbust\.com"
+
+
+class RedbustExtractor(Extractor):
+ """Base class for RedBust extractors"""
+ category = "redbust"
+ root = "https://redbust.com"
+ filename_fmt = "{filename}.{extension}"
+
+ def items(self):
+ data = {"_extractor": RedbustGalleryExtractor}
+ for url in self.galleries():
+ yield Message.Queue, url, data
+
+ def _pagination(self, path, page=None):
+ if page is None:
+ url = f"{self.root}{path}/"
+ base = url + "page/"
+ page = self.request(url).text
+ else:
+ base = f"{self.root}{path}/page/"
+
+ pnum = 1
+ while True:
+ for post in text.extract_iter(
+ page, '<h2 class="post-title">', "rel="):
+ yield text.extr(post, 'href="', '"')
+
+ pnum += 1
+ url = f"{base}{pnum}/"
+ if url not in page:
+ return
+ page = self.request(url).text
+
+
+class RedbustGalleryExtractor(GalleryExtractor, RedbustExtractor):
+ """Extractor for RedBust galleries"""
+ pattern = BASE_PATTERN + r"/([\w-]+)/?$"
+ example = "https://redbust.com/TITLE/"
+
+ def items(self):
+ url = f"{self.root}/{self.groups[0]}/"
+ self.page = page = self.request(url).text
+
+ self.gallery_id = gid = text.extr(
+ page, "<link rel='shortlink' href='https://redbust.com/?p=", "'")
+
+ if gid:
+ self.page_url = False
+ return GalleryExtractor.items(self)
+ else:
+ self.subcategory = "category"
+ return self._items_category(page)
+
+ def _items_category(self, _):
+ page = self.page
+ data = {"_extractor": RedbustGalleryExtractor}
+ base = f"{self.root}/{self.groups[0]}/page/"
+ pnum = 1
+
+ while True:
+ for post in text.extract_iter(
+ page, '<h2 class="post-title">', "rel="):
+ url = text.extr(post, 'href="', '"')
+ yield Message.Queue, url, data
+
+ pnum += 1
+ url = f"{base}{pnum}/"
+ if url not in page:
+ return
+ page = self.request(url).text
+
+ def metadata(self, _):
+ extr = text.extract_from(self.page)
+
+ return {
+ "gallery_id" : self.gallery_id,
+ "gallery_slug": self.groups[0],
+ "categories" : text.split_html(extr(
+ '<li class="category">', "</li>"))[::2],
+ "title" : text.unescape(extr('class="post-title">', "<")),
+ "date" : text.parse_datetime(
+ extr('class="post-byline">', "<").strip(), "%B %d, %Y"),
+ "views" : text.parse_int(extr("</b>", "v").replace(",", "")),
+ "tags" : text.split_html(extr(
+ 'class="post-tags">', "</p"))[1:],
+ }
+
+ def images(self, _):
+ results = []
+
+ for img in text.extract_iter(self.page, "'><img ", ">"):
+ if src := text.extr(img, 'src="', '"'):
+ path, _, end = src.rpartition("-")
+ if "x" in end:
+ url = f"{path}.{end.rpartition('.')[2]}"
+ data = None if src == url else {"_fallback": (src,)}
+ else:
+ url = src
+ data = None
+ results.append((url, data))
+
+ if not results:
+ # fallback for older galleries
+ for path in text.extract_iter(
+ self.page, '<img src="/wp-content/uploads/', '"'):
+ results.append(
+ (f"{self.root}/wp-content/uploads/{path}", None))
+
+ return results
+
+
+class RedbustTagExtractor(RedbustExtractor):
+ """Extractor for RedBust tag searches"""
+ subcategory = "tag"
+ pattern = BASE_PATTERN + r"/tag/([\w-]+)"
+ example = "https://redbust.com/tag/TAG/"
+
+ def galleries(self):
+ return self._pagination("/tag/" + self.groups[0])
+
+
+class RedbustArchiveExtractor(RedbustExtractor):
+ """Extractor for RedBust monthly archive collections"""
+ subcategory = "archive"
+ pattern = BASE_PATTERN + r"(/\d{4}/\d{2})"
+ example = "https://redbust.com/2010/01/"
+
+ def galleries(self):
+ return self._pagination(self.groups[0])
+
+
+class RedbustImageExtractor(RedbustExtractor):
+ """Extractor for RedBust images"""
+ subcategory = "image"
+ directory_fmt = ("{category}", "{title}")
+ pattern = BASE_PATTERN + r"/(?!tag/|\d{4}/)([\w-]+)/([\w-]+)/?$"
+ example = "https://redbust.com/TITLE/SLUG/"
+
+ def items(self):
+ gallery_slug, image_slug = self.groups
+ url = f"{self.root}/{gallery_slug}/{image_slug}/"
+ page = self.request(url).text
+
+ img_url = None
+
+ # Look for the largest image in srcset first
+ if srcset := text.extr(page, 'srcset="', '"'):
+ # Extract the largest image from srcset (typically last one)
+ urls = srcset.split(", ")
+ img_url = urls[-1].partition(" ")[0] if urls else None
+
+ # Fallback to original extraction method
+ if not img_url:
+ if entry := text.extr(page, "entry-inner ", "alt="):
+ img_url = text.extr(entry, "img src=", " ").strip("\"'")
+
+ if not img_url:
+ return
+
+ end = img_url.rpartition("-")[2]
+ data = text.nameext_from_url(img_url, {
+ "title" : text.unescape(text.extr(
+ page, 'title="Return to ', '"')),
+ "image_id" : text.extr(
+ page, "rel='shortlink' href='https://redbust.com/?p=", "'"),
+ "gallery_slug": gallery_slug,
+ "image_slug" : image_slug,
+ "num" : text.parse_int(end.partition(".")[0]),
+ "count" : 1,
+ "url" : img_url,
+ })
+
+ yield Message.Directory, data
+ yield Message.Url, img_url, data
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 76eadc4..c87430b 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -33,8 +33,7 @@ class RedditExtractor(Extractor):
previews = self.config("previews", True)
embeds = self.config("embeds", True)
- videos = self.config("videos", True)
- if videos:
+ if videos := self.config("videos", True):
if videos == "ytdl":
self._extract_video = self._extract_video_ytdl
elif videos == "dash":
@@ -139,9 +138,8 @@ class RedditExtractor(Extractor):
)):
continue
- match = match_submission(url)
- if match:
- extra.append(match.group(1))
+ if match := match_submission(url):
+ extra.append(match[1])
elif not match_user(url) and not match_subreddit(url):
if previews and "comment" not in data and \
"preview" in data:
@@ -181,8 +179,7 @@ class RedditExtractor(Extractor):
submission["id"], item["media_id"], data.get("status"))
continue
src = data["s"]
- url = src.get("u") or src.get("gif") or src.get("mp4")
- if url:
+ if url := src.get("u") or src.get("gif") or src.get("mp4"):
yield url.partition("?")[0].replace("/preview.", "/i.", 1)
else:
self.log.error(
@@ -202,8 +199,7 @@ class RedditExtractor(Extractor):
submission["id"], mid, data.get("status"))
continue
src = data["s"]
- url = src.get("u") or src.get("gif") or src.get("mp4")
- if url:
+ if url := src.get("u") or src.get("gif") or src.get("mp4"):
yield url.partition("?")[0].replace("/preview.", "/i.", 1)
else:
self.log.error(
@@ -242,8 +238,7 @@ class RedditExtractor(Extractor):
try:
for image in post["preview"]["images"]:
- variants = image.get("variants")
- if variants:
+ if variants := image.get("variants"):
if "gif" in variants:
yield variants["gif"]["source"]["url"]
if "mp4" in variants:
@@ -309,7 +304,7 @@ class RedditSubmissionExtractor(RedditExtractor):
def __init__(self, match):
RedditExtractor.__init__(self, match)
- self.submission_id = match.group(1)
+ self.submission_id = match[1]
def submissions(self):
return (self.api.submission(self.submission_id),)
@@ -326,17 +321,17 @@ class RedditImageExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- domain = match.group(1)
- self.path = match.group(2)
+ domain = match[1]
+ self.path = match[2]
if domain == "preview.redd.it":
self.domain = "i.redd.it"
self.query = ""
else:
self.domain = domain
- self.query = match.group(3) or ""
+ self.query = match[3] or ""
def items(self):
- url = "https://{}/{}{}".format(self.domain, self.path, self.query)
+ url = f"https://{self.domain}/{self.path}{self.query}"
data = text.nameext_from_url(url)
yield Message.Directory, data
yield Message.Url, url, data
@@ -355,8 +350,7 @@ class RedditRedirectExtractor(Extractor):
sub_type, subreddit, share_url = self.groups
if sub_type == "u":
sub_type = "user"
- url = "https://www.reddit.com/{}/{}/s/{}".format(
- sub_type, subreddit, share_url)
+ url = f"https://www.reddit.com/{sub_type}/{subreddit}/s/{share_url}"
location = self.request_location(url, notfound="submission")
data = {"_extractor": RedditSubmissionExtractor}
yield Message.Queue, location, data
@@ -478,8 +472,8 @@ class RedditAPI():
if response.status_code != 200:
self.log.debug("Server response: %s", data)
- raise exception.AuthenticationError('"{}: {}"'.format(
- data.get("error"), data.get("message")))
+ raise exception.AuthenticationError(
+ f"\"{data.get('error')}: {data.get('message')}\"")
return "Bearer " + data["access_token"]
def _call(self, endpoint, params):
@@ -508,7 +502,8 @@ class RedditAPI():
try:
data = response.json()
except ValueError:
- raise exception.StopExtraction(text.remove_html(response.text))
+ raise exception.AbortExtraction(
+ text.remove_html(response.text))
if "error" in data:
if data["error"] == 403:
@@ -516,7 +511,7 @@ class RedditAPI():
if data["error"] == 404:
raise exception.NotFoundError()
self.log.debug(data)
- raise exception.StopExtraction(data.get("message"))
+ raise exception.AbortExtraction(data.get("message"))
return data
def _pagination(self, endpoint, params):
@@ -573,8 +568,7 @@ class RedditAPI():
sid = self.extractor.config(key)
return self._decode(sid.rpartition("_")[2].lower()) if sid else default
- @staticmethod
- def _decode(sid):
+ def _decode(self, sid):
return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz")
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
index 612faac..4098c54 100644
--- a/gallery_dl/extractor/redgifs.py
+++ b/gallery_dl/extractor/redgifs.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,7 +23,7 @@ class RedgifsExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.key = match.group(1)
+ self.key = match[1]
def _init(self):
self.api = RedgifsAPI(self)
@@ -40,8 +40,7 @@ class RedgifsExtractor(Extractor):
for gif in self.gifs():
- gallery = gif.get("gallery")
- if gallery:
+ if gallery := gif.get("gallery"):
gifs = self.api.gallery(gallery)["gifs"]
enum = 1
cnt = len(gifs)
@@ -71,8 +70,7 @@ class RedgifsExtractor(Extractor):
def _formats(self, gif):
urls = gif["urls"]
for fmt in self.formats:
- url = urls.get(fmt)
- if url:
+ if url := urls.get(fmt):
url = url.replace("//thumbs2.", "//thumbs3.", 1)
text.nameext_from_url(url, gif)
yield url
@@ -94,7 +92,7 @@ class RedgifsUserExtractor(RedgifsExtractor):
def __init__(self, match):
RedgifsExtractor.__init__(self, match)
- self.query = match.group(2)
+ self.query = match[2]
def metadata(self):
return {"userName": self.key}
@@ -116,7 +114,7 @@ class RedgifsCollectionExtractor(RedgifsExtractor):
def __init__(self, match):
RedgifsExtractor.__init__(self, match)
- self.collection_id = match.group(2)
+ self.collection_id = match[2]
def metadata(self):
collection = self.api.collection_info(self.key, self.collection_id)
@@ -135,9 +133,9 @@ class RedgifsCollectionsExtractor(RedgifsExtractor):
example = "https://www.redgifs.com/users/USER/collections"
def items(self):
+ base = f"{self.root}/users/{self.key}/collections/"
for collection in self.api.collections(self.key):
- url = "{}/users/{}/collections/{}".format(
- self.root, self.key, collection["folderId"])
+ url = f"{base}{collection['folderId']}"
collection["_extractor"] = RedgifsCollectionExtractor
yield Message.Queue, url, collection
@@ -151,7 +149,7 @@ class RedgifsNichesExtractor(RedgifsExtractor):
def __init__(self, match):
RedgifsExtractor.__init__(self, match)
- self.query = match.group(2)
+ self.query = match[2]
def gifs(self):
order = text.parse_query(self.query).get("order")
@@ -223,25 +221,24 @@ class RedgifsAPI():
return self._call(endpoint)
def user(self, user, order="new"):
- endpoint = "/v2/users/{}/search".format(user.lower())
+ endpoint = f"/v2/users/{user.lower()}/search"
params = {"order": order}
return self._pagination(endpoint, params)
def collection(self, user, collection_id):
- endpoint = "/v2/users/{}/collections/{}/gifs".format(
- user, collection_id)
+ endpoint = f"/v2/users/{user}/collections/{collection_id}/gifs"
return self._pagination(endpoint)
def collection_info(self, user, collection_id):
- endpoint = "/v2/users/{}/collections/{}".format(user, collection_id)
+ endpoint = f"/v2/users/{user}/collections/{collection_id}"
return self._call(endpoint)
def collections(self, user):
- endpoint = "/v2/users/{}/collections".format(user)
+ endpoint = f"/v2/users/{user}/collections"
return self._pagination(endpoint, key="collections")
def niches(self, niche, order):
- endpoint = "/v2/niches/{}/gifs".format(niche)
+ endpoint = f"/v2/niches/{niche}/gifs"
params = {"count": 30, "order": order}
return self._pagination(endpoint, params)
@@ -257,8 +254,8 @@ class RedgifsAPI():
def _call(self, endpoint, params=None):
url = self.API_ROOT + endpoint
self.headers["Authorization"] = self._auth()
- return self.extractor.request(
- url, params=params, headers=self.headers).json()
+ return self.extractor.request_json(
+ url, params=params, headers=self.headers)
def _pagination(self, endpoint, params=None, key="gifs"):
if params is None:
@@ -278,5 +275,5 @@ class RedgifsAPI():
# https://github.com/Redgifs/api/wiki/Temporary-tokens
url = self.API_ROOT + "/v2/auth/temporary"
self.headers["Authorization"] = None
- return "Bearer " + self.extractor.request(
- url, headers=self.headers).json()["token"]
+ return "Bearer " + self.extractor.request_json(
+ url, headers=self.headers)["token"]
diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py
index 60c1c35..a43ea4c 100644
--- a/gallery_dl/extractor/rule34us.py
+++ b/gallery_dl/extractor/rule34us.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,9 +9,8 @@
"""Extractors for https://rule34.us/"""
from .booru import BooruExtractor
-from .. import text
+from .. import text, util
import collections
-import re
class Rule34usExtractor(BooruExtractor):
@@ -20,11 +19,11 @@ class Rule34usExtractor(BooruExtractor):
per_page = 42
def _init(self):
- self._find_tags = re.compile(
+ self._find_tags = util.re(
r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall
def _parse_post(self, post_id):
- url = "{}/index.php?r=posts/view&id={}".format(self.root, post_id)
+ url = f"{self.root}/index.php?r=posts/view&id={post_id}"
page = self.request(url).text
extr = text.extract_from(page)
@@ -41,7 +40,8 @@ class Rule34usExtractor(BooruExtractor):
url = post["file_url"]
if "//video-cdn1." in url:
- post["_fallback"] = (url.replace("//video-cdn1.", "//video."),)
+ post["file_url"] = url.replace("//video-cdn1.", "//video.")
+ post["_fallback"] = (url,)
post["md5"] = url.rpartition("/")[2].partition(".")[0]
tags = collections.defaultdict(list)
@@ -62,7 +62,7 @@ class Rule34usTagExtractor(Rule34usExtractor):
def __init__(self, match):
Rule34usExtractor.__init__(self, match)
- self.tags = text.unquote(match.group(1).replace("+", " "))
+ self.tags = text.unquote(match[1].replace("+", " "))
def metadata(self):
return {"search_tags": self.tags}
@@ -99,7 +99,7 @@ class Rule34usPostExtractor(Rule34usExtractor):
def __init__(self, match):
Rule34usExtractor.__init__(self, match)
- self.post_id = match.group(1)
+ self.post_id = match[1]
def posts(self):
return (self._parse_post(self.post_id),)
diff --git a/gallery_dl/extractor/rule34vault.py b/gallery_dl/extractor/rule34vault.py
index 8c8abfa..14d5aef 100644
--- a/gallery_dl/extractor/rule34vault.py
+++ b/gallery_dl/extractor/rule34vault.py
@@ -30,8 +30,8 @@ class Rule34vaultExtractor(BooruExtractor):
def _file_url(self, post):
post_id = post["id"]
extension = "jpg" if post["type"] == 0 else "mp4"
- post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format(
- self.root_cdn, post_id // 1000, post_id, post_id, extension)
+ post["file_url"] = url = (f"{self.root_cdn}/posts/{post_id // 1000}/"
+ f"{post_id}/{post_id}.{extension}")
return url
def _prepare(self, post):
@@ -53,11 +53,11 @@ class Rule34vaultExtractor(BooruExtractor):
post["tags_" + types[type]] = values
def _fetch_post(self, post_id):
- url = "{}/api/v2/post/{}".format(self.root, post_id)
- return self.request(url).json()
+ url = f"{self.root}/api/v2/post/{post_id}"
+ return self.request_json(url)
def _pagination(self, endpoint, params=None):
- url = "{}/api{}".format(self.root, endpoint)
+ url = f"{self.root}/api{endpoint}"
if params is None:
params = {}
@@ -67,7 +67,7 @@ class Rule34vaultExtractor(BooruExtractor):
threshold = self.per_page
while True:
- data = self.request(url, method="POST", json=params).json()
+ data = self.request_json(url, method="POST", json=params)
yield from data["items"]
diff --git a/gallery_dl/extractor/rule34xyz.py b/gallery_dl/extractor/rule34xyz.py
index 411a71a..05915ba 100644
--- a/gallery_dl/extractor/rule34xyz.py
+++ b/gallery_dl/extractor/rule34xyz.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,7 +9,8 @@
"""Extractors for https://rule34.xyz/"""
from .booru import BooruExtractor
-from .. import text
+from .. import text, exception
+from ..cache import cache
import collections
BASE_PATTERN = r"(?:https?://)?rule34\.xyz"
@@ -38,8 +39,7 @@ class Rule34xyzExtractor(BooruExtractor):
}
def _init(self):
- formats = self.config("format")
- if formats:
+ if formats := self.config("format"):
if isinstance(formats, str):
formats = formats.split(",")
self.formats = formats
@@ -59,8 +59,8 @@ class Rule34xyzExtractor(BooruExtractor):
post_id = post["id"]
root = self.root_cdn if files[fmt][0] else self.root
- post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format(
- root, post_id // 1000, post_id, post_id, extension)
+ post["file_url"] = url = \
+ f"{root}/posts/{post_id // 1000}/{post_id}/{post_id}.{extension}"
post["format_id"] = fmt
post["format"] = extension.partition(".")[0]
@@ -86,11 +86,11 @@ class Rule34xyzExtractor(BooruExtractor):
post["tags_" + types[type]] = values
def _fetch_post(self, post_id):
- url = "{}/api/v2/post/{}".format(self.root, post_id)
- return self.request(url).json()
+ url = f"{self.root}/api/v2/post/{post_id}"
+ return self.request_json(url)
def _pagination(self, endpoint, params=None):
- url = "{}/api{}".format(self.root, endpoint)
+ url = f"{self.root}/api{endpoint}"
if params is None:
params = {}
@@ -102,7 +102,7 @@ class Rule34xyzExtractor(BooruExtractor):
threshold = self.per_page
while True:
- data = self.request(url, method="POST", json=params).json()
+ data = self.request_json(url, method="POST", json=params)
yield from data["items"]
@@ -111,6 +111,26 @@ class Rule34xyzExtractor(BooruExtractor):
params["Skip"] += self.per_page
params["cursor"] = data["cursor"]
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self.session.headers["Authorization"] = \
+ self._login_impl(username, password)
+
+ @cache(maxage=3650*86400, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ url = f"{self.root}/api/v2/auth/signin"
+ data = {"email": username, "password": password}
+ response = self.request_json(
+ url, method="POST", json=data, fatal=False)
+
+ if jwt := response.get("jwt"):
+ return f"Bearer {jwt}"
+ raise exception.AuthenticationError(
+ (msg := response.get("message")) and f'"{msg}"')
+
class Rule34xyzPostExtractor(Rule34xyzExtractor):
subcategory = "post"
diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py
index 5ec2443..07d490a 100644
--- a/gallery_dl/extractor/saint.py
+++ b/gallery_dl/extractor/saint.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 3485db9..1c93cbf 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -13,7 +13,6 @@ from .common import Message
from .. import text, util, exception
from ..cache import cache
import collections
-import re
BASE_PATTERN = r"(?:https?://)?" \
r"(?:(?:chan|www|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \
@@ -26,7 +25,6 @@ class SankakuExtractor(BooruExtractor):
category = "sankaku"
root = "https://sankaku.app"
filename_fmt = "{category}_{id}_{md5}.{extension}"
- cookies_domain = None
_warning = True
TAG_TYPES = {
@@ -49,7 +47,7 @@ class SankakuExtractor(BooruExtractor):
self.api = SankakuAPI(self)
if self.config("tags") == "extended":
self._tags = self._tags_extended
- self._tags_findall = re.compile(
+ self._tags_findall = util.re(
r"tag-type-([^\"' ]+).*?\?tags=([^\"'&]+)").findall
def _file_url(self, post):
@@ -80,8 +78,7 @@ class SankakuExtractor(BooruExtractor):
def _tags(self, post, page):
tags = collections.defaultdict(list)
for tag in self.api.tags(post["id"]):
- name = tag["name"]
- if name:
+ if name := tag["name"]:
tags[tag["type"]].append(name.lower().replace(" ", "_"))
types = self.TAG_TYPES
for type, values in tags.items():
@@ -92,7 +89,8 @@ class SankakuExtractor(BooruExtractor):
def _tags_extended(self, post, page):
try:
url = "https://chan.sankakucomplex.com/posts/" + post["id"]
- page = self.request(url).text
+ headers = {"Referer": url}
+ page = self.request(url, headers=headers).text
except Exception as exc:
return self.log.warning(
"%s: Failed to extract extended tag categories (%s: %s)",
@@ -126,16 +124,16 @@ class SankakuTagExtractor(SankakuExtractor):
def __init__(self, match):
SankakuExtractor.__init__(self, match)
- query = text.parse_query(match.group(1))
+ query = text.parse_query(match[1])
self.tags = text.unquote(query.get("tags", "").replace("+", " "))
if "date:" in self.tags:
# rewrite 'date:' tags (#1790)
- self.tags = re.sub(
- r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)(?!T)",
+ self.tags = util.re(
+ r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)(?!T)").sub(
r"date:\3-\2-\1T00:00", self.tags)
- self.tags = re.sub(
- r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)(?!T)",
+ self.tags = util.re(
+ r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)(?!T)").sub(
r"date:\1-\2-\3T00:00", self.tags)
def metadata(self):
@@ -156,7 +154,7 @@ class SankakuPoolExtractor(SankakuExtractor):
def __init__(self, match):
SankakuExtractor.__init__(self, match)
- self.pool_id = match.group(1)
+ self.pool_id = match[1]
def metadata(self):
pool = self.api.pools(self.pool_id)
@@ -182,7 +180,7 @@ class SankakuPostExtractor(SankakuExtractor):
def __init__(self, match):
SankakuExtractor.__init__(self, match)
- self.post_id = match.group(1)
+ self.post_id = match[1]
def posts(self):
return self.api.posts(self.post_id)
@@ -196,14 +194,14 @@ class SankakuBooksExtractor(SankakuExtractor):
def __init__(self, match):
SankakuExtractor.__init__(self, match)
- query = text.parse_query(match.group(1))
+ query = text.parse_query(match[1])
self.tags = text.unquote(query.get("tags", "").replace("+", " "))
def items(self):
params = {"tags": self.tags, "pool_type": "0"}
for pool in self.api.pools_keyset(params):
pool["_extractor"] = SankakuPoolExtractor
- url = "https://sankaku.app/books/{}".format(pool["id"])
+ url = f"https://sankaku.app/books/{pool['id']}"
yield Message.Queue, url, pool
@@ -218,19 +216,16 @@ class SankakuAPI():
"Origin" : extractor.root,
}
- if extractor.config("id-format") in ("alnum", "alphanumeric"):
- self.headers["Api-Version"] = "2"
-
self.username, self.password = extractor._get_auth_info()
if not self.username:
self.authenticate = util.noop
def notes(self, post_id):
params = {"lang": "en"}
- return self._call("/posts/{}/notes".format(post_id), params)
+ return self._call(f"/posts/{post_id}/notes", params)
def tags(self, post_id):
- endpoint = "/posts/{}/tags".format(post_id)
+ endpoint = f"/posts/{post_id}/tags"
params = {
"lang" : "en",
"page" : 1,
@@ -312,15 +307,14 @@ class SankakuAPI():
("unauthorized", "invalid-token", "invalid_token")):
_authenticate_impl.invalidate(self.username)
continue
- raise exception.StopExtraction(code)
+ raise exception.AbortExtraction(code)
return data
def _pagination(self, endpoint, params):
params["lang"] = "en"
params["limit"] = str(self.extractor.per_page)
- refresh = self.extractor.config("refresh", False)
- if refresh:
+ if refresh := self.extractor.config("refresh", False):
offset = expires = 0
from time import time
@@ -334,8 +328,7 @@ class SankakuAPI():
for post in posts:
if not expires:
- url = post["file_url"]
- if url:
+ if url := post["file_url"]:
expires = text.parse_int(
text.extr(url, "e=", "&")) - 60
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
index 50c21e3..405e07e 100644
--- a/gallery_dl/extractor/sankakucomplex.py
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,6 @@
from .common import Extractor, Message
from .. import text, util
-import re
class SankakucomplexExtractor(Extractor):
@@ -20,7 +19,7 @@ class SankakucomplexExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.path = match.group(1)
+ self.path = match[1]
class SankakucomplexArticleExtractor(SankakucomplexExtractor):
@@ -34,7 +33,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
example = "https://news.sankakucomplex.com/1970/01/01/TITLE"
def items(self):
- url = "{}/{}/?pg=X".format(self.root, self.path)
+ url = f"{self.root}/{self.path}/?pg=X"
extr = text.extract_from(self.request(url).text)
data = {
"title" : text.unescape(
@@ -64,23 +63,20 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
file.update(data)
yield Message.Url, url, file
- @staticmethod
- def _extract_images(content):
- orig_sub = re.compile(r"-\d+x\d+\.").sub
+ def _extract_images(self, content):
+ orig_sub = util.re(r"-\d+x\d+\.").sub
return [
orig_sub(".", url) for url in
util.unique(text.extract_iter(content, 'data-lazy-src="', '"'))
]
- @staticmethod
- def _extract_videos(content):
- return re.findall(r"<source [^>]*src=[\"']([^\"']+)", content)
+ def _extract_videos(self, content):
+ return util.re(r"<source [^>]*src=[\"']([^\"']+)").findall(content)
- @staticmethod
- def _extract_embeds(content):
+ def _extract_embeds(self, content):
return [
"ytdl:" + url for url in
- re.findall(r"<iframe [^>]*src=[\"']([^\"']+)", content)
+ util.re(r"<iframe [^>]*src=[\"']([^\"']+)").findall(content)
]
@@ -96,7 +92,7 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor):
data = {"_extractor": SankakucomplexArticleExtractor}
while True:
- url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
+ url = f"{self.root}/{self.path}/page/{pnum}/"
response = self.request(url, fatal=False)
if response.status_code >= 400:
return
diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/schalenetwork.py
index b60157e..d517287 100644
--- a/gallery_dl/extractor/koharu.py
+++ b/gallery_dl/extractor/schalenetwork.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,9 +22,9 @@ BASE_PATTERN = (
)
-class KoharuExtractor(Extractor):
- """Base class for koharu extractors"""
- category = "koharu"
+class SchalenetworkExtractor(Extractor):
+ """Base class for schale.network extractors"""
+ category = "schalenetwork"
root = "https://niyaniya.moe"
root_api = "https://api.schale.network"
request_interval = (0.5, 1.5)
@@ -40,8 +40,8 @@ class KoharuExtractor(Extractor):
url_api = self.root_api + endpoint
while True:
- data = self.request(
- url_api, params=params, headers=self.headers).json()
+ data = self.request_json(
+ url_api, params=params, headers=self.headers)
try:
entries = data["entries"]
@@ -49,9 +49,8 @@ class KoharuExtractor(Extractor):
return
for entry in entries:
- url = "{}/g/{}/{}".format(
- self.root, entry["id"], entry["public_key"])
- entry["_extractor"] = KoharuGalleryExtractor
+ url = f"{self.root}/g/{entry['id']}/{entry['public_key']}"
+ entry["_extractor"] = SchalenetworkGalleryExtractor
yield Message.Queue, url, entry
try:
@@ -62,8 +61,8 @@ class KoharuExtractor(Extractor):
params["page"] += 1
-class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
- """Extractor for koharu galleries"""
+class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
+ """Extractor for schale.network galleries"""
filename_fmt = "{num:>03}.{extension}"
directory_fmt = ("{category}", "{id} {title}")
archive_fmt = "{id}_{num}"
@@ -89,7 +88,7 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
def __init__(self, match):
GalleryExtractor.__init__(self, match)
- self.gallery_url = None
+ self.page_url = None
def _init(self):
self.headers = {
@@ -106,9 +105,8 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
self.directory_fmt = ("{category}",)
def metadata(self, _):
- url = "{}/books/detail/{}/{}".format(
- self.root_api, self.groups[1], self.groups[2])
- self.data = data = self.request(url, headers=self.headers).json()
+ url = f"{self.root_api}/books/detail/{self.groups[1]}/{self.groups[2]}"
+ self.data = data = self.request_json(url, headers=self.headers)
data["date"] = text.parse_timestamp(data["created_at"] // 1000)
tags = []
@@ -142,11 +140,8 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
data = self.data
fmt = self._select_format(data["data"])
- url = "{}/books/data/{}/{}/{}/{}".format(
- self.root_api,
- data["id"], data["public_key"],
- fmt["id"], fmt["public_key"],
- )
+ url = (f"{self.root_api}/books/data/{data['id']}/"
+ f"{data['public_key']}/{fmt['id']}/{fmt['public_key']}")
params = {
"v": data["updated_at"],
"w": fmt["w"],
@@ -154,16 +149,16 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
if self.cbz:
params["action"] = "dl"
- base = self.request(
+ base = self.request_json(
url, method="POST", params=params, headers=self.headers,
- ).json()["base"]
- url = "{}?v={}&w={}".format(base, data["updated_at"], fmt["w"])
+ )["base"]
+ url = f"{base}?v={data['updated_at']}&w={fmt['w']}"
info = text.nameext_from_url(base)
if not info["extension"]:
info["extension"] = "cbz"
return ((url, info),)
- data = self.request(url, params=params, headers=self.headers).json()
+ data = self.request_json(url, params=params, headers=self.headers)
base = data["base"]
results = []
@@ -205,8 +200,8 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
return fmt
-class KoharuSearchExtractor(KoharuExtractor):
- """Extractor for koharu search results"""
+class SchalenetworkSearchExtractor(SchalenetworkExtractor):
+ """Extractor for schale.network search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/\?([^#]*)"
example = "https://niyaniya.moe/?s=QUERY"
@@ -217,8 +212,8 @@ class KoharuSearchExtractor(KoharuExtractor):
return self._pagination("/books", params)
-class KoharuFavoriteExtractor(KoharuExtractor):
- """Extractor for koharu favorites"""
+class SchalenetworkFavoriteExtractor(SchalenetworkExtractor):
+ """Extractor for schale.network favorites"""
subcategory = "favorite"
pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?"
example = "https://niyaniya.moe/favorites"
diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py
index 7bfc550..40f047a 100644
--- a/gallery_dl/extractor/scrolller.py
+++ b/gallery_dl/extractor/scrolller.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -110,9 +110,9 @@ class ScrolllerExtractor(Extractor):
url = "https://api.scrolller.com/api/v2/graphql"
headers["Content-Type"] = "text/plain;charset=UTF-8"
- return self.request(
+ return self.request_json(
url, method="POST", headers=headers, data=util.json_dumps(data),
- ).json()["data"]
+ )["data"]
def _pagination(self, opname, variables, data=None):
if data is None:
diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py
index ff8c505..7319731 100644
--- a/gallery_dl/extractor/seiga.py
+++ b/gallery_dl/extractor/seiga.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -42,11 +42,11 @@ class SeigaExtractor(Extractor):
def get_image_url(self, image_id):
"""Get url for an image with id 'image_id'"""
- url = "{}/image/source/{}".format(self.root, image_id)
+ url = f"{self.root}/image/source/{image_id}"
location = self.request_location(url, notfound="image")
if "nicovideo.jp/login" in location:
- raise exception.StopExtraction(
- "HTTP redirect to login page (%s)", location.partition("?")[0])
+ raise exception.AbortExtraction(
+ f"HTTP redirect to login page ({location.partition('?')[0]})")
return location.replace("/o/", "/priv/", 1)
def login(self):
@@ -81,7 +81,7 @@ class SeigaExtractor(Extractor):
if "/mfa" in response.url:
page = response.text
email = text.extr(page, 'class="userAccount">', "<")
- code = self.input("Email Confirmation Code ({}): ".format(email))
+ code = self.input(f"Email Confirmation Code ({email}): ")
data = {
"otp": code,
@@ -145,7 +145,7 @@ class SeigaUserExtractor(SeigaExtractor):
}
def get_images(self):
- url = "{}/user/illust/{}".format(self.root, self.user_id)
+ url = f"{self.root}/user/illust/{self.user_id}"
params = {"sort": self.order, "page": self.start_page,
"target": "illust_all"}
@@ -187,7 +187,7 @@ class SeigaImageExtractor(SeigaExtractor):
def __init__(self, match):
SeigaExtractor.__init__(self, match)
- self.image_id = match.group(1)
+ self.image_id = match[1]
def skip(self, num):
self.start_image += num
@@ -197,7 +197,7 @@ class SeigaImageExtractor(SeigaExtractor):
self.cookies.set(
"skip_fetish_warning", "1", domain="seiga.nicovideo.jp")
- url = "{}/seiga/im{}".format(self.root, self.image_id)
+ url = f"{self.root}/seiga/im{self.image_id}"
page = self.request(url, notfound="image").text
data = text.extract_all(page, (
diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py
index a8fdc4c..301652a 100644
--- a/gallery_dl/extractor/senmanga.py
+++ b/gallery_dl/extractor/senmanga.py
@@ -20,7 +20,7 @@ class SenmangaChapterExtractor(ChapterExtractor):
example = "https://raw.senmanga.com/MANGA/CHAPTER"
def _init(self):
- self.session.headers["Referer"] = self.gallery_url
+ self.session.headers["Referer"] = self.page_url
# select "All pages" viewer
self.cookies.set("viewer", "1", domain="raw.senmanga.com")
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index 9e7d75d..2feb64e 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,7 +12,7 @@ from .common import Extractor, Message
from .. import text
from datetime import datetime
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?sex\.com"
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?sex\.com(?:/[a-z]{2})?"
class SexcomExtractor(Extractor):
@@ -24,6 +24,8 @@ class SexcomExtractor(Extractor):
root = "https://www.sex.com"
def items(self):
+ self.gifs = self.config("gifs", True)
+
yield Message.Directory, self.metadata()
for pin in map(self._parse_pin, self.pins()):
if not pin:
@@ -38,6 +40,7 @@ class SexcomExtractor(Extractor):
pin["date"] = dt
except Exception:
pass
+ pin["tags"] = [t[1:] for t in pin["tags"]]
yield Message.Url, url, pin
@@ -63,17 +66,32 @@ class SexcomExtractor(Extractor):
url = text.urljoin(self.root, text.unescape(url))
def _parse_pin(self, url):
- response = self.request(url, fatal=False)
+ if "/pin/" in url:
+ if url[-1] != "/":
+ url += "/"
+ elif url[-1] == "/":
+ url = url[:-1]
+
+ response = self.request(url, fatal=False, allow_redirects=False)
+ location = response.headers.get("location")
+
+ if location:
+ if location[0] == "/":
+ location = self.root + location
+ if len(location) <= 25:
+ return self.log.warning(
+ 'Unable to fetch %s: Redirect to homepage', url)
+ response = self.request(location, fatal=False)
+
if response.status_code >= 400:
- self.log.warning('Unable to fetch %s ("%s %s")',
- url, response.status_code, response.reason)
- return None
+ return self.log.warning('Unable to fetch %s: %s %s',
+ url, response.status_code, response.reason)
if "/pin/" in response.url:
return self._parse_pin_legacy(response)
if "/videos/" in response.url:
return self._parse_pin_video(response)
- return self._parse_pin_gifs(response)
+ return self._parse_pin_image(response)
def _parse_pin_legacy(self, response):
extr = text.extract_from(response.text)
@@ -94,7 +112,7 @@ class SexcomExtractor(Extractor):
if info:
try:
- path, _ = text.rextract(
+ path = text.rextr(
info, "src: '", "'", info.index("label: 'HD'"))
except ValueError:
path = text.extr(info, "src: '", "'")
@@ -124,20 +142,31 @@ class SexcomExtractor(Extractor):
return data
- def _parse_pin_gifs(self, response):
+ def _parse_pin_image(self, response):
extr = text.extract_from(response.text)
+ href = extr(' href="', '"').partition("?")[0]
+ title, _, type = extr("<title>", " | ").rpartition(" ")
data = {
"_http_headers": {"Referer": response.url},
- "type": "gif",
- "url": extr(' href="', '"'),
- "title": text.unescape(extr("<title>", " Gif | Sex.com<")),
+ "url": href,
+ "title": text.unescape(title),
"pin_id": text.parse_int(extr(
'rel="canonical" href="', '"').rpartition("/")[2]),
"tags": text.split_html(extr("</h1>", "</section>")),
}
- return text.nameext_from_url(data["url"], data)
+ text.nameext_from_url(href, data)
+ if type.lower() == "pic":
+ data["type"] = "picture"
+ else:
+ data["type"] = "gif"
+ if self.gifs and data["extension"] == "webp":
+ data["extension"] = "gif"
+ data["_fallback"] = (href,)
+ data["url"] = href[:-4] + "gif"
+
+ return data
def _parse_pin_video(self, response):
extr = text.extract_from(response.text)
@@ -147,6 +176,7 @@ class SexcomExtractor(Extractor):
data = {
"_ytdl_manifest": "hls",
+ "_ytdl_manifest_headers": {"Referer": response.url},
"extension": "mp4",
"type": "video",
"title": text.unescape(extr("<title>", " | Sex.com<")),
@@ -166,7 +196,7 @@ class SexcomPinExtractor(SexcomExtractor):
subcategory = "pin"
directory_fmt = ("{category}",)
pattern = (BASE_PATTERN +
- r"(/(?:pin|\w\w/(?:gif|video)s)/\d+/?)(?!.*#related$)")
+ r"(/(?:\w\w/(?:pic|gif|video)s|pin)/\d+/?)(?!.*#related$)")
example = "https://www.sex.com/pin/12345-TITLE/"
def pins(self):
@@ -185,8 +215,8 @@ class SexcomRelatedPinExtractor(SexcomPinExtractor):
return {"original_pin": pin}
def pins(self):
- url = "{}/pin/related?pinId={}&limit=24&offset=0".format(
- self.root, self.groups[1])
+ url = (f"{self.root}/pin/related?pinId={self.groups[1]}"
+ f"&limit=24&offset=0")
return self._pagination(url)
@@ -201,7 +231,7 @@ class SexcomPinsExtractor(SexcomExtractor):
return {"user": text.unquote(self.groups[0])}
def pins(self):
- url = "{}/user/{}/pins/".format(self.root, self.groups[0])
+ url = f"{self.root}/user/{self.groups[0]}/pins/"
return self._pagination(url)
@@ -216,7 +246,7 @@ class SexcomLikesExtractor(SexcomExtractor):
return {"user": text.unquote(self.groups[0])}
def pins(self):
- url = "{}/user/{}/likes/".format(self.root, self.groups[0])
+ url = f"{self.root}/user/{self.groups[0]}/likes/"
return self._pagination(url)
@@ -236,33 +266,75 @@ class SexcomBoardExtractor(SexcomExtractor):
}
def pins(self):
- url = "{}/user/{}/{}/".format(self.root, self.user, self.board)
+ url = f"{self.root}/user/{self.user}/{self.board}/"
return self._pagination(url)
class SexcomSearchExtractor(SexcomExtractor):
"""Extractor for search results on www.sex.com"""
subcategory = "search"
- directory_fmt = ("{category}", "search", "{search[query]}")
- pattern = (BASE_PATTERN + r"/((?:"
- r"(pic|gif|video)s/([^/?#]*)|search/(pic|gif|video)s"
- r")/?(?:\?([^#]+))?)")
+ directory_fmt = ("{category}", "search", "{search[search]}")
+ pattern = (BASE_PATTERN + r"/(?:"
+ r"(pic|gif|video)s(?:\?(search=[^#]+)$|/([^/?#]*))"
+ r"|search/(pic|gif|video)s"
+ r")/?(?:\?([^#]+))?")
example = "https://www.sex.com/search/pics?query=QUERY"
def _init(self):
- self.path, t1, query_alt, t2, query = self.groups
+ t1, qs1, search_alt, t2, qs2 = self.groups
- self.search = text.parse_query(query)
- self.search["type"] = t1 or t2
- if "query" not in self.search:
- self.search["query"] = query_alt or ""
+ self.params = params = text.parse_query(qs1 or qs2)
+ if "query" in params:
+ params["search"] = params.pop("query")
+ params.setdefault("sexual-orientation", "straight")
+ params.setdefault("order", "likeCount")
+ params.setdefault("search", search_alt or "")
- def metadata(self):
- return {"search": self.search}
+ self.kwdict["search"] = search = params.copy()
+ search["type"] = self.type = t1 or t2
- def pins(self):
- url = "{}/{}".format(self.root, self.path)
- return self._pagination(url)
+ def items(self):
+ root = "https://imagex1.sx.cdn.live"
+ type = self.type
+ gifs = self.config("gifs", True)
+
+ url = (f"{self.root}/portal/api/"
+ f"{'picture' if type == 'pic' else type}s/search")
+ params = self.params
+ params["page"] = text.parse_int(params.get("page"), 1)
+ params["limit"] = 40
+
+ while True:
+ data = self.request_json(url, params=params)
+
+ for pin in data["data"]:
+ path = pin["uri"]
+ pin["pin_id"] = pin.pop("id")
+ text.nameext_from_url(path, pin)
+
+ parts = path.rsplit("/", 4)
+ try:
+ pin["date_url"] = pin["date"] = datetime(
+ int(parts[1]), int(parts[2]), int(parts[3]))
+ except Exception:
+ pass
+
+ if type == "pic":
+ pin["type"] = "picture"
+ else:
+ pin["type"] = "gif"
+ if gifs and pin["extension"] == "webp":
+ pin["extension"] = "gif"
+ pin["_fallback"] = (f"{root}{path}",)
+ path = f"{path[:-4]}gif"
+
+ pin["url"] = f"{root}{path}"
+ yield Message.Directory, pin
+ yield Message.Url, pin["url"], pin
+
+ if params["page"] >= data["paging"]["numberOfPages"]:
+ break
+ params["page"] += 1
def _check_empty(response):
diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py
index d15762d..9afa706 100644
--- a/gallery_dl/extractor/shimmie2.py
+++ b/gallery_dl/extractor/shimmie2.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -19,13 +19,11 @@ class Shimmie2Extractor(BaseExtractor):
archive_fmt = "{id}"
def _init(self):
- cookies = self.config_instance("cookies")
- if cookies:
+ if cookies := self.config_instance("cookies"):
domain = self.root.rpartition("/")[2]
self.cookies_update_dict(cookies, domain=domain)
- file_url = self.config_instance("file_url")
- if file_url:
+ if file_url := self.config_instance("file_url"):
self.file_url_fmt = file_url
if self.category == "giantessbooru":
@@ -110,7 +108,7 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
mime = ""
while True:
- url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
+ url = f"{self.root}/post/list/{self.tags}/{pnum}"
page = self.request(url).text
extr = text.extract_from(page)
@@ -153,7 +151,7 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
pnum += 1
if not extr(">Next<", ">"):
- if not extr("/{}'>{}<".format(pnum, pnum), ">"):
+ if not extr(f"/{pnum}'>{pnum}<", ">"):
return
def _posts_giantessbooru(self):
@@ -161,8 +159,7 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
file_url_fmt = (self.root + "/index.php?q=/image/{}.jpg").format
while True:
- url = "{}/index.php?q=/post/list/{}/{}".format(
- self.root, self.tags, pnum)
+ url = f"{self.root}/index.php?q=/post/list/{self.tags}/{pnum}"
extr = text.extract_from(self.request(url).text)
while True:
@@ -184,7 +181,7 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
}
pnum += 1
- if not extr("/{0}'>{0}<".format(pnum), ">"):
+ if not extr(f"/{pnum}'>{pnum}<", ">"):
return
@@ -196,18 +193,18 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
def posts(self):
post_id = self.groups[-1]
- url = "{}/post/view/{}".format(self.root, post_id)
+ url = f"{self.root}/post/view/{post_id}"
page = self.request(url).text
extr = text.extract_from(page)
- quote = self._quote_type(page)
+ qt = self._quote_type(page)
post = {
"id" : post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : extr("/_thumbs/", "/"),
"file_url": self.root + (
- extr("id={0}main_image{0} src={0}".format(quote), quote) or
- extr("<source src="+quote, quote)).lstrip("."),
+ extr(f"id={qt}main_image{qt} src={qt}", qt) or
+ extr("<source src="+qt, qt)).lstrip("."),
"width" : extr("data-width=", " ").strip("\"'"),
"height" : extr("data-height=", ">").partition(
" ")[0].strip("\"'"),
@@ -221,7 +218,7 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
def _posts_giantessbooru(self):
post_id = self.groups[-1]
- url = "{}/index.php?q=/post/view/{}".format(self.root, post_id)
+ url = f"{self.root}/index.php?q=/post/view/{post_id}"
extr = text.extract_from(self.request(url).text)
return ({
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
index a658cac..84c9a84 100644
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -18,10 +18,6 @@ class ShopifyExtractor(BaseExtractor):
filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}"
archive_fmt = "{id}"
- def __init__(self, match):
- BaseExtractor.__init__(self, match)
- self.item_url = self.root + match.group(match.lastindex)
-
def items(self):
data = self.metadata()
yield Message.Directory, data
@@ -98,14 +94,15 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
example = "https://www.fashionnova.com/collections/TITLE"
def metadata(self):
- return self.request(self.item_url + ".json").json()
+ url = f"{self.root}{self.groups[-1]}.json"
+ return self.request_json(url)
def products(self):
- url = self.item_url + "/products.json"
+ url = f"{self.root}{self.groups[-1]}/products.json"
params = {"page": 1}
while True:
- data = self.request(url, params=params).json()["products"]
+ data = self.request_json(url, params=params)["products"]
if not data:
return
yield from data
@@ -120,6 +117,7 @@ class ShopifyProductExtractor(ShopifyExtractor):
example = "https://www.fashionnova.com/collections/TITLE/products/NAME"
def products(self):
- product = self.request(self.item_url + ".json").json()["product"]
+ url = f"{self.root}{self.groups[-1]}.json"
+ product = self.request_json(url)["product"]
del product["image"]
return (product,)
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
index 6f72291..d6541b2 100644
--- a/gallery_dl/extractor/simplyhentai.py
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -29,7 +29,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
GalleryExtractor.__init__(self, match, url)
def _init(self):
- self.session.headers["Referer"] = self.gallery_url
+ self.session.headers["Referer"] = self.page_url
def metadata(self, page):
extr = text.extract_from(page)
@@ -55,9 +55,9 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
return data
def images(self, _):
- url = self.gallery_url + "/all-pages"
+ url = self.page_url + "/all-pages"
headers = {"Accept": "application/json"}
- images = self.request(url, headers=headers).json()
+ images = self.request_json(url, headers=headers)
return [
(
urls["full"].replace("/giant_thumb_", "/"),
@@ -80,8 +80,8 @@ class SimplyhentaiImageExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.page_url = "https://old." + match.group(1)
- self.type = match.group(2)
+ self.page_url = "https://old." + match[1]
+ self.type = match[2]
def items(self):
extr = text.extract_from(self.request(self.page_url).text)
@@ -90,8 +90,7 @@ class SimplyhentaiImageExtractor(Extractor):
url = extr('&quot;image&quot;:&quot;' , '&')
url = extr("&quot;content&quot;:&quot;", "&") or url
- tags = text.extr(descr, " tagged with ", " online for free ")
- if tags:
+ if tags := text.extr(descr, " tagged with ", " online for free "):
tags = tags.split(", ")
tags[-1] = tags[-1].partition(" ")[2]
else:
@@ -123,7 +122,7 @@ class SimplyhentaiVideoExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.page_url = "https://" + match.group(1)
+ self.page_url = "https://" + match[1]
def items(self):
page = self.request(self.page_url).text
diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py
index cdccd4c..1caafd1 100644
--- a/gallery_dl/extractor/skeb.py
+++ b/gallery_dl/extractor/skeb.py
@@ -21,7 +21,7 @@ class SkebExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user_name = match.group(1)
+ self.user_name = match[1]
def _init(self):
self.thumbnails = self.config("thumbnails", False)
@@ -81,8 +81,8 @@ class SkebExtractor(Extractor):
params["offset"] = 0
while True:
- posts = self.request(
- url, params=params, headers=self.headers).json()
+ posts = self.request_json(
+ url, params=params, headers=self.headers)
for post in posts:
parts = post["path"].split("/")
@@ -100,13 +100,13 @@ class SkebExtractor(Extractor):
params["offset"] += 30
def _pagination_users(self, endpoint, params):
- url = "{}/api{}".format(self.root, endpoint)
+ url = f"{self.root}/api{endpoint}"
params["offset"] = 0
params["limit"] = 90
while True:
- data = self.request(
- url, params=params, headers=self.headers).json()
+ data = self.request_json(
+ url, params=params, headers=self.headers)
yield from data
if len(data) < params["limit"]:
@@ -114,9 +114,8 @@ class SkebExtractor(Extractor):
params["offset"] += params["limit"]
def _get_post_data(self, user_name, post_num):
- url = "{}/api/users/{}/works/{}".format(
- self.root, user_name, post_num)
- resp = self.request(url, headers=self.headers).json()
+ url = f"{self.root}/api/users/{user_name}/works/{post_num}"
+ resp = self.request_json(url, headers=self.headers)
creator = resp["creator"]
post = {
"post_id" : resp["id"],
@@ -163,8 +162,7 @@ class SkebExtractor(Extractor):
})
if self.article and "article_image_url" in resp:
- url = resp["article_image_url"]
- if url:
+ if url := resp["article_image_url"]:
files.append({
"content_category": "article",
"file_id" : "article",
@@ -184,7 +182,7 @@ class SkebExtractor(Extractor):
"height" : info["height"],
"byte_size" : info["byte_size"],
"duration" : info["duration"],
- "frame_rate": info["frame_rate"],
+ "frame_rate": info.get("frame_rate"),
"software" : info["software"],
"extension" : info["extension"],
"is_movie" : info["is_movie"],
@@ -203,7 +201,7 @@ class SkebPostExtractor(SkebExtractor):
def __init__(self, match):
SkebExtractor.__init__(self, match)
- self.post_num = match.group(2)
+ self.post_num = match[2]
def posts(self):
return ((self.user_name, self.post_num),)
@@ -216,7 +214,7 @@ class SkebUserExtractor(SkebExtractor):
example = "https://skeb.jp/@USER"
def posts(self):
- url = "{}/api/users/{}/works".format(self.root, self.user_name)
+ url = f"{self.root}/api/users/{self.user_name}/works"
params = {"role": "creator", "sort": "date"}
posts = self._pagination(url, params)
@@ -266,9 +264,9 @@ class SkebSearchExtractor(SkebExtractor):
data = {"requests": (request,)}
while True:
- result = self.request(
+ result = self.request_json(
url, method="POST", params=params, headers=headers, json=data,
- ).json()["results"][0]
+ )["results"][0]
for post in result["hits"]:
parts = post["path"].split("/")
@@ -289,7 +287,7 @@ class SkebFollowingExtractor(SkebExtractor):
items = SkebExtractor._items_users
def users(self):
- endpoint = "/users/{}/following_creators".format(self.user_name)
+ endpoint = f"/users/{self.user_name}/following_creators"
params = {"sort": "date"}
return self._pagination_users(endpoint, params)
diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py
index b2961e1..ee877f2 100644
--- a/gallery_dl/extractor/slickpic.py
+++ b/gallery_dl/extractor/slickpic.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -21,8 +21,8 @@ class SlickpicExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1)
- self.root = "https://{}.slickpic.com".format(self.user)
+ self.user = match[1]
+ self.root = f"https://{self.user}.slickpic.com"
class SlickpicAlbumExtractor(SlickpicExtractor):
@@ -37,7 +37,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor):
def __init__(self, match):
SlickpicExtractor.__init__(self, match)
- self.album = match.group(2)
+ self.album = match[2]
def items(self):
data = self.metadata()
@@ -72,7 +72,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor):
yield Message.Url, url, img
def metadata(self):
- url = "{}/albums/{}/?wallpaper".format(self.root, self.album)
+ url = f"{self.root}/albums/{self.album}/?wallpaper"
extr = text.extract_from(self.request(url).text)
title = text.unescape(extr("<title>", "</title>"))
@@ -105,7 +105,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor):
"sng" : "0",
"whq" : "1",
}
- return self.request(url, method="POST", data=data).json()["list"]
+ return self.request_json(url, method="POST", data=data)["list"]
class SlickpicUserExtractor(SlickpicExtractor):
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
index 0722d23..c0f0e36 100644
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2016-2017 Leonardo Taccari
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,8 +26,7 @@ class SlidesharePresentationExtractor(GalleryExtractor):
def __init__(self, match):
self.user, self.presentation = match.groups()
- url = "https://www.slideshare.net/{}/{}".format(
- self.user, self.presentation)
+ url = f"https://www.slideshare.net/{self.user}/{self.presentation}"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -47,11 +46,8 @@ class SlidesharePresentationExtractor(GalleryExtractor):
def images(self, page):
slides = self.slideshow["slides"]
- begin = "{}/{}/95/{}-".format(
- slides["host"],
- slides["imageLocation"],
- slides["title"],
- )
+ begin = (f"{slides['host']}/{slides['imageLocation']}"
+ f"/95/{slides['title']}-")
end = "-1024.jpg"
return [
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
index 48bd918..e9c89a1 100644
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -71,7 +71,7 @@ class SmugmugAlbumExtractor(SmugmugExtractor):
def __init__(self, match):
SmugmugExtractor.__init__(self, match)
- self.album_id = match.group(1)
+ self.album_id = match[1]
def items(self):
album = self.api.album(self.album_id, "User")
@@ -98,7 +98,7 @@ class SmugmugImageExtractor(SmugmugExtractor):
def __init__(self, match):
SmugmugExtractor.__init__(self, match)
- self.image_id = match.group(3)
+ self.image_id = match[3]
def items(self):
image = self.api.image(self.image_id, "ImageSizeDetails")
@@ -197,7 +197,7 @@ class SmugmugAPI(oauth.OAuth1API):
return self._expansion(endpoint, "Node", params)
def _call(self, endpoint, params=None, domain=API_DOMAIN):
- url = "https://{}/api/v2/{}".format(domain, endpoint)
+ url = f"https://{domain}/api/v2/{endpoint}"
params = params or {}
if self.api_key:
params["APIKey"] = self.api_key
@@ -211,9 +211,9 @@ class SmugmugAPI(oauth.OAuth1API):
if data["Code"] == 404:
raise exception.NotFoundError()
if data["Code"] == 429:
- raise exception.StopExtraction("Rate limit reached")
+ raise exception.AbortExtraction("Rate limit reached")
self.log.debug(data)
- raise exception.StopExtraction("API request failed")
+ raise exception.AbortExtraction("API request failed")
def _expansion(self, endpoint, expands, params=None):
endpoint = self._extend(endpoint, expands)
@@ -234,14 +234,12 @@ class SmugmugAPI(oauth.OAuth1API):
return
params["start"] += params["count"]
- @staticmethod
- def _extend(endpoint, expands):
+ def _extend(self, endpoint, expands):
if expands:
endpoint += "?_expand=" + expands
return endpoint
- @staticmethod
- def _apply_expansions(data, expands):
+ def _apply_expansions(self, data, expands):
def unwrap(response):
locator = response["Locator"]
diff --git a/gallery_dl/extractor/soundgasm.py b/gallery_dl/extractor/soundgasm.py
index 7c75aaa..79ab74d 100644
--- a/gallery_dl/extractor/soundgasm.py
+++ b/gallery_dl/extractor/soundgasm.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -58,7 +58,7 @@ class SoundgasmAudioExtractor(SoundgasmExtractor):
self.user, self.slug = match.groups()
def sounds(self):
- return ("{}/u/{}/{}".format(self.root, self.user, self.slug),)
+ return (f"{self.root}/u/{self.user}/{self.slug}",)
class SoundgasmUserExtractor(SoundgasmExtractor):
@@ -69,7 +69,7 @@ class SoundgasmUserExtractor(SoundgasmExtractor):
def __init__(self, match):
SoundgasmExtractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
def sounds(self):
page = self.request(self.root + "/user/" + self.user).text
diff --git a/gallery_dl/extractor/speakerdeck.py b/gallery_dl/extractor/speakerdeck.py
index 3210fd8..b809b7f 100644
--- a/gallery_dl/extractor/speakerdeck.py
+++ b/gallery_dl/extractor/speakerdeck.py
@@ -9,8 +9,7 @@
"""Extractors for https://speakerdeck.com/"""
from .common import GalleryExtractor
-from .. import text
-import re
+from .. import text, util
class SpeakerdeckPresentationExtractor(GalleryExtractor):
@@ -24,31 +23,30 @@ class SpeakerdeckPresentationExtractor(GalleryExtractor):
pattern = r"(?:https?://)?(?:www\.)?speakerdeck\.com/([^/?#]+)/([^/?#]+)"
example = "https://speakerdeck.com/USER/PRESENTATION"
- def __init__(self, match):
- GalleryExtractor.__init__(self, match, "")
- self.user, self.presentation = match.groups()
-
def metadata(self, _):
+ user, presentation = self.groups
+
url = self.root + "/oembed.json"
params = {
- "url": "{}/{}/{}".format(self.root, self.user, self.presentation),
+ "url": f"{self.root}/{user}/{presentation}",
}
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
self.presentation_id = text.extr(
data["html"], 'src="//speakerdeck.com/player/', '"')
return {
- "user": self.user,
- "presentation": self.presentation,
+ "user": user,
+ "presentation": presentation,
"presentation_id": self.presentation_id,
"title": data["title"],
"author": data["author_name"],
}
def images(self, _):
- url = "{}/player/{}".format(self.root, self.presentation_id)
- page = re.sub(r"\s+", " ", self.request(url).text)
+ url = f"{self.root}/player/{self.presentation_id}"
+ page = self.request(url).text
+ page = util.re(r"\s+").sub(" ", page)
return [
(url, None)
for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"')
diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py
index c120ee5..e17b9fd 100644
--- a/gallery_dl/extractor/steamgriddb.py
+++ b/gallery_dl/extractor/steamgriddb.py
@@ -72,9 +72,9 @@ class SteamgriddbExtractor(Extractor):
yield Message.Url, url, text.nameext_from_url(url, asset)
def _call(self, endpoint, **kwargs):
- data = self.request(self.root + endpoint, **kwargs).json()
+ data = self.request_json(self.root + endpoint, **kwargs)
if not data["success"]:
- raise exception.StopExtraction(data["error"])
+ raise exception.AbortExtraction(data["error"])
return data["data"]
@@ -83,11 +83,11 @@ class SteamgriddbAssetsExtractor(SteamgriddbExtractor):
def __init__(self, match):
SteamgriddbExtractor.__init__(self, match)
- list_type = match.group(1)
- id = int(match.group(2))
+ list_type = match[1]
+ id = int(match[2])
self.game_id = id if list_type == "game" else None
self.collection_id = id if list_type == "collection" else None
- self.page = int(match.group(3) or 1)
+ self.page = int(p) if (p := match[3]) else 1
def assets(self):
limit = 48
@@ -96,7 +96,7 @@ class SteamgriddbAssetsExtractor(SteamgriddbExtractor):
sort = self.config("sort", "score_desc")
if sort not in ("score_desc", "score_asc", "score_old_desc",
"score_old_asc", "age_desc", "age_asc"):
- raise exception.StopExtractor("Invalid sort '%s'", sort)
+ raise exception.AbortExtraction(f"Invalid sort '{sort}'")
json = {
"static" : self.config("static", True),
@@ -149,7 +149,7 @@ class SteamgriddbAssetsExtractor(SteamgriddbExtractor):
for i in value:
if i not in valid_values:
- raise exception.StopExtraction("Invalid %s '%s'", type_name, i)
+ raise exception.AbortExtraction(f"Invalid {type_name} '{i}'")
return value
@@ -162,15 +162,15 @@ class SteamgriddbAssetExtractor(SteamgriddbExtractor):
def __init__(self, match):
SteamgriddbExtractor.__init__(self, match)
- self.asset_type = match.group(1)
- self.asset_id = match.group(2)
+ self.asset_type = match[1]
+ self.asset_id = match[2]
def assets(self):
endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id
asset = self._call(endpoint)["asset"]
if asset is None:
- raise exception.NotFoundError("asset ({}:{})".format(
- self.asset_type, self.asset_id))
+ raise exception.NotFoundError(
+ f"asset ({self.asset_type}:{self.asset_id})")
return (asset,)
@@ -211,7 +211,7 @@ class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor):
asset_type = "icon"
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/icons"
- valid_dimensions = ["{0}x{0}".format(i) for i in (8, 10, 14, 16, 20, 24,
+ valid_dimensions = [f"{i}x{i}" for i in (8, 10, 14, 16, 20, 24,
28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90,
96, 100, 114, 120, 128, 144, 150, 152, 160, 180, 192,
194, 256, 310, 512, 768, 1024)]
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index a83f2da..989e6cc 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,6 @@
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
-import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?subscribestar\.(com|adult)"
@@ -45,8 +44,7 @@ class SubscribestarExtractor(Extractor):
if "<html><body>" in content:
data["content"] = content = text.extr(
content, "<body>", "</body>")
- data["title"] = text.unescape(
- text.rextract(content, "<h1>", "</h1>")[0] or "")
+ data["title"] = text.unescape(text.rextr(content, "<h1>", "</h1>"))
yield Message.Directory, data
for num, item in enumerate(media, 1):
@@ -67,8 +65,8 @@ class SubscribestarExtractor(Extractor):
if response.history and (
"/verify_subscriber" in response.url or
"/age_confirmation_warning" in response.url):
- raise exception.StopExtraction(
- "HTTP redirect to %s", response.url)
+ raise exception.AbortExtraction(
+ f"HTTP redirect to {response.url}")
content = response.content
if len(content) < 250 and b">redirected<" in content:
@@ -117,11 +115,10 @@ class SubscribestarExtractor(Extractor):
}
def check_errors(response):
- errors = response.json().get("errors")
- if errors:
+ if errors := response.json().get("errors"):
self.log.debug(errors)
try:
- msg = '"{}"'.format(errors.popitem()[1])
+ msg = f'"{errors.popitem()[1]}"'
except Exception:
msg = None
raise exception.AuthenticationError(msg)
@@ -148,8 +145,7 @@ class SubscribestarExtractor(Extractor):
def _media_from_post(self, html):
media = []
- gallery = text.extr(html, 'data-gallery="', '"')
- if gallery:
+ if gallery := text.extr(html, 'data-gallery="', '"'):
for item in util.json_loads(text.unescape(gallery)):
if "/previews" in item["url"]:
self._warn_preview()
@@ -159,8 +155,8 @@ class SubscribestarExtractor(Extractor):
attachments = text.extr(
html, 'class="uploads-docs"', 'class="post-edit_form"')
if attachments:
- for att in re.split(
- r'class="doc_preview[" ]', attachments)[1:]:
+ for att in util.re(r'class="doc_preview[" ]').split(
+ attachments)[1:]:
media.append({
"id" : text.parse_int(text.extr(
att, 'data-upload-id="', '"')),
@@ -173,8 +169,8 @@ class SubscribestarExtractor(Extractor):
audios = text.extr(
html, 'class="uploads-audios"', 'class="post-edit_form"')
if audios:
- for audio in re.split(
- r'class="audio_preview-data[" ]', audios)[1:]:
+ for audio in util.re(r'class="audio_preview-data[" ]').split(
+ audios)[1:]:
media.append({
"id" : text.parse_int(text.extr(
audio, 'data-upload-id="', '"')),
@@ -224,7 +220,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
def posts(self):
needle_next_page = 'data-role="infinite_scroll-next_page" href="'
- page = self.request("{}/{}".format(self.root, self.item)).text
+ page = self.request(f"{self.root}/{self.item}").text
while True:
posts = page.split('<div class="post ')[1:]
@@ -235,7 +231,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
url = text.extr(posts[-1], needle_next_page, '"')
if not url:
return
- page = self.request(self.root + text.unescape(url)).json()["html"]
+ page = self.request_json(self.root + text.unescape(url))["html"]
class SubscribestarPostExtractor(SubscribestarExtractor):
@@ -245,7 +241,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor):
example = "https://www.subscribestar.com/posts/12345"
def posts(self):
- url = "{}/posts/{}".format(self.root, self.item)
+ url = f"{self.root}/posts/{self.item}"
return (self.request(url).text,)
def _data_from_post(self, html):
diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py
index 1713509..190ccbf 100644
--- a/gallery_dl/extractor/szurubooru.py
+++ b/gallery_dl/extractor/szurubooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,17 +26,15 @@ class SzurubooruExtractor(booru.BooruExtractor):
"Content-Type": "application/json",
}
- username = self.config("username")
- if username:
- token = self.config("token")
- if token:
+ if username := self.config("username"):
+ if token := self.config("token"):
value = username + ":" + token
self.headers["Authorization"] = "Token " + \
binascii.b2a_base64(value.encode())[:-1].decode()
def _api_request(self, endpoint, params=None):
url = self.root + "/api" + endpoint
- return self.request(url, headers=self.headers, params=params).json()
+ return self.request_json(url, headers=self.headers, params=params)
def _pagination(self, endpoint, params):
params["offset"] = 0
@@ -58,20 +56,17 @@ class SzurubooruExtractor(booru.BooruExtractor):
url = self.root + "/" + url
return url
- @staticmethod
- def _prepare(post):
+ def _prepare(self, post):
post["date"] = text.parse_datetime(
post["creationTime"], "%Y-%m-%dT%H:%M:%S.%fZ")
tags = []
- append = tags.append
tags_categories = collections.defaultdict(list)
-
for tag in post["tags"]:
tag_type = tag["category"].rpartition("_")[2]
tag_name = tag["names"][0]
tags_categories[tag_type].append(tag_name)
- append(tag_name)
+ tags.append(tag_name)
post["tags"] = tags
for category, tags in tags_categories.items():
diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py
index 35a346d..d823f6a 100644
--- a/gallery_dl/extractor/tapas.py
+++ b/gallery_dl/extractor/tapas.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -67,7 +67,7 @@ class TapasExtractor(Extractor):
def request_api(self, url, params=None):
headers = {"Accept": "application/json, text/javascript, */*;"}
- return self.request(url, params=params, headers=headers).json()["data"]
+ return self.request_json(url, params=params, headers=headers)["data"]
class TapasEpisodeExtractor(TapasExtractor):
@@ -79,14 +79,13 @@ class TapasEpisodeExtractor(TapasExtractor):
self.login()
episode_id = self.groups[0]
- url = "{}/episode/{}".format(self.root, episode_id)
+ url = f"{self.root}/episode/{episode_id}"
data = self.request_api(url)
episode = data["episode"]
if not episode.get("free") and not episode.get("unlocked"):
raise exception.AuthorizationError(
- "{}: Episode '{}' not unlocked".format(
- episode_id, episode["title"]))
+ f"{episode_id}: Episode '{episode['title']}' not unlocked")
html = data["html"]
episode["series"] = self._extract_series(html)
@@ -106,11 +105,11 @@ class TapasEpisodeExtractor(TapasExtractor):
yield Message.Url, url, text.nameext_from_url(url, episode)
def _extract_series(self, html):
- series_id = text.rextract(html, 'data-series-id="', '"')[0]
+ series_id = text.rextr(html, 'data-series-id="', '"')
try:
return self._cache[series_id]
except KeyError:
- url = "{}/series/{}".format(self.root, series_id)
+ url = f"{self.root}/series/{series_id}"
series = self._cache[series_id] = self.request_api(url)
return series
@@ -123,12 +122,12 @@ class TapasSeriesExtractor(TapasExtractor):
def items(self):
self.login()
- url = "{}/series/{}".format(self.root, self.groups[0])
+ url = f"{self.root}/series/{self.groups[0]}"
series_id, _, episode_id = text.extr(
self.request(url).text, 'content="tapastic://series/', '"',
).partition("/episodes/")
- url = "{}/series/{}/episodes".format(self.root, series_id)
+ url = f"{self.root}/series/{series_id}/episodes"
params = {
"eid" : episode_id,
"page" : 1,
@@ -157,7 +156,7 @@ class TapasCreatorExtractor(TapasExtractor):
def items(self):
self.login()
- url = "{}/{}/series".format(self.root, self.groups[0])
+ url = f"{self.root}/{self.groups[0]}/series"
page = self.request(url).text
page = text.extr(page, '<ul class="content-list-wrap', "</ul>")
diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py
index 71431ad..6dcb153 100644
--- a/gallery_dl/extractor/tcbscans.py
+++ b/gallery_dl/extractor/tcbscans.py
@@ -19,7 +19,7 @@ class TcbscansChapterExtractor(ChapterExtractor):
example = "https://tcbscans.me/chapters/12345/MANGA-chapter-123"
def __init__(self, match):
- self.root = text.root_from_url(match.group(0))
+ self.root = text.root_from_url(match[0])
ChapterExtractor.__init__(self, match)
def images(self, page):
@@ -48,7 +48,7 @@ class TcbscansMangaExtractor(MangaExtractor):
example = "https://tcbscans.me/mangas/123/MANGA"
def __init__(self, match):
- self.root = text.root_from_url(match.group(0))
+ self.root = text.root_from_url(match[0])
MangaExtractor.__init__(self, match)
def chapters(self, page):
diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py
index 468840b..2713621 100644
--- a/gallery_dl/extractor/telegraph.py
+++ b/gallery_dl/extractor/telegraph.py
@@ -44,7 +44,7 @@ class TelegraphGalleryExtractor(GalleryExtractor):
num_zeroes = len(str(len(figures)))
num = 0
- result = []
+ results = []
for figure in figures:
url, pos = text.extract(figure, 'src="', '"')
if url.startswith("/embed/"):
@@ -54,10 +54,10 @@ class TelegraphGalleryExtractor(GalleryExtractor):
caption, pos = text.extract(figure, "<figcaption>", "<", pos)
num += 1
- result.append((url, {
+ results.append((url, {
"url" : url,
"caption" : text.unescape(caption) if caption else "",
"num" : num,
"num_formatted": str(num).zfill(num_zeroes),
}))
- return result
+ return results
diff --git a/gallery_dl/extractor/tenor.py b/gallery_dl/extractor/tenor.py
index 7273eac..7e1f802 100644
--- a/gallery_dl/extractor/tenor.py
+++ b/gallery_dl/extractor/tenor.py
@@ -32,22 +32,22 @@ class TenorExtractor(Extractor):
self.formats = formats
def items(self):
- meta = self.metadata()
-
for gif in self.gifs():
- fmt = self._extract_format(gif)
- if not fmt:
+
+ if not (fmt := self._extract_format(gif)):
self.log.warning("%s: Selected format(s) not available",
gif.get("id"))
continue
url = fmt["url"]
+ gif["id_format"] = url.rsplit("/", 2)[1]
+ gif["format"] = fmt["name"]
gif["width"], gif["height"] = fmt["dims"]
+ gif["duration"] = fmt["duration"]
+ gif["size"] = fmt["size"]
gif["title"] = gif["h1_title"][:-4]
gif["description"] = gif.pop("content_description", "")
gif["date"] = text.parse_timestamp(gif["created"])
- if meta:
- gif.update(meta)
yield Message.Directory, gif
yield Message.Url, url, text.nameext_from_url(url, gif)
@@ -56,7 +56,9 @@ class TenorExtractor(Extractor):
media_formats = gif["media_formats"]
for fmt in self.formats:
if fmt in media_formats:
- return media_formats[fmt]
+ media = media_formats[fmt]
+ media["name"] = fmt
+ return media
def _search_results(self, query):
url = "https://tenor.googleapis.com/v2/search"
@@ -91,7 +93,7 @@ class TenorExtractor(Extractor):
}
while True:
- data = self.request(url, params=params, headers=headers).json()
+ data = self.request_json(url, params=params, headers=headers)
yield from data["results"]
@@ -112,7 +114,7 @@ class TenorImageExtractor(TenorExtractor):
example = "https://tenor.com/view/SLUG-1234567890"
def gifs(self):
- url = "{}/view/{}".format(self.root, self.groups[0])
+ url = f"{self.root}/view/{self.groups[0]}"
page = self.request(url).text
pos = page.index('id="store-cache"')
data = util.json_loads(text.extract(page, ">", "</script>", pos)[0])
@@ -125,17 +127,14 @@ class TenorSearchExtractor(TenorExtractor):
pattern = BASE_PATTERN + r"search/([^/?#]+)"
example = "https://tenor.com/search/QUERY"
- def metadata(self):
+ def gifs(self):
query = text.unquote(self.groups[0])
rest, _, last = query.rpartition("-")
if last == "gifs":
query = rest
- self.search_tags = query.replace("-", " ")
-
- return {"search_tags": self.search_tags}
+ self.kwdict["search_tags"] = search_tags = query.replace("-", " ")
- def gifs(self):
- return self._search_results(self.search_tags)
+ return self._search_results(search_tags)
class TenorUserExtractor(TenorExtractor):
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index b9783c4..973bd22 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -52,7 +52,7 @@ class TiktokExtractor(Extractor):
if "imagePost" in post:
if not original_title:
- title = "TikTok photo #{}".format(post["id"])
+ title = f"TikTok photo #{post['id']}"
img_list = post["imagePost"]["images"]
for i, img in enumerate(img_list, 1):
url = img["imageURL"]["urlList"][0]
@@ -83,7 +83,7 @@ class TiktokExtractor(Extractor):
if ytdl_media:
if not original_title:
- title = "TikTok {} #{}".format(ytdl_media, post["id"])
+ title = f"TikTok {ytdl_media} #{post['id']}"
post.update({
"type" : ytdl_media,
"image" : None,
@@ -104,7 +104,12 @@ class TiktokExtractor(Extractor):
tries = 0
while True:
try:
- html = self.request(url).text
+ response = self.request(url)
+ if response.history and "/login" in response.url:
+ raise exception.AuthorizationError(
+ "HTTP redirect to login page "
+ f"('{response.url.partition('?')[0]}')")
+ html = response.text
data = text.extr(
html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
'type="application/json">', '</script>')
@@ -128,7 +133,7 @@ class TiktokExtractor(Extractor):
post.update({
"type" : "audio",
"image" : None,
- "title" : post["desc"] or "TikTok audio #{}".format(post["id"]),
+ "title" : post["desc"] or f"TikTok audio #{post['id']}",
"duration" : audio.get("duration"),
"num" : 0,
"img_id" : "",
@@ -167,7 +172,7 @@ class TiktokPostExtractor(TiktokExtractor):
def urls(self):
user, post_id = self.groups
- url = "{}/@{}/video/{}".format(self.root, user or "", post_id)
+ url = f"{self.root}/@{user or ''}/video/{post_id}"
return (url,)
@@ -241,11 +246,17 @@ class TiktokUserExtractor(TiktokExtractor):
set_cookie(cookie)
user_name = self.groups[0]
- profile_url = "{}/@{}".format(self.root, user_name)
+ profile_url = f"{self.root}/@{user_name}"
if self.avatar:
- avatar_url, avatar = self._generate_avatar(user_name, profile_url)
- yield Message.Directory, avatar
- yield Message.Url, avatar_url, avatar
+ try:
+ avatar_url, avatar = self._generate_avatar(
+ user_name, profile_url)
+ except Exception as exc:
+ self.log.warning("Unable to extract 'avatar' URL (%s: %s)",
+ exc.__class__.__name__, exc)
+ else:
+ yield Message.Directory, avatar
+ yield Message.Url, avatar_url, avatar
with ytdl_instance as ydl:
info_dict = ydl._YoutubeDL__extract_info(
diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py
index 9c29727..ef441d3 100644
--- a/gallery_dl/extractor/tmohentai.py
+++ b/gallery_dl/extractor/tmohentai.py
@@ -20,15 +20,14 @@ class TmohentaiGalleryExtractor(GalleryExtractor):
example = "https://tmohentai.com/contents/12345a67b89c0"
def __init__(self, match):
- self.gallery_id = match.group(1)
- url = "{}/contents/{}".format(self.root, self.gallery_id)
+ self.gallery_id = match[1]
+ url = f"{self.root}/contents/{self.gallery_id}"
GalleryExtractor.__init__(self, match, url)
def images(self, page):
- fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format(
- self.gallery_id).format
+ base = f"https://imgrojo.tmohentai.com/contents/{self.gallery_id}/"
cnt = page.count('class="lanzador')
- return [(fmt(i), None) for i in range(0, cnt)]
+ return [(f"{base}{i:>03}.webp", None) for i in range(0, cnt)]
def metadata(self, page):
extr = text.extract_from(page)
diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py
index cee0d9d..7add79a 100644
--- a/gallery_dl/extractor/toyhouse.py
+++ b/gallery_dl/extractor/toyhouse.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,7 +23,7 @@ class ToyhouseExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
self.offset = 0
def items(self):
@@ -108,7 +108,7 @@ class ToyhouseArtExtractor(ToyhouseExtractor):
example = "https://www.toyhou.se/USER/art"
def posts(self):
- return self._pagination("/{}/art".format(self.user))
+ return self._pagination(f"/{self.user}/art")
def metadata(self):
return {"user": self.user}
@@ -124,6 +124,6 @@ class ToyhouseImageExtractor(ToyhouseExtractor):
example = "https://toyhou.se/~images/12345"
def posts(self):
- url = "{}/~images/{}".format(self.root, self.user)
+ url = f"{self.root}/~images/{self.user}"
return (self._parse_post(
self.request(url).text, '<img class="mw-100" src="'),)
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
index b196aeb..8732c60 100644
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -30,7 +30,7 @@ class TsuminoBase():
@cache(maxage=14*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- url = "{}/Account/Login".format(self.root)
+ url = f"{self.root}/Account/Login"
headers = {"Referer": url}
data = {"Username": username, "Password": password}
@@ -47,8 +47,8 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
example = "https://www.tsumino.com/entry/12345"
def __init__(self, match):
- self.gallery_id = match.group(1)
- url = "{}/entry/{}".format(self.root, self.gallery_id)
+ self.gallery_id = match[1]
+ url = f"{self.root}/entry/{self.gallery_id}"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -81,14 +81,14 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
}
def images(self, page):
- url = "{}/Read/Index/{}?page=1".format(self.root, self.gallery_id)
- headers = {"Referer": self.gallery_url}
+ url = f"{self.root}/Read/Index/{self.gallery_id}?page=1"
+ headers = {"Referer": self.page_url}
response = self.request(url, headers=headers, fatal=False)
if "/Auth/" in response.url:
- raise exception.StopExtraction(
- "Failed to get gallery JSON data. Visit '%s' in a browser "
- "and solve the CAPTCHA to continue.", response.url)
+ raise exception.AbortExtraction(
+ f"Failed to get gallery JSON data. Visit '{response.url}' "
+ f"in a browser and solve the CAPTCHA to continue.")
page = response.text
tpl, pos = text.extract(page, 'data-cdn="', '"')
@@ -109,19 +109,19 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.query = match.group(1)
+ self.query = match[1]
def items(self):
for gallery in self.galleries():
- url = "{}/entry/{}".format(self.root, gallery["id"])
+ url = f"{self.root}/entry/{gallery['id']}"
gallery["_extractor"] = TsuminoGalleryExtractor
yield Message.Queue, url, gallery
def galleries(self):
"""Return all gallery results matching 'self.query'"""
- url = "{}/Search/Operate?type=Book".format(self.root)
+ url = f"{self.root}/Search/Operate?type=Book"
headers = {
- "Referer": "{}/".format(self.root),
+ "Referer": f"{self.root}/",
"X-Requested-With": "XMLHttpRequest",
}
data = {
@@ -137,8 +137,8 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
data.update(self._parse(self.query))
while True:
- info = self.request(
- url, method="POST", headers=headers, data=data).json()
+ info = self.request_json(
+ url, method="POST", headers=headers, data=data)
for gallery in info["data"]:
yield gallery["entry"]
@@ -155,11 +155,10 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
return self._parse_simple(query)
return self._parse_jsurl(query)
except Exception as exc:
- raise exception.StopExtraction(
- "Invalid search query '%s' (%s)", query, exc)
+ raise exception.AbortExtraction(
+ f"Invalid search query '{query}' ({exc})")
- @staticmethod
- def _parse_simple(query):
+ def _parse_simple(self, query):
"""Parse search query with format '?<key>=value>'"""
key, _, value = query.partition("=")
tag_types = {
@@ -179,8 +178,7 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
"Tags[0][Exclude]": "false",
}
- @staticmethod
- def _parse_jsurl(data):
+ def _parse_jsurl(self, data):
"""Parse search query in JSURL format
Nested lists and dicts are handled in a special way to deal
@@ -196,9 +194,8 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
nonlocal i
if data[i] != expected:
- error = "bad JSURL syntax: expected '{}', got {}".format(
- expected, data[i])
- raise ValueError(error)
+ raise ValueError(
+ f"bad JSURL syntax: expected '{expected}', got {data[i]}")
i += 1
def decode():
@@ -295,11 +292,11 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
def expand(key, value):
if isinstance(value, list):
for index, cvalue in enumerate(value):
- ckey = "{}[{}]".format(key, index)
+ ckey = f"{key}[{index}]"
yield from expand(ckey, cvalue)
elif isinstance(value, dict):
for ckey, cvalue in value.items():
- ckey = "{}[{}]".format(key, ckey)
+ ckey = f"{key}[{ckey}]"
yield from expand(ckey, cvalue)
else:
yield key, value
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index a2cce83..d9f1ea2 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,6 @@
from .common import Extractor, Message
from .. import text, util, oauth, exception
from datetime import datetime, date, timedelta
-import re
BASE_PATTERN = (
@@ -35,11 +34,10 @@ class TumblrExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- name = match.group(2)
- if name:
+ if name := match[2]:
self.blog = name + ".tumblr.com"
else:
- self.blog = match.group(1) or match.group(3)
+ self.blog = match[1] or match[3]
def _init(self):
self.api = TumblrAPI(self)
@@ -66,16 +64,16 @@ class TumblrExtractor(Extractor):
blog = None
# pre-compile regular expressions
- self._sub_video = re.compile(
+ self._sub_video = util.re(
r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub
if self.inline:
- self._sub_image = re.compile(
+ self._sub_image = util.re(
r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub
- self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn
- _findall_image = re.compile('<img src="([^"]+)"').findall
- _findall_video = re.compile('<source src="([^"]+)"').findall
+ self._subn_orig_image = util.re(r"/s\d+x\d+/").subn
+ _findall_image = util.re('<img src="([^"]+)"').findall
+ _findall_video = util.re('<source src="([^"]+)"').findall
for post in self.posts():
if self.date_min > post["timestamp"]:
@@ -140,8 +138,7 @@ class TumblrExtractor(Extractor):
if url and url.startswith("https://a.tumblr.com/"):
posts.append(self._prepare(url, post.copy()))
- url = post.get("video_url") # type "video"
- if url:
+ if url := post.get("video_url"): # type "video"
posts.append(self._prepare(
self._original_video(url), post.copy()))
@@ -161,8 +158,7 @@ class TumblrExtractor(Extractor):
posts.append(self._prepare(url, post.copy()))
if self.external: # external links
- url = post.get("permalink_url") or post.get("url")
- if url:
+ if url := post.get("permalink_url") or post.get("url"):
post["extension"] = None
posts.append((Message.Queue, url, post.copy()))
del post["extension"]
@@ -192,21 +188,18 @@ class TumblrExtractor(Extractor):
types = types.split(",")
types = frozenset(types)
- invalid = types - POST_TYPES
- if invalid:
+ if invalid := types - POST_TYPES:
types = types & POST_TYPES
self.log.warning("Invalid post types: '%s'",
"', '".join(sorted(invalid)))
return types
- @staticmethod
- def _prepare(url, post):
+ def _prepare(self, url, post):
text.nameext_from_url(url, post)
post["hash"] = post["filename"].partition("_")[2]
return Message.Url, url, post
- @staticmethod
- def _prepare_image(url, post):
+ def _prepare_image(self, url, post):
text.nameext_from_url(url, post)
# try ".gifv" (#3095)
@@ -227,8 +220,7 @@ class TumblrExtractor(Extractor):
return Message.Url, url, post
- @staticmethod
- def _prepare_avatar(url, post, blog):
+ def _prepare_avatar(self, url, post, blog):
text.nameext_from_url(url, post)
post["num"] = post["count"] = 1
post["blog"] = blog
@@ -292,15 +284,14 @@ class TumblrPostExtractor(TumblrExtractor):
def __init__(self, match):
TumblrExtractor.__init__(self, match)
- self.post_id = match.group(4)
+ self.post_id = match[4]
self.reblogs = True
self.date_min = 0
def posts(self):
return self.api.posts(self.blog, {"id": self.post_id})
- @staticmethod
- def _setup_posttypes():
+ def _setup_posttypes(self):
return POST_TYPES
@@ -312,7 +303,7 @@ class TumblrTagExtractor(TumblrExtractor):
def __init__(self, match):
TumblrExtractor.__init__(self, match)
- self.tag = text.unquote(match.group(4).replace("-", " "))
+ self.tag = text.unquote(match[4].replace("-", " "))
def posts(self):
return self.api.posts(self.blog, {"tag": self.tag})
@@ -326,7 +317,7 @@ class TumblrDayExtractor(TumblrExtractor):
def __init__(self, match):
TumblrExtractor.__init__(self, match)
- year, month, day = match.group(4).split("/")
+ year, month, day = match[4].split("/")
self.ordinal = date(int(year), int(month), int(day)).toordinal()
def _init(self):
@@ -386,7 +377,7 @@ class TumblrAPI(oauth.OAuth1API):
try:
return self.BLOG_CACHE[blog]
except KeyError:
- endpoint = "/v2/blog/{}/info".format(blog)
+ endpoint = f"/v2/blog/{blog}/info"
params = {"api_key": self.api_key} if self.api_key else None
self.BLOG_CACHE[blog] = blog = self._call(endpoint, params)["blog"]
return blog
@@ -394,9 +385,9 @@ class TumblrAPI(oauth.OAuth1API):
def avatar(self, blog, size="512"):
"""Retrieve a blog avatar"""
if self.api_key:
- return "{}/v2/blog/{}/avatar/{}?api_key={}".format(
- self.ROOT, blog, size, self.api_key)
- endpoint = "/v2/blog/{}/avatar".format(blog)
+ return (f"{self.ROOT}/v2/blog/{blog}/avatar/{size}"
+ f"?api_key={self.api_key}")
+ endpoint = f"/v2/blog/{blog}/avatar"
params = {"size": size}
return self._call(
endpoint, params, allow_redirects=False)["avatar_url"]
@@ -412,12 +403,12 @@ class TumblrAPI(oauth.OAuth1API):
if self.before and params["offset"]:
self.log.warning("'offset' and 'date-max' cannot be used together")
- endpoint = "/v2/blog/{}/posts".format(blog)
+ endpoint = f"/v2/blog/{blog}/posts"
return self._pagination(endpoint, params, blog=blog, cache=True)
def likes(self, blog):
"""Retrieve liked posts"""
- endpoint = "/v2/blog/{}/likes".format(blog)
+ endpoint = f"/v2/blog/{blog}/likes"
params = {"limit": "50", "before": self.before}
if self.api_key:
params["api_key"] = self.api_key
@@ -504,18 +495,17 @@ class TumblrAPI(oauth.OAuth1API):
continue
t = (datetime.now() + timedelta(0, float(reset))).time()
- raise exception.StopExtraction(
- "Aborting - Rate limit will reset at %s",
- "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second))
+ raise exception.AbortExtraction(
+ f"Aborting - Rate limit will reset at "
+ f"{t.hour:02}:{t.minute:02}:{t.second:02}")
# hourly rate limit
- reset = response.headers.get("x-ratelimit-perhour-reset")
- if reset:
+ if reset := response.headers.get("x-ratelimit-perhour-reset"):
self.log.info("Hourly API rate limit exceeded")
self.extractor.wait(seconds=reset)
continue
- raise exception.StopExtraction(data)
+ raise exception.AbortExtraction(data)
def _pagination(self, endpoint, params,
blog=None, key="posts", cache=False):
diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py
index 448625e..26868ec 100644
--- a/gallery_dl/extractor/tumblrgallery.py
+++ b/gallery_dl/extractor/tumblrgallery.py
@@ -20,13 +20,11 @@ class TumblrgalleryExtractor(GalleryExtractor):
root = "https://tumblrgallery.xyz"
referer = False
- @staticmethod
- def _urls_from_page(page):
+ def _urls_from_page(self, page):
return text.extract_iter(
page, '<div class="report"> <a class="xx-co-me" href="', '"')
- @staticmethod
- def _data_from_url(url):
+ def _data_from_url(self, url):
filename = text.nameext_from_url(url)["filename"]
parts = filename.split("_")
try:
@@ -43,7 +41,7 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
def __init__(self, match):
TumblrgalleryExtractor.__init__(self, match)
- self.gallery_id = text.parse_int(match.group(2))
+ self.gallery_id = text.parse_int(match[2])
def metadata(self, page):
return {
@@ -52,10 +50,11 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
}
def images(self, _):
- page_num = 1
+ base = f"{self.root}/tumblrblog/gallery/{self.gallery_id}/"
+ pnum = 1
+
while True:
- url = "{}/tumblrblog/gallery/{}/{}.html".format(
- self.root, self.gallery_id, page_num)
+ url = f"{base}{pnum}.html"
response = self.request(url, allow_redirects=False, fatal=False)
if response.status_code >= 300:
@@ -63,7 +62,7 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
for url in self._urls_from_page(response.text):
yield url, self._data_from_url(url)
- page_num += 1
+ pnum += 1
class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
@@ -74,7 +73,7 @@ class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
def __init__(self, match):
TumblrgalleryExtractor.__init__(self, match)
- self.gallery_id = text.parse_int(match.group(2))
+ self.gallery_id = text.parse_int(match[2])
def metadata(self, page):
return {
@@ -99,7 +98,7 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
def __init__(self, match):
TumblrgalleryExtractor.__init__(self, match)
- self.search_term = match.group(2)
+ self.search_term = match[2]
def metadata(self, page):
return {
@@ -114,7 +113,7 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
for gallery_id in text.extract_iter(
page, '<div class="title"><a href="post/', '.html'):
- url = "{}/post/{}.html".format(self.root, gallery_id)
+ url = f"{self.root}/post/{gallery_id}.html"
post_page = self.request(url).text
for url in self._urls_from_page(post_page):
diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py
index 3b0ea36..4f9fe84 100644
--- a/gallery_dl/extractor/twibooru.py
+++ b/gallery_dl/extractor/twibooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -36,8 +36,7 @@ class TwibooruExtractor(BooruExtractor):
return post["view_url"].rpartition(".")[0] + ".svg"
return post["view_url"]
- @staticmethod
- def _prepare(post):
+ def _prepare(self, post):
post["date"] = text.parse_datetime(
post["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
@@ -55,7 +54,7 @@ class TwibooruPostExtractor(TwibooruExtractor):
def __init__(self, match):
TwibooruExtractor.__init__(self, match)
- self.post_id = match.group(1)
+ self.post_id = match[1]
def posts(self):
return (self.api.post(self.post_id),)
@@ -104,7 +103,7 @@ class TwibooruGalleryExtractor(TwibooruExtractor):
def __init__(self, match):
TwibooruExtractor.__init__(self, match)
- self.gallery_id = match.group(1)
+ self.gallery_id = match[1]
def metadata(self):
return {"gallery": self.api.gallery(self.gallery_id)}
@@ -155,18 +154,15 @@ class TwibooruAPI():
# error
self.extractor.log.debug(response.content)
- raise exception.StopExtraction(
- "%s %s", response.status_code, response.reason)
+ raise exception.HttpError("", response)
def _pagination(self, endpoint, params):
extr = self.extractor
- api_key = extr.config("api-key")
- if api_key:
+ if api_key := extr.config("api-key"):
params["key"] = api_key
- filter_id = extr.config("filter")
- if filter_id:
+ if filter_id := extr.config("filter"):
params["filter_id"] = filter_id
elif not api_key:
params["filter_id"] = "2"
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 896bf28..7252d05 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,12 +8,11 @@
"""Extractors for https://x.com/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from ..cache import cache, memcache
import itertools
import random
-import re
BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?"
r"(?:(?:[fv]x)?twitter|(?:fix(?:up|v))?x)\.com")
@@ -32,7 +31,7 @@ class TwitterExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
def _init(self):
self.unavailable = self.config("unavailable", False)
@@ -72,21 +71,16 @@ class TwitterExtractor(Extractor):
self.login()
self.api = TwitterAPI(self)
metadata = self.metadata()
-
- if self.config("expand"):
- tweets = self._expand_tweets(self.tweets())
- self.tweets = lambda : tweets
-
- if self.config("unique", True):
- seen_tweets = set()
- else:
- seen_tweets = None
+ seen_tweets = set() if self.config("unique", True) else None
if self.twitpic:
- self._find_twitpic = re.compile(
+ self._find_twitpic = util.re(
r"https?(://twitpic\.com/(?!photos/)\w+)").findall
- for tweet in self.tweets():
+ tweets = self.tweets()
+ if self.config("expand"):
+ tweets = self._expand_tweets(tweets)
+ for tweet in tweets:
if "legacy" in tweet:
data = tweet["legacy"]
@@ -129,6 +123,12 @@ class TwitterExtractor(Extractor):
tdata.update(metadata)
tdata["count"] = len(files)
yield Message.Directory, tdata
+
+ del tdata["source_id"]
+ del tdata["sensitive_flags"]
+ if "source_user" in tdata:
+ del tdata["source_user"]
+
for tdata["num"], file in enumerate(files, 1):
file.update(tdata)
url = file.pop("url")
@@ -170,8 +170,29 @@ class TwitterExtractor(Extractor):
return files
def _extract_media(self, tweet, entities, files):
+ flags_tweet = None
+
for media in entities:
+ if "sensitive_media_warning" in media:
+ flags_media = media["sensitive_media_warning"]
+
+ flags = []
+ if "adult_content" in flags_media:
+ flags.append("Nudity")
+ if "other" in flags_media:
+ flags.append("Sensitive")
+ if "graphic_violence" in flags_media:
+ flags.append("Violence")
+
+ if flags_tweet is None:
+ flags_tweet = set(flags)
+ else:
+ flags_tweet.update(flags)
+ flags_media = flags
+ else:
+ flags_media = ()
+
if "ext_media_availability" in media:
ext = media["ext_media_availability"]
if ext.get("status") == "Unavailable":
@@ -180,38 +201,22 @@ class TwitterExtractor(Extractor):
if not self.unavailable:
continue
- mtype = media.get("type")
- descr = media.get("ext_alt_text")
- width = media["original_info"].get("width", 0)
- height = media["original_info"].get("height", 0)
-
if "video_info" in media:
if self.videos == "ytdl":
- files.append({
- "url": "ytdl:{}/i/web/status/{}".format(
- self.root, tweet["id_str"]),
- "type" : mtype,
- "width" : width,
- "height" : height,
- "extension" : None,
- "description": descr,
- })
+ url = f"ytdl:{self.root}/i/web/status/{tweet['id_str']}"
+ file = {"url": url, "extension": "mp4"}
elif self.videos:
video_info = media["video_info"]
variant = max(
video_info["variants"],
key=lambda v: v.get("bitrate", 0),
)
- files.append({
- "url" : variant["url"],
- "type" : mtype,
- "width" : width,
- "height" : height,
- "bitrate" : variant.get("bitrate", 0),
- "duration" : video_info.get(
+ file = {
+ "url" : variant["url"],
+ "bitrate" : variant.get("bitrate", 0),
+ "duration": video_info.get(
"duration_millis", 0) / 1000,
- "description": descr,
- })
+ }
elif "media_url_https" in media:
url = media["media_url_https"]
if url[-4] == ".":
@@ -219,16 +224,37 @@ class TwitterExtractor(Extractor):
base += "?format=" + fmt + "&name="
else:
base = url.rpartition("=")[0] + "="
- files.append(text.nameext_from_url(url, {
- "url" : base + self._size_image,
- "type" : mtype,
- "width" : width,
- "height" : height,
- "_fallback" : self._image_fallback(base),
- "description": descr,
- }))
+ file = text.nameext_from_url(url, {
+ "url" : base + self._size_image,
+ "_fallback": self._image_fallback(base),
+ })
else:
files.append({"url": media["media_url"]})
+ continue
+
+ file["type"] = media.get("type")
+ file["width"] = media["original_info"].get("width", 0)
+ file["height"] = media["original_info"].get("height", 0)
+ file["description"] = media.get("ext_alt_text")
+ file["sensitive_flags"] = flags_media
+ self._extract_media_source(file, media)
+ files.append(file)
+
+ tweet["sensitive_flags"] = \
+ () if flags_tweet is None else sorted(flags_tweet)
+
+ def _extract_media_source(self, dest, media):
+ dest["source_id"] = 0
+
+ if "source_status_id_str" in media:
+ try:
+ dest["source_id"] = text.parse_int(
+ media["source_status_id_str"])
+ dest["source_user"] = self._transform_user(
+ media["additional_media_info"]["source_user"]
+ ["user_results"]["result"])
+ except Exception:
+ pass
def _image_fallback(self, base):
for fmt in self._size_fallback:
@@ -252,8 +278,7 @@ class TwitterExtractor(Extractor):
bvals = {bval["key"]: bval["value"]
for bval in card["binding_values"]}
- cbl = self.cards_blacklist
- if cbl:
+ if cbl := self.cards_blacklist:
if name in cbl:
return
if "vanity_url" in bvals:
@@ -288,7 +313,7 @@ class TwitterExtractor(Extractor):
if self.cards == "ytdl":
tweet_id = tweet.get("rest_id") or tweet["id_str"]
- url = "ytdl:{}/i/web/status/{}".format(self.root, tweet_id)
+ url = f"ytdl:{self.root}/i/web/status/{tweet_id}"
files.append({"url": url})
def _extract_twitpic(self, tweet, files):
@@ -313,8 +338,8 @@ class TwitterExtractor(Extractor):
response = self.request(url, fatal=False)
if response.status_code >= 400:
continue
- url = text.extr(response.text, 'name="twitter:image" value="', '"')
- if url:
+ if url := text.extr(
+ response.text, 'name="twitter:image" value="', '"'):
files.append({"url": url})
def _transform_tweet(self, tweet):
@@ -354,12 +379,14 @@ class TwitterExtractor(Extractor):
tget("in_reply_to_status_id_str")),
"conversation_id": text.parse_int(
tget("conversation_id_str")),
+ "source_id" : 0,
"date" : date,
"author" : author,
"user" : self._user or author,
"lang" : legacy["lang"],
"source" : text.extr(source, ">", "<") if source else "",
"sensitive" : tget("possibly_sensitive"),
+ "sensitive_flags": tget("sensitive_flags"),
"favorite_count": tget("favorite_count"),
"quote_count" : tget("quote_count"),
"reply_count" : tget("reply_count"),
@@ -383,12 +410,10 @@ class TwitterExtractor(Extractor):
content = tget("full_text") or tget("text") or ""
entities = legacy["entities"]
- hashtags = entities.get("hashtags")
- if hashtags:
+ if hashtags := entities.get("hashtags"):
tdata["hashtags"] = [t["text"] for t in hashtags]
- mentions = entities.get("user_mentions")
- if mentions:
+ if mentions := entities.get("user_mentions"):
tdata["mentions"] = [{
"id": text.parse_int(u["id_str"]),
"name": u["screen_name"],
@@ -396,8 +421,7 @@ class TwitterExtractor(Extractor):
} for u in mentions]
content = text.unescape(content)
- urls = entities.get("urls")
- if urls:
+ if urls := entities.get("urls"):
for url in urls:
try:
content = content.replace(url["url"], url["expanded_url"])
@@ -417,9 +441,11 @@ class TwitterExtractor(Extractor):
tdata["reply_to"] = legacy["in_reply_to_screen_name"]
if "quoted_by" in legacy:
tdata["quote_by"] = legacy["quoted_by"]
+ if "extended_entities" in legacy:
+ self._extract_media_source(
+ tdata, legacy["extended_entities"]["media"][0])
if tdata["retweet_id"]:
- tdata["content"] = "RT @{}: {}".format(
- author["name"], tdata["content"])
+ tdata["content"] = f"RT @{author['name']}: {tdata['content']}"
tdata["date_original"] = text.parse_timestamp(
((tdata["retweet_id"] >> 22) + 1288834974657) // 1000)
@@ -466,8 +492,7 @@ class TwitterExtractor(Extractor):
}
descr = user["description"]
- urls = entities["description"].get("urls")
- if urls:
+ if urls := entities["description"].get("urls"):
for url in urls:
try:
descr = descr.replace(url["url"], url["expanded_url"])
@@ -577,27 +602,18 @@ class TwitterExtractor(Extractor):
return self.cookies_update(_login_impl(self, username, password))
-class TwitterUserExtractor(TwitterExtractor):
+class TwitterUserExtractor(Dispatch, TwitterExtractor):
"""Extractor for a Twitter user"""
- subcategory = "user"
pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
example = "https://x.com/USER"
- def __init__(self, match):
- TwitterExtractor.__init__(self, match)
- user_id = match.group(2)
- if user_id:
- self.user = "id:" + user_id
-
- def initialize(self):
- pass
-
- def finalize(self):
- pass
-
def items(self):
- base = "{}/{}/".format(self.root, self.user)
+ user, user_id = self.groups
+ if user_id is not None:
+ user = "id:" + user_id
+
+ base = f"{self.root}/{user}/"
return self._dispatch_extractors((
(TwitterInfoExtractor , base + "info"),
(TwitterAvatarExtractor , base + "photo"),
@@ -663,12 +679,12 @@ class TwitterTimelineExtractor(TwitterExtractor):
self.api._user_id_by_screen_name(self.user)
# build search query
- query = "from:{} max_id:{}".format(self._user["name"], tweet_id)
+ query = f"from:{self._user['name']} max_id:{tweet_id}"
if self.retweets:
query += " include:retweets include:nativeretweets"
if state <= 2:
- self._cursor_prefix = "2_{}/".format(tweet_id)
+ self._cursor_prefix = f"2_{tweet_id}/"
if reset:
self._cursor = self._cursor_prefix
@@ -684,7 +700,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
if state <= 3:
# yield unfiltered search results
- self._cursor_prefix = "3_{}/".format(tweet_id)
+ self._cursor_prefix = f"3_{tweet_id}/"
if reset:
self._cursor = self._cursor_prefix
@@ -704,7 +720,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
return self.api.user_media
if strategy == "with_replies":
return self.api.user_tweets_and_replies
- raise exception.StopExtraction("Invalid strategy '%s'", strategy)
+ raise exception.AbortExtraction(f"Invalid strategy '{strategy}'")
class TwitterTweetsExtractor(TwitterExtractor):
@@ -847,7 +863,7 @@ class TwitterHashtagExtractor(TwitterExtractor):
example = "https://x.com/hashtag/NAME"
def items(self):
- url = "{}/search?q=%23{}".format(self.root, self.user)
+ url = f"{self.root}/search?q=%23{self.user}"
data = {"_extractor": TwitterSearchExtractor}
yield Message.Queue, url, data
@@ -898,11 +914,10 @@ class TwitterTweetExtractor(TwitterExtractor):
def __init__(self, match):
TwitterExtractor.__init__(self, match)
- self.tweet_id = match.group(2)
+ self.tweet_id = match[2]
def tweets(self):
- conversations = self.config("conversations")
- if conversations:
+ if conversations := self.config("conversations"):
self._accessible = (conversations == "accessible")
return self._tweets_conversation(self.tweet_id)
@@ -919,8 +934,8 @@ class TwitterTweetExtractor(TwitterExtractor):
try:
self._assign_user(tweet["core"]["user_results"]["result"])
except KeyError:
- raise exception.StopExtraction(
- "'%s'", tweet.get("reason") or "Unavailable")
+ raise exception.AbortExtraction(
+ f"'{tweet.get('reason') or 'Unavailable'}'")
yield tweet
@@ -977,7 +992,7 @@ class TwitterQuotesExtractor(TwitterExtractor):
example = "https://x.com/USER/status/12345/quotes"
def items(self):
- url = "{}/search?q=quoted_tweet_id:{}".format(self.root, self.user)
+ url = f"{self.root}/search?q=quoted_tweet_id:{self.user}"
data = {"_extractor": TwitterSearchExtractor}
yield Message.Queue, url, data
@@ -1055,8 +1070,7 @@ class TwitterImageExtractor(Extractor):
TwitterExtractor._init_sizes(self)
def items(self):
- base = "https://pbs.twimg.com/media/{}?format={}&name=".format(
- self.id, self.fmt)
+ base = f"https://pbs.twimg.com/media/{self.id}?format={self.fmt}&name="
data = {
"filename": self.id,
@@ -1233,7 +1247,7 @@ class TwitterAPI():
raise exception.AuthorizationError("NSFW Tweet")
if reason == "Protected":
raise exception.AuthorizationError("Protected Tweet")
- raise exception.StopExtraction("Tweet unavailable ('%s')", reason)
+ raise exception.AbortExtraction(f"Tweet unavailable ('{reason}')")
return tweet
@@ -1391,7 +1405,7 @@ class TwitterAPI():
("viewer", "communities_timeline", "timeline"))
def live_event_timeline(self, event_id):
- endpoint = "/2/live_event/timeline/{}.json".format(event_id)
+ endpoint = f"/2/live_event/timeline/{event_id}.json"
params = self.params.copy()
params["timeline_id"] = "recap"
params["urt"] = "true"
@@ -1399,7 +1413,7 @@ class TwitterAPI():
return self._pagination_legacy(endpoint, params)
def live_event(self, event_id):
- endpoint = "/1.1/live_event/1/{}/timeline.json".format(event_id)
+ endpoint = f"/1.1/live_event/1/{event_id}/timeline.json"
params = self.params.copy()
params["count"] = "0"
params["urt"] = "true"
@@ -1484,9 +1498,9 @@ class TwitterAPI():
return user["rest_id"]
except KeyError:
if "unavailable_message" in user:
- raise exception.NotFoundError("{} ({})".format(
- user["unavailable_message"].get("text"),
- user.get("reason")), False)
+ raise exception.NotFoundError(
+ f"{user['unavailable_message'].get('text')} "
+ f"({user.get('reason')})", False)
else:
raise exception.NotFoundError("user")
@@ -1543,8 +1557,7 @@ class TwitterAPI():
headers=self.headers, fatal=None)
# update 'x-csrf-token' header (#1170)
- csrf_token = response.cookies.get("ct0")
- if csrf_token:
+ if csrf_token := response.cookies.get("ct0"):
self.headers["x-csrf-token"] = csrf_token
remaining = int(response.headers.get("x-rate-limit-remaining", 6))
@@ -1614,13 +1627,12 @@ class TwitterAPI():
except Exception:
pass
- raise exception.StopExtraction(
- "%s %s (%s)", response.status_code, response.reason, errors)
+ raise exception.AbortExtraction(
+ f"{response.status_code} {response.reason} ({errors})")
def _pagination_legacy(self, endpoint, params):
extr = self.extractor
- cursor = extr._init_cursor()
- if cursor:
+ if cursor := extr._init_cursor():
params["cursor"] = cursor
original_retweets = (extr.retweets == "original")
bottom = ("cursor-bottom-", "sq-cursor-bottom")
@@ -1701,8 +1713,7 @@ class TwitterAPI():
yield tweet
if "quoted_status_id_str" in tweet:
- quoted = tweets.get(tweet["quoted_status_id_str"])
- if quoted:
+ if quoted := tweets.get(tweet["quoted_status_id_str"]):
quoted = quoted.copy()
quoted["author"] = users[quoted["user_id_str"]]
quoted["quoted_by"] = tweet["user"]["screen_name"]
@@ -1722,8 +1733,7 @@ class TwitterAPI():
pinned_tweet = extr.pinned
params = {"variables": None}
- cursor = extr._init_cursor()
- if cursor:
+ if cursor := extr._init_cursor():
variables["cursor"] = cursor
if features is None:
features = self.features_pagination
@@ -1772,8 +1782,7 @@ class TwitterAPI():
except LookupError:
extr.log.debug(data)
- user = extr._user_obj
- if user:
+ if user := extr._user_obj:
user = user["legacy"]
if user.get("blocked_by"):
if self.headers["x-twitter-auth-type"] and \
@@ -1784,14 +1793,12 @@ class TwitterAPI():
extr.log.info("Retrying API request as guest")
continue
raise exception.AuthorizationError(
- "{} blocked your account".format(
- user["screen_name"]))
+ f"{user['screen_name']} blocked your account")
elif user.get("protected"):
raise exception.AuthorizationError(
- "{}'s Tweets are protected".format(
- user["screen_name"]))
+ f"{user['screen_name']}'s Tweets are protected")
- raise exception.StopExtraction(
+ raise exception.AbortExtraction(
"Unable to retrieve Tweets from this timeline")
tweets = []
@@ -1924,8 +1931,7 @@ class TwitterAPI():
def _pagination_users(self, endpoint, variables, path=None):
extr = self.extractor
- cursor = extr._init_cursor()
- if cursor:
+ if cursor := extr._init_cursor():
variables["cursor"] = cursor
params = {
"variables": None,
@@ -1970,7 +1976,7 @@ class TwitterAPI():
def _handle_ratelimit(self, response):
rl = self.extractor.config("ratelimit")
if rl == "abort":
- raise exception.StopExtraction("Rate limit exceeded")
+ raise exception.AbortExtraction("Rate limit exceeded")
elif rl and isinstance(rl, str) and rl.startswith("wait:"):
until = None
seconds = text.parse_float(rl.partition(":")[2]) or 60.0
@@ -2000,8 +2006,7 @@ def _login_impl(extr, username, password):
method="POST", fatal=None)
# update 'x-csrf-token' header (#5945)
- csrf_token = response.cookies.get("ct0")
- if csrf_token:
+ if csrf_token := response.cookies.get("ct0"):
headers["x-csrf-token"] = csrf_token
try:
@@ -2019,7 +2024,7 @@ def _login_impl(extr, username, password):
errors = []
for error in data.get("errors") or ():
msg = error.get("message")
- errors.append('"{}"'.format(msg) if msg else "Unknown error")
+ errors.append(f'"{msg}"' if msg else "Unknown error")
extr.log.debug(response.text)
raise exception.AuthenticationError(", ".join(errors))
@@ -2154,7 +2159,7 @@ def _login_impl(extr, username, password):
raise exception.AuthenticationError(
"No 'auth_token' cookie received")
else:
- raise exception.StopExtraction("Unrecognized subtask %s", subtask)
+ raise exception.AbortExtraction(f"Unrecognized subtask {subtask}")
inputs = {"subtask_id": subtask}
inputs.update(data)
@@ -2163,7 +2168,7 @@ def _login_impl(extr, username, password):
"subtask_inputs": [inputs],
}
- extr.sleep(random.uniform(1.0, 3.0), "login ({})".format(subtask))
+ extr.sleep(random.uniform(1.0, 3.0), f"login ({subtask})")
flow_token, subtask = process(data)
return {
diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py
index a1b87b9..cf6631f 100644
--- a/gallery_dl/extractor/unsplash.py
+++ b/gallery_dl/extractor/unsplash.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,7 +26,7 @@ class UnsplashExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.item = match.group(1)
+ self.item = match[1]
def items(self):
fmt = self.config("format") or "raw"
@@ -48,8 +48,7 @@ class UnsplashExtractor(Extractor):
yield Message.Directory, photo
yield Message.Url, url, photo
- @staticmethod
- def metadata():
+ def metadata(self):
return None
def skip(self, num):
@@ -62,7 +61,7 @@ class UnsplashExtractor(Extractor):
params["page"] = self.page_start
while True:
- photos = self.request(url, params=params).json()
+ photos = self.request_json(url, params=params)
if results:
photos = photos["results"]
yield from photos
@@ -79,8 +78,8 @@ class UnsplashImageExtractor(UnsplashExtractor):
example = "https://unsplash.com/photos/ID"
def photos(self):
- url = "{}/napi/photos/{}".format(self.root, self.item)
- return (self.request(url).json(),)
+ url = f"{self.root}/napi/photos/{self.item}"
+ return (self.request_json(url),)
class UnsplashUserExtractor(UnsplashExtractor):
@@ -90,7 +89,7 @@ class UnsplashUserExtractor(UnsplashExtractor):
example = "https://unsplash.com/@USER"
def photos(self):
- url = "{}/napi/users/{}/photos".format(self.root, self.item)
+ url = f"{self.root}/napi/users/{self.item}/photos"
params = {"order_by": "latest"}
return self._pagination(url, params)
@@ -102,7 +101,7 @@ class UnsplashFavoriteExtractor(UnsplashExtractor):
example = "https://unsplash.com/@USER/likes"
def photos(self):
- url = "{}/napi/users/{}/likes".format(self.root, self.item)
+ url = f"{self.root}/napi/users/{self.item}/likes"
params = {"order_by": "latest"}
return self._pagination(url, params)
@@ -115,13 +114,13 @@ class UnsplashCollectionExtractor(UnsplashExtractor):
def __init__(self, match):
UnsplashExtractor.__init__(self, match)
- self.title = match.group(2) or ""
+ self.title = match[2] or ""
def metadata(self):
return {"collection_id": self.item, "collection_title": self.title}
def photos(self):
- url = "{}/napi/collections/{}/photos".format(self.root, self.item)
+ url = f"{self.root}/napi/collections/{self.item}/photos"
params = {"order_by": "latest"}
return self._pagination(url, params)
@@ -134,7 +133,7 @@ class UnsplashSearchExtractor(UnsplashExtractor):
def __init__(self, match):
UnsplashExtractor.__init__(self, match)
- self.query = match.group(2)
+ self.query = match[2]
def photos(self):
url = self.root + "/napi/search/photos"
diff --git a/gallery_dl/extractor/uploadir.py b/gallery_dl/extractor/uploadir.py
index ce34e7d..d06c2ad 100644
--- a/gallery_dl/extractor/uploadir.py
+++ b/gallery_dl/extractor/uploadir.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -24,10 +24,10 @@ class UploadirFileExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.file_id = match.group(1)
+ self.file_id = match[1]
def items(self):
- url = "{}/u/{}".format(self.root, self.file_id)
+ url = f"{self.root}/u/{self.file_id}"
response = self.request(url, method="HEAD", allow_redirects=False)
if 300 <= response.status_code < 400:
diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py
index ebfeb9d..4369ac6 100644
--- a/gallery_dl/extractor/urlgalleries.py
+++ b/gallery_dl/extractor/urlgalleries.py
@@ -23,8 +23,7 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor):
_, blog_alt, blog, self.gallery_id = self.groups
if not blog:
blog = blog_alt
- url = "https://urlgalleries.net/b/{}/porn-gallery-{}/?a=10000".format(
- blog, self.gallery_id)
+ url = f"{self.root}/b/{blog}/porn-gallery-{self.gallery_id}/?a=10000"
with self.request(url, allow_redirects=False, fatal=...) as response:
if 300 <= response.status_code < 500:
@@ -38,7 +37,7 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor):
data = self.metadata(page)
data["count"] = len(imgs)
- root = "https://urlgalleries.net/b/" + blog
+ root = self.root
yield Message.Directory, data
for data["num"], img in enumerate(imgs, 1):
page = self.request(root + img).text
@@ -54,7 +53,7 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor):
"_rprt": extr(' title="', '"'), # report button
"title": text.unescape(extr(' title="', '"').strip()),
"date" : text.parse_datetime(
- extr(" images in gallery | ", "<"), "%B %d, %Y %H:%M"),
+ extr(" images in gallery | ", "<"), "%B %d, %Y"),
}
def images(self, page):
diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py
index 0478ef0..7a9269d 100644
--- a/gallery_dl/extractor/urlshortener.py
+++ b/gallery_dl/extractor/urlshortener.py
@@ -40,5 +40,5 @@ class UrlshortenerLinkExtractor(UrlshortenerExtractor):
location = self.request_location(
url, headers=self.config_instance("headers"), notfound="URL")
if not location:
- raise exception.StopExtraction("Unable to resolve short URL")
+ raise exception.AbortExtraction("Unable to resolve short URL")
yield Message.Queue, location, {}
diff --git a/gallery_dl/extractor/vanillarock.py b/gallery_dl/extractor/vanillarock.py
index 1ce969f..e0107f3 100644
--- a/gallery_dl/extractor/vanillarock.py
+++ b/gallery_dl/extractor/vanillarock.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -19,7 +19,7 @@ class VanillarockExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.path = match.group(1)
+ self.path = match[1]
class VanillarockPostExtractor(VanillarockExtractor):
diff --git a/gallery_dl/extractor/vichan.py b/gallery_dl/extractor/vichan.py
index 654c451..f99b5de 100644
--- a/gallery_dl/extractor/vichan.py
+++ b/gallery_dl/extractor/vichan.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -38,21 +38,18 @@ class VichanThreadExtractor(VichanExtractor):
pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
example = "https://8kun.top/a/res/12345.html"
- def __init__(self, match):
- VichanExtractor.__init__(self, match)
- index = match.lastindex
- self.board = match.group(index-1)
- self.thread = match.group(index)
-
def items(self):
- url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
- posts = self.request(url).json()["posts"]
+ board = self.groups[-2]
+ thread = self.groups[-1]
+ url = f"{self.root}/{board}/res/{thread}.json"
+ posts = self.request_json(url)["posts"]
+
title = posts[0].get("sub") or text.remove_html(posts[0]["com"])
process = (self._process_8kun if self.category == "8kun" else
self._process)
data = {
- "board" : self.board,
- "thread": self.thread,
+ "board" : board,
+ "thread": thread,
"title" : text.unescape(title)[:50],
"num" : 0,
}
@@ -68,25 +65,25 @@ class VichanThreadExtractor(VichanExtractor):
def _process(self, post, data):
post.update(data)
- post["extension"] = post["ext"][1:]
- post["url"] = "{}/{}/src/{}{}".format(
- self.root, post["board"], post["tim"], post["ext"])
- return Message.Url, post["url"], post
+ ext = post["ext"]
+ post["extension"] = ext[1:]
+ post["url"] = url = \
+ f"{self.root}/{post['board']}/src/{post['tim']}{ext}"
+ return Message.Url, url, post
- @staticmethod
- def _process_8kun(post, data):
+ def _process_8kun(self, post, data):
post.update(data)
- post["extension"] = post["ext"][1:]
-
+ ext = post["ext"]
tim = post["tim"]
+
if len(tim) > 16:
- post["url"] = "https://media.128ducks.com/file_store/{}{}".format(
- tim, post["ext"])
+ url = f"https://media.128ducks.com/file_store/{tim}{ext}"
else:
- post["url"] = "https://media.128ducks.com/{}/src/{}{}".format(
- post["board"], tim, post["ext"])
+ url = f"https://media.128ducks.com/{post['board']}/src/{tim}{ext}"
- return Message.Url, post["url"], post
+ post["url"] = url
+ post["extension"] = ext[1:]
+ return Message.Url, url, post
class VichanBoardExtractor(VichanExtractor):
@@ -95,18 +92,14 @@ class VichanBoardExtractor(VichanExtractor):
pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
example = "https://8kun.top/a/"
- def __init__(self, match):
- VichanExtractor.__init__(self, match)
- self.board = match.group(match.lastindex)
-
def items(self):
- url = "{}/{}/threads.json".format(self.root, self.board)
- threads = self.request(url).json()
+ board = self.groups[-1]
+ url = f"{self.root}/{board}/threads.json"
+ threads = self.request_json(url)
for page in threads:
for thread in page["threads"]:
- url = "{}/{}/res/{}.html".format(
- self.root, self.board, thread["no"])
+ url = f"{self.root}/{board}/res/{thread['no']}.html"
thread["page"] = page["page"]
thread["_extractor"] = VichanThreadExtractor
yield Message.Queue, url, thread
diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py
index 1dd3482..e53ecf4 100644
--- a/gallery_dl/extractor/vipergirls.py
+++ b/gallery_dl/extractor/vipergirls.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,8 +12,6 @@ from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
-from xml.etree import ElementTree
-
BASE_PATTERN = r"(?:https?://)?(?:www\.)?vipergirls\.to"
@@ -27,8 +25,7 @@ class VipergirlsExtractor(Extractor):
cookies_names = ("vg_userid", "vg_password")
def _init(self):
- domain = self.config("domain")
- if domain:
+ if domain := self.config("domain"):
pos = domain.find("://")
if pos >= 0:
self.root = domain.rstrip("/")
@@ -47,8 +44,7 @@ class VipergirlsExtractor(Extractor):
forum_title = root[1].attrib["title"]
thread_title = root[2].attrib["title"]
- like = self.config("like")
- if like:
+ if like := self.config("like"):
user_hash = root[0].get("hash")
if len(user_hash) < 16:
self.log.warning("Login required to like posts")
@@ -90,7 +86,7 @@ class VipergirlsExtractor(Extractor):
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- url = "{}/login.php?do=login".format(self.root)
+ url = f"{self.root}/login.php?do=login"
data = {
"vb_login_username": username,
"vb_login_password": password,
@@ -129,8 +125,8 @@ class VipergirlsThreadExtractor(VipergirlsExtractor):
self.thread_id, self.page = match.groups()
def posts(self):
- url = "{}/vr.php?t={}".format(self.root, self.thread_id)
- return ElementTree.fromstring(self.request(url).text)
+ url = f"{self.root}/vr.php?t={self.thread_id}"
+ return self.request_xml(url)
class VipergirlsPostExtractor(VipergirlsExtractor):
@@ -146,5 +142,5 @@ class VipergirlsPostExtractor(VipergirlsExtractor):
self.page = 0
def posts(self):
- url = "{}/vr.php?p={}".format(self.root, self.post_id)
- return ElementTree.fromstring(self.request(url).text)
+ url = f"{self.root}/vr.php?p={self.post_id}"
+ return self.request_xml(url)
diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py
index ea034a7..0f323e1 100644
--- a/gallery_dl/extractor/vk.py
+++ b/gallery_dl/extractor/vk.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://vk.com/"""
from .common import Extractor, Message
-from .. import text, exception
-import re
+from .. import text, util, exception
BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com"
@@ -27,12 +26,17 @@ class VkExtractor(Extractor):
def _init(self):
self.offset = text.parse_int(self.config("offset"))
+ def finalize(self):
+ if self.offset:
+ self.log.info("Use '-o offset=%s' to continue downloading "
+ "from the current position", self.offset)
+
def skip(self, num):
self.offset += num
return num
def items(self):
- sub = re.compile(r"/imp[fg]/").sub
+ subn = util.re(r"/imp[fg]/").subn
sizes = "wzyxrqpo"
data = self.metadata()
@@ -54,9 +58,12 @@ class VkExtractor(Extractor):
self.log.warning("no photo URL found (%s)", photo.get("id"))
continue
- photo["url"] = sub("/", url.partition("?")[0])
- # photo["url"] = url
- photo["_fallback"] = (url,)
+ url_sub, count = subn("/", url.partition("?")[0])
+ if count:
+ photo["_fallback"] = (url,)
+ photo["url"] = url = url_sub
+ else:
+ photo["url"] = url
try:
_, photo["width"], photo["height"] = photo[size]
@@ -67,8 +74,8 @@ class VkExtractor(Extractor):
photo["id"] = photo["id"].rpartition("_")[2]
photo.update(data)
- text.nameext_from_url(photo["url"], photo)
- yield Message.Url, photo["url"], photo
+ text.nameext_from_url(url, photo)
+ yield Message.Url, url, photo
def _pagination(self, photos_id):
url = self.root + "/al_photos.php"
@@ -86,10 +93,13 @@ class VkExtractor(Extractor):
}
while True:
- payload = self.request(
- url, method="POST", headers=headers, data=data,
- ).json()["payload"][1]
+ response = self.request(
+ url, method="POST", headers=headers, data=data)
+ if response.history and "/challenge.html" in response.url:
+ raise exception.AbortExtraction(
+ f"HTTP redirect to 'challenge' page:\n{response.url}")
+ payload = response.json()["payload"][1]
if len(payload) < 4:
self.log.debug(payload)
raise exception.AuthorizationError(
@@ -98,18 +108,19 @@ class VkExtractor(Extractor):
total = payload[1]
photos = payload[3]
- data["offset"] += len(photos)
- if data["offset"] >= total:
+ offset_next = self.offset + len(photos)
+ if offset_next >= total:
# the last chunk of photos also contains the first few photos
# again if 'total' is not a multiple of 10
- extra = total - data["offset"]
- if extra:
+ if extra := total - offset_next:
del photos[extra:]
yield from photos
+ self.offset = 0
return
yield from photos
+ data["offset"] = self.offset = offset_next
class VkPhotosExtractor(VkExtractor):
@@ -131,26 +142,34 @@ class VkPhotosExtractor(VkExtractor):
if self.user_id:
user_id = self.user_id
prefix = "public" if user_id[0] == "-" else "id"
- url = "{}/{}{}".format(self.root, prefix, user_id.lstrip("-"))
+ url = f"{self.root}/{prefix}{user_id.lstrip('-')}"
data = self._extract_profile(url)
else:
- url = "{}/{}".format(self.root, self.user_name)
+ url = f"{self.root}/{self.user_name}"
data = self._extract_profile(url)
self.user_id = data["user"]["id"]
return data
def _extract_profile(self, url):
- extr = text.extract_from(self.request(url).text)
- return {"user": {
- "name": text.unescape(extr(
- 'rel="canonical" href="https://vk.com/', '"')),
+ page = self.request(url).text
+ extr = text.extract_from(page)
+
+ user = {
+ "id" : extr('property="og:url" content="https://vk.com/id', '"'),
"nick": text.unescape(extr(
- '<h1 class="page_name">', "<")).replace(" ", " "),
- "info": text.unescape(text.remove_html(extr(
- '<span class="current_text">', '</span'))),
- "id" : (extr('<a href="/albums', '"') or
- extr('data-from-id="', '"')),
- }}
+ "<title>", " | VK</title>")),
+ "info": text.unescape(extr(
+ ',"activity":"', '","')).replace("\\/", "/"),
+ "name": extr('href="https://m.vk.com/', '"'),
+ }
+
+ if user["id"]:
+ user["group"] = False
+ else:
+ user["group"] = True
+ user["id"] = extr('data-from-id="', '"')
+
+ return {"user": user}
class VkAlbumExtractor(VkExtractor):
@@ -165,8 +184,7 @@ class VkAlbumExtractor(VkExtractor):
self.user_id, self.album_id = match.groups()
def photos(self):
- return self._pagination("album{}_{}".format(
- self.user_id, self.album_id))
+ return self._pagination(f"album{self.user_id}_{self.album_id}")
def metadata(self):
return {
@@ -184,10 +202,10 @@ class VkTaggedExtractor(VkExtractor):
def __init__(self, match):
VkExtractor.__init__(self, match)
- self.user_id = match.group(1)
+ self.user_id = match[1]
def photos(self):
- return self._pagination("tag{}".format(self.user_id))
+ return self._pagination(f"tag{self.user_id}")
def metadata(self):
return {"user": {"id": self.user_id}}
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index 524bd81..42839a8 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://vsco.co/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co"
@@ -25,7 +25,7 @@ class VscoExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1).lower()
+ self.user = match[1].lower()
def items(self):
videos = self.config("videos", True)
@@ -47,7 +47,7 @@ class VscoExtractor(Extractor):
base = img["responsive_url"].partition("/")[2]
cdn, _, path = base.partition("/")
if cdn.startswith("aws"):
- url = "https://image-{}.vsco.co/{}".format(cdn, path)
+ url = f"https://image-{cdn}.vsco.co/{path}"
elif cdn.isdecimal():
url = "https://image.vsco.co/" + base
elif img["responsive_url"].startswith("http"):
@@ -79,11 +79,11 @@ class VscoExtractor(Extractor):
def _extract_preload_state(self, url):
page = self.request(url, notfound=self.subcategory).text
return util.json_loads(text.extr(page, "__PRELOADED_STATE__ = ", "<")
- .replace('"prevPageToken":undefined,', ''))
+ .replace('":undefined', '":null'))
def _pagination(self, url, params, token, key, extra=None):
headers = {
- "Referer" : "{}/{}".format(self.root, self.user),
+ "Referer" : f"{self.root}/{self.user}",
"Authorization" : "Bearer " + token,
"X-Client-Platform": "web",
"X-Client-Build" : "1",
@@ -93,7 +93,7 @@ class VscoExtractor(Extractor):
yield from map(self._transform_media, extra)
while True:
- data = self.request(url, params=params, headers=headers).json()
+ data = self.request_json(url, params=params, headers=headers)
medias = data.get(key)
if not medias:
return
@@ -109,8 +109,7 @@ class VscoExtractor(Extractor):
yield from medias
params["page"] += 1
- @staticmethod
- def _transform_media(media):
+ def _transform_media(self, media):
if "responsiveUrl" not in media:
return None
media["_id"] = media["id"]
@@ -122,8 +121,7 @@ class VscoExtractor(Extractor):
media["image_meta"] = media.get("imageMeta")
return media
- @staticmethod
- def _transform_video(media):
+ def _transform_video(self, media):
media["is_video"] = True
media["grid_name"] = ""
media["video_url"] = media["playback_url"]
@@ -132,17 +130,13 @@ class VscoExtractor(Extractor):
return media
-class VscoUserExtractor(VscoExtractor):
+class VscoUserExtractor(Dispatch, VscoExtractor):
"""Extractor for a vsco user profile"""
- subcategory = "user"
pattern = USER_PATTERN + r"/?$"
example = "https://vsco.co/USER"
- def initialize(self):
- pass
-
def items(self):
- base = "{}/{}/".format(self.root, self.user)
+ base = f"{self.root}/{self.user}/"
return self._dispatch_extractors((
(VscoAvatarExtractor , base + "avatar"),
(VscoGalleryExtractor , base + "gallery"),
@@ -158,12 +152,12 @@ class VscoGalleryExtractor(VscoExtractor):
example = "https://vsco.co/USER/gallery"
def images(self):
- url = "{}/{}/gallery".format(self.root, self.user)
+ url = f"{self.root}/{self.user}/gallery"
data = self._extract_preload_state(url)
tkn = data["users"]["currentUser"]["tkn"]
sid = str(data["sites"]["siteByUsername"][self.user]["site"]["id"])
- url = "{}/api/3.0/medias/profile".format(self.root)
+ url = f"{self.root}/api/3.0/medias/profile"
params = {
"site_id" : sid,
"limit" : "14",
@@ -182,14 +176,14 @@ class VscoCollectionExtractor(VscoExtractor):
example = "https://vsco.co/USER/collection/1"
def images(self):
- url = "{}/{}/collection/1".format(self.root, self.user)
+ url = f"{self.root}/{self.user}/collection/1"
data = self._extract_preload_state(url)
tkn = data["users"]["currentUser"]["tkn"]
cid = (data["sites"]["siteByUsername"][self.user]
["site"]["siteCollectionId"])
- url = "{}/api/2.0/collections/{}/medias".format(self.root, cid)
+ url = f"{self.root}/api/2.0/collections/{cid}/medias"
params = {"page": 2, "size": "20"}
return self._pagination(url, params, tkn, "medias", (
data["medias"]["byId"][mid["id"]]["media"]
@@ -207,7 +201,7 @@ class VscoSpaceExtractor(VscoExtractor):
example = "https://vsco.co/spaces/a1b2c3d4e5f"
def images(self):
- url = "{}/spaces/{}".format(self.root, self.user)
+ url = f"{self.root}/spaces/{self.user}"
data = self._extract_preload_state(url)
tkn = data["users"]["currentUser"]["tkn"]
@@ -221,14 +215,14 @@ class VscoSpaceExtractor(VscoExtractor):
space = data["spaces"]["byId"][sid]
space["postsList"] = [posts[pid] for pid in space["postsList"]]
- url = "{}/grpc/spaces/{}/posts".format(self.root, sid)
+ url = f"{self.root}/grpc/spaces/{sid}/posts"
params = {}
return self._pagination(url, params, tkn, space)
def _pagination(self, url, params, token, data):
headers = {
"Accept" : "application/json",
- "Referer" : "{}/spaces/{}".format(self.root, self.user),
+ "Referer" : f"{self.root}/spaces/{self.user}",
"Content-Type" : "application/json",
"Authorization": "Bearer " + token,
}
@@ -244,7 +238,7 @@ class VscoSpaceExtractor(VscoExtractor):
return
params["cursor"] = cursor["postcursorcontext"]["postId"]
- data = self.request(url, params=params, headers=headers).json()
+ data = self.request_json(url, params=params, headers=headers)
class VscoSpacesExtractor(VscoExtractor):
@@ -254,7 +248,7 @@ class VscoSpacesExtractor(VscoExtractor):
example = "https://vsco.co/USER/spaces"
def items(self):
- url = "{}/{}/spaces".format(self.root, self.user)
+ url = f"{self.root}/{self.user}/spaces"
data = self._extract_preload_state(url)
tkn = data["users"]["currentUser"]["tkn"]
@@ -267,12 +261,12 @@ class VscoSpacesExtractor(VscoExtractor):
"Authorization": "Bearer " + tkn,
}
# this would theoretically need to be paginated
- url = "{}/grpc/spaces/user/{}".format(self.root, uid)
- data = self.request(url, headers=headers).json()
+ url = f"{self.root}/grpc/spaces/user/{uid}"
+ data = self.request_json(url, headers=headers)
for space in data["spacesWithRoleList"]:
space = space["space"]
- url = "{}/spaces/{}".format(self.root, space["id"])
+ url = f"{self.root}/spaces/{space['id']}"
space["_extractor"] = VscoSpaceExtractor
yield Message.Queue, url, space
@@ -284,7 +278,7 @@ class VscoAvatarExtractor(VscoExtractor):
example = "https://vsco.co/USER/avatar"
def images(self):
- url = "{}/{}/gallery".format(self.root, self.user)
+ url = f"{self.root}/{self.user}/gallery"
page = self.request(url).text
piid = text.extr(page, '"profileImageId":"', '"')
@@ -312,7 +306,7 @@ class VscoImageExtractor(VscoExtractor):
example = "https://vsco.co/USER/media/0123456789abcdef"
def images(self):
- url = "{}/{}/media/{}".format(self.root, self.user, self.groups[1])
+ url = f"{self.root}/{self.user}/media/{self.groups[1]}"
data = self._extract_preload_state(url)
media = data["medias"]["byId"].popitem()[1]["media"]
return (self._transform_media(media),)
@@ -325,7 +319,7 @@ class VscoVideoExtractor(VscoExtractor):
example = "https://vsco.co/USER/video/012345678-9abc-def0"
def images(self):
- url = "{}/{}/video/{}".format(self.root, self.user, self.groups[1])
+ url = f"{self.root}/{self.user}/video/{self.groups[1]}"
data = self._extract_preload_state(url)
media = data["medias"]["byId"].popitem()[1]["media"]
diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py
index e5b764a..f0f27e0 100644
--- a/gallery_dl/extractor/wallhaven.py
+++ b/gallery_dl/extractor/wallhaven.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://wallhaven.cc/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, exception
@@ -39,8 +39,7 @@ class WallhavenExtractor(Extractor):
"""Return general metadata"""
return ()
- @staticmethod
- def _transform(wp):
+ def _transform(self, wp):
wp["url"] = wp.pop("path")
if "tags" in wp:
wp["tags"] = [t["name"] for t in wp["tags"]]
@@ -61,7 +60,7 @@ class WallhavenSearchExtractor(WallhavenExtractor):
def __init__(self, match):
WallhavenExtractor.__init__(self, match)
- self.params = text.parse_query(match.group(1))
+ self.params = text.parse_query(match[1])
def wallpapers(self):
return self.api.search(self.params)
@@ -88,21 +87,13 @@ class WallhavenCollectionExtractor(WallhavenExtractor):
return {"username": self.username, "collection_id": self.collection_id}
-class WallhavenUserExtractor(WallhavenExtractor):
+class WallhavenUserExtractor(Dispatch, WallhavenExtractor):
"""Extractor for a wallhaven user"""
- subcategory = "user"
pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/?$"
example = "https://wallhaven.cc/user/USER"
- def __init__(self, match):
- WallhavenExtractor.__init__(self, match)
- self.username = match.group(1)
-
- def initialize(self):
- pass
-
def items(self):
- base = "{}/user/{}/".format(self.root, self.username)
+ base = f"{self.root}/user/{self.groups[0]}/"
return self._dispatch_extractors((
(WallhavenUploadsExtractor , base + "uploads"),
(WallhavenCollectionsExtractor, base + "favorites"),
@@ -117,13 +108,13 @@ class WallhavenCollectionsExtractor(WallhavenExtractor):
def __init__(self, match):
WallhavenExtractor.__init__(self, match)
- self.username = match.group(1)
+ self.username = match[1]
def items(self):
+ base = f"{self.root}/user/{self.username}/favorites/"
for collection in self.api.collections(self.username):
collection["_extractor"] = WallhavenCollectionExtractor
- url = "https://wallhaven.cc/user/{}/favorites/{}".format(
- self.username, collection["id"])
+ url = f"{base}{collection['id']}"
yield Message.Queue, url, collection
@@ -137,7 +128,7 @@ class WallhavenUploadsExtractor(WallhavenExtractor):
def __init__(self, match):
WallhavenExtractor.__init__(self, match)
- self.username = match.group(1)
+ self.username = match[1]
def wallpapers(self):
params = {"q": "@" + self.username}
@@ -156,7 +147,7 @@ class WallhavenImageExtractor(WallhavenExtractor):
def __init__(self, match):
WallhavenExtractor.__init__(self, match)
- self.wallpaper_id = match.group(1)
+ self.wallpaper_id = match[1]
def wallpapers(self):
return (self.api.info(self.wallpaper_id),)
@@ -184,7 +175,7 @@ class WallhavenAPI():
return self._call(endpoint)["data"]
def collection(self, username, collection_id):
- endpoint = "/v1/collections/{}/{}".format(username, collection_id)
+ endpoint = f"/v1/collections/{username}/{collection_id}"
return self._pagination(endpoint)
def collections(self, username):
@@ -209,9 +200,9 @@ class WallhavenAPI():
continue
self.extractor.log.debug("Server response: %s", response.text)
- raise exception.StopExtraction(
- "API request failed (%s %s)",
- response.status_code, response.reason)
+ raise exception.AbortExtraction(
+ f"API request failed "
+ f"({response.status_code} {response.reason})")
def _pagination(self, endpoint, params=None, metadata=None):
if params is None:
diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py
index 796f3f8..65fca24 100644
--- a/gallery_dl/extractor/wallpapercave.py
+++ b/gallery_dl/extractor/wallpapercave.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2021 David Hoppenbrouwers
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -32,8 +32,8 @@ class WallpapercaveImageExtractor(Extractor):
if path is None:
try:
- path = text.rextract(
- page, 'href="', '"', page.index('id="tdownload"'))[0]
+ path = text.rextr(
+ page, 'href="', '"', page.index('id="tdownload"'), None)
except Exception:
pass
else:
@@ -44,8 +44,7 @@ class WallpapercaveImageExtractor(Extractor):
if path is None:
for wp in text.extract_iter(
page, 'class="wallpaper" id="wp', '</picture>'):
- path = text.rextract(wp, ' src="', '"')[0]
- if path:
+ if path := text.rextr(wp, ' src="', '"'):
image = text.nameext_from_url(path)
yield Message.Directory, image
yield Message.Url, self.root + path, image
diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py
index 61a36d5..8ae2a49 100644
--- a/gallery_dl/extractor/warosu.py
+++ b/gallery_dl/extractor/warosu.py
@@ -18,7 +18,7 @@ class WarosuThreadExtractor(Extractor):
subcategory = "thread"
root = "https://warosu.org"
directory_fmt = ("{category}", "{board}", "{thread} - {title}")
- filename_fmt = "{tim}-{filename}.{extension}"
+ filename_fmt = "{tim} {filename}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
pattern = r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"
example = "https://warosu.org/a/thread/12345"
@@ -28,7 +28,7 @@ class WarosuThreadExtractor(Extractor):
self.board, self.thread = match.groups()
def items(self):
- url = "{}/{}/thread/{}".format(self.root, self.board, self.thread)
+ url = f"{self.root}/{self.board}/thread/{self.thread}"
page = self.request(url).text
data = self.metadata(page)
posts = self.posts(page)
@@ -42,6 +42,9 @@ class WarosuThreadExtractor(Extractor):
if "image" in post:
for key in ("w", "h", "no", "time", "tim"):
post[key] = text.parse_int(post[key])
+ dt = text.parse_timestamp(post["time"])
+ # avoid zero-padding 'day' with %d
+ post["now"] = dt.strftime(f"%a, %b {dt.day}, %Y %H:%M:%S")
post.update(data)
yield Message.Url, post["image"], post
@@ -64,7 +67,8 @@ class WarosuThreadExtractor(Extractor):
def parse(self, post):
"""Build post object by extracting data from an HTML post"""
data = self._extract_post(post)
- if "<span class=fileinfo>" in post and self._extract_image(post, data):
+ if '<span class="fileinfo' in post and \
+ self._extract_image(post, data):
part = data["image"].rpartition("/")[2]
data["tim"], _, data["extension"] = part.partition(".")
data["ext"] = "." + data["extension"]
@@ -76,24 +80,25 @@ class WarosuThreadExtractor(Extractor):
"no" : extr("id=p", ">"),
"name": extr("class=postername>", "<").strip(),
"time": extr("class=posttime title=", "000>"),
- "now" : extr("", "<").strip(),
"com" : text.unescape(text.remove_html(extr(
"<blockquote>", "</blockquote>").strip())),
}
def _extract_image(self, post, data):
extr = text.extract_from(post)
- data["fsize"] = extr("<span class=fileinfo> File: ", ", ")
+ extr('<span class="fileinfo">', "")
+ data["fsize"] = extr("File: ", ", ")
data["w"] = extr("", "x")
data["h"] = extr("", ", ")
data["filename"] = text.unquote(extr(
"", "<").rstrip().rpartition(".")[0])
extr("<br>", "")
- url = extr("<a href=", ">")
- if url:
+ if url := extr("<a href=", ">"):
if url[0] == "/":
data["image"] = self.root + url
+ elif "warosu." not in url:
+ return False
else:
data["image"] = url
return True
diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py
index 9f6b021..a69f3a8 100644
--- a/gallery_dl/extractor/weasyl.py
+++ b/gallery_dl/extractor/weasyl.py
@@ -20,8 +20,7 @@ class WeasylExtractor(Extractor):
root = "https://www.weasyl.com"
useragent = util.USERAGENT
- @staticmethod
- def populate_submission(data):
+ def populate_submission(self, data):
# Some submissions don't have content and can be skipped
if "submission" in data["media"]:
data["url"] = data["media"]["submission"][0]["url"]
@@ -35,12 +34,12 @@ class WeasylExtractor(Extractor):
self.session.headers['X-Weasyl-API-Key'] = self.config("api-key")
def request_submission(self, submitid):
- return self.request(
- "{}/api/submissions/{}/view".format(self.root, submitid)).json()
+ return self.request_json(
+ f"{self.root}/api/submissions/{submitid}/view")
def retrieve_journal(self, journalid):
- data = self.request(
- "{}/api/journals/{}/view".format(self.root, journalid)).json()
+ data = self.request_json(
+ f"{self.root}/api/journals/{journalid}/view")
data["extension"] = "html"
data["html"] = "text:" + data["content"]
data["date"] = text.parse_datetime(data["posted_at"])
@@ -48,14 +47,14 @@ class WeasylExtractor(Extractor):
def submissions(self, owner_login, folderid=None):
metadata = self.config("metadata")
- url = "{}/api/users/{}/gallery".format(self.root, owner_login)
+ url = f"{self.root}/api/users/{owner_login}/gallery"
params = {
"nextid" : None,
"folderid": folderid,
}
while True:
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
for submission in data["submissions"]:
if metadata:
submission = self.request_submission(
@@ -77,7 +76,7 @@ class WeasylSubmissionExtractor(WeasylExtractor):
def __init__(self, match):
WeasylExtractor.__init__(self, match)
- self.submitid = match.group(1)
+ self.submitid = match[1]
def items(self):
data = self.request_submission(self.submitid)
@@ -93,7 +92,7 @@ class WeasylSubmissionsExtractor(WeasylExtractor):
def __init__(self, match):
WeasylExtractor.__init__(self, match)
- self.owner_login = match.group(1)
+ self.owner_login = match[1]
def items(self):
yield Message.Directory, {"owner_login": self.owner_login}
@@ -129,7 +128,7 @@ class WeasylJournalExtractor(WeasylExtractor):
def __init__(self, match):
WeasylExtractor.__init__(self, match)
- self.journalid = match.group(1)
+ self.journalid = match[1]
def items(self):
data = self.retrieve_journal(self.journalid)
@@ -146,12 +145,12 @@ class WeasylJournalsExtractor(WeasylExtractor):
def __init__(self, match):
WeasylExtractor.__init__(self, match)
- self.owner_login = match.group(1)
+ self.owner_login = match[1]
def items(self):
yield Message.Directory, {"owner_login": self.owner_login}
- url = "{}/journals/{}".format(self.root, self.owner_login)
+ url = f"{self.root}/journals/{self.owner_login}"
page = self.request(url).text
for journalid in text.extract_iter(page, 'href="/journal/', '/'):
data = self.retrieve_journal(journalid)
@@ -200,5 +199,5 @@ class WeasylFavoriteExtractor(WeasylExtractor):
pos = page.index('">Next (', pos)
except ValueError:
return
- path = text.unescape(text.rextract(page, 'href="', '"', pos)[0])
+ path = text.unescape(text.rextr(page, 'href="', '"', pos))
params = None
diff --git a/gallery_dl/extractor/webmshare.py b/gallery_dl/extractor/webmshare.py
index 7e2b5ea..cc41b03 100644
--- a/gallery_dl/extractor/webmshare.py
+++ b/gallery_dl/extractor/webmshare.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -25,10 +25,10 @@ class WebmshareVideoExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.video_id = match.group(1)
+ self.video_id = match[1]
def items(self):
- url = "{}/{}".format(self.root, self.video_id)
+ url = f"{self.root}/{self.video_id}"
extr = text.extract_from(self.request(url).text)
data = {
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 8ff32af..49a94b5 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2020 Leonardo Taccari
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -19,6 +19,9 @@ LANG_PATTERN = BASE_PATTERN + r"/(([^/?#]+)"
class WebtoonsBase():
category = "webtoons"
root = "https://www.webtoons.com"
+ directory_fmt = ("{category}", "{comic}")
+ filename_fmt = "{episode_no}-{num:>02}{type:?-//}.{extension}"
+ archive_fmt = "{title_no}_{episode_no}_{num}"
cookies_domain = ".webtoons.com"
request_interval = (0.5, 1.5)
@@ -32,20 +35,19 @@ class WebtoonsBase():
"ageGatePass": "true",
})
+ _init = setup_agegate_cookies
+
def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)
if response.history and "/ageGate" in response.url:
- raise exception.StopExtraction(
- "HTTP redirect to age gate check ('%s')", response.url)
+ raise exception.AbortExtraction(
+ f"HTTP redirect to age gate check ('{response.url}')")
return response
class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
"""Extractor for an episode on webtoons.com"""
subcategory = "episode"
- directory_fmt = ("{category}", "{comic}")
- filename_fmt = "{episode_no}-{num:>02}.{extension}"
- archive_fmt = "{title_no}_{episode_no}_{num}"
pattern = (LANG_PATTERN + r"/([^/?#]+)/([^/?#]+)/[^/?#]+)"
r"/viewer\?([^#'\"]+)")
example = ("https://www.webtoons.com/en/GENRE/TITLE/NAME/viewer"
@@ -54,11 +56,11 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
def _init(self):
self.setup_agegate_cookies()
- path, self.lang, self.genre, self.comic, query = self.groups
+ base, self.lang, self.genre, self.comic, query = self.groups
params = text.parse_query(query)
self.title_no = params.get("title_no")
self.episode_no = params.get("episode_no")
- self.gallery_url = "{}/{}/viewer?{}".format(self.root, path, query)
+ self.page_url = f"{self.root}/{base}/viewer?{query}"
def metadata(self, page):
extr = text.extract_from(page)
@@ -66,19 +68,19 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
descr = extr('<meta property="og:description" content="', '"')
if extr('<div class="subj_info"', '\n'):
- comic_name = extr('>', '<')
+ comic_name = extr(">", "<")
episode_name = extr('<h1 class="subj_episode" title="', '"')
else:
comic_name = episode_name = ""
if extr('<span class="tx _btnOpenEpisodeList ', '"'):
- episode = extr('>#', '<')
+ episode = extr(">#", "<")
else:
episode = ""
- if extr('<span class="author"', '\n'):
- username = extr('/u/', '"')
- author_name = extr('<span>', '</span>')
+ if extr('<span class="author"', "\n"):
+ username = extr("/u/", '"')
+ author_name = extr("<span>", "</span>")
else:
username = author_name = ""
@@ -122,65 +124,75 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
if type is False:
url = path
elif type:
- url = "{}?type={}".format(path, type)
+ url = f"{path}?type={type}"
- url = url.replace("://webtoon-phinf.", "://swebtoon-phinf.")
- results.append((url, None))
+ results.append((_url(url), None))
return results
+ def assets(self, page):
+ if self.config("thumbnails", False):
+ active = text.extr(page, 'class="on ', '</a>')
+ url = _url(text.extr(active, 'data-url="', '"'))
+ return ({"url": url, "type": "thumbnail"},)
+
class WebtoonsComicExtractor(WebtoonsBase, Extractor):
"""Extractor for an entire comic on webtoons.com"""
subcategory = "comic"
categorytransfer = True
+ filename_fmt = "{type}.{extension}"
+ archive_fmt = "{title_no}_{type}"
pattern = LANG_PATTERN + r"/([^/?#]+)/([^/?#]+))/list\?([^#]+)"
example = "https://www.webtoons.com/en/GENRE/TITLE/list?title_no=123"
- def _init(self):
- self.setup_agegate_cookies()
-
- self.path, self.lang, self.genre, self.comic, query = self.groups
- params = text.parse_query(query)
- self.title_no = params.get("title_no")
- self.page_no = text.parse_int(params.get("page"), 1)
-
def items(self):
- page = None
- data = {
- "_extractor": WebtoonsEpisodeExtractor,
- "title_no" : text.parse_int(self.title_no),
- }
-
- while True:
- path = "/{}/list?title_no={}&page={}".format(
- self.path, self.title_no, self.page_no)
-
- if page is not None and path not in page:
- return
+ kw = self.kwdict
+ base, kw["lang"], kw["genre"], kw["comic"], query = self.groups
+ params = text.parse_query(query)
+ kw["title_no"] = title_no = text.parse_int(params.get("title_no"))
+ kw["page"] = page_no = text.parse_int(params.get("page"), 1)
- response = self.request(self.root + path)
- if response.history:
- parts = response.url.split("/")
- self.path = "/".join(parts[3:-1])
+ path = f"/{base}/list?title_no={title_no}&page={page_no}"
+ response = self.request(self.root + path)
+ if response.history:
+ parts = response.url.split("/")
+ base = "/".join(parts[3:-1])
+ page = response.text
- page = response.text
- data["page"] = self.page_no
+ if self.config("banners") and (asset := self._asset_banner(page)):
+ yield Message.Directory, asset
+ yield Message.Url, asset["url"], asset
+ data = {"_extractor": WebtoonsEpisodeExtractor}
+ while True:
for url in self.get_episode_urls(page):
params = text.parse_query(url.rpartition("?")[2])
data["episode_no"] = text.parse_int(params.get("episode_no"))
yield Message.Queue, url, data
- self.page_no += 1
+ kw["page"] = page_no = page_no + 1
+ path = f"/{base}/list?title_no={title_no}&page={page_no}"
+ if path not in page:
+ return
+ page = self.request(self.root + path).text
def get_episode_urls(self, page):
"""Extract and return all episode urls in 'page'"""
- page = text.extr(page, 'id="_listUl"', '</ul>')
+ page = text.extr(page, 'id="_listUl"', "</ul>")
return [
- match.group(0)
+ match[0]
for match in WebtoonsEpisodeExtractor.pattern.finditer(page)
]
+ def _asset_banner(self, page):
+ try:
+ pos = page.index('<span class="thmb')
+ except Exception:
+ return
+
+ url = _url(text.extract(page, 'src="', '"', pos)[0])
+ return text.nameext_from_url(url, {"url": url, "type": "banner"})
+
class WebtoonsArtistExtractor(WebtoonsBase, Extractor):
"""Extractor for webtoons.com artists"""
@@ -189,8 +201,6 @@ class WebtoonsArtistExtractor(WebtoonsBase, Extractor):
example = "https://www.webtoons.com/p/community/LANG/u/ARTIST"
def items(self):
- self.setup_agegate_cookies()
-
for comic in self.comics():
comic["_extractor"] = WebtoonsComicExtractor
comic_url = self.root + comic["extra"]["episodeListPath"]
@@ -200,13 +210,11 @@ class WebtoonsArtistExtractor(WebtoonsBase, Extractor):
lang, artist = self.groups
language = util.code_to_language(lang).upper()
- url = "{}/p/community/{}/u/{}".format(
- self.root, lang, artist)
+ url = f"{self.root}/p/community/{lang}/u/{artist}"
page = self.request(url).text
creator_id = text.extr(page, '\\"creatorId\\":\\"', '\\')
- url = "{}/p/community/api/v1/creator/{}/titles".format(
- self.root, creator_id)
+ url = f"{self.root}/p/community/api/v1/creator/{creator_id}/titles"
params = {
"language": language,
"nextSize": "50",
@@ -214,6 +222,10 @@ class WebtoonsArtistExtractor(WebtoonsBase, Extractor):
headers = {
"language": language,
}
- data = self.request(url, params=params, headers=headers).json()
+ data = self.request_json(url, params=params, headers=headers)
return data["result"]["titles"]
+
+
+def _url(url):
+ return url.replace("://webtoon-phinf.", "://swebtoon-phinf.")
diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py
index cacefd6..03cbf29 100644
--- a/gallery_dl/extractor/weebcentral.py
+++ b/gallery_dl/extractor/weebcentral.py
@@ -22,7 +22,7 @@ class WeebcentralBase():
@memcache(keyarg=1)
def _extract_manga_data(self, manga_id):
- url = "{}/series/{}".format(self.root, manga_id)
+ url = f"{self.root}/series/{manga_id}"
page = self.request(url).text
extr = text.extract_from(page)
@@ -64,7 +64,7 @@ class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor):
return data
def images(self, page):
- referer = self.gallery_url
+ referer = self.page_url
url = referer + "/images"
params = {
"is_prev" : "False",
@@ -98,12 +98,9 @@ class WeebcentralMangaExtractor(WeebcentralBase, MangaExtractor):
pattern = BASE_PATTERN + r"/series/(\w+)"
example = "https://weebcentral.com/series/01J7ABCDEFGHIJKLMNOPQRSTUV/TITLE"
- def __init__(self, match):
- MangaExtractor.__init__(self, match, False)
-
def chapters(self, _):
manga_id = self.groups[0]
- referer = "{}/series/{}".format(self.root, manga_id)
+ referer = f"{self.root}/series/{manga_id}"
url = referer + "/full-chapter-list"
headers = {
"Accept" : "*/*",
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 3ed5a06..823e8e0 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://www.weibo.com/"""
-from .common import Extractor, Message
+from .common import Extractor, Message, Dispatch
from .. import text, util, exception
from ..cache import cache
import random
@@ -46,9 +46,9 @@ class WeiboExtractor(Extractor):
if response.history:
if "login.sina.com" in response.url:
- raise exception.StopExtraction(
- "HTTP redirect to login page (%s)",
- response.url.partition("?")[0])
+ raise exception.AbortExtraction(
+ f"HTTP redirect to login page "
+ f"({response.url.partition('?')[0]})")
if "passport.weibo.com" in response.url:
self._sina_visitor_system(response)
response = Extractor.request(self, url, **kwargs)
@@ -98,22 +98,20 @@ class WeiboExtractor(Extractor):
yield Message.Url, file["url"], file
def _extract_status(self, status, files):
- append = files.append
-
if "mix_media_info" in status:
for item in status["mix_media_info"]["items"]:
type = item.get("type")
if type == "video":
if self.videos:
- append(self._extract_video(item["data"]["media_info"]))
+ files.append(self._extract_video(
+ item["data"]["media_info"]))
elif type == "pic":
- append(item["data"]["largest"].copy())
+ files.append(item["data"]["largest"].copy())
else:
self.log.warning("Unknown media type '%s'", type)
return
- pic_ids = status.get("pic_ids")
- if pic_ids:
+ if pic_ids := status.get("pic_ids"):
pics = status["pic_infos"]
for pic_id in pic_ids:
pic = pics[pic_id]
@@ -121,22 +119,22 @@ class WeiboExtractor(Extractor):
if pic_type == "gif" and self.gifs:
if self.gifs_video:
- append({"url": pic["video"]})
+ files.append({"url": pic["video"]})
else:
- append(pic["largest"].copy())
+ files.append(pic["largest"].copy())
elif pic_type == "livephoto" and self.livephoto:
- append(pic["largest"].copy())
- append({"url": pic["video"]})
+ files.append(pic["largest"].copy())
+ files.append({"url": pic["video"]})
else:
- append(pic["largest"].copy())
+ files.append(pic["largest"].copy())
if "page_info" in status:
info = status["page_info"]
if "media_info" in info and self.videos:
if info.get("type") != "5" or self.movies:
- append(self._extract_video(info["media_info"]))
+ files.append(self._extract_video(info["media_info"]))
else:
self.log.debug("%s: Ignoring 'movie' video", status["id"])
@@ -151,25 +149,24 @@ class WeiboExtractor(Extractor):
return media["play_info"].copy()
def _status_by_id(self, status_id):
- url = "{}/ajax/statuses/show?id={}".format(self.root, status_id)
- return self.request(url).json()
+ url = f"{self.root}/ajax/statuses/show?id={status_id}"
+ return self.request_json(url)
def _user_id(self):
if len(self.user) >= 10 and self.user.isdecimal():
return self.user[-10:]
else:
- url = "{}/ajax/profile/info?{}={}".format(
- self.root,
- "screen_name" if self._prefix == "n" else "custom",
- self.user)
- return self.request(url).json()["data"]["user"]["idstr"]
+ url = (f"{self.root}/ajax/profile/info?"
+ f"{'screen_name' if self._prefix == 'n' else 'custom'}="
+ f"{self.user}")
+ return self.request_json(url)["data"]["user"]["idstr"]
def _pagination(self, endpoint, params):
url = self.root + "/ajax" + endpoint
headers = {
"X-Requested-With": "XMLHttpRequest",
"X-XSRF-TOKEN": None,
- "Referer": "{}/u/{}".format(self.root, params["uid"]),
+ "Referer": f"{self.root}/u/{params['uid']}",
}
while True:
@@ -181,25 +178,23 @@ class WeiboExtractor(Extractor):
if not data.get("ok"):
self.log.debug(response.content)
if "since_id" not in params: # first iteration
- raise exception.StopExtraction(
- '"%s"', data.get("msg") or "unknown error")
+ raise exception.AbortExtraction(
+ f'"{data.get("msg") or "unknown error"}"')
data = data["data"]
statuses = data["list"]
yield from statuses
# videos, newvideo
- cursor = data.get("next_cursor")
- if cursor:
+ if cursor := data.get("next_cursor"):
if cursor == -1:
return
params["cursor"] = cursor
continue
# album
- since_id = data.get("since_id")
- if since_id:
- params["sinceid"] = data["since_id"]
+ if since_id := data.get("since_id"):
+ params["sinceid"] = since_id
continue
# home, article
@@ -235,7 +230,7 @@ class WeiboExtractor(Extractor):
"a" : "incarnate",
"t" : data["tid"],
"w" : "3" if data.get("new_tid") else "2",
- "c" : "{:>03}".format(data.get("confidence") or 100),
+ "c" : f"{data.get('confidence') or 100:>03}",
"gc" : "",
"cb" : "cross_domain",
"from" : "weibo",
@@ -257,8 +252,8 @@ class WeiboUserExtractor(WeiboExtractor):
# pass
def items(self):
- base = "{}/u/{}?tabtype=".format(self.root, self._user_id())
- return self._dispatch_extractors((
+ base = f"{self.root}/u/{self._user_id()}?tabtype="
+ return Dispatch._dispatch_extractors(self, (
(WeiboHomeExtractor , base + "home"),
(WeiboFeedExtractor , base + "feed"),
(WeiboVideosExtractor , base + "video"),
diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py
index 938c048..830d880 100644
--- a/gallery_dl/extractor/wikiart.py
+++ b/gallery_dl/extractor/wikiart.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,7 +23,7 @@ class WikiartExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.lang = match.group(1)
+ self.lang = match[1]
def items(self):
data = self.metadata()
@@ -54,7 +54,7 @@ class WikiartExtractor(Extractor):
params.update(extra_params)
while True:
- data = self.request(url, headers=headers, params=params).json()
+ data = self.request_json(url, headers=headers, params=params)
items = data.get(key)
if not items:
return
@@ -73,17 +73,16 @@ class WikiartArtistExtractor(WikiartExtractor):
def __init__(self, match):
WikiartExtractor.__init__(self, match)
- self.artist_name = match.group(2)
+ self.artist_name = match[2]
self.artist = None
def metadata(self):
- url = "{}/{}/{}?json=2".format(self.root, self.lang, self.artist_name)
- self.artist = self.request(url).json()
+ url = f"{self.root}/{self.lang}/{self.artist_name}?json=2"
+ self.artist = self.request_json(url)
return {"artist": self.artist}
def paintings(self):
- url = "{}/{}/{}/mode/all-paintings".format(
- self.root, self.lang, self.artist_name)
+ url = f"{self.root}/{self.lang}/{self.artist_name}/mode/all-paintings"
return self._pagination(url)
@@ -95,16 +94,14 @@ class WikiartImageExtractor(WikiartArtistExtractor):
def __init__(self, match):
WikiartArtistExtractor.__init__(self, match)
- self.title = match.group(3)
+ self.title = match[3]
def paintings(self):
title, sep, year = self.title.rpartition("-")
if not sep or not year.isdecimal():
title = self.title
- url = "{}/{}/Search/{} {}".format(
- self.root, self.lang,
- self.artist.get("artistName") or self.artist_name, title,
- )
+ url = (f"{self.root}/{self.lang}/Search/"
+ f"{self.artist.get('artistName') or self.artist_name} {title}")
return self._pagination(url, stop=True)
@@ -117,15 +114,14 @@ class WikiartArtworksExtractor(WikiartExtractor):
def __init__(self, match):
WikiartExtractor.__init__(self, match)
- self.group = match.group(2)
- self.type = match.group(3)
+ self.group = match[2]
+ self.type = match[3]
def metadata(self):
return {"group": self.group, "type": self.type}
def paintings(self):
- url = "{}/{}/paintings-by-{}/{}".format(
- self.root, self.lang, self.group, self.type)
+ url = f"{self.root}/{self.lang}/paintings-by-{self.group}/{self.type}"
return self._pagination(url)
@@ -137,12 +133,11 @@ class WikiartArtistsExtractor(WikiartExtractor):
def __init__(self, match):
WikiartExtractor.__init__(self, match)
- self.group = match.group(2)
- self.type = match.group(3)
+ self.group = match[2]
+ self.type = match[3]
def items(self):
- url = "{}/{}/App/Search/Artists-by-{}".format(
- self.root, self.lang, self.group)
+ url = f"{self.root}/{self.lang}/App/Search/Artists-by-{self.group}"
params = {"json": "3", "searchterm": self.type}
for artist in self._pagination(url, params, "Artists"):
diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py
index f7bfeb2..31dc9cd 100644
--- a/gallery_dl/extractor/wikifeet.py
+++ b/gallery_dl/extractor/wikifeet.py
@@ -21,11 +21,11 @@ class WikifeetGalleryExtractor(GalleryExtractor):
example = "https://www.wikifeet.com/CELEB"
def __init__(self, match):
- self.root = text.root_from_url(match.group(0))
+ self.root = text.root_from_url(match[0])
if "wikifeetx.com" in self.root:
self.category = "wikifeetx"
self.type = "men" if "://men." in self.root else "women"
- self.celeb = match.group(1)
+ self.celeb = match[1]
GalleryExtractor.__init__(self, match, self.root + "/" + self.celeb)
def metadata(self, page):
@@ -50,9 +50,11 @@ class WikifeetGalleryExtractor(GalleryExtractor):
"S": "Soles",
"B": "Barefoot",
}
- ufmt = "https://pics.wikifeet.com/" + self.celeb + "-Feet-{}.jpg"
+
+ gallery = text.extr(page, '"gallery":[', '],')
+ base = f"https://pics.wikifeet.com/{self.celeb}-Feet-"
return [
- (ufmt.format(data["pid"]), {
+ (f"{base}{data['pid']}.jpg", {
"pid" : data["pid"],
"width" : data["pw"],
"height": data["ph"],
@@ -61,6 +63,5 @@ class WikifeetGalleryExtractor(GalleryExtractor):
for tag in data["tags"] if tag in tagmap
],
})
- for data in
- util.json_loads("[" + text.extr(page, '"gallery":[', '],') + "]")
+ for data in util.json_loads(f"[{gallery}]")
]
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index 3b23f3a..e927bc1 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Ailothaen
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -27,14 +27,17 @@ class WikimediaExtractor(BaseExtractor):
if self.category == "wikimedia":
self.category = self.root.split(".")[-2]
elif self.category in ("fandom", "wikigg"):
- self.category = "{}-{}".format(
- self.category, self.root.partition(".")[0].rpartition("/")[2])
+ self.category = (
+ f"{self.category}-"
+ f"{self.root.partition('.')[0].rpartition('/')[2]}")
self.per_page = self.config("limit", 50)
+ if useragent := self.config_instance("useragent"):
+ self.useragent = useragent
+
def _init(self):
- api_path = self.config_instance("api-path")
- if api_path:
+ if api_path := self.config_instance("api-path"):
if api_path[0] == "/":
self.api_url = self.root + api_path
else:
@@ -50,10 +53,9 @@ class WikimediaExtractor(BaseExtractor):
response = self.request(url, method="HEAD", fatal=None)
if response.status_code < 400:
return url
- raise exception.StopExtraction("Unable to find API endpoint")
+ raise exception.AbortExtraction("Unable to find API endpoint")
- @staticmethod
- def prepare(image):
+ def prepare(self, image):
"""Adjust the content of an image object"""
image["metadata"] = {
m["name"]: m["value"]
@@ -107,17 +109,15 @@ class WikimediaExtractor(BaseExtractor):
)
while True:
- data = self.request(url, params=params).json()
+ data = self.request_json(url, params=params)
# ref: https://www.mediawiki.org/wiki/API:Errors_and_warnings
- error = data.get("error")
- if error:
+ if error := data.get("error"):
self.log.error("%s: %s", error["code"], error["info"])
return
# MediaWiki will emit warnings for non-fatal mistakes such as
# invalid parameter instead of raising an error
- warnings = data.get("warnings")
- if warnings:
+ if warnings := data.get("warnings"):
self.log.debug("MediaWiki returned warnings: %s", warnings)
try:
@@ -187,6 +187,7 @@ BASE_PATTERN = WikimediaExtractor.update({
"root": "https://azurlane.koumakan.jp",
"pattern": r"azurlane\.koumakan\.jp",
"api-path": "/w/api.php",
+ "useragent": "Googlebot-Image/1.0",
},
})
@@ -238,7 +239,7 @@ class WikimediaArticleExtractor(WikimediaExtractor):
}
def prepare(self, image):
- WikimediaExtractor.prepare(image)
+ WikimediaExtractor.prepare(self, image)
image["page"] = self.title
diff --git a/gallery_dl/extractor/xfolio.py b/gallery_dl/extractor/xfolio.py
index a1a5be3..12f437a 100644
--- a/gallery_dl/extractor/xfolio.py
+++ b/gallery_dl/extractor/xfolio.py
@@ -38,7 +38,7 @@ class XfolioExtractor(Extractor):
response = Extractor.request(self, url, **kwargs)
if "/system/recaptcha" in response.url:
- raise exception.StopExtraction("Bot check / CAPTCHA page")
+ raise exception.AbortExtraction("Bot check / CAPTCHA page")
return response
@@ -47,13 +47,10 @@ class XfolioWorkExtractor(XfolioExtractor):
subcategory = "work"
pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/works/(\d+)"
example = "https://xfolio.jp/portfolio/USER/works/12345"
- ref_fmt = ("{}/fullscale_image?image_id={}&work_id={}")
- url_fmt = ("{}/user_asset.php?id={}&work_id={}"
- "&work_image_id={}&type=work_image")
def items(self):
creator, work_id = self.groups
- url = "{}/portfolio/{}/works/{}".format(self.root, creator, work_id)
+ url = f"{self.root}/portfolio/{creator}/works/{work_id}"
html = self.request(url).text
work = self._extract_data(html)
@@ -98,10 +95,11 @@ class XfolioWorkExtractor(XfolioExtractor):
files.append({
"image_id" : image_id,
"extension": "jpg",
- "url": self.url_fmt.format(
- self.root, image_id, work_id, image_id),
- "_http_headers": {"Referer": self.ref_fmt.format(
- self.root, image_id, work_id)},
+ "url": (f"{self.root}/user_asset.php?id={image_id}&work_id="
+ f"{work_id}&work_image_id={image_id}&type=work_image"),
+ "_http_headers": {"Referer": (
+ f"{self.root}/fullscale_image"
+ f"?image_id={image_id}&work_id={work_id}")},
})
return files
@@ -113,7 +111,7 @@ class XfolioUserExtractor(XfolioExtractor):
example = "https://xfolio.jp/portfolio/USER"
def works(self):
- url = "{}/portfolio/{}/works".format(self.root, self.groups[0])
+ url = f"{self.root}/portfolio/{self.groups[0]}/works"
while True:
html = self.request(url).text
@@ -136,7 +134,7 @@ class XfolioSeriesExtractor(XfolioExtractor):
def works(self):
creator, series_id = self.groups
- url = "{}/portfolio/{}/series/{}".format(self.root, creator, series_id)
+ url = f"{self.root}/portfolio/{creator}/series/{series_id}"
html = self.request(url).text
return [
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index 4d69d3d..6c97175 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -20,7 +20,7 @@ class XhamsterExtractor(Extractor):
category = "xhamster"
def __init__(self, match):
- self.root = "https://" + match.group(1)
+ self.root = "https://" + match[1]
Extractor.__init__(self, match)
@@ -106,7 +106,7 @@ class XhamsterUserExtractor(XhamsterExtractor):
example = "https://xhamster.com/users/USER/photos"
def items(self):
- url = "{}/users/{}/photos".format(self.root, self.groups[1])
+ url = f"{self.root}/users/{self.groups[1]}/photos"
data = {"_extractor": XhamsterGalleryExtractor}
while url:
diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py
index da9d6b0..6c016ec 100644
--- a/gallery_dl/extractor/xvideos.py
+++ b/gallery_dl/extractor/xvideos.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -33,8 +33,7 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor):
def __init__(self, match):
self.user, self.gallery_id = match.groups()
- url = "{}/profiles/{}/photos/{}".format(
- self.root, self.user, self.gallery_id)
+ url = f"{self.root}/profiles/{self.user}/photos/{self.gallery_id}"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -70,7 +69,7 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor):
return
while len(results) % 500 == 0:
- path = text.rextract(page, ' href="', '"', page.find(">Next</"))[0]
+ path = text.rextr(page, ' href="', '"', page.find(">Next</"))
if not path:
break
page = self.request(self.root + path).text
@@ -92,10 +91,10 @@ class XvideosUserExtractor(XvideosBase, Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match[1]
def items(self):
- url = "{}/profiles/{}".format(self.root, self.user)
+ url = f"{self.root}/profiles/{self.user}"
page = self.request(url, notfound=self.subcategory).text
data = util.json_loads(text.extr(
page, "xv.conf=", ";</script>"))["data"]
@@ -116,7 +115,7 @@ class XvideosUserExtractor(XvideosBase, Extractor):
]
galleries.sort(key=lambda x: x["id"])
+ base = f"{self.root}/profiles/{self.user}/photos/"
for gallery in galleries:
- url = "https://www.xvideos.com/profiles/{}/photos/{}".format(
- self.user, gallery["id"])
+ url = f"{base}{gallery['id']}"
yield Message.Queue, url, gallery
diff --git a/gallery_dl/extractor/yiffverse.py b/gallery_dl/extractor/yiffverse.py
index 2b14341..1595b4d 100644
--- a/gallery_dl/extractor/yiffverse.py
+++ b/gallery_dl/extractor/yiffverse.py
@@ -46,8 +46,8 @@ class YiffverseExtractor(BooruExtractor):
post_id = post["id"]
root = self.root_cdn if files[fmt][0] else self.root
- post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format(
- root, post_id // 1000, post_id, post_id, extension)
+ post["file_url"] = url = \
+ f"{root}/posts/{post_id // 1000}/{post_id}/{post_id}.{extension}"
post["format_id"] = fmt
post["format"] = extension.partition(".")[0]
@@ -73,11 +73,11 @@ class YiffverseExtractor(BooruExtractor):
post["tags_" + types[type]] = values
def _fetch_post(self, post_id):
- url = "{}/api/v2/post/{}".format(self.root, post_id)
- return self.request(url).json()
+ url = f"{self.root}/api/v2/post/{post_id}"
+ return self.request_json(url)
def _pagination(self, endpoint, params=None):
- url = "{}/api{}".format(self.root, endpoint)
+ url = f"{self.root}/api{endpoint}"
if params is None:
params = {}
@@ -87,7 +87,7 @@ class YiffverseExtractor(BooruExtractor):
threshold = self.per_page
while True:
- data = self.request(url, method="POST", json=params).json()
+ data = self.request_json(url, method="POST", json=params)
yield from data["items"]
diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py
index 168845e..eb33b65 100644
--- a/gallery_dl/extractor/ytdl.py
+++ b/gallery_dl/extractor/ytdl.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -28,7 +28,7 @@ class YoutubeDLExtractor(Extractor):
self.ytdl_module_name = ytdl_module.__name__
# find suitable youtube_dl extractor
- self.ytdl_url = url = match.group(1)
+ self.ytdl_url = url = match[1]
generic = config.interpolate(("extractor", "ytdl"), "generic", True)
if generic == "force":
self.ytdl_ie_key = "Generic"
@@ -42,8 +42,14 @@ class YoutubeDLExtractor(Extractor):
raise exception.NoExtractorError()
self.force_generic_extractor = False
- # set subcategory to youtube_dl extractor's key
- self.subcategory = self.ytdl_ie_key
+ if self.ytdl_ie_key == "Generic" and config.interpolate(
+ ("extractor", "ytdl"), "generic-category", True):
+ # set subcategory to URL domain
+ self.category = "ytdl-generic"
+ self.subcategory = url[url.rfind("/", None, 8)+1:url.find("/", 8)]
+ else:
+ # set subcategory to youtube_dl extractor's key
+ self.subcategory = self.ytdl_ie_key
Extractor.__init__(self, match)
def items(self):
@@ -76,8 +82,7 @@ class YoutubeDLExtractor(Extractor):
ytdl_module, self, user_opts, extr_opts)
# transfer cookies to ytdl
- cookies = self.cookies
- if cookies:
+ if cookies := self.cookies:
set_cookie = ytdl_instance.cookiejar.set_cookie
for cookie in cookies:
set_cookie(cookie)
@@ -89,7 +94,7 @@ class YoutubeDLExtractor(Extractor):
ytdl_instance.get_info_extractor(self.ytdl_ie_key),
False, {}, True)
except ytdl_module.utils.YoutubeDLError:
- raise exception.StopExtraction("Failed to extract video data")
+ raise exception.AbortExtraction("Failed to extract video data")
if not info_dict:
return
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index 0ad73c0..3341594 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,7 +12,6 @@ from .booru import BooruExtractor
from ..cache import cache
from .. import text, util, exception
import collections
-import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
@@ -63,7 +62,7 @@ class ZerochanExtractor(BooruExtractor):
return response.cookies
def _parse_entry_html(self, entry_id):
- url = "{}/{}".format(self.root, entry_id)
+ url = f"{self.root}/{entry_id}"
page = self.request(url).text
try:
@@ -102,7 +101,7 @@ class ZerochanExtractor(BooruExtractor):
return data
def _parse_entry_api(self, entry_id):
- url = "{}/{}?json".format(self.root, entry_id)
+ url = f"{self.root}/{entry_id}?json"
txt = self.request(url).text
try:
item = util.json_loads(txt)
@@ -127,7 +126,7 @@ class ZerochanExtractor(BooruExtractor):
return data
def _parse_json(self, txt):
- txt = re.sub(r"[\x00-\x1f\x7f]", "", txt)
+ txt = util.re(r"[\x00-\x1f\x7f]").sub("", txt)
main, _, tags = txt.partition('tags": [')
item = {}
@@ -174,8 +173,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
self.posts = self.posts_api
self.session.headers["User-Agent"] = util.USERAGENT
- exts = self.config("extensions")
- if exts:
+ if exts := self.config("extensions"):
if isinstance(exts, str):
exts = exts.split(",")
self.exts = exts
@@ -239,7 +237,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
self.log.warning("HTTP redirect to %s", url)
if self.config("redirects"):
continue
- raise exception.StopExtraction()
+ raise exception.AbortExtraction()
data = response.json()
try:
@@ -278,7 +276,7 @@ class ZerochanImageExtractor(ZerochanExtractor):
def __init__(self, match):
ZerochanExtractor.__init__(self, match)
- self.image_id = match.group(1)
+ self.image_id = match[1]
def posts(self):
post = self._parse_entry_html(self.image_id)
diff --git a/gallery_dl/extractor/zzup.py b/gallery_dl/extractor/zzup.py
index 20454b4..7393931 100644
--- a/gallery_dl/extractor/zzup.py
+++ b/gallery_dl/extractor/zzup.py
@@ -25,7 +25,7 @@ class ZzupGalleryExtractor(GalleryExtractor):
if subdomain == "up.":
self.root = "https://up.zzup.com"
self.images = self.images_v2
- url = "{}{}/index.html".format(self.root, path)
+ url = f"{self.root}{path}/index.html"
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -41,22 +41,23 @@ class ZzupGalleryExtractor(GalleryExtractor):
page = self.request(self.root + path).text
url = self.root + text.extr(page, '\n<a href="', '"')
p1, _, p2 = url.partition("/image0")
- ufmt = p1 + "/image{:>05}" + p2[4:]
- return [(ufmt.format(num), None) for num in range(1, count + 1)]
+ p2 = p2[4:]
+ return [(f"{p1}/image{i:>05}{p2}", None) for i in range(1, count + 1)]
def images_v2(self, page):
+ base = f"{self.root}/showimage/"
results = []
while True:
for path in text.extract_iter(
page, ' class="picbox"><a target="_blank" href="', '"'):
- results.append(("{}/showimage/{}/zzup.com.jpg".format(
- self.root, "/".join(path.split("/")[2:-2])), None))
+ url = f"{base}{'/'.join(path.split('/')[2:-2])}/zzup.com.jpg"
+ results.append((url, None))
pos = page.find("glyphicon-arrow-right")
if pos < 0:
break
- path = text.rextract(page, ' href="', '"', pos)[0]
- page = self.request(text.urljoin(self.gallery_url, path)).text
+ path = text.rextr(page, ' href="', '"', pos)
+ page = self.request(text.urljoin(self.page_url, path)).text
return results
diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py
index 6affc3e..7a49049 100644
--- a/gallery_dl/formatter.py
+++ b/gallery_dl/formatter.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -28,21 +28,17 @@ def parse(format_string, default=NONE, fmt=format):
except KeyError:
pass
- cls = StringFormatter
- if format_string.startswith("\f"):
+ if format_string and format_string[0] == "\f":
kind, _, format_string = format_string.partition(" ")
- kind = kind[1:]
-
- if kind == "T":
- cls = TemplateFormatter
- elif kind == "TF":
- cls = TemplateFStringFormatter
- elif kind == "E":
- cls = ExpressionFormatter
- elif kind == "M":
- cls = ModuleFormatter
- elif kind == "F":
- cls = FStringFormatter
+ try:
+ cls = _FORMATTERS[kind[1:]]
+ except KeyError:
+ import logging
+ logging.getLogger("formatter").error(
+ "Invalid formatter type '%s'", kind[1:])
+ cls = StringFormatter
+ else:
+ cls = StringFormatter
formatter = _CACHE[key] = cls(format_string, default, fmt)
return formatter
@@ -208,6 +204,48 @@ class ExpressionFormatter():
self.format_map = util.compile_expression(expression)
+class FStringFormatter():
+ """Generate text by evaluating an f-string literal"""
+
+ def __init__(self, fstring, default=NONE, fmt=None):
+ self.format_map = util.compile_expression(f'f"""{fstring}"""')
+
+
+def _init_jinja():
+ import jinja2
+ from . import config
+
+ if opts := config.get((), "jinja"):
+ JinjaFormatter.env = env = jinja2.Environment(
+ **opts.get("environment") or {})
+ else:
+ JinjaFormatter.env = jinja2.Environment()
+ return
+
+ if policies := opts.get("policies"):
+ env.policies.update(policies)
+
+ if path := opts.get("filters"):
+ module = util.import_file(path).__dict__
+ env.filters.update(
+ module["__filters__"] if "__filters__" in module else module)
+
+ if path := opts.get("tests"):
+ module = util.import_file(path).__dict__
+ env.tests.update(
+ module["__tests__"] if "__tests__" in module else module)
+
+
+class JinjaFormatter():
+ """Generate text by evaluating a Jinja template string"""
+ env = None
+
+ def __init__(self, source, default=NONE, fmt=None):
+ if self.env is None:
+ _init_jinja()
+ self.format_map = self.env.from_string(source).render
+
+
class ModuleFormatter():
"""Generate text by calling an external function"""
@@ -217,13 +255,6 @@ class ModuleFormatter():
self.format_map = getattr(module, function_name)
-class FStringFormatter():
- """Generate text by evaluating an f-string literal"""
-
- def __init__(self, fstring, default=NONE, fmt=None):
- self.format_map = util.compile_expression('f"""' + fstring + '"""')
-
-
class TemplateFormatter(StringFormatter):
"""Read format_string from file"""
@@ -242,6 +273,15 @@ class TemplateFStringFormatter(FStringFormatter):
FStringFormatter.__init__(self, fstring, default, fmt)
+class TemplateJinjaFormatter(JinjaFormatter):
+ """Generate text by evaluating a Jinja template"""
+
+ def __init__(self, path, default=NONE, fmt=None):
+ with open(util.expand_path(path)) as fp:
+ source = fp.read()
+ JinjaFormatter.__init__(self, source, default, fmt)
+
+
def parse_field_name(field_name):
if field_name[0] == "'":
return "_lit", (operator.itemgetter(field_name[1:-1]),)
@@ -302,7 +342,7 @@ def _parse_optional(format_spec, default):
fmt = _build_format_func(format_spec, default)
def optional(obj):
- return before + fmt(obj) + after if obj else ""
+ return f"{before}{fmt(obj)}{after}" if obj else ""
return optional
@@ -385,6 +425,27 @@ def _parse_join(format_spec, default):
return apply_join
+def _parse_map(format_spec, default):
+ key, _, format_spec = format_spec.partition(_SEPARATOR)
+ key = key[1:]
+ fmt = _build_format_func(format_spec, default)
+
+ def map_(obj):
+ if not obj or isinstance(obj, str):
+ return fmt(obj)
+
+ results = []
+ for item in obj:
+ if isinstance(item, dict):
+ value = item.get(key, ...)
+ results.append(default if value is ... else value)
+ else:
+ results.append(item)
+ return fmt(results)
+
+ return map_
+
+
def _parse_replace(format_spec, default):
old, new, format_spec = format_spec.split(_SEPARATOR, 2)
old = old[1:]
@@ -463,8 +524,7 @@ class Literal():
# __getattr__, __getattribute__, and __class_getitem__
# are all slower than regular __getitem__
- @staticmethod
- def __getitem__(key):
+ def __getitem__(self, key):
return key
@@ -472,6 +532,18 @@ _literal = Literal()
_CACHE = {}
_SEPARATOR = "/"
+_FORMATTERS = {
+ "E" : ExpressionFormatter,
+ "F" : FStringFormatter,
+ "J" : JinjaFormatter,
+ "M" : ModuleFormatter,
+ "S" : StringFormatter,
+ "T" : TemplateFormatter,
+ "TF": TemplateFStringFormatter,
+ "FT": TemplateFStringFormatter,
+ "TJ": TemplateJinjaFormatter,
+ "JT": TemplateJinjaFormatter,
+}
_GLOBALS = {
"_env": lambda: os.environ,
"_lit": lambda: _literal,
@@ -485,12 +557,15 @@ _CONVERSIONS = {
"C": string.capwords,
"j": util.json_dumps,
"t": str.strip,
- "L": len,
+ "n": len,
+ "L": util.code_to_language,
"T": util.datetime_to_timestamp_string,
"d": text.parse_timestamp,
+ "D": util.to_datetime,
"U": text.unescape,
"H": lambda s: text.unescape(text.remove_html(s)),
"g": text.slugify,
+ "W": text.sanitize_whitespace,
"S": util.to_string,
"s": str,
"r": repr,
@@ -506,6 +581,7 @@ _FORMAT_SPECIFIERS = {
"D": _parse_datetime,
"J": _parse_join,
"L": _parse_maxlen,
+ "M": _parse_map,
"O": _parse_offset,
"R": _parse_replace,
"S": _parse_sort,
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index a88f536..3176eb4 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -28,6 +28,7 @@ from . import (
)
from .extractor.message import Message
stdout_write = output.stdout_write
+FLAGS = util.FLAGS
class Job():
@@ -47,41 +48,16 @@ class Job():
self.kwdict = {}
self.kwdict_eval = False
- cfgpath = []
- if parent:
- if extr.category == parent.extractor.category or \
- extr.category in parent.parents:
- parents = parent.parents
- else:
- parents = parent.parents + (parent.extractor.category,)
-
- if parents:
- for category in parents:
- cat = "{}>{}".format(category, extr.category)
- cfgpath.append((cat, extr.subcategory))
- cfgpath.append((category + ">*", extr.subcategory))
- cfgpath.append((extr.category, extr.subcategory))
- self.parents = parents
- else:
- self.parents = ()
- else:
- self.parents = ()
-
- if extr.basecategory:
- if not cfgpath:
- cfgpath.append((extr.category, extr.subcategory))
- cfgpath.append((extr.basecategory, extr.subcategory))
-
- if cfgpath:
+ if cfgpath := self._build_config_path(parent):
+ if isinstance(cfgpath, list):
+ extr.config = extr._config_shared
+ extr.config_accumulate = extr._config_shared_accumulate
extr._cfgpath = cfgpath
- extr.config = extr._config_shared
- extr.config_accumulate = extr._config_shared_accumulate
- actions = extr.config("actions")
- if actions:
- from .actions import LoggerAdapter, parse
+ if actions := extr.config("actions"):
+ from .actions import LoggerAdapter, parse_logging
self._logger_adapter = LoggerAdapter
- self._logger_actions = parse(actions)
+ self._logger_actions = parse_logging(actions)
path_proxy = output.PathfmtProxy(self)
self._logger_extra = {
@@ -93,16 +69,6 @@ class Job():
extr.log = self._wrap_logger(extr.log)
extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url)
- # data from parent job
- if parent:
- pextr = parent.extractor
-
- # transfer (sub)category
- if pextr.config("category-transfer", pextr.categorytransfer):
- extr._cfgpath = pextr._cfgpath
- extr.category = pextr.category
- extr.subcategory = pextr.subcategory
-
self.metadata_url = extr.config2("metadata-url", "url-metadata")
self.metadata_http = extr.config2("metadata-http", "http-metadata")
metadata_path = extr.config2("metadata-path", "path-metadata")
@@ -121,8 +87,7 @@ class Job():
"current_git_head": util.git_head()
}
# user-supplied metadata
- kwdict = extr.config("keywords")
- if kwdict:
+ if kwdict := extr.config("keywords"):
if extr.config("keywords-eval"):
self.kwdict_eval = []
for key, value in kwdict.items():
@@ -134,6 +99,41 @@ class Job():
else:
self.kwdict.update(kwdict)
+ def _build_config_path(self, parent):
+ extr = self.extractor
+ cfgpath = []
+
+ if parent:
+ pextr = parent.extractor
+ if extr.category == pextr.category or \
+ extr.category in parent.parents:
+ parents = parent.parents
+ else:
+ parents = parent.parents + (pextr.category,)
+ self.parents = parents
+
+ if pextr.config("category-transfer", pextr.categorytransfer):
+ extr.category = pextr.category
+ extr.subcategory = pextr.subcategory
+ return pextr._cfgpath
+
+ if parents:
+ sub = extr.subcategory
+ for category in parents:
+ cat = f"{category}>{extr.category}"
+ cfgpath.append((cat, sub))
+ cfgpath.append((category + ">*", sub))
+ cfgpath.append((extr.category, sub))
+ else:
+ self.parents = ()
+
+ if extr.basecategory:
+ if not cfgpath:
+ cfgpath.append((extr.category, extr.subcategory))
+ cfgpath.append((extr.basecategory, extr.subcategory))
+
+ return cfgpath
+
def run(self):
"""Execute or run the job"""
extractor = self.extractor
@@ -151,9 +151,10 @@ class Job():
try:
for msg in extractor:
self.dispatch(msg)
- except exception.StopExtraction as exc:
- if exc.message:
- log.error(exc.message)
+ except exception.StopExtraction:
+ pass
+ except exception.AbortExtraction as exc:
+ log.error(exc.message)
self.status |= exc.code
except (exception.TerminateExtraction, exception.RestartExtraction):
raise
@@ -162,10 +163,14 @@ class Job():
log.debug("", exc_info=exc)
self.status |= exc.code
except OSError as exc:
- log.error("Unable to download data: %s: %s",
- exc.__class__.__name__, exc)
log.debug("", exc_info=exc)
- self.status |= 128
+ name = exc.__class__.__name__
+ if name == "JSONDecodeError":
+ log.error("Failed to parse JSON data: %s: %s", name, exc)
+ self.status |= 1
+ else: # regular OSError
+ log.error("Unable to download data: %s: %s", name, exc)
+ self.status |= 128
except Exception as exc:
log.error(("An unexpected error occurred: %s - %s. "
"Please run gallery-dl again with the --verbose flag, "
@@ -184,6 +189,8 @@ class Job():
self.handle_finalize()
extractor.finalize()
+ if s := extractor.status:
+ self.status |= s
return self.status
def dispatch(self, msg):
@@ -195,6 +202,8 @@ class Job():
if self.pred_url(url, kwdict):
self.update_kwdict(kwdict)
self.handle_url(url, kwdict)
+ if FLAGS.FILE is not None:
+ FLAGS.process("FILE")
elif msg[0] == Message.Directory:
self.update_kwdict(msg[1])
@@ -205,7 +214,10 @@ class Job():
if self.metadata_url:
kwdict[self.metadata_url] = url
if self.pred_queue(url, kwdict):
+ self.update_kwdict(kwdict)
self.handle_queue(url, kwdict)
+ if FLAGS.CHILD is not None:
+ FLAGS.process("CHILD")
def handle_url(self, url, kwdict):
"""Handle Message.Url"""
@@ -226,6 +238,8 @@ class Job():
kwdict["subcategory"] = extr.subcategory
if self.metadata_http:
kwdict.pop(self.metadata_http, None)
+ if extr.kwdict:
+ kwdict.update(extr.kwdict)
if self.kwdict:
kwdict.update(self.kwdict)
if self.kwdict_eval:
@@ -243,8 +257,7 @@ class Job():
if self.extractor.config(target + "-unique"):
predicates.append(util.UniquePredicate())
- pfilter = self.extractor.config(target + "-filter")
- if pfilter:
+ if pfilter := self.extractor.config(target + "-filter"):
try:
pred = util.FilterPredicate(pfilter, target)
except (SyntaxError, ValueError, TypeError) as exc:
@@ -252,8 +265,7 @@ class Job():
else:
predicates.append(pred)
- prange = self.extractor.config(target + "-range")
- if prange:
+ if prange := self.extractor.config(target + "-range"):
try:
pred = util.RangePredicate(prange)
except ValueError as exc:
@@ -382,6 +394,8 @@ class DownloadJob(Job):
if "post-after" in self.hooks:
for callback in self.hooks["post-after"]:
callback(self.pathfmt)
+ if FLAGS.POST is not None:
+ FLAGS.process("POST")
self.pathfmt.set_directory(kwdict)
if "post" in self.hooks:
for callback in self.hooks["post"]:
@@ -392,12 +406,10 @@ class DownloadJob(Job):
return
self.visited.add(url)
- cls = kwdict.get("_extractor")
- if cls:
+ if cls := kwdict.get("_extractor"):
extr = cls.from_url(url)
else:
- extr = extractor.find(url)
- if extr:
+ if extr := extractor.find(url):
if self._extractor_filter is None:
self._extractor_filter = self._build_extractor_filter()
if not self._extractor_filter(extr):
@@ -413,8 +425,7 @@ class DownloadJob(Job):
else:
extr._parentdir = pextr._parentdir
- pmeta = pextr.config2("parent-metadata", "metadata-parent")
- if pmeta:
+ if pmeta := pextr.config2("parent-metadata", "metadata-parent"):
if isinstance(pmeta, str):
data = self.kwdict.copy()
if kwdict:
@@ -446,9 +457,13 @@ class DownloadJob(Job):
except StopIteration:
pass
else:
+ pextr.log.info("Downloading fallback URL")
text.nameext_from_url(url, kwdict)
+ if kwdict["filename"].startswith((
+ "HLS", "DASH")):
+ kwdict["filename"] = url.rsplit("/", 2)[-2]
if url.startswith("ytdl:"):
- kwdict["extension"] = ""
+ kwdict["extension"] = "mp4"
self.handle_url(url, kwdict)
break
except exception.RestartExtraction:
@@ -463,8 +478,7 @@ class DownloadJob(Job):
self.archive.finalize()
self.archive.close()
- pathfmt = self.pathfmt
- if pathfmt:
+ if pathfmt := self.pathfmt:
hooks = self.hooks
if "post-after" in hooks:
for callback in hooks["post-after"]:
@@ -500,8 +514,7 @@ class DownloadJob(Job):
def download(self, url):
"""Download 'url'"""
scheme = url.partition(":")[0]
- downloader = self.get_downloader(scheme)
- if downloader:
+ if downloader := self.get_downloader(scheme):
try:
return downloader.download(url, self.pathfmt)
except OSError as exc:
@@ -547,8 +560,7 @@ class DownloadJob(Job):
# monkey-patch method to do nothing and always return True
self.download = pathfmt.fix_extension
- archive_path = cfg("archive")
- if archive_path:
+ if archive_path := cfg("archive"):
archive_table = cfg("archive-table")
archive_prefix = cfg("archive-prefix")
if archive_prefix is None:
@@ -585,8 +597,7 @@ class DownloadJob(Job):
self._archive_write_file = ("file" in events)
self._archive_write_skip = ("skip" in events)
- skip = cfg("skip", True)
- if skip:
+ if skip := cfg("skip", True):
self._skipexc = None
if skip == "enumerate":
pathfmt.check_file = pathfmt._enum_file
@@ -600,8 +611,7 @@ class DownloadJob(Job):
self._skipexc = SystemExit
self._skipmax = text.parse_int(smax)
- skip_filter = cfg("skip-filter")
- if skip_filter:
+ if skip_filter := cfg("skip-filter"):
self._skipftr = util.compile_filter(skip_filter)
else:
self._skipftr = None
@@ -614,8 +624,7 @@ class DownloadJob(Job):
if not cfg("postprocess", True):
return
- postprocessors = extr.config_accumulate("postprocessors")
- if postprocessors:
+ if postprocessors := extr.config_accumulate("postprocessors"):
self.hooks = collections.defaultdict(list)
pp_log = self.get_logger("postprocessor")
@@ -648,7 +657,26 @@ class DownloadJob(Job):
clist, negate)(extr):
continue
- name = pp_dict.get("name")
+ name = pp_dict.get("name", "")
+ if "__init__" not in pp_dict:
+ name, sep, event = name.rpartition("@")
+ if sep:
+ pp_dict["name"] = name
+ if "event" not in pp_dict:
+ pp_dict["event"] = event
+ else:
+ name = event
+
+ name, sep, mode = name.rpartition("/")
+ if sep:
+ pp_dict["name"] = name
+ if "mode" not in pp_dict:
+ pp_dict["mode"] = mode
+ else:
+ name = mode
+
+ pp_dict["__init__"] = None
+
pp_cls = postprocessor.find(name)
if not pp_cls:
pp_log.warning("module '%s' not found", name)
@@ -680,8 +708,7 @@ class DownloadJob(Job):
for hook, callback in hooks.items():
self.hooks[hook].append(callback)
- @staticmethod
- def _call_hook(callback, condition, pathfmt):
+ def _call_hook(self, callback, condition, pathfmt):
if condition(pathfmt.kwdict):
callback(pathfmt)
@@ -775,7 +802,7 @@ class KeywordJob(Job):
if markers is None:
markers = {markerid}
elif markerid in markers:
- write("{}\n <circular reference>\n".format(prefix[:-2]))
+ write(f"{prefix[:-2]}\n <circular reference>\n")
return # ignore circular reference
else:
markers.add(markerid)
@@ -801,7 +828,7 @@ class KeywordJob(Job):
else:
# string or number
- write("{}\n {}\n".format(key, value))
+ write(f"{key}\n {value}\n")
markers.remove(markerid)
@@ -816,20 +843,17 @@ class UrlJob(Job):
if depth >= self.maxdepth:
self.handle_queue = self.handle_url
- @staticmethod
- def handle_url(url, _):
+ def handle_url(self, url, _):
stdout_write(url + "\n")
- @staticmethod
- def handle_url_fallback(url, kwdict):
+ def handle_url_fallback(self, url, kwdict):
stdout_write(url + "\n")
if "_fallback" in kwdict:
for url in kwdict["_fallback"]:
stdout_write("| " + url + "\n")
def handle_queue(self, url, kwdict):
- cls = kwdict.get("_extractor")
- if cls:
+ if cls := kwdict.get("_extractor"):
extr = cls.from_url(url)
else:
extr = extractor.find(url)
@@ -862,20 +886,18 @@ class InfoJob(Job):
return 0
def _print_multi(self, title, *values):
- stdout_write("{}\n {}\n\n".format(
- title, " / ".join(map(util.json_dumps, values))))
+ stdout_write(
+ f"{title}\n {' / '.join(map(util.json_dumps, values))}\n\n")
def _print_config(self, title, optname, value):
optval = self.extractor.config(optname, util.SENTINEL)
if optval is not util.SENTINEL:
stdout_write(
- "{} (custom):\n {}\n{} (default):\n {}\n\n".format(
- title, util.json_dumps(optval),
- title, util.json_dumps(value)))
+ f"{title} (custom):\n {util.json_dumps(optval)}\n"
+ f"{title} (default):\n {util.json_dumps(value)}\n\n")
elif value:
stdout_write(
- "{} (default):\n {}\n\n".format(
- title, util.json_dumps(value)))
+ f"{title} (default):\n {util.json_dumps(value)}\n\n")
class DataJob(Job):
@@ -912,7 +934,10 @@ class DataJob(Job):
except exception.StopExtraction:
pass
except Exception as exc:
- self.data.append((exc.__class__.__name__, str(exc)))
+ self.data.append((-1, {
+ "error" : exc.__class__.__name__,
+ "message": str(exc),
+ }))
except BaseException:
pass
@@ -941,8 +966,7 @@ class DataJob(Job):
self.data.append((Message.Queue, url, self.filter(kwdict)))
def handle_queue_resolve(self, url, kwdict):
- cls = kwdict.get("_extractor")
- if cls:
+ if cls := kwdict.get("_extractor"):
extr = cls.from_url(url)
else:
extr = extractor.find(url)
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
index 3c03271..963f957 100644
--- a/gallery_dl/option.py
+++ b/gallery_dl/option.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -40,8 +40,8 @@ class DeprecatedConfigConstAction(argparse.Action):
"""Set argparse const values as config values + deprecation warning"""
def __call__(self, parser, namespace, values, option_string=None):
sys.stderr.write(
- "warning: {} is deprecated. Use {} instead.\n".format(
- "/".join(self.option_strings), self.choices))
+ f"Warning: {'/'.join(self.option_strings)} is deprecated. "
+ f"Use {self.choices} instead.\n")
namespace.options.append(((), self.dest, self.const))
@@ -71,7 +71,7 @@ class MtimeAction(argparse.Action):
def __call__(self, parser, namespace, value, option_string=None):
namespace.postprocessors.append({
"name": "mtime",
- "value": "{" + (self.const or value) + "}",
+ "value": f"{{{self.const or value}}}",
})
@@ -144,7 +144,7 @@ class UgoiraAction(argparse.Action):
}
namespace.options.append(((), "ugoira", "original"))
else:
- parser.error("Unsupported Ugoira format '{}'".format(value))
+ parser.error(f"Unsupported Ugoira format '{value}'")
pp["name"] = "ugoira"
pp["whitelist"] = ("pixiv", "danbooru")
@@ -156,10 +156,17 @@ class UgoiraAction(argparse.Action):
class PrintAction(argparse.Action):
def __call__(self, parser, namespace, value, option_string=None):
if self.const:
- filename = self.const
+ if self.const == "-":
+ namespace.options.append(((), "skip", False))
+ namespace.options.append(((), "download", False))
+ namespace.options.append((("output",), "mode", False))
+ filename = "-"
base = None
mode = "w"
else:
+ if self.const is None:
+ namespace.options.append(((), "skip", False))
+ namespace.options.append(((), "download", False))
value, path = value
base, filename = os.path.split(path)
mode = "a"
@@ -186,7 +193,7 @@ class PrintAction(argparse.Action):
if format_string[1] == "F" and format_string[-1] != "\n":
format_string += "\n"
elif "{" not in format_string and " " not in format_string:
- format_string = "{" + format_string + "}\n"
+ format_string = f"{{{format_string}}}\n"
elif format_string[-1] != "\n":
format_string += "\n"
@@ -205,12 +212,19 @@ class Formatter(argparse.HelpFormatter):
def __init__(self, prog):
argparse.HelpFormatter.__init__(self, prog, max_help_position=30)
- def _format_action_invocation(self, action, join=", ".join):
+ def _format_action_invocation(self, action):
opts = action.option_strings
if action.metavar:
opts = opts.copy()
- opts[-1] += " " + action.metavar
- return join(opts)
+ opts[-1] = f"{opts[-1]} {action.metavar}"
+ return ", ".join(opts)
+
+ def _format_usage(self, usage, actions, groups, prefix):
+ return f"Usage: {self._prog} [OPTIONS] URL [URL...]\n"
+
+ def format_help(self):
+ return self._long_break_matcher.sub(
+ "\n\n", self._root_section.format_help())
def _parse_option(opt):
@@ -225,7 +239,6 @@ def _parse_option(opt):
def build_parser():
"""Build and configure an ArgumentParser object"""
parser = argparse.ArgumentParser(
- usage="%(prog)s [OPTION]... URL...",
formatter_class=Formatter,
add_help=False,
)
@@ -273,6 +286,11 @@ def build_parser():
help="Delete cached login sessions, cookies, etc. for MODULE "
"(ALL to delete everything)",
)
+ general.add_argument(
+ "--compat",
+ dest="category-map", nargs=0, action=ConfigConstAction, const="compat",
+ help="Restore legacy 'category' names",
+ )
update = parser.add_argument_group("Update Options")
if util.EXECUTABLE:
@@ -395,13 +413,28 @@ def build_parser():
dest="postprocessors", metavar="[EVENT:]FORMAT",
action=PrintAction, const="-", default=[],
help=("Write FORMAT during EVENT (default 'prepare') to standard "
- "output. Examples: 'id' or 'post:{md5[:8]}'"),
+ "output instead of downloading files. "
+ "Can be used multiple times. "
+ "Examples: 'id' or 'post:{md5[:8]}'"),
+ )
+ output.add_argument(
+ "--Print",
+ dest="postprocessors", metavar="[EVENT:]FORMAT",
+ action=PrintAction, const="+",
+ help="Like --print, but downloads files as well",
)
output.add_argument(
"--print-to-file",
dest="postprocessors", metavar="[EVENT:]FORMAT FILE",
- action=PrintAction, nargs=2,
- help="Append FORMAT during EVENT to FILE",
+ action=PrintAction, const=None, nargs=2,
+ help=("Append FORMAT during EVENT to FILE instead of downloading "
+ "files. Can be used multiple times"),
+ )
+ output.add_argument(
+ "--Print-to-file",
+ dest="postprocessors", metavar="[EVENT:]FORMAT FILE",
+ action=PrintAction, const=False, nargs=2,
+ help="Like --print-to-file, but downloads files as well",
)
output.add_argument(
"--list-modules",
@@ -485,7 +518,7 @@ def build_parser():
downloader.add_argument(
"-r", "--limit-rate",
dest="rate", metavar="RATE", action=ConfigAction,
- help="Maximum download rate (e.g. 500k or 2.5M)",
+ help="Maximum download rate (e.g. 500k, 2.5M, or 800k-2M)",
)
downloader.add_argument(
"--chunk-size",
@@ -506,6 +539,12 @@ def build_parser():
"during data extraction"),
)
downloader.add_argument(
+ "--sleep-429",
+ dest="sleep-429", metavar="SECONDS", action=ConfigAction,
+ help=("Number of seconds to wait when receiving a "
+ "'429 Too Many Requests' response"),
+ )
+ downloader.add_argument(
"--sleep-extractor",
dest="sleep-extractor", metavar="SECONDS", action=ConfigAction,
help=("Number of seconds to wait before starting data extraction "
@@ -648,7 +687,7 @@ def build_parser():
selection.add_argument(
"--download-archive",
dest="archive", metavar="FILE", action=ConfigAction,
- help=("Record all downloaded or skipped files in FILE and "
+ help=("Record successfully downloaded files in FILE and "
"skip downloading any file already in it"),
)
selection.add_argument(
diff --git a/gallery_dl/output.py b/gallery_dl/output.py
index 1649487..e4937f4 100644
--- a/gallery_dl/output.py
+++ b/gallery_dl/output.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -121,8 +121,7 @@ class PathfmtProxy():
return pathfmt.__dict__.get(name) if pathfmt else None
def __str__(self):
- pathfmt = object.__getattribute__(self, "job").pathfmt
- if pathfmt:
+ if pathfmt := object.__getattribute__(self, "job").pathfmt:
return pathfmt.path or pathfmt.directory
return ""
@@ -235,8 +234,7 @@ def configure_logging(loglevel):
minlevel = handler.level
# file logging handler
- handler = setup_logging_handler("logfile", lvl=loglevel)
- if handler:
+ if handler := setup_logging_handler("logfile", lvl=loglevel):
root.addHandler(handler)
if minlevel > handler.level:
minlevel = handler.level
@@ -394,8 +392,7 @@ class PipeOutput(NullOutput):
class TerminalOutput():
def __init__(self):
- shorten = config.get(("output",), "shorten", True)
- if shorten:
+ if shorten := config.get(("output",), "shorten", True):
func = shorten_string_eaw if shorten == "eaw" else shorten_string
limit = shutil.get_terminal_size().columns - OFFSET
sep = CHAR_ELLIPSIES
@@ -416,10 +413,10 @@ class TerminalOutput():
bdl = util.format_value(bytes_downloaded)
bps = util.format_value(bytes_per_second)
if bytes_total is None:
- stderr_write("\r{:>7}B {:>7}B/s ".format(bdl, bps))
+ stderr_write(f"\r{bdl:>7}B {bps:>7}B/s ")
else:
- stderr_write("\r{:>3}% {:>7}B {:>7}B/s ".format(
- bytes_downloaded * 100 // bytes_total, bdl, bps))
+ stderr_write(f"\r{bytes_downloaded * 100 // bytes_total:>3}% "
+ f"{bdl:>7}B {bps:>7}B/s ")
class ColorOutput(TerminalOutput):
@@ -431,10 +428,8 @@ class ColorOutput(TerminalOutput):
if colors is None:
colors = COLORS_DEFAULT
- self.color_skip = "\033[{}m".format(
- colors.get("skip", "2"))
- self.color_success = "\r\033[{}m".format(
- colors.get("success", "1;32"))
+ self.color_skip = f"\x1b[{colors.get('skip', '2')}m"
+ self.color_success = f"\r\x1b[{colors.get('success', '1;32')}m"
def start(self, path):
stdout_write_flush(self.shorten(path))
@@ -462,8 +457,7 @@ class CustomOutput():
if isinstance(fmt_success, list):
off_success, fmt_success = fmt_success
- shorten = config.get(("output",), "shorten", True)
- if shorten:
+ if shorten := config.get(("output",), "shorten", True):
func = shorten_string_eaw if shorten == "eaw" else shorten_string
width = shutil.get_terminal_size().columns
@@ -483,8 +477,7 @@ class CustomOutput():
self._fmt_progress_total = (options.get("progress-total") or
"\r{3:>3}% {0:>7}B {1:>7}B/s ").format
- @staticmethod
- def _make_func(shorten, format_string, limit):
+ def _make_func(self, shorten, format_string, limit):
fmt = format_string.format
return lambda txt: fmt(shorten(txt, limit, CHAR_ELLIPSIES))
diff --git a/gallery_dl/path.py b/gallery_dl/path.py
index 54cf126..795564d 100644
--- a/gallery_dl/path.py
+++ b/gallery_dl/path.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,7 +9,6 @@
"""Filesystem path handling"""
import os
-import re
import shutil
import functools
from . import util, formatter, exception
@@ -134,26 +133,45 @@ class PathFormat():
basedir = self.clean_path(basedir)
self.basedirectory = basedir
- @staticmethod
- def _build_cleanfunc(chars, repl):
+ def _build_cleanfunc(self, chars, repl):
if not chars:
return util.identity
elif isinstance(chars, dict):
+ if 0 not in chars:
+ chars = self._process_repl_dict(chars)
+ chars[0] = None
+
def func(x, table=str.maketrans(chars)):
return x.translate(table)
elif len(chars) == 1:
def func(x, c=chars, r=repl):
return x.replace(c, r)
else:
- return functools.partial(
- re.compile("[" + chars + "]").sub, repl)
+ return functools.partial(util.re(f"[{chars}]").sub, repl)
return func
+ def _process_repl_dict(self, chars):
+ # can't modify 'chars' while *directly* iterating over its keys
+ for char in [c for c in chars if len(c) > 1]:
+ if len(char) == 3 and char[1] == "-":
+ citer = range(ord(char[0]), ord(char[2])+1)
+ else:
+ citer = char
+
+ repl = chars.pop(char)
+ for c in citer:
+ chars[c] = repl
+
+ return chars
+
def open(self, mode="wb"):
"""Open file and return a corresponding file object"""
try:
return open(self.temppath, mode)
except FileNotFoundError:
+ if "r" in mode:
+ # '.part' file no longer exists
+ return util.NullContext()
os.makedirs(self.realdirectory)
return open(self.temppath, mode)
@@ -163,8 +181,7 @@ class PathFormat():
return self.check_file()
return False
- @staticmethod
- def check_file():
+ def check_file(self):
return True
def _enum_file(self):
@@ -185,8 +202,7 @@ class PathFormat():
"""Build directory path and create it if necessary"""
self.kwdict = kwdict
- segments = self.build_directory(kwdict)
- if segments:
+ if segments := self.build_directory(kwdict):
self.directory = directory = self.basedirectory + self.clean_path(
os.sep.join(segments) + os.sep)
else:
@@ -263,7 +279,6 @@ class PathFormat():
def build_directory(self, kwdict):
"""Apply 'kwdict' to directory format strings"""
segments = []
- append = segments.append
strip = self.strip
try:
@@ -273,14 +288,13 @@ class PathFormat():
# remove trailing dots and spaces (#647)
segment = segment.rstrip(strip)
if segment:
- append(self.clean_segment(segment))
+ segments.append(self.clean_segment(segment))
return segments
except Exception as exc:
raise exception.DirectoryFormatError(exc)
def build_directory_conditional(self, kwdict):
segments = []
- append = segments.append
strip = self.strip
try:
@@ -294,7 +308,7 @@ class PathFormat():
if strip and segment != "..":
segment = segment.rstrip(strip)
if segment:
- append(self.clean_segment(segment))
+ segments.append(self.clean_segment(segment))
return segments
except Exception as exc:
raise exception.DirectoryFormatError(exc)
@@ -329,6 +343,11 @@ class PathFormat():
pass
return 0
+ def set_mtime(self, path=None):
+ if (mtime := (self.kwdict.get("_mtime_meta") or
+ self.kwdict.get("_mtime_http"))):
+ util.set_mtime(self.realpath if path is None else path, mtime)
+
def finalize(self):
"""Move tempfile to its target location"""
if self.delete:
@@ -362,6 +381,4 @@ class PathFormat():
os.unlink(self.temppath)
break
- mtime = self.kwdict.get("_mtime")
- if mtime:
- util.set_mtime(self.realpath, mtime)
+ self.set_mtime()
diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py
index dd44a8a..1a4ce56 100644
--- a/gallery_dl/postprocessor/__init__.py
+++ b/gallery_dl/postprocessor/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py
index 3099547..8da8417 100644
--- a/gallery_dl/postprocessor/common.py
+++ b/gallery_dl/postprocessor/common.py
@@ -22,8 +22,7 @@ class PostProcessor():
return self.__class__.__name__
def _init_archive(self, job, options, prefix=None):
- archive_path = options.get("archive")
- if archive_path:
+ if archive_path := options.get("archive"):
extr = job.extractor
archive_table = options.get("archive-table")
diff --git a/gallery_dl/postprocessor/compare.py b/gallery_dl/postprocessor/compare.py
index c6bc54d..c3d328d 100644
--- a/gallery_dl/postprocessor/compare.py
+++ b/gallery_dl/postprocessor/compare.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2023 Mike Fährmann
+# Copyright 2020-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -21,8 +21,7 @@ class ComparePP(PostProcessor):
self._compare = self._compare_size
self._equal_exc = self._equal_cnt = 0
- equal = options.get("equal")
- if equal:
+ if equal := options.get("equal"):
equal, _, emax = equal.partition(":")
self._equal_max = text.parse_int(emax)
if equal == "abort":
@@ -62,12 +61,10 @@ class ComparePP(PostProcessor):
def _compare(self, f1, f2):
return self._compare_size(f1, f2) and self._compare_content(f1, f2)
- @staticmethod
- def _compare_size(f1, f2):
+ def _compare_size(self, f1, f2):
return os.stat(f1).st_size == os.stat(f2).st_size
- @staticmethod
- def _compare_content(f1, f2):
+ def _compare_content(self, f1, f2):
size = 16384
with open(f1, "rb") as fp1, open(f2, "rb") as fp2:
while True:
diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py
index 7d2be2b..0bfe1a2 100644
--- a/gallery_dl/postprocessor/exec.py
+++ b/gallery_dl/postprocessor/exec.py
@@ -10,13 +10,14 @@
from .common import PostProcessor
from .. import util, formatter
+import subprocess
import os
-import re
if util.WINDOWS:
def quote(s):
- return '"' + s.replace('"', '\\"') + '"'
+ s = s.replace('"', '\\"')
+ return f'"{s}"'
else:
from shlex import quote
@@ -26,17 +27,21 @@ class ExecPP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
- if options.get("async", False):
- self._exec = self._exec_async
-
- args = options["command"]
- if isinstance(args, str):
- self.args = args
- self._sub = re.compile(r"\{(_directory|_filename|_path|)\}").sub
- execute = self.exec_string
+ if cmds := options.get("commands"):
+ self.cmds = [self._prepare_cmd(c) for c in cmds]
+ execute = self.exec_many
else:
- self.args = [formatter.parse(arg) for arg in args]
- execute = self.exec_list
+ execute, self.args = self._prepare_cmd(options["command"])
+ if options.get("async", False):
+ self._exec = self._popen
+
+ self.session = False
+ self.creationflags = 0
+ if options.get("session"):
+ if util.WINDOWS:
+ self.creationflags = subprocess.CREATE_NEW_PROCESS_GROUP
+ else:
+ self.session = True
events = options.get("event")
if events is None:
@@ -47,6 +52,13 @@ class ExecPP(PostProcessor):
self._init_archive(job, options)
+ def _prepare_cmd(self, cmd):
+ if isinstance(cmd, str):
+ self._sub = util.re(r"\{(_directory|_filename|_path|)\}").sub
+ return self.exec_string, cmd
+ else:
+ return self.exec_list, [formatter.parse(arg) for arg in cmd]
+
def exec_list(self, pathfmt):
archive = self.archive
kwdict = pathfmt.kwdict
@@ -60,10 +72,11 @@ class ExecPP(PostProcessor):
args = [arg.format_map(kwdict) for arg in self.args]
args[0] = os.path.expanduser(args[0])
- self._exec(args, False)
+ retcode = self._exec(args, False)
if archive:
archive.add(kwdict)
+ return retcode
def exec_string(self, pathfmt):
archive = self.archive
@@ -72,24 +85,47 @@ class ExecPP(PostProcessor):
self.pathfmt = pathfmt
args = self._sub(self._replace, self.args)
- self._exec(args, True)
+ retcode = self._exec(args, True)
if archive:
archive.add(pathfmt.kwdict)
+ return retcode
+
+ def exec_many(self, pathfmt):
+ if archive := self.archive:
+ if archive.check(pathfmt.kwdict):
+ return
+ self.archive = False
+
+ retcode = 0
+ for execute, args in self.cmds:
+ self.args = args
+ if retcode := execute(pathfmt):
+ # non-zero exit status
+ break
+
+ if archive:
+ self.archive = archive
+ archive.add(pathfmt.kwdict)
+ return retcode
def _exec(self, args, shell):
- self.log.debug("Running '%s'", args)
- retcode = util.Popen(args, shell=shell).wait()
- if retcode:
+ if retcode := self._popen(args, shell).wait():
self.log.warning("'%s' returned with non-zero exit status (%d)",
args, retcode)
+ return retcode
- def _exec_async(self, args, shell):
+ def _popen(self, args, shell):
self.log.debug("Running '%s'", args)
- util.Popen(args, shell=shell)
+ return util.Popen(
+ args,
+ shell=shell,
+ creationflags=self.creationflags,
+ start_new_session=self.session,
+ )
def _replace(self, match):
- name = match.group(1)
+ name = match[1]
if name == "_directory":
return quote(self.pathfmt.realdirectory)
if name == "_filename":
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
index fbb3fb8..c74f92f 100644
--- a/gallery_dl/postprocessor/metadata.py
+++ b/gallery_dl/postprocessor/metadata.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -55,8 +55,7 @@ class MetadataPP(PostProcessor):
self._json_encode = self._make_encoder(options, 4).encode
ext = "json"
- base_directory = options.get("base-directory")
- if base_directory:
+ if base_directory := options.get("base-directory"):
if base_directory is True:
self._base = lambda p: p.basedirectory
else:
@@ -139,9 +138,7 @@ class MetadataPP(PostProcessor):
archive.add(pathfmt.kwdict)
if self.mtime:
- mtime = pathfmt.kwdict.get("_mtime")
- if mtime:
- util.set_mtime(path, mtime)
+ pathfmt.set_mtime(path)
def _run_stdout(self, pathfmt):
self.write(sys.stdout, pathfmt.kwdict)
@@ -183,8 +180,7 @@ class MetadataPP(PostProcessor):
try:
pathfmt.directory_formatters = self._directory_formatters
pathfmt.directory_conditions = ()
- segments = pathfmt.build_directory(pathfmt.kwdict)
- if segments:
+ if segments := pathfmt.build_directory(pathfmt.kwdict):
directory = pathfmt.clean_path(os.sep.join(segments) + os.sep)
else:
directory = "." + os.sep
@@ -246,8 +242,7 @@ class MetadataPP(PostProcessor):
fp.write(self._json_encode(kwdict) + "\n")
def _make_filter(self, options):
- include = options.get("include")
- if include:
+ if include := options.get("include"):
if isinstance(include, str):
include = include.split(",")
return lambda d: {k: d[k] for k in include if k in d}
@@ -268,8 +263,7 @@ class MetadataPP(PostProcessor):
if not private:
return util.filter_dict
- @staticmethod
- def _make_encoder(options, indent=None):
+ def _make_encoder(self, options, indent=None):
return json.JSONEncoder(
ensure_ascii=options.get("ascii", False),
sort_keys=options.get("sort", False),
diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py
index 6ded1e2..b1269dd 100644
--- a/gallery_dl/postprocessor/mtime.py
+++ b/gallery_dl/postprocessor/mtime.py
@@ -17,8 +17,7 @@ class MtimePP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
- value = options.get("value")
- if value:
+ if value := options.get("value"):
self._get = formatter.parse(value, None, util.identity).format_map
else:
key = options.get("key", "date")
@@ -36,7 +35,7 @@ class MtimePP(PostProcessor):
if mtime is None:
return
- pathfmt.kwdict["_mtime"] = (
+ pathfmt.kwdict["_mtime_meta"] = (
util.datetime_to_timestamp(mtime)
if isinstance(mtime, datetime) else
text.parse_int(mtime)
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
index 5340335..33ebb75 100644
--- a/gallery_dl/postprocessor/ugoira.py
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -170,8 +170,8 @@ class UgoiraPP(PostProcessor):
for frame in self._files:
# update frame filename extension
- frame["file"] = name = "{}.{}".format(
- frame["file"].partition(".")[0], frame["ext"])
+ frame["file"] = name = \
+ f"{frame['file'].partition('.')[0]}.{frame['ext']}"
if tempdir:
# move frame into tempdir
@@ -236,9 +236,7 @@ class UgoiraPP(PostProcessor):
pathfmt.realpath = pathfmt.temppath
else:
if self.mtime:
- mtime = pathfmt.kwdict.get("_mtime")
- if mtime:
- util.set_mtime(pathfmt.realpath, mtime)
+ pathfmt.set_mtime()
return True
def convert_to_archive(self, pathfmt, tempdir):
@@ -298,8 +296,7 @@ class UgoiraPP(PostProcessor):
def _exec(self, args):
self.log.debug(args)
out = None if self.output else subprocess.DEVNULL
- retcode = util.Popen(args, stdout=out, stderr=out).wait()
- if retcode:
+ if retcode := util.Popen(args, stdout=out, stderr=out).wait():
output.stderr_write("\n")
self.log.error("Non-zero exit status when running %s (%s)",
args, retcode)
@@ -334,7 +331,7 @@ class UgoiraPP(PostProcessor):
last_copy = last.copy()
frames.append(last_copy)
name, _, ext = last_copy["file"].rpartition(".")
- last_copy["file"] = "{:>06}.{}".format(int(name)+1, ext)
+ last_copy["file"] = f"{int(name) + 1:>06}.{ext}"
shutil.copyfile(tempdir + last["file"],
tempdir + last_copy["file"])
@@ -349,10 +346,8 @@ class UgoiraPP(PostProcessor):
"-f", "image2",
"-ts_from_file", "2",
"-pattern_type", "sequence",
- "-i", "{}%06d.{}".format(
- tempdir.replace("%", "%%"),
- frame["file"].rpartition(".")[2]
- ),
+ "-i", (f"{tempdir.replace('%', '%%')}%06d."
+ f"{frame['file'].rpartition('.')[2]}"),
]
def _process_mkvmerge(self, pathfmt, tempdir):
@@ -363,10 +358,8 @@ class UgoiraPP(PostProcessor):
self.ffmpeg,
"-f", "image2",
"-pattern_type", "sequence",
- "-i", "{}/%06d.{}".format(
- tempdir.replace("%", "%%"),
- self._frames[0]["file"].rpartition(".")[2]
- ),
+ "-i", (f"{tempdir.replace('%', '%%')}/%06d."
+ f"{self._frames[0]['file'].rpartition('.')[2]}"),
]
def _finalize_mkvmerge(self, pathfmt, tempdir):
@@ -384,14 +377,13 @@ class UgoiraPP(PostProcessor):
def _write_ffmpeg_concat(self, tempdir):
content = ["ffconcat version 1.0"]
- append = content.append
for frame in self._frames:
- append("file '{}'\nduration {}".format(
- frame["file"], frame["delay"] / 1000))
+ content.append(f"file '{frame['file']}'\n"
+ f"duration {frame['delay'] / 1000}")
if self.repeat:
- append("file '{}'".format(frame["file"]))
- append("")
+ content.append(f"file '{frame['file']}'")
+ content.append("")
ffconcat = tempdir + "/ffconcat.txt"
with open(ffconcat, "w") as fp:
@@ -400,14 +392,13 @@ class UgoiraPP(PostProcessor):
def _write_mkvmerge_timecodes(self, tempdir):
content = ["# timecode format v2"]
- append = content.append
delay_sum = 0
for frame in self._frames:
- append(str(delay_sum))
+ content.append(str(delay_sum))
delay_sum += frame["delay"]
- append(str(delay_sum))
- append("")
+ content.append(str(delay_sum))
+ content.append("")
timecodes = tempdir + "/timecodes.tc"
with open(timecodes, "w") as fp:
@@ -416,24 +407,22 @@ class UgoiraPP(PostProcessor):
def calculate_framerate(self, frames):
if self._delay_is_uniform(frames):
- return ("1000/{}".format(frames[0]["delay"]), None)
+ return (f"1000/{frames[0]['delay']}", None)
if not self.uniform:
gcd = self._delay_gcd(frames)
if gcd >= 10:
- return (None, "1000/{}".format(gcd))
+ return (None, f"1000/{gcd}")
return (None, None)
- @staticmethod
- def _delay_gcd(frames):
+ def _delay_gcd(self, frames):
result = frames[0]["delay"]
for f in frames:
result = gcd(result, f["delay"])
return result
- @staticmethod
- def _delay_is_uniform(frames):
+ def _delay_is_uniform(self, frames):
delay = frames[0]["delay"]
for f in frames:
if f["delay"] != delay:
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index c1dde94..a7539ad 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2022 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,14 +8,29 @@
"""Collection of functions that work on strings/text"""
-import re
import sys
import html
import time
import datetime
import urllib.parse
+import re as re_module
-HTML_RE = re.compile("<[^>]+>")
+try:
+ re_compile = re_module._compiler.compile
+except AttributeError:
+ re_compile = re_module.sre_compile.compile
+
+HTML_RE = re_compile(r"<[^>]+>")
+PATTERN_CACHE = {}
+
+
+def re(pattern):
+ """Compile a regular expression pattern"""
+ try:
+ return PATTERN_CACHE[pattern]
+ except KeyError:
+ p = PATTERN_CACHE[pattern] = re_compile(pattern)
+ return p
def remove_html(txt, repl=" ", sep=" "):
@@ -47,8 +62,13 @@ def slugify(value):
Adapted from:
https://github.com/django/django/blob/master/django/utils/text.py
"""
- value = re.sub(r"[^\w\s-]", "", str(value).lower())
- return re.sub(r"[-\s]+", "-", value).strip("-_")
+ value = re(r"[^\w\s-]").sub("", str(value).lower())
+ return re(r"[-\s]+").sub("-", value).strip("-_")
+
+
+def sanitize_whitespace(value):
+ """Replace all whitespace characters with a single space"""
+ return re(r"\s+").sub(" ", value.strip())
def ensure_http_scheme(url, scheme="https://"):
@@ -100,7 +120,7 @@ def nameext_from_url(url, data=None):
return data
-def extract(txt, begin, end, pos=0):
+def extract(txt, begin, end, pos=None):
"""Extract the text between 'begin' and 'end' from 'txt'
Args:
@@ -125,7 +145,7 @@ def extract(txt, begin, end, pos=0):
last = txt.index(end, first)
return txt[first:last], last+len(end)
except Exception:
- return None, pos
+ return None, 0 if pos is None else pos
def extr(txt, begin, end, default=""):
@@ -137,17 +157,26 @@ def extr(txt, begin, end, default=""):
return default
-def rextract(txt, begin, end, pos=-1):
+def rextract(txt, begin, end, pos=None):
try:
lbeg = len(begin)
- first = txt.rindex(begin, 0, pos)
+ first = txt.rindex(begin, None, pos)
last = txt.index(end, first + lbeg)
return txt[first + lbeg:last], first
except Exception:
- return None, pos
+ return None, -1 if pos is None else pos
+
+
+def rextr(txt, begin, end, pos=None, default=""):
+ """Stripped-down version of 'rextract()'"""
+ try:
+ first = txt.rindex(begin, None, pos) + len(begin)
+ return txt[first:txt.index(end, first)]
+ except Exception:
+ return default
-def extract_all(txt, rules, pos=0, values=None):
+def extract_all(txt, rules, pos=None, values=None):
"""Calls extract for each rule and returns the result in a dict"""
if values is None:
values = {}
@@ -155,10 +184,10 @@ def extract_all(txt, rules, pos=0, values=None):
result, pos = extract(txt, begin, end, pos)
if key:
values[key] = result
- return values, pos
+ return values, 0 if pos is None else pos
-def extract_iter(txt, begin, end, pos=0):
+def extract_iter(txt, begin, end, pos=None):
"""Yield values that would be returned by repeated calls of extract()"""
try:
index = txt.index
@@ -173,7 +202,7 @@ def extract_iter(txt, begin, end, pos=0):
return
-def extract_from(txt, pos=0, default=""):
+def extract_from(txt, pos=None, default=""):
"""Returns a function object that extracts from 'txt'"""
def extr(begin, end, index=txt.index, txt=txt):
nonlocal pos
@@ -190,21 +219,22 @@ def extract_from(txt, pos=0, default=""):
def parse_unicode_escapes(txt):
"""Convert JSON Unicode escapes in 'txt' into actual characters"""
if "\\u" in txt:
- return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt)
+ return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt)
return txt
def _hex_to_char(match):
- return chr(int(match.group(1), 16))
+ return chr(int(match[1], 16))
def parse_bytes(value, default=0, suffixes="bkmgtp"):
"""Convert a bytes-amount ("500k", "2.5M", ...) to int"""
- try:
- last = value[-1].lower()
- except Exception:
+ if not value:
return default
+ value = str(value).strip()
+ last = value[-1].lower()
+
if last in suffixes:
mul = 1024 ** suffixes.index(last)
value = value[:-1]
@@ -279,12 +309,19 @@ def parse_query_list(qs, as_list=()):
else:
result[name] = [value]
elif name not in result:
- result[name] = unquote(value.replace("+", " "))
+ result[name] = value
except Exception:
pass
return result
+def build_query(params):
+ return "&".join([
+ f"{quote(name)}={quote(value)}"
+ for name, value in params.items()
+ ])
+
+
if sys.hexversion < 0x30c0000:
# Python <= 3.11
def parse_timestamp(ts, default=None):
@@ -307,12 +344,7 @@ else:
def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0):
"""Create a datetime object by parsing 'date_string'"""
try:
- if format.endswith("%z") and date_string[-3] == ":":
- # workaround for Python < 3.7: +00:00 -> +0000
- ds = date_string[:-3] + date_string[-2:]
- else:
- ds = date_string
- d = datetime.datetime.strptime(ds, format)
+ d = datetime.datetime.strptime(date_string, format)
o = d.utcoffset()
if o is not None:
# convert to naive UTC
diff --git a/gallery_dl/transaction_id.py b/gallery_dl/transaction_id.py
index 89e3d5b..915b7b3 100644
--- a/gallery_dl/transaction_id.py
+++ b/gallery_dl/transaction_id.py
@@ -139,8 +139,7 @@ class ClientTransaction():
(now >> 24) & 0xFF,
)
- payload = "{}!{}!{}{}{}".format(
- method, path, now, keyword, self.animation_key)
+ payload = f"{method}!{path}!{now}{keyword}{self.animation_key}"
bytes_hash = hashlib.sha256(payload.encode()).digest()[:16]
num = (random.randrange(16) << 4) + int((nowf - nowi) * 16.0)
diff --git a/gallery_dl/update.py b/gallery_dl/update.py
index 6650ec4..273ca18 100644
--- a/gallery_dl/update.py
+++ b/gallery_dl/update.py
@@ -1,13 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2024 Mike Fährmann
+# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import os
-import re
import sys
from .extractor.common import Extractor, Message
@@ -98,7 +97,7 @@ class UpdateJob(DownloadJob):
import atexit
import subprocess
- cmd = 'ping 127.0.0.1 -n 5 -w 1000 & del /F "{}"'.format(path_old)
+ cmd = f'ping 127.0.0.1 -n 5 -w 1000 & del /F "{path_old}"'
atexit.register(
util.Popen, cmd, shell=True,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
@@ -184,17 +183,16 @@ class UpdateExtractor(Extractor):
tag = channel
exact = True
- if re.match(r"\d\.\d+\.\d+", tag):
+ if util.re_compile(r"\d\.\d+\.\d+").match(tag):
tag = "v" + tag
try:
path_repo = REPOS[repo or "stable"]
except KeyError:
- raise exception.StopExtraction("Invalid channel '%s'", repo)
+ raise exception.AbortExtraction(f"Invalid channel '{repo}'")
path_tag = tag if tag == "latest" else "tags/" + tag
- url = "{}/repos/{}/releases/{}".format(
- self.root_api, path_repo, path_tag)
+ url = f"{self.root_api}/repos/{path_repo}/releases/{path_tag}"
headers = {
"Accept": "application/vnd.github+json",
"User-Agent": util.USERAGENT,
@@ -211,8 +209,8 @@ class UpdateExtractor(Extractor):
else:
binary_name = BINARIES[repo][binary]
- url = "{}/{}/releases/download/{}/{}".format(
- self.root, path_repo, data["tag_name"], binary_name)
+ url = (f"{self.root}/{path_repo}/releases/download"
+ f"/{data['tag_name']}/{binary_name}")
yield Message.Directory, data
yield Message.Url, url, data
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index ba31ea7..4027ac6 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,6 @@
"""Utility functions and classes"""
-import re
import os
import sys
import json
@@ -27,11 +26,6 @@ from http.cookiejar import Cookie
from email.utils import mktime_tz, parsedate_tz
from . import text, version, exception
-try:
- re_compile = re._compiler.compile
-except AttributeError:
- re_compile = re.sre_compile.compile
-
def bencode(num, alphabet="0123456789"):
"""Encode an integer into a base-N encoded string"""
@@ -48,8 +42,7 @@ def bdecode(data, alphabet="0123456789"):
num = 0
base = len(alphabet)
for c in data:
- num *= base
- num += alphabet.index(c)
+ num = num * base + alphabet.find(c)
return num
@@ -135,7 +128,7 @@ def false(_, __=None):
return False
-def noop():
+def noop(_=None):
"""Does nothing"""
@@ -159,18 +152,17 @@ def sha1(s):
def generate_token(size=16):
"""Generate a random token with hexadecimal digits"""
- data = random.getrandbits(size * 8).to_bytes(size, "big")
- return binascii.hexlify(data).decode()
+ return random.getrandbits(size * 8).to_bytes(size, "big").hex()
def format_value(value, suffixes="kMGTPEZY"):
- value = format(value)
+ value = str(value)
value_len = len(value)
index = value_len - 4
if index >= 0:
offset = (value_len - 1) % 3 + 1
- return (value[:offset] + "." + value[offset:offset+2] +
- suffixes[index // 3])
+ return (f"{value[:offset]}.{value[offset:offset+2]}"
+ f"{suffixes[index // 3]}")
return value
@@ -236,6 +228,34 @@ def to_string(value):
return str(value)
+def to_datetime(value):
+ """Convert 'value' to a datetime object"""
+ if not value:
+ return EPOCH
+
+ if isinstance(value, datetime.datetime):
+ return value
+
+ if isinstance(value, str):
+ try:
+ if value[-1] == "Z":
+ # compat for Python < 3.11
+ value = value[:-1]
+ dt = datetime.datetime.fromisoformat(value)
+ if dt.tzinfo is None:
+ if dt.microsecond:
+ dt = dt.replace(microsecond=0)
+ else:
+ # convert to naive UTC
+ dt = dt.astimezone(datetime.timezone.utc).replace(
+ microsecond=0, tzinfo=None)
+ return dt
+ except Exception:
+ pass
+
+ return text.parse_timestamp(value, EPOCH)
+
+
def datetime_to_timestamp(dt):
"""Convert naive UTC datetime to Unix timestamp"""
return (dt - EPOCH) / SECOND
@@ -298,7 +318,32 @@ def dump_response(response, fp, headers=False, content=True, hide_auth=True):
request = response.request
req_headers = request.headers.copy()
res_headers = response.headers.copy()
- outfmt = """\
+
+ if hide_auth:
+ if authorization := req_headers.get("Authorization"):
+ atype, sep, _ = str(authorization).partition(" ")
+ req_headers["Authorization"] = f"{atype} ***" if sep else "***"
+
+ if cookie := req_headers.get("Cookie"):
+ req_headers["Cookie"] = ";".join(
+ c.partition("=")[0] + "=***"
+ for c in cookie.split(";")
+ )
+
+ if set_cookie := res_headers.get("Set-Cookie"):
+ res_headers["Set-Cookie"] = re(r"(^|, )([^ =]+)=[^,;]*").sub(
+ r"\1\2=***", set_cookie)
+
+ request_headers = "\n".join(
+ f"{name}: {value}"
+ for name, value in req_headers.items()
+ )
+ response_headers = "\n".join(
+ f"{name}: {value}"
+ for name, value in res_headers.items()
+ )
+
+ output = f"""\
{request.method} {request.url}
Status: {response.status_code} {response.reason}
@@ -307,49 +352,17 @@ Request Headers
{request_headers}
"""
if request.body:
- outfmt += """
+ output = f"""{output}
Request Body
------------
{request.body}
"""
- outfmt += """
+ output = f"""{output}
Response Headers
----------------
{response_headers}
"""
- if hide_auth:
- authorization = req_headers.get("Authorization")
- if authorization:
- atype, sep, _ = str(authorization).partition(" ")
- req_headers["Authorization"] = atype + " ***" if sep else "***"
-
- cookie = req_headers.get("Cookie")
- if cookie:
- req_headers["Cookie"] = ";".join(
- c.partition("=")[0] + "=***"
- for c in cookie.split(";")
- )
-
- set_cookie = res_headers.get("Set-Cookie")
- if set_cookie:
- res_headers["Set-Cookie"] = re.sub(
- r"(^|, )([^ =]+)=[^,;]*", r"\1\2=***", set_cookie,
- )
-
- fmt_nv = "{}: {}".format
-
- fp.write(outfmt.format(
- request=request,
- response=response,
- request_headers="\n".join(
- fmt_nv(name, value)
- for name, value in req_headers.items()
- ),
- response_headers="\n".join(
- fmt_nv(name, value)
- for name, value in res_headers.items()
- ),
- ).encode())
+ fp.write(output.encode())
if content:
if headers:
@@ -361,14 +374,11 @@ def extract_headers(response):
headers = response.headers
data = dict(headers)
- hcd = headers.get("content-disposition")
- if hcd:
- name = text.extr(hcd, 'filename="', '"')
- if name:
+ if hcd := headers.get("content-disposition"):
+ if name := text.extr(hcd, 'filename="', '"'):
text.nameext_from_url(name, data)
- hlm = headers.get("last-modified")
- if hlm:
+ if hlm := headers.get("last-modified"):
data["date"] = datetime.datetime(*parsedate_tz(hlm)[:6])
return data
@@ -488,8 +498,7 @@ def cookiestxt_load(fp):
def cookiestxt_store(fp, cookies):
"""Write 'cookies' in Netscape cookies.txt format to 'fp'"""
- write = fp.write
- write("# Netscape HTTP Cookie File\n\n")
+ fp.write("# Netscape HTTP Cookie File\n\n")
for cookie in cookies:
if not cookie.domain:
@@ -503,7 +512,7 @@ def cookiestxt_store(fp, cookies):
value = cookie.value
domain = cookie.domain
- write("\t".join((
+ fp.write("\t".join((
domain,
"TRUE" if domain and domain[0] == "." else "FALSE",
cookie.path,
@@ -568,8 +577,7 @@ class HTTPBasicAuth():
def __init__(self, username, password):
self.authorization = b"Basic " + binascii.b2a_base64(
- username.encode("latin1") + b":" + str(password).encode("latin1")
- )[:-1]
+ f"{username}:{password}".encode("latin1"), newline=False)
def __call__(self, request):
request.headers["Authorization"] = self.authorization
@@ -611,6 +619,28 @@ class NullContext():
pass
+class NullResponse():
+ __slots__ = ("url", "reason")
+
+ ok = is_redirect = is_permanent_redirect = False
+ cookies = headers = history = links = {}
+ encoding = apparent_encoding = "utf-8"
+ content = b""
+ text = ""
+ status_code = 900
+ close = noop
+
+ def __init__(self, url, reason=""):
+ self.url = url
+ self.reason = str(reason)
+
+ def __str__(self):
+ return "900 " + self.reason
+
+ def json(self):
+ return {}
+
+
class CustomNone():
"""None-style type that supports more operations than regular None"""
__slots__ = ()
@@ -622,15 +652,14 @@ class CustomNone():
def __call__(self, *args, **kwargs):
return self
- @staticmethod
- def __next__():
+ def __next__(self):
raise StopIteration
def __eq__(self, other):
- return self is other
+ return other is self or other is None
def __ne__(self, other):
- return self is not other
+ return other is not self and other is not None
__lt__ = true
__le__ = true
@@ -671,25 +700,40 @@ class CustomNone():
__abs__ = identity
__invert__ = identity
- @staticmethod
- def __len__():
+ def __len__(self):
return 0
__int__ = __len__
__hash__ = __len__
__index__ = __len__
- @staticmethod
- def __format__(_):
+ def __format__(self, _):
return "None"
- @staticmethod
- def __str__():
+ def __str__(self):
return "None"
__repr__ = __str__
+class Flags():
+
+ def __init__(self):
+ self.FILE = self.POST = self.CHILD = self.DOWNLOAD = None
+
+ def process(self, flag):
+ value = self.__dict__[flag]
+ self.__dict__[flag] = None
+
+ if value == "abort":
+ raise exception.AbortExtraction()
+ if value == "terminate":
+ raise exception.TerminateExtraction()
+ if value == "restart":
+ raise exception.RestartExtraction()
+ raise exception.StopExtraction()
+
+
# v137.0 release of Firefox on 2025-04-01 has ordinal 739342
# 735506 == 739342 - 137 * 28
# v135.0 release of Chrome on 2025-04-01 has ordinal 739342
@@ -701,19 +745,30 @@ class CustomNone():
_ff_ver = (datetime.date.today().toordinal() - 735506) // 28
# _ch_ver = _ff_ver - 2
+re = text.re
+re_compile = text.re_compile
+
NONE = CustomNone()
+FLAGS = Flags()
EPOCH = datetime.datetime(1970, 1, 1)
SECOND = datetime.timedelta(0, 1)
WINDOWS = (os.name == "nt")
SENTINEL = object()
EXECUTABLE = getattr(sys, "frozen", False)
+SPECIAL_EXTRACTORS = {"oauth", "recursive", "generic"}
+
+EXTS_IMAGE = {"jpg", "jpeg", "png", "gif", "bmp", "svg", "psd", "ico",
+ "webp", "avif", "heic", "heif"}
+EXTS_VIDEO = {"mp4", "m4v", "mov", "webm", "mkv", "ogv", "flv", "avi", "wmv"}
+EXTS_ARCHIVE = {"zip", "rar", "7z", "tar", "gz", "bz2", "lzma", "xz"}
+
USERAGENT = "gallery-dl/" + version.__version__
-USERAGENT_FIREFOX = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{}.0) "
- "Gecko/20100101 Firefox/{}.0").format(_ff_ver, _ff_ver)
+USERAGENT_FIREFOX = (f"Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
+ f"rv:{_ff_ver}.0) Gecko/20100101 Firefox/{_ff_ver}.0")
USERAGENT_CHROME = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
- "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{}.0.0.0 "
- "Safari/537.36").format(_ff_ver - 2)
-SPECIAL_EXTRACTORS = {"oauth", "recursive", "generic"}
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ f"Chrome/{_ff_ver - 2}.0.0.0 Safari/537.36")
+
GLOBALS = {
"contains" : contains,
"parse_int": text.parse_int,
@@ -721,12 +776,16 @@ GLOBALS = {
"datetime" : datetime.datetime,
"timedelta": datetime.timedelta,
"abort" : raises(exception.StopExtraction),
+ "error" : raises(exception.AbortExtraction),
"terminate": raises(exception.TerminateExtraction),
"restart" : raises(exception.RestartExtraction),
"hash_sha1": sha1,
"hash_md5" : md5,
"std" : ModuleProxy(),
- "re" : re,
+ "re" : text.re_module,
+ "exts_image" : EXTS_IMAGE,
+ "exts_video" : EXTS_VIDEO,
+ "exts_archive": EXTS_ARCHIVE,
}
@@ -786,10 +845,12 @@ def compile_expression_defaultdict_impl(expr, name="<expr>", globals=None):
def compile_expression_tryexcept(expr, name="<expr>", globals=None):
code_object = compile(expr, name, "eval")
+ if globals is None:
+ globals = GLOBALS
- def _eval(locals=None, globals=(globals or GLOBALS), co=code_object):
+ def _eval(locals=None):
try:
- return eval(co, globals, locals)
+ return eval(code_object, globals, locals)
except exception.GalleryDLException:
raise
except Exception:
@@ -803,7 +864,7 @@ compile_expression = compile_expression_tryexcept
def compile_filter(expr, name="<filter>", globals=None):
if not isinstance(expr, str):
- expr = "(" + ") and (".join(expr) + ")"
+ expr = f"({') and ('.join(expr)})"
return compile_expression(expr, name, globals)
@@ -826,25 +887,25 @@ def import_file(path):
return __import__(name.replace("-", "_"))
-def build_duration_func(duration, min=0.0):
- if not duration:
+def build_selection_func(value, min=0.0, conv=float):
+ if not value:
if min:
return lambda: min
return None
- if isinstance(duration, str):
- lower, _, upper = duration.partition("-")
- lower = float(lower)
+ if isinstance(value, str):
+ lower, _, upper = value.partition("-")
else:
try:
- lower, upper = duration
+ lower, upper = value
except TypeError:
- lower, upper = duration, None
+ lower, upper = value, None
+ lower = conv(lower)
if upper:
- upper = float(upper)
+ upper = conv(upper)
return functools.partial(
- random.uniform,
+ random.uniform if lower.__class__ is float else random.randint,
lower if lower > min else min,
upper if upper > min else min,
)
@@ -854,6 +915,9 @@ def build_duration_func(duration, min=0.0):
return lambda: lower
+build_duration_func = build_selection_func
+
+
def build_extractor_filter(categories, negate=True, special=None):
"""Build a function that takes an Extractor class as argument
and returns True if that class is allowed by 'categories'
@@ -931,13 +995,13 @@ def build_proxy_map(proxies, log=None):
proxies[scheme] = "http://" + proxy.lstrip("/")
return proxies
- if log:
+ if log is not None:
log.warning("invalid proxy specifier: %s", proxies)
def build_predicate(predicates):
if not predicates:
- return lambda url, kwdict: True
+ return true
elif len(predicates) == 1:
return predicates[0]
return functools.partial(chain_predicates, predicates)
@@ -977,8 +1041,7 @@ class RangePredicate():
return True
return False
- @staticmethod
- def _parse(rangespec):
+ def _parse(self, rangespec):
"""Parse an integer range string and return the resulting ranges
Examples:
@@ -987,7 +1050,6 @@ class RangePredicate():
_parse("1:2,4:8:2") -> [(1,1), (4,7,2)]
"""
ranges = []
- append = ranges.append
if isinstance(rangespec, str):
rangespec = rangespec.split(",")
@@ -999,7 +1061,7 @@ class RangePredicate():
elif ":" in group:
start, _, stop = group.partition(":")
stop, _, step = stop.partition(":")
- append(range(
+ ranges.append(range(
int(start) if start.strip() else 1,
int(stop) if stop.strip() else sys.maxsize,
int(step) if step.strip() else 1,
@@ -1007,14 +1069,14 @@ class RangePredicate():
elif "-" in group:
start, _, stop = group.partition("-")
- append(range(
+ ranges.append(range(
int(start) if start.strip() else 1,
int(stop) + 1 if stop.strip() else sys.maxsize,
))
else:
start = int(group)
- append(range(start, start+1))
+ ranges.append(range(start, start+1))
return ranges
@@ -1037,7 +1099,7 @@ class FilterPredicate():
"""Predicate; True if evaluating the given expression returns True"""
def __init__(self, expr, target="image"):
- name = "<{} filter>".format(target)
+ name = f"<{target} filter>"
self.expr = compile_filter(expr, name)
def __call__(self, _, kwdict):
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index e543a31..af7e3c6 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2023 Mike Fährmann
+# Copyright 2016-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.29.7"
+__version__ = "1.30.2"
__variant__ = None
diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py
index 319e781..cfc6b50 100644
--- a/gallery_dl/ytdl.py
+++ b/gallery_dl/ytdl.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,6 @@
"""Helpers for interacting with youtube-dl"""
-import re
import shlex
import itertools
from . import text, util, exception
@@ -27,14 +26,16 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None):
opts = argv = None
config = obj.config
- cfg = config("config-file")
- if cfg:
+ if not config("deprecations"):
+ module.YoutubeDL.deprecated_feature = util.false
+ module.YoutubeDL.deprecation_warning = util.false
+
+ if cfg := config("config-file"):
with open(util.expand_path(cfg)) as fp:
contents = fp.read()
argv = shlex.split(contents, comments=True)
- cmd = config("cmdline-args")
- if cmd:
+ if cmd := config("cmdline-args"):
if isinstance(cmd, str):
cmd = shlex.split(cmd)
argv = (argv + cmd) if argv else cmd
@@ -42,7 +43,7 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None):
try:
opts = parse_command_line(module, argv) if argv else user_opts
except SystemExit:
- raise exception.StopExtraction("Invalid command-line option")
+ raise exception.AbortExtraction("Invalid command-line option")
if opts.get("format") is None:
opts["format"] = config("format")
@@ -50,28 +51,35 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None):
opts["nopart"] = not config("part", True)
if opts.get("updatetime") is None:
opts["updatetime"] = config("mtime", True)
- if opts.get("ratelimit") is None:
- opts["ratelimit"] = text.parse_bytes(config("rate"), None)
if opts.get("min_filesize") is None:
opts["min_filesize"] = text.parse_bytes(config("filesize-min"), None)
if opts.get("max_filesize") is None:
opts["max_filesize"] = text.parse_bytes(config("filesize-max"), None)
+ if opts.get("ratelimit") is None:
+ if rate := config("rate"):
+ func = util.build_selection_func(rate, 0, text.parse_bytes)
+ if hasattr(func, "args"):
+ opts["__gdl_ratelimit_func"] = func
+ else:
+ opts["ratelimit"] = func() or None
+ else:
+ opts["ratelimit"] = None
- raw_opts = config("raw-options")
- if raw_opts:
+ if raw_opts := config("raw-options"):
opts.update(raw_opts)
if config("logging", True):
opts["logger"] = obj.log
if system_opts:
opts.update(system_opts)
+ opts["__gdl_initialize"] = True
return module.YoutubeDL(opts)
def parse_command_line(module, argv):
parser, opts, args = module.parseOpts(argv)
- ytdlp = (module.__name__ == "yt_dlp")
+ ytdlp = hasattr(module, "cookies")
std_headers = module.std_headers
try:
@@ -141,7 +149,7 @@ def parse_command_line(module, argv):
if name not in compat_opts:
return False
compat_opts.discard(name)
- compat_opts.update(["*%s" % name])
+ compat_opts.update([f"*{name}"])
return True
def set_default_compat(
@@ -206,7 +214,7 @@ def parse_command_line(module, argv):
if "pre_process" not in parse_metadata:
parse_metadata["pre_process"] = []
parse_metadata["pre_process"].append(
- "title:%s" % opts.metafromtitle)
+ f"title:{opts.metafromtitle}")
opts.parse_metadata = {
k: list(itertools.chain.from_iterable(map(
metadataparser_actions, v)))
@@ -216,7 +224,7 @@ def parse_command_line(module, argv):
if parse_metadata is None:
parse_metadata = []
if opts.metafromtitle is not None:
- parse_metadata.append("title:%s" % opts.metafromtitle)
+ parse_metadata.append(f"title:{opts.metafromtitle}")
opts.parse_metadata = list(itertools.chain.from_iterable(map(
metadataparser_actions, parse_metadata)))
@@ -250,15 +258,13 @@ def parse_command_line(module, argv):
None if opts.match_filter is None
else module.match_filter_func(opts.match_filter))
- cookiesfrombrowser = getattr(opts, "cookiesfrombrowser", None)
- if cookiesfrombrowser:
- match = re.fullmatch(r"""(?x)
+ if cookiesfrombrowser := getattr(opts, "cookiesfrombrowser", None):
+ pattern = util.re(r"""(?x)
(?P<name>[^+:]+)
(?:\s*\+\s*(?P<keyring>[^:]+))?
(?:\s*:\s*(?!:)(?P<profile>.+?))?
- (?:\s*::\s*(?P<container>.+))?
- """, cookiesfrombrowser)
- if match:
+ (?:\s*::\s*(?P<container>.+))?""")
+ if match := pattern.fullmatch(cookiesfrombrowser):
browser, keyring, profile, container = match.groups()
if keyring is not None:
keyring = keyring.upper()
@@ -518,7 +524,7 @@ def legacy_postprocessors(opts, module, ytdlp, compat_opts):
if len(dur) == 2 and all(t is not None for t in dur):
remove_ranges.append(tuple(dur))
continue
- remove_chapters_patterns.append(re.compile(regex))
+ remove_chapters_patterns.append(util.re(regex))
if opts.remove_chapters or sponsorblock_query:
postprocessors.append({
"key": "ModifyChapters",
diff --git a/setup.py b/setup.py
index 44acef9..c52d1d7 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@ def check_file(fname):
VERSION = re.search(
r'__version__\s*=\s*"([^"]+)"',
read("gallery_dl/version.py"),
-).group(1)
+)[1]
FILES = [
(path, [f for f in files if check_file(f)])
@@ -100,13 +100,21 @@ def build_setuptools():
maintainer="Mike Fährmann",
maintainer_email="mike_faehrmann@web.de",
license="GPLv2",
- python_requires=">=3.4",
+ python_requires=">=3.8",
install_requires=[
"requests>=2.11.0",
],
extras_require={
"video": [
- "youtube-dl",
+ "yt-dlp",
+ ],
+ "extra": [
+ "requests[socks]",
+ "yt-dlp[default]",
+ "pyyaml",
+ "toml; python_version < '3.11'",
+ "truststore; python_version >= '3.10'",
+ "secretstorage; sys_platform == 'linux'",
],
},
entry_points={
@@ -127,10 +135,6 @@ def build_setuptools():
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
- "Programming Language :: Python :: 3.4",
- "Programming Language :: Python :: 3.5",
- "Programming Language :: Python :: 3.6",
- "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
diff --git a/test/test_config.py b/test/test_config.py
index be58456..5c94b1b 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -223,8 +223,7 @@ class TestConfigFiles(unittest.TestCase):
self.assertIsInstance(cfg, dict)
self.assertTrue(cfg)
- @staticmethod
- def _load(name):
+ def _load(self, name):
path = os.path.join(ROOTDIR, "docs", name)
try:
with open(path) as fp:
diff --git a/test/test_cookies.py b/test/test_cookies.py
index 9ba562c..5900473 100644
--- a/test/test_cookies.py
+++ b/test/test_cookies.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2017-2023 Mike Fährmann
+# Copyright 2017-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -14,6 +14,7 @@ from unittest import mock
import time
import logging
+import datetime
import tempfile
from os.path import join
@@ -70,8 +71,7 @@ class TestCookiejar(unittest.TestCase):
self.assertEqual(len(cookies), 0)
self.assertEqual(mock_warning.call_count, 1)
- self.assertEqual(mock_warning.call_args[0][0], "cookies: %s")
- self.assertIsInstance(mock_warning.call_args[0][1], exc)
+ self.assertIsInstance(mock_warning.call_args[0][-1], exc)
class TestCookiedict(unittest.TestCase):
@@ -205,27 +205,32 @@ class TestCookieUtils(unittest.TestCase):
now = int(time.time())
log = logging.getLogger("generic")
- extr.cookies.set("a", "1", expires=now-100)
+ extr.cookies.set("a", "1", expires=now-100, domain=".example.org")
with mock.patch.object(log, "warning") as mw:
self.assertFalse(extr.cookies_check(("a",)))
self.assertEqual(mw.call_count, 1)
- self.assertEqual(mw.call_args[0], ("Cookie '%s' has expired", "a"))
+ self.assertEqual(mw.call_args[0], (
+ "cookies: %s/%s expired at %s", "example.org", "a",
+ datetime.datetime.fromtimestamp(now-100)))
- extr.cookies.set("a", "1", expires=now+100)
+ extr.cookies.set("a", "1", expires=now+100, domain=".example.org")
with mock.patch.object(log, "warning") as mw:
self.assertTrue(extr.cookies_check(("a",)))
self.assertEqual(mw.call_count, 1)
self.assertEqual(mw.call_args[0], (
- "Cookie '%s' will expire in less than %s hour%s", "a", 1, ""))
+ "cookies: %s/%s will expire in less than %s hour%s",
+ "example.org", "a", 1, ""))
- extr.cookies.set("a", "1", expires=now+100+7200)
+ extr.cookies.set("a", "1", expires=now+100+7200, domain=".example.org")
with mock.patch.object(log, "warning") as mw:
self.assertTrue(extr.cookies_check(("a",)))
self.assertEqual(mw.call_count, 1)
self.assertEqual(mw.call_args[0], (
- "Cookie '%s' will expire in less than %s hour%s", "a", 3, "s"))
+ "cookies: %s/%s will expire in less than %s hour%s",
+ "example.org", "a", 3, "s"))
- extr.cookies.set("a", "1", expires=now+100+24*3600)
+ extr.cookies.set(
+ "a", "1", expires=now+100+24*3600, domain=".example.org")
with mock.patch.object(log, "warning") as mw:
self.assertTrue(extr.cookies_check(("a",)))
self.assertEqual(mw.call_count, 0)
diff --git a/test/test_downloader.py b/test/test_downloader.py
index 5a9a20b..3e5bf84 100644
--- a/test/test_downloader.py
+++ b/test/test_downloader.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2018-2022 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -163,7 +163,7 @@ class TestDownloaderConfig(unittest.TestCase):
self.assertEqual(dl.timeout, 10)
self.assertEqual(dl.verify, False)
self.assertEqual(dl.mtime, False)
- self.assertEqual(dl.rate, 42)
+ self.assertEqual(dl.rate(), 42)
self.assertEqual(dl.part, False)
@@ -332,7 +332,7 @@ class HttpRequestHandler(http.server.BaseHTTPRequestHandler):
status = 206
match = re.match(r"bytes=(\d+)-", self.headers["Range"])
- start = int(match.group(1))
+ start = int(match[1])
headers["Content-Range"] = "bytes {}-{}/{}".format(
start, len(output)-1, len(output))
@@ -369,6 +369,8 @@ SAMPLES = {
("heic", b"????ftypheis"),
("heic", b"????ftypheix"),
("svg" , b"<?xml"),
+ ("html", b"<!DOCTYPE html><html>...</html>"),
+ ("html", b" \n \n\r\t\n <!DOCTYPE html><html>...</html>"),
("ico" , b"\x00\x00\x01\x00"),
("cur" , b"\x00\x00\x02\x00"),
("psd" , b"8BPS"),
diff --git a/test/test_extractor.py b/test/test_extractor.py
index dfc5ff8..bf4aa07 100644
--- a/test/test_extractor.py
+++ b/test/test_extractor.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2018-2023 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -122,8 +122,8 @@ class TestExtractorModule(unittest.TestCase):
extr = cls.from_url(url)
except ImportError as exc:
if exc.name in ("youtube_dl", "yt_dlp"):
- print("Skipping '{}' category checks".format(cls.category))
- return
+ return sys.stdout.write(
+ f"Skipping '{cls.category}' category checks\n")
raise
self.assertTrue(extr, url)
@@ -138,46 +138,8 @@ class TestExtractorModule(unittest.TestCase):
self.assertEqual(extr.subcategory, sub, url)
self.assertEqual(extr.basecategory, base, url)
- @unittest.skipIf(not results, "no test data")
- def test_unique_pattern_matches(self):
- # collect testcase URLs
- test_urls = []
- append = test_urls.append
-
- for result in results.all():
- if not result.get("#fail"):
- append((result["#url"], result["#class"]))
-
- # iterate over all testcase URLs
- for url, extr1 in test_urls:
- matches = []
-
- # ... and apply all regex patterns to each one
- for extr2 in _list_classes():
-
- # skip DirectlinkExtractor pattern if it isn't tested
- if extr1 != DirectlinkExtractor and \
- extr2 == DirectlinkExtractor:
- continue
-
- match = extr2.pattern.match(url)
- if match:
- matches.append((match, extr2))
-
- # fail if more or less than 1 match happened
- if len(matches) > 1:
- msg = "'{}' gets matched by more than one pattern:".format(url)
- for match, extr in matches:
- msg += "\n\n- {}:\n{}".format(
- extr.__name__, match.re.pattern)
- self.fail(msg)
-
- elif len(matches) < 1:
- msg = "'{}' isn't matched by any pattern".format(url)
- self.fail(msg)
-
- else:
- self.assertIs(extr1, matches[0][1], url)
+ if base not in ("reactor", "wikimedia"):
+ self.assertEqual(extr._cfgpath, ("extractor", cat, sub), url)
def test_init(self):
"""Test for exceptions in Extractor.initialize() and .finalize()"""
@@ -188,14 +150,16 @@ class TestExtractorModule(unittest.TestCase):
if cls.category == "ytdl":
continue
extr = cls.from_url(cls.example)
- if not extr and cls.basecategory and not cls.instances:
- continue
+ if not extr:
+ if cls.basecategory and not cls.instances:
+ continue
+ self.fail(f"{cls.__name__} pattern does not match "
+ f"example URL '{cls.example}'")
extr.request = fail_request
extr.initialize()
extr.finalize()
- @unittest.skipIf(sys.hexversion < 0x3060000, "test fails in CI")
def test_init_ytdl(self):
try:
extr = extractor.find("ytdl:")
@@ -293,8 +257,7 @@ class TestExtractorWait(unittest.TestCase):
u = self._isotime_to_seconds(until.time().isoformat()[:8])
self.assertLessEqual(o-u, 1.0)
- @staticmethod
- def _isotime_to_seconds(isotime):
+ def _isotime_to_seconds(self, isotime):
parts = isotime.split(":")
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
diff --git a/test/test_formatter.py b/test/test_formatter.py
index 646f179..3305983 100644
--- a/test/test_formatter.py
+++ b/test/test_formatter.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -15,11 +15,19 @@ import datetime
import tempfile
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from gallery_dl import formatter, text, util # noqa E402
+from gallery_dl import formatter, text, util, config # noqa E402
+
+try:
+ import jinja2
+except ImportError:
+ jinja2 = None
class TestFormatter(unittest.TestCase):
+ def tearDown(self):
+ config.clear()
+
kwdict = {
"a": "hElLo wOrLd",
"b": "äöü",
@@ -27,16 +35,23 @@ class TestFormatter(unittest.TestCase):
"d": {"a": "foo", "b": 0, "c": None},
"i": 2,
"l": ["a", "b", "c"],
+ "L": [
+ {"name": "John Doe" , "age": 42, "email": "jd@example.org"},
+ {"name": "Jane Smith" , "age": 24, "email": None},
+ {"name": "Max Mustermann", "age": False},
+ ],
"n": None,
"s": " \n\r\tSPACE ",
+ "S": " \n\r\tS P A\tC\nE ",
"h": "<p>foo </p> &amp; bar <p> </p>",
"u": "&#x27;&lt; / &gt;&#x27;",
"t": 1262304000,
- "ds": "2010-01-01T01:00:00+0100",
+ "ds": "2010-01-01T01:00:00+01:00",
"dt": datetime.datetime(2010, 1, 1),
"dt_dst": datetime.datetime(2010, 6, 1),
"i_str": "12345",
"f_str": "12.45",
+ "lang": "en",
"name": "Name",
"title1": "Title",
"title2": "",
@@ -50,6 +65,7 @@ class TestFormatter(unittest.TestCase):
self._run_test("{a!c}", "Hello world")
self._run_test("{a!C}", "Hello World")
self._run_test("{s!t}", "SPACE")
+ self._run_test("{S!t}", "S P A\tC\nE")
self._run_test("{a!U}", self.kwdict["a"])
self._run_test("{u!U}", "'< / >'")
self._run_test("{a!H}", self.kwdict["a"])
@@ -65,13 +81,22 @@ class TestFormatter(unittest.TestCase):
self._run_test("{n!S}", "")
self._run_test("{t!d}", datetime.datetime(2010, 1, 1))
self._run_test("{t!d:%Y-%m-%d}", "2010-01-01")
+ self._run_test("{t!D}" , datetime.datetime(2010, 1, 1))
+ self._run_test("{ds!D}", datetime.datetime(2010, 1, 1))
+ self._run_test("{dt!D}", datetime.datetime(2010, 1, 1))
+ self._run_test("{t!D:%Y-%m-%d}", "2010-01-01")
self._run_test("{dt!T}", "1262304000")
self._run_test("{l!j}", '["a","b","c"]')
self._run_test("{dt!j}", '"2010-01-01 00:00:00"')
self._run_test("{a!g}", "hello-world")
- self._run_test("{a!L}", 11)
- self._run_test("{l!L}", 3)
- self._run_test("{d!L}", 3)
+ self._run_test("{lang!L}", "English")
+ self._run_test("{'fr'!L}", "French")
+ self._run_test("{a!L}", None)
+ self._run_test("{a!n}", 11)
+ self._run_test("{l!n}", 3)
+ self._run_test("{d!n}", 3)
+ self._run_test("{s!W}", "SPACE")
+ self._run_test("{S!W}", "S P A C E")
self._run_test("{i_str!i}", 12345)
self._run_test("{i_str!f}", 12345.0)
self._run_test("{f_str!f}", 12.45)
@@ -201,7 +226,7 @@ class TestFormatter(unittest.TestCase):
self._run_test("{j:[b:]}" , v)
self._run_test("{j:[b::]}" , v)
- def test_maxlen(self):
+ def test_specifier_maxlen(self):
v = self.kwdict["a"]
self._run_test("{a:L5/foo/}" , "foo")
self._run_test("{a:L50/foo/}", v)
@@ -209,7 +234,7 @@ class TestFormatter(unittest.TestCase):
self._run_test("{a:L50/foo/>51}", "foo")
self._run_test("{a:Lab/foo/}", "foo")
- def test_join(self):
+ def test_specifier_join(self):
self._run_test("{l:J}" , "abc")
self._run_test("{l:J,}" , "a,b,c")
self._run_test("{l:J,/}" , "a,b,c")
@@ -221,7 +246,7 @@ class TestFormatter(unittest.TestCase):
self._run_test("{a:J/}" , self.kwdict["a"])
self._run_test("{a:J, /}" , self.kwdict["a"])
- def test_replace(self):
+ def test_specifier_replace(self):
self._run_test("{a:Rh/C/}" , "CElLo wOrLd")
self._run_test("{a!l:Rh/C/}", "Cello world")
self._run_test("{a!u:Rh/C/}", "HELLO WORLD")
@@ -230,12 +255,12 @@ class TestFormatter(unittest.TestCase):
self._run_test("{a!l:Rl//}" , "heo word")
self._run_test("{name:Rame/othing/}", "Nothing")
- def test_datetime(self):
+ def test_specifier_datetime(self):
self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z}", "2010-01-01 00:00:00")
- self._run_test("{ds:D%Y}", "2010-01-01T01:00:00+0100")
+ self._run_test("{ds:D%Y}", "2010-01-01T01:00:00+01:00")
self._run_test("{l:D%Y}", "None")
- def test_offset(self):
+ def test_specifier_offset(self):
self._run_test("{dt:O 01:00}", "2010-01-01 01:00:00")
self._run_test("{dt:O+02:00}", "2010-01-01 02:00:00")
self._run_test("{dt:O-03:45}", "2009-12-31 20:15:00")
@@ -246,7 +271,7 @@ class TestFormatter(unittest.TestCase):
self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z/O1}", "2010-01-01 01:00:00")
self._run_test("{t!d:O2}", "2010-01-01 02:00:00")
- def test_offset_local(self):
+ def test_specifier_offset_local(self):
ts = self.kwdict["dt"].replace(
tzinfo=datetime.timezone.utc).timestamp()
offset = time.localtime(ts).tm_gmtoff
@@ -261,7 +286,7 @@ class TestFormatter(unittest.TestCase):
self._run_test("{dt_dst:O}", str(dt))
self._run_test("{dt_dst:Olocal}", str(dt))
- def test_sort(self):
+ def test_specifier_sort(self):
self._run_test("{l:S}" , "['a', 'b', 'c']")
self._run_test("{l:Sa}", "['a', 'b', 'c']")
self._run_test("{l:Sd}", "['c', 'b', 'a']")
@@ -293,6 +318,19 @@ class TestFormatter(unittest.TestCase):
with self.assertRaises(ValueError):
self._run_test("{a:Xfoo/ */}", "hello wo *")
+ def test_specifier_map(self):
+ self._run_test("{L:Mname/}" ,
+ "['John Doe', 'Jane Smith', 'Max Mustermann']")
+ self._run_test("{L:Mage/}" ,
+ "[42, 24, False]")
+
+ self._run_test("{a:Mname}", self.kwdict["a"])
+ self._run_test("{n:Mname}", "None")
+ self._run_test("{title4:Mname}", "0")
+
+ with self.assertRaises(ValueError):
+ self._run_test("{t:Mname", "")
+
def test_chain_special(self):
# multiple replacements
self._run_test("{a:Rh/C/RE/e/RL/l/}", "Cello wOrld")
@@ -314,6 +352,9 @@ class TestFormatter(unittest.TestCase):
# sort and join
self._run_test("{a:S/J}", " ELLOdhlorw")
+ # map and join
+ self._run_test("{L:Mname/J-}", "John Doe-Jane Smith-Max Mustermann")
+
def test_separator(self):
orig_separator = formatter._SEPARATOR
try:
@@ -420,7 +461,6 @@ class TestFormatter(unittest.TestCase):
self._run_test("\fE name * 2 + ' ' + a", "{}{} {}".format(
self.kwdict["name"], self.kwdict["name"], self.kwdict["a"]))
- @unittest.skipIf(sys.hexversion < 0x3060000, "no fstring support")
def test_fstring(self):
self._run_test("\fF {a}", self.kwdict["a"])
self._run_test("\fF {name}{name} {a}", "{}{} {}".format(
@@ -428,7 +468,6 @@ class TestFormatter(unittest.TestCase):
self._run_test("\fF foo-'\"{a.upper()}\"'-bar",
"""foo-'"{}"'-bar""".format(self.kwdict["a"].upper()))
- @unittest.skipIf(sys.hexversion < 0x3060000, "no fstring support")
def test_template_fstring(self):
with tempfile.TemporaryDirectory() as tmpdirname:
path1 = os.path.join(tmpdirname, "tpl1")
@@ -449,6 +488,90 @@ class TestFormatter(unittest.TestCase):
with self.assertRaises(OSError):
formatter.parse("\fTF /")
+ @unittest.skipIf(jinja2 is None, "no jinja2")
+ def test_jinja(self):
+ formatter.JinjaFormatter.env = None
+
+ self._run_test("\fJ {{a}}", self.kwdict["a"])
+ self._run_test("\fJ {{name}}{{name}} {{a}}", "{}{} {}".format(
+ self.kwdict["name"], self.kwdict["name"], self.kwdict["a"]))
+ self._run_test("\fJ foo-'\"{{a | upper}}\"'-bar",
+ """foo-'"{}"'-bar""".format(self.kwdict["a"].upper()))
+
+ @unittest.skipIf(jinja2 is None, "no jinja2")
+ def test_template_jinja(self):
+ formatter.JinjaFormatter.env = None
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ path1 = os.path.join(tmpdirname, "tpl1")
+ path2 = os.path.join(tmpdirname, "tpl2")
+
+ with open(path1, "w") as fp:
+ fp.write("{{a}}")
+ fmt1 = formatter.parse("\fTJ " + path1)
+
+ with open(path2, "w") as fp:
+ fp.write("foo-'\"{{a | upper}}\"'-bar")
+ fmt2 = formatter.parse("\fTJ " + path2)
+
+ self.assertEqual(fmt1.format_map(self.kwdict), self.kwdict["a"])
+ self.assertEqual(fmt2.format_map(self.kwdict),
+ """foo-'"{}"'-bar""".format(self.kwdict["a"].upper()))
+
+ with self.assertRaises(OSError):
+ formatter.parse("\fTJ /")
+
+ @unittest.skipIf(jinja2 is None, "no jinja2")
+ def test_template_jinja_opts(self):
+ formatter.JinjaFormatter.env = None
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ path_filters = os.path.join(tmpdirname, "jinja_filters.py")
+ path_template = os.path.join(tmpdirname, "jinja_template.txt")
+
+ config.set((), "jinja", {
+ "environment": {
+ "variable_start_string": "(((",
+ "variable_end_string" : ")))",
+ "keep_trailing_newline": True,
+ },
+ "filters": path_filters,
+ })
+
+ with open(path_filters, "w") as fp:
+ fp.write(r"""
+import re
+
+def datetime_format(value, format="%H:%M %d-%m-%y"):
+ return value.strftime(format)
+
+def sanitize(value):
+ return re.sub(r"\s+", " ", value.strip())
+
+__filters__ = {
+ "dt_fmt": datetime_format,
+ "sanitize_whitespace": sanitize,
+}
+""")
+
+ with open(path_template, "w") as fp:
+ fp.write("""\
+Present Day is ((( dt | dt_fmt("%B %d, %Y") )))
+Present Time is ((( dt | dt_fmt("%H:%M:%S") )))
+
+Hello ((( s | sanitize_whitespace ))).
+I hope there is enough "(((S|sanitize_whitespace)))" for you.
+""")
+ fmt = formatter.parse("\fTJ " + path_template)
+
+ self.assertEqual(fmt.format_map(self.kwdict), """\
+Present Day is January 01, 2010
+Present Time is 00:00:00
+
+Hello SPACE.
+I hope there is enough "S P A C E" for you.
+""")
+
def test_module(self):
with tempfile.TemporaryDirectory() as tmpdirname:
path = os.path.join(tmpdirname, "testmod.py")
@@ -488,10 +611,10 @@ def noarg():
fmt4 = formatter.parse("\fM " + path + ":lengths")
self.assertEqual(fmt1.format_map(self.kwdict), "'Title' by Name")
- self.assertEqual(fmt2.format_map(self.kwdict), "136")
+ self.assertEqual(fmt2.format_map(self.kwdict), "168")
self.assertEqual(fmt3.format_map(self.kwdict), "'Title' by Name")
- self.assertEqual(fmt4.format_map(self.kwdict), "136")
+ self.assertEqual(fmt4.format_map(self.kwdict), "168")
with self.assertRaises(TypeError):
self.assertEqual(fmt0.format_map(self.kwdict), "")
diff --git a/test/test_job.py b/test/test_job.py
index 3e6f85b..3aa28e8 100644
--- a/test/test_job.py
+++ b/test/test_job.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2021-2023 Mike Fährmann
+# Copyright 2021-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -282,7 +282,12 @@ class TestDataJob(TestJob):
tjob = self.jobclass(extr, file=io.StringIO())
tjob.run()
self.assertEqual(
- tjob.data[-1], ("ZeroDivisionError", "division by zero"))
+ tjob.data[-1],
+ (-1, {
+ "error" : "ZeroDivisionError",
+ "message": "division by zero",
+ })
+ )
def test_private(self):
config.set(("output",), "private", True)
@@ -364,7 +369,7 @@ class TestExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.user = {"id": 123, "name": "test"}
- if match.group(1) == "self":
+ if match[1] == "self":
self.user["self"] = self.user
def items(self):
diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py
index 76e728c..2e39cc7 100644
--- a/test/test_postprocessor.py
+++ b/test/test_postprocessor.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2019-2023 Mike Fährmann
+# Copyright 2019-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,7 @@
import os
import sys
import unittest
-from unittest.mock import Mock, mock_open, patch
+from unittest.mock import Mock, mock_open, patch, call
import shutil
import logging
@@ -20,7 +20,7 @@ import collections
from datetime import datetime
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from gallery_dl import extractor, output, path # noqa E402
+from gallery_dl import extractor, output, path, util # noqa E402
from gallery_dl import postprocessor, config # noqa E402
from gallery_dl.postprocessor.common import PostProcessor # noqa E402
@@ -209,7 +209,10 @@ class ExecTest(BasePostprocessorTest):
self.pathfmt.realpath,
self.pathfmt.realdirectory,
self.pathfmt.filename),
- shell=True)
+ shell=True,
+ creationflags=0,
+ start_new_session=False,
+ )
i.wait.assert_called_once_with()
def test_command_list(self):
@@ -231,8 +234,46 @@ class ExecTest(BasePostprocessorTest):
self.pathfmt.realdirectory.upper(),
],
shell=False,
+ creationflags=0,
+ start_new_session=False,
)
+ def test_command_many(self):
+ self._create({
+ "commands": [
+ "echo {} {_path} {_directory} {_filename} && rm {};",
+ ["~/script.sh", "{category}", "\fE _directory.upper()"],
+ ]
+ })
+
+ with patch("gallery_dl.util.Popen") as p:
+ i = Mock()
+ i.wait.return_value = 0
+ p.return_value = i
+ self._trigger(("after",))
+
+ self.assertEqual(p.call_args_list, [
+ call(
+ "echo {0} {0} {1} {2} && rm {0};".format(
+ self.pathfmt.realpath,
+ self.pathfmt.realdirectory,
+ self.pathfmt.filename),
+ shell=True,
+ creationflags=0,
+ start_new_session=False,
+ ),
+ call(
+ [
+ os.path.expanduser("~/script.sh"),
+ self.pathfmt.kwdict["category"],
+ self.pathfmt.realdirectory.upper(),
+ ],
+ shell=False,
+ creationflags=0,
+ start_new_session=False,
+ ),
+ ])
+
def test_command_returncode(self):
self._create({
"command": "echo {}",
@@ -264,6 +305,49 @@ class ExecTest(BasePostprocessorTest):
self.assertTrue(p.called)
self.assertFalse(i.wait.called)
+ @unittest.skipIf(util.WINDOWS, "not POSIX")
+ def test_session_posix(self):
+ self._create({
+ "session": True,
+ "command": ["echo", "foobar"],
+ })
+
+ with patch("gallery_dl.util.Popen") as p:
+ i = Mock()
+ i.wait.return_value = 0
+ p.return_value = i
+ self._trigger(("after",))
+
+ p.assert_called_once_with(
+ ["echo", "foobar"],
+ shell=False,
+ creationflags=0,
+ start_new_session=True,
+ )
+ i.wait.assert_called_once_with()
+
+ @unittest.skipIf(not util.WINDOWS, "not Windows")
+ def test_session_windows(self):
+ self._create({
+ "session": True,
+ "command": ["echo", "foobar"],
+ })
+
+ with patch("gallery_dl.util.Popen") as p:
+ i = Mock()
+ i.wait.return_value = 0
+ p.return_value = i
+ self._trigger(("after",))
+
+ import subprocess
+ p.assert_called_once_with(
+ ["echo", "foobar"],
+ shell=False,
+ creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
+ start_new_session=False,
+ )
+ i.wait.assert_called_once_with()
+
class HashTest(BasePostprocessorTest):
@@ -345,9 +429,7 @@ class MetadataTest(BasePostprocessorTest):
path = self.pathfmt.realpath + ".JSON"
m.assert_called_once_with(path, "w", encoding="utf-8")
- if sys.hexversion >= 0x3060000:
- # python 3.4 & 3.5 have random order without 'sort: True'
- self.assertEqual(self._output(m), """{
+ self.assertEqual(self._output(m), """{
"category": "test",
"filename": "file",
"extension": "ext",
@@ -713,8 +795,7 @@ class MetadataTest(BasePostprocessorTest):
}
""")
- @staticmethod
- def _output(mock):
+ def _output(self, mock):
return "".join(
call[1][0]
for call in mock.mock_calls
@@ -727,32 +808,32 @@ class MtimeTest(BasePostprocessorTest):
def test_mtime_datetime(self):
self._create(None, {"date": datetime(1980, 1, 1)})
self._trigger()
- self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
+ self.assertEqual(self.pathfmt.kwdict["_mtime_meta"], 315532800)
def test_mtime_timestamp(self):
self._create(None, {"date": 315532800})
self._trigger()
- self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
+ self.assertEqual(self.pathfmt.kwdict["_mtime_meta"], 315532800)
def test_mtime_none(self):
self._create(None, {"date": None})
self._trigger()
- self.assertNotIn("_mtime", self.pathfmt.kwdict)
+ self.assertNotIn("_mtime_meta", self.pathfmt.kwdict)
def test_mtime_undefined(self):
self._create(None, {})
self._trigger()
- self.assertNotIn("_mtime", self.pathfmt.kwdict)
+ self.assertNotIn("_mtime_meta", self.pathfmt.kwdict)
def test_mtime_key(self):
self._create({"key": "foo"}, {"foo": 315532800})
self._trigger()
- self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
+ self.assertEqual(self.pathfmt.kwdict["_mtime_meta"], 315532800)
def test_mtime_value(self):
self._create({"value": "{foo}"}, {"foo": 315532800})
self._trigger()
- self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
+ self.assertEqual(self.pathfmt.kwdict["_mtime_meta"], 315532800)
class PythonTest(BasePostprocessorTest):
@@ -945,8 +1026,8 @@ class ZipTest(BasePostprocessorTest):
self._trigger(("finalize",))
self.assertEqual(pp.zfile.write.call_count, 3)
- for call in pp.zfile.write.call_args_list:
- args, kwargs = call
+ for call_args in pp.zfile.write.call_args_list:
+ args, kwargs = call_args
self.assertEqual(len(args), 2)
self.assertEqual(len(kwargs), 0)
self.assertEqual(args[0], self.pathfmt.temppath)
diff --git a/test/test_results.py b/test/test_results.py
index 6e04e1d..4b1c4c1 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -80,9 +80,9 @@ class TestExtractorResults(unittest.TestCase):
@classmethod
def tearDownClass(cls):
if cls._skipped:
- print("\n\nSkipped tests:")
- for url, exc in cls._skipped:
- print('- {} ("{}")'.format(url, exc))
+ sys.stdout.write("\n\nSkipped tests:\n")
+ for url, reason in cls._skipped:
+ sys.stdout.write(f'- {url} ("{reason}")\n')
def assertRange(self, value, range, msg=None):
if range.step > 1:
@@ -91,6 +91,24 @@ class TestExtractorResults(unittest.TestCase):
self.assertLessEqual(value, range.stop, msg=msg)
self.assertGreaterEqual(value, range.start, msg=msg)
+ def assertLogEqual(self, expected, output):
+ if isinstance(expected, str):
+ expected = (expected,)
+ self.assertEqual(len(expected), len(output), "#log/count")
+
+ for exp, out in zip(expected, output):
+ level, name, message = out.split(":", 2)
+
+ if isinstance(exp, str):
+ return self.assertEqual(exp, message, "#log")
+
+ self.assertEqual(exp[0].lower(), level.lower(), "#log/level")
+ if len(exp) < 3:
+ self.assertEqual(exp[1], message, "#log/message")
+ else:
+ self.assertEqual(exp[1], name , "#log/name")
+ self.assertEqual(exp[2], message, "#log/message")
+
def _run_test(self, result):
if result.get("#fail"):
del result["#fail"]
@@ -145,7 +163,11 @@ class TestExtractorResults(unittest.TestCase):
return
try:
- tjob.run()
+ if "#log" in result:
+ with self.assertLogs() as log_info:
+ tjob.run()
+ else:
+ tjob.run()
except exception.StopExtraction:
pass
except exception.HttpError as exc:
@@ -156,6 +178,9 @@ class TestExtractorResults(unittest.TestCase):
self.skipTest(exc)
raise
+ if "#log" in result:
+ self.assertLogEqual(result["#log"], log_info.output)
+
if result.get("#archive", True):
self.assertEqual(
len(set(tjob.archive_list)),
@@ -220,13 +245,15 @@ class TestExtractorResults(unittest.TestCase):
for url, pat in zip(tjob.url_list, pattern):
self.assertRegex(url, pat, msg="#pattern")
- if "#urls" in result:
- expected = result["#urls"]
+ if "#results" in result:
+ expected = result["#results"]
if isinstance(expected, str):
- self.assertTrue(tjob.url_list, msg="#urls")
- self.assertEqual(tjob.url_list[0], expected, msg="#urls")
+ self.assertTrue(tjob.url_list, msg="#results")
+ self.assertEqual(
+ tjob.url_list[0], expected, msg="#results")
else:
- self.assertSequenceEqual(tjob.url_list, expected, msg="#urls")
+ self.assertSequenceEqual(
+ tjob.url_list, expected, msg="#results")
metadata = {k: v for k, v in result.items() if k[0] != "#"}
if metadata:
@@ -235,56 +262,74 @@ class TestExtractorResults(unittest.TestCase):
def _test_kwdict(self, kwdict, tests, parent=None):
for key, test in tests.items():
+
if key.startswith("?"):
key = key[1:]
if key not in kwdict:
continue
+ if key.endswith("[*]"):
+ key = key[:-3]
+ subtest = True
+ else:
+ subtest = False
+
path = "{}.{}".format(parent, key) if parent else key
+
if key.startswith("!"):
self.assertNotIn(key[1:], kwdict, msg=path)
continue
+
self.assertIn(key, kwdict, msg=path)
value = kwdict[key]
- if isinstance(test, dict):
- self._test_kwdict(value, test, path)
- elif isinstance(test, type):
- self.assertIsInstance(value, test, msg=path)
- elif isinstance(test, range):
- self.assertRange(value, test, msg=path)
- elif isinstance(test, set):
- try:
- self.assertIn(value, test, msg=path)
- except AssertionError:
- self.assertIn(type(value), test, msg=path)
- elif isinstance(test, list):
- subtest = False
- for idx, item in enumerate(test):
- if isinstance(item, dict):
- subtest = True
- subpath = "{}[{}]".format(path, idx)
- self._test_kwdict(value[idx], item, subpath)
- if not subtest:
- self.assertEqual(test, value, msg=path)
- elif isinstance(test, str):
- if test.startswith("re:"):
- self.assertRegex(value, test[3:], msg=path)
- elif test.startswith("dt:"):
- self.assertIsInstance(value, datetime.datetime, msg=path)
- self.assertEqual(test[3:], str(value), msg=path)
- elif test.startswith("type:"):
- self.assertEqual(test[5:], type(value).__name__, msg=path)
- elif test.startswith("len:"):
- cls, _, length = test[4:].rpartition(":")
- if cls:
- self.assertEqual(
- cls, type(value).__name__, msg=path + "/type")
- self.assertEqual(int(length), len(value), msg=path)
- else:
- self.assertEqual(test, value, msg=path)
+ if subtest:
+ self.assertNotIsInstance(value, str, msg=path)
+ for idx, item in enumerate(value):
+ subpath = "{}[{}]".format(path, idx)
+ self._test_kwdict_value(item, test, subpath)
else:
+ self._test_kwdict_value(value, test, path)
+
+ def _test_kwdict_value(self, value, test, path):
+ if isinstance(test, dict):
+ self._test_kwdict(value, test, path)
+ elif isinstance(test, type):
+ self.assertIsInstance(value, test, msg=path)
+ elif isinstance(test, range):
+ self.assertRange(value, test, msg=path)
+ elif isinstance(test, set):
+ try:
+ self.assertIn(value, test, msg=path)
+ except AssertionError:
+ self.assertIn(type(value), test, msg=path)
+ elif isinstance(test, list):
+ subtest = False
+ for idx, item in enumerate(test):
+ if isinstance(item, dict):
+ subtest = True
+ subpath = "{}[{}]".format(path, idx)
+ self._test_kwdict(value[idx], item, subpath)
+ if not subtest:
self.assertEqual(test, value, msg=path)
+ elif isinstance(test, str):
+ if test.startswith("re:"):
+ self.assertRegex(value, test[3:], msg=path)
+ elif test.startswith("dt:"):
+ self.assertIsInstance(value, datetime.datetime, msg=path)
+ self.assertEqual(test[3:], str(value), msg=path)
+ elif test.startswith("type:"):
+ self.assertEqual(test[5:], type(value).__name__, msg=path)
+ elif test.startswith("len:"):
+ cls, _, length = test[4:].rpartition(":")
+ if cls:
+ self.assertEqual(
+ cls, type(value).__name__, msg=path + "/type")
+ self.assertEqual(int(length), len(value), msg=path)
+ else:
+ self.assertEqual(test, value, msg=path)
+ else:
+ self.assertEqual(test, value, msg=path)
class ResultJob(job.DownloadJob):
@@ -402,27 +447,31 @@ class TestPathfmt():
class TestFormatter(formatter.StringFormatter):
- @staticmethod
- def _noop(_):
- return ""
-
def _apply_simple(self, key, fmt):
if key == "extension" or "_parse_optional." in repr(fmt):
- return self._noop
-
- def wrap(obj):
- return fmt(obj[key])
+ def wrap(obj):
+ try:
+ return fmt(obj[key])
+ except KeyError:
+ return ""
+ else:
+ def wrap(obj):
+ return fmt(obj[key])
return wrap
def _apply(self, key, funcs, fmt):
if key == "extension" or "_parse_optional." in repr(fmt):
- return self._noop
-
- def wrap(obj):
- obj = obj[key]
- for func in funcs:
- obj = func(obj)
- return fmt(obj)
+ def wrap(obj):
+ obj = obj[key] if key in obj else ""
+ for func in funcs:
+ obj = func(obj)
+ return fmt(obj)
+ else:
+ def wrap(obj):
+ obj = obj[key]
+ for func in funcs:
+ obj = func(obj)
+ return fmt(obj)
return wrap
@@ -457,7 +506,10 @@ def generate_tests():
"""Dynamically generate extractor unittests"""
def _generate_method(result):
def test(self):
- print("\n" + result["#url"])
+ sys.stdout.write(f"\n{result['#url']}\n")
+ if "#comment" in result:
+ sys.stdout.write(f"# {result['#comment']}\n")
+
try:
self._run_test(result)
except KeyboardInterrupt as exc:
diff --git a/test/test_text.py b/test/test_text.py
index d42507c..13029d2 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2015-2022 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,6 +23,20 @@ INVALID_ALT = ((), [], {}, None, "")
class TestText(unittest.TestCase):
+ def test_re(self):
+ p1 = text.re_compile("foo")
+ p2 = text.re("foo")
+ p3 = text.re("foo")
+
+ Pattern = text.re_module.Pattern
+ self.assertIsInstance(p1, Pattern)
+ self.assertIsInstance(p2, Pattern)
+ self.assertIsInstance(p3, Pattern)
+
+ self.assertEqual(p1, p2)
+ self.assertIsNot(p1, p2)
+ self.assertIs(p2, p3)
+
def test_remove_html(self, f=text.remove_html):
result = "Hello World."
@@ -92,6 +106,17 @@ class TestText(unittest.TestCase):
self.assertEqual(f(1), "1")
self.assertEqual(f(2.3), "23")
+ def test_sanitize_whitespace(self, f=text.sanitize_whitespace):
+ self.assertEqual(f("Hello World"), "Hello World")
+ self.assertEqual(f("Hello\tWorld"), "Hello World")
+ self.assertEqual(f(" Hello World "), "Hello World")
+ self.assertEqual(f("\tHello \n\tWorld "), "Hello World")
+
+ self.assertEqual(f(""), "")
+ self.assertEqual(f(" "), "")
+ self.assertEqual(f(" "), "")
+ self.assertEqual(f(" \t\n "), "")
+
def test_ensure_http_scheme(self, f=text.ensure_http_scheme):
result = "https://example.org/filename.ext"
@@ -241,6 +266,29 @@ class TestText(unittest.TestCase):
self.assertEqual(f(txt , value, ">") , (None, -1))
self.assertEqual(f(txt , "<" , value), (None, -1))
+ def test_rextr(self, f=text.rextr):
+ txt = "<a><b>"
+ self.assertEqual(f(txt, "<", ">"), "b")
+ self.assertEqual(f(txt, "X", ">"), "")
+ self.assertEqual(f(txt, "<", "X"), "")
+
+ # 'pos' argument
+ for i in range(10, 3, -1):
+ self.assertEqual(f(txt, "<", ">", i), "b")
+ for i in range(3, 0, -1):
+ self.assertEqual(f(txt, "<", ">", i), "a")
+
+ # 'default' argument
+ self.assertEqual(f(txt, "[", "]", -1, "none"), "none")
+ self.assertEqual(f(txt, "[", "]", None, "none"), "none")
+ self.assertEqual(f(txt, "[", "]", default="none"), "none")
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value, "<" , ">") , "")
+ self.assertEqual(f(txt , value, ">") , "")
+ self.assertEqual(f(txt , "<" , value), "")
+
def test_extract_all(self, f=text.extract_all):
txt = "[c][b][a]: xyz! [d][e"
@@ -336,6 +384,8 @@ class TestText(unittest.TestCase):
)
def test_parse_bytes(self, f=text.parse_bytes):
+ self.assertEqual(f(0), 0)
+ self.assertEqual(f(50), 50)
self.assertEqual(f("0"), 0)
self.assertEqual(f("50"), 50)
self.assertEqual(f("50k"), 50 * 1024**1)
@@ -343,10 +393,13 @@ class TestText(unittest.TestCase):
self.assertEqual(f("50g"), 50 * 1024**3)
self.assertEqual(f("50t"), 50 * 1024**4)
self.assertEqual(f("50p"), 50 * 1024**5)
+ self.assertEqual(f(" 50p "), 50 * 1024**5)
# fractions
+ self.assertEqual(f(123.456), 123)
self.assertEqual(f("123.456"), 123)
self.assertEqual(f("123.567"), 124)
+ self.assertEqual(f(" 123.89 "), 124)
self.assertEqual(f("0.5M"), round(0.5 * 1024**2))
# invalid arguments
@@ -405,8 +458,12 @@ class TestText(unittest.TestCase):
# missing value
self.assertEqual(f("bar"), {})
+ self.assertEqual(f("bar="), {"bar": ""})
self.assertEqual(f("foo=1&bar"), {"foo": "1"})
+ self.assertEqual(f("foo=1&bar="), {"foo": "1", "bar": ""})
self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"})
+ self.assertEqual(f("foo=1&bar=&baz=3"),
+ {"foo": "1", "bar": "", "baz": "3"})
# keys with identical names
self.assertEqual(f("foo=1&foo=2"), {"foo": "1"})
@@ -424,6 +481,8 @@ class TestText(unittest.TestCase):
self.assertEqual(f(""), {})
self.assertEqual(f("foo=1"), {"foo": "1"})
self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"})
+ self.assertEqual(f("%C3%A4%26=%E3%81%82%E3%81%A8&%23=%3F"),
+ {"ä&": "あと", "#": "?"})
# missing value
self.assertEqual(f("bar"), {})
@@ -441,6 +500,21 @@ class TestText(unittest.TestCase):
for value in INVALID:
self.assertEqual(f(value), {})
+ def test_build_query(self, f=text.build_query):
+ # standard usage
+ self.assertEqual(f({}), "")
+ self.assertEqual(f({"foo": "1"}), "foo=1")
+ self.assertEqual(f({"foo": "1", "bar": "2"}), "foo=1&bar=2")
+
+ # missing value
+ self.assertEqual(f({"bar": ""}), "bar=")
+ self.assertEqual(f({"foo": "1", "bar": ""}), "foo=1&bar=")
+ self.assertEqual(f({"foo": "1", "bar": "", "baz": "3"}),
+ "foo=1&bar=&baz=3")
+
+ self.assertEqual(f({"ä&": "あと", "#": "?"}),
+ "%C3%A4%26=%E3%81%82%E3%81%A8&%23=%3F")
+
def test_parse_timestamp(self, f=text.parse_timestamp):
null = util.datetime_utcfromtimestamp(0)
value = util.datetime_utcfromtimestamp(1555816235)
diff --git a/test/test_util.py b/test/test_util.py
index 27f78ec..00e8c4b 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2015-2023 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,6 +10,7 @@
import os
import sys
import unittest
+from unittest.mock import patch
import io
import time
@@ -27,11 +28,18 @@ from gallery_dl import util, text, exception # noqa E402
class TestRange(unittest.TestCase):
- def test_parse_empty(self, f=util.RangePredicate._parse):
+ def setUp(self):
+ self.predicate = util.RangePredicate("")
+
+ def test_parse_empty(self):
+ f = self.predicate._parse
+
self.assertEqual(f(""), [])
self.assertEqual(f([]), [])
- def test_parse_digit(self, f=util.RangePredicate._parse):
+ def test_parse_digit(self):
+ f = self.predicate._parse
+
self.assertEqual(f("2"), [range(2, 3)])
self.assertEqual(
@@ -41,7 +49,9 @@ class TestRange(unittest.TestCase):
range(4, 5)],
)
- def test_parse_range(self, f=util.RangePredicate._parse):
+ def test_parse_range(self):
+ f = self.predicate._parse
+
self.assertEqual(f("1-2"), [range(1, 3)])
self.assertEqual(f("2-"), [range(2, sys.maxsize)])
self.assertEqual(f("-3"), [range(1, 4)])
@@ -61,7 +71,9 @@ class TestRange(unittest.TestCase):
range(2, 7)],
)
- def test_parse_slice(self, f=util.RangePredicate._parse):
+ def test_parse_slice(self):
+ f = self.predicate._parse
+
self.assertEqual(f("2:4") , [range(2, 4)])
self.assertEqual(f("3::") , [range(3, sys.maxsize)])
self.assertEqual(f(":4:") , [range(1, 4)])
@@ -149,6 +161,10 @@ class TestPredicate(unittest.TestCase):
self.assertFalse(pred(url, {"a": 2}))
+ pred = util.FilterPredicate("re.search(r'.+', url)")
+ self.assertTrue(pred(url, {"url": "https://example.org/"}))
+ self.assertFalse(pred(url, {"url": ""}))
+
def test_build_predicate(self):
pred = util.build_predicate([])
self.assertIsInstance(pred, type(lambda: True))
@@ -390,6 +406,89 @@ def hash(value):
self.assertEqual(expr(value), result)
+class TestDatetime(unittest.TestCase):
+
+ def test_to_datetime(self, f=util.to_datetime):
+
+ def _assert(value, expected):
+ result = f(value)
+ self.assertIsInstance(result, datetime.datetime)
+ self.assertEqual(result, expected, msg=repr(value))
+
+ dt = datetime.datetime(2010, 1, 1)
+ self.assertIs(f(dt), dt)
+
+ _assert(dt , dt)
+ _assert(1262304000 , dt)
+ _assert(1262304000.0 , dt)
+ _assert(1262304000.123, dt)
+ _assert("1262304000" , dt)
+
+ _assert("2010-01-01" , dt)
+ _assert("2010-01-01 00:00:00" , dt)
+ _assert("2010-01-01T00:00:00" , dt)
+ _assert("2010-01-01T00:00:00.123456" , dt)
+ _assert("2009-12-31T19:00:00-05:00" , dt)
+ _assert("2009-12-31T19:00:00.123456-05:00", dt)
+ _assert("2010-01-01T00:00:00Z" , dt)
+ _assert("2010-01-01T00:00:00.123456Z" , dt)
+
+ _assert(0 , util.EPOCH)
+ _assert("" , util.EPOCH)
+ _assert("foo", util.EPOCH)
+ _assert(None , util.EPOCH)
+ _assert(() , util.EPOCH)
+ _assert([] , util.EPOCH)
+ _assert({} , util.EPOCH)
+ _assert((1, 2, 3), util.EPOCH)
+
+ @unittest.skipIf(sys.hexversion < 0x30b0000,
+ "extended fromisoformat timezones")
+ def test_to_datetime_tz(self, f=util.to_datetime):
+
+ def _assert(value, expected):
+ result = f(value)
+ self.assertIsInstance(result, datetime.datetime)
+ self.assertEqual(result, expected, msg=repr(value))
+
+ dt = datetime.datetime(2010, 1, 1)
+
+ _assert("2009-12-31T19:00:00-05" , dt)
+ _assert("2009-12-31T19:00:00-0500" , dt)
+ _assert("2009-12-31T19:00:00.123456-05" , dt)
+ _assert("2009-12-31T19:00:00.123456-0500" , dt)
+
+ def test_datetime_to_timestamp(self, f=util.datetime_to_timestamp):
+ self.assertEqual(f(util.EPOCH), 0.0)
+ self.assertEqual(f(datetime.datetime(2010, 1, 1)), 1262304000.0)
+ self.assertEqual(f(datetime.datetime(2010, 1, 1, 0, 0, 0, 128000)),
+ 1262304000.128000)
+ with self.assertRaises(TypeError):
+ f(None)
+
+ def test_datetime_to_timestamp_string(
+ self, f=util.datetime_to_timestamp_string):
+ self.assertEqual(f(util.EPOCH), "0")
+ self.assertEqual(f(datetime.datetime(2010, 1, 1)), "1262304000")
+ self.assertEqual(f(None), "")
+
+ def test_datetime_from_timestamp(
+ self, f=util.datetime_from_timestamp):
+ self.assertEqual(f(0.0), util.EPOCH)
+ self.assertEqual(f(1262304000.0), datetime.datetime(2010, 1, 1))
+ self.assertEqual(f(1262304000.128000).replace(microsecond=0),
+ datetime.datetime(2010, 1, 1, 0, 0, 0))
+
+ def test_datetime_utcfromtimestamp(
+ self, f=util.datetime_utcfromtimestamp):
+ self.assertEqual(f(0.0), util.EPOCH)
+ self.assertEqual(f(1262304000.0), datetime.datetime(2010, 1, 1))
+
+ def test_datetime_utcnow(
+ self, f=util.datetime_utcnow):
+ self.assertIsInstance(f(), datetime.datetime)
+
+
class TestOther(unittest.TestCase):
def test_bencode(self):
@@ -492,6 +591,7 @@ class TestOther(unittest.TestCase):
def test_noop(self):
self.assertEqual(util.noop(), None)
+ self.assertEqual(util.noop(...), None)
def test_md5(self):
self.assertEqual(util.md5(b""),
@@ -552,17 +652,21 @@ value = 123
self.assertEqual(module.value, 123)
self.assertIs(module.datetime, datetime)
- def test_build_duration_func(self, f=util.build_duration_func):
+ def test_build_selection_func(self, f=util.build_selection_func):
- def test_single(df, v):
+ def test_single(df, v, type=None):
for _ in range(10):
self.assertEqual(df(), v)
+ if type is not None:
+ self.assertIsInstance(df(), type)
- def test_range(df, lower, upper):
+ def test_range(df, lower, upper, type=None):
for __ in range(10):
v = df()
self.assertGreaterEqual(v, lower)
self.assertLessEqual(v, upper)
+ if type is not None:
+ self.assertIsInstance(v, type)
for v in (0, 0.0, "", None, (), []):
self.assertIsNone(f(v))
@@ -570,16 +674,24 @@ value = 123
for v in (0, 0.0, "", None, (), []):
test_single(f(v, 1.0), 1.0)
- test_single(f(3), 3)
- test_single(f(3.0), 3.0)
- test_single(f("3"), 3)
- test_single(f("3.0-"), 3)
- test_single(f(" 3 -"), 3)
+ test_single(f(3) , 3 , float)
+ test_single(f(3.0) , 3.0, float)
+ test_single(f("3") , 3 , float)
+ test_single(f("3.0-") , 3 , float)
+ test_single(f(" 3 -"), 3 , float)
- test_range(f((2, 4)), 2, 4)
- test_range(f([2, 4]), 2, 4)
- test_range(f("2-4"), 2, 4)
- test_range(f(" 2.0 - 4 "), 2, 4)
+ test_range(f((2, 4)) , 2, 4, float)
+ test_range(f([2.0, 4.0]) , 2, 4, float)
+ test_range(f("2-4") , 2, 4, float)
+ test_range(f(" 2.0 - 4 "), 2, 4, float)
+
+ pb = text.parse_bytes
+ test_single(f("3", 0, pb) , 3, int)
+ test_single(f("3.0-", 0, pb) , 3, int)
+ test_single(f(" 3 -", 0, pb), 3, int)
+
+ test_range(f("2k-4k", 0, pb) , 2048, 4096, int)
+ test_range(f(" 2.0k - 4k ", 0, pb), 2048, 4096, int)
def test_extractor_filter(self):
# empty
@@ -765,40 +877,16 @@ value = 123
self.assertEqual(f(["a", "b", "c"]), "a, b, c")
self.assertEqual(f([1, 2, 3]), "1, 2, 3")
- def test_datetime_to_timestamp(self, f=util.datetime_to_timestamp):
- self.assertEqual(f(util.EPOCH), 0.0)
- self.assertEqual(f(datetime.datetime(2010, 1, 1)), 1262304000.0)
- self.assertEqual(f(datetime.datetime(2010, 1, 1, 0, 0, 0, 128000)),
- 1262304000.128000)
- with self.assertRaises(TypeError):
- f(None)
-
- def test_datetime_to_timestamp_string(
- self, f=util.datetime_to_timestamp_string):
- self.assertEqual(f(util.EPOCH), "0")
- self.assertEqual(f(datetime.datetime(2010, 1, 1)), "1262304000")
- self.assertEqual(f(None), "")
-
- def test_datetime_from_timestamp(
- self, f=util.datetime_from_timestamp):
- self.assertEqual(f(0.0), util.EPOCH)
- self.assertEqual(f(1262304000.0), datetime.datetime(2010, 1, 1))
- self.assertEqual(f(1262304000.128000).replace(microsecond=0),
- datetime.datetime(2010, 1, 1, 0, 0, 0))
-
- def test_datetime_utcfromtimestamp(
- self, f=util.datetime_utcfromtimestamp):
- self.assertEqual(f(0.0), util.EPOCH)
- self.assertEqual(f(1262304000.0), datetime.datetime(2010, 1, 1))
-
- def test_datetime_utcnow(
- self, f=util.datetime_utcnow):
- self.assertIsInstance(f(), datetime.datetime)
-
def test_universal_none(self):
obj = util.NONE
self.assertFalse(obj)
+ self.assertEqual(obj, obj)
+ self.assertEqual(obj, None)
+ self.assertNotEqual(obj, False)
+ self.assertNotEqual(obj, 0)
+ self.assertNotEqual(obj, "")
+
self.assertEqual(len(obj), 0)
self.assertEqual(int(obj), 0)
self.assertEqual(hash(obj), 0)
@@ -873,6 +961,26 @@ value = 123
i += 1
self.assertEqual(i, 0)
+ def test_HTTPBasicAuth(self, f=util.HTTPBasicAuth):
+ class Request:
+ headers = {}
+ request = Request()
+
+ auth = f("", "")
+ auth(request)
+ self.assertEqual(request.headers["Authorization"],
+ b"Basic Og==")
+
+ f("foo", "bar")(request)
+ self.assertEqual(request.headers["Authorization"],
+ b"Basic Zm9vOmJhcg==")
+
+ f("ewsxcvbhnjtr",
+ "RVXQ4i9Ju5ypi86VGJ8MqhDYpDKluS0sxiSRBAG7ymB3Imok")(request)
+ self.assertEqual(request.headers["Authorization"],
+ b"Basic ZXdzeGN2YmhuanRyOlJWWFE0aTlKdTV5cGk4NlZHSjhNc"
+ b"WhEWXBES2x1UzBzeGlTUkJBRzd5bUIzSW1vaw==")
+
def test_module_proxy(self):
proxy = util.ModuleProxy()
@@ -887,6 +995,16 @@ value = 123
self.assertIs(proxy["abc.def.ghi"], util.NONE)
self.assertIs(proxy["os.path2"], util.NONE)
+ def test_lazy_prompt(self):
+ prompt = util.LazyPrompt()
+
+ with patch("getpass.getpass") as p:
+ p.return_value = "***"
+ result = str(prompt)
+
+ self.assertEqual(result, "***")
+ p.assert_called_once_with()
+
def test_null_context(self):
with util.NullContext():
pass
@@ -901,6 +1019,28 @@ value = 123
except ValueError as exc:
self.assertIs(exc, exc_orig)
+ def test_null_response(self):
+ response = util.NullResponse("https://example.org")
+
+ self.assertEqual(response.url, "https://example.org")
+ self.assertEqual(response.status_code, 900)
+ self.assertEqual(response.reason, "")
+ self.assertEqual(response.text, "")
+ self.assertEqual(response.content, b"")
+ self.assertEqual(response.json(), {})
+
+ self.assertFalse(response.ok)
+ self.assertFalse(response.is_redirect)
+ self.assertFalse(response.is_permanent_redirect)
+ self.assertFalse(response.history)
+
+ self.assertEqual(response.encoding, "utf-8")
+ self.assertEqual(response.apparent_encoding, "utf-8")
+ self.assertEqual(response.cookies.get("foo"), None)
+ self.assertEqual(response.headers.get("foo"), None)
+ self.assertEqual(response.links.get("next"), None)
+ self.assertEqual(response.close(), None)
+
class TestExtractor():
category = "test_category"
diff --git a/test/test_ytdl.py b/test/test_ytdl.py
index f7eb671..ecc6d2f 100644
--- a/test/test_ytdl.py
+++ b/test/test_ytdl.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2022-2023 Mike Fährmann
+# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,6 +26,7 @@ class Test_CommandlineArguments(unittest.TestCase):
raise unittest.SkipTest("cannot import module '{}'".format(
cls.module_name))
cls.default = ytdl.parse_command_line(cls.module, [])
+ cls.ytdlp = hasattr(cls.module, "cookies")
def test_ignore_errors(self):
self._("--ignore-errors" , "ignoreerrors", True)
@@ -155,21 +156,21 @@ class Test_CommandlineArguments(unittest.TestCase):
def test_subs(self):
opts = self._(["--convert-subs", "srt"])
conv = {"key": "FFmpegSubtitlesConvertor", "format": "srt"}
- if self.module_name == "yt_dlp":
+ if self.ytdlp:
conv["when"] = "before_dl"
self.assertEqual(opts["postprocessors"][0], conv)
def test_embed(self):
subs = {"key": "FFmpegEmbedSubtitle"}
thumb = {"key": "EmbedThumbnail", "already_have_thumbnail": False}
- if self.module_name == "yt_dlp":
+ if self.ytdlp:
subs["already_have_subtitle"] = False
opts = self._(["--embed-subs", "--embed-thumbnail"])
self.assertEqual(opts["postprocessors"][:2], [subs, thumb])
thumb["already_have_thumbnail"] = True
- if self.module_name == "yt_dlp":
+ if self.ytdlp:
subs["already_have_subtitle"] = True
thumb["already_have_thumbnail"] = "all"
@@ -212,7 +213,7 @@ class Test_CommandlineArguments(unittest.TestCase):
"--ignore-config",
]
- if self.module_name != "yt_dlp":
+ if not self.ytdlp:
cmdline.extend((
"--dump-json",
"--dump-single-json",